diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2016-12-07 16:20:31 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2016-12-07 16:20:31 +1100 |
commit | e794ca9f1b701134528e7248a7e25d5c7760d3f3 (patch) | |
tree | 27a63f6a5582b4ea0c4855319845636ee5cbdb2c | |
parent | e64e6ec3a48aa4daf403fe17d99abd161bf5547f (diff) | |
parent | 755edccbffa1e954ca31206b7e04b2499c88aef1 (diff) | |
download | linux-next-e794ca9f1b701134528e7248a7e25d5c7760d3f3.tar.gz |
Merge branch 'akpm/master'
129 files changed, 4817 insertions, 3300 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 1b5f15653b1b..ace63cd7af8c 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -20,7 +20,7 @@ prototypes: void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); struct vfsmount *(*d_automount)(struct path *path); - int (*d_manage)(struct dentry *, bool); + int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *, unsigned int); @@ -556,7 +556,7 @@ till "end_pgoff". ->map_pages() is called with page table locked and must not block. If it's not possible to reach a page without blocking, filesystem should skip it. Filesystem should use do_set_pte() to setup page table entry. Pointer to entry associated with the page is passed in -"pte" field in fault_env structure. Pointers to entries for other offsets +"pte" field in vm_fault structure. Pointers to entries for other offsets should be calculated relative to "pte". ->page_mkwrite() is called when a previously read-only pte is diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index b5039a00caaf..3893f4d44cd4 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -948,7 +948,7 @@ struct dentry_operations { void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); - int (*d_manage)(struct dentry *, bool); + int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *, unsigned int); }; diff --git a/arch/Kconfig b/arch/Kconfig index abab6590f08f..13f27c1a39ba 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -5,6 +5,9 @@ config KEXEC_CORE bool +config HAVE_IMA_KEXEC + bool + config OPROFILE tristate "OProfile system profiling" depends on PROFILING diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3da87e198878..a8ee573fe610 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -469,6 +469,7 @@ config KEXEC config KEXEC_FILE bool "kexec file based system call" select KEXEC_CORE + select HAVE_IMA_KEXEC select BUILD_BIN2C depends on PPC64 depends on CRYPTO=y diff --git a/arch/powerpc/include/asm/ima.h b/arch/powerpc/include/asm/ima.h new file mode 100644 index 000000000000..2313bdface34 --- /dev/null +++ b/arch/powerpc/include/asm/ima.h @@ -0,0 +1,29 @@ +#ifndef _ASM_POWERPC_IMA_H +#define _ASM_POWERPC_IMA_H + +struct kimage; + +int ima_get_kexec_buffer(void **addr, size_t *size); +int ima_free_kexec_buffer(void); + +#ifdef CONFIG_IMA +void remove_ima_buffer(void *fdt, int chosen_node); +#else +static inline void remove_ima_buffer(void *fdt, int chosen_node) {} +#endif + +#ifdef CONFIG_IMA_KEXEC +int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr, + size_t size); + +int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node); +#else +static inline int setup_ima_buffer(const struct kimage *image, void *fdt, + int chosen_node) +{ + remove_ima_buffer(fdt, chosen_node); + return 0; +} +#endif /* CONFIG_IMA_KEXEC */ + +#endif /* _ASM_POWERPC_IMA_H */ diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 6c3b71502fbc..25668bc8cb2a 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -94,11 +94,22 @@ static inline bool kdump_in_progress(void) #ifdef CONFIG_KEXEC_FILE extern struct kexec_file_ops kexec_elf64_ops; +#ifdef CONFIG_IMA_KEXEC +#define ARCH_HAS_KIMAGE_ARCH + +struct kimage_arch { + phys_addr_t ima_buffer_addr; + size_t ima_buffer_size; +}; +#endif + int setup_purgatory(struct kimage *image, const void *slave_code, const void *fdt, unsigned long kernel_load_addr, unsigned long fdt_load_addr); -int setup_new_fdt(void *fdt, unsigned long initrd_load_addr, - unsigned long initrd_len, const char *cmdline); +int setup_new_fdt(const struct kimage *image, void *fdt, + unsigned long initrd_load_addr, unsigned long initrd_len, + const char *cmdline); +int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size); #endif /* CONFIG_KEXEC_FILE */ #else /* !CONFIG_KEXEC_CORE */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index a3a6047fd395..23f8082d7bfa 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -112,6 +112,10 @@ obj-$(CONFIG_PCI_MSI) += msi.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o crash.o \ machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file_$(BITS).o kexec_elf_$(BITS).o +ifeq ($(CONFIG_HAVE_IMA_KEXEC)$(CONFIG_IMA),yy) +obj-y += ima_kexec.o +endif + obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o diff --git a/arch/powerpc/kernel/ima_kexec.c b/arch/powerpc/kernel/ima_kexec.c new file mode 100644 index 000000000000..5ea42c937ca9 --- /dev/null +++ b/arch/powerpc/kernel/ima_kexec.c @@ -0,0 +1,223 @@ +/* + * Copyright (C) 2016 IBM Corporation + * + * Authors: + * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/slab.h> +#include <linux/kexec.h> +#include <linux/of.h> +#include <linux/memblock.h> +#include <linux/libfdt.h> + +static int get_addr_size_cells(int *addr_cells, int *size_cells) +{ + struct device_node *root; + + root = of_find_node_by_path("/"); + if (!root) + return -EINVAL; + + *addr_cells = of_n_addr_cells(root); + *size_cells = of_n_size_cells(root); + + of_node_put(root); + + return 0; +} + +static int do_get_kexec_buffer(const void *prop, int len, unsigned long *addr, + size_t *size) +{ + int ret, addr_cells, size_cells; + + ret = get_addr_size_cells(&addr_cells, &size_cells); + if (ret) + return ret; + + if (len < 4 * (addr_cells + size_cells)) + return -ENOENT; + + *addr = of_read_number(prop, addr_cells); + *size = of_read_number(prop + 4 * addr_cells, size_cells); + + return 0; +} + +/** + * ima_get_kexec_buffer - get IMA buffer from the previous kernel + * @addr: On successful return, set to point to the buffer contents. + * @size: On successful return, set to the buffer size. + * + * Return: 0 on success, negative errno on error. + */ +int ima_get_kexec_buffer(void **addr, size_t *size) +{ + int ret, len; + unsigned long tmp_addr; + size_t tmp_size; + const void *prop; + + prop = of_get_property(of_chosen, "linux,ima-kexec-buffer", &len); + if (!prop) + return -ENOENT; + + ret = do_get_kexec_buffer(prop, len, &tmp_addr, &tmp_size); + if (ret) + return ret; + + *addr = __va(tmp_addr); + *size = tmp_size; + + return 0; +} + +/** + * ima_free_kexec_buffer - free memory used by the IMA buffer + */ +int ima_free_kexec_buffer(void) +{ + int ret; + unsigned long addr; + size_t size; + struct property *prop; + + prop = of_find_property(of_chosen, "linux,ima-kexec-buffer", NULL); + if (!prop) + return -ENOENT; + + ret = do_get_kexec_buffer(prop->value, prop->length, &addr, &size); + if (ret) + return ret; + + ret = of_remove_property(of_chosen, prop); + if (ret) + return ret; + + return memblock_free(addr, size); + +} + +/** + * remove_ima_buffer - remove the IMA buffer property and reservation from @fdt + * + * The IMA measurement buffer is of no use to a subsequent kernel, so we always + * remove it from the device tree. + */ +void remove_ima_buffer(void *fdt, int chosen_node) +{ + int ret, len; + unsigned long addr; + size_t size; + const void *prop; + + prop = fdt_getprop(fdt, chosen_node, "linux,ima-kexec-buffer", &len); + if (!prop) + return; + + ret = do_get_kexec_buffer(prop, len, &addr, &size); + fdt_delprop(fdt, chosen_node, "linux,ima-kexec-buffer"); + if (ret) + return; + + ret = delete_fdt_mem_rsv(fdt, addr, size); + if (!ret) + pr_debug("Removed old IMA buffer reservation.\n"); +} + +#ifdef CONFIG_IMA_KEXEC +/** + * arch_ima_add_kexec_buffer - do arch-specific steps to add the IMA buffer + * + * Architectures should use this function to pass on the IMA buffer + * information to the next kernel. + * + * Return: 0 on success, negative errno on error. + */ +int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr, + size_t size) +{ + image->arch.ima_buffer_addr = load_addr; + image->arch.ima_buffer_size = size; + + return 0; +} + +static int write_number(void *p, u64 value, int cells) +{ + if (cells == 1) { + u32 tmp; + + if (value > U32_MAX) + return -EINVAL; + + tmp = cpu_to_be32(value); + memcpy(p, &tmp, sizeof(tmp)); + } else if (cells == 2) { + u64 tmp; + + tmp = cpu_to_be64(value); + memcpy(p, &tmp, sizeof(tmp)); + } else + return -EINVAL; + + return 0; +} + +/** + * setup_ima_buffer - add IMA buffer information to the fdt + * @image: kexec image being loaded. + * @fdt: Flattened device tree for the next kernel. + * @chosen_node: Offset to the chosen node. + * + * Return: 0 on success, or negative errno on error. + */ +int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node) +{ + int ret, addr_cells, size_cells, entry_size; + u8 value[16]; + + remove_ima_buffer(fdt, chosen_node); + if (!image->arch.ima_buffer_size) + return 0; + + ret = get_addr_size_cells(&addr_cells, &size_cells); + if (ret) + return ret; + + entry_size = 4 * (addr_cells + size_cells); + + if (entry_size > sizeof(value)) + return -EINVAL; + + ret = write_number(value, image->arch.ima_buffer_addr, addr_cells); + if (ret) + return ret; + + ret = write_number(value + 4 * addr_cells, image->arch.ima_buffer_size, + size_cells); + if (ret) + return ret; + + ret = fdt_setprop(fdt, chosen_node, "linux,ima-kexec-buffer", value, + entry_size); + if (ret < 0) + return -EINVAL; + + ret = fdt_add_mem_rsv(fdt, image->arch.ima_buffer_addr, + image->arch.ima_buffer_size); + if (ret) + return -EINVAL; + + pr_debug("IMA buffer at 0x%llx, size = 0x%zx\n", + image->arch.ima_buffer_addr, image->arch.ima_buffer_size); + + return 0; +} +#endif /* CONFIG_IMA_KEXEC */ diff --git a/arch/powerpc/kernel/kexec_elf_64.c b/arch/powerpc/kernel/kexec_elf_64.c index 6acffd34a70f..9a42309b091a 100644 --- a/arch/powerpc/kernel/kexec_elf_64.c +++ b/arch/powerpc/kernel/kexec_elf_64.c @@ -627,7 +627,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf, goto out; } - ret = setup_new_fdt(fdt, initrd_load_addr, initrd_len, cmdline); + ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline); if (ret) goto out; diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c index 7abc8a75ee48..992c0d258e5d 100644 --- a/arch/powerpc/kernel/machine_kexec_file_64.c +++ b/arch/powerpc/kernel/machine_kexec_file_64.c @@ -27,6 +27,7 @@ #include <linux/memblock.h> #include <linux/of_fdt.h> #include <linux/libfdt.h> +#include <asm/ima.h> #define SLAVE_CODE_SIZE 256 @@ -180,7 +181,7 @@ int setup_purgatory(struct kimage *image, const void *slave_code, * * Return: 0 on success, or negative errno on error. */ -static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size) +int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size) { int i, ret, num_rsvs = fdt_num_mem_rsv(fdt); @@ -209,6 +210,7 @@ static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size /* * setup_new_fdt - modify /chosen and memory reservation for the next kernel + * @image: kexec image being loaded. * @fdt: Flattened device tree for the next kernel. * @initrd_load_addr: Address where the next initrd will be loaded. * @initrd_len: Size of the next initrd, or 0 if there will be none. @@ -217,8 +219,9 @@ static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size * * Return: 0 on success, or negative errno on error. */ -int setup_new_fdt(void *fdt, unsigned long initrd_load_addr, - unsigned long initrd_len, const char *cmdline) +int setup_new_fdt(const struct kimage *image, void *fdt, + unsigned long initrd_load_addr, unsigned long initrd_len, + const char *cmdline) { int ret, chosen_node; const void *prop; @@ -328,6 +331,12 @@ int setup_new_fdt(void *fdt, unsigned long initrd_load_addr, } } + ret = setup_ima_buffer(image, fdt, chosen_node); + if (ret) { + pr_err("Error setting up the new device tree.\n"); + return ret; + } + ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0); if (ret) { pr_err("Error setting up the new device tree.\n"); diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 06254467e4dd..3a147122bc98 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -236,7 +236,6 @@ static int spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct spu_context *ctx = vma->vm_file->private_data; - unsigned long address = (unsigned long)vmf->virtual_address; unsigned long pfn, offset; offset = vmf->pgoff << PAGE_SHIFT; @@ -244,7 +243,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; pr_debug("spufs_mem_mmap_fault address=0x%lx, offset=0x%lx\n", - address, offset); + vmf->address, offset); if (spu_acquire(ctx)) return VM_FAULT_NOPAGE; @@ -256,7 +255,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT; } - vm_insert_pfn(vma, address, pfn); + vm_insert_pfn(vma, vmf->address, pfn); spu_release(ctx); @@ -355,8 +354,7 @@ static int spufs_ps_fault(struct vm_area_struct *vma, down_read(¤t->mm->mmap_sem); } else { area = ctx->spu->problem_phys + ps_offs; - vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, - (area + offset) >> PAGE_SHIFT); + vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT); spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu); } diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index a9973bb4a1b2..95e73c63c99d 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c @@ -42,7 +42,7 @@ static int panic_on_timeout; */ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ EXPORT_SYMBOL(nmi_active); - +static int nmi_init_done; static unsigned int nmi_hz = HZ; static DEFINE_PER_CPU(short, wd_enabled); static int endflag __initdata; @@ -153,6 +153,8 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count) void stop_nmi_watchdog(void *unused) { + if (!__this_cpu_read(wd_enabled)) + return; pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); __this_cpu_write(wd_enabled, 0); atomic_dec(&nmi_active); @@ -207,6 +209,9 @@ error: void start_nmi_watchdog(void *unused) { + if (__this_cpu_read(wd_enabled)) + return; + __this_cpu_write(wd_enabled, 1); atomic_inc(&nmi_active); @@ -259,6 +264,8 @@ int __init nmi_init(void) } } + nmi_init_done = 1; + return err; } @@ -270,3 +277,38 @@ static int __init setup_nmi_watchdog(char *str) return 0; } __setup("nmi_watchdog=", setup_nmi_watchdog); + +/* + * sparc specific NMI watchdog enable function. + * Enables watchdog if it is not enabled already. + */ +int watchdog_nmi_enable(unsigned int cpu) +{ + if (atomic_read(&nmi_active) == -1) { + pr_warn("NMI watchdog cannot be enabled or disabled\n"); + return -1; + } + + /* + * watchdog thread could start even before nmi_init is called. + * Just Return in that case. Let nmi_init finish the init + * process first. + */ + if (!nmi_init_done) + return 0; + + smp_call_function_single(cpu, start_nmi_watchdog, NULL, 1); + + return 0; +} +/* + * sparc specific NMI watchdog disable function. + * Disables watchdog if it is not disabled already. + */ +void watchdog_nmi_disable(unsigned int cpu) +{ + if (atomic_read(&nmi_active) == -1) + pr_warn_once("NMI watchdog cannot be enabled or disabled\n"); + else + smp_call_function_single(cpu, stop_nmi_watchdog, NULL, 1); +} diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index e739002427ed..40121d14d34d 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -109,7 +109,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, return VM_FAULT_SIGBUS; if (sym_offset == image->sym_vvar_page) { - ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, + ret = vm_insert_pfn(vma, vmf->address, __pa_symbol(&__vvar_page) >> PAGE_SHIFT); } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = @@ -117,7 +117,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { ret = vm_insert_pfn( vma, - (unsigned long)vmf->virtual_address, + vmf->address, __pa(pvti) >> PAGE_SHIFT); } } diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c index 199b8e99f7d7..737187865269 100644 --- a/drivers/char/agp/alpha-agp.c +++ b/drivers/char/agp/alpha-agp.c @@ -19,8 +19,7 @@ static int alpha_core_agp_vm_fault(struct vm_area_struct *vma, unsigned long pa; struct page *page; - dma_addr = (unsigned long)vmf->virtual_address - vma->vm_start - + agp->aperture.bus_base; + dma_addr = vmf->address - vma->vm_start + agp->aperture.bus_base; pa = agp->ops->translate(agp, dma_addr); if (pa == (unsigned long)-EINVAL) diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index f3f92d5fcda0..a697ca0cab1e 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -227,7 +227,7 @@ mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * be because another thread has installed the pte first, so it * is no problem. */ - vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); + vm_insert_pfn(vma, vmf->address, pfn); return VM_FAULT_NOPAGE; } diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c index 7a4869151d3b..a77262d31911 100644 --- a/drivers/char/tpm/tpm-chip.c +++ b/drivers/char/tpm/tpm-chip.c @@ -84,7 +84,7 @@ EXPORT_SYMBOL_GPL(tpm_put_ops); * * The return'd chip has been tpm_try_get_ops'd and must be released via * tpm_put_ops - */ + */ struct tpm_chip *tpm_chip_find_get(int chip_num) { struct tpm_chip *chip, *res = NULL; @@ -103,7 +103,7 @@ struct tpm_chip *tpm_chip_find_get(int chip_num) } } while (chip_prev != chip_num); } else { - chip = idr_find_slowpath(&dev_nums_idr, chip_num); + chip = idr_find(&dev_nums_idr, chip_num); if (chip && !tpm_try_get_ops(chip)) res = chip; } diff --git a/drivers/clk/uniphier/clk-uniphier-mio.c b/drivers/clk/uniphier/clk-uniphier-mio.c index 4974d38623c0..002b333b4060 100644 --- a/drivers/clk/uniphier/clk-uniphier-mio.c +++ b/drivers/clk/uniphier/clk-uniphier-mio.c @@ -30,38 +30,40 @@ .name = "sd" #ch "-sel", \ .type = UNIPHIER_CLK_TYPE_MUX, \ .idx = -1, \ - .mux = { \ - .parent_names = { \ - "sd-44m", \ - "sd-33m", \ - "sd-50m", \ - "sd-67m", \ - "sd-100m", \ - "sd-40m", \ - "sd-25m", \ - "sd-22m", \ - }, \ - .num_parents = 8, \ - .reg = 0x30 + 0x200 * (ch), \ - .masks = { \ - 0x00031000, \ - 0x00031000, \ - 0x00031000, \ - 0x00031000, \ - 0x00001300, \ - 0x00001300, \ - 0x00001300, \ - 0x00001300, \ - }, \ - .vals = { \ - 0x00000000, \ - 0x00010000, \ - 0x00020000, \ - 0x00030000, \ - 0x00001000, \ - 0x00001100, \ - 0x00001200, \ - 0x00001300, \ + { \ + .mux = { \ + .parent_names = { \ + "sd-44m", \ + "sd-33m", \ + "sd-50m", \ + "sd-67m", \ + "sd-100m", \ + "sd-40m", \ + "sd-25m", \ + "sd-22m", \ + }, \ + .num_parents = 8, \ + .reg = 0x30 + 0x200 * (ch), \ + .masks = { \ + 0x00031000, \ + 0x00031000, \ + 0x00031000, \ + 0x00031000, \ + 0x00001300, \ + 0x00001300, \ + 0x00001300, \ + 0x00001300, \ + }, \ + .vals = { \ + 0x00000000, \ + 0x00010000, \ + 0x00020000, \ + 0x00030000, \ + 0x00001000, \ + 0x00001100, \ + 0x00001200, \ + 0x00001300, \ + }, \ }, \ }, \ }, \ diff --git a/drivers/clk/uniphier/clk-uniphier.h b/drivers/clk/uniphier/clk-uniphier.h index 81d7e5c9bd86..23489923314a 100644 --- a/drivers/clk/uniphier/clk-uniphier.h +++ b/drivers/clk/uniphier/clk-uniphier.h @@ -81,12 +81,14 @@ struct uniphier_clk_data { .name = (_name), \ .type = UNIPHIER_CLK_TYPE_CPUGEAR, \ .idx = (_idx), \ - .cpugear = { \ - .parent_names = { __VA_ARGS__ }, \ - .num_parents = (_num_parents), \ - .regbase = (_regbase), \ - .mask = (_mask) \ - }, \ + { \ + .cpugear = { \ + .parent_names = { __VA_ARGS__ },\ + .num_parents = (_num_parents), \ + .regbase = (_regbase), \ + .mask = (_mask) \ + }, \ + } \ } #define UNIPHIER_CLK_FACTOR(_name, _idx, _parent, _mult, _div) \ @@ -94,10 +96,12 @@ struct uniphier_clk_data { .name = (_name), \ .type = UNIPHIER_CLK_TYPE_FIXED_FACTOR, \ .idx = (_idx), \ - .factor = { \ - .parent_name = (_parent), \ - .mult = (_mult), \ - .div = (_div), \ + { \ + .factor = { \ + .parent_name = (_parent), \ + .mult = (_mult), \ + .div = (_div), \ + }, \ }, \ } @@ -106,11 +110,13 @@ struct uniphier_clk_data { .name = (_name), \ .type = UNIPHIER_CLK_TYPE_GATE, \ .idx = (_idx), \ - .gate = { \ - .parent_name = (_parent), \ - .reg = (_reg), \ - .bit = (_bit), \ - }, \ + { \ + .gate = { \ + .parent_name = (_parent), \ + .reg = (_reg), \ + .bit = (_bit), \ + }, \ + } \ } #define UNIPHIER_CLK_DIV(parent, div) \ diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 3d94ff20fdca..028b56ab9e31 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -328,7 +328,6 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, struct vm_fault *vmf) { - unsigned long vaddr = (unsigned long) vmf->virtual_address; struct device *dev = &dax_dev->dev; struct dax_region *dax_region; int rc = VM_FAULT_SIGBUS; @@ -353,7 +352,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); - rc = vm_insert_mixed(vma, vaddr, pfn); + rc = vm_insert_mixed(vma, vmf->address, pfn); if (rc == -ENOMEM) return VM_FAULT_OOM; diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c index 768087ddb046..a293c8be232c 100644 --- a/drivers/gpu/drm/armada/armada_gem.c +++ b/drivers/gpu/drm/armada/armada_gem.c @@ -17,12 +17,11 @@ static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data); - unsigned long addr = (unsigned long)vmf->virtual_address; unsigned long pfn = obj->phys_addr >> PAGE_SHIFT; int ret; - pfn += (addr - vma->vm_start) >> PAGE_SHIFT; - ret = vm_insert_pfn(vma, addr, pfn); + pfn += (vmf->address - vma->vm_start) >> PAGE_SHIFT; + ret = vm_insert_pfn(vma, vmf->address, pfn); switch (ret) { case 0: diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index caa4e4ca616d..bd311c77c254 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c @@ -124,8 +124,7 @@ static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * Using vm_pgoff as a selector forces us to use this unusual * addressing scheme. */ - resource_size_t offset = (unsigned long)vmf->virtual_address - - vma->vm_start; + resource_size_t offset = vmf->address - vma->vm_start; resource_size_t baddr = map->offset + offset; struct drm_agp_mem *agpmem; struct page *page; @@ -195,7 +194,7 @@ static int drm_do_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!map) return VM_FAULT_SIGBUS; /* Nothing allocated */ - offset = (unsigned long)vmf->virtual_address - vma->vm_start; + offset = vmf->address - vma->vm_start; i = (unsigned long)map->handle + offset; page = vmalloc_to_page((void *)i); if (!page) @@ -301,7 +300,8 @@ static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!dma->pagelist) return VM_FAULT_SIGBUS; /* Nothing allocated */ - offset = (unsigned long)vmf->virtual_address - vma->vm_start; /* vm_[pg]off[set] should be 0 */ + offset = vmf->address - vma->vm_start; + /* vm_[pg]off[set] should be 0 */ page_nr = offset >> PAGE_SHIFT; /* page_nr could just be vmf->pgoff */ page = virt_to_page((void *)dma->pagelist[page_nr]); @@ -337,7 +337,7 @@ static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!entry->pagelist) return VM_FAULT_SIGBUS; /* Nothing allocated */ - offset = (unsigned long)vmf->virtual_address - vma->vm_start; + offset = vmf->address - vma->vm_start; map_offset = map->offset - (unsigned long)dev->sg->virtual; page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT); page = entry->pagelist[page_offset]; diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 7d066a91d778..114dddbd297b 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -202,15 +202,14 @@ int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } /* We don't use vmf->pgoff since that has the fake offset: */ - pgoff = ((unsigned long)vmf->virtual_address - - vma->vm_start) >> PAGE_SHIFT; + pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; page = pages[pgoff]; - VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, + VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, page_to_pfn(page), page_to_pfn(page) << PAGE_SHIFT); - ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); + ret = vm_insert_page(vma, vmf->address, page); out: switch (ret) { @@ -759,7 +758,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages( down_read(&mm->mmap_sem); while (pinned < npages) { ret = get_user_pages_remote(task, mm, ptr, npages - pinned, - flags, pvec + pinned, NULL); + flags, pvec + pinned, NULL, NULL); if (ret < 0) break; diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index ea7a18230888..57b81460fec8 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c @@ -455,8 +455,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) pgoff_t page_offset; int ret; - page_offset = ((unsigned long)vmf->virtual_address - - vma->vm_start) >> PAGE_SHIFT; + page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; if (page_offset >= (exynos_gem->size >> PAGE_SHIFT)) { DRM_ERROR("invalid page offset\n"); @@ -465,8 +464,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } pfn = page_to_pfn(exynos_gem->pages[page_offset]); - ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, - __pfn_to_pfn_t(pfn, PFN_DEV)); + ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV)); out: switch (ret) { diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c index 4071b2d1e8cf..8b44fa542562 100644 --- a/drivers/gpu/drm/gma500/framebuffer.c +++ b/drivers/gpu/drm/gma500/framebuffer.c @@ -125,7 +125,7 @@ static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) psbfb->gtt->offset; page_num = vma_pages(vma); - address = (unsigned long)vmf->virtual_address - (vmf->pgoff << PAGE_SHIFT); + address = vmf->address - (vmf->pgoff << PAGE_SHIFT); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/gpu/drm/gma500/gem.c b/drivers/gpu/drm/gma500/gem.c index 6d1cb6b370b1..527c62917660 100644 --- a/drivers/gpu/drm/gma500/gem.c +++ b/drivers/gpu/drm/gma500/gem.c @@ -197,15 +197,14 @@ int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) /* Page relative to the VMA start - we must calculate this ourselves because vmf->pgoff is the fake GEM offset */ - page_offset = ((unsigned long) vmf->virtual_address - vma->vm_start) - >> PAGE_SHIFT; + page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; /* CPU view of the page, don't go via the GART for CPU writes */ if (r->stolen) pfn = (dev_priv->stolen_base + r->offset) >> PAGE_SHIFT; else pfn = page_to_pfn(r->pages[page_offset]); - ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); + ret = vm_insert_pfn(vma, vmf->address, pfn); fail: mutex_unlock(&dev_priv->mmap_mutex); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index d0dcaf35b429..412f3513f269 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1796,8 +1796,7 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf) int ret; /* We don't use vmf->pgoff since that has the fake offset */ - page_offset = ((unsigned long)vmf->virtual_address - area->vm_start) >> - PAGE_SHIFT; + page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT; trace_i915_gem_object_fault(obj, page_offset, true, write); diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 107ddf51065e..d068af2ec3a3 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -515,7 +515,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) obj->userptr.ptr + pinned * PAGE_SIZE, npages - pinned, flags, - pvec + pinned, NULL); + pvec + pinned, NULL, NULL); if (ret < 0) break; diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index cd06cfd94687..d8bc59c7e261 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -225,16 +225,14 @@ int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } /* We don't use vmf->pgoff since that has the fake offset: */ - pgoff = ((unsigned long)vmf->virtual_address - - vma->vm_start) >> PAGE_SHIFT; + pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; pfn = page_to_pfn(pages[pgoff]); - VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, + VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, pfn, pfn << PAGE_SHIFT); - ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, - __pfn_to_pfn_t(pfn, PFN_DEV)); + ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV)); out_unlock: mutex_unlock(&dev->struct_mutex); diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c index d4e1e11466f8..4a90c690f09e 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem.c +++ b/drivers/gpu/drm/omapdrm/omap_gem.c @@ -398,8 +398,7 @@ static int fault_1d(struct drm_gem_object *obj, pgoff_t pgoff; /* We don't use vmf->pgoff since that has the fake offset: */ - pgoff = ((unsigned long)vmf->virtual_address - - vma->vm_start) >> PAGE_SHIFT; + pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; if (omap_obj->pages) { omap_gem_cpu_sync(obj, pgoff); @@ -409,11 +408,10 @@ static int fault_1d(struct drm_gem_object *obj, pfn = (omap_obj->paddr >> PAGE_SHIFT) + pgoff; } - VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, + VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, pfn, pfn << PAGE_SHIFT); - return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, - __pfn_to_pfn_t(pfn, PFN_DEV)); + return vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV)); } /* Special handling for the case of faulting in 2d tiled buffers */ @@ -427,7 +425,7 @@ static int fault_2d(struct drm_gem_object *obj, struct page *pages[64]; /* XXX is this too much to have on stack? */ unsigned long pfn; pgoff_t pgoff, base_pgoff; - void __user *vaddr; + unsigned long vaddr; int i, ret, slots; /* @@ -447,8 +445,7 @@ static int fault_2d(struct drm_gem_object *obj, const int m = 1 + ((omap_obj->width << fmt) / PAGE_SIZE); /* We don't use vmf->pgoff since that has the fake offset: */ - pgoff = ((unsigned long)vmf->virtual_address - - vma->vm_start) >> PAGE_SHIFT; + pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; /* * Actual address we start mapping at is rounded down to previous slot @@ -459,7 +456,7 @@ static int fault_2d(struct drm_gem_object *obj, /* figure out buffer width in slots */ slots = omap_obj->width >> priv->usergart[fmt].slot_shift; - vaddr = vmf->virtual_address - ((pgoff - base_pgoff) << PAGE_SHIFT); + vaddr = vmf->address - ((pgoff - base_pgoff) << PAGE_SHIFT); entry = &priv->usergart[fmt].entry[priv->usergart[fmt].last]; @@ -503,12 +500,11 @@ static int fault_2d(struct drm_gem_object *obj, pfn = entry->paddr >> PAGE_SHIFT; - VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, + VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, pfn, pfn << PAGE_SHIFT); for (i = n; i > 0; i--) { - vm_insert_mixed(vma, (unsigned long)vaddr, - __pfn_to_pfn_t(pfn, PFN_DEV)); + vm_insert_mixed(vma, vaddr, __pfn_to_pfn_t(pfn, PFN_DEV)); pfn += priv->usergart[fmt].stride_pfn; vaddr += PAGE_SIZE * m; } diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c index c08e5279eeac..7d853e6b5ff0 100644 --- a/drivers/gpu/drm/tegra/gem.c +++ b/drivers/gpu/drm/tegra/gem.c @@ -452,10 +452,10 @@ static int tegra_bo_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!bo->pages) return VM_FAULT_SIGBUS; - offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT; + offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; page = bo->pages[offset]; - err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); + err = vm_insert_page(vma, vmf->address, page); switch (err) { case -EAGAIN: case 0: diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 4748aedc933a..68ef993ab431 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -101,7 +101,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct page *page; int ret; int i; - unsigned long address = (unsigned long)vmf->virtual_address; + unsigned long address = vmf->address; int retval = VM_FAULT_NOPAGE; struct ttm_mem_type_manager *man = &bdev->man[bo->mem.mem_type]; diff --git a/drivers/gpu/drm/udl/udl_gem.c b/drivers/gpu/drm/udl/udl_gem.c index 818e70712b18..3c0c4bd3f750 100644 --- a/drivers/gpu/drm/udl/udl_gem.c +++ b/drivers/gpu/drm/udl/udl_gem.c @@ -107,14 +107,13 @@ int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned int page_offset; int ret = 0; - page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> - PAGE_SHIFT; + page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; if (!obj->pages) return VM_FAULT_SIGBUS; page = obj->pages[page_offset]; - ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); + ret = vm_insert_page(vma, vmf->address, page); switch (ret) { case -EAGAIN: case 0: diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index f36c14729b55..477e07f0ecb6 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -54,7 +54,7 @@ static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct drm_vgem_gem_object *obj = vma->vm_private_data; /* We don't use vmf->pgoff since that has the fake offset */ - unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned long vaddr = vmf->address; struct page *page; page = shmem_read_mapping_page(file_inode(obj->base.filp)->i_mapping, diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 1f0fe3217f23..6b079a31dced 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -578,7 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, */ npages = get_user_pages_remote(owning_process, owning_mm, user_virt, gup_num_pages, - flags, local_page_list, NULL); + flags, local_page_list, NULL, NULL); up_read(&owning_mm->mmap_sem); if (npages < 0) diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 1db0af6c7f94..ba63ca57ed7e 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -439,13 +439,12 @@ static int videobuf_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct page *page; dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n", - (unsigned long)vmf->virtual_address, - vma->vm_start, vma->vm_end); + vmf->address, vma->vm_start, vma->vm_end); page = alloc_page(GFP_USER | __GFP_DMA32); if (!page) return VM_FAULT_OOM; - clear_user_highpage(page, (unsigned long)vmf->virtual_address); + clear_user_highpage(page, vmf->address); vmf->page = page; return 0; diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index ff5e7e8cb1d1..3907387b6d15 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -124,13 +124,12 @@ void cxl_context_set_mapping(struct cxl_context *ctx, static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct cxl_context *ctx = vma->vm_file->private_data; - unsigned long address = (unsigned long)vmf->virtual_address; u64 area, offset; offset = vmf->pgoff << PAGE_SHIFT; pr_devel("%s: pe: %i address: 0x%lx offset: 0x%llx\n", - __func__, ctx->pe, address, offset); + __func__, ctx->pe, vmf->address, offset); if (ctx->afu->current_mode == CXL_MODE_DEDICATED) { area = ctx->afu->psn_phys; @@ -162,7 +161,7 @@ static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } - vm_insert_pfn(vma, address, (area + offset) >> PAGE_SHIFT); + vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT); mutex_unlock(&ctx->status_mutex); diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c index 33741ad4a74a..af2e077da4b8 100644 --- a/drivers/misc/sgi-gru/grumain.c +++ b/drivers/misc/sgi-gru/grumain.c @@ -932,7 +932,7 @@ int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned long paddr, vaddr; unsigned long expires; - vaddr = (unsigned long)vmf->virtual_address; + vaddr = vmf->address; gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n", vma, vaddr, GSEG_BASE(vaddr)); STAT(nopfn); diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c index e9cef9de9ed8..c96f9b1d948a 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c @@ -900,8 +900,7 @@ static void iwlagn_gain_computation(struct iwl_priv *priv, /* bound gain by 2 bits value max, 3rd bit is sign */ data->delta_gain_code[i] = - min(abs(delta_g), - (s32) CHAIN_NOISE_MAX_DELTA_GAIN_CODE); + min(abs(delta_g), CHAIN_NOISE_MAX_DELTA_GAIN_CODE); if (delta_g < 0) /* diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index d5cc3070e83f..b653451843c8 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -882,7 +882,7 @@ static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]); pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff])); - ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); + ret = vm_insert_pfn(vma, vmf->address, pfn); mutex_unlock(&buffer->lock); if (ret) return VM_FAULT_ERROR; diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c index 0b6d388d8aa4..697cbfbe9374 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_io.c +++ b/drivers/staging/lustre/lustre/llite/vvp_io.c @@ -1014,7 +1014,7 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n", vmf->page, vmf->page->mapping, vmf->page->index, (long)vmf->page->flags, page_count(vmf->page), - page_private(vmf->page), vmf->virtual_address); + page_private(vmf->page), (void *)vmf->address); if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) { lock_page(vmf->page); cfio->ft_flags |= VM_FAULT_LOCKED; @@ -1025,12 +1025,12 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) } if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) { - CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address); + CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", (void *)vmf->address); return -EFAULT; } if (cfio->ft_flags & VM_FAULT_OOM) { - CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address); + CDEBUG(D_PAGE, "got addr %p - OOM\n", (void *)vmf->address); return -ENOMEM; } diff --git a/drivers/usb/gadget/function/f_hid.c b/drivers/usb/gadget/function/f_hid.c index 7abd70b2a588..3151d2a0fe59 100644 --- a/drivers/usb/gadget/function/f_hid.c +++ b/drivers/usb/gadget/function/f_hid.c @@ -905,7 +905,7 @@ static void hidg_free_inst(struct usb_function_instance *f) mutex_lock(&hidg_ida_lock); hidg_put_minor(opts->minor); - if (idr_is_empty(&hidg_ida.idr)) + if (ida_is_empty(&hidg_ida)) ghid_cleanup(); mutex_unlock(&hidg_ida_lock); @@ -931,7 +931,7 @@ static struct usb_function_instance *hidg_alloc_inst(void) mutex_lock(&hidg_ida_lock); - if (idr_is_empty(&hidg_ida.idr)) { + if (ida_is_empty(&hidg_ida)) { status = ghid_setup(NULL, HIDG_MINORS); if (status) { ret = ERR_PTR(status); @@ -944,7 +944,7 @@ static struct usb_function_instance *hidg_alloc_inst(void) if (opts->minor < 0) { ret = ERR_PTR(opts->minor); kfree(opts); - if (idr_is_empty(&hidg_ida.idr)) + if (ida_is_empty(&hidg_ida)) ghid_cleanup(); goto unlock; } diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c index 0de36cda6e41..8054da9276dd 100644 --- a/drivers/usb/gadget/function/f_printer.c +++ b/drivers/usb/gadget/function/f_printer.c @@ -1265,7 +1265,7 @@ static void gprinter_free_inst(struct usb_function_instance *f) mutex_lock(&printer_ida_lock); gprinter_put_minor(opts->minor); - if (idr_is_empty(&printer_ida.idr)) + if (ida_is_empty(&printer_ida)) gprinter_cleanup(); mutex_unlock(&printer_ida_lock); @@ -1289,7 +1289,7 @@ static struct usb_function_instance *gprinter_alloc_inst(void) mutex_lock(&printer_ida_lock); - if (idr_is_empty(&printer_ida.idr)) { + if (ida_is_empty(&printer_ida)) { status = gprinter_setup(PRINTER_MINORS); if (status) { ret = ERR_PTR(status); @@ -1302,7 +1302,7 @@ static struct usb_function_instance *gprinter_alloc_inst(void) if (opts->minor < 0) { ret = ERR_PTR(opts->minor); kfree(opts); - if (idr_is_empty(&printer_ida.idr)) + if (ida_is_empty(&printer_ida)) gprinter_cleanup(); goto unlock; } diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 9815e45c23c4..f3726ba12aa6 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -362,7 +362,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, down_read(&mm->mmap_sem); ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, - NULL); + NULL, NULL); up_read(&mm->mmap_sem); } diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 702040fe2001..6e3306f4a525 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -602,7 +602,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", vma, vma->vm_start, vma->vm_end, - vmf->pgoff, vmf->virtual_address); + vmf->pgoff, (void *)vmf->address); return VM_FAULT_SIGBUS; } diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index a1fba4285277..c885daae68c8 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -145,7 +145,7 @@ void autofs4_free_ino(struct autofs_info *); /* Expiration */ int is_autofs4_dentry(struct dentry *); -int autofs4_expire_wait(struct dentry *dentry, int rcu_walk); +int autofs4_expire_wait(const struct path *path, int rcu_walk); int autofs4_expire_run(struct super_block *, struct vfsmount *, struct autofs_sb_info *, struct autofs_packet_expire __user *); @@ -217,7 +217,8 @@ static inline int autofs_prepare_pipe(struct file *pipe) /* Queue management functions */ -int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify); +int autofs4_wait(struct autofs_sb_info *, + const struct path *, enum autofs_notify); int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); void autofs4_catatonic_mode(struct autofs_sb_info *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index fc09eb77ddf3..afacdaa8dd5a 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -468,7 +468,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, ino = autofs4_dentry_ino(path.dentry); if (ino) { err = 0; - autofs4_expire_wait(path.dentry, 0); + autofs4_expire_wait(&path, 0); spin_lock(&sbi->fs_lock); param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); @@ -575,7 +575,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, devid = new_encode_dev(dev); - err = have_submounts(path.dentry); + err = path_has_submounts(&path); if (follow_down_one(&path)) magic = path.dentry->d_sb->s_magic; diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index d8e6d421c27f..d44ae324df50 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -52,7 +52,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) goto done; } - /* Update the expiry counter if fs is busy */ + /* Update the expiry counter if fs is busy in any namespace */ if (!may_umount_tree(path.mnt)) { struct autofs_info *ino; @@ -191,7 +191,7 @@ static int autofs4_direct_busy(struct vfsmount *mnt, { pr_debug("top %p %pd\n", top, top); - /* If it's busy update the expiry counters */ + /* If it's busy in any namespace update the expiry counters */ if (!may_umount_tree(mnt)) { struct autofs_info *ino; @@ -310,26 +310,29 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, now = jiffies; timeout = sbi->exp_timeout; - spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(root); - /* No point expiring a pending mount */ - if (ino->flags & AUTOFS_INF_PENDING) - goto out; if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(root); + /* No point expiring a pending mount */ + if (ino->flags & AUTOFS_INF_PENDING) { + spin_unlock(&sbi->fs_lock); + goto out; + } ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - spin_lock(&sbi->fs_lock); if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + spin_lock(&sbi->fs_lock); ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); return root; } + spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; + spin_unlock(&sbi->fs_lock); } out: - spin_unlock(&sbi->fs_lock); dput(root); return NULL; @@ -495,8 +498,9 @@ found: return expired; } -int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) +int autofs4_expire_wait(const struct path *path, int rcu_walk) { + struct dentry *dentry = path->dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; @@ -525,7 +529,7 @@ retry: pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); - status = autofs4_wait(sbi, dentry, NFY_NONE); + status = autofs4_wait(sbi, path, NFY_NONE); wait_for_completion(&ino->expire_complete); pr_debug("expire done status=%d\n", status); @@ -592,11 +596,12 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (dentry) { struct autofs_info *ino = autofs4_dentry_ino(dentry); + const struct path path = { .mnt = mnt, .dentry = dentry }; /* This is synchronous because it makes the daemon a * little easier */ - ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); + ret = autofs4_wait(sbi, &path, NFY_EXPIRE); spin_lock(&sbi->fs_lock); /* avoid rapid-fire expire attempts if expiry fails */ diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index a11f73174877..08718d30a6ac 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -32,7 +32,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file); static struct dentry *autofs4_lookup(struct inode *, struct dentry *, unsigned int); static struct vfsmount *autofs4_d_automount(struct path *); -static int autofs4_d_manage(struct dentry *, bool); +static int autofs4_d_manage(const struct path *, bool); static void autofs4_dentry_release(struct dentry *); const struct file_operations autofs4_root_operations = { @@ -107,12 +107,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_path.dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + const struct path *path; pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry); if (autofs4_oz_mode(sbi)) goto out; + path = &file->f_path; + /* * An empty directory in an autofs file system is always a * mount point. The daemon must have failed to mount this @@ -123,7 +126,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) * it. */ spin_lock(&sbi->lookup_lock); - if (!d_mountpoint(dentry) && simple_empty(dentry)) { + if (!path_is_mountpoint(path) && simple_empty(dentry)) { spin_unlock(&sbi->lookup_lock); return -ENOENT; } @@ -269,39 +272,41 @@ next: return NULL; } -static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk) +static int autofs4_mount_wait(const struct path *path, bool rcu_walk) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs4_sbi(path->dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(path->dentry); int status = 0; if (ino->flags & AUTOFS_INF_PENDING) { if (rcu_walk) return -ECHILD; - pr_debug("waiting for mount name=%pd\n", dentry); - status = autofs4_wait(sbi, dentry, NFY_MOUNT); + pr_debug("waiting for mount name=%pd\n", path->dentry); + status = autofs4_wait(sbi, path, NFY_MOUNT); pr_debug("mount wait done status=%d\n", status); } ino->last_used = jiffies; return status; } -static int do_expire_wait(struct dentry *dentry, bool rcu_walk) +static int do_expire_wait(const struct path *path, bool rcu_walk) { + struct dentry *dentry = path->dentry; struct dentry *expiring; expiring = autofs4_lookup_expiring(dentry, rcu_walk); if (IS_ERR(expiring)) return PTR_ERR(expiring); if (!expiring) - return autofs4_expire_wait(dentry, rcu_walk); + return autofs4_expire_wait(path, rcu_walk); else { + const struct path this = { .mnt = path->mnt, .dentry = expiring }; /* * If we are racing with expire the request might not * be quite complete, but the directory has been removed * so it must have been successful, just wait for it. */ - autofs4_expire_wait(expiring, 0); + autofs4_expire_wait(&this, 0); autofs4_del_expiring(expiring); dput(expiring); } @@ -354,7 +359,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * and the directory was removed, so just go ahead and try * the mount. */ - status = do_expire_wait(dentry, 0); + status = do_expire_wait(path, 0); if (status && status != -EAGAIN) return NULL; @@ -362,7 +367,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) spin_lock(&sbi->fs_lock); if (ino->flags & AUTOFS_INF_PENDING) { spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(dentry, 0); + status = autofs4_mount_wait(path, 0); if (status) return ERR_PTR(status); goto done; @@ -370,28 +375,28 @@ static struct vfsmount *autofs4_d_automount(struct path *path) /* * If the dentry is a symlink it's equivalent to a directory - * having d_mountpoint() true, so there's no need to call back - * to the daemon. + * having path_is_mountpoint() true, so there's no need to call + * back to the daemon. */ if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { spin_unlock(&sbi->fs_lock); goto done; } - if (!d_mountpoint(dentry)) { + if (!path_is_mountpoint(path)) { /* * It's possible that user space hasn't removed directories * after umounting a rootless multi-mount, although it - * should. For v5 have_submounts() is sufficient to handle - * this because the leaves of the directory tree under the - * mount never trigger mounts themselves (they have an autofs - * trigger mount mounted on them). But v4 pseudo direct mounts - * do need the leaves to trigger mounts. In this case we - * have no choice but to use the list_empty() check and + * should. For v5 path_has_submounts() is sufficient to + * handle this because the leaves of the directory tree under + * the mount never trigger mounts themselves (they have an + * autofs trigger mount mounted on them). But v4 pseudo direct + * mounts do need the leaves to trigger mounts. In this case + * we have no choice but to use the list_empty() check and * require user space behave. */ if (sbi->version > 4) { - if (have_submounts(dentry)) { + if (path_has_submounts(path)) { spin_unlock(&sbi->fs_lock); goto done; } @@ -403,7 +408,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) } ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(dentry, 0); + status = autofs4_mount_wait(path, 0); spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_PENDING; if (status) { @@ -421,8 +426,9 @@ done: return NULL; } -static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) +static int autofs4_d_manage(const struct path *path, bool rcu_walk) { + struct dentry *dentry = path->dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; @@ -431,20 +437,20 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) /* The daemon never waits. */ if (autofs4_oz_mode(sbi)) { - if (!d_mountpoint(dentry)) + if (!path_is_mountpoint(path)) return -EISDIR; return 0; } /* Wait for pending expires */ - if (do_expire_wait(dentry, rcu_walk) == -ECHILD) + if (do_expire_wait(path, rcu_walk) == -ECHILD) return -ECHILD; /* * This dentry may be under construction so wait on mount * completion. */ - status = autofs4_mount_wait(dentry, rcu_walk); + status = autofs4_mount_wait(path, rcu_walk); if (status) return status; @@ -460,7 +466,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) if (ino->flags & AUTOFS_INF_WANT_EXPIRE) return 0; - if (d_mountpoint(dentry)) + if (path_is_mountpoint(path)) return 0; inode = d_inode_rcu(dentry); if (inode && S_ISLNK(inode->i_mode)) @@ -487,7 +493,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) * we can avoid needless calls ->d_automount() and avoid * an incorrect ELOOP error return. */ - if ((!d_mountpoint(dentry) && !simple_empty(dentry)) || + if ((!path_is_mountpoint(path) && !simple_empty(dentry)) || (d_really_is_positive(dentry) && d_is_symlink(dentry))) status = -EISDIR; } diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index e44271dfceb6..1278335ce366 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -250,8 +250,9 @@ autofs4_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) static int validate_request(struct autofs_wait_queue **wait, struct autofs_sb_info *sbi, const struct qstr *qstr, - struct dentry *dentry, enum autofs_notify notify) + const struct path *path, enum autofs_notify notify) { + struct dentry *dentry = path->dentry; struct autofs_wait_queue *wq; struct autofs_info *ino; @@ -314,6 +315,7 @@ static int validate_request(struct autofs_wait_queue **wait, */ if (notify == NFY_MOUNT) { struct dentry *new = NULL; + struct path this; int valid = 1; /* @@ -333,7 +335,9 @@ static int validate_request(struct autofs_wait_queue **wait, dentry = new; } } - if (have_submounts(dentry)) + this.mnt = path->mnt; + this.dentry = dentry; + if (path_has_submounts(&this)) valid = 0; if (new) @@ -345,8 +349,9 @@ static int validate_request(struct autofs_wait_queue **wait, } int autofs4_wait(struct autofs_sb_info *sbi, - struct dentry *dentry, enum autofs_notify notify) + const struct path *path, enum autofs_notify notify) { + struct dentry *dentry = path->dentry; struct autofs_wait_queue *wq; struct qstr qstr; char *name; @@ -405,7 +410,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, return -EINTR; } - ret = validate_request(&wq, sbi, &qstr, dentry, notify); + ret = validate_request(&wq, sbi, &qstr, path, notify); if (ret <= 0) { if (ret != -EINTR) mutex_unlock(&sbi->wq_mutex); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index bf62ad919a95..00ee006a8aa2 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -162,6 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) slot = radix_tree_iter_retry(&iter); continue; } + slot = radix_tree_iter_resume(slot, &iter); spin_unlock(&fs_info->buffer_lock); free_extent_buffer_stale(eb); spin_lock(&fs_info->buffer_lock); @@ -31,6 +31,7 @@ #include <linux/vmstat.h> #include <linux/pfn_t.h> #include <linux/sizes.h> +#include <linux/mmu_notifier.h> #include <linux/iomap.h> #include "internal.h" @@ -240,6 +241,23 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, } } +static void dax_unlock_mapping_entry(struct address_space *mapping, + pgoff_t index) +{ + void *entry, **slot; + + spin_lock_irq(&mapping->tree_lock); + entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || + !slot_locked(mapping, slot))) { + spin_unlock_irq(&mapping->tree_lock); + return; + } + unlock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, entry, false); +} + static void put_locked_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry) { @@ -434,22 +452,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); } -void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) -{ - void *entry, **slot; - - spin_lock_irq(&mapping->tree_lock); - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || - !slot_locked(mapping, slot))) { - spin_unlock_irq(&mapping->tree_lock); - return; - } - unlock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); - dax_wake_mapping_entry_waiter(mapping, index, entry, false); -} - /* * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree * entry to get unlocked before deleting it. @@ -501,10 +503,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry, /* This will replace locked radix tree entry with a hole page */ page = find_or_create_page(mapping, vmf->pgoff, vmf->gfp_mask | __GFP_ZERO); - if (!page) { - put_locked_mapping_entry(mapping, vmf->pgoff, entry); + if (!page) return VM_FAULT_OOM; - } vmf->page = page; return VM_FAULT_LOCKED; } @@ -616,36 +616,107 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, return new_entry; } +static inline unsigned long +pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) +{ + unsigned long address; + + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); + return address; +} + +/* Walk all mappings of a given index of a file and writeprotect them */ +static void dax_mapping_entry_mkclean(struct address_space *mapping, + pgoff_t index, unsigned long pfn) +{ + struct vm_area_struct *vma; + pte_t *ptep; + pte_t pte; + spinlock_t *ptl; + bool changed; + + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { + unsigned long address; + + cond_resched(); + + if (!(vma->vm_flags & VM_SHARED)) + continue; + + address = pgoff_address(index, vma); + changed = false; + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + continue; + if (pfn != pte_pfn(*ptep)) + goto unlock; + if (!pte_dirty(*ptep) && !pte_write(*ptep)) + goto unlock; + + flush_cache_page(vma, address, pfn); + pte = ptep_clear_flush(vma, address, ptep); + pte = pte_wrprotect(pte); + pte = pte_mkclean(pte); + set_pte_at(vma->vm_mm, address, ptep, pte); + changed = true; +unlock: + pte_unmap_unlock(ptep, ptl); + + if (changed) + mmu_notifier_invalidate_page(vma->vm_mm, address); + } + i_mmap_unlock_read(mapping); +} + static int dax_writeback_one(struct block_device *bdev, struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; - struct radix_tree_node *node; struct blk_dax_ctl dax; - void **slot; + void *entry2, **slot; int ret = 0; - spin_lock_irq(&mapping->tree_lock); /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. + * A page got tagged dirty in DAX mapping? Something is seriously + * wrong. */ - if (!__radix_tree_lookup(page_tree, index, &node, &slot)) - goto unlock; - if (*slot != entry) - goto unlock; - - /* another fsync thread may have already written back this entry */ - if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) - goto unlock; + if (WARN_ON(!radix_tree_exceptional_entry(entry))) + return -EIO; + spin_lock_irq(&mapping->tree_lock); + entry2 = get_unlocked_mapping_entry(mapping, index, &slot); + /* Entry got punched out / reallocated? */ + if (!entry2 || !radix_tree_exceptional_entry(entry2)) + goto put_unlocked; + /* + * Entry got reallocated elsewhere? No need to writeback. We have to + * compare sectors as we must not bail out due to difference in lockbit + * or entry type. + */ + if (dax_radix_sector(entry2) != dax_radix_sector(entry)) + goto put_unlocked; if (WARN_ON_ONCE(dax_is_empty_entry(entry) || dax_is_zero_entry(entry))) { ret = -EIO; - goto unlock; + goto put_unlocked; } + /* Another fsync thread may have already written back this entry */ + if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + goto put_unlocked; + /* Lock the entry to serialize with page faults */ + entry = lock_slot(mapping, slot); + /* + * We can clear the tag now but we have to be careful so that concurrent + * dax_writeback_one() calls for the same index cannot finish before we + * actually flush the caches. This is achieved as the calls will look + * at the entry only under tree_lock and once they do that they will + * see the entry locked and wait for it to unlock. + */ + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); + spin_unlock_irq(&mapping->tree_lock); + /* * Even if dax_writeback_mapping_range() was given a wbc->range_start * in the middle of a PMD, the 'index' we are given will be aligned to @@ -655,31 +726,40 @@ static int dax_writeback_one(struct block_device *bdev, */ dax.sector = dax_radix_sector(entry); dax.size = PAGE_SIZE << dax_radix_order(entry); - spin_unlock_irq(&mapping->tree_lock); /* * We cannot hold tree_lock while calling dax_map_atomic() because it * eventually calls cond_resched(). */ ret = dax_map_atomic(bdev, &dax); - if (ret < 0) + if (ret < 0) { + put_locked_mapping_entry(mapping, index, entry); return ret; + } if (WARN_ON_ONCE(ret < dax.size)) { ret = -EIO; goto unmap; } + dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); wb_cache_pmem(dax.addr, dax.size); - + /* + * After we have flushed the cache, we can clear the dirty tag. There + * cannot be new dirty data in the pfn after the flush has completed as + * the pfn mappings are writeprotected and fault waits for mapping + * entry lock. + */ spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); unmap: dax_unmap_atomic(bdev, &dax); + put_locked_mapping_entry(mapping, index, entry); return ret; - unlock: + put_unlocked: + put_unlocked_mapping_entry(mapping, index, entry2); spin_unlock_irq(&mapping->tree_lock); return ret; } @@ -739,7 +819,7 @@ static int dax_insert_mapping(struct address_space *mapping, struct block_device *bdev, sector_t sector, size_t size, void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { - unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned long vaddr = vmf->address; struct blk_dax_ctl dax = { .sector = sector, .size = size, @@ -768,17 +848,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; - void *entry; + void *entry, **slot; pgoff_t index = vmf->pgoff; spin_lock_irq(&mapping->tree_lock); - entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (!entry || !radix_tree_exceptional_entry(entry)) - goto out; + entry = get_unlocked_mapping_entry(mapping, index, &slot); + if (!entry || !radix_tree_exceptional_entry(entry)) { + if (entry) + put_unlocked_mapping_entry(mapping, index, entry); + spin_unlock_irq(&mapping->tree_lock); + return VM_FAULT_NOPAGE; + } radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); - put_unlocked_mapping_entry(mapping, index, entry); -out: + entry = lock_slot(mapping, slot); spin_unlock_irq(&mapping->tree_lock); + /* + * If we race with somebody updating the PTE and finish_mkwrite_fault() + * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry + * the fault in either case. + */ + finish_mkwrite_fault(vmf); + put_locked_mapping_entry(mapping, index, entry); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); @@ -949,13 +1039,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; sector_t sector; struct iomap iomap = { 0 }; unsigned flags = IOMAP_FAULT; int error, major = 0; - int locked_status = 0; + int vmf_ret = 0; void *entry; /* @@ -1008,13 +1098,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (error) goto finish_iomap; - if (!radix_tree_exceptional_entry(entry)) { - vmf->page = entry; - locked_status = VM_FAULT_LOCKED; - } else { - vmf->entry = entry; - locked_status = VM_FAULT_DAX_LOCKED; - } + + __SetPageUptodate(vmf->cow_page); + vmf_ret = finish_fault(vmf); + if (!vmf_ret) + vmf_ret = VM_FAULT_DONE_COW; goto finish_iomap; } @@ -1031,7 +1119,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) { - locked_status = dax_load_hole(mapping, entry, vmf); + vmf_ret = dax_load_hole(mapping, entry, vmf); break; } /*FALLTHRU*/ @@ -1043,7 +1131,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, finish_iomap: if (ops->iomap_end) { - if (error) { + if (error || (vmf_ret & VM_FAULT_ERROR)) { /* keep previous error */ ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, &iomap); @@ -1053,7 +1141,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } } unlock_entry: - if (!locked_status || error) + if (vmf_ret != VM_FAULT_LOCKED || error) put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: if (error == -ENOMEM) @@ -1061,9 +1149,9 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error < 0 && error != -EBUSY) return VM_FAULT_SIGBUS | major; - if (locked_status) { + if (vmf_ret) { WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ - return locked_status; + return vmf_ret; } return VM_FAULT_NOPAGE | major; } diff --git a/fs/dcache.c b/fs/dcache.c index 5c7cc953ac81..29bc9109537a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1273,38 +1273,40 @@ rename_retry: goto again; } -/* - * Search for at least 1 mount point in the dentry's subdirs. - * We descend to the next level whenever the d_subdirs - * list is non-empty and continue searching. - */ +struct check_mount { + struct vfsmount *mnt; + unsigned int mounted; +}; -static enum d_walk_ret check_mount(void *data, struct dentry *dentry) +static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) { - int *ret = data; - if (d_mountpoint(dentry)) { - *ret = 1; + struct check_mount *info = data; + struct path path = { .mnt = info->mnt, .dentry = dentry }; + + if (path_is_mountpoint(&path)) { + info->mounted = 1; return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** - * have_submounts - check for mounts over a dentry - * @parent: dentry to check. + * path_has_submounts - check for mounts over a dentry in the + * current namespace. + * @parent: path to check. * * Return true if the parent or its subdirectories contain - * a mount point + * a mount point in the current namespace. */ -int have_submounts(struct dentry *parent) +int path_has_submounts(const struct path *parent) { - int ret = 0; + struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; - d_walk(parent, &ret, check_mount, NULL); + d_walk(parent->dentry, &data, path_check_mount, NULL); - return ret; + return data.mounted; } -EXPORT_SYMBOL(have_submounts); +EXPORT_SYMBOL(path_has_submounts); /* * Called by mount code to set a mountpoint and check if the mountpoint is diff --git a/fs/exec.c b/fs/exec.c index 88b5e1efdbd6..8112eacf10f3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * doing the exec and bprm->mm is the new process's mm. */ ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, - &page, NULL); + &page, NULL, NULL); if (ret <= 0) return NULL; diff --git a/fs/namei.c b/fs/namei.c index 31d04d993a2d..8e9a3589230f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1200,7 +1200,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) if (managed & DCACHE_MANAGE_TRANSIT) { BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); - ret = path->dentry->d_op->d_manage(path->dentry, false); + ret = path->dentry->d_op->d_manage(path, false); if (ret < 0) break; } @@ -1263,10 +1263,10 @@ int follow_down_one(struct path *path) } EXPORT_SYMBOL(follow_down_one); -static inline int managed_dentry_rcu(struct dentry *dentry) +static inline int managed_dentry_rcu(const struct path *path) { - return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? - dentry->d_op->d_manage(dentry, true) : 0; + return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? + path->dentry->d_op->d_manage(path, true) : 0; } /* @@ -1282,7 +1282,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ - switch (managed_dentry_rcu(path->dentry)) { + switch (managed_dentry_rcu(path)) { case -ECHILD: default: return false; @@ -1392,8 +1392,7 @@ int follow_down(struct path *path) if (managed & DCACHE_MANAGE_TRANSIT) { BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); - ret = path->dentry->d_op->d_manage( - path->dentry, false); + ret = path->dentry->d_op->d_manage(path, false); if (ret < 0) return ret == -EISDIR ? 0 : ret; } diff --git a/fs/namespace.c b/fs/namespace.c index ff1cd146dd4c..77afe78ef0b2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1160,6 +1160,36 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); +/* __path_is_mountpoint() - Check if path is a mount in the current + * namespace. + * + * d_mountpoint() can only be used reliably to establish if a dentry is + * not mounted in any namespace and that common case is handled inline. + * d_mountpoint() isn't aware of the possibility there may be multiple + * mounts using a given dentry in a different namespace. This function + * checks if the passed in path is a mountpoint rather than the dentry + * alone. + */ +bool __path_is_mountpoint(const struct path *path) +{ + struct mount *mount; + struct vfsmount *mnt; + unsigned seq; + + rcu_read_lock(); + do { + seq = read_seqbegin(&mount_lock); + mount = __lookup_mnt(path->mnt, path->dentry); + mnt = mount ? &mount->mnt : NULL; + } while (mnt && + !(mnt->mnt_flags & MNT_SYNC_UMOUNT) && + read_seqretry(&mount_lock, seq)); + rcu_read_unlock(); + + return mnt != NULL; +} +EXPORT_SYMBOL(__path_is_mountpoint); + struct vfsmount *mnt_clone_internal(struct path *path) { struct mount *p; @@ -1281,6 +1311,33 @@ const struct seq_operations mounts_op = { }; #endif /* CONFIG_PROC_FS */ +struct mnt_tree_refs { + struct mount *root; + unsigned int refs; + unsigned int min_refs; +}; + +static void mnt_get_tree_refs(struct mnt_tree_refs *mtr) +{ + struct mount *mnt = mtr->root; + struct mount *p; + + /* + * Each propagated tree contribues 2 * #mounts - 1 to + * the minimal reference count. But when a mount is + * umounted and connected the mount doesn't hold a + * reference to its parent so it contributes a single + * reference. + */ + for (p = mnt; p; p = next_mnt(p, mnt)) { + mtr->refs += mnt_get_count(p); + if (p == mnt || p->mnt.mnt_flags & MNT_UMOUNT) + mtr->min_refs++; + else + mtr->min_refs += 2; + } +} + /** * may_umount_tree - check if a mount tree is busy * @mnt: root of mount tree @@ -1292,25 +1349,51 @@ const struct seq_operations mounts_op = { int may_umount_tree(struct vfsmount *m) { struct mount *mnt = real_mount(m); - int actual_refs = 0; - int minimum_refs = 0; - struct mount *p; + struct mount *parent = mnt->mnt_parent; + struct mnt_tree_refs mtr; + struct mount *p, *child; + BUG_ON(!m); - /* write lock needed for mnt_get_count */ + down_read(&namespace_sem); lock_mount_hash(); - for (p = mnt; p; p = next_mnt(p, mnt)) { - actual_refs += mnt_get_count(p); - minimum_refs += 2; + + mtr.root = mnt; + mtr.refs = 0; + mtr.min_refs = 0; + + mnt_get_tree_refs(&mtr); + /* + * Caller holds a mount reference so minimum references + * to the tree at mnt is one greater than the minimum + * references. + */ + mtr.min_refs++; + + /* The pnode.c propagation_next() function (as used below) + * returns each mount propogated from a given mount. Using + * the parent of mnt and matching the mnt->mnt_mountpoint + * gets the list of mounts propogated from mnt. To work + * out if the tree is in use (eg. open file or pwd) the + * reference counts of each of these mounts needs to be + * checked as well as mnt itself. + */ + for (p = propagation_next(parent, parent); p; + p = propagation_next(p, parent)) { + child = __lookup_mnt_last(&p->mnt, mnt->mnt_mountpoint); + if (child) { + mtr.root = child; + mnt_get_tree_refs(&mtr); + } } unlock_mount_hash(); + up_read(&namespace_sem); - if (actual_refs > minimum_refs) + if (mtr.refs > mtr.min_refs) return 0; return 1; } - EXPORT_SYMBOL(may_umount_tree); /** diff --git a/fs/pnode.c b/fs/pnode.c index 234a9ac49958..ae6f95e149b2 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -143,8 +143,7 @@ void change_mnt_propagation(struct mount *mnt, int type) * vfsmount found while iterating with propagation_next() is * a peer of one we'd found earlier. */ -static struct mount *propagation_next(struct mount *m, - struct mount *origin) +struct mount *propagation_next(struct mount *m, struct mount *origin) { /* are there any slaves of this mount? */ if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) diff --git a/fs/pnode.h b/fs/pnode.h index 550f5a8b4fcf..2accd2d0f527 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -38,6 +38,7 @@ static inline void set_mnt_shared(struct mount *mnt) mnt->mnt.mnt_flags |= MNT_SHARED; } +struct mount *propagation_next(struct mount *, struct mount *); void change_mnt_propagation(struct mount *, int); int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct hlist_head *); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 85959d8324df..d96e2f30084b 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -257,9 +257,9 @@ out: * fatal_signal_pending()s, and the mmap_sem must be released before * returning it. */ -int handle_userfault(struct fault_env *fe, unsigned long reason) +int handle_userfault(struct vm_fault *vmf, unsigned long reason) { - struct mm_struct *mm = fe->vma->vm_mm; + struct mm_struct *mm = vmf->vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; int ret; @@ -268,7 +268,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); ret = VM_FAULT_SIGBUS; - ctx = fe->vma->vm_userfaultfd_ctx.ctx; + ctx = vmf->vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; @@ -301,17 +301,18 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) * without first stopping userland access to the memory. For * VM_UFFD_MISSING userfaults this is enough for now. */ - if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) { + if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { /* * Validate the invariant that nowait must allow retry * to be sure not to return SIGBUS erroneously on * nowait invocations. */ - BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT); + BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); #ifdef CONFIG_DEBUG_VM if (printk_ratelimit()) { printk(KERN_WARNING - "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags); + "FAULT_FLAG_ALLOW_RETRY missing %x\n", + vmf->flags); dump_stack(); } #endif @@ -323,7 +324,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) * and wait. */ ret = VM_FAULT_RETRY; - if (fe->flags & FAULT_FLAG_RETRY_NOWAIT) + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; /* take the reference before dropping the mmap_sem */ @@ -331,11 +332,11 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(fe->address, fe->flags, reason); + uwq.msg = userfault_msg(vmf->address, vmf->flags, reason); uwq.ctx = ctx; return_to_userland = - (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == + (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); spin_lock(&ctx->fault_pending_wqh.lock); @@ -353,7 +354,8 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) TASK_KILLABLE); spin_unlock(&ctx->fault_pending_wqh.lock); - must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); + must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, + reason); up_read(&mm->mmap_sem); if (likely(must_wait && !ACCESS_ONCE(ctx->released) && diff --git a/include/linux/dax.h b/include/linux/dax.h index 0afade8bd3d7..f97bcfe79472 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -46,7 +46,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, #ifdef CONFIG_FS_DAX struct page *read_dax_sector(struct block_device *bdev, sector_t n); -void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index); int __dax_zero_page_range(struct block_device *bdev, sector_t sector, unsigned int offset, unsigned int length); #else @@ -55,12 +54,6 @@ static inline struct page *read_dax_sector(struct block_device *bdev, { return ERR_PTR(-ENXIO); } -/* Shouldn't ever be called when dax is disabled. */ -static inline void dax_unlock_mapping_entry(struct address_space *mapping, - pgoff_t index) -{ - BUG(); -} static inline int __dax_zero_page_range(struct block_device *bdev, sector_t sector, unsigned int offset, unsigned int length) { diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 5beed7b30561..c965e4469499 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -139,7 +139,7 @@ struct dentry_operations { void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); - int (*d_manage)(struct dentry *, bool); + int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *, unsigned int); } ____cacheline_aligned; @@ -254,7 +254,7 @@ extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); /* test whether we have any submounts in a subdir tree */ -extern int have_submounts(struct dentry *); +extern int path_has_submounts(const struct path *); /* * This adds the entry to the hash queues. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1f782aa1d8e6..97e478d6b690 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -1,12 +1,12 @@ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H -extern int do_huge_pmd_anonymous_page(struct fault_env *fe); +extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf); extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma); -extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd); -extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd); +extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); +extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, @@ -142,7 +142,7 @@ static inline int hpage_nr_pages(struct page *page) return 1; } -extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd); +extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *huge_zero_page; @@ -212,7 +212,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, return NULL; } -static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd) +static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd) { return 0; } diff --git a/include/linux/idr.h b/include/linux/idr.h index 083d61e92706..00357991af8f 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -12,48 +12,48 @@ #ifndef __IDR_H__ #define __IDR_H__ -#include <linux/types.h> -#include <linux/bitops.h> -#include <linux/init.h> -#include <linux/rcupdate.h> - -/* - * We want shallower trees and thus more bits covered at each layer. 8 - * bits gives us large enough first layer for most use cases and maximum - * tree depth of 4. Each idr_layer is slightly larger than 2k on 64bit and - * 1k on 32bit. - */ -#define IDR_BITS 8 -#define IDR_SIZE (1 << IDR_BITS) -#define IDR_MASK ((1 << IDR_BITS)-1) - -struct idr_layer { - int prefix; /* the ID prefix of this idr_layer */ - int layer; /* distance from leaf */ - struct idr_layer __rcu *ary[1<<IDR_BITS]; - int count; /* When zero, we can release it */ - union { - /* A zero bit means "space here" */ - DECLARE_BITMAP(bitmap, IDR_SIZE); - struct rcu_head rcu_head; - }; -}; +#include <linux/radix-tree.h> +#include <linux/gfp.h> struct idr { - struct idr_layer __rcu *hint; /* the last layer allocated from */ - struct idr_layer __rcu *top; - int layers; /* only valid w/o concurrent changes */ - int cur; /* current pos for cyclic allocation */ - spinlock_t lock; - int id_free_cnt; - struct idr_layer *id_free; + struct radix_tree_root idr_rt; + unsigned int idr_next; }; -#define IDR_INIT(name) \ +/* Set the IDR flag and the IDR_FREE tag */ +#define IDR_RT_MARKER ((__force gfp_t)(3 << __GFP_BITS_SHIFT)) + +#define IDR_INIT \ { \ - .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ + .idr_rt = RADIX_TREE_INIT(IDR_RT_MARKER) \ +} +#define DEFINE_IDR(name) struct idr name = IDR_INIT + +/** + * idr_get_cursor - Return the current position of the cyclic allocator + * @idr: idr handle + * + * The value returned is the value that will be next returned from + * idr_alloc_cyclic() if it is free (otherwise the search will start from + * this position). + */ +static inline unsigned int idr_get_cursor(struct idr *idr) +{ + return READ_ONCE(idr->idr_next); +} + +/** + * idr_set_cursor - Set the current position of the cyclic allocator + * @idr: idr handle + * @val: new position + * + * The next call to idr_alloc_cyclic() will return @val if it is free + * (otherwise the search will start from this position). + */ +static inline void idr_set_cursor(struct idr *idr, unsigned int val) +{ + WRITE_ONCE(idr->idr_next, val); } -#define DEFINE_IDR(name) struct idr name = IDR_INIT(name) /** * DOC: idr sync @@ -72,22 +72,30 @@ struct idr { * period). */ -/* - * This is what we export. - */ - -void *idr_find_slowpath(struct idr *idp, int id); void idr_preload(gfp_t gfp_mask); -int idr_alloc(struct idr *idp, void *ptr, int start, int end, gfp_t gfp_mask); -int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask); -int idr_for_each(struct idr *idp, +int idr_alloc(struct idr *, void *, int start, int end, gfp_t gfp_mask); +int idr_alloc_cyclic(struct idr *, void *, int start, int end, gfp_t gfp_mask); +int idr_for_each(struct idr *, int (*fn)(int id, void *p, void *data), void *data); -void *idr_get_next(struct idr *idp, int *nextid); -void *idr_replace(struct idr *idp, void *ptr, int id); -void idr_remove(struct idr *idp, int id); -void idr_destroy(struct idr *idp); -void idr_init(struct idr *idp); -bool idr_is_empty(struct idr *idp); +void *idr_get_next(struct idr *, int *nextid); +void *idr_replace(struct idr *, void *, int id); +void idr_destroy(struct idr *); + +static inline void idr_remove(struct idr *idp, int id) +{ + radix_tree_delete(&idp->idr_rt, id); +} + +static inline void idr_init(struct idr *idp) +{ + memset(idp, 0, sizeof(*idp)); + idp->idr_rt.gfp_mask = IDR_RT_MARKER; +} + +static inline bool idr_is_empty(struct idr *idp) +{ + return radix_tree_empty(&idp->idr_rt); +} /** * idr_preload_end - end preload section started with idr_preload() @@ -114,17 +122,12 @@ static inline void idr_preload_end(void) */ static inline void *idr_find(struct idr *idr, int id) { - struct idr_layer *hint = rcu_dereference_raw(idr->hint); - - if (hint && (id & ~IDR_MASK) == hint->prefix) - return rcu_dereference_raw(hint->ary[id & IDR_MASK]); - - return idr_find_slowpath(idr, id); + return radix_tree_lookup(&idr->idr_rt, id); } /** * idr_for_each_entry - iterate over an idr's elements of a given type - * @idp: idr handle + * @idr: idr handle * @entry: the type * to use as cursor * @id: id entry's key * @@ -132,57 +135,58 @@ static inline void *idr_find(struct idr *idr, int id) * after normal terminatinon @entry is left with the value NULL. This * is convenient for a "not found" value. */ -#define idr_for_each_entry(idp, entry, id) \ - for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id) +#define idr_for_each_entry(idr, entry, id) \ + for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; ++id) /** - * idr_for_each_entry - continue iteration over an idr's elements of a given type - * @idp: idr handle + * idr_for_each_entry_continue - continue iteration over an idr's elements of a given type + * @idr: idr handle * @entry: the type * to use as cursor * @id: id entry's key * * Continue to iterate over list of given type, continuing after * the current position. */ -#define idr_for_each_entry_continue(idp, entry, id) \ - for ((entry) = idr_get_next((idp), &(id)); \ +#define idr_for_each_entry_continue(idr, entry, id) \ + for ((entry) = idr_get_next((idr), &(id)); \ entry; \ - ++id, (entry) = idr_get_next((idp), &(id))) + ++id, (entry) = idr_get_next((idr), &(id))) /* * IDA - IDR based id allocator, use when translation from id to * pointer isn't necessary. - * - * IDA_BITMAP_LONGS is calculated to be one less to accommodate - * ida_bitmap->nr_busy so that the whole struct fits in 128 bytes. */ #define IDA_CHUNK_SIZE 128 /* 128 bytes per chunk */ -#define IDA_BITMAP_LONGS (IDA_CHUNK_SIZE / sizeof(long) - 1) +#define IDA_BITMAP_LONGS (IDA_CHUNK_SIZE / sizeof(long)) #define IDA_BITMAP_BITS (IDA_BITMAP_LONGS * sizeof(long) * 8) struct ida_bitmap { - long nr_busy; unsigned long bitmap[IDA_BITMAP_LONGS]; }; struct ida { - struct idr idr; + struct radix_tree_root ida_rt; struct ida_bitmap *free_bitmap; }; -#define IDA_INIT(name) { .idr = IDR_INIT((name).idr), .free_bitmap = NULL, } -#define DEFINE_IDA(name) struct ida name = IDA_INIT(name) +#define IDA_INIT { .ida_rt = RADIX_TREE_INIT(IDR_RT_MARKER), } +#define DEFINE_IDA(name) struct ida name = IDA_INIT int ida_pre_get(struct ida *ida, gfp_t gfp_mask); int ida_get_new_above(struct ida *ida, int starting_id, int *p_id); void ida_remove(struct ida *ida, int id); void ida_destroy(struct ida *ida); -void ida_init(struct ida *ida); int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end, gfp_t gfp_mask); void ida_simple_remove(struct ida *ida, unsigned int id); +static inline void ida_init(struct ida *ida) +{ + memset(ida, 0, sizeof(*ida)); + ida->ida_rt.gfp_mask = IDR_RT_MARKER; +} + /** * ida_get_new - allocate new ID * @ida: idr handle @@ -195,6 +199,8 @@ static inline int ida_get_new(struct ida *ida, int *p_id) return ida_get_new_above(ida, 0, p_id); } -void __init idr_init_cache(void); - +static inline bool ida_is_empty(struct ida *ida) +{ + return radix_tree_empty(&ida->ida_rt); +} #endif /* __IDR_H__ */ diff --git a/include/linux/ima.h b/include/linux/ima.h index 0eb7c2e7f0d6..7f6952f8d6aa 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -11,6 +11,7 @@ #define _LINUX_IMA_H #include <linux/fs.h> +#include <linux/kexec.h> struct linux_binprm; #ifdef CONFIG_IMA @@ -23,6 +24,10 @@ extern int ima_post_read_file(struct file *file, void *buf, loff_t size, enum kernel_read_file_id id); extern void ima_post_path_mknod(struct dentry *dentry); +#ifdef CONFIG_IMA_KEXEC +extern void ima_add_kexec_buffer(struct kimage *image); +#endif + #else static inline int ima_bprm_check(struct linux_binprm *bprm) { @@ -62,6 +67,13 @@ static inline void ima_post_path_mknod(struct dentry *dentry) #endif /* CONFIG_IMA */ +#ifndef CONFIG_IMA_KEXEC +struct kimage; + +static inline void ima_add_kexec_buffer(struct kimage *image) +{} +#endif + #ifdef CONFIG_IMA_APPRAISE extern void ima_inode_post_setattr(struct dentry *dentry); extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name, diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b5b2e4df14e..4424784ac374 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -292,36 +292,23 @@ extern pgprot_t protection_map[16]; * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { + struct vm_area_struct *vma; /* Target VMA */ unsigned int flags; /* FAULT_FLAG_xxx flags */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ - void __user *virtual_address; /* Faulting virtual address */ + unsigned long address; /* Faulting virtual address */ + pmd_t *pmd; /* Pointer to pmd entry matching + * the 'address' */ + pte_t orig_pte; /* Value of PTE at the time of fault */ - struct page *cow_page; /* Handler may choose to COW */ + struct page *cow_page; /* Page handler may use for COW fault */ + struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by * VM_FAULT_ERROR). */ - void *entry; /* ->fault handler can alternatively - * return locked DAX entry. In that - * case handler should return - * VM_FAULT_DAX_LOCKED and fill in - * entry here. - */ -}; - -/* - * Page fault context: passes though page fault handler instead of endless list - * of function arguments. - */ -struct fault_env { - struct vm_area_struct *vma; /* Target VMA */ - unsigned long address; /* Faulting virtual address */ - unsigned int flags; /* FAULT_FLAG_xxx flags */ - pmd_t *pmd; /* Pointer to pmd entry matching - * the 'address' - */ + /* These three entries are valid only while holding ptl lock */ pte_t *pte; /* Pointer to pte entry matching * the 'address'. NULL if the page * table hasn't been allocated. @@ -351,7 +338,7 @@ struct vm_operations_struct { int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); int (*pmd_fault)(struct vm_area_struct *, unsigned long address, pmd_t *, unsigned int flags); - void (*map_pages)(struct fault_env *fe, + void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); /* notification that a previously read-only page is about to become @@ -625,8 +612,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, +int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page); +int finish_fault(struct vm_fault *vmf); +int finish_mkwrite_fault(struct vm_fault *vmf); #endif /* @@ -1110,7 +1099,7 @@ static inline void clear_page_pfmemalloc(struct page *page) #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ -#define VM_FAULT_DAX_LOCKED 0x1000 /* ->fault has locked DAX entry */ +#define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */ #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ @@ -1221,6 +1210,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); +int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, + spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, @@ -1276,15 +1267,12 @@ extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); + struct vm_area_struct **vmas, int *locked); long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); -long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); int get_user_pages_fast(unsigned long start, int nr_pages, int write, @@ -2099,7 +2087,7 @@ extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); -extern void filemap_map_pages(struct fault_env *fe, +extern void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); diff --git a/include/linux/mount.h b/include/linux/mount.h index 1172cce949a4..42dc62b60202 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -15,6 +15,8 @@ #include <linux/spinlock.h> #include <linux/seqlock.h> #include <linux/atomic.h> +#include <linux/path.h> +#include <linux/dcache.h> struct super_block; struct vfsmount; @@ -98,4 +100,13 @@ extern dev_t name_to_dev_t(const char *name); extern unsigned int sysctl_mount_max; +extern bool __path_is_mountpoint(const struct path *path); +static inline bool path_is_mountpoint(const struct path *path) +{ + if (!d_mountpoint(path->dentry)) + return 0; + + return __path_is_mountpoint(path); +} + #endif /* _LINUX_MOUNT_H */ diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a78c35cff1ae..aacca824a6ae 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -7,6 +7,23 @@ #include <linux/sched.h> #include <asm/irq.h> +/* + * The run state of the lockup detectors is controlled by the content of the + * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - + * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. + * + * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' + * are variables that are only used as an 'interface' between the parameters + * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The + * 'watchdog_thresh' variable is handled differently because its value is not + * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' + * is equal zero. + */ +#define NMI_WATCHDOG_ENABLED_BIT 0 +#define SOFT_WATCHDOG_ENABLED_BIT 1 +#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) +#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) + /** * touch_nmi_watchdog - restart NMI watchdog timeout. * @@ -91,9 +108,16 @@ extern int nmi_watchdog_enabled; extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; +extern unsigned long watchdog_enabled; extern unsigned long *watchdog_cpumask_bits; +#ifdef CONFIG_SMP extern int sysctl_softlockup_all_cpu_backtrace; extern int sysctl_hardlockup_all_cpu_backtrace; +#else +#define sysctl_softlockup_all_cpu_backtrace 0 +#define sysctl_hardlockup_all_cpu_backtrace 0 +#endif +extern bool is_hardlockup(void); struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 744486057e9e..d0690691d9bf 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -80,28 +80,33 @@ static inline bool radix_tree_is_internal_node(void *ptr) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) +/* + * @count is the count of every non-NULL element in the ->slots array + * whether that is an exceptional entry, a retry entry, a user pointer, + * a sibling entry or a pointer to the next level of the tree. + * @exceptional is the count of every element in ->slots which is + * either radix_tree_exceptional_entry() or is a sibling entry for an + * exceptional entry. + */ struct radix_tree_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ unsigned char count; /* Total entry count */ unsigned char exceptional; /* Exceptional entry count */ + struct radix_tree_node *parent; /* Used when ascending tree */ + void *private_data; /* For tree user */ union { - struct { - /* Used when ascending tree */ - struct radix_tree_node *parent; - /* For tree user */ - void *private_data; - }; - /* Used when freeing node */ - struct rcu_head rcu_head; + struct list_head private_list; /* For tree user */ + struct rcu_head rcu_head; /* Used when freeing node */ }; - /* For tree user */ - struct list_head private_list; void __rcu *slots[RADIX_TREE_MAP_SIZE]; unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; -/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ +/* The top bits of gfp_mask are used to store the root tags and the IDR flag */ +#define ROOT_IS_IDR (1 << __GFP_BITS_SHIFT) +#define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT + 1) + struct radix_tree_root { gfp_t gfp_mask; struct radix_tree_node __rcu *rnode; @@ -127,6 +132,41 @@ static inline bool radix_tree_empty(struct radix_tree_root *root) } /** + * struct radix_tree_iter - radix tree iterator state + * + * @index: index of current slot + * @next_index: one beyond the last index for this chunk + * @tags: bit-mask for tag-iterating + * @node: node that contains current slot + * @shift: shift for the node that holds our slots + * + * This radix tree iterator works in terms of "chunks" of slots. A chunk is a + * subinterval of slots contained within one radix tree leaf node. It is + * described by a pointer to its first slot and a struct radix_tree_iter + * which holds the chunk's position in the tree and its size. For tagged + * iteration radix_tree_iter also holds the slots' bit-mask for one chosen + * radix tree tag. + */ +struct radix_tree_iter { + unsigned long index; + unsigned long next_index; + unsigned long tags; + struct radix_tree_node *node; +#ifdef CONFIG_RADIX_TREE_MULTIORDER + unsigned int shift; +#endif +}; + +static inline unsigned int iter_shift(const struct radix_tree_iter *iter) +{ +#ifdef CONFIG_RADIX_TREE_MULTIORDER + return iter->shift; +#else + return 0; +#endif +} + +/** * Radix-tree synchronization * * The radix-tree API requires that users provide all synchronisation (with @@ -264,6 +304,8 @@ void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, void **slot, void *item, radix_tree_update_node_t update_node, void *private); +void radix_tree_iter_replace(struct radix_tree_root *, + const struct radix_tree_iter *, void **slot, void *item); void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item); void __radix_tree_delete_node(struct radix_tree_root *root, @@ -289,6 +331,8 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, unsigned long index, unsigned int tag); int radix_tree_tag_get(struct radix_tree_root *root, unsigned long index, unsigned int tag); +void radix_tree_iter_tag_set(struct radix_tree_root *root, + const struct radix_tree_iter *iter, unsigned int tag); unsigned int radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items, @@ -297,50 +341,18 @@ unsigned int radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, unsigned long first_index, unsigned int max_items, unsigned int tag); -unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, - unsigned long *first_indexp, unsigned long last_index, - unsigned long nr_to_tag, - unsigned int fromtag, unsigned int totag); int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); -unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item); static inline void radix_tree_preload_end(void) { preempt_enable(); } -/** - * struct radix_tree_iter - radix tree iterator state - * - * @index: index of current slot - * @next_index: one beyond the last index for this chunk - * @tags: bit-mask for tag-iterating - * @shift: shift for the node that holds our slots - * - * This radix tree iterator works in terms of "chunks" of slots. A chunk is a - * subinterval of slots contained within one radix tree leaf node. It is - * described by a pointer to its first slot and a struct radix_tree_iter - * which holds the chunk's position in the tree and its size. For tagged - * iteration radix_tree_iter also holds the slots' bit-mask for one chosen - * radix tree tag. - */ -struct radix_tree_iter { - unsigned long index; - unsigned long next_index; - unsigned long tags; -#ifdef CONFIG_RADIX_TREE_MULTIORDER - unsigned int shift; -#endif -}; - -static inline unsigned int iter_shift(struct radix_tree_iter *iter) -{ -#ifdef CONFIG_RADIX_TREE_MULTIORDER - return iter->shift; -#else - return 0; -#endif -} +int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t); +int radix_tree_split(struct radix_tree_root *, unsigned long index, + unsigned new_order); +int radix_tree_join(struct radix_tree_root *, unsigned long index, + unsigned new_order, void *); #define RADIX_TREE_ITER_TAG_MASK 0x00FF /* tag index in lower byte */ #define RADIX_TREE_ITER_TAGGED 0x0100 /* lookup tagged slots */ @@ -409,20 +421,17 @@ __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots) } /** - * radix_tree_iter_next - resume iterating when the chunk may be invalid - * @iter: iterator state + * radix_tree_iter_resume - resume iterating when the chunk may be invalid + * @slot: pointer to current slot + * @iter: iterator state + * Returns: New slot pointer * * If the iterator needs to release then reacquire a lock, the chunk may * have been invalidated by an insertion or deletion. Call this function - * to continue the iteration from the next index. + * before releasing the lock to continue the iteration from the next index. */ -static inline __must_check -void **radix_tree_iter_next(struct radix_tree_iter *iter) -{ - iter->next_index = __radix_tree_iter_add(iter, 1); - iter->tags = 0; - return NULL; -} +void **__must_check radix_tree_iter_resume(void **slot, + struct radix_tree_iter *iter); /** * radix_tree_chunk_size - get current chunk size @@ -436,10 +445,17 @@ radix_tree_chunk_size(struct radix_tree_iter *iter) return (iter->next_index - iter->index) >> iter_shift(iter); } -static inline struct radix_tree_node *entry_to_node(void *ptr) +#ifdef CONFIG_RADIX_TREE_MULTIORDER +void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, + unsigned flags); +#else +/* Can't happen without sibling entries, but the compiler can't tell that */ +static inline void ** __radix_tree_next_slot(void **slot, + struct radix_tree_iter *iter, unsigned flags) { - return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); + return slot; } +#endif /** * radix_tree_next_slot - find next slot in chunk @@ -453,7 +469,7 @@ static inline struct radix_tree_node *entry_to_node(void *ptr) * For tagged lookup it also eats @iter->tags. * * There are several cases where 'slot' can be passed in as NULL to this - * function. These cases result from the use of radix_tree_iter_next() or + * function. These cases result from the use of radix_tree_iter_resume() or * radix_tree_iter_retry(). In these cases we don't end up dereferencing * 'slot' because either: * a) we are doing tagged iteration and iter->tags has been set to 0, or @@ -464,51 +480,31 @@ static __always_inline void ** radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) { if (flags & RADIX_TREE_ITER_TAGGED) { - void *canon = slot; - iter->tags >>= 1; if (unlikely(!iter->tags)) return NULL; - while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && - radix_tree_is_internal_node(slot[1])) { - if (entry_to_node(slot[1]) == canon) { - iter->tags >>= 1; - iter->index = __radix_tree_iter_add(iter, 1); - slot++; - continue; - } - iter->next_index = __radix_tree_iter_add(iter, 1); - return NULL; - } if (likely(iter->tags & 1ul)) { iter->index = __radix_tree_iter_add(iter, 1); - return slot + 1; + slot++; + goto found; } if (!(flags & RADIX_TREE_ITER_CONTIG)) { unsigned offset = __ffs(iter->tags); - iter->tags >>= offset; - iter->index = __radix_tree_iter_add(iter, offset + 1); - return slot + offset + 1; + iter->tags >>= offset++; + iter->index = __radix_tree_iter_add(iter, offset); + slot += offset; + goto found; } } else { long count = radix_tree_chunk_size(iter); - void *canon = slot; while (--count > 0) { slot++; iter->index = __radix_tree_iter_add(iter, 1); - if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && - radix_tree_is_internal_node(*slot)) { - if (entry_to_node(*slot) == canon) - continue; - iter->next_index = iter->index; - break; - } - if (likely(*slot)) - return slot; + goto found; if (flags & RADIX_TREE_ITER_CONTIG) { /* forbid switching to the next chunk */ iter->next_index = 0; @@ -517,6 +513,11 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) } } return NULL; + + found: + if (unlikely(radix_tree_is_internal_node(*slot))) + return __radix_tree_next_slot(slot, iter, flags); + return slot; } /** @@ -567,6 +568,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) slot || (slot = radix_tree_next_chunk(root, iter, \ RADIX_TREE_ITER_TAGGED | tag)) ; \ slot = radix_tree_next_slot(slot, iter, \ - RADIX_TREE_ITER_TAGGED)) + RADIX_TREE_ITER_TAGGED | tag)) #endif /* _LINUX_RADIX_TREE_H */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index dd66a952e8cd..11b92b047a1e 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -27,7 +27,7 @@ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) -extern int handle_userfault(struct fault_env *fe, unsigned long reason); +extern int handle_userfault(struct vm_fault *vmf, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len); @@ -55,7 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) #else /* CONFIG_USERFAULTFD */ /* mm helpers */ -static inline int handle_userfault(struct fault_env *fe, unsigned long reason) +static inline int handle_userfault(struct vm_fault *vmf, unsigned long reason) { return VM_FAULT_SIGBUS; } diff --git a/init/main.c b/init/main.c index 74cf81e2b9d0..8161208d4ece 100644 --- a/init/main.c +++ b/init/main.c @@ -551,7 +551,7 @@ asmlinkage __visible void __init start_kernel(void) if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); - idr_init_cache(); + radix_tree_init(); /* * Allow workqueue creation and work item queueing/cancelling @@ -566,7 +566,6 @@ asmlinkage __visible void __init start_kernel(void) trace_init(); context_tracking_init(); - radix_tree_init(); /* init some links before init_ISA_irqs() */ early_irq_init(); init_IRQ(); diff --git a/ipc/sem.c b/ipc/sem.c index e84703d5024e..31eaa87ecba4 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -11,6 +11,7 @@ * (c) 2001 Red Hat Inc * Lockless wakeup * (c) 2003 Manfred Spraul <manfred@colorfullife.com> + * (c) 2016 Davidlohr Bueso <dave@stgolabs.net> * Further wakeup optimizations, documentation * (c) 2010 Manfred Spraul <manfred@colorfullife.com> * @@ -53,15 +54,11 @@ * Semaphores are actively given to waiting tasks (necessary for FIFO). * (see update_queue()) * - To improve the scalability, the actual wake-up calls are performed after - * dropping all locks. (see wake_up_sem_queue_prepare(), - * wake_up_sem_queue_do()) + * dropping all locks. (see wake_up_sem_queue_prepare()) * - All work is done by the waker, the woken up task does not have to do * anything - not even acquiring a lock or dropping a refcount. * - A woken up task may not even touch the semaphore array anymore, it may * have been destroyed already by a semctl(RMID). - * - The synchronizations between wake-ups due to a timeout/signal and a - * wake-up due to a completed semaphore operation is achieved by using an - * intermediate state (IN_WAKEUP). * - UNDO values are stored in an array (one per process and per * semaphore array, lazily allocated). For backwards compatibility, multiple * modes for the UNDO variables are supported (per process, per thread) @@ -118,7 +115,8 @@ struct sem_queue { struct sembuf *sops; /* array of pending operations */ struct sembuf *blocking; /* the operation that blocked */ int nsops; /* number of operations */ - int alter; /* does *sops alter the array? */ + bool alter; /* does *sops alter the array? */ + bool dupsop; /* sops on more than one sem_num */ }; /* Each task has a list of undo requests. They are executed automatically @@ -433,29 +431,6 @@ static inline void sem_unlock(struct sem_array *sma, int locknum) * * The caller holds the RCU read lock. */ -static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns, - int id, struct sembuf *sops, int nsops, int *locknum) -{ - struct kern_ipc_perm *ipcp; - struct sem_array *sma; - - ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); - if (IS_ERR(ipcp)) - return ERR_CAST(ipcp); - - sma = container_of(ipcp, struct sem_array, sem_perm); - *locknum = sem_lock(sma, sops, nsops); - - /* ipc_rmid() may have already freed the ID while sem_lock - * was spinning: verify that the structure is still valid - */ - if (ipc_valid_object(ipcp)) - return container_of(ipcp, struct sem_array, sem_perm); - - sem_unlock(sma, *locknum); - return ERR_PTR(-EINVAL); -} - static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); @@ -488,40 +463,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) ipc_rmid(&sem_ids(ns), &s->sem_perm); } -/* - * Lockless wakeup algorithm: - * Without the check/retry algorithm a lockless wakeup is possible: - * - queue.status is initialized to -EINTR before blocking. - * - wakeup is performed by - * * unlinking the queue entry from the pending list - * * setting queue.status to IN_WAKEUP - * This is the notification for the blocked thread that a - * result value is imminent. - * * call wake_up_process - * * set queue.status to the final value. - * - the previously blocked thread checks queue.status: - * * if it's IN_WAKEUP, then it must wait until the value changes - * * if it's not -EINTR, then the operation was completed by - * update_queue. semtimedop can return queue.status without - * performing any operation on the sem array. - * * otherwise it must acquire the spinlock and check what's up. - * - * The two-stage algorithm is necessary to protect against the following - * races: - * - if queue.status is set after wake_up_process, then the woken up idle - * thread could race forward and try (and fail) to acquire sma->lock - * before update_queue had a chance to set queue.status - * - if queue.status is written before wake_up_process and if the - * blocked process is woken up by a signal between writing - * queue.status and the wake_up_process, then the woken up - * process could return from semtimedop and die by calling - * sys_exit before wake_up_process is called. Then wake_up_process - * will oops, because the task structure is already invalid. - * (yes, this happened on s390 with sysv msg). - * - */ -#define IN_WAKEUP 1 - /** * newary - Create a new semaphore set * @ns: namespace @@ -641,15 +582,23 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg) } /** - * perform_atomic_semop - Perform (if possible) a semaphore operation + * perform_atomic_semop[_slow] - Attempt to perform semaphore + * operations on a given array. * @sma: semaphore array * @q: struct sem_queue that describes the operation * + * Caller blocking are as follows, based the value + * indicated by the semaphore operation (sem_op): + * + * (1) >0 never blocks. + * (2) 0 (wait-for-zero operation): semval is non-zero. + * (3) <0 attempting to decrement semval to a value smaller than zero. + * * Returns 0 if the operation was possible. * Returns 1 if the operation is impossible, the caller must sleep. - * Negative values are error codes. + * Returns <0 for error codes. */ -static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) +static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q) { int result, sem_op, nsops, pid; struct sembuf *sop; @@ -720,51 +669,84 @@ undo: return result; } -/** wake_up_sem_queue_prepare(q, error): Prepare wake-up - * @q: queue entry that must be signaled - * @error: Error value for the signal - * - * Prepare the wake-up of the queue entry q. - */ -static void wake_up_sem_queue_prepare(struct list_head *pt, - struct sem_queue *q, int error) +static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) { - if (list_empty(pt)) { - /* - * Hold preempt off so that we don't get preempted and have the - * wakee busy-wait until we're scheduled back on. - */ - preempt_disable(); + int result, sem_op, nsops; + struct sembuf *sop; + struct sem *curr; + struct sembuf *sops; + struct sem_undo *un; + + sops = q->sops; + nsops = q->nsops; + un = q->undo; + + if (unlikely(q->dupsop)) + return perform_atomic_semop_slow(sma, q); + + /* + * We scan the semaphore set twice, first to ensure that the entire + * operation can succeed, therefore avoiding any pointless writes + * to shared memory and having to undo such changes in order to block + * until the operations can go through. + */ + for (sop = sops; sop < sops + nsops; sop++) { + curr = sma->sem_base + sop->sem_num; + sem_op = sop->sem_op; + result = curr->semval; + + if (!sem_op && result) + goto would_block; /* wait-for-zero */ + + result += sem_op; + if (result < 0) + goto would_block; + + if (result > SEMVMX) + return -ERANGE; + + if (sop->sem_flg & SEM_UNDO) { + int undo = un->semadj[sop->sem_num] - sem_op; + + /* Exceeding the undo range is an error. */ + if (undo < (-SEMAEM - 1) || undo > SEMAEM) + return -ERANGE; + } + } + + for (sop = sops; sop < sops + nsops; sop++) { + curr = sma->sem_base + sop->sem_num; + sem_op = sop->sem_op; + result = curr->semval; + + if (sop->sem_flg & SEM_UNDO) { + int undo = un->semadj[sop->sem_num] - sem_op; + + un->semadj[sop->sem_num] = undo; + } + curr->semval += sem_op; + curr->sempid = q->pid; } - q->status = IN_WAKEUP; - q->pid = error; - list_add_tail(&q->list, pt); + return 0; + +would_block: + q->blocking = sop; + return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1; } -/** - * wake_up_sem_queue_do - do the actual wake-up - * @pt: list of tasks to be woken up - * - * Do the actual wake-up. - * The function is called without any locks held, thus the semaphore array - * could be destroyed already and the tasks can disappear as soon as the - * status is set to the actual return code. - */ -static void wake_up_sem_queue_do(struct list_head *pt) +static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error, + struct wake_q_head *wake_q) { - struct sem_queue *q, *t; - int did_something; - - did_something = !list_empty(pt); - list_for_each_entry_safe(q, t, pt, list) { - wake_up_process(q->sleeper); - /* q can disappear immediately after writing q->status. */ - smp_wmb(); - q->status = q->pid; - } - if (did_something) - preempt_enable(); + wake_q_add(wake_q, q->sleeper); + /* + * Rely on the above implicit barrier, such that we can + * ensure that we hold reference to the task before setting + * q->status. Otherwise we could race with do_exit if the + * task is awoken by an external event before calling + * wake_up_process(). + */ + WRITE_ONCE(q->status, error); } static void unlink_queue(struct sem_array *sma, struct sem_queue *q) @@ -784,7 +766,7 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q) * modified the array. * Note that wait-for-zero operations are handled without restart. */ -static int check_restart(struct sem_array *sma, struct sem_queue *q) +static inline int check_restart(struct sem_array *sma, struct sem_queue *q) { /* pending complex alter operations are too difficult to analyse */ if (!list_empty(&sma->pending_alter)) @@ -812,21 +794,20 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q) * wake_const_ops - wake up non-alter tasks * @sma: semaphore array. * @semnum: semaphore that was modified. - * @pt: list head for the tasks that must be woken up. + * @wake_q: lockless wake-queue head. * * wake_const_ops must be called after a semaphore in a semaphore array * was set to 0. If complex const operations are pending, wake_const_ops must * be called with semnum = -1, as well as with the number of each modified * semaphore. - * The tasks that must be woken up are added to @pt. The return code + * The tasks that must be woken up are added to @wake_q. The return code * is stored in q->pid. * The function returns 1 if at least one operation was completed successfully. */ static int wake_const_ops(struct sem_array *sma, int semnum, - struct list_head *pt) + struct wake_q_head *wake_q) { - struct sem_queue *q; - struct list_head *walk; + struct sem_queue *q, *tmp; struct list_head *pending_list; int semop_completed = 0; @@ -835,25 +816,19 @@ static int wake_const_ops(struct sem_array *sma, int semnum, else pending_list = &sma->sem_base[semnum].pending_const; - walk = pending_list->next; - while (walk != pending_list) { - int error; - - q = container_of(walk, struct sem_queue, list); - walk = walk->next; - - error = perform_atomic_semop(sma, q); - - if (error <= 0) { - /* operation completed, remove from queue & wakeup */ + list_for_each_entry_safe(q, tmp, pending_list, list) { + int error = perform_atomic_semop(sma, q); - unlink_queue(sma, q); + if (error > 0) + continue; + /* operation completed, remove from queue & wakeup */ + unlink_queue(sma, q); - wake_up_sem_queue_prepare(pt, q, error); - if (error == 0) - semop_completed = 1; - } + wake_up_sem_queue_prepare(q, error, wake_q); + if (error == 0) + semop_completed = 1; } + return semop_completed; } @@ -862,14 +837,14 @@ static int wake_const_ops(struct sem_array *sma, int semnum, * @sma: semaphore array * @sops: operations that were performed * @nsops: number of operations - * @pt: list head of the tasks that must be woken up. + * @wake_q: lockless wake-queue head * * Checks all required queue for wait-for-zero operations, based * on the actual changes that were performed on the semaphore array. * The function returns 1 if at least one operation was completed successfully. */ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, - int nsops, struct list_head *pt) + int nsops, struct wake_q_head *wake_q) { int i; int semop_completed = 0; @@ -882,7 +857,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, if (sma->sem_base[num].semval == 0) { got_zero = 1; - semop_completed |= wake_const_ops(sma, num, pt); + semop_completed |= wake_const_ops(sma, num, wake_q); } } } else { @@ -893,7 +868,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, for (i = 0; i < sma->sem_nsems; i++) { if (sma->sem_base[i].semval == 0) { got_zero = 1; - semop_completed |= wake_const_ops(sma, i, pt); + semop_completed |= wake_const_ops(sma, i, wake_q); } } } @@ -902,7 +877,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, * then check the global queue, too. */ if (got_zero) - semop_completed |= wake_const_ops(sma, -1, pt); + semop_completed |= wake_const_ops(sma, -1, wake_q); return semop_completed; } @@ -912,22 +887,21 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, * update_queue - look for tasks that can be completed. * @sma: semaphore array. * @semnum: semaphore that was modified. - * @pt: list head for the tasks that must be woken up. + * @wake_q: lockless wake-queue head. * * update_queue must be called after a semaphore in a semaphore array * was modified. If multiple semaphores were modified, update_queue must * be called with semnum = -1, as well as with the number of each modified * semaphore. - * The tasks that must be woken up are added to @pt. The return code + * The tasks that must be woken up are added to @wake_q. The return code * is stored in q->pid. * The function internally checks if const operations can now succeed. * * The function return 1 if at least one semop was completed successfully. */ -static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) +static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q) { - struct sem_queue *q; - struct list_head *walk; + struct sem_queue *q, *tmp; struct list_head *pending_list; int semop_completed = 0; @@ -937,13 +911,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) pending_list = &sma->sem_base[semnum].pending_alter; again: - walk = pending_list->next; - while (walk != pending_list) { + list_for_each_entry_safe(q, tmp, pending_list, list) { int error, restart; - q = container_of(walk, struct sem_queue, list); - walk = walk->next; - /* If we are scanning the single sop, per-semaphore list of * one semaphore and that semaphore is 0, then it is not * necessary to scan further: simple increments @@ -966,11 +936,11 @@ again: restart = 0; } else { semop_completed = 1; - do_smart_wakeup_zero(sma, q->sops, q->nsops, pt); + do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q); restart = check_restart(sma, q); } - wake_up_sem_queue_prepare(pt, q, error); + wake_up_sem_queue_prepare(q, error, wake_q); if (restart) goto again; } @@ -1001,24 +971,24 @@ static void set_semotime(struct sem_array *sma, struct sembuf *sops) * @sops: operations that were performed * @nsops: number of operations * @otime: force setting otime - * @pt: list head of the tasks that must be woken up. + * @wake_q: lockless wake-queue head * * do_smart_update() does the required calls to update_queue and wakeup_zero, * based on the actual changes that were performed on the semaphore array. * Note that the function does not do the actual wake-up: the caller is - * responsible for calling wake_up_sem_queue_do(@pt). + * responsible for calling wake_up_q(). * It is safe to perform this call after dropping all locks. */ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, - int otime, struct list_head *pt) + int otime, struct wake_q_head *wake_q) { int i; - otime |= do_smart_wakeup_zero(sma, sops, nsops, pt); + otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q); if (!list_empty(&sma->pending_alter)) { /* semaphore array uses the global queue - just process it. */ - otime |= update_queue(sma, -1, pt); + otime |= update_queue(sma, -1, wake_q); } else { if (!sops) { /* @@ -1026,7 +996,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop * known. Check all. */ for (i = 0; i < sma->sem_nsems; i++) - otime |= update_queue(sma, i, pt); + otime |= update_queue(sma, i, wake_q); } else { /* * Check the semaphores that were increased: @@ -1040,7 +1010,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop for (i = 0; i < nsops; i++) { if (sops[i].sem_op > 0) { otime |= update_queue(sma, - sops[i].sem_num, pt); + sops[i].sem_num, wake_q); } } } @@ -1128,8 +1098,8 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) struct sem_undo *un, *tu; struct sem_queue *q, *tq; struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); - struct list_head tasks; int i; + DEFINE_WAKE_Q(wake_q); /* Free the existing undo structures for this semaphore set. */ ipc_assert_locked_object(&sma->sem_perm); @@ -1143,25 +1113,24 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) } /* Wake up all pending processes and let them fail with EIDRM. */ - INIT_LIST_HEAD(&tasks); list_for_each_entry_safe(q, tq, &sma->pending_const, list) { unlink_queue(sma, q); - wake_up_sem_queue_prepare(&tasks, q, -EIDRM); + wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { unlink_queue(sma, q); - wake_up_sem_queue_prepare(&tasks, q, -EIDRM); + wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } for (i = 0; i < sma->sem_nsems; i++) { struct sem *sem = sma->sem_base + i; list_for_each_entry_safe(q, tq, &sem->pending_const, list) { unlink_queue(sma, q); - wake_up_sem_queue_prepare(&tasks, q, -EIDRM); + wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { unlink_queue(sma, q); - wake_up_sem_queue_prepare(&tasks, q, -EIDRM); + wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } } @@ -1170,7 +1139,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) sem_unlock(sma, -1); rcu_read_unlock(); - wake_up_sem_queue_do(&tasks); + wake_up_q(&wake_q); ns->used_sems -= sma->sem_nsems; ipc_rcu_putref(sma, sem_rcu_free); } @@ -1309,9 +1278,9 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, struct sem_undo *un; struct sem_array *sma; struct sem *curr; - int err; - struct list_head tasks; - int val; + int err, val; + DEFINE_WAKE_Q(wake_q); + #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) /* big-endian 64bit */ val = arg >> 32; @@ -1323,8 +1292,6 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, if (val > SEMVMX || val < 0) return -ERANGE; - INIT_LIST_HEAD(&tasks); - rcu_read_lock(); sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { @@ -1367,10 +1334,10 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, curr->sempid = task_tgid_vnr(current); sma->sem_ctime = get_seconds(); /* maybe some queued-up processes were waiting for this */ - do_smart_update(sma, NULL, 0, 0, &tasks); + do_smart_update(sma, NULL, 0, 0, &wake_q); sem_unlock(sma, -1); rcu_read_unlock(); - wake_up_sem_queue_do(&tasks); + wake_up_q(&wake_q); return 0; } @@ -1382,9 +1349,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, int err, nsems; ushort fast_sem_io[SEMMSL_FAST]; ushort *sem_io = fast_sem_io; - struct list_head tasks; - - INIT_LIST_HEAD(&tasks); + DEFINE_WAKE_Q(wake_q); rcu_read_lock(); sma = sem_obtain_object_check(ns, semid); @@ -1495,7 +1460,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, } sma->sem_ctime = get_seconds(); /* maybe some queued-up processes were waiting for this */ - do_smart_update(sma, NULL, 0, 0, &tasks); + do_smart_update(sma, NULL, 0, 0, &wake_q); err = 0; goto out_unlock; } @@ -1531,7 +1496,7 @@ out_unlock: sem_unlock(sma, -1); out_rcu_wakeup: rcu_read_unlock(); - wake_up_sem_queue_do(&tasks); + wake_up_q(&wake_q); out_free: if (sem_io != fast_sem_io) ipc_free(sem_io); @@ -1804,32 +1769,6 @@ out: return un; } - -/** - * get_queue_result - retrieve the result code from sem_queue - * @q: Pointer to queue structure - * - * Retrieve the return code from the pending queue. If IN_WAKEUP is found in - * q->status, then we must loop until the value is replaced with the final - * value: This may happen if a task is woken up by an unrelated event (e.g. - * signal) and in parallel the task is woken up by another task because it got - * the requested semaphores. - * - * The function can be called with or without holding the semaphore spinlock. - */ -static int get_queue_result(struct sem_queue *q) -{ - int error; - - error = q->status; - while (unlikely(error == IN_WAKEUP)) { - cpu_relax(); - error = q->status; - } - - return error; -} - SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, unsigned, nsops, const struct timespec __user *, timeout) { @@ -1838,11 +1777,11 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, struct sembuf fast_sops[SEMOPM_FAST]; struct sembuf *sops = fast_sops, *sop; struct sem_undo *un; - int undos = 0, alter = 0, max, locknum; + int max, locknum; + bool undos = false, alter = false, dupsop = false; struct sem_queue queue; - unsigned long jiffies_left = 0; + unsigned long dup = 0, jiffies_left = 0; struct ipc_namespace *ns; - struct list_head tasks; ns = current->nsproxy->ipc_ns; @@ -1855,10 +1794,12 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, if (sops == NULL) return -ENOMEM; } + if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { error = -EFAULT; goto out_free; } + if (timeout) { struct timespec _timeout; if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) { @@ -1872,18 +1813,30 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, } jiffies_left = timespec_to_jiffies(&_timeout); } + max = 0; for (sop = sops; sop < sops + nsops; sop++) { + unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG); + if (sop->sem_num >= max) max = sop->sem_num; if (sop->sem_flg & SEM_UNDO) - undos = 1; - if (sop->sem_op != 0) - alter = 1; + undos = true; + if (dup & mask) { + /* + * There was a previous alter access that appears + * to have accessed the same semaphore, thus use + * the dupsop logic. "appears", because the detection + * can only check % BITS_PER_LONG. + */ + dupsop = true; + } + if (sop->sem_op != 0) { + alter = true; + dup |= mask; + } } - INIT_LIST_HEAD(&tasks); - if (undos) { /* On success, find_alloc_undo takes the rcu_read_lock */ un = find_alloc_undo(ns, semid); @@ -1904,16 +1857,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, } error = -EFBIG; - if (max >= sma->sem_nsems) - goto out_rcu_wakeup; + if (max >= sma->sem_nsems) { + rcu_read_unlock(); + goto out_free; + } error = -EACCES; - if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) - goto out_rcu_wakeup; + if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) { + rcu_read_unlock(); + goto out_free; + } error = security_sem_semop(sma, sops, nsops, alter); - if (error) - goto out_rcu_wakeup; + if (error) { + rcu_read_unlock(); + goto out_free; + } error = -EIDRM; locknum = sem_lock(sma, sops, nsops); @@ -1942,24 +1901,34 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, queue.undo = un; queue.pid = task_tgid_vnr(current); queue.alter = alter; + queue.dupsop = dupsop; error = perform_atomic_semop(sma, &queue); - if (error == 0) { - /* If the operation was successful, then do + if (error == 0) { /* non-blocking succesfull path */ + DEFINE_WAKE_Q(wake_q); + + /* + * If the operation was successful, then do * the required updates. */ if (alter) - do_smart_update(sma, sops, nsops, 1, &tasks); + do_smart_update(sma, sops, nsops, 1, &wake_q); else set_semotime(sma, sops); + + sem_unlock(sma, locknum); + rcu_read_unlock(); + wake_up_q(&wake_q); + + goto out_free; } - if (error <= 0) + if (error < 0) /* non-blocking error path */ goto out_unlock_free; - /* We need to sleep on this operation, so we put the current + /* + * We need to sleep on this operation, so we put the current * task into the pending queue and go to sleep. */ - if (nsops == 1) { struct sem *curr; curr = &sma->sem_base[sops->sem_num]; @@ -1988,77 +1957,69 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, sma->complex_count++; } - queue.status = -EINTR; - queue.sleeper = current; + do { + queue.status = -EINTR; + queue.sleeper = current; -sleep_again: - __set_current_state(TASK_INTERRUPTIBLE); - sem_unlock(sma, locknum); - rcu_read_unlock(); - - if (timeout) - jiffies_left = schedule_timeout(jiffies_left); - else - schedule(); + __set_current_state(TASK_INTERRUPTIBLE); + sem_unlock(sma, locknum); + rcu_read_unlock(); - error = get_queue_result(&queue); + if (timeout) + jiffies_left = schedule_timeout(jiffies_left); + else + schedule(); - if (error != -EINTR) { - /* fast path: update_queue already obtained all requested - * resources. - * Perform a smp_mb(): User space could assume that semop() - * is a memory barrier: Without the mb(), the cpu could - * speculatively read in user space stale data that was - * overwritten by the previous owner of the semaphore. + /* + * fastpath: the semop has completed, either successfully or + * not, from the syscall pov, is quite irrelevant to us at this + * point; we're done. + * + * We _do_ care, nonetheless, about being awoken by a signal or + * spuriously. The queue.status is checked again in the + * slowpath (aka after taking sem_lock), such that we can detect + * scenarios where we were awakened externally, during the + * window between wake_q_add() and wake_up_q(). */ - smp_mb(); - - goto out_free; - } - - rcu_read_lock(); - sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum); - - /* - * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing. - */ - error = get_queue_result(&queue); + error = READ_ONCE(queue.status); + if (error != -EINTR) { + /* + * User space could assume that semop() is a memory + * barrier: Without the mb(), the cpu could + * speculatively read in userspace stale data that was + * overwritten by the previous owner of the semaphore. + */ + smp_mb(); + goto out_free; + } - /* - * Array removed? If yes, leave without sem_unlock(). - */ - if (IS_ERR(sma)) { - rcu_read_unlock(); - goto out_free; - } + rcu_read_lock(); + sem_lock(sma, sops, nsops); + if (!ipc_valid_object(&sma->sem_perm)) + goto out_unlock_free; - /* - * If queue.status != -EINTR we are woken up by another process. - * Leave without unlink_queue(), but with sem_unlock(). - */ - if (error != -EINTR) - goto out_unlock_free; + error = READ_ONCE(queue.status); - /* - * If an interrupt occurred we have to clean up the queue - */ - if (timeout && jiffies_left == 0) - error = -EAGAIN; + /* + * If queue.status != -EINTR we are woken up by another process. + * Leave without unlink_queue(), but with sem_unlock(). + */ + if (error != -EINTR) + goto out_unlock_free; - /* - * If the wakeup was spurious, just retry - */ - if (error == -EINTR && !signal_pending(current)) - goto sleep_again; + /* + * If an interrupt occurred we have to clean up the queue. + */ + if (timeout && jiffies_left == 0) + error = -EAGAIN; + } while (error == -EINTR && !signal_pending(current)); /* spurious */ unlink_queue(sma, &queue); out_unlock_free: sem_unlock(sma, locknum); -out_rcu_wakeup: rcu_read_unlock(); - wake_up_sem_queue_do(&tasks); out_free: if (sops != fast_sops) kfree(sops); @@ -2119,8 +2080,8 @@ void exit_sem(struct task_struct *tsk) for (;;) { struct sem_array *sma; struct sem_undo *un; - struct list_head tasks; int semid, i; + DEFINE_WAKE_Q(wake_q); cond_resched(); @@ -2208,11 +2169,10 @@ void exit_sem(struct task_struct *tsk) } } /* maybe some queued-up processes were waiting for this */ - INIT_LIST_HEAD(&tasks); - do_smart_update(sma, NULL, 0, 1, &tasks); + do_smart_update(sma, NULL, 0, 1, &wake_q); sem_unlock(sma, -1); rcu_read_unlock(); - wake_up_sem_queue_do(&tasks); + wake_up_q(&wake_q); kfree_rcu(un, rcu); } diff --git a/kernel/Makefile b/kernel/Makefile index eb26e12c6c2a..314e7d62f5f0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f9ec9add2164..215871bda3a2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -301,7 +301,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, - &vma); + &vma, NULL); if (ret <= 0) return ret; @@ -1712,7 +1712,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * essentially a kernel access to the memory. */ result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, - NULL); + NULL, NULL); if (result < 0) return result; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 0c2df7f73792..b56a558e406d 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -19,6 +19,7 @@ #include <linux/mutex.h> #include <linux/list.h> #include <linux/fs.h> +#include <linux/ima.h> #include <crypto/hash.h> #include <crypto/sha.h> #include <linux/syscalls.h> @@ -132,6 +133,9 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, return ret; image->kernel_buf_len = size; + /* IMA needs to pass the measurement list to the next kernel. */ + ima_add_kexec_buffer(image); + /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, image->kernel_buf_len); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 9b08ca391aed..3921cf7fea8e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -516,7 +516,8 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, spin_lock_irqsave(&ptr->it_lock, flags); if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { - if (posix_timer_event(ptr, 0) != 0) + if (IS_ENABLED(CONFIG_POSIX_TIMERS) && + posix_timer_event(ptr, 0) != 0) ptr->it_overrun++; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6d1020c03d41..d4b0fa01cae3 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,32 +24,14 @@ #include <asm/irq_regs.h> #include <linux/kvm_para.h> -#include <linux/perf_event.h> #include <linux/kthread.h> -/* - * The run state of the lockup detectors is controlled by the content of the - * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - - * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. - * - * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' - * are variables that are only used as an 'interface' between the parameters - * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The - * 'watchdog_thresh' variable is handled differently because its value is not - * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' - * is equal zero. - */ -#define NMI_WATCHDOG_ENABLED_BIT 0 -#define SOFT_WATCHDOG_ENABLED_BIT 1 -#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) -#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) - static DEFINE_MUTEX(watchdog_proc_mutex); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; #else -static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif int __read_mostly nmi_watchdog_enabled; int __read_mostly soft_watchdog_enabled; @@ -59,9 +41,6 @@ int __read_mostly watchdog_thresh = 10; #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; int __read_mostly sysctl_hardlockup_all_cpu_backtrace; -#else -#define sysctl_softlockup_all_cpu_backtrace 0 -#define sysctl_hardlockup_all_cpu_backtrace 0 #endif static struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -100,50 +79,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static DEFINE_PER_CPU(bool, hard_watchdog_warn); -static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); -static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -#endif static unsigned long soft_lockup_nmi_warn; -/* boot commands */ -/* - * Should we panic when a soft-lockup or hard-lockup occurs: - */ -#ifdef CONFIG_HARDLOCKUP_DETECTOR -unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; -static unsigned long hardlockup_allcpu_dumped; -/* - * We may not want to enable hard lockup detection by default in all cases, - * for example when running the kernel as a guest on a hypervisor. In these - * cases this function can be called to disable hard lockup detection. This - * function should only be executed once by the boot processor before the - * kernel command line parameters are parsed, because otherwise it is not - * possible to override this in hardlockup_panic_setup(). - */ -void hardlockup_detector_disable(void) -{ - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; -} - -static int __init hardlockup_panic_setup(char *str) -{ - if (!strncmp(str, "panic", 5)) - hardlockup_panic = 1; - else if (!strncmp(str, "nopanic", 7)) - hardlockup_panic = 0; - else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; - else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; - return 1; -} -__setup("nmi_watchdog=", hardlockup_panic_setup); -#endif - unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -264,32 +202,14 @@ void touch_all_softlockup_watchdogs(void) wq_watchdog_touch(-1); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR -void touch_nmi_watchdog(void) -{ - /* - * Using __raw here because some code paths have - * preemption enabled. If preemption is enabled - * then interrupts should be enabled too, in which - * case we shouldn't have to worry about the watchdog - * going off. - */ - raw_cpu_write(watchdog_nmi_touch, true); - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -#endif - void touch_softlockup_watchdog_sync(void) { __this_cpu_write(softlockup_touch_sync, true); __this_cpu_write(watchdog_touch_ts, 0); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR /* watchdog detector functions */ -static bool is_hardlockup(void) +bool is_hardlockup(void) { unsigned long hrint = __this_cpu_read(hrtimer_interrupts); @@ -299,7 +219,6 @@ static bool is_hardlockup(void) __this_cpu_write(hrtimer_interrupts_saved, hrint); return false; } -#endif static int is_softlockup(unsigned long touch_ts) { @@ -313,77 +232,22 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -#ifdef CONFIG_HARDLOCKUP_DETECTOR - -static struct perf_event_attr wd_hw_attr = { - .type = PERF_TYPE_HARDWARE, - .config = PERF_COUNT_HW_CPU_CYCLES, - .size = sizeof(struct perf_event_attr), - .pinned = 1, - .disabled = 1, -}; - -/* Callback function for perf event subsystem */ -static void watchdog_overflow_callback(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - /* Ensure the watchdog never gets throttled */ - event->hw.interrupts = 0; - - if (__this_cpu_read(watchdog_nmi_touch) == true) { - __this_cpu_write(watchdog_nmi_touch, false); - return; - } - - /* check for a hardlockup - * This is done by making sure our timer interrupt - * is incrementing. The timer interrupt should have - * fired multiple times before we overflow'd. If it hasn't - * then this is a good indication the cpu is stuck - */ - if (is_hardlockup()) { - int this_cpu = smp_processor_id(); - - /* only print hardlockups once */ - if (__this_cpu_read(hard_watchdog_warn) == true) - return; - - pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); - print_modules(); - print_irqtrace_events(current); - if (regs) - show_regs(regs); - else - dump_stack(); - - /* - * Perform all-CPU dump only once to avoid multiple hardlockups - * generating interleaving traces - */ - if (sysctl_hardlockup_all_cpu_backtrace && - !test_and_set_bit(0, &hardlockup_allcpu_dumped)) - trigger_allbutself_cpu_backtrace(); - - if (hardlockup_panic) - nmi_panic(regs, "Hard LOCKUP"); - - __this_cpu_write(hard_watchdog_warn, true); - return; - } - - __this_cpu_write(hard_watchdog_warn, false); - return; -} -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ - static void watchdog_interrupt_count(void) { __this_cpu_inc(hrtimer_interrupts); } -static int watchdog_nmi_enable(unsigned int cpu); -static void watchdog_nmi_disable(unsigned int cpu); +/* + * These two functions are mostly architecture specific + * defining them as weak here. + */ +int __weak watchdog_nmi_enable(unsigned int cpu) +{ + return 0; +} +void __weak watchdog_nmi_disable(unsigned int cpu) +{ +} static int watchdog_enable_all_cpus(void); static void watchdog_disable_all_cpus(void); @@ -576,109 +440,6 @@ static void watchdog(unsigned int cpu) watchdog_nmi_disable(cpu); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR -/* - * People like the simple clean cpu node info on boot. - * Reduce the watchdog noise by only printing messages - * that are different from what cpu0 displayed. - */ -static unsigned long cpu0_err; - -static int watchdog_nmi_enable(unsigned int cpu) -{ - struct perf_event_attr *wd_attr; - struct perf_event *event = per_cpu(watchdog_ev, cpu); - - /* nothing to do if the hard lockup detector is disabled */ - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto out; - - /* is it already setup and enabled? */ - if (event && event->state > PERF_EVENT_STATE_OFF) - goto out; - - /* it is setup but not enabled */ - if (event != NULL) - goto out_enable; - - wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); - - /* Try to register using hardware perf events */ - event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); - - /* save cpu0 error for future comparision */ - if (cpu == 0 && IS_ERR(event)) - cpu0_err = PTR_ERR(event); - - if (!IS_ERR(event)) { - /* only print for cpu0 or different than cpu0 */ - if (cpu == 0 || cpu0_err) - pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); - goto out_save; - } - - /* - * Disable the hard lockup detector if _any_ CPU fails to set up - * set up the hardware perf event. The watchdog() function checks - * the NMI_WATCHDOG_ENABLED bit periodically. - * - * The barriers are for syncing up watchdog_enabled across all the - * cpus, as clear_bit() does not use barriers. - */ - smp_mb__before_atomic(); - clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); - smp_mb__after_atomic(); - - /* skip displaying the same error again */ - if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) - return PTR_ERR(event); - - /* vary the KERN level based on the returned errno */ - if (PTR_ERR(event) == -EOPNOTSUPP) - pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); - else if (PTR_ERR(event) == -ENOENT) - pr_warn("disabled (cpu%i): hardware events not enabled\n", - cpu); - else - pr_err("disabled (cpu%i): unable to create perf event: %ld\n", - cpu, PTR_ERR(event)); - - pr_info("Shutting down hard lockup detector on all cpus\n"); - - return PTR_ERR(event); - - /* success path */ -out_save: - per_cpu(watchdog_ev, cpu) = event; -out_enable: - perf_event_enable(per_cpu(watchdog_ev, cpu)); -out: - return 0; -} - -static void watchdog_nmi_disable(unsigned int cpu) -{ - struct perf_event *event = per_cpu(watchdog_ev, cpu); - - if (event) { - perf_event_disable(event); - per_cpu(watchdog_ev, cpu) = NULL; - - /* should be in cleanup, but blocks oprofile */ - perf_event_release_kernel(event); - } - if (cpu == 0) { - /* watchdog_nmi_enable() expects this to be zero initially. */ - cpu0_err = 0; - } -} - -#else -static int watchdog_nmi_enable(unsigned int cpu) { return 0; } -static void watchdog_nmi_disable(unsigned int cpu) { return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ - static struct smp_hotplug_thread watchdog_threads = { .store = &softlockup_watchdog, .thread_should_run = watchdog_should_run, diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c new file mode 100644 index 000000000000..84016c8aee6b --- /dev/null +++ b/kernel/watchdog_hld.c @@ -0,0 +1,227 @@ +/* + * Detect hard lockups on a system + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Note: Most of this code is borrowed heavily from the original softlockup + * detector, so thanks to Ingo for the initial implementation. + * Some chunks also taken from the old x86-specific nmi watchdog code, thanks + * to those contributors as well. + */ + +#define pr_fmt(fmt) "NMI watchdog: " fmt + +#include <linux/nmi.h> +#include <linux/module.h> +#include <asm/irq_regs.h> +#include <linux/perf_event.h> + +static DEFINE_PER_CPU(bool, hard_watchdog_warn); +static DEFINE_PER_CPU(bool, watchdog_nmi_touch); +static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); + +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +unsigned int __read_mostly hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +static unsigned long hardlockup_allcpu_dumped; +/* + * We may not want to enable hard lockup detection by default in all cases, + * for example when running the kernel as a guest on a hypervisor. In these + * cases this function can be called to disable hard lockup detection. This + * function should only be executed once by the boot processor before the + * kernel command line parameters are parsed, because otherwise it is not + * possible to override this in hardlockup_panic_setup(). + */ +void hardlockup_detector_disable(void) +{ + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; +} + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; + else if (!strncmp(str, "0", 1)) + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); + +void touch_nmi_watchdog(void) +{ + /* + * Using __raw here because some code paths have + * preemption enabled. If preemption is enabled + * then interrupts should be enabled too, in which + * case we shouldn't have to worry about the watchdog + * going off. + */ + raw_cpu_write(watchdog_nmi_touch, true); + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +static struct perf_event_attr wd_hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + +/* Callback function for perf event subsystem */ +static void watchdog_overflow_callback(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + /* Ensure the watchdog never gets throttled */ + event->hw.interrupts = 0; + + if (__this_cpu_read(watchdog_nmi_touch) == true) { + __this_cpu_write(watchdog_nmi_touch, false); + return; + } + + /* check for a hardlockup + * This is done by making sure our timer interrupt + * is incrementing. The timer interrupt should have + * fired multiple times before we overflow'd. If it hasn't + * then this is a good indication the cpu is stuck + */ + if (is_hardlockup()) { + int this_cpu = smp_processor_id(); + + /* only print hardlockups once */ + if (__this_cpu_read(hard_watchdog_warn) == true) + return; + + pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + /* + * Perform all-CPU dump only once to avoid multiple hardlockups + * generating interleaving traces + */ + if (sysctl_hardlockup_all_cpu_backtrace && + !test_and_set_bit(0, &hardlockup_allcpu_dumped)) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); + + __this_cpu_write(hard_watchdog_warn, true); + return; + } + + __this_cpu_write(hard_watchdog_warn, false); + return; +} + +/* + * People like the simple clean cpu node info on boot. + * Reduce the watchdog noise by only printing messages + * that are different from what cpu0 displayed. + */ +static unsigned long cpu0_err; + +int watchdog_nmi_enable(unsigned int cpu) +{ + struct perf_event_attr *wd_attr; + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + /* nothing to do if the hard lockup detector is disabled */ + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + goto out; + + /* is it already setup and enabled? */ + if (event && event->state > PERF_EVENT_STATE_OFF) + goto out; + + /* it is setup but not enabled */ + if (event != NULL) + goto out_enable; + + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + + /* Try to register using hardware perf events */ + event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + + /* save cpu0 error for future comparision */ + if (cpu == 0 && IS_ERR(event)) + cpu0_err = PTR_ERR(event); + + if (!IS_ERR(event)) { + /* only print for cpu0 or different than cpu0 */ + if (cpu == 0 || cpu0_err) + pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); + goto out_save; + } + + /* + * Disable the hard lockup detector if _any_ CPU fails to set up + * set up the hardware perf event. The watchdog() function checks + * the NMI_WATCHDOG_ENABLED bit periodically. + * + * The barriers are for syncing up watchdog_enabled across all the + * cpus, as clear_bit() does not use barriers. + */ + smp_mb__before_atomic(); + clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); + smp_mb__after_atomic(); + + /* skip displaying the same error again */ + if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) + return PTR_ERR(event); + + /* vary the KERN level based on the returned errno */ + if (PTR_ERR(event) == -EOPNOTSUPP) + pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); + else if (PTR_ERR(event) == -ENOENT) + pr_warn("disabled (cpu%i): hardware events not enabled\n", + cpu); + else + pr_err("disabled (cpu%i): unable to create perf event: %ld\n", + cpu, PTR_ERR(event)); + + pr_info("Shutting down hard lockup detector on all cpus\n"); + + return PTR_ERR(event); + + /* success path */ +out_save: + per_cpu(watchdog_ev, cpu) = event; +out_enable: + perf_event_enable(per_cpu(watchdog_ev, cpu)); +out: + return 0; +} + +void watchdog_nmi_disable(unsigned int cpu) +{ + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) { + perf_event_disable(event); + per_cpu(watchdog_ev, cpu) = NULL; + + /* should be in cleanup, but blocks oprofile */ + perf_event_release_kernel(event); + } + if (cpu == 0) { + /* watchdog_nmi_enable() expects this to be zero initially. */ + cpu0_err = 0; + } +} diff --git a/lib/idr.c b/lib/idr.c index 52d2979a05e8..918b595d9455 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -1,1071 +1,8 @@ -/* - * 2002-10-18 written by Jim Houston jim.houston@ccur.com - * Copyright (C) 2002 by Concurrent Computer Corporation - * Distributed under the GNU GPL license version 2. - * - * Modified by George Anzinger to reuse immediately and to use - * find bit instructions. Also removed _irq on spinlocks. - * - * Modified by Nadia Derbey to make it RCU safe. - * - * Small id to pointer translation service. - * - * It uses a radix tree like structure as a sparse array indexed - * by the id to obtain the pointer. The bitmap makes allocating - * a new id quick. - * - * You call it to allocate an id (an int) an associate with that id a - * pointer or what ever, we treat it as a (void *). You can pass this - * id to a user for him to pass back at a later time. You then pass - * that id to this code and it returns your pointer. - */ - -#ifndef TEST // to test in user space... -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/export.h> -#endif -#include <linux/err.h> -#include <linux/string.h> #include <linux/idr.h> #include <linux/spinlock.h> -#include <linux/percpu.h> - -#define MAX_IDR_SHIFT (sizeof(int) * 8 - 1) -#define MAX_IDR_BIT (1U << MAX_IDR_SHIFT) - -/* Leave the possibility of an incomplete final layer */ -#define MAX_IDR_LEVEL ((MAX_IDR_SHIFT + IDR_BITS - 1) / IDR_BITS) - -/* Number of id_layer structs to leave in free list */ -#define MAX_IDR_FREE (MAX_IDR_LEVEL * 2) -static struct kmem_cache *idr_layer_cache; -static DEFINE_PER_CPU(struct idr_layer *, idr_preload_head); -static DEFINE_PER_CPU(int, idr_preload_cnt); static DEFINE_SPINLOCK(simple_ida_lock); -/* the maximum ID which can be allocated given idr->layers */ -static int idr_max(int layers) -{ - int bits = min_t(int, layers * IDR_BITS, MAX_IDR_SHIFT); - - return (1 << bits) - 1; -} - -/* - * Prefix mask for an idr_layer at @layer. For layer 0, the prefix mask is - * all bits except for the lower IDR_BITS. For layer 1, 2 * IDR_BITS, and - * so on. - */ -static int idr_layer_prefix_mask(int layer) -{ - return ~idr_max(layer + 1); -} - -static struct idr_layer *get_from_free_list(struct idr *idp) -{ - struct idr_layer *p; - unsigned long flags; - - spin_lock_irqsave(&idp->lock, flags); - if ((p = idp->id_free)) { - idp->id_free = p->ary[0]; - idp->id_free_cnt--; - p->ary[0] = NULL; - } - spin_unlock_irqrestore(&idp->lock, flags); - return(p); -} - -/** - * idr_layer_alloc - allocate a new idr_layer - * @gfp_mask: allocation mask - * @layer_idr: optional idr to allocate from - * - * If @layer_idr is %NULL, directly allocate one using @gfp_mask or fetch - * one from the per-cpu preload buffer. If @layer_idr is not %NULL, fetch - * an idr_layer from @idr->id_free. - * - * @layer_idr is to maintain backward compatibility with the old alloc - * interface - idr_pre_get() and idr_get_new*() - and will be removed - * together with per-pool preload buffer. - */ -static struct idr_layer *idr_layer_alloc(gfp_t gfp_mask, struct idr *layer_idr) -{ - struct idr_layer *new; - - /* this is the old path, bypass to get_from_free_list() */ - if (layer_idr) - return get_from_free_list(layer_idr); - - /* - * Try to allocate directly from kmem_cache. We want to try this - * before preload buffer; otherwise, non-preloading idr_alloc() - * users will end up taking advantage of preloading ones. As the - * following is allowed to fail for preloaded cases, suppress - * warning this time. - */ - new = kmem_cache_zalloc(idr_layer_cache, gfp_mask | __GFP_NOWARN); - if (new) - return new; - - /* - * Try to fetch one from the per-cpu preload buffer if in process - * context. See idr_preload() for details. - */ - if (!in_interrupt()) { - preempt_disable(); - new = __this_cpu_read(idr_preload_head); - if (new) { - __this_cpu_write(idr_preload_head, new->ary[0]); - __this_cpu_dec(idr_preload_cnt); - new->ary[0] = NULL; - } - preempt_enable(); - if (new) - return new; - } - - /* - * Both failed. Try kmem_cache again w/o adding __GFP_NOWARN so - * that memory allocation failure warning is printed as intended. - */ - return kmem_cache_zalloc(idr_layer_cache, gfp_mask); -} - -static void idr_layer_rcu_free(struct rcu_head *head) -{ - struct idr_layer *layer; - - layer = container_of(head, struct idr_layer, rcu_head); - kmem_cache_free(idr_layer_cache, layer); -} - -static inline void free_layer(struct idr *idr, struct idr_layer *p) -{ - if (idr->hint == p) - RCU_INIT_POINTER(idr->hint, NULL); - call_rcu(&p->rcu_head, idr_layer_rcu_free); -} - -/* only called when idp->lock is held */ -static void __move_to_free_list(struct idr *idp, struct idr_layer *p) -{ - p->ary[0] = idp->id_free; - idp->id_free = p; - idp->id_free_cnt++; -} - -static void move_to_free_list(struct idr *idp, struct idr_layer *p) -{ - unsigned long flags; - - /* - * Depends on the return element being zeroed. - */ - spin_lock_irqsave(&idp->lock, flags); - __move_to_free_list(idp, p); - spin_unlock_irqrestore(&idp->lock, flags); -} - -static void idr_mark_full(struct idr_layer **pa, int id) -{ - struct idr_layer *p = pa[0]; - int l = 0; - - __set_bit(id & IDR_MASK, p->bitmap); - /* - * If this layer is full mark the bit in the layer above to - * show that this part of the radix tree is full. This may - * complete the layer above and require walking up the radix - * tree. - */ - while (bitmap_full(p->bitmap, IDR_SIZE)) { - if (!(p = pa[++l])) - break; - id = id >> IDR_BITS; - __set_bit((id & IDR_MASK), p->bitmap); - } -} - -static int __idr_pre_get(struct idr *idp, gfp_t gfp_mask) -{ - while (idp->id_free_cnt < MAX_IDR_FREE) { - struct idr_layer *new; - new = kmem_cache_zalloc(idr_layer_cache, gfp_mask); - if (new == NULL) - return (0); - move_to_free_list(idp, new); - } - return 1; -} - -/** - * sub_alloc - try to allocate an id without growing the tree depth - * @idp: idr handle - * @starting_id: id to start search at - * @pa: idr_layer[MAX_IDR_LEVEL] used as backtrack buffer - * @gfp_mask: allocation mask for idr_layer_alloc() - * @layer_idr: optional idr passed to idr_layer_alloc() - * - * Allocate an id in range [@starting_id, INT_MAX] from @idp without - * growing its depth. Returns - * - * the allocated id >= 0 if successful, - * -EAGAIN if the tree needs to grow for allocation to succeed, - * -ENOSPC if the id space is exhausted, - * -ENOMEM if more idr_layers need to be allocated. - */ -static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa, - gfp_t gfp_mask, struct idr *layer_idr) -{ - int n, m, sh; - struct idr_layer *p, *new; - int l, id, oid; - - id = *starting_id; - restart: - p = idp->top; - l = idp->layers; - pa[l--] = NULL; - while (1) { - /* - * We run around this while until we reach the leaf node... - */ - n = (id >> (IDR_BITS*l)) & IDR_MASK; - m = find_next_zero_bit(p->bitmap, IDR_SIZE, n); - if (m == IDR_SIZE) { - /* no space available go back to previous layer. */ - l++; - oid = id; - id = (id | ((1 << (IDR_BITS * l)) - 1)) + 1; - - /* if already at the top layer, we need to grow */ - if (id > idr_max(idp->layers)) { - *starting_id = id; - return -EAGAIN; - } - p = pa[l]; - BUG_ON(!p); - - /* If we need to go up one layer, continue the - * loop; otherwise, restart from the top. - */ - sh = IDR_BITS * (l + 1); - if (oid >> sh == id >> sh) - continue; - else - goto restart; - } - if (m != n) { - sh = IDR_BITS*l; - id = ((id >> sh) ^ n ^ m) << sh; - } - if ((id >= MAX_IDR_BIT) || (id < 0)) - return -ENOSPC; - if (l == 0) - break; - /* - * Create the layer below if it is missing. - */ - if (!p->ary[m]) { - new = idr_layer_alloc(gfp_mask, layer_idr); - if (!new) - return -ENOMEM; - new->layer = l-1; - new->prefix = id & idr_layer_prefix_mask(new->layer); - rcu_assign_pointer(p->ary[m], new); - p->count++; - } - pa[l--] = p; - p = p->ary[m]; - } - - pa[l] = p; - return id; -} - -static int idr_get_empty_slot(struct idr *idp, int starting_id, - struct idr_layer **pa, gfp_t gfp_mask, - struct idr *layer_idr) -{ - struct idr_layer *p, *new; - int layers, v, id; - unsigned long flags; - - id = starting_id; -build_up: - p = idp->top; - layers = idp->layers; - if (unlikely(!p)) { - if (!(p = idr_layer_alloc(gfp_mask, layer_idr))) - return -ENOMEM; - p->layer = 0; - layers = 1; - } - /* - * Add a new layer to the top of the tree if the requested - * id is larger than the currently allocated space. - */ - while (id > idr_max(layers)) { - layers++; - if (!p->count) { - /* special case: if the tree is currently empty, - * then we grow the tree by moving the top node - * upwards. - */ - p->layer++; - WARN_ON_ONCE(p->prefix); - continue; - } - if (!(new = idr_layer_alloc(gfp_mask, layer_idr))) { - /* - * The allocation failed. If we built part of - * the structure tear it down. - */ - spin_lock_irqsave(&idp->lock, flags); - for (new = p; p && p != idp->top; new = p) { - p = p->ary[0]; - new->ary[0] = NULL; - new->count = 0; - bitmap_clear(new->bitmap, 0, IDR_SIZE); - __move_to_free_list(idp, new); - } - spin_unlock_irqrestore(&idp->lock, flags); - return -ENOMEM; - } - new->ary[0] = p; - new->count = 1; - new->layer = layers-1; - new->prefix = id & idr_layer_prefix_mask(new->layer); - if (bitmap_full(p->bitmap, IDR_SIZE)) - __set_bit(0, new->bitmap); - p = new; - } - rcu_assign_pointer(idp->top, p); - idp->layers = layers; - v = sub_alloc(idp, &id, pa, gfp_mask, layer_idr); - if (v == -EAGAIN) - goto build_up; - return(v); -} - -/* - * @id and @pa are from a successful allocation from idr_get_empty_slot(). - * Install the user pointer @ptr and mark the slot full. - */ -static void idr_fill_slot(struct idr *idr, void *ptr, int id, - struct idr_layer **pa) -{ - /* update hint used for lookup, cleared from free_layer() */ - rcu_assign_pointer(idr->hint, pa[0]); - - rcu_assign_pointer(pa[0]->ary[id & IDR_MASK], (struct idr_layer *)ptr); - pa[0]->count++; - idr_mark_full(pa, id); -} - - -/** - * idr_preload - preload for idr_alloc() - * @gfp_mask: allocation mask to use for preloading - * - * Preload per-cpu layer buffer for idr_alloc(). Can only be used from - * process context and each idr_preload() invocation should be matched with - * idr_preload_end(). Note that preemption is disabled while preloaded. - * - * The first idr_alloc() in the preloaded section can be treated as if it - * were invoked with @gfp_mask used for preloading. This allows using more - * permissive allocation masks for idrs protected by spinlocks. - * - * For example, if idr_alloc() below fails, the failure can be treated as - * if idr_alloc() were called with GFP_KERNEL rather than GFP_NOWAIT. - * - * idr_preload(GFP_KERNEL); - * spin_lock(lock); - * - * id = idr_alloc(idr, ptr, start, end, GFP_NOWAIT); - * - * spin_unlock(lock); - * idr_preload_end(); - * if (id < 0) - * error; - */ -void idr_preload(gfp_t gfp_mask) -{ - /* - * Consuming preload buffer from non-process context breaks preload - * allocation guarantee. Disallow usage from those contexts. - */ - WARN_ON_ONCE(in_interrupt()); - might_sleep_if(gfpflags_allow_blocking(gfp_mask)); - - preempt_disable(); - - /* - * idr_alloc() is likely to succeed w/o full idr_layer buffer and - * return value from idr_alloc() needs to be checked for failure - * anyway. Silently give up if allocation fails. The caller can - * treat failures from idr_alloc() as if idr_alloc() were called - * with @gfp_mask which should be enough. - */ - while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) { - struct idr_layer *new; - - preempt_enable(); - new = kmem_cache_zalloc(idr_layer_cache, gfp_mask); - preempt_disable(); - if (!new) - break; - - /* link the new one to per-cpu preload list */ - new->ary[0] = __this_cpu_read(idr_preload_head); - __this_cpu_write(idr_preload_head, new); - __this_cpu_inc(idr_preload_cnt); - } -} -EXPORT_SYMBOL(idr_preload); - -/** - * idr_alloc - allocate new idr entry - * @idr: the (initialized) idr - * @ptr: pointer to be associated with the new id - * @start: the minimum id (inclusive) - * @end: the maximum id (exclusive, <= 0 for max) - * @gfp_mask: memory allocation flags - * - * Allocate an id in [start, end) and associate it with @ptr. If no ID is - * available in the specified range, returns -ENOSPC. On memory allocation - * failure, returns -ENOMEM. - * - * Note that @end is treated as max when <= 0. This is to always allow - * using @start + N as @end as long as N is inside integer range. - * - * The user is responsible for exclusively synchronizing all operations - * which may modify @idr. However, read-only accesses such as idr_find() - * or iteration can be performed under RCU read lock provided the user - * destroys @ptr in RCU-safe way after removal from idr. - */ -int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) -{ - int max = end > 0 ? end - 1 : INT_MAX; /* inclusive upper limit */ - struct idr_layer *pa[MAX_IDR_LEVEL + 1]; - int id; - - might_sleep_if(gfpflags_allow_blocking(gfp_mask)); - - /* sanity checks */ - if (WARN_ON_ONCE(start < 0)) - return -EINVAL; - if (unlikely(max < start)) - return -ENOSPC; - - /* allocate id */ - id = idr_get_empty_slot(idr, start, pa, gfp_mask, NULL); - if (unlikely(id < 0)) - return id; - if (unlikely(id > max)) - return -ENOSPC; - - idr_fill_slot(idr, ptr, id, pa); - return id; -} -EXPORT_SYMBOL_GPL(idr_alloc); - -/** - * idr_alloc_cyclic - allocate new idr entry in a cyclical fashion - * @idr: the (initialized) idr - * @ptr: pointer to be associated with the new id - * @start: the minimum id (inclusive) - * @end: the maximum id (exclusive, <= 0 for max) - * @gfp_mask: memory allocation flags - * - * Essentially the same as idr_alloc, but prefers to allocate progressively - * higher ids if it can. If the "cur" counter wraps, then it will start again - * at the "start" end of the range and allocate one that has already been used. - */ -int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, - gfp_t gfp_mask) -{ - int id; - - id = idr_alloc(idr, ptr, max(start, idr->cur), end, gfp_mask); - if (id == -ENOSPC) - id = idr_alloc(idr, ptr, start, end, gfp_mask); - - if (likely(id >= 0)) - idr->cur = id + 1; - return id; -} -EXPORT_SYMBOL(idr_alloc_cyclic); - -static void idr_remove_warning(int id) -{ - WARN(1, "idr_remove called for id=%d which is not allocated.\n", id); -} - -static void sub_remove(struct idr *idp, int shift, int id) -{ - struct idr_layer *p = idp->top; - struct idr_layer **pa[MAX_IDR_LEVEL + 1]; - struct idr_layer ***paa = &pa[0]; - struct idr_layer *to_free; - int n; - - *paa = NULL; - *++paa = &idp->top; - - while ((shift > 0) && p) { - n = (id >> shift) & IDR_MASK; - __clear_bit(n, p->bitmap); - *++paa = &p->ary[n]; - p = p->ary[n]; - shift -= IDR_BITS; - } - n = id & IDR_MASK; - if (likely(p != NULL && test_bit(n, p->bitmap))) { - __clear_bit(n, p->bitmap); - RCU_INIT_POINTER(p->ary[n], NULL); - to_free = NULL; - while(*paa && ! --((**paa)->count)){ - if (to_free) - free_layer(idp, to_free); - to_free = **paa; - **paa-- = NULL; - } - if (!*paa) - idp->layers = 0; - if (to_free) - free_layer(idp, to_free); - } else - idr_remove_warning(id); -} - -/** - * idr_remove - remove the given id and free its slot - * @idp: idr handle - * @id: unique key - */ -void idr_remove(struct idr *idp, int id) -{ - struct idr_layer *p; - struct idr_layer *to_free; - - if (id < 0) - return; - - if (id > idr_max(idp->layers)) { - idr_remove_warning(id); - return; - } - - sub_remove(idp, (idp->layers - 1) * IDR_BITS, id); - if (idp->top && idp->top->count == 1 && (idp->layers > 1) && - idp->top->ary[0]) { - /* - * Single child at leftmost slot: we can shrink the tree. - * This level is not needed anymore since when layers are - * inserted, they are inserted at the top of the existing - * tree. - */ - to_free = idp->top; - p = idp->top->ary[0]; - rcu_assign_pointer(idp->top, p); - --idp->layers; - to_free->count = 0; - bitmap_clear(to_free->bitmap, 0, IDR_SIZE); - free_layer(idp, to_free); - } -} -EXPORT_SYMBOL(idr_remove); - -static void __idr_remove_all(struct idr *idp) -{ - int n, id, max; - int bt_mask; - struct idr_layer *p; - struct idr_layer *pa[MAX_IDR_LEVEL + 1]; - struct idr_layer **paa = &pa[0]; - - n = idp->layers * IDR_BITS; - *paa = idp->top; - RCU_INIT_POINTER(idp->top, NULL); - max = idr_max(idp->layers); - - id = 0; - while (id >= 0 && id <= max) { - p = *paa; - while (n > IDR_BITS && p) { - n -= IDR_BITS; - p = p->ary[(id >> n) & IDR_MASK]; - *++paa = p; - } - - bt_mask = id; - id += 1 << n; - /* Get the highest bit that the above add changed from 0->1. */ - while (n < fls(id ^ bt_mask)) { - if (*paa) - free_layer(idp, *paa); - n += IDR_BITS; - --paa; - } - } - idp->layers = 0; -} - -/** - * idr_destroy - release all cached layers within an idr tree - * @idp: idr handle - * - * Free all id mappings and all idp_layers. After this function, @idp is - * completely unused and can be freed / recycled. The caller is - * responsible for ensuring that no one else accesses @idp during or after - * idr_destroy(). - * - * A typical clean-up sequence for objects stored in an idr tree will use - * idr_for_each() to free all objects, if necessary, then idr_destroy() to - * free up the id mappings and cached idr_layers. - */ -void idr_destroy(struct idr *idp) -{ - __idr_remove_all(idp); - - while (idp->id_free_cnt) { - struct idr_layer *p = get_from_free_list(idp); - kmem_cache_free(idr_layer_cache, p); - } -} -EXPORT_SYMBOL(idr_destroy); - -void *idr_find_slowpath(struct idr *idp, int id) -{ - int n; - struct idr_layer *p; - - if (id < 0) - return NULL; - - p = rcu_dereference_raw(idp->top); - if (!p) - return NULL; - n = (p->layer+1) * IDR_BITS; - - if (id > idr_max(p->layer + 1)) - return NULL; - BUG_ON(n == 0); - - while (n > 0 && p) { - n -= IDR_BITS; - BUG_ON(n != p->layer*IDR_BITS); - p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]); - } - return((void *)p); -} -EXPORT_SYMBOL(idr_find_slowpath); - -/** - * idr_for_each - iterate through all stored pointers - * @idp: idr handle - * @fn: function to be called for each pointer - * @data: data passed back to callback function - * - * Iterate over the pointers registered with the given idr. The - * callback function will be called for each pointer currently - * registered, passing the id, the pointer and the data pointer passed - * to this function. It is not safe to modify the idr tree while in - * the callback, so functions such as idr_get_new and idr_remove are - * not allowed. - * - * We check the return of @fn each time. If it returns anything other - * than %0, we break out and return that value. - * - * The caller must serialize idr_for_each() vs idr_get_new() and idr_remove(). - */ -int idr_for_each(struct idr *idp, - int (*fn)(int id, void *p, void *data), void *data) -{ - int n, id, max, error = 0; - struct idr_layer *p; - struct idr_layer *pa[MAX_IDR_LEVEL + 1]; - struct idr_layer **paa = &pa[0]; - - n = idp->layers * IDR_BITS; - *paa = rcu_dereference_raw(idp->top); - max = idr_max(idp->layers); - - id = 0; - while (id >= 0 && id <= max) { - p = *paa; - while (n > 0 && p) { - n -= IDR_BITS; - p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]); - *++paa = p; - } - - if (p) { - error = fn(id, (void *)p, data); - if (error) - break; - } - - id += 1 << n; - while (n < fls(id)) { - n += IDR_BITS; - --paa; - } - } - - return error; -} -EXPORT_SYMBOL(idr_for_each); - -/** - * idr_get_next - lookup next object of id to given id. - * @idp: idr handle - * @nextidp: pointer to lookup key - * - * Returns pointer to registered object with id, which is next number to - * given id. After being looked up, *@nextidp will be updated for the next - * iteration. - * - * This function can be called under rcu_read_lock(), given that the leaf - * pointers lifetimes are correctly managed. - */ -void *idr_get_next(struct idr *idp, int *nextidp) -{ - struct idr_layer *p, *pa[MAX_IDR_LEVEL + 1]; - struct idr_layer **paa = &pa[0]; - int id = *nextidp; - int n, max; - - /* find first ent */ - p = *paa = rcu_dereference_raw(idp->top); - if (!p) - return NULL; - n = (p->layer + 1) * IDR_BITS; - max = idr_max(p->layer + 1); - - while (id >= 0 && id <= max) { - p = *paa; - while (n > 0 && p) { - n -= IDR_BITS; - p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]); - *++paa = p; - } - - if (p) { - *nextidp = id; - return p; - } - - /* - * Proceed to the next layer at the current level. Unlike - * idr_for_each(), @id isn't guaranteed to be aligned to - * layer boundary at this point and adding 1 << n may - * incorrectly skip IDs. Make sure we jump to the - * beginning of the next layer using round_up(). - */ - id = round_up(id + 1, 1 << n); - while (n < fls(id)) { - n += IDR_BITS; - --paa; - } - } - return NULL; -} -EXPORT_SYMBOL(idr_get_next); - - -/** - * idr_replace - replace pointer for given id - * @idp: idr handle - * @ptr: pointer you want associated with the id - * @id: lookup key - * - * Replace the pointer registered with an id and return the old value. - * A %-ENOENT return indicates that @id was not found. - * A %-EINVAL return indicates that @id was not within valid constraints. - * - * The caller must serialize with writers. - */ -void *idr_replace(struct idr *idp, void *ptr, int id) -{ - int n; - struct idr_layer *p, *old_p; - - if (id < 0) - return ERR_PTR(-EINVAL); - - p = idp->top; - if (!p) - return ERR_PTR(-ENOENT); - - if (id > idr_max(p->layer + 1)) - return ERR_PTR(-ENOENT); - - n = p->layer * IDR_BITS; - while ((n > 0) && p) { - p = p->ary[(id >> n) & IDR_MASK]; - n -= IDR_BITS; - } - - n = id & IDR_MASK; - if (unlikely(p == NULL || !test_bit(n, p->bitmap))) - return ERR_PTR(-ENOENT); - - old_p = p->ary[n]; - rcu_assign_pointer(p->ary[n], ptr); - - return old_p; -} -EXPORT_SYMBOL(idr_replace); - -void __init idr_init_cache(void) -{ - idr_layer_cache = kmem_cache_create("idr_layer_cache", - sizeof(struct idr_layer), 0, SLAB_PANIC, NULL); -} - -/** - * idr_init - initialize idr handle - * @idp: idr handle - * - * This function is use to set up the handle (@idp) that you will pass - * to the rest of the functions. - */ -void idr_init(struct idr *idp) -{ - memset(idp, 0, sizeof(struct idr)); - spin_lock_init(&idp->lock); -} -EXPORT_SYMBOL(idr_init); - -static int idr_has_entry(int id, void *p, void *data) -{ - return 1; -} - -bool idr_is_empty(struct idr *idp) -{ - return !idr_for_each(idp, idr_has_entry, NULL); -} -EXPORT_SYMBOL(idr_is_empty); - -/** - * DOC: IDA description - * IDA - IDR based ID allocator - * - * This is id allocator without id -> pointer translation. Memory - * usage is much lower than full blown idr because each id only - * occupies a bit. ida uses a custom leaf node which contains - * IDA_BITMAP_BITS slots. - * - * 2007-04-25 written by Tejun Heo <htejun@gmail.com> - */ - -static void free_bitmap(struct ida *ida, struct ida_bitmap *bitmap) -{ - unsigned long flags; - - if (!ida->free_bitmap) { - spin_lock_irqsave(&ida->idr.lock, flags); - if (!ida->free_bitmap) { - ida->free_bitmap = bitmap; - bitmap = NULL; - } - spin_unlock_irqrestore(&ida->idr.lock, flags); - } - - kfree(bitmap); -} - -/** - * ida_pre_get - reserve resources for ida allocation - * @ida: ida handle - * @gfp_mask: memory allocation flag - * - * This function should be called prior to locking and calling the - * following function. It preallocates enough memory to satisfy the - * worst possible allocation. - * - * If the system is REALLY out of memory this function returns %0, - * otherwise %1. - */ -int ida_pre_get(struct ida *ida, gfp_t gfp_mask) -{ - /* allocate idr_layers */ - if (!__idr_pre_get(&ida->idr, gfp_mask)) - return 0; - - /* allocate free_bitmap */ - if (!ida->free_bitmap) { - struct ida_bitmap *bitmap; - - bitmap = kmalloc(sizeof(struct ida_bitmap), gfp_mask); - if (!bitmap) - return 0; - - free_bitmap(ida, bitmap); - } - - return 1; -} -EXPORT_SYMBOL(ida_pre_get); - -/** - * ida_get_new_above - allocate new ID above or equal to a start id - * @ida: ida handle - * @starting_id: id to start search at - * @p_id: pointer to the allocated handle - * - * Allocate new ID above or equal to @starting_id. It should be called - * with any required locks. - * - * If memory is required, it will return %-EAGAIN, you should unlock - * and go back to the ida_pre_get() call. If the ida is full, it will - * return %-ENOSPC. - * - * Note that callers must ensure that concurrent access to @ida is not possible. - * See ida_simple_get() for a varaint which takes care of locking. - * - * @p_id returns a value in the range @starting_id ... %0x7fffffff. - */ -int ida_get_new_above(struct ida *ida, int starting_id, int *p_id) -{ - struct idr_layer *pa[MAX_IDR_LEVEL + 1]; - struct ida_bitmap *bitmap; - unsigned long flags; - int idr_id = starting_id / IDA_BITMAP_BITS; - int offset = starting_id % IDA_BITMAP_BITS; - int t, id; - - restart: - /* get vacant slot */ - t = idr_get_empty_slot(&ida->idr, idr_id, pa, 0, &ida->idr); - if (t < 0) - return t == -ENOMEM ? -EAGAIN : t; - - if (t * IDA_BITMAP_BITS >= MAX_IDR_BIT) - return -ENOSPC; - - if (t != idr_id) - offset = 0; - idr_id = t; - - /* if bitmap isn't there, create a new one */ - bitmap = (void *)pa[0]->ary[idr_id & IDR_MASK]; - if (!bitmap) { - spin_lock_irqsave(&ida->idr.lock, flags); - bitmap = ida->free_bitmap; - ida->free_bitmap = NULL; - spin_unlock_irqrestore(&ida->idr.lock, flags); - - if (!bitmap) - return -EAGAIN; - - memset(bitmap, 0, sizeof(struct ida_bitmap)); - rcu_assign_pointer(pa[0]->ary[idr_id & IDR_MASK], - (void *)bitmap); - pa[0]->count++; - } - - /* lookup for empty slot */ - t = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, offset); - if (t == IDA_BITMAP_BITS) { - /* no empty slot after offset, continue to the next chunk */ - idr_id++; - offset = 0; - goto restart; - } - - id = idr_id * IDA_BITMAP_BITS + t; - if (id >= MAX_IDR_BIT) - return -ENOSPC; - - __set_bit(t, bitmap->bitmap); - if (++bitmap->nr_busy == IDA_BITMAP_BITS) - idr_mark_full(pa, idr_id); - - *p_id = id; - - /* Each leaf node can handle nearly a thousand slots and the - * whole idea of ida is to have small memory foot print. - * Throw away extra resources one by one after each successful - * allocation. - */ - if (ida->idr.id_free_cnt || ida->free_bitmap) { - struct idr_layer *p = get_from_free_list(&ida->idr); - if (p) - kmem_cache_free(idr_layer_cache, p); - } - - return 0; -} -EXPORT_SYMBOL(ida_get_new_above); - -/** - * ida_remove - remove the given ID - * @ida: ida handle - * @id: ID to free - */ -void ida_remove(struct ida *ida, int id) -{ - struct idr_layer *p = ida->idr.top; - int shift = (ida->idr.layers - 1) * IDR_BITS; - int idr_id = id / IDA_BITMAP_BITS; - int offset = id % IDA_BITMAP_BITS; - int n; - struct ida_bitmap *bitmap; - - if (idr_id > idr_max(ida->idr.layers)) - goto err; - - /* clear full bits while looking up the leaf idr_layer */ - while ((shift > 0) && p) { - n = (idr_id >> shift) & IDR_MASK; - __clear_bit(n, p->bitmap); - p = p->ary[n]; - shift -= IDR_BITS; - } - - if (p == NULL) - goto err; - - n = idr_id & IDR_MASK; - __clear_bit(n, p->bitmap); - - bitmap = (void *)p->ary[n]; - if (!bitmap || !test_bit(offset, bitmap->bitmap)) - goto err; - - /* update bitmap and remove it if empty */ - __clear_bit(offset, bitmap->bitmap); - if (--bitmap->nr_busy == 0) { - __set_bit(n, p->bitmap); /* to please idr_remove() */ - idr_remove(&ida->idr, idr_id); - free_bitmap(ida, bitmap); - } - - return; - - err: - WARN(1, "ida_remove called for id=%d which is not allocated.\n", id); -} -EXPORT_SYMBOL(ida_remove); - -/** - * ida_destroy - release all cached layers within an ida tree - * @ida: ida handle - */ -void ida_destroy(struct ida *ida) -{ - idr_destroy(&ida->idr); - kfree(ida->free_bitmap); -} -EXPORT_SYMBOL(ida_destroy); - /** * ida_simple_get - get a new id. * @ida: the (initialized) ida. @@ -1141,18 +78,3 @@ void ida_simple_remove(struct ida *ida, unsigned int id) spin_unlock_irqrestore(&simple_ida_lock, flags); } EXPORT_SYMBOL(ida_simple_remove); - -/** - * ida_init - initialize ida handle - * @ida: ida handle - * - * This function is use to set up the handle (@ida) that you will pass - * to the rest of the functions. - */ -void ida_init(struct ida *ida) -{ - memset(ida, 0, sizeof(struct ida)); - idr_init(&ida->idr); - -} -EXPORT_SYMBOL(ida_init); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index e0290d11331a..5e8fc32697b1 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -22,19 +22,21 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include <linux/bitmap.h> +#include <linux/bitops.h> +#include <linux/cpu.h> #include <linux/errno.h> +#include <linux/export.h> +#include <linux/idr.h> #include <linux/init.h> #include <linux/kernel.h> -#include <linux/export.h> -#include <linux/radix-tree.h> +#include <linux/kmemleak.h> #include <linux/percpu.h> +#include <linux/preempt.h> /* in_interrupt() */ +#include <linux/radix-tree.h> +#include <linux/rcupdate.h> #include <linux/slab.h> -#include <linux/kmemleak.h> -#include <linux/cpu.h> #include <linux/string.h> -#include <linux/bitops.h> -#include <linux/rcupdate.h> -#include <linux/preempt.h> /* in_interrupt() */ /* Number of nodes in fully populated tree of given height */ @@ -59,6 +61,15 @@ static struct kmem_cache *radix_tree_node_cachep; #define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1) /* + * The IDR does not have to be as high since it can only store a 31-bit integer + * at its maximum height + */ +#define IDR_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(int) - 1) +#define IDR_MAX_PATH (DIV_ROUND_UP(IDR_INDEX_BITS, \ + RADIX_TREE_MAP_SHIFT)) +#define IDR_PRELOAD_SIZE (IDR_MAX_PATH * 2 - 1) + +/* * Per-cpu pool of preloaded nodes */ struct radix_tree_preload { @@ -68,6 +79,11 @@ struct radix_tree_preload { }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; +static inline struct radix_tree_node *entry_to_node(void *ptr) +{ + return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); +} + static inline void *node_to_entry(void *ptr) { return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); @@ -141,27 +157,38 @@ static inline int tag_get(struct radix_tree_node *node, unsigned int tag, static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag) { - root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT)); + root->gfp_mask |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag) { - root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT)); + root->gfp_mask &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear_all(struct radix_tree_root *root) { - root->gfp_mask &= __GFP_BITS_MASK; + root->gfp_mask &= (1 << ROOT_TAG_SHIFT) - 1; } static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag) { - return (__force int)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT)); + return (__force int)root->gfp_mask & (1 << (tag + ROOT_TAG_SHIFT)); } static inline unsigned root_tags_get(struct radix_tree_root *root) { - return (__force unsigned)root->gfp_mask >> __GFP_BITS_SHIFT; + return (__force unsigned)root->gfp_mask >> ROOT_TAG_SHIFT; +} + +/* + * IDRs do not expose the tagging functionality of the radix tree to their + * users. Reuse tag 0 to track whether a node has free space below it. + */ +#define IDR_FREE 0 + +static inline bool is_idr(struct radix_tree_root *root) +{ + return (__force unsigned)root->gfp_mask & ROOT_IS_IDR; } /* @@ -178,6 +205,11 @@ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) return 0; } +static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag) +{ + bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE); +} + /** * radix_tree_find_next_bit - find the next set bit in a memory region * @@ -190,13 +222,12 @@ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) * Returns next bit offset, or size if nothing found. */ static __always_inline unsigned long -radix_tree_find_next_bit(const unsigned long *addr, - unsigned long size, unsigned long offset) +radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag, + unsigned long offset) { - if (!__builtin_constant_p(size)) - return find_next_bit(addr, size, offset); + const unsigned long *addr = node->tags[tag]; - if (offset < size) { + if (offset < RADIX_TREE_MAP_SIZE) { unsigned long tmp; addr += offset / BITS_PER_LONG; @@ -204,14 +235,39 @@ radix_tree_find_next_bit(const unsigned long *addr, if (tmp) return __ffs(tmp) + offset; offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); - while (offset < size) { + while (offset < RADIX_TREE_MAP_SIZE) { tmp = *++addr; if (tmp) return __ffs(tmp) + offset; offset += BITS_PER_LONG; } } - return size; + return RADIX_TREE_MAP_SIZE; +} + +static unsigned int iter_offset(const struct radix_tree_iter *iter) +{ + return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK; +} + +/* + * The maximum index which can be stored in a radix tree + */ +static inline unsigned long shift_maxindex(unsigned int shift) +{ + return (RADIX_TREE_MAP_SIZE << shift) - 1; +} + +static inline unsigned long node_maxindex(struct radix_tree_node *node) +{ + return shift_maxindex(node->shift); +} + +static unsigned long next_index(unsigned long index, + struct radix_tree_node *node, + unsigned long offset) +{ + return (index & ~node_maxindex(node)) + (offset << node->shift); } #ifndef __KERNEL__ @@ -219,10 +275,11 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) { unsigned long i; - pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n", - node, node->offset, + pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d exceptional %d\n", + node, node->offset, index, index | node_maxindex(node), + node->parent, node->tags[0][0], node->tags[1][0], node->tags[2][0], - node->shift, node->count, node->exceptional, node->parent); + node->shift, node->count, node->exceptional); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { unsigned long first = index | (i << node->shift); @@ -230,14 +287,16 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) void *entry = node->slots[i]; if (!entry) continue; - if (is_sibling_entry(node, entry)) { - pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n", - entry, i, - *(void **)entry_to_node(entry), - first, last); + if (entry == RADIX_TREE_RETRY) { + pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n", + i, first, last, node); } else if (!radix_tree_is_internal_node(entry)) { - pr_debug("radix entry %p offset %ld indices %ld-%ld\n", - entry, i, first, last); + pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n", + entry, i, first, last, node); + } else if (is_sibling_entry(node, entry)) { + pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n", + entry, i, first, last, node, + *(void **)entry_to_node(entry)); } else { dump_node(entry_to_node(entry), first); } @@ -249,11 +308,47 @@ static void radix_tree_dump(struct radix_tree_root *root) { pr_debug("radix root: %p rnode %p tags %x\n", root, root->rnode, - root->gfp_mask >> __GFP_BITS_SHIFT); + root->gfp_mask >> ROOT_TAG_SHIFT); if (!radix_tree_is_internal_node(root->rnode)) return; dump_node(entry_to_node(root->rnode), 0); } + +static void dump_ida_node(void *entry, unsigned long index) +{ + unsigned long i; + + if (!entry) + return; + + if (radix_tree_is_internal_node(entry)) { + struct radix_tree_node *node = entry_to_node(entry); + + pr_debug("ida node: %p offset %d indices %lu-%lu parent %p free %lx shift %d count %d\n", + node, node->offset, index, index | node_maxindex(node), + node->parent, node->tags[0][0], node->shift, + node->count); + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) + dump_ida_node(node->slots[i], + index | (i << node->shift)); + } else { + struct ida_bitmap *bitmap = entry; + + pr_debug("ida btmp: %p index %lu data", bitmap, index); + for (i = 0; i < IDA_BITMAP_LONGS; i++) + pr_cont(" %lx", bitmap->bitmap[i]); + pr_cont("\n"); + } +} + +static void ida_dump(struct ida *ida) +{ + struct radix_tree_root *root = &ida->ida_rt; + pr_debug("ida: %p %p free %d bitmap %p\n", ida, root->rnode, + root->gfp_mask >> ROOT_TAG_SHIFT, + ida->free_bitmap); + dump_ida_node(root->rnode, 0); +} #endif /* @@ -261,10 +356,9 @@ static void radix_tree_dump(struct radix_tree_root *root) * that the caller has pinned this thread of control to the current CPU. */ static struct radix_tree_node * -radix_tree_node_alloc(struct radix_tree_root *root) +radix_tree_node_alloc(gfp_t gfp_mask) { struct radix_tree_node *ret = NULL; - gfp_t gfp_mask = root_gfp_mask(root); /* * Preload code isn't irq safe and it doesn't make sense to use @@ -313,17 +407,15 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) { struct radix_tree_node *node = container_of(head, struct radix_tree_node, rcu_head); - int i; /* - * must only free zeroed nodes into the slab. radix_tree_shrink - * can leave us with a non-NULL entry in the first slot, so clear - * that here to make sure. + * Must only free zeroed nodes into the slab. We can be left with + * non-NULL entries by radix_tree_free_nodes, so clear the entries + * and tags here. */ - for (i = 0; i < RADIX_TREE_MAX_TAGS; i++) - tag_clear(node, i, 0); - - node->slots[0] = NULL; + memset(node->slots, 0, sizeof(node->slots)); + memset(node->tags, 0, sizeof(node->tags)); + INIT_LIST_HEAD(&node->private_list); kmem_cache_free(radix_tree_node_cachep, node); } @@ -343,7 +435,7 @@ radix_tree_node_free(struct radix_tree_node *node) * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ -static int __radix_tree_preload(gfp_t gfp_mask, int nr) +static int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) { struct radix_tree_preload *rtp; struct radix_tree_node *node; @@ -409,6 +501,28 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) } EXPORT_SYMBOL(radix_tree_maybe_preload); +#ifdef CONFIG_RADIX_TREE_MULTIORDER +/* + * Preload with enough objects to ensure that we can split a single entry + * of order @old_order into many entries of size @new_order + */ +int radix_tree_split_preload(unsigned int old_order, unsigned int new_order, + gfp_t gfp_mask) +{ + unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT); + unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) - + (new_order / RADIX_TREE_MAP_SHIFT); + unsigned nr = 0; + + WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); + BUG_ON(new_order >= old_order); + + while (layers--) + nr = nr * RADIX_TREE_MAP_SIZE + 1; + return __radix_tree_preload(gfp_mask, top * nr); +} +#endif + /* * The same as function above, but preload number of nodes required to insert * (1 << order) continuous naturally-aligned elements. @@ -454,19 +568,6 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) return __radix_tree_preload(gfp_mask, nr_nodes); } -/* - * The maximum index which can be stored in a radix tree - */ -static inline unsigned long shift_maxindex(unsigned int shift) -{ - return (RADIX_TREE_MAP_SIZE << shift) - 1; -} - -static inline unsigned long node_maxindex(struct radix_tree_node *node) -{ - return shift_maxindex(node->shift); -} - static unsigned radix_tree_load_root(struct radix_tree_root *root, struct radix_tree_node **nodep, unsigned long *maxindex) { @@ -487,7 +588,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root, /* * Extend a radix tree so it can store key @index. */ -static int radix_tree_extend(struct radix_tree_root *root, +static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp_mask, unsigned long index, unsigned int shift) { struct radix_tree_node *slot; @@ -504,15 +605,22 @@ static int radix_tree_extend(struct radix_tree_root *root, goto out; do { - struct radix_tree_node *node = radix_tree_node_alloc(root); + struct radix_tree_node *node = radix_tree_node_alloc(gfp_mask); if (!node) return -ENOMEM; - /* Propagate the aggregated tag info into the new root */ - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { - if (root_tag_get(root, tag)) - tag_set(node, tag, 0); + if (is_idr(root)) { + all_tag_set(node, IDR_FREE); + if (!root_tag_get(root, IDR_FREE)) + tag_clear(node, IDR_FREE, 0); + root_tag_set(root, IDR_FREE); + } else { + /* Propagate the aggregated tag info to the new child */ + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (root_tag_get(root, tag)) + tag_set(node, tag, 0); + } } BUG_ON(shift > BITS_PER_LONG); @@ -576,6 +684,8 @@ static inline void radix_tree_shrink(struct radix_tree_root *root, * one (root->rnode) as far as dependent read barriers go. */ root->rnode = child; + if (is_idr(root) && !tag_get(node, IDR_FREE, 0)) + root_tag_clear(root, IDR_FREE); /* * We have a dilemma here. The node's slot[0] must not be @@ -624,7 +734,12 @@ static void delete_node(struct radix_tree_root *root, parent->slots[node->offset] = NULL; parent->count--; } else { - root_tag_clear_all(root); + /* + * Shouldn't the tags already have all been cleared + * by the caller? + */ + if (!is_idr(root)) + root_tag_clear_all(root); root->rnode = NULL; } @@ -634,26 +749,9 @@ static void delete_node(struct radix_tree_root *root, } while (node); } -/** - * __radix_tree_create - create a slot in a radix tree - * @root: radix tree root - * @index: index key - * @order: index occupies 2^order aligned slots - * @nodep: returns node - * @slotp: returns slot - * - * Create, if necessary, and return the node and slot for an item - * at position @index in the radix tree @root. - * - * Until there is more than one item in the tree, no nodes are - * allocated and @root->rnode is used as a direct slot instead of - * pointing to a node, in which case *@nodep will be NULL. - * - * Returns -ENOMEM, or 0 for success. - */ -int __radix_tree_create(struct radix_tree_root *root, unsigned long index, - unsigned order, struct radix_tree_node **nodep, - void ***slotp) +static int _radix_tree_create(struct radix_tree_root *root, gfp_t gfp_mask, + unsigned long index, unsigned int order, + struct radix_tree_node **nodep, void ***slotp) { struct radix_tree_node *node = NULL, *child; void **slot = (void **)&root->rnode; @@ -664,25 +762,27 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, shift = radix_tree_load_root(root, &child, &maxindex); /* Make sure the tree is high enough. */ + if (order > 0 && max == ((1UL << order) - 1)) + max++; if (max > maxindex) { - int error = radix_tree_extend(root, max, shift); + int error = radix_tree_extend(root, gfp_mask, max, shift); if (error < 0) return error; shift = error; child = root->rnode; - if (order == shift) - shift += RADIX_TREE_MAP_SHIFT; } while (shift > order) { shift -= RADIX_TREE_MAP_SHIFT; if (child == NULL) { /* Have to add a child node. */ - child = radix_tree_node_alloc(root); + child = radix_tree_node_alloc(gfp_mask); if (!child) return -ENOMEM; child->shift = shift; child->offset = offset; + child->count = 0; + child->exceptional = 0; child->parent = node; rcu_assign_pointer(*slot, node_to_entry(child)); if (node) @@ -696,30 +796,149 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, slot = &node->slots[offset]; } + if (nodep) + *nodep = node; + if (slotp) + *slotp = slot; + return 0; +} + +/* + * Free any nodes below this node. The tree is presumed to not need + * shrinking, and any user data in the tree is presumed to not need a + * destructor called on it. If we need to add a destructor, we can + * add that functionality later. Note that we may not clear tags or + * slots from the tree as an RCU walker may still have a pointer into + * this subtree. We could replace the entries with RADIX_TREE_RETRY, + * but we'll still have to clear those in rcu_free. + */ +static void radix_tree_free_nodes(struct radix_tree_node *node) +{ + unsigned offset = 0; + struct radix_tree_node *child = entry_to_node(node); + + for (;;) { + void *entry = child->slots[offset]; + if (radix_tree_is_internal_node(entry) && + !is_sibling_entry(child, entry)) { + child = entry_to_node(entry); + offset = 0; + continue; + } + offset++; + while (offset == RADIX_TREE_MAP_SIZE) { + struct radix_tree_node *old = child; + offset = child->offset + 1; + child = child->parent; + radix_tree_node_free(old); + if (old == entry_to_node(node)) + return; + } + } +} + #ifdef CONFIG_RADIX_TREE_MULTIORDER - /* Insert pointers to the canonical entry */ - if (order > shift) { - unsigned i, n = 1 << (order - shift); +static inline int insert_entries(struct radix_tree_node *node, void **slot, + void *item, unsigned order, bool replace) +{ + struct radix_tree_node *child; + unsigned i, n, tag, offset, tags = 0; + + if (node) { + if (order > node->shift) + n = 1 << (order - node->shift); + else + n = 1; + offset = get_slot_offset(node, slot); + } else { + n = 1; + offset = 0; + } + + if (n > 1) { offset = offset & ~(n - 1); slot = &node->slots[offset]; - child = node_to_entry(slot); - for (i = 0; i < n; i++) { - if (slot[i]) + } + child = node_to_entry(slot); + + for (i = 0; i < n; i++) { + if (slot[i]) { + if (replace) { + node->count--; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tag_get(node, tag, offset + i)) + tags |= 1 << tag; + } else return -EEXIST; } + } - for (i = 1; i < n; i++) { + for (i = 0; i < n; i++) { + struct radix_tree_node *old = slot[i]; + if (i) { rcu_assign_pointer(slot[i], child); - node->count++; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_clear(node, tag, offset + i); + } else { + rcu_assign_pointer(slot[i], item); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(node, tag, offset); } + if (radix_tree_is_internal_node(old) && + !is_sibling_entry(node, old) && + (old != RADIX_TREE_RETRY)) + radix_tree_free_nodes(old); + if (radix_tree_exceptional_entry(old)) + node->exceptional--; + } + if (node) { + node->count += n; + if (radix_tree_exceptional_entry(item)) + node->exceptional += n; + } + return n; +} +#else +static inline int insert_entries(struct radix_tree_node *node, void **slot, + void *item, unsigned order, bool replace) +{ + if (*slot) + return -EEXIST; + rcu_assign_pointer(*slot, item); + if (node) { + node->count++; + if (radix_tree_exceptional_entry(item)) + node->exceptional++; } + return 1; +} #endif - if (nodep) - *nodep = node; - if (slotp) - *slotp = slot; - return 0; +/** + * __radix_tree_create - create a slot in a radix tree + * @root: radix tree root + * @index: index key + * @order: index occupies 2^order aligned slots + * @nodep: returns node + * @slotp: returns slot + * + * Create, if necessary, and return the node and slot for an item + * at position @index in the radix tree @root. + * + * Until there is more than one item in the tree, no nodes are + * allocated and @root->rnode is used as a direct slot instead of + * pointing to a node, in which case *@nodep will be NULL. + * + * Returns -ENOMEM, or 0 for success. + */ +int __radix_tree_create(struct radix_tree_root *root, unsigned long index, + unsigned order, struct radix_tree_node **nodep, + void ***slotp) +{ + return _radix_tree_create(root, root_gfp_mask(root), index, order, + nodep, slotp); } /** @@ -743,15 +962,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, error = __radix_tree_create(root, index, order, &node, &slot); if (error) return error; - if (*slot != NULL) - return -EEXIST; - rcu_assign_pointer(*slot, item); + + error = insert_entries(node, slot, item, order, false); + if (error < 0) + return error; if (node) { unsigned offset = get_slot_offset(node, slot); - node->count++; - if (radix_tree_exceptional_entry(item)) - node->exceptional++; BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset)); @@ -849,6 +1066,24 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_lookup); +static inline int slot_count(struct radix_tree_node *node, + void **slot) +{ + int n = 1; +#ifdef CONFIG_RADIX_TREE_MULTIORDER + void *ptr = node_to_entry(slot); + unsigned offset = get_slot_offset(node, slot); + int i; + + for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[offset + i] != ptr) + break; + n++; + } +#endif + return n; +} + static void replace_slot(struct radix_tree_root *root, struct radix_tree_node *node, void **slot, void *item, @@ -867,12 +1102,35 @@ static void replace_slot(struct radix_tree_root *root, if (node) { node->count += count; - node->exceptional += exceptional; + if (exceptional) { + exceptional *= slot_count(node, slot); + node->exceptional += exceptional; + } } rcu_assign_pointer(*slot, item); } +static inline void delete_sibling_entries(struct radix_tree_node *node, + void **slot) +{ +#ifdef CONFIG_RADIX_TREE_MULTIORDER + bool exceptional = radix_tree_exceptional_entry(*slot); + void *ptr = node_to_entry(slot); + unsigned offset = get_slot_offset(node, slot); + int i; + + for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[offset + i] != ptr) + break; + node->slots[offset + i] = NULL; + node->count--; + if (exceptional) + node->exceptional--; + } +#endif +} + /** * __radix_tree_replace - replace item in a slot * @root: radix tree root @@ -890,6 +1148,8 @@ void __radix_tree_replace(struct radix_tree_root *root, void **slot, void *item, radix_tree_update_node_t update_node, void *private) { + if (!item) + delete_sibling_entries(node, slot); /* * This function supports replacing exceptional entries and * deleting entries, but that needs accounting against the @@ -920,7 +1180,8 @@ void __radix_tree_replace(struct radix_tree_root *root, * NOTE: This cannot be used to switch between non-entries (empty slots), * regular entries, and exceptional entries, as that requires accounting * inside the radix tree node. When switching from one type of entry or - * deleting, use __radix_tree_lookup() and __radix_tree_replace(). + * deleting, use __radix_tree_lookup() and __radix_tree_replace() or + * radix_tree_iter_replace(). */ void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item) @@ -929,6 +1190,167 @@ void radix_tree_replace_slot(struct radix_tree_root *root, } /** + * radix_tree_iter_replace - replace item in a slot + * @root: radix tree root + * @slot: pointer to slot + * @item: new item to store in the slot. + * + * For use with radix_tree_split() and radix_tree_for_each_slot(). + * Caller must hold tree write locked across split and replacement. + */ +void radix_tree_iter_replace(struct radix_tree_root *root, + const struct radix_tree_iter *iter, void **slot, void *item) +{ + __radix_tree_replace(root, iter->node, slot, item, NULL, NULL); +} + +#ifdef CONFIG_RADIX_TREE_MULTIORDER +/** + * radix_tree_join - replace multiple entries with one multiorder entry + * @root: radix tree root + * @index: an index inside the new entry + * @order: order of the new entry + * @item: new entry + * + * Call this function to replace several entries with one larger entry. + * The existing entries are presumed to not need freeing as a result of + * this call. + * + * The replacement entry will have all the tags set on it that were set + * on any of the entries it is replacing. + */ +int radix_tree_join(struct radix_tree_root *root, unsigned long index, + unsigned order, void *item) +{ + struct radix_tree_node *node; + void **slot; + int error; + + BUG_ON(radix_tree_is_internal_node(item)); + + error = __radix_tree_create(root, index, order, &node, &slot); + if (!error) + error = insert_entries(node, slot, item, order, true); + if (error > 0) + error = 0; + + return error; +} + +/** + * radix_tree_split - Split an entry into smaller entries + * @root: radix tree root + * @index: An index within the large entry + * @order: Order of new entries + * + * Call this function as the first step in replacing a multiorder entry + * with several entries of lower order. After this function returns, + * loop over the relevant portion of the tree using radix_tree_for_each_slot() + * and call radix_tree_iter_replace() to set up each new entry. + * + * The tags from this entry are replicated to all the new entries. + * + * The radix tree should be locked against modification during the entire + * replacement operation. Lock-free lookups will see RADIX_TREE_RETRY which + * should prompt RCU walkers to restart the lookup from the root. + */ +int radix_tree_split(struct radix_tree_root *root, unsigned long index, + unsigned order) +{ + struct radix_tree_node *parent, *node, *child; + void **slot; + unsigned int offset, end; + unsigned n, tag, tags = 0; + gfp_t gfp = root_gfp_mask(root); + + if (!__radix_tree_lookup(root, index, &parent, &slot)) + return -ENOENT; + if (!parent) + return -ENOENT; + + offset = get_slot_offset(parent, slot); + + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tag_get(parent, tag, offset)) + tags |= 1 << tag; + + for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) { + if (!is_sibling_entry(parent, parent->slots[end])) + break; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(parent, tag, end); + /* rcu_assign_pointer ensures tags are set before RETRY */ + rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY); + } + rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY); + parent->exceptional -= (end - offset); + + if (order == parent->shift) + return 0; + if (order > parent->shift) { + while (offset < end) + offset += insert_entries(parent, &parent->slots[offset], + RADIX_TREE_RETRY, order, true); + return 0; + } + + node = parent; + + for (;;) { + if (node->shift > order) { + child = radix_tree_node_alloc(gfp); + if (!child) + goto nomem; + child->shift = node->shift - RADIX_TREE_MAP_SHIFT; + child->offset = offset; + child->count = 0; + child->parent = node; + if (node != parent) { + node->count++; + node->slots[offset] = node_to_entry(child); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(node, tag, offset); + } + + node = child; + offset = 0; + continue; + } + + n = insert_entries(node, &node->slots[offset], + RADIX_TREE_RETRY, order, false); + BUG_ON(n > RADIX_TREE_MAP_SIZE); + + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(node, tag, offset); + offset += n; + + while (offset == RADIX_TREE_MAP_SIZE) { + if (node == parent) + break; + offset = node->offset; + child = node; + node = node->parent; + rcu_assign_pointer(node->slots[offset], + node_to_entry(child)); + offset++; + } + if ((node == parent) && (offset == end)) + return 0; + } + + nomem: + /* Shouldn't happen; did user forget to preload? */ + /* TODO: free all the allocated nodes */ + WARN_ON(1); + return -ENOMEM; +} +#endif + +/** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root * @index: index key @@ -989,6 +1411,34 @@ static void node_tag_clear(struct radix_tree_root *root, root_tag_clear(root, tag); } +static void node_tag_set(struct radix_tree_root *root, + struct radix_tree_node *node, + unsigned int tag, unsigned int offset) +{ + while (node) { + if (tag_get(node, tag, offset)) + return; + tag_set(node, tag, offset); + offset = node->offset; + node = node->parent; + } + + if (!root_tag_get(root, tag)) + root_tag_set(root, tag); +} + +/** + * radix_tree_iter_tag_set - set a tag on the current iterator entry + * @root: radix tree root + * @iter: iterator state + * @tag: tag to set + */ +void radix_tree_iter_tag_set(struct radix_tree_root *root, + const struct radix_tree_iter *iter, unsigned int tag) +{ + node_tag_set(root, iter->node, tag, iter_offset(iter)); +} + /** * radix_tree_tag_clear - clear a tag on a radix tree node * @root: radix tree root @@ -1084,6 +1534,121 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter, #endif } +/* Construct iter->tags bit-mask from node->tags[tag] array */ +static void set_iter_tags(struct radix_tree_iter *iter, + struct radix_tree_node *node, unsigned offset, + unsigned tag) +{ + unsigned tag_long = offset / BITS_PER_LONG; + unsigned tag_bit = offset % BITS_PER_LONG; + + iter->tags = node->tags[tag][tag_long] >> tag_bit; + + /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ + if (tag_long < RADIX_TREE_TAG_LONGS - 1) { + /* Pick tags from next element */ + if (tag_bit) + iter->tags |= node->tags[tag][tag_long + 1] << + (BITS_PER_LONG - tag_bit); + /* Clip chunk size, here only BITS_PER_LONG tags */ + iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG); + } +} + +#ifdef CONFIG_RADIX_TREE_MULTIORDER +static void **skip_siblings(struct radix_tree_node **nodep, + void **slot, struct radix_tree_iter *iter) +{ + void *sib = node_to_entry(slot - 1); + + while (iter->index < iter->next_index) { + *nodep = rcu_dereference_raw(*slot); + if (*nodep && *nodep != sib) + return slot; + slot++; + iter->index = __radix_tree_iter_add(iter, 1); + iter->tags >>= 1; + } + + *nodep = NULL; + return NULL; +} + +void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, + unsigned flags) +{ + unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; + struct radix_tree_node *node = rcu_dereference_raw(*slot); + + slot = skip_siblings(&node, slot, iter); + + while (radix_tree_is_internal_node(node)) { + unsigned offset; + unsigned long next_index; + + if (node == RADIX_TREE_RETRY) + return slot; + node = entry_to_node(node); + iter->node = node; + iter->shift = node->shift; + + if (flags & RADIX_TREE_ITER_TAGGED) { + offset = radix_tree_find_next_bit(node, tag, 0); + if (offset == RADIX_TREE_MAP_SIZE) + return NULL; + slot = &node->slots[offset]; + iter->index = __radix_tree_iter_add(iter, offset); + set_iter_tags(iter, node, offset, tag); + node = rcu_dereference_raw(*slot); + } else { + offset = 0; + slot = &node->slots[0]; + for (;;) { + node = rcu_dereference_raw(*slot); + if (node) + break; + slot++; + offset++; + if (offset == RADIX_TREE_MAP_SIZE) + return NULL; + } + iter->index = __radix_tree_iter_add(iter, offset); + } + if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0)) + goto none; + next_index = (iter->index | shift_maxindex(iter->shift)) + 1; + if (next_index < iter->next_index) + iter->next_index = next_index; + } + + return slot; + none: + iter->next_index = 0; + return NULL; +} +EXPORT_SYMBOL(__radix_tree_next_slot); +#else +static void **skip_siblings(struct radix_tree_node **nodep, + void **slot, struct radix_tree_iter *iter) +{ + return slot; +} +#endif + +void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter) +{ + struct radix_tree_node *node; + + slot++; + iter->index = __radix_tree_iter_add(iter, 1); + node = rcu_dereference_raw(*slot); + skip_siblings(&node, slot, iter); + iter->next_index = iter->index; + iter->tags = 0; + return NULL; +} +EXPORT_SYMBOL(radix_tree_iter_resume); + /** * radix_tree_next_chunk - find next chunk of slots for iteration * @@ -1109,7 +1674,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. * * This condition also used by radix_tree_next_slot() to stop - * contiguous iterating, and forbid swithing to the next chunk. + * contiguous iterating, and forbid switching to the next chunk. */ index = iter->next_index; if (!index && iter->index) @@ -1127,6 +1692,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, iter->index = index; iter->next_index = maxindex + 1; iter->tags = 1; + iter->node = NULL; __set_iter_shift(iter, 0); return (void **)&root->rnode; } @@ -1142,9 +1708,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, return NULL; if (flags & RADIX_TREE_ITER_TAGGED) - offset = radix_tree_find_next_bit( - node->tags[tag], - RADIX_TREE_MAP_SIZE, + offset = radix_tree_find_next_bit(node, tag, offset + 1); else while (++offset < RADIX_TREE_MAP_SIZE) { @@ -1164,154 +1728,26 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, child = rcu_dereference_raw(node->slots[offset]); } - if ((child == NULL) || (child == RADIX_TREE_RETRY)) + if (!child) goto restart; + if (child == RADIX_TREE_RETRY) + break; } while (radix_tree_is_internal_node(child)); /* Update the iterator state */ iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); iter->next_index = (index | node_maxindex(node)) + 1; + iter->node = node; __set_iter_shift(iter, node->shift); - /* Construct iter->tags bit-mask from node->tags[tag] array */ - if (flags & RADIX_TREE_ITER_TAGGED) { - unsigned tag_long, tag_bit; - - tag_long = offset / BITS_PER_LONG; - tag_bit = offset % BITS_PER_LONG; - iter->tags = node->tags[tag][tag_long] >> tag_bit; - /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ - if (tag_long < RADIX_TREE_TAG_LONGS - 1) { - /* Pick tags from next element */ - if (tag_bit) - iter->tags |= node->tags[tag][tag_long + 1] << - (BITS_PER_LONG - tag_bit); - /* Clip chunk size, here only BITS_PER_LONG tags */ - iter->next_index = index + BITS_PER_LONG; - } - } + if (flags & RADIX_TREE_ITER_TAGGED) + set_iter_tags(iter, node, offset, tag); return node->slots + offset; } EXPORT_SYMBOL(radix_tree_next_chunk); /** - * radix_tree_range_tag_if_tagged - for each item in given range set given - * tag if item has another tag set - * @root: radix tree root - * @first_indexp: pointer to a starting index of a range to scan - * @last_index: last index of a range to scan - * @nr_to_tag: maximum number items to tag - * @iftag: tag index to test - * @settag: tag index to set if tested tag is set - * - * This function scans range of radix tree from first_index to last_index - * (inclusive). For each item in the range if iftag is set, the function sets - * also settag. The function stops either after tagging nr_to_tag items or - * after reaching last_index. - * - * The tags must be set from the leaf level only and propagated back up the - * path to the root. We must do this so that we resolve the full path before - * setting any tags on intermediate nodes. If we set tags as we descend, then - * we can get to the leaf node and find that the index that has the iftag - * set is outside the range we are scanning. This reults in dangling tags and - * can lead to problems with later tag operations (e.g. livelocks on lookups). - * - * The function returns the number of leaves where the tag was set and sets - * *first_indexp to the first unscanned index. - * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must - * be prepared to handle that. - */ -unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, - unsigned long *first_indexp, unsigned long last_index, - unsigned long nr_to_tag, - unsigned int iftag, unsigned int settag) -{ - struct radix_tree_node *parent, *node, *child; - unsigned long maxindex; - unsigned long tagged = 0; - unsigned long index = *first_indexp; - - radix_tree_load_root(root, &child, &maxindex); - last_index = min(last_index, maxindex); - if (index > last_index) - return 0; - if (!nr_to_tag) - return 0; - if (!root_tag_get(root, iftag)) { - *first_indexp = last_index + 1; - return 0; - } - if (!radix_tree_is_internal_node(child)) { - *first_indexp = last_index + 1; - root_tag_set(root, settag); - return 1; - } - - node = entry_to_node(child); - - for (;;) { - unsigned offset = radix_tree_descend(node, &child, index); - if (!child) - goto next; - if (!tag_get(node, iftag, offset)) - goto next; - /* Sibling slots never have tags set on them */ - if (radix_tree_is_internal_node(child)) { - node = entry_to_node(child); - continue; - } - - /* tag the leaf */ - tagged++; - tag_set(node, settag, offset); - - /* walk back up the path tagging interior nodes */ - parent = node; - for (;;) { - offset = parent->offset; - parent = parent->parent; - if (!parent) - break; - /* stop if we find a node with the tag already set */ - if (tag_get(parent, settag, offset)) - break; - tag_set(parent, settag, offset); - } - next: - /* Go to next entry in node */ - index = ((index >> node->shift) + 1) << node->shift; - /* Overflow can happen when last_index is ~0UL... */ - if (index > last_index || !index) - break; - offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; - while (offset == 0) { - /* - * We've fully scanned this node. Go up. Because - * last_index is guaranteed to be in the tree, what - * we do below cannot wander astray. - */ - node = node->parent; - offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; - } - if (is_sibling_entry(node, node->slots[offset])) - goto next; - if (tagged >= nr_to_tag) - break; - } - /* - * We need not to tag the root tag if there is no tag which is set with - * settag within the range from *first_indexp to last_index. - */ - if (tagged > 0) - root_tag_set(root, settag); - *first_indexp = index; - - return tagged; -} -EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); - -/** * radix_tree_gang_lookup - perform multiple lookup on a radix tree * @root: radix tree root * @results: where the results of the lookup are placed @@ -1476,105 +1912,6 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, } EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); -#if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP) -#include <linux/sched.h> /* for cond_resched() */ - -struct locate_info { - unsigned long found_index; - bool stop; -}; - -/* - * This linear search is at present only useful to shmem_unuse_inode(). - */ -static unsigned long __locate(struct radix_tree_node *slot, void *item, - unsigned long index, struct locate_info *info) -{ - unsigned long i; - - do { - unsigned int shift = slot->shift; - - for (i = (index >> shift) & RADIX_TREE_MAP_MASK; - i < RADIX_TREE_MAP_SIZE; - i++, index += (1UL << shift)) { - struct radix_tree_node *node = - rcu_dereference_raw(slot->slots[i]); - if (node == RADIX_TREE_RETRY) - goto out; - if (!radix_tree_is_internal_node(node)) { - if (node == item) { - info->found_index = index; - info->stop = true; - goto out; - } - continue; - } - node = entry_to_node(node); - if (is_sibling_entry(slot, node)) - continue; - slot = node; - break; - } - } while (i < RADIX_TREE_MAP_SIZE); - -out: - if ((index == 0) && (i == RADIX_TREE_MAP_SIZE)) - info->stop = true; - return index; -} - -/** - * radix_tree_locate_item - search through radix tree for item - * @root: radix tree root - * @item: item to be found - * - * Returns index where item was found, or -1 if not found. - * Caller must hold no lock (since this time-consuming function needs - * to be preemptible), and must check afterwards if item is still there. - */ -unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) -{ - struct radix_tree_node *node; - unsigned long max_index; - unsigned long cur_index = 0; - struct locate_info info = { - .found_index = -1, - .stop = false, - }; - - do { - rcu_read_lock(); - node = rcu_dereference_raw(root->rnode); - if (!radix_tree_is_internal_node(node)) { - rcu_read_unlock(); - if (node == item) - info.found_index = 0; - break; - } - - node = entry_to_node(node); - - max_index = node_maxindex(node); - if (cur_index > max_index) { - rcu_read_unlock(); - break; - } - - cur_index = __locate(node, item, cur_index, &info); - rcu_read_unlock(); - cond_resched(); - } while (!info.stop && cur_index <= max_index); - - return info.found_index; -} -#else -unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) -{ - return -1; -} -#endif /* CONFIG_SHMEM && CONFIG_SWAP */ - /** * __radix_tree_delete_node - try to free node after clearing a slot * @root: radix tree root @@ -1590,18 +1927,14 @@ void __radix_tree_delete_node(struct radix_tree_root *root, delete_node(root, node, NULL, NULL); } -static inline void delete_sibling_entries(struct radix_tree_node *node, - void *ptr, unsigned offset) +static void radix_tree_iter_delete(struct radix_tree_root *root, + const struct radix_tree_iter *iter) { -#ifdef CONFIG_RADIX_TREE_MULTIORDER - int i; - for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { - if (node->slots[offset + i] != ptr) - break; - node->slots[offset + i] = NULL; - node->count--; - } -#endif + struct radix_tree_node *node = iter->node; + + node->slots[iter_offset(iter)] = NULL; + node->count--; + __radix_tree_delete_node(root, node); } /** @@ -1612,7 +1945,7 @@ static inline void delete_sibling_entries(struct radix_tree_node *node, * * Remove @item at @index from the radix tree rooted at @root. * - * Returns the address of the deleted item, or NULL if it was not present + * Returns the value of the deleted item, or NULL if it was not present * or the entry at the given @index was not @item. */ void *radix_tree_delete_item(struct radix_tree_root *root, @@ -1632,18 +1965,22 @@ void *radix_tree_delete_item(struct radix_tree_root *root, return NULL; if (!node) { - root_tag_clear_all(root); + if (is_idr(root)) + root_tag_set(root, IDR_FREE); + else + root_tag_clear_all(root); root->rnode = NULL; return entry; } offset = get_slot_offset(node, slot); - /* Clear all tags associated with the item to be deleted. */ - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - node_tag_clear(root, node, tag, offset); + if (is_idr(root)) + node_tag_set(root, node, IDR_FREE, offset); + else + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + node_tag_clear(root, node, tag, offset); - delete_sibling_entries(node, node_to_entry(slot), offset); __radix_tree_replace(root, node, slot, NULL, NULL, NULL); return entry; @@ -1657,7 +1994,7 @@ EXPORT_SYMBOL(radix_tree_delete_item); * * Remove the item at @index from the radix tree rooted at @root. * - * Returns the address of the deleted item, or NULL if it was not present. + * Returns the value of the deleted item, or NULL if it was not present. */ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) { @@ -1674,8 +2011,7 @@ void radix_tree_clear_tags(struct radix_tree_root *root, for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) node_tag_clear(root, node, tag, offset); } else { - /* Clear root node tags */ - root->gfp_mask &= __GFP_BITS_MASK; + root_tag_clear_all(root); } } @@ -1690,6 +2026,425 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag) } EXPORT_SYMBOL(radix_tree_tagged); +/** + * idr_preload - preload for idr_alloc() + * @gfp_mask: allocation mask to use for preloading + * + * Preallocate memory to use for the next call to idr_alloc(). This function + * returns with preemption disabled. It will be enabled by idr_preload_end(). + */ +void idr_preload(gfp_t gfp_mask) +{ + __radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE); +} +EXPORT_SYMBOL(idr_preload); + +static int __idr_get_empty(struct radix_tree_root *root, gfp_t gfp, + unsigned long start, int end, + struct radix_tree_node **nodep, void ***slotp) +{ + struct radix_tree_node *node = NULL, *child; + void **slot = (void **)&root->rnode; + unsigned long maxindex; + unsigned long max = end > 0 ? end - 1 : INT_MAX; + unsigned int shift, offset = 0; + + grow: + shift = radix_tree_load_root(root, &child, &maxindex); + if (!radix_tree_tagged(root, IDR_FREE)) + start = max(start, maxindex + 1); + if (start > max) + return -ENOSPC; + + if (start > maxindex) { + int error = radix_tree_extend(root, gfp, start, shift); + if (error < 0) + return error; + shift = error; + child = root->rnode; + } + + while (shift) { + shift -= RADIX_TREE_MAP_SHIFT; + if (child == NULL) { + /* Have to add a child node. */ + child = radix_tree_node_alloc(gfp); + if (!child) + return -ENOMEM; + child->shift = shift; + child->offset = offset; + child->parent = node; + all_tag_set(child, IDR_FREE); + rcu_assign_pointer(*slot, node_to_entry(child)); + if (node) + node->count++; + } else if (!radix_tree_is_internal_node(child)) + break; + + node = entry_to_node(child); + offset = radix_tree_descend(node, &child, start); + if (!tag_get(node, IDR_FREE, offset)) { + offset = radix_tree_find_next_bit(node, IDR_FREE, + offset + 1); + start = next_index(start, node, offset); + if (start > max) + return -ENOSPC; + while (offset == RADIX_TREE_MAP_SIZE) { + offset = node->offset + 1; + node = node->parent; + if (!node) + goto grow; + shift = node->shift; + } + child = node->slots[offset]; + } + slot = &node->slots[offset]; + } + + *nodep = node; + *slotp = slot; + return start; +} + +/** + * idr_alloc - allocate an id + * @idr: idr handle + * @ptr: pointer to be associated with the new id + * @start: the minimum id (inclusive) + * @end: the maximum id (exclusive) + * @gfp: memory allocation flags + * + * Allocates an unused ID in the range [start, end). Returns -ENOSPC + * if there are no unused IDs in that range. + */ +int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp) +{ + struct radix_tree_node *node; + void **slot; + int id; + + if (WARN_ON_ONCE(start < 0)) + return -EINVAL; + + id = __idr_get_empty(&idr->idr_rt, gfp, start, end, &node, &slot); + if (id < 0) + return id; + + BUG_ON(radix_tree_is_internal_node(ptr)); + + rcu_assign_pointer(*slot, ptr); + if (node) + node->count++; + node_tag_clear(&idr->idr_rt, node, IDR_FREE, + get_slot_offset(node, slot)); + return id; +} +EXPORT_SYMBOL(idr_alloc); + +/** + * idr_alloc_cyclic - allocate new idr entry in a cyclical fashion + * @idr: idr handle + * @ptr: pointer to be associated with the new id + * @start: the minimum id (inclusive) + * @end: the maximum id (exclusive) + * @gfp: memory allocation flags + * + * Allocates an ID larger than the last ID allocated if one is available. + * If not, it will attempt to allocate the smallest ID that is larger or + * equal to @start. + */ +int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp) +{ + int id, curr = idr->idr_next; + + if (curr < start) + curr = start; + + id = idr_alloc(idr, ptr, curr, end, gfp); + if ((id == -ENOSPC) && (curr > start)) + id = idr_alloc(idr, ptr, start, curr, gfp); + + if (id >= 0) + idr->idr_next = id + 1U; + + return id; +} +EXPORT_SYMBOL(idr_alloc_cyclic); + +/** + * idr_for_each - iterate through all stored pointers + * @idr: idr handle + * @fn: function to be called for each pointer + * @data: data passed to callback function + * + * The callback function will be called for each entry in @idr, passing + * the id, the pointer and the data pointer passed to this function. + * + * If @fn returns anything other than %0, the iteration stops and that + * value is returned from this function. + * + * idr_for_each() can be called concurrently with idr_get_new() and + * idr_remove() if protected by RCU. Newly added entries may not be + * seen and deleted entries may be seen, but adding and removing entries + * will not cause other entries to be skipped, nor spurious ones to be seen. + */ +int idr_for_each(struct idr *idr, + int (*fn)(int id, void *p, void *data), void *data) +{ + struct radix_tree_iter iter; + void **slot; + + radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, 0) { + int ret = fn(iter.index, *slot, data); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL(idr_for_each); + +/** + * idr_get_next - Find next populated entry + * @idr: idr handle + * @nextid: Pointer to lowest possible ID to return + * + * Returns the next populated entry in the tree with an ID greater than + * or equal to the value pointed to by @nextid. On exit, @nextid is updated + * to the ID of the found value. To use in a loop, the value pointed to by + * nextid must be incremented by the user. + */ +void *idr_get_next(struct idr *idr, int *nextid) +{ + struct radix_tree_iter iter; + void **slot; + + radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, *nextid) { + *nextid = iter.index; + return *slot; + } + + return NULL; +} +EXPORT_SYMBOL(idr_get_next); + +/** + * idr_replace - replace pointer for given id + * @idr: idr handle + * @ptr: New pointer to associate with the ID + * @id: Lookup key + * + * Replace the pointer registered with an id and return the old value. + * A %-ENOENT return indicates that @id was not found. + * A %-EINVAL return indicates that @id was not within valid constraints. + * + * This function can be called under the RCU read lock concurrently with + * idr_remove(). + */ +void *idr_replace(struct idr *idr, void *ptr, int id) +{ + struct radix_tree_node *node; + void **slot; + void *entry; + + if (id < 0) + return ERR_PTR(-EINVAL); + if (!ptr || radix_tree_is_internal_node(ptr)) + return ERR_PTR(-EINVAL); + + entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot); + + if (!entry) + return ERR_PTR(-ENOENT); + + __radix_tree_replace(&idr->idr_rt, node, slot, ptr, NULL, NULL); + + return entry; +} +EXPORT_SYMBOL(idr_replace); + +/** + * idr_destroy - release all internal memory from an IDR + * @idr: idr handle + * + * After this function is called, the IDR is empty, and may be reused or + * the data structure containing it may be freed. + * + * A typical clean-up sequence for objects stored in an idr tree will use + * idr_for_each() to free all objects, if necessary, then idr_destroy() to + * free the memory used to keep track of those objects. + */ +void idr_destroy(struct idr *idr) +{ + struct radix_tree_node **slot = &idr->idr_rt.rnode; + if (radix_tree_is_internal_node(*slot)) + radix_tree_free_nodes(*slot); + *slot = NULL; + root_tag_set(&idr->idr_rt, IDR_FREE); +} +EXPORT_SYMBOL(idr_destroy); + +/** + * ida_pre_get - reserve resources for ida allocation + * @ida: ida handle + * @gfp: memory allocation flags + * + * This function should be called before calling ida_get_new_above(). If it + * is unable to allocate memory, it will return %0. On success, it returns %1. + */ +int ida_pre_get(struct ida *ida, gfp_t gfp) +{ + struct ida_bitmap *bitmap; + + idr_preload(gfp); + idr_preload_end(); + + if (!ida->free_bitmap) { + bitmap = kmalloc(sizeof(struct ida_bitmap), gfp); + if (!bitmap) + return 0; + bitmap = xchg(&ida->free_bitmap, bitmap); + kfree(bitmap); + } + + return 1; +} +EXPORT_SYMBOL(ida_pre_get); + +/** + * ida_get_new_above - allocate new ID above or equal to a start id + * @ida: ida handle + * @starting_id: id to start search at + * @p_id: pointer to the allocated handle + * + * Allocate new ID above or equal to @starting_id. It should be called + * with any required locks to ensure that concurrent calls to + * ida_get_new_above() / ida_get_new() / ida_remove() are not allowed. + * Consider using ida_simple_get() if you do not have complex locking + * requirements. + * + * If memory is required, it will return %-EAGAIN, you should unlock + * and go back to the ida_pre_get() call. If the ida is full, it will + * return %-ENOSPC. + * + * @p_id returns a value in the range @starting_id ... %0x7fffffff. + */ +int ida_get_new_above(struct ida *ida, int start, int *id) +{ + struct radix_tree_root *root = &ida->ida_rt; + void **slot = (void **)&root->rnode; + struct radix_tree_node *node; + struct ida_bitmap *bitmap = NULL; + unsigned long index; + unsigned bit, offset = 0; + + index = start / IDA_BITMAP_BITS; + bit = start % IDA_BITMAP_BITS; + + restart: + index = __idr_get_empty(root, GFP_ATOMIC, index, INT_MAX, &node, &slot); + if (index > INT_MAX) + return index; + + index *= IDA_BITMAP_BITS; + if (index > INT_MAX) + return -ENOSPC; + + if (index > start) + bit = 0; + offset = get_slot_offset(node, slot); + + bitmap = *slot; + if (bitmap) { + bit = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, bit); + index += bit; + if (index > INT_MAX) + return -ENOSPC; + if (bit == IDA_BITMAP_BITS) { + index /= IDA_BITMAP_BITS; + goto restart; + } + __set_bit(bit, bitmap->bitmap); + if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS)) + node_tag_clear(root, node, IDR_FREE, offset); + bitmap = xchg(&ida->free_bitmap, NULL); + kfree(bitmap); + } else { + index += bit; + bitmap = xchg(&ida->free_bitmap, NULL); + if (!bitmap) + return -EAGAIN; + memset(bitmap, 0, sizeof(*bitmap)); + __set_bit(bit, bitmap->bitmap); + rcu_assign_pointer(*slot, bitmap); + if (node) + node->count++; + } + + *id = index; + return 0; +} +EXPORT_SYMBOL(ida_get_new_above); + +/** + * ida_remove - Free the given ID + * @ida: ida handle + * @id: ID to free + * + * This function should not be called at the same time as ida_get_new_above(). + */ +void ida_remove(struct ida *ida, int id) +{ + unsigned long index = id / IDA_BITMAP_BITS; + unsigned offset = id % IDA_BITMAP_BITS; + struct ida_bitmap *bitmap; + struct radix_tree_node *node; + void **slot; + + bitmap = __radix_tree_lookup(&ida->ida_rt, index, &node, &slot); + if (!bitmap || !test_bit(offset, bitmap->bitmap)) + goto err; + + __clear_bit(offset, bitmap->bitmap); + node_tag_set(&ida->ida_rt, node, IDR_FREE, get_slot_offset(node, slot)); + if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) { + *slot = NULL; + kfree(bitmap); + if (node) { + node->count--; + __radix_tree_delete_node(&ida->ida_rt, node); + } + } + return; + err: + WARN(1, "ida_remove called for id=%d which is not allocated.\n", id); +} +EXPORT_SYMBOL(ida_remove); + +/** + * ida_destroy - Free the contents of an ida + * @ida: ida handle + * + * Calling this function releases all resources associated with an IDA. When + * this call returns, the IDA is empty and can be reused or freed. The caller + * should not allow ida_remove() or ida_get_new_above() to be called at the + * same time. + */ +void ida_destroy(struct ida *ida) +{ + struct radix_tree_iter iter; + void **slot; + + radix_tree_for_each_slot(slot, &ida->ida_rt, &iter, 0) { + struct ida_bitmap *bitmap = *slot; + kfree(bitmap); + radix_tree_iter_delete(&ida->ida_rt, &iter); + } + + kfree(ida->free_bitmap); +} +EXPORT_SYMBOL(ida_destroy); + static void radix_tree_node_ctor(void *arg) { diff --git a/mm/filemap.c b/mm/filemap.c index 69568388c699..235021e361eb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2164,12 +2164,12 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct fault_env *fe, +void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct radix_tree_iter iter; void **slot; - struct file *file = fe->vma->vm_file; + struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; loff_t size; @@ -2225,11 +2225,11 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; - if (fe->pte) - fe->pte += iter.index - last_pgoff; + vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; + if (vmf->pte) + vmf->pte += iter.index - last_pgoff; last_pgoff = iter.index; - if (alloc_set_pte(fe, NULL, page)) + if (alloc_set_pte(vmf, NULL, page)) goto unlock; unlock_page(page); goto next; @@ -2239,7 +2239,7 @@ skip: put_page(page); next: /* Huge page is mapped? No need to proceed. */ - if (pmd_trans_huge(*fe->pmd)) + if (pmd_trans_huge(*vmf->pmd)) break; if (iter.index == end_pgoff) break; @@ -865,9 +865,10 @@ EXPORT_SYMBOL(get_user_pages_locked); * caller if required (just like with __get_user_pages). "FOLL_GET" * is set implicitly if "pages" is non-NULL. */ -__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) +static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + unsigned int gup_flags) { long ret; int locked = 1; @@ -879,7 +880,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m up_read(&mm->mmap_sem); return ret; } -EXPORT_SYMBOL(__get_user_pages_unlocked); /* * get_user_pages_unlocked() is suitable to replace the form: @@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages @@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, int *locked) { return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, - NULL, false, + locked, true, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote); /* * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's. We also - * obviously don't pass FOLL_REMOTE in here. + * and mm being operated on are the current task's and don't allow + * passing of a locked parameter. We also obviously don't pass + * FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cee42cf05477..10eedbf14421 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -542,13 +542,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); -static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, +static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, gfp_t gfp) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; pgtable_t pgtable; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; VM_BUG_ON_PAGE(!PageCompound(page), page); @@ -573,9 +573,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, */ __SetPageUptodate(page); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) { - spin_unlock(fe->ptl); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); @@ -586,11 +586,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, if (userfaultfd_missing(vma)) { int ret; - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); - ret = handle_userfault(fe, VM_UFFD_MISSING); + ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } @@ -600,11 +600,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, page_add_new_anon_rmap(page, vma, haddr, true); mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); - pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); atomic_long_inc(&vma->vm_mm->nr_ptes); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -651,12 +651,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return true; } -int do_huge_pmd_anonymous_page(struct fault_env *fe) +int do_huge_pmd_anonymous_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; gfp_t gfp; struct page *page; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return VM_FAULT_FALLBACK; @@ -664,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; - if (!(fe->flags & FAULT_FLAG_WRITE) && + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; @@ -680,22 +680,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); ret = 0; set = false; - if (pmd_none(*fe->pmd)) { + if (pmd_none(*vmf->pmd)) { if (userfaultfd_missing(vma)) { - spin_unlock(fe->ptl); - ret = handle_userfault(fe, VM_UFFD_MISSING); + spin_unlock(vmf->ptl); + ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); } else { set_huge_zero_page(pgtable, vma->vm_mm, vma, - haddr, fe->pmd, zero_page); - spin_unlock(fe->ptl); + haddr, vmf->pmd, zero_page); + spin_unlock(vmf->ptl); set = true; } } else - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); if (!set) pte_free(vma->vm_mm, pgtable); return ret; @@ -707,7 +707,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) return VM_FAULT_FALLBACK; } prep_transhuge_page(page); - return __do_huge_pmd_anonymous_page(fe, page, gfp); + return __do_huge_pmd_anonymous_page(vmf, page, gfp); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -879,30 +879,30 @@ out: return ret; } -void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) +void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) { pmd_t entry; unsigned long haddr; - fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto unlock; entry = pmd_mkyoung(orig_pmd); - haddr = fe->address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, - fe->flags & FAULT_FLAG_WRITE)) - update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); + haddr = vmf->address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, + vmf->flags & FAULT_FLAG_WRITE)) + update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); } -static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, +static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, struct page *page) { - struct vm_area_struct *vma = fe->vma; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; struct mem_cgroup *memcg; pgtable_t pgtable; pmd_t _pmd; @@ -921,7 +921,7 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, for (i = 0; i < HPAGE_PMD_NR; i++) { pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | __GFP_OTHER_NODE, vma, - fe->address, page_to_nid(page)); + vmf->address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_try_charge(pages[i], vma->vm_mm, GFP_KERNEL, &memcg, false))) { @@ -952,15 +952,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); + pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); pmd_populate(vma->vm_mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -969,20 +969,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, entry = maybe_mkwrite(pte_mkdirty(entry), vma); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); + page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); mem_cgroup_commit_charge(pages[i], memcg, false, false); lru_cache_add_active_or_unevictable(pages[i], vma); - fe->pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*fe->pte)); - set_pte_at(vma->vm_mm, haddr, fe->pte, entry); - pte_unmap(fe->pte); + vmf->pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*vmf->pte)); + set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); + pte_unmap(vmf->pte); } kfree(pages); smp_wmb(); /* make pte visible before pmd */ - pmd_populate(vma->vm_mm, fe->pmd, pgtable); + pmd_populate(vma->vm_mm, vmf->pmd, pgtable); page_remove_rmap(page, true); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); @@ -993,7 +993,7 @@ out: return ret; out_free_pages: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); @@ -1005,23 +1005,23 @@ out_free_pages: goto out; } -int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) +int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *new_page; struct mem_cgroup *memcg; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t huge_gfp; /* for allocation and charge */ int ret = 0; - fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); + vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); if (is_huge_zero_pmd(orig_pmd)) goto alloc; - spin_lock(fe->ptl); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto out_unlock; page = pmd_page(orig_pmd); @@ -1034,13 +1034,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } get_page(page); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { @@ -1053,12 +1053,12 @@ alloc: prep_transhuge_page(new_page); } else { if (!page) { - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); ret |= VM_FAULT_FALLBACK; } else { - ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); + ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); if (ret & VM_FAULT_OOM) { - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); ret |= VM_FAULT_FALLBACK; } put_page(page); @@ -1070,7 +1070,7 @@ alloc: if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, huge_gfp, &memcg, true))) { put_page(new_page); - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); if (page) put_page(page); ret |= VM_FAULT_FALLBACK; @@ -1090,11 +1090,11 @@ alloc: mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - spin_lock(fe->ptl); + spin_lock(vmf->ptl); if (page) put_page(page); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { - spin_unlock(fe->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(new_page, memcg, true); put_page(new_page); goto out_mn; @@ -1102,12 +1102,12 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); page_add_new_anon_rmap(new_page, vma, haddr, true); mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); if (!page) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); } else { @@ -1117,13 +1117,13 @@ alloc: } ret |= VM_FAULT_WRITE; } - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); out_mn: mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); out: return ret; out_unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); return ret; } @@ -1196,12 +1196,12 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) +int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct anon_vma *anon_vma = NULL; struct page *page; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; bool page_locked; @@ -1209,8 +1209,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) bool was_writable; int flags = 0; - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(pmd, *fe->pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(pmd, *vmf->pmd))) goto out_unlock; /* @@ -1218,9 +1218,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * without disrupting NUMA hinting information. Do not relock and * check_same as the page may no longer be mapped. */ - if (unlikely(pmd_trans_migrating(*fe->pmd))) { - page = pmd_page(*fe->pmd); - spin_unlock(fe->ptl); + if (unlikely(pmd_trans_migrating(*vmf->pmd))) { + page = pmd_page(*vmf->pmd); + spin_unlock(vmf->ptl); wait_on_page_locked(page); goto out; } @@ -1253,7 +1253,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); wait_on_page_locked(page); page_nid = -1; goto out; @@ -1264,12 +1264,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * to serialises splits */ get_page(page); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ - spin_lock(fe->ptl); - if (unlikely(!pmd_same(pmd, *fe->pmd))) { + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(pmd, *vmf->pmd))) { unlock_page(page); put_page(page); page_nid = -1; @@ -1287,9 +1287,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * Migrate the THP to the requested node, returns with page unlocked * and access rights restored. */ - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, - fe->pmd, pmd, fe->address, page, target_nid); + vmf->pmd, pmd, vmf->address, page, target_nid); if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; @@ -1304,18 +1304,19 @@ clear_pmdnuma: pmd = pmd_mkyoung(pmd); if (was_writable) pmd = pmd_mkwrite(pmd); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); unlock_page(page); out_unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, + vmf->flags); return 0; } diff --git a/mm/internal.h b/mm/internal.h index 537ac9951f5f..44d68895a9b9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -36,7 +36,7 @@ /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) -int do_swap_page(struct fault_env *fe, pte_t orig_pte); +int do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 09460955e818..e32389a97030 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -875,13 +875,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int referenced) { - pte_t pteval; int swapped_in = 0, ret = 0; - struct fault_env fe = { + struct vm_fault vmf = { .vma = vma, .address = address, .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, + .pgoff = linear_page_index(vma, address), }; /* we only decide to swapin, if there is enough young ptes */ @@ -889,19 +889,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } - fe.pte = pte_offset_map(pmd, address); - for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; - fe.pte++, fe.address += PAGE_SIZE) { - pteval = *fe.pte; - if (!is_swap_pte(pteval)) + vmf.pte = pte_offset_map(pmd, address); + for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; + vmf.pte++, vmf.address += PAGE_SIZE) { + vmf.orig_pte = *vmf.pte; + if (!is_swap_pte(vmf.orig_pte)) continue; swapped_in++; - ret = do_swap_page(&fe, pteval); + ret = do_swap_page(&vmf); /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ if (ret & VM_FAULT_RETRY) { down_read(&mm->mmap_sem); - if (hugepage_vma_revalidate(mm, address, &fe.vma)) { + if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; @@ -915,10 +915,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, return false; } /* pte is unmapped now, we need to map it */ - fe.pte = pte_offset_map(pmd, fe.address); + vmf.pte = pte_offset_map(pmd, vmf.address); } - fe.pte--; - pte_unmap(fe.pte); + vmf.pte--; + pte_unmap(vmf.pte); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); return true; } @@ -1446,7 +1446,7 @@ static void collapse_shmem(struct mm_struct *mm, radix_tree_replace_slot(&mapping->page_tree, slot, new_page + (index % HPAGE_PMD_NR)); - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_resume(slot, &iter); index++; continue; out_lru: @@ -1546,7 +1546,6 @@ tree_unlocked: /* Put holes back where they were */ radix_tree_delete(&mapping->page_tree, iter.index); - slot = radix_tree_iter_next(&iter); continue; } @@ -1557,11 +1556,11 @@ tree_unlocked: page_ref_unfreeze(page, 2); radix_tree_replace_slot(&mapping->page_tree, slot, page); + slot = radix_tree_iter_resume(slot, &iter); spin_unlock_irq(&mapping->tree_lock); putback_lru_page(page); unlock_page(page); spin_lock_irq(&mapping->tree_lock); - slot = radix_tree_iter_next(&iter); } VM_BUG_ON(nr_none); spin_unlock_irq(&mapping->tree_lock); @@ -1641,8 +1640,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, present++; if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); diff --git a/mm/memory.c b/mm/memory.c index 08d8da39de28..455c3e628d52 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) * * We do this without the lock held, so that it can sleep if it needs to. */ -static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, - unsigned long address) +static int do_page_mkwrite(struct vm_fault *vmf) { - struct vm_fault vmf; int ret; + struct page *page = vmf->page; + unsigned int old_flags = vmf->flags; - vmf.virtual_address = (void __user *)(address & PAGE_MASK); - vmf.pgoff = page->index; - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vmf.page = page; - vmf.cow_page = NULL; + vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - ret = vma->vm_ops->page_mkwrite(vma, &vmf); + ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf); + /* Restore original flags so that caller is not surprised */ + vmf->flags = old_flags; if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { @@ -2063,6 +2060,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, } /* + * Handle dirtying of a page in shared file mapping on a write fault. + * + * The function expects the page to be locked and unlocks it. + */ +static void fault_dirty_shared_page(struct vm_area_struct *vma, + struct page *page) +{ + struct address_space *mapping; + bool dirtied; + bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; + + dirtied = set_page_dirty(page); + VM_BUG_ON_PAGE(PageAnon(page), page); + /* + * Take a local copy of the address_space - page.mapping may be zeroed + * by truncate after unlock_page(). The address_space itself remains + * pinned by vma->vm_file's reference. We rely on unlock_page()'s + * release semantics to prevent the compiler from undoing this copying. + */ + mapping = page_rmapping(page); + unlock_page(page); + + if ((dirtied || page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + if (!page_mkwrite) + file_update_time(vma->vm_file); +} + +/* * Handle write page faults for pages that can be reused in the current vma * * This can happen either due to the mapping being with the VM_SHARED flag, @@ -2070,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, - struct page *page, int page_mkwrite, int dirty_shared) - __releases(fe->ptl) +static inline void wp_page_reuse(struct vm_fault *vmf) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; + struct page *page = vmf->page; pte_t entry; /* * Clear the pages cpupid information as the existing @@ -2084,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, if (page) page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); - flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); - entry = pte_mkyoung(orig_pte); + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); + entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) - update_mmu_cache(vma, fe->address, fe->pte); - pte_unmap_unlock(fe->pte, fe->ptl); - - if (dirty_shared) { - struct address_space *mapping; - int dirtied; - - if (!page_mkwrite) - lock_page(page); - - dirtied = set_page_dirty(page); - VM_BUG_ON_PAGE(PageAnon(page), page); - mapping = page->mapping; - unlock_page(page); - put_page(page); - - if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - - if (!page_mkwrite) - file_update_time(vma->vm_file); - } - - return VM_FAULT_WRITE; + if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) + update_mmu_cache(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); } /* @@ -2135,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ -static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, - struct page *old_page) +static int wp_page_copy(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; + struct page *old_page = vmf->page; struct page *new_page = NULL; pte_t entry; int page_copied = 0; - const unsigned long mmun_start = fe->address & PAGE_MASK; + const unsigned long mmun_start = vmf->address & PAGE_MASK; const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; if (unlikely(anon_vma_prepare(vma))) goto oom; - if (is_zero_pfn(pte_pfn(orig_pte))) { - new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); + if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { + new_page = alloc_zeroed_user_highpage_movable(vma, + vmf->address); if (!new_page) goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - fe->address); + vmf->address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, fe->address, vma); + cow_user_page(new_page, old_page, vmf->address, vma); } if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) @@ -2172,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, /* * Re-check the pte - we dropped the lock */ - fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); - if (likely(pte_same(*fe->pte, orig_pte))) { + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, @@ -2183,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, } else { inc_mm_counter_fast(mm, MM_ANONPAGES); } - flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -2192,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, fe->address, fe->pte); - page_add_new_anon_rmap(new_page, vma, fe->address, false); + ptep_clear_flush_notify(vma, vmf->address, vmf->pte); + page_add_new_anon_rmap(new_page, vma, vmf->address, false); mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* @@ -2201,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ - set_pte_at_notify(mm, fe->address, fe->pte, entry); - update_mmu_cache(vma, fe->address, fe->pte); + set_pte_at_notify(mm, vmf->address, vmf->pte, entry); + update_mmu_cache(vma, vmf->address, vmf->pte); if (old_page) { /* * Only after switching the pte to the new page may @@ -2239,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, if (new_page) put_page(new_page); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* @@ -2263,79 +2269,91 @@ oom: return VM_FAULT_OOM; } +/** + * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE + * writeable once the page is prepared + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a write page fault in a + * shared mapping due to PTE being read-only once the mapped page is prepared. + * It handles locking of PTE and modifying it. The function returns + * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE + * lock. + * + * The function expects the page to be locked or other protection against + * concurrent faults / writeback (such as DAX radix tree locks). + */ +int finish_mkwrite_fault(struct vm_fault *vmf) +{ + WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + /* + * We might have raced with another page fault while we released the + * pte_offset_map_lock. + */ + if (!pte_same(*vmf->pte, vmf->orig_pte)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return VM_FAULT_NOPAGE; + } + wp_page_reuse(vmf); + return 0; +} + /* * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ -static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) +static int wp_pfn_shared(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { - struct vm_fault vmf = { - .page = NULL, - .pgoff = linear_page_index(vma, fe->address), - .virtual_address = - (void __user *)(fe->address & PAGE_MASK), - .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, - }; int ret; - pte_unmap_unlock(fe->pte, fe->ptl); - ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); - if (ret & VM_FAULT_ERROR) + pte_unmap_unlock(vmf->pte, vmf->ptl); + vmf->flags |= FAULT_FLAG_MKWRITE; + ret = vma->vm_ops->pfn_mkwrite(vma, vmf); + if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - /* - * We might have raced with another page fault while we - * released the pte_offset_map_lock. - */ - if (!pte_same(*fe->pte, orig_pte)) { - pte_unmap_unlock(fe->pte, fe->ptl); - return 0; - } + return finish_mkwrite_fault(vmf); } - return wp_page_reuse(fe, orig_pte, NULL, 0, 0); + wp_page_reuse(vmf); + return VM_FAULT_WRITE; } -static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, - struct page *old_page) - __releases(fe->ptl) +static int wp_page_shared(struct vm_fault *vmf) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; - int page_mkwrite = 0; + struct vm_area_struct *vma = vmf->vma; - get_page(old_page); + get_page(vmf->page); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; - pte_unmap_unlock(fe->pte, fe->ptl); - tmp = do_page_mkwrite(vma, old_page, fe->address); + pte_unmap_unlock(vmf->pte, vmf->ptl); + tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(old_page); + put_page(vmf->page); return tmp; } - /* - * Since we dropped the lock we need to revalidate - * the PTE as someone else may have changed it. If - * they did, we just return, as we can count on the - * MMU to tell us if they didn't also make it writable. - */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_same(*fe->pte, orig_pte)) { - unlock_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - put_page(old_page); - return 0; + tmp = finish_mkwrite_fault(vmf); + if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + unlock_page(vmf->page); + put_page(vmf->page); + return tmp; } - page_mkwrite = 1; + } else { + wp_page_reuse(vmf); + lock_page(vmf->page); } + fault_dirty_shared_page(vma, vmf->page); + put_page(vmf->page); - return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); + return VM_FAULT_WRITE; } /* @@ -2356,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_wp_page(struct fault_env *fe, pte_t orig_pte) - __releases(fe->ptl) +static int do_wp_page(struct vm_fault *vmf) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; - struct page *old_page; + struct vm_area_struct *vma = vmf->vma; - old_page = vm_normal_page(vma, fe->address, orig_pte); - if (!old_page) { + vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); + if (!vmf->page) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. @@ -2373,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) - return wp_pfn_shared(fe, orig_pte); + return wp_pfn_shared(vmf); - pte_unmap_unlock(fe->pte, fe->ptl); - return wp_page_copy(fe, orig_pte, old_page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return wp_page_copy(vmf); } /* * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(old_page) && !PageKsm(old_page)) { + if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { int total_mapcount; - if (!trylock_page(old_page)) { - get_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - lock_page(old_page); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); - if (!pte_same(*fe->pte, orig_pte)) { - unlock_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - put_page(old_page); + if (!trylock_page(vmf->page)) { + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + lock_page(vmf->page); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_same(*vmf->pte, vmf->orig_pte)) { + unlock_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + put_page(vmf->page); return 0; } - put_page(old_page); + put_page(vmf->page); } - if (reuse_swap_page(old_page, &total_mapcount)) { + if (reuse_swap_page(vmf->page, &total_mapcount)) { if (total_mapcount == 1) { /* * The page is all ours. Move it to @@ -2408,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) * Protected against the rmap code by * the page lock. */ - page_move_anon_rmap(old_page, vma); + page_move_anon_rmap(vmf->page, vma); } - unlock_page(old_page); - return wp_page_reuse(fe, orig_pte, old_page, 0, 0); + unlock_page(vmf->page); + wp_page_reuse(vmf); + return VM_FAULT_WRITE; } - unlock_page(old_page); + unlock_page(vmf->page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { - return wp_page_shared(fe, orig_pte, old_page); + return wp_page_shared(vmf); } /* * Ok, we need to copy. Oh, well.. */ - get_page(old_page); + get_page(vmf->page); - pte_unmap_unlock(fe->pte, fe->ptl); - return wp_page_copy(fe, orig_pte, old_page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return wp_page_copy(vmf); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -2513,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range); * We return with the mmap_sem locked or unlocked in the same cases * as does filemap_fault(). */ -int do_swap_page(struct fault_env *fe, pte_t orig_pte) +int do_swap_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page, *swapcache; struct mem_cgroup *memcg; swp_entry_t entry; @@ -2524,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) int exclusive = 0; int ret = 0; - if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; - entry = pte_to_swp_entry(orig_pte); + entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { - migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); + migration_entry_wait(vma->vm_mm, vmf->pmd, + vmf->address); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { - print_bad_pte(vma, fe->address, orig_pte, NULL); + print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; @@ -2542,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, fe->address); + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, + vmf->address); if (!page) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); - if (likely(pte_same(*fe->pte, orig_pte))) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; @@ -2573,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) } swapcache = page; - locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); + locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (!locked) { @@ -2590,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; - page = ksm_might_need_to_copy(page, vma, fe->address); + page = ksm_might_need_to_copy(page, vma, vmf->address); if (unlikely(!page)) { ret = VM_FAULT_OOM; page = swapcache; @@ -2606,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) /* * Back out if somebody else already faulted in this pte. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (unlikely(!pte_same(*fe->pte, orig_pte))) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) goto out_nomap; if (unlikely(!PageUptodate(page))) { @@ -2629,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); - fe->flags &= ~FAULT_FLAG_WRITE; + vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); - if (pte_swp_soft_dirty(orig_pte)) + if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); - set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + vmf->orig_pte = pte; if (page == swapcache) { - do_page_add_anon_rmap(page, vma, fe->address, exclusive); + do_page_add_anon_rmap(page, vma, vmf->address, exclusive); mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } @@ -2667,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) put_page(swapcache); } - if (fe->flags & FAULT_FLAG_WRITE) { - ret |= do_wp_page(fe, pte); + if (vmf->flags & FAULT_FLAG_WRITE) { + ret |= do_wp_page(vmf); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out: return ret; out_nomap: mem_cgroup_cancel_charge(page, memcg, false); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: unlock_page(page); out_release: @@ -2733,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_anonymous_page(struct fault_env *fe) +static int do_anonymous_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; struct page *page; pte_t entry; @@ -2745,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe) return VM_FAULT_SIGBUS; /* Check if we need to add a guard page to the stack */ - if (check_stack_guard_page(vma, fe->address) < 0) + if (check_stack_guard_page(vma, vmf->address) < 0) return VM_FAULT_SIGSEGV; /* @@ -2758,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe) * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) + if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) return VM_FAULT_OOM; /* See the comment in pte_alloc_one_map() */ - if (unlikely(pmd_trans_unstable(fe->pmd))) + if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; /* Use the zero-page for reads */ - if (!(fe->flags & FAULT_FLAG_WRITE) && + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { - entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), + entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_none(*fe->pte)) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_none(*vmf->pte)) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(fe->pte, fe->ptl); - return handle_userfault(fe, VM_UFFD_MISSING); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_MISSING); } goto setpte; } @@ -2785,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe) /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; - page = alloc_zeroed_user_highpage_movable(vma, fe->address); + page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!page) goto oom; @@ -2803,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe) if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_none(*fe->pte)) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (!pte_none(*vmf->pte)) goto release; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return handle_userfault(fe, VM_UFFD_MISSING); + return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: - set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; release: mem_cgroup_cancel_charge(page, memcg, false); @@ -2843,62 +2863,50 @@ oom: * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ -static int __do_fault(struct fault_env *fe, pgoff_t pgoff, - struct page *cow_page, struct page **page, void **entry) +static int __do_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct vm_fault vmf; + struct vm_area_struct *vma = vmf->vma; int ret; - vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); - vmf.pgoff = pgoff; - vmf.flags = fe->flags; - vmf.page = NULL; - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vmf.cow_page = cow_page; - - ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - return ret; - if (ret & VM_FAULT_DAX_LOCKED) { - *entry = vmf.entry; + ret = vma->vm_ops->fault(vma, vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | + VM_FAULT_DONE_COW))) return ret; - } - if (unlikely(PageHWPoison(vmf.page))) { + if (unlikely(PageHWPoison(vmf->page))) { if (ret & VM_FAULT_LOCKED) - unlock_page(vmf.page); - put_page(vmf.page); + unlock_page(vmf->page); + put_page(vmf->page); + vmf->page = NULL; return VM_FAULT_HWPOISON; } if (unlikely(!(ret & VM_FAULT_LOCKED))) - lock_page(vmf.page); + lock_page(vmf->page); else - VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); - *page = vmf.page; return ret; } -static int pte_alloc_one_map(struct fault_env *fe) +static int pte_alloc_one_map(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; - if (!pmd_none(*fe->pmd)) + if (!pmd_none(*vmf->pmd)) goto map_pte; - if (fe->prealloc_pte) { - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) { - spin_unlock(fe->ptl); + if (vmf->prealloc_pte) { + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + spin_unlock(vmf->ptl); goto map_pte; } atomic_long_inc(&vma->vm_mm->nr_ptes); - pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); - spin_unlock(fe->ptl); - fe->prealloc_pte = 0; - } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { + pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); + spin_unlock(vmf->ptl); + vmf->prealloc_pte = 0; + } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { return VM_FAULT_OOM; } map_pte: @@ -2913,11 +2921,11 @@ map_pte: * through an atomic read in C, which is what pmd_trans_unstable() * provides. */ - if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) return VM_FAULT_NOPAGE; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); return 0; } @@ -2935,24 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } -static void deposit_prealloc_pte(struct fault_env *fe) +static void deposit_prealloc_pte(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; - pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); /* * We are going to consume the prealloc table, * count that as nr_ptes. */ atomic_long_inc(&vma->vm_mm->nr_ptes); - fe->prealloc_pte = 0; + vmf->prealloc_pte = 0; } -static int do_set_pmd(struct fault_env *fe, struct page *page) +static int do_set_pmd(struct vm_fault *vmf, struct page *page) { - struct vm_area_struct *vma = fe->vma; - bool write = fe->flags & FAULT_FLAG_WRITE; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; int i, ret; @@ -2966,15 +2974,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * Archs like ppc64 need additonal space to store information * related to pte entry. Use the preallocated table for that. */ - if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { - fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); - if (!fe->prealloc_pte) + if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); + if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ } - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) goto out; for (i = 0; i < HPAGE_PMD_NR; i++) @@ -2990,11 +2998,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * deposit and withdraw with pmd lock held */ if (arch_needs_pgtable_deposit()) - deposit_prealloc_pte(fe); + deposit_prealloc_pte(vmf); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, haddr, fe->pmd); + update_mmu_cache_pmd(vma, haddr, vmf->pmd); /* fault is handled */ ret = 0; @@ -3005,13 +3013,13 @@ out: * withdraw with pmd lock held. */ if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) - fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, - fe->pmd); - spin_unlock(fe->ptl); + vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, + vmf->pmd); + spin_unlock(vmf->ptl); return ret; } #else -static int do_set_pmd(struct fault_env *fe, struct page *page) +static int do_set_pmd(struct vm_fault *vmf, struct page *page) { BUILD_BUG(); return 0; @@ -3022,41 +3030,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * alloc_set_pte - setup new PTE entry for given page and add reverse page * mapping. If needed, the fucntion allocates page table or use pre-allocated. * - * @fe: fault environment + * @vmf: fault environment * @memcg: memcg to charge page (only for private mappings) * @page: page to map * - * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. + * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on + * return. * * Target users are page handler itself and implementations of * vm_ops->map_pages. */ -int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, +int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page) { - struct vm_area_struct *vma = fe->vma; - bool write = fe->flags & FAULT_FLAG_WRITE; + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; pte_t entry; int ret; - if (pmd_none(*fe->pmd) && PageTransCompound(page) && + if (pmd_none(*vmf->pmd) && PageTransCompound(page) && IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { /* THP on COW? */ VM_BUG_ON_PAGE(memcg, page); - ret = do_set_pmd(fe, page); + ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) goto fault_handled; } - if (!fe->pte) { - ret = pte_alloc_one_map(fe); + if (!vmf->pte) { + ret = pte_alloc_one_map(vmf); if (ret) goto fault_handled; } /* Re-check under ptl */ - if (unlikely(!pte_none(*fe->pte))) { + if (unlikely(!pte_none(*vmf->pte))) { ret = VM_FAULT_NOPAGE; goto fault_handled; } @@ -3068,28 +3077,60 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } - set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); ret = 0; fault_handled: /* preallocated pagetable is unused: free it */ - if (fe->prealloc_pte) { - pte_free(fe->vma->vm_mm, fe->prealloc_pte); - fe->prealloc_pte = 0; + if (vmf->prealloc_pte) { + pte_free(vmf->vma->vm_mm, vmf->prealloc_pte); + vmf->prealloc_pte = 0; } return ret; } + +/** + * finish_fault - finish page fault once we have prepared the page to fault + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a page fault once the + * page to fault in is prepared. It handles locking of PTEs, inserts PTE for + * given page, adds reverse page mapping, handles memcg charges and LRU + * addition. The function returns 0 on success, VM_FAULT_ code in case of + * error. + * + * The function expects the page to be locked and on success it consumes a + * reference of a page being mapped (for the PTE which maps it). + */ +int finish_fault(struct vm_fault *vmf) +{ + struct page *page; + int ret; + + /* Did we COW the page? */ + if ((vmf->flags & FAULT_FLAG_WRITE) && + !(vmf->vma->vm_flags & VM_SHARED)) + page = vmf->cow_page; + else + page = vmf->page; + ret = alloc_set_pte(vmf, vmf->memcg, page); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); + return ret; +} + static unsigned long fault_around_bytes __read_mostly = rounddown_pow_of_two(65536); @@ -3154,17 +3195,18 @@ late_initcall(fault_around_debugfs); * fault_around_pages() value (and therefore to page order). This way it's * easier to guarantee that we don't cross page table boundaries. */ -static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) +static int do_fault_around(struct vm_fault *vmf) { - unsigned long address = fe->address, nr_pages, mask; + unsigned long address = vmf->address, nr_pages, mask; + pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; int off, ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; - fe->address = max(address & mask, fe->vma->vm_start); - off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + vmf->address = max(address & mask, vmf->vma->vm_start); + off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); start_pgoff -= off; /* @@ -3172,45 +3214,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) * or fault_around_pages() from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - - ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; - end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, + end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, start_pgoff + nr_pages - 1); - if (pmd_none(*fe->pmd)) { - fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); - if (!fe->prealloc_pte) + if (pmd_none(*vmf->pmd)) { + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, + vmf->address); + if (!vmf->prealloc_pte) goto out; smp_wmb(); /* See comment in __pte_alloc() */ } - fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); + vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); /* Huge page is mapped? Page fault is solved */ - if (pmd_trans_huge(*fe->pmd)) { + if (pmd_trans_huge(*vmf->pmd)) { ret = VM_FAULT_NOPAGE; goto out; } /* ->map_pages() haven't done anything useful. Cold page cache? */ - if (!fe->pte) + if (!vmf->pte) goto out; /* check if the page fault is solved */ - fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); - if (!pte_none(*fe->pte)) + vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); + if (!pte_none(*vmf->pte)) ret = VM_FAULT_NOPAGE; - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out: - fe->address = address; - fe->pte = NULL; + vmf->address = address; + vmf->pte = NULL; return ret; } -static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_read_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct page *fault_page; + struct vm_area_struct *vma = vmf->vma; int ret = 0; /* @@ -3219,80 +3261,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - ret = do_fault_around(fe, pgoff); + ret = do_fault_around(vmf); if (ret) return ret; } - ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - ret |= alloc_set_pte(fe, NULL, fault_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); - unlock_page(fault_page); + ret |= finish_fault(vmf); + unlock_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - put_page(fault_page); + put_page(vmf->page); return ret; } -static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_cow_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct page *fault_page, *new_page; - void *fault_entry; - struct mem_cgroup *memcg; + struct vm_area_struct *vma = vmf->vma; int ret; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); - if (!new_page) + vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); + if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { - put_page(new_page); + if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, + &vmf->memcg, false)) { + put_page(vmf->cow_page); return VM_FAULT_OOM; } - ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; + if (ret & VM_FAULT_DONE_COW) + return ret; - if (!(ret & VM_FAULT_DAX_LOCKED)) - copy_user_highpage(new_page, fault_page, fe->address, vma); - __SetPageUptodate(new_page); + copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); + __SetPageUptodate(vmf->cow_page); - ret |= alloc_set_pte(fe, memcg, new_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); - if (!(ret & VM_FAULT_DAX_LOCKED)) { - unlock_page(fault_page); - put_page(fault_page); - } else { - dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); - } + ret |= finish_fault(vmf); + unlock_page(vmf->page); + put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; return ret; uncharge_out: - mem_cgroup_cancel_charge(new_page, memcg, false); - put_page(new_page); + mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); + put_page(vmf->cow_page); return ret; } -static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_shared_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct page *fault_page; - struct address_space *mapping; - int dirtied = 0; + struct vm_area_struct *vma = vmf->vma; int ret, tmp; - ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3301,46 +3330,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) * about to become writable */ if (vma->vm_ops->page_mkwrite) { - unlock_page(fault_page); - tmp = do_page_mkwrite(vma, fault_page, fe->address); + unlock_page(vmf->page); + tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(fault_page); + put_page(vmf->page); return tmp; } } - ret |= alloc_set_pte(fe, NULL, fault_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); + ret |= finish_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { - unlock_page(fault_page); - put_page(fault_page); + unlock_page(vmf->page); + put_page(vmf->page); return ret; } - if (set_page_dirty(fault_page)) - dirtied = 1; - /* - * Take a local copy of the address_space - page.mapping may be zeroed - * by truncate after unlock_page(). The address_space itself remains - * pinned by vma->vm_file's reference. We rely on unlock_page()'s - * release semantics to prevent the compiler from undoing this copying. - */ - mapping = page_rmapping(fault_page); - unlock_page(fault_page); - if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping but still - * dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - - if (!vma->vm_ops->page_mkwrite) - file_update_time(vma->vm_file); - + fault_dirty_shared_page(vma, vmf->page); return ret; } @@ -3350,19 +3357,18 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int do_fault(struct fault_env *fe) +static int do_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - pgoff_t pgoff = linear_page_index(vma, fe->address); + struct vm_area_struct *vma = vmf->vma; /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) return VM_FAULT_SIGBUS; - if (!(fe->flags & FAULT_FLAG_WRITE)) - return do_read_fault(fe, pgoff); + if (!(vmf->flags & FAULT_FLAG_WRITE)) + return do_read_fault(vmf); if (!(vma->vm_flags & VM_SHARED)) - return do_cow_fault(fe, pgoff); - return do_shared_fault(fe, pgoff); + return do_cow_fault(vmf); + return do_shared_fault(vmf); } static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, @@ -3380,14 +3386,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -static int do_numa_page(struct fault_env *fe, pte_t pte) +static int do_numa_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; int page_nid = -1; int last_cpupid; int target_nid; bool migrated = false; + pte_t pte = vmf->orig_pte; bool was_writable = pte_write(pte); int flags = 0; @@ -3400,10 +3407,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) * page table entry is not accessible, so there would be no * concurrent hardware modifications to the PTE. */ - fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); - if (unlikely(!pte_same(*fe->pte, pte))) { - pte_unmap_unlock(fe->pte, fe->ptl); + vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -3412,18 +3419,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); - update_mmu_cache(vma, fe->address, fe->pte); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + update_mmu_cache(vma, vmf->address, vmf->pte); - page = vm_normal_page(vma, fe->address, pte); + page = vm_normal_page(vma, vmf->address, pte); if (!page) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } /* TODO: handle PTE-mapped THP */ if (PageCompound(page)) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } @@ -3447,9 +3454,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, + target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); if (target_nid == -1) { put_page(page); goto out; @@ -3469,28 +3476,28 @@ out: return 0; } -static int create_huge_pmd(struct fault_env *fe) +static int create_huge_pmd(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; if (vma_is_anonymous(vma)) - return do_huge_pmd_anonymous_page(fe); + return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->pmd_fault) - return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, - fe->flags); + return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd, + vmf->flags); return VM_FAULT_FALLBACK; } -static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) +static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { - if (vma_is_anonymous(fe->vma)) - return do_huge_pmd_wp_page(fe, orig_pmd); - if (fe->vma->vm_ops->pmd_fault) - return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, - fe->flags); + if (vma_is_anonymous(vmf->vma)) + return do_huge_pmd_wp_page(vmf, orig_pmd); + if (vmf->vma->vm_ops->pmd_fault) + return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address, + vmf->pmd, vmf->flags); /* COW handled on pte level: split pmd */ - VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); - __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); + VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); + __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } @@ -3515,21 +3522,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) * The mmap_sem may have been released depending on flags and our return value. * See filemap_fault() and __lock_page_or_retry(). */ -static int handle_pte_fault(struct fault_env *fe) +static int handle_pte_fault(struct vm_fault *vmf) { pte_t entry; - if (unlikely(pmd_none(*fe->pmd))) { + if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may * want to allocate huge page, and if we expose page table * for an instant, it will be difficult to retract from * concurrent faults and from rmap lookups. */ - fe->pte = NULL; + vmf->pte = NULL; } else { /* See comment in pte_alloc_one_map() */ - if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) return 0; /* * A regular pmd is established and it can't morph into a huge @@ -3537,9 +3544,8 @@ static int handle_pte_fault(struct fault_env *fe) * mmap_sem read mode and khugepaged takes it in write mode. * So now it's safe to run pte_offset_map(). */ - fe->pte = pte_offset_map(fe->pmd, fe->address); - - entry = *fe->pte; + vmf->pte = pte_offset_map(vmf->pmd, vmf->address); + vmf->orig_pte = *vmf->pte; /* * some architectures can have larger ptes than wordsize, @@ -3550,38 +3556,39 @@ static int handle_pte_fault(struct fault_env *fe) * ptl lock held. So here a barrier will do. */ barrier(); - if (pte_none(entry)) { - pte_unmap(fe->pte); - fe->pte = NULL; + if (pte_none(vmf->orig_pte)) { + pte_unmap(vmf->pte); + vmf->pte = NULL; } } - if (!fe->pte) { - if (vma_is_anonymous(fe->vma)) - return do_anonymous_page(fe); + if (!vmf->pte) { + if (vma_is_anonymous(vmf->vma)) + return do_anonymous_page(vmf); else - return do_fault(fe); + return do_fault(vmf); } - if (!pte_present(entry)) - return do_swap_page(fe, entry); + if (!pte_present(vmf->orig_pte)) + return do_swap_page(vmf); - if (pte_protnone(entry) && vma_is_accessible(fe->vma)) - return do_numa_page(fe, entry); + if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) + return do_numa_page(vmf); - fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); - if (unlikely(!pte_same(*fe->pte, entry))) + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + entry = vmf->orig_pte; + if (unlikely(!pte_same(*vmf->pte, entry))) goto unlock; - if (fe->flags & FAULT_FLAG_WRITE) { + if (vmf->flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) - return do_wp_page(fe, entry); + return do_wp_page(vmf); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, - fe->flags & FAULT_FLAG_WRITE)) { - update_mmu_cache(fe->vma, fe->address, fe->pte); + if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, + vmf->flags & FAULT_FLAG_WRITE)) { + update_mmu_cache(vmf->vma, vmf->address, vmf->pte); } else { /* * This is needed only for protection faults but the arch code @@ -3589,11 +3596,11 @@ static int handle_pte_fault(struct fault_env *fe) * This still avoids useless tlb flushes for .text page faults * with threads. */ - if (fe->flags & FAULT_FLAG_WRITE) - flush_tlb_fix_spurious_fault(fe->vma, fe->address); + if (vmf->flags & FAULT_FLAG_WRITE) + flush_tlb_fix_spurious_fault(vmf->vma, vmf->address); } unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } @@ -3606,10 +3613,12 @@ unlock: static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - struct fault_env fe = { + struct vm_fault vmf = { .vma = vma, - .address = address, + .address = address & PAGE_MASK, .flags = flags, + .pgoff = linear_page_index(vma, address), + .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -3619,35 +3628,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, pud = pud_alloc(mm, pgd, address); if (!pud) return VM_FAULT_OOM; - fe.pmd = pmd_alloc(mm, pud, address); - if (!fe.pmd) + vmf.pmd = pmd_alloc(mm, pud, address); + if (!vmf.pmd) return VM_FAULT_OOM; - if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { - int ret = create_huge_pmd(&fe); + if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { + int ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - pmd_t orig_pmd = *fe.pmd; + pmd_t orig_pmd = *vmf.pmd; int ret; barrier(); if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) - return do_huge_pmd_numa_page(&fe, orig_pmd); + return do_huge_pmd_numa_page(&vmf, orig_pmd); - if ((fe.flags & FAULT_FLAG_WRITE) && + if ((vmf.flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) { - ret = wp_huge_pmd(&fe, orig_pmd); + ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - huge_pmd_set_accessed(&fe, orig_pmd); + huge_pmd_set_accessed(&vmf, orig_pmd); return 0; } } } - return handle_pte_fault(&fe); + return handle_pte_fault(&vmf); } /* @@ -3808,8 +3817,8 @@ out: return -EINVAL; } -static inline int follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) +int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, + spinlock_t **ptlp) { int res; @@ -3919,7 +3928,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, - gup_flags, &page, &vma); + gup_flags, &page, &vma, NULL); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; diff --git a/mm/nommu.c b/mm/nommu.c index 27bc543128e5..210d7ec2843c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -176,9 +176,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages_locked); -long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) +static long __get_user_pages_unlocked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + unsigned int gup_flags) { long ret; down_read(&mm->mmap_sem); @@ -187,7 +188,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, up_read(&mm->mmap_sem); return ret; } -EXPORT_SYMBOL(__get_user_pages_unlocked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) @@ -1801,7 +1801,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct fault_env *fe, +void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { BUG(); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 52e2f8e3b472..290e8b7d3181 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2106,18 +2106,26 @@ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { #define WRITEBACK_TAG_BATCH 4096 - unsigned long tagged; - - do { - spin_lock_irq(&mapping->tree_lock); - tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, - &start, end, WRITEBACK_TAG_BATCH, - PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); + unsigned long tagged = 0; + struct radix_tree_iter iter; + void **slot; + + spin_lock_irq(&mapping->tree_lock); + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, + PAGECACHE_TAG_DIRTY) { + if (iter.index > end) + break; + radix_tree_iter_tag_set(&mapping->page_tree, &iter, + PAGECACHE_TAG_TOWRITE); + tagged++; + if ((tagged % WRITEBACK_TAG_BATCH) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); spin_unlock_irq(&mapping->tree_lock); - WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); cond_resched(); - /* We check 'start' to handle wrapping when end == ~0UL */ - } while (tagged >= WRITEBACK_TAG_BATCH && start); + spin_lock_irq(&mapping->tree_lock); + } + spin_unlock_irq(&mapping->tree_lock); } EXPORT_SYMBOL(tag_pages_for_writeback); diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index be8dc8d1edb9..84d0c7eada2b 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -88,7 +88,7 @@ static int process_vm_rw_single_vec(unsigned long addr, ssize_t rc = 0; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); - unsigned int flags = FOLL_REMOTE; + unsigned int flags = 0; /* Work out address and page range required */ if (len == 0) @@ -100,15 +100,19 @@ static int process_vm_rw_single_vec(unsigned long addr, while (!rc && nr_pages && iov_iter_count(iter)) { int pages = min(nr_pages, max_pages_per_loop); + int locked = 1; size_t bytes; /* * Get the pages we're interested in. We must - * add FOLL_REMOTE because task/mm might not + * access remotely because task/mm might not * current/current->mm */ - pages = __get_user_pages_unlocked(task, mm, pa, pages, - process_pages, flags); + down_read(&mm->mmap_sem); + pages = get_user_pages_remote(task, mm, pa, pages, flags, + process_pages, NULL, &locked); + if (locked) + up_read(&mm->mmap_sem); if (pages <= 0) return -EFAULT; diff --git a/mm/shmem.c b/mm/shmem.c index abd7403aba41..54287d443806 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -661,8 +661,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, swapped++; if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } @@ -1049,6 +1049,30 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } +static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) +{ + struct radix_tree_iter iter; + void **slot; + unsigned long found = -1; + unsigned int checked = 0; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, root, &iter, 0) { + if (*slot == item) { + found = iter.index; + break; + } + checked++; + if ((checked % 4096) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + + rcu_read_unlock(); + return found; +} + /* * If swap found in inode, free it and move page from swapcache to filecache. */ @@ -1062,7 +1086,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, int error = 0; radswap = swp_to_radix_entry(swap); - index = radix_tree_locate_item(&mapping->page_tree, radswap); + index = find_swap_entry(&mapping->page_tree, radswap); if (index == -1) return -EAGAIN; /* tell shmem_unuse we found nothing */ @@ -2447,8 +2471,8 @@ static void shmem_tag_pins(struct address_space *mapping) } if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -2517,8 +2541,8 @@ static int shmem_wait_for_pins(struct address_space *mapping) spin_unlock_irq(&mapping->tree_lock); continue_resched: if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 2d59c9be40e1..5f63f6dcaabb 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -762,16 +762,17 @@ static const struct net_proto_family rxrpc_family_ops = { static int __init af_rxrpc_init(void) { int ret = -1; + unsigned int tmp; BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch)); rxrpc_epoch |= RXRPC_RANDOM_EPOCH; - get_random_bytes(&rxrpc_client_conn_ids.cur, - sizeof(rxrpc_client_conn_ids.cur)); - rxrpc_client_conn_ids.cur &= 0x3fffffff; - if (rxrpc_client_conn_ids.cur == 0) - rxrpc_client_conn_ids.cur = 1; + get_random_bytes(&tmp, sizeof(tmp)); + tmp &= 0x3fffffff; + if (tmp == 0) + tmp = 1; + idr_set_cursor(&rxrpc_client_conn_ids, tmp); ret = -ENOMEM; rxrpc_call_jar = kmem_cache_create( diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 60ef9605167e..6cbcdcc29853 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -263,12 +263,12 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) * times the maximum number of client conns away from the current * allocation point to try and keep the IDs concentrated. */ - id_cursor = READ_ONCE(rxrpc_client_conn_ids.cur); + id_cursor = idr_get_cursor(&rxrpc_client_conn_ids); id = conn->proto.cid >> RXRPC_CIDSHIFT; distance = id - id_cursor; if (distance < 0) distance = -distance; - limit = round_up(rxrpc_max_client_connections, IDR_SIZE) * 4; + limit = max(rxrpc_max_client_connections * 4, 1024U); if (distance > limit) goto mark_dont_reuse; diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig index 5487827fa86c..370eb2f4dd37 100644 --- a/security/integrity/ima/Kconfig +++ b/security/integrity/ima/Kconfig @@ -27,6 +27,18 @@ config IMA to learn more about IMA. If unsure, say N. +config IMA_KEXEC + bool "Enable carrying the IMA measurement list across a soft boot" + depends on IMA && TCG_TPM && HAVE_IMA_KEXEC + default n + help + TPM PCRs are only reset on a hard reboot. In order to validate + a TPM's quote after a soft boot, the IMA measurement list of the + running kernel must be saved and restored on boot. + + Depending on the IMA policy, the measurement list can grow to + be very large. + config IMA_MEASURE_PCR_IDX int depends on IMA diff --git a/security/integrity/ima/Makefile b/security/integrity/ima/Makefile index 9aeaedad1e2b..29f198bde02b 100644 --- a/security/integrity/ima/Makefile +++ b/security/integrity/ima/Makefile @@ -8,4 +8,5 @@ obj-$(CONFIG_IMA) += ima.o ima-y := ima_fs.o ima_queue.o ima_init.o ima_main.o ima_crypto.o ima_api.o \ ima_policy.o ima_template.o ima_template_lib.o ima-$(CONFIG_IMA_APPRAISE) += ima_appraise.o +ima-$(CONFIG_HAVE_IMA_KEXEC) += ima_kexec.o obj-$(CONFIG_IMA_BLACKLIST_KEYRING) += ima_mok.o diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index db25f54a04fe..5e6180a4da7d 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -28,6 +28,10 @@ #include "../integrity.h" +#ifdef CONFIG_HAVE_IMA_KEXEC +#include <asm/ima.h> +#endif + enum ima_show_type { IMA_SHOW_BINARY, IMA_SHOW_BINARY_NO_FIELD_LEN, IMA_SHOW_BINARY_OLD_STRING_FMT, IMA_SHOW_ASCII }; enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8 }; @@ -81,6 +85,7 @@ struct ima_template_field { /* IMA template descriptor definition */ struct ima_template_desc { + struct list_head list; char *name; char *fmt; int num_fields; @@ -102,6 +107,27 @@ struct ima_queue_entry { }; extern struct list_head ima_measurements; /* list of all measurements */ +/* Some details preceding the binary serialized measurement list */ +struct ima_kexec_hdr { + u16 version; + u16 _reserved0; + u32 _reserved1; + u64 buffer_size; + u64 count; +}; + +#ifdef CONFIG_HAVE_IMA_KEXEC +void ima_load_kexec_buffer(void); +#else +static inline void ima_load_kexec_buffer(void) {} +#endif /* CONFIG_HAVE_IMA_KEXEC */ + +/* + * The default binary_runtime_measurements list format is defined as the + * platform native format. The canonical format is defined as little-endian. + */ +extern bool ima_canonical_fmt; + /* Internal IMA function definitions */ int ima_init(void); int ima_fs_init(void); @@ -122,7 +148,12 @@ int ima_init_crypto(void); void ima_putc(struct seq_file *m, void *data, int datalen); void ima_print_digest(struct seq_file *m, u8 *digest, u32 size); struct ima_template_desc *ima_template_desc_current(void); +int ima_restore_measurement_entry(struct ima_template_entry *entry); +int ima_restore_measurement_list(loff_t bufsize, void *buf); +int ima_measurements_show(struct seq_file *m, void *v); +unsigned long ima_get_binary_runtime_size(void); int ima_init_template(void); +void ima_init_template_list(void); /* * used to protect h_table and sha_table diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 38f2ed830dd6..802d5d20f36f 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -477,11 +477,13 @@ static int ima_calc_field_array_hash_tfm(struct ima_field_data *field_data, u8 buffer[IMA_EVENT_NAME_LEN_MAX + 1] = { 0 }; u8 *data_to_hash = field_data[i].data; u32 datalen = field_data[i].len; + u32 datalen_to_hash = + !ima_canonical_fmt ? datalen : cpu_to_le32(datalen); if (strcmp(td->name, IMA_TEMPLATE_IMA_NAME) != 0) { rc = crypto_shash_update(shash, - (const u8 *) &field_data[i].len, - sizeof(field_data[i].len)); + (const u8 *) &datalen_to_hash, + sizeof(datalen_to_hash)); if (rc) break; } else if (strcmp(td->fields[i]->field_id, "n") == 0) { diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index 3df46906492d..ca303e5d2b94 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -28,6 +28,16 @@ static DEFINE_MUTEX(ima_write_mutex); +bool ima_canonical_fmt; +static int __init default_canonical_fmt_setup(char *str) +{ +#ifdef __BIG_ENDIAN + ima_canonical_fmt = 1; +#endif + return 1; +} +__setup("ima_canonical_fmt", default_canonical_fmt_setup); + static int valid_policy = 1; #define TMPBUFLEN 12 static ssize_t ima_show_htable_value(char __user *buf, size_t count, @@ -116,13 +126,13 @@ void ima_putc(struct seq_file *m, void *data, int datalen) * [eventdata length] * eventdata[n]=template specific data */ -static int ima_measurements_show(struct seq_file *m, void *v) +int ima_measurements_show(struct seq_file *m, void *v) { /* the list never shrinks, so we don't need a lock here */ struct ima_queue_entry *qe = v; struct ima_template_entry *e; char *template_name; - int namelen; + u32 pcr, namelen, template_data_len; /* temporary fields */ bool is_ima_template = false; int i; @@ -139,25 +149,29 @@ static int ima_measurements_show(struct seq_file *m, void *v) * PCR used defaults to the same (config option) in * little-endian format, unless set in policy */ - ima_putc(m, &e->pcr, sizeof(e->pcr)); + pcr = !ima_canonical_fmt ? e->pcr : cpu_to_le32(e->pcr); + ima_putc(m, &pcr, sizeof(e->pcr)); /* 2nd: template digest */ ima_putc(m, e->digest, TPM_DIGEST_SIZE); /* 3rd: template name size */ - namelen = strlen(template_name); + namelen = !ima_canonical_fmt ? strlen(template_name) : + cpu_to_le32(strlen(template_name)); ima_putc(m, &namelen, sizeof(namelen)); /* 4th: template name */ - ima_putc(m, template_name, namelen); + ima_putc(m, template_name, strlen(template_name)); /* 5th: template length (except for 'ima' template) */ if (strcmp(template_name, IMA_TEMPLATE_IMA_NAME) == 0) is_ima_template = true; - if (!is_ima_template) - ima_putc(m, &e->template_data_len, - sizeof(e->template_data_len)); + if (!is_ima_template) { + template_data_len = !ima_canonical_fmt ? e->template_data_len : + cpu_to_le32(e->template_data_len); + ima_putc(m, &template_data_len, sizeof(e->template_data_len)); + } /* 6th: template specific data */ for (i = 0; i < e->template_desc->num_fields; i++) { diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c index 2ac1f41db5c0..2967d497a665 100644 --- a/security/integrity/ima/ima_init.c +++ b/security/integrity/ima/ima_init.c @@ -129,6 +129,8 @@ int __init ima_init(void) if (rc != 0) return rc; + ima_load_kexec_buffer(); + rc = ima_add_boot_aggregate(); /* boot aggregate must be first entry */ if (rc != 0) return rc; diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c new file mode 100644 index 000000000000..e473eee913cb --- /dev/null +++ b/security/integrity/ima/ima_kexec.c @@ -0,0 +1,168 @@ +/* + * Copyright (C) 2016 IBM Corporation + * + * Authors: + * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com> + * Mimi Zohar <zohar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +#include <linux/seq_file.h> +#include <linux/vmalloc.h> +#include <linux/kexec.h> +#include "ima.h" + +#ifdef CONFIG_IMA_KEXEC +static int ima_dump_measurement_list(unsigned long *buffer_size, void **buffer, + unsigned long segment_size) +{ + struct ima_queue_entry *qe; + struct seq_file file; + struct ima_kexec_hdr khdr; + int ret = 0; + + /* segment size can't change between kexec load and execute */ + file.buf = vmalloc(segment_size); + if (!file.buf) { + ret = -ENOMEM; + goto out; + } + + file.size = segment_size; + file.read_pos = 0; + file.count = sizeof(khdr); /* reserved space */ + + memset(&khdr, 0, sizeof(khdr)); + khdr.version = 1; + list_for_each_entry_rcu(qe, &ima_measurements, later) { + if (file.count < file.size) { + khdr.count++; + ima_measurements_show(&file, qe); + } else { + ret = -EINVAL; + break; + } + } + + if (ret < 0) + goto out; + + /* + * fill in reserved space with some buffer details + * (eg. version, buffer size, number of measurements) + */ + khdr.buffer_size = file.count; + if (ima_canonical_fmt) { + khdr.version = cpu_to_le16(khdr.version); + khdr.count = cpu_to_le64(khdr.count); + khdr.buffer_size = cpu_to_le64(khdr.buffer_size); + } + memcpy(file.buf, &khdr, sizeof(khdr)); + + print_hex_dump(KERN_DEBUG, "ima dump: ", DUMP_PREFIX_NONE, + 16, 1, file.buf, + file.count < 100 ? file.count : 100, true); + + *buffer_size = file.count; + *buffer = file.buf; +out: + if (ret == -EINVAL) + vfree(file.buf); + return ret; +} + +/* + * Called during kexec_file_load so that IMA can add a segment to the kexec + * image for the measurement list for the next kernel. + * + * This function assumes that kexec_mutex is held. + */ +void ima_add_kexec_buffer(struct kimage *image) +{ + struct kexec_buf kbuf = { .image = image, .buf_align = PAGE_SIZE, + .buf_min = 0, .buf_max = ULONG_MAX, + .top_down = true }; + unsigned long binary_runtime_size; + + /* use more understandable variable names than defined in kbuf */ + void *kexec_buffer = NULL; + size_t kexec_buffer_size; + size_t kexec_segment_size; + int ret; + + /* + * Reserve an extra half page of memory for additional measurements + * added during the kexec load. + */ + binary_runtime_size = ima_get_binary_runtime_size(); + if (binary_runtime_size >= ULONG_MAX - PAGE_SIZE) + kexec_segment_size = ULONG_MAX; + else + kexec_segment_size = ALIGN(ima_get_binary_runtime_size() + + PAGE_SIZE / 2, PAGE_SIZE); + if ((kexec_segment_size == ULONG_MAX) || + ((kexec_segment_size >> PAGE_SHIFT) > totalram_pages / 2)) { + pr_err("Binary measurement list too large.\n"); + return; + } + + ima_dump_measurement_list(&kexec_buffer_size, &kexec_buffer, + kexec_segment_size); + if (!kexec_buffer) { + pr_err("Not enough memory for the kexec measurement buffer.\n"); + return; + } + + kbuf.buffer = kexec_buffer; + kbuf.bufsz = kexec_buffer_size; + kbuf.memsz = kexec_segment_size; + ret = kexec_add_buffer(&kbuf); + if (ret) { + pr_err("Error passing over kexec measurement buffer.\n"); + return; + } + + ret = arch_ima_add_kexec_buffer(image, kbuf.mem, kexec_segment_size); + if (ret) { + pr_err("Error passing over kexec measurement buffer.\n"); + return; + } + + pr_debug("kexec measurement buffer for the loaded kernel at 0x%lx.\n", + kbuf.mem); +} +#endif /* IMA_KEXEC */ + +/* + * Restore the measurement list from the previous kernel. + */ +void ima_load_kexec_buffer(void) +{ + void *kexec_buffer = NULL; + size_t kexec_buffer_size = 0; + int rc; + + rc = ima_get_kexec_buffer(&kexec_buffer, &kexec_buffer_size); + switch (rc) { + case 0: + rc = ima_restore_measurement_list(kexec_buffer_size, + kexec_buffer); + if (rc != 0) + pr_err("Failed to restore the measurement list: %d\n", + rc); + + ima_free_kexec_buffer(); + break; + case -ENOTSUPP: + pr_debug("Restoring the measurement list not supported\n"); + break; + case -ENOENT: + pr_debug("No measurement list to restore\n"); + break; + default: + pr_debug("Error restoring the measurement list: %d\n", rc); + } +} diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 423d111b3b94..50818c60538b 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -418,6 +418,7 @@ static int __init init_ima(void) { int error; + ima_init_template_list(); hash_setup(CONFIG_IMA_DEFAULT_HASH); error = ima_init(); if (!error) { diff --git a/security/integrity/ima/ima_queue.c b/security/integrity/ima/ima_queue.c index 32f6ac0f96df..d9aa5ab71204 100644 --- a/security/integrity/ima/ima_queue.c +++ b/security/integrity/ima/ima_queue.c @@ -29,6 +29,11 @@ #define AUDIT_CAUSE_LEN_MAX 32 LIST_HEAD(ima_measurements); /* list of all measurements */ +#ifdef CONFIG_IMA_KEXEC +static unsigned long binary_runtime_size; +#else +static unsigned long binary_runtime_size = ULONG_MAX; +#endif /* key: inode (before secure-hashing a file) */ struct ima_h_table ima_htable = { @@ -64,12 +69,32 @@ static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value, return ret; } +/* + * Calculate the memory required for serializing a single + * binary_runtime_measurement list entry, which contains a + * couple of variable length fields (e.g template name and data). + */ +static int get_binary_runtime_size(struct ima_template_entry *entry) +{ + int size = 0; + + size += sizeof(u32); /* pcr */ + size += sizeof(entry->digest); + size += sizeof(int); /* template name size field */ + size += strlen(entry->template_desc->name) + 1; + size += sizeof(entry->template_data_len); + size += entry->template_data_len; + return size; +} + /* ima_add_template_entry helper function: - * - Add template entry to measurement list and hash table. + * - Add template entry to the measurement list and hash table, for + * all entries except those carried across kexec. * * (Called with ima_extend_list_mutex held.) */ -static int ima_add_digest_entry(struct ima_template_entry *entry) +static int ima_add_digest_entry(struct ima_template_entry *entry, + bool update_htable) { struct ima_queue_entry *qe; unsigned int key; @@ -85,11 +110,34 @@ static int ima_add_digest_entry(struct ima_template_entry *entry) list_add_tail_rcu(&qe->later, &ima_measurements); atomic_long_inc(&ima_htable.len); - key = ima_hash_key(entry->digest); - hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]); + if (update_htable) { + key = ima_hash_key(entry->digest); + hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]); + } + + if (binary_runtime_size != ULONG_MAX) { + int size; + + size = get_binary_runtime_size(entry); + binary_runtime_size = (binary_runtime_size < ULONG_MAX - size) ? + binary_runtime_size + size : ULONG_MAX; + } return 0; } +/* + * Return the amount of memory required for serializing the + * entire binary_runtime_measurement list, including the ima_kexec_hdr + * structure. + */ +unsigned long ima_get_binary_runtime_size(void) +{ + if (binary_runtime_size >= (ULONG_MAX - sizeof(struct ima_kexec_hdr))) + return ULONG_MAX; + else + return binary_runtime_size + sizeof(struct ima_kexec_hdr); +}; + static int ima_pcr_extend(const u8 *hash, int pcr) { int result = 0; @@ -103,8 +151,13 @@ static int ima_pcr_extend(const u8 *hash, int pcr) return result; } -/* Add template entry to the measurement list and hash table, - * and extend the pcr. +/* + * Add template entry to the measurement list and hash table, and + * extend the pcr. + * + * On systems which support carrying the IMA measurement list across + * kexec, maintain the total memory size required for serializing the + * binary_runtime_measurements. */ int ima_add_template_entry(struct ima_template_entry *entry, int violation, const char *op, struct inode *inode, @@ -126,7 +179,7 @@ int ima_add_template_entry(struct ima_template_entry *entry, int violation, } } - result = ima_add_digest_entry(entry); + result = ima_add_digest_entry(entry, 1); if (result < 0) { audit_cause = "ENOMEM"; audit_info = 0; @@ -149,3 +202,13 @@ out: op, audit_cause, result, audit_info); return result; } + +int ima_restore_measurement_entry(struct ima_template_entry *entry) +{ + int result = 0; + + mutex_lock(&ima_extend_list_mutex); + result = ima_add_digest_entry(entry, 0); + mutex_unlock(&ima_extend_list_mutex); + return result; +} diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c index febd12ed9b55..cebb37c63629 100644 --- a/security/integrity/ima/ima_template.c +++ b/security/integrity/ima/ima_template.c @@ -15,16 +15,20 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/rculist.h> #include "ima.h" #include "ima_template_lib.h" -static struct ima_template_desc defined_templates[] = { +static struct ima_template_desc builtin_templates[] = { {.name = IMA_TEMPLATE_IMA_NAME, .fmt = IMA_TEMPLATE_IMA_FMT}, {.name = "ima-ng", .fmt = "d-ng|n-ng"}, {.name = "ima-sig", .fmt = "d-ng|n-ng|sig"}, {.name = "", .fmt = ""}, /* placeholder for a custom format */ }; +static LIST_HEAD(defined_templates); +static DEFINE_SPINLOCK(template_list); + static struct ima_template_field supported_fields[] = { {.field_id = "d", .field_init = ima_eventdigest_init, .field_show = ima_show_template_digest}, @@ -37,6 +41,7 @@ static struct ima_template_field supported_fields[] = { {.field_id = "sig", .field_init = ima_eventsig_init, .field_show = ima_show_template_sig}, }; +#define MAX_TEMPLATE_NAME_LEN 15 static struct ima_template_desc *ima_template; static struct ima_template_desc *lookup_template_desc(const char *name); @@ -52,6 +57,8 @@ static int __init ima_template_setup(char *str) if (ima_template) return 1; + ima_init_template_list(); + /* * Verify that a template with the supplied name exists. * If not, use CONFIG_IMA_DEFAULT_TEMPLATE. @@ -80,7 +87,7 @@ __setup("ima_template=", ima_template_setup); static int __init ima_template_fmt_setup(char *str) { - int num_templates = ARRAY_SIZE(defined_templates); + int num_templates = ARRAY_SIZE(builtin_templates); if (ima_template) return 1; @@ -91,22 +98,28 @@ static int __init ima_template_fmt_setup(char *str) return 1; } - defined_templates[num_templates - 1].fmt = str; - ima_template = defined_templates + num_templates - 1; + builtin_templates[num_templates - 1].fmt = str; + ima_template = builtin_templates + num_templates - 1; + return 1; } __setup("ima_template_fmt=", ima_template_fmt_setup); static struct ima_template_desc *lookup_template_desc(const char *name) { - int i; - - for (i = 0; i < ARRAY_SIZE(defined_templates); i++) { - if (strcmp(defined_templates[i].name, name) == 0) - return defined_templates + i; + struct ima_template_desc *template_desc; + int found = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(template_desc, &defined_templates, list) { + if ((strcmp(template_desc->name, name) == 0) || + (strcmp(template_desc->fmt, name) == 0)) { + found = 1; + break; + } } - - return NULL; + rcu_read_unlock(); + return found ? template_desc : NULL; } static struct ima_template_field *lookup_template_field(const char *field_id) @@ -142,9 +155,14 @@ static int template_desc_init_fields(const char *template_fmt, { const char *template_fmt_ptr; struct ima_template_field *found_fields[IMA_TEMPLATE_NUM_FIELDS_MAX]; - int template_num_fields = template_fmt_size(template_fmt); + int template_num_fields; int i, len; + if (num_fields && *num_fields > 0) /* already initialized? */ + return 0; + + template_num_fields = template_fmt_size(template_fmt); + if (template_num_fields > IMA_TEMPLATE_NUM_FIELDS_MAX) { pr_err("format string '%s' contains too many fields\n", template_fmt); @@ -182,11 +200,28 @@ static int template_desc_init_fields(const char *template_fmt, return 0; } +void ima_init_template_list(void) +{ + int i; + + if (!list_empty(&defined_templates)) + return; + + spin_lock(&template_list); + for (i = 0; i < ARRAY_SIZE(builtin_templates); i++) { + list_add_tail_rcu(&builtin_templates[i].list, + &defined_templates); + } + spin_unlock(&template_list); +} + struct ima_template_desc *ima_template_desc_current(void) { - if (!ima_template) + if (!ima_template) { + ima_init_template_list(); ima_template = lookup_template_desc(CONFIG_IMA_DEFAULT_TEMPLATE); + } return ima_template; } @@ -205,3 +240,239 @@ int __init ima_init_template(void) return result; } + +static struct ima_template_desc *restore_template_fmt(char *template_name) +{ + struct ima_template_desc *template_desc = NULL; + int ret; + + ret = template_desc_init_fields(template_name, NULL, NULL); + if (ret < 0) { + pr_err("attempting to initialize the template \"%s\" failed\n", + template_name); + goto out; + } + + template_desc = kzalloc(sizeof(*template_desc), GFP_KERNEL); + if (!template_desc) + goto out; + + template_desc->name = ""; + template_desc->fmt = kstrdup(template_name, GFP_KERNEL); + if (!template_desc->fmt) + goto out; + + spin_lock(&template_list); + list_add_tail_rcu(&template_desc->list, &defined_templates); + spin_unlock(&template_list); +out: + return template_desc; +} + +static int ima_restore_template_data(struct ima_template_desc *template_desc, + void *template_data, + int template_data_size, + struct ima_template_entry **entry) +{ + struct binary_field_data { + u32 len; + u8 data[0]; + } __packed; + + struct binary_field_data *field_data; + int offset = 0; + int ret = 0; + int i; + + *entry = kzalloc(sizeof(**entry) + + template_desc->num_fields * sizeof(struct ima_field_data), + GFP_NOFS); + if (!*entry) + return -ENOMEM; + + (*entry)->template_desc = template_desc; + for (i = 0; i < template_desc->num_fields; i++) { + field_data = template_data + offset; + + /* Each field of the template data is prefixed with a length. */ + if (offset > (template_data_size - sizeof(*field_data))) { + pr_err("Restoring the template field failed\n"); + ret = -EINVAL; + break; + } + offset += sizeof(*field_data); + + if (ima_canonical_fmt) + field_data->len = le32_to_cpu(field_data->len); + + if (offset > (template_data_size - field_data->len)) { + pr_err("Restoring the template field data failed\n"); + ret = -EINVAL; + break; + } + offset += field_data->len; + + (*entry)->template_data[i].len = field_data->len; + (*entry)->template_data_len += sizeof(field_data->len); + + (*entry)->template_data[i].data = + kzalloc(field_data->len + 1, GFP_KERNEL); + if (!(*entry)->template_data[i].data) { + ret = -ENOMEM; + break; + } + memcpy((*entry)->template_data[i].data, field_data->data, + field_data->len); + (*entry)->template_data_len += field_data->len; + } + + if (ret < 0) { + ima_free_template_entry(*entry); + *entry = NULL; + } + + return ret; +} + +/* Restore the serialized binary measurement list without extending PCRs. */ +int ima_restore_measurement_list(loff_t size, void *buf) +{ + struct binary_hdr_v1 { + u32 pcr; + u8 digest[TPM_DIGEST_SIZE]; + u32 template_name_len; + char template_name[0]; + } __packed; + char template_name[MAX_TEMPLATE_NAME_LEN]; + + struct binary_data_v1 { + u32 template_data_size; + char template_data[0]; + } __packed; + + struct ima_kexec_hdr *khdr = buf; + struct binary_hdr_v1 *hdr_v1; + struct binary_data_v1 *data_v1; + + void *bufp = buf + sizeof(*khdr); + void *bufendp; + struct ima_template_entry *entry; + struct ima_template_desc *template_desc; + unsigned long count = 0; + int ret = 0; + + if (!buf || size < sizeof(*khdr)) + return 0; + + if (ima_canonical_fmt) { + khdr->version = le16_to_cpu(khdr->version); + khdr->count = le64_to_cpu(khdr->count); + khdr->buffer_size = le64_to_cpu(khdr->buffer_size); + } + + if (khdr->version != 1) { + pr_err("attempting to restore a incompatible measurement list"); + return -EINVAL; + } + + if (khdr->count > ULONG_MAX - 1) { + pr_err("attempting to restore too many measurements"); + return -EINVAL; + } + + /* + * ima kexec buffer prefix: version, buffer size, count + * v1 format: pcr, digest, template-name-len, template-name, + * template-data-size, template-data + */ + bufendp = buf + khdr->buffer_size; + while ((bufp < bufendp) && (count++ < khdr->count)) { + hdr_v1 = bufp; + if (bufp > (bufendp - sizeof(*hdr_v1))) { + pr_err("attempting to restore partial measurement\n"); + ret = -EINVAL; + break; + } + bufp += sizeof(*hdr_v1); + + if (ima_canonical_fmt) + hdr_v1->template_name_len = + le32_to_cpu(hdr_v1->template_name_len); + + if ((hdr_v1->template_name_len >= MAX_TEMPLATE_NAME_LEN) || + (bufp > (bufendp - hdr_v1->template_name_len))) { + pr_err("attempting to restore a template name \ + that is too long\n"); + ret = -EINVAL; + break; + } + data_v1 = bufp += (u_int8_t)hdr_v1->template_name_len; + + /* template name is not null terminated */ + memcpy(template_name, hdr_v1->template_name, + hdr_v1->template_name_len); + template_name[hdr_v1->template_name_len] = 0; + + if (strcmp(template_name, "ima") == 0) { + pr_err("attempting to restore an unsupported \ + template \"%s\" failed\n", template_name); + ret = -EINVAL; + break; + } + + template_desc = lookup_template_desc(template_name); + if (!template_desc) { + template_desc = restore_template_fmt(template_name); + if (!template_desc) + break; + } + + /* + * Only the running system's template format is initialized + * on boot. As needed, initialize the other template formats. + */ + ret = template_desc_init_fields(template_desc->fmt, + &(template_desc->fields), + &(template_desc->num_fields)); + if (ret < 0) { + pr_err("attempting to restore the template fmt \"%s\" \ + failed\n", template_desc->fmt); + ret = -EINVAL; + break; + } + + if (bufp > (bufendp - sizeof(data_v1->template_data_size))) { + pr_err("restoring the template data size failed\n"); + ret = -EINVAL; + break; + } + bufp += (u_int8_t) sizeof(data_v1->template_data_size); + + if (ima_canonical_fmt) + data_v1->template_data_size = + le32_to_cpu(data_v1->template_data_size); + + if (bufp > (bufendp - data_v1->template_data_size)) { + pr_err("restoring the template data failed\n"); + ret = -EINVAL; + break; + } + bufp += data_v1->template_data_size; + + ret = ima_restore_template_data(template_desc, + data_v1->template_data, + data_v1->template_data_size, + &entry); + if (ret < 0) + break; + + memcpy(entry->digest, hdr_v1->digest, TPM_DIGEST_SIZE); + entry->pcr = + !ima_canonical_fmt ? hdr_v1->pcr : le32_to_cpu(hdr_v1->pcr); + ret = ima_restore_measurement_entry(entry); + if (ret < 0) + break; + + } + return ret; +} diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c index f9bae04ba176..f9ba37b3928d 100644 --- a/security/integrity/ima/ima_template_lib.c +++ b/security/integrity/ima/ima_template_lib.c @@ -103,8 +103,11 @@ static void ima_show_template_data_binary(struct seq_file *m, u32 len = (show == IMA_SHOW_BINARY_OLD_STRING_FMT) ? strlen(field_data->data) : field_data->len; - if (show != IMA_SHOW_BINARY_NO_FIELD_LEN) - ima_putc(m, &len, sizeof(len)); + if (show != IMA_SHOW_BINARY_NO_FIELD_LEN) { + u32 field_len = !ima_canonical_fmt ? len : cpu_to_le32(len); + + ima_putc(m, &field_len, sizeof(field_len)); + } if (!len) return; diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 682b73af7766..838ffa78cfda 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -881,7 +881,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, * the execve(). */ if (get_user_pages_remote(current, bprm->mm, pos, 1, - FOLL_FORCE, &page, NULL) <= 0) + FOLL_FORCE, &page, NULL, NULL) <= 0) return false; #else page = bprm->page[pos / PAGE_SIZE]; diff --git a/tools/include/asm/bug.h b/tools/include/asm/bug.h index 9e5f4846967f..beda1a884b50 100644 --- a/tools/include/asm/bug.h +++ b/tools/include/asm/bug.h @@ -12,6 +12,17 @@ unlikely(__ret_warn_on); \ }) +#define WARN_ON_ONCE(condition) ({ \ + static int __warned; \ + int __ret_warn_once = !!(condition); \ + \ + if (unlikely(__ret_warn_once && !__warned)) { \ + __warned = true; \ + WARN_ON(1); \ + } \ + unlikely(__ret_warn_once); \ +}) + #define WARN_ONCE(condition, format...) ({ \ static int __warned; \ int __ret_warn_once = !!(condition); \ diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 43c1c5021e4b..eef41d500e9e 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -35,6 +35,32 @@ static inline void bitmap_zero(unsigned long *dst, int nbits) } } +static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) +{ + unsigned int nlongs = BITS_TO_LONGS(nbits); + if (!small_const_nbits(nbits)) { + unsigned int len = (nlongs - 1) * sizeof(unsigned long); + memset(dst, 0xff, len); + } + dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits); +} + +static inline int bitmap_empty(const unsigned long *src, unsigned nbits) +{ + if (small_const_nbits(nbits)) + return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); + + return find_first_bit(src, nbits) == nbits; +} + +static inline int bitmap_full(const unsigned long *src, unsigned int nbits) +{ + if (small_const_nbits(nbits)) + return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); + + return find_first_zero_bit(src, nbits) == nbits; +} + static inline int bitmap_weight(const unsigned long *src, int nbits) { if (small_const_nbits(nbits)) diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index d08e214ec6e7..be93ab02b490 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -719,14 +719,14 @@ sub set_value { if ($buildonly && $lvalue =~ /^TEST_TYPE(\[.*\])?$/ && $prvalue ne "build") { # Note if a test is something other than build, then we - # will need other manditory options. + # will need other mandatory options. if ($prvalue ne "install") { # for bisect, we need to check BISECT_TYPE if ($prvalue ne "bisect") { $buildonly = 0; } } else { - # install still limits some manditory options. + # install still limits some mandatory options. $buildonly = 2; } } @@ -735,7 +735,7 @@ sub set_value { if ($prvalue ne "install") { $buildonly = 0; } else { - # install still limits some manditory options. + # install still limits some mandatory options. $buildonly = 2; } } @@ -3989,7 +3989,7 @@ sub make_min_config { } } - # Save off all the current mandidory configs + # Save off all the current mandatory configs open (OUT, ">$temp_config") or die "Can't write to $temp_config"; foreach my $config (keys %keep_configs) { diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index f2e07f2fd4b4..5a616a3a61b4 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile @@ -1,10 +1,14 @@ -CFLAGS += -I. -g -O2 -Wall -D_LGPL_SOURCE +CFLAGS += -I. -I../../include -g -O2 -Wall -D_LGPL_SOURCE LDFLAGS += -lpthread -lurcu TARGETS = main OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_next_bit.o \ - regression1.o regression2.o regression3.o multiorder.o \ - iteration_check.o + regression1.o regression2.o regression3.o multiorder.o idr.o \ + iteration_check.o benchmark.o + +ifdef BENCHMARK + CFLAGS += -DBENCHMARK=1 +endif targets: $(TARGETS) @@ -14,7 +18,13 @@ main: $(OFILES) clean: $(RM) -f $(TARGETS) *.o radix-tree.c -$(OFILES): *.h */*.h ../../../include/linux/radix-tree.h ../../include/linux/*.h +find_next_bit.o: ../../lib/find_bit.c + $(CC) $(CFLAGS) -c -o $@ $< + +$(OFILES): *.h */*.h \ + ../../include/linux/*.h \ + ../../../include/linux/radix-tree.h \ + ../../../include/linux/idr.h radix-tree.c: ../../../lib/radix-tree.c sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ diff --git a/tools/testing/radix-tree/benchmark.c b/tools/testing/radix-tree/benchmark.c new file mode 100644 index 000000000000..215ca86c7605 --- /dev/null +++ b/tools/testing/radix-tree/benchmark.c @@ -0,0 +1,98 @@ +/* + * benchmark.c: + * Author: Konstantin Khlebnikov <koct9i@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/radix-tree.h> +#include <linux/slab.h> +#include <linux/errno.h> +#include <time.h> +#include "test.h" + +#define NSEC_PER_SEC 1000000000L + +static long long benchmark_iter(struct radix_tree_root *root, bool tagged) +{ + volatile unsigned long sink = 0; + struct radix_tree_iter iter; + struct timespec start, finish; + long long nsec; + int l, loops = 1; + void **slot; + +#ifdef BENCHMARK +again: +#endif + clock_gettime(CLOCK_MONOTONIC, &start); + for (l = 0; l < loops; l++) { + if (tagged) { + radix_tree_for_each_tagged(slot, root, &iter, 0, 0) + sink ^= (unsigned long)slot; + } else { + radix_tree_for_each_slot(slot, root, &iter, 0) + sink ^= (unsigned long)slot; + } + } + clock_gettime(CLOCK_MONOTONIC, &finish); + + nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC + + (finish.tv_nsec - start.tv_nsec); + +#ifdef BENCHMARK + if (loops == 1 && nsec * 5 < NSEC_PER_SEC) { + loops = NSEC_PER_SEC / nsec / 4 + 1; + goto again; + } +#endif + + nsec /= loops; + return nsec; +} + +static void benchmark_size(unsigned long size, unsigned long step, int order) +{ + RADIX_TREE(tree, GFP_KERNEL); + long long normal, tagged; + unsigned long index; + + for (index = 0 ; index < size ; index += step) { + item_insert_order(&tree, index, order); + radix_tree_tag_set(&tree, index, 0); + } + + tagged = benchmark_iter(&tree, true); + normal = benchmark_iter(&tree, false); + + printf("Size %ld, step %6ld, order %d tagged %10lld ns, normal %10lld ns\n", + size, step, order, tagged, normal); + + item_kill_tree(&tree); + rcu_barrier(); +} + +void benchmark(void) +{ + unsigned long size[] = {1 << 10, 1 << 20, 0}; + unsigned long step[] = {1, 2, 7, 15, 63, 64, 65, + 128, 256, 512, 12345, 0}; + int c, s; + + printf("starting benchmarks\n"); + printf("RADIX_TREE_MAP_SHIFT = %d\n", RADIX_TREE_MAP_SHIFT); + + for (c = 0; size[c]; c++) + for (s = 0; step[s]; s++) + benchmark_size(size[c], step[s], 0); + + for (c = 0; size[c]; c++) + for (s = 0; step[s]; s++) + benchmark_size(size[c], step[s] << 9, 9); +} diff --git a/tools/testing/radix-tree/find_next_bit.c b/tools/testing/radix-tree/find_next_bit.c deleted file mode 100644 index d1c2178bb2d4..000000000000 --- a/tools/testing/radix-tree/find_next_bit.c +++ /dev/null @@ -1,57 +0,0 @@ -/* find_next_bit.c: fallback find next bit implementation - * - * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/types.h> -#include <linux/bitops.h> - -#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) - -/* - * Find the next set bit in a memory region. - */ -unsigned long find_next_bit(const unsigned long *addr, unsigned long size, - unsigned long offset) -{ - const unsigned long *p = addr + BITOP_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG-1); - unsigned long tmp; - - if (offset >= size) - return size; - size -= result; - offset %= BITS_PER_LONG; - if (offset) { - tmp = *(p++); - tmp &= (~0UL << offset); - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - } - while (size & ~(BITS_PER_LONG-1)) { - if ((tmp = *(p++))) - goto found_middle; - result += BITS_PER_LONG; - size -= BITS_PER_LONG; - } - if (!size) - return result; - tmp = *p; - -found_first: - tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ - return result + size; /* Nope. */ -found_middle: - return result + __ffs(tmp); -} diff --git a/tools/testing/radix-tree/idr.c b/tools/testing/radix-tree/idr.c new file mode 100644 index 000000000000..4033dbcbd873 --- /dev/null +++ b/tools/testing/radix-tree/idr.c @@ -0,0 +1,160 @@ +/* + * idr.c: Test the IDR API + * Copyright (c) 2016 Matthew Wilcox <willy@infradead.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/idr.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/errno.h> + +#include "test.h" + +#define DUMMY_PTR ((void *)0x12) + +int item_idr_free(int id, void *p, void *data) +{ + struct item *item = p; + assert(item->index == id); + idr_remove(data, id); + free(p); + + return 0; +} + +void item_idr_remove(struct idr *idr, int id) +{ + struct item *item = idr_find(idr, id); + assert(item->index == id); + idr_remove(idr, id); + free(item); +} + +void idr_alloc_test(void) +{ + unsigned long i; + DEFINE_IDR(idr); + + assert(idr_alloc_cyclic(&idr, DUMMY_PTR, 0, 0x4000, GFP_KERNEL) == 0); + assert(idr_alloc_cyclic(&idr, DUMMY_PTR, 0x3ffd, 0x4000, GFP_KERNEL) == 0x3ffd); + idr_remove(&idr, 0x3ffd); + idr_remove(&idr, 0); + + for (i = 0x3ffe; i < 0x4003; i++) { + int id; + struct item *item; + + if (i < 0x4000) + item = item_create(i, 0); + else + item = item_create(i - 0x3fff, 0); + + id = idr_alloc_cyclic(&idr, item, 1, 0x4000, GFP_KERNEL); + assert(id == item->index); + } + + idr_for_each(&idr, item_idr_free, &idr); +} + +void idr_replace_test(void) +{ + DEFINE_IDR(idr); + + idr_alloc(&idr, (void *)-1, 10, 11, GFP_KERNEL); + idr_replace(&idr, &idr, 10); + + idr_destroy(&idr); +} + +void idr_checks(void) +{ + unsigned long i; + DEFINE_IDR(idr); + + for (i = 0; i < 10000; i++) { + struct item *item = item_create(i, 0); + assert(idr_alloc(&idr, item, 0, 20000, GFP_KERNEL) == i); + } + + assert(idr_alloc(&idr, DUMMY_PTR, 5, 30, GFP_KERNEL) < 0); + + for (i = 0; i < 5000; i++) + item_idr_remove(&idr, i); + + idr_for_each(&idr, item_idr_free, &idr); + + assert(idr_is_empty(&idr)); + + for (i = INT_MAX - 3UL; i < INT_MAX + 1UL; i++) { + struct item *item = item_create(i, 0); + assert(idr_alloc(&idr, item, i, i + 10, GFP_KERNEL) == i); + } + assert(idr_alloc(&idr, DUMMY_PTR, i - 2, i, GFP_KERNEL) == -ENOSPC); + + idr_destroy(&idr); + idr_destroy(&idr); + + assert(idr_is_empty(&idr)); + + for (i = 1; i < 10000; i++) { + struct item *item = item_create(i, 0); + assert(idr_alloc(&idr, item, 1, 20000, GFP_KERNEL) == i); + } + + idr_destroy(&idr); + + idr_replace_test(); + + idr_alloc_test(); +} + +void ida_checks(void) +{ + DEFINE_IDA(ida); + + unsigned long i; + int id; + + for (i = 0; i < 10000; i++) { + ida_pre_get(&ida, GFP_KERNEL); + ida_get_new(&ida, &id); + assert(id == i); + } + + ida_remove(&ida, 20); + ida_remove(&ida, 21); + for (i = 0; i < 3; i++) { + ida_pre_get(&ida, GFP_KERNEL); + ida_get_new(&ida, &id); + if (i == 2) + assert(id == 10000); + } + + for (i = 0; i < 5000; i++) + ida_remove(&ida, i); + + ida_pre_get(&ida, GFP_KERNEL); + ida_get_new_above(&ida, 5000, &id); + assert(id == 10001); + + ida_destroy(&ida); + + assert(ida_is_empty(&ida)); + + ida_pre_get(&ida, GFP_KERNEL); + ida_get_new_above(&ida, 1, &id); + assert(id == 1); + + ida_remove(&ida, id); + ida_destroy(&ida); + + radix_tree_cpu_dead(1); +} diff --git a/tools/testing/radix-tree/iteration_check.c b/tools/testing/radix-tree/iteration_check.c index 9adb8e7415a6..7572b7ed930e 100644 --- a/tools/testing/radix-tree/iteration_check.c +++ b/tools/testing/radix-tree/iteration_check.c @@ -16,35 +16,50 @@ #include <pthread.h> #include "test.h" -#define NUM_THREADS 4 -#define TAG 0 +#define NUM_THREADS 5 +#define MAX_IDX 100 +#define TAG 0 +#define NEW_TAG 1 + static pthread_mutex_t tree_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_t threads[NUM_THREADS]; -RADIX_TREE(tree, GFP_KERNEL); -bool test_complete; +static unsigned int seeds[3]; +static RADIX_TREE(tree, GFP_KERNEL); +static bool test_complete; +static int max_order; /* relentlessly fill the tree with tagged entries */ static void *add_entries_fn(void *arg) { - int pgoff; + rcu_register_thread(); while (!test_complete) { - for (pgoff = 0; pgoff < 100; pgoff++) { + unsigned long pgoff; + int order; + + for (pgoff = 0; pgoff < MAX_IDX; pgoff++) { pthread_mutex_lock(&tree_lock); - if (item_insert(&tree, pgoff) == 0) - item_tag_set(&tree, pgoff, TAG); + for (order = max_order; order >= 0; order--) { + if (item_insert_order(&tree, pgoff, order) + == 0) { + item_tag_set(&tree, pgoff, TAG); + break; + } + } pthread_mutex_unlock(&tree_lock); } } + rcu_unregister_thread(); + return NULL; } /* * Iterate over the tagged entries, doing a radix_tree_iter_retry() as we find * things that have been removed and randomly resetting our iteration to the - * next chunk with radix_tree_iter_next(). Both radix_tree_iter_retry() and - * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a + * next chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and + * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a * NULL 'slot' variable. */ static void *tagged_iteration_fn(void *arg) @@ -52,17 +67,12 @@ static void *tagged_iteration_fn(void *arg) struct radix_tree_iter iter; void **slot; + rcu_register_thread(); + while (!test_complete) { rcu_read_lock(); radix_tree_for_each_tagged(slot, &tree, &iter, 0, TAG) { - void *entry; - int i; - - /* busy wait to let removals happen */ - for (i = 0; i < 1000000; i++) - ; - - entry = radix_tree_deref_slot(slot); + void *entry = radix_tree_deref_slot(slot); if (unlikely(!entry)) continue; @@ -71,20 +81,26 @@ static void *tagged_iteration_fn(void *arg) continue; } - if (rand() % 50 == 0) - slot = radix_tree_iter_next(&iter); + if (rand_r(&seeds[0]) % 50 == 0) { + slot = radix_tree_iter_resume(slot, &iter); + rcu_read_unlock(); + rcu_barrier(); + rcu_read_lock(); + } } rcu_read_unlock(); } + rcu_unregister_thread(); + return NULL; } /* * Iterate over the entries, doing a radix_tree_iter_retry() as we find things * that have been removed and randomly resetting our iteration to the next - * chunk with radix_tree_iter_next(). Both radix_tree_iter_retry() and - * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a + * chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and + * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a * NULL 'slot' variable. */ static void *untagged_iteration_fn(void *arg) @@ -92,17 +108,12 @@ static void *untagged_iteration_fn(void *arg) struct radix_tree_iter iter; void **slot; + rcu_register_thread(); + while (!test_complete) { rcu_read_lock(); radix_tree_for_each_slot(slot, &tree, &iter, 0) { - void *entry; - int i; - - /* busy wait to let removals happen */ - for (i = 0; i < 1000000; i++) - ; - - entry = radix_tree_deref_slot(slot); + void *entry = radix_tree_deref_slot(slot); if (unlikely(!entry)) continue; @@ -111,12 +122,18 @@ static void *untagged_iteration_fn(void *arg) continue; } - if (rand() % 50 == 0) - slot = radix_tree_iter_next(&iter); + if (rand_r(&seeds[1]) % 50 == 0) { + slot = radix_tree_iter_resume(slot, &iter); + rcu_read_unlock(); + rcu_barrier(); + rcu_read_lock(); + } } rcu_read_unlock(); } + rcu_unregister_thread(); + return NULL; } @@ -126,47 +143,71 @@ static void *untagged_iteration_fn(void *arg) */ static void *remove_entries_fn(void *arg) { + rcu_register_thread(); + while (!test_complete) { int pgoff; - pgoff = rand() % 100; + pgoff = rand_r(&seeds[2]) % MAX_IDX; pthread_mutex_lock(&tree_lock); item_delete(&tree, pgoff); pthread_mutex_unlock(&tree_lock); } + rcu_unregister_thread(); + + return NULL; +} + +static void *tag_entries_fn(void *arg) +{ + rcu_register_thread(); + + while (!test_complete) { + tag_tagged_items(&tree, &tree_lock, 0, MAX_IDX, 10, TAG, + NEW_TAG); + } + rcu_unregister_thread(); return NULL; } /* This is a unit test for a bug found by the syzkaller tester */ -void iteration_test(void) +void iteration_test(unsigned order, unsigned test_duration) { int i; - printf("Running iteration tests for 10 seconds\n"); + printf("Running %siteration tests for %d seconds\n", + order > 0 ? "multiorder " : "", test_duration); - srand(time(0)); + max_order = order; test_complete = false; + for (i = 0; i < 3; i++) + seeds[i] = rand(); + if (pthread_create(&threads[0], NULL, tagged_iteration_fn, NULL)) { - perror("pthread_create"); + perror("create tagged iteration thread"); exit(1); } if (pthread_create(&threads[1], NULL, untagged_iteration_fn, NULL)) { - perror("pthread_create"); + perror("create untagged iteration thread"); exit(1); } if (pthread_create(&threads[2], NULL, add_entries_fn, NULL)) { - perror("pthread_create"); + perror("create add entry thread"); exit(1); } if (pthread_create(&threads[3], NULL, remove_entries_fn, NULL)) { - perror("pthread_create"); + perror("create remove entry thread"); + exit(1); + } + if (pthread_create(&threads[4], NULL, tag_entries_fn, NULL)) { + perror("create tag entry thread"); exit(1); } - sleep(10); + sleep(test_duration); test_complete = true; for (i = 0; i < NUM_THREADS; i++) { diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c index 154823737b20..ff0452e8a0c4 100644 --- a/tools/testing/radix-tree/linux.c +++ b/tools/testing/radix-tree/linux.c @@ -9,6 +9,7 @@ #include <urcu/uatomic.h> int nr_allocated; +int preempt_count; void *mempool_alloc(mempool_t *pool, int gfp_mask) { @@ -33,7 +34,12 @@ mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, void *kmem_cache_alloc(struct kmem_cache *cachep, int flags) { - void *ret = malloc(cachep->size); + void *ret; + + if (flags & __GFP_NOWARN) + return NULL; + + ret = malloc(cachep->size); if (cachep->ctor) cachep->ctor(ret); uatomic_inc(&nr_allocated); @@ -48,6 +54,21 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) free(objp); } +void *kmalloc(size_t size, gfp_t gfp) +{ + void *ret = malloc(size); + uatomic_inc(&nr_allocated); + return ret; +} + +void kfree(void *p) +{ + if (!p) + return; + uatomic_dec(&nr_allocated); + free(p); +} + struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t offset, unsigned long flags, void (*ctor)(void *)) diff --git a/tools/testing/radix-tree/linux/bitops.h b/tools/testing/radix-tree/linux/bitops.h index 71d58427ab60..a13e9bc76eec 100644 --- a/tools/testing/radix-tree/linux/bitops.h +++ b/tools/testing/radix-tree/linux/bitops.h @@ -2,9 +2,14 @@ #define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ #include <linux/types.h> +#include <linux/bitops/find.h> +#include <linux/bitops/hweight.h> +#include <linux/kernel.h> -#define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) -#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) +#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) +#define BITS_PER_BYTE 8 +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) /** * __set_bit - Set a bit in memory @@ -17,16 +22,16 @@ */ static inline void __set_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); *p |= mask; } static inline void __clear_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); *p &= ~mask; } @@ -42,8 +47,8 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr) */ static inline void __change_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); *p ^= mask; } @@ -59,8 +64,8 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) */ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; *p = old | mask; @@ -78,8 +83,8 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) */ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; *p = old & ~mask; @@ -90,8 +95,8 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; *p = old ^ mask; @@ -105,7 +110,7 @@ static inline int __test_and_change_bit(int nr, */ static inline int test_bit(int nr, const volatile unsigned long *addr) { - return 1UL & (addr[BITOP_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); + return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); } /** @@ -147,4 +152,9 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset); +static inline unsigned long hweight_long(unsigned long w) +{ + return sizeof(w) == 4 ? hweight32(w) : hweight64(w); +} + #endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */ diff --git a/tools/testing/radix-tree/linux/bitops/non-atomic.h b/tools/testing/radix-tree/linux/bitops/non-atomic.h index 46a825cf2ae1..6a1bcb9d2c4a 100644 --- a/tools/testing/radix-tree/linux/bitops/non-atomic.h +++ b/tools/testing/radix-tree/linux/bitops/non-atomic.h @@ -3,7 +3,6 @@ #include <asm/types.h> -#define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) /** @@ -17,7 +16,7 @@ */ static inline void __set_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); + unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); *p |= mask; @@ -25,7 +24,7 @@ static inline void __set_bit(int nr, volatile unsigned long *addr) static inline void __clear_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); + unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); *p &= ~mask; @@ -42,7 +41,7 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr) */ static inline void __change_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); + unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); *p ^= mask; @@ -59,7 +58,7 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) */ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); + unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); unsigned long old = *p; @@ -78,7 +77,7 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) */ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); + unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); unsigned long old = *p; @@ -90,7 +89,7 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) { - unsigned long mask = BITOP_MASK(nr); + unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); unsigned long old = *p; diff --git a/tools/testing/radix-tree/linux/bug.h b/tools/testing/radix-tree/linux/bug.h index ccbe444977df..23b8ed52f8c8 100644 --- a/tools/testing/radix-tree/linux/bug.h +++ b/tools/testing/radix-tree/linux/bug.h @@ -1 +1 @@ -#define WARN_ON_ONCE(x) assert(x) +#include "asm/bug.h" diff --git a/tools/testing/radix-tree/linux/gfp.h b/tools/testing/radix-tree/linux/gfp.h index 5201b915f631..5b09b2ce6c33 100644 --- a/tools/testing/radix-tree/linux/gfp.h +++ b/tools/testing/radix-tree/linux/gfp.h @@ -3,8 +3,24 @@ #define __GFP_BITS_SHIFT 26 #define __GFP_BITS_MASK ((gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) -#define __GFP_WAIT 1 -#define __GFP_ACCOUNT 0 -#define __GFP_NOWARN 0 + +#define __GFP_HIGH 0x20u +#define __GFP_IO 0x40u +#define __GFP_FS 0x80u +#define __GFP_NOWARN 0x200u +#define __GFP_ATOMIC 0x80000u +#define __GFP_ACCOUNT 0x100000u +#define __GFP_DIRECT_RECLAIM 0x400000u +#define __GFP_KSWAPD_RECLAIM 0x2000000u + +#define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM) + +#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) + +static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) +{ + return !!(gfp_flags & __GFP_DIRECT_RECLAIM); +} #endif diff --git a/tools/testing/radix-tree/linux/idr.h b/tools/testing/radix-tree/linux/idr.h new file mode 100644 index 000000000000..4e342f2e37cf --- /dev/null +++ b/tools/testing/radix-tree/linux/idr.h @@ -0,0 +1 @@ +#include "../../../../include/linux/idr.h" diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h index be98a47b4e1b..7d214e9ce021 100644 --- a/tools/testing/radix-tree/linux/kernel.h +++ b/tools/testing/radix-tree/linux/kernel.h @@ -8,9 +8,14 @@ #include <limits.h> #include "../../include/linux/compiler.h" +#include "../../include/linux/err.h" #include "../../../include/linux/kconfig.h" +#ifdef BENCHMARK +#define RADIX_TREE_MAP_SHIFT 6 +#else #define RADIX_TREE_MAP_SHIFT 3 +#endif #ifndef NULL #define NULL 0 @@ -25,6 +30,7 @@ #define __force #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) #define pr_debug printk +#define pr_cont printk #define smp_rmb() barrier() #define smp_wmb() barrier() @@ -36,6 +42,7 @@ const typeof( ((type *)0)->member ) *__mptr = (ptr); \ (type *)( (char *)__mptr - offsetof(type, member) );}) #define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) < (b) ? (b) : (a)) #define cond_resched() sched_yield() @@ -43,4 +50,17 @@ static inline int in_interrupt(void) { return 0; } + +/* + * This looks more complex than it should be. But we need to + * get the type for the ~ right in round_down (it needs to be + * as wide as the result!), and we want to evaluate the macro + * arguments just once each. + */ +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) + +#define xchg(ptr, x) uatomic_xchg(ptr, x) + #endif /* _KERNEL_H */ diff --git a/tools/testing/radix-tree/linux/preempt.h b/tools/testing/radix-tree/linux/preempt.h index 6210672e3baa..65c04c226965 100644 --- a/tools/testing/radix-tree/linux/preempt.h +++ b/tools/testing/radix-tree/linux/preempt.h @@ -1,4 +1,4 @@ -/* */ +extern int preempt_count; -#define preempt_disable() do { } while (0) -#define preempt_enable() do { } while (0) +#define preempt_disable() uatomic_inc(&preempt_count) +#define preempt_enable() uatomic_dec(&preempt_count) diff --git a/tools/testing/radix-tree/linux/slab.h b/tools/testing/radix-tree/linux/slab.h index 6d5a34770fd4..446639f78fc1 100644 --- a/tools/testing/radix-tree/linux/slab.h +++ b/tools/testing/radix-tree/linux/slab.h @@ -7,10 +7,8 @@ #define SLAB_PANIC 2 #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ -static inline int gfpflags_allow_blocking(gfp_t mask) -{ - return 1; -} +void *kmalloc(size_t size, gfp_t); +void kfree(void *); struct kmem_cache { int size; diff --git a/tools/testing/radix-tree/linux/types.h b/tools/testing/radix-tree/linux/types.h index faa0b6ff9ca8..8491d89873bb 100644 --- a/tools/testing/radix-tree/linux/types.h +++ b/tools/testing/radix-tree/linux/types.h @@ -6,8 +6,6 @@ #define __rcu #define __read_mostly -#define BITS_PER_LONG (sizeof(long) * 8) - static inline void INIT_LIST_HEAD(struct list_head *list) { list->next = list; diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c index daa9010693e8..ddd90a11db3f 100644 --- a/tools/testing/radix-tree/main.c +++ b/tools/testing/radix-tree/main.c @@ -3,6 +3,7 @@ #include <unistd.h> #include <time.h> #include <assert.h> +#include <limits.h> #include <linux/slab.h> #include <linux/radix-tree.h> @@ -67,7 +68,6 @@ void big_gang_check(bool long_run) for (i = 0; i < (long_run ? 1000 : 3); i++) { __big_gang_check(); - srand(time(0)); printf("%d ", i); fflush(stdout); } @@ -206,8 +206,7 @@ void copy_tag_check(void) } // printf("\ncopying tags...\n"); - cur = start; - tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, ITEMS, 0, 1); + tagged = tag_tagged_items(&tree, NULL, start, end, ITEMS, 0, 1); // printf("checking copied tags\n"); assert(tagged == count); @@ -215,16 +214,13 @@ void copy_tag_check(void) /* Copy tags in several rounds */ // printf("\ncopying tags...\n"); - cur = start; - do { - tmp = rand() % (count/10+2); - tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, tmp, 0, 2); - } while (tmp == tagged); + tmp = rand() % (count / 10 + 2); + tagged = tag_tagged_items(&tree, NULL, start, end, tmp, 0, 2); + assert(tagged == count); // printf("%lu %lu %lu\n", tagged, tmp, count); // printf("checking copied tags\n"); check_copied_tags(&tree, start, end, idx, ITEMS, 0, 2); - assert(tagged < tmp); verify_tag_consistency(&tree, 0); verify_tag_consistency(&tree, 1); verify_tag_consistency(&tree, 2); @@ -240,7 +236,7 @@ static void __locate_check(struct radix_tree_root *tree, unsigned long index, item_insert_order(tree, index, order); item = item_lookup(tree, index); - index2 = radix_tree_locate_item(tree, item); + index2 = find_item(tree, item); if (index != index2) { printf("index %ld order %d inserted; found %ld\n", index, order, index2); @@ -274,17 +270,17 @@ static void locate_check(void) index += (1UL << order)) { __locate_check(&tree, index + offset, order); } - if (radix_tree_locate_item(&tree, &tree) != -1) + if (find_item(&tree, &tree) != -1) abort(); item_kill_tree(&tree); } } - if (radix_tree_locate_item(&tree, &tree) != -1) + if (find_item(&tree, &tree) != -1) abort(); __locate_check(&tree, -1, 0); - if (radix_tree_locate_item(&tree, &tree) != -1) + if (find_item(&tree, &tree) != -1) abort(); item_kill_tree(&tree); } @@ -293,50 +289,85 @@ static void single_thread_tests(bool long_run) { int i; - printf("starting single_thread_tests: %d allocated\n", nr_allocated); + printf("starting single_thread_tests: %d allocated, preempt %d\n", + nr_allocated, preempt_count); multiorder_checks(); - printf("after multiorder_check: %d allocated\n", nr_allocated); + rcu_barrier(); + printf("after multiorder_check: %d allocated, preempt %d\n", + nr_allocated, preempt_count); locate_check(); - printf("after locate_check: %d allocated\n", nr_allocated); + rcu_barrier(); + printf("after locate_check: %d allocated, preempt %d\n", + nr_allocated, preempt_count); tag_check(); - printf("after tag_check: %d allocated\n", nr_allocated); + rcu_barrier(); + printf("after tag_check: %d allocated, preempt %d\n", + nr_allocated, preempt_count); gang_check(); - printf("after gang_check: %d allocated\n", nr_allocated); + rcu_barrier(); + printf("after gang_check: %d allocated, preempt %d\n", + nr_allocated, preempt_count); add_and_check(); - printf("after add_and_check: %d allocated\n", nr_allocated); + rcu_barrier(); + printf("after add_and_check: %d allocated, preempt %d\n", + nr_allocated, preempt_count); dynamic_height_check(); - printf("after dynamic_height_check: %d allocated\n", nr_allocated); + rcu_barrier(); + printf("after dynamic_height_check: %d allocated, preempt %d\n", + nr_allocated, preempt_count); + idr_checks(); + ida_checks(); + rcu_barrier(); + printf("after idr_checks: %d allocated, preempt %d\n", + nr_allocated, preempt_count); big_gang_check(long_run); - printf("after big_gang_check: %d allocated\n", nr_allocated); + rcu_barrier(); + printf("after big_gang_check: %d allocated, preempt %d\n", + nr_allocated, preempt_count); for (i = 0; i < (long_run ? 2000 : 3); i++) { copy_tag_check(); printf("%d ", i); fflush(stdout); } - printf("after copy_tag_check: %d allocated\n", nr_allocated); + rcu_barrier(); + printf("after copy_tag_check: %d allocated, preempt %d\n", + nr_allocated, preempt_count); } int main(int argc, char **argv) { bool long_run = false; int opt; + unsigned int seed = time(NULL); - while ((opt = getopt(argc, argv, "l")) != -1) { + while ((opt = getopt(argc, argv, "ls:")) != -1) { if (opt == 'l') long_run = true; + else if (opt == 's') + seed = strtoul(optarg, NULL, 0); } + printf("random seed %u\n", seed); + srand(seed); + rcu_register_thread(); radix_tree_init(); regression1_test(); regression2_test(); regression3_test(); - iteration_test(); + iteration_test(0, 10); + iteration_test(7, 20); single_thread_tests(long_run); - sleep(1); - printf("after sleep(1): %d allocated\n", nr_allocated); + /* Free any remaining preallocated nodes */ + radix_tree_cpu_dead(0); + + benchmark(); + + rcu_barrier(); + printf("after rcu_barrier: %d allocated, preempt %d\n", + nr_allocated, preempt_count); rcu_unregister_thread(); exit(0); diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index d1be94667a30..08b4e16dc86f 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -26,7 +26,6 @@ static void __multiorder_tag_test(int index, int order) { RADIX_TREE(tree, GFP_KERNEL); int base, err, i; - unsigned long first = 0; /* our canonical entry */ base = index & ~((1 << order) - 1); @@ -60,7 +59,7 @@ static void __multiorder_tag_test(int index, int order) assert(!radix_tree_tag_get(&tree, i, 1)); } - assert(radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, 10, 0, 1) == 1); + assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 1); assert(radix_tree_tag_clear(&tree, index, 0)); for_each_index(i, base, order) { @@ -76,8 +75,27 @@ static void __multiorder_tag_test(int index, int order) item_kill_tree(&tree); } +static void __multiorder_tag_test2(unsigned order, unsigned long index2) +{ + RADIX_TREE(tree, GFP_KERNEL); + unsigned long index = (1 << order); + index2 += index; + + assert(item_insert_order(&tree, 0, order) == 0); + assert(item_insert(&tree, index2) == 0); + + assert(radix_tree_tag_set(&tree, 0, 0)); + assert(radix_tree_tag_set(&tree, index2, 0)); + + assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 2); + + item_kill_tree(&tree); +} + static void multiorder_tag_tests(void) { + int i, j; + /* test multi-order entry for indices 0-7 with no sibling pointers */ __multiorder_tag_test(0, 3); __multiorder_tag_test(5, 3); @@ -117,6 +135,10 @@ static void multiorder_tag_tests(void) __multiorder_tag_test(300, 8); __multiorder_tag_test(0x12345678UL, 8); + + for (i = 1; i < 10; i++) + for (j = 0; j < (10 << i); j++) + __multiorder_tag_test2(i, j); } static void multiorder_check(unsigned long index, int order) @@ -125,7 +147,7 @@ static void multiorder_check(unsigned long index, int order) unsigned long min = index & ~((1UL << order) - 1); unsigned long max = min + (1UL << order); void **slot; - struct item *item2 = item_create(min); + struct item *item2 = item_create(min, order); RADIX_TREE(tree, GFP_KERNEL); printf("Multiorder index %ld, order %d\n", index, order); @@ -231,11 +253,14 @@ void multiorder_iteration(void) radix_tree_for_each_slot(slot, &tree, &iter, j) { int height = order[i] / RADIX_TREE_MAP_SHIFT; int shift = height * RADIX_TREE_MAP_SHIFT; - int mask = (1 << order[i]) - 1; + unsigned long mask = (1UL << order[i]) - 1; + struct item *item = *slot; - assert(iter.index >= (index[i] &~ mask)); - assert(iter.index <= (index[i] | mask)); + assert((iter.index | mask) == (index[i] | mask)); assert(iter.shift == shift); + assert(!radix_tree_is_internal_node(item)); + assert((item->index | mask) == (index[i] | mask)); + assert(item->order == order[i]); i++; } } @@ -248,7 +273,6 @@ void multiorder_tagged_iteration(void) RADIX_TREE(tree, GFP_KERNEL); struct radix_tree_iter iter; void **slot; - unsigned long first = 0; int i, j; printf("Multiorder tagged iteration test\n"); @@ -269,7 +293,7 @@ void multiorder_tagged_iteration(void) assert(radix_tree_tag_set(&tree, tag_index[i], 1)); for (j = 0; j < 256; j++) { - int mask, k; + int k; for (i = 0; i < TAG_ENTRIES; i++) { for (k = i; index[k] < tag_index[i]; k++) @@ -279,18 +303,22 @@ void multiorder_tagged_iteration(void) } radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) { + unsigned long mask; + struct item *item = *slot; for (k = i; index[k] < tag_index[i]; k++) ; - mask = (1 << order[k]) - 1; + mask = (1UL << order[k]) - 1; - assert(iter.index >= (tag_index[i] &~ mask)); - assert(iter.index <= (tag_index[i] | mask)); + assert((iter.index | mask) == (tag_index[i] | mask)); + assert(!radix_tree_is_internal_node(item)); + assert((item->index | mask) == (tag_index[i] | mask)); + assert(item->order == order[k]); i++; } } - radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, - MT_NUM_ENTRIES, 1, 2); + assert(tag_tagged_items(&tree, NULL, 0, ~0UL, TAG_ENTRIES, 1, 2) == + TAG_ENTRIES); for (j = 0; j < 256; j++) { int mask, k; @@ -303,19 +331,21 @@ void multiorder_tagged_iteration(void) } radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) { + struct item *item = *slot; for (k = i; index[k] < tag_index[i]; k++) ; mask = (1 << order[k]) - 1; - assert(iter.index >= (tag_index[i] &~ mask)); - assert(iter.index <= (tag_index[i] | mask)); + assert((iter.index | mask) == (tag_index[i] | mask)); + assert(!radix_tree_is_internal_node(item)); + assert((item->index | mask) == (tag_index[i] | mask)); + assert(item->order == order[k]); i++; } } - first = 1; - radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, - MT_NUM_ENTRIES, 1, 0); + assert(tag_tagged_items(&tree, NULL, 1, ~0UL, MT_NUM_ENTRIES * 2, 1, 0) + == TAG_ENTRIES); i = 0; radix_tree_for_each_tagged(slot, &tree, &iter, 0, 0) { assert(iter.index == tag_index[i]); @@ -325,6 +355,224 @@ void multiorder_tagged_iteration(void) item_kill_tree(&tree); } +static void __multiorder_join(unsigned long index, + unsigned order1, unsigned order2) +{ + unsigned long loc; + void *item, *item2 = item_create(index + 1, order1); + RADIX_TREE(tree, GFP_KERNEL); + + item_insert_order(&tree, index, order2); + item = radix_tree_lookup(&tree, index); + radix_tree_join(&tree, index + 1, order1, item2); + loc = find_item(&tree, item); + if (loc == -1) + free(item); + item = radix_tree_lookup(&tree, index + 1); + assert(item == item2); + item_kill_tree(&tree); +} + +static void __multiorder_join2(unsigned order1, unsigned order2) +{ + RADIX_TREE(tree, GFP_KERNEL); + struct radix_tree_node *node; + void *item1 = item_create(0, order1); + void *item2; + + item_insert_order(&tree, 0, order2); + radix_tree_insert(&tree, 1 << order2, (void *)0x12UL); + item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL); + assert(item2 == (void *)0x12UL); + assert(node->exceptional == 1); + + radix_tree_join(&tree, 0, order1, item1); + item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL); + assert(item2 == item1); + assert(node->exceptional == 0); + item_kill_tree(&tree); +} + +static void multiorder_join(void) +{ + int i, j, idx; + + for (idx = 0; idx < 1024; idx = idx * 2 + 3) { + for (i = 1; i < 15; i++) { + for (j = 0; j < i; j++) { + __multiorder_join(idx, i, j); + } + } + } + + for (i = 1; i < 15; i++) { + for (j = 0; j < i; j++) { + __multiorder_join2(i, j); + } + } +} + +static void check_mem(unsigned old_order, unsigned new_order, unsigned alloc) +{ + struct radix_tree_preload *rtp = &radix_tree_preloads; + if (rtp->nr != 0) + printf("split(%u %u) remaining %u\n", old_order, new_order, + rtp->nr); + /* + * Can't check for equality here as some nodes may have been + * RCU-freed while we ran. But we should never finish with more + * nodes allocated since they should have all been preloaded. + */ + if (nr_allocated > alloc) + printf("split(%u %u) allocated %u %u\n", old_order, new_order, + alloc, nr_allocated); +} + +static void __multiorder_split(int old_order, int new_order) +{ + RADIX_TREE(tree, GFP_ATOMIC); + void **slot; + struct radix_tree_iter iter; + unsigned alloc; + + radix_tree_preload(GFP_KERNEL); + assert(item_insert_order(&tree, 0, old_order) == 0); + radix_tree_preload_end(); + + /* Wipe out the preloaded cache or it'll confuse check_mem() */ + radix_tree_cpu_dead(0); + + radix_tree_tag_set(&tree, 0, 2); + + radix_tree_split_preload(old_order, new_order, GFP_KERNEL); + alloc = nr_allocated; + radix_tree_split(&tree, 0, new_order); + check_mem(old_order, new_order, alloc); + radix_tree_for_each_slot(slot, &tree, &iter, 0) { + radix_tree_iter_replace(&tree, &iter, slot, + item_create(iter.index, new_order)); + } + radix_tree_preload_end(); + + item_kill_tree(&tree); +} + +static void __multiorder_split2(int old_order, int new_order) +{ + RADIX_TREE(tree, GFP_KERNEL); + void **slot; + struct radix_tree_iter iter; + struct radix_tree_node *node; + void *item; + + __radix_tree_insert(&tree, 0, old_order, (void *)0x12); + + item = __radix_tree_lookup(&tree, 0, &node, NULL); + assert(item == (void *)0x12); + assert(node->exceptional > 0); + + radix_tree_split(&tree, 0, new_order); + radix_tree_for_each_slot(slot, &tree, &iter, 0) { + radix_tree_iter_replace(&tree, &iter, slot, + item_create(iter.index, new_order)); + } + + item = __radix_tree_lookup(&tree, 0, &node, NULL); + assert(item != (void *)0x12); + assert(node->exceptional == 0); + + item_kill_tree(&tree); +} + +static void __multiorder_split3(int old_order, int new_order) +{ + RADIX_TREE(tree, GFP_KERNEL); + void **slot; + struct radix_tree_iter iter; + struct radix_tree_node *node; + void *item; + + __radix_tree_insert(&tree, 0, old_order, (void *)0x12); + + item = __radix_tree_lookup(&tree, 0, &node, NULL); + assert(item == (void *)0x12); + assert(node->exceptional > 0); + + radix_tree_split(&tree, 0, new_order); + radix_tree_for_each_slot(slot, &tree, &iter, 0) { + radix_tree_iter_replace(&tree, &iter, slot, (void *)0x16); + } + + item = __radix_tree_lookup(&tree, 0, &node, NULL); + assert(item == (void *)0x16); + assert(node->exceptional > 0); + + item_kill_tree(&tree); + + __radix_tree_insert(&tree, 0, old_order, (void *)0x12); + + item = __radix_tree_lookup(&tree, 0, &node, NULL); + assert(item == (void *)0x12); + assert(node->exceptional > 0); + + radix_tree_split(&tree, 0, new_order); + radix_tree_for_each_slot(slot, &tree, &iter, 0) { + if (iter.index == (1 << new_order)) + radix_tree_iter_replace(&tree, &iter, slot, + (void *)0x16); + else + radix_tree_iter_replace(&tree, &iter, slot, NULL); + } + + item = __radix_tree_lookup(&tree, 1 << new_order, &node, NULL); + assert(item == (void *)0x16); + assert(node->count == node->exceptional); + do { + node = node->parent; + if (!node) + break; + assert(node->count == 1); + assert(node->exceptional == 0); + } while (1); + + item_kill_tree(&tree); +} + +static void multiorder_split(void) +{ + int i, j; + + for (i = 3; i < 11; i++) + for (j = 0; j < i; j++) { + __multiorder_split(i, j); + __multiorder_split2(i, j); + __multiorder_split3(i, j); + } +} + +static void multiorder_account(void) +{ + RADIX_TREE(tree, GFP_KERNEL); + struct radix_tree_node *node; + void **slot; + + item_insert_order(&tree, 0, 5); + + __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12); + __radix_tree_lookup(&tree, 0, &node, NULL); + assert(node->count == node->exceptional * 2); + radix_tree_delete(&tree, 1 << 5); + assert(node->exceptional == 0); + + __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12); + __radix_tree_lookup(&tree, 1 << 5, &node, &slot); + assert(node->count == node->exceptional * 2); + __radix_tree_replace(&tree, node, slot, NULL, NULL, NULL); + assert(node->exceptional == 0); + + item_kill_tree(&tree); +} + void multiorder_checks(void) { int i; @@ -342,4 +590,9 @@ void multiorder_checks(void) multiorder_tag_tests(); multiorder_iteration(); multiorder_tagged_iteration(); + multiorder_join(); + multiorder_split(); + multiorder_account(); + + radix_tree_cpu_dead(0); } diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c index 63bf347aaf33..a41325d7a170 100644 --- a/tools/testing/radix-tree/regression2.c +++ b/tools/testing/radix-tree/regression2.c @@ -50,6 +50,7 @@ #include <stdio.h> #include "regression.h" +#include "test.h" #define PAGECACHE_TAG_DIRTY 0 #define PAGECACHE_TAG_WRITEBACK 1 @@ -90,7 +91,7 @@ void regression2_test(void) /* 1. */ start = 0; end = max_slots - 2; - radix_tree_range_tag_if_tagged(&mt_tree, &start, end, 1, + tag_tagged_items(&mt_tree, NULL, start, end, 1, PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); /* 2. */ diff --git a/tools/testing/radix-tree/regression3.c b/tools/testing/radix-tree/regression3.c index 1f06ed73d0a8..b594841fae85 100644 --- a/tools/testing/radix-tree/regression3.c +++ b/tools/testing/radix-tree/regression3.c @@ -5,7 +5,7 @@ * In following radix_tree_next_slot current chunk size becomes zero. * This isn't checked and it tries to dereference null pointer in slot. * - * Helper radix_tree_iter_next reset slot to NULL and next_index to index + 1, + * Helper radix_tree_iter_resume reset slot to NULL and next_index to index + 1, * for tagger iteraction it also must reset cached tags in iterator to abort * next radix_tree_next_slot and go to slow-path into radix_tree_next_chunk. * @@ -88,7 +88,7 @@ void regression3_test(void) printf("slot %ld %p\n", iter.index, *slot); if (!iter.index) { printf("next at %ld\n", iter.index); - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_resume(slot, &iter); } } @@ -96,7 +96,7 @@ void regression3_test(void) printf("contig %ld %p\n", iter.index, *slot); if (!iter.index) { printf("next at %ld\n", iter.index); - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_resume(slot, &iter); } } @@ -106,7 +106,7 @@ void regression3_test(void) printf("tagged %ld %p\n", iter.index, *slot); if (!iter.index) { printf("next at %ld\n", iter.index); - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_resume(slot, &iter); } } diff --git a/tools/testing/radix-tree/tag_check.c b/tools/testing/radix-tree/tag_check.c index b0ac05741750..ed5f87d1dce2 100644 --- a/tools/testing/radix-tree/tag_check.c +++ b/tools/testing/radix-tree/tag_check.c @@ -23,7 +23,7 @@ __simple_checks(struct radix_tree_root *tree, unsigned long index, int tag) item_tag_set(tree, index, tag); ret = item_tag_get(tree, index, tag); assert(ret != 0); - ret = radix_tree_range_tag_if_tagged(tree, &first, ~0UL, 10, tag, !tag); + ret = tag_tagged_items(tree, NULL, first, ~0UL, 10, tag, !tag); assert(ret == 1); ret = item_tag_get(tree, index, !tag); assert(ret != 0); @@ -51,6 +51,7 @@ void simple_checks(void) verify_tag_consistency(&tree, 1); printf("before item_kill_tree: %d allocated\n", nr_allocated); item_kill_tree(&tree); + rcu_barrier(); printf("after item_kill_tree: %d allocated\n", nr_allocated); } @@ -319,7 +320,7 @@ static void single_check(void) assert(ret == 0); verify_tag_consistency(&tree, 0); verify_tag_consistency(&tree, 1); - ret = radix_tree_range_tag_if_tagged(&tree, &first, 10, 10, 0, 1); + ret = tag_tagged_items(&tree, NULL, first, 10, 10, 0, 1); assert(ret == 1); ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 1); assert(ret == 1); @@ -331,12 +332,16 @@ void tag_check(void) single_check(); extend_checks(); contract_checks(); + rcu_barrier(); printf("after extend_checks: %d allocated\n", nr_allocated); __leak_check(); leak_check(); + rcu_barrier(); printf("after leak_check: %d allocated\n", nr_allocated); simple_checks(); + rcu_barrier(); printf("after simple_checks: %d allocated\n", nr_allocated); thrash_tags(); + rcu_barrier(); printf("after thrash_tags: %d allocated\n", nr_allocated); } diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index a6e8099eaf4f..e5726e373646 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c @@ -24,21 +24,29 @@ int item_tag_get(struct radix_tree_root *root, unsigned long index, int tag) return radix_tree_tag_get(root, index, tag); } -int __item_insert(struct radix_tree_root *root, struct item *item, - unsigned order) +int __item_insert(struct radix_tree_root *root, struct item *item) { - return __radix_tree_insert(root, item->index, order, item); + return __radix_tree_insert(root, item->index, item->order, item); } int item_insert(struct radix_tree_root *root, unsigned long index) { - return __item_insert(root, item_create(index), 0); + return __item_insert(root, item_create(index, 0)); } int item_insert_order(struct radix_tree_root *root, unsigned long index, unsigned order) { - return __item_insert(root, item_create(index), order); + return __item_insert(root, item_create(index, order)); +} + +void item_sanity(struct item *item, unsigned long index) +{ + unsigned long mask; + assert(!radix_tree_is_internal_node(item)); + assert(item->order < BITS_PER_LONG); + mask = (1UL << item->order) - 1; + assert((item->index | mask) == (index | mask)); } int item_delete(struct radix_tree_root *root, unsigned long index) @@ -46,18 +54,19 @@ int item_delete(struct radix_tree_root *root, unsigned long index) struct item *item = radix_tree_delete(root, index); if (item) { - assert(item->index == index); + item_sanity(item, index); free(item); return 1; } return 0; } -struct item *item_create(unsigned long index) +struct item *item_create(unsigned long index, unsigned int order) { struct item *ret = malloc(sizeof(*ret)); ret->index = index; + ret->order = order; return ret; } @@ -66,8 +75,8 @@ void item_check_present(struct radix_tree_root *root, unsigned long index) struct item *item; item = radix_tree_lookup(root, index); - assert(item != 0); - assert(item->index == index); + assert(item != NULL); + item_sanity(item, index); } struct item *item_lookup(struct radix_tree_root *root, unsigned long index) @@ -80,7 +89,7 @@ void item_check_absent(struct radix_tree_root *root, unsigned long index) struct item *item; item = radix_tree_lookup(root, index); - assert(item == 0); + assert(item == NULL); } /* @@ -142,6 +151,62 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start, assert(nfound == 0); } +/* Use the same pattern as tag_pages_for_writeback() in mm/page-writeback.c */ +int tag_tagged_items(struct radix_tree_root *root, pthread_mutex_t *lock, + unsigned long start, unsigned long end, unsigned batch, + unsigned iftag, unsigned thentag) +{ + unsigned long tagged = 0; + struct radix_tree_iter iter; + void **slot; + + if (batch == 0) + batch = 1; + + if (lock) + pthread_mutex_lock(lock); + radix_tree_for_each_tagged(slot, root, &iter, start, iftag) { + if (iter.index > end) + break; + radix_tree_iter_tag_set(root, &iter, thentag); + tagged++; + if ((tagged % batch) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); + if (lock) { + pthread_mutex_unlock(lock); + rcu_barrier(); + pthread_mutex_lock(lock); + } + } + if (lock) + pthread_mutex_unlock(lock); + + return tagged; +} + +/* Use the same pattern as find_swap_entry() in mm/shmem.c */ +unsigned long find_item(struct radix_tree_root *root, void *item) +{ + struct radix_tree_iter iter; + void **slot; + unsigned long found = -1; + unsigned long checked = 0; + + radix_tree_for_each_slot(slot, root, &iter, 0) { + if (*slot == item) { + found = iter.index; + break; + } + checked++; + if ((checked % 4) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); + } + + return found; +} + static int verify_node(struct radix_tree_node *slot, unsigned int tag, int tagged) { @@ -200,9 +265,16 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag) void item_kill_tree(struct radix_tree_root *root) { + struct radix_tree_iter iter; + void **slot; struct item *items[32]; int nfound; + radix_tree_for_each_slot(slot, root, &iter, 0) { + if (radix_tree_exceptional_entry(*slot)) + radix_tree_delete(root, iter.index); + } + while ((nfound = radix_tree_gang_lookup(root, (void **)items, 0, 32))) { int i; diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index 217fb2403f09..b30e11d9d271 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -5,11 +5,11 @@ struct item { unsigned long index; + unsigned int order; }; -struct item *item_create(unsigned long index); -int __item_insert(struct radix_tree_root *root, struct item *item, - unsigned order); +struct item *item_create(unsigned long index, unsigned int order); +int __item_insert(struct radix_tree_root *root, struct item *item); int item_insert(struct radix_tree_root *root, unsigned long index); int item_insert_order(struct radix_tree_root *root, unsigned long index, unsigned order); @@ -25,9 +25,17 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start, unsigned long nr, int chunk); void item_kill_tree(struct radix_tree_root *root); +int tag_tagged_items(struct radix_tree_root *, pthread_mutex_t *, + unsigned long start, unsigned long end, unsigned batch, + unsigned iftag, unsigned thentag); +unsigned long find_item(struct radix_tree_root *, void *item); + void tag_check(void); void multiorder_checks(void); -void iteration_test(void); +void iteration_test(unsigned order, unsigned duration); +void benchmark(void); +void idr_checks(void); +void ida_checks(void); struct item * item_tag_set(struct radix_tree_root *root, unsigned long index, int tag); @@ -40,7 +48,14 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag); extern int nr_allocated; /* Normally private parts of lib/radix-tree.c */ +struct radix_tree_node *entry_to_node(void *ptr); void radix_tree_dump(struct radix_tree_root *root); int root_tag_get(struct radix_tree_root *root, unsigned int tag); unsigned long node_maxindex(struct radix_tree_node *); unsigned long shift_maxindex(unsigned int shift); +int radix_tree_cpu_dead(unsigned int cpu); +struct radix_tree_preload { + unsigned nr; + struct radix_tree_node *nodes; +}; +extern struct radix_tree_preload radix_tree_preloads; diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index efeceb0a222d..3815e940fbea 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -76,16 +76,20 @@ static void async_pf_execute(struct work_struct *work) struct kvm_vcpu *vcpu = apf->vcpu; unsigned long addr = apf->addr; gva_t gva = apf->gva; + int locked = 1; might_sleep(); /* * This work is run asynchromously to the task which owns * mm and might be done in another context, so we must - * use FOLL_REMOTE. + * access remotely. */ - __get_user_pages_unlocked(NULL, mm, addr, 1, NULL, - FOLL_WRITE | FOLL_REMOTE); + down_read(&mm->mmap_sem); + get_user_pages_remote(NULL, mm, addr, 1, FOLL_WRITE, NULL, NULL, + &locked); + if (locked) + up_read(&mm->mmap_sem); kvm_async_page_present_sync(vcpu, apf); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 823544c166be..de102cae7125 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1418,13 +1418,12 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, npages = get_user_page_nowait(addr, write_fault, page); up_read(¤t->mm->mmap_sem); } else { - unsigned int flags = FOLL_TOUCH | FOLL_HWPOISON; + unsigned int flags = FOLL_HWPOISON; if (write_fault) flags |= FOLL_WRITE; - npages = __get_user_pages_unlocked(current, current->mm, addr, 1, - page, flags); + npages = get_user_pages_unlocked(addr, 1, page, flags); } if (npages != 1) return npages; |