summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-class-bdi8
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst18
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt5
-rw-r--r--Documentation/admin-guide/mm/idle_page_tracking.rst5
-rw-r--r--Documentation/admin-guide/mm/pagemap.rst3
-rw-r--r--Documentation/filesystems/proc.txt3
-rw-r--r--Documentation/filesystems/seq_file.txt63
-rw-r--r--Documentation/sysctl/fs.txt36
-rw-r--r--Documentation/sysctl/kernel.txt18
-rw-r--r--Documentation/sysctl/vm.txt2
-rw-r--r--arch/Kconfig10
-rw-r--r--arch/alpha/mm/fault.c3
-rw-r--r--arch/arc/mm/fault.c4
-rw-r--r--arch/arm/Kconfig4
-rw-r--r--arch/arm/include/asm/page.h2
-rw-r--r--arch/arm/kernel/process.c9
-rw-r--r--arch/arm/mm/dma-mapping.c5
-rw-r--r--arch/arm/mm/fault.c7
-rw-r--r--arch/arm64/Kconfig5
-rw-r--r--arch/arm64/mm/dma-mapping.c4
-rw-r--r--arch/arm64/mm/fault.c6
-rw-r--r--arch/hexagon/mm/vm_fault.c2
-rw-r--r--arch/ia64/include/asm/pgtable.h1
-rw-r--r--arch/ia64/mm/fault.c2
-rw-r--r--arch/m68k/mm/fault.c4
-rw-r--r--arch/microblaze/mm/fault.c2
-rw-r--r--arch/mips/mm/fault.c2
-rw-r--r--arch/nds32/mm/fault.c2
-rw-r--r--arch/nios2/mm/fault.c2
-rw-r--r--arch/openrisc/mm/fault.c2
-rw-r--r--arch/parisc/mm/fault.c2
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/include/asm/copro.h4
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c2
-rw-r--r--arch/powerpc/mm/copro_fault.c2
-rw-r--r--arch/powerpc/mm/fault.c7
-rw-r--r--arch/powerpc/platforms/cell/spufs/fault.c2
-rw-r--r--arch/riscv/mm/fault.c3
-rw-r--r--arch/s390/mm/fault.c13
-rw-r--r--arch/sh/boards/of-generic.c7
-rw-r--r--arch/sh/include/asm/kexec.h3
-rw-r--r--arch/sh/kernel/dwarf.c2
-rw-r--r--arch/sh/mm/fault.c4
-rw-r--r--arch/sparc/mm/fault_32.c3
-rw-r--r--arch/sparc/mm/fault_64.c3
-rw-r--r--arch/um/kernel/trap.c2
-rw-r--r--arch/unicore32/mm/fault.c9
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/boot/compressed/kaslr.c5
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/export.h5
-rw-r--r--arch/x86/kvm/x86.c7
-rw-r--r--arch/x86/mm/fault.c5
-rw-r--r--arch/xtensa/kernel/pci-dma.c2
-rw-r--r--arch/xtensa/mm/fault.c2
-rw-r--r--block/genhd.c2
-rw-r--r--drivers/base/firmware_loader/fallback.c5
-rw-r--r--drivers/base/memory.c2
-rw-r--r--drivers/base/node.c49
-rw-r--r--drivers/dax/device.c2
-rw-r--r--drivers/firewire/core-cdev.c8
-rw-r--r--drivers/firmware/efi/libstub/Makefile1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c43
-rw-r--r--drivers/gpu/drm/i915/i915_gem_userptr.c13
-rw-r--r--drivers/gpu/drm/radeon/radeon_mn.c22
-rw-r--r--drivers/infiniband/core/umem_odp.c33
-rw-r--r--drivers/infiniband/hw/hfi1/mmu_rb.c11
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c2
-rw-r--r--drivers/iommu/amd_iommu.c2
-rw-r--r--drivers/iommu/amd_iommu_v2.c2
-rw-r--r--drivers/iommu/intel-iommu.c3
-rw-r--r--drivers/iommu/intel-svm.c4
-rw-r--r--drivers/md/bcache/Kconfig1
-rw-r--r--drivers/md/bcache/util.c131
-rw-r--r--drivers/md/bcache/util.h21
-rw-r--r--drivers/misc/cxl/fault.c2
-rw-r--r--drivers/misc/mic/scif/scif_dma.c7
-rw-r--r--drivers/misc/ocxl/link.c3
-rw-r--r--drivers/misc/sgi-gru/grutlbpurge.c7
-rw-r--r--drivers/pci/quirks.c12
-rw-r--r--drivers/rapidio/devices/rio_mport_cdev.c2
-rw-r--r--drivers/s390/char/vmcp.c2
-rw-r--r--drivers/staging/android/ion/ion_cma_heap.c2
-rw-r--r--drivers/xen/gntdev.c44
-rw-r--r--fs/adfs/inode.c11
-rw-r--r--fs/autofs/autofs_i.h8
-rw-r--r--fs/autofs/expire.c143
-rw-r--r--fs/autofs/root.c33
-rw-r--r--fs/bfs/inode.c9
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/buffer.c62
-rw-r--r--fs/dcache.c3
-rw-r--r--fs/eventpoll.c118
-rw-r--r--fs/exec.c1
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/inode.c5
-rw-r--r--fs/ext4/readpage.c5
-rw-r--r--fs/fat/cache.c19
-rw-r--r--fs/fat/dir.c2
-rw-r--r--fs/fat/fat.h12
-rw-r--r--fs/fat/fatent.c108
-rw-r--r--fs/fat/file.c33
-rw-r--r--fs/fat/inode.c18
-rw-r--r--fs/fat/misc.c12
-rw-r--r--fs/fat/namei_msdos.c17
-rw-r--r--fs/fat/namei_vfat.c20
-rw-r--r--fs/hfs/brec.c7
-rw-r--r--fs/hfsplus/Kconfig15
-rw-r--r--fs/hfsplus/Makefile2
-rw-r--r--fs/hfsplus/acl.h28
-rw-r--r--fs/hfsplus/brec.c7
-rw-r--r--fs/hfsplus/dir.c13
-rw-r--r--fs/hfsplus/extents.c18
-rw-r--r--fs/hfsplus/hfsplus_fs.h1
-rw-r--r--fs/hfsplus/inode.c11
-rw-r--r--fs/hfsplus/posix_acl.c144
-rw-r--r--fs/hfsplus/super.c8
-rw-r--r--fs/hfsplus/unicode.c62
-rw-r--r--fs/hfsplus/xattr.c6
-rw-r--r--fs/hfsplus/xattr.h3
-rw-r--r--fs/hfsplus/xattr_security.c13
-rw-r--r--fs/hostfs/hostfs.h2
-rw-r--r--fs/hpfs/hpfs_fn.h13
-rw-r--r--fs/hpfs/namei.c12
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/mpage.c118
-rw-r--r--fs/namei.c53
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/notify/dnotify/dnotify.c5
-rw-r--r--fs/notify/fanotify/fanotify.c14
-rw-r--r--fs/notify/fanotify/fanotify_user.c5
-rw-r--r--fs/notify/group.c3
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c7
-rw-r--r--fs/notify/inotify/inotify_user.c5
-rw-r--r--fs/ntfs/aops.c9
-rw-r--r--fs/ntfs/compress.c28
-rw-r--r--fs/ntfs/inode.c12
-rw-r--r--fs/ntfs/mft.c12
-rw-r--r--fs/ntfs/time.h27
-rw-r--r--fs/ocfs2/alloc.c60
-rw-r--r--fs/ocfs2/buffer_head_io.c77
-rw-r--r--fs/ocfs2/cluster/heartbeat.c12
-rw-r--r--fs/ocfs2/cluster/nodemanager.c6
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/dir.c3
-rw-r--r--fs/ocfs2/dlmglue.c30
-rw-r--r--fs/ocfs2/file.c4
-rw-r--r--fs/ocfs2/inode.c5
-rw-r--r--fs/ocfs2/journal.c51
-rw-r--r--fs/ocfs2/localalloc.c9
-rw-r--r--fs/ocfs2/quota_local.c15
-rw-r--r--fs/ocfs2/stackglue.c6
-rw-r--r--fs/ocfs2/stackglue.h3
-rw-r--r--fs/proc/Kconfig1
-rw-r--r--fs/proc/base.c50
-rw-r--r--fs/proc/generic.c4
-rw-r--r--fs/proc/inode.c6
-rw-r--r--fs/proc/internal.h25
-rw-r--r--fs/proc/kcore.c532
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/proc/task_mmu.c319
-rw-r--r--fs/proc/task_nommu.c39
-rw-r--r--fs/proc/uptime.c4
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/reiserfs/item_ops.c16
-rw-r--r--fs/reiserfs/journal.c22
-rw-r--r--fs/reiserfs/procfs.c11
-rw-r--r--fs/reiserfs/reiserfs.h4
-rw-r--r--fs/seq_file.c54
-rw-r--r--fs/super.c11
-rw-r--r--fs/sysv/inode.c6
-rw-r--r--fs/ufs/balloc.c4
-rw-r--r--fs/ufs/ialloc.c2
-rw-r--r--fs/ufs/super.c4
-rw-r--r--fs/ufs/util.h14
-rw-r--r--fs/userfaultfd.c9
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--include/asm-generic/bug.h16
-rw-r--r--include/asm-generic/export.h12
-rw-r--r--include/asm-generic/pgtable.h18
-rw-r--r--include/linux/backing-dev-defs.h3
-rw-r--r--include/linux/backing-dev.h4
-rw-r--r--include/linux/bits.h1
-rw-r--r--include/linux/cma.h2
-rw-r--r--include/linux/compiler.h19
-rw-r--r--include/linux/crash_core.h2
-rw-r--r--include/linux/crc64.h11
-rw-r--r--include/linux/dma-contiguous.h4
-rw-r--r--include/linux/export.h57
-rw-r--r--include/linux/fs.h7
-rw-r--r--include/linux/fsnotify_backend.h12
-rw-r--r--include/linux/hugetlb.h3
-rw-r--r--include/linux/init.h44
-rw-r--r--include/linux/ipc_namespace.h3
-rw-r--r--include/linux/kasan.h13
-rw-r--r--include/linux/kcore.h2
-rw-r--r--include/linux/kernel.h35
-rw-r--r--include/linux/kvm_host.h4
-rw-r--r--include/linux/list_lru.h43
-rw-r--r--include/linux/memblock.h2
-rw-r--r--include/linux/memcontrol.h92
-rw-r--r--include/linux/memory_hotplug.h1
-rw-r--r--include/linux/mm.h22
-rw-r--r--include/linux/mmu_notifier.h33
-rw-r--r--include/linux/mmzone.h73
-rw-r--r--include/linux/net_dim.h1
-rw-r--r--include/linux/node.h12
-rw-r--r--include/linux/oom.h2
-rw-r--r--include/linux/page-flags.h5
-rw-r--r--include/linux/page_ext.h15
-rw-r--r--include/linux/pci.h20
-rw-r--r--include/linux/percpu.h2
-rw-r--r--include/linux/proc_fs.h2
-rw-r--r--include/linux/sched.h15
-rw-r--r--include/linux/sched/mm.h37
-rw-r--r--include/linux/sched/signal.h5
-rw-r--r--include/linux/sched/sysctl.h1
-rw-r--r--include/linux/sched/user.h5
-rw-r--r--include/linux/shrinker.h21
-rw-r--r--include/linux/signal.h4
-rw-r--r--include/linux/slab.h2
-rw-r--r--include/linux/swap.h15
-rw-r--r--include/linux/swapops.h10
-rw-r--r--include/linux/tracepoint.h19
-rw-r--r--include/linux/vmacache.h6
-rw-r--r--include/rdma/ib_umem_odp.h3
-rw-r--r--include/uapi/linux/auto_fs.h8
-rw-r--r--include/uapi/linux/prctl.h4
-rw-r--r--init/Kconfig33
-rw-r--r--init/do_mounts.c10
-rw-r--r--init/do_mounts_initrd.c10
-rw-r--r--init/do_mounts_md.c10
-rw-r--r--init/do_mounts_rd.c10
-rw-r--r--init/initramfs.c10
-rw-r--r--init/main.c33
-rw-r--r--ipc/msg.c38
-rw-r--r--ipc/namespace.c20
-rw-r--r--ipc/sem.c38
-rw-r--r--ipc/shm.c65
-rw-r--r--ipc/util.c214
-rw-r--r--ipc/util.h33
-rw-r--r--kernel/crash_core.c8
-rw-r--r--kernel/cred.c1
-rw-r--r--kernel/dma/contiguous.c6
-rw-r--r--kernel/dma/direct.c3
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/hung_task.c15
-rw-r--r--kernel/memremap.c10
-rw-r--r--kernel/module.c32
-rw-r--r--kernel/printk/printk.c16
-rw-r--r--kernel/sched/wait.c2
-rw-r--r--kernel/signal.c170
-rw-r--r--kernel/sys.c11
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/tracepoint.c49
-rw-r--r--kernel/user.c11
-rw-r--r--lib/.gitignore2
-rw-r--r--lib/Kconfig9
-rw-r--r--lib/Kconfig.debug23
-rw-r--r--lib/Makefile11
-rw-r--r--lib/bitmap.c9
-rw-r--r--lib/crc64.c56
-rw-r--r--lib/gen_crc64table.c68
-rw-r--r--lib/rhashtable.c21
-rw-r--r--lib/test_debug_virtual.c2
-rw-r--r--lib/test_hexdump.c28
-rw-r--r--mm/Kconfig14
-rw-r--r--mm/Kconfig.debug6
-rw-r--r--mm/backing-dev.c50
-rw-r--r--mm/cma.c8
-rw-r--r--mm/cma_debug.c2
-rw-r--r--mm/fadvise.c8
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/hmm.c26
-rw-r--r--mm/huge_memory.c11
-rw-r--r--mm/hugetlb.c50
-rw-r--r--mm/internal.h12
-rw-r--r--mm/kasan/kasan_init.c316
-rw-r--r--mm/khugepaged.c60
-rw-r--r--mm/ksm.c9
-rw-r--r--mm/list_lru.c154
-rw-r--r--mm/memblock.c130
-rw-r--r--mm/memcontrol.c569
-rw-r--r--mm/memory-failure.c46
-rw-r--r--mm/memory.c150
-rw-r--r--mm/memory_hotplug.c112
-rw-r--r--mm/mempolicy.c5
-rw-r--r--mm/mempool.c13
-rw-r--r--mm/migrate.c7
-rw-r--r--mm/mincore.c12
-rw-r--r--mm/mlock.c3
-rw-r--r--mm/mm_init.c9
-rw-r--r--mm/mmap.c13
-rw-r--r--mm/mmu_notifier.c19
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/oom_kill.c235
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c334
-rw-r--r--mm/page_ext.c4
-rw-r--r--mm/page_owner.c4
-rw-r--r--mm/percpu.c29
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slab_common.c8
-rw-r--r--mm/slub.c3
-rw-r--r--mm/sparse-vmemmap.c61
-rw-r--r--mm/sparse.c290
-rw-r--r--mm/swap_slots.c12
-rw-r--r--mm/swap_state.c16
-rw-r--r--mm/swapfile.c349
-rw-r--r--mm/vmacache.c38
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c243
-rw-r--r--mm/workingset.c30
-rw-r--r--mm/z3fold.c8
-rw-r--r--mm/zsmalloc.c36
-rwxr-xr-xscripts/checkpatch.pl244
-rwxr-xr-xscripts/get_maintainer.pl57
-rwxr-xr-xscripts/spdxcheck.py11
-rw-r--r--scripts/spelling.txt88
-rw-r--r--security/apparmor/lsm.c1
-rw-r--r--security/security.c17
-rw-r--r--security/selinux/hooks.c1
-rw-r--r--tools/testing/selftests/proc/.gitignore2
-rw-r--r--tools/testing/selftests/proc/Makefile3
-rw-r--r--tools/testing/selftests/proc/proc.h12
-rw-r--r--tools/testing/selftests/proc/self.c39
-rw-r--r--tools/testing/selftests/proc/thread-self.c64
-rw-r--r--tools/testing/selftests/vm/.gitignore1
-rw-r--r--tools/testing/selftests/vm/Makefile1
-rw-r--r--tools/testing/selftests/vm/map_populate.c113
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests11
-rw-r--r--tools/vm/page-types.c118
-rw-r--r--virt/kvm/kvm_main.c15
339 files changed, 6102 insertions, 3642 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi
index d773d5697cf5..3187a18af6da 100644
--- a/Documentation/ABI/testing/sysfs-class-bdi
+++ b/Documentation/ABI/testing/sysfs-class-bdi
@@ -53,3 +53,11 @@ stable_pages_required (read-only)
If set, the backing device requires that all pages comprising a write
request must not be changed until writeout is complete.
+
+strictlimit (read-write)
+
+ Forces per-BDI checks for the share of given device in the write-back
+ cache even before the global background dirty limit is reached. This
+ is useful in situations where the global limit is much higher than
+ affordable for given relatively slow (or untrusted) device. Turning
+ strictlimit on has no visible effect if max_ratio is equal to 100%.
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 1746131bc9cb..184193bcb262 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1072,6 +1072,24 @@ PAGE_SIZE multiple when read back.
high limit is used and monitored properly, this limit's
utility is limited to providing the final safety net.
+ memory.oom.group
+ A read-write single value file which exists on non-root
+ cgroups. The default value is "0".
+
+ Determines whether the cgroup should be treated as
+ an indivisible workload by the OOM killer. If set,
+ all tasks belonging to the cgroup or to its descendants
+ (if the memory cgroup is not a leaf cgroup) are killed
+ together or not at all. This can be used to avoid
+ partial kills to guarantee workload integrity.
+
+ Tasks with the OOM protection (oom_score_adj set to -1000)
+ are treated as an exception and are never killed.
+
+ If the OOM killer is invoked in a cgroup, it's not going
+ to kill any tasks outside of this cgroup, regardless
+ memory.oom.group values of ancestor cgroups.
+
memory.events
A read-only flat-keyed file which exists on non-root cgroups.
The following entries are defined. Unless specified
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index e35f6bc7a89f..9871e649ffef 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3042,8 +3042,9 @@
on: enable the feature
page_poison= [KNL] Boot-time parameter changing the state of
- poisoning on the buddy allocator.
- off: turn off poisoning
+ poisoning on the buddy allocator, available with
+ CONFIG_PAGE_POISONING=y.
+ off: turn off poisoning (default)
on: turn on poisoning
panic= [KNL] Kernel behaviour on panic: delay <timeout>
diff --git a/Documentation/admin-guide/mm/idle_page_tracking.rst b/Documentation/admin-guide/mm/idle_page_tracking.rst
index 6f7b7ca1add3..df9394fb39c2 100644
--- a/Documentation/admin-guide/mm/idle_page_tracking.rst
+++ b/Documentation/admin-guide/mm/idle_page_tracking.rst
@@ -65,6 +65,11 @@ workload one should:
are not reclaimable, he or she can filter them out using
``/proc/kpageflags``.
+The page-types tool in the tools/vm directory can be used to assist in this.
+If the tool is run initially with the appropriate option, it will mark all the
+queried pages as idle. Subsequent runs of the tool can then show which pages have
+their idle flag cleared in the interim.
+
See :ref:`Documentation/admin-guide/mm/pagemap.rst <pagemap>` for more
information about ``/proc/pid/pagemap``, ``/proc/kpageflags``, and
``/proc/kpagecgroup``.
diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst
index 577af85beb41..3f7bade2c231 100644
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -44,6 +44,9 @@ There are four components to pagemap:
* ``/proc/kpagecount``. This file contains a 64-bit count of the number of
times each page is mapped, indexed by PFN.
+The page-types tool in the tools/vm directory can be used to query the
+number of times a page is mapped.
+
* ``/proc/kpageflags``. This file contains a 64-bit set of flags for each
page, indexed by PFN.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 1605acbb23b6..1beab5b2fc60 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -871,6 +871,7 @@ VmallocTotal: 112216 kB
VmallocUsed: 428 kB
VmallocChunk: 111088 kB
HardwareCorrupted: 0 kB
+Percpu: 62080 kB
AnonHugePages: 49152 kB
ShmemHugePages: 0 kB
ShmemPmdMapped: 0 kB
@@ -962,6 +963,8 @@ Committed_AS: The amount of memory presently allocated on the system.
VmallocTotal: total size of vmalloc memory area
VmallocUsed: amount of vmalloc area which is used
VmallocChunk: largest contiguous block of vmalloc area which is free
+ Percpu: Memory allocated to the percpu allocator used to back percpu
+ allocations. This stat excludes the cost of metadata.
..............................................................................
diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt
index 9de4303201e1..d412b236a9d6 100644
--- a/Documentation/filesystems/seq_file.txt
+++ b/Documentation/filesystems/seq_file.txt
@@ -66,23 +66,39 @@ kernel 3.10. Current versions require the following update
The iterator interface
-Modules implementing a virtual file with seq_file must implement a simple
-iterator object that allows stepping through the data of interest.
-Iterators must be able to move to a specific position - like the file they
-implement - but the interpretation of that position is up to the iterator
-itself. A seq_file implementation that is formatting firewall rules, for
-example, could interpret position N as the Nth rule in the chain.
-Positioning can thus be done in whatever way makes the most sense for the
-generator of the data, which need not be aware of how a position translates
-to an offset in the virtual file. The one obvious exception is that a
-position of zero should indicate the beginning of the file.
+Modules implementing a virtual file with seq_file must implement an
+iterator object that allows stepping through the data of interest
+during a "session" (roughly one read() system call). If the iterator
+is able to move to a specific position - like the file they implement,
+though with freedom to map the position number to a sequence location
+in whatever way is convenient - the iterator need only exist
+transiently during a session. If the iterator cannot easily find a
+numerical position but works well with a first/next interface, the
+iterator can be stored in the private data area and continue from one
+session to the next.
+
+A seq_file implementation that is formatting firewall rules from a
+table, for example, could provide a simple iterator that interprets
+position N as the Nth rule in the chain. A seq_file implementation
+that presents the content of a, potentially volatile, linked list
+might record a pointer into that list, providing that can be done
+without risk of the current location being removed.
+
+Positioning can thus be done in whatever way makes the most sense for
+the generator of the data, which need not be aware of how a position
+translates to an offset in the virtual file. The one obvious exception
+is that a position of zero should indicate the beginning of the file.
The /proc/sequence iterator just uses the count of the next number it
will output as its position.
-Four functions must be implemented to make the iterator work. The first,
-called start() takes a position as an argument and returns an iterator
-which will start reading at that position. For our simple sequence example,
+Four functions must be implemented to make the iterator work. The
+first, called start(), starts a session and takes a position as an
+argument, returning an iterator which will start reading at that
+position. The pos passed to start() will always be either zero, or
+the most recent pos used in the previous session.
+
+For our simple sequence example,
the start() function looks like:
static void *ct_seq_start(struct seq_file *s, loff_t *pos)
@@ -101,11 +117,12 @@ implementations; in most cases the start() function should check for a
"past end of file" condition and return NULL if need be.
For more complicated applications, the private field of the seq_file
-structure can be used. There is also a special value which can be returned
-by the start() function called SEQ_START_TOKEN; it can be used if you wish
-to instruct your show() function (described below) to print a header at the
-top of the output. SEQ_START_TOKEN should only be used if the offset is
-zero, however.
+structure can be used to hold state from session to session. There is
+also a special value which can be returned by the start() function
+called SEQ_START_TOKEN; it can be used if you wish to instruct your
+show() function (described below) to print a header at the top of the
+output. SEQ_START_TOKEN should only be used if the offset is zero,
+however.
The next function to implement is called, amazingly, next(); its job is to
move the iterator forward to the next position in the sequence. The
@@ -121,9 +138,13 @@ complete. Here's the example version:
return spos;
}
-The stop() function is called when iteration is complete; its job, of
-course, is to clean up. If dynamic memory is allocated for the iterator,
-stop() is the place to free it.
+The stop() function closes a session; its job, of course, is to clean
+up. If dynamic memory is allocated for the iterator, stop() is the
+place to free it; if a lock was taken by start(), stop() must release
+that lock. The value that *pos was set to by the last next() call
+before stop() is remembered, and used for the first start() call of
+the next session unless lseek() has been called on the file; in that
+case next start() will be asked to start at position zero.
static void ct_seq_stop(struct seq_file *s, void *v)
{
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index 6c00c1e2743f..819caf8ca05f 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -34,7 +34,9 @@ Currently, these files are in /proc/sys/fs:
- overflowgid
- pipe-user-pages-hard
- pipe-user-pages-soft
+- protected_fifos
- protected_hardlinks
+- protected_regular
- protected_symlinks
- suid_dumpable
- super-max
@@ -182,6 +184,24 @@ applied.
==============================================================
+protected_fifos:
+
+The intent of this protection is to avoid unintentional writes to
+an attacker-controlled FIFO, where a program expected to create a regular
+file.
+
+When set to "0", writing to FIFOs is unrestricted.
+
+When set to "1" don't allow O_CREAT open on FIFOs that we don't own
+in world writable sticky directories, unless they are owned by the
+owner of the directory.
+
+When set to "2" it also applies to group writable sticky directories.
+
+This protection is based on the restrictions in Openwall.
+
+==============================================================
+
protected_hardlinks:
A long-standing class of security issues is the hardlink-based
@@ -202,6 +222,22 @@ This protection is based on the restrictions in Openwall and grsecurity.
==============================================================
+protected_regular:
+
+This protection is similar to protected_fifos, but it
+avoids writes to an attacker-controlled regular file, where a program
+expected to create one.
+
+When set to "0", writing to regular files is unrestricted.
+
+When set to "1" don't allow O_CREAT open on regular files that we
+don't own in world writable sticky directories, unless they are
+owned by the owner of the directory.
+
+When set to "2" it also applies to group writable sticky directories.
+
+==============================================================
+
protected_symlinks:
A long-standing class of security issues is the symlink-based
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index ffd8701232ce..1b8775298cf7 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -38,6 +38,7 @@ show up in /proc/sys/kernel:
- hung_task_panic
- hung_task_check_count
- hung_task_timeout_secs
+- hung_task_check_interval_secs
- hung_task_warnings
- hyperv_record_panic_msg
- kexec_load_disabled
@@ -356,7 +357,7 @@ This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
hung_task_timeout_secs:
-Check interval. When a task in D state did not get scheduled
+When a task in D state did not get scheduled
for more than this value report a warning.
This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
@@ -365,6 +366,18 @@ Possible values to set are in range {0..LONG_MAX/HZ}.
==============================================================
+hung_task_check_interval_secs:
+
+Hung task check interval. If hung task checking is enabled
+(see hung_task_timeout_secs), the check is done every
+hung_task_check_interval_secs seconds.
+This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
+
+0 (default): means use hung_task_timeout_secs as checking interval.
+Possible values to set are in range {0..LONG_MAX/HZ}.
+
+==============================================================
+
hung_task_warnings:
The maximum number of warnings to report. During a check interval
@@ -452,7 +465,8 @@ Notes:
1) kernel doesn't guarantee, that new object will have desired id. So,
it's up to userspace, how to handle an object with "wrong" id.
2) Toggle with non-default value will be set back to -1 by kernel after
-successful IPC object allocation.
+successful IPC object allocation. If an IPC object allocation syscall
+fails, it is undefined if the value remains unmodified or is reset to -1.
==============================================================
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index e72853b6d725..7d73882e2c27 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -691,7 +691,7 @@ and don't use much of it.
The default value is 0.
See Documentation/vm/overcommit-accounting.rst and
-mm/mmap.c::__vm_enough_memory() for more information.
+mm/util.c::__vm_enough_memory() for more information.
==============================================================
diff --git a/arch/Kconfig b/arch/Kconfig
index c40293a9a335..f8c45bbd0d44 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -848,6 +848,16 @@ config REFCOUNT_FULL
against various use-after-free conditions that can be used in
security flaw exploits.
+config HAVE_ARCH_PREL32_RELOCATIONS
+ bool
+ help
+ May be selected by an architecture if it supports place-relative
+ 32-bit relocations, both in the toolchain and in the module loader,
+ in which case relative references can be used in special sections
+ for PCI fixup, initcalls etc which are only half the size on 64 bit
+ architectures, and don't require runtime relocation on relocatable
+ kernels.
+
source "kernel/gcov/Kconfig"
source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index de2bd217adad..d73dc473fbb9 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -87,7 +87,8 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
const struct exception_table_entry *fixup;
- int fault, si_code = SEGV_MAPERR;
+ int si_code = SEGV_MAPERR;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
/* As of EV6, a load into $31/$f31 is a prefetch, and never faults
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index b884bbd6f354..db6913094be3 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -15,6 +15,7 @@
#include <linux/uaccess.h>
#include <linux/kdebug.h>
#include <linux/perf_event.h>
+#include <linux/mm_types.h>
#include <asm/pgalloc.h>
#include <asm/mmu.h>
@@ -66,7 +67,8 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
siginfo_t info;
- int fault, ret;
+ int ret;
+ vm_fault_t fault;
int write = regs->ecr_cause & ECR_C_PROTV_STORE; /* ST/EX */
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e8cd55a5b04c..ed74be437e32 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1635,6 +1635,10 @@ config ARCH_SELECT_MEMORY_MODEL
config HAVE_ARCH_PFN_VALID
def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM
+config HAVE_MEMBLOCK_PFN_VALID
+ def_bool y
+ depends on HAVE_ARCH_PFN_VALID
+
config HAVE_GENERIC_GUP
def_bool y
depends on ARM_LPAE
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index 4355f0ec44d6..f98baaec0a15 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -17,6 +17,8 @@
#ifndef __ASSEMBLY__
+#include <linux/personality.h> /* For READ_IMPLIES_EXEC */
+
#ifndef CONFIG_MMU
#include <asm/page-nommu.h>
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index d9c299133111..82ab015bf42b 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -330,16 +330,15 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
* atomic helpers. Insert it into the gate_vma so that it is visible
* through ptrace and /proc/<pid>/mem.
*/
-static struct vm_area_struct gate_vma = {
- .vm_start = 0xffff0000,
- .vm_end = 0xffff0000 + PAGE_SIZE,
- .vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC,
-};
+static struct vm_area_struct gate_vma;
static int __init gate_vma_init(void)
{
vma_init(&gate_vma, NULL);
gate_vma.vm_page_prot = PAGE_READONLY_EXEC;
+ gate_vma.vm_start = 0xffff0000;
+ gate_vma.vm_end = 0xffff0000 + PAGE_SIZE;
+ gate_vma.vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC;
return 0;
}
arch_initcall(gate_vma_init);
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index ba0e786c952e..66566472c153 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -594,7 +594,7 @@ static void *__alloc_from_contiguous(struct device *dev, size_t size,
struct page *page;
void *ptr = NULL;
- page = dma_alloc_from_contiguous(dev, count, order, gfp);
+ page = dma_alloc_from_contiguous(dev, count, order, gfp & __GFP_NOWARN);
if (!page)
return NULL;
@@ -1299,7 +1299,8 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
unsigned long order = get_order(size);
struct page *page;
- page = dma_alloc_from_contiguous(dev, count, order, gfp);
+ page = dma_alloc_from_contiguous(dev, count, order,
+ gfp & __GFP_NOWARN);
if (!page)
goto error;
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 84becc911ee3..3232afb6fdc0 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -224,12 +224,12 @@ static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma)
return vma->vm_flags & mask ? false : true;
}
-static int __kprobes
+static vm_fault_t __kprobes
__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
unsigned int flags, struct task_struct *tsk)
{
struct vm_area_struct *vma;
- int fault;
+ vm_fault_t fault;
vma = find_vma(mm, addr);
fault = VM_FAULT_BADMAP;
@@ -264,7 +264,8 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
struct task_struct *tsk;
struct mm_struct *mm;
- int fault, sig, code;
+ int sig, code;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
if (notify_page_fault(regs, fsr))
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index d0a53cc6293a..0dec01a0c81c 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -108,6 +108,7 @@ config ARM64
select HAVE_ARCH_KGDB
select HAVE_ARCH_MMAP_RND_BITS
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
+ select HAVE_ARCH_PREL32_RELOCATIONS
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
@@ -788,6 +789,10 @@ config ARCH_FLATMEM_ENABLE
config HAVE_ARCH_PFN_VALID
def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM
+config HAVE_MEMBLOCK_PFN_VALID
+ def_bool y
+ depends on HAVE_ARCH_PFN_VALID
+
config HW_PERF_EVENTS
def_bool y
depends on ARM_PMU
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 61e93f0b5482..072c51fb07d7 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -355,7 +355,7 @@ static int __init atomic_pool_init(void)
if (dev_get_cma_area(NULL))
page = dma_alloc_from_contiguous(NULL, nr_pages,
- pool_size_order, GFP_KERNEL);
+ pool_size_order, false);
else
page = alloc_pages(GFP_DMA32, pool_size_order);
@@ -573,7 +573,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size,
struct page *page;
page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
- get_order(size), gfp);
+ get_order(size), gfp & __GFP_NOWARN);
if (!page)
return NULL;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 9943690a3924..50b30ff30de4 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -379,12 +379,12 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
#define VM_FAULT_BADMAP 0x010000
#define VM_FAULT_BADACCESS 0x020000
-static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
+static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
unsigned int mm_flags, unsigned long vm_flags,
struct task_struct *tsk)
{
struct vm_area_struct *vma;
- int fault;
+ vm_fault_t fault;
vma = find_vma(mm, addr);
fault = VM_FAULT_BADMAP;
@@ -427,7 +427,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
struct task_struct *tsk;
struct mm_struct *mm;
struct siginfo si;
- int fault, major = 0;
+ vm_fault_t fault, major = 0;
unsigned long vm_flags = VM_READ | VM_WRITE;
unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index 933bbcef5363..eb263e61daf4 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -52,7 +52,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
struct mm_struct *mm = current->mm;
int si_signo;
int si_code = SEGV_MAPERR;
- int fault;
+ vm_fault_t fault;
const struct exception_table_entry *fixup;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 165827774bea..b1e7468eb65a 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -544,7 +544,6 @@ extern struct page *zero_page_memmap_ptr;
# ifdef CONFIG_VIRTUAL_MEM_MAP
/* arch mem_map init routine is needed due to holes in a virtual mem_map */
-# define __HAVE_ARCH_MEMMAP_INIT
extern void memmap_init (unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn);
# endif /* CONFIG_VIRTUAL_MEM_MAP */
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 817fa120645f..a9d55ad8d67b 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -86,7 +86,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
struct vm_area_struct *vma, *prev_vma;
struct mm_struct *mm = current->mm;
unsigned long mask;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index f2ff3779875a..9b6163c05a75 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -70,7 +70,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct * vma;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
pr_debug("do page fault:\nregs->sr=%#x, regs->pc=%#lx, address=%#lx, %ld, %p\n",
@@ -136,7 +136,7 @@ good_area:
*/
fault = handle_mm_fault(vma, address, flags);
- pr_debug("handle_mm_fault returns %d\n", fault);
+ pr_debug("handle_mm_fault returns %x\n", fault);
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
return 0;
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index af607447c683..202ad6a494f5 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -90,7 +90,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
struct mm_struct *mm = current->mm;
int code = SEGV_MAPERR;
int is_write = error_code & ESR_S;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
regs->ear = address;
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 5f71f2b903b7..73d8a0f0b810 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -43,7 +43,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
struct mm_struct *mm = tsk->mm;
const int field = sizeof(unsigned long) * 2;
int si_code;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c
index 9bdb7c3ecbb6..b740534b152c 100644
--- a/arch/nds32/mm/fault.c
+++ b/arch/nds32/mm/fault.c
@@ -73,7 +73,7 @@ void do_page_fault(unsigned long entry, unsigned long addr,
struct mm_struct *mm;
struct vm_area_struct *vma;
int si_code;
- int fault;
+ vm_fault_t fault;
unsigned int mask = VM_READ | VM_WRITE | VM_EXEC;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index b804dd06ea1c..24fd84cf6006 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -47,7 +47,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause,
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
int code = SEGV_MAPERR;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
cause >>= 2;
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index 9f011d16cc46..dc4dbafc1d83 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -53,7 +53,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
struct mm_struct *mm;
struct vm_area_struct *vma;
int si_code;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index a80117980fc2..c8e8b7c05558 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -262,7 +262,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
struct task_struct *tsk;
struct mm_struct *mm;
unsigned long acc_type;
- int fault = 0;
+ vm_fault_t fault = 0;
unsigned int flags;
if (faulthandler_disabled())
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a80669209155..db0b6eebbfa5 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -177,6 +177,7 @@ config PPC
select HAVE_ARCH_KGDB
select HAVE_ARCH_MMAP_RND_BITS
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
+ select HAVE_ARCH_PREL32_RELOCATIONS
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
select HAVE_CBPF_JIT if !PPC64
diff --git a/arch/powerpc/include/asm/copro.h b/arch/powerpc/include/asm/copro.h
index ce216df31381..48616fe7ea75 100644
--- a/arch/powerpc/include/asm/copro.h
+++ b/arch/powerpc/include/asm/copro.h
@@ -10,13 +10,15 @@
#ifndef _ASM_POWERPC_COPRO_H
#define _ASM_POWERPC_COPRO_H
+#include <linux/mm_types.h>
+
struct copro_slb
{
u64 esid, vsid;
};
int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
- unsigned long dsisr, unsigned *flt);
+ unsigned long dsisr, vm_fault_t *flt);
int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index d4a3f4da409b..fc6bb9630a9c 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -77,7 +77,7 @@ struct page *kvm_alloc_hpt_cma(unsigned long nr_pages)
VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES),
- GFP_KERNEL);
+ false);
}
EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma);
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index 7d0945bd3a61..c8da352e8686 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -34,7 +34,7 @@
* to handle fortunately.
*/
int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
- unsigned long dsisr, unsigned *flt)
+ unsigned long dsisr, vm_fault_t *flt)
{
struct vm_area_struct *vma;
unsigned long is_write;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 7d262c6437c4..d51cf5f4e45e 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -155,7 +155,7 @@ static noinline int bad_access(struct pt_regs *regs, unsigned long address)
}
static int do_sigbus(struct pt_regs *regs, unsigned long address,
- unsigned int fault)
+ vm_fault_t fault)
{
siginfo_t info;
unsigned int lsb = 0;
@@ -186,7 +186,8 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
return 0;
}
-static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
+static int mm_fault_error(struct pt_regs *regs, unsigned long addr,
+ vm_fault_t fault)
{
/*
* Kernel page fault interrupted by SIGKILL. We have no reason to
@@ -414,7 +415,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
int is_exec = TRAP(regs) == 0x400;
int is_user = user_mode(regs);
int is_write = page_fault_is_write(error_code);
- int fault, major = 0;
+ vm_fault_t fault, major = 0;
bool must_retry = false;
if (notify_page_fault(regs))
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index 1e002e94d0f6..83cf58daaa79 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -111,7 +111,7 @@ int spufs_handle_class1(struct spu_context *ctx)
{
u64 ea, dsisr, access;
unsigned long flags;
- unsigned flt = 0;
+ vm_fault_t flt = 0;
int ret;
/*
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 148c98ca9b45..88401d5125bc 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -41,7 +41,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
struct mm_struct *mm;
unsigned long addr, cause;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
- int fault, code = SEGV_MAPERR;
+ int code = SEGV_MAPERR;
+ vm_fault_t fault;
cause = regs->scause;
addr = regs->sbadaddr;
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 4cc3f06b0ab3..72af23bacbb5 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -341,7 +341,8 @@ static noinline int signal_return(struct pt_regs *regs)
return -EACCES;
}
-static noinline void do_fault_error(struct pt_regs *regs, int access, int fault)
+static noinline void do_fault_error(struct pt_regs *regs, int access,
+ vm_fault_t fault)
{
int si_code;
@@ -401,7 +402,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, int fault)
* 11 Page translation -> Not present (nullification)
* 3b Region third trans. -> Not present (nullification)
*/
-static inline int do_exception(struct pt_regs *regs, int access)
+static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
{
struct gmap *gmap;
struct task_struct *tsk;
@@ -411,7 +412,7 @@ static inline int do_exception(struct pt_regs *regs, int access)
unsigned long trans_exc_code;
unsigned long address;
unsigned int flags;
- int fault;
+ vm_fault_t fault;
tsk = current;
/*
@@ -564,7 +565,8 @@ out:
void do_protection_exception(struct pt_regs *regs)
{
unsigned long trans_exc_code;
- int access, fault;
+ int access;
+ vm_fault_t fault;
trans_exc_code = regs->int_parm_long;
/*
@@ -599,7 +601,8 @@ NOKPROBE_SYMBOL(do_protection_exception);
void do_dat_exception(struct pt_regs *regs)
{
- int access, fault;
+ int access;
+ vm_fault_t fault;
access = VM_READ | VM_EXEC | VM_WRITE;
fault = do_exception(regs, access);
diff --git a/arch/sh/boards/of-generic.c b/arch/sh/boards/of-generic.c
index 46b2481eec90..26789ad28193 100644
--- a/arch/sh/boards/of-generic.c
+++ b/arch/sh/boards/of-generic.c
@@ -56,15 +56,15 @@ const struct of_cpu_method __cpu_method_of_table_sentinel
static void sh_of_smp_probe(void)
{
- struct device_node *np = 0;
- const char *method = 0;
+ struct device_node *np;
+ const char *method = NULL;
const struct of_cpu_method *m = __cpu_method_of_table;
pr_info("SH generic board support: scanning for cpus\n");
init_cpu_possible(cpumask_of(0));
- while ((np = of_find_node_by_type(np, "cpu"))) {
+ for_each_node_by_type(np, "cpu") {
const __be32 *cell = of_get_property(np, "reg", NULL);
u64 id = -1;
if (cell) id = of_read_number(cell, of_n_addr_cells(np));
@@ -80,6 +80,7 @@ static void sh_of_smp_probe(void)
if (!method) {
np = of_find_node_by_name(NULL, "cpus");
of_property_read_string(np, "enable-method", &method);
+ of_node_put(np);
}
pr_info("CPU enable method: %s\n", method);
diff --git a/arch/sh/include/asm/kexec.h b/arch/sh/include/asm/kexec.h
index fd5f331a3912..927d80ba2332 100644
--- a/arch/sh/include/asm/kexec.h
+++ b/arch/sh/include/asm/kexec.h
@@ -4,6 +4,7 @@
#include <asm/ptrace.h>
#include <asm/string.h>
+#include <linux/kernel.h>
/*
* KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
@@ -61,7 +62,7 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
__asm__ __volatile__ ("stc gbr, %0" : "=r" (newregs->gbr));
__asm__ __volatile__ ("stc sr, %0" : "=r" (newregs->sr));
- newregs->pc = (unsigned long)current_text_addr();
+ newregs->pc = _THIS_IP_;
}
}
#else
diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c
index 1a2526676a87..bb511e2d9d68 100644
--- a/arch/sh/kernel/dwarf.c
+++ b/arch/sh/kernel/dwarf.c
@@ -599,7 +599,7 @@ struct dwarf_frame *dwarf_unwind_stack(unsigned long pc,
* time this function makes its first function call.
*/
if (!pc || !prev)
- pc = (unsigned long)current_text_addr();
+ pc = _THIS_IP_;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/*
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index b8e7bb84b6b1..6defd2c6d9b1 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -313,7 +313,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
static noinline int
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, unsigned int fault)
+ unsigned long address, vm_fault_t fault)
{
/*
* Pagefault was interrupted by SIGKILL. We have no reason to
@@ -396,7 +396,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
struct task_struct *tsk;
struct mm_struct *mm;
struct vm_area_struct * vma;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 9f75b6444bf1..b0440b0edd97 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -166,7 +166,8 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
unsigned int fixup;
unsigned long g2;
int from_user = !(regs->psr & PSR_PS);
- int fault, code;
+ int code;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
if (text_fault)
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 63166fcf9e25..8f8a604c1300 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -278,7 +278,8 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned int insn = 0;
- int si_code, fault_code, fault;
+ int si_code, fault_code;
+ vm_fault_t fault;
unsigned long address, mm_rss;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index ec9a42c14c56..cced82946042 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -72,7 +72,7 @@ good_area:
}
do {
- int fault;
+ vm_fault_t fault;
fault = handle_mm_fault(vma, address, flags);
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index 381473412937..8f12a5b50a42 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -168,11 +168,11 @@ static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma)
return vma->vm_flags & mask ? false : true;
}
-static int __do_pf(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
- unsigned int flags, struct task_struct *tsk)
+static vm_fault_t __do_pf(struct mm_struct *mm, unsigned long addr,
+ unsigned int fsr, unsigned int flags, struct task_struct *tsk)
{
struct vm_area_struct *vma;
- int fault;
+ vm_fault_t fault;
vma = find_vma(mm, addr);
fault = VM_FAULT_BADMAP;
@@ -209,7 +209,8 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
struct task_struct *tsk;
struct mm_struct *mm;
- int fault, sig, code;
+ int sig, code;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 662de7cc8848..b7f34a074840 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -123,6 +123,7 @@ config X86
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT
+ select HAVE_ARCH_PREL32_RELOCATIONS
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_STACKLEAK
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 302517929932..d1e19f358b6e 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -23,11 +23,8 @@
* _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h.
* While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL
* which is meaningless and will cause compiling error in some cases.
- * So do not include linux/export.h and define EXPORT_SYMBOL(sym)
- * as empty.
*/
-#define _LINUX_EXPORT_H
-#define EXPORT_SYMBOL(sym)
+#define __DISABLE_EXPORTS
#include "misc.h"
#include "error.h"
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index de690c2d2e33..a0ab9ab61c75 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -8,5 +8,6 @@ generated-y += xen-hypercalls.h
generic-y += dma-contiguous.h
generic-y += early_ioremap.h
+generic-y += export.h
generic-y += mcs_spinlock.h
generic-y += mm-arch-hooks.h
diff --git a/arch/x86/include/asm/export.h b/arch/x86/include/asm/export.h
deleted file mode 100644
index 2a51d66689c5..000000000000
--- a/arch/x86/include/asm/export.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifdef CONFIG_64BIT
-#define KSYM_ALIGN 16
-#endif
-#include <asm-generic/export.h>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8b7fbda649b2..2714131c96e9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7301,8 +7301,9 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
}
-void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
- unsigned long start, unsigned long end)
+int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+ unsigned long start, unsigned long end,
+ bool blockable)
{
unsigned long apic_address;
@@ -7313,6 +7314,8 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
if (start <= apic_address && apic_address < end)
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+
+ return 0;
}
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index db1c042e9853..b9123c497e0a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -16,6 +16,7 @@
#include <linux/prefetch.h> /* prefetchw */
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
+#include <linux/mm_types.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
@@ -999,7 +1000,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, u32 *pkey, unsigned int fault)
+ unsigned long address, u32 *pkey, vm_fault_t fault)
{
if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, 0, 0);
@@ -1213,7 +1214,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
- int fault, major = 0;
+ vm_fault_t fault, major = 0;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
u32 pkey;
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index fe3343ddccaf..1fc138b6bc0a 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -157,7 +157,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
if (gfpflags_allow_blocking(flag))
page = dma_alloc_from_contiguous(dev, count, get_order(size),
- flag);
+ flag & __GFP_NOWARN);
if (!page)
page = alloc_pages(flag, get_order(size));
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index c111a833205a..2ab0e0dcd166 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -42,7 +42,7 @@ void do_page_fault(struct pt_regs *regs)
int code;
int is_write, is_exec;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
code = SEGV_MAPERR;
diff --git a/block/genhd.c b/block/genhd.c
index 8cc719a37b32..434066b206a8 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1003,7 +1003,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/drivers/base/firmware_loader/fallback.c b/drivers/base/firmware_loader/fallback.c
index 202324291542..b5c865fe263b 100644
--- a/drivers/base/firmware_loader/fallback.c
+++ b/drivers/base/firmware_loader/fallback.c
@@ -219,11 +219,6 @@ static ssize_t firmware_loading_show(struct device *dev,
return sprintf(buf, "%d\n", loading);
}
-/* Some architectures don't have PAGE_KERNEL_RO */
-#ifndef PAGE_KERNEL_RO
-#define PAGE_KERNEL_RO PAGE_KERNEL
-#endif
-
/* one pages buffer should be mapped/unmapped only once */
static int map_fw_priv_pages(struct fw_priv *fw_priv)
{
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index f5e560188a18..c8a1cb0b6136 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -736,8 +736,6 @@ int hotplug_memory_register(int nid, struct mem_section *section)
mem->section_count++;
}
- if (mem->section_count == sections_per_block)
- ret = register_mem_sect_under_node(mem, nid, false);
out:
mutex_unlock(&mem_sysfs_mutex);
return ret;
diff --git a/drivers/base/node.c b/drivers/base/node.c
index a5e821d09656..1ac4c36e13bb 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -399,18 +399,12 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
}
/* register memory section under specified node if it spans that node */
-int register_mem_sect_under_node(struct memory_block *mem_blk, int nid,
- bool check_nid)
+int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
{
- int ret;
+ int ret, nid = *(int *)arg;
unsigned long pfn, sect_start_pfn, sect_end_pfn;
- if (!mem_blk)
- return -EFAULT;
-
mem_blk->nid = nid;
- if (!node_online(nid))
- return 0;
sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
@@ -433,7 +427,7 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid,
* case, during hotplug we know that all pages in the memory
* block belong to the same node.
*/
- if (check_nid) {
+ if (system_state == SYSTEM_BOOTING) {
page_nid = get_nid_for_pfn(pfn);
if (page_nid < 0)
continue;
@@ -490,41 +484,10 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
return 0;
}
-int link_mem_sections(int nid, unsigned long start_pfn, unsigned long nr_pages,
- bool check_nid)
+int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long end_pfn = start_pfn + nr_pages;
- unsigned long pfn;
- struct memory_block *mem_blk = NULL;
- int err = 0;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- unsigned long section_nr = pfn_to_section_nr(pfn);
- struct mem_section *mem_sect;
- int ret;
-
- if (!present_section_nr(section_nr))
- continue;
- mem_sect = __nr_to_section(section_nr);
-
- /* same memblock ? */
- if (mem_blk)
- if ((section_nr >= mem_blk->start_section_nr) &&
- (section_nr <= mem_blk->end_section_nr))
- continue;
-
- mem_blk = find_memory_block_hinted(mem_sect, mem_blk);
-
- ret = register_mem_sect_under_node(mem_blk, nid, check_nid);
- if (!err)
- err = ret;
-
- /* discard ref obtained in find_memory_block() */
- }
-
- if (mem_blk)
- kobject_put(&mem_blk->dev.kobj);
- return err;
+ return walk_memory_range(start_pfn, end_pfn, (void *)&nid,
+ register_mem_sect_under_node);
}
#ifdef CONFIG_HUGETLBFS
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 77fff4da9a03..6fd46083e629 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -495,7 +495,7 @@ static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
return rc;
vma->vm_ops = &dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ vma->vm_flags |= VM_HUGEPAGE;
return 0;
}
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index f0587273940e..d8e185582642 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -1205,7 +1205,7 @@ static int ioctl_get_cycle_timer2(struct client *client, union ioctl_arg *arg)
{
struct fw_cdev_get_cycle_timer2 *a = &arg->get_cycle_timer2;
struct fw_card *card = client->device->card;
- struct timespec ts = {0, 0};
+ struct timespec64 ts = {0, 0};
u32 cycle_time;
int ret = 0;
@@ -1214,9 +1214,9 @@ static int ioctl_get_cycle_timer2(struct client *client, union ioctl_arg *arg)
cycle_time = card->driver->read_csr(card, CSR_CYCLE_TIME);
switch (a->clk_id) {
- case CLOCK_REALTIME: getnstimeofday(&ts); break;
- case CLOCK_MONOTONIC: ktime_get_ts(&ts); break;
- case CLOCK_MONOTONIC_RAW: getrawmonotonic(&ts); break;
+ case CLOCK_REALTIME: ktime_get_real_ts64(&ts); break;
+ case CLOCK_MONOTONIC: ktime_get_ts64(&ts); break;
+ case CLOCK_MONOTONIC_RAW: ktime_get_raw_ts64(&ts); break;
default:
ret = -EINVAL;
}
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 88c322d7c71e..14c40a7750d1 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -24,6 +24,7 @@ KBUILD_CFLAGS := $(cflags-y) -DDISABLE_BRANCH_PROFILING \
-D__NO_FORTIFY \
$(call cc-option,-ffreestanding) \
$(call cc-option,-fno-stack-protector) \
+ -D__DISABLE_EXPORTS
GCOV_PROFILE := n
KASAN_SANITIZE := n
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index a365ea2383d1..e55508b39496 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -178,12 +178,18 @@ void amdgpu_mn_unlock(struct amdgpu_mn *mn)
*
* @amn: our notifier
*/
-static void amdgpu_mn_read_lock(struct amdgpu_mn *amn)
+static int amdgpu_mn_read_lock(struct amdgpu_mn *amn, bool blockable)
{
- mutex_lock(&amn->read_lock);
+ if (blockable)
+ mutex_lock(&amn->read_lock);
+ else if (!mutex_trylock(&amn->read_lock))
+ return -EAGAIN;
+
if (atomic_inc_return(&amn->recursion) == 1)
down_read_non_owner(&amn->lock);
mutex_unlock(&amn->read_lock);
+
+ return 0;
}
/**
@@ -239,10 +245,11 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node,
* Block for operations on BOs to finish and mark pages as accessed and
* potentially dirty.
*/
-static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
+static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool blockable)
{
struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn);
struct interval_tree_node *it;
@@ -250,17 +257,28 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
/* notification is exclusive, but interval is inclusive */
end -= 1;
- amdgpu_mn_read_lock(amn);
+ /* TODO we should be able to split locking for interval tree and
+ * amdgpu_mn_invalidate_node
+ */
+ if (amdgpu_mn_read_lock(amn, blockable))
+ return -EAGAIN;
it = interval_tree_iter_first(&amn->objects, start, end);
while (it) {
struct amdgpu_mn_node *node;
+ if (!blockable) {
+ amdgpu_mn_read_unlock(amn);
+ return -EAGAIN;
+ }
+
node = container_of(it, struct amdgpu_mn_node, it);
it = interval_tree_iter_next(it, start, end);
amdgpu_mn_invalidate_node(node, start, end);
}
+
+ return 0;
}
/**
@@ -275,10 +293,11 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
* necessitates evicting all user-mode queues of the process. The BOs
* are restorted in amdgpu_mn_invalidate_range_end_hsa.
*/
-static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
+static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool blockable)
{
struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn);
struct interval_tree_node *it;
@@ -286,13 +305,19 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
/* notification is exclusive, but interval is inclusive */
end -= 1;
- amdgpu_mn_read_lock(amn);
+ if (amdgpu_mn_read_lock(amn, blockable))
+ return -EAGAIN;
it = interval_tree_iter_first(&amn->objects, start, end);
while (it) {
struct amdgpu_mn_node *node;
struct amdgpu_bo *bo;
+ if (!blockable) {
+ amdgpu_mn_read_unlock(amn);
+ return -EAGAIN;
+ }
+
node = container_of(it, struct amdgpu_mn_node, it);
it = interval_tree_iter_next(it, start, end);
@@ -304,6 +329,8 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
amdgpu_amdkfd_evict_userptr(mem, mm);
}
}
+
+ return 0;
}
/**
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index dcd6e230d16a..2c9b284036d1 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -112,10 +112,11 @@ static void del_object(struct i915_mmu_object *mo)
mo->attached = false;
}
-static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
+static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
struct mm_struct *mm,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool blockable)
{
struct i915_mmu_notifier *mn =
container_of(_mn, struct i915_mmu_notifier, mn);
@@ -124,7 +125,7 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
LIST_HEAD(cancelled);
if (RB_EMPTY_ROOT(&mn->objects.rb_root))
- return;
+ return 0;
/* interval ranges are inclusive, but invalidate range is exclusive */
end--;
@@ -132,6 +133,10 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
spin_lock(&mn->lock);
it = interval_tree_iter_first(&mn->objects, start, end);
while (it) {
+ if (!blockable) {
+ spin_unlock(&mn->lock);
+ return -EAGAIN;
+ }
/* The mmu_object is released late when destroying the
* GEM object so it is entirely possible to gain a
* reference on an object in the process of being freed
@@ -154,6 +159,8 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
if (!list_empty(&cancelled))
flush_workqueue(mn->wq);
+
+ return 0;
}
static const struct mmu_notifier_ops i915_gem_userptr_notifier = {
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c
index abd24975c9b1..f8b35df44c60 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -118,19 +118,27 @@ static void radeon_mn_release(struct mmu_notifier *mn,
* We block for all BOs between start and end to be idle and
* unmap them by move them into system domain again.
*/
-static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
+static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool blockable)
{
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
struct ttm_operation_ctx ctx = { false, false };
struct interval_tree_node *it;
+ int ret = 0;
/* notification is exclusive, but interval is inclusive */
end -= 1;
- mutex_lock(&rmn->lock);
+ /* TODO we should be able to split locking for interval tree and
+ * the tear down.
+ */
+ if (blockable)
+ mutex_lock(&rmn->lock);
+ else if (!mutex_trylock(&rmn->lock))
+ return -EAGAIN;
it = interval_tree_iter_first(&rmn->objects, start, end);
while (it) {
@@ -138,6 +146,11 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
struct radeon_bo *bo;
long r;
+ if (!blockable) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
node = container_of(it, struct radeon_mn_node, it);
it = interval_tree_iter_next(it, start, end);
@@ -166,7 +179,10 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
}
}
+out_unlock:
mutex_unlock(&rmn->lock);
+
+ return ret;
}
static const struct mmu_notifier_ops radeon_mn_ops = {
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 182436b92ba9..6ec748eccff7 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -186,6 +186,7 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn,
rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
ULLONG_MAX,
ib_umem_notifier_release_trampoline,
+ true,
NULL);
up_read(&context->umem_rwsem);
}
@@ -207,22 +208,31 @@ static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
return 0;
}
-static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
+static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool blockable)
{
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+ int ret;
if (!context->invalidate_range)
- return;
+ return 0;
+
+ if (blockable)
+ down_read(&context->umem_rwsem);
+ else if (!down_read_trylock(&context->umem_rwsem))
+ return -EAGAIN;
ib_ucontext_notifier_start_account(context);
- down_read(&context->umem_rwsem);
- rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+ ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
end,
- invalidate_range_start_trampoline, NULL);
+ invalidate_range_start_trampoline,
+ blockable, NULL);
up_read(&context->umem_rwsem);
+
+ return ret;
}
static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
@@ -242,10 +252,15 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
if (!context->invalidate_range)
return;
+ /*
+ * TODO: we currently bail out if there is any sleepable work to be done
+ * in ib_umem_notifier_invalidate_range_start so we shouldn't really block
+ * here. But this is ugly and fragile.
+ */
down_read(&context->umem_rwsem);
rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
end,
- invalidate_range_end_trampoline, NULL);
+ invalidate_range_end_trampoline, true, NULL);
up_read(&context->umem_rwsem);
ib_ucontext_notifier_end_account(context);
}
@@ -798,6 +813,7 @@ EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
u64 start, u64 last,
umem_call_back cb,
+ bool blockable,
void *cookie)
{
int ret_val = 0;
@@ -809,6 +825,9 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
for (node = rbt_ib_umem_iter_first(root, start, last - 1);
node; node = next) {
+ /* TODO move the blockable decision up to the callback */
+ if (!blockable)
+ return -EAGAIN;
next = rbt_ib_umem_iter_next(node, start, last - 1);
umem = container_of(node, struct ib_umem_odp, interval_tree);
ret_val = cb(umem->umem, start, last, cookie) || ret_val;
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
index 70aceefe14d5..e1c7996c018e 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -67,9 +67,9 @@ struct mmu_rb_handler {
static unsigned long mmu_node_start(struct mmu_rb_node *);
static unsigned long mmu_node_last(struct mmu_rb_node *);
-static void mmu_notifier_range_start(struct mmu_notifier *,
+static int mmu_notifier_range_start(struct mmu_notifier *,
struct mm_struct *,
- unsigned long, unsigned long);
+ unsigned long, unsigned long, bool);
static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
unsigned long, unsigned long);
static void do_remove(struct mmu_rb_handler *handler,
@@ -284,10 +284,11 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
handler->ops->remove(handler->ops_arg, node);
}
-static void mmu_notifier_range_start(struct mmu_notifier *mn,
+static int mmu_notifier_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool blockable)
{
struct mmu_rb_handler *handler =
container_of(mn, struct mmu_rb_handler, mn);
@@ -313,6 +314,8 @@ static void mmu_notifier_range_start(struct mmu_notifier *mn,
if (added)
queue_work(handler->wq, &handler->del_work);
+
+ return 0;
}
/*
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index f1a87a690a4c..d216e0d2921d 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -488,7 +488,7 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
down_read(&ctx->umem_rwsem);
rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX,
- mr_leaf_free, imr);
+ mr_leaf_free, true, imr);
up_read(&ctx->umem_rwsem);
wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 2aded63548c7..4e04fff23977 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2610,7 +2610,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
return NULL;
page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
- get_order(size), flag);
+ get_order(size), flag & __GFP_NOWARN);
if (!page)
return NULL;
}
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 1d0b53a04a08..58da65df03f5 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -508,7 +508,7 @@ static void do_fault(struct work_struct *work)
{
struct fault *fault = container_of(work, struct fault, work);
struct vm_area_struct *vma;
- int ret = VM_FAULT_ERROR;
+ vm_fault_t ret = VM_FAULT_ERROR;
unsigned int flags = 0;
struct mm_struct *mm;
u64 address;
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index d611a3e9c81a..5f3f10cf9d9d 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3763,7 +3763,8 @@ static void *intel_alloc_coherent(struct device *dev, size_t size,
if (gfpflags_allow_blocking(flags)) {
unsigned int count = size >> PAGE_SHIFT;
- page = dma_alloc_from_contiguous(dev, count, order, flags);
+ page = dma_alloc_from_contiguous(dev, count, order,
+ flags & __GFP_NOWARN);
if (page && iommu_no_mapping(dev) &&
page_to_phys(page) + size > dev->coherent_dma_mask) {
dma_release_from_contiguous(dev, page, count);
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index eb308363e541..4a03e5090952 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -24,6 +24,7 @@
#include <linux/pci-ats.h>
#include <linux/dmar.h>
#include <linux/interrupt.h>
+#include <linux/mm_types.h>
#include <asm/page.h>
#include "intel-pasid.h"
@@ -581,7 +582,8 @@ static irqreturn_t prq_event_thread(int irq, void *d)
struct vm_area_struct *vma;
struct page_req_dsc *req;
struct qi_desc resp;
- int ret, result;
+ int result;
+ vm_fault_t ret;
u64 address;
handled = 1;
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 817b9fba50db..f6e0a8b3a61e 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -1,6 +1,7 @@
config BCACHE
tristate "Block device as cache"
+ select CRC64
help
Allows a block device to be used as cache for other devices; uses
a btree for indexing and the layout is optimized for SSDs.
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index e873b0f7a82a..20eddeac1531 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -281,134 +281,3 @@ int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
return 0;
}
-
-/*
- * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
- * use permitted, subject to terms of PostgreSQL license; see.)
-
- * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
- * usual sort of implementation. (See Ross Williams' excellent introduction
- * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
- * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
- * If we have no working 64-bit type, then fake it with two 32-bit registers.
- *
- * The present implementation is a normal (not "reflected", in Williams'
- * terms) 64-bit CRC, using initial all-ones register contents and a final
- * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
- * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
- *
- * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
- * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
- * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
- * x^7 + x^4 + x + 1
-*/
-
-static const uint64_t crc_table[256] = {
- 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
- 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
- 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
- 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
- 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
- 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
- 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
- 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
- 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
- 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
- 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
- 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
- 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
- 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
- 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
- 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
- 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
- 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
- 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
- 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
- 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
- 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
- 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
- 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
- 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
- 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
- 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
- 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
- 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
- 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
- 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
- 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
- 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
- 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
- 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
- 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
- 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
- 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
- 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
- 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
- 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
- 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
- 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
- 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
- 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
- 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
- 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
- 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
- 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
- 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
- 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
- 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
- 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
- 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
- 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
- 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
- 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
- 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
- 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
- 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
- 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
- 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
- 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
- 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
- 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
- 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
- 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
- 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
- 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
- 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
- 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
- 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
- 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
- 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
- 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
- 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
- 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
- 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
- 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
- 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
- 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
- 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
- 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
- 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
- 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
- 0x9AFCE626CE85B507ULL,
-};
-
-uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len)
-{
- const unsigned char *data = _data;
-
- while (len--) {
- int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
- crc = crc_table[i] ^ (crc << 8);
- }
-
- return crc;
-}
-
-uint64_t bch_crc64(const void *data, size_t len)
-{
- uint64_t crc = 0xffffffffffffffffULL;
-
- crc = bch_crc64_update(crc, data, len);
-
- return crc ^ 0xffffffffffffffffULL;
-}
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 4e0ed19e32d3..00aab6abcfe4 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -11,6 +11,7 @@
#include <linux/ratelimit.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
+#include <linux/crc64.h>
#include "closure.h"
@@ -543,6 +544,22 @@ dup: \
#define RB_PREV(ptr, member) \
container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
+static inline uint64_t bch_crc64(const void *p, size_t len)
+{
+ uint64_t crc = 0xffffffffffffffffULL;
+
+ crc = crc64_be(crc, p, len);
+ return crc ^ 0xffffffffffffffffULL;
+}
+
+static inline uint64_t bch_crc64_update(uint64_t crc,
+ const void *p,
+ size_t len)
+{
+ crc = crc64_be(crc, p, len);
+ return crc;
+}
+
/* Does linear interpolation between powers of two */
static inline unsigned int fract_exp_two(unsigned int x,
unsigned int fract_bits)
@@ -563,8 +580,4 @@ static inline sector_t bdev_sectors(struct block_device *bdev)
{
return bdev->bd_inode->i_size >> 9;
}
-
-uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len);
-uint64_t bch_crc64(const void *data, size_t len);
-
#endif /* _BCACHE_UTIL_H */
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index d45f3e6b17d2..dc7b34174f85 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -134,7 +134,7 @@ static int cxl_handle_segment_miss(struct cxl_context *ctx,
int cxl_handle_mm_fault(struct mm_struct *mm, u64 dsisr, u64 dar)
{
- unsigned flt = 0;
+ vm_fault_t flt = 0;
int result;
unsigned long access, flags, inv_flags = 0;
diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c
index 63d6246d6dff..6369aeaa7056 100644
--- a/drivers/misc/mic/scif/scif_dma.c
+++ b/drivers/misc/mic/scif/scif_dma.c
@@ -200,15 +200,18 @@ static void scif_mmu_notifier_release(struct mmu_notifier *mn,
schedule_work(&scif_info.misc_work);
}
-static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+static int scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool blockable)
{
struct scif_mmu_notif *mmn;
mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier);
scif_rma_destroy_tcw(mmn, start, end - start);
+
+ return 0;
}
static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
index a963b0a4a3c5..31695a078485 100644
--- a/drivers/misc/ocxl/link.c
+++ b/drivers/misc/ocxl/link.c
@@ -2,6 +2,7 @@
// Copyright 2017 IBM Corp.
#include <linux/sched/mm.h>
#include <linux/mutex.h>
+#include <linux/mm_types.h>
#include <linux/mmu_context.h>
#include <asm/copro.h>
#include <asm/pnv-ocxl.h>
@@ -126,7 +127,7 @@ static void ack_irq(struct spa *spa, enum xsl_response r)
static void xsl_fault_handler_bh(struct work_struct *fault_work)
{
- unsigned int flt = 0;
+ vm_fault_t flt = 0;
unsigned long access, flags, inv_flags = 0;
enum xsl_response r;
struct xsl_fault *fault = container_of(fault_work, struct xsl_fault,
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
index a3454eb56fbf..be28f05bfafa 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -219,9 +219,10 @@ void gru_flush_all_tlb(struct gru_state *gru)
/*
* MMUOPS notifier callout functions
*/
-static void gru_invalidate_range_start(struct mmu_notifier *mn,
+static int gru_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end,
+ bool blockable)
{
struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
ms_notifier);
@@ -231,6 +232,8 @@ static void gru_invalidate_range_start(struct mmu_notifier *mn,
gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms,
start, end, atomic_read(&gms->ms_range_active));
gru_flush_tlb_range(gms, start, end - start);
+
+ return 0;
}
static void gru_invalidate_range_end(struct mmu_notifier *mn,
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 46f58a9771d7..ef7143a274e0 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -66,9 +66,15 @@ static void pci_do_fixups(struct pci_dev *dev, struct pci_fixup *f,
f->vendor == (u16) PCI_ANY_ID) &&
(f->device == dev->device ||
f->device == (u16) PCI_ANY_ID)) {
- calltime = fixup_debug_start(dev, f->hook);
- f->hook(dev);
- fixup_debug_report(dev, calltime, f->hook);
+ void (*hook)(struct pci_dev *dev);
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ hook = offset_to_ptr(&f->hook_offset);
+#else
+ hook = f->hook;
+#endif
+ calltime = fixup_debug_start(dev, hook);
+ hook(dev);
+ fixup_debug_report(dev, calltime, hook);
}
}
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index a8cb8d2f2abb..cbe467ff1aba 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -1006,7 +1006,6 @@ out_free:
static int rio_mport_wait_for_async_dma(struct file *filp, void __user *arg)
{
struct mport_cdev_priv *priv;
- struct mport_dev *md;
struct rio_async_tx_wait w_param;
struct mport_dma_req *req;
dma_cookie_t cookie;
@@ -1016,7 +1015,6 @@ static int rio_mport_wait_for_async_dma(struct file *filp, void __user *arg)
int ret;
priv = (struct mport_cdev_priv *)filp->private_data;
- md = priv->md;
if (unlikely(copy_from_user(&w_param, arg, sizeof(w_param))))
return -EFAULT;
diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c
index 948ce82a7725..0fa1b6b1491a 100644
--- a/drivers/s390/char/vmcp.c
+++ b/drivers/s390/char/vmcp.c
@@ -68,7 +68,7 @@ static void vmcp_response_alloc(struct vmcp_session *session)
* anymore the system won't work anyway.
*/
if (order > 2)
- page = cma_alloc(vmcp_cma, nr_pages, 0, GFP_KERNEL);
+ page = cma_alloc(vmcp_cma, nr_pages, 0, false);
if (page) {
session->response = (char *)page_to_phys(page);
session->cma_alloc = 1;
diff --git a/drivers/staging/android/ion/ion_cma_heap.c b/drivers/staging/android/ion/ion_cma_heap.c
index 49718c96bf9e..3fafd013d80a 100644
--- a/drivers/staging/android/ion/ion_cma_heap.c
+++ b/drivers/staging/android/ion/ion_cma_heap.c
@@ -39,7 +39,7 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer,
if (align > CONFIG_CMA_ALIGNMENT)
align = CONFIG_CMA_ALIGNMENT;
- pages = cma_alloc(cma_heap->cma, nr_pages, align, GFP_KERNEL);
+ pages = cma_alloc(cma_heap->cma, nr_pages, align, false);
if (!pages)
return -ENOMEM;
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index c866a62f766d..57390c7666e5 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -479,18 +479,25 @@ static const struct vm_operations_struct gntdev_vmops = {
/* ------------------------------------------------------------------ */
+static bool in_range(struct gntdev_grant_map *map,
+ unsigned long start, unsigned long end)
+{
+ if (!map->vma)
+ return false;
+ if (map->vma->vm_start >= end)
+ return false;
+ if (map->vma->vm_end <= start)
+ return false;
+
+ return true;
+}
+
static void unmap_if_in_range(struct gntdev_grant_map *map,
unsigned long start, unsigned long end)
{
unsigned long mstart, mend;
int err;
- if (!map->vma)
- return;
- if (map->vma->vm_start >= end)
- return;
- if (map->vma->vm_end <= start)
- return;
mstart = max(start, map->vma->vm_start);
mend = min(end, map->vma->vm_end);
pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
@@ -503,21 +510,40 @@ static void unmap_if_in_range(struct gntdev_grant_map *map,
WARN_ON(err);
}
-static void mn_invl_range_start(struct mmu_notifier *mn,
+static int mn_invl_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end,
+ bool blockable)
{
struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
struct gntdev_grant_map *map;
+ int ret = 0;
+
+ /* TODO do we really need a mutex here? */
+ if (blockable)
+ mutex_lock(&priv->lock);
+ else if (!mutex_trylock(&priv->lock))
+ return -EAGAIN;
- mutex_lock(&priv->lock);
list_for_each_entry(map, &priv->maps, next) {
+ if (in_range(map, start, end)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
unmap_if_in_range(map, start, end);
}
list_for_each_entry(map, &priv->freeable_maps, next) {
+ if (in_range(map, start, end)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
unmap_if_in_range(map, start, end);
}
+
+out_unlock:
mutex_unlock(&priv->lock);
+
+ return ret;
}
static void mn_release(struct mmu_notifier *mn,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index e91028d4340a..66621e96f9af 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -167,7 +167,7 @@ adfs_mode2atts(struct super_block *sb, struct inode *inode)
* of time to convert from RISC OS epoch to Unix epoch.
*/
static void
-adfs_adfs2unix_time(struct timespec *tv, struct inode *inode)
+adfs_adfs2unix_time(struct timespec64 *tv, struct inode *inode)
{
unsigned int high, low;
/* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since
@@ -195,11 +195,11 @@ adfs_adfs2unix_time(struct timespec *tv, struct inode *inode)
/* convert from RISC OS to Unix epoch */
nsec -= nsec_unix_epoch_diff_risc_os_epoch;
- *tv = ns_to_timespec(nsec);
+ *tv = ns_to_timespec64(nsec);
return;
cur_time:
- *tv = timespec64_to_timespec(current_time(inode));
+ *tv = current_time(inode);
return;
too_early:
@@ -242,7 +242,6 @@ adfs_unix2adfs_time(struct inode *inode, unsigned int secs)
struct inode *
adfs_iget(struct super_block *sb, struct object_info *obj)
{
- struct timespec ts;
struct inode *inode;
inode = new_inode(sb);
@@ -271,9 +270,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
ADFS_I(inode)->stamped = ((obj->loadaddr & 0xfff00000) == 0xfff00000);
inode->i_mode = adfs_atts2mode(sb, inode);
- ts = timespec64_to_timespec(inode->i_mtime);
- adfs_adfs2unix_time(&ts, inode);
- inode->i_mtime = timespec_to_timespec64(ts);
+ adfs_adfs2unix_time(&inode->i_mtime, inode);
inode->i_atime = inode->i_mtime;
inode->i_ctime = inode->i_mtime;
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 94ce15633809..25e7131fd230 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -152,15 +152,9 @@ int autofs_expire_run(struct super_block *, struct vfsmount *,
struct autofs_sb_info *,
struct autofs_packet_expire __user *);
int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
- struct autofs_sb_info *sbi, int when);
+ struct autofs_sb_info *sbi, unsigned int how);
int autofs_expire_multi(struct super_block *, struct vfsmount *,
struct autofs_sb_info *, int __user *);
-struct dentry *autofs_expire_direct(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi, int how);
-struct dentry *autofs_expire_indirect(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi, int how);
/* Device node initialization */
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index b332d3f6e730..d441244b79df 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -10,11 +10,9 @@
#include "autofs_i.h"
-static unsigned long now;
-
/* Check if a dentry can be expired */
static inline int autofs_can_expire(struct dentry *dentry,
- unsigned long timeout, int do_now)
+ unsigned long timeout, unsigned int how)
{
struct autofs_info *ino = autofs_dentry_ino(dentry);
@@ -22,16 +20,17 @@ static inline int autofs_can_expire(struct dentry *dentry,
if (ino == NULL)
return 0;
- if (!do_now) {
+ if (!(how & AUTOFS_EXP_IMMEDIATE)) {
/* Too young to die */
- if (!timeout || time_after(ino->last_used + timeout, now))
+ if (!timeout || time_after(ino->last_used + timeout, jiffies))
return 0;
}
return 1;
}
/* Check a mount point for busyness */
-static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
+static int autofs_mount_busy(struct vfsmount *mnt,
+ struct dentry *dentry, unsigned int how)
{
struct dentry *top = dentry;
struct path path = {.mnt = mnt, .dentry = dentry};
@@ -52,6 +51,12 @@ static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
goto done;
}
+ /* Not a submount, has a forced expire been requested */
+ if (how & AUTOFS_EXP_FORCED) {
+ status = 0;
+ goto done;
+ }
+
/* Update the expiry counter if fs is busy */
if (!may_umount_tree(path.mnt)) {
struct autofs_info *ino;
@@ -187,10 +192,14 @@ again:
static int autofs_direct_busy(struct vfsmount *mnt,
struct dentry *top,
unsigned long timeout,
- int do_now)
+ unsigned int how)
{
pr_debug("top %p %pd\n", top, top);
+ /* Forced expire, user space handles busy mounts */
+ if (how & AUTOFS_EXP_FORCED)
+ return 0;
+
/* If it's busy update the expiry counters */
if (!may_umount_tree(mnt)) {
struct autofs_info *ino;
@@ -202,7 +211,7 @@ static int autofs_direct_busy(struct vfsmount *mnt,
}
/* Timeout of a direct mount is determined by its top dentry */
- if (!autofs_can_expire(top, timeout, do_now))
+ if (!autofs_can_expire(top, timeout, how))
return 1;
return 0;
@@ -215,7 +224,7 @@ static int autofs_direct_busy(struct vfsmount *mnt,
static int autofs_tree_busy(struct vfsmount *mnt,
struct dentry *top,
unsigned long timeout,
- int do_now)
+ unsigned int how)
{
struct autofs_info *top_ino = autofs_dentry_ino(top);
struct dentry *p;
@@ -237,7 +246,7 @@ static int autofs_tree_busy(struct vfsmount *mnt,
* If the fs is busy update the expiry counter.
*/
if (d_mountpoint(p)) {
- if (autofs_mount_busy(mnt, p)) {
+ if (autofs_mount_busy(mnt, p, how)) {
top_ino->last_used = jiffies;
dput(p);
return 1;
@@ -260,8 +269,12 @@ static int autofs_tree_busy(struct vfsmount *mnt,
}
}
+ /* Forced expire, user space handles busy mounts */
+ if (how & AUTOFS_EXP_FORCED)
+ return 0;
+
/* Timeout of a tree mount is ultimately determined by its top dentry */
- if (!autofs_can_expire(top, timeout, do_now))
+ if (!autofs_can_expire(top, timeout, how))
return 1;
return 0;
@@ -270,7 +283,7 @@ static int autofs_tree_busy(struct vfsmount *mnt,
static struct dentry *autofs_check_leaves(struct vfsmount *mnt,
struct dentry *parent,
unsigned long timeout,
- int do_now)
+ unsigned int how)
{
struct dentry *p;
@@ -282,11 +295,17 @@ static struct dentry *autofs_check_leaves(struct vfsmount *mnt,
if (d_mountpoint(p)) {
/* Can we umount this guy */
- if (autofs_mount_busy(mnt, p))
+ if (autofs_mount_busy(mnt, p, how))
continue;
+ /* This isn't a submount so if a forced expire
+ * has been requested, user space handles busy
+ * mounts */
+ if (how & AUTOFS_EXP_FORCED)
+ return p;
+
/* Can we expire this guy */
- if (autofs_can_expire(p, timeout, do_now))
+ if (autofs_can_expire(p, timeout, how))
return p;
}
}
@@ -294,23 +313,21 @@ static struct dentry *autofs_check_leaves(struct vfsmount *mnt,
}
/* Check if we can expire a direct mount (possibly a tree) */
-struct dentry *autofs_expire_direct(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi,
- int how)
+static struct dentry *autofs_expire_direct(struct super_block *sb,
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi,
+ unsigned int how)
{
- unsigned long timeout;
struct dentry *root = dget(sb->s_root);
- int do_now = how & AUTOFS_EXP_IMMEDIATE;
struct autofs_info *ino;
+ unsigned long timeout;
if (!root)
return NULL;
- now = jiffies;
timeout = sbi->exp_timeout;
- if (!autofs_direct_busy(mnt, root, timeout, do_now)) {
+ if (!autofs_direct_busy(mnt, root, timeout, how)) {
spin_lock(&sbi->fs_lock);
ino = autofs_dentry_ino(root);
/* No point expiring a pending mount */
@@ -321,7 +338,7 @@ struct dentry *autofs_expire_direct(struct super_block *sb,
ino->flags |= AUTOFS_INF_WANT_EXPIRE;
spin_unlock(&sbi->fs_lock);
synchronize_rcu();
- if (!autofs_direct_busy(mnt, root, timeout, do_now)) {
+ if (!autofs_direct_busy(mnt, root, timeout, how)) {
spin_lock(&sbi->fs_lock);
ino->flags |= AUTOFS_INF_EXPIRING;
init_completion(&ino->expire_complete);
@@ -346,10 +363,8 @@ out:
static struct dentry *should_expire(struct dentry *dentry,
struct vfsmount *mnt,
unsigned long timeout,
- int how)
+ unsigned int how)
{
- int do_now = how & AUTOFS_EXP_IMMEDIATE;
- int exp_leaves = how & AUTOFS_EXP_LEAVES;
struct autofs_info *ino = autofs_dentry_ino(dentry);
unsigned int ino_count;
@@ -367,22 +382,33 @@ static struct dentry *should_expire(struct dentry *dentry,
pr_debug("checking mountpoint %p %pd\n", dentry, dentry);
/* Can we umount this guy */
- if (autofs_mount_busy(mnt, dentry))
+ if (autofs_mount_busy(mnt, dentry, how))
return NULL;
+ /* This isn't a submount so if a forced expire
+ * has been requested, user space handles busy
+ * mounts */
+ if (how & AUTOFS_EXP_FORCED)
+ return dentry;
+
/* Can we expire this guy */
- if (autofs_can_expire(dentry, timeout, do_now))
+ if (autofs_can_expire(dentry, timeout, how))
return dentry;
return NULL;
}
if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
pr_debug("checking symlink %p %pd\n", dentry, dentry);
+
+ /* Forced expire, user space handles busy mounts */
+ if (how & AUTOFS_EXP_FORCED)
+ return dentry;
+
/*
* A symlink can't be "busy" in the usual sense so
* just check last used for expire timeout.
*/
- if (autofs_can_expire(dentry, timeout, do_now))
+ if (autofs_can_expire(dentry, timeout, how))
return dentry;
return NULL;
}
@@ -391,27 +417,33 @@ static struct dentry *should_expire(struct dentry *dentry,
return NULL;
/* Case 2: tree mount, expire iff entire tree is not busy */
- if (!exp_leaves) {
- /* Path walk currently on this dentry? */
- ino_count = atomic_read(&ino->count) + 1;
- if (d_count(dentry) > ino_count)
- return NULL;
+ if (!(how & AUTOFS_EXP_LEAVES)) {
+ /* Not a forced expire? */
+ if (!(how & AUTOFS_EXP_FORCED)) {
+ /* ref-walk currently on this dentry? */
+ ino_count = atomic_read(&ino->count) + 1;
+ if (d_count(dentry) > ino_count)
+ return NULL;
+ }
- if (!autofs_tree_busy(mnt, dentry, timeout, do_now))
+ if (!autofs_tree_busy(mnt, dentry, timeout, how))
return dentry;
/*
* Case 3: pseudo direct mount, expire individual leaves
* (autofs-4.1).
*/
} else {
- /* Path walk currently on this dentry? */
struct dentry *expired;
- ino_count = atomic_read(&ino->count) + 1;
- if (d_count(dentry) > ino_count)
- return NULL;
+ /* Not a forced expire? */
+ if (!(how & AUTOFS_EXP_FORCED)) {
+ /* ref-walk currently on this dentry? */
+ ino_count = atomic_read(&ino->count) + 1;
+ if (d_count(dentry) > ino_count)
+ return NULL;
+ }
- expired = autofs_check_leaves(mnt, dentry, timeout, do_now);
+ expired = autofs_check_leaves(mnt, dentry, timeout, how);
if (expired) {
if (expired == dentry)
dput(dentry);
@@ -427,10 +459,10 @@ static struct dentry *should_expire(struct dentry *dentry,
* - it is unused by any user process
* - it has been unused for exp_timeout time
*/
-struct dentry *autofs_expire_indirect(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi,
- int how)
+static struct dentry *autofs_expire_indirect(struct super_block *sb,
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi,
+ unsigned int how)
{
unsigned long timeout;
struct dentry *root = sb->s_root;
@@ -442,13 +474,10 @@ struct dentry *autofs_expire_indirect(struct super_block *sb,
if (!root)
return NULL;
- now = jiffies;
timeout = sbi->exp_timeout;
dentry = NULL;
while ((dentry = get_next_positive_subdir(dentry, root))) {
- int flags = how;
-
spin_lock(&sbi->fs_lock);
ino = autofs_dentry_ino(dentry);
if (ino->flags & AUTOFS_INF_WANT_EXPIRE) {
@@ -457,7 +486,7 @@ struct dentry *autofs_expire_indirect(struct super_block *sb,
}
spin_unlock(&sbi->fs_lock);
- expired = should_expire(dentry, mnt, timeout, flags);
+ expired = should_expire(dentry, mnt, timeout, how);
if (!expired)
continue;
@@ -470,7 +499,7 @@ struct dentry *autofs_expire_indirect(struct super_block *sb,
/* Make sure a reference is not taken on found if
* things have changed.
*/
- flags &= ~AUTOFS_EXP_LEAVES;
+ how &= ~AUTOFS_EXP_LEAVES;
found = should_expire(expired, mnt, timeout, how);
if (!found || found != expired)
/* Something has changed, continue */
@@ -575,7 +604,7 @@ int autofs_expire_run(struct super_block *sb,
spin_lock(&sbi->fs_lock);
ino = autofs_dentry_ino(dentry);
/* avoid rapid-fire expire attempts if expiry fails */
- ino->last_used = now;
+ ino->last_used = jiffies;
ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE);
complete_all(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
@@ -584,15 +613,15 @@ int autofs_expire_run(struct super_block *sb,
}
int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
- struct autofs_sb_info *sbi, int when)
+ struct autofs_sb_info *sbi, unsigned int how)
{
struct dentry *dentry;
int ret = -EAGAIN;
if (autofs_type_trigger(sbi->type))
- dentry = autofs_expire_direct(sb, mnt, sbi, when);
+ dentry = autofs_expire_direct(sb, mnt, sbi, how);
else
- dentry = autofs_expire_indirect(sb, mnt, sbi, when);
+ dentry = autofs_expire_indirect(sb, mnt, sbi, how);
if (dentry) {
struct autofs_info *ino = autofs_dentry_ino(dentry);
@@ -605,7 +634,7 @@ int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
spin_lock(&sbi->fs_lock);
/* avoid rapid-fire expire attempts if expiry fails */
- ino->last_used = now;
+ ino->last_used = jiffies;
ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE);
complete_all(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
@@ -622,10 +651,10 @@ int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
int autofs_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_sb_info *sbi, int __user *arg)
{
- int do_now = 0;
+ unsigned int how = 0;
- if (arg && get_user(do_now, arg))
+ if (arg && get_user(how, arg))
return -EFAULT;
- return autofs_do_expire_multi(sb, mnt, sbi, do_now);
+ return autofs_do_expire_multi(sb, mnt, sbi, how);
}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index a3d414150578..782e57b911ab 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -559,6 +559,13 @@ static int autofs_dir_symlink(struct inode *dir,
if (!autofs_oz_mode(sbi))
return -EACCES;
+ /* autofs_oz_mode() needs to allow path walks when the
+ * autofs mount is catatonic but the state of an autofs
+ * file system needs to be preserved over restarts.
+ */
+ if (sbi->catatonic)
+ return -EACCES;
+
BUG_ON(!ino);
autofs_clean_ino(ino);
@@ -612,9 +619,15 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
struct autofs_info *ino = autofs_dentry_ino(dentry);
struct autofs_info *p_ino;
- /* This allows root to remove symlinks */
- if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ if (!autofs_oz_mode(sbi))
+ return -EACCES;
+
+ /* autofs_oz_mode() needs to allow path walks when the
+ * autofs mount is catatonic but the state of an autofs
+ * file system needs to be preserved over restarts.
+ */
+ if (sbi->catatonic)
+ return -EACCES;
if (atomic_dec_and_test(&ino->count)) {
p_ino = autofs_dentry_ino(dentry->d_parent);
@@ -697,6 +710,13 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
if (!autofs_oz_mode(sbi))
return -EACCES;
+ /* autofs_oz_mode() needs to allow path walks when the
+ * autofs mount is catatonic but the state of an autofs
+ * file system needs to be preserved over restarts.
+ */
+ if (sbi->catatonic)
+ return -EACCES;
+
spin_lock(&sbi->lookup_lock);
if (!simple_empty(dentry)) {
spin_unlock(&sbi->lookup_lock);
@@ -735,6 +755,13 @@ static int autofs_dir_mkdir(struct inode *dir,
if (!autofs_oz_mode(sbi))
return -EACCES;
+ /* autofs_oz_mode() needs to allow path walks when the
+ * autofs mount is catatonic but the state of an autofs
+ * file system needs to be preserved over restarts.
+ */
+ if (sbi->catatonic)
+ return -EACCES;
+
pr_debug("dentry %p, creating %pd\n", dentry, dentry);
BUG_ON(!ino);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 6e76e4e762e8..f7ef2913bd9d 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -351,7 +351,8 @@ static int bfs_fill_super(struct super_block *s, void *data, size_t data_size,
s->s_magic = BFS_MAGIC;
- if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
+ if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) ||
+ le32_to_cpu(bfs_sb->s_start) < BFS_BSIZE) {
printf("Superblock is corrupted\n");
goto out1;
}
@@ -360,9 +361,11 @@ static int bfs_fill_super(struct super_block *s, void *data, size_t data_size,
sizeof(struct bfs_inode)
+ BFS_ROOT_INO - 1;
imap_len = (info->si_lasti / 8) + 1;
- info->si_imap = kzalloc(imap_len, GFP_KERNEL);
- if (!info->si_imap)
+ info->si_imap = kzalloc(imap_len, GFP_KERNEL | __GFP_NOWARN);
+ if (!info->si_imap) {
+ printf("Cannot allocate %u bytes\n", imap_len);
goto out1;
+ }
for (i = 0; i < BFS_ROOT_INO; i++)
set_bit(i, info->si_imap);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9025a2ba4f68..e1d479e0d0fc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3102,7 +3102,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
for (index = 0; index < nr_pages; index++) {
__do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
- bio, 0, bio_flags, 0, prev_em_start);
+ bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
put_page(pages[index]);
}
}
diff --git a/fs/buffer.c b/fs/buffer.c
index b413023b64b6..ebf78abb7770 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -45,6 +45,7 @@
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
#include <linux/pagevec.h>
+#include <linux/sched/mm.h>
#include <trace/events/block.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -813,12 +814,16 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
bool retry)
{
struct buffer_head *bh, *head;
- gfp_t gfp = GFP_NOFS;
+ gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
long offset;
+ struct mem_cgroup *memcg;
if (retry)
gfp |= __GFP_NOFAIL;
+ memcg = get_mem_cgroup_from_page(page);
+ memalloc_use_memcg(memcg);
+
head = NULL;
offset = PAGE_SIZE;
while ((offset -= size) >= 0) {
@@ -835,6 +840,9 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
/* Link the buffer to its page */
set_bh_page(bh, page, offset);
}
+out:
+ memalloc_unuse_memcg();
+ mem_cgroup_put(memcg);
return head;
/*
* In case anything failed, we just free everything we got.
@@ -848,7 +856,7 @@ no_grow:
} while (head);
}
- return NULL;
+ goto out;
}
EXPORT_SYMBOL_GPL(alloc_page_buffers);
@@ -947,10 +955,20 @@ grow_dev_page(struct block_device *bdev, sector_t block,
end_block = init_page_buffers(page, bdev,
(sector_t)index << sizebits,
size);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x01;
+#endif
goto done;
}
- if (!try_to_free_buffers(page))
+ if (!try_to_free_buffers(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x02;
+#endif
goto failed;
+ }
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x04;
+#endif
}
/*
@@ -970,6 +988,9 @@ grow_dev_page(struct block_device *bdev, sector_t block,
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x08;
+#endif
failed:
unlock_page(page);
put_page(page);
@@ -1025,6 +1046,12 @@ __getblk_slow(struct block_device *bdev, sector_t block,
return NULL;
}
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_stamp = jiffies;
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+#endif
for (;;) {
struct buffer_head *bh;
int ret;
@@ -1036,6 +1063,18 @@ __getblk_slow(struct block_device *bdev, sector_t block,
ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
+
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ if (!time_after(jiffies, current->getblk_stamp + 3 * HZ))
+ continue;
+ printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx\n",
+ current->comm, current->pid, current->getblk_executed,
+ current->getblk_bh_count, current->getblk_bh_state);
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+ current->getblk_stamp = jiffies;
+#endif
}
}
@@ -3208,6 +3247,11 @@ EXPORT_SYMBOL(sync_dirty_buffer);
*/
static inline int buffer_busy(struct buffer_head *bh)
{
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x80;
+ current->getblk_bh_count = atomic_read(&bh->b_count);
+ current->getblk_bh_state = bh->b_state;
+#endif
return atomic_read(&bh->b_count) |
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
@@ -3246,11 +3290,18 @@ int try_to_free_buffers(struct page *page)
int ret = 0;
BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
+ if (PageWriteback(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x10;
+#endif
return 0;
+ }
if (mapping == NULL) { /* can this still happen? */
ret = drop_buffers(page, &buffers_to_free);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x20;
+#endif
goto out;
}
@@ -3274,6 +3325,9 @@ int try_to_free_buffers(struct page *page)
if (ret)
cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x40;
+#endif
out:
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
diff --git a/fs/dcache.c b/fs/dcache.c
index 8d2ec4898c2b..2e7e8d85e9b4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -292,7 +292,8 @@ void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry
spin_unlock(&dentry->d_lock);
name->name = p->name;
} else {
- memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN);
+ memcpy(name->inline_name, dentry->d_iname,
+ dentry->d_name.len + 1);
spin_unlock(&dentry->d_lock);
name->name = name->inline_name;
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 67db22fe99c5..42bbe6824b4b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -50,10 +50,10 @@
*
* 1) epmutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->lock (spinlock)
+ * 3) ep->wq.lock (spinlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a spinlock (ep->lock) because we manipulate objects
+ * We need a spinlock (ep->wq.lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
@@ -85,7 +85,7 @@
* of epoll file descriptors, we use the current recursion depth as
* the lockdep subkey.
* It is possible to drop the "ep->mtx" and to use the global
- * mutex "epmutex" (together with "ep->lock") to have it working,
+ * mutex "epmutex" (together with "ep->wq.lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
* Events that require holding "epmutex" are very rare, while for
* normal operations the epoll private "ep->mtx" will guarantee
@@ -182,11 +182,10 @@ struct epitem {
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
+ *
+ * Access to it is protected by the lock inside wq.
*/
struct eventpoll {
- /* Protect the access to this structure */
- spinlock_t lock;
-
/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
@@ -210,7 +209,7 @@ struct eventpoll {
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
- * holding ->lock.
+ * holding ->wq.lock.
*/
struct epitem *ovflist;
@@ -337,9 +336,9 @@ static inline int ep_cmp_ffd(struct epoll_filefd *p1,
}
/* Tells us if the item is currently linked */
-static inline int ep_is_linked(struct list_head *p)
+static inline int ep_is_linked(struct epitem *epi)
{
- return !list_empty(p);
+ return !list_empty(&epi->rdllink);
}
static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
@@ -392,7 +391,6 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
return ep_events_available(ep) || busy_loop_timeout(start_time);
}
-#endif /* CONFIG_NET_RX_BUSY_POLL */
/*
* Busy poll if globally on and supporting sockets found && no events,
@@ -402,20 +400,16 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
*/
static void ep_busy_loop(struct eventpoll *ep, int nonblock)
{
-#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int napi_id = READ_ONCE(ep->napi_id);
if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
-#endif
}
static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
{
-#ifdef CONFIG_NET_RX_BUSY_POLL
if (ep->napi_id)
ep->napi_id = 0;
-#endif
}
/*
@@ -423,7 +417,6 @@ static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
*/
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
-#ifdef CONFIG_NET_RX_BUSY_POLL
struct eventpoll *ep;
unsigned int napi_id;
struct socket *sock;
@@ -453,9 +446,24 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
/* record NAPI ID for use in next busy poll */
ep->napi_id = napi_id;
-#endif
}
+#else
+
+static inline void ep_busy_loop(struct eventpoll *ep, int nonblock)
+{
+}
+
+static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
+{
+}
+
+static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
+{
+}
+
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
/**
* ep_call_nested - Perform a bound (possibly) nested call, by checking
* that the recursion limit is not exceeded, and that
@@ -668,10 +676,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
{
__poll_t res;
int pwake = 0;
- unsigned long flags;
struct epitem *epi, *nepi;
LIST_HEAD(txlist);
+ lockdep_assert_irqs_enabled();
+
/*
* We need to lock this because we could be hit by
* eventpoll_release_file() and epoll_ctl().
@@ -688,17 +697,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
- spin_lock_irqsave(&ep->lock, flags);
+ spin_lock_irq(&ep->wq.lock);
list_splice_init(&ep->rdllist, &txlist);
ep->ovflist = NULL;
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irq(&ep->wq.lock);
/*
* Now call the callback function.
*/
res = (*sproc)(ep, &txlist, priv);
- spin_lock_irqsave(&ep->lock, flags);
+ spin_lock_irq(&ep->wq.lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
@@ -712,7 +721,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* queued into ->ovflist but the "txlist" might already
* contain them, and the list_splice() below takes care of them.
*/
- if (!ep_is_linked(&epi->rdllink)) {
+ if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
@@ -740,7 +749,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irq(&ep->wq.lock);
if (!ep_locked)
mutex_unlock(&ep->mtx);
@@ -764,16 +773,12 @@ static void epi_rcu_free(struct rcu_head *head)
*/
static int ep_remove(struct eventpoll *ep, struct epitem *epi)
{
- unsigned long flags;
struct file *file = epi->ffd.file;
+ lockdep_assert_irqs_enabled();
+
/*
- * Removes poll wait queue hooks. We _have_ to do this without holding
- * the "ep->lock" otherwise a deadlock might occur. This because of the
- * sequence of the lock acquisition. Here we do "ep->lock" then the wait
- * queue head lock when unregistering the wait queue. The wakeup callback
- * will run by holding the wait queue head lock and will call our callback
- * that will try to get "ep->lock".
+ * Removes poll wait queue hooks.
*/
ep_unregister_pollwait(ep, epi);
@@ -784,10 +789,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
rb_erase_cached(&epi->rbn, &ep->rbr);
- spin_lock_irqsave(&ep->lock, flags);
- if (ep_is_linked(&epi->rdllink))
+ spin_lock_irq(&ep->wq.lock);
+ if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irq(&ep->wq.lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
@@ -837,7 +842,7 @@ static void ep_free(struct eventpoll *ep)
* Walks through the whole tree by freeing each "struct epitem". At this
* point we are sure no poll callbacks will be lingering around, and also by
* holding "epmutex" we can be sure that no file cleanup code will hit
- * us during this operation. So we can avoid the lock on "ep->lock".
+ * us during this operation. So we can avoid the lock on "ep->wq.lock".
* We do not need to lock ep->mtx, either, we only do it to prevent
* a lockdep warning.
*/
@@ -1017,7 +1022,6 @@ static int ep_alloc(struct eventpoll **pep)
if (unlikely(!ep))
goto free_uid;
- spin_lock_init(&ep->lock);
mutex_init(&ep->mtx);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
@@ -1122,7 +1126,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
__poll_t pollflags = key_to_poll(key);
int ewake = 0;
- spin_lock_irqsave(&ep->lock, flags);
+ spin_lock_irqsave(&ep->wq.lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1167,7 +1171,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
}
/* If this file is already in the ready list we exit soon */
- if (!ep_is_linked(&epi->rdllink)) {
+ if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake_rcu(epi);
}
@@ -1199,7 +1203,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
pwake++;
out_unlock:
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irqrestore(&ep->wq.lock, flags);
/* We have to call this outside the lock */
if (pwake)
@@ -1417,11 +1421,12 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
{
int error, pwake = 0;
__poll_t revents;
- unsigned long flags;
long user_watches;
struct epitem *epi;
struct ep_pqueue epq;
+ lockdep_assert_irqs_enabled();
+
user_watches = atomic_long_read(&ep->user->epoll_watches);
if (unlikely(user_watches >= max_user_watches))
return -ENOSPC;
@@ -1484,13 +1489,13 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
goto error_remove_epi;
/* We have to drop the new item inside our item list to keep track of it */
- spin_lock_irqsave(&ep->lock, flags);
+ spin_lock_irq(&ep->wq.lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
/* If the file is already "ready" we drop it inside the ready list */
- if (revents && !ep_is_linked(&epi->rdllink)) {
+ if (revents && !ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
@@ -1501,7 +1506,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
pwake++;
}
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irq(&ep->wq.lock);
atomic_long_inc(&ep->user->epoll_watches);
@@ -1527,10 +1532,10 @@ error_unregister:
* list, since that is used/cleaned only inside a section bound by "mtx".
* And ep_insert() is called with "mtx" held.
*/
- spin_lock_irqsave(&ep->lock, flags);
- if (ep_is_linked(&epi->rdllink))
+ spin_lock_irq(&ep->wq.lock);
+ if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irq(&ep->wq.lock);
wakeup_source_unregister(ep_wakeup_source(epi));
@@ -1550,6 +1555,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
int pwake = 0;
poll_table pt;
+ lockdep_assert_irqs_enabled();
+
init_poll_funcptr(&pt, NULL);
/*
@@ -1572,9 +1579,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* 1) Flush epi changes above to other CPUs. This ensures
* we do not miss events from ep_poll_callback if an
* event occurs immediately after we call f_op->poll().
- * We need this because we did not take ep->lock while
+ * We need this because we did not take ep->wq.lock while
* changing epi above (but ep_poll_callback does take
- * ep->lock).
+ * ep->wq.lock).
*
* 2) We also need to ensure we do not miss _past_ events
* when calling f_op->poll(). This barrier also
@@ -1593,8 +1600,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- spin_lock_irq(&ep->lock);
- if (!ep_is_linked(&epi->rdllink)) {
+ spin_lock_irq(&ep->wq.lock);
+ if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
@@ -1604,7 +1611,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->wq.lock);
}
/* We have to call this outside the lock */
@@ -1739,11 +1746,12 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
int res = 0, eavail, timed_out = 0;
- unsigned long flags;
u64 slack = 0;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
+ lockdep_assert_irqs_enabled();
+
if (timeout > 0) {
struct timespec64 end_time = ep_set_mstimeout(timeout);
@@ -1756,7 +1764,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* caller specified a non blocking operation.
*/
timed_out = 1;
- spin_lock_irqsave(&ep->lock, flags);
+ spin_lock_irq(&ep->wq.lock);
goto check_events;
}
@@ -1765,7 +1773,7 @@ fetch_events:
if (!ep_events_available(ep))
ep_busy_loop(ep, timed_out);
- spin_lock_irqsave(&ep->lock, flags);
+ spin_lock_irq(&ep->wq.lock);
if (!ep_events_available(ep)) {
/*
@@ -1807,11 +1815,11 @@ fetch_events:
break;
}
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irq(&ep->wq.lock);
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
timed_out = 1;
- spin_lock_irqsave(&ep->lock, flags);
+ spin_lock_irq(&ep->wq.lock);
}
__remove_wait_queue(&ep->wq, &wait);
@@ -1821,7 +1829,7 @@ check_events:
/* Is it worth to try to dig for events ? */
eavail = ep_events_available(ep);
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irq(&ep->wq.lock);
/*
* Try to transfer events to user space. In case we get 0 events and
diff --git a/fs/exec.c b/fs/exec.c
index 1ebf6e5a521d..a756fb674c9a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1338,6 +1338,7 @@ void setup_new_exec(struct linux_binprm * bprm)
if (bprm->secureexec) {
/* Make sure parent cannot signal privileged process. */
current->pdeath_signal = 0;
+ current->signal->pdeath_signal_proc = 0;
/*
* For secureexec, reset the stack limit to sane default to
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 047c327a6b23..28b2609f25c1 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -126,7 +126,6 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
vma->vm_ops = &ext2_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
return 0;
}
#else
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1fc013f3d944..0f0edd1cd0cd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3062,7 +3062,7 @@ static inline void ext4_set_de_type(struct super_block *sb,
/* readpages.c */
extern int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
- unsigned nr_pages);
+ unsigned nr_pages, bool is_readahead);
/* symlink.c */
extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7f8023340eb8..69d65d49837b 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -374,7 +374,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ vma->vm_flags |= VM_HUGEPAGE;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 15a8c5c88dc8..82adee7533bd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3325,7 +3325,8 @@ static int ext4_readpage(struct file *file, struct page *page)
ret = ext4_readpage_inline(inode, page);
if (ret == -EAGAIN)
- return ext4_mpage_readpages(page->mapping, NULL, page, 1);
+ return ext4_mpage_readpages(page->mapping, NULL, page, 1,
+ false);
return ret;
}
@@ -3340,7 +3341,7 @@ ext4_readpages(struct file *file, struct address_space *mapping,
if (ext4_has_inline_data(inode))
return 0;
- return ext4_mpage_readpages(mapping, pages, NULL, nr_pages);
+ return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true);
}
static void ext4_invalidatepage(struct page *page, unsigned int offset,
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 19b87a8de6ff..f461d75ac049 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -98,7 +98,7 @@ static void mpage_end_io(struct bio *bio)
int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
- unsigned nr_pages)
+ unsigned nr_pages, bool is_readahead)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
@@ -259,7 +259,8 @@ int ext4_mpage_readpages(struct address_space *mapping,
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
bio->bi_private = ctx;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio_set_op_attrs(bio, REQ_OP_READ,
+ is_readahead ? REQ_RAHEAD : 0);
}
length = first_hole << blkbits;
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index e9bed49df6b7..78d501c1fb65 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -225,7 +225,8 @@ static inline void cache_init(struct fat_cache_id *cid, int fclus, int dclus)
int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
{
struct super_block *sb = inode->i_sb;
- const int limit = sb->s_maxbytes >> MSDOS_SB(sb)->cluster_bits;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ const int limit = sb->s_maxbytes >> sbi->cluster_bits;
struct fat_entry fatent;
struct fat_cache_id cid;
int nr;
@@ -234,6 +235,12 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
*fclus = 0;
*dclus = MSDOS_I(inode)->i_start;
+ if (!fat_valid_entry(sbi, *dclus)) {
+ fat_fs_error_ratelimit(sb,
+ "%s: invalid start cluster (i_pos %lld, start %08x)",
+ __func__, MSDOS_I(inode)->i_pos, *dclus);
+ return -EIO;
+ }
if (cluster == 0)
return 0;
@@ -250,9 +257,8 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
/* prevent the infinite loop of cluster chain */
if (*fclus > limit) {
fat_fs_error_ratelimit(sb,
- "%s: detected the cluster chain loop"
- " (i_pos %lld)", __func__,
- MSDOS_I(inode)->i_pos);
+ "%s: detected the cluster chain loop (i_pos %lld)",
+ __func__, MSDOS_I(inode)->i_pos);
nr = -EIO;
goto out;
}
@@ -262,9 +268,8 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
goto out;
else if (nr == FAT_ENT_FREE) {
fat_fs_error_ratelimit(sb,
- "%s: invalid cluster chain (i_pos %lld)",
- __func__,
- MSDOS_I(inode)->i_pos);
+ "%s: invalid cluster chain (i_pos %lld)",
+ __func__, MSDOS_I(inode)->i_pos);
nr = -EIO;
goto out;
} else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 8e100c3bf72c..7f5f3699fc6c 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1130,7 +1130,7 @@ error:
return err;
}
-int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
+int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts)
{
struct super_block *sb = dir->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 8fc1093da47d..9d7d2d5da28b 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -304,7 +304,7 @@ extern int fat_scan_logstart(struct inode *dir, int i_logstart,
struct fat_slot_info *sinfo);
extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
struct msdos_dir_entry **de);
-extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
+extern int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts);
extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
struct fat_slot_info *sinfo);
extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo);
@@ -348,6 +348,11 @@ static inline void fatent_brelse(struct fat_entry *fatent)
fatent->fat_inode = NULL;
}
+static inline bool fat_valid_entry(struct msdos_sb_info *sbi, int entry)
+{
+ return FAT_START_ENT <= entry && entry < sbi->max_cluster;
+}
+
extern void fat_ent_access_init(struct super_block *sb);
extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent,
int entry);
@@ -357,6 +362,7 @@ extern int fat_alloc_clusters(struct inode *inode, int *cluster,
int nr_cluster);
extern int fat_free_clusters(struct inode *inode, int cluster);
extern int fat_count_free_clusters(struct super_block *sb);
+extern int fat_trim_fs(struct inode *inode, struct fstrim_range *range);
/* fat/file.c */
extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
@@ -406,9 +412,9 @@ void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...);
} while (0)
extern int fat_clusters_flush(struct super_block *sb);
extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
-extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 __time, __le16 __date, u8 time_cs);
-extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 *time, __le16 *date, u8 *time_cs);
extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index bac10de678cc..defc2168de91 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -4,6 +4,7 @@
*/
#include <linux/blkdev.h>
+#include <linux/sched/signal.h>
#include "fat.h"
struct fatent_operations {
@@ -23,7 +24,7 @@ static void fat12_ent_blocknr(struct super_block *sb, int entry,
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
int bytes = entry + (entry >> 1);
- WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry);
+ WARN_ON(!fat_valid_entry(sbi, entry));
*offset = bytes & (sb->s_blocksize - 1);
*blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits);
}
@@ -33,7 +34,7 @@ static void fat_ent_blocknr(struct super_block *sb, int entry,
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
int bytes = (entry << sbi->fatent_shift);
- WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry);
+ WARN_ON(!fat_valid_entry(sbi, entry));
*offset = bytes & (sb->s_blocksize - 1);
*blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits);
}
@@ -353,7 +354,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry)
int err, offset;
sector_t blocknr;
- if (entry < FAT_START_ENT || sbi->max_cluster <= entry) {
+ if (!fat_valid_entry(sbi, entry)) {
fatent_brelse(fatent);
fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry);
return -EIO;
@@ -690,3 +691,104 @@ out:
unlock_fat(sbi);
return err;
}
+
+static int fat_trim_clusters(struct super_block *sb, u32 clus, u32 nr_clus)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ return sb_issue_discard(sb, fat_clus_to_blknr(sbi, clus),
+ nr_clus * sbi->sec_per_clus, GFP_NOFS, 0);
+}
+
+int fat_trim_fs(struct inode *inode, struct fstrim_range *range)
+{
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ const struct fatent_operations *ops = sbi->fatent_ops;
+ struct fat_entry fatent;
+ u64 ent_start, ent_end, minlen, trimmed = 0;
+ u32 free = 0;
+ unsigned long reada_blocks, reada_mask, cur_block = 0;
+ int err = 0;
+
+ /*
+ * FAT data is organized as clusters, trim at the granulary of cluster.
+ *
+ * fstrim_range is in byte, convert vaules to cluster index.
+ * Treat sectors before data region as all used, not to trim them.
+ */
+ ent_start = max_t(u64, range->start>>sbi->cluster_bits, FAT_START_ENT);
+ ent_end = ent_start + (range->len >> sbi->cluster_bits) - 1;
+ minlen = range->minlen >> sbi->cluster_bits;
+
+ if (ent_start >= sbi->max_cluster || range->len < sbi->cluster_size)
+ return -EINVAL;
+ if (ent_end >= sbi->max_cluster)
+ ent_end = sbi->max_cluster - 1;
+
+ reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits;
+ reada_mask = reada_blocks - 1;
+
+ fatent_init(&fatent);
+ lock_fat(sbi);
+ fatent_set_entry(&fatent, ent_start);
+ while (fatent.entry <= ent_end) {
+ /* readahead of fat blocks */
+ if ((cur_block & reada_mask) == 0) {
+ unsigned long rest = sbi->fat_length - cur_block;
+ fat_ent_reada(sb, &fatent, min(reada_blocks, rest));
+ }
+ cur_block++;
+
+ err = fat_ent_read_block(sb, &fatent);
+ if (err)
+ goto error;
+ do {
+ if (ops->ent_get(&fatent) == FAT_ENT_FREE) {
+ free++;
+ } else if (free) {
+ if (free >= minlen) {
+ u32 clus = fatent.entry - free;
+
+ err = fat_trim_clusters(sb, clus, free);
+ if (err && err != -EOPNOTSUPP)
+ goto error;
+ if (!err)
+ trimmed += free;
+ err = 0;
+ }
+ free = 0;
+ }
+ } while (fat_ent_next(sbi, &fatent) && fatent.entry <= ent_end);
+
+ if (fatal_signal_pending(current)) {
+ err = -ERESTARTSYS;
+ goto error;
+ }
+
+ if (need_resched()) {
+ fatent_brelse(&fatent);
+ unlock_fat(sbi);
+ cond_resched();
+ lock_fat(sbi);
+ }
+ }
+ /* handle scenario when tail entries are all free */
+ if (free && free >= minlen) {
+ u32 clus = fatent.entry - free;
+
+ err = fat_trim_clusters(sb, clus, free);
+ if (err && err != -EOPNOTSUPP)
+ goto error;
+ if (!err)
+ trimmed += free;
+ err = 0;
+ }
+
+error:
+ fatent_brelse(&fatent);
+ unlock_fat(sbi);
+
+ range->len = trimmed << sbi->cluster_bits;
+
+ return err;
+}
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 4724cc9ad650..4f3d72fb1e60 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -121,6 +121,37 @@ static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr)
return put_user(sbi->vol_id, user_attr);
}
+static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg)
+{
+ struct super_block *sb = inode->i_sb;
+ struct fstrim_range __user *user_range;
+ struct fstrim_range range;
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ user_range = (struct fstrim_range __user *)arg;
+ if (copy_from_user(&range, user_range, sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max_t(unsigned int, range.minlen,
+ q->limits.discard_granularity);
+
+ err = fat_trim_fs(inode, &range);
+ if (err < 0)
+ return err;
+
+ if (copy_to_user(user_range, &range, sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+}
+
long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -133,6 +164,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return fat_ioctl_set_attributes(filp, user_attr);
case FAT_IOCTL_GET_VOLUME_ID:
return fat_ioctl_get_volume_id(inode, user_attr);
+ case FITRIM:
+ return fat_ioctl_fitrim(inode, arg);
default:
return -ENOTTY; /* Inappropriate ioctl for device */
}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 725c893105e1..56203cf2bb27 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -508,8 +508,8 @@ static int fat_validate_dir(struct inode *dir)
/* doesn't deal with root inode */
int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
{
- struct timespec ts;
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+ struct timespec64 ts;
int error;
MSDOS_I(inode)->i_pos = 0;
@@ -560,13 +560,13 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
& ~((loff_t)sbi->cluster_size - 1)) >> 9;
fat_time_fat2unix(sbi, &ts, de->time, de->date, 0);
- inode->i_mtime = timespec_to_timespec64(ts);
+ inode->i_mtime = ts;
if (sbi->options.isvfat) {
fat_time_fat2unix(sbi, &ts, de->ctime,
de->cdate, de->ctime_cs);
- inode->i_ctime = timespec_to_timespec64(ts);
+ inode->i_ctime = ts;
fat_time_fat2unix(sbi, &ts, 0, de->adate, 0);
- inode->i_atime = timespec_to_timespec64(ts);
+ inode->i_atime = ts;
} else
inode->i_ctime = inode->i_atime = inode->i_mtime;
@@ -844,7 +844,6 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
static int __fat_write_inode(struct inode *inode, int wait)
{
- struct timespec ts;
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
struct buffer_head *bh;
@@ -882,16 +881,13 @@ retry:
raw_entry->size = cpu_to_le32(inode->i_size);
raw_entry->attr = fat_make_attrs(inode);
fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart);
- ts = timespec64_to_timespec(inode->i_mtime);
- fat_time_unix2fat(sbi, &ts, &raw_entry->time,
+ fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
&raw_entry->date, NULL);
if (sbi->options.isvfat) {
__le16 atime;
- ts = timespec64_to_timespec(inode->i_ctime);
- fat_time_unix2fat(sbi, &ts, &raw_entry->ctime,
+ fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime,
&raw_entry->cdate, &raw_entry->ctime_cs);
- ts = timespec64_to_timespec(inode->i_atime);
- fat_time_unix2fat(sbi, &ts, &atime,
+ fat_time_unix2fat(sbi, &inode->i_atime, &atime,
&raw_entry->adate, NULL);
}
spin_unlock(&sbi->inode_hash_lock);
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index f9bdc1e01c98..2206a6821264 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -180,17 +180,19 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
#define IS_LEAP_YEAR(y) (!((y) & 3) && (y) != YEAR_2100)
/* Linear day numbers of the respective 1sts in non-leap years. */
-static time_t days_in_year[] = {
+static time64_t days_in_year[] = {
/* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */
0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
};
/* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */
-void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 __time, __le16 __date, u8 time_cs)
{
u16 time = le16_to_cpu(__time), date = le16_to_cpu(__date);
- time_t second, day, leap_day, month, year;
+ time64_t second;
+ long day, leap_day, month;
+ long long year;
year = date >> 9;
month = max(1, (date >> 5) & 0xf);
@@ -224,11 +226,11 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
}
/* Convert linear UNIX date to a FAT time/date pair. */
-void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 *time, __le16 *date, u8 *time_cs)
{
struct tm tm;
- time_to_tm(ts->tv_sec,
+ time64_to_tm(ts->tv_sec,
(sbi->options.tz_set ? sbi->options.time_offset :
-sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 06e1c9d81d65..8c49575b4da8 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -225,7 +225,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
/***** Creates a directory entry (name is already formatted). */
static int msdos_add_entry(struct inode *dir, const unsigned char *name,
int is_dir, int is_hid, int cluster,
- struct timespec *ts, struct fat_slot_info *sinfo)
+ struct timespec64 *ts, struct fat_slot_info *sinfo)
{
struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
struct msdos_dir_entry de;
@@ -250,7 +250,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
if (err)
return err;
- dir->i_ctime = dir->i_mtime = timespec_to_timespec64(*ts);
+ dir->i_ctime = dir->i_mtime = *ts;
if (IS_DIRSYNC(dir))
(void)fat_sync_inode(dir);
else
@@ -267,7 +267,6 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct inode *inode = NULL;
struct fat_slot_info sinfo;
struct timespec64 ts;
- struct timespec t;
unsigned char msdos_name[MSDOS_NAME];
int err, is_hid;
@@ -286,8 +285,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
}
ts = current_time(dir);
- t = timespec64_to_timespec(ts);
- err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &t, &sinfo);
+ err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo);
if (err)
goto out;
inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
@@ -347,7 +345,6 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct inode *inode;
unsigned char msdos_name[MSDOS_NAME];
struct timespec64 ts;
- struct timespec t;
int err, is_hid, cluster;
mutex_lock(&MSDOS_SB(sb)->s_lock);
@@ -365,13 +362,12 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
}
ts = current_time(dir);
- t = timespec64_to_timespec(ts);
- cluster = fat_alloc_new_dir(dir, &t);
+ cluster = fat_alloc_new_dir(dir, &ts);
if (cluster < 0) {
err = cluster;
goto out;
}
- err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &t, &sinfo);
+ err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo);
if (err)
goto out_free;
inc_nlink(dir);
@@ -503,9 +499,8 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
new_i_pos = MSDOS_I(new_inode)->i_pos;
fat_detach(new_inode);
} else {
- struct timespec t = timespec64_to_timespec(ts);
err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0,
- &t, &sinfo);
+ &ts, &sinfo);
if (err)
goto out;
new_i_pos = sinfo.i_pos;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 4ee41d511e05..993f186705b1 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -577,7 +577,7 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
static int vfat_build_slots(struct inode *dir, const unsigned char *name,
int len, int is_dir, int cluster,
- struct timespec *ts,
+ struct timespec64 *ts,
struct msdos_dir_slot *slots, int *nr_slots)
{
struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
@@ -653,7 +653,7 @@ out_free:
}
static int vfat_add_entry(struct inode *dir, const struct qstr *qname,
- int is_dir, int cluster, struct timespec *ts,
+ int is_dir, int cluster, struct timespec64 *ts,
struct fat_slot_info *sinfo)
{
struct msdos_dir_slot *slots;
@@ -678,7 +678,7 @@ static int vfat_add_entry(struct inode *dir, const struct qstr *qname,
goto cleanup;
/* update timestamp */
- dir->i_ctime = dir->i_mtime = dir->i_atime = timespec_to_timespec64(*ts);
+ dir->i_ctime = dir->i_mtime = dir->i_atime = *ts;
if (IS_DIRSYNC(dir))
(void)fat_sync_inode(dir);
else
@@ -762,14 +762,12 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct inode *inode;
struct fat_slot_info sinfo;
struct timespec64 ts;
- struct timespec t;
int err;
mutex_lock(&MSDOS_SB(sb)->s_lock);
ts = current_time(dir);
- t = timespec64_to_timespec(ts);
- err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &t, &sinfo);
+ err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
if (err)
goto out;
inode_inc_iversion(dir);
@@ -853,19 +851,17 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct inode *inode;
struct fat_slot_info sinfo;
struct timespec64 ts;
- struct timespec t;
int err, cluster;
mutex_lock(&MSDOS_SB(sb)->s_lock);
ts = current_time(dir);
- t = timespec64_to_timespec(ts);
- cluster = fat_alloc_new_dir(dir, &t);
+ cluster = fat_alloc_new_dir(dir, &ts);
if (cluster < 0) {
err = cluster;
goto out;
}
- err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &t, &sinfo);
+ err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo);
if (err)
goto out_free;
inode_inc_iversion(dir);
@@ -904,7 +900,6 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode, *new_inode;
struct fat_slot_info old_sinfo, sinfo;
struct timespec64 ts;
- struct timespec t;
loff_t new_i_pos;
int err, is_dir, update_dotdot, corrupt = 0;
struct super_block *sb = old_dir->i_sb;
@@ -939,9 +934,8 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
new_i_pos = MSDOS_I(new_inode)->i_pos;
fat_detach(new_inode);
} else {
- t = timespec64_to_timespec(ts);
err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0,
- &t, &sinfo);
+ &ts, &sinfo);
if (err)
goto out;
new_i_pos = sinfo.i_pos;
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index ad04a5741016..9a8772465a90 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -75,9 +75,10 @@ int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len)
if (!fd->bnode) {
if (!tree->root)
hfs_btree_inc_height(tree);
- fd->bnode = hfs_bnode_find(tree, tree->leaf_head);
- if (IS_ERR(fd->bnode))
- return PTR_ERR(fd->bnode);
+ node = hfs_bnode_find(tree, tree->leaf_head);
+ if (IS_ERR(node))
+ return PTR_ERR(node);
+ fd->bnode = node;
fd->record = -1;
}
new_node = NULL;
diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
index 7cc8b4acf66a..a63371815aab 100644
--- a/fs/hfsplus/Kconfig
+++ b/fs/hfsplus/Kconfig
@@ -11,18 +11,3 @@ config HFSPLUS_FS
MacOS 8. It includes all Mac specific filesystem data such as
data forks and creator codes, but it also has several UNIX
style features such as file ownership and permissions.
-
-config HFSPLUS_FS_POSIX_ACL
- bool "HFS+ POSIX Access Control Lists"
- depends on HFSPLUS_FS
- select FS_POSIX_ACL
- help
- POSIX Access Control Lists (ACLs) support permissions for users and
- groups beyond the owner/group/world scheme.
-
- It needs to understand that POSIX ACLs are treated only under
- Linux. POSIX ACLs doesn't mean something under Mac OS X.
- Mac OS X beginning with version 10.4 ("Tiger") support NFSv4 ACLs,
- which are part of the NFSv4 standard.
-
- If you don't know what Access Control Lists are, say N
diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile
index f6a56542f8d7..9ed20e64b983 100644
--- a/fs/hfsplus/Makefile
+++ b/fs/hfsplus/Makefile
@@ -8,5 +8,3 @@ obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o
hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \
bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \
attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o
-
-hfsplus-$(CONFIG_HFSPLUS_FS_POSIX_ACL) += posix_acl.o
diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h
deleted file mode 100644
index 488c2b75cf41..000000000000
--- a/fs/hfsplus/acl.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * linux/fs/hfsplus/acl.h
- *
- * Vyacheslav Dubeyko <slava@dubeyko.com>
- *
- * Handler for Posix Access Control Lists (ACLs) support.
- */
-
-#include <linux/posix_acl_xattr.h>
-
-#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
-
-/* posix_acl.c */
-struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type);
-int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
- int type);
-extern int hfsplus_init_posix_acl(struct inode *, struct inode *);
-
-#else /* CONFIG_HFSPLUS_FS_POSIX_ACL */
-#define hfsplus_get_posix_acl NULL
-#define hfsplus_set_posix_acl NULL
-
-static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
-{
- return 0;
-}
-#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 808f4d8c859c..ed8eacb34452 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -73,9 +73,10 @@ int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len)
if (!fd->bnode) {
if (!tree->root)
hfs_btree_inc_height(tree);
- fd->bnode = hfs_bnode_find(tree, tree->leaf_head);
- if (IS_ERR(fd->bnode))
- return PTR_ERR(fd->bnode);
+ node = hfs_bnode_find(tree, tree->leaf_head);
+ if (IS_ERR(node))
+ return PTR_ERR(node);
+ fd->bnode = node;
fd->record = -1;
}
new_node = NULL;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index b5254378f011..f37662675c3a 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -18,7 +18,6 @@
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
#include "xattr.h"
-#include "acl.h"
static inline void hfsplus_instantiate(struct dentry *dentry,
struct inode *inode, u32 cnid)
@@ -78,13 +77,13 @@ again:
cpu_to_be32(HFSP_HARDLINK_TYPE) &&
entry.file.user_info.fdCreator ==
cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
+ HFSPLUS_SB(sb)->hidden_dir &&
(entry.file.create_date ==
HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->
create_date ||
entry.file.create_date ==
HFSPLUS_I(d_inode(sb->s_root))->
- create_date) &&
- HFSPLUS_SB(sb)->hidden_dir) {
+ create_date)) {
struct qstr str;
char name[32];
@@ -455,7 +454,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
if (res)
goto out_err;
- res = hfsplus_init_inode_security(inode, dir, &dentry->d_name);
+ res = hfsplus_init_security(inode, dir, &dentry->d_name);
if (res == -EOPNOTSUPP)
res = 0; /* Operation is not supported. */
else if (res) {
@@ -496,7 +495,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
if (res)
goto failed_mknod;
- res = hfsplus_init_inode_security(inode, dir, &dentry->d_name);
+ res = hfsplus_init_security(inode, dir, &dentry->d_name);
if (res == -EOPNOTSUPP)
res = 0; /* Operation is not supported. */
else if (res) {
@@ -567,10 +566,6 @@ const struct inode_operations hfsplus_dir_inode_operations = {
.mknod = hfsplus_mknod,
.rename = hfsplus_rename,
.listxattr = hfsplus_listxattr,
-#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
- .get_acl = hfsplus_get_posix_acl,
- .set_acl = hfsplus_set_posix_acl,
-#endif
};
const struct file_operations hfsplus_dir_operations = {
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index e8770935ce6d..8e0f59767694 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -336,6 +336,9 @@ static int hfsplus_free_extents(struct super_block *sb,
int i;
int err = 0;
+ /* Mapping the allocation file may lock the extent tree */
+ WARN_ON(mutex_is_locked(&HFSPLUS_SB(sb)->ext_tree->tree_lock));
+
hfsplus_dump_extent(extent);
for (i = 0; i < 8; extent++, i++) {
count = be32_to_cpu(extent->block_count);
@@ -415,11 +418,13 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid,
if (res)
break;
start = be32_to_cpu(fd.key->ext.start_block);
- hfsplus_free_extents(sb, ext_entry,
- total_blocks - start,
- total_blocks);
hfs_brec_remove(&fd);
+
+ mutex_unlock(&fd.tree->tree_lock);
+ hfsplus_free_extents(sb, ext_entry, total_blocks - start,
+ total_blocks);
total_blocks = start;
+ mutex_lock(&fd.tree->tree_lock);
} while (total_blocks > blocks);
hfs_find_exit(&fd);
@@ -576,15 +581,20 @@ void hfsplus_file_truncate(struct inode *inode)
}
while (1) {
if (alloc_cnt == hip->first_blocks) {
+ mutex_unlock(&fd.tree->tree_lock);
hfsplus_free_extents(sb, hip->first_extents,
alloc_cnt, alloc_cnt - blk_cnt);
hfsplus_dump_extent(hip->first_extents);
hip->first_blocks = blk_cnt;
+ mutex_lock(&fd.tree->tree_lock);
break;
}
res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
if (res)
break;
+ hfs_brec_remove(&fd);
+
+ mutex_unlock(&fd.tree->tree_lock);
start = hip->cached_start;
hfsplus_free_extents(sb, hip->cached_extents,
alloc_cnt - start, alloc_cnt - blk_cnt);
@@ -596,7 +606,7 @@ void hfsplus_file_truncate(struct inode *inode)
alloc_cnt = start;
hip->cached_start = hip->cached_blocks = 0;
hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
- hfs_brec_remove(&fd);
+ mutex_lock(&fd.tree->tree_lock);
}
hfs_find_exit(&fd);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index d9255abafb81..8e039435958a 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -31,7 +31,6 @@
#define DBG_EXTENT 0x00000020
#define DBG_BITMAP 0x00000040
#define DBG_ATTR_MOD 0x00000080
-#define DBG_ACL_MOD 0x00000100
#if 0
#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c824f702feec..8e9427a42b81 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -21,7 +21,6 @@
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
#include "xattr.h"
-#include "acl.h"
static int hfsplus_readpage(struct file *file, struct page *page)
{
@@ -267,12 +266,6 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
setattr_copy(inode, attr);
mark_inode_dirty(inode);
- if (attr->ia_valid & ATTR_MODE) {
- error = posix_acl_chmod(inode, inode->i_mode);
- if (unlikely(error))
- return error;
- }
-
return 0;
}
@@ -336,10 +329,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
static const struct inode_operations hfsplus_file_inode_operations = {
.setattr = hfsplus_setattr,
.listxattr = hfsplus_listxattr,
-#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
- .get_acl = hfsplus_get_posix_acl,
- .set_acl = hfsplus_set_posix_acl,
-#endif
};
static const struct file_operations hfsplus_file_operations = {
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
deleted file mode 100644
index 066114dcc3a2..000000000000
--- a/fs/hfsplus/posix_acl.c
+++ /dev/null
@@ -1,144 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/hfsplus/posix_acl.c
- *
- * Vyacheslav Dubeyko <slava@dubeyko.com>
- *
- * Handler for Posix Access Control Lists (ACLs) support.
- */
-
-#include "hfsplus_fs.h"
-#include "xattr.h"
-#include "acl.h"
-
-struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
-{
- struct posix_acl *acl;
- char *xattr_name;
- char *value = NULL;
- ssize_t size;
-
- hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
- break;
- default:
- return ERR_PTR(-EINVAL);
- }
-
- size = __hfsplus_getxattr(inode, xattr_name, NULL, 0);
-
- if (size > 0) {
- value = (char *)hfsplus_alloc_attr_entry();
- if (unlikely(!value))
- return ERR_PTR(-ENOMEM);
- size = __hfsplus_getxattr(inode, xattr_name, value, size);
- }
-
- if (size > 0)
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- else if (size == -ENODATA)
- acl = NULL;
- else
- acl = ERR_PTR(size);
-
- hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value);
-
- return acl;
-}
-
-static int __hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
- int type)
-{
- int err;
- char *xattr_name;
- size_t size = 0;
- char *value = NULL;
-
- hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
- break;
-
- case ACL_TYPE_DEFAULT:
- xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
- if (!S_ISDIR(inode->i_mode))
- return acl ? -EACCES : 0;
- break;
-
- default:
- return -EINVAL;
- }
-
- if (acl) {
- size = posix_acl_xattr_size(acl->a_count);
- if (unlikely(size > HFSPLUS_MAX_INLINE_DATA_SIZE))
- return -ENOMEM;
- value = (char *)hfsplus_alloc_attr_entry();
- if (unlikely(!value))
- return -ENOMEM;
- err = posix_acl_to_xattr(&init_user_ns, acl, value, size);
- if (unlikely(err < 0))
- goto end_set_acl;
- }
-
- err = __hfsplus_setxattr(inode, xattr_name, value, size, 0);
-
-end_set_acl:
- hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value);
-
- if (!err)
- set_cached_acl(inode, type, acl);
-
- return err;
-}
-
-int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, int type)
-{
- int err;
-
- if (type == ACL_TYPE_ACCESS && acl) {
- err = posix_acl_update_mode(inode, &inode->i_mode, &acl);
- if (err)
- return err;
- }
- return __hfsplus_set_posix_acl(inode, acl, type);
-}
-
-int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
-{
- int err = 0;
- struct posix_acl *default_acl, *acl;
-
- hfs_dbg(ACL_MOD,
- "[%s]: ino %lu, dir->ino %lu\n",
- __func__, inode->i_ino, dir->i_ino);
-
- if (S_ISLNK(inode->i_mode))
- return 0;
-
- err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
- if (err)
- return err;
-
- if (default_acl) {
- err = __hfsplus_set_posix_acl(inode, default_acl,
- ACL_TYPE_DEFAULT);
- posix_acl_release(default_acl);
- }
-
- if (acl) {
- if (!err)
- err = __hfsplus_set_posix_acl(inode, acl,
- ACL_TYPE_ACCESS);
- posix_acl_release(acl);
- }
- return err;
-}
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9c5f19922e4a..e12e051d46b4 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -526,8 +526,10 @@ static int hfsplus_fill_super(struct super_block *sb,
goto out_put_root;
if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
hfs_find_exit(&fd);
- if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
+ if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) {
+ err = -EINVAL;
goto out_put_root;
+ }
inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
@@ -564,8 +566,8 @@ static int hfsplus_fill_super(struct super_block *sb,
goto out_put_hidden_dir;
}
- err = hfsplus_init_inode_security(sbi->hidden_dir,
- root, &str);
+ err = hfsplus_init_security(sbi->hidden_dir,
+ root, &str);
if (err == -EOPNOTSUPP)
err = 0; /* Operation is not supported. */
else if (err) {
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index dfa90c21948f..c8d1b2be7854 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -272,8 +272,8 @@ static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
return size;
}
-/* Decomposes a single unicode character. */
-static inline u16 *decompose_unichar(wchar_t uc, int *size)
+/* Decomposes a non-Hangul unicode character. */
+static u16 *hfsplus_decompose_nonhangul(wchar_t uc, int *size)
{
int off;
@@ -296,6 +296,51 @@ static inline u16 *decompose_unichar(wchar_t uc, int *size)
return hfsplus_decompose_table + (off / 4);
}
+/*
+ * Try to decompose a unicode character as Hangul. Return 0 if @uc is not
+ * precomposed Hangul, otherwise return the length of the decomposition.
+ *
+ * This function was adapted from sample code from the Unicode Standard
+ * Annex #15: Unicode Normalization Forms, version 3.2.0.
+ *
+ * Copyright (C) 1991-2018 Unicode, Inc. All rights reserved. Distributed
+ * under the Terms of Use in http://www.unicode.org/copyright.html.
+ */
+static int hfsplus_try_decompose_hangul(wchar_t uc, u16 *result)
+{
+ int index;
+ int l, v, t;
+
+ index = uc - Hangul_SBase;
+ if (index < 0 || index >= Hangul_SCount)
+ return 0;
+
+ l = Hangul_LBase + index / Hangul_NCount;
+ v = Hangul_VBase + (index % Hangul_NCount) / Hangul_TCount;
+ t = Hangul_TBase + index % Hangul_TCount;
+
+ result[0] = l;
+ result[1] = v;
+ if (t != Hangul_TBase) {
+ result[2] = t;
+ return 3;
+ }
+ return 2;
+}
+
+/* Decomposes a single unicode character. */
+static u16 *decompose_unichar(wchar_t uc, int *size, u16 *hangul_buffer)
+{
+ u16 *result;
+
+ /* Hangul is handled separately */
+ result = hangul_buffer;
+ *size = hfsplus_try_decompose_hangul(uc, result);
+ if (*size == 0)
+ result = hfsplus_decompose_nonhangul(uc, size);
+ return result;
+}
+
int hfsplus_asc2uni(struct super_block *sb,
struct hfsplus_unistr *ustr, int max_unistr_len,
const char *astr, int len)
@@ -303,13 +348,14 @@ int hfsplus_asc2uni(struct super_block *sb,
int size, dsize, decompose;
u16 *dstr, outlen = 0;
wchar_t c;
+ u16 dhangul[3];
decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
while (outlen < max_unistr_len && len > 0) {
size = asc2unichar(sb, astr, len, &c);
if (decompose)
- dstr = decompose_unichar(c, &dsize);
+ dstr = decompose_unichar(c, &dsize, dhangul);
else
dstr = NULL;
if (dstr) {
@@ -344,6 +390,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
unsigned long hash;
wchar_t c;
u16 c2;
+ u16 dhangul[3];
casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
@@ -357,7 +404,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
len -= size;
if (decompose)
- dstr = decompose_unichar(c, &dsize);
+ dstr = decompose_unichar(c, &dsize, dhangul);
else
dstr = NULL;
if (dstr) {
@@ -396,6 +443,7 @@ int hfsplus_compare_dentry(const struct dentry *dentry,
const char *astr1, *astr2;
u16 c1, c2;
wchar_t c;
+ u16 dhangul_1[3], dhangul_2[3];
casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
@@ -413,7 +461,8 @@ int hfsplus_compare_dentry(const struct dentry *dentry,
len1 -= size;
if (decompose)
- dstr1 = decompose_unichar(c, &dsize1);
+ dstr1 = decompose_unichar(c, &dsize1,
+ dhangul_1);
if (!decompose || !dstr1) {
c1 = c;
dstr1 = &c1;
@@ -427,7 +476,8 @@ int hfsplus_compare_dentry(const struct dentry *dentry,
len2 -= size;
if (decompose)
- dstr2 = decompose_unichar(c, &dsize2);
+ dstr2 = decompose_unichar(c, &dsize2,
+ dhangul_2);
if (!decompose || !dstr2) {
c2 = c;
dstr2 = &c2;
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index e538b758c448..d5403b4004c9 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -8,10 +8,8 @@
*/
#include "hfsplus_fs.h"
-#include <linux/posix_acl_xattr.h>
#include <linux/nls.h>
#include "xattr.h"
-#include "acl.h"
static int hfsplus_removexattr(struct inode *inode, const char *name);
@@ -19,10 +17,6 @@ const struct xattr_handler *hfsplus_xattr_handlers[] = {
&hfsplus_xattr_osx_handler,
&hfsplus_xattr_user_handler,
&hfsplus_xattr_trusted_handler,
-#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
-#endif
&hfsplus_xattr_security_handler,
NULL
};
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index a4e611d69710..d14e362b3eba 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -38,7 +38,4 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
int hfsplus_init_security(struct inode *inode, struct inode *dir,
const struct qstr *qstr);
-int hfsplus_init_inode_security(struct inode *inode, struct inode *dir,
- const struct qstr *qstr);
-
#endif
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index f5550b006e88..cfbe6a3bfb1e 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -12,7 +12,6 @@
#include "hfsplus_fs.h"
#include "xattr.h"
-#include "acl.h"
static int hfsplus_security_getxattr(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
@@ -72,18 +71,6 @@ int hfsplus_init_security(struct inode *inode, struct inode *dir,
&hfsplus_initxattrs, NULL);
}
-int hfsplus_init_inode_security(struct inode *inode,
- struct inode *dir,
- const struct qstr *qstr)
-{
- int err;
-
- err = hfsplus_init_posix_acl(inode, dir);
- if (!err)
- err = hfsplus_init_security(inode, dir, qstr);
- return err;
-}
-
const struct xattr_handler hfsplus_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.get = hfsplus_security_getxattr,
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index cb8374af08a6..33b8423ef0c9 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -19,7 +19,7 @@
#define HOSTFS_ATTR_ATIME_SET 128
#define HOSTFS_ATTR_MTIME_SET 256
-/* These two are unused by hostfs. */
+/* This one is unused by hostfs. */
#define HOSTFS_ATTR_FORCE 512 /* Not a change, but a change it */
#define HOSTFS_ATTR_ATTR_FLAG 1024
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 2a153aed4c19..ab2e7cc2ff33 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -334,16 +334,23 @@ long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg);
* local time (HPFS) to GMT (Unix)
*/
-static inline time_t local_to_gmt(struct super_block *s, time32_t t)
+static inline time64_t local_to_gmt(struct super_block *s, time32_t t)
{
extern struct timezone sys_tz;
return t + sys_tz.tz_minuteswest * 60 + hpfs_sb(s)->sb_timeshift;
}
-static inline time32_t gmt_to_local(struct super_block *s, time_t t)
+static inline time32_t gmt_to_local(struct super_block *s, time64_t t)
{
extern struct timezone sys_tz;
- return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
+ t = t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
+
+ return clamp_t(time64_t, t, 0, U32_MAX);
+}
+
+static inline time32_t local_get_seconds(struct super_block *s)
+{
+ return gmt_to_local(s, ktime_get_real_seconds());
}
/*
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index a3615e4c730d..082b7c76dd0c 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -11,7 +11,7 @@
static void hpfs_update_directory_times(struct inode *dir)
{
- time_t t = get_seconds();
+ time64_t t = local_to_gmt(dir->i_sb, local_get_seconds(dir->i_sb));
if (t == dir->i_mtime.tv_sec &&
t == dir->i_ctime.tv_sec)
return;
@@ -50,7 +50,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
/*dee.archive = 0;*/
dee.hidden = name[0] == '.';
dee.fnode = cpu_to_le32(fno);
- dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+ dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb));
result = new_inode(dir->i_sb);
if (!result)
goto bail2;
@@ -91,7 +91,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
dnode->root_dnode = 1;
dnode->up = cpu_to_le32(fno);
de = hpfs_add_de(dir->i_sb, dnode, "\001\001", 2, 0);
- de->creation_date = de->write_date = de->read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+ de->creation_date = de->write_date = de->read_date = cpu_to_le32(local_get_seconds(dir->i_sb));
if (!(mode & 0222)) de->read_only = 1;
de->first = de->directory = 1;
/*de->hidden = de->system = 0;*/
@@ -151,7 +151,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, b
dee.archive = 1;
dee.hidden = name[0] == '.';
dee.fnode = cpu_to_le32(fno);
- dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+ dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb));
result = new_inode(dir->i_sb);
if (!result)
@@ -238,7 +238,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de
dee.archive = 1;
dee.hidden = name[0] == '.';
dee.fnode = cpu_to_le32(fno);
- dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+ dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb));
result = new_inode(dir->i_sb);
if (!result)
@@ -314,7 +314,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
dee.archive = 1;
dee.hidden = name[0] == '.';
dee.fnode = cpu_to_le32(fno);
- dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+ dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb));
result = new_inode(dir->i_sb);
if (!result)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 99c33a90f86c..d8a7ba625603 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -439,7 +439,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
int i, freed = 0;
bool truncate_op = (lend == LLONG_MAX);
- memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
vma_init(&pseudo_vma, current->mm);
pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
pagevec_init(&pvec);
@@ -624,7 +623,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
* allocation routines. If NUMA is configured, use page index
* as input to create an allocation policy.
*/
- memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
vma_init(&pseudo_vma, mm);
pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
pseudo_vma.vm_file = file;
diff --git a/fs/mpage.c b/fs/mpage.c
index b73638db9866..c820dc9bebab 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -133,6 +133,17 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
} while (page_bh != head);
}
+struct mpage_readpage_args {
+ struct bio *bio;
+ struct page *page;
+ unsigned int nr_pages;
+ bool is_readahead;
+ sector_t last_block_in_bio;
+ struct buffer_head map_bh;
+ unsigned long first_logical_block;
+ get_block_t *get_block;
+};
+
/*
* This is the worker routine which does all the work of mapping the disk
* blocks and constructs largest possible bios, submits them for IO if the
@@ -142,16 +153,14 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
* represent the validity of its disk mapping and to decide when to do the next
* get_block() call.
*/
-static struct bio *
-do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
- sector_t *last_block_in_bio, struct buffer_head *map_bh,
- unsigned long *first_logical_block, get_block_t get_block,
- gfp_t gfp)
+static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
{
+ struct page *page = args->page;
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
+ struct buffer_head *map_bh = &args->map_bh;
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
@@ -161,14 +170,24 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
struct block_device *bdev = NULL;
int length;
int fully_mapped = 1;
+ int op_flags;
unsigned nblocks;
unsigned relative_block;
+ gfp_t gfp;
+
+ if (args->is_readahead) {
+ op_flags = REQ_RAHEAD;
+ gfp = readahead_gfp_mask(page->mapping);
+ } else {
+ op_flags = 0;
+ gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+ }
if (page_has_buffers(page))
goto confused;
block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
- last_block = block_in_file + nr_pages * blocks_per_page;
+ last_block = block_in_file + args->nr_pages * blocks_per_page;
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
@@ -178,9 +197,10 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
* Map blocks using the result from the previous get_blocks call first.
*/
nblocks = map_bh->b_size >> blkbits;
- if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
- block_in_file < (*first_logical_block + nblocks)) {
- unsigned map_offset = block_in_file - *first_logical_block;
+ if (buffer_mapped(map_bh) &&
+ block_in_file > args->first_logical_block &&
+ block_in_file < (args->first_logical_block + nblocks)) {
+ unsigned map_offset = block_in_file - args->first_logical_block;
unsigned last = nblocks - map_offset;
for (relative_block = 0; ; relative_block++) {
@@ -208,9 +228,9 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
if (block_in_file < last_block) {
map_bh->b_size = (last_block-block_in_file) << blkbits;
- if (get_block(inode, block_in_file, map_bh, 0))
+ if (args->get_block(inode, block_in_file, map_bh, 0))
goto confused;
- *first_logical_block = block_in_file;
+ args->first_logical_block = block_in_file;
}
if (!buffer_mapped(map_bh)) {
@@ -273,43 +293,45 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
- if (bio && (*last_block_in_bio != blocks[0] - 1))
- bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
+ if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
+ args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
alloc_new:
- if (bio == NULL) {
+ if (args->bio == NULL) {
if (first_hole == blocks_per_page) {
if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9),
page))
goto out;
}
- bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- min_t(int, nr_pages, BIO_MAX_PAGES), gfp);
- if (bio == NULL)
+ args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
+ min_t(int, args->nr_pages,
+ BIO_MAX_PAGES),
+ gfp);
+ if (args->bio == NULL)
goto confused;
}
length = first_hole << blkbits;
- if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
+ if (bio_add_page(args->bio, page, length, 0) < length) {
+ args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
goto alloc_new;
}
- relative_block = block_in_file - *first_logical_block;
+ relative_block = block_in_file - args->first_logical_block;
nblocks = map_bh->b_size >> blkbits;
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
(first_hole != blocks_per_page))
- bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
+ args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
else
- *last_block_in_bio = blocks[blocks_per_page - 1];
+ args->last_block_in_bio = blocks[blocks_per_page - 1];
out:
- return bio;
+ return args->bio;
confused:
- if (bio)
- bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
+ if (args->bio)
+ args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
if (!PageUptodate(page))
- block_read_full_page(page, get_block);
+ block_read_full_page(page, args->get_block);
else
unlock_page(page);
goto out;
@@ -363,15 +385,12 @@ int
mpage_readpages(struct address_space *mapping, struct list_head *pages,
unsigned nr_pages, get_block_t get_block)
{
- struct bio *bio = NULL;
+ struct mpage_readpage_args args = {
+ .get_block = get_block,
+ .is_readahead = true,
+ };
unsigned page_idx;
- sector_t last_block_in_bio = 0;
- struct buffer_head map_bh;
- unsigned long first_logical_block = 0;
- gfp_t gfp = readahead_gfp_mask(mapping);
- map_bh.b_state = 0;
- map_bh.b_size = 0;
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = lru_to_page(pages);
@@ -379,18 +398,16 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
list_del(&page->lru);
if (!add_to_page_cache_lru(page, mapping,
page->index,
- gfp)) {
- bio = do_mpage_readpage(bio, page,
- nr_pages - page_idx,
- &last_block_in_bio, &map_bh,
- &first_logical_block,
- get_block, gfp);
+ readahead_gfp_mask(mapping))) {
+ args.page = page;
+ args.nr_pages = nr_pages - page_idx;
+ args.bio = do_mpage_readpage(&args);
}
put_page(page);
}
BUG_ON(!list_empty(pages));
- if (bio)
- mpage_bio_submit(REQ_OP_READ, 0, bio);
+ if (args.bio)
+ mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio);
return 0;
}
EXPORT_SYMBOL(mpage_readpages);
@@ -400,18 +417,15 @@ EXPORT_SYMBOL(mpage_readpages);
*/
int mpage_readpage(struct page *page, get_block_t get_block)
{
- struct bio *bio = NULL;
- sector_t last_block_in_bio = 0;
- struct buffer_head map_bh;
- unsigned long first_logical_block = 0;
- gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+ struct mpage_readpage_args args = {
+ .page = page,
+ .nr_pages = 1,
+ .get_block = get_block,
+ };
- map_bh.b_state = 0;
- map_bh.b_size = 0;
- bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
- &map_bh, &first_logical_block, get_block, gfp);
- if (bio)
- mpage_bio_submit(REQ_OP_READ, 0, bio);
+ args.bio = do_mpage_readpage(&args);
+ if (args.bio)
+ mpage_bio_submit(REQ_OP_READ, 0, args.bio);
return 0;
}
EXPORT_SYMBOL(mpage_readpage);
diff --git a/fs/namei.c b/fs/namei.c
index 4cfa5228352e..fb913148d4d1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -887,6 +887,8 @@ static inline void put_link(struct nameidata *nd)
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
+int sysctl_protected_fifos __read_mostly;
+int sysctl_protected_regular __read_mostly;
/**
* may_follow_link - Check symlink following for unsafe situations
@@ -1003,6 +1005,45 @@ static int may_linkat(struct path *link)
return -EPERM;
}
+/**
+ * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
+ * should be allowed, or not, on files that already
+ * exist.
+ * @dir: the sticky parent directory
+ * @inode: the inode of the file to open
+ *
+ * Block an O_CREAT open of a FIFO (or a regular file) when:
+ * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
+ * - the file already exists
+ * - we are in a sticky directory
+ * - we don't own the file
+ * - the owner of the directory doesn't own the file
+ * - the directory is world writable
+ * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
+ * the directory doesn't have to be world writable: being group writable will
+ * be enough.
+ *
+ * Returns 0 if the open is allowed, -ve on error.
+ */
+static int may_create_in_sticky(struct dentry * const dir,
+ struct inode * const inode)
+{
+ if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
+ (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
+ likely(!(dir->d_inode->i_mode & S_ISVTX)) ||
+ uid_eq(inode->i_uid, dir->d_inode->i_uid) ||
+ uid_eq(current_fsuid(), inode->i_uid))
+ return 0;
+
+ if (likely(dir->d_inode->i_mode & 0002) ||
+ (dir->d_inode->i_mode & 0020 &&
+ ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
+ (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
+ return -EACCES;
+ }
+ return 0;
+}
+
static __always_inline
const char *get_link(struct nameidata *nd)
{
@@ -3348,9 +3389,15 @@ finish_open:
if (error)
return error;
audit_inode(nd->name, nd->path.dentry, 0);
- error = -EISDIR;
- if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
- goto out;
+ if (open_flag & O_CREAT) {
+ error = -EISDIR;
+ if (d_is_dir(nd->path.dentry))
+ goto out;
+ error = may_create_in_sticky(dir,
+ d_backing_inode(nd->path.dentry));
+ if (unlikely(error))
+ goto out;
+ }
error = -ENOTDIR;
if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
goto out;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index c5fa3dee72fc..7da0fac71dc2 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -51,7 +51,7 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
return err;
}
-static int nilfs_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = vmf->page;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 3a21a1ab141f..3afbade97833 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -835,7 +835,7 @@ static int nilfs_setup_super(struct super_block *sb, int is_mount)
sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
- sbp[0]->s_mtime = cpu_to_le64(get_seconds());
+ sbp[0]->s_mtime = cpu_to_le64(ktime_get_real_seconds());
skip_mount_setup:
sbp[0]->s_state =
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 484f2c3a33bb..58d77dc696eb 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -385,8 +385,9 @@ out_err:
static int __init dnotify_init(void)
{
- dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
- dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC);
+ dnotify_struct_cache = KMEM_CACHE(dnotify_struct,
+ SLAB_PANIC|SLAB_ACCOUNT);
+ dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT);
dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
if (IS_ERR(dnotify_group))
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 6e828cb82e5e..94b52157bf8d 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/wait.h>
#include <linux/audit.h>
+#include <linux/sched/mm.h>
#include "fanotify.h"
@@ -141,8 +142,8 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
struct inode *inode, u32 mask,
const struct path *path)
{
- struct fanotify_event_info *event;
- gfp_t gfp = GFP_KERNEL;
+ struct fanotify_event_info *event = NULL;
+ gfp_t gfp = GFP_KERNEL_ACCOUNT;
/*
* For queues with unlimited length lost events are not expected and
@@ -152,19 +153,22 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
if (group->max_events == UINT_MAX)
gfp |= __GFP_NOFAIL;
+ /* Whoever is interested in the event, pays for the allocation. */
+ memalloc_use_memcg(group->memcg);
+
if (fanotify_is_perm_event(mask)) {
struct fanotify_perm_event_info *pevent;
pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
if (!pevent)
- return NULL;
+ goto out;
event = &pevent->fae;
pevent->response = 0;
goto init;
}
event = kmem_cache_alloc(fanotify_event_cachep, gfp);
if (!event)
- return NULL;
+ goto out;
init: __maybe_unused
fsnotify_init_event(&event->fse, inode, mask);
event->tgid = get_pid(task_tgid(current));
@@ -175,6 +179,8 @@ init: __maybe_unused
event->path.mnt = NULL;
event->path.dentry = NULL;
}
+out:
+ memalloc_unuse_memcg();
return event;
}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index d736a833fe39..69054886915b 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -16,6 +16,7 @@
#include <linux/uaccess.h>
#include <linux/compat.h>
#include <linux/sched/signal.h>
+#include <linux/memcontrol.h>
#include <asm/ioctls.h>
@@ -731,6 +732,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->fanotify_data.user = user;
atomic_inc(&user->fanotify_listeners);
+ group->memcg = get_mem_cgroup_from_mm(current->mm);
oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL);
if (unlikely(!oevent)) {
@@ -932,7 +934,8 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
*/
static int __init fanotify_user_setup(void)
{
- fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
+ fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
+ SLAB_PANIC|SLAB_ACCOUNT);
fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
fanotify_perm_event_cachep =
diff --git a/fs/notify/group.c b/fs/notify/group.c
index aa5468f23e45..c03b83662876 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -22,6 +22,7 @@
#include <linux/srcu.h>
#include <linux/rculist.h>
#include <linux/wait.h>
+#include <linux/memcontrol.h>
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
@@ -36,6 +37,8 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
if (group->ops->free_group_priv)
group->ops->free_group_priv(group);
+ mem_cgroup_put(group->memcg);
+
kfree(group);
}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 9ab6dde38a14..f4184b4f3815 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -31,6 +31,7 @@
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/sched/user.h>
+#include <linux/sched/mm.h>
#include "inotify.h"
@@ -98,7 +99,11 @@ int inotify_handle_event(struct fsnotify_group *group,
i_mark = container_of(inode_mark, struct inotify_inode_mark,
fsn_mark);
- event = kmalloc(alloc_len, GFP_KERNEL);
+ /* Whoever is interested in the event, pays for the allocation. */
+ memalloc_use_memcg(group->memcg);
+ event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT);
+ memalloc_unuse_memcg();
+
if (unlikely(!event)) {
/*
* Treat lost event due to ENOMEM the same way as queue
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 6f48d325c350..ac6978d3208c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -38,6 +38,7 @@
#include <linux/uaccess.h>
#include <linux/poll.h>
#include <linux/wait.h>
+#include <linux/memcontrol.h>
#include "inotify.h"
#include "../fdinfo.h"
@@ -639,6 +640,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
oevent->name_len = 0;
group->max_events = max_events;
+ group->memcg = get_mem_cgroup_from_mm(current->mm);
spin_lock_init(&group->inotify_data.idr_lock);
idr_init(&group->inotify_data.idr);
@@ -815,7 +817,8 @@ static int __init inotify_user_setup(void)
BUG_ON(hweight32(ALL_INOTIFY_BITS) != 22);
- inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
+ inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark,
+ SLAB_PANIC|SLAB_ACCOUNT);
inotify_max_queued_events = 16384;
init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 3a2e509c77c5..8946130c87ad 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -93,13 +93,11 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
ofs = 0;
if (file_ofs < init_size)
ofs = init_size - file_ofs;
- local_irq_save(flags);
kaddr = kmap_atomic(page);
memset(kaddr + bh_offset(bh) + ofs, 0,
bh->b_size - ofs);
flush_dcache_page(page);
kunmap_atomic(kaddr);
- local_irq_restore(flags);
}
} else {
clear_buffer_uptodate(bh);
@@ -146,13 +144,11 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
recs = PAGE_SIZE / rec_size;
/* Should have been verified before we got here... */
BUG_ON(!recs);
- local_irq_save(flags);
kaddr = kmap_atomic(page);
for (i = 0; i < recs; i++)
post_read_mst_fixup((NTFS_RECORD*)(kaddr +
i * rec_size), rec_size);
kunmap_atomic(kaddr);
- local_irq_restore(flags);
flush_dcache_page(page);
if (likely(page_uptodate && !PageError(page)))
SetPageUptodate(page);
@@ -926,7 +922,7 @@ static int ntfs_write_mst_block(struct page *page,
ntfs_volume *vol = ni->vol;
u8 *kaddr;
unsigned int rec_size = ni->itype.index.block_size;
- ntfs_inode *locked_nis[PAGE_SIZE / rec_size];
+ ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE];
struct buffer_head *bh, *head, *tbh, *rec_start_bh;
struct buffer_head *bhs[MAX_BUF_PER_PAGE];
runlist_element *rl;
@@ -935,6 +931,9 @@ static int ntfs_write_mst_block(struct page *page,
bool sync, is_mft, page_is_dirty, rec_is_dirty;
unsigned char bh_size_bits;
+ if (WARN_ON(rec_size < NTFS_BLOCK_SIZE))
+ return -EINVAL;
+
ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
"0x%lx.", vi->i_ino, ni->type, page->index);
BUG_ON(!NInoNonResident(ni));
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index fbd0090d7d0c..df7c32b5fac7 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -128,6 +128,7 @@ static inline void handle_bounds_compressed_page(struct page *page,
/**
* ntfs_decompress - decompress a compression block into an array of pages
* @dest_pages: destination array of pages
+ * @completed_pages: scratch space to track completed pages
* @dest_index: current index into @dest_pages (IN/OUT)
* @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT)
* @dest_max_index: maximum index into @dest_pages (IN)
@@ -162,10 +163,10 @@ static inline void handle_bounds_compressed_page(struct page *page,
* Note to hackers: This function may not sleep until it has finished accessing
* the compression block @cb_start as it is a per-CPU buffer.
*/
-static int ntfs_decompress(struct page *dest_pages[], int *dest_index,
- int *dest_ofs, const int dest_max_index, const int dest_max_ofs,
- const int xpage, char *xpage_done, u8 *const cb_start,
- const u32 cb_size, const loff_t i_size,
+static int ntfs_decompress(struct page *dest_pages[], int completed_pages[],
+ int *dest_index, int *dest_ofs, const int dest_max_index,
+ const int dest_max_ofs, const int xpage, char *xpage_done,
+ u8 *const cb_start, const u32 cb_size, const loff_t i_size,
const s64 initialized_size)
{
/*
@@ -190,9 +191,6 @@ static int ntfs_decompress(struct page *dest_pages[], int *dest_index,
/* Variables for tag and token parsing. */
u8 tag; /* Current tag. */
int token; /* Loop counter for the eight tokens in tag. */
-
- /* Need this because we can't sleep, so need two stages. */
- int completed_pages[dest_max_index - *dest_index + 1];
int nr_completed_pages = 0;
/* Default error code. */
@@ -516,6 +514,7 @@ int ntfs_read_compressed_block(struct page *page)
unsigned int cb_clusters, cb_max_ofs;
int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0;
struct page **pages;
+ int *completed_pages;
unsigned char xpage_done = 0;
ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = "
@@ -528,14 +527,16 @@ int ntfs_read_compressed_block(struct page *page)
BUG_ON(ni->name_len);
pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
+ completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS);
/* Allocate memory to store the buffer heads we need. */
bhs_size = cb_size / block_size * sizeof(struct buffer_head *);
bhs = kmalloc(bhs_size, GFP_NOFS);
- if (unlikely(!pages || !bhs)) {
+ if (unlikely(!pages || !bhs || !completed_pages)) {
kfree(bhs);
kfree(pages);
+ kfree(completed_pages);
unlock_page(page);
ntfs_error(vol->sb, "Failed to allocate internal buffers.");
return -ENOMEM;
@@ -562,6 +563,7 @@ int ntfs_read_compressed_block(struct page *page)
if (xpage >= max_page) {
kfree(bhs);
kfree(pages);
+ kfree(completed_pages);
zero_user(page, 0, PAGE_SIZE);
ntfs_debug("Compressed read outside i_size - truncated?");
SetPageUptodate(page);
@@ -854,10 +856,10 @@ lock_retry_remap:
unsigned int prev_cur_page = cur_page;
ntfs_debug("Found compressed compression block.");
- err = ntfs_decompress(pages, &cur_page, &cur_ofs,
- cb_max_page, cb_max_ofs, xpage, &xpage_done,
- cb_pos, cb_size - (cb_pos - cb), i_size,
- initialized_size);
+ err = ntfs_decompress(pages, completed_pages, &cur_page,
+ &cur_ofs, cb_max_page, cb_max_ofs, xpage,
+ &xpage_done, cb_pos, cb_size - (cb_pos - cb),
+ i_size, initialized_size);
/*
* We can sleep from now on, lock already dropped by
* ntfs_decompress().
@@ -912,6 +914,7 @@ lock_retry_remap:
/* We no longer need the list of pages. */
kfree(pages);
+ kfree(completed_pages);
/* If we have completed the requested page, we return success. */
if (likely(xpage_done))
@@ -956,5 +959,6 @@ err_out:
}
}
kfree(pages);
+ kfree(completed_pages);
return -EIO;
}
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index decaf75d1cd5..bd3221cbdd95 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -667,18 +667,18 @@ static int ntfs_read_locked_inode(struct inode *vi)
* mtime is the last change of the data within the file. Not changed
* when only metadata is changed, e.g. a rename doesn't affect mtime.
*/
- vi->i_mtime = timespec_to_timespec64(ntfs2utc(si->last_data_change_time));
+ vi->i_mtime = ntfs2utc(si->last_data_change_time);
/*
* ctime is the last change of the metadata of the file. This obviously
* always changes, when mtime is changed. ctime can be changed on its
* own, mtime is then not changed, e.g. when a file is renamed.
*/
- vi->i_ctime = timespec_to_timespec64(ntfs2utc(si->last_mft_change_time));
+ vi->i_ctime = ntfs2utc(si->last_mft_change_time);
/*
* Last access to the data within the file. Not changed during a rename
* for example but changed whenever the file is written to.
*/
- vi->i_atime = timespec_to_timespec64(ntfs2utc(si->last_access_time));
+ vi->i_atime = ntfs2utc(si->last_access_time);
/* Find the attribute list attribute if present. */
ntfs_attr_reinit_search_ctx(ctx);
@@ -2997,7 +2997,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
le16_to_cpu(ctx->attr->data.resident.value_offset));
/* Update the access times if they have changed. */
- nt = utc2ntfs(timespec64_to_timespec(vi->i_mtime));
+ nt = utc2ntfs(vi->i_mtime);
if (si->last_data_change_time != nt) {
ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
"new = 0x%llx", vi->i_ino, (long long)
@@ -3006,7 +3006,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
si->last_data_change_time = nt;
modified = true;
}
- nt = utc2ntfs(timespec64_to_timespec(vi->i_ctime));
+ nt = utc2ntfs(vi->i_ctime);
if (si->last_mft_change_time != nt) {
ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
"new = 0x%llx", vi->i_ino, (long long)
@@ -3015,7 +3015,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
si->last_mft_change_time = nt;
modified = true;
}
- nt = utc2ntfs(timespec64_to_timespec(vi->i_atime));
+ nt = utc2ntfs(vi->i_atime);
if (si->last_access_time != nt) {
ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
"new = 0x%llx", vi->i_ino,
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 32c523cf5a2d..fb14d17666c8 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -35,6 +35,8 @@
#include "mft.h"
#include "ntfs.h"
+#define MAX_BHS (PAGE_SIZE / NTFS_BLOCK_SIZE)
+
/**
* map_mft_record_page - map the page in which a specific mft record resides
* @ni: ntfs inode whose mft record page to map
@@ -469,7 +471,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
struct page *page;
unsigned int blocksize = vol->sb->s_blocksize;
int max_bhs = vol->mft_record_size / blocksize;
- struct buffer_head *bhs[max_bhs];
+ struct buffer_head *bhs[MAX_BHS];
struct buffer_head *bh, *head;
u8 *kmirr;
runlist_element *rl;
@@ -479,6 +481,8 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
ntfs_debug("Entering for inode 0x%lx.", mft_no);
BUG_ON(!max_bhs);
+ if (WARN_ON(max_bhs > MAX_BHS))
+ return -EINVAL;
if (unlikely(!vol->mftmirr_ino)) {
/* This could happen during umount... */
err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
@@ -674,7 +678,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
unsigned int blocksize = vol->sb->s_blocksize;
unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
int max_bhs = vol->mft_record_size / blocksize;
- struct buffer_head *bhs[max_bhs];
+ struct buffer_head *bhs[MAX_BHS];
struct buffer_head *bh, *head;
runlist_element *rl;
unsigned int block_start, block_end, m_start, m_end;
@@ -684,6 +688,10 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
BUG_ON(NInoAttr(ni));
BUG_ON(!max_bhs);
BUG_ON(!PageLocked(page));
+ if (WARN_ON(max_bhs > MAX_BHS)) {
+ err = -EINVAL;
+ goto err_out;
+ }
/*
* If the ntfs_inode is clean no need to do anything. If it is dirty,
* mark it as clean now so that it can be redirtied later on if needed.
diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h
index 01233989d5d1..24cd719f1fd2 100644
--- a/fs/ntfs/time.h
+++ b/fs/ntfs/time.h
@@ -36,16 +36,16 @@
* Convert the Linux UTC time @ts to its corresponding NTFS time and return
* that in little endian format.
*
- * Linux stores time in a struct timespec consisting of a time_t (long at
- * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second
- * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of
- * 1-nano-second intervals since the value of tv_sec.
+ * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
+ * and a long tv_nsec where tv_sec is the number of 1-second intervals since
+ * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
+ * intervals since the value of tv_sec.
*
* NTFS uses Microsoft's standard time format which is stored in a s64 and is
* measured as the number of 100-nano-second intervals since 1st January 1601,
* 00:00:00 UTC.
*/
-static inline sle64 utc2ntfs(const struct timespec ts)
+static inline sle64 utc2ntfs(const struct timespec64 ts)
{
/*
* Convert the seconds to 100ns intervals, add the nano-seconds
@@ -63,7 +63,10 @@ static inline sle64 utc2ntfs(const struct timespec ts)
*/
static inline sle64 get_current_ntfs_time(void)
{
- return utc2ntfs(current_kernel_time());
+ struct timespec64 ts;
+
+ ktime_get_coarse_real_ts64(&ts);
+ return utc2ntfs(ts);
}
/**
@@ -73,18 +76,18 @@ static inline sle64 get_current_ntfs_time(void)
* Convert the little endian NTFS time @time to its corresponding Linux UTC
* time and return that in cpu format.
*
- * Linux stores time in a struct timespec consisting of a time_t (long at
- * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second
- * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of
- * 1-nano-second intervals since the value of tv_sec.
+ * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
+ * and a long tv_nsec where tv_sec is the number of 1-second intervals since
+ * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
+ * intervals since the value of tv_sec.
*
* NTFS uses Microsoft's standard time format which is stored in a s64 and is
* measured as the number of 100 nano-second intervals since 1st January 1601,
* 00:00:00 UTC.
*/
-static inline struct timespec ntfs2utc(const sle64 time)
+static inline struct timespec64 ntfs2utc(const sle64 time)
{
- struct timespec ts;
+ struct timespec64 ts;
/* Subtract the NTFS time offset. */
u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0f157bbd3e0f..a342f008e42f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -932,13 +932,11 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
goto bail;
}
- if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+ if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation)
rc = ocfs2_error(sb,
"Extent block #%llu has an invalid h_fs_generation of #%u\n",
(unsigned long long)bh->b_blocknr,
le32_to_cpu(eb->h_fs_generation));
- goto bail;
- }
bail:
return rc;
}
@@ -1481,19 +1479,17 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
while(le16_to_cpu(el->l_tree_depth) > 1) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
- ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty extent list (next_free_rec == 0)\n",
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
- status = -EIO;
+ status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has empty extent list (next_free_rec == 0)\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
goto bail;
}
i = le16_to_cpu(el->l_next_free_rec) - 1;
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (!blkno) {
- ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has extent list where extent # %d has no physical block start\n",
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
- status = -EIO;
+ status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has extent list where extent # %d has no physical block start\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
goto bail;
}
@@ -1598,10 +1594,8 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
* the new data. */
ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
meta_ac);
- if (ret < 0) {
+ if (ret < 0)
mlog_errno(ret);
- goto out;
- }
out:
if (final_depth)
@@ -3214,11 +3208,10 @@ rightmost_no_delete:
goto rightmost_no_delete;
if (le16_to_cpu(el->l_next_free_rec) == 0) {
- ret = -EIO;
- ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty extent block at %llu\n",
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
- (unsigned long long)le64_to_cpu(eb->h_blkno));
+ ret = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has empty extent block at %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ (unsigned long long)le64_to_cpu(eb->h_blkno));
goto out;
}
@@ -4411,12 +4404,11 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
le16_to_cpu(new_el->l_count)) {
bh = path_leaf_bh(left_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
- ocfs2_error(sb,
- "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n",
- (unsigned long long)le64_to_cpu(eb->h_blkno),
- le16_to_cpu(new_el->l_next_free_rec),
- le16_to_cpu(new_el->l_count));
- status = -EINVAL;
+ status = ocfs2_error(sb,
+ "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n",
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(new_el->l_next_free_rec),
+ le16_to_cpu(new_el->l_count));
goto free_left_path;
}
rec = &new_el->l_recs[
@@ -4466,11 +4458,10 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
bh = path_leaf_bh(right_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
- ocfs2_error(sb,
- "Extent block #%llu has an invalid l_next_free_rec of %d\n",
- (unsigned long long)le64_to_cpu(eb->h_blkno),
- le16_to_cpu(new_el->l_next_free_rec));
- status = -EINVAL;
+ status = ocfs2_error(sb,
+ "Extent block #%llu has an invalid l_next_free_rec of %d\n",
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(new_el->l_next_free_rec));
goto free_right_path;
}
rec = &new_el->l_recs[1];
@@ -5523,10 +5514,8 @@ static int ocfs2_truncate_rec(handle_t *handle,
ocfs2_journal_dirty(handle, path_leaf_bh(path));
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
- if (ret) {
+ if (ret)
mlog_errno(ret);
- goto out;
- }
out:
ocfs2_free_path(left_path);
@@ -5659,10 +5648,8 @@ int ocfs2_remove_extent(handle_t *handle,
ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
cpos, len);
- if (ret) {
+ if (ret)
mlog_errno(ret);
- goto out;
- }
}
out:
@@ -5707,7 +5694,6 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
if (ret < 0) {
if (ret != -ENOSPC)
mlog_errno(ret);
- goto out;
}
}
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index d9ebe11c8990..9f0304f89fc3 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -99,25 +99,34 @@ out:
return ret;
}
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
unsigned int nr, struct buffer_head *bhs[])
{
int status = 0;
unsigned int i;
struct buffer_head *bh;
+ int new_bh = 0;
trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
if (!nr)
goto bail;
+ /* Don't put buffer head and re-assign it to NULL if it is allocated
+ * outside since the caller can't be aware of this alternation!
+ */
+ new_bh = (bhs[0] == NULL);
+
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(osb->sb, block++);
if (bhs[i] == NULL) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ break;
}
}
bh = bhs[i];
@@ -158,9 +167,26 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
submit_bh(REQ_OP_READ, 0, bh);
}
+read_failure:
for (i = nr; i > 0; i--) {
bh = bhs[i - 1];
+ if (unlikely(status)) {
+ if (new_bh && bh) {
+ /* If middle bh fails, let previous bh
+ * finish its read and then put it to
+ * aovoid bh leak
+ */
+ if (!buffer_jbd(bh))
+ wait_on_buffer(bh);
+ put_bh(bh);
+ bhs[i - 1] = NULL;
+ } else if (bh && buffer_uptodate(bh)) {
+ clear_buffer_uptodate(bh);
+ }
+ continue;
+ }
+
/* No need to wait on the buffer if it's managed by JBD. */
if (!buffer_jbd(bh))
wait_on_buffer(bh);
@@ -170,8 +196,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
* so we can safely record this and loop back
* to cleanup the other buffers. */
status = -EIO;
- put_bh(bh);
- bhs[i - 1] = NULL;
+ goto read_failure;
}
}
@@ -179,6 +204,9 @@ bail:
return status;
}
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
@@ -188,6 +216,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
int i, ignore_cache = 0;
struct buffer_head *bh;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ int new_bh = 0;
trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
@@ -213,6 +242,11 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
goto bail;
}
+ /* Don't put buffer head and re-assign it to NULL if it is allocated
+ * outside since the caller can't be aware of this alternation!
+ */
+ new_bh = (bhs[0] == NULL);
+
ocfs2_metadata_cache_io_lock(ci);
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
@@ -221,7 +255,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
ocfs2_metadata_cache_io_unlock(ci);
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ /* Don't forget to put previous bh! */
+ break;
}
}
bh = bhs[i];
@@ -316,16 +351,27 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
}
}
- status = 0;
-
+read_failure:
for (i = (nr - 1); i >= 0; i--) {
bh = bhs[i];
if (!(flags & OCFS2_BH_READAHEAD)) {
- if (status) {
- /* Clear the rest of the buffers on error */
- put_bh(bh);
- bhs[i] = NULL;
+ if (unlikely(status)) {
+ /* Clear the buffers on error including those
+ * ever succeeded in reading
+ */
+ if (new_bh && bh) {
+ /* If middle bh fails, let previous bh
+ * finish its read and then put it to
+ * aovoid bh leak
+ */
+ if (!buffer_jbd(bh))
+ wait_on_buffer(bh);
+ put_bh(bh);
+ bhs[i] = NULL;
+ } else if (bh && buffer_uptodate(bh)) {
+ clear_buffer_uptodate(bh);
+ }
continue;
}
/* We know this can't have changed as we hold the
@@ -342,9 +388,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
* for this bh as it's not marked locally
* uptodate. */
status = -EIO;
- put_bh(bh);
- bhs[i] = NULL;
- continue;
+ goto read_failure;
}
if (buffer_needs_validate(bh)) {
@@ -354,11 +398,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
BUG_ON(buffer_jbd(bh));
clear_buffer_needs_validate(bh);
status = validate(sb, bh);
- if (status) {
- put_bh(bh);
- bhs[i] = NULL;
- continue;
- }
+ if (status)
+ goto read_failure;
}
}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index ea8c551bcd7e..9b2ed62dd638 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -127,13 +127,13 @@ enum o2hb_heartbeat_modes {
O2HB_HEARTBEAT_NUM_MODES,
};
-char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
- "local", /* O2HB_HEARTBEAT_LOCAL */
- "global", /* O2HB_HEARTBEAT_GLOBAL */
+static const char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
+ "local", /* O2HB_HEARTBEAT_LOCAL */
+ "global", /* O2HB_HEARTBEAT_GLOBAL */
};
unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
-unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
+static unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
/*
* o2hb_dependent_users tracks the number of registered callbacks that depend
@@ -141,7 +141,7 @@ unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
* However only o2dlm depends on the heartbeat. It does not want the heartbeat
* to stop while a dlm domain is still active.
*/
-unsigned int o2hb_dependent_users;
+static unsigned int o2hb_dependent_users;
/*
* In global heartbeat mode, all regions are pinned if there are one or more
@@ -2486,7 +2486,7 @@ unlock:
return ret;
}
-void o2hb_region_dec_user(const char *region_uuid)
+static void o2hb_region_dec_user(const char *region_uuid)
{
spin_lock(&o2hb_live_lock);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index da64c3a20eeb..0e4166cc23a0 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -35,9 +35,9 @@
* cluster references throughout where nodes are looked up */
struct o2nm_cluster *o2nm_single_cluster = NULL;
-char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
- "reset", /* O2NM_FENCE_RESET */
- "panic", /* O2NM_FENCE_PANIC */
+static const char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
+ "reset", /* O2NM_FENCE_RESET */
+ "panic", /* O2NM_FENCE_PANIC */
};
static inline void o2nm_lock_subsystem(void);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1296f78ae966..7d9eea7d4a87 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -872,8 +872,6 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
"for type %u key %08x\n", msg_type, key);
}
write_unlock(&o2net_handler_lock);
- if (ret)
- goto out;
out:
if (ret)
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index b048d4fa3959..c121abbdfc7d 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1897,8 +1897,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
/* On error, skip the f_pos to the
next block. */
ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
- brelse(bh);
- continue;
+ break;
}
if (le64_to_cpu(de->inode)) {
unsigned char d_type = DT_UNKNOWN;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 0ff424c6d17c..040ddb6e7dab 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -96,7 +96,7 @@ struct ocfs2_unblock_ctl {
};
/* Lockdep class keys */
-struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
+static struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
int new_level);
@@ -2121,10 +2121,10 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
/* LVB only has room for 64 bits of time here so we pack it for
* now. */
-static u64 ocfs2_pack_timespec(struct timespec *spec)
+static u64 ocfs2_pack_timespec(struct timespec64 *spec)
{
u64 res;
- u64 sec = spec->tv_sec;
+ u64 sec = clamp_t(time64_t, spec->tv_sec, 0, 0x3ffffffffull);
u32 nsec = spec->tv_nsec;
res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
@@ -2140,7 +2140,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
- struct timespec ts;
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
@@ -2161,15 +2160,12 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
lvb->lvb_igid = cpu_to_be32(i_gid_read(inode));
lvb->lvb_imode = cpu_to_be16(inode->i_mode);
lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
- ts = timespec64_to_timespec(inode->i_atime);
lvb->lvb_iatime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&ts));
- ts = timespec64_to_timespec(inode->i_ctime);
+ cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
lvb->lvb_ictime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&ts));
- ts = timespec64_to_timespec(inode->i_mtime);
+ cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
lvb->lvb_imtime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&ts));
+ cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
@@ -2178,7 +2174,7 @@ out:
mlog_meta_lvb(0, lockres);
}
-static void ocfs2_unpack_timespec(struct timespec *spec,
+static void ocfs2_unpack_timespec(struct timespec64 *spec,
u64 packed_time)
{
spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
@@ -2187,7 +2183,6 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
{
- struct timespec ts;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
@@ -2215,15 +2210,12 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
inode->i_mode = be16_to_cpu(lvb->lvb_imode);
set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
- ocfs2_unpack_timespec(&ts,
+ ocfs2_unpack_timespec(&inode->i_atime,
be64_to_cpu(lvb->lvb_iatime_packed));
- inode->i_atime = timespec_to_timespec64(ts);
- ocfs2_unpack_timespec(&ts,
+ ocfs2_unpack_timespec(&inode->i_mtime,
be64_to_cpu(lvb->lvb_imtime_packed));
- inode->i_mtime = timespec_to_timespec64(ts);
- ocfs2_unpack_timespec(&ts,
+ ocfs2_unpack_timespec(&inode->i_ctime,
be64_to_cpu(lvb->lvb_ictime_packed));
- inode->i_ctime = timespec_to_timespec64(ts);
spin_unlock(&oi->ip_lock);
}
@@ -3601,7 +3593,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
* we can recover correctly from node failure. Otherwise, we may get
* invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
*/
- if (!ocfs2_is_o2cb_active() &&
+ if (ocfs2_userspace_stack(osb) &&
lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
lvb = 1;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9fa35cb6f6e0..1ac9547a2cd7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2343,7 +2343,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
written = __generic_file_write_iter(iocb, from);
/* buffered aio wouldn't have proper lock coverage today */
- BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+ BUG_ON(written == -EIOCBQUEUED && !direct_io);
/*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -2463,7 +2463,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
trace_generic_file_read_iter_ret(ret);
/* buffered aio wouldn't have proper lock coverage today */
- BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+ BUG_ON(ret == -EIOCBQUEUED && !direct_io);
/* see ocfs2_file_write_iter */
if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index ddc3e9470c87..79279240fb6e 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -637,10 +637,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
handle = NULL;
status = ocfs2_commit_truncate(osb, inode, fe_bh);
- if (status < 0) {
+ if (status < 0)
mlog_errno(status);
- goto out;
- }
}
out:
@@ -1499,7 +1497,6 @@ static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
(unsigned long long)bh->b_blocknr,
le32_to_cpu(di->i_fs_generation));
rc = -OCFS2_FILECHECK_ERR_GENERATION;
- goto bail;
}
bail:
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index bd3475694e83..b63c97f4318e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1378,15 +1378,23 @@ static int __ocfs2_recovery_thread(void *arg)
int rm_quota_used = 0, i;
struct ocfs2_quota_recovery *qrec;
+ /* Whether the quota supported. */
+ int quota_enabled = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+ || OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA);
+
status = ocfs2_wait_on_mount(osb);
if (status < 0) {
goto bail;
}
- rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS);
- if (!rm_quota) {
- status = -ENOMEM;
- goto bail;
+ if (quota_enabled) {
+ rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS);
+ if (!rm_quota) {
+ status = -ENOMEM;
+ goto bail;
+ }
}
restart:
status = ocfs2_super_lock(osb, 1);
@@ -1422,9 +1430,14 @@ restart:
* then quota usage would be out of sync until some node takes
* the slot. So we remember which nodes need quota recovery
* and when everything else is done, we recover quotas. */
- for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
- if (i == rm_quota_used)
- rm_quota[rm_quota_used++] = slot_num;
+ if (quota_enabled) {
+ for (i = 0; i < rm_quota_used
+ && rm_quota[i] != slot_num; i++)
+ ;
+
+ if (i == rm_quota_used)
+ rm_quota[rm_quota_used++] = slot_num;
+ }
status = ocfs2_recover_node(osb, node_num, slot_num);
skip_recovery:
@@ -1452,16 +1465,19 @@ skip_recovery:
/* Now it is right time to recover quotas... We have to do this under
* superblock lock so that no one can start using the slot (and crash)
* before we recover it */
- for (i = 0; i < rm_quota_used; i++) {
- qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
- if (IS_ERR(qrec)) {
- status = PTR_ERR(qrec);
- mlog_errno(status);
- continue;
+ if (quota_enabled) {
+ for (i = 0; i < rm_quota_used; i++) {
+ qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+ if (IS_ERR(qrec)) {
+ status = PTR_ERR(qrec);
+ mlog_errno(status);
+ continue;
+ }
+ ocfs2_queue_recovery_completion(osb->journal,
+ rm_quota[i],
+ NULL, NULL, qrec,
+ ORPHAN_NEED_TRUNCATE);
}
- ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
- NULL, NULL, qrec,
- ORPHAN_NEED_TRUNCATE);
}
ocfs2_super_unlock(osb, 1);
@@ -1483,7 +1499,8 @@ bail:
mutex_unlock(&osb->recovery_lock);
- kfree(rm_quota);
+ if (quota_enabled)
+ kfree(rm_quota);
/* no one is callint kthread_stop() for us so the kthread() api
* requires that we call do_exit(). And it isn't exported, but
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index fe0d1f9571bb..7642b6712c39 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -663,11 +663,10 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
#ifdef CONFIG_OCFS2_DEBUG_FS
if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
ocfs2_local_alloc_count_bits(alloc)) {
- ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
- (unsigned long long)le64_to_cpu(alloc->i_blkno),
- le32_to_cpu(alloc->id1.bitmap1.i_used),
- ocfs2_local_alloc_count_bits(alloc));
- status = -EIO;
+ status = ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
+ (unsigned long long)le64_to_cpu(alloc->i_blkno),
+ le32_to_cpu(alloc->id1.bitmap1.i_used),
+ ocfs2_local_alloc_count_bits(alloc));
goto bail;
}
#endif
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 16c42ed0dca8..b1a8b046f4c2 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -137,14 +137,13 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
int rc = 0;
struct buffer_head *tmp = *bh;
- if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
- ocfs2_error(inode->i_sb,
- "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)v_block,
- (unsigned long long)i_size_read(inode));
- return -EIO;
- }
+ if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block)
+ return ocfs2_error(inode->i_sb,
+ "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)v_block,
+ (unsigned long long)i_size_read(inode));
+
rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
ocfs2_validate_quota_block);
if (rc)
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index d6c350ba25b9..c4b029c43464 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -48,12 +48,6 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
*/
static struct ocfs2_stack_plugin *active_stack;
-inline int ocfs2_is_o2cb_active(void)
-{
- return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB);
-}
-EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active);
-
static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
{
struct ocfs2_stack_plugin *p;
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index e3036e1790e8..f2dce10fae54 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,9 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
-/* In ocfs2_downconvert_lock(), we need to know which stack we are using */
-int ocfs2_is_o2cb_active(void);
-
extern struct kset *ocfs2_kset;
#endif /* STACKGLUE_H */
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 0eaeb41453f5..817c02b13b1d 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -31,6 +31,7 @@ config PROC_FS
config PROC_KCORE
bool "/proc/kcore support" if !ARM
depends on PROC_FS && MMU
+ select CRASH_CORE
help
Provides a virtual ELF core file of the live kernel. This can
be read with gdb and other ELF tools. No modifications can be
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 66ee5b7d7937..2a238d68610e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -463,7 +463,7 @@ static int lstats_show_proc(struct seq_file *m, void *v)
if (!task)
return -ESRCH;
seq_puts(m, "Latency Top version : v0.1\n");
- for (i = 0; i < 32; i++) {
+ for (i = 0; i < LT_SAVECOUNT; i++) {
struct latency_record *lr = &task->latency_record[i];
if (lr->backtrace[0]) {
int q;
@@ -1366,10 +1366,8 @@ static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
if (!task)
return -ESRCH;
len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth);
- len = simple_read_from_buffer(buf, count, ppos, numbuf, len);
put_task_struct(task);
-
- return len;
+ return simple_read_from_buffer(buf, count, ppos, numbuf, len);
}
static const struct file_operations proc_fail_nth_operations = {
@@ -2519,47 +2517,47 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
struct inode * inode = file_inode(file);
+ struct task_struct *task;
void *page;
- ssize_t length;
- struct task_struct *task = get_proc_task(inode);
-
- length = -ESRCH;
- if (!task)
- goto out_no_task;
+ int rv;
+ rcu_read_lock();
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (!task) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
/* A task may only write its own attributes. */
- length = -EACCES;
- if (current != task)
- goto out;
+ if (current != task) {
+ rcu_read_unlock();
+ return -EACCES;
+ }
+ rcu_read_unlock();
if (count > PAGE_SIZE)
count = PAGE_SIZE;
/* No partial writes. */
- length = -EINVAL;
if (*ppos != 0)
- goto out;
+ return -EINVAL;
page = memdup_user(buf, count);
if (IS_ERR(page)) {
- length = PTR_ERR(page);
+ rv = PTR_ERR(page);
goto out;
}
/* Guard against adverse ptrace interaction */
- length = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
- if (length < 0)
+ rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
+ if (rv < 0)
goto out_free;
- length = security_setprocattr(file->f_path.dentry->d_name.name,
- page, count);
+ rv = security_setprocattr(file->f_path.dentry->d_name.name, page, count);
mutex_unlock(&current->signal->cred_guard_mutex);
out_free:
kfree(page);
out:
- put_task_struct(task);
-out_no_task:
- return length;
+ return rv;
}
static const struct file_operations proc_pid_attr_operations = {
@@ -3327,12 +3325,12 @@ static const struct pid_entry tid_base_stuff[] = {
REG("cmdline", S_IRUGO, proc_pid_cmdline_ops),
ONE("stat", S_IRUGO, proc_tid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
- REG("maps", S_IRUGO, proc_tid_maps_operations),
+ REG("maps", S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_PROC_CHILDREN
REG("children", S_IRUGO, proc_tid_children_operations),
#endif
#ifdef CONFIG_NUMA
- REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations),
+ REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
#endif
REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
LNK("cwd", proc_cwd_link),
@@ -3342,7 +3340,7 @@ static const struct pid_entry tid_base_stuff[] = {
REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
- REG("smaps", S_IRUGO, proc_tid_smaps_operations),
+ REG("smaps", S_IRUGO, proc_pid_smaps_operations),
REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index bb1c1625b158..8ae109429a88 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -286,9 +286,9 @@ int proc_readdir_de(struct file *file, struct dir_context *ctx,
if (!dir_emit_dots(file, ctx))
return 0;
+ i = ctx->pos - 2;
read_lock(&proc_subdir_lock);
de = pde_subdir_first(de);
- i = ctx->pos - 2;
for (;;) {
if (!de) {
read_unlock(&proc_subdir_lock);
@@ -309,8 +309,8 @@ int proc_readdir_de(struct file *file, struct dir_context *ctx,
pde_put(de);
return 0;
}
- read_lock(&proc_subdir_lock);
ctx->pos++;
+ read_lock(&proc_subdir_lock);
next = pde_subdir_next(de);
pde_put(de);
de = next;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 38155bec4a54..82a157f8e932 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -104,8 +104,10 @@ void __init proc_init_kmemcache(void)
kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0,
SLAB_ACCOUNT|SLAB_PANIC, NULL);
proc_dir_entry_cache = kmem_cache_create_usercopy(
- "proc_dir_entry", SIZEOF_PDE_SLOT, 0, SLAB_PANIC,
- OFFSETOF_PDE_NAME, SIZEOF_PDE_INLINE_NAME, NULL);
+ "proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC,
+ offsetof(struct proc_dir_entry, inline_name),
+ SIZEOF_PDE_INLINE_NAME, NULL);
+ BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
}
static int proc_show_options(struct seq_file *seq, struct dentry *root)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index ea8c5468eafc..3ced8ff06eb7 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -65,16 +65,13 @@ struct proc_dir_entry {
char inline_name[];
} __randomize_layout;
-#define OFFSETOF_PDE_NAME offsetof(struct proc_dir_entry, inline_name)
-#define SIZEOF_PDE_SLOT \
- (OFFSETOF_PDE_NAME + 34 <= 64 ? 64 : \
- OFFSETOF_PDE_NAME + 34 <= 128 ? 128 : \
- OFFSETOF_PDE_NAME + 34 <= 192 ? 192 : \
- OFFSETOF_PDE_NAME + 34 <= 256 ? 256 : \
- OFFSETOF_PDE_NAME + 34 <= 512 ? 512 : \
- 0)
-
-#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE_SLOT - OFFSETOF_PDE_NAME)
+#define SIZEOF_PDE ( \
+ sizeof(struct proc_dir_entry) < 128 ? 128 : \
+ sizeof(struct proc_dir_entry) < 192 ? 192 : \
+ sizeof(struct proc_dir_entry) < 256 ? 256 : \
+ sizeof(struct proc_dir_entry) < 512 ? 512 : \
+ 0)
+#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
@@ -116,12 +113,12 @@ static inline void *__PDE_DATA(const struct inode *inode)
return PDE(inode)->data;
}
-static inline struct pid *proc_pid(struct inode *inode)
+static inline struct pid *proc_pid(const struct inode *inode)
{
return PROC_I(inode)->pid;
}
-static inline struct task_struct *get_proc_task(struct inode *inode)
+static inline struct task_struct *get_proc_task(const struct inode *inode)
{
return get_pid_task(proc_pid(inode), PIDTYPE_PID);
}
@@ -283,7 +280,6 @@ struct proc_maps_private {
struct inode *inode;
struct task_struct *task;
struct mm_struct *mm;
- struct mem_size_stats *rollup;
#ifdef CONFIG_MMU
struct vm_area_struct *tail_vma;
#endif
@@ -295,12 +291,9 @@ struct proc_maps_private {
struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);
extern const struct file_operations proc_pid_maps_operations;
-extern const struct file_operations proc_tid_maps_operations;
extern const struct file_operations proc_pid_numa_maps_operations;
-extern const struct file_operations proc_tid_numa_maps_operations;
extern const struct file_operations proc_pid_smaps_operations;
extern const struct file_operations proc_pid_smaps_rollup_operations;
-extern const struct file_operations proc_tid_smaps_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e64ecb9f2720..80464432dfe6 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,6 +10,7 @@
* Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
*/
+#include <linux/crash_core.h>
#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/kcore.h>
@@ -49,32 +50,23 @@ static struct proc_dir_entry *proc_root_kcore;
#define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
#endif
-/* An ELF note in memory */
-struct memelfnote
-{
- const char *name;
- int type;
- unsigned int datasz;
- void *data;
-};
-
static LIST_HEAD(kclist_head);
-static DEFINE_RWLOCK(kclist_lock);
+static DECLARE_RWSEM(kclist_lock);
static int kcore_need_update = 1;
-void
-kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
+/* This doesn't grab kclist_lock, so it should only be used at init time. */
+void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
+ int type)
{
new->addr = (unsigned long)addr;
new->size = size;
new->type = type;
- write_lock(&kclist_lock);
list_add_tail(&new->list, &kclist_head);
- write_unlock(&kclist_lock);
}
-static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
+static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
+ size_t *data_offset)
{
size_t try, size;
struct kcore_list *m;
@@ -88,53 +80,19 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
size = try;
*nphdr = *nphdr + 1;
}
- *elf_buflen = sizeof(struct elfhdr) +
- (*nphdr + 2)*sizeof(struct elf_phdr) +
- 3 * ((sizeof(struct elf_note)) +
- roundup(sizeof(CORE_STR), 4)) +
- roundup(sizeof(struct elf_prstatus), 4) +
- roundup(sizeof(struct elf_prpsinfo), 4) +
- roundup(arch_task_struct_size, 4);
- *elf_buflen = PAGE_ALIGN(*elf_buflen);
- return size + *elf_buflen;
-}
-
-static void free_kclist_ents(struct list_head *head)
-{
- struct kcore_list *tmp, *pos;
- list_for_each_entry_safe(pos, tmp, head, list) {
- list_del(&pos->list);
- kfree(pos);
- }
+ *phdrs_len = *nphdr * sizeof(struct elf_phdr);
+ *notes_len = (4 * sizeof(struct elf_note) +
+ 3 * ALIGN(sizeof(CORE_STR), 4) +
+ VMCOREINFO_NOTE_NAME_BYTES +
+ ALIGN(sizeof(struct elf_prstatus), 4) +
+ ALIGN(sizeof(struct elf_prpsinfo), 4) +
+ ALIGN(arch_task_struct_size, 4) +
+ ALIGN(vmcoreinfo_size, 4));
+ *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
+ *notes_len);
+ return *data_offset + size;
}
-/*
- * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
- */
-static void __kcore_update_ram(struct list_head *list)
-{
- int nphdr;
- size_t size;
- struct kcore_list *tmp, *pos;
- LIST_HEAD(garbage);
-
- write_lock(&kclist_lock);
- if (kcore_need_update) {
- list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
- if (pos->type == KCORE_RAM
- || pos->type == KCORE_VMEMMAP)
- list_move(&pos->list, &garbage);
- }
- list_splice_tail(list, &kclist_head);
- } else
- list_splice(list, &garbage);
- kcore_need_update = 0;
- proc_root_kcore->size = get_kcore_size(&nphdr, &size);
- write_unlock(&kclist_lock);
-
- free_kclist_ents(&garbage);
-}
-
#ifdef CONFIG_HIGHMEM
/*
@@ -142,11 +100,9 @@ static void __kcore_update_ram(struct list_head *list)
* because memory hole is not as big as !HIGHMEM case.
* (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
*/
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *head)
{
- LIST_HEAD(head);
struct kcore_list *ent;
- int ret = 0;
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
@@ -154,9 +110,8 @@ static int kcore_update_ram(void)
ent->addr = (unsigned long)__va(0);
ent->size = max_low_pfn << PAGE_SHIFT;
ent->type = KCORE_RAM;
- list_add(&ent->list, &head);
- __kcore_update_ram(&head);
- return ret;
+ list_add(&ent->list, head);
+ return 0;
}
#else /* !CONFIG_HIGHMEM */
@@ -255,11 +210,10 @@ free_out:
return 1;
}
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *list)
{
int nid, ret;
unsigned long end_pfn;
- LIST_HEAD(head);
/* Not inialized....update now */
/* find out "max pfn" */
@@ -271,258 +225,255 @@ static int kcore_update_ram(void)
end_pfn = node_end;
}
/* scan 0 to max_pfn */
- ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private);
- if (ret) {
- free_kclist_ents(&head);
+ ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private);
+ if (ret)
return -ENOMEM;
- }
- __kcore_update_ram(&head);
- return ret;
+ return 0;
}
#endif /* CONFIG_HIGHMEM */
-/*****************************************************************************/
-/*
- * determine size of ELF note
- */
-static int notesize(struct memelfnote *en)
-{
- int sz;
-
- sz = sizeof(struct elf_note);
- sz += roundup((strlen(en->name) + 1), 4);
- sz += roundup(en->datasz, 4);
-
- return sz;
-} /* end notesize() */
-
-/*****************************************************************************/
-/*
- * store a note in the header buffer
- */
-static char *storenote(struct memelfnote *men, char *bufp)
+static int kcore_update_ram(void)
{
- struct elf_note en;
-
-#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0)
-
- en.n_namesz = strlen(men->name) + 1;
- en.n_descsz = men->datasz;
- en.n_type = men->type;
-
- DUMP_WRITE(&en, sizeof(en));
- DUMP_WRITE(men->name, en.n_namesz);
-
- /* XXX - cast from long long to long to avoid need for libgcc.a */
- bufp = (char*) roundup((unsigned long)bufp,4);
- DUMP_WRITE(men->data, men->datasz);
- bufp = (char*) roundup((unsigned long)bufp,4);
-
-#undef DUMP_WRITE
-
- return bufp;
-} /* end storenote() */
+ LIST_HEAD(list);
+ LIST_HEAD(garbage);
+ int nphdr;
+ size_t phdrs_len, notes_len, data_offset;
+ struct kcore_list *tmp, *pos;
+ int ret = 0;
-/*
- * store an ELF coredump header in the supplied buffer
- * nphdr is the number of elf_phdr to insert
- */
-static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
-{
- struct elf_prstatus prstatus; /* NT_PRSTATUS */
- struct elf_prpsinfo prpsinfo; /* NT_PRPSINFO */
- struct elf_phdr *nhdr, *phdr;
- struct elfhdr *elf;
- struct memelfnote notes[3];
- off_t offset = 0;
- struct kcore_list *m;
+ down_write(&kclist_lock);
+ if (!xchg(&kcore_need_update, 0))
+ goto out;
- /* setup ELF header */
- elf = (struct elfhdr *) bufp;
- bufp += sizeof(struct elfhdr);
- offset += sizeof(struct elfhdr);
- memcpy(elf->e_ident, ELFMAG, SELFMAG);
- elf->e_ident[EI_CLASS] = ELF_CLASS;
- elf->e_ident[EI_DATA] = ELF_DATA;
- elf->e_ident[EI_VERSION]= EV_CURRENT;
- elf->e_ident[EI_OSABI] = ELF_OSABI;
- memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
- elf->e_type = ET_CORE;
- elf->e_machine = ELF_ARCH;
- elf->e_version = EV_CURRENT;
- elf->e_entry = 0;
- elf->e_phoff = sizeof(struct elfhdr);
- elf->e_shoff = 0;
- elf->e_flags = ELF_CORE_EFLAGS;
- elf->e_ehsize = sizeof(struct elfhdr);
- elf->e_phentsize= sizeof(struct elf_phdr);
- elf->e_phnum = nphdr;
- elf->e_shentsize= 0;
- elf->e_shnum = 0;
- elf->e_shstrndx = 0;
-
- /* setup ELF PT_NOTE program header */
- nhdr = (struct elf_phdr *) bufp;
- bufp += sizeof(struct elf_phdr);
- offset += sizeof(struct elf_phdr);
- nhdr->p_type = PT_NOTE;
- nhdr->p_offset = 0;
- nhdr->p_vaddr = 0;
- nhdr->p_paddr = 0;
- nhdr->p_filesz = 0;
- nhdr->p_memsz = 0;
- nhdr->p_flags = 0;
- nhdr->p_align = 0;
-
- /* setup ELF PT_LOAD program header for every area */
- list_for_each_entry(m, &kclist_head, list) {
- phdr = (struct elf_phdr *) bufp;
- bufp += sizeof(struct elf_phdr);
- offset += sizeof(struct elf_phdr);
-
- phdr->p_type = PT_LOAD;
- phdr->p_flags = PF_R|PF_W|PF_X;
- phdr->p_offset = kc_vaddr_to_offset(m->addr) + dataoff;
- phdr->p_vaddr = (size_t)m->addr;
- if (m->type == KCORE_RAM || m->type == KCORE_TEXT)
- phdr->p_paddr = __pa(m->addr);
- else
- phdr->p_paddr = (elf_addr_t)-1;
- phdr->p_filesz = phdr->p_memsz = m->size;
- phdr->p_align = PAGE_SIZE;
+ ret = kcore_ram_list(&list);
+ if (ret) {
+ /* Couldn't get the RAM list, try again next time. */
+ WRITE_ONCE(kcore_need_update, 1);
+ list_splice_tail(&list, &garbage);
+ goto out;
}
- /*
- * Set up the notes in similar form to SVR4 core dumps made
- * with info from their /proc.
- */
- nhdr->p_offset = offset;
-
- /* set up the process status */
- notes[0].name = CORE_STR;
- notes[0].type = NT_PRSTATUS;
- notes[0].datasz = sizeof(struct elf_prstatus);
- notes[0].data = &prstatus;
-
- memset(&prstatus, 0, sizeof(struct elf_prstatus));
-
- nhdr->p_filesz = notesize(&notes[0]);
- bufp = storenote(&notes[0], bufp);
-
- /* set up the process info */
- notes[1].name = CORE_STR;
- notes[1].type = NT_PRPSINFO;
- notes[1].datasz = sizeof(struct elf_prpsinfo);
- notes[1].data = &prpsinfo;
-
- memset(&prpsinfo, 0, sizeof(struct elf_prpsinfo));
- prpsinfo.pr_state = 0;
- prpsinfo.pr_sname = 'R';
- prpsinfo.pr_zomb = 0;
-
- strcpy(prpsinfo.pr_fname, "vmlinux");
- strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs));
-
- nhdr->p_filesz += notesize(&notes[1]);
- bufp = storenote(&notes[1], bufp);
+ list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
+ if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP)
+ list_move(&pos->list, &garbage);
+ }
+ list_splice_tail(&list, &kclist_head);
- /* set up the task structure */
- notes[2].name = CORE_STR;
- notes[2].type = NT_TASKSTRUCT;
- notes[2].datasz = arch_task_struct_size;
- notes[2].data = current;
+ proc_root_kcore->size = get_kcore_size(&nphdr, &phdrs_len, &notes_len,
+ &data_offset);
- nhdr->p_filesz += notesize(&notes[2]);
- bufp = storenote(&notes[2], bufp);
+out:
+ up_write(&kclist_lock);
+ list_for_each_entry_safe(pos, tmp, &garbage, list) {
+ list_del(&pos->list);
+ kfree(pos);
+ }
+ return ret;
+}
-} /* end elf_kcore_store_hdr() */
+static void append_kcore_note(char *notes, size_t *i, const char *name,
+ unsigned int type, const void *desc,
+ size_t descsz)
+{
+ struct elf_note *note = (struct elf_note *)&notes[*i];
+
+ note->n_namesz = strlen(name) + 1;
+ note->n_descsz = descsz;
+ note->n_type = type;
+ *i += sizeof(*note);
+ memcpy(&notes[*i], name, note->n_namesz);
+ *i = ALIGN(*i + note->n_namesz, 4);
+ memcpy(&notes[*i], desc, descsz);
+ *i = ALIGN(*i + descsz, 4);
+}
-/*****************************************************************************/
-/*
- * read from the ELF header and then kernel memory
- */
static ssize_t
read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
{
char *buf = file->private_data;
- ssize_t acc = 0;
- size_t size, tsz;
- size_t elf_buflen;
+ size_t phdrs_offset, notes_offset, data_offset;
+ size_t phdrs_len, notes_len;
+ struct kcore_list *m;
+ size_t tsz;
int nphdr;
unsigned long start;
+ size_t orig_buflen = buflen;
+ int ret = 0;
- read_lock(&kclist_lock);
- size = get_kcore_size(&nphdr, &elf_buflen);
+ down_read(&kclist_lock);
+
+ get_kcore_size(&nphdr, &phdrs_len, &notes_len, &data_offset);
+ phdrs_offset = sizeof(struct elfhdr);
+ notes_offset = phdrs_offset + phdrs_len;
+
+ /* ELF file header. */
+ if (buflen && *fpos < sizeof(struct elfhdr)) {
+ struct elfhdr ehdr = {
+ .e_ident = {
+ [EI_MAG0] = ELFMAG0,
+ [EI_MAG1] = ELFMAG1,
+ [EI_MAG2] = ELFMAG2,
+ [EI_MAG3] = ELFMAG3,
+ [EI_CLASS] = ELF_CLASS,
+ [EI_DATA] = ELF_DATA,
+ [EI_VERSION] = EV_CURRENT,
+ [EI_OSABI] = ELF_OSABI,
+ },
+ .e_type = ET_CORE,
+ .e_machine = ELF_ARCH,
+ .e_version = EV_CURRENT,
+ .e_phoff = sizeof(struct elfhdr),
+ .e_flags = ELF_CORE_EFLAGS,
+ .e_ehsize = sizeof(struct elfhdr),
+ .e_phentsize = sizeof(struct elf_phdr),
+ .e_phnum = nphdr,
+ };
+
+ tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
+ if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
- if (buflen == 0 || *fpos >= size) {
- read_unlock(&kclist_lock);
- return 0;
+ buffer += tsz;
+ buflen -= tsz;
+ *fpos += tsz;
}
- /* trim buflen to not go beyond EOF */
- if (buflen > size - *fpos)
- buflen = size - *fpos;
-
- /* construct an ELF core header if we'll need some of it */
- if (*fpos < elf_buflen) {
- char * elf_buf;
-
- tsz = elf_buflen - *fpos;
- if (buflen < tsz)
- tsz = buflen;
- elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
- if (!elf_buf) {
- read_unlock(&kclist_lock);
- return -ENOMEM;
+ /* ELF program headers. */
+ if (buflen && *fpos < phdrs_offset + phdrs_len) {
+ struct elf_phdr *phdrs, *phdr;
+
+ phdrs = kzalloc(phdrs_len, GFP_KERNEL);
+ if (!phdrs) {
+ ret = -ENOMEM;
+ goto out;
}
- elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
- read_unlock(&kclist_lock);
- if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
- kfree(elf_buf);
- return -EFAULT;
+
+ phdrs[0].p_type = PT_NOTE;
+ phdrs[0].p_offset = notes_offset;
+ phdrs[0].p_filesz = notes_len;
+
+ phdr = &phdrs[1];
+ list_for_each_entry(m, &kclist_head, list) {
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R | PF_W | PF_X;
+ phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset;
+ phdr->p_vaddr = (size_t)m->addr;
+ if (m->type == KCORE_RAM)
+ phdr->p_paddr = __pa(m->addr);
+ else if (m->type == KCORE_TEXT)
+ phdr->p_paddr = __pa_symbol(m->addr);
+ else
+ phdr->p_paddr = (elf_addr_t)-1;
+ phdr->p_filesz = phdr->p_memsz = m->size;
+ phdr->p_align = PAGE_SIZE;
+ phdr++;
}
- kfree(elf_buf);
+
+ tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos);
+ if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset,
+ tsz)) {
+ kfree(phdrs);
+ ret = -EFAULT;
+ goto out;
+ }
+ kfree(phdrs);
+
+ buffer += tsz;
buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
- acc += tsz;
+ }
+
+ /* ELF note segment. */
+ if (buflen && *fpos < notes_offset + notes_len) {
+ struct elf_prstatus prstatus = {};
+ struct elf_prpsinfo prpsinfo = {
+ .pr_sname = 'R',
+ .pr_fname = "vmlinux",
+ };
+ char *notes;
+ size_t i = 0;
+
+ strlcpy(prpsinfo.pr_psargs, saved_command_line,
+ sizeof(prpsinfo.pr_psargs));
+
+ notes = kzalloc(notes_len, GFP_KERNEL);
+ if (!notes) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus,
+ sizeof(prstatus));
+ append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo,
+ sizeof(prpsinfo));
+ append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current,
+ arch_task_struct_size);
+ /*
+ * vmcoreinfo_size is mostly constant after init time, but it
+ * can be changed by crash_save_vmcoreinfo(). Racing here with a
+ * panic on another CPU before the machine goes down is insanely
+ * unlikely, but it's better to not leave potential buffer
+ * overflows lying around, regardless.
+ */
+ append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0,
+ vmcoreinfo_data,
+ min(vmcoreinfo_size, notes_len - i));
+
+ tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
+ if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
+ kfree(notes);
+ ret = -EFAULT;
+ goto out;
+ }
+ kfree(notes);
- /* leave now if filled buffer already */
- if (buflen == 0)
- return acc;
- } else
- read_unlock(&kclist_lock);
+ buffer += tsz;
+ buflen -= tsz;
+ *fpos += tsz;
+ }
/*
* Check to see if our file offset matches with any of
* the addresses in the elf_phdr on our list.
*/
- start = kc_offset_to_vaddr(*fpos - elf_buflen);
+ start = kc_offset_to_vaddr(*fpos - data_offset);
if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
tsz = buflen;
-
- while (buflen) {
- struct kcore_list *m;
- read_lock(&kclist_lock);
- list_for_each_entry(m, &kclist_head, list) {
- if (start >= m->addr && start < (m->addr+m->size))
- break;
+ m = NULL;
+ while (buflen) {
+ /*
+ * If this is the first iteration or the address is not within
+ * the previous entry, search for a matching entry.
+ */
+ if (!m || start < m->addr || start >= m->addr + m->size) {
+ list_for_each_entry(m, &kclist_head, list) {
+ if (start >= m->addr &&
+ start < m->addr + m->size)
+ break;
+ }
}
- read_unlock(&kclist_lock);
if (&m->list == &kclist_head) {
- if (clear_user(buffer, tsz))
- return -EFAULT;
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
} else if (m->type == KCORE_VMALLOC) {
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
- if (copy_to_user(buffer, buf, tsz))
- return -EFAULT;
+ if (copy_to_user(buffer, buf, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
} else if (m->type == KCORE_USER) {
/* User page is handled prior to normal kernel page: */
- if (copy_to_user(buffer, (char *)start, tsz))
- return -EFAULT;
+ if (copy_to_user(buffer, (char *)start, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
} else {
if (kern_addr_valid(start)) {
/*
@@ -530,29 +481,37 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
* hardened user copy kernel text checks.
*/
if (probe_kernel_read(buf, (void *) start, tsz)) {
- if (clear_user(buffer, tsz))
- return -EFAULT;
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
} else {
- if (copy_to_user(buffer, buf, tsz))
- return -EFAULT;
+ if (copy_to_user(buffer, buf, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
} else {
- if (clear_user(buffer, tsz))
- return -EFAULT;
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
}
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
- acc += tsz;
start += tsz;
tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
}
- return acc;
+out:
+ up_read(&kclist_lock);
+ if (ret)
+ return ret;
+ return orig_buflen - buflen;
}
-
static int open_kcore(struct inode *inode, struct file *filp)
{
if (!capable(CAP_SYS_RAWIO))
@@ -592,9 +551,8 @@ static int __meminit kcore_callback(struct notifier_block *self,
switch (action) {
case MEM_ONLINE:
case MEM_OFFLINE:
- write_lock(&kclist_lock);
kcore_need_update = 1;
- write_unlock(&kclist_lock);
+ break;
}
return NOTIFY_OK;
}
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 2fb04846ed11..edda898714eb 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -7,6 +7,7 @@
#include <linux/mman.h>
#include <linux/mmzone.h>
#include <linux/proc_fs.h>
+#include <linux/percpu.h>
#include <linux/quicklist.h>
#include <linux/seq_file.h>
#include <linux/swap.h>
@@ -121,6 +122,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
(unsigned long)VMALLOC_TOTAL >> 10);
show_val_kb(m, "VmallocUsed: ", 0ul);
show_val_kb(m, "VmallocChunk: ", 0ul);
+ show_val_kb(m, "Percpu: ", pcpu_nr_pages());
#ifdef CONFIG_MEMORY_FAILURE
seq_printf(m, "HardwareCorrupted: %5lu kB\n",
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 59749dfaef67..535eda7857cf 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -183,7 +183,7 @@ static int show_stat(struct seq_file *p, void *v)
static int stat_open(struct inode *inode, struct file *file)
{
- size_t size = 1024 + 128 * num_online_cpus();
+ unsigned int size = 1024 + 128 * num_online_cpus();
/* minimum size to display an interrupt count : 2 bytes */
size += 2 * nr_irqs;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5c7950d8200b..669abb617321 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -247,7 +247,6 @@ static int proc_map_release(struct inode *inode, struct file *file)
if (priv->mm)
mmdrop(priv->mm);
- kfree(priv->rollup);
return seq_release_private(inode, file);
}
@@ -294,7 +293,7 @@ static void show_vma_header_prefix(struct seq_file *m,
}
static void
-show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
+show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
@@ -357,35 +356,18 @@ done:
seq_putc(m, '\n');
}
-static int show_map(struct seq_file *m, void *v, int is_pid)
+static int show_map(struct seq_file *m, void *v)
{
- show_map_vma(m, v, is_pid);
+ show_map_vma(m, v);
m_cache_vma(m, v);
return 0;
}
-static int show_pid_map(struct seq_file *m, void *v)
-{
- return show_map(m, v, 1);
-}
-
-static int show_tid_map(struct seq_file *m, void *v)
-{
- return show_map(m, v, 0);
-}
-
static const struct seq_operations proc_pid_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_pid_map
-};
-
-static const struct seq_operations proc_tid_maps_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_tid_map
+ .show = show_map
};
static int pid_maps_open(struct inode *inode, struct file *file)
@@ -393,11 +375,6 @@ static int pid_maps_open(struct inode *inode, struct file *file)
return do_maps_open(inode, file, &proc_pid_maps_op);
}
-static int tid_maps_open(struct inode *inode, struct file *file)
-{
- return do_maps_open(inode, file, &proc_tid_maps_op);
-}
-
const struct file_operations proc_pid_maps_operations = {
.open = pid_maps_open,
.read = seq_read,
@@ -405,13 +382,6 @@ const struct file_operations proc_pid_maps_operations = {
.release = proc_map_release,
};
-const struct file_operations proc_tid_maps_operations = {
- .open = tid_maps_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = proc_map_release,
-};
-
/*
* Proportional Set Size(PSS): my share of RSS.
*
@@ -433,7 +403,6 @@ const struct file_operations proc_tid_maps_operations = {
#ifdef CONFIG_PROC_PAGE_MONITOR
struct mem_size_stats {
- bool first;
unsigned long resident;
unsigned long shared_clean;
unsigned long shared_dirty;
@@ -447,7 +416,6 @@ struct mem_size_stats {
unsigned long swap;
unsigned long shared_hugetlb;
unsigned long private_hugetlb;
- unsigned long first_vma_start;
u64 pss;
u64 pss_locked;
u64 swap_pss;
@@ -731,14 +699,9 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
}
#endif /* HUGETLB_PAGE */
-#define SEQ_PUT_DEC(str, val) \
- seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
-static int show_smap(struct seq_file *m, void *v, int is_pid)
+static void smap_gather_stats(struct vm_area_struct *vma,
+ struct mem_size_stats *mss)
{
- struct proc_maps_private *priv = m->private;
- struct vm_area_struct *vma = v;
- struct mem_size_stats mss_stack;
- struct mem_size_stats *mss;
struct mm_walk smaps_walk = {
.pmd_entry = smaps_pte_range,
#ifdef CONFIG_HUGETLB_PAGE
@@ -746,23 +709,6 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
#endif
.mm = vma->vm_mm,
};
- int ret = 0;
- bool rollup_mode;
- bool last_vma;
-
- if (priv->rollup) {
- rollup_mode = true;
- mss = priv->rollup;
- if (mss->first) {
- mss->first_vma_start = vma->vm_start;
- mss->first = false;
- }
- last_vma = !m_next_vma(priv, vma);
- } else {
- rollup_mode = false;
- memset(&mss_stack, 0, sizeof(mss_stack));
- mss = &mss_stack;
- }
smaps_walk.private = mss;
@@ -794,79 +740,116 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
walk_page_vma(vma, &smaps_walk);
if (vma->vm_flags & VM_LOCKED)
mss->pss_locked += mss->pss;
+}
- if (!rollup_mode) {
- show_map_vma(m, vma, is_pid);
- } else if (last_vma) {
- show_vma_header_prefix(
- m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0);
- seq_pad(m, ' ');
- seq_puts(m, "[rollup]\n");
- } else {
- ret = SEQ_SKIP;
- }
-
- if (!rollup_mode) {
- SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start);
- SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma));
- SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma));
- seq_puts(m, " kB\n");
- }
+#define SEQ_PUT_DEC(str, val) \
+ seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
- if (!rollup_mode || last_vma) {
- SEQ_PUT_DEC("Rss: ", mss->resident);
- SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
- SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean);
- SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty);
- SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean);
- SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty);
- SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced);
- SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous);
- SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
- SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
- SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
- SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
- seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
- mss->private_hugetlb >> 10, 7);
- SEQ_PUT_DEC(" kB\nSwap: ", mss->swap);
- SEQ_PUT_DEC(" kB\nSwapPss: ",
- mss->swap_pss >> PSS_SHIFT);
- SEQ_PUT_DEC(" kB\nLocked: ",
- mss->pss_locked >> PSS_SHIFT);
- seq_puts(m, " kB\n");
- }
- if (!rollup_mode) {
- if (arch_pkeys_enabled())
- seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
- show_smap_vma_flags(m, vma);
- }
- m_cache_vma(m, vma);
- return ret;
+/* Show the contents common for smaps and smaps_rollup */
+static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss)
+{
+ SEQ_PUT_DEC("Rss: ", mss->resident);
+ SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean);
+ SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty);
+ SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean);
+ SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty);
+ SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced);
+ SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous);
+ SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
+ SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
+ SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
+ SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
+ seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
+ mss->private_hugetlb >> 10, 7);
+ SEQ_PUT_DEC(" kB\nSwap: ", mss->swap);
+ SEQ_PUT_DEC(" kB\nSwapPss: ",
+ mss->swap_pss >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nLocked: ",
+ mss->pss_locked >> PSS_SHIFT);
+ seq_puts(m, " kB\n");
}
-#undef SEQ_PUT_DEC
-static int show_pid_smap(struct seq_file *m, void *v)
+static int show_smap(struct seq_file *m, void *v)
{
- return show_smap(m, v, 1);
+ struct vm_area_struct *vma = v;
+ struct mem_size_stats mss;
+
+ memset(&mss, 0, sizeof(mss));
+
+ smap_gather_stats(vma, &mss);
+
+ show_map_vma(m, vma);
+
+ SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start);
+ SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma));
+ SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma));
+ seq_puts(m, " kB\n");
+
+ __show_smap(m, &mss);
+
+ if (arch_pkeys_enabled())
+ seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
+ show_smap_vma_flags(m, vma);
+
+ m_cache_vma(m, vma);
+
+ return 0;
}
-static int show_tid_smap(struct seq_file *m, void *v)
+static int show_smaps_rollup(struct seq_file *m, void *v)
{
- return show_smap(m, v, 0);
+ struct proc_maps_private *priv = m->private;
+ struct mem_size_stats mss;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ unsigned long last_vma_end = 0;
+ int ret = 0;
+
+ priv->task = get_proc_task(priv->inode);
+ if (!priv->task)
+ return -ESRCH;
+
+ mm = priv->mm;
+ if (!mm || !mmget_not_zero(mm)) {
+ ret = -ESRCH;
+ goto out_put_task;
+ }
+
+ memset(&mss, 0, sizeof(mss));
+
+ down_read(&mm->mmap_sem);
+ hold_task_mempolicy(priv);
+
+ for (vma = priv->mm->mmap; vma; vma = vma->vm_next) {
+ smap_gather_stats(vma, &mss);
+ last_vma_end = vma->vm_end;
+ }
+
+ show_vma_header_prefix(m, priv->mm->mmap->vm_start,
+ last_vma_end, 0, 0, 0, 0);
+ seq_pad(m, ' ');
+ seq_puts(m, "[rollup]\n");
+
+ __show_smap(m, &mss);
+
+ release_task_mempolicy(priv);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+
+out_put_task:
+ put_task_struct(priv->task);
+ priv->task = NULL;
+
+ return ret;
}
+#undef SEQ_PUT_DEC
static const struct seq_operations proc_pid_smaps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_pid_smap
-};
-
-static const struct seq_operations proc_tid_smaps_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_tid_smap
+ .show = show_smap
};
static int pid_smaps_open(struct inode *inode, struct file *file)
@@ -874,28 +857,45 @@ static int pid_smaps_open(struct inode *inode, struct file *file)
return do_maps_open(inode, file, &proc_pid_smaps_op);
}
-static int pid_smaps_rollup_open(struct inode *inode, struct file *file)
+static int smaps_rollup_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
+ int ret;
struct proc_maps_private *priv;
- int ret = do_maps_open(inode, file, &proc_pid_smaps_op);
-
- if (ret < 0)
- return ret;
- seq = file->private_data;
- priv = seq->private;
- priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL);
- if (!priv->rollup) {
- proc_map_release(inode, file);
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT);
+ if (!priv)
return -ENOMEM;
+
+ ret = single_open(file, show_smaps_rollup, priv);
+ if (ret)
+ goto out_free;
+
+ priv->inode = inode;
+ priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(priv->mm)) {
+ ret = PTR_ERR(priv->mm);
+
+ single_release(inode, file);
+ goto out_free;
}
- priv->rollup->first = true;
+
return 0;
+
+out_free:
+ kfree(priv);
+ return ret;
}
-static int tid_smaps_open(struct inode *inode, struct file *file)
+static int smaps_rollup_release(struct inode *inode, struct file *file)
{
- return do_maps_open(inode, file, &proc_tid_smaps_op);
+ struct seq_file *seq = file->private_data;
+ struct proc_maps_private *priv = seq->private;
+
+ if (priv->mm)
+ mmdrop(priv->mm);
+
+ kfree(priv);
+ return single_release(inode, file);
}
const struct file_operations proc_pid_smaps_operations = {
@@ -906,17 +906,10 @@ const struct file_operations proc_pid_smaps_operations = {
};
const struct file_operations proc_pid_smaps_rollup_operations = {
- .open = pid_smaps_rollup_open,
+ .open = smaps_rollup_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = proc_map_release,
-};
-
-const struct file_operations proc_tid_smaps_operations = {
- .open = tid_smaps_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = proc_map_release,
+ .release = smaps_rollup_release,
};
enum clear_refs_types {
@@ -1728,7 +1721,7 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
/*
* Display pages allocated per node and memory policy via /proc.
*/
-static int show_numa_map(struct seq_file *m, void *v, int is_pid)
+static int show_numa_map(struct seq_file *m, void *v)
{
struct numa_maps_private *numa_priv = m->private;
struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
@@ -1812,45 +1805,17 @@ out:
return 0;
}
-static int show_pid_numa_map(struct seq_file *m, void *v)
-{
- return show_numa_map(m, v, 1);
-}
-
-static int show_tid_numa_map(struct seq_file *m, void *v)
-{
- return show_numa_map(m, v, 0);
-}
-
static const struct seq_operations proc_pid_numa_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_pid_numa_map,
+ .show = show_numa_map,
};
-static const struct seq_operations proc_tid_numa_maps_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_tid_numa_map,
-};
-
-static int numa_maps_open(struct inode *inode, struct file *file,
- const struct seq_operations *ops)
-{
- return proc_maps_open(inode, file, ops,
- sizeof(struct numa_maps_private));
-}
-
static int pid_numa_maps_open(struct inode *inode, struct file *file)
{
- return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
-}
-
-static int tid_numa_maps_open(struct inode *inode, struct file *file)
-{
- return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
+ return proc_maps_open(inode, file, &proc_pid_numa_maps_op,
+ sizeof(struct numa_maps_private));
}
const struct file_operations proc_pid_numa_maps_operations = {
@@ -1860,10 +1825,4 @@ const struct file_operations proc_pid_numa_maps_operations = {
.release = proc_map_release,
};
-const struct file_operations proc_tid_numa_maps_operations = {
- .open = tid_numa_maps_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = proc_map_release,
-};
#endif /* CONFIG_NUMA */
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5b62f57bd9bc..0b63d68dedb2 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -142,8 +142,7 @@ static int is_stack(struct vm_area_struct *vma)
/*
* display a single VMA to a sequenced file
*/
-static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
- int is_pid)
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long ino = 0;
@@ -189,22 +188,11 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
/*
* display mapping lines for a particular process's /proc/pid/maps
*/
-static int show_map(struct seq_file *m, void *_p, int is_pid)
+static int show_map(struct seq_file *m, void *_p)
{
struct rb_node *p = _p;
- return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb),
- is_pid);
-}
-
-static int show_pid_map(struct seq_file *m, void *_p)
-{
- return show_map(m, _p, 1);
-}
-
-static int show_tid_map(struct seq_file *m, void *_p)
-{
- return show_map(m, _p, 0);
+ return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
}
static void *m_start(struct seq_file *m, loff_t *pos)
@@ -260,14 +248,7 @@ static const struct seq_operations proc_pid_maps_ops = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_pid_map
-};
-
-static const struct seq_operations proc_tid_maps_ops = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_tid_map
+ .show = show_map
};
static int maps_open(struct inode *inode, struct file *file,
@@ -308,11 +289,6 @@ static int pid_maps_open(struct inode *inode, struct file *file)
return maps_open(inode, file, &proc_pid_maps_ops);
}
-static int tid_maps_open(struct inode *inode, struct file *file)
-{
- return maps_open(inode, file, &proc_tid_maps_ops);
-}
-
const struct file_operations proc_pid_maps_operations = {
.open = pid_maps_open,
.read = seq_read,
@@ -320,10 +296,3 @@ const struct file_operations proc_pid_maps_operations = {
.release = map_release,
};
-const struct file_operations proc_tid_maps_operations = {
- .open = tid_maps_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = map_release,
-};
-
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 3f723cb478af..a4c2791ab70b 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -9,7 +9,7 @@
static int uptime_proc_show(struct seq_file *m, void *v)
{
- struct timespec uptime;
+ struct timespec64 uptime;
struct timespec64 idle;
u64 nsec;
u32 rem;
@@ -19,7 +19,7 @@ static int uptime_proc_show(struct seq_file *m, void *v)
for_each_possible_cpu(i)
nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
- get_monotonic_boottime(&uptime);
+ ktime_get_boottime_ts64(&uptime);
idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
idle.tv_nsec = rem;
seq_printf(m, "%lu.%02lu %lu.%02lu\n",
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cfb6674331fd..6c1c2607e9e4 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -379,7 +379,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
* On s390 the fault handler is used for memory regions that can't be mapped
* directly with remap_pfn_range().
*/
-static int mmap_vmcore_fault(struct vm_fault *vmf)
+static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
{
#ifdef CONFIG_S390
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index e3c558d1b78c..3a5a752d96c7 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -33,30 +33,22 @@ static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize)
return 0;
}
-static char *print_time(time_t t)
-{
- static char timebuf[256];
-
- sprintf(timebuf, "%ld", t);
- return timebuf;
-}
-
static void sd_print_item(struct item_head *ih, char *item)
{
printk("\tmode | size | nlinks | first direct | mtime\n");
if (stat_data_v1(ih)) {
struct stat_data_v1 *sd = (struct stat_data_v1 *)item;
- printk("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd),
+ printk("\t0%-6o | %6u | %2u | %d | %u\n", sd_v1_mode(sd),
sd_v1_size(sd), sd_v1_nlink(sd),
sd_v1_first_direct_byte(sd),
- print_time(sd_v1_mtime(sd)));
+ sd_v1_mtime(sd));
} else {
struct stat_data *sd = (struct stat_data *)item;
- printk("\t0%-6o | %6llu | %2u | %d | %s\n", sd_v2_mode(sd),
+ printk("\t0%-6o | %6llu | %2u | %d | %u\n", sd_v2_mode(sd),
(unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
- sd_v2_rdev(sd), print_time(sd_v2_mtime(sd)));
+ sd_v2_rdev(sd), sd_v2_mtime(sd));
}
}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 52eb5d293a34..8a76f9d14bc6 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2381,7 +2381,7 @@ static int journal_read(struct super_block *sb)
struct reiserfs_journal_desc *desc;
unsigned int oldest_trans_id = 0;
unsigned int oldest_invalid_trans_id = 0;
- time_t start;
+ time64_t start;
unsigned long oldest_start = 0;
unsigned long cur_dblock = 0;
unsigned long newest_mount_id = 9;
@@ -2395,7 +2395,7 @@ static int journal_read(struct super_block *sb)
cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
reiserfs_info(sb, "checking transaction log (%pg)\n",
journal->j_dev_bd);
- start = get_seconds();
+ start = ktime_get_seconds();
/*
* step 1, read in the journal header block. Check the transaction
@@ -2556,7 +2556,7 @@ start_log_replay:
if (replay_count > 0) {
reiserfs_info(sb,
"replayed %d transactions in %lu seconds\n",
- replay_count, get_seconds() - start);
+ replay_count, ktime_get_seconds() - start);
}
/* needed to satisfy the locking in _update_journal_header_block */
reiserfs_write_lock(sb);
@@ -2914,7 +2914,7 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
int new_alloc)
{
struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
- time_t now = get_seconds();
+ time64_t now = ktime_get_seconds();
/* cannot restart while nested */
BUG_ON(!th->t_trans_id);
if (th->t_refcount > 1)
@@ -3023,7 +3023,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
struct super_block *sb, unsigned long nblocks,
int join)
{
- time_t now = get_seconds();
+ time64_t now = ktime_get_seconds();
unsigned int old_trans_id;
struct reiserfs_journal *journal = SB_JOURNAL(sb);
struct reiserfs_transaction_handle myth;
@@ -3056,7 +3056,7 @@ relock:
PROC_INFO_INC(sb, journal.journal_relock_writers);
goto relock;
}
- now = get_seconds();
+ now = ktime_get_seconds();
/*
* if there is no room in the journal OR
@@ -3119,7 +3119,7 @@ relock:
}
/* we are the first writer, set trans_id */
if (journal->j_trans_start_time == 0) {
- journal->j_trans_start_time = get_seconds();
+ journal->j_trans_start_time = ktime_get_seconds();
}
atomic_inc(&journal->j_wcount);
journal->j_len_alloc += nblocks;
@@ -3559,11 +3559,11 @@ static void flush_async_commits(struct work_struct *work)
*/
void reiserfs_flush_old_commits(struct super_block *sb)
{
- time_t now;
+ time64_t now;
struct reiserfs_transaction_handle th;
struct reiserfs_journal *journal = SB_JOURNAL(sb);
- now = get_seconds();
+ now = ktime_get_seconds();
/*
* safety check so we don't flush while we are replaying the log during
* mount
@@ -3613,7 +3613,7 @@ void reiserfs_flush_old_commits(struct super_block *sb)
static int check_journal_end(struct reiserfs_transaction_handle *th, int flags)
{
- time_t now;
+ time64_t now;
int flush = flags & FLUSH_ALL;
int commit_now = flags & COMMIT_NOW;
int wait_on_commit = flags & WAIT;
@@ -3694,7 +3694,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, int flags)
}
/* deal with old transactions where we are the last writers */
- now = get_seconds();
+ now = ktime_get_seconds();
if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
commit_now = 1;
journal->j_next_async_flush = 1;
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index e39b3910d24d..f2cf3441fdfc 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -297,6 +297,13 @@ static int show_oidmap(struct seq_file *m, void *unused)
return 0;
}
+static time64_t ktime_mono_to_real_seconds(time64_t mono)
+{
+ ktime_t kt = ktime_set(mono, NSEC_PER_SEC/2);
+
+ return ktime_divns(ktime_mono_to_real(kt), NSEC_PER_SEC);
+}
+
static int show_journal(struct seq_file *m, void *unused)
{
struct super_block *sb = m->private;
@@ -325,7 +332,7 @@ static int show_journal(struct seq_file *m, void *unused)
"j_bcount: \t%lu\n"
"j_first_unflushed_offset: \t%lu\n"
"j_last_flush_trans_id: \t%u\n"
- "j_trans_start_time: \t%li\n"
+ "j_trans_start_time: \t%lli\n"
"j_list_bitmap_index: \t%i\n"
"j_must_wait: \t%i\n"
"j_next_full_flush: \t%i\n"
@@ -366,7 +373,7 @@ static int show_journal(struct seq_file *m, void *unused)
JF(j_bcount),
JF(j_first_unflushed_offset),
JF(j_last_flush_trans_id),
- JF(j_trans_start_time),
+ ktime_mono_to_real_seconds(JF(j_trans_start_time)),
JF(j_list_bitmap_index),
JF(j_must_wait),
JF(j_next_full_flush),
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index ae4811fecc1f..e5ca9ed79e54 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -271,7 +271,7 @@ struct reiserfs_journal_list {
struct mutex j_commit_mutex;
unsigned int j_trans_id;
- time_t j_timestamp;
+ time64_t j_timestamp; /* write-only but useful for crash dump analysis */
struct reiserfs_list_bitmap *j_list_bitmap;
struct buffer_head *j_commit_bh; /* commit buffer head */
struct reiserfs_journal_cnode *j_realblock;
@@ -331,7 +331,7 @@ struct reiserfs_journal {
struct buffer_head *j_header_bh;
- time_t j_trans_start_time; /* time this transaction started */
+ time64_t j_trans_start_time; /* time this transaction started */
struct mutex j_mutex;
struct mutex j_flush_mutex;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 4cc090b50cc5..1dea7a8a5255 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -90,23 +90,22 @@ EXPORT_SYMBOL(seq_open);
static int traverse(struct seq_file *m, loff_t offset)
{
- loff_t pos = 0, index;
+ loff_t pos = 0;
int error = 0;
void *p;
m->version = 0;
- index = 0;
+ m->index = 0;
m->count = m->from = 0;
- if (!offset) {
- m->index = index;
+ if (!offset)
return 0;
- }
+
if (!m->buf) {
m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
if (!m->buf)
return -ENOMEM;
}
- p = m->op->start(m, &index);
+ p = m->op->start(m, &m->index);
while (p) {
error = PTR_ERR(p);
if (IS_ERR(p))
@@ -123,20 +122,15 @@ static int traverse(struct seq_file *m, loff_t offset)
if (pos + m->count > offset) {
m->from = offset - pos;
m->count -= m->from;
- m->index = index;
break;
}
pos += m->count;
m->count = 0;
- if (pos == offset) {
- index++;
- m->index = index;
+ p = m->op->next(m, p, &m->index);
+ if (pos == offset)
break;
- }
- p = m->op->next(m, p, &index);
}
m->op->stop(m, p);
- m->index = index;
return error;
Eoverflow:
@@ -160,7 +154,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
{
struct seq_file *m = file->private_data;
size_t copied = 0;
- loff_t pos;
size_t n;
void *p;
int err = 0;
@@ -223,16 +216,12 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
size -= n;
buf += n;
copied += n;
- if (!m->count) {
- m->from = 0;
- m->index++;
- }
if (!size)
goto Done;
}
/* we need at least one record in buffer */
- pos = m->index;
- p = m->op->start(m, &pos);
+ m->from = 0;
+ p = m->op->start(m, &m->index);
while (1) {
err = PTR_ERR(p);
if (!p || IS_ERR(p))
@@ -243,8 +232,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
if (unlikely(err))
m->count = 0;
if (unlikely(!m->count)) {
- p = m->op->next(m, p, &pos);
- m->index = pos;
+ p = m->op->next(m, p, &m->index);
continue;
}
if (m->count < m->size)
@@ -256,29 +244,33 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
if (!m->buf)
goto Enomem;
m->version = 0;
- pos = m->index;
- p = m->op->start(m, &pos);
+ p = m->op->start(m, &m->index);
}
m->op->stop(m, p);
m->count = 0;
goto Done;
Fill:
/* they want more? let's try to get some more */
- while (m->count < size) {
+ while (1) {
size_t offs = m->count;
- loff_t next = pos;
- p = m->op->next(m, p, &next);
+ loff_t pos = m->index;
+
+ p = m->op->next(m, p, &m->index);
+ if (pos == m->index)
+ /* Buggy ->next function */
+ m->index++;
if (!p || IS_ERR(p)) {
err = PTR_ERR(p);
break;
}
+ if (m->count >= size)
+ break;
err = m->op->show(m, p);
if (seq_has_overflowed(m) || err) {
m->count = offs;
if (likely(err <= 0))
break;
}
- pos = next;
}
m->op->stop(m, p);
n = min(m->count, size);
@@ -287,11 +279,7 @@ Fill:
goto Efault;
copied += n;
m->count -= n;
- if (m->count)
- m->from = n;
- else
- pos++;
- m->index = pos;
+ m->from = n;
Done:
if (!copied)
copied = err;
diff --git a/fs/super.c b/fs/super.c
index a3adcad07fc9..426161360af3 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -146,6 +146,9 @@ static unsigned long super_cache_count(struct shrinker *shrink,
total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
+ if (!total_objects)
+ return SHRINK_EMPTY;
+
total_objects = vfs_pressure_ratio(total_objects);
return total_objects;
}
@@ -243,10 +246,6 @@ static struct super_block *alloc_super(struct fs_context *fc)
INIT_LIST_HEAD(&s->s_inodes_wb);
spin_lock_init(&s->s_inode_wblist_lock);
- if (list_lru_init_memcg(&s->s_dentry_lru))
- goto fail;
- if (list_lru_init_memcg(&s->s_inode_lru))
- goto fail;
s->s_count = 1;
atomic_set(&s->s_active, 1);
mutex_init(&s->s_vfs_rename_mutex);
@@ -264,6 +263,10 @@ static struct super_block *alloc_super(struct fs_context *fc)
s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
if (prealloc_shrinker(&s->s_shrink))
goto fail;
+ if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
+ goto fail;
+ if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink))
+ goto fail;
return s;
fail:
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 47f66bbc4578..e8927ea70d12 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -35,7 +35,7 @@
static int sysv_sync_fs(struct super_block *sb, int wait)
{
struct sysv_sb_info *sbi = SYSV_SB(sb);
- unsigned long time = get_seconds(), old_time;
+ u32 time = (u32)ktime_get_real_seconds(), old_time;
mutex_lock(&sbi->s_lock);
@@ -46,8 +46,8 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
*/
old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
if (sbi->s_type == FSTYPE_SYSV4) {
- if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time))
- *sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38 - time);
+ if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38u - old_time))
+ *sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38u - time);
*sbi->s_sb_time = cpu_to_fs32(sbi, time);
mark_buffer_dirty(sbi->s_bh2);
}
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index e727ee07dbe4..075d3d9114c8 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -547,7 +547,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
/*
* Block can be extended
*/
- ucg->cg_time = cpu_to_fs32(sb, get_seconds());
+ ucg->cg_time = ufs_get_seconds(sb);
for (i = newcount; i < (uspi->s_fpb - fragoff); i++)
if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
break;
@@ -639,7 +639,7 @@ cg_found:
if (!ufs_cg_chkmagic(sb, ucg))
ufs_panic (sb, "ufs_alloc_fragments",
"internal error, bad magic number on cg %u", cgno);
- ucg->cg_time = cpu_to_fs32(sb, get_seconds());
+ ucg->cg_time = ufs_get_seconds(sb);
if (count == uspi->s_fpb) {
result = ufs_alloccg_block (inode, ucpi, goal, err);
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 02c0a4be4212..969fd60436d3 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -89,7 +89,7 @@ void ufs_free_inode (struct inode * inode)
if (!ufs_cg_chkmagic(sb, ucg))
ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number");
- ucg->cg_time = cpu_to_fs32(sb, get_seconds());
+ ucg->cg_time = ufs_get_seconds(sb);
is_directory = S_ISDIR(inode->i_mode);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 96a20a76e3c4..f48a5b802221 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -698,7 +698,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
usb1 = ubh_get_usb_first(uspi);
usb3 = ubh_get_usb_third(uspi);
- usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+ usb1->fs_time = ufs_get_seconds(sb);
if ((flags & UFS_ST_MASK) == UFS_ST_SUN ||
(flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
(flags & UFS_ST_MASK) == UFS_ST_SUNx86)
@@ -1344,7 +1344,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags,
*/
if (*mount_flags & SB_RDONLY) {
ufs_put_super_internal(sb);
- usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+ usb1->fs_time = ufs_get_seconds(sb);
if ((flags & UFS_ST_MASK) == UFS_ST_SUN
|| (flags & UFS_ST_MASK) == UFS_ST_SUNOS
|| (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 1907be6d5808..1fd3011ea623 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -590,3 +590,17 @@ static inline int ufs_is_data_ptr_zero(struct ufs_sb_private_info *uspi,
else
return *(__fs32 *)p == 0;
}
+
+static inline __fs32 ufs_get_seconds(struct super_block *sbp)
+{
+ time64_t now = ktime_get_real_seconds();
+
+ /* Signed 32-bit interpretation wraps around in 2038, which
+ * happens in ufs1 inode stamps but not ufs2 using 64-bits
+ * stamps. For superblock and blockgroup, let's assume
+ * unsigned 32-bit stamps, which are good until y2106.
+ * Wrap around rather than clamp here to make the dirty
+ * file system detection work in the superblock stamp.
+ */
+ return cpu_to_fs32(sbp, lower_32_bits(now));
+}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index bad9cea37f12..f649023b19b5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -910,7 +910,7 @@ wakeup:
*/
spin_lock(&ctx->fault_pending_wqh.lock);
__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
- __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
+ __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
spin_unlock(&ctx->fault_pending_wqh.lock);
/* Flush pending events that may still wait on event_wqh */
@@ -1066,7 +1066,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
* anyway.
*/
list_del(&uwq->wq.entry);
- __add_wait_queue(&ctx->fault_wqh, &uwq->wq);
+ add_wait_queue(&ctx->fault_wqh, &uwq->wq);
write_seqcount_end(&ctx->refile_seq);
@@ -1215,7 +1215,7 @@ static void __wake_userfault(struct userfaultfd_ctx *ctx,
__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
range);
if (waitqueue_active(&ctx->fault_wqh))
- __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, range);
+ __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
spin_unlock(&ctx->fault_pending_wqh.lock);
}
@@ -1849,17 +1849,14 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
{
struct userfaultfd_ctx *ctx = f->private_data;
wait_queue_entry_t *wq;
- struct userfaultfd_wait_queue *uwq;
unsigned long pending = 0, total = 0;
spin_lock(&ctx->fault_pending_wqh.lock);
list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
- uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
pending++;
total++;
}
list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
- uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
total++;
}
spin_unlock(&ctx->fault_pending_wqh.lock);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index bcce532ca90c..61a5ad2600e8 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1154,7 +1154,7 @@ xfs_file_mmap(
file_accessed(filp);
vma->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(file_inode(filp)))
- vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ vma->vm_flags |= VM_HUGEPAGE;
return 0;
}
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index a7613e1b0c87..20561a60db9c 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -75,9 +75,19 @@ struct bug_entry {
/*
* WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report
- * significant issues that need prompt attention if they should ever
- * appear at runtime. Use the versions with printk format strings
- * to provide better diagnostics.
+ * significant kernel issues that need prompt attention if they should ever
+ * appear at runtime.
+ *
+ * Do not use these macros when checking for invalid external inputs
+ * (e.g. invalid system call arguments, or invalid data coming from
+ * network/devices), and on transient conditions like ENOMEM or EAGAIN.
+ * These macros should be used for recoverable kernel issues only.
+ * For invalid external inputs, transient conditions, etc use
+ * pr_err[_once/_ratelimited]() followed by dump_stack(), if necessary.
+ * Do not include "BUG"/"WARNING" in format strings manually to make these
+ * conditions distinguishable from kernel issues.
+ *
+ * Use the versions with printk format strings to provide better diagnostics.
*/
#ifndef __WARN_TAINT
extern __printf(3, 4)
diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h
index 68efb950a918..4d73e6e3c66c 100644
--- a/include/asm-generic/export.h
+++ b/include/asm-generic/export.h
@@ -5,12 +5,10 @@
#define KSYM_FUNC(x) x
#endif
#ifdef CONFIG_64BIT
-#define __put .quad
#ifndef KSYM_ALIGN
#define KSYM_ALIGN 8
#endif
#else
-#define __put .long
#ifndef KSYM_ALIGN
#define KSYM_ALIGN 4
#endif
@@ -19,6 +17,16 @@
#define KCRC_ALIGN 4
#endif
+.macro __put, val, name
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ .long \val - ., \name - .
+#elif defined(CONFIG_64BIT)
+ .quad \val, \name
+#else
+ .long \val, \name
+#endif
+.endm
+
/*
* note on .section use: @progbits vs %progbits nastiness doesn't matter,
* since we immediately emit into those sections anyway.
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index a75cb371cd19..88ebc6102c7c 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1095,6 +1095,24 @@ static inline bool arch_has_pfn_modify_check(void)
}
#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */
+/*
+ * Architecture PAGE_KERNEL_* fallbacks
+ *
+ * Some architectures don't define certain PAGE_KERNEL_* flags. This is either
+ * because they really don't support them, or the port needs to be updated to
+ * reflect the required functionality. Below are a set of relatively safe
+ * fallbacks, as best effort, which we can count on in lieu of the architectures
+ * not defining them on their own yet.
+ */
+
+#ifndef PAGE_KERNEL_RO
+# define PAGE_KERNEL_RO PAGE_KERNEL
+#endif
+
+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+
#endif /* !__ASSEMBLY__ */
#ifndef io_remap_pfn_range
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 24251762c20c..9a6bc0951cfa 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -12,6 +12,7 @@
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/kref.h>
+#include <linux/refcount.h>
struct page;
struct device;
@@ -75,7 +76,7 @@ enum wb_reason {
*/
struct bdi_writeback_congested {
unsigned long state; /* WB_[a]sync_congested flags */
- atomic_t refcnt; /* nr of attached wb's and blkg */
+ refcount_t refcnt; /* nr of attached wb's and blkg */
#ifdef CONFIG_CGROUP_WRITEBACK
struct backing_dev_info *__bdi; /* the associated bdi, set to NULL
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 72ca0f3d39f3..c28a47cbe355 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -404,13 +404,13 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
static inline struct bdi_writeback_congested *
wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
{
- atomic_inc(&bdi->wb_congested->refcnt);
+ refcount_inc(&bdi->wb_congested->refcnt);
return bdi->wb_congested;
}
static inline void wb_congested_put(struct bdi_writeback_congested *congested)
{
- if (atomic_dec_and_test(&congested->refcnt))
+ if (refcount_dec_and_test(&congested->refcnt))
kfree(congested);
}
diff --git a/include/linux/bits.h b/include/linux/bits.h
index 2b7b532c1d51..94d09b8ab3f0 100644
--- a/include/linux/bits.h
+++ b/include/linux/bits.h
@@ -10,6 +10,7 @@
#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG))
#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG)
#define BITS_PER_BYTE 8
+#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
/*
* Create a contiguous bitmask starting at bit position @l and ending at
diff --git a/include/linux/cma.h b/include/linux/cma.h
index bf90f0bb42bd..190184b5ff32 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -33,7 +33,7 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
const char *name,
struct cma **res_cma);
extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
- gfp_t gfp_mask);
+ bool no_warn);
extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 42506e4d1f53..61c844d4ab48 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -280,6 +280,25 @@ unsigned long read_word_at_a_time(const void *addr)
#endif /* __KERNEL__ */
+/*
+ * Force the compiler to emit 'sym' as a symbol, so that we can reference
+ * it from inline assembler. Necessary in case 'sym' could be inlined
+ * otherwise, or eliminated entirely due to lack of references that are
+ * visible to the compiler.
+ */
+#define __ADDRESSABLE(sym) \
+ static void * __attribute__((section(".discard.addressable"), used)) \
+ __PASTE(__addressable_##sym, __LINE__) = (void *)&sym;
+
+/**
+ * offset_to_ptr - convert a relative memory offset to an absolute pointer
+ * @off: the address of the 32-bit offset value
+ */
+static inline void *offset_to_ptr(const int *off)
+{
+ return (void *)((unsigned long)off + *off);
+}
+
#endif /* __ASSEMBLY__ */
#ifndef __optimize
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index b511f6d24b42..525510a9f965 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -60,6 +60,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
#define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+extern unsigned char *vmcoreinfo_data;
+extern size_t vmcoreinfo_size;
extern u32 *vmcoreinfo_note;
Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
diff --git a/include/linux/crc64.h b/include/linux/crc64.h
new file mode 100644
index 000000000000..c756e65a1b58
--- /dev/null
+++ b/include/linux/crc64.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * See lib/crc64.c for the related specification and polynomial arithmetic.
+ */
+#ifndef _LINUX_CRC64_H
+#define _LINUX_CRC64_H
+
+#include <linux/types.h>
+
+u64 __pure crc64_be(u64 crc, const void *p, size_t len);
+#endif /* _LINUX_CRC64_H */
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
index 3c5a4cb3eb95..f247e8aa5e3d 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -112,7 +112,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size,
}
struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
- unsigned int order, gfp_t gfp_mask);
+ unsigned int order, bool no_warn);
bool dma_release_from_contiguous(struct device *dev, struct page *pages,
int count);
@@ -145,7 +145,7 @@ int dma_declare_contiguous(struct device *dev, phys_addr_t size,
static inline
struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
- unsigned int order, gfp_t gfp_mask)
+ unsigned int order, bool no_warn)
{
return NULL;
}
diff --git a/include/linux/export.h b/include/linux/export.h
index b768d6dd3c90..ae072bc5aacf 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -18,12 +18,6 @@
#define VMLINUX_SYMBOL_STR(x) __VMLINUX_SYMBOL_STR(x)
#ifndef __ASSEMBLY__
-struct kernel_symbol
-{
- unsigned long value;
- const char *name;
-};
-
#ifdef MODULE
extern struct module __this_module;
#define THIS_MODULE (&__this_module)
@@ -54,19 +48,58 @@ extern struct module __this_module;
#define __CRC_SYMBOL(sym, sec)
#endif
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+#include <linux/compiler.h>
+/*
+ * Emit the ksymtab entry as a pair of relative references: this reduces
+ * the size by half on 64-bit architectures, and eliminates the need for
+ * absolute relocations that require runtime processing on relocatable
+ * kernels.
+ */
+#define __KSYMTAB_ENTRY(sym, sec) \
+ __ADDRESSABLE(sym) \
+ asm(" .section \"___ksymtab" sec "+" #sym "\", \"a\" \n" \
+ " .balign 8 \n" \
+ "__ksymtab_" #sym ": \n" \
+ " .long " #sym "- . \n" \
+ " .long __kstrtab_" #sym "- . \n" \
+ " .previous \n")
+
+struct kernel_symbol {
+ int value_offset;
+ int name_offset;
+};
+#else
+#define __KSYMTAB_ENTRY(sym, sec) \
+ static const struct kernel_symbol __ksymtab_##sym \
+ __attribute__((section("___ksymtab" sec "+" #sym), used)) \
+ = { (unsigned long)&sym, __kstrtab_##sym }
+
+struct kernel_symbol {
+ unsigned long value;
+ const char *name;
+};
+#endif
+
/* For every exported symbol, place a struct in the __ksymtab section */
#define ___EXPORT_SYMBOL(sym, sec) \
extern typeof(sym) sym; \
__CRC_SYMBOL(sym, sec) \
static const char __kstrtab_##sym[] \
- __attribute__((section("__ksymtab_strings"), aligned(1))) \
+ __attribute__((section("__ksymtab_strings"), used, aligned(1))) \
= #sym; \
- static const struct kernel_symbol __ksymtab_##sym \
- __used \
- __attribute__((section("___ksymtab" sec "+" #sym), used)) \
- = { (unsigned long)&sym, __kstrtab_##sym }
+ __KSYMTAB_ENTRY(sym, sec)
+
+#if defined(__DISABLE_EXPORTS)
+
+/*
+ * Allow symbol exports to be disabled completely so that C code may
+ * be reused in other execution contexts such as the UEFI stub or the
+ * decompressor.
+ */
+#define __EXPORT_SYMBOL(sym, sec)
-#if defined(__KSYM_DEPS__)
+#elif defined(__KSYM_DEPS__)
/*
* For fine grained build dependencies, we want to tell the build system
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f8ef2c5d0085..5e3bc3747015 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -79,6 +79,8 @@ extern struct inodes_stat_t inodes_stat;
extern int leases_enable, lease_break_time;
extern int sysctl_protected_symlinks;
extern int sysctl_protected_hardlinks;
+extern int sysctl_protected_fifos;
+extern int sysctl_protected_regular;
typedef __kernel_rwf_t rwf_t;
@@ -190,7 +192,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
#define ATTR_ATIME_SET (1 << 7)
#define ATTR_MTIME_SET (1 << 8)
#define ATTR_FORCE (1 << 9) /* Not a change, but a change it */
-#define ATTR_ATTR_FLAG (1 << 10)
#define ATTR_KILL_SUID (1 << 11)
#define ATTR_KILL_SGID (1 << 12)
#define ATTR_FILE (1 << 13)
@@ -356,6 +357,10 @@ struct address_space_operations {
/* Set a page dirty. Return true if this dirtied it */
int (*set_page_dirty)(struct page *page);
+ /*
+ * Reads in the requested pages. Unlike ->readpage(), this is
+ * PURELY used for read-ahead!.
+ */
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 2b9b6f1ff5e0..b8f4182f42f1 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -84,6 +84,8 @@ struct fsnotify_event_private_data;
struct fsnotify_fname;
struct fsnotify_iter_info;
+struct mem_cgroup;
+
/*
* Each group much define these ops. The fsnotify infrastructure will call
* these operations for each relevant group.
@@ -127,6 +129,8 @@ struct fsnotify_event {
* everything will be cleaned up.
*/
struct fsnotify_group {
+ const struct fsnotify_ops *ops; /* how this group handles things */
+
/*
* How the refcnt is used is up to each group. When the refcnt hits 0
* fsnotify will clean up all of the resources associated with this group.
@@ -137,8 +141,6 @@ struct fsnotify_group {
*/
refcount_t refcnt; /* things with interest in this group */
- const struct fsnotify_ops *ops; /* how this group handles things */
-
/* needed to send notification to userspace */
spinlock_t notification_lock; /* protect the notification_list */
struct list_head notification_list; /* list of event_holder this group needs to send to userspace */
@@ -160,6 +162,8 @@ struct fsnotify_group {
atomic_t num_marks; /* 1 for each mark and 1 for not being
* past the point of no return when freeing
* a group */
+ atomic_t user_waits; /* Number of tasks waiting for user
+ * response */
struct list_head marks_list; /* all inode marks for this group */
struct fasync_struct *fsn_fa; /* async notification */
@@ -167,8 +171,8 @@ struct fsnotify_group {
struct fsnotify_event *overflow_event; /* Event we queue when the
* notification list is too
* full */
- atomic_t user_waits; /* Number of tasks waiting for user
- * response */
+
+ struct mem_cgroup *memcg; /* memcg to charge allocations */
/* groups can define private fields here or use the void *private */
union {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 36fa6a2a82e3..c39d9170a8a0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -348,9 +348,6 @@ struct hstate {
struct huge_bootmem_page {
struct list_head list;
struct hstate *hstate;
-#ifdef CONFIG_HIGHMEM
- phys_addr_t phys;
-#endif
};
struct page *alloc_huge_page(struct vm_area_struct *vma,
diff --git a/include/linux/init.h b/include/linux/init.h
index bc27cf03c41e..2538d176dd1f 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -116,8 +116,24 @@
typedef int (*initcall_t)(void);
typedef void (*exitcall_t)(void);
-extern initcall_t __con_initcall_start[], __con_initcall_end[];
-extern initcall_t __security_initcall_start[], __security_initcall_end[];
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+typedef int initcall_entry_t;
+
+static inline initcall_t initcall_from_entry(initcall_entry_t *entry)
+{
+ return offset_to_ptr(entry);
+}
+#else
+typedef initcall_t initcall_entry_t;
+
+static inline initcall_t initcall_from_entry(initcall_entry_t *entry)
+{
+ return *entry;
+}
+#endif
+
+extern initcall_entry_t __con_initcall_start[], __con_initcall_end[];
+extern initcall_entry_t __security_initcall_start[], __security_initcall_end[];
/* Used for contructor calls. */
typedef void (*ctor_fn_t)(void);
@@ -167,9 +183,20 @@ extern bool initcall_debug;
* as KEEP() in the linker script.
*/
-#define __define_initcall(fn, id) \
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+#define ___define_initcall(fn, id, __sec) \
+ __ADDRESSABLE(fn) \
+ asm(".section \"" #__sec ".init\", \"a\" \n" \
+ "__initcall_" #fn #id ": \n" \
+ ".long " #fn " - . \n" \
+ ".previous \n");
+#else
+#define ___define_initcall(fn, id, __sec) \
static initcall_t __initcall_##fn##id __used \
- __attribute__((__section__(".initcall" #id ".init"))) = fn;
+ __attribute__((__section__(#__sec ".init"))) = fn;
+#endif
+
+#define __define_initcall(fn, id) ___define_initcall(fn, id, .initcall##id)
/*
* Early initcalls run before initializing SMP.
@@ -208,13 +235,8 @@ extern bool initcall_debug;
#define __exitcall(fn) \
static exitcall_t __exitcall_##fn __exit_call = fn
-#define console_initcall(fn) \
- static initcall_t __initcall_##fn \
- __used __section(.con_initcall.init) = fn
-
-#define security_initcall(fn) \
- static initcall_t __initcall_##fn \
- __used __section(.security_initcall.init) = fn
+#define console_initcall(fn) ___define_initcall(fn,, .con_initcall)
+#define security_initcall(fn) ___define_initcall(fn,, .security_initcall)
struct obs_kernel_param {
const char *str;
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 6cea726612b7..6ab8c1bada3f 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -16,10 +16,9 @@ struct user_namespace;
struct ipc_ids {
int in_use;
unsigned short seq;
- bool tables_initialized;
struct rw_semaphore rwsem;
struct idr ipcs_idr;
- int max_id;
+ int max_idx;
#ifdef CONFIG_CHECKPOINT_RESTORE
int next_id;
#endif
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index de784fd11d12..46aae129917c 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -20,7 +20,7 @@ extern pmd_t kasan_zero_pmd[PTRS_PER_PMD];
extern pud_t kasan_zero_pud[PTRS_PER_PUD];
extern p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D];
-void kasan_populate_zero_shadow(const void *shadow_start,
+int kasan_populate_zero_shadow(const void *shadow_start,
const void *shadow_end);
static inline void *kasan_mem_to_shadow(const void *addr)
@@ -71,6 +71,9 @@ struct kasan_cache {
int kasan_module_alloc(void *addr, size_t size);
void kasan_free_shadow(const struct vm_struct *vm);
+int kasan_add_zero_shadow(void *start, unsigned long size);
+void kasan_remove_zero_shadow(void *start, unsigned long size);
+
size_t ksize(const void *);
static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); }
size_t kasan_metadata_size(struct kmem_cache *cache);
@@ -124,6 +127,14 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
static inline void kasan_free_shadow(const struct vm_struct *vm) {}
+static inline int kasan_add_zero_shadow(void *start, unsigned long size)
+{
+ return 0;
+}
+static inline void kasan_remove_zero_shadow(void *start,
+ unsigned long size)
+{}
+
static inline void kasan_unpoison_slab(const void *ptr) { }
static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 8de55e4b5ee9..c20f296438fb 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -35,7 +35,7 @@ struct vmcoredd_node {
};
#ifdef CONFIG_PROC_KCORE
-extern void kclist_add(struct kcore_list *, void *, size_t, int type);
+void __init kclist_add(struct kcore_list *, void *, size_t, int type);
#else
static inline
void kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 941dc0a5a877..d6aac75b51ba 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -85,7 +85,23 @@
* arguments just once each.
*/
#define __round_mask(x, y) ((__typeof__(x))((y)-1))
+/**
+ * round_up - round up to next specified power of 2
+ * @x: the value to round
+ * @y: multiple to round up to (must be a power of 2)
+ *
+ * Rounds @x up to next multiple of @y (which must be a power of 2).
+ * To perform arbitrary rounding up, use roundup() below.
+ */
#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
+/**
+ * round_down - round down to next specified power of 2
+ * @x: the value to round
+ * @y: multiple to round down to (must be a power of 2)
+ *
+ * Rounds @x down to next multiple of @y (which must be a power of 2).
+ * To perform arbitrary rounding down, use rounddown() below.
+ */
#define round_down(x, y) ((x) & ~__round_mask(x, y))
/**
@@ -110,13 +126,30 @@
# define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP(ll,d)
#endif
-/* The `const' in roundup() prevents gcc-3.3 from calling __divdi3 */
+/**
+ * roundup - round up to the next specified multiple
+ * @x: the value to up
+ * @y: multiple to round up to
+ *
+ * Rounds @x up to next multiple of @y. If @y will always be a power
+ * of 2, consider using the faster round_up().
+ *
+ * The `const' here prevents gcc-3.3 from calling __divdi3
+ */
#define roundup(x, y) ( \
{ \
const typeof(y) __y = y; \
(((x) + (__y - 1)) / __y) * __y; \
} \
)
+/**
+ * rounddown - round down to next specified multiple
+ * @x: the value to round
+ * @y: multiple to round down to
+ *
+ * Rounds @x down to next multiple of @y. If @y will always be a power
+ * of 2, consider using the faster round_down().
+ */
#define rounddown(x, y) ( \
{ \
typeof(x) __x = (x); \
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7c7362dd2faa..0205aee44ded 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1289,8 +1289,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
}
#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */
-void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
- unsigned long start, unsigned long end);
+int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+ unsigned long start, unsigned long end, bool blockable);
#ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu);
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 96def9d15b1b..aa5efd9351eb 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -42,7 +42,7 @@ struct list_lru_node {
spinlock_t lock;
/* global list, used for the root cgroup in cgroup aware lrus */
struct list_lru_one lru;
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
/* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
struct list_lru_memcg __rcu *memcg_lrus;
#endif
@@ -51,21 +51,25 @@ struct list_lru_node {
struct list_lru {
struct list_lru_node *node;
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
struct list_head list;
+ int shrinker_id;
#endif
};
void list_lru_destroy(struct list_lru *lru);
int __list_lru_init(struct list_lru *lru, bool memcg_aware,
- struct lock_class_key *key);
+ struct lock_class_key *key, struct shrinker *shrinker);
-#define list_lru_init(lru) __list_lru_init((lru), false, NULL)
-#define list_lru_init_key(lru, key) __list_lru_init((lru), false, (key))
-#define list_lru_init_memcg(lru) __list_lru_init((lru), true, NULL)
+#define list_lru_init(lru) \
+ __list_lru_init((lru), false, NULL, NULL)
+#define list_lru_init_key(lru, key) \
+ __list_lru_init((lru), false, (key), NULL)
+#define list_lru_init_memcg(lru, shrinker) \
+ __list_lru_init((lru), true, NULL, shrinker)
int memcg_update_all_list_lrus(int num_memcgs);
-void memcg_drain_all_list_lrus(int src_idx, int dst_idx);
+void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg);
/**
* list_lru_add: add an element to the lru list's tail
@@ -162,6 +166,23 @@ unsigned long list_lru_walk_one(struct list_lru *lru,
int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk);
+/**
+ * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items.
+ * @lru: the lru pointer.
+ * @nid: the node id to scan from.
+ * @memcg: the cgroup to scan from.
+ * @isolate: callback function that is resposible for deciding what to do with
+ * the item currently being scanned
+ * @cb_arg: opaque type that will be passed to @isolate
+ * @nr_to_walk: how many items to scan.
+ *
+ * Same as @list_lru_walk_one except that the spinlock is acquired with
+ * spin_lock_irq().
+ */
+unsigned long list_lru_walk_one_irq(struct list_lru *lru,
+ int nid, struct mem_cgroup *memcg,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk);
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk);
@@ -175,6 +196,14 @@ list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
}
static inline unsigned long
+list_lru_shrink_walk_irq(struct list_lru *lru, struct shrink_control *sc,
+ list_lru_walk_cb isolate, void *cb_arg)
+{
+ return list_lru_walk_one_irq(lru, sc->nid, sc->memcg, isolate, cb_arg,
+ &sc->nr_to_scan);
+}
+
+static inline unsigned long
list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
void *cb_arg, unsigned long nr_to_walk)
{
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 516920549378..58697ad64cd3 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -232,6 +232,8 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+int memblock_search_pfn_regions(unsigned long pfn);
+
/**
* for_each_free_mem_range - iterate through free memblock areas
* @i: u64 used as loop variable
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 680d3395fc83..652f602167df 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -112,6 +112,15 @@ struct lruvec_stat {
};
/*
+ * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
+ * which have elements charged to this memcg.
+ */
+struct memcg_shrinker_map {
+ struct rcu_head rcu;
+ unsigned long map[0];
+};
+
+/*
* per-zone information in memory controller.
*/
struct mem_cgroup_per_node {
@@ -124,6 +133,9 @@ struct mem_cgroup_per_node {
struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
+#ifdef CONFIG_MEMCG_KMEM
+ struct memcg_shrinker_map __rcu *shrinker_map;
+#endif
struct rb_node tree_node; /* RB tree node */
unsigned long usage_in_excess;/* Set to the value by which */
/* the soft limit is exceeded*/
@@ -213,6 +225,11 @@ struct mem_cgroup {
*/
bool use_hierarchy;
+ /*
+ * Should the OOM killer kill all belonging tasks, had it kill one?
+ */
+ bool oom_group;
+
/* protected by memcg_oom_lock */
bool oom_lock;
int under_oom;
@@ -271,7 +288,7 @@ struct mem_cgroup {
bool tcpmem_active;
int tcpmem_pressure;
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
/* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
enum memcg_kmem_state kmem_state;
@@ -306,6 +323,11 @@ struct mem_cgroup {
extern struct mem_cgroup *root_mem_cgroup;
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
+{
+ return (memcg == root_mem_cgroup);
+}
+
static inline bool mem_cgroup_disabled(void)
{
return !cgroup_subsys_enabled(memory_cgrp_subsys);
@@ -373,11 +395,21 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
+
+struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
+
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
}
+static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+{
+ if (memcg)
+ css_put(&memcg->css);
+}
+
#define mem_cgroup_from_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
@@ -497,16 +529,16 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
struct task_struct *p);
-static inline void mem_cgroup_oom_enable(void)
+static inline void mem_cgroup_enter_user_fault(void)
{
- WARN_ON(current->memcg_may_oom);
- current->memcg_may_oom = 1;
+ WARN_ON(current->in_user_fault);
+ current->in_user_fault = 1;
}
-static inline void mem_cgroup_oom_disable(void)
+static inline void mem_cgroup_exit_user_fault(void)
{
- WARN_ON(!current->memcg_may_oom);
- current->memcg_may_oom = 0;
+ WARN_ON(!current->in_user_fault);
+ current->in_user_fault = 0;
}
static inline bool task_in_memcg_oom(struct task_struct *p)
@@ -515,6 +547,9 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
}
bool mem_cgroup_oom_synchronize(bool wait);
+struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
+ struct mem_cgroup *oom_domain);
+void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
#ifdef CONFIG_MEMCG_SWAP
extern int do_swap_account;
@@ -762,6 +797,11 @@ void mem_cgroup_split_huge_fixup(struct page *head);
struct mem_cgroup;
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
+{
+ return true;
+}
+
static inline bool mem_cgroup_disabled(void)
{
return true;
@@ -850,6 +890,20 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,
return true;
}
+static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+{
+ return NULL;
+}
+
+static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
+{
+ return NULL;
+}
+
+static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+{
+}
+
static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -937,11 +991,11 @@ static inline void mem_cgroup_handle_over_high(void)
{
}
-static inline void mem_cgroup_oom_enable(void)
+static inline void mem_cgroup_enter_user_fault(void)
{
}
-static inline void mem_cgroup_oom_disable(void)
+static inline void mem_cgroup_exit_user_fault(void)
{
}
@@ -955,6 +1009,16 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
return false;
}
+static inline struct mem_cgroup *mem_cgroup_get_oom_group(
+ struct task_struct *victim, struct mem_cgroup *oom_domain)
+{
+ return NULL;
+}
+
+static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
+{
+}
+
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
int idx)
{
@@ -1207,7 +1271,7 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
int memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
void memcg_kmem_uncharge(struct page *page, int order);
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
extern struct static_key_false memcg_kmem_enabled_key;
extern struct workqueue_struct *memcg_kmem_cache_wq;
@@ -1238,6 +1302,10 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
}
+extern int memcg_expand_shrinker_maps(int new_id);
+
+extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
+ int nid, int shrinker_id);
#else
#define for_each_memcg_cache_index(_idx) \
for (; NULL; )
@@ -1260,6 +1328,8 @@ static inline void memcg_put_cache_ids(void)
{
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
+ int nid, int shrinker_id) { }
+#endif /* CONFIG_MEMCG_KMEM */
#endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 4e9828cda7a2..34a28227068d 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -319,6 +319,7 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
static inline void remove_memory(int nid, u64 start, u64 size) {}
#endif /* CONFIG_MEMORY_HOTREMOVE */
+extern void __ref free_area_init_core_hotplug(int nid);
extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
void *arg, int (*func)(struct memory_block *, void *));
extern int add_memory(int nid, u64 start, u64 size);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ebe53e98d4b8..ddf844296060 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -456,6 +456,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
static const struct vm_operations_struct dummy_vm_ops = {};
+ memset(vma, 0, sizeof(*vma));
vma->vm_mm = mm;
vma->vm_ops = &dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
@@ -959,15 +960,6 @@ static inline int page_zone_id(struct page *page)
return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
}
-static inline int zone_to_nid(struct zone *zone)
-{
-#ifdef CONFIG_NUMA
- return zone->node;
-#else
- return 0;
-#endif
-}
-
#ifdef NODE_NOT_IN_PAGE_FLAGS
extern int page_to_nid(const struct page *page);
#else
@@ -2023,7 +2015,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
extern void __init pagecache_init(void);
extern void free_area_init(unsigned long * zones_size);
-extern void free_area_init_node(int nid, unsigned long * zones_size,
+extern void __init free_area_init_node(int nid, unsigned long * zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
extern void free_initmem(void);
@@ -2665,12 +2657,7 @@ extern int randomize_va_space;
const char * arch_vma_name(struct vm_area_struct *vma);
void print_vma_addr(char *prefix, unsigned long rip);
-void sparse_mem_maps_populate_node(struct page **map_map,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long map_count,
- int nodeid);
-
+void *sparse_buffer_alloc(unsigned long size);
struct page *sparse_mem_map_populate(unsigned long pnum, int nid,
struct vmem_altmap *altmap);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
@@ -2753,7 +2740,8 @@ extern void clear_huge_page(struct page *page,
unsigned long addr_hint,
unsigned int pages_per_huge_page);
extern void copy_user_huge_page(struct page *dst, struct page *src,
- unsigned long addr, struct vm_area_struct *vma,
+ unsigned long addr_hint,
+ struct vm_area_struct *vma,
unsigned int pages_per_huge_page);
extern long copy_huge_page_from_user(struct page *dst_page,
const void __user *usr_src,
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 392e6af82701..133ba78820ee 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -151,13 +151,15 @@ struct mmu_notifier_ops {
* address space but may still be referenced by sptes until
* the last refcount is dropped.
*
- * If both of these callbacks cannot block, and invalidate_range
- * cannot block, mmu_notifier_ops.flags should have
- * MMU_INVALIDATE_DOES_NOT_BLOCK set.
+ * If blockable argument is set to false then the callback cannot
+ * sleep and has to return with -EAGAIN. 0 should be returned
+ * otherwise.
+ *
*/
- void (*invalidate_range_start)(struct mmu_notifier *mn,
+ int (*invalidate_range_start)(struct mmu_notifier *mn,
struct mm_struct *mm,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end,
+ bool blockable);
void (*invalidate_range_end)(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start, unsigned long end);
@@ -229,8 +231,9 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address);
extern void __mmu_notifier_change_pte(struct mm_struct *mm,
unsigned long address, pte_t pte);
-extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
- unsigned long start, unsigned long end);
+extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ bool blockable);
extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
unsigned long start, unsigned long end,
bool only_end);
@@ -281,7 +284,15 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
if (mm_has_notifiers(mm))
- __mmu_notifier_invalidate_range_start(mm, start, end);
+ __mmu_notifier_invalidate_range_start(mm, start, end, true);
+}
+
+static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ if (mm_has_notifiers(mm))
+ return __mmu_notifier_invalidate_range_start(mm, start, end, false);
+ return 0;
}
static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -461,6 +472,12 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
{
}
+static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ return 0;
+}
+
static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 32699b2dc52a..0859130e4db8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -755,25 +755,6 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
}
-static inline int zone_id(const struct zone *zone)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
-
- return zone - pgdat->node_zones;
-}
-
-#ifdef CONFIG_ZONE_DEVICE
-static inline bool is_dev_zone(const struct zone *zone)
-{
- return zone_id(zone) == ZONE_DEVICE;
-}
-#else
-static inline bool is_dev_zone(const struct zone *zone)
-{
- return false;
-}
-#endif
-
#include <linux/memory_hotplug.h>
void build_all_zonelists(pg_data_t *pgdat);
@@ -824,6 +805,18 @@ static inline int local_memory_node(int node_id) { return node_id; };
*/
#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
+#ifdef CONFIG_ZONE_DEVICE
+static inline bool is_dev_zone(const struct zone *zone)
+{
+ return zone_idx(zone) == ZONE_DEVICE;
+}
+#else
+static inline bool is_dev_zone(const struct zone *zone)
+{
+ return false;
+}
+#endif
+
/*
* Returns true if a zone has pages managed by the buddy allocator.
* All the reclaim decisions have to use this function rather than
@@ -841,6 +834,25 @@ static inline bool populated_zone(struct zone *zone)
return zone->present_pages;
}
+#ifdef CONFIG_NUMA
+static inline int zone_to_nid(struct zone *zone)
+{
+ return zone->node;
+}
+
+static inline void zone_set_nid(struct zone *zone, int nid)
+{
+ zone->node = nid;
+}
+#else
+static inline int zone_to_nid(struct zone *zone)
+{
+ return 0;
+}
+
+static inline void zone_set_nid(struct zone *zone, int nid) {}
+#endif
+
extern int movable_zone;
#ifdef CONFIG_HIGHMEM
@@ -956,12 +968,7 @@ static inline int zonelist_zone_idx(struct zoneref *zoneref)
static inline int zonelist_node_idx(struct zoneref *zoneref)
{
-#ifdef CONFIG_NUMA
- /* zone_to_nid not available in this context */
- return zoneref->zone->node;
-#else
- return 0;
-#endif /* CONFIG_NUMA */
+ return zone_to_nid(zoneref->zone);
}
struct zoneref *__next_zones_zonelist(struct zoneref *z,
@@ -1241,6 +1248,8 @@ static inline int pfn_valid(unsigned long pfn)
return 0;
return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
}
+
+#define next_valid_pfn(pfn) (pfn + 1)
#endif
static inline int pfn_present(unsigned long pfn)
@@ -1265,7 +1274,16 @@ static inline int pfn_present(unsigned long pfn)
#define pfn_to_nid(pfn) (0)
#endif
+#ifdef CONFIG_HAVE_MEMBLOCK_PFN_VALID
+extern ulong memblock_next_valid_pfn(ulong pfn);
+#define next_valid_pfn(pfn) memblock_next_valid_pfn(pfn)
+
+extern int pfn_valid_region(ulong pfn);
+#define early_pfn_valid(pfn) pfn_valid_region(pfn)
+#else
#define early_pfn_valid(pfn) pfn_valid(pfn)
+#endif /*CONFIG_HAVE_ARCH_PFN_VALID*/
+
void sparse_init(void);
#else
#define sparse_init() do {} while (0)
@@ -1287,6 +1305,11 @@ struct mminit_pfnnid_cache {
#define early_pfn_valid(pfn) (1)
#endif
+/* fallback to default definitions*/
+#ifndef next_valid_pfn
+#define next_valid_pfn(pfn) (pfn + 1)
+#endif
+
void memory_present(int nid, unsigned long start, unsigned long end);
/*
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index db99240d00bd..c79e859408e6 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -363,7 +363,6 @@ static inline void net_dim_sample(u16 event_ctr,
}
#define NET_DIM_NEVENTS 64
-#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
#define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) & (BIT_ULL(bits) - 1))
static inline void net_dim_calc_stats(struct net_dim_sample *start,
diff --git a/include/linux/node.h b/include/linux/node.h
index 6d336e38d155..257bb3d6d014 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -33,10 +33,10 @@ typedef void (*node_registration_func_t)(struct node *);
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA)
extern int link_mem_sections(int nid, unsigned long start_pfn,
- unsigned long nr_pages, bool check_nid);
+ unsigned long end_pfn);
#else
static inline int link_mem_sections(int nid, unsigned long start_pfn,
- unsigned long nr_pages, bool check_nid)
+ unsigned long end_pfn)
{
return 0;
}
@@ -54,12 +54,14 @@ static inline int register_one_node(int nid)
if (node_online(nid)) {
struct pglist_data *pgdat = NODE_DATA(nid);
+ unsigned long start_pfn = pgdat->node_start_pfn;
+ unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
error = __register_one_node(nid);
if (error)
return error;
/* link memory sections under this node */
- error = link_mem_sections(nid, pgdat->node_start_pfn, pgdat->node_spanned_pages, true);
+ error = link_mem_sections(nid, start_pfn, end_pfn);
}
return error;
@@ -69,7 +71,7 @@ extern void unregister_one_node(int nid);
extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
extern int register_mem_sect_under_node(struct memory_block *mem_blk,
- int nid, bool check_nid);
+ void *arg);
extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
unsigned long phys_index);
@@ -99,7 +101,7 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
return 0;
}
static inline int register_mem_sect_under_node(struct memory_block *mem_blk,
- int nid, bool check_nid)
+ void *arg)
{
return 0;
}
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 6adac113e96d..92f70e4c6252 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -95,7 +95,7 @@ static inline int check_stable_address_space(struct mm_struct *mm)
return 0;
}
-void __oom_reap_task_mm(struct mm_struct *mm);
+bool __oom_reap_task_mm(struct mm_struct *mm);
extern unsigned long oom_badness(struct task_struct *p,
struct mem_cgroup *memcg, const nodemask_t *nodemask,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 901943e4754b..74bee8cecf4c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -369,8 +369,13 @@ PAGEFLAG_FALSE(Uncached)
PAGEFLAG(HWPoison, hwpoison, PF_ANY)
TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
+extern bool set_hwpoison_free_buddy_page(struct page *page);
#else
PAGEFLAG_FALSE(HWPoison)
+static inline bool set_hwpoison_free_buddy_page(struct page *page)
+{
+ return 0;
+}
#define __PG_HWPOISON 0
#endif
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index ca5461efae2f..f84f167ec04c 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -16,18 +16,7 @@ struct page_ext_operations {
#ifdef CONFIG_PAGE_EXTENSION
-/*
- * page_ext->flags bits:
- *
- * PAGE_EXT_DEBUG_POISON is set for poisoned pages. This is used to
- * implement generic debug pagealloc feature. The pages are filled with
- * poison patterns and set this flag after free_pages(). The poisoned
- * pages are verified whether the patterns are not corrupted and clear
- * the flag before alloc_pages().
- */
-
enum page_ext_flags {
- PAGE_EXT_DEBUG_POISON, /* Page is poisoned */
PAGE_EXT_DEBUG_GUARD,
PAGE_EXT_OWNER,
#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
@@ -61,7 +50,7 @@ static inline void page_ext_init(void)
}
#endif
-struct page_ext *lookup_page_ext(struct page *page);
+struct page_ext *lookup_page_ext(const struct page *page);
#else /* !CONFIG_PAGE_EXTENSION */
struct page_ext;
@@ -70,7 +59,7 @@ static inline void pgdat_page_ext_init(struct pglist_data *pgdat)
{
}
-static inline struct page_ext *lookup_page_ext(struct page *page)
+static inline struct page_ext *lookup_page_ext(const struct page *page)
{
return NULL;
}
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 9b87f1936906..e72ca8dd6241 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1809,7 +1809,11 @@ struct pci_fixup {
u16 device; /* Or PCI_ANY_ID */
u32 class; /* Or PCI_ANY_ID */
unsigned int class_shift; /* should be 0, 8, 16 */
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ int hook_offset;
+#else
void (*hook)(struct pci_dev *dev);
+#endif
};
enum pci_fixup_pass {
@@ -1823,12 +1827,28 @@ enum pci_fixup_pass {
pci_fixup_suspend_late, /* pci_device_suspend_late() */
};
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+#define __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class, \
+ class_shift, hook) \
+ __ADDRESSABLE(hook) \
+ asm(".section " #sec ", \"a\" \n" \
+ ".balign 16 \n" \
+ ".short " #vendor ", " #device " \n" \
+ ".long " #class ", " #class_shift " \n" \
+ ".long " #hook " - . \n" \
+ ".previous \n");
+#define DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class, \
+ class_shift, hook) \
+ __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class, \
+ class_shift, hook)
+#else
/* Anonymous variables would be nice... */
#define DECLARE_PCI_FIXUP_SECTION(section, name, vendor, device, class, \
class_shift, hook) \
static const struct pci_fixup __PASTE(__pci_fixup_##name,__LINE__) __used \
__attribute__((__section__(#section), aligned((sizeof(void *))))) \
= { vendor, device, class, class_shift, hook };
+#endif
#define DECLARE_PCI_FIXUP_CLASS_EARLY(vendor, device, class, \
class_shift, hook) \
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 296bbe49d5d1..70b7123f38c7 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -149,4 +149,6 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
(typeof(type) __percpu *)__alloc_percpu(sizeof(type), \
__alignof__(type))
+extern unsigned long pcpu_nr_pages(void);
+
#endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 626fc65c4336..d0e1f1522a78 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -129,7 +129,7 @@ int open_related_ns(struct ns_common *ns,
struct ns_common *(*get_ns)(struct ns_common *ns));
/* get the associated pid namespace for a file in procfs */
-static inline struct pid_namespace *proc_pid_ns(struct inode *inode)
+static inline struct pid_namespace *proc_pid_ns(const struct inode *inode)
{
return inode->i_sb->s_fs_info;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 10e754c03d97..42e7ee7ea2b9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -722,8 +722,8 @@ struct task_struct {
unsigned restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG
- unsigned memcg_may_oom:1;
-#ifndef CONFIG_SLOB
+ unsigned in_user_fault:1;
+#ifdef CONFIG_MEMCG_KMEM
unsigned memcg_kmem_skip_account:1;
#endif
#endif
@@ -854,6 +854,7 @@ struct task_struct {
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
unsigned long last_switch_count;
+ unsigned long last_switch_time;
#endif
/* Filesystem information: */
struct fs_struct *fs;
@@ -1153,6 +1154,9 @@ struct task_struct {
/* Number of pages to reclaim on returning to userland: */
unsigned int memcg_nr_pages_over_high;
+
+ /* Used by memcontrol for targeted memcg charge: */
+ struct mem_cgroup *active_memcg;
#endif
#ifdef CONFIG_BLK_CGROUP
@@ -1188,6 +1192,13 @@ struct task_struct {
void *security;
#endif
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ unsigned long getblk_stamp;
+ unsigned int getblk_executed;
+ unsigned int getblk_bh_count;
+ unsigned long getblk_bh_state;
+#endif
+
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
unsigned long lowest_stack;
unsigned long prev_lowest_stack;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 44d356f5e47c..aebb370a0006 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -248,6 +248,43 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
current->flags = (current->flags & ~PF_MEMALLOC) | flags;
}
+#ifdef CONFIG_MEMCG
+/**
+ * memalloc_use_memcg - Starts the remote memcg charging scope.
+ * @memcg: memcg to charge.
+ *
+ * This function marks the beginning of the remote memcg charging scope. All the
+ * __GFP_ACCOUNT allocations till the end of the scope will be charged to the
+ * given memcg.
+ *
+ * NOTE: This function is not nesting safe.
+ */
+static inline void memalloc_use_memcg(struct mem_cgroup *memcg)
+{
+ WARN_ON_ONCE(current->active_memcg);
+ current->active_memcg = memcg;
+}
+
+/**
+ * memalloc_unuse_memcg - Ends the remote memcg charging scope.
+ *
+ * This function marks the end of the remote memcg charging scope started by
+ * memalloc_use_memcg().
+ */
+static inline void memalloc_unuse_memcg(void)
+{
+ current->active_memcg = NULL;
+}
+#else
+static inline void memalloc_use_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void memalloc_unuse_memcg(void)
+{
+}
+#endif
+
#ifdef CONFIG_MEMBARRIER
enum {
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 4e9b77fb702d..9d223c91008d 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -112,6 +112,9 @@ struct signal_struct {
int group_stop_count;
unsigned int flags; /* see SIGNAL_* flags below */
+ /* The signal sent when the parent dies: */
+ int pdeath_signal_proc;
+
/*
* PR_SET_CHILD_SUBREAPER marks a process, like a service
* manager, to re-parent orphan (double-forking) child processes
@@ -323,7 +326,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey);
int force_sig_ptrace_errno_trap(int errno, void __user *addr);
extern int send_sig_info(int, struct siginfo *, struct task_struct *);
-extern int force_sigsegv(int, struct task_struct *);
+extern void force_sigsegv(int sig, struct task_struct *p);
extern int force_sig_info(int, struct siginfo *, struct task_struct *);
extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 913488d828cb..a9c32daeb9d8 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -10,6 +10,7 @@ struct ctl_table;
extern int sysctl_hung_task_check_count;
extern unsigned int sysctl_hung_task_panic;
extern unsigned long sysctl_hung_task_timeout_secs;
+extern unsigned long sysctl_hung_task_check_interval_secs;
extern int sysctl_hung_task_warnings;
extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
void __user *buffer,
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 96fe289c4c6e..39ad98c09c58 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -4,6 +4,7 @@
#include <linux/uidgid.h>
#include <linux/atomic.h>
+#include <linux/refcount.h>
#include <linux/ratelimit.h>
struct key;
@@ -12,7 +13,7 @@ struct key;
* Some day this will be a full-fledged user tracking system..
*/
struct user_struct {
- atomic_t __count; /* reference count */
+ refcount_t __count; /* reference count */
atomic_t processes; /* How many processes does this user have? */
atomic_t sigpending; /* How many pending signals does this user have? */
#ifdef CONFIG_FANOTIFY
@@ -59,7 +60,7 @@ extern struct user_struct root_user;
extern struct user_struct * alloc_uid(kuid_t);
static inline struct user_struct *get_uid(struct user_struct *u)
{
- atomic_inc(&u->__count);
+ refcount_inc(&u->__count);
return u;
}
extern void free_uid(struct user_struct *);
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 6794490f25b2..9443cafd1969 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -12,6 +12,9 @@
struct shrink_control {
gfp_t gfp_mask;
+ /* current node being shrunk (for NUMA aware shrinkers) */
+ int nid;
+
/*
* How many objects scan_objects should scan and try to reclaim.
* This is reset before every call, so it is safe for callees
@@ -26,20 +29,20 @@ struct shrink_control {
*/
unsigned long nr_scanned;
- /* current node being shrunk (for NUMA aware shrinkers) */
- int nid;
-
/* current memcg being shrunk (for memcg aware shrinkers) */
struct mem_cgroup *memcg;
};
#define SHRINK_STOP (~0UL)
+#define SHRINK_EMPTY (~0UL - 1)
/*
* A callback you can register to apply pressure to ageable caches.
*
* @count_objects should return the number of freeable items in the cache. If
- * there are no objects to free or the number of freeable items cannot be
- * determined, it should return 0. No deadlock checks should be done during the
+ * there are no objects to free, it should return SHRINK_EMPTY, while 0 is
+ * returned in cases of the number of freeable items cannot be determined
+ * or shrinker should skip this cache for this time (e.g., their number
+ * is below shrinkable limit). No deadlock checks should be done during the
* count callback - the shrinker relies on aggregating scan counts that couldn't
* be executed due to potential deadlocks to be run at a later call when the
* deadlock condition is no longer pending.
@@ -60,12 +63,16 @@ struct shrinker {
unsigned long (*scan_objects)(struct shrinker *,
struct shrink_control *sc);
- int seeks; /* seeks to recreate an obj */
long batch; /* reclaim batch size, 0 = default */
- unsigned long flags;
+ int seeks; /* seeks to recreate an obj */
+ unsigned flags;
/* These are for internal use */
struct list_head list;
+#ifdef CONFIG_MEMCG_KMEM
+ /* ID in shrinker_idr */
+ int id;
+#endif
/* objs pending delete, per node */
atomic_long_t *nr_deferred;
};
diff --git a/include/linux/signal.h b/include/linux/signal.h
index fe125b0335f7..3d4cd5db30a9 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -267,7 +267,7 @@ extern void set_current_blocked(sigset_t *);
extern void __set_current_blocked(const sigset_t *);
extern int show_unhandled_signals;
-extern int get_signal(struct ksignal *ksig);
+extern bool get_signal(struct ksignal *ksig);
extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping);
extern void exit_signals(struct task_struct *tsk);
extern void kernel_sigaction(int, __sighandler_t);
@@ -289,7 +289,7 @@ static inline void disallow_signal(int sig)
extern struct kmem_cache *sighand_cachep;
-int unhandled_signal(struct task_struct *tsk, int sig);
+extern bool unhandled_signal(struct task_struct *tsk, int sig);
/*
* In POSIX a signal is sent either to a specific thread (Linux task)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 14e3fe4bd6a1..ed9cbddeb4a6 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -97,7 +97,7 @@
# define SLAB_FAILSLAB 0
#endif
/* Account to memcg */
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
# define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000U)
#else
# define SLAB_ACCOUNT 0
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8de15e44842f..378792e41043 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -172,8 +172,9 @@ enum {
SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */
SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */
SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */
+ SWP_VALID = (1 << 12), /* swap is valid to be operated on? */
/* add others here before... */
- SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */
+ SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */
};
#define SWAP_CLUSTER_MAX 32UL
@@ -442,7 +443,7 @@ extern void si_swapinfo(struct sysinfo *);
extern swp_entry_t get_swap_page(struct page *page);
extern void put_swap_page(struct page *page, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
-extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]);
+extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern void swap_shmem_alloc(swp_entry_t);
extern int swap_duplicate(swp_entry_t);
@@ -455,7 +456,7 @@ extern unsigned int count_swap_pages(int, int);
extern sector_t map_swap_page(struct page *, struct block_device **);
extern sector_t swapdev_block(int, pgoff_t);
extern int page_swapcount(struct page *);
-extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry);
+extern int __swap_count(swp_entry_t entry);
extern int __swp_swapcount(swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
extern struct swap_info_struct *page_swap_info(struct page *);
@@ -465,6 +466,12 @@ extern int try_to_free_swap(struct page *);
struct backing_dev_info;
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
extern void exit_swap_address_space(unsigned int type);
+extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
+
+static inline void put_swap_device(struct swap_info_struct *si)
+{
+ preempt_enable();
+}
#else /* CONFIG_SWAP */
@@ -571,7 +578,7 @@ static inline int page_swapcount(struct page *page)
return 0;
}
-static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+static inline int __swap_count(swp_entry_t entry)
{
return 0;
}
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 9c0eb4d4f444..fe8e08b08c38 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -335,11 +335,6 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
return swp_type(entry) == SWP_HWPOISON;
}
-static inline bool test_set_page_hwpoison(struct page *page)
-{
- return TestSetPageHWPoison(page);
-}
-
static inline void num_poisoned_pages_inc(void)
{
atomic_long_inc(&num_poisoned_pages);
@@ -362,11 +357,6 @@ static inline int is_hwpoison_entry(swp_entry_t swp)
return 0;
}
-static inline bool test_set_page_hwpoison(struct page *page)
-{
- return false;
-}
-
static inline void num_poisoned_pages_inc(void)
{
}
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index d9a084c72541..7f2e16e76ac4 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -249,6 +249,19 @@ extern void syscall_unregfunc(void);
return static_key_false(&__tracepoint_##name.key); \
}
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+#define __TRACEPOINT_ENTRY(name) \
+ asm(" .section \"__tracepoints_ptrs\", \"a\" \n" \
+ " .balign 4 \n" \
+ " .long __tracepoint_" #name " - . \n" \
+ " .previous \n")
+#else
+#define __TRACEPOINT_ENTRY(name) \
+ static struct tracepoint * const __tracepoint_ptr_##name __used \
+ __attribute__((section("__tracepoints_ptrs"))) = \
+ &__tracepoint_##name
+#endif
+
/*
* We have no guarantee that gcc and the linker won't up-align the tracepoint
* structures, so we create an array of pointers that will be used for iteration
@@ -258,11 +271,9 @@ extern void syscall_unregfunc(void);
static const char __tpstrtab_##name[] \
__attribute__((section("__tracepoints_strings"))) = #name; \
struct tracepoint __tracepoint_##name \
- __attribute__((section("__tracepoints"))) = \
+ __attribute__((section("__tracepoints"), used)) = \
{ __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\
- static struct tracepoint * const __tracepoint_ptr_##name __used \
- __attribute__((section("__tracepoints_ptrs"))) = \
- &__tracepoint_##name;
+ __TRACEPOINT_ENTRY(name);
#define DEFINE_TRACE(name) \
DEFINE_TRACE_FN(name, NULL, NULL);
diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
index a5b3aa8d281f..3e9a963edd6a 100644
--- a/include/linux/vmacache.h
+++ b/include/linux/vmacache.h
@@ -5,12 +5,6 @@
#include <linux/sched.h>
#include <linux/mm.h>
-/*
- * Hash based on the page number. Provides a good hit rate for
- * workloads with good locality and those with random accesses as well.
- */
-#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK)
-
static inline void vmacache_flush(struct task_struct *tsk)
{
memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas));
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index 6a17f856f841..381cdf5a9bd1 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -119,7 +119,8 @@ typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end,
*/
int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
u64 start, u64 end,
- umem_call_back cb, void *cookie);
+ umem_call_back cb,
+ bool blockable, void *cookie);
/*
* Find first region intersecting with address range.
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index e13eec3dfb2f..df31aa9c9a8c 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -23,7 +23,7 @@
#define AUTOFS_MIN_PROTO_VERSION 3
#define AUTOFS_MAX_PROTO_VERSION 5
-#define AUTOFS_PROTO_SUBVERSION 2
+#define AUTOFS_PROTO_SUBVERSION 3
/*
* The wait_queue_token (autofs_wqt_t) is part of a structure which is passed
@@ -90,8 +90,10 @@ enum {
/* autofs version 4 and later definitions */
/* Mask for expire behaviour */
-#define AUTOFS_EXP_IMMEDIATE 1
-#define AUTOFS_EXP_LEAVES 2
+#define AUTOFS_EXP_NORMAL 0x00
+#define AUTOFS_EXP_IMMEDIATE 0x01
+#define AUTOFS_EXP_LEAVES 0x02
+#define AUTOFS_EXP_FORCED 0x04
#define AUTOFS_TYPE_ANY 0U
#define AUTOFS_TYPE_INDIRECT 1U
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index c0d7ea0bf5b6..76a12ee8ccfd 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -219,4 +219,8 @@ struct prctl_mm_map {
# define PR_SPEC_DISABLE (1UL << 2)
# define PR_SPEC_FORCE_DISABLE (1UL << 3)
+/* Process-based variant of PDEATHSIG */
+#define PR_SET_PDEATHSIG_PROC 48
+#define PR_GET_PDEATHSIG_PROC 49
+
#endif /* _LINUX_PRCTL_H */
diff --git a/init/Kconfig b/init/Kconfig
index c174655be2b2..db78a8e74464 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -436,7 +436,7 @@ config BSD_PROCESS_ACCT_V3
help
If you say Y here, the process accounting information is written
in a new file format that also logs the process IDs of each
- process and it's parent. Note that this file format is incompatible
+ process and its parent. Note that this file format is incompatible
with previous v0/v1/v2 file formats, so you will need updated tools
for processing it. A preliminary version of these tools is available
at <http://www.gnu.org/software/acct/>.
@@ -708,6 +708,11 @@ config MEMCG_SWAP_ENABLED
select this option (if, for some reason, they need to disable it
then swapaccount=0 does the trick).
+config MEMCG_KMEM
+ bool
+ depends on MEMCG && !SLOB
+ default y
+
config BLK_CGROUP
bool "IO controller"
depends on BLOCK
@@ -962,6 +967,18 @@ config NET_NS
endif # NAMESPACES
+config CHECKPOINT_RESTORE
+ bool "Checkpoint/restore support"
+ select PROC_CHILDREN
+ default n
+ help
+ Enables additional kernel features in a sake of checkpoint/restore.
+ In particular it adds auxiliary prctl codes to setup process text,
+ data and heap segment sizes, and a few additional /proc filesystem
+ entries.
+
+ If unsure, say N here.
+
config SCHED_AUTOGROUP
bool "Automatic process group scheduling"
select CGROUPS
@@ -1371,18 +1388,6 @@ config MEMBARRIER
If unsure, say Y.
-config CHECKPOINT_RESTORE
- bool "Checkpoint/restore support" if EXPERT
- select PROC_CHILDREN
- default n
- help
- Enables additional kernel features in a sake of checkpoint/restore.
- In particular it adds auxiliary prctl codes to setup process text,
- data and heap segment sizes, and a few additional /proc filesystem
- entries.
-
- If unsure, say N here.
-
config KALLSYMS
bool "Load all symbols for debugging/ksymoops" if EXPERT
default y
@@ -1687,7 +1692,7 @@ config MMAP_ALLOW_UNINITIALIZED
default n
help
Normally, and according to the Linux spec, anonymous memory obtained
- from mmap() has it's contents cleared before it is passed to
+ from mmap() has its contents cleared before it is passed to
userspace. Enabling this config option allows you to request that
mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus
providing a huge performance boost. If this option is not enabled,
diff --git a/init/do_mounts.c b/init/do_mounts.c
index d4fc2a5afdb6..d95435fd37b5 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -1,13 +1,3 @@
-/*
- * Many of the syscalls used in this file expect some of the arguments
- * to be __user pointers not __kernel pointers. To limit the sparse
- * noise, turn off sparse checking for this file.
- */
-#ifdef __CHECKER__
-#undef __CHECKER__
-#warning "Sparse checking disabled for this file"
-#endif
-
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/ctype.h>
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index 65de0412f80f..56a557403d39 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -1,14 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
-/*
- * Many of the syscalls used in this file expect some of the arguments
- * to be __user pointers not __kernel pointers. To limit the sparse
- * noise, turn off sparse checking for this file.
- */
-#ifdef __CHECKER__
-#undef __CHECKER__
-#warning "Sparse checking disabled for this file"
-#endif
-
#include <linux/unistd.h>
#include <linux/kernel.h>
#include <linux/fs.h>
diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c
index 7d85d172bc7e..b84031528dd4 100644
--- a/init/do_mounts_md.c
+++ b/init/do_mounts_md.c
@@ -1,14 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
-/*
- * Many of the syscalls used in this file expect some of the arguments
- * to be __user pointers not __kernel pointers. To limit the sparse
- * noise, turn off sparse checking for this file.
- */
-#ifdef __CHECKER__
-#undef __CHECKER__
-#warning "Sparse checking disabled for this file"
-#endif
-
#include <linux/delay.h>
#include <linux/raid/md_u.h>
#include <linux/raid/md_p.h>
diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index 035a5f0ab26b..32fb049d18f9 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -1,14 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
-/*
- * Many of the syscalls used in this file expect some of the arguments
- * to be __user pointers not __kernel pointers. To limit the sparse
- * noise, turn off sparse checking for this file.
- */
-#ifdef __CHECKER__
-#undef __CHECKER__
-#warning "Sparse checking disabled for this file"
-#endif
-
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/minix_fs.h>
diff --git a/init/initramfs.c b/init/initramfs.c
index 13643c46ebab..640557788026 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -1,14 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
-/*
- * Many of the syscalls used in this file expect some of the arguments
- * to be __user pointers not __kernel pointers. To limit the sparse
- * noise, turn off sparse checking for this file.
- */
-#ifdef __CHECKER__
-#undef __CHECKER__
-#warning "Sparse checking disabled for this file"
-#endif
-
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/slab.h>
diff --git a/init/main.c b/init/main.c
index b729e1f22838..18f8f0140fa0 100644
--- a/init/main.c
+++ b/init/main.c
@@ -902,18 +902,18 @@ int __init_or_module do_one_initcall(initcall_t fn)
}
-extern initcall_t __initcall_start[];
-extern initcall_t __initcall0_start[];
-extern initcall_t __initcall1_start[];
-extern initcall_t __initcall2_start[];
-extern initcall_t __initcall3_start[];
-extern initcall_t __initcall4_start[];
-extern initcall_t __initcall5_start[];
-extern initcall_t __initcall6_start[];
-extern initcall_t __initcall7_start[];
-extern initcall_t __initcall_end[];
-
-static initcall_t *initcall_levels[] __initdata = {
+extern initcall_entry_t __initcall_start[];
+extern initcall_entry_t __initcall0_start[];
+extern initcall_entry_t __initcall1_start[];
+extern initcall_entry_t __initcall2_start[];
+extern initcall_entry_t __initcall3_start[];
+extern initcall_entry_t __initcall4_start[];
+extern initcall_entry_t __initcall5_start[];
+extern initcall_entry_t __initcall6_start[];
+extern initcall_entry_t __initcall7_start[];
+extern initcall_entry_t __initcall_end[];
+
+static initcall_entry_t *initcall_levels[] __initdata = {
__initcall0_start,
__initcall1_start,
__initcall2_start,
@@ -939,7 +939,7 @@ static char *initcall_level_names[] __initdata = {
static void __init do_initcall_level(int level)
{
- initcall_t *fn;
+ initcall_entry_t *fn;
strcpy(initcall_command_line, saved_command_line);
parse_args(initcall_level_names[level],
@@ -950,7 +950,7 @@ static void __init do_initcall_level(int level)
trace_initcall_level(initcall_level_names[level]);
for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)
- do_one_initcall(*fn);
+ do_one_initcall(initcall_from_entry(fn));
}
static void __init do_initcalls(void)
@@ -981,11 +981,11 @@ static void __init do_basic_setup(void)
static void __init do_pre_smp_initcalls(void)
{
- initcall_t *fn;
+ initcall_entry_t *fn;
trace_initcall_level("early");
for (fn = __initcall_start; fn < __initcall0_start; fn++)
- do_one_initcall(*fn);
+ do_one_initcall(initcall_from_entry(fn));
}
/*
@@ -1002,6 +1002,7 @@ void __init load_default_modules(void)
static int run_init_process(const char *init_filename)
{
argv_init[0] = init_filename;
+ pr_info("Run %s as init process\n", init_filename);
return do_execve(getname_kernel(init_filename),
(const char __user *const __user *)argv_init,
(const char __user *const __user *)envp_init);
diff --git a/ipc/msg.c b/ipc/msg.c
index 203281198079..883642cf2b27 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -163,7 +163,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
/* ipc_addid() locks msq upon success. */
retval = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);
if (retval < 0) {
- call_rcu(&msq->q_perm.rcu, msg_rcu_free);
+ ipc_rcu_putref(&msq->q_perm, msg_rcu_free);
return retval;
}
@@ -386,7 +386,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
down_write(&msg_ids(ns).rwsem);
rcu_read_lock();
- ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd,
+ ipcp = ipcctl_obtain_check(ns, &msg_ids(ns), msqid, cmd,
&msqid64->msg_perm, msqid64->msg_qbytes);
if (IS_ERR(ipcp)) {
err = PTR_ERR(ipcp);
@@ -456,7 +456,7 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid,
int cmd, struct msginfo *msginfo)
{
int err;
- int max_id;
+ int max_idx;
/*
* We must not return kernel stack data.
@@ -483,16 +483,15 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid,
msginfo->msgpool = MSGPOOL;
msginfo->msgtql = MSGTQL;
}
- max_id = ipc_get_maxid(&msg_ids(ns));
+ max_idx = ipc_get_maxidx(&msg_ids(ns));
up_read(&msg_ids(ns).rwsem);
- return (max_id < 0) ? 0 : max_id;
+ return (max_idx < 0) ? 0 : max_idx;
}
static int msgctl_stat(struct ipc_namespace *ns, int msqid,
int cmd, struct msqid64_ds *p)
{
struct msg_queue *msq;
- int id = 0;
int err;
memset(p, 0, sizeof(*p));
@@ -504,7 +503,6 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid,
err = PTR_ERR(msq);
goto out_unlock;
}
- id = msq->q_perm.id;
} else { /* IPC_STAT */
msq = msq_obtain_object_check(ns, msqid);
if (IS_ERR(msq)) {
@@ -549,10 +547,21 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid,
p->msg_lspid = pid_vnr(msq->q_lspid);
p->msg_lrpid = pid_vnr(msq->q_lrpid);
- ipc_unlock_object(&msq->q_perm);
- rcu_read_unlock();
- return id;
+ if (cmd == IPC_STAT) {
+ /*
+ * As defined in SUS:
+ * Return 0 on success
+ */
+ err = 0;
+ } else {
+ /*
+ * MSG_STAT and MSG_STAT_ANY (both Linux specific)
+ * Return the full id, including the sequence number
+ */
+ err = msq->q_perm.id;
+ }
+ ipc_unlock_object(&msq->q_perm);
out_unlock:
rcu_read_unlock();
return err;
@@ -1229,7 +1238,7 @@ COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp,
}
#endif
-int msg_init_ns(struct ipc_namespace *ns)
+void msg_init_ns(struct ipc_namespace *ns)
{
ns->msg_ctlmax = MSGMAX;
ns->msg_ctlmnb = MSGMNB;
@@ -1237,7 +1246,7 @@ int msg_init_ns(struct ipc_namespace *ns)
atomic_set(&ns->msg_bytes, 0);
atomic_set(&ns->msg_hdrs, 0);
- return ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
+ ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
}
#ifdef CONFIG_IPC_NS
@@ -1278,12 +1287,11 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
}
#endif
-int __init msg_init(void)
+void __init msg_init(void)
{
- const int err = msg_init_ns(&init_ipc_ns);
+ msg_init_ns(&init_ipc_ns);
ipc_init_proc_interface("sysvipc/msg",
" key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n",
IPC_MSG_IDS, sysvipc_msg_proc_show);
- return err;
}
diff --git a/ipc/namespace.c b/ipc/namespace.c
index f59a89966f92..21607791d62c 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -55,28 +55,16 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
- err = sem_init_ns(ns);
+ err = mq_init_ns(ns);
if (err)
goto fail_put;
- err = msg_init_ns(ns);
- if (err)
- goto fail_destroy_sem;
- err = shm_init_ns(ns);
- if (err)
- goto fail_destroy_msg;
- err = mq_init_ns(ns);
- if (err)
- goto fail_destroy_shm;
+ sem_init_ns(ns);
+ msg_init_ns(ns);
+ shm_init_ns(ns);
return ns;
-fail_destroy_shm:
- shm_exit_ns(ns);
-fail_destroy_msg:
- msg_exit_ns(ns);
-fail_destroy_sem:
- sem_exit_ns(ns);
fail_put:
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
diff --git a/ipc/sem.c b/ipc/sem.c
index 00ef2f743a62..26f8e37fcdcb 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -221,14 +221,14 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
#define sc_semopm sem_ctls[2]
#define sc_semmni sem_ctls[3]
-int sem_init_ns(struct ipc_namespace *ns)
+void sem_init_ns(struct ipc_namespace *ns)
{
ns->sc_semmsl = SEMMSL;
ns->sc_semmns = SEMMNS;
ns->sc_semopm = SEMOPM;
ns->sc_semmni = SEMMNI;
ns->used_sems = 0;
- return ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
+ ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
}
#ifdef CONFIG_IPC_NS
@@ -240,14 +240,12 @@ void sem_exit_ns(struct ipc_namespace *ns)
}
#endif
-int __init sem_init(void)
+void __init sem_init(void)
{
- const int err = sem_init_ns(&init_ipc_ns);
-
+ sem_init_ns(&init_ipc_ns);
ipc_init_proc_interface("sysvipc/sem",
" key semid perms nsems uid gid cuid cgid otime ctime\n",
IPC_SEM_IDS, sysvipc_sem_proc_show);
- return err;
}
/**
@@ -557,7 +555,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
/* ipc_addid() locks sma upon success. */
retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
if (retval < 0) {
- call_rcu(&sma->sem_perm.rcu, sem_rcu_free);
+ ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
return retval;
}
ns->used_sems += nsems;
@@ -1223,7 +1221,6 @@ static int semctl_stat(struct ipc_namespace *ns, int semid,
{
struct sem_array *sma;
time64_t semotime;
- int id = 0;
int err;
memset(semid64, 0, sizeof(*semid64));
@@ -1235,7 +1232,6 @@ static int semctl_stat(struct ipc_namespace *ns, int semid,
err = PTR_ERR(sma);
goto out_unlock;
}
- id = sma->sem_perm.id;
} else { /* IPC_STAT */
sma = sem_obtain_object_check(ns, semid);
if (IS_ERR(sma)) {
@@ -1275,10 +1271,20 @@ static int semctl_stat(struct ipc_namespace *ns, int semid,
#endif
semid64->sem_nsems = sma->sem_nsems;
+ if (cmd == IPC_STAT) {
+ /*
+ * As defined in SUS:
+ * Return 0 on success
+ */
+ err = 0;
+ } else {
+ /*
+ * SEM_STAT and SEM_STAT_ANY (both Linux specific)
+ * Return the full id, including the sequence number
+ */
+ err = sma->sem_perm.id;
+ }
ipc_unlock_object(&sma->sem_perm);
- rcu_read_unlock();
- return id;
-
out_unlock:
rcu_read_unlock();
return err;
@@ -1288,7 +1294,7 @@ static int semctl_info(struct ipc_namespace *ns, int semid,
int cmd, void __user *p)
{
struct seminfo seminfo;
- int max_id;
+ int max_idx;
int err;
err = security_sem_semctl(NULL, cmd);
@@ -1312,11 +1318,11 @@ static int semctl_info(struct ipc_namespace *ns, int semid,
seminfo.semusz = SEMUSZ;
seminfo.semaem = SEMAEM;
}
- max_id = ipc_get_maxid(&sem_ids(ns));
+ max_idx = ipc_get_maxidx(&sem_ids(ns));
up_read(&sem_ids(ns).rwsem);
if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
return -EFAULT;
- return (max_id < 0) ? 0 : max_id;
+ return (max_idx < 0) ? 0 : max_idx;
}
static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
@@ -1588,7 +1594,7 @@ static int semctl_down(struct ipc_namespace *ns, int semid,
down_write(&sem_ids(ns).rwsem);
rcu_read_lock();
- ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd,
+ ipcp = ipcctl_obtain_check(ns, &sem_ids(ns), semid, cmd,
&semid64->sem_perm, 0);
if (IS_ERR(ipcp)) {
err = PTR_ERR(ipcp);
diff --git a/ipc/shm.c b/ipc/shm.c
index b204feb38274..b0eb3757ab89 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -96,14 +96,14 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp);
static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
#endif
-int shm_init_ns(struct ipc_namespace *ns)
+void shm_init_ns(struct ipc_namespace *ns)
{
ns->shm_ctlmax = SHMMAX;
ns->shm_ctlall = SHMALL;
ns->shm_ctlmni = SHMMNI;
ns->shm_rmid_forced = 0;
ns->shm_tot = 0;
- return ipc_init_ids(&shm_ids(ns));
+ ipc_init_ids(&shm_ids(ns));
}
/*
@@ -136,9 +136,8 @@ void shm_exit_ns(struct ipc_namespace *ns)
static int __init ipc_ns_init(void)
{
- const int err = shm_init_ns(&init_ipc_ns);
- WARN(err, "ipc: sysv shm_init_ns failed: %d\n", err);
- return err;
+ shm_init_ns(&init_ipc_ns);
+ return 0;
}
pure_initcall(ipc_ns_init);
@@ -180,16 +179,33 @@ static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace
*/
static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
{
- struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
+ struct kern_ipc_perm *ipcp;
+
+ rcu_read_lock();
+ ipcp = ipc_obtain_object_idr(&shm_ids(ns), id);
+ if (IS_ERR(ipcp))
+ goto err;
+ ipc_lock_object(ipcp);
+ /*
+ * ipc_rmid() may have already freed the ID while ipc_lock_object()
+ * was spinning: here verify that the structure is still valid.
+ * Upon races with RMID, return -EIDRM, thus indicating that
+ * the ID points to a removed identifier.
+ */
+ if (ipc_valid_object(ipcp)) {
+ /* return a locked ipc object upon success */
+ return container_of(ipcp, struct shmid_kernel, shm_perm);
+ }
+
+ ipc_unlock_object(ipcp);
+err:
+ rcu_read_unlock();
/*
* Callers of shm_lock() must validate the status of the returned ipc
- * object pointer (as returned by ipc_lock()), and error out as
- * appropriate.
+ * object pointer and error out as appropriate.
*/
- if (IS_ERR(ipcp))
- return (void *)ipcp;
- return container_of(ipcp, struct shmid_kernel, shm_perm);
+ return (void *)ipcp;
}
static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
@@ -684,6 +700,8 @@ no_id:
if (is_file_hugepages(file) && shp->mlock_user)
user_shm_unlock(size, shp->mlock_user);
fput(file);
+ ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
+ return error;
no_file:
call_rcu(&shp->shm_perm.rcu, shm_rcu_free);
return error;
@@ -879,7 +897,7 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
down_write(&shm_ids(ns).rwsem);
rcu_read_lock();
- ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd,
+ ipcp = ipcctl_obtain_check(ns, &shm_ids(ns), shmid, cmd,
&shmid64->shm_perm, 0);
if (IS_ERR(ipcp)) {
err = PTR_ERR(ipcp);
@@ -930,7 +948,7 @@ static int shmctl_ipc_info(struct ipc_namespace *ns,
shminfo->shmall = ns->shm_ctlall;
shminfo->shmmin = SHMMIN;
down_read(&shm_ids(ns).rwsem);
- err = ipc_get_maxid(&shm_ids(ns));
+ err = ipc_get_maxidx(&shm_ids(ns));
up_read(&shm_ids(ns).rwsem);
if (err < 0)
err = 0;
@@ -950,7 +968,7 @@ static int shmctl_shm_info(struct ipc_namespace *ns,
shm_info->shm_tot = ns->shm_tot;
shm_info->swap_attempts = 0;
shm_info->swap_successes = 0;
- err = ipc_get_maxid(&shm_ids(ns));
+ err = ipc_get_maxidx(&shm_ids(ns));
up_read(&shm_ids(ns).rwsem);
if (err < 0)
err = 0;
@@ -962,7 +980,6 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid,
int cmd, struct shmid64_ds *tbuf)
{
struct shmid_kernel *shp;
- int id = 0;
int err;
memset(tbuf, 0, sizeof(*tbuf));
@@ -974,7 +991,6 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid,
err = PTR_ERR(shp);
goto out_unlock;
}
- id = shp->shm_perm.id;
} else { /* IPC_STAT */
shp = shm_obtain_object_check(ns, shmid);
if (IS_ERR(shp)) {
@@ -1024,10 +1040,21 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid,
tbuf->shm_lpid = pid_vnr(shp->shm_lprid);
tbuf->shm_nattch = shp->shm_nattch;
- ipc_unlock_object(&shp->shm_perm);
- rcu_read_unlock();
- return id;
+ if (cmd == IPC_STAT) {
+ /*
+ * As defined in SUS:
+ * Return 0 on success
+ */
+ err = 0;
+ } else {
+ /*
+ * SHM_STAT and SHM_STAT_ANY (both Linux specific)
+ * Return the full id, including the sequence number
+ */
+ err = shp->shm_perm.id;
+ }
+ ipc_unlock_object(&shp->shm_perm);
out_unlock:
rcu_read_unlock();
return err;
diff --git a/ipc/util.c b/ipc/util.c
index fdffff41f65b..0af05752969f 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -88,16 +88,12 @@ struct ipc_proc_iface {
*/
static int __init ipc_init(void)
{
- int err_sem, err_msg;
-
proc_mkdir("sysvipc", NULL);
- err_sem = sem_init();
- WARN(err_sem, "ipc: sysv sem_init failed: %d\n", err_sem);
- err_msg = msg_init();
- WARN(err_msg, "ipc: sysv msg_init failed: %d\n", err_msg);
+ sem_init();
+ msg_init();
shm_init();
- return err_msg ? err_msg : err_sem;
+ return 0;
}
device_initcall(ipc_init);
@@ -116,22 +112,17 @@ static const struct rhashtable_params ipc_kht_params = {
* Set up the sequence range to use for the ipc identifier range (limited
* below IPCMNI) then initialise the keys hashtable and ids idr.
*/
-int ipc_init_ids(struct ipc_ids *ids)
+void ipc_init_ids(struct ipc_ids *ids)
{
- int err;
ids->in_use = 0;
ids->seq = 0;
init_rwsem(&ids->rwsem);
- err = rhashtable_init(&ids->key_ht, &ipc_kht_params);
- if (err)
- return err;
+ rhashtable_init(&ids->key_ht, &ipc_kht_params);
idr_init(&ids->ipcs_idr);
- ids->tables_initialized = true;
- ids->max_id = -1;
+ ids->max_idx = -1;
#ifdef CONFIG_CHECKPOINT_RESTORE
ids->next_id = -1;
#endif
- return 0;
}
#ifdef CONFIG_PROC_FS
@@ -179,61 +170,66 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
*/
static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
{
- struct kern_ipc_perm *ipcp = NULL;
+ struct kern_ipc_perm *ipcp;
- if (likely(ids->tables_initialized))
- ipcp = rhashtable_lookup_fast(&ids->key_ht, &key,
+ ipcp = rhashtable_lookup_fast(&ids->key_ht, &key,
ipc_kht_params);
+ if (!ipcp)
+ return NULL;
- if (ipcp) {
- rcu_read_lock();
- ipc_lock_object(ipcp);
- return ipcp;
- }
-
- return NULL;
+ rcu_read_lock();
+ ipc_lock_object(ipcp);
+ return ipcp;
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
/*
- * Specify desired id for next allocated IPC object.
+ * Insert new IPC object into idr tree, and set sequence number and id
+ * in the correct order.
+ * Especially:
+ * - the sequence number must be set before inserting the object into the idr,
+ * because the sequence number is accessed without a lock.
+ * - the id can/must be set after inserting the object into the idr.
+ * All accesses must be done after getting kern_ipc_perm.lock.
+ *
+ * The caller must own kern_ipc_perm.lock.of the new object.
+ * On error, the function returns a (negative) error code.
*/
-#define ipc_idr_alloc(ids, new) \
- idr_alloc(&(ids)->ipcs_idr, (new), \
- (ids)->next_id < 0 ? 0 : ipcid_to_idx((ids)->next_id),\
- 0, GFP_NOWAIT)
-
-static inline int ipc_buildid(int id, struct ipc_ids *ids,
- struct kern_ipc_perm *new)
+static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
{
- if (ids->next_id < 0) { /* default, behave as !CHECKPOINT_RESTORE */
+ int idx, next_id = -1;
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ next_id = ids->next_id;
+ ids->next_id = -1;
+#endif
+
+ /*
+ * As soon as a new object is inserted into the idr,
+ * ipc_obtain_object_idr() or ipc_obtain_object_check() can find it,
+ * and the lockless preparations for ipc operations can start.
+ * This means especially: permission checks, audit calls, allocation
+ * of undo structures, ...
+ *
+ * Thus the object must be fully initialized, and if something fails,
+ * then the full tear-down sequence must be followed.
+ * (i.e.: set new->deleted, reduce refcount, call_rcu())
+ */
+
+ if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */
new->seq = ids->seq++;
if (ids->seq > IPCID_SEQ_MAX)
ids->seq = 0;
+ idx = idr_alloc(&ids->ipcs_idr, new, 0, 0, GFP_NOWAIT);
} else {
- new->seq = ipcid_to_seqx(ids->next_id);
- ids->next_id = -1;
+ new->seq = ipcid_to_seqx(next_id);
+ idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id),
+ 0, GFP_NOWAIT);
}
-
- return SEQ_MULTIPLIER * new->seq + id;
+ if (idx >= 0)
+ new->id = SEQ_MULTIPLIER * new->seq + idx;
+ return idx;
}
-#else
-#define ipc_idr_alloc(ids, new) \
- idr_alloc(&(ids)->ipcs_idr, (new), 0, 0, GFP_NOWAIT)
-
-static inline int ipc_buildid(int id, struct ipc_ids *ids,
- struct kern_ipc_perm *new)
-{
- new->seq = ids->seq++;
- if (ids->seq > IPCID_SEQ_MAX)
- ids->seq = 0;
-
- return SEQ_MULTIPLIER * new->seq + id;
-}
-
-#endif /* CONFIG_CHECKPOINT_RESTORE */
-
/**
* ipc_addid - add an ipc identifier
* @ids: ipc identifier set
@@ -241,9 +237,11 @@ static inline int ipc_buildid(int id, struct ipc_ids *ids,
* @limit: limit for the number of used ids
*
* Add an entry 'new' to the ipc ids idr. The permissions object is
- * initialised and the first free entry is set up and the id assigned
+ * initialised and the first free entry is set up and the index assigned
* is returned. The 'new' entry is returned in a locked state on success.
+ *
* On failure the entry is not locked and a negative err-code is returned.
+ * The caller must use ipc_rcu_putref() to free the identifier.
*
* Called with writer ipc_ids.rwsem held.
*/
@@ -251,19 +249,20 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
{
kuid_t euid;
kgid_t egid;
- int id, err;
+ int idx, err;
+
+ /* 1) Initialize the refcount so that ipc_rcu_putref works */
+ refcount_set(&new->refcount, 1);
if (limit > IPCMNI)
limit = IPCMNI;
- if (!ids->tables_initialized || ids->in_use >= limit)
+ if (ids->in_use >= limit)
return -ENOSPC;
idr_preload(GFP_KERNEL);
- refcount_set(&new->refcount, 1);
spin_lock_init(&new->lock);
- new->deleted = false;
rcu_read_lock();
spin_lock(&new->lock);
@@ -271,30 +270,30 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
new->cuid = new->uid = euid;
new->gid = new->cgid = egid;
- id = ipc_idr_alloc(ids, new);
+ new->deleted = false;
+
+ idx = ipc_idr_alloc(ids, new);
idr_preload_end();
- if (id >= 0 && new->key != IPC_PRIVATE) {
+ if (idx >= 0 && new->key != IPC_PRIVATE) {
err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
ipc_kht_params);
if (err < 0) {
- idr_remove(&ids->ipcs_idr, id);
- id = err;
+ idr_remove(&ids->ipcs_idr, idx);
+ idx = err;
}
}
- if (id < 0) {
+ if (idx < 0) {
+ new->deleted = true;
spin_unlock(&new->lock);
rcu_read_unlock();
- return id;
+ return idx;
}
ids->in_use++;
- if (id > ids->max_id)
- ids->max_id = id;
-
- new->id = ipc_buildid(id, ids, new);
-
- return id;
+ if (idx > ids->max_idx)
+ ids->max_idx = idx;
+ return idx;
}
/**
@@ -432,20 +431,20 @@ static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
*/
void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
{
- int lid = ipcid_to_idx(ipcp->id);
+ int idx = ipcid_to_idx(ipcp->id);
- idr_remove(&ids->ipcs_idr, lid);
+ idr_remove(&ids->ipcs_idr, idx);
ipc_kht_remove(ids, ipcp);
ids->in_use--;
ipcp->deleted = true;
- if (unlikely(lid == ids->max_id)) {
+ if (unlikely(idx == ids->max_idx)) {
do {
- lid--;
- if (lid == -1)
+ idx--;
+ if (idx == -1)
break;
- } while (!idr_find(&ids->ipcs_idr, lid));
- ids->max_id = lid;
+ } while (!idr_find(&ids->ipcs_idr, idx));
+ ids->max_idx = idx;
}
}
@@ -463,7 +462,7 @@ void ipc_set_key_private(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
ipcp->key = IPC_PRIVATE;
}
-int ipc_rcu_getref(struct kern_ipc_perm *ptr)
+bool ipc_rcu_getref(struct kern_ipc_perm *ptr)
{
return refcount_inc_not_zero(&ptr->refcount);
}
@@ -565,12 +564,9 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out)
struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id)
{
struct kern_ipc_perm *out;
- int lid = ipcid_to_idx(id);
-
- if (unlikely(!ids->tables_initialized))
- return ERR_PTR(-EINVAL);
+ int idx = ipcid_to_idx(id);
- out = idr_find(&ids->ipcs_idr, lid);
+ out = idr_find(&ids->ipcs_idr, idx);
if (!out)
return ERR_PTR(-EINVAL);
@@ -578,48 +574,12 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id)
}
/**
- * ipc_lock - lock an ipc structure without rwsem held
- * @ids: ipc identifier set
- * @id: ipc id to look for
- *
- * Look for an id in the ipc ids idr and lock the associated ipc object.
- *
- * The ipc object is locked on successful exit.
- */
-struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id)
-{
- struct kern_ipc_perm *out;
-
- rcu_read_lock();
- out = ipc_obtain_object_idr(ids, id);
- if (IS_ERR(out))
- goto err;
-
- spin_lock(&out->lock);
-
- /*
- * ipc_rmid() may have already freed the ID while ipc_lock()
- * was spinning: here verify that the structure is still valid.
- * Upon races with RMID, return -EIDRM, thus indicating that
- * the ID points to a removed identifier.
- */
- if (ipc_valid_object(out))
- return out;
-
- spin_unlock(&out->lock);
- out = ERR_PTR(-EIDRM);
-err:
- rcu_read_unlock();
- return out;
-}
-
-/**
* ipc_obtain_object_check
* @ids: ipc identifier set
* @id: ipc id to look for
*
- * Similar to ipc_obtain_object_idr() but also checks
- * the ipc object reference counter.
+ * Similar to ipc_obtain_object_idr() but also checks the ipc object
+ * sequence number.
*
* Call inside the RCU critical section.
* The ipc object is *not* locked on exit.
@@ -677,7 +637,7 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out)
}
/**
- * ipcctl_pre_down_nolock - retrieve an ipc and check permissions for some IPC_XXX cmd
+ * ipcctl_obtain_check - retrieve an ipc object and check permissions
* @ns: ipc namespace
* @ids: the table of ids where to look for the ipc
* @id: the id of the ipc to retrieve
@@ -687,16 +647,16 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out)
*
* This function does some common audit and permissions check for some IPC_XXX
* cmd and is called from semctl_down, shmctl_down and msgctl_down.
- * It must be called without any lock held and:
*
- * - retrieves the ipc with the given id in the given table.
+ * It:
+ * - retrieves the ipc object with the given id in the given table.
* - performs some audit and permission check, depending on the given cmd
* - returns a pointer to the ipc object or otherwise, the corresponding
* error.
*
* Call holding the both the rwsem and the rcu read lock.
*/
-struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
+struct kern_ipc_perm *ipcctl_obtain_check(struct ipc_namespace *ns,
struct ipc_ids *ids, int id, int cmd,
struct ipc64_perm *perm, int extra_perm)
{
diff --git a/ipc/util.h b/ipc/util.h
index 0aba3230d007..0a159f69b3bb 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -18,8 +18,8 @@
#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
#define SEQ_MULTIPLIER (IPCMNI)
-int sem_init(void);
-int msg_init(void);
+void sem_init(void);
+void msg_init(void);
void shm_init(void);
struct ipc_namespace;
@@ -34,17 +34,17 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { }
#endif
#ifdef CONFIG_SYSVIPC
-int sem_init_ns(struct ipc_namespace *ns);
-int msg_init_ns(struct ipc_namespace *ns);
-int shm_init_ns(struct ipc_namespace *ns);
+void sem_init_ns(struct ipc_namespace *ns);
+void msg_init_ns(struct ipc_namespace *ns);
+void shm_init_ns(struct ipc_namespace *ns);
void sem_exit_ns(struct ipc_namespace *ns);
void msg_exit_ns(struct ipc_namespace *ns);
void shm_exit_ns(struct ipc_namespace *ns);
#else
-static inline int sem_init_ns(struct ipc_namespace *ns) { return 0; }
-static inline int msg_init_ns(struct ipc_namespace *ns) { return 0; }
-static inline int shm_init_ns(struct ipc_namespace *ns) { return 0; }
+static inline void sem_init_ns(struct ipc_namespace *ns) { }
+static inline void msg_init_ns(struct ipc_namespace *ns) { }
+static inline void shm_init_ns(struct ipc_namespace *ns) { }
static inline void sem_exit_ns(struct ipc_namespace *ns) { }
static inline void msg_exit_ns(struct ipc_namespace *ns) { }
@@ -83,7 +83,7 @@ struct ipc_ops {
struct seq_file;
struct ipc_ids;
-int ipc_init_ids(struct ipc_ids *);
+void ipc_init_ids(struct ipc_ids *ids);
#ifdef CONFIG_PROC_FS
void __init ipc_init_proc_interface(const char *path, const char *header,
int ids, int (*show)(struct seq_file *, void *));
@@ -113,12 +113,12 @@ void ipc_set_key_private(struct ipc_ids *, struct kern_ipc_perm *);
int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);
/**
- * ipc_get_maxid - get the last assigned id
+ * ipc_get_maxidx - get the highest assigned index
* @ids: ipc identifier set
*
* Called with ipc_ids.rwsem held for reading.
*/
-static inline int ipc_get_maxid(struct ipc_ids *ids)
+static inline int ipc_get_maxidx(struct ipc_ids *ids)
{
if (ids->in_use == 0)
return -1;
@@ -126,7 +126,7 @@ static inline int ipc_get_maxid(struct ipc_ids *ids)
if (ids->in_use == IPCMNI)
return IPCMNI - 1;
- return ids->max_id;
+ return ids->max_idx;
}
/*
@@ -138,17 +138,16 @@ static inline int ipc_get_maxid(struct ipc_ids *ids)
* refcount is initialized by ipc_addid(), before that point call_rcu()
* must be used.
*/
-int ipc_rcu_getref(struct kern_ipc_perm *ptr);
+bool ipc_rcu_getref(struct kern_ipc_perm *ptr);
void ipc_rcu_putref(struct kern_ipc_perm *ptr,
void (*func)(struct rcu_head *head));
-struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id);
void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out);
void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out);
int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out);
-struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
+struct kern_ipc_perm *ipcctl_obtain_check(struct ipc_namespace *ns,
struct ipc_ids *ids, int id, int cmd,
struct ipc64_perm *perm, int extra_perm);
@@ -173,9 +172,9 @@ extern struct msg_msg *load_msg(const void __user *src, size_t len);
extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst);
extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len);
-static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid)
+static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int id)
{
- return uid / SEQ_MULTIPLIER != ipcp->seq;
+ return ipcid_to_seqx(id) != ipcp->seq;
}
static inline void ipc_lock_object(struct kern_ipc_perm *perm)
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b66aced5e8c2..933cb3e45b98 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -14,8 +14,8 @@
#include <asm/sections.h>
/* vmcoreinfo stuff */
-static unsigned char *vmcoreinfo_data;
-static size_t vmcoreinfo_size;
+unsigned char *vmcoreinfo_data;
+size_t vmcoreinfo_size;
u32 *vmcoreinfo_note;
/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
@@ -344,7 +344,7 @@ void crash_save_vmcoreinfo(void)
if (vmcoreinfo_data_safecopy)
vmcoreinfo_data = vmcoreinfo_data_safecopy;
- vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+ vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds());
update_vmcoreinfo_note();
}
@@ -401,7 +401,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(init_uts_ns);
VMCOREINFO_SYMBOL(node_online_map);
#ifdef CONFIG_MMU
- VMCOREINFO_SYMBOL(swapper_pg_dir);
+ VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
#endif
VMCOREINFO_SYMBOL(_stext);
VMCOREINFO_SYMBOL(vmap_area_list);
diff --git a/kernel/cred.c b/kernel/cred.c
index ecf03657e71c..0192a94670e1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -448,6 +448,7 @@ int commit_creds(struct cred *new)
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
+ task->signal->pdeath_signal_proc = 0;
smp_wmb();
}
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index d987dcd1bd56..286d82329eb0 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -178,7 +178,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
* @dev: Pointer to device for which the allocation is performed.
* @count: Requested number of pages.
* @align: Requested alignment of pages (in PAGE_SIZE order).
- * @gfp_mask: GFP flags to use for this allocation.
+ * @no_warn: Avoid printing message about failed allocation.
*
* This function allocates memory buffer for specified device. It uses
* device specific contiguous memory area if available or the default
@@ -186,12 +186,12 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
* function.
*/
struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
- unsigned int align, gfp_t gfp_mask)
+ unsigned int align, bool no_warn)
{
if (align > CONFIG_CMA_ALIGNMENT)
align = CONFIG_CMA_ALIGNMENT;
- return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask);
+ return cma_alloc(dev_get_cma_area(dev), count, align, no_warn);
}
/**
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index c2860c5a9e96..1c35b7b945d0 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -78,7 +78,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
again:
/* CMA can be used only in the context which permits sleeping */
if (gfpflags_allow_blocking(gfp)) {
- page = dma_alloc_from_contiguous(dev, count, page_order, gfp);
+ page = dma_alloc_from_contiguous(dev, count, page_order,
+ gfp & __GFP_NOWARN);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_release_from_contiguous(dev, page, count);
page = NULL;
diff --git a/kernel/exit.c b/kernel/exit.c
index 0e21e6d21f35..066c66448258 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -636,6 +636,10 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
if (unlikely(p->exit_state == EXIT_DEAD))
return;
+ if (p->signal->pdeath_signal_proc)
+ group_send_sig_info(p->signal->pdeath_signal_proc,
+ SEND_SIG_NOINFO, p, PIDTYPE_TGID);
+
/* We don't want people slaying init. */
p->exit_signal = SIGCHLD;
diff --git a/kernel/fork.c b/kernel/fork.c
index feb906c4c5d0..2680886d9f6c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -311,8 +311,9 @@ static struct kmem_cache *mm_cachep;
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
- struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ struct vm_area_struct *vma;
+ vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (vma)
vma_init(vma, mm);
return vma;
@@ -872,6 +873,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->use_memdelay = 0;
#endif
+#ifdef CONFIG_MEMCG
+ tsk->active_memcg = NULL;
+#endif
return tsk;
free_stack:
@@ -1299,6 +1303,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
+ tsk->last_switch_time = 0;
#endif
tsk->mm = NULL;
@@ -1423,7 +1428,9 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
return -ENOMEM;
atomic_set(&sig->count, 1);
+ spin_lock_irq(&current->sighand->siglock);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
+ spin_unlock_irq(&current->sighand->siglock);
return 0;
}
@@ -1509,6 +1516,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
mutex_init(&sig->cred_guard_mutex);
+ sig->pdeath_signal_proc = current->signal->pdeath_signal_proc;
+
return 0;
}
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 32b479468e4d..b9132d1269ef 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -40,6 +40,11 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
+/*
+ * Zero (default value) means use sysctl_hung_task_timeout_secs:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
+
int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
@@ -98,8 +103,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
if (switch_count != t->last_switch_count) {
t->last_switch_count = switch_count;
+ t->last_switch_time = jiffies;
return;
}
+ if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
+ return;
trace_sched_process_hang(t);
@@ -245,8 +253,13 @@ static int watchdog(void *dummy)
for ( ; ; ) {
unsigned long timeout = sysctl_hung_task_timeout_secs;
- long t = hung_timeout_jiffies(hung_last_checked, timeout);
+ unsigned long interval = sysctl_hung_task_check_interval_secs;
+ long t;
+ if (interval == 0)
+ interval = timeout;
+ interval = min_t(unsigned long, interval, timeout);
+ t = hung_timeout_jiffies(hung_last_checked, interval);
if (t <= 0) {
if (!atomic_xchg(&reset_hung_task, 0))
check_hung_uninterruptible_tasks(timeout);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index cfb750105e1e..c3172f69421d 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/pfn_t.h>
#include <linux/io.h>
+#include <linux/kasan.h>
#include <linux/mm.h>
#include <linux/memory_hotplug.h>
#include <linux/swap.h>
@@ -137,6 +138,7 @@ static void devm_memremap_pages_release(void *data)
mem_hotplug_begin();
arch_remove_memory(align_start, align_size, pgmap->altmap_valid ?
&pgmap->altmap : NULL);
+ kasan_remove_zero_shadow(__va(align_start), align_size);
mem_hotplug_done();
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
@@ -239,6 +241,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
goto err_pfn_remap;
mem_hotplug_begin();
+ error = kasan_add_zero_shadow(__va(align_start), align_size);
+ if (error) {
+ mem_hotplug_done();
+ goto err_kasan;
+ }
+
error = arch_add_memory(nid, align_start, align_size, altmap, false);
if (!error)
move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
@@ -267,6 +275,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
return __va(res->start);
err_add_memory:
+ kasan_remove_zero_shadow(__va(align_start), align_size);
+ err_kasan:
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
err_pfn_remap:
err_radix:
diff --git a/kernel/module.c b/kernel/module.c
index b046a32520d8..6746c85511fe 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -529,12 +529,30 @@ static bool check_symbol(const struct symsearch *syms,
return true;
}
+static unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ return (unsigned long)offset_to_ptr(&sym->value_offset);
+#else
+ return sym->value;
+#endif
+}
+
+static const char *kernel_symbol_name(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ return offset_to_ptr(&sym->name_offset);
+#else
+ return sym->name;
+#endif
+}
+
static int cmp_name(const void *va, const void *vb)
{
const char *a;
const struct kernel_symbol *b;
a = va; b = vb;
- return strcmp(a, b->name);
+ return strcmp(a, kernel_symbol_name(b));
}
static bool find_symbol_in_section(const struct symsearch *syms,
@@ -2170,7 +2188,7 @@ void *__symbol_get(const char *symbol)
sym = NULL;
preempt_enable();
- return sym ? (void *)sym->value : NULL;
+ return sym ? (void *)kernel_symbol_value(sym) : NULL;
}
EXPORT_SYMBOL_GPL(__symbol_get);
@@ -2200,10 +2218,12 @@ static int verify_export_symbols(struct module *mod)
for (i = 0; i < ARRAY_SIZE(arr); i++) {
for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
- if (find_symbol(s->name, &owner, NULL, true, false)) {
+ if (find_symbol(kernel_symbol_name(s), &owner, NULL,
+ true, false)) {
pr_err("%s: exports duplicate symbol %s"
" (owned by %s)\n",
- mod->name, s->name, module_name(owner));
+ mod->name, kernel_symbol_name(s),
+ module_name(owner));
return -ENOEXEC;
}
}
@@ -2252,7 +2272,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
ksym = resolve_symbol_wait(mod, info, name);
/* Ok if resolved. */
if (ksym && !IS_ERR(ksym)) {
- sym[i].st_value = ksym->value;
+ sym[i].st_value = kernel_symbol_value(ksym);
break;
}
@@ -2516,7 +2536,7 @@ static int is_exported(const char *name, unsigned long value,
ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
else
ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
- return ks != NULL && ks->value == value;
+ return ks != NULL && kernel_symbol_value(ks) == value;
}
/* As per nm */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 7fdb5e236309..924e37fb1620 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2791,7 +2791,8 @@ EXPORT_SYMBOL(unregister_console);
void __init console_init(void)
{
int ret;
- initcall_t *call;
+ initcall_t call;
+ initcall_entry_t *ce;
/* Setup the default TTY line discipline. */
n_tty_init();
@@ -2800,13 +2801,14 @@ void __init console_init(void)
* set up the console device so that later boot sequences can
* inform about problems etc..
*/
- call = __con_initcall_start;
+ ce = __con_initcall_start;
trace_initcall_level("console");
- while (call < __con_initcall_end) {
- trace_initcall_start((*call));
- ret = (*call)();
- trace_initcall_finish((*call), ret);
- call++;
+ while (ce < __con_initcall_end) {
+ call = initcall_from_entry(ce);
+ trace_initcall_start(call);
+ ret = call();
+ trace_initcall_finish(call, ret);
+ ce++;
}
}
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 870f97b313e3..5dd47f1103d1 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -69,6 +69,8 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
wait_queue_entry_t *curr, *next;
int cnt = 0;
+ lockdep_assert_held(&wq_head->lock);
+
if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
curr = list_next_entry(bookmark, entry);
diff --git a/kernel/signal.c b/kernel/signal.c
index cfa9d10e731a..5843c541fda9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -65,14 +65,14 @@ static void __user *sig_handler(struct task_struct *t, int sig)
return t->sighand->action[sig - 1].sa.sa_handler;
}
-static int sig_handler_ignored(void __user *handler, int sig)
+static inline bool sig_handler_ignored(void __user *handler, int sig)
{
/* Is it explicitly or implicitly ignored? */
return handler == SIG_IGN ||
- (handler == SIG_DFL && sig_kernel_ignore(sig));
+ (handler == SIG_DFL && sig_kernel_ignore(sig));
}
-static int sig_task_ignored(struct task_struct *t, int sig, bool force)
+static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
{
void __user *handler;
@@ -80,12 +80,12 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force)
if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
handler == SIG_DFL && !(force && sig_kernel_only(sig)))
- return 1;
+ return true;
return sig_handler_ignored(handler, sig);
}
-static int sig_ignored(struct task_struct *t, int sig, bool force)
+static bool sig_ignored(struct task_struct *t, int sig, bool force)
{
/*
* Blocked signals are never ignored, since the
@@ -93,7 +93,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
* unblocked.
*/
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
- return 0;
+ return false;
/*
* Tracers may want to know about even ignored signal unless it
@@ -101,7 +101,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
* by SIGNAL_UNKILLABLE task.
*/
if (t->ptrace && sig != SIGKILL)
- return 0;
+ return false;
return sig_task_ignored(t, sig, force);
}
@@ -110,7 +110,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
* Re-calculate pending state from the set of locally pending
* signals, globally pending signals, and blocked signals.
*/
-static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
+static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)
{
unsigned long ready;
long i;
@@ -138,20 +138,21 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
#define PENDING(p,b) has_pending_signals(&(p)->signal, (b))
-static int recalc_sigpending_tsk(struct task_struct *t)
+static bool recalc_sigpending_tsk(struct task_struct *t)
{
if ((t->jobctl & JOBCTL_PENDING_MASK) ||
PENDING(&t->pending, &t->blocked) ||
PENDING(&t->signal->shared_pending, &t->blocked)) {
set_tsk_thread_flag(t, TIF_SIGPENDING);
- return 1;
+ return true;
}
+
/*
* We must never clear the flag in another thread, or in current
* when it's possible the current syscall is returning -ERESTART*.
* So we don't clear it here, and only callers who know they should do.
*/
- return 0;
+ return false;
}
/*
@@ -529,13 +530,15 @@ flush_signal_handlers(struct task_struct *t, int force_default)
}
}
-int unhandled_signal(struct task_struct *tsk, int sig)
+bool unhandled_signal(struct task_struct *tsk, int sig)
{
void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
if (is_global_init(tsk))
- return 1;
+ return true;
+
if (handler != SIG_IGN && handler != SIG_DFL)
- return 0;
+ return false;
+
/* if ptraced, let the tracer determine */
return !tsk->ptrace;
}
@@ -709,14 +712,14 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state)
*
* All callers must be holding the siglock.
*/
-static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
+static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
{
struct sigqueue *q, *n;
sigset_t m;
sigandsets(&m, mask, &s->signal);
if (sigisemptyset(&m))
- return 0;
+ return;
sigandnsets(&s->signal, &s->signal, mask);
list_for_each_entry_safe(q, n, &s->list, list) {
@@ -725,7 +728,6 @@ static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
__sigqueue_free(q);
}
}
- return 1;
}
static inline int is_si_special(const struct siginfo *info)
@@ -742,21 +744,16 @@ static inline bool si_fromuser(const struct siginfo *info)
/*
* called with RCU read lock from check_kill_permission()
*/
-static int kill_ok_by_cred(struct task_struct *t)
+static bool kill_ok_by_cred(struct task_struct *t)
{
const struct cred *cred = current_cred();
const struct cred *tcred = __task_cred(t);
- if (uid_eq(cred->euid, tcred->suid) ||
- uid_eq(cred->euid, tcred->uid) ||
- uid_eq(cred->uid, tcred->suid) ||
- uid_eq(cred->uid, tcred->uid))
- return 1;
-
- if (ns_capable(tcred->user_ns, CAP_KILL))
- return 1;
-
- return 0;
+ return uid_eq(cred->euid, tcred->suid) ||
+ uid_eq(cred->euid, tcred->uid) ||
+ uid_eq(cred->uid, tcred->suid) ||
+ uid_eq(cred->uid, tcred->uid) ||
+ ns_capable(tcred->user_ns, CAP_KILL);
}
/*
@@ -907,16 +904,20 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
* as soon as they're available, so putting the signal on the shared queue
* will be equivalent to sending it to one such thread.
*/
-static inline int wants_signal(int sig, struct task_struct *p)
+static inline bool wants_signal(int sig, struct task_struct *p)
{
if (sigismember(&p->blocked, sig))
- return 0;
+ return false;
+
if (p->flags & PF_EXITING)
- return 0;
+ return false;
+
if (sig == SIGKILL)
- return 1;
+ return true;
+
if (task_is_stopped_or_traced(p))
- return 0;
+ return false;
+
return task_curr(p) || !signal_pending(p);
}
@@ -996,7 +997,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
return;
}
-static inline int legacy_queue(struct sigpending *signals, int sig)
+static inline bool legacy_queue(struct sigpending *signals, int sig)
{
return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}
@@ -1380,14 +1381,15 @@ static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
return error;
}
-static int kill_as_cred_perm(const struct cred *cred,
- struct task_struct *target)
+static inline bool kill_as_cred_perm(const struct cred *cred,
+ struct task_struct *target)
{
const struct cred *pcred = __task_cred(target);
- if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) &&
- !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid))
- return 0;
- return 1;
+
+ return uid_eq(cred->euid, pcred->suid) ||
+ uid_eq(cred->euid, pcred->uid) ||
+ uid_eq(cred->uid, pcred->suid) ||
+ uid_eq(cred->uid, pcred->uid);
}
/* like kill_pid_info(), but doesn't use uid/euid of "current" */
@@ -1500,8 +1502,7 @@ send_sig(int sig, struct task_struct *p, int priv)
return send_sig_info(sig, __si_special(priv), p);
}
-void
-force_sig(int sig, struct task_struct *p)
+void force_sig(int sig, struct task_struct *p)
{
force_sig_info(sig, SEND_SIG_PRIV, p);
}
@@ -1512,8 +1513,7 @@ force_sig(int sig, struct task_struct *p)
* the problem was already a SIGSEGV, we'll want to
* make sure we don't even try to deliver the signal..
*/
-int
-force_sigsegv(int sig, struct task_struct *p)
+void force_sigsegv(int sig, struct task_struct *p)
{
if (sig == SIGSEGV) {
unsigned long flags;
@@ -1522,7 +1522,6 @@ force_sigsegv(int sig, struct task_struct *p)
spin_unlock_irqrestore(&p->sighand->siglock, flags);
}
force_sig(SIGSEGV, p);
- return 0;
}
int force_sig_fault(int sig, int code, void __user *addr
@@ -1923,10 +1922,10 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
spin_unlock_irqrestore(&sighand->siglock, flags);
}
-static inline int may_ptrace_stop(void)
+static inline bool may_ptrace_stop(void)
{
if (!likely(current->ptrace))
- return 0;
+ return false;
/*
* Are we in the middle of do_coredump?
* If so and our tracer is also part of the coredump stopping
@@ -1942,19 +1941,19 @@ static inline int may_ptrace_stop(void)
*/
if (unlikely(current->mm->core_state) &&
unlikely(current->mm == current->parent->mm))
- return 0;
+ return false;
- return 1;
+ return true;
}
/*
* Return non-zero if there is a SIGKILL that should be waking us up.
* Called with the siglock held.
*/
-static int sigkill_pending(struct task_struct *tsk)
+static bool sigkill_pending(struct task_struct *tsk)
{
- return sigismember(&tsk->pending.signal, SIGKILL) ||
- sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
+ return sigismember(&tsk->pending.signal, SIGKILL) ||
+ sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
}
/*
@@ -2334,7 +2333,7 @@ static int ptrace_signal(int signr, siginfo_t *info)
return signr;
}
-int get_signal(struct ksignal *ksig)
+bool get_signal(struct ksignal *ksig)
{
struct sighand_struct *sighand = current->sighand;
struct signal_struct *signal = current->signal;
@@ -2344,7 +2343,7 @@ int get_signal(struct ksignal *ksig)
task_work_run();
if (unlikely(uprobe_deny_signal()))
- return 0;
+ return false;
/*
* Do this once, we can't return to user-mode if freezing() == T.
@@ -2801,7 +2800,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
}
#endif
-static int do_sigpending(sigset_t *set)
+static void do_sigpending(sigset_t *set)
{
spin_lock_irq(&current->sighand->siglock);
sigorsets(set, &current->pending.signal,
@@ -2810,7 +2809,6 @@ static int do_sigpending(sigset_t *set)
/* Outside the lock because only this thread touches it. */
sigandsets(set, &current->blocked, set);
- return 0;
}
/**
@@ -2822,15 +2820,16 @@ static int do_sigpending(sigset_t *set)
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
{
sigset_t set;
- int err;
if (sigsetsize > sizeof(*uset))
return -EINVAL;
- err = do_sigpending(&set);
- if (!err && copy_to_user(uset, &set, sigsetsize))
- err = -EFAULT;
- return err;
+ do_sigpending(&set);
+
+ if (copy_to_user(uset, &set, sigsetsize))
+ return -EFAULT;
+
+ return 0;
}
#ifdef CONFIG_COMPAT
@@ -2838,15 +2837,13 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
compat_size_t, sigsetsize)
{
sigset_t set;
- int err;
if (sigsetsize > sizeof(*uset))
return -EINVAL;
- err = do_sigpending(&set);
- if (!err)
- err = put_compat_sigset(uset, &set, sigsetsize);
- return err;
+ do_sigpending(&set);
+
+ return put_compat_sigset(uset, &set, sigsetsize);
}
#endif
@@ -3608,25 +3605,26 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset)
{
sigset_t set;
- int err;
if (sizeof(old_sigset_t) > sizeof(*uset))
return -EINVAL;
- err = do_sigpending(&set);
- if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t)))
- err = -EFAULT;
- return err;
+ do_sigpending(&set);
+
+ if (copy_to_user(uset, &set, sizeof(old_sigset_t)))
+ return -EFAULT;
+
+ return 0;
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
{
sigset_t set;
- int err = do_sigpending(&set);
- if (!err)
- err = put_user(set.sig[0], set32);
- return err;
+
+ do_sigpending(&set);
+
+ return put_user(set.sig[0], set32);
}
#endif
@@ -3697,25 +3695,23 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig,
size_t, sigsetsize)
{
struct k_sigaction new_sa, old_sa;
- int ret = -EINVAL;
+ int ret;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
- goto out;
+ return -EINVAL;
- if (act) {
- if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
- return -EFAULT;
- }
+ if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
+ return -EFAULT;
ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
+ if (ret)
+ return ret;
- if (!ret && oact) {
- if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
- return -EFAULT;
- }
-out:
- return ret;
+ if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
+ return -EFAULT;
+
+ return 0;
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
diff --git a/kernel/sys.c b/kernel/sys.c
index e27b51d3facd..45dc5527a8d2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2289,6 +2289,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_GET_PDEATHSIG:
error = put_user(me->pdeath_signal, (int __user *)arg2);
break;
+ case PR_SET_PDEATHSIG_PROC:
+ if (!valid_signal(arg2)) {
+ error = -EINVAL;
+ break;
+ }
+ me->signal->pdeath_signal_proc = arg2;
+ break;
+ case PR_GET_PDEATHSIG_PROC:
+ error = put_user(me->signal->pdeath_signal_proc,
+ (int __user *)arg2);
+ break;
case PR_GET_DUMPABLE:
error = get_dumpable(me->mm);
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index faa216dde482..3ae223f7b5df 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -147,7 +147,10 @@ static int minolduid;
static int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
-/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */
+/*
+ * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
+ * and hung_task_check_interval_secs
+ */
#ifdef CONFIG_DETECT_HUNG_TASK
static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
#endif
@@ -224,7 +227,7 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
#ifdef CONFIG_MAGIC_SYSRQ
-/* Note: sysrq code uses it's own private copy */
+/* Note: sysrq code uses its own private copy */
static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
static int sysrq_sysctl_handler(struct ctl_table *table, int write,
@@ -1093,6 +1096,14 @@ static struct ctl_table kern_table[] = {
.extra2 = &hung_task_timeout_max,
},
{
+ .procname = "hung_task_check_interval_secs",
+ .data = &sysctl_hung_task_check_interval_secs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = &hung_task_timeout_max,
+ },
+ {
.procname = "hung_task_warnings",
.data = &sysctl_hung_task_warnings,
.maxlen = sizeof(int),
@@ -1810,6 +1821,24 @@ static struct ctl_table fs_table[] = {
.extra2 = &one,
},
{
+ .procname = "protected_fifos",
+ .data = &sysctl_protected_fifos,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+ {
+ .procname = "protected_regular",
+ .data = &sysctl_protected_regular,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+ {
.procname = "suid_dumpable",
.data = &suid_dumpable,
.maxlen = sizeof(int),
@@ -1978,13 +2007,13 @@ static void warn_sysctl_write(struct ctl_table *table)
}
/**
- * proc_first_pos_non_zero_ignore - check if firs position is allowed
+ * proc_first_pos_non_zero_ignore - check if first position is allowed
* @ppos: file position
* @table: the sysctl table
*
* Returns true if the first position is non-zero and the sysctl_writes_strict
* mode indicates this is not allowed for numeric input types. String proc
- * hadlers can ignore the return value.
+ * handlers can ignore the return value.
*/
static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
struct ctl_table *table)
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 96db841bf0fc..bf2c06ef9afc 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -371,6 +371,27 @@ int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data)
}
EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
+static void for_each_tracepoint_range(struct tracepoint * const *begin,
+ struct tracepoint * const *end,
+ void (*fct)(struct tracepoint *tp, void *priv),
+ void *priv)
+{
+ if (!begin)
+ return;
+
+ if (IS_ENABLED(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS)) {
+ const int *iter;
+
+ for (iter = (const int *)begin; iter < (const int *)end; iter++)
+ fct(offset_to_ptr(iter), priv);
+ } else {
+ struct tracepoint * const *iter;
+
+ for (iter = begin; iter < end; iter++)
+ fct(*iter, priv);
+ }
+}
+
#ifdef CONFIG_MODULES
bool trace_module_has_bad_taint(struct module *mod)
{
@@ -435,15 +456,9 @@ EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier);
* Ensure the tracer unregistered the module's probes before the module
* teardown is performed. Prevents leaks of probe and data pointers.
*/
-static void tp_module_going_check_quiescent(struct tracepoint * const *begin,
- struct tracepoint * const *end)
+static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv)
{
- struct tracepoint * const *iter;
-
- if (!begin)
- return;
- for (iter = begin; iter < end; iter++)
- WARN_ON_ONCE((*iter)->funcs);
+ WARN_ON_ONCE(tp->funcs);
}
static int tracepoint_module_coming(struct module *mod)
@@ -494,8 +509,9 @@ static void tracepoint_module_going(struct module *mod)
* Called the going notifier before checking for
* quiescence.
*/
- tp_module_going_check_quiescent(mod->tracepoints_ptrs,
- mod->tracepoints_ptrs + mod->num_tracepoints);
+ for_each_tracepoint_range(mod->tracepoints_ptrs,
+ mod->tracepoints_ptrs + mod->num_tracepoints,
+ tp_module_going_check_quiescent, NULL);
break;
}
}
@@ -547,19 +563,6 @@ static __init int init_tracepoints(void)
__initcall(init_tracepoints);
#endif /* CONFIG_MODULES */
-static void for_each_tracepoint_range(struct tracepoint * const *begin,
- struct tracepoint * const *end,
- void (*fct)(struct tracepoint *tp, void *priv),
- void *priv)
-{
- struct tracepoint * const *iter;
-
- if (!begin)
- return;
- for (iter = begin; iter < end; iter++)
- fct(*iter, priv);
-}
-
/**
* for_each_kernel_tracepoint - iteration on all kernel tracepoints
* @fct: callback
diff --git a/kernel/user.c b/kernel/user.c
index 36288d840675..0df9b1640b2a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -96,7 +96,7 @@ static DEFINE_SPINLOCK(uidhash_lock);
/* root_user.__count is 1, for init task cred */
struct user_struct root_user = {
- .__count = ATOMIC_INIT(1),
+ .__count = REFCOUNT_INIT(1),
.processes = ATOMIC_INIT(1),
.sigpending = ATOMIC_INIT(0),
.locked_shm = 0,
@@ -123,7 +123,7 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
hlist_for_each_entry(user, hashent, uidhash_node) {
if (uid_eq(user->uid, uid)) {
- atomic_inc(&user->__count);
+ refcount_inc(&user->__count);
return user;
}
}
@@ -169,11 +169,8 @@ void free_uid(struct user_struct *up)
if (!up)
return;
- local_irq_save(flags);
- if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
+ if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags))
free_user(up, flags);
- else
- local_irq_restore(flags);
}
struct user_struct *alloc_uid(kuid_t uid)
@@ -191,7 +188,7 @@ struct user_struct *alloc_uid(kuid_t uid)
goto out_unlock;
new->uid = uid;
- atomic_set(&new->__count, 1);
+ refcount_set(&new->__count, 1);
ratelimit_state_init(&new->ratelimit, HZ, 100);
ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);
diff --git a/lib/.gitignore b/lib/.gitignore
index 09aae85418ab..f2a39c9e5485 100644
--- a/lib/.gitignore
+++ b/lib/.gitignore
@@ -2,5 +2,7 @@
# Generated files
#
gen_crc32table
+gen_crc64table
crc32table.h
+crc64table.h
oid_registry_data.c
diff --git a/lib/Kconfig b/lib/Kconfig
index 0017443b322a..40bfa6ccd294 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -170,6 +170,14 @@ config CRC32_BIT
endchoice
+config CRC64
+ tristate "CRC64 functions"
+ help
+ This option is provided for the case where no in-kernel-tree
+ modules require CRC64 functions, but a module built outside
+ the kernel tree does. Such modules that use library CRC64
+ functions require M here.
+
config CRC4
tristate "CRC4 functions"
help
@@ -223,7 +231,6 @@ config AUDIT_COMPAT_GENERIC
config RANDOM32_SELFTEST
bool "PRNG perform self test on init"
- default n
help
This option enables the 32 bit PRNG library functions to perform a
self test on initialization.
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 46f755a60872..8142dd33720d 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1228,7 +1228,6 @@ config LOCK_TORTURE_TEST
tristate "torture tests for locking"
depends on DEBUG_KERNEL
select TORTURE_TEST
- default n
help
This option provides a kernel module that runs torture tests
on kernel locking primitives. The kernel module may be built
@@ -1695,7 +1694,6 @@ config LKDTM
tristate "Linux Kernel Dump Test Tool Module"
depends on DEBUG_FS
depends on BLOCK
- default n
help
This module enables testing of the different dumping mechanisms by
inducing system failures at predefined crash points.
@@ -1729,7 +1727,6 @@ config KPROBES_SANITY_TEST
bool "Kprobes sanity tests"
depends on DEBUG_KERNEL
depends on KPROBES
- default n
help
This option provides for testing basic kprobes functionality on
boot. Samples of kprobe and kretprobe are inserted and
@@ -1740,7 +1737,6 @@ config KPROBES_SANITY_TEST
config BACKTRACE_SELF_TEST
tristate "Self test for the backtrace code"
depends on DEBUG_KERNEL
- default n
help
This option provides a kernel module that can be used to test
the kernel stack backtrace code. This option is not useful
@@ -1810,7 +1806,6 @@ config TEST_PRINTF
config TEST_BITMAP
tristate "Test bitmap_*() family of functions at runtime"
- default n
help
Enable this option to test the bitmap functions at boot.
@@ -1834,7 +1829,6 @@ config TEST_OVERFLOW
config TEST_RHASHTABLE
tristate "Perform selftest on resizable hash table"
- default n
help
Enable this option to test the rhashtable functions at boot.
@@ -1842,7 +1836,6 @@ config TEST_RHASHTABLE
config TEST_HASH
tristate "Perform selftest on hash functions"
- default n
help
Enable this option to test the kernel's integer (<linux/hash.h>),
string (<linux/stringhash.h>), and siphash (<linux/siphash.h>)
@@ -1856,7 +1849,6 @@ config TEST_IDA
config TEST_PARMAN
tristate "Perform selftest on priority array manager"
- default n
depends on PARMAN
help
Enable this option to test priority array manager on boot
@@ -1866,7 +1858,6 @@ config TEST_PARMAN
config TEST_LKM
tristate "Test module loading with 'hello world' module"
- default n
depends on m
help
This builds the "test_module" module that emits "Hello, world"
@@ -1880,7 +1871,6 @@ config TEST_LKM
config TEST_USER_COPY
tristate "Test user/kernel boundary protections"
- default n
depends on m
help
This builds the "test_user_copy" module that runs sanity checks
@@ -1893,7 +1883,6 @@ config TEST_USER_COPY
config TEST_BPF
tristate "Test BPF filter functionality"
- default n
depends on m && NET
help
This builds the "test_bpf" module that runs various test vectors
@@ -1907,7 +1896,6 @@ config TEST_BPF
config FIND_BIT_BENCHMARK
tristate "Test find_bit functions"
- default n
help
This builds the "test_find_bit" module that measure find_*_bit()
functions performance.
@@ -1916,7 +1904,6 @@ config FIND_BIT_BENCHMARK
config TEST_FIRMWARE
tristate "Test firmware loading via userspace interface"
- default n
depends on FW_LOADER
help
This builds the "test_firmware" module that creates a userspace
@@ -1929,7 +1916,6 @@ config TEST_FIRMWARE
config TEST_SYSCTL
tristate "sysctl test driver"
- default n
depends on PROC_SYSCTL
help
This builds the "test_sysctl" module. This driver enables to test the
@@ -1940,7 +1926,6 @@ config TEST_SYSCTL
config TEST_UDELAY
tristate "udelay test driver"
- default n
help
This builds the "udelay_test" module that helps to make sure
that udelay() is working properly.
@@ -1949,7 +1934,6 @@ config TEST_UDELAY
config TEST_STATIC_KEYS
tristate "Test static keys"
- default n
depends on m
help
Test the static key interfaces.
@@ -1958,7 +1942,6 @@ config TEST_STATIC_KEYS
config TEST_KMOD
tristate "kmod stress tester"
- default n
depends on m
depends on BLOCK && (64BIT || LBDAF) # for XFS, BTRFS
depends on NETDEVICES && NET_CORE && INET # for TUN
@@ -2062,6 +2045,12 @@ config IO_STRICT_DEVMEM
If in doubt, say Y.
+config DEBUG_AID_FOR_SYZBOT
+ bool "Additional debug code for syzbot"
+ default n
+ help
+ This option is intended for testing by syzbot.
+
source "arch/$(SRCARCH)/Kconfig.debug"
endmenu # Kernel hacking
diff --git a/lib/Makefile b/lib/Makefile
index 78631d5062fb..e6f809556af5 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -105,6 +105,7 @@ obj-$(CONFIG_CRC16) += crc16.o
obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o
obj-$(CONFIG_CRC_ITU_T) += crc-itu-t.o
obj-$(CONFIG_CRC32) += crc32.o
+obj-$(CONFIG_CRC64) += crc64.o
obj-$(CONFIG_CRC32_SELFTEST) += crc32test.o
obj-$(CONFIG_CRC4) += crc4.o
obj-$(CONFIG_CRC7) += crc7.o
@@ -219,7 +220,9 @@ obj-$(CONFIG_FONT_SUPPORT) += fonts/
obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o
hostprogs-y := gen_crc32table
+hostprogs-y += gen_crc64table
clean-files := crc32table.h
+clean-files += crc64table.h
$(obj)/crc32.o: $(obj)/crc32table.h
@@ -229,6 +232,14 @@ quiet_cmd_crc32 = GEN $@
$(obj)/crc32table.h: $(obj)/gen_crc32table
$(call cmd,crc32)
+$(obj)/crc64.o: $(obj)/crc64table.h
+
+quiet_cmd_crc64 = GEN $@
+ cmd_crc64 = $< > $@
+
+$(obj)/crc64table.h: $(obj)/gen_crc64table
+ $(call cmd,crc64)
+
#
# Build a fast OID lookip registry from include/linux/oid_registry.h
#
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 730969c681cb..2fd07f6df0b8 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -1152,14 +1152,10 @@ EXPORT_SYMBOL(bitmap_free);
* @buf: array of u32 (in host byte order), the source bitmap
* @nbits: number of bits in @bitmap
*/
-void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf,
- unsigned int nbits)
+void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits)
{
unsigned int i, halfwords;
- if (!nbits)
- return;
-
halfwords = DIV_ROUND_UP(nbits, 32);
for (i = 0; i < halfwords; i++) {
bitmap[i/2] = (unsigned long) buf[i];
@@ -1183,9 +1179,6 @@ void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits)
{
unsigned int i, halfwords;
- if (!nbits)
- return;
-
halfwords = DIV_ROUND_UP(nbits, 32);
for (i = 0; i < halfwords; i++) {
buf[i] = (u32) (bitmap[i/2] & UINT_MAX);
diff --git a/lib/crc64.c b/lib/crc64.c
new file mode 100644
index 000000000000..0ef8ae6ac047
--- /dev/null
+++ b/lib/crc64.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Normal 64-bit CRC calculation.
+ *
+ * This is a basic crc64 implementation following ECMA-182 specification,
+ * which can be found from,
+ * http://www.ecma-international.org/publications/standards/Ecma-182.htm
+ *
+ * Dr. Ross N. Williams has a great document to introduce the idea of CRC
+ * algorithm, here the CRC64 code is also inspired by the table-driven
+ * algorithm and detail example from this paper. This paper can be found
+ * from,
+ * http://www.ross.net/crc/download/crc_v3.txt
+ *
+ * crc64table[256] is the lookup table of a table-driven 64-bit CRC
+ * calculation, which is generated by gen_crc64table.c in kernel build
+ * time. The polynomial of crc64 arithmetic is from ECMA-182 specification
+ * as well, which is defined as,
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+ *
+ * Copyright 2018 SUSE Linux.
+ * Author: Coly Li <colyli@suse.de>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include "crc64table.h"
+
+MODULE_DESCRIPTION("CRC64 calculations");
+MODULE_LICENSE("GPL v2");
+
+/**
+ * crc64_be - Calculate bitwise big-endian ECMA-182 CRC64
+ * @crc: seed value for computation. 0 or (u64)~0 for a new CRC calculation,
+ or the previous crc64 value if computing incrementally.
+ * @p: pointer to buffer over which CRC64 is run
+ * @len: length of buffer @p
+ */
+u64 __pure crc64_be(u64 crc, const void *p, size_t len)
+{
+ size_t i, t;
+
+ const unsigned char *_p = p;
+
+ for (i = 0; i < len; i++) {
+ t = ((crc >> 56) ^ (*_p++)) & 0xFF;
+ crc = crc64table[t] ^ (crc << 8);
+ }
+
+ return crc;
+}
+EXPORT_SYMBOL_GPL(crc64_be);
diff --git a/lib/gen_crc64table.c b/lib/gen_crc64table.c
new file mode 100644
index 000000000000..9011926e4162
--- /dev/null
+++ b/lib/gen_crc64table.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate lookup table for the table-driven CRC64 calculation.
+ *
+ * gen_crc64table is executed in kernel build time and generates
+ * lib/crc64table.h. This header is included by lib/crc64.c for
+ * the table-driven CRC64 calculation.
+ *
+ * See lib/crc64.c for more information about which specification
+ * and polynomial arithmetic that gen_crc64table.c follows to
+ * generate the lookup table.
+ *
+ * Copyright 2018 SUSE Linux.
+ * Author: Coly Li <colyli@suse.de>
+ */
+#include <inttypes.h>
+#include <stdio.h>
+
+#include <linux/swab.h>
+
+#define CRC64_ECMA182_POLY 0x42F0E1EBA9EA3693ULL
+
+static uint64_t crc64_table[256] = {0};
+
+static void generate_crc64_table(void)
+{
+ uint64_t i, j, c, crc;
+
+ for (i = 0; i < 256; i++) {
+ crc = 0;
+ c = i << 56;
+
+ for (j = 0; j < 8; j++) {
+ if ((crc ^ c) & 0x8000000000000000ULL)
+ crc = (crc << 1) ^ CRC64_ECMA182_POLY;
+ else
+ crc <<= 1;
+ c <<= 1;
+ }
+
+ crc64_table[i] = crc;
+ }
+}
+
+static void print_crc64_table(void)
+{
+ int i;
+
+ printf("/* this file is generated - do not edit */\n\n");
+ printf("#include <linux/types.h>\n");
+ printf("#include <linux/cache.h>\n\n");
+ printf("static const u64 ____cacheline_aligned crc64table[256] = {\n");
+ for (i = 0; i < 256; i++) {
+ printf("\t0x%016" PRIx64 "ULL", crc64_table[i]);
+ if (i & 0x1)
+ printf(",\n");
+ else
+ printf(", ");
+ }
+ printf("};\n");
+}
+
+int main(int argc, char *argv[])
+{
+ generate_crc64_table();
+ print_crc64_table();
+ return 0;
+}
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index ae4223e0f5bc..310e29b51507 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -174,17 +174,15 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
int i;
size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
- if (gfp != GFP_KERNEL)
- tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
- else
- tbl = kvzalloc(size, gfp);
+ tbl = kvzalloc(size, gfp);
size = nbuckets;
- if (tbl == NULL && gfp != GFP_KERNEL) {
+ if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) {
tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
nbuckets = 0;
}
+
if (tbl == NULL)
return NULL;
@@ -450,7 +448,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht,
err = -ENOMEM;
- new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
+ new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
if (new_tbl == NULL)
goto fail;
@@ -1060,9 +1058,16 @@ int rhashtable_init(struct rhashtable *ht,
}
}
+ /*
+ * This is api initialization and thus we need to guarantee the
+ * initial rhashtable allocation. Upon failure, retry with the
+ * smallest possible size with __GFP_NOFAIL semantics.
+ */
tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
- if (tbl == NULL)
- return -ENOMEM;
+ if (unlikely(tbl == NULL)) {
+ size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
+ tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
+ }
atomic_set(&ht->nelems, 0);
diff --git a/lib/test_debug_virtual.c b/lib/test_debug_virtual.c
index b9cdeecc19dc..d5a06addeb27 100644
--- a/lib/test_debug_virtual.c
+++ b/lib/test_debug_virtual.c
@@ -15,7 +15,7 @@ struct foo {
unsigned int bar;
};
-struct foo *foo;
+static struct foo *foo;
static int __init test_debug_virtual_init(void)
{
diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c
index 3f415d8101f3..626f580b4ff7 100644
--- a/lib/test_hexdump.c
+++ b/lib/test_hexdump.c
@@ -18,7 +18,7 @@ static const unsigned char data_b[] = {
static const unsigned char data_a[] = ".2.{....p..$}.4...1.....L...C...";
-static const char * const test_data_1_le[] __initconst = {
+static const char * const test_data_1[] __initconst = {
"be", "32", "db", "7b", "0a", "18", "93", "b2",
"70", "ba", "c4", "24", "7d", "83", "34", "9b",
"a6", "9c", "31", "ad", "9c", "0f", "ac", "e9",
@@ -32,16 +32,33 @@ static const char * const test_data_2_le[] __initconst = {
"d14c", "9919", "b143", "0caf",
};
+static const char * const test_data_2_be[] __initconst = {
+ "be32", "db7b", "0a18", "93b2",
+ "70ba", "c424", "7d83", "349b",
+ "a69c", "31ad", "9c0f", "ace9",
+ "4cd1", "1999", "43b1", "af0c",
+};
+
static const char * const test_data_4_le[] __initconst = {
"7bdb32be", "b293180a", "24c4ba70", "9b34837d",
"ad319ca6", "e9ac0f9c", "9919d14c", "0cafb143",
};
+static const char * const test_data_4_be[] __initconst = {
+ "be32db7b", "0a1893b2", "70bac424", "7d83349b",
+ "a69c31ad", "9c0face9", "4cd11999", "43b1af0c",
+};
+
static const char * const test_data_8_le[] __initconst = {
"b293180a7bdb32be", "9b34837d24c4ba70",
"e9ac0f9cad319ca6", "0cafb1439919d14c",
};
+static const char * const test_data_8_be[] __initconst = {
+ "be32db7b0a1893b2", "70bac4247d83349b",
+ "a69c31ad9c0face9", "4cd1199943b1af0c",
+};
+
#define FILL_CHAR '#'
static unsigned total_tests __initdata;
@@ -56,6 +73,7 @@ static void __init test_hexdump_prepare_test(size_t len, int rowsize,
size_t l = len;
int gs = groupsize, rs = rowsize;
unsigned int i;
+ const bool is_be = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN);
if (rs != 16 && rs != 32)
rs = 16;
@@ -67,13 +85,13 @@ static void __init test_hexdump_prepare_test(size_t len, int rowsize,
gs = 1;
if (gs == 8)
- result = test_data_8_le;
+ result = is_be ? test_data_8_be : test_data_8_le;
else if (gs == 4)
- result = test_data_4_le;
+ result = is_be ? test_data_4_be : test_data_4_le;
else if (gs == 2)
- result = test_data_2_le;
+ result = is_be ? test_data_2_be : test_data_2_le;
else
- result = test_data_1_le;
+ result = test_data_1;
/* hex dump */
p = test;
diff --git a/mm/Kconfig b/mm/Kconfig
index 9ae1b6a8e30f..ce5782ff3110 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -118,10 +118,6 @@ config SPARSEMEM_EXTREME
config SPARSEMEM_VMEMMAP_ENABLE
bool
-config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- def_bool y
- depends on SPARSEMEM && X86_64
-
config SPARSEMEM_VMEMMAP
bool "Sparse Memory virtual memmap"
depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
@@ -140,6 +136,9 @@ config HAVE_MEMBLOCK_NODE_MAP
config HAVE_MEMBLOCK_PHYS_MAP
bool
+config HAVE_MEMBLOCK_PFN_VALID
+ bool
+
config HAVE_GENERIC_GUP
bool
@@ -423,10 +422,11 @@ config ARCH_WANTS_THP_SWAP
config THP_SWAP
def_bool y
- depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP
+ depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
help
Swap transparent huge pages in one piece, without splitting.
- XXX: For now this only does clustered swap space allocation.
+ XXX: For now, swap cluster backing transparent huge page
+ will be split after swapout.
For selection by architectures with reasonable THP sizes.
@@ -638,7 +638,7 @@ config DEFERRED_STRUCT_PAGE_INIT
bool "Defer initialisation of struct pages to kthreads"
default n
depends on NO_BOOTMEM
- depends on !FLATMEM
+ depends on SPARSEMEM
depends on !NEED_PER_CPU_KM
help
Ordinarily all struct pages are initialised during early boot in a
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index e5e606ee5f71..9a7b8b049d04 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -46,7 +46,8 @@ config PAGE_POISONING
Fill the pages with poison patterns after free_pages() and verify
the patterns before alloc_pages. The filling of the memory helps
reduce the risk of information leaks from freed data. This does
- have a potential performance impact.
+ have a potential performance impact if enabled with the
+ "page_poison=1" kernel boot option.
Note that "poison" here is not the same thing as the "HWPoison"
for CONFIG_MEMORY_FAILURE. This is software poisoning only.
@@ -65,7 +66,7 @@ config PAGE_POISONING_NO_SANITY
say N.
config PAGE_POISONING_ZERO
- bool "Use zero for poisoning instead of random data"
+ bool "Use zero for poisoning instead of debugging value"
depends on PAGE_POISONING
---help---
Instead of using the existing poison value, fill the pages with
@@ -75,7 +76,6 @@ config PAGE_POISONING_ZERO
allocation.
If unsure, say N
- bool
config DEBUG_PAGE_REF
bool "Enable tracepoint to track down page reference manipulation"
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 2e5d3df0853d..800f579ca16c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -221,11 +221,46 @@ static ssize_t stable_pages_required_show(struct device *dev,
}
static DEVICE_ATTR_RO(stable_pages_required);
+static ssize_t strictlimit_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int val;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ switch (val) {
+ case 0:
+ bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
+ break;
+ case 1:
+ bdi->capabilities |= BDI_CAP_STRICTLIMIT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return count;
+}
+static ssize_t strictlimit_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n",
+ !!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
+}
+static DEVICE_ATTR_RW(strictlimit);
+
static struct attribute *bdi_dev_attrs[] = {
&dev_attr_read_ahead_kb.attr,
&dev_attr_min_ratio.attr,
&dev_attr_max_ratio.attr,
&dev_attr_stable_pages_required.attr,
+ &dev_attr_strictlimit.attr,
NULL,
};
ATTRIBUTE_GROUPS(bdi_dev);
@@ -438,10 +473,10 @@ retry:
if (new_congested) {
/* !found and storage for new one already allocated, insert */
congested = new_congested;
- new_congested = NULL;
rb_link_node(&congested->rb_node, parent, node);
rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
- goto found;
+ spin_unlock_irqrestore(&cgwb_lock, flags);
+ return congested;
}
spin_unlock_irqrestore(&cgwb_lock, flags);
@@ -451,13 +486,13 @@ retry:
if (!new_congested)
return NULL;
- atomic_set(&new_congested->refcnt, 0);
+ refcount_set(&new_congested->refcnt, 1);
new_congested->__bdi = bdi;
new_congested->blkcg_id = blkcg_id;
goto retry;
found:
- atomic_inc(&congested->refcnt);
+ refcount_inc(&congested->refcnt);
spin_unlock_irqrestore(&cgwb_lock, flags);
kfree(new_congested);
return congested;
@@ -473,11 +508,8 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
{
unsigned long flags;
- local_irq_save(flags);
- if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
- local_irq_restore(flags);
+ if (!refcount_dec_and_lock_irqsave(&congested->refcnt, &cgwb_lock, &flags))
return;
- }
/* bdi might already have been destroyed leaving @congested unlinked */
if (congested->__bdi) {
@@ -804,7 +836,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
if (!bdi->wb_congested)
return -ENOMEM;
- atomic_set(&bdi->wb_congested->refcnt, 1);
+ refcount_set(&bdi->wb_congested->refcnt, 1);
err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (err) {
diff --git a/mm/cma.c b/mm/cma.c
index 5809bbe360d7..4cb76121a3ab 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -395,13 +395,13 @@ static inline void cma_debug_show_areas(struct cma *cma) { }
* @cma: Contiguous memory region for which the allocation is performed.
* @count: Requested number of pages.
* @align: Requested alignment of pages (in PAGE_SIZE order).
- * @gfp_mask: GFP mask to use during compaction
+ * @no_warn: Avoid printing message about failed allocation
*
* This function allocates part of contiguous memory on specific
* contiguous memory area.
*/
struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
- gfp_t gfp_mask)
+ bool no_warn)
{
unsigned long mask, offset;
unsigned long pfn = -1;
@@ -447,7 +447,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
mutex_lock(&cma_mutex);
ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
- gfp_mask);
+ GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0));
mutex_unlock(&cma_mutex);
if (ret == 0) {
page = pfn_to_page(pfn);
@@ -466,7 +466,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
trace_cma_alloc(pfn, page, count, align);
- if (ret && !(gfp_mask & __GFP_NOWARN)) {
+ if (ret && !no_warn) {
pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n",
__func__, count, ret);
cma_debug_show_areas(cma);
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index f23467291cfb..ad6723e9d110 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -139,7 +139,7 @@ static int cma_alloc_mem(struct cma *cma, int count)
if (!mem)
return -ENOMEM;
- p = cma_alloc(cma, count, 0, GFP_KERNEL);
+ p = cma_alloc(cma, count, 0, false);
if (!p) {
kfree(mem);
return -ENOMEM;
diff --git a/mm/fadvise.c b/mm/fadvise.c
index afa41491d324..2d8376e3c640 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -72,8 +72,12 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
goto out;
}
- /* Careful about overflows. Len == 0 means "as much as possible" */
- endbyte = offset + len;
+ /*
+ * Careful about overflows. Len == 0 means "as much as possible". Use
+ * unsigned math because signed overflows are undefined and UBSan
+ * complains.
+ */
+ endbyte = (u64)offset + (u64)len;
if (!len || endbyte < len)
endbyte = -1;
else
diff --git a/mm/filemap.c b/mm/filemap.c
index 4602314ed62e..655878c3dcd5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1976,7 +1976,7 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
return 0;
- iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
+ iov_iter_truncate(iter, inode->i_sb->s_maxbytes - *ppos);
index = *ppos >> PAGE_SHIFT;
prev_index = ra->prev_pos >> PAGE_SHIFT;
diff --git a/mm/hmm.c b/mm/hmm.c
index f9d1d89dec4d..c968e49f7a0c 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -177,16 +177,19 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
up_write(&hmm->mirrors_sem);
}
-static void hmm_invalidate_range_start(struct mmu_notifier *mn,
+static int hmm_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool blockable)
{
struct hmm *hmm = mm->hmm;
VM_BUG_ON(!hmm);
atomic_inc(&hmm->sequence);
+
+ return 0;
}
static void hmm_invalidate_range_end(struct mmu_notifier *mn,
@@ -299,14 +302,14 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
- int r;
+ vm_fault_t ret;
flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
flags |= write_fault ? FAULT_FLAG_WRITE : 0;
- r = handle_mm_fault(vma, addr, flags);
- if (r & VM_FAULT_RETRY)
+ ret = handle_mm_fault(vma, addr, flags);
+ if (ret & VM_FAULT_RETRY)
return -EBUSY;
- if (r & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_ERROR) {
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
}
@@ -676,7 +679,8 @@ int hmm_vma_get_pfns(struct hmm_range *range)
return -EINVAL;
/* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
+ vma_is_dax(vma)) {
hmm_pfns_special(range);
return -EINVAL;
}
@@ -849,7 +853,8 @@ int hmm_vma_fault(struct hmm_range *range, bool block)
return -EINVAL;
/* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
+ vma_is_dax(vma)) {
hmm_pfns_special(range);
return -EINVAL;
}
@@ -973,10 +978,7 @@ static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
static void hmm_devmem_radix_release(struct resource *resource)
{
- resource_size_t key, align_start, align_size;
-
- align_start = resource->start & ~(PA_SECTION_SIZE - 1);
- align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
+ resource_size_t key;
mutex_lock(&hmm_devmem_lock);
for (key = resource->start;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a4afaed29e19..be1f953e8f9d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -762,11 +762,11 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
* but we need to be consistent with PTEs and architectures that
* can't support a 'special' bit.
*/
- BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
+ !pfn_t_devmap(pfn));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- BUG_ON(!pfn_t_devmap(pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
@@ -1328,7 +1328,8 @@ alloc:
if (!page)
clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
else
- copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+ copy_user_huge_page(new_page, page, vmf->address,
+ vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
mmun_start = haddr;
@@ -1740,7 +1741,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
} else {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
- add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
+ add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
}
spin_unlock(ptl);
@@ -2090,7 +2091,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
SetPageReferenced(page);
page_remove_rmap(page, true);
put_page(page);
- add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
+ add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
} else if (is_huge_zero_pmd(*pmd)) {
/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3103099f64fd..9f1c853f67b5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1479,22 +1479,20 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
/*
* Dissolve a given free hugepage into free buddy pages. This function does
* nothing for in-use (including surplus) hugepages. Returns -EBUSY if the
- * number of free hugepages would be reduced below the number of reserved
- * hugepages.
+ * dissolution fails because a give page is not a free hugepage, or because
+ * free hugepages are fully reserved.
*/
int dissolve_free_huge_page(struct page *page)
{
- int rc = 0;
+ int rc = -EBUSY;
spin_lock(&hugetlb_lock);
if (PageHuge(page) && !page_count(page)) {
struct page *head = compound_head(page);
struct hstate *h = page_hstate(head);
int nid = page_to_nid(head);
- if (h->free_huge_pages - h->resv_huge_pages == 0) {
- rc = -EBUSY;
+ if (h->free_huge_pages - h->resv_huge_pages == 0)
goto out;
- }
/*
* Move PageHWPoison flag from head page to the raw error page,
* which makes any subpages rather than the error page reusable.
@@ -1508,6 +1506,7 @@ int dissolve_free_huge_page(struct page *page)
h->free_huge_pages_node[nid]--;
h->max_huge_pages--;
update_and_free_page(h, head);
+ rc = 0;
}
out:
spin_unlock(&hugetlb_lock);
@@ -2101,7 +2100,7 @@ int __alloc_bootmem_huge_page(struct hstate *h)
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
void *addr;
- addr = memblock_virt_alloc_try_nid_nopanic(
+ addr = memblock_virt_alloc_try_nid_raw(
huge_page_size(h), huge_page_size(h),
0, BOOTMEM_ALLOC_ACCESSIBLE, node);
if (addr) {
@@ -2119,6 +2118,7 @@ int __alloc_bootmem_huge_page(struct hstate *h)
found:
BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
/* Put them into a private list first because mem_map is not up yet */
+ INIT_LIST_HEAD(&m->list);
list_add(&m->list, &huge_boot_pages);
m->hstate = h;
return 1;
@@ -2139,16 +2139,9 @@ static void __init gather_bootmem_prealloc(void)
struct huge_bootmem_page *m;
list_for_each_entry(m, &huge_boot_pages, list) {
+ struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
- struct page *page;
-#ifdef CONFIG_HIGHMEM
- page = pfn_to_page(m->phys >> PAGE_SHIFT);
- memblock_free_late(__pa(m),
- sizeof(struct huge_bootmem_page));
-#else
- page = virt_to_page(m);
-#endif
WARN_ON(page_count(page) != 1);
prep_compound_huge_page(page, h->order);
WARN_ON(PageReserved(page));
@@ -3518,6 +3511,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
int ret = 0, outside_reserve = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ unsigned long haddr = address & huge_page_mask(h);
pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
@@ -3527,7 +3521,7 @@ retry_avoidcopy:
* and just make the page writable */
if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
page_move_anon_rmap(old_page, vma);
- set_huge_ptep_writable(vma, address, ptep);
+ set_huge_ptep_writable(vma, haddr, ptep);
return 0;
}
@@ -3551,7 +3545,7 @@ retry_avoidcopy:
* be acquired again before returning to the caller, as expected.
*/
spin_unlock(ptl);
- new_page = alloc_huge_page(vma, address, outside_reserve);
+ new_page = alloc_huge_page(vma, haddr, outside_reserve);
if (IS_ERR(new_page)) {
/*
@@ -3564,11 +3558,10 @@ retry_avoidcopy:
if (outside_reserve) {
put_page(old_page);
BUG_ON(huge_pte_none(pte));
- unmap_ref_private(mm, vma, old_page, address);
+ unmap_ref_private(mm, vma, old_page, haddr);
BUG_ON(huge_pte_none(pte));
spin_lock(ptl);
- ptep = huge_pte_offset(mm, address & huge_page_mask(h),
- huge_page_size(h));
+ ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep &&
pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
@@ -3598,7 +3591,7 @@ retry_avoidcopy:
__SetPageUptodate(new_page);
set_page_huge_active(new_page);
- mmun_start = address & huge_page_mask(h);
+ mmun_start = haddr;
mmun_end = mmun_start + huge_page_size(h);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
@@ -3607,25 +3600,24 @@ retry_avoidcopy:
* before the page tables are altered
*/
spin_lock(ptl);
- ptep = huge_pte_offset(mm, address & huge_page_mask(h),
- huge_page_size(h));
+ ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
ClearPagePrivate(new_page);
/* Break COW */
- huge_ptep_clear_flush(vma, address, ptep);
+ huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
- set_huge_pte_at(mm, address, ptep,
+ set_huge_pte_at(mm, haddr, ptep,
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
- hugepage_add_new_anon_rmap(new_page, vma, address);
+ hugepage_add_new_anon_rmap(new_page, vma, haddr);
/* Make the old page be freed below */
new_page = old_page;
}
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out_release_all:
- restore_reserve_on_error(h, vma, address, new_page);
+ restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
out_release_old:
put_page(old_page);
@@ -3828,7 +3820,7 @@ retry:
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, haddr, ptep, page, ptl);
+ ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
}
spin_unlock(ptl);
@@ -3982,7 +3974,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_WRITE) {
if (!huge_pte_write(entry)) {
- ret = hugetlb_cow(mm, vma, haddr, ptep,
+ ret = hugetlb_cow(mm, vma, address, ptep,
pagecache_page, ptl);
goto out_put_page;
}
diff --git a/mm/internal.h b/mm/internal.h
index 9e3654d70289..dab088cb6937 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -389,18 +389,6 @@ static inline struct page *mem_map_next(struct page *iter,
return iter + 1;
}
-/*
- * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
- * so all functions starting at paging_init should be marked __init
- * in those cases. SPARSEMEM, however, allows for memory hotplug,
- * and alloc_bootmem_node is not used.
- */
-#ifdef CONFIG_SPARSEMEM
-#define __paginginit __meminit
-#else
-#define __paginginit __init
-#endif
-
/* Memory initialisation debug and verification */
enum mminit_level {
MMINIT_WARNING,
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
index f436246ccc79..7a2a2f13f86f 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/kasan_init.c
@@ -17,10 +17,13 @@
#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/pfn.h>
+#include <linux/slab.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
+#include "kasan.h"
+
/*
* This page serves two purposes:
* - It used as early shadow memory. The entire shadow region populated
@@ -32,22 +35,59 @@ unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
#if CONFIG_PGTABLE_LEVELS > 4
p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss;
+static inline bool kasan_p4d_table(pgd_t pgd)
+{
+ return pgd_page(pgd) == virt_to_page(lm_alias(kasan_zero_p4d));
+}
+#else
+static inline bool kasan_p4d_table(pgd_t pgd)
+{
+ return 0;
+}
#endif
#if CONFIG_PGTABLE_LEVELS > 3
pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
+static inline bool kasan_pud_table(p4d_t p4d)
+{
+ return p4d_page(p4d) == virt_to_page(lm_alias(kasan_zero_pud));
+}
+#else
+static inline bool kasan_pud_table(p4d_t p4d)
+{
+ return 0;
+}
#endif
#if CONFIG_PGTABLE_LEVELS > 2
pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
+static inline bool kasan_pmd_table(pud_t pud)
+{
+ return pud_page(pud) == virt_to_page(lm_alias(kasan_zero_pmd));
+}
+#else
+static inline bool kasan_pmd_table(pud_t pud)
+{
+ return 0;
+}
#endif
pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
+static inline bool kasan_pte_table(pmd_t pmd)
+{
+ return pmd_page(pmd) == virt_to_page(lm_alias(kasan_zero_pte));
+}
+
+static inline bool kasan_zero_page_entry(pte_t pte)
+{
+ return pte_page(pte) == virt_to_page(lm_alias(kasan_zero_page));
+}
+
static __init void *early_alloc(size_t size, int node)
{
return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS),
BOOTMEM_ALLOC_ACCESSIBLE, node);
}
-static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
+static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr,
unsigned long end)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
@@ -63,7 +103,7 @@ static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
}
}
-static void __init zero_pmd_populate(pud_t *pud, unsigned long addr,
+static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
unsigned long end)
{
pmd_t *pmd = pmd_offset(pud, addr);
@@ -78,14 +118,24 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr,
}
if (pmd_none(*pmd)) {
- pmd_populate_kernel(&init_mm, pmd,
- early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ pte_t *p;
+
+ if (slab_is_available())
+ p = pte_alloc_one_kernel(&init_mm, addr);
+ else
+ p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
+ if (!p)
+ return -ENOMEM;
+
+ pmd_populate_kernel(&init_mm, pmd, p);
}
zero_pte_populate(pmd, addr, next);
} while (pmd++, addr = next, addr != end);
+
+ return 0;
}
-static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr,
+static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
unsigned long end)
{
pud_t *pud = pud_offset(p4d, addr);
@@ -103,14 +153,24 @@ static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr,
}
if (pud_none(*pud)) {
- pud_populate(&init_mm, pud,
- early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ pmd_t *p;
+
+ if (slab_is_available()) {
+ p = pmd_alloc(&init_mm, pud, addr);
+ if (!p)
+ return -ENOMEM;
+ } else {
+ pud_populate(&init_mm, pud,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
}
zero_pmd_populate(pud, addr, next);
} while (pud++, addr = next, addr != end);
+
+ return 0;
}
-static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr,
+static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
unsigned long end)
{
p4d_t *p4d = p4d_offset(pgd, addr);
@@ -132,11 +192,21 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr,
}
if (p4d_none(*p4d)) {
- p4d_populate(&init_mm, p4d,
- early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ pud_t *p;
+
+ if (slab_is_available()) {
+ p = pud_alloc(&init_mm, p4d, addr);
+ if (!p)
+ return -ENOMEM;
+ } else {
+ p4d_populate(&init_mm, p4d,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
}
zero_pud_populate(p4d, addr, next);
} while (p4d++, addr = next, addr != end);
+
+ return 0;
}
/**
@@ -145,7 +215,7 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr,
* @shadow_start - start of the memory range to populate
* @shadow_end - end of the memory range to populate
*/
-void __init kasan_populate_zero_shadow(const void *shadow_start,
+int __ref kasan_populate_zero_shadow(const void *shadow_start,
const void *shadow_end)
{
unsigned long addr = (unsigned long)shadow_start;
@@ -191,9 +261,229 @@ void __init kasan_populate_zero_shadow(const void *shadow_start,
}
if (pgd_none(*pgd)) {
- pgd_populate(&init_mm, pgd,
- early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ p4d_t *p;
+
+ if (slab_is_available()) {
+ p = p4d_alloc(&init_mm, pgd, addr);
+ if (!p)
+ return -ENOMEM;
+ } else {
+ pgd_populate(&init_mm, pgd,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
}
zero_p4d_populate(pgd, addr, next);
} while (pgd++, addr = next, addr != end);
+
+ return 0;
+}
+
+static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd)
+{
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = pte_start + i;
+ if (!pte_none(*pte))
+ return;
+ }
+
+ pte_free_kernel(&init_mm, (pte_t *)page_to_virt(pmd_page(*pmd)));
+ pmd_clear(pmd);
+}
+
+static void kasan_free_pmd(pmd_t *pmd_start, pud_t *pud)
+{
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_start + i;
+ if (!pmd_none(*pmd))
+ return;
+ }
+
+ pmd_free(&init_mm, (pmd_t *)page_to_virt(pud_page(*pud)));
+ pud_clear(pud);
+}
+
+static void kasan_free_pud(pud_t *pud_start, p4d_t *p4d)
+{
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (!pud_none(*pud))
+ return;
+ }
+
+ pud_free(&init_mm, (pud_t *)page_to_virt(p4d_page(*p4d)));
+ p4d_clear(p4d);
+}
+
+static void kasan_free_p4d(p4d_t *p4d_start, pgd_t *pgd)
+{
+ p4d_t *p4d;
+ int i;
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ p4d = p4d_start + i;
+ if (!p4d_none(*p4d))
+ return;
+ }
+
+ p4d_free(&init_mm, (p4d_t *)page_to_virt(pgd_page(*pgd)));
+ pgd_clear(pgd);
+}
+
+static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, pte++) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ if (!pte_present(*pte))
+ continue;
+
+ if (WARN_ON(!kasan_zero_page_entry(*pte)))
+ continue;
+ pte_clear(&init_mm, addr, pte);
+ }
+}
+
+static void kasan_remove_pmd_table(pmd_t *pmd, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, pmd++) {
+ pte_t *pte;
+
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(*pmd))
+ continue;
+
+ if (kasan_pte_table(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE))
+ pmd_clear(pmd);
+ continue;
+ }
+ pte = pte_offset_kernel(pmd, addr);
+ kasan_remove_pte_table(pte, addr, next);
+ kasan_free_pte(pte_offset_kernel(pmd, 0), pmd);
+ }
+}
+
+static void kasan_remove_pud_table(pud_t *pud, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, pud++) {
+ pmd_t *pmd, *pmd_base;
+
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(*pud))
+ continue;
+
+ if (kasan_pmd_table(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE))
+ pud_clear(pud);
+ continue;
+ }
+ pmd = pmd_offset(pud, addr);
+ pmd_base = pmd_offset(pud, 0);
+ kasan_remove_pmd_table(pmd, addr, next);
+ kasan_free_pmd(pmd_base, pud);
+ }
+}
+
+static void kasan_remove_p4d_table(p4d_t *p4d, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, p4d++) {
+ pud_t *pud;
+
+ next = p4d_addr_end(addr, end);
+
+ if (!p4d_present(*p4d))
+ continue;
+
+ if (kasan_pud_table(*p4d)) {
+ if (IS_ALIGNED(addr, P4D_SIZE) &&
+ IS_ALIGNED(next, P4D_SIZE))
+ p4d_clear(p4d);
+ continue;
+ }
+ pud = pud_offset(p4d, addr);
+ kasan_remove_pud_table(pud, addr, next);
+ kasan_free_pud(pud_offset(p4d, 0), p4d);
+ }
+}
+
+void kasan_remove_zero_shadow(void *start, unsigned long size)
+{
+ unsigned long addr, end, next;
+ pgd_t *pgd;
+
+ addr = (unsigned long)kasan_mem_to_shadow(start);
+ end = addr + (size >> KASAN_SHADOW_SCALE_SHIFT);
+
+ if (WARN_ON((unsigned long)start %
+ (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) ||
+ WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
+ return;
+
+ for (; addr < end; addr = next) {
+ p4d_t *p4d;
+
+ next = pgd_addr_end(addr, end);
+
+ pgd = pgd_offset_k(addr);
+ if (!pgd_present(*pgd))
+ continue;
+
+ if (kasan_p4d_table(*pgd)) {
+ if (IS_ALIGNED(addr, PGDIR_SIZE) &&
+ IS_ALIGNED(next, PGDIR_SIZE))
+ pgd_clear(pgd);
+ continue;
+ }
+
+ p4d = p4d_offset(pgd, addr);
+ kasan_remove_p4d_table(p4d, addr, next);
+ kasan_free_p4d(p4d_offset(pgd, 0), pgd);
+ }
+}
+
+int kasan_add_zero_shadow(void *start, unsigned long size)
+{
+ int ret;
+ void *shadow_start, *shadow_end;
+
+ shadow_start = kasan_mem_to_shadow(start);
+ shadow_end = shadow_start + (size >> KASAN_SHADOW_SCALE_SHIFT);
+
+ if (WARN_ON((unsigned long)start %
+ (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) ||
+ WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
+ return -EINVAL;
+
+ ret = kasan_populate_zero_shadow(shadow_start, shadow_end);
+ if (ret)
+ kasan_remove_zero_shadow(shadow_start,
+ size >> KASAN_SHADOW_SCALE_SHIFT);
+ return ret;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 98edecfac764..35c5b0b98ebe 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -397,6 +397,26 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
+static bool hugepage_vma_check(struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+ (vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ return false;
+ if (shmem_file(vma->vm_file)) {
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ return false;
+ return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+ HPAGE_PMD_NR);
+ }
+ if (!vma->anon_vma || vma->vm_ops)
+ return false;
+ if (is_vma_temporary_stack(vma))
+ return false;
+ return !(vm_flags & VM_NO_KHUGEPAGED);
+}
+
int __khugepaged_enter(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
@@ -434,15 +454,14 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
unsigned long vm_flags)
{
unsigned long hstart, hend;
- if (!vma->anon_vma)
- /*
- * Not yet faulted in so we will register later in the
- * page fault if needed.
- */
- return 0;
- if (vma->vm_ops || (vm_flags & VM_NO_KHUGEPAGED))
- /* khugepaged not yet working on file or special mappings */
+
+ /*
+ * khugepaged does not yet work on non-shmem files or special
+ * mappings. And file-private shmem THP is not supported.
+ */
+ if (!hugepage_vma_check(vma, vm_flags))
return 0;
+
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
@@ -819,25 +838,6 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
}
#endif
-static bool hugepage_vma_check(struct vm_area_struct *vma)
-{
- if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
- (vma->vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
- return false;
- if (shmem_file(vma->vm_file)) {
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
- return false;
- return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
- HPAGE_PMD_NR);
- }
- if (!vma->anon_vma || vma->vm_ops)
- return false;
- if (is_vma_temporary_stack(vma))
- return false;
- return !(vma->vm_flags & VM_NO_KHUGEPAGED);
-}
-
/*
* If mmap_sem temporarily dropped, revalidate vma
* before taking mmap_sem.
@@ -862,7 +862,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
hend = vma->vm_end & HPAGE_PMD_MASK;
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
return SCAN_ADDRESS_RANGE;
- if (!hugepage_vma_check(vma))
+ if (!hugepage_vma_check(vma, vma->vm_flags))
return SCAN_VMA_CHECK;
return 0;
}
@@ -1490,6 +1490,8 @@ xa_unlocked:
unlock_page(new_page);
*hpage = NULL;
+
+ khugepaged_pages_collapsed++;
} else {
struct page *page;
/* Something went wrong: roll back page cache changes */
@@ -1660,7 +1662,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
progress++;
break;
}
- if (!hugepage_vma_check(vma)) {
+ if (!hugepage_vma_check(vma, vma->vm_flags)) {
skip:
progress++;
continue;
diff --git a/mm/ksm.c b/mm/ksm.c
index a6d43cf9a982..1bd514c9e5d0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -470,7 +470,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
- int ret = 0;
+ vm_fault_t ret = 0;
do {
cond_resched();
@@ -703,7 +703,7 @@ again:
* We cannot do anything with the page while its refcount is 0.
* Usually 0 means free, or tail of a higher-order page: in which
* case this node is no longer referenced, and should be freed;
- * however, it might mean that the page is under page_freeze_refs().
+ * however, it might mean that the page is under page_ref_freeze().
* The __remove_mapping() case is easy, again the node is now stale;
* but if page is swapcache in migrate_page_move_mapping(), it might
* still be our page, in which case it's essential to keep the node.
@@ -714,7 +714,7 @@ again:
* work here too. We have chosen the !PageSwapCache test to
* optimize the common case, when the page is or is about to
* be freed: PageSwapCache is cleared (under spin_lock_irq)
- * in the freeze_refs section of __remove_mapping(); but Anon
+ * in the ref_freeze section of __remove_mapping(); but Anon
* page->mapping reset to NULL later, in free_pages_prepare().
*/
if (!PageSwapCache(page))
@@ -2430,6 +2430,9 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
VM_HUGETLB | VM_MIXEDMAP))
return 0; /* just ignore the advice */
+ if (vma_is_dax(vma))
+ return 0;
+
#ifdef VM_SAO
if (*vm_flags & VM_SAO)
return 0;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index fcfb6c89ed47..89349a0276de 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -8,11 +8,12 @@
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/list_lru.h>
+#include <linux/prefetch.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/memcontrol.h>
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
static LIST_HEAD(list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);
@@ -29,17 +30,12 @@ static void list_lru_unregister(struct list_lru *lru)
list_del(&lru->list);
mutex_unlock(&list_lrus_mutex);
}
-#else
-static void list_lru_register(struct list_lru *lru)
-{
-}
-static void list_lru_unregister(struct list_lru *lru)
+static int lru_shrinker_id(struct list_lru *lru)
{
+ return lru->shrinker_id;
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
/*
@@ -75,20 +71,39 @@ static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
}
static inline struct list_lru_one *
-list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
+ struct mem_cgroup **memcg_ptr)
{
- struct mem_cgroup *memcg;
+ struct list_lru_one *l = &nlru->lru;
+ struct mem_cgroup *memcg = NULL;
if (!nlru->memcg_lrus)
- return &nlru->lru;
+ goto out;
memcg = mem_cgroup_from_kmem(ptr);
if (!memcg)
- return &nlru->lru;
+ goto out;
- return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+ l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+out:
+ if (memcg_ptr)
+ *memcg_ptr = memcg;
+ return l;
}
#else
+static void list_lru_register(struct list_lru *lru)
+{
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+}
+
+static int lru_shrinker_id(struct list_lru *lru)
+{
+ return -1;
+}
+
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
return false;
@@ -101,23 +116,30 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
}
static inline struct list_lru_one *
-list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
+ struct mem_cgroup **memcg_ptr)
{
+ if (memcg_ptr)
+ *memcg_ptr = NULL;
return &nlru->lru;
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
+ struct mem_cgroup *memcg;
struct list_lru_one *l;
spin_lock(&nlru->lock);
if (list_empty(item)) {
- l = list_lru_from_kmem(nlru, item);
+ l = list_lru_from_kmem(nlru, item, &memcg);
list_add_tail(item, &l->list);
- l->nr_items++;
+ /* Set shrinker bit if the first element was added */
+ if (!l->nr_items++)
+ memcg_set_shrinker_bit(memcg, nid,
+ lru_shrinker_id(lru));
nlru->nr_items++;
spin_unlock(&nlru->lock);
return true;
@@ -133,9 +155,15 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
+ /*
+ * Prefetch the neighboring list entries to reduce lock hold time.
+ */
+ prefetchw(item->prev);
+ prefetchw(item->next);
+
spin_lock(&nlru->lock);
if (!list_empty(item)) {
- l = list_lru_from_kmem(nlru, item);
+ l = list_lru_from_kmem(nlru, item, NULL);
list_del_init(item);
l->nr_items--;
nlru->nr_items--;
@@ -162,26 +190,20 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
}
EXPORT_SYMBOL_GPL(list_lru_isolate_move);
-static unsigned long __list_lru_count_one(struct list_lru *lru,
- int nid, int memcg_idx)
+unsigned long list_lru_count_one(struct list_lru *lru,
+ int nid, struct mem_cgroup *memcg)
{
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
unsigned long count;
rcu_read_lock();
- l = list_lru_from_memcg_idx(nlru, memcg_idx);
+ l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
count = l->nr_items;
rcu_read_unlock();
return count;
}
-
-unsigned long list_lru_count_one(struct list_lru *lru,
- int nid, struct mem_cgroup *memcg)
-{
- return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
-}
EXPORT_SYMBOL_GPL(list_lru_count_one);
unsigned long list_lru_count_node(struct list_lru *lru, int nid)
@@ -194,17 +216,15 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
EXPORT_SYMBOL_GPL(list_lru_count_node);
static unsigned long
-__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
+__list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
{
- struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
struct list_head *item, *n;
unsigned long isolated = 0;
- spin_lock(&nlru->lock);
l = list_lru_from_memcg_idx(nlru, memcg_idx);
restart:
list_for_each_safe(item, n, &l->list) {
@@ -250,8 +270,6 @@ restart:
BUG();
}
}
-
- spin_unlock(&nlru->lock);
return isolated;
}
@@ -260,11 +278,32 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
{
- return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
- isolate, cb_arg, nr_to_walk);
+ struct list_lru_node *nlru = &lru->node[nid];
+ unsigned long ret;
+
+ spin_lock(&nlru->lock);
+ ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
+ nr_to_walk);
+ spin_unlock(&nlru->lock);
+ return ret;
}
EXPORT_SYMBOL_GPL(list_lru_walk_one);
+unsigned long
+list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
+{
+ struct list_lru_node *nlru = &lru->node[nid];
+ unsigned long ret;
+
+ spin_lock_irq(&nlru->lock);
+ ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
+ nr_to_walk);
+ spin_unlock_irq(&nlru->lock);
+ return ret;
+}
+
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
@@ -272,12 +311,18 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
long isolated = 0;
int memcg_idx;
- isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
- nr_to_walk);
+ isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
+ nr_to_walk);
if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
for_each_memcg_cache_index(memcg_idx) {
- isolated += __list_lru_walk_one(lru, nid, memcg_idx,
- isolate, cb_arg, nr_to_walk);
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
+ isolated += __list_lru_walk_one(nlru, memcg_idx,
+ isolate, cb_arg,
+ nr_to_walk);
+ spin_unlock(&nlru->lock);
+
if (*nr_to_walk <= 0)
break;
}
@@ -292,7 +337,7 @@ static void init_one_lru(struct list_lru_one *l)
l->nr_items = 0;
}
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
int begin, int end)
{
@@ -500,10 +545,13 @@ fail:
goto out;
}
-static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
- int src_idx, int dst_idx)
+static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
+ int src_idx, struct mem_cgroup *dst_memcg)
{
+ struct list_lru_node *nlru = &lru->node[nid];
+ int dst_idx = dst_memcg->kmemcg_id;
struct list_lru_one *src, *dst;
+ bool set;
/*
* Since list_lru_{add,del} may be called under an IRQ-safe lock,
@@ -515,14 +563,17 @@ static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
dst = list_lru_from_memcg_idx(nlru, dst_idx);
list_splice_init(&src->list, &dst->list);
+ set = (!dst->nr_items && src->nr_items);
dst->nr_items += src->nr_items;
+ if (set)
+ memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
src->nr_items = 0;
spin_unlock_irq(&nlru->lock);
}
static void memcg_drain_list_lru(struct list_lru *lru,
- int src_idx, int dst_idx)
+ int src_idx, struct mem_cgroup *dst_memcg)
{
int i;
@@ -530,16 +581,16 @@ static void memcg_drain_list_lru(struct list_lru *lru,
return;
for_each_node(i)
- memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
+ memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg);
}
-void memcg_drain_all_list_lrus(int src_idx, int dst_idx)
+void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg)
{
struct list_lru *lru;
mutex_lock(&list_lrus_mutex);
list_for_each_entry(lru, &list_lrus, list)
- memcg_drain_list_lru(lru, src_idx, dst_idx);
+ memcg_drain_list_lru(lru, src_idx, dst_memcg);
mutex_unlock(&list_lrus_mutex);
}
#else
@@ -551,15 +602,21 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
static void memcg_destroy_list_lru(struct list_lru *lru)
{
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
int __list_lru_init(struct list_lru *lru, bool memcg_aware,
- struct lock_class_key *key)
+ struct lock_class_key *key, struct shrinker *shrinker)
{
int i;
size_t size = sizeof(*lru->node) * nr_node_ids;
int err = -ENOMEM;
+#ifdef CONFIG_MEMCG_KMEM
+ if (shrinker)
+ lru->shrinker_id = shrinker->id;
+ else
+ lru->shrinker_id = -1;
+#endif
memcg_get_cache_ids();
lru->node = kzalloc(size, GFP_KERNEL);
@@ -602,6 +659,9 @@ void list_lru_destroy(struct list_lru *lru)
kfree(lru->node);
lru->node = NULL;
+#ifdef CONFIG_MEMCG_KMEM
+ lru->shrinker_id = -1;
+#endif
memcg_put_cache_ids();
}
EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/memblock.c b/mm/memblock.c
index b4ad05764745..d894b5ffc8c2 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -392,7 +392,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
{
struct memblock_region *new_array, *old_array;
phys_addr_t old_alloc_size, new_alloc_size;
- phys_addr_t old_size, new_size, addr;
+ phys_addr_t old_size, new_size, addr, new_end;
int use_slab = slab_is_available();
int *in_slab;
@@ -453,9 +453,9 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
return -1;
}
- memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
- type->name, type->max * 2, (u64)addr,
- (u64)addr + new_size - 1);
+ new_end = addr + new_size - 1;
+ memblock_dbg("memblock: %s is doubled to %ld at [%pa-%pa]",
+ type->name, type->max * 2, &addr, &new_end);
/*
* Found space, we now need to move the array over before we add the
@@ -1231,6 +1231,81 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+#ifdef CONFIG_HAVE_MEMBLOCK_PFN_VALID
+static int early_region_idx __init_memblock = -1;
+ulong __init_memblock memblock_next_valid_pfn(ulong pfn)
+{
+ struct memblock_type *type = &memblock.memory;
+ struct memblock_region *regions = type->regions;
+ uint right = type->cnt;
+ uint mid, left = 0;
+ ulong start_pfn, end_pfn, next_start_pfn;
+ phys_addr_t addr = PFN_PHYS(++pfn);
+
+ /* fast path, return pfn+1 if next pfn is in the same region */
+ if (early_region_idx != -1) {
+ start_pfn = PFN_DOWN(regions[early_region_idx].base);
+ end_pfn = PFN_DOWN(regions[early_region_idx].base +
+ regions[early_region_idx].size);
+
+ if (pfn >= start_pfn && pfn < end_pfn)
+ return pfn;
+
+ early_region_idx++;
+ next_start_pfn = PFN_DOWN(regions[early_region_idx].base);
+
+ if (pfn >= end_pfn && pfn <= next_start_pfn)
+ return next_start_pfn;
+ }
+
+ /* slow path, do the binary searching */
+ do {
+ mid = (right + left) / 2;
+
+ if (addr < regions[mid].base)
+ right = mid;
+ else if (addr >= (regions[mid].base + regions[mid].size))
+ left = mid + 1;
+ else {
+ early_region_idx = mid;
+ return pfn;
+ }
+ } while (left < right);
+
+ if (right == type->cnt)
+ return -1UL;
+
+ early_region_idx = right;
+
+ return PHYS_PFN(regions[early_region_idx].base);
+}
+EXPORT_SYMBOL(memblock_next_valid_pfn);
+
+int pfn_valid_region(ulong pfn)
+{
+ ulong start_pfn, end_pfn;
+ struct memblock_type *type = &memblock.memory;
+ struct memblock_region *regions = type->regions;
+
+ if (early_region_idx != -1) {
+ start_pfn = PFN_DOWN(regions[early_region_idx].base);
+ end_pfn = PFN_DOWN(regions[early_region_idx].base +
+ regions[early_region_idx].size);
+
+ if (pfn >= start_pfn && pfn < end_pfn)
+ return !memblock_is_nomap(
+ &regions[early_region_idx]);
+ }
+
+ early_region_idx = memblock_search_pfn_regions(pfn);
+ if (early_region_idx == -1)
+ return false;
+
+ return !memblock_is_nomap(&regions[early_region_idx]);
+}
+EXPORT_SYMBOL(pfn_valid_region);
+#endif /*CONFIG_HAVE_MEMBLOCK_PFN_VALID*/
+
static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid,
@@ -1438,9 +1513,9 @@ void * __init memblock_virt_alloc_try_nid_raw(
{
void *ptr;
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr, (void *)_RET_IP_);
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr,
+ &max_addr, (void *)_RET_IP_);
ptr = memblock_virt_alloc_internal(size, align,
min_addr, max_addr, nid);
@@ -1475,9 +1550,9 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
{
void *ptr;
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr, (void *)_RET_IP_);
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr,
+ &max_addr, (void *)_RET_IP_);
ptr = memblock_virt_alloc_internal(size, align,
min_addr, max_addr, nid);
@@ -1511,9 +1586,9 @@ void * __init memblock_virt_alloc_try_nid(
{
void *ptr;
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr, (void *)_RET_IP_);
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr,
+ &max_addr, (void *)_RET_IP_);
ptr = memblock_virt_alloc_internal(size, align,
min_addr, max_addr, nid);
if (ptr) {
@@ -1521,9 +1596,8 @@ void * __init memblock_virt_alloc_try_nid(
return ptr;
}
- panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr);
+ panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr);
return NULL;
}
#endif
@@ -1538,9 +1612,10 @@ void * __init memblock_virt_alloc_try_nid(
*/
void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
{
- memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
- __func__, (u64)base, (u64)base + size - 1,
- (void *)_RET_IP_);
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("%s: [%pa-%pa] %pF\n",
+ __func__, &base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
memblock_remove_range(&memblock.reserved, base, size);
}
@@ -1556,11 +1631,11 @@ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
*/
void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
{
- u64 cursor, end;
+ phys_addr_t cursor, end;
- memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
- __func__, (u64)base, (u64)base + size - 1,
- (void *)_RET_IP_);
+ end = base + size - 1;
+ memblock_dbg("%s: [%pa-%pa] %pF\n",
+ __func__, &base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
cursor = PFN_UP(base);
end = PFN_DOWN(base + size);
@@ -1718,6 +1793,15 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr
return -1;
}
+/* search memblock with the input pfn, return the region idx */
+int __init_memblock memblock_search_pfn_regions(unsigned long pfn)
+{
+ struct memblock_type *type = &memblock.memory;
+ int mid = memblock_search(type, PFN_PHYS(pfn));
+
+ return mid;
+}
+
bool __init memblock_is_reserved(phys_addr_t addr)
{
return memblock_search(&memblock.reserved, addr) != -1;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 298f0d41483d..4e3c1315b1de 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -233,6 +233,21 @@ enum res_type {
/* Used for OOM nofiier */
#define OOM_CONTROL (0)
+/*
+ * Iteration constructs for visiting all cgroups (under a tree). If
+ * loops are exited prematurely (break), mem_cgroup_iter_break() must
+ * be used for reference counting.
+ */
+#define for_each_mem_cgroup_tree(iter, root) \
+ for (iter = mem_cgroup_iter(root, NULL, NULL); \
+ iter != NULL; \
+ iter = mem_cgroup_iter(root, iter, NULL))
+
+#define for_each_mem_cgroup(iter) \
+ for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
+ iter != NULL; \
+ iter = mem_cgroup_iter(NULL, iter, NULL))
+
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -246,12 +261,7 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
-static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
-{
- return (memcg == root_mem_cgroup);
-}
-
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
/*
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
* The main reason for not using cgroup id for this:
@@ -305,7 +315,135 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
struct workqueue_struct *memcg_kmem_cache_wq;
-#endif /* !CONFIG_SLOB */
+static int memcg_shrinker_map_size;
+static DEFINE_MUTEX(memcg_shrinker_map_mutex);
+
+static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
+{
+ kvfree(container_of(head, struct memcg_shrinker_map, rcu));
+}
+
+static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
+ int size, int old_size)
+{
+ struct memcg_shrinker_map *new, *old;
+ int nid;
+
+ lockdep_assert_held(&memcg_shrinker_map_mutex);
+
+ for_each_node(nid) {
+ old = rcu_dereference_protected(
+ mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
+ /* Not yet online memcg */
+ if (!old)
+ return 0;
+
+ new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ /* Set all old bits, clear all new bits */
+ memset(new->map, (int)0xff, old_size);
+ memset((void *)new->map + old_size, 0, size - old_size);
+
+ rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
+ call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
+ }
+
+ return 0;
+}
+
+static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_per_node *pn;
+ struct memcg_shrinker_map *map;
+ int nid;
+
+ if (mem_cgroup_is_root(memcg))
+ return;
+
+ for_each_node(nid) {
+ pn = mem_cgroup_nodeinfo(memcg, nid);
+ map = rcu_dereference_protected(pn->shrinker_map, true);
+ if (map)
+ kvfree(map);
+ rcu_assign_pointer(pn->shrinker_map, NULL);
+ }
+}
+
+static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
+{
+ struct memcg_shrinker_map *map;
+ int nid, size, ret = 0;
+
+ if (mem_cgroup_is_root(memcg))
+ return 0;
+
+ mutex_lock(&memcg_shrinker_map_mutex);
+ size = memcg_shrinker_map_size;
+ for_each_node(nid) {
+ map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
+ if (!map) {
+ memcg_free_shrinker_maps(memcg);
+ ret = -ENOMEM;
+ break;
+ }
+ rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
+ }
+ mutex_unlock(&memcg_shrinker_map_mutex);
+
+ return ret;
+}
+
+int memcg_expand_shrinker_maps(int new_id)
+{
+ int size, old_size, ret = 0;
+ struct mem_cgroup *memcg;
+
+ size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
+ old_size = memcg_shrinker_map_size;
+ if (size <= old_size)
+ return 0;
+
+ mutex_lock(&memcg_shrinker_map_mutex);
+ if (!root_mem_cgroup)
+ goto unlock;
+
+ for_each_mem_cgroup(memcg) {
+ if (mem_cgroup_is_root(memcg))
+ continue;
+ ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
+ if (ret)
+ goto unlock;
+ }
+unlock:
+ if (!ret)
+ memcg_shrinker_map_size = size;
+ mutex_unlock(&memcg_shrinker_map_mutex);
+ return ret;
+}
+
+void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
+{
+ if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
+ struct memcg_shrinker_map *map;
+
+ rcu_read_lock();
+ map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
+ /* Pairs with smp mb in shrink_slab() */
+ smp_mb__before_atomic();
+ set_bit(shrinker_id, map->map);
+ rcu_read_unlock();
+ }
+}
+
+#else /* CONFIG_MEMCG_KMEM */
+static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
+#endif /* CONFIG_MEMCG_KMEM */
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
@@ -678,9 +816,20 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
}
EXPORT_SYMBOL(mem_cgroup_from_task);
-static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+/**
+ * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
+ * @mm: mm from which memcg should be extracted. It can be NULL.
+ *
+ * Obtain a reference on mm->memcg and returns it if successful. Otherwise
+ * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
+ * returned.
+ */
+struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
- struct mem_cgroup *memcg = NULL;
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled())
+ return NULL;
rcu_read_lock();
do {
@@ -700,6 +849,46 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
rcu_read_unlock();
return memcg;
}
+EXPORT_SYMBOL(get_mem_cgroup_from_mm);
+
+/**
+ * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
+ * @page: page from which memcg should be extracted.
+ *
+ * Obtain a reference on page->memcg and returns it if successful. Otherwise
+ * root_mem_cgroup is returned.
+ */
+struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
+{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ rcu_read_lock();
+ if (!memcg || !css_tryget_online(&memcg->css))
+ memcg = root_mem_cgroup;
+ rcu_read_unlock();
+ return memcg;
+}
+EXPORT_SYMBOL(get_mem_cgroup_from_page);
+
+/**
+ * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
+ */
+static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
+{
+ if (unlikely(current->active_memcg)) {
+ struct mem_cgroup *memcg = root_mem_cgroup;
+
+ rcu_read_lock();
+ if (css_tryget_online(&current->active_memcg->css))
+ memcg = current->active_memcg;
+ rcu_read_unlock();
+ return memcg;
+ }
+ return get_mem_cgroup_from_mm(current->mm);
+}
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
@@ -862,21 +1051,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
}
}
-/*
- * Iteration constructs for visiting all cgroups (under a tree). If
- * loops are exited prematurely (break), mem_cgroup_iter_break() must
- * be used for reference counting.
- */
-#define for_each_mem_cgroup_tree(iter, root) \
- for (iter = mem_cgroup_iter(root, NULL, NULL); \
- iter != NULL; \
- iter = mem_cgroup_iter(root, iter, NULL))
-
-#define for_each_mem_cgroup(iter) \
- for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
- iter != NULL; \
- iter = mem_cgroup_iter(NULL, iter, NULL))
-
/**
* mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
* @memcg: hierarchy root
@@ -1483,28 +1657,53 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
-static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+enum oom_status {
+ OOM_SUCCESS,
+ OOM_FAILED,
+ OOM_ASYNC,
+ OOM_SKIPPED
+};
+
+static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER)
- return;
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ return OOM_SKIPPED;
+
/*
* We are in the middle of the charge context here, so we
* don't want to block when potentially sitting on a callstack
* that holds all kinds of filesystem and mm locks.
*
- * Also, the caller may handle a failed allocation gracefully
- * (like optional page cache readahead) and so an OOM killer
- * invocation might not even be necessary.
+ * cgroup1 allows disabling the OOM killer and waiting for outside
+ * handling until the charge can succeed; remember the context and put
+ * the task to sleep at the end of the page fault when all locks are
+ * released.
*
- * That's why we don't do anything here except remember the
- * OOM context and then deal with it at the end of the page
- * fault when the stack is unwound, the locks are released,
- * and when we know whether the fault was overall successful.
+ * On the other hand, in-kernel OOM killer allows for an async victim
+ * memory reclaim (oom_reaper) and that means that we are not solely
+ * relying on the oom victim to make a forward progress and we can
+ * invoke the oom killer here.
+ *
+ * Please note that mem_cgroup_out_of_memory might fail to find a
+ * victim and then we have to bail out from the charge path.
*/
- css_get(&memcg->css);
- current->memcg_in_oom = memcg;
- current->memcg_oom_gfp_mask = mask;
- current->memcg_oom_order = order;
+ if (memcg->oom_kill_disable) {
+ if (!current->in_user_fault)
+ return OOM_SKIPPED;
+ css_get(&memcg->css);
+ current->memcg_in_oom = memcg;
+ current->memcg_oom_gfp_mask = mask;
+ current->memcg_oom_order = order;
+
+ return OOM_ASYNC;
+ }
+
+ if (mem_cgroup_out_of_memory(memcg, mask, order))
+ return OOM_SUCCESS;
+
+ WARN(1,"Memory cgroup charge failed because of no reclaimable memory! "
+ "This looks like a misconfiguration or a kernel bug.");
+ return OOM_FAILED;
}
/**
@@ -1578,6 +1777,62 @@ cleanup:
}
/**
+ * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
+ * @victim: task to be killed by the OOM killer
+ * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
+ *
+ * Returns a pointer to a memory cgroup, which has to be cleaned up
+ * by killing all belonging OOM-killable tasks.
+ *
+ * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
+ */
+struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
+ struct mem_cgroup *oom_domain)
+{
+ struct mem_cgroup *oom_group = NULL;
+ struct mem_cgroup *memcg;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return NULL;
+
+ if (!oom_domain)
+ oom_domain = root_mem_cgroup;
+
+ rcu_read_lock();
+
+ memcg = mem_cgroup_from_task(victim);
+ if (memcg == root_mem_cgroup)
+ goto out;
+
+ /*
+ * Traverse the memory cgroup hierarchy from the victim task's
+ * cgroup up to the OOMing cgroup (or root) to find the
+ * highest-level memory cgroup with oom.group set.
+ */
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ if (memcg->oom_group)
+ oom_group = memcg;
+
+ if (memcg == oom_domain)
+ break;
+ }
+
+ if (oom_group)
+ css_get(&oom_group->css);
+out:
+ rcu_read_unlock();
+
+ return oom_group;
+}
+
+void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
+{
+ pr_info("Tasks in ");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ pr_cont(" are going to be killed due to memory.oom.group set\n");
+}
+
+/**
* lock_page_memcg - lock a page->mem_cgroup binding
* @page: the page
*
@@ -1899,6 +2154,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
+ bool oomed = false;
+ enum oom_status oom_status;
if (mem_cgroup_is_root(memcg))
return 0;
@@ -1986,6 +2243,9 @@ retry:
if (nr_retries--)
goto retry;
+ if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
+ goto nomem;
+
if (gfp_mask & __GFP_NOFAIL)
goto force;
@@ -1994,8 +2254,23 @@ retry:
memcg_memory_event(mem_over_limit, MEMCG_OOM);
- mem_cgroup_oom(mem_over_limit, gfp_mask,
+ /*
+ * keep retrying as long as the memcg oom killer is able to make
+ * a forward progress or bypass the charge if the oom killer
+ * couldn't make any progress.
+ */
+ oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
get_order(nr_pages * PAGE_SIZE));
+ switch (oom_status) {
+ case OOM_SUCCESS:
+ nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ oomed = true;
+ goto retry;
+ case OOM_FAILED:
+ goto force;
+ default:
+ goto nomem;
+ }
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM;
@@ -2119,7 +2394,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
unlock_page_lru(page, isolated);
}
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -2261,7 +2536,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
if (current->memcg_kmem_skip_account)
return cachep;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ memcg = get_mem_cgroup_from_current();
kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
goto out;
@@ -2345,7 +2620,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
if (memcg_kmem_bypass())
return 0;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ memcg = get_mem_cgroup_from_current();
if (!mem_cgroup_is_root(memcg)) {
ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
if (!ret)
@@ -2384,7 +2659,7 @@ void memcg_kmem_uncharge(struct page *page, int order)
css_put_many(&memcg->css, nr_pages);
}
-#endif /* !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2680,29 +2955,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return retval;
}
-static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
-{
- struct mem_cgroup *iter;
- int i;
-
- memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
-
- for_each_mem_cgroup_tree(iter, memcg) {
- for (i = 0; i < MEMCG_NR_STAT; i++)
- stat[i] += memcg_page_state(iter, i);
- }
-}
+struct accumulated_stats {
+ unsigned long stat[MEMCG_NR_STAT];
+ unsigned long events[NR_VM_EVENT_ITEMS];
+ unsigned long lru_pages[NR_LRU_LISTS];
+ const unsigned int *stats_array;
+ const unsigned int *events_array;
+ int stats_size;
+ int events_size;
+};
-static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
+static void accumulate_memcg_tree(struct mem_cgroup *memcg,
+ struct accumulated_stats *acc)
{
- struct mem_cgroup *iter;
+ struct mem_cgroup *mi;
int i;
- memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS);
+ for_each_mem_cgroup_tree(mi, memcg) {
+ for (i = 0; i < acc->stats_size; i++)
+ acc->stat[i] += memcg_page_state(mi,
+ acc->stats_array ? acc->stats_array[i] : i);
- for_each_mem_cgroup_tree(iter, memcg) {
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- events[i] += memcg_sum_events(iter, i);
+ for (i = 0; i < acc->events_size; i++)
+ acc->events[i] += memcg_sum_events(mi,
+ acc->events_array ? acc->events_array[i] : i);
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ acc->lru_pages[i] +=
+ mem_cgroup_nr_lru_pages(mi, BIT(i));
}
}
@@ -2779,7 +3059,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
int memcg_id;
@@ -2851,7 +3131,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
}
rcu_read_unlock();
- memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+ memcg_drain_all_list_lrus(kmemcg_id, parent);
memcg_free_cache_id(kmemcg_id);
}
@@ -2879,7 +3159,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
static void memcg_free_kmem(struct mem_cgroup *memcg)
{
}
-#endif /* !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
static int memcg_update_kmem_max(struct mem_cgroup *memcg,
unsigned long max)
@@ -3113,6 +3393,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
+ struct accumulated_stats acc;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -3145,32 +3426,27 @@ static int memcg_stat_show(struct seq_file *m, void *v)
seq_printf(m, "hierarchical_memsw_limit %llu\n",
(u64)memsw * PAGE_SIZE);
- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
- unsigned long long val = 0;
+ memset(&acc, 0, sizeof(acc));
+ acc.stats_size = ARRAY_SIZE(memcg1_stats);
+ acc.stats_array = memcg1_stats;
+ acc.events_size = ARRAY_SIZE(memcg1_events);
+ acc.events_array = memcg1_events;
+ accumulate_memcg_tree(memcg, &acc);
+ for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
- for_each_mem_cgroup_tree(mi, memcg)
- val += memcg_page_state(mi, memcg1_stats[i]) *
- PAGE_SIZE;
- seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
+ seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
+ (u64)acc.stat[i] * PAGE_SIZE);
}
- for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
- unsigned long long val = 0;
-
- for_each_mem_cgroup_tree(mi, memcg)
- val += memcg_sum_events(mi, memcg1_events[i]);
- seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
- }
-
- for (i = 0; i < NR_LRU_LISTS; i++) {
- unsigned long long val = 0;
+ for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
+ seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
+ (u64)acc.events[i]);
- for_each_mem_cgroup_tree(mi, memcg)
- val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
- seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
- }
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
+ (u64)acc.lru_pages[i] * PAGE_SIZE);
#ifdef CONFIG_DEBUG_VM
{
@@ -4183,7 +4459,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
memcg->socket_pressure = jiffies;
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -4260,6 +4536,16 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ /*
+ * A memcg must be visible for memcg_expand_shrinker_maps()
+ * by the time the maps are allocated. So, we allocate maps
+ * here, when for_each_mem_cgroup() can't skip it.
+ */
+ if (memcg_alloc_shrinker_maps(memcg)) {
+ mem_cgroup_id_remove(memcg);
+ return -ENOMEM;
+ }
+
/* Online state pins memcg ID, memcg ID pins CSS */
atomic_set(&memcg->id.ref, 1);
css_get(css);
@@ -4312,6 +4598,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
mem_cgroup_remove_from_trees(memcg);
+ memcg_free_shrinker_maps(memcg);
memcg_free_kmem(memcg);
mem_cgroup_free(memcg);
}
@@ -5256,8 +5543,7 @@ static int memory_events_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long stat[MEMCG_NR_STAT];
- unsigned long events[NR_VM_EVENT_ITEMS];
+ struct accumulated_stats acc;
int i;
/*
@@ -5271,70 +5557,97 @@ static int memory_stat_show(struct seq_file *m, void *v)
* Current memory state:
*/
- tree_stat(memcg, stat);
- tree_events(memcg, events);
+ memset(&acc, 0, sizeof(acc));
+ acc.stats_size = MEMCG_NR_STAT;
+ acc.events_size = NR_VM_EVENT_ITEMS;
+ accumulate_memcg_tree(memcg, &acc);
seq_printf(m, "anon %llu\n",
- (u64)stat[MEMCG_RSS] * PAGE_SIZE);
+ (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
seq_printf(m, "file %llu\n",
- (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
+ (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
seq_printf(m, "kernel_stack %llu\n",
- (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
+ (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
seq_printf(m, "slab %llu\n",
- (u64)(stat[NR_SLAB_RECLAIMABLE] +
- stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
+ (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
+ acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
seq_printf(m, "sock %llu\n",
- (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
+ (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
seq_printf(m, "shmem %llu\n",
- (u64)stat[NR_SHMEM] * PAGE_SIZE);
+ (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n",
- (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
+ (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n",
- (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
+ (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
seq_printf(m, "file_writeback %llu\n",
- (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
-
- for (i = 0; i < NR_LRU_LISTS; i++) {
- struct mem_cgroup *mi;
- unsigned long val = 0;
+ (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
- for_each_mem_cgroup_tree(mi, memcg)
- val += mem_cgroup_nr_lru_pages(mi, BIT(i));
- seq_printf(m, "%s %llu\n",
- mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
- }
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
+ (u64)acc.lru_pages[i] * PAGE_SIZE);
seq_printf(m, "slab_reclaimable %llu\n",
- (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
+ (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
seq_printf(m, "slab_unreclaimable %llu\n",
- (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+ (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
/* Accumulated memory events */
- seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
- seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
+ seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
+ seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
- seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
- seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
- events[PGSCAN_DIRECT]);
- seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
- events[PGSTEAL_DIRECT]);
- seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
- seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
- seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
- seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
+ seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
+ seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
+ acc.events[PGSCAN_DIRECT]);
+ seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
+ acc.events[PGSTEAL_DIRECT]);
+ seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
+ seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
+ seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
+ seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
seq_printf(m, "workingset_refault %lu\n",
- stat[WORKINGSET_REFAULT]);
+ acc.stat[WORKINGSET_REFAULT]);
seq_printf(m, "workingset_activate %lu\n",
- stat[WORKINGSET_ACTIVATE]);
+ acc.stat[WORKINGSET_ACTIVATE]);
seq_printf(m, "workingset_nodereclaim %lu\n",
- stat[WORKINGSET_NODERECLAIM]);
+ acc.stat[WORKINGSET_NODERECLAIM]);
return 0;
}
+static int memory_oom_group_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+ seq_printf(m, "%d\n", memcg->oom_group);
+
+ return 0;
+}
+
+static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ int ret, oom_group;
+
+ buf = strstrip(buf);
+ if (!buf)
+ return -EINVAL;
+
+ ret = kstrtoint(buf, 0, &oom_group);
+ if (ret)
+ return ret;
+
+ if (oom_group != 0 && oom_group != 1)
+ return -EINVAL;
+
+ memcg->oom_group = oom_group;
+
+ return nbytes;
+}
+
static struct cftype memory_files[] = {
{
.name = "current",
@@ -5376,6 +5689,12 @@ static struct cftype memory_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,
},
+ {
+ .name = "oom.group",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_oom_group_show,
+ .write = memory_oom_group_write,
+ },
{ } /* terminate */
};
@@ -6023,7 +6342,7 @@ static int __init mem_cgroup_init(void)
{
int cpu, node;
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
/*
* Kmem cache creation is mostly done with the slab_mutex held,
* so use a workqueue with limited concurrency to avoid stalling
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index effaa7c7a1a4..3e4f6912789b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -58,6 +58,7 @@
#include <linux/memremap.h>
#include <linux/kfifo.h>
#include <linux/ratelimit.h>
+#include <linux/page-isolation.h>
#include "internal.h"
#include "ras/ras_event.h"
@@ -1287,7 +1288,7 @@ int memory_failure(unsigned long pfn, int flags)
* R/W the page; let's pray that the page has been
* used and will be freed some time later.
* In fact it's dangerous to directly bump up page count from 0,
- * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
+ * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
*/
if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
if (is_free_buddy_page(p)) {
@@ -1718,8 +1719,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
if (ret > 0)
ret = -EIO;
} else {
- if (PageHuge(page))
- dissolve_free_huge_page(page);
+ /*
+ * We set PG_hwpoison only when the migration source hugepage
+ * was successfully dissolved, because otherwise hwpoisoned
+ * hugepage remains on free hugepage list, then userspace will
+ * find it as SIGBUS by allocation failure. That's not expected
+ * in soft-offlining.
+ */
+ ret = dissolve_free_huge_page(page);
+ if (!ret) {
+ if (set_hwpoison_free_buddy_page(page))
+ num_poisoned_pages_inc();
+ }
}
return ret;
}
@@ -1807,6 +1818,7 @@ static int __soft_offline_page(struct page *page, int flags)
static int soft_offline_in_use_page(struct page *page, int flags)
{
int ret;
+ int mt;
struct page *hpage = compound_head(page);
if (!PageHuge(page) && PageTransHuge(hpage)) {
@@ -1825,23 +1837,37 @@ static int soft_offline_in_use_page(struct page *page, int flags)
put_hwpoison_page(hpage);
}
+ /*
+ * Setting MIGRATE_ISOLATE here ensures that the page will be linked
+ * to free list immediately (not via pcplist) when released after
+ * successful page migration. Otherwise we can't guarantee that the
+ * page is really free after put_page() returns, so
+ * set_hwpoison_free_buddy_page() highly likely fails.
+ */
+ mt = get_pageblock_migratetype(page);
+ set_pageblock_migratetype(page, MIGRATE_ISOLATE);
if (PageHuge(page))
ret = soft_offline_huge_page(page, flags);
else
ret = __soft_offline_page(page, flags);
-
+ set_pageblock_migratetype(page, mt);
return ret;
}
-static void soft_offline_free_page(struct page *page)
+static int soft_offline_free_page(struct page *page)
{
+ int rc = 0;
struct page *head = compound_head(page);
- if (!TestSetPageHWPoison(head)) {
- num_poisoned_pages_inc();
- if (PageHuge(head))
- dissolve_free_huge_page(page);
+ if (PageHuge(head))
+ rc = dissolve_free_huge_page(page);
+ if (!rc) {
+ if (set_hwpoison_free_buddy_page(page))
+ num_poisoned_pages_inc();
+ else
+ rc = -EBUSY;
}
+ return rc;
}
/**
@@ -1893,7 +1919,7 @@ int soft_offline_page(struct page *page, int flags)
if (ret > 0)
ret = soft_offline_in_use_page(page, flags);
else if (ret == 0)
- soft_offline_free_page(page);
+ ret = soft_offline_free_page(page);
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index 348279ff6e51..daaf0b6f60a0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -859,6 +859,10 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return NULL;
}
}
+
+ if (pte_devmap(pte))
+ return NULL;
+
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
@@ -923,6 +927,8 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
}
}
+ if (pmd_devmap(pmd))
+ return NULL;
if (is_zero_pfn(pfn))
return NULL;
if (unlikely(pfn > highest_memmap_pfn))
@@ -1607,20 +1613,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, start, end);
- for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
+ for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
unmap_single_vma(&tlb, vma, start, end, NULL);
-
- /*
- * zap_page_range does not specify whether mmap_sem should be
- * held for read or write. That allows parallel zap_page_range
- * operations to unmap a PTE and defer a flush meaning that
- * this call observes pte_none and fails to flush the TLB.
- * Rather than adding a complex API, ensure that no stale
- * TLB entries exist when this call returns.
- */
- flush_tlb_range(vma, start, end);
- }
-
mmu_notifier_invalidate_range_end(mm, start, end);
tlb_finish_mmu(&tlb, start, end);
}
@@ -2955,7 +2949,7 @@ int do_swap_page(struct vm_fault *vmf)
struct swap_info_struct *si = swp_swap_info(entry);
if (si->flags & SWP_SYNCHRONOUS_IO &&
- __swap_count(si, entry) == 1) {
+ __swap_count(entry) == 1) {
/* skip swapcache */
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
@@ -3065,7 +3059,6 @@ int do_swap_page(struct vm_fault *vmf)
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
@@ -3079,6 +3072,7 @@ int do_swap_page(struct vm_fault *vmf)
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
}
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
swap_free(entry);
if (mem_cgroup_swap_full(page) ||
@@ -3394,7 +3388,7 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page)
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
+ add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
page_add_file_rmap(page, true);
/*
* deposit and withdraw with pmd lock held
@@ -4147,7 +4141,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
* space. Kernel faults are handled more gracefully.
*/
if (flags & FAULT_FLAG_USER)
- mem_cgroup_oom_enable();
+ mem_cgroup_enter_user_fault();
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
@@ -4155,7 +4149,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
ret = __handle_mm_fault(vma, address, flags);
if (flags & FAULT_FLAG_USER) {
- mem_cgroup_oom_disable();
+ mem_cgroup_exit_user_fault();
/*
* The task may have entered a memcg OOM situation but
* if the allocation error was handled gracefully (no
@@ -4593,71 +4587,93 @@ EXPORT_SYMBOL(__might_fault);
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
-static void clear_gigantic_page(struct page *page,
- unsigned long addr,
- unsigned int pages_per_huge_page)
-{
- int i;
- struct page *p = page;
-
- might_sleep();
- for (i = 0; i < pages_per_huge_page;
- i++, p = mem_map_next(p, page, i)) {
- cond_resched();
- clear_user_highpage(p, addr + i * PAGE_SIZE);
- }
-}
-void clear_huge_page(struct page *page,
- unsigned long addr_hint, unsigned int pages_per_huge_page)
+/*
+ * Process all subpages of the specified huge page with the specified
+ * operation. The target subpage will be processed last to keep its
+ * cache lines hot.
+ */
+static inline void process_huge_page(
+ unsigned long addr_hint, unsigned int pages_per_huge_page,
+ void (*process_subpage)(unsigned long addr, int idx, void *arg),
+ void *arg)
{
int i, n, base, l;
unsigned long addr = addr_hint &
~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
- if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
- clear_gigantic_page(page, addr, pages_per_huge_page);
- return;
- }
-
- /* Clear sub-page to access last to keep its cache lines hot */
+ /* Process target subpage last to keep its cache lines hot */
might_sleep();
n = (addr_hint - addr) / PAGE_SIZE;
if (2 * n <= pages_per_huge_page) {
- /* If sub-page to access in first half of huge page */
+ /* If target subpage in first half of huge page */
base = 0;
l = n;
- /* Clear sub-pages at the end of huge page */
+ /* Process subpages at the end of huge page */
for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
cond_resched();
- clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ process_subpage(addr + i * PAGE_SIZE, i, arg);
}
} else {
- /* If sub-page to access in second half of huge page */
+ /* If target subpage in second half of huge page */
base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
l = pages_per_huge_page - n;
- /* Clear sub-pages at the begin of huge page */
+ /* Process subpages at the begin of huge page */
for (i = 0; i < base; i++) {
cond_resched();
- clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ process_subpage(addr + i * PAGE_SIZE, i, arg);
}
}
/*
- * Clear remaining sub-pages in left-right-left-right pattern
- * towards the sub-page to access
+ * Process remaining subpages in left-right-left-right pattern
+ * towards the target subpage
*/
for (i = 0; i < l; i++) {
int left_idx = base + i;
int right_idx = base + 2 * l - 1 - i;
cond_resched();
- clear_user_highpage(page + left_idx,
- addr + left_idx * PAGE_SIZE);
+ process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
cond_resched();
- clear_user_highpage(page + right_idx,
- addr + right_idx * PAGE_SIZE);
+ process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
}
}
+static void clear_gigantic_page(struct page *page,
+ unsigned long addr,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+ struct page *p = page;
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page;
+ i++, p = mem_map_next(p, page, i)) {
+ cond_resched();
+ clear_user_highpage(p, addr + i * PAGE_SIZE);
+ }
+}
+
+static void clear_subpage(unsigned long addr, int idx, void *arg)
+{
+ struct page *page = arg;
+
+ clear_user_highpage(page + idx, addr);
+}
+
+void clear_huge_page(struct page *page,
+ unsigned long addr_hint, unsigned int pages_per_huge_page)
+{
+ unsigned long addr = addr_hint &
+ ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
+
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+ clear_gigantic_page(page, addr, pages_per_huge_page);
+ return;
+ }
+
+ process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
+}
+
static void copy_user_gigantic_page(struct page *dst, struct page *src,
unsigned long addr,
struct vm_area_struct *vma,
@@ -4677,11 +4693,31 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src,
}
}
+struct copy_subpage_arg {
+ struct page *dst;
+ struct page *src;
+ struct vm_area_struct *vma;
+};
+
+static void copy_subpage(unsigned long addr, int idx, void *arg)
+{
+ struct copy_subpage_arg *copy_arg = arg;
+
+ copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
+ addr, copy_arg->vma);
+}
+
void copy_user_huge_page(struct page *dst, struct page *src,
- unsigned long addr, struct vm_area_struct *vma,
+ unsigned long addr_hint, struct vm_area_struct *vma,
unsigned int pages_per_huge_page)
{
- int i;
+ unsigned long addr = addr_hint &
+ ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
+ struct copy_subpage_arg arg = {
+ .dst = dst,
+ .src = src,
+ .vma = vma,
+ };
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
copy_user_gigantic_page(dst, src, addr, vma,
@@ -4689,11 +4725,7 @@ void copy_user_huge_page(struct page *dst, struct page *src,
return;
}
- might_sleep();
- for (i = 0; i < pages_per_huge_page; i++) {
- cond_resched();
- copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
- }
+ process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
}
long copy_huge_page_from_user(struct page *dst_page,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7deb49f69e27..9eea6e809a4e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -982,8 +982,6 @@ static void reset_node_present_pages(pg_data_t *pgdat)
static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
{
struct pglist_data *pgdat;
- unsigned long zones_size[MAX_NR_ZONES] = {0};
- unsigned long zholes_size[MAX_NR_ZONES] = {0};
unsigned long start_pfn = PFN_DOWN(start);
pgdat = NODE_DATA(nid);
@@ -1006,8 +1004,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
/* we can use NODE_DATA(nid) from here */
+ pgdat->node_id = nid;
+ pgdat->node_start_pfn = start_pfn;
+
/* init node's zones as empty zones, we don't have any present pages.*/
- free_area_init_node(nid, zones_size, start_pfn, zholes_size);
+ free_area_init_core_hotplug(nid);
pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
/*
@@ -1017,25 +1018,20 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
build_all_zonelists(pgdat);
/*
- * zone->managed_pages is set to an approximate value in
- * free_area_init_core(), which will cause
- * /sys/device/system/node/nodeX/meminfo has wrong data.
- * So reset it to 0 before any memory is onlined.
- */
- reset_node_managed_pages(pgdat);
-
- /*
* When memory is hot-added, all the memory is in offline state. So
* clear all zones' present_pages because they will be updated in
* online_pages() and offline_pages().
*/
+ reset_node_managed_pages(pgdat);
reset_node_present_pages(pgdat);
return pgdat;
}
-static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
+static void rollback_node_hotadd(int nid)
{
+ pg_data_t *pgdat = NODE_DATA(nid);
+
arch_refresh_nodedata(nid, NULL);
free_percpu(pgdat->per_cpu_nodestats);
arch_free_nodedata(pgdat);
@@ -1046,28 +1042,48 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
/**
* try_online_node - online a node if offlined
* @nid: the node ID
- *
+ * @start: start addr of the node
+ * @set_node_online: Whether we want to online the node
* called by cpu_up() to online a node without onlined memory.
+ *
+ * Returns:
+ * 1 -> a new node has been allocated
+ * 0 -> the node is already online
+ * -ENOMEM -> the node could not be allocated
*/
-int try_online_node(int nid)
+static int __try_online_node(int nid, u64 start, bool set_node_online)
{
- pg_data_t *pgdat;
- int ret;
+ pg_data_t *pgdat;
+ int ret = 1;
if (node_online(nid))
return 0;
- mem_hotplug_begin();
- pgdat = hotadd_new_pgdat(nid, 0);
+ pgdat = hotadd_new_pgdat(nid, start);
if (!pgdat) {
pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM;
goto out;
}
- node_set_online(nid);
- ret = register_one_node(nid);
- BUG_ON(ret);
+
+ if (set_node_online) {
+ node_set_online(nid);
+ ret = register_one_node(nid);
+ BUG_ON(ret);
+ }
out:
+ return ret;
+}
+
+/*
+ * Users of this function always want to online/register the node
+ */
+int try_online_node(int nid)
+{
+ int ret;
+
+ mem_hotplug_begin();
+ ret = __try_online_node(nid, 0, true);
mem_hotplug_done();
return ret;
}
@@ -1099,9 +1115,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
int __ref add_memory_resource(int nid, struct resource *res, bool online)
{
u64 start, size;
- pg_data_t *pgdat = NULL;
- bool new_pgdat;
- bool new_node;
+ bool new_node = false;
int ret;
start = res->start;
@@ -1111,11 +1125,6 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
if (ret)
return ret;
- { /* Stupid hack to suppress address-never-null warning */
- void *p = NODE_DATA(nid);
- new_pgdat = !p;
- }
-
mem_hotplug_begin();
/*
@@ -1126,48 +1135,31 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
*/
memblock_add_node(start, size, nid);
- new_node = !node_online(nid);
- if (new_node) {
- pgdat = hotadd_new_pgdat(nid, start);
- ret = -ENOMEM;
- if (!pgdat)
- goto error;
- }
+ ret = __try_online_node(nid, start, false);
+ if (ret < 0)
+ goto error;
+ new_node = ret;
/* call arch's memory hotadd */
ret = arch_add_memory(nid, start, size, NULL, true);
-
if (ret < 0)
goto error;
- /* we online node here. we can't roll back from here. */
- node_set_online(nid);
-
if (new_node) {
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long nr_pages = size >> PAGE_SHIFT;
-
- ret = __register_one_node(nid);
- if (ret)
- goto register_fail;
-
- /*
- * link memory sections under this node. This is already
- * done when creatig memory section in register_new_memory
- * but that depends to have the node registered so offline
- * nodes have to go through register_node.
- * TODO clean up this mess.
- */
- ret = link_mem_sections(nid, start_pfn, nr_pages, false);
-register_fail:
- /*
- * If sysfs file of new node can't create, cpu on the node
+ /* If sysfs file of new node can't be created, cpu on the node
* can't be hot-added. There is no rollback way now.
* So, check by BUG_ON() to catch it reluctantly..
+ * We online node here. We can't roll back from here.
*/
+ node_set_online(nid);
+ ret = __register_one_node(nid);
BUG_ON(ret);
}
+ /* link memory sections under this node.*/
+ ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1));
+ BUG_ON(ret);
+
/* create new memmap entry */
firmware_map_add_hotplug(start, start + size, "System RAM");
@@ -1180,8 +1172,8 @@ register_fail:
error:
/* rollback pgdat allocation and others */
- if (new_pgdat && pgdat)
- rollback_node_hotadd(nid, pgdat);
+ if (new_node)
+ rollback_node_hotadd(nid);
memblock_remove(start, size);
out:
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 01f1a14facc4..da858f794eb6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1784,7 +1784,7 @@ unsigned int mempolicy_slab_node(void)
zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, highest_zoneidx,
&policy->v.nodes);
- return z->zone ? z->zone->node : node;
+ return z->zone ? zone_to_nid(z->zone) : node;
}
default:
@@ -2326,7 +2326,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->v.nodes);
- polnid = z->zone->node;
+ polnid = zone_to_nid(z->zone);
break;
default:
@@ -2504,7 +2504,6 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
goto put_new;
/* Create pseudo-vma that contains just the policy */
- memset(&pvma, 0, sizeof(struct vm_area_struct));
vma_init(&pvma, NULL);
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
diff --git a/mm/mempool.c b/mm/mempool.c
index b54f2c20e5e0..0ef8cc8d1602 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -111,7 +111,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
kasan_free_pages(element, (unsigned long)pool->pool_data);
}
-static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags)
+static void kasan_unpoison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_unpoison_slab(element);
@@ -127,12 +127,12 @@ static __always_inline void add_element(mempool_t *pool, void *element)
pool->elements[pool->curr_nr++] = element;
}
-static void *remove_element(mempool_t *pool, gfp_t flags)
+static void *remove_element(mempool_t *pool)
{
void *element = pool->elements[--pool->curr_nr];
BUG_ON(pool->curr_nr < 0);
- kasan_unpoison_element(pool, element, flags);
+ kasan_unpoison_element(pool, element);
check_element(pool, element);
return element;
}
@@ -151,7 +151,7 @@ static void *remove_element(mempool_t *pool, gfp_t flags)
void mempool_exit(mempool_t *pool)
{
while (pool->curr_nr) {
- void *element = remove_element(pool, GFP_KERNEL);
+ void *element = remove_element(pool);
pool->free(element, pool->pool_data);
}
kfree(pool->elements);
@@ -213,6 +213,7 @@ EXPORT_SYMBOL(mempool_init_node);
/**
* mempool_init - initialize a memory pool
+ * @pool: pointer to the memory pool that should be initialized
* @min_nr: the minimum number of elements guaranteed to be
* allocated for this pool.
* @alloc_fn: user-defined element-allocation function.
@@ -301,7 +302,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
spin_lock_irqsave(&pool->lock, flags);
if (new_min_nr <= pool->min_nr) {
while (new_min_nr < pool->curr_nr) {
- element = remove_element(pool, GFP_KERNEL);
+ element = remove_element(pool);
spin_unlock_irqrestore(&pool->lock, flags);
pool->free(element, pool->pool_data);
spin_lock_irqsave(&pool->lock, flags);
@@ -387,7 +388,7 @@ repeat_alloc:
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr)) {
- element = remove_element(pool, gfp_temp);
+ element = remove_element(pool);
spin_unlock_irqrestore(&pool->lock, flags);
/* paired with rmb in mempool_free(), read comment there */
smp_wmb();
diff --git a/mm/migrate.c b/mm/migrate.c
index 80dc5b8c9738..e7723234d0c6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1199,7 +1199,7 @@ out:
* intentionally. Although it's rather weird,
* it's how HWPoison flag works at the moment.
*/
- if (!test_set_page_hwpoison(page))
+ if (set_hwpoison_free_buddy_page(page))
num_poisoned_pages_inc();
}
} else {
@@ -1318,8 +1318,6 @@ put_anon:
out:
if (rc != -EAGAIN)
putback_active_hugepage(hpage);
- if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage))
- num_poisoned_pages_inc();
/*
* If migration was not successful and there's a freeing callback, use
@@ -2939,7 +2937,8 @@ int migrate_vma(const struct migrate_vma_ops *ops,
/* Sanity check the arguments */
start &= PAGE_MASK;
end &= PAGE_MASK;
- if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
+ if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
+ vma_is_dax(vma))
return -EINVAL;
if (start < vma->vm_start || start >= vma->vm_end)
return -EINVAL;
diff --git a/mm/mincore.c b/mm/mincore.c
index 4985965aa20a..aa0e542569f9 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -68,8 +68,16 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
*/
if (xa_is_value(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
- page = find_get_page(swap_address_space(swp),
- swp_offset(swp));
+ struct swap_info_struct *si;
+
+ /* Prevent swap device to being swapoff under us */
+ si = get_swap_device(swp);
+ if (si) {
+ page = find_get_page(swap_address_space(swp),
+ swp_offset(swp));
+ put_swap_device(si);
+ } else
+ page = NULL;
}
} else
page = find_get_page(mapping, pgoff);
diff --git a/mm/mlock.c b/mm/mlock.c
index 74e5a6547c3d..41cc47e28ad6 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -527,7 +527,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
vm_flags_t old_flags = vma->vm_flags;
if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
- is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
+ is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
+ vma_is_dax(vma))
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
goto out;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 5b72266b4b03..6838a530789b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -53,13 +53,8 @@ void __init mminit_verify_zonelist(void)
zone->name);
/* Iterate the zonelist */
- for_each_zone_zonelist(zone, z, zonelist, zoneid) {
-#ifdef CONFIG_NUMA
- pr_cont("%d:%s ", zone->node, zone->name);
-#else
- pr_cont("0:%s ", zone->name);
-#endif /* CONFIG_NUMA */
- }
+ for_each_zone_zonelist(zone, z, zonelist, zoneid)
+ pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
pr_cont("\n");
}
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 17bbf4d3e24f..5f2b2b184c60 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1796,11 +1796,12 @@ out:
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
- if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current->mm)))
- mm->locked_vm += (len >> PAGE_SHIFT);
- else
+ if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
+ is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ else
+ mm->locked_vm += (len >> PAGE_SHIFT);
}
if (file)
@@ -3062,9 +3063,7 @@ void exit_mmap(struct mm_struct *mm)
* which clears VM_LOCKED, otherwise the oom reaper cannot
* reliably test it.
*/
- mutex_lock(&oom_lock);
- __oom_reap_task_mm(mm);
- mutex_unlock(&oom_lock);
+ (void)__oom_reap_task_mm(mm);
set_bit(MMF_OOM_SKIP, &mm->flags);
down_write(&mm->mmap_sem);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index eff6b88a993f..82bb1a939c0e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -174,18 +174,29 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
srcu_read_unlock(&srcu, id);
}
-void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
- unsigned long start, unsigned long end)
+int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ bool blockable)
{
struct mmu_notifier *mn;
+ int ret = 0;
int id;
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
- if (mn->ops->invalidate_range_start)
- mn->ops->invalidate_range_start(mn, mm, start, end);
+ if (mn->ops->invalidate_range_start) {
+ int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable);
+ if (_ret) {
+ pr_info("%pS callback failed with %d in %sblockable context.\n",
+ mn->ops->invalidate_range_start, _ret,
+ !blockable ? "non-" : "");
+ ret = _ret;
+ }
+ }
}
srcu_read_unlock(&srcu, id);
+
+ return ret;
}
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
diff --git a/mm/nommu.c b/mm/nommu.c
index 9fc9e43335b6..e4aac33216ae 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -364,10 +364,6 @@ void *vzalloc_node(unsigned long size, int node)
}
EXPORT_SYMBOL(vzalloc_node);
-#ifndef PAGE_KERNEL_EXEC
-# define PAGE_KERNEL_EXEC PAGE_KERNEL
-#endif
-
/**
* vmalloc_exec - allocate virtually contiguous, executable memory
* @size: allocation size
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2cc9b238368f..b5b25e4dcbbb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -53,6 +53,14 @@ int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;
+/*
+ * Serializes oom killer invocations (out_of_memory()) from all contexts to
+ * prevent from over eager oom killing (e.g. when the oom killer is invoked
+ * from different domains).
+ *
+ * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
+ * and mark_oom_victim
+ */
DEFINE_MUTEX(oom_lock);
#ifdef CONFIG_NUMA
@@ -392,7 +400,8 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
struct task_struct *p;
struct task_struct *task;
- pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
+ pr_info("Tasks state (memory values in pages):\n");
+ pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
rcu_read_lock();
for_each_process(p) {
if (oom_unkillable_task(p, memcg, nodemask))
@@ -408,7 +417,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
continue;
}
- pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
+ pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
mm_pgtables_bytes(task->mm),
@@ -479,9 +488,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);
-void __oom_reap_task_mm(struct mm_struct *mm)
+bool __oom_reap_task_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
+ bool ret = true;
/*
* Tell all users of get_user/copy_from_user etc... that the content
@@ -511,50 +521,32 @@ void __oom_reap_task_mm(struct mm_struct *mm)
struct mmu_gather tlb;
tlb_gather_mmu(&tlb, mm, start, end);
- mmu_notifier_invalidate_range_start(mm, start, end);
+ if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
+ ret = false;
+ continue;
+ }
unmap_page_range(&tlb, vma, start, end, NULL);
mmu_notifier_invalidate_range_end(mm, start, end);
tlb_finish_mmu(&tlb, start, end);
}
}
+
+ return ret;
}
+/*
+ * Reaps the address space of the give task.
+ *
+ * Returns true on success and false if none or part of the address space
+ * has been reclaimed and the caller should retry later.
+ */
static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
bool ret = true;
- /*
- * We have to make sure to not race with the victim exit path
- * and cause premature new oom victim selection:
- * oom_reap_task_mm exit_mm
- * mmget_not_zero
- * mmput
- * atomic_dec_and_test
- * exit_oom_victim
- * [...]
- * out_of_memory
- * select_bad_process
- * # no TIF_MEMDIE task selects new victim
- * unmap_page_range # frees some memory
- */
- mutex_lock(&oom_lock);
-
if (!down_read_trylock(&mm->mmap_sem)) {
- ret = false;
trace_skip_task_reaping(tsk->pid);
- goto unlock_oom;
- }
-
- /*
- * If the mm has invalidate_{start,end}() notifiers that could block,
- * sleep to give the oom victim some more time.
- * TODO: we really want to get rid of this ugly hack and make sure that
- * notifiers cannot block for unbounded amount of time
- */
- if (mm_has_blockable_invalidate_notifiers(mm)) {
- up_read(&mm->mmap_sem);
- schedule_timeout_idle(HZ);
- goto unlock_oom;
+ return false;
}
/*
@@ -564,25 +556,27 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* down_write();up_write() cycle in exit_mmap().
*/
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
- up_read(&mm->mmap_sem);
trace_skip_task_reaping(tsk->pid);
- goto unlock_oom;
+ goto out_unlock;
}
trace_start_task_reaping(tsk->pid);
- __oom_reap_task_mm(mm);
+ /* failed to reap part of the address space. Try again later */
+ ret = __oom_reap_task_mm(mm);
+ if (!ret)
+ goto out_finish;
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(tsk), tsk->comm,
K(get_mm_counter(mm, MM_ANONPAGES)),
K(get_mm_counter(mm, MM_FILEPAGES)),
K(get_mm_counter(mm, MM_SHMEMPAGES)));
+out_finish:
+ trace_finish_task_reaping(tsk->pid);
+out_unlock:
up_read(&mm->mmap_sem);
- trace_finish_task_reaping(tsk->pid);
-unlock_oom:
- mutex_unlock(&oom_lock);
return ret;
}
@@ -835,68 +829,12 @@ static bool task_will_free_mem(struct task_struct *task)
return ret;
}
-static void oom_kill_process(struct oom_control *oc, const char *message)
+static void __oom_kill_process(struct task_struct *victim)
{
- struct task_struct *p = oc->chosen;
- unsigned int points = oc->chosen_points;
- struct task_struct *victim = p;
- struct task_struct *child;
- struct task_struct *t;
+ struct task_struct *p;
struct mm_struct *mm;
- unsigned int victim_points = 0;
- static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
bool can_oom_reap = true;
- /*
- * If the task is already exiting, don't alarm the sysadmin or kill
- * its children or threads, just give it access to memory reserves
- * so it can die quickly
- */
- task_lock(p);
- if (task_will_free_mem(p)) {
- mark_oom_victim(p);
- wake_oom_reaper(p);
- task_unlock(p);
- put_task_struct(p);
- return;
- }
- task_unlock(p);
-
- if (__ratelimit(&oom_rs))
- dump_header(oc, p);
-
- pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
- message, task_pid_nr(p), p->comm, points);
-
- /*
- * If any of p's children has a different mm and is eligible for kill,
- * the one with the highest oom_badness() score is sacrificed for its
- * parent. This attempts to lose the minimal amount of work done while
- * still freeing memory.
- */
- read_lock(&tasklist_lock);
- for_each_thread(p, t) {
- list_for_each_entry(child, &t->children, sibling) {
- unsigned int child_points;
-
- if (process_shares_mm(child, p->mm))
- continue;
- /*
- * oom_badness() returns 0 if the thread is unkillable
- */
- child_points = oom_badness(child,
- oc->memcg, oc->nodemask, oc->totalpages);
- if (child_points > victim_points) {
- put_task_struct(victim);
- victim = child;
- victim_points = child_points;
- get_task_struct(victim);
- }
- }
- }
- read_unlock(&tasklist_lock);
-
p = find_lock_task_mm(victim);
if (!p) {
put_task_struct(victim);
@@ -971,6 +909,99 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
#undef K
/*
+ * Kill provided task unless it's secured by setting
+ * oom_score_adj to OOM_SCORE_ADJ_MIN.
+ */
+static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+{
+ if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+ get_task_struct(task);
+ __oom_kill_process(task);
+ }
+ return 0;
+}
+
+static void oom_kill_process(struct oom_control *oc, const char *message)
+{
+ struct task_struct *p = oc->chosen;
+ unsigned int points = oc->chosen_points;
+ struct task_struct *victim = p;
+ struct task_struct *child;
+ struct task_struct *t;
+ struct mem_cgroup *oom_group;
+ unsigned int victim_points = 0;
+ static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ /*
+ * If the task is already exiting, don't alarm the sysadmin or kill
+ * its children or threads, just give it access to memory reserves
+ * so it can die quickly
+ */
+ task_lock(p);
+ if (task_will_free_mem(p)) {
+ mark_oom_victim(p);
+ wake_oom_reaper(p);
+ task_unlock(p);
+ put_task_struct(p);
+ return;
+ }
+ task_unlock(p);
+
+ if (__ratelimit(&oom_rs))
+ dump_header(oc, p);
+
+ pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
+ message, task_pid_nr(p), p->comm, points);
+
+ /*
+ * If any of p's children has a different mm and is eligible for kill,
+ * the one with the highest oom_badness() score is sacrificed for its
+ * parent. This attempts to lose the minimal amount of work done while
+ * still freeing memory.
+ */
+ read_lock(&tasklist_lock);
+ for_each_thread(p, t) {
+ list_for_each_entry(child, &t->children, sibling) {
+ unsigned int child_points;
+
+ if (process_shares_mm(child, p->mm))
+ continue;
+ /*
+ * oom_badness() returns 0 if the thread is unkillable
+ */
+ child_points = oom_badness(child,
+ oc->memcg, oc->nodemask, oc->totalpages);
+ if (child_points > victim_points) {
+ put_task_struct(victim);
+ victim = child;
+ victim_points = child_points;
+ get_task_struct(victim);
+ }
+ }
+ }
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Do we need to kill the entire memory cgroup?
+ * Or even one of the ancestor memory cgroups?
+ * Check this out before killing the victim task.
+ */
+ oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
+
+ __oom_kill_process(victim);
+
+ /*
+ * If necessary, kill all tasks in the selected memory cgroup.
+ */
+ if (oom_group) {
+ mem_cgroup_print_oom_group(oom_group);
+ mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
+ mem_cgroup_put(oom_group);
+ }
+}
+
+/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
static void check_panic_on_oom(struct oom_control *oc,
@@ -1077,15 +1108,9 @@ bool out_of_memory(struct oom_control *oc)
dump_header(oc, NULL);
panic("Out of memory and no killable processes...\n");
}
- if (oc->chosen && oc->chosen != (void *)-1UL) {
+ if (oc->chosen && oc->chosen != (void *)-1UL)
oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
"Memory cgroup out of memory");
- /*
- * Give the killed process a good chance to exit before trying
- * to allocate memory again.
- */
- schedule_timeout_killable(1);
- }
return !!oc->chosen;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0e502bd13ad9..651cfc64d577 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2481,8 +2481,8 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers);
/*
* Call this whenever redirtying a page, to de-account the dirty counters
- * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written
- * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to
+ * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written
+ * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to
* systematic errors in balanced_dirty_ratelimit and the dirty pages position
* control.
*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0922ef5d2e46..839e0cc17f2c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -307,24 +307,33 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
}
/*
- * Returns false when the remaining initialisation should be deferred until
+ * Returns true when the remaining initialisation should be deferred until
* later in the boot cycle when it can be parallelised.
*/
-static inline bool update_defer_init(pg_data_t *pgdat,
- unsigned long pfn, unsigned long zone_end,
- unsigned long *nr_initialised)
+static bool __meminit
+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
+ static unsigned long prev_end_pfn, nr_initialised;
+
+ /*
+ * prev_end_pfn static that contains the end of previous zone
+ * No need to protect because called very early in boot before smp_init.
+ */
+ if (prev_end_pfn != end_pfn) {
+ prev_end_pfn = end_pfn;
+ nr_initialised = 0;
+ }
+
/* Always populate low zones for address-constrained allocations */
- if (zone_end < pgdat_end_pfn(pgdat))
- return true;
- (*nr_initialised)++;
- if ((*nr_initialised > pgdat->static_init_pgcnt) &&
- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
- pgdat->first_deferred_pfn = pfn;
+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
return false;
+ nr_initialised++;
+ if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) &&
+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+ NODE_DATA(nid)->first_deferred_pfn = pfn;
+ return true;
}
-
- return true;
+ return false;
}
#else
static inline bool early_page_uninitialised(unsigned long pfn)
@@ -332,11 +341,9 @@ static inline bool early_page_uninitialised(unsigned long pfn)
return false;
}
-static inline bool update_defer_init(pg_data_t *pgdat,
- unsigned long pfn, unsigned long zone_end,
- unsigned long *nr_initialised)
+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
- return true;
+ return false;
}
#endif
@@ -2909,10 +2916,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
if (!static_branch_likely(&vm_numa_stat_key))
return;
- if (z->node != numa_node_id())
+ if (zone_to_nid(z) != numa_node_id())
local_stat = NUMA_OTHER;
- if (z->node == preferred_zone->node)
+ if (zone_to_nid(z) == zone_to_nid(preferred_zone))
__inc_numa_state(z, NUMA_HIT);
else {
__inc_numa_state(z, NUMA_MISS);
@@ -4165,11 +4172,12 @@ retry:
alloc_flags = reserve_flags;
/*
- * Reset the zonelist iterators if memory policies can be ignored.
- * These allocations are high priority and system rather than user
- * orientated.
+ * Reset the nodemask and zonelist iterators if memory policies can be
+ * ignored. These allocations are high priority and system rather than
+ * user oriented.
*/
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
+ ac->nodemask = NULL;
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
@@ -4403,19 +4411,15 @@ out:
EXPORT_SYMBOL(__alloc_pages_nodemask);
/*
- * Common helper functions.
+ * Common helper functions. Never use with __GFP_HIGHMEM because the returned
+ * address cannot represent highmem pages. Use alloc_pages and then kmap if
+ * you need to access high mem.
*/
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *page;
- /*
- * __get_free_pages() returns a virtual address, which cannot represent
- * a highmem page
- */
- VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
-
- page = alloc_pages(gfp_mask, order);
+ page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
@@ -5281,7 +5285,7 @@ int local_memory_node(int node)
z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
gfp_zone(GFP_KERNEL),
NULL);
- return z->zone->node;
+ return zone_to_nid(z->zone);
}
#endif
@@ -5453,6 +5457,30 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
#endif
}
+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
+static bool __meminit
+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
+{
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ static struct memblock_region *r;
+
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_memblock(memory, r) {
+ if (*pfn < memblock_region_memory_end_pfn(r))
+ break;
+ }
+ }
+ if (*pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ *pfn = memblock_region_memory_end_pfn(r);
+ return true;
+ }
+ }
+#endif
+ return false;
+}
+
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
@@ -5462,14 +5490,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context,
struct vmem_altmap *altmap)
{
- unsigned long end_pfn = start_pfn + size;
- pg_data_t *pgdat = NODE_DATA(nid);
- unsigned long pfn;
- unsigned long nr_initialised = 0;
+ unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- struct memblock_region *r = NULL, *tmp;
-#endif
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
@@ -5486,39 +5508,19 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
- if (context != MEMMAP_EARLY)
- goto not_early;
-
- if (!early_pfn_valid(pfn))
- continue;
- if (!early_pfn_in_nid(pfn, nid))
- continue;
- if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
- break;
-
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- /*
- * Check given memblock attribute by firmware which can affect
- * kernel memory layout. If zone==ZONE_MOVABLE but memory is
- * mirrored, it's an overlapped memmap init. skip it.
- */
- if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
- if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
- for_each_memblock(memory, tmp)
- if (pfn < memblock_region_memory_end_pfn(tmp))
- break;
- r = tmp;
- }
- if (pfn >= memblock_region_memory_base_pfn(r) &&
- memblock_is_mirror(r)) {
- /* already initialized as NORMAL */
- pfn = memblock_region_memory_end_pfn(r);
+ if (context == MEMMAP_EARLY) {
+ if (!early_pfn_valid(pfn)) {
+ pfn = next_valid_pfn(pfn) - 1;
continue;
}
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ if (overlap_memmap_init(zone, &pfn))
+ continue;
+ if (defer_init(nid, pfn, end_pfn))
+ break;
}
-#endif
-not_early:
page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
if (context == MEMMAP_HOTPLUG)
@@ -5535,9 +5537,6 @@ not_early:
* can be created for invalid pages (for alignment)
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
- *
- * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
- * because this is done early in sparse_add_one_section
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
@@ -5555,10 +5554,11 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
}
-#ifndef __HAVE_ARCH_MEMMAP_INIT
-#define memmap_init(size, nid, zone, start_pfn) \
- memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL)
-#endif
+void __meminit __weak memmap_init(unsigned long size, int nid,
+ unsigned long zone, unsigned long start_pfn)
+{
+ memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
+}
static int zone_batchsize(struct zone *zone)
{
@@ -5567,13 +5567,12 @@ static int zone_batchsize(struct zone *zone)
/*
* The per-cpu-pages pools are set to around 1000th of the
- * size of the zone. But no more than 1/2 of a meg.
- *
- * OK, so we don't know how big the cache is. So guess.
+ * size of the zone.
*/
batch = zone->managed_pages / 1024;
- if (batch * PAGE_SIZE > 512 * 1024)
- batch = (512 * 1024) / PAGE_SIZE;
+ /* But no more than a meg. */
+ if (batch * PAGE_SIZE > 1024 * 1024)
+ batch = (1024 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
@@ -6124,7 +6123,7 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l
return usemapsize / 8;
}
-static void __init setup_usemap(struct pglist_data *pgdat,
+static void __ref setup_usemap(struct pglist_data *pgdat,
struct zone *zone,
unsigned long zone_start_pfn,
unsigned long zonesize)
@@ -6144,7 +6143,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-void __paginginit set_pageblock_order(void)
+void __init set_pageblock_order(void)
{
unsigned int order;
@@ -6172,14 +6171,14 @@ void __paginginit set_pageblock_order(void)
* include/linux/pageblock-flags.h for the values of pageblock_order based on
* the kernel config
*/
-void __paginginit set_pageblock_order(void)
+void __init set_pageblock_order(void)
{
}
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
-static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
- unsigned long present_pages)
+static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
+ unsigned long present_pages)
{
unsigned long pages = spanned_pages;
@@ -6198,39 +6197,99 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
}
-/*
- * Set up the zone data structures:
- * - mark all pages reserved
- * - mark all memory queues empty
- * - clear the memory bitmaps
- *
- * NOTE: pgdat should get zeroed by caller.
- */
-static void __paginginit free_area_init_core(struct pglist_data *pgdat)
-{
- enum zone_type j;
- int nid = pgdat->node_id;
-
- pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
+static void pgdat_init_numabalancing(struct pglist_data *pgdat)
+{
spin_lock_init(&pgdat->numabalancing_migrate_lock);
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies;
+}
+#else
+static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}
#endif
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pgdat_init_split_queue(struct pglist_data *pgdat)
+{
spin_lock_init(&pgdat->split_queue_lock);
INIT_LIST_HEAD(&pgdat->split_queue);
pgdat->split_queue_len = 0;
+}
+#else
+static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
#endif
- init_waitqueue_head(&pgdat->kswapd_wait);
- init_waitqueue_head(&pgdat->pfmemalloc_wait);
+
#ifdef CONFIG_COMPACTION
+static void pgdat_init_kcompactd(struct pglist_data *pgdat)
+{
init_waitqueue_head(&pgdat->kcompactd_wait);
+}
+#else
+static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
#endif
+
+static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
+{
+ pgdat_resize_init(pgdat);
+
+ pgdat_init_numabalancing(pgdat);
+ pgdat_init_split_queue(pgdat);
+ pgdat_init_kcompactd(pgdat);
+
+ init_waitqueue_head(&pgdat->kswapd_wait);
+ init_waitqueue_head(&pgdat->pfmemalloc_wait);
+
pgdat_page_ext_init(pgdat);
spin_lock_init(&pgdat->lru_lock);
lruvec_init(node_lruvec(pgdat));
+}
+
+static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
+ unsigned long remaining_pages)
+{
+ zone->managed_pages = remaining_pages;
+ zone_set_nid(zone, nid);
+ zone->name = zone_names[idx];
+ zone->zone_pgdat = NODE_DATA(nid);
+ spin_lock_init(&zone->lock);
+ zone_seqlock_init(zone);
+ zone_pcp_init(zone);
+}
+
+/*
+ * Set up the zone data structures
+ * - init pgdat internals
+ * - init all zones belonging to this node
+ *
+ * NOTE: this function is only called during memory hotplug
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __ref free_area_init_core_hotplug(int nid)
+{
+ enum zone_type z;
+ pg_data_t *pgdat = NODE_DATA(nid);
+ pgdat_init_internals(pgdat);
+ for (z = 0; z < MAX_NR_ZONES; z++)
+ zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
+}
+#endif
+
+/*
+ * Set up the zone data structures:
+ * - mark all pages reserved
+ * - mark all memory queues empty
+ * - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
+ * NOTE: this function is only called during early init.
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat)
+{
+ enum zone_type j;
+ int nid = pgdat->node_id;
+
+ pgdat_init_internals(pgdat);
pgdat->per_cpu_nodestats = &boot_nodestats;
for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -6278,15 +6337,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
- zone->managed_pages = freesize;
-#ifdef CONFIG_NUMA
- zone->node = nid;
-#endif
- zone->name = zone_names[j];
- zone->zone_pgdat = pgdat;
- spin_lock_init(&zone->lock);
- zone_seqlock_init(zone);
- zone_pcp_init(zone);
+ zone_init_internals(zone, j, nid, freesize);
if (!size)
continue;
@@ -6346,8 +6397,24 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
-void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
- unsigned long node_start_pfn, unsigned long *zholes_size)
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
+{
+ /*
+ * We start only with one section of pages, more pages are added as
+ * needed until the rest of deferred pages are initialized.
+ */
+ pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
+ pgdat->node_spanned_pages);
+ pgdat->first_deferred_pfn = ULONG_MAX;
+}
+#else
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
+#endif
+
+void __init free_area_init_node(int nid, unsigned long *zones_size,
+ unsigned long node_start_pfn,
+ unsigned long *zholes_size)
{
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long start_pfn = 0;
@@ -6371,16 +6438,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
zones_size, zholes_size);
alloc_node_mem_map(pgdat);
+ pgdat_set_deferred_range(pgdat);
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- /*
- * We start only with one section of pages, more pages are added as
- * needed until the rest of deferred pages are initialized.
- */
- pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
- pgdat->node_spanned_pages);
- pgdat->first_deferred_pfn = ULONG_MAX;
-#endif
free_area_init_core(pgdat);
}
@@ -6392,7 +6451,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
* may be accessed (for example page_to_pfn() on some configuration accesses
* flags). We must explicitly zero those struct pages.
*/
-void __paginginit zero_resv_unavail(void)
+void __init zero_resv_unavail(void)
{
phys_addr_t start, end;
unsigned long pfn;
@@ -6405,8 +6464,11 @@ void __paginginit zero_resv_unavail(void)
pgcnt = 0;
for_each_resv_unavail_range(i, &start, &end) {
for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages)))
+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+ + pageblock_nr_pages - 1;
continue;
+ }
mm_zero_struct_page(pfn_to_page(pfn));
pgcnt++;
}
@@ -8037,3 +8099,33 @@ bool is_free_buddy_page(struct page *page)
return order < MAX_ORDER;
}
+
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
+ * test is performed under the zone lock to prevent a race against page
+ * allocation.
+ */
+bool set_hwpoison_free_buddy_page(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long flags;
+ unsigned int order;
+ bool hwpoisoned = false;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+ struct page *page_head = page - (pfn & ((1 << order) - 1));
+
+ if (PageBuddy(page_head) && page_order(page_head) >= order) {
+ if (!TestSetPageHWPoison(page))
+ hwpoisoned = true;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ return hwpoisoned;
+}
+#endif
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 5295ef331165..a9826da84ccb 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -120,7 +120,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
pgdat->node_page_ext = NULL;
}
-struct page_ext *lookup_page_ext(struct page *page)
+struct page_ext *lookup_page_ext(const struct page *page)
{
unsigned long pfn = page_to_pfn(page);
unsigned long index;
@@ -195,7 +195,7 @@ fail:
#else /* CONFIG_FLAT_NODE_MEM_MAP */
-struct page_ext *lookup_page_ext(struct page *page)
+struct page_ext *lookup_page_ext(const struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index d80adfe702d3..c2494f034d02 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -274,7 +274,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
*/
for (; pfn < end_pfn; ) {
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
@@ -541,7 +541,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
unsigned long block_end_pfn;
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 0b6480979ac7..a749d4d96e3e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -170,6 +170,14 @@ static LIST_HEAD(pcpu_map_extend_chunks);
int pcpu_nr_empty_pop_pages;
/*
+ * The number of populated pages in use by the allocator, protected by
+ * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets
+ * allocated/deallocated, it is allocated/deallocated in all units of a chunk
+ * and increments/decrements this count by 1).
+ */
+static unsigned long pcpu_nr_populated;
+
+/*
* Balance work is used to populate or destroy chunks asynchronously. We
* try to keep the number of populated free pages between
* PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
@@ -1232,6 +1240,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
bitmap_set(chunk->populated, page_start, nr);
chunk->nr_populated += nr;
+ pcpu_nr_populated += nr;
if (!for_alloc) {
chunk->nr_empty_pop_pages += nr;
@@ -1260,6 +1269,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
chunk->nr_populated -= nr;
chunk->nr_empty_pop_pages -= nr;
pcpu_nr_empty_pop_pages -= nr;
+ pcpu_nr_populated -= nr;
}
/*
@@ -2176,6 +2186,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
+ /* include all regions of the first chunk */
+ pcpu_nr_populated += PFN_DOWN(size_sum);
+
pcpu_stats_chunk_alloc();
trace_percpu_create_chunk(base_addr);
@@ -2746,6 +2759,22 @@ void __init setup_per_cpu_areas(void)
#endif /* CONFIG_SMP */
/*
+ * pcpu_nr_pages - calculate total number of populated backing pages
+ *
+ * This reflects the number of pages populated to back chunks. Metadata is
+ * excluded in the number exposed in meminfo as the number of backing pages
+ * scales with the number of cpus and can quickly outweigh the memory used for
+ * metadata. It also keeps this calculation nice and simple.
+ *
+ * RETURNS:
+ * Total number of populated backing pages in use by the allocator.
+ */
+unsigned long pcpu_nr_pages(void)
+{
+ return pcpu_nr_populated * pcpu_nr_units;
+}
+
+/*
* Percpu allocator is initialized early during boot when neither slab or
* workqueue is available. Plug async management until everything is up
* and running.
diff --git a/mm/shmem.c b/mm/shmem.c
index 9c1302b78cdd..df108fbfe1d7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -29,6 +29,7 @@
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
+#include <linux/random.h>
#include <linux/sched/signal.h>
#include <linux/export.h>
#include <linux/swap.h>
@@ -1388,7 +1389,6 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
struct shmem_inode_info *info, pgoff_t index)
{
/* Create a pseudo vma that just contains the policy */
- memset(vma, 0, sizeof(*vma));
vma_init(vma, NULL);
/* Bias interleave by inode number to distribute better across nodes */
vma->vm_pgoff = index + info->vfs_inode.i_ino;
@@ -2144,7 +2144,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
- inode->i_generation = get_seconds();
+ inode->i_generation = prandom_u32();
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
diff --git a/mm/slab.h b/mm/slab.h
index 68bdf498da3b..58c6c1c2a78e 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -203,7 +203,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
/* List of all root caches. */
extern struct list_head slab_root_caches;
@@ -296,7 +296,7 @@ extern void memcg_link_cache(struct kmem_cache *s);
extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
void (*deact_fn)(struct kmem_cache *));
-#else /* CONFIG_MEMCG && !CONFIG_SLOB */
+#else /* CONFIG_MEMCG_KMEM */
/* If !memcg, all caches are root. */
#define slab_root_caches slab_caches
@@ -351,7 +351,7 @@ static inline void memcg_link_cache(struct kmem_cache *s)
{
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 2296caf87bfb..fea3376f9816 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -127,7 +127,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
return i;
}
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
LIST_HEAD(slab_root_caches);
@@ -256,7 +256,7 @@ static inline void destroy_memcg_params(struct kmem_cache *s)
static inline void memcg_unlink_cache(struct kmem_cache *s)
{
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
/*
* Figure out what the alignment of the objects will be given a set of
@@ -584,7 +584,7 @@ static int shutdown_cache(struct kmem_cache *s)
return 0;
}
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
/*
* memcg_create_kmem_cache - Create a cache for a memory cgroup.
* @memcg: The memory cgroup the new cache is for.
@@ -861,7 +861,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s)
static inline void flush_memcg_workqueue(struct kmem_cache *s)
{
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
void slab_kmem_cache_release(struct kmem_cache *s)
{
diff --git a/mm/slub.c b/mm/slub.c
index 51258eff4178..ce2b9e5cea77 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -271,8 +271,7 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
static void prefetch_freepointer(const struct kmem_cache *s, void *object)
{
- if (object)
- prefetch(freelist_dereference(s, object + s->offset));
+ prefetch(object + s->offset);
}
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index bd0276d5f66b..8301293331a2 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -43,12 +43,9 @@ static void * __ref __earlyonly_bootmem_alloc(int node,
unsigned long goal)
{
return memblock_virt_alloc_try_nid_raw(size, align, goal,
- BOOTMEM_ALLOC_ACCESSIBLE, node);
+ BOOTMEM_ALLOC_ACCESSIBLE, node);
}
-static void *vmemmap_buf;
-static void *vmemmap_buf_end;
-
void * __meminit vmemmap_alloc_block(unsigned long size, int node)
{
/* If the main allocator is up use that, fallback to bootmem. */
@@ -76,18 +73,10 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
/* need to make sure size is all the same during early stage */
void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
{
- void *ptr;
-
- if (!vmemmap_buf)
- return vmemmap_alloc_block(size, node);
-
- /* take the from buf */
- ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
- if (ptr + size > vmemmap_buf_end)
- return vmemmap_alloc_block(size, node);
-
- vmemmap_buf = ptr + size;
+ void *ptr = sparse_buffer_alloc(size);
+ if (!ptr)
+ ptr = vmemmap_alloc_block(size, node);
return ptr;
}
@@ -272,45 +261,3 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid,
return map;
}
-
-void __init sparse_mem_maps_populate_node(struct page **map_map,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long map_count, int nodeid)
-{
- unsigned long pnum;
- unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
- void *vmemmap_buf_start;
-
- size = ALIGN(size, PMD_SIZE);
- vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
- PMD_SIZE, __pa(MAX_DMA_ADDRESS));
-
- if (vmemmap_buf_start) {
- vmemmap_buf = vmemmap_buf_start;
- vmemmap_buf_end = vmemmap_buf_start + size * map_count;
- }
-
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- struct mem_section *ms;
-
- if (!present_section_nr(pnum))
- continue;
-
- map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL);
- if (map_map[pnum])
- continue;
- ms = __nr_to_section(pnum);
- pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
- __func__);
- ms->section_mem_map = 0;
- }
-
- if (vmemmap_buf_start) {
- /* need to free left buf */
- memblock_free_early(__pa(vmemmap_buf),
- vmemmap_buf_end - vmemmap_buf);
- vmemmap_buf = NULL;
- vmemmap_buf_end = NULL;
- }
-}
diff --git a/mm/sparse.c b/mm/sparse.c
index f13f2723950a..10b07eea9a6e 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -200,6 +200,11 @@ static inline int next_present_section_nr(int section_nr)
(section_nr <= __highest_present_section_nr)); \
section_nr = next_present_section_nr(section_nr))
+static inline unsigned long first_present_section_nr(void)
+{
+ return next_present_section_nr(-1);
+}
+
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
@@ -257,19 +262,14 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}
-static int __meminit sparse_init_one_section(struct mem_section *ms,
+static void __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map,
unsigned long *pageblock_bitmap)
{
- if (!present_section(ms))
- return -EINVAL;
-
ms->section_mem_map &= ~SECTION_MAP_MASK;
ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
SECTION_HAS_MEM_MAP;
ms->pageblock_flags = pageblock_bitmap;
-
- return 1;
}
unsigned long usemap_size(void)
@@ -370,160 +370,121 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
-static void __init sparse_early_usemaps_alloc_node(void *data,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long usemap_count, int nodeid)
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static unsigned long __init section_map_size(void)
{
- void *usemap;
- unsigned long pnum;
- unsigned long **usemap_map = (unsigned long **)data;
- int size = usemap_size();
-
- usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
- size * usemap_count);
- if (!usemap) {
- pr_warn("%s: allocation failed\n", __func__);
- return;
- }
+ return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
+}
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- if (!present_section_nr(pnum))
- continue;
- usemap_map[pnum] = usemap;
- usemap += size;
- check_usemap_section_nr(nodeid, usemap_map[pnum]);
- }
+#else
+static unsigned long __init section_map_size(void)
+{
+ return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
struct vmem_altmap *altmap)
{
- struct page *map;
- unsigned long size;
+ unsigned long size = section_map_size();
+ struct page *map = sparse_buffer_alloc(size);
+
+ if (map)
+ return map;
- size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
map = memblock_virt_alloc_try_nid(size,
PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
BOOTMEM_ALLOC_ACCESSIBLE, nid);
return map;
}
-void __init sparse_mem_maps_populate_node(struct page **map_map,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long map_count, int nodeid)
-{
- void *map;
- unsigned long pnum;
- unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
-
- size = PAGE_ALIGN(size);
- map = memblock_virt_alloc_try_nid_raw(size * map_count,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
- if (map) {
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- if (!present_section_nr(pnum))
- continue;
- map_map[pnum] = map;
- map += size;
- }
- return;
- }
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
- /* fallback */
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- struct mem_section *ms;
+static void *sparsemap_buf __meminitdata;
+static void *sparsemap_buf_end __meminitdata;
- if (!present_section_nr(pnum))
- continue;
- map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL);
- if (map_map[pnum])
- continue;
- ms = __nr_to_section(pnum);
- pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
- __func__);
- ms->section_mem_map = 0;
- }
+static void __init sparse_buffer_init(unsigned long size, int nid)
+{
+ WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
+ sparsemap_buf =
+ memblock_virt_alloc_try_nid_raw(size, PAGE_SIZE,
+ __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
+ sparsemap_buf_end = sparsemap_buf + size;
}
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
-static void __init sparse_early_mem_maps_alloc_node(void *data,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long map_count, int nodeid)
+static void __init sparse_buffer_fini(void)
{
- struct page **map_map = (struct page **)data;
- sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
- map_count, nodeid);
+ unsigned long size = sparsemap_buf_end - sparsemap_buf;
+
+ if (sparsemap_buf && size > 0)
+ memblock_free_early(__pa(sparsemap_buf), size);
+ sparsemap_buf = NULL;
}
-#else
-static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
-{
- struct page *map;
- struct mem_section *ms = __nr_to_section(pnum);
- int nid = sparse_early_nid(ms);
- map = sparse_mem_map_populate(pnum, nid, NULL);
- if (map)
- return map;
+void * __meminit sparse_buffer_alloc(unsigned long size)
+{
+ void *ptr = NULL;
- pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
- __func__);
- ms->section_mem_map = 0;
- return NULL;
+ if (sparsemap_buf) {
+ ptr = PTR_ALIGN(sparsemap_buf, size);
+ if (ptr + size > sparsemap_buf_end)
+ ptr = NULL;
+ else
+ sparsemap_buf = ptr + size;
+ }
+ return ptr;
}
-#endif
void __weak __meminit vmemmap_populate_print_last(void)
{
}
-/**
- * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap
- * @map: usemap_map for pageblock flags or mmap_map for vmemmap
+/*
+ * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
+ * And number of present sections in this node is map_count.
*/
-static void __init alloc_usemap_and_memmap(void (*alloc_func)
- (void *, unsigned long, unsigned long,
- unsigned long, int), void *data)
+static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count)
{
- unsigned long pnum;
- unsigned long map_count;
- int nodeid_begin = 0;
- unsigned long pnum_begin = 0;
-
- for_each_present_section_nr(0, pnum) {
- struct mem_section *ms;
+ unsigned long pnum, usemap_longs, *usemap;
+ struct page *map;
- ms = __nr_to_section(pnum);
- nodeid_begin = sparse_early_nid(ms);
- pnum_begin = pnum;
- break;
+ usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
+ usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
+ usemap_size() *
+ map_count);
+ if (!usemap) {
+ pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
+ goto failed;
+ }
+ sparse_buffer_init(map_count * section_map_size(), nid);
+ for_each_present_section_nr(pnum_begin, pnum) {
+ if (pnum >= pnum_end)
+ break;
+
+ map = sparse_mem_map_populate(pnum, nid, NULL);
+ if (!map) {
+ pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
+ __func__, nid);
+ pnum_begin = pnum;
+ goto failed;
+ }
+ check_usemap_section_nr(nid, usemap);
+ sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap);
+ usemap += usemap_longs;
}
- map_count = 1;
- for_each_present_section_nr(pnum_begin + 1, pnum) {
+ sparse_buffer_fini();
+ return;
+failed:
+ /* We failed to allocate, mark all the following pnums as not present */
+ for_each_present_section_nr(pnum_begin, pnum) {
struct mem_section *ms;
- int nodeid;
+ if (pnum >= pnum_end)
+ break;
ms = __nr_to_section(pnum);
- nodeid = sparse_early_nid(ms);
- if (nodeid == nodeid_begin) {
- map_count++;
- continue;
- }
- /* ok, we need to take cake of from pnum_begin to pnum - 1*/
- alloc_func(data, pnum_begin, pnum,
- map_count, nodeid_begin);
- /* new start, update count etc*/
- nodeid_begin = nodeid;
- pnum_begin = pnum;
- map_count = 1;
+ ms->section_mem_map = 0;
}
- /* ok, last chunk */
- alloc_func(data, pnum_begin, __highest_present_section_nr+1,
- map_count, nodeid_begin);
}
/*
@@ -532,72 +493,29 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func)
*/
void __init sparse_init(void)
{
- unsigned long pnum;
- struct page *map;
- unsigned long *usemap;
- unsigned long **usemap_map;
- int size;
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- int size2;
- struct page **map_map;
-#endif
-
- /* see include/linux/mmzone.h 'struct mem_section' definition */
- BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
+ unsigned long pnum_begin = first_present_section_nr();
+ int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
+ unsigned long pnum_end, map_count = 1;
/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
set_pageblock_order();
- /*
- * map is using big page (aka 2M in x86 64 bit)
- * usemap is less one page (aka 24 bytes)
- * so alloc 2M (with 2M align) and 24 bytes in turn will
- * make next 2M slip to one more 2M later.
- * then in big system, the memory will have a lot of holes...
- * here try to allocate 2M pages continuously.
- *
- * powerpc need to call sparse_init_one_section right after each
- * sparse_early_mem_map_alloc, so allocate usemap_map at first.
- */
- size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
- usemap_map = memblock_virt_alloc(size, 0);
- if (!usemap_map)
- panic("can not allocate usemap_map\n");
- alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
- (void *)usemap_map);
-
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
- map_map = memblock_virt_alloc(size2, 0);
- if (!map_map)
- panic("can not allocate map_map\n");
- alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
- (void *)map_map);
-#endif
-
- for_each_present_section_nr(0, pnum) {
- usemap = usemap_map[pnum];
- if (!usemap)
- continue;
+ for_each_present_section_nr(pnum_begin + 1, pnum_end) {
+ int nid = sparse_early_nid(__nr_to_section(pnum_end));
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- map = map_map[pnum];
-#else
- map = sparse_early_mem_map_alloc(pnum);
-#endif
- if (!map)
+ if (nid == nid_begin) {
+ map_count++;
continue;
-
- sparse_init_one_section(__nr_to_section(pnum), pnum, map,
- usemap);
+ }
+ /* Init node with sections in range [pnum_begin, pnum_end) */
+ sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
+ nid_begin = nid;
+ pnum_begin = pnum_end;
+ map_count = 1;
}
-
+ /* cover the last node */
+ sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
vmemmap_populate_print_last();
-
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- memblock_free_early(__pa(map_map), size2);
-#endif
- memblock_free_early(__pa(usemap_map), size);
}
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -760,6 +678,7 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
ret = sparse_index_init(section_nr, pgdat->node_id);
if (ret < 0 && ret != -EEXIST)
return ret;
+ ret = 0;
memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, altmap);
if (!memmap)
return -ENOMEM;
@@ -786,12 +705,11 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
#endif
section_mark_present(ms);
-
- ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
+ sparse_init_one_section(ms, section_nr, memmap, usemap);
out:
pgdat_resize_unlock(pgdat, &flags);
- if (ret <= 0) {
+ if (ret < 0) {
kfree(usemap);
__kfree_section_memmap(memmap, altmap);
}
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index a791411fed71..63a7b4563a57 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -38,9 +38,9 @@ static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
static bool swap_slot_cache_active;
bool swap_slot_cache_enabled;
static bool swap_slot_cache_initialized;
-DEFINE_MUTEX(swap_slots_cache_mutex);
+static DEFINE_MUTEX(swap_slots_cache_mutex);
/* Serialize swap slots cache enable/disable operations */
-DEFINE_MUTEX(swap_slots_cache_enable_mutex);
+static DEFINE_MUTEX(swap_slots_cache_enable_mutex);
static void __drain_swap_slots_cache(unsigned int type);
static void deactivate_swap_slots_cache(void);
@@ -269,8 +269,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
cache->cur = 0;
if (swap_slot_cache_active)
- cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false,
- cache->slots);
+ cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
+ cache->slots, 1);
return cache->nr;
}
@@ -316,7 +316,7 @@ swp_entry_t get_swap_page(struct page *page)
if (PageTransHuge(page)) {
if (IS_ENABLED(CONFIG_THP_SWAP))
- get_swap_pages(1, true, &entry);
+ get_swap_pages(1, &entry, HPAGE_PMD_NR);
goto out;
}
@@ -350,7 +350,7 @@ repeat:
goto out;
}
- get_swap_pages(1, false, &entry);
+ get_swap_pages(1, &entry, 1);
out:
if (mem_cgroup_try_charge_swap(page, entry)) {
put_swap_page(page, entry);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 31c45a25b2d3..605d211012a1 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -310,8 +310,13 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
unsigned long addr)
{
struct page *page;
+ struct swap_info_struct *si;
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
page = find_get_page(swap_address_space(entry), swp_offset(entry));
+ put_swap_device(si);
INC_CACHE_INFO(find_total);
if (page) {
@@ -354,8 +359,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
bool *new_page_allocated)
{
- struct page *found_page, *new_page = NULL;
- struct address_space *swapper_space = swap_address_space(entry);
+ struct page *found_page = NULL, *new_page = NULL;
+ struct swap_info_struct *si;
int err;
*new_page_allocated = false;
@@ -365,7 +370,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* called after lookup_swap_cache() failed, re-calling
* that would confuse statistics.
*/
- found_page = find_get_page(swapper_space, swp_offset(entry));
+ si = get_swap_device(entry);
+ if (!si)
+ break;
+ found_page = find_get_page(swap_address_space(entry),
+ swp_offset(entry));
+ put_swap_device(si);
if (found_page)
break;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8837b22c848d..9211c427a9ad 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -38,6 +38,7 @@
#include <linux/export.h>
#include <linux/swap_slots.h>
#include <linux/sort.h>
+#include <linux/stop_machine.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -204,8 +205,16 @@ static void discard_swap_cluster(struct swap_info_struct *si,
#ifdef CONFIG_THP_SWAP
#define SWAPFILE_CLUSTER HPAGE_PMD_NR
+
+#define swap_entry_size(size) (size)
#else
#define SWAPFILE_CLUSTER 256
+
+/*
+ * Define swap_entry_size() as constant to let compiler to optimize
+ * out some code if !CONFIG_THP_SWAP
+ */
+#define swap_entry_size(size) 1
#endif
#define LATENCY_LIMIT 256
@@ -269,7 +278,9 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
static inline bool cluster_is_huge(struct swap_cluster_info *info)
{
- return info->flags & CLUSTER_FLAG_HUGE;
+ if (IS_ENABLED(CONFIG_THP_SWAP))
+ return info->flags & CLUSTER_FLAG_HUGE;
+ return false;
}
static inline void cluster_clear_huge(struct swap_cluster_info *info)
@@ -296,13 +307,18 @@ static inline void unlock_cluster(struct swap_cluster_info *ci)
spin_unlock(&ci->lock);
}
+/*
+ * Determine the locking method in use for this device. Return
+ * swap_cluster_info if SSD-style cluster-based locking is in place.
+ */
static inline struct swap_cluster_info *lock_cluster_or_swap_info(
- struct swap_info_struct *si,
- unsigned long offset)
+ struct swap_info_struct *si, unsigned long offset)
{
struct swap_cluster_info *ci;
+ /* Try to use fine-grained SSD-style locking if available: */
ci = lock_cluster(si, offset);
+ /* Otherwise, fall back to traditional, coarse locking: */
if (!ci)
spin_lock(&si->lock);
@@ -863,7 +879,6 @@ no_page:
return n_ret;
}
-#ifdef CONFIG_THP_SWAP
static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
{
unsigned long idx;
@@ -871,6 +886,15 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
unsigned long offset, i;
unsigned char *map;
+ /*
+ * Should not even be attempting cluster allocations when huge
+ * page swap is disabled. Warn and fail the allocation.
+ */
+ if (!IS_ENABLED(CONFIG_THP_SWAP)) {
+ VM_WARN_ON_ONCE(1);
+ return 0;
+ }
+
if (cluster_list_empty(&si->free_clusters))
return 0;
@@ -901,13 +925,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
unlock_cluster(ci);
swap_range_free(si, offset, SWAPFILE_CLUSTER);
}
-#else
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
-{
- VM_WARN_ON_ONCE(1);
- return 0;
-}
-#endif /* CONFIG_THP_SWAP */
static unsigned long scan_swap_map(struct swap_info_struct *si,
unsigned char usage)
@@ -924,18 +941,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
}
-int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
{
- unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1;
+ unsigned long size = swap_entry_size(entry_size);
struct swap_info_struct *si, *next;
long avail_pgs;
int n_ret = 0;
int node;
/* Only single cluster request supported */
- WARN_ON_ONCE(n_goal > 1 && cluster);
+ WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
- avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages;
+ avail_pgs = atomic_long_read(&nr_swap_pages) / size;
if (avail_pgs <= 0)
goto noswap;
@@ -945,7 +962,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
if (n_goal > avail_pgs)
n_goal = avail_pgs;
- atomic_long_sub(n_goal * nr_pages, &nr_swap_pages);
+ atomic_long_sub(n_goal * size, &nr_swap_pages);
spin_lock(&swap_avail_lock);
@@ -972,14 +989,14 @@ start_over:
spin_unlock(&si->lock);
goto nextsi;
}
- if (cluster) {
+ if (size == SWAPFILE_CLUSTER) {
if (!(si->flags & SWP_FILE))
n_ret = swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
n_goal, swp_entries);
spin_unlock(&si->lock);
- if (n_ret || cluster)
+ if (n_ret || size == SWAPFILE_CLUSTER)
goto check_out;
pr_debug("scan_swap_map of si %d failed to find offset\n",
si->type);
@@ -1005,7 +1022,7 @@ nextsi:
check_out:
if (n_ret < n_goal)
- atomic_long_add((long)(n_goal - n_ret) * nr_pages,
+ atomic_long_add((long)(n_goal - n_ret) * size,
&nr_swap_pages);
noswap:
return n_ret;
@@ -1107,16 +1124,13 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
return p;
}
-static unsigned char __swap_entry_free(struct swap_info_struct *p,
- swp_entry_t entry, unsigned char usage)
+static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
+ unsigned long offset,
+ unsigned char usage)
{
- struct swap_cluster_info *ci;
- unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
- ci = lock_cluster_or_swap_info(p, offset);
-
count = p->swap_map[offset];
has_cache = count & SWAP_HAS_CACHE;
@@ -1144,6 +1158,75 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
usage = count | has_cache;
p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
+ return usage;
+}
+
+/*
+ * Check whether swap entry is valid in the swap device. If so,
+ * return pointer to swap_info_struct, and keep the swap entry valid
+ * via preventing the swap device from being swapoff, until
+ * put_swap_device() is called. Otherwise return NULL.
+ *
+ * Notice that swapoff or swapoff+swapon can still happen before the
+ * preempt_disable() in get_swap_device() or after the
+ * preempt_enable() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc. The
+ * caller must be prepared for that. For example, the following
+ * situation is possible.
+ *
+ * CPU1 CPU2
+ * do_swap_page()
+ * ... swapoff+swapon
+ * __read_swap_cache_async()
+ * swapcache_prepare()
+ * __swap_duplicate()
+ * // check swap_map
+ * // verify PTE not changed
+ *
+ * In __swap_duplicate(), the swap_map need to be checked before
+ * changing partly because the specified swap entry may be for another
+ * swap device which has been swapoff. And in do_swap_page(), after
+ * the page is read from the swap device, the PTE is verified not
+ * changed with the page table locked to check whether the swap device
+ * has been swapoff or swapoff+swapon.
+ */
+struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ unsigned long type, offset;
+
+ if (!entry.val)
+ goto out;
+ type = swp_type(entry);
+ if (type >= nr_swapfiles)
+ goto bad_nofile;
+ si = swap_info[type];
+
+ preempt_disable();
+ if (!(si->flags & SWP_VALID))
+ goto unlock_out;
+ offset = swp_offset(entry);
+ if (offset >= si->max)
+ goto unlock_out;
+
+ return si;
+bad_nofile:
+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
+out:
+ return NULL;
+unlock_out:
+ preempt_enable();
+ return NULL;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+ swp_entry_t entry, unsigned char usage)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+
+ ci = lock_cluster_or_swap_info(p, offset);
+ usage = __swap_entry_free_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
return usage;
@@ -1184,19 +1267,7 @@ void swap_free(swp_entry_t entry)
/*
* Called after dropping swapcache to decrease refcnt to swap entries.
*/
-static void swapcache_free(swp_entry_t entry)
-{
- struct swap_info_struct *p;
-
- p = _swap_info_get(entry);
- if (p) {
- if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
- free_swap_slot(entry);
- }
-}
-
-#ifdef CONFIG_THP_SWAP
-static void swapcache_free_cluster(swp_entry_t entry)
+void put_swap_page(struct page *page, swp_entry_t entry)
{
unsigned long offset = swp_offset(entry);
unsigned long idx = offset / SWAPFILE_CLUSTER;
@@ -1205,42 +1276,48 @@ static void swapcache_free_cluster(swp_entry_t entry)
unsigned char *map;
unsigned int i, free_entries = 0;
unsigned char val;
+ int size = swap_entry_size(hpage_nr_pages(page));
si = _swap_info_get(entry);
if (!si)
return;
- ci = lock_cluster(si, offset);
- VM_BUG_ON(!cluster_is_huge(ci));
- map = si->swap_map + offset;
- for (i = 0; i < SWAPFILE_CLUSTER; i++) {
- val = map[i];
- VM_BUG_ON(!(val & SWAP_HAS_CACHE));
- if (val == SWAP_HAS_CACHE)
- free_entries++;
- }
- if (!free_entries) {
- for (i = 0; i < SWAPFILE_CLUSTER; i++)
- map[i] &= ~SWAP_HAS_CACHE;
+ ci = lock_cluster_or_swap_info(si, offset);
+ if (size == SWAPFILE_CLUSTER) {
+ VM_BUG_ON(!cluster_is_huge(ci));
+ map = si->swap_map + offset;
+ for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ val = map[i];
+ VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+ if (val == SWAP_HAS_CACHE)
+ free_entries++;
+ }
+ cluster_clear_huge(ci);
+ if (free_entries == SWAPFILE_CLUSTER) {
+ unlock_cluster_or_swap_info(si, ci);
+ spin_lock(&si->lock);
+ ci = lock_cluster(si, offset);
+ memset(map, 0, SWAPFILE_CLUSTER);
+ unlock_cluster(ci);
+ mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+ swap_free_cluster(si, idx);
+ spin_unlock(&si->lock);
+ return;
+ }
}
- cluster_clear_huge(ci);
- unlock_cluster(ci);
- if (free_entries == SWAPFILE_CLUSTER) {
- spin_lock(&si->lock);
- ci = lock_cluster(si, offset);
- memset(map, 0, SWAPFILE_CLUSTER);
- unlock_cluster(ci);
- mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
- swap_free_cluster(si, idx);
- spin_unlock(&si->lock);
- } else if (free_entries) {
- for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
- if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
- free_swap_slot(entry);
+ for (i = 0; i < size; i++, entry.val++) {
+ if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
+ unlock_cluster_or_swap_info(si, ci);
+ free_swap_slot(entry);
+ if (i == size - 1)
+ return;
+ lock_cluster_or_swap_info(si, offset);
}
}
+ unlock_cluster_or_swap_info(si, ci);
}
+#ifdef CONFIG_THP_SWAP
int split_swap_cluster(swp_entry_t entry)
{
struct swap_info_struct *si;
@@ -1255,19 +1332,7 @@ int split_swap_cluster(swp_entry_t entry)
unlock_cluster(ci);
return 0;
}
-#else
-static inline void swapcache_free_cluster(swp_entry_t entry)
-{
-}
-#endif /* CONFIG_THP_SWAP */
-
-void put_swap_page(struct page *page, swp_entry_t entry)
-{
- if (!PageTransHuge(page))
- swapcache_free(entry);
- else
- swapcache_free_cluster(entry);
-}
+#endif
static int swp_entry_cmp(const void *ent1, const void *ent2)
{
@@ -1328,11 +1393,18 @@ int page_swapcount(struct page *page)
return count;
}
-int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+int __swap_count(swp_entry_t entry)
{
+ struct swap_info_struct *si;
pgoff_t offset = swp_offset(entry);
+ int count = 0;
- return swap_count(si->swap_map[offset]);
+ si = get_swap_device(entry);
+ if (si) {
+ count = swap_count(si->swap_map[offset]);
+ put_swap_device(si);
+ }
+ return count;
}
static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
@@ -1357,9 +1429,11 @@ int __swp_swapcount(swp_entry_t entry)
int count = 0;
struct swap_info_struct *si;
- si = __swap_info_get(entry);
- if (si)
+ si = get_swap_device(entry);
+ if (si) {
count = swap_swapcount(si, entry);
+ put_swap_device(si);
+ }
return count;
}
@@ -1409,7 +1483,6 @@ out:
return count;
}
-#ifdef CONFIG_THP_SWAP
static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
swp_entry_t entry)
{
@@ -1422,12 +1495,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
- if (map[roffset] != SWAP_HAS_CACHE)
+ if (swap_count(map[roffset]))
ret = true;
goto unlock_out;
}
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
- if (map[offset + i] != SWAP_HAS_CACHE) {
+ if (swap_count(map[offset + i])) {
ret = true;
break;
}
@@ -1442,7 +1515,7 @@ static bool page_swapped(struct page *page)
swp_entry_t entry;
struct swap_info_struct *si;
- if (likely(!PageTransCompound(page)))
+ if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
return page_swapcount(page) != 0;
page = compound_head(page);
@@ -1466,10 +1539,8 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
/* hugetlbfs shouldn't call it */
VM_BUG_ON_PAGE(PageHuge(page), page);
- if (likely(!PageTransCompound(page))) {
- mapcount = atomic_read(&page->_mapcount) + 1;
- if (total_mapcount)
- *total_mapcount = mapcount;
+ if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
+ mapcount = page_trans_huge_mapcount(page, total_mapcount);
if (PageSwapCache(page))
swapcount = page_swapcount(page);
if (total_swapcount)
@@ -1516,26 +1587,6 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
return map_swapcount;
}
-#else
-#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry)
-#define page_swapped(page) (page_swapcount(page) != 0)
-
-static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
- int *total_swapcount)
-{
- int mapcount, swapcount = 0;
-
- /* hugetlbfs shouldn't call it */
- VM_BUG_ON_PAGE(PageHuge(page), page);
-
- mapcount = page_trans_huge_mapcount(page, total_mapcount);
- if (PageSwapCache(page))
- swapcount = page_swapcount(page);
- if (total_swapcount)
- *total_swapcount = swapcount;
- return mapcount + swapcount;
-}
-#endif
/*
* We can write to an anon page without COW if there are no other references
@@ -1800,8 +1851,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
- set_pte_at(vma->vm_mm, addr, pte,
- pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
page_add_anon_rmap(page, vma, addr, false);
mem_cgroup_commit_charge(page, memcg, true, false);
@@ -1810,6 +1859,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
+ set_pte_at(vma->vm_mm, addr, pte,
+ pte_mkold(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
/*
* Move the page to the active list so it is not
@@ -2451,9 +2502,9 @@ static int swap_node(struct swap_info_struct *p)
return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
}
-static void _enable_swap_info(struct swap_info_struct *p, int prio,
- unsigned char *swap_map,
- struct swap_cluster_info *cluster_info)
+static void setup_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info)
{
int i;
@@ -2478,7 +2529,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
}
p->swap_map = swap_map;
p->cluster_info = cluster_info;
- p->flags |= SWP_WRITEOK;
+}
+
+static void _enable_swap_info(struct swap_info_struct *p)
+{
+ p->flags |= SWP_WRITEOK | SWP_VALID;
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
@@ -2497,6 +2552,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
add_to_avail_list(p);
}
+static int swap_onoff_stop(void *arg)
+{
+ return 0;
+}
+
static void enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info,
@@ -2505,7 +2565,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
frontswap_init(p->type, frontswap_map);
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, prio, swap_map, cluster_info);
+ setup_swap_info(p, prio, swap_map, cluster_info);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * Guarantee swap_map, cluster_info, etc. fields are used
+ * between get/put_swap_device() only if SWP_VALID bit is set
+ */
+ stop_machine(swap_onoff_stop, NULL, cpu_online_mask);
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2514,7 +2584,8 @@ static void reinsert_swap_info(struct swap_info_struct *p)
{
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2617,6 +2688,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
reenable_swap_slots_cache_unlock();
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ p->flags &= ~SWP_VALID; /* mark swap device as invalid */
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * wait for swap operations protected by get/put_swap_device()
+ * to complete
+ */
+ stop_machine(swap_onoff_stop, NULL, cpu_online_mask);
+
flush_work(&p->discard_work);
destroy_swap_extents(p);
@@ -3380,22 +3462,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
struct swap_info_struct *p;
struct swap_cluster_info *ci;
- unsigned long offset, type;
+ unsigned long offset;
unsigned char count;
unsigned char has_cache;
int err = -EINVAL;
- if (non_swap_entry(entry))
+ p = get_swap_device(entry);
+ if (!p)
goto out;
- type = swp_type(entry);
- if (type >= nr_swapfiles)
- goto bad_file;
- p = swap_info[type];
offset = swp_offset(entry);
- if (unlikely(offset >= p->max))
- goto out;
-
ci = lock_cluster_or_swap_info(p, offset);
count = p->swap_map[offset];
@@ -3441,11 +3517,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unlock_out:
unlock_cluster_or_swap_info(p, ci);
out:
+ if (p)
+ put_swap_device(p);
return err;
-
-bad_file:
- pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
- goto out;
}
/*
@@ -3537,6 +3611,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
struct page *list_page;
pgoff_t offset;
unsigned char count;
+ int ret = 0;
/*
* When debugging, it's easier to use __GFP_ZERO here; but it's better
@@ -3544,15 +3619,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
*/
page = alloc_page(gfp_mask | __GFP_HIGHMEM);
- si = swap_info_get(entry);
+ si = get_swap_device(entry);
if (!si) {
/*
* An acceptable race has occurred since the failing
- * __swap_duplicate(): the swap entry has been freed,
- * perhaps even the whole swap_map cleared for swapoff.
+ * __swap_duplicate(): the swap device may be swapoff
*/
goto outer;
}
+ spin_lock(&si->lock);
offset = swp_offset(entry);
@@ -3570,9 +3645,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
}
if (!page) {
- unlock_cluster(ci);
- spin_unlock(&si->lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
/*
@@ -3624,10 +3698,11 @@ out_unlock_cont:
out:
unlock_cluster(ci);
spin_unlock(&si->lock);
+ put_swap_device(si);
outer:
if (page)
__free_page(page);
- return 0;
+ return ret;
}
/*
diff --git a/mm/vmacache.c b/mm/vmacache.c
index db7596eb6132..ea517bef7dc5 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -6,6 +6,18 @@
#include <linux/sched/task.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
+#include <asm/pgtable.h>
+
+/*
+ * Hash based on the pmd of addr if configured with MMU, which provides a good
+ * hit rate for workloads with spatial locality. Otherwise, use pages.
+ */
+#ifdef CONFIG_MMU
+#define VMACACHE_SHIFT PMD_SHIFT
+#else
+#define VMACACHE_SHIFT PAGE_SHIFT
+#endif
+#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK)
/*
* Flush vma caches for threads that share a given mm.
@@ -87,6 +99,7 @@ static bool vmacache_valid(struct mm_struct *mm)
struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
{
+ int idx = VMACACHE_HASH(addr);
int i;
count_vm_vmacache_event(VMACACHE_FIND_CALLS);
@@ -95,16 +108,20 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
return NULL;
for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache.vmas[i];
+ struct vm_area_struct *vma = current->vmacache.vmas[idx];
- if (!vma)
- continue;
- if (WARN_ON_ONCE(vma->vm_mm != mm))
- break;
- if (vma->vm_start <= addr && vma->vm_end > addr) {
- count_vm_vmacache_event(VMACACHE_FIND_HITS);
- return vma;
+ if (vma) {
+#ifdef CONFIG_DEBUG_VM_VMACACHE
+ if (WARN_ON_ONCE(vma->vm_mm != mm))
+ break;
+#endif
+ if (vma->vm_start <= addr && vma->vm_end > addr) {
+ count_vm_vmacache_event(VMACACHE_FIND_HITS);
+ return vma;
+ }
}
+ if (++idx == VMACACHE_SIZE)
+ idx = 0;
}
return NULL;
@@ -115,6 +132,7 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
+ int idx = VMACACHE_HASH(start);
int i;
count_vm_vmacache_event(VMACACHE_FIND_CALLS);
@@ -123,12 +141,14 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
return NULL;
for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache.vmas[i];
+ struct vm_area_struct *vma = current->vmacache.vmas[idx];
if (vma && vma->vm_start == start && vma->vm_end == end) {
count_vm_vmacache_event(VMACACHE_FIND_HITS);
return vma;
}
+ if (++idx == VMACACHE_SIZE)
+ idx = 0;
}
return NULL;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index cfea25be7754..a728fc492557 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1907,10 +1907,6 @@ void *vzalloc_node(unsigned long size, int node)
}
EXPORT_SYMBOL(vzalloc_node);
-#ifndef PAGE_KERNEL_EXEC
-# define PAGE_KERNEL_EXEC PAGE_KERNEL
-#endif
-
/**
* vmalloc_exec - allocate virtually contiguous, executable memory
* @size: allocation size
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 575747728ee6..0ff97e860759 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -65,12 +65,6 @@ struct scan_control {
/* How many pages shrink_list() should reclaim */
unsigned long nr_to_reclaim;
- /* This context's GFP mask */
- gfp_t gfp_mask;
-
- /* Allocation order */
- int order;
-
/*
* Nodemask of nodes allowed by the caller. If NULL, all nodes
* are scanned.
@@ -83,12 +77,6 @@ struct scan_control {
*/
struct mem_cgroup *target_mem_cgroup;
- /* Scan (total_size >> priority) pages at once */
- int priority;
-
- /* The highest zone to isolate pages for reclaim from */
- enum zone_type reclaim_idx;
-
/* Writepage batching in laptop mode; RECLAIM_WRITE */
unsigned int may_writepage:1;
@@ -111,6 +99,18 @@ struct scan_control {
/* One of the zones is ready for compaction */
unsigned int compaction_ready:1;
+ /* Allocation order */
+ s8 order;
+
+ /* Scan (total_size >> priority) pages at once */
+ s8 priority;
+
+ /* The highest zone to isolate pages for reclaim from */
+ s8 reclaim_idx;
+
+ /* This context's GFP mask */
+ gfp_t gfp_mask;
+
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -169,6 +169,70 @@ unsigned long vm_total_pages;
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
+#ifdef CONFIG_MEMCG_KMEM
+
+/*
+ * We allow subsystems to populate their shrinker-related
+ * LRU lists before register_shrinker_prepared() is called
+ * for the shrinker, since we don't want to impose
+ * restrictions on their internal registration order.
+ * In this case shrink_slab_memcg() may find corresponding
+ * bit is set in the shrinkers map.
+ *
+ * This value is used by the function to detect registering
+ * shrinkers and to skip do_shrink_slab() calls for them.
+ */
+#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
+
+static DEFINE_IDR(shrinker_idr);
+static int shrinker_nr_max;
+
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+ int id, ret = -ENOMEM;
+
+ down_write(&shrinker_rwsem);
+ /* This may call shrinker, so it must use down_read_trylock() */
+ id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
+ if (id < 0)
+ goto unlock;
+
+ if (id >= shrinker_nr_max) {
+ if (memcg_expand_shrinker_maps(id)) {
+ idr_remove(&shrinker_idr, id);
+ goto unlock;
+ }
+
+ shrinker_nr_max = id + 1;
+ }
+ shrinker->id = id;
+ ret = 0;
+unlock:
+ up_write(&shrinker_rwsem);
+ return ret;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+ int id = shrinker->id;
+
+ BUG_ON(id < 0);
+
+ down_write(&shrinker_rwsem);
+ idr_remove(&shrinker_idr, id);
+ up_write(&shrinker_rwsem);
+}
+#else /* CONFIG_MEMCG_KMEM */
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+ return 0;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
#ifdef CONFIG_MEMCG
static bool global_reclaim(struct scan_control *sc)
{
@@ -313,11 +377,28 @@ int prealloc_shrinker(struct shrinker *shrinker)
shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
if (!shrinker->nr_deferred)
return -ENOMEM;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+ if (prealloc_memcg_shrinker(shrinker))
+ goto free_deferred;
+ }
+
return 0;
+
+free_deferred:
+ kfree(shrinker->nr_deferred);
+ shrinker->nr_deferred = NULL;
+ return -ENOMEM;
}
void free_prealloced_shrinker(struct shrinker *shrinker)
{
+ if (!shrinker->nr_deferred)
+ return;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ unregister_memcg_shrinker(shrinker);
+
kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
}
@@ -326,6 +407,9 @@ void register_shrinker_prepared(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
+#ifdef CONFIG_MEMCG_KMEM
+ idr_replace(&shrinker_idr, shrinker, shrinker->id);
+#endif
up_write(&shrinker_rwsem);
}
@@ -347,6 +431,8 @@ void unregister_shrinker(struct shrinker *shrinker)
{
if (!shrinker->nr_deferred)
return;
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ unregister_memcg_shrinker(shrinker);
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
up_write(&shrinker_rwsem);
@@ -371,9 +457,12 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
: SHRINK_BATCH;
long scanned = 0, next_deferred;
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+ nid = 0;
+
freeable = shrinker->count_objects(shrinker, shrinkctl);
- if (freeable == 0)
- return 0;
+ if (freeable == 0 || freeable == SHRINK_EMPTY)
+ return freeable;
/*
* copy the current shrinker scan count into a local variable
@@ -474,6 +563,84 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
return freed;
}
+#ifdef CONFIG_MEMCG_KMEM
+static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg, int priority)
+{
+ struct memcg_shrinker_map *map;
+ unsigned long freed = 0;
+ int ret, i;
+
+ if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
+ return 0;
+
+ if (!down_read_trylock(&shrinker_rwsem))
+ return 0;
+
+ map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
+ true);
+ if (unlikely(!map))
+ goto unlock;
+
+ for_each_set_bit(i, map->map, shrinker_nr_max) {
+ struct shrink_control sc = {
+ .gfp_mask = gfp_mask,
+ .nid = nid,
+ .memcg = memcg,
+ };
+ struct shrinker *shrinker;
+
+ shrinker = idr_find(&shrinker_idr, i);
+ if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
+ if (!shrinker)
+ clear_bit(i, map->map);
+ continue;
+ }
+
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY) {
+ clear_bit(i, map->map);
+ /*
+ * After the shrinker reported that it had no objects to
+ * free, but before we cleared the corresponding bit in
+ * the memcg shrinker map, a new object might have been
+ * added. To make sure, we have the bit set in this
+ * case, we invoke the shrinker one more time and reset
+ * the bit if it reports that it is not empty anymore.
+ * The memory barrier here pairs with the barrier in
+ * memcg_set_shrinker_bit():
+ *
+ * list_lru_add() shrink_slab_memcg()
+ * list_add_tail() clear_bit()
+ * <MB> <MB>
+ * set_bit() do_shrink_slab()
+ */
+ smp_mb__after_atomic();
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY)
+ ret = 0;
+ else
+ memcg_set_shrinker_bit(memcg, nid, i);
+ }
+ freed += ret;
+
+ if (rwsem_is_contended(&shrinker_rwsem)) {
+ freed = freed ? : 1;
+ break;
+ }
+ }
+unlock:
+ up_read(&shrinker_rwsem);
+ return freed;
+}
+#else /* CONFIG_MEMCG_KMEM */
+static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg, int priority)
+{
+ return 0;
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
/**
* shrink_slab - shrink slab caches
* @gfp_mask: allocation context
@@ -486,10 +653,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
* @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
* unaware shrinkers will receive a node id of 0 instead.
*
- * @memcg specifies the memory cgroup to target. If it is not NULL,
- * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
- * objects from the memory cgroup specified. Otherwise, only unaware
- * shrinkers are called.
+ * @memcg specifies the memory cgroup to target. Unaware shrinkers
+ * are called only if it is the root cgroup.
*
* @priority is sc->priority, we take the number of objects and >> by priority
* in order to get the scan target.
@@ -502,9 +667,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
{
struct shrinker *shrinker;
unsigned long freed = 0;
+ int ret;
- if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
- return 0;
+ if (!mem_cgroup_is_root(memcg))
+ return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
if (!down_read_trylock(&shrinker_rwsem))
goto out;
@@ -516,19 +682,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
.memcg = memcg,
};
- /*
- * If kernel memory accounting is disabled, we ignore
- * SHRINKER_MEMCG_AWARE flag and call all shrinkers
- * passing NULL for memcg.
- */
- if (memcg_kmem_enabled() &&
- !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE))
- continue;
-
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
- sc.nid = 0;
-
- freed += do_shrink_slab(&sc, shrinker, priority);
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY)
+ ret = 0;
+ freed += ret;
/*
* Bail out if someone want to register a new shrinker to
* prevent the regsitration from being stalled for long periods
@@ -554,6 +711,7 @@ void drop_slab_node(int nid)
struct mem_cgroup *memcg = NULL;
freed = 0;
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
@@ -744,7 +902,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
refcount = 2;
if (!page_ref_freeze(page, refcount))
goto cannot_free;
- /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
+ /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
if (unlikely(PageDirty(page))) {
page_ref_unfreeze(page, refcount);
goto cannot_free;
@@ -2573,9 +2731,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
node_lru_pages += lru_pages;
- if (memcg)
- shrink_slab(sc->gfp_mask, pgdat->node_id,
- memcg, sc->priority);
+ shrink_slab(sc->gfp_mask, pgdat->node_id,
+ memcg, sc->priority);
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
@@ -2599,10 +2756,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
}
} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
- if (global_reclaim(sc))
- shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
- sc->priority);
-
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
@@ -3064,6 +3217,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
};
/*
+ * scan_control uses s8 fields for order, priority, and reclaim_idx.
+ * Confirm they are large enough for max values.
+ */
+ BUILD_BUG_ON(MAX_ORDER > S8_MAX);
+ BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
+ BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
+
+ /*
* Do not enter reclaim if fatal signal was delivered while throttled.
* 1 is returned so that the page allocator does not OOM kill at this
* point.
diff --git a/mm/workingset.c b/mm/workingset.c
index c17e040ce271..5cfb29ec3fd9 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -365,10 +365,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
unsigned long nodes;
unsigned long cache;
- /* list_lru lock nests inside the IRQ-safe i_pages lock */
- local_irq_disable();
nodes = list_lru_shrink_count(&shadow_nodes, sc);
- local_irq_enable();
/*
* Approximate a reasonable limit for the nodes
@@ -401,6 +398,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
}
max_nodes = cache >> (XA_CHUNK_SHIFT - 3);
+ if (!nodes)
+ return SHRINK_EMPTY;
+
if (nodes <= max_nodes)
return 0;
return nodes - max_nodes;
@@ -432,7 +432,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
/* Coming from the list, invert the lock order */
if (!xa_trylock(&mapping->i_pages)) {
- spin_unlock(lru_lock);
+ spin_unlock_irq(lru_lock);
ret = LRU_RETRY;
goto out;
}
@@ -462,26 +462,20 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
out_invalid:
- xa_unlock(&mapping->i_pages);
+ xa_unlock_irq(&mapping->i_pages);
ret = LRU_REMOVED_RETRY;
out:
- local_irq_enable();
cond_resched();
- local_irq_disable();
- spin_lock(lru_lock);
+ spin_lock_irq(lru_lock);
return ret;
}
static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
struct shrink_control *sc)
{
- unsigned long ret;
-
/* list_lru lock nests inside the IRQ-safe i_pages lock */
- local_irq_disable();
- ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
- local_irq_enable();
- return ret;
+ return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
+ NULL);
}
static struct shrinker workingset_shadow_shrinker = {
@@ -518,15 +512,17 @@ static int __init workingset_init(void)
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
- ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key);
+ ret = prealloc_shrinker(&workingset_shadow_shrinker);
if (ret)
goto err;
- ret = register_shrinker(&workingset_shadow_shrinker);
+ ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
+ &workingset_shadow_shrinker);
if (ret)
goto err_list_lru;
+ register_shrinker_prepared(&workingset_shadow_shrinker);
return 0;
err_list_lru:
- list_lru_destroy(&shadow_nodes);
+ free_prealloced_shrinker(&workingset_shadow_shrinker);
err:
return ret;
}
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 4b366d181f35..201a8ac6342e 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -746,6 +746,9 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
}
if (bud == HEADLESS) {
+ if (test_bit(UNDER_RECLAIM, &page->private))
+ return;
+
spin_lock(&pool->lock);
list_del(&page->lru);
spin_unlock(&pool->lock);
@@ -836,20 +839,20 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
}
list_for_each_prev(pos, &pool->lru) {
page = list_entry(pos, struct page, lru);
+ zhdr = page_address(page);
if (test_bit(PAGE_HEADLESS, &page->private))
/* candidate found */
break;
- zhdr = page_address(page);
if (!z3fold_page_trylock(zhdr))
continue; /* can't evict at this point */
kref_get(&zhdr->refcount);
list_del_init(&zhdr->buddy);
zhdr->cpu = -1;
- set_bit(UNDER_RECLAIM, &page->private);
break;
}
+ set_bit(UNDER_RECLAIM, &page->private);
list_del_init(&page->lru);
spin_unlock(&pool->lock);
@@ -898,6 +901,7 @@ next:
if (test_bit(PAGE_HEADLESS, &page->private)) {
if (ret == 0) {
free_z3fold_page(page);
+ atomic64_dec(&pool->pages_nr);
return 0;
}
spin_lock(&pool->lock);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index a1a9debb6fc8..58886d40786b 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -924,20 +924,7 @@ static void reset_page(struct page *page)
page->freelist = NULL;
}
-/*
- * To prevent zspage destroy during migration, zspage freeing should
- * hold locks of all pages in the zspage.
- */
-void lock_zspage(struct zspage *zspage)
-{
- struct page *page = get_first_page(zspage);
-
- do {
- lock_page(page);
- } while ((page = get_next_page(page)) != NULL);
-}
-
-int trylock_zspage(struct zspage *zspage)
+static int trylock_zspage(struct zspage *zspage)
{
struct page *cursor, *fail;
@@ -1814,6 +1801,19 @@ static enum fullness_group putback_zspage(struct size_class *class,
}
#ifdef CONFIG_COMPACTION
+/*
+ * To prevent zspage destroy during migration, zspage freeing should
+ * hold locks of all pages in the zspage.
+ */
+static void lock_zspage(struct zspage *zspage)
+{
+ struct page *page = get_first_page(zspage);
+
+ do {
+ lock_page(page);
+ } while ((page = get_next_page(page)) != NULL);
+}
+
static struct dentry *zs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name,
void *data, size_t data_size)
@@ -1906,7 +1906,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
__SetPageMovable(newpage, page_mapping(oldpage));
}
-bool zs_page_isolate(struct page *page, isolate_mode_t mode)
+static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
{
struct zs_pool *pool;
struct size_class *class;
@@ -1961,7 +1961,7 @@ bool zs_page_isolate(struct page *page, isolate_mode_t mode)
return true;
}
-int zs_page_migrate(struct address_space *mapping, struct page *newpage,
+static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
struct page *page, enum migrate_mode mode)
{
struct zs_pool *pool;
@@ -2077,7 +2077,7 @@ unpin_objects:
return ret;
}
-void zs_page_putback(struct page *page)
+static void zs_page_putback(struct page *page)
{
struct zs_pool *pool;
struct size_class *class;
@@ -2109,7 +2109,7 @@ void zs_page_putback(struct page *page)
spin_unlock(&class->lock);
}
-const struct address_space_operations zsmalloc_aops = {
+static const struct address_space_operations zsmalloc_aops = {
.isolate_page = zs_page_isolate,
.migratepage = zs_page_migrate,
.putback_page = zs_page_putback,
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 447857ffaf6b..67ef4d3c7c04 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -13,6 +13,7 @@ use POSIX;
use File::Basename;
use Cwd 'abs_path';
use Term::ANSIColor qw(:constants);
+use Encode qw(decode encode);
my $P = $0;
my $D = dirname(abs_path($P));
@@ -240,11 +241,11 @@ $check_orig = $check;
my $exit = 0;
+my $perl_version_ok = 1;
if ($^V && $^V lt $minimum_perl_version) {
+ $perl_version_ok = 0;
printf "$P: requires at least perl version %vd\n", $minimum_perl_version;
- if (!$ignore_perl_version) {
- exit(1);
- }
+ exit(1) if (!$ignore_perl_version);
}
#if no filenames are given, push '-' to read patch from stdin
@@ -346,9 +347,10 @@ our $Sparse = qr{
__force|
__iomem|
__must_check|
- __init_refok|
__kprobes|
__ref|
+ __refconst|
+ __refdata|
__rcu|
__private
}x;
@@ -847,6 +849,17 @@ sub is_maintained_obsolete {
return $status =~ /obsolete/i;
}
+sub is_SPDX_License_valid {
+ my ($license) = @_;
+
+ return 1 if (!$tree || which("python") eq "" || !(-e "$root/scripts/spdxcheck.py") || !(-e "$root/.git"));
+
+ my $root_path = abs_path($root);
+ my $status = `cd "$root_path"; echo "$license" | python scripts/spdxcheck.py -`;
+ return 0 if ($status ne "");
+ return 1;
+}
+
my $camelcase_seeded = 0;
sub seed_camelcase_includes {
return if ($camelcase_seeded);
@@ -1026,11 +1039,11 @@ if (!$quiet) {
hash_show_words(\%use_type, "Used");
hash_show_words(\%ignore_type, "Ignored");
- if ($^V lt 5.10.0) {
+ if (!$perl_version_ok) {
print << "EOM"
NOTE: perl $^V is not modern enough to detect all possible issues.
- An upgrade to at least perl v5.10.0 is suggested.
+ An upgrade to at least perl $minimum_perl_version is suggested.
EOM
}
if ($exit) {
@@ -2235,10 +2248,14 @@ sub process {
our $clean = 1;
my $signoff = 0;
+ my $author = '';
+ my $authorsignoff = 0;
my $is_patch = 0;
+ my $is_binding_patch = -1;
my $in_header_lines = $file ? 0 : 1;
my $in_commit_log = 0; #Scanning lines before patch
my $has_commit_log = 0; #Encountered lines before patch
+ my $commit_log_lines = 0; #Number of commit log lines
my $commit_log_possible_stack_dump = 0;
my $commit_log_long_line = 0;
my $commit_log_has_diff = 0;
@@ -2485,6 +2502,27 @@ sub process {
$check = $check_orig;
}
$checklicenseline = 1;
+
+ if ($realfile !~ /MAINTAINERS/) {
+ my $mixed = 0;
+ if ($realfile =~ /(Documentation\/devicetree|include\/dt-bindings).*/) {
+ if ($is_binding_patch == 0) {
+ $mixed = 1;
+ }
+ $is_binding_patch = 1;
+ } else {
+ if ($is_binding_patch == 1) {
+ $mixed = 1;
+ }
+ $is_binding_patch = 0;
+ }
+
+ if ($mixed == 1) {
+ WARN("DT_SPLIT_BINDING_PATCH",
+ "DT binding docs and includes should be a separate patch\n");
+ }
+ }
+
next;
}
@@ -2496,6 +2534,18 @@ sub process {
$cnt_lines++ if ($realcnt != 0);
+# Verify the existence of a commit log if appropriate
+# 2 is used because a $signature is counted in $commit_log_lines
+ if ($in_commit_log) {
+ if ($line !~ /^\s*$/) {
+ $commit_log_lines++; #could be a $signature
+ }
+ } elsif ($has_commit_log && $commit_log_lines < 2) {
+ WARN("COMMIT_MESSAGE",
+ "Missing commit description - Add an appropriate one\n");
+ $commit_log_lines = 2; #warn only once
+ }
+
# Check if the commit log has what seems like a diff which can confuse patch
if ($in_commit_log && !$commit_log_has_diff &&
(($line =~ m@^\s+diff\b.*a/[\w/]+@ &&
@@ -2517,10 +2567,24 @@ sub process {
}
}
+# Check the patch for a From:
+ if (decode("MIME-Header", $line) =~ /^From:\s*(.*)/) {
+ $author = $1;
+ $author = encode("utf8", $author) if ($line =~ /=\?utf-8\?/i);
+ $author =~ s/"//g;
+ }
+
# Check the patch for a signoff:
if ($line =~ /^\s*signed-off-by:/i) {
$signoff++;
$in_commit_log = 0;
+ if ($author ne '') {
+ my $l = $line;
+ $l =~ s/"//g;
+ if ($l =~ /^\s*signed-off-by:\s*\Q$author\E/i) {
+ $authorsignoff = 1;
+ }
+ }
}
# Check if MAINTAINERS is being updated. If so, there's probably no need to
@@ -2960,8 +3024,14 @@ sub process {
if ($comment !~ /^$/ &&
$rawline !~ /^\+\Q$comment\E SPDX-License-Identifier: /) {
- WARN("SPDX_LICENSE_TAG",
- "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr);
+ WARN("SPDX_LICENSE_TAG",
+ "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr);
+ } elsif ($rawline =~ /(SPDX-License-Identifier: .*)/) {
+ my $spdx_license = $1;
+ if (!is_SPDX_License_valid($spdx_license)) {
+ WARN("SPDX_LICENSE_TAG",
+ "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr);
+ }
}
}
}
@@ -3079,7 +3149,7 @@ sub process {
}
# check indentation starts on a tab stop
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
$sline =~ /^\+\t+( +)(?:$c90_Keywords\b|\{\s*$|\}\s*(?:else\b|while\b|\s*$)|$Declare\s*$Ident\s*[;=])/) {
my $indent = length($1);
if ($indent % 8) {
@@ -3092,7 +3162,7 @@ sub process {
}
# check multi-line statement indentation matches previous line
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
$prevline =~ /^\+([ \t]*)((?:$c90_Keywords(?:\s+if)\s*)|(?:$Declare\s*)?(?:$Ident|\(\s*\*\s*$Ident\s*\))\s*|(?:\*\s*)*$Lval\s*=\s*$Ident\s*)\(.*(\&\&|\|\||,)\s*$/) {
$prevline =~ /^\+(\t*)(.*)$/;
my $oldindent = $1;
@@ -3781,6 +3851,26 @@ sub process {
"type '$tmp' should be specified in [[un]signed] [short|int|long|long long] order\n" . $herecurr);
}
+# check for unnecessary <signed> int declarations of short/long/long long
+ while ($sline =~ m{\b($TypeMisordered(\s*\*)*|$C90_int_types)\b}g) {
+ my $type = trim($1);
+ next if ($type !~ /\bint\b/);
+ next if ($type !~ /\b(?:short|long\s+long|long)\b/);
+ my $new_type = $type;
+ $new_type =~ s/\b\s*int\s*\b/ /;
+ $new_type =~ s/\b\s*(?:un)?signed\b\s*/ /;
+ $new_type =~ s/^const\s+//;
+ $new_type = "unsigned $new_type" if ($type =~ /\bunsigned\b/);
+ $new_type = "const $new_type" if ($type =~ /^const\b/);
+ $new_type =~ s/\s+/ /g;
+ $new_type = trim($new_type);
+ if (WARN("UNNECESSARY_INT",
+ "Prefer '$new_type' over '$type' as the int is unnecessary\n" . $herecurr) &&
+ $fix) {
+ $fixed[$fixlinenr] =~ s/\b\Q$type\E\b/$new_type/;
+ }
+ }
+
# check for static const char * arrays.
if ($line =~ /\bstatic\s+const\s+char\s*\*\s*(\w+)\s*\[\s*\]\s*=\s*/) {
WARN("STATIC_CONST_CHAR_ARRAY",
@@ -3967,7 +4057,7 @@ sub process {
# function brace can't be on same line, except for #defines of do while,
# or if closed on same line
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
$sline =~ /$Type\s*$Ident\s*$balanced_parens\s*\{/ &&
$sline !~ /\#\s*define\b.*do\s*\{/ &&
$sline !~ /}/) {
@@ -4483,11 +4573,11 @@ sub process {
#need space before brace following if, while, etc
if (($line =~ /\(.*\)\{/ && $line !~ /\($Type\)\{/) ||
- $line =~ /do\{/) {
+ $line =~ /\b(?:else|do)\{/) {
if (ERROR("SPACING",
"space required before the open brace '{'\n" . $herecurr) &&
$fix) {
- $fixed[$fixlinenr] =~ s/^(\+.*(?:do|\)))\{/$1 {/;
+ $fixed[$fixlinenr] =~ s/^(\+.*(?:do|else|\)))\{/$1 {/;
}
}
@@ -4578,7 +4668,7 @@ sub process {
# check for unnecessary parentheses around comparisons in if uses
# when !drivers/staging or command-line uses --strict
if (($realfile !~ m@^(?:drivers/staging/)@ || $check_orig) &&
- $^V && $^V ge 5.10.0 && defined($stat) &&
+ $perl_version_ok && defined($stat) &&
$stat =~ /(^.\s*if\s*($balanced_parens))/) {
my $if_stat = $1;
my $test = substr($2, 1, -1);
@@ -4615,7 +4705,7 @@ sub process {
# return is not a function
if (defined($stat) && $stat =~ /^.\s*return(\s*)\(/s) {
my $spacing = $1;
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
$stat =~ /^.\s*return\s*($balanced_parens)\s*;\s*$/) {
my $value = $1;
$value = deparenthesize($value);
@@ -4642,7 +4732,7 @@ sub process {
}
# if statements using unnecessary parentheses - ie: if ((foo == bar))
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
$line =~ /\bif\s*((?:\(\s*){2,})/) {
my $openparens = $1;
my $count = $openparens =~ tr@\(@\(@;
@@ -4659,7 +4749,7 @@ sub process {
# avoid cases like "foo + BAR < baz"
# only fix matches surrounded by parentheses to avoid incorrect
# conversions like "FOO < baz() + 5" being "misfixed" to "baz() > FOO + 5"
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
$line =~ /^\+(.*)\b($Constant|[A-Z_][A-Z0-9_]*)\s*($Compare)\s*($LvalOrFunc)/) {
my $lead = $1;
my $const = $2;
@@ -4949,6 +5039,7 @@ sub process {
if (defined $define_args && $define_args ne "") {
$define_args = substr($define_args, 1, length($define_args) - 2);
$define_args =~ s/\s*//g;
+ $define_args =~ s/\\\+?//g;
@def_args = split(",", $define_args);
}
@@ -5084,7 +5175,7 @@ sub process {
# do {} while (0) macro tests:
# single-statement macros do not need to be enclosed in do while (0) loop,
# macro should not end with a semicolon
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
$realfile !~ m@/vmlinux.lds.h$@ &&
$line =~ /^.\s*\#\s*define\s+$Ident(\()?/) {
my $ln = $linenr;
@@ -5330,15 +5421,28 @@ sub process {
}
# concatenated string without spaces between elements
- if ($line =~ /$String[A-Z_]/ || $line =~ /[A-Za-z0-9_]$String/) {
- CHK("CONCATENATED_STRING",
- "Concatenated strings should use spaces between elements\n" . $herecurr);
+ if ($line =~ /$String[A-Za-z0-9_]/ || $line =~ /[A-Za-z0-9_]$String/) {
+ if (CHK("CONCATENATED_STRING",
+ "Concatenated strings should use spaces between elements\n" . $herecurr) &&
+ $fix) {
+ while ($line =~ /($String)/g) {
+ my $extracted_string = substr($rawline, $-[0], $+[0] - $-[0]);
+ $fixed[$fixlinenr] =~ s/\Q$extracted_string\E([A-Za-z0-9_])/$extracted_string $1/;
+ $fixed[$fixlinenr] =~ s/([A-Za-z0-9_])\Q$extracted_string\E/$1 $extracted_string/;
+ }
+ }
}
# uncoalesced string fragments
if ($line =~ /$String\s*"/) {
- WARN("STRING_FRAGMENTS",
- "Consecutive strings are generally better as a single string\n" . $herecurr);
+ if (WARN("STRING_FRAGMENTS",
+ "Consecutive strings are generally better as a single string\n" . $herecurr) &&
+ $fix) {
+ while ($line =~ /($String)(?=\s*")/g) {
+ my $extracted_string = substr($rawline, $-[0], $+[0] - $-[0]);
+ $fixed[$fixlinenr] =~ s/\Q$extracted_string\E\s*"/substr($extracted_string, 0, -1)/e;
+ }
+ }
}
# check for non-standard and hex prefixed decimal printf formats
@@ -5374,9 +5478,14 @@ sub process {
# warn about #if 0
if ($line =~ /^.\s*\#\s*if\s+0\b/) {
- CHK("REDUNDANT_CODE",
- "if this code is redundant consider removing it\n" .
- $herecurr);
+ WARN("IF_0",
+ "Consider removing the code enclosed by this #if 0 and its #endif\n" . $herecurr);
+ }
+
+# warn about #if 1
+ if ($line =~ /^.\s*\#\s*if\s+1\b/) {
+ WARN("IF_1",
+ "Consider removing the #if 1 and its #endif\n" . $herecurr);
}
# check for needless "if (<foo>) fn(<foo>)" uses
@@ -5447,7 +5556,7 @@ sub process {
}
# check for mask then right shift without a parentheses
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
$line =~ /$LvalOrFunc\s*\&\s*($LvalOrFunc)\s*>>/ &&
$4 !~ /^\&/) { # $LvalOrFunc may be &foo, ignore if so
WARN("MASK_THEN_SHIFT",
@@ -5455,7 +5564,7 @@ sub process {
}
# check for pointer comparisons to NULL
- if ($^V && $^V ge 5.10.0) {
+ if ($perl_version_ok) {
while ($line =~ /\b$LvalOrFunc\s*(==|\!=)\s*NULL\b/g) {
my $val = $1;
my $equal = "!";
@@ -5727,7 +5836,7 @@ sub process {
}
# Check for __attribute__ weak, or __weak declarations (may have link issues)
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
$line =~ /(?:$Declare|$DeclareMisordered)\s*$Ident\s*$balanced_parens\s*(?:$Attribute)?\s*;/ &&
($line =~ /\b__attribute__\s*\(\s*\(.*\bweak\b/ ||
$line =~ /\b__weak\b/)) {
@@ -5809,7 +5918,7 @@ sub process {
}
# check for vsprintf extension %p<foo> misuses
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
defined $stat &&
$stat =~ /^\+(?![^\{]*\{\s*).*\b(\w+)\s*\(.*$String\s*,/s &&
$1 !~ /^_*volatile_*$/) {
@@ -5856,7 +5965,7 @@ sub process {
}
# Check for misused memsets
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
defined $stat &&
$stat =~ /^\+(?:.*?)\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*$FuncArg\s*\)/) {
@@ -5874,7 +5983,7 @@ sub process {
}
# Check for memcpy(foo, bar, ETH_ALEN) that could be ether_addr_copy(foo, bar)
-# if ($^V && $^V ge 5.10.0 &&
+# if ($perl_version_ok &&
# defined $stat &&
# $stat =~ /^\+(?:.*?)\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) {
# if (WARN("PREFER_ETHER_ADDR_COPY",
@@ -5885,7 +5994,7 @@ sub process {
# }
# Check for memcmp(foo, bar, ETH_ALEN) that could be ether_addr_equal*(foo, bar)
-# if ($^V && $^V ge 5.10.0 &&
+# if ($perl_version_ok &&
# defined $stat &&
# $stat =~ /^\+(?:.*?)\bmemcmp\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) {
# WARN("PREFER_ETHER_ADDR_EQUAL",
@@ -5894,7 +6003,7 @@ sub process {
# check for memset(foo, 0x0, ETH_ALEN) that could be eth_zero_addr
# check for memset(foo, 0xFF, ETH_ALEN) that could be eth_broadcast_addr
-# if ($^V && $^V ge 5.10.0 &&
+# if ($perl_version_ok &&
# defined $stat &&
# $stat =~ /^\+(?:.*?)\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) {
#
@@ -5916,7 +6025,7 @@ sub process {
# }
# typecasts on min/max could be min_t/max_t
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
defined $stat &&
$stat =~ /^\+(?:.*?)\b(min|max)\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\)/) {
if (defined $2 || defined $7) {
@@ -5940,7 +6049,7 @@ sub process {
}
# check usleep_range arguments
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
defined $stat &&
$stat =~ /^\+(?:.*?)\busleep_range\s*\(\s*($FuncArg)\s*,\s*($FuncArg)\s*\)/) {
my $min = $1;
@@ -5956,7 +6065,7 @@ sub process {
}
# check for naked sscanf
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
defined $stat &&
$line =~ /\bsscanf\b/ &&
($stat !~ /$Ident\s*=\s*sscanf\s*$balanced_parens/ &&
@@ -5970,7 +6079,7 @@ sub process {
}
# check for simple sscanf that should be kstrto<foo>
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
defined $stat &&
$line =~ /\bsscanf\b/) {
my $lc = $stat =~ tr@\n@@;
@@ -6042,10 +6151,12 @@ sub process {
}
# check for function definitions
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
defined $stat &&
- $stat =~ /^.\s*(?:$Storage\s+)?$Type\s*($Ident)\s*$balanced_parens\s*{/s) {
+ $stat =~ /^.\s*(?:$Storage\s+)?$Type\s*($Ident)\s*$balanced_parens\s*([\{;])/s) {
$context_function = $1;
+ my $args = $2;
+ my $term = $3;
# check for multiline function definition with misplaced open brace
my $ok = 0;
@@ -6056,12 +6167,26 @@ sub process {
$herectx .= $rl . "\n";
$ok = 1 if ($rl =~ /^[ \+]\{/);
$ok = 1 if ($rl =~ /\{/ && $n == 0);
- last if $rl =~ /^[ \+].*\{/;
+ last if ($rl =~ /^[ \+].*[\{;]/);
}
- if (!$ok) {
+ if (!$ok && $term eq '{') {
ERROR("OPEN_BRACE",
"open brace '{' following function definitions go on the next line\n" . $herectx);
}
+
+# check for 'passed by value' uses of a struct or a union that might
+# be better 'passed by reference'
+
+ while ($args =~ /(?:$Storage\s+)?($Type)\s*($Ident(?:\s*\[\s*\])?)?\s*,?/g) {
+ my $type = trim($1);
+ my $ident = defined($2) ? trim($2) : "";
+ if ($type =~ /\b(?:union|struct)\b/ &&
+ !($type =~ /(?:\*|\bconst|\])$/ ||
+ $ident =~ /\]$/)) {
+ WARN("AGGREGATE_BY_VALUE",
+ "Unusual 'passed by value' use of '$type'\n" . $herectx);
+ }
+ }
}
# checks for new __setup's
@@ -6082,14 +6207,14 @@ sub process {
# alloc style
# p = alloc(sizeof(struct foo), ...) should be p = alloc(sizeof(*p), ...)
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
$line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*([kv][mz]alloc(?:_node)?)\s*\(\s*(sizeof\s*\(\s*struct\s+$Lval\s*\))/) {
CHK("ALLOC_SIZEOF_STRUCT",
"Prefer $3(sizeof(*$1)...) over $3($4...)\n" . $herecurr);
}
# check for k[mz]alloc with multiplies that could be kmalloc_array/kcalloc
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
defined $stat &&
$stat =~ /^\+\s*($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)\s*,/) {
my $oldfunc = $3;
@@ -6118,8 +6243,9 @@ sub process {
}
# check for krealloc arg reuse
- if ($^V && $^V ge 5.10.0 &&
- $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*krealloc\s*\(\s*\1\s*,/) {
+ if ($perl_version_ok &&
+ $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*krealloc\s*\(\s*($Lval)\s*,/ &&
+ $1 eq $3) {
WARN("KREALLOC_ARG_REUSE",
"Reusing the krealloc arg is almost always a bug\n" . $herecurr);
}
@@ -6187,7 +6313,7 @@ sub process {
}
# check for switch/default statements without a break;
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
defined $stat &&
$stat =~ /^\+[$;\s]*(?:case[$;\s]+\w+[$;\s]*:[$;\s]*|)*[$;\s]*\bdefault[$;\s]*:[$;\s]*;/g) {
my $cnt = statement_rawlines($stat);
@@ -6251,6 +6377,13 @@ sub process {
"Avoid using bool as bitfield. Prefer bool bitfields as unsigned int or u<8|16|32>\n" . $herecurr);
}
+# check for bool use in .h files
+ if ($realfile =~ /\.h$/ &&
+ $sline =~ /^.\s+bool\s*$Ident\s*(?::\s*d+\s*)?;/) {
+ CHK("BOOL_MEMBER",
+ "Avoid using bool structure members because of possible alignment issues - see: https://lkml.org/lkml/2017/11/21/384\n" . $herecurr);
+ }
+
# check for semaphores initialized locked
if ($line =~ /^.\s*sema_init.+,\W?0\W?\)/) {
WARN("CONSIDER_COMPLETION",
@@ -6297,7 +6430,7 @@ sub process {
}
# likely/unlikely comparisons similar to "(likely(foo) > 0)"
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
$line =~ /\b((?:un)?likely)\s*\(\s*$FuncArg\s*\)\s*$Compare/) {
WARN("LIKELY_MISUSE",
"Using $1 should generally have parentheses around the comparison\n" . $herecurr);
@@ -6340,7 +6473,7 @@ sub process {
# check for DEVICE_ATTR uses that could be DEVICE_ATTR_<FOO>
# and whether or not function naming is typical and if
# DEVICE_ATTR permissions uses are unusual too
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
defined $stat &&
$stat =~ /\bDEVICE_ATTR\s*\(\s*(\w+)\s*,\s*\(?\s*(\s*(?:${multi_mode_perms_string_search}|0[0-7]{3,3})\s*)\s*\)?\s*,\s*(\w+)\s*,\s*(\w+)\s*\)/) {
my $var = $1;
@@ -6400,7 +6533,7 @@ sub process {
# specific definition of not visible in sysfs.
# o Ignore proc_create*(...) uses with a decimal 0 permission as that means
# use the default permissions
- if ($^V && $^V ge 5.10.0 &&
+ if ($perl_version_ok &&
defined $stat &&
$line =~ /$mode_perms_search/) {
foreach my $entry (@mode_permission_funcs) {
@@ -6486,9 +6619,14 @@ sub process {
ERROR("NOT_UNIFIED_DIFF",
"Does not appear to be a unified-diff format patch\n");
}
- if ($is_patch && $has_commit_log && $chk_signoff && $signoff == 0) {
- ERROR("MISSING_SIGN_OFF",
- "Missing Signed-off-by: line(s)\n");
+ if ($is_patch && $has_commit_log && $chk_signoff) {
+ if ($signoff == 0) {
+ ERROR("MISSING_SIGN_OFF",
+ "Missing Signed-off-by: line(s)\n");
+ } elsif (!$authorsignoff) {
+ WARN("NO_AUTHOR_SIGN_OFF",
+ "Missing Signed-off-by: line by nominal patch author '$author'\n");
+ }
}
print report_dump();
diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index c87fa734e3e1..0ebdefe74d5b 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -48,6 +48,7 @@ my $output_roles = 0;
my $output_rolestats = 1;
my $output_section_maxlen = 50;
my $scm = 0;
+my $tree = 1;
my $web = 0;
my $subsystem = 0;
my $status = 0;
@@ -61,7 +62,7 @@ my $self_test = undef;
my $version = 0;
my $help = 0;
my $find_maintainer_files = 0;
-
+my $maintainer_path;
my $vcs_used = 0;
my $exit = 0;
@@ -255,6 +256,7 @@ if (!GetOptions(
'subsystem!' => \$subsystem,
'status!' => \$status,
'scm!' => \$scm,
+ 'tree!' => \$tree,
'web!' => \$web,
'letters=s' => \$letters,
'pattern-depth=i' => \$pattern_depth,
@@ -263,6 +265,7 @@ if (!GetOptions(
'fe|file-emails!' => \$file_emails,
'f|file' => \$from_filename,
'find-maintainer-files' => \$find_maintainer_files,
+ 'mpath|maintainer-path=s' => \$maintainer_path,
'self-test:s' => \$self_test,
'v|version' => \$version,
'h|help|usage' => \$help,
@@ -319,7 +322,7 @@ if ($email &&
die "$P: Please select at least 1 email option\n";
}
-if (!top_of_kernel_tree($lk_path)) {
+if ($tree && !top_of_kernel_tree($lk_path)) {
die "$P: The current directory does not appear to be "
. "a linux kernel source tree.\n";
}
@@ -384,26 +387,37 @@ sub find_ignore_git {
read_all_maintainer_files();
sub read_all_maintainer_files {
- if (-d "${lk_path}MAINTAINERS") {
- opendir(DIR, "${lk_path}MAINTAINERS") or die $!;
- my @files = readdir(DIR);
- closedir(DIR);
- foreach my $file (@files) {
- push(@mfiles, "${lk_path}MAINTAINERS/$file") if ($file !~ /^\./);
- }
- }
-
- if ($find_maintainer_files) {
- find( { wanted => \&find_is_maintainer_file,
- preprocess => \&find_ignore_git,
- no_chdir => 1,
- }, "${lk_path}");
+ my $path = "${lk_path}MAINTAINERS";
+ if (defined $maintainer_path) {
+ $path = $maintainer_path;
+ # Perl Cookbook tilde expansion if necessary
+ $path =~ s@^~([^/]*)@ $1 ? (getpwnam($1))[7] : ( $ENV{HOME} || $ENV{LOGDIR} || (getpwuid($<))[7])@ex;
+ }
+
+ if (-d $path) {
+ $path .= '/' if ($path !~ m@/$@);
+ if ($path eq "${lk_path}MAINTAINERS/") {
+ opendir(DIR, "$path") or die $!;
+ my @files = readdir(DIR);
+ closedir(DIR);
+ foreach my $file (@files) {
+ push(@mfiles, "$path$file") if ($file !~ /^\./);
+ }
+ }
+ if ($find_maintainer_files) {
+ find( { wanted => \&find_is_maintainer_file,
+ preprocess => \&find_ignore_git,
+ no_chdir => 1,
+ }, "$path");
+ }
+ } elsif (-f "$path") {
+ push(@mfiles, "$path");
} else {
- push(@mfiles, "${lk_path}MAINTAINERS") if -f "${lk_path}MAINTAINERS";
+ die "$P: MAINTAINER file not found '$path'\n";
}
-
+ die "$P: No MAINTAINER files found in '$path'\n" if (scalar(@mfiles) == 0);
foreach my $file (@mfiles) {
- read_maintainer_file("$file");
+ read_maintainer_file("$file");
}
}
@@ -1031,13 +1045,14 @@ Other options:
--sections => print all of the subsystem sections with pattern matches
--letters => print all matching 'letter' types from all matching sections
--mailmap => use .mailmap file (default: $email_use_mailmap)
+ --no-tree => run without a kernel tree
--self-test => show potential issues with MAINTAINERS file content
--version => show version
--help => show this help information
Default options:
- [--email --nogit --git-fallback --m --r --n --l --multiline --pattern-depth=0
- --remove-duplicates --rolestats]
+ [--email --tree --nogit --git-fallback --m --r --n --l --multiline
+ --pattern-depth=0 --remove-duplicates --rolestats]
Notes:
Using "-f directory" may give unexpected results:
diff --git a/scripts/spdxcheck.py b/scripts/spdxcheck.py
index 7deaef297f52..839e190bbd7a 100755
--- a/scripts/spdxcheck.py
+++ b/scripts/spdxcheck.py
@@ -4,6 +4,7 @@
from argparse import ArgumentParser
from ply import lex, yacc
+import locale
import traceback
import sys
import git
@@ -32,7 +33,7 @@ def read_spdxdata(repo):
# The subdirectories of LICENSES in the kernel source
license_dirs = [ "preferred", "other", "exceptions" ]
- lictree = repo.heads.master.commit.tree['LICENSES']
+ lictree = repo.head.commit.tree['LICENSES']
spdx = SPDXdata()
@@ -102,7 +103,7 @@ class id_parser(object):
raise ParserException(tok, 'Invalid License ID')
self.lastid = id
elif tok.type == 'EXC':
- if not self.spdx.exceptions.has_key(id):
+ if id not in self.spdx.exceptions:
raise ParserException(tok, 'Invalid Exception ID')
if self.lastid not in self.spdx.exceptions[id]:
raise ParserException(tok, 'Exception not valid for license %s' %self.lastid)
@@ -167,6 +168,7 @@ class id_parser(object):
self.curline = 0
try:
for line in fd:
+ line = line.decode(locale.getpreferredencoding(False), errors='ignore')
self.curline += 1
if self.curline > maxlines:
break
@@ -199,11 +201,10 @@ def scan_git_tree(tree):
continue
if el.path.find("license-rules.rst") >= 0:
continue
- if el.path == 'scripts/checkpatch.pl':
- continue
if not os.path.isfile(el.path):
continue
- parser.parse_lines(open(el.path), args.maxlines, el.path)
+ with open(el.path, 'rb') as fd:
+ parser.parse_lines(fd, args.maxlines, el.path)
def scan_git_subtree(tree, path):
for p in path.strip('/').split('/'):
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 9a058cff49d4..d2688d55c34b 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -10,6 +10,8 @@
abandonning||abandoning
abigious||ambiguous
abitrate||arbitrate
+abord||abort
+aboslute||absolute
abov||above
abreviated||abbreviated
absense||absence
@@ -25,6 +27,7 @@ accessable||accessible
accesss||access
accidentaly||accidentally
accidentually||accidentally
+acclerated||accelerated
accoding||according
accomodate||accommodate
accomodates||accommodates
@@ -58,12 +61,15 @@ addres||address
adddress||address
addreses||addresses
addresss||address
+addrress||address
aditional||additional
aditionally||additionally
aditionaly||additionally
adminstrative||administrative
adress||address
adresses||addresses
+adrresses||addresses
+advertisment||advertisement
adviced||advised
afecting||affecting
againt||against
@@ -100,6 +106,7 @@ alue||value
ambigious||ambiguous
amoung||among
amout||amount
+amplifer||amplifier
an union||a union
an user||a user
an userspace||a userspace
@@ -145,11 +152,15 @@ assistent||assistant
assocation||association
associcated||associated
assotiated||associated
+asssert||assert
assum||assume
assumtpion||assumption
asuming||assuming
asycronous||asynchronous
asynchnous||asynchronous
+asynchromous||asynchronous
+asymetric||asymmetric
+asymmeric||asymmetric
atomatically||automatically
atomicly||atomically
atempt||attempt
@@ -172,6 +183,7 @@ avaible||available
availabe||available
availabled||available
availablity||availability
+availaible||available
availale||available
availavility||availability
availble||available
@@ -206,8 +218,11 @@ borad||board
boundry||boundary
brievely||briefly
broadcat||broadcast
+bufufer||buffer
cacluated||calculated
+caculate||calculate
caculation||calculation
+cadidate||candidate
calender||calendar
calescing||coalescing
calle||called
@@ -221,12 +236,14 @@ capabilty||capability
capabitilies||capabilities
capatibilities||capabilities
capapbilities||capabilities
+caputure||capture
carefuly||carefully
cariage||carriage
catagory||category
cehck||check
challange||challenge
challanges||challenges
+chache||cache
chanell||channel
changable||changeable
chanined||chained
@@ -240,6 +257,7 @@ charaters||characters
charcter||character
chcek||check
chck||check
+checksumed||checksummed
checksuming||checksumming
childern||children
childs||children
@@ -292,8 +310,10 @@ comunication||communication
conbination||combination
conditionaly||conditionally
conected||connected
+conector||connector
connecetd||connected
configuartion||configuration
+configuation||configuration
configuratoin||configuration
configuraton||configuration
configuretion||configuration
@@ -315,6 +335,7 @@ continous||continuous
continously||continuously
continueing||continuing
contraints||constraints
+contruct||construct
contol||control
contoller||controller
controled||controlled
@@ -343,6 +364,7 @@ dafault||default
deafult||default
deamon||daemon
decompres||decompress
+decsribed||described
decription||description
dectected||detected
defailt||default
@@ -379,6 +401,7 @@ desctiptor||descriptor
desriptor||descriptor
desriptors||descriptors
destionation||destination
+destoried||destroyed
destory||destroy
destoryed||destroyed
destorys||destroys
@@ -404,18 +427,25 @@ diffrentiate||differentiate
difinition||definition
dimesions||dimensions
diplay||display
+directon||direction
direectly||directly
+diregard||disregard
disassocation||disassociation
disapear||disappear
disapeared||disappeared
disappared||disappeared
+disbale||disable
+disbaled||disabled
disble||disable
disbled||disabled
disconnet||disconnect
discontinous||discontinuous
+disharge||discharge
dispertion||dispersion
dissapears||disappears
distiction||distinction
+divisable||divisible
+divsiors||divisors
docuentation||documentation
documantation||documentation
documentaion||documentation
@@ -427,6 +457,7 @@ downlad||download
downlads||downloads
druing||during
dynmaic||dynamic
+eanable||enable
easilly||easily
ecspecially||especially
edditable||editable
@@ -484,9 +515,12 @@ exprimental||experimental
extened||extended
extensability||extensibility
extention||extension
+extenstion||extension
extracter||extractor
+faield||failed
falied||failed
faild||failed
+failer||failure
faill||fail
failied||failed
faillure||failure
@@ -520,6 +554,7 @@ forseeable||foreseeable
forse||force
fortan||fortran
forwardig||forwarding
+frambuffer||framebuffer
framming||framing
framwork||framework
frequncy||frequency
@@ -527,6 +562,7 @@ frome||from
fucntion||function
fuction||function
fuctions||functions
+funcation||function
funcion||function
functionallity||functionality
functionaly||functionally
@@ -540,6 +576,7 @@ futrue||future
gaurenteed||guaranteed
generiously||generously
genereate||generate
+genereted||generated
genric||generic
globel||global
grabing||grabbing
@@ -553,6 +590,7 @@ guarentee||guarantee
halfs||halves
hander||handler
handfull||handful
+hanlde||handle
hanled||handled
happend||happened
harware||hardware
@@ -561,6 +599,7 @@ helpfull||helpful
hybernate||hibernate
hierachy||hierarchy
hierarchie||hierarchy
+homogenous||homogeneous
howver||however
hsould||should
hypervior||hypervisor
@@ -568,6 +607,8 @@ hypter||hyper
identidier||identifier
iligal||illegal
illigal||illegal
+illgal||illegal
+iomaped||iomapped
imblance||imbalance
immeadiately||immediately
immedaite||immediate
@@ -618,10 +659,12 @@ initation||initiation
initators||initiators
initialiazation||initialization
initializiation||initialization
+initialze||initialize
initialzed||initialized
initilization||initialization
initilize||initialize
inofficial||unofficial
+inrerface||interface
insititute||institute
instal||install
instanciated||instantiated
@@ -657,6 +700,7 @@ intregral||integral
intrrupt||interrupt
intterrupt||interrupt
intuative||intuitive
+inavlid||invalid
invaid||invalid
invald||invalid
invalde||invalid
@@ -683,6 +727,7 @@ langauge||language
langugage||language
lauch||launch
layed||laid
+legnth||length
leightweight||lightweight
lengh||length
lenght||length
@@ -696,6 +741,7 @@ licenceing||licencing
loggging||logging
loggin||login
logile||logfile
+loobpack||loopback
loosing||losing
losted||lost
machinary||machinery
@@ -703,6 +749,7 @@ maintainance||maintenance
maintainence||maintenance
maintan||maintain
makeing||making
+mailformed||malformed
malplaced||misplaced
malplace||misplace
managable||manageable
@@ -710,6 +757,7 @@ managment||management
mangement||management
manoeuvering||maneuvering
mappping||mapping
+matchs||matches
mathimatical||mathematical
mathimatic||mathematic
mathimatics||mathematics
@@ -725,6 +773,7 @@ messsage||message
messsages||messages
micropone||microphone
microprocesspr||microprocessor
+migrateable||migratable
milliseonds||milliseconds
minium||minimum
minimam||minimum
@@ -741,6 +790,7 @@ missmatch||mismatch
miximum||maximum
mmnemonic||mnemonic
mnay||many
+modfiy||modify
modulues||modules
momery||memory
memomry||memory
@@ -777,6 +827,7 @@ notifed||notified
numebr||number
numner||number
obtaion||obtain
+obusing||abusing
occassionally||occasionally
occationally||occasionally
occurance||occurrence
@@ -787,6 +838,7 @@ occure||occurred
occured||occurred
occuring||occurring
offet||offset
+offloded||offloaded
omited||omitted
omiting||omitting
omitt||omit
@@ -829,6 +881,7 @@ parametes||parameters
parametised||parametrised
paramter||parameter
paramters||parameters
+parmaters||parameters
particuarly||particularly
particularily||particularly
partiton||partition
@@ -837,6 +890,7 @@ passin||passing
pathes||paths
pecularities||peculiarities
peformance||performance
+peforming||performing
peice||piece
pendantic||pedantic
peprocessor||preprocessor
@@ -846,6 +900,7 @@ peroid||period
persistance||persistence
persistant||persistent
plalform||platform
+platfoem||platform
platfrom||platform
plattform||platform
pleaes||please
@@ -858,6 +913,7 @@ posible||possible
positon||position
possibilites||possibilities
powerfull||powerful
+preamle||preamble
preample||preamble
preapre||prepare
preceeded||preceded
@@ -870,6 +926,7 @@ prefered||preferred
prefferably||preferably
premption||preemption
prepaired||prepared
+preperation||preparation
pressre||pressure
primative||primitive
princliple||principle
@@ -935,6 +992,7 @@ recommanded||recommended
recyle||recycle
redircet||redirect
redirectrion||redirection
+redundacy||redundancy
reename||rename
refcounf||refcount
refence||reference
@@ -945,6 +1003,7 @@ refernces||references
refernnce||reference
refrence||reference
registerd||registered
+registeration||registration
registeresd||registered
registerred||registered
registes||registers
@@ -973,7 +1032,9 @@ requirment||requirement
requred||required
requried||required
requst||request
+reregisteration||reregistration
reseting||resetting
+reseverd||reserved
resizeable||resizable
resouce||resource
resouces||resources
@@ -982,6 +1043,7 @@ responce||response
ressizes||resizes
ressource||resource
ressources||resources
+restesting||retesting
retransmited||retransmitted
retreived||retrieved
retreive||retrieve
@@ -1006,6 +1068,7 @@ sacrifying||sacrificing
safly||safely
safty||safety
savable||saveable
+scaleing||scaling
scaned||scanned
scaning||scanning
scarch||search
@@ -1014,6 +1077,8 @@ searchs||searches
secquence||sequence
secund||second
segement||segment
+semaphone||semaphore
+senario||scenario
senarios||scenarios
sentivite||sensitive
separatly||separately
@@ -1025,9 +1090,13 @@ seperate||separate
seperatly||separately
seperator||separator
sepperate||separate
+seqeunce||sequence
+seqeuncer||sequencer
+seqeuencer||sequencer
sequece||sequence
sequencial||sequential
serveral||several
+servive||service
setts||sets
settting||setting
shotdown||shutdown
@@ -1073,6 +1142,7 @@ standartization||standardization
standart||standard
staticly||statically
stoped||stopped
+stoping||stopping
stoppped||stopped
straming||streaming
struc||struct
@@ -1085,6 +1155,7 @@ subdirectoires||subdirectories
suble||subtle
substract||subtract
submition||submission
+suceed||succeed
succesfully||successfully
succesful||successful
successed||succeeded
@@ -1108,6 +1179,7 @@ surpressed||suppressed
surpresses||suppresses
susbsystem||subsystem
suspeneded||suspended
+suspsend||suspend
suspicously||suspiciously
swaping||swapping
switchs||switches
@@ -1122,6 +1194,7 @@ swtich||switch
symetric||symmetric
synax||syntax
synchonized||synchronized
+synchronuously||synchronously
syncronize||synchronize
syncronized||synchronized
syncronizing||synchronizing
@@ -1130,11 +1203,14 @@ syste||system
sytem||system
sythesis||synthesis
taht||that
+tansmit||transmit
targetted||targeted
targetting||targeting
+taskelt||tasklet
teh||the
temorary||temporary
temproarily||temporarily
+thead||thread
therfore||therefore
thier||their
threds||threads
@@ -1143,11 +1219,14 @@ thresold||threshold
throught||through
troughput||throughput
thses||these
+tiggers||triggers
tiggered||triggered
tipically||typically
+timeing||timing
timout||timeout
tmis||this
torerable||tolerable
+traking||tracking
tramsmitted||transmitted
tramsmit||transmit
tranasction||transaction
@@ -1162,6 +1241,7 @@ transormed||transformed
trasfer||transfer
trasmission||transmission
treshold||threshold
+trigerred||triggered
trigerring||triggering
trun||turn
tunning||tuning
@@ -1169,6 +1249,8 @@ ture||true
tyep||type
udpate||update
uesd||used
+uknown||unknown
+usupported||unsupported
uncommited||uncommitted
unconditionaly||unconditionally
underun||underrun
@@ -1181,11 +1263,14 @@ unexpeted||unexpected
unexpexted||unexpected
unfortunatelly||unfortunately
unifiy||unify
+uniterrupted||uninterrupted
unintialized||uninitialized
unkmown||unknown
unknonw||unknown
unknow||unknown
unkown||unknown
+unamed||unnamed
+uneeded||unneeded
unneded||unneeded
unneccecary||unnecessary
unneccesary||unnecessary
@@ -1210,6 +1295,7 @@ usefull||useful
usege||usage
usera||users
usualy||usually
+usupported||unsupported
utilites||utilities
utillities||utilities
utilties||utilities
@@ -1233,7 +1319,9 @@ virtaul||virtual
virtiual||virtual
visiters||visitors
vitual||virtual
+vunerable||vulnerable
wakeus||wakeups
+wathdog||watchdog
wating||waiting
wiat||wait
wether||whether
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index c316d4237d94..f0ae4b40add6 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -796,6 +796,7 @@ static void apparmor_bprm_committing_creds(struct linux_binprm *bprm)
aa_inherit_files(bprm->cred, current->files);
current->pdeath_signal = 0;
+ current->signal->pdeath_signal_proc = 0;
/* reset soft limits and set hard limits for the new label */
__aa_transition_rlimits(label, new_label);
diff --git a/security/security.c b/security/security.c
index 7503a1741c38..570405bf4155 100644
--- a/security/security.c
+++ b/security/security.c
@@ -48,14 +48,17 @@ static __initdata char chosen_lsm[SECURITY_NAME_MAX + 1] =
static void __init do_security_initcalls(void)
{
int ret;
- initcall_t *call;
- call = __security_initcall_start;
+ initcall_t call;
+ initcall_entry_t *ce;
+
+ ce = __security_initcall_start;
trace_initcall_level("security");
- while (call < __security_initcall_end) {
- trace_initcall_start((*call));
- ret = (*call) ();
- trace_initcall_finish((*call), ret);
- call++;
+ while (ce < __security_initcall_end) {
+ call = initcall_from_entry(ce);
+ trace_initcall_start(call);
+ ret = call();
+ trace_initcall_finish(call, ret);
+ ce++;
}
}
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4618762c764a..2b9f9433e5fa 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2656,6 +2656,7 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm)
/* Always clear parent death signal on SID transitions. */
current->pdeath_signal = 0;
+ current->signal->pdeath_signal_proc = 0;
/* Check whether the new SID can inherit resource limits from the old
* SID. If not, reset all soft limits to the lower of the current
diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore
index 74e5912e9f2e..82121a81681f 100644
--- a/tools/testing/selftests/proc/.gitignore
+++ b/tools/testing/selftests/proc/.gitignore
@@ -9,3 +9,5 @@
/proc-uptime-001
/proc-uptime-002
/read
+/self
+/thread-self
diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile
index db310eedc268..1c12c34cf85d 100644
--- a/tools/testing/selftests/proc/Makefile
+++ b/tools/testing/selftests/proc/Makefile
@@ -1,4 +1,5 @@
CFLAGS += -Wall -O2 -Wno-unused-function
+CFLAGS += -D_GNU_SOURCE
TEST_GEN_PROGS :=
TEST_GEN_PROGS += fd-001-lookup
@@ -12,5 +13,7 @@ TEST_GEN_PROGS += proc-self-wchan
TEST_GEN_PROGS += proc-uptime-001
TEST_GEN_PROGS += proc-uptime-002
TEST_GEN_PROGS += read
+TEST_GEN_PROGS += self
+TEST_GEN_PROGS += thread-self
include ../lib.mk
diff --git a/tools/testing/selftests/proc/proc.h b/tools/testing/selftests/proc/proc.h
index 4e178166fd84..b7d57ea40237 100644
--- a/tools/testing/selftests/proc/proc.h
+++ b/tools/testing/selftests/proc/proc.h
@@ -6,6 +6,18 @@
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+static inline pid_t sys_getpid(void)
+{
+ return syscall(SYS_getpid);
+}
+
+static inline pid_t sys_gettid(void)
+{
+ return syscall(SYS_gettid);
+}
static inline bool streq(const char *s1, const char *s2)
{
diff --git a/tools/testing/selftests/proc/self.c b/tools/testing/selftests/proc/self.c
new file mode 100644
index 000000000000..21c15a1ffefb
--- /dev/null
+++ b/tools/testing/selftests/proc/self.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+// Test that /proc/self gives correct TGID.
+#undef NDEBUG
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "proc.h"
+
+int main(void)
+{
+ char buf1[64], buf2[64];
+ pid_t pid;
+ ssize_t rv;
+
+ pid = sys_getpid();
+ snprintf(buf1, sizeof(buf1), "%u", pid);
+
+ rv = readlink("/proc/self", buf2, sizeof(buf2));
+ assert(rv == strlen(buf1));
+ buf2[rv] = '\0';
+ assert(streq(buf1, buf2));
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/thread-self.c b/tools/testing/selftests/proc/thread-self.c
new file mode 100644
index 000000000000..4b23b39b7ae0
--- /dev/null
+++ b/tools/testing/selftests/proc/thread-self.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+// Test that /proc/thread-self gives correct TGID/PID.
+#undef NDEBUG
+#include <assert.h>
+#include <sched.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+
+#include "proc.h"
+
+int f(void *arg)
+{
+ char buf1[64], buf2[64];
+ pid_t pid, tid;
+ ssize_t rv;
+
+ pid = sys_getpid();
+ tid = sys_gettid();
+ snprintf(buf1, sizeof(buf1), "%u/task/%u", pid, tid);
+
+ rv = readlink("/proc/thread-self", buf2, sizeof(buf2));
+ assert(rv == strlen(buf1));
+ buf2[rv] = '\0';
+ assert(streq(buf1, buf2));
+
+ if (arg)
+ exit(0);
+ return 0;
+}
+
+int main(void)
+{
+ const int PAGE_SIZE = sysconf(_SC_PAGESIZE);
+ pid_t pid;
+ void *stack;
+
+ /* main thread */
+ f((void *)0);
+
+ stack = mmap(NULL, 2 * PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ assert(stack != MAP_FAILED);
+ /* side thread */
+ pid = clone(f, stack + PAGE_SIZE, CLONE_THREAD|CLONE_SIGHAND|CLONE_VM, (void *)1);
+ assert(pid > 0);
+ pause();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 342c7bc9dc8c..af5ff83f6d7f 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -1,6 +1,7 @@
hugepage-mmap
hugepage-shm
map_hugetlb
+map_populate
thuge-gen
compaction_test
mlock2-tests
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index fdefa2295ddc..9881876d2aa0 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -12,6 +12,7 @@ TEST_GEN_FILES += gup_benchmark
TEST_GEN_FILES += hugepage-mmap
TEST_GEN_FILES += hugepage-shm
TEST_GEN_FILES += map_hugetlb
+TEST_GEN_FILES += map_populate
TEST_GEN_FILES += mlock-random-test
TEST_GEN_FILES += mlock2-tests
TEST_GEN_FILES += on-fault-limit
diff --git a/tools/testing/selftests/vm/map_populate.c b/tools/testing/selftests/vm/map_populate.c
new file mode 100644
index 000000000000..df3eacc8eecb
--- /dev/null
+++ b/tools/testing/selftests/vm/map_populate.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Dmitry Safonov, Arista Networks
+ *
+ * MAP_POPULATE | MAP_PRIVATE should COW VMA pages.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifndef MMAP_SZ
+#define MMAP_SZ 4096
+#endif
+
+#define BUG_ON(condition, description) \
+ do { \
+ if (condition) { \
+ fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \
+ __LINE__, (description), strerror(errno)); \
+ exit(1); \
+ } \
+ } while (0)
+
+static int parent_f(int sock, unsigned long *smap, int child)
+{
+ int status, ret;
+
+ ret = read(sock, &status, sizeof(int));
+ BUG_ON(ret <= 0, "read(sock)");
+
+ *smap = 0x22222BAD;
+ ret = msync(smap, MMAP_SZ, MS_SYNC);
+ BUG_ON(ret, "msync()");
+
+ ret = write(sock, &status, sizeof(int));
+ BUG_ON(ret <= 0, "write(sock)");
+
+ waitpid(child, &status, 0);
+ BUG_ON(!WIFEXITED(status), "child hasn't suicide by own will");
+
+ return WEXITSTATUS(status);
+}
+
+static int child_f(int sock, unsigned long *smap, int fd)
+{
+ int ret, buf = 0;
+
+ smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_POPULATE, fd, 0);
+ BUG_ON(smap == MAP_FAILED, "mmap()");
+
+ BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file");
+
+ ret = write(sock, &buf, sizeof(int));
+ BUG_ON(ret <= 0, "write(sock)");
+
+ ret = read(sock, &buf, sizeof(int));
+ BUG_ON(ret <= 0, "read(sock)");
+
+ BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page");
+ BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted");
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int sock[2], child, ret;
+ FILE *ftmp;
+ unsigned long *smap;
+
+ ftmp = tmpfile();
+ BUG_ON(ftmp == 0, "tmpfile()");
+
+ ret = ftruncate(fileno(ftmp), MMAP_SZ);
+ BUG_ON(ret, "ftruncate()");
+
+ smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fileno(ftmp), 0);
+ BUG_ON(smap == MAP_FAILED, "mmap()");
+
+ *smap = 0xdeadbabe;
+ /* Probably unnecessary, but let it be. */
+ ret = msync(smap, MMAP_SZ, MS_SYNC);
+ BUG_ON(ret, "msync()");
+
+ ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock);
+ BUG_ON(ret, "socketpair()");
+
+ child = fork();
+ BUG_ON(child == -1, "fork()");
+
+ if (child) {
+ ret = close(sock[0]);
+ BUG_ON(ret, "close()");
+
+ return parent_f(sock[1], smap, child);
+ }
+
+ ret = close(sock[1]);
+ BUG_ON(ret, "close()");
+
+ return child_f(sock[0], smap, fileno(ftmp));
+}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index 88cbe5575f0c..584a91ae4a8f 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -168,6 +168,17 @@ else
fi
echo "--------------------"
+echo "running map_populate"
+echo "--------------------"
+./map_populate
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "--------------------"
echo "running mlock2-tests"
echo "--------------------"
./mlock2-tests
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index cce853dca691..30cb0a0713ff 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -75,8 +75,11 @@
#define KPF_BYTES 8
#define PROC_KPAGEFLAGS "/proc/kpageflags"
+#define PROC_KPAGECOUNT "/proc/kpagecount"
#define PROC_KPAGECGROUP "/proc/kpagecgroup"
+#define SYS_KERNEL_MM_PAGE_IDLE "/sys/kernel/mm/page_idle/bitmap"
+
/* [32-] kernel hacking assistances */
#define KPF_RESERVED 32
#define KPF_MLOCKED 33
@@ -168,11 +171,13 @@ static const char * const debugfs_known_mountpoints[] = {
static int opt_raw; /* for kernel developers */
static int opt_list; /* list pages (in ranges) */
+static int opt_mark_idle; /* set accessed bit */
static int opt_no_summary; /* don't show summary */
static pid_t opt_pid; /* process to walk */
const char *opt_file; /* file or directory path */
static uint64_t opt_cgroup; /* cgroup inode */
static int opt_list_cgroup;/* list page cgroup */
+static int opt_list_mapcnt;/* list page map count */
static const char *opt_kpageflags;/* kpageflags file to parse */
#define MAX_ADDR_RANGES 1024
@@ -194,7 +199,9 @@ static int page_size;
static int pagemap_fd;
static int kpageflags_fd;
+static int kpagecount_fd = -1;
static int kpagecgroup_fd = -1;
+static int page_idle_fd = -1;
static int opt_hwpoison;
static int opt_unpoison;
@@ -298,6 +305,15 @@ static unsigned long kpagecgroup_read(uint64_t *buf,
return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages);
}
+static unsigned long kpagecount_read(uint64_t *buf,
+ unsigned long index,
+ unsigned long pages)
+{
+ return kpagecount_fd < 0 ? pages :
+ do_u64_read(kpagecount_fd, PROC_KPAGECOUNT,
+ buf, index, pages);
+}
+
static unsigned long pagemap_read(uint64_t *buf,
unsigned long index,
unsigned long pages)
@@ -370,16 +386,18 @@ static char *page_flag_longname(uint64_t flags)
*/
static void show_page_range(unsigned long voffset, unsigned long offset,
- unsigned long size, uint64_t flags, uint64_t cgroup)
+ unsigned long size, uint64_t flags,
+ uint64_t cgroup, uint64_t mapcnt)
{
static uint64_t flags0;
static uint64_t cgroup0;
+ static uint64_t mapcnt0;
static unsigned long voff;
static unsigned long index;
static unsigned long count;
- if (flags == flags0 && cgroup == cgroup0 && offset == index + count &&
- size && voffset == voff + count) {
+ if (flags == flags0 && cgroup == cgroup0 && mapcnt == mapcnt0 &&
+ offset == index + count && size && voffset == voff + count) {
count += size;
return;
}
@@ -391,12 +409,15 @@ static void show_page_range(unsigned long voffset, unsigned long offset,
printf("%lu\t", voff);
if (opt_list_cgroup)
printf("@%llu\t", (unsigned long long)cgroup0);
+ if (opt_list_mapcnt)
+ printf("%lu\t", mapcnt0);
printf("%lx\t%lx\t%s\n",
index, count, page_flag_name(flags0));
}
flags0 = flags;
- cgroup0= cgroup;
+ cgroup0 = cgroup;
+ mapcnt0 = mapcnt;
index = offset;
voff = voffset;
count = size;
@@ -404,11 +425,11 @@ static void show_page_range(unsigned long voffset, unsigned long offset,
static void flush_page_range(void)
{
- show_page_range(0, 0, 0, 0, 0);
+ show_page_range(0, 0, 0, 0, 0, 0);
}
static void show_page(unsigned long voffset, unsigned long offset,
- uint64_t flags, uint64_t cgroup)
+ uint64_t flags, uint64_t cgroup, uint64_t mapcnt)
{
if (opt_pid)
printf("%lx\t", voffset);
@@ -416,6 +437,9 @@ static void show_page(unsigned long voffset, unsigned long offset,
printf("%lu\t", voffset);
if (opt_list_cgroup)
printf("@%llu\t", (unsigned long long)cgroup);
+ if (opt_list_mapcnt)
+ printf("%lu\t", mapcnt);
+
printf("%lx\t%s\n", offset, page_flag_name(flags));
}
@@ -567,6 +591,30 @@ static int unpoison_page(unsigned long offset)
return 0;
}
+static int mark_page_idle(unsigned long offset)
+{
+ static unsigned long off;
+ static uint64_t buf;
+ int len;
+
+ if ((offset / 64 == off / 64) || buf == 0) {
+ buf |= 1UL << (offset % 64);
+ off = offset;
+ return 0;
+ }
+
+ len = pwrite(page_idle_fd, &buf, 8, 8 * (off / 64));
+ if (len < 0) {
+ perror("mark page idle");
+ return len;
+ }
+
+ buf = 1UL << (offset % 64);
+ off = offset;
+
+ return 0;
+}
+
/*
* page frame walker
*/
@@ -599,7 +647,8 @@ static size_t hash_slot(uint64_t flags)
}
static void add_page(unsigned long voffset, unsigned long offset,
- uint64_t flags, uint64_t cgroup, uint64_t pme)
+ uint64_t flags, uint64_t cgroup, uint64_t mapcnt,
+ uint64_t pme)
{
flags = kpageflags_flags(flags, pme);
@@ -614,10 +663,13 @@ static void add_page(unsigned long voffset, unsigned long offset,
if (opt_unpoison)
unpoison_page(offset);
+ if (opt_mark_idle)
+ mark_page_idle(offset);
+
if (opt_list == 1)
- show_page_range(voffset, offset, 1, flags, cgroup);
+ show_page_range(voffset, offset, 1, flags, cgroup, mapcnt);
else if (opt_list == 2)
- show_page(voffset, offset, flags, cgroup);
+ show_page(voffset, offset, flags, cgroup, mapcnt);
nr_pages[hash_slot(flags)]++;
total_pages++;
@@ -631,6 +683,7 @@ static void walk_pfn(unsigned long voffset,
{
uint64_t buf[KPAGEFLAGS_BATCH];
uint64_t cgi[KPAGEFLAGS_BATCH];
+ uint64_t cnt[KPAGEFLAGS_BATCH];
unsigned long batch;
unsigned long pages;
unsigned long i;
@@ -654,8 +707,12 @@ static void walk_pfn(unsigned long voffset,
if (kpagecgroup_read(cgi, index, pages) != pages)
fatal("kpagecgroup returned fewer pages than expected");
+ if (kpagecount_read(cnt, index, batch) != pages)
+ fatal("kpagecount returned fewer pages than expected");
+
for (i = 0; i < pages; i++)
- add_page(voffset + i, index + i, buf[i], cgi[i], pme);
+ add_page(voffset + i, index + i,
+ buf[i], cgi[i], cnt[i], pme);
index += pages;
count -= pages;
@@ -673,9 +730,10 @@ static void walk_swap(unsigned long voffset, uint64_t pme)
return;
if (opt_list == 1)
- show_page_range(voffset, pagemap_swap_offset(pme), 1, flags, 0);
+ show_page_range(voffset, pagemap_swap_offset(pme),
+ 1, flags, 0, 0);
else if (opt_list == 2)
- show_page(voffset, pagemap_swap_offset(pme), flags, 0);
+ show_page(voffset, pagemap_swap_offset(pme), flags, 0, 0);
nr_pages[hash_slot(flags)]++;
total_pages++;
@@ -756,6 +814,9 @@ static void walk_addr_ranges(void)
else
walk_task(opt_offset[i], opt_size[i]);
+ if (opt_mark_idle)
+ mark_page_idle(0);
+
close(kpageflags_fd);
}
@@ -786,9 +847,11 @@ static void usage(void)
" -c|--cgroup path|@inode Walk pages within memory cgroup\n"
" -p|--pid pid Walk process address space\n"
" -f|--file filename Walk file address space\n"
+" -i|--mark-idle Mark pages idle\n"
" -l|--list Show page details in ranges\n"
" -L|--list-each Show page details one by one\n"
" -C|--list-cgroup Show cgroup inode for pages\n"
+" -M|--list-mapcnt Show page map count\n"
" -N|--no-summary Don't show summary info\n"
" -X|--hwpoison hwpoison pages\n"
" -x|--unpoison unpoison pages\n"
@@ -925,6 +988,7 @@ static void walk_file(const char *name, const struct stat *st)
uint8_t vec[PAGEMAP_BATCH];
uint64_t buf[PAGEMAP_BATCH], flags;
uint64_t cgroup = 0;
+ uint64_t mapcnt = 0;
unsigned long nr_pages, pfn, i;
off_t off, end = st->st_size;
int fd;
@@ -984,13 +1048,15 @@ got_sigbus:
continue;
if (!kpagecgroup_read(&cgroup, pfn, 1))
fatal("kpagecgroup_read failed");
+ if (!kpagecount_read(&mapcnt, pfn, 1))
+ fatal("kpagecount_read failed");
if (first && opt_list) {
first = 0;
flush_page_range();
show_file(name, st);
}
add_page(off / page_size + i, pfn,
- flags, cgroup, buf[i]);
+ flags, cgroup, mapcnt, buf[i]);
}
}
@@ -1190,9 +1256,11 @@ static const struct option opts[] = {
{ "bits" , 1, NULL, 'b' },
{ "cgroup" , 1, NULL, 'c' },
{ "describe" , 1, NULL, 'd' },
+ { "mark-idle" , 0, NULL, 'i' },
{ "list" , 0, NULL, 'l' },
{ "list-each" , 0, NULL, 'L' },
{ "list-cgroup", 0, NULL, 'C' },
+ { "list-mapcnt", 0, NULL, 'M' },
{ "no-summary", 0, NULL, 'N' },
{ "hwpoison" , 0, NULL, 'X' },
{ "unpoison" , 0, NULL, 'x' },
@@ -1208,7 +1276,8 @@ int main(int argc, char *argv[])
page_size = getpagesize();
while ((c = getopt_long(argc, argv,
- "rp:f:a:b:d:c:ClLNXxF:h", opts, NULL)) != -1) {
+ "rp:f:a:b:d:c:CilLMNXxF:h",
+ opts, NULL)) != -1) {
switch (c) {
case 'r':
opt_raw = 1;
@@ -1234,12 +1303,18 @@ int main(int argc, char *argv[])
case 'd':
describe_flags(optarg);
exit(0);
+ case 'i':
+ opt_mark_idle = 1;
+ break;
case 'l':
opt_list = 1;
break;
case 'L':
opt_list = 2;
break;
+ case 'M':
+ opt_list_mapcnt = 1;
+ break;
case 'N':
opt_no_summary = 1;
break;
@@ -1269,12 +1344,21 @@ int main(int argc, char *argv[])
if (opt_cgroup || opt_list_cgroup)
kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY);
+ if (opt_list && opt_list_mapcnt)
+ kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY);
+
+ if (opt_mark_idle && opt_file)
+ page_idle_fd = checked_open(SYS_KERNEL_MM_PAGE_IDLE, O_RDWR);
+
if (opt_list && opt_pid)
printf("voffset\t");
if (opt_list && opt_file)
printf("foffset\t");
if (opt_list && opt_list_cgroup)
printf("cgroup\t");
+ if (opt_list && opt_list_mapcnt)
+ printf("map-cnt\t");
+
if (opt_list == 1)
printf("offset\tlen\tflags\n");
if (opt_list == 2)
@@ -1296,5 +1380,11 @@ int main(int argc, char *argv[])
show_summary();
+ if (opt_list_mapcnt)
+ close(kpagecount_fd);
+
+ if (page_idle_fd >= 0)
+ close(page_idle_fd);
+
return 0;
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0df592c4f09f..f986e31fa68c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -140,9 +140,10 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
static unsigned long long kvm_createvm_count;
static unsigned long long kvm_active_vms;
-__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
- unsigned long start, unsigned long end)
+__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+ unsigned long start, unsigned long end, bool blockable)
{
+ return 0;
}
bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
@@ -360,13 +361,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
srcu_read_unlock(&kvm->srcu, idx);
}
-static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool blockable)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int need_tlb_flush = 0, idx;
+ int ret;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
@@ -384,9 +387,11 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
spin_unlock(&kvm->mmu_lock);
- kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
+ ret = kvm_arch_mmu_notifier_invalidate_range(kvm, start, end, blockable);
srcu_read_unlock(&kvm->srcu, idx);
+
+ return ret;
}
static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,