summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2021-07-21 16:31:13 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2021-07-21 16:31:13 +1000
commit43d1a8e000c2651686da00e5a81d32090e7ec8ed (patch)
treec2d125ac28002bd9663d8133aed22db82e2d432a
parentb4e39e59e64e0cf011d0770798ffece3d6a76277 (diff)
parentace6e27b90197751ad3e6baadae3b8b065fd4e2d (diff)
downloadlinux-next-43d1a8e000c2651686da00e5a81d32090e7ec8ed.tar.gz
Merge branch 'akpm-current/current'
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-numa24
-rw-r--r--Documentation/admin-guide/mm/memory-hotplug.rst800
-rw-r--r--Documentation/admin-guide/mm/numa_memory_policy.rst16
-rw-r--r--Documentation/arm64/tagged-address-abi.rst26
-rw-r--r--Documentation/core-api/cachetlb.rst86
-rw-r--r--Documentation/dev-tools/kasan.rst13
-rw-r--r--Documentation/dev-tools/kfence.rst98
-rw-r--r--Documentation/translations/zh_CN/core-api/cachetlb.rst9
-rw-r--r--arch/Kconfig30
-rw-r--r--arch/arm/include/asm/cacheflush.h4
-rw-r--r--arch/arm/mach-rpc/ecard.c2
-rw-r--r--arch/arm/mm/flush.c33
-rw-r--r--arch/arm/mm/nommu.c6
-rw-r--r--arch/arm64/mm/mmu.c3
-rw-r--r--arch/csky/abiv1/cacheflush.c11
-rw-r--r--arch/csky/abiv1/inc/abi/cacheflush.h4
-rw-r--r--arch/csky/kernel/probes/kprobes.c3
-rw-r--r--arch/ia64/mm/init.c3
-rw-r--r--arch/microblaze/include/asm/pgtable.h2
-rw-r--r--arch/microblaze/mm/init.c12
-rw-r--r--arch/microblaze/mm/pgtable.c17
-rw-r--r--arch/mips/include/asm/cacheflush.h8
-rw-r--r--arch/nds32/include/asm/cacheflush.h3
-rw-r--r--arch/nds32/mm/cacheflush.c9
-rw-r--r--arch/parisc/include/asm/cacheflush.h8
-rw-r--r--arch/parisc/kernel/cache.c3
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/kernel/smp.c2
-rw-r--r--arch/powerpc/mm/book3s64/radix_tlb.c4
-rw-r--r--arch/powerpc/mm/mem.c3
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-memory.c13
-rw-r--r--arch/s390/mm/init.c3
-rw-r--r--arch/sh/include/asm/cacheflush.h8
-rw-r--r--arch/sh/mm/init.c3
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c3
-rw-r--r--block/blk-map.c2
-rw-r--r--drivers/acpi/acpi_memhotplug.c11
-rw-r--r--drivers/base/memory.c4
-rw-r--r--drivers/base/node.c2
-rw-r--r--drivers/block/ps3disk.c2
-rw-r--r--drivers/dax/kmem.c3
-rw-r--r--drivers/mmc/host/jz4740_mmc.c4
-rw-r--r--drivers/mmc/host/mmc_spi.c2
-rw-r--r--drivers/of/kexec.c1
-rw-r--r--drivers/virtio/virtio_mem.c4
-rw-r--r--fs/buffer.c56
-rw-r--r--fs/drop_caches.c3
-rw-r--r--fs/exec.c10
-rw-r--r--fs/fs-writeback.c11
-rw-r--r--fs/fs_context.c4
-rw-r--r--fs/hfsplus/catalog.c16
-rw-r--r--fs/hfsplus/dir.c4
-rw-r--r--fs/hfsplus/hfsplus_raw.h12
-rw-r--r--fs/hfsplus/xattr.c18
-rw-r--r--fs/inode.c48
-rw-r--r--fs/internal.h1
-rw-r--r--fs/namei.c8
-rw-r--r--fs/nilfs2/sysfs.c26
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/namei.c17
-rw-r--r--fs/proc/base.c5
-rw-r--r--fs/proc/fd.c65
-rw-r--r--fs/proc/kcore.c73
-rw-r--r--fs/proc/page.c40
-rw-r--r--fs/userfaultfd.c26
-rw-r--r--include/linux/backing-dev-defs.h2
-rw-r--r--include/linux/backing-dev.h19
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/highmem.h39
-rw-r--r--include/linux/memblock.h4
-rw-r--r--include/linux/memcontrol.h75
-rw-r--r--include/linux/memory.h2
-rw-r--r--include/linux/memory_hotplug.h17
-rw-r--r--include/linux/mempolicy.h9
-rw-r--r--include/linux/memremap.h6
-rw-r--r--include/linux/migrate.h14
-rw-r--r--include/linux/mm.h32
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/mm_types_task.h5
-rw-r--r--include/linux/mmzone.h15
-rw-r--r--include/linux/pagemap.h50
-rw-r--r--include/linux/rmap.h8
-rw-r--r--include/linux/sched.h8
-rw-r--r--include/linux/sched/mm.h21
-rw-r--r--include/linux/slab.h32
-rw-r--r--include/linux/swap.h28
-rw-r--r--include/linux/threads.h2
-rw-r--r--include/linux/vm_event_item.h2
-rw-r--r--include/linux/vmpressure.h2
-rw-r--r--include/linux/writeback.h2
-rw-r--r--include/trace/events/migrate.h3
-rw-r--r--include/uapi/linux/mempolicy.h1
-rw-r--r--init/main.c25
-rw-r--r--ipc/shm.c10
-rw-r--r--ipc/util.c6
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c51
-rw-r--r--kernel/hung_task.c44
-rw-r--r--kernel/kthread.c21
-rw-r--r--kernel/pid_namespace.c3
-rw-r--r--kernel/sched/core.c35
-rw-r--r--kernel/sched/sched.h4
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/sysctl.c8
-rw-r--r--lib/Kconfig3
-rw-r--r--lib/Kconfig.debug17
-rw-r--r--lib/math/Kconfig2
-rw-r--r--lib/math/rational.c3
-rw-r--r--lib/scatterlist.c5
-rw-r--r--lib/string.c130
-rw-r--r--lib/test_sort.c40
-rw-r--r--lib/test_vmalloc.c5
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/backing-dev.c13
-rw-r--r--mm/compaction.c22
-rw-r--r--mm/debug_vm_pgtable.c849
-rw-r--r--mm/filemap.c23
-rw-r--r--mm/gup.c2
-rw-r--r--mm/huge_memory.c26
-rw-r--r--mm/hugetlb.c164
-rw-r--r--mm/internal.h9
-rw-r--r--mm/kasan/hw_tags.c43
-rw-r--r--mm/kasan/kasan.h1
-rw-r--r--mm/kasan/report.c29
-rw-r--r--mm/kfence/core.c22
-rw-r--r--mm/kfence/kfence.h2
-rw-r--r--mm/kfence/kfence_test.c2
-rw-r--r--mm/kfence/report.c19
-rw-r--r--mm/memblock.c23
-rw-r--r--mm/memcontrol.c186
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c2
-rw-r--r--mm/memory_hotplug.c51
-rw-r--r--mm/mempolicy.c152
-rw-r--r--mm/memremap.c25
-rw-r--r--mm/migrate.c315
-rw-r--r--mm/mmap_lock.c4
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/oom_kill.c7
-rw-r--r--mm/page-writeback.c121
-rw-r--r--mm/page_alloc.c95
-rw-r--r--mm/page_isolation.c7
-rw-r--r--mm/page_owner.c14
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/secretmem.c1
-rw-r--r--mm/slub.c6
-rw-r--r--mm/sparse.c43
-rw-r--r--mm/swapfile.c8
-rw-r--r--mm/truncate.c47
-rw-r--r--mm/vmalloc.c79
-rw-r--r--mm/vmpressure.c10
-rw-r--r--mm/vmscan.c206
-rw-r--r--mm/vmstat.c25
-rw-r--r--mm/workingset.c10
-rw-r--r--mm/zsmalloc.c10
-rw-r--r--tools/testing/scatterlist/linux/mm.h1
-rw-r--r--tools/testing/selftests/memfd/memfd_test.c2
-rw-r--r--tools/testing/selftests/vm/.gitignore1
-rw-r--r--tools/testing/selftests/vm/Makefile3
-rw-r--r--tools/testing/selftests/vm/ksm_tests.c516
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests.sh96
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c6
164 files changed, 3993 insertions, 1908 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-numa b/Documentation/ABI/testing/sysfs-kernel-mm-numa
new file mode 100644
index 000000000000..77e559d4ed80
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-numa
@@ -0,0 +1,24 @@
+What: /sys/kernel/mm/numa/
+Date: June 2021
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Interface for NUMA
+
+What: /sys/kernel/mm/numa/demotion_enabled
+Date: June 2021
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Enable/disable demoting pages during reclaim
+
+ Page migration during reclaim is intended for systems
+ with tiered memory configurations. These systems have
+ multiple types of memory with varied performance
+ characteristics instead of plain NUMA systems where
+ the same kind of memory is found at varied distances.
+ Allowing page migration during reclaim enables these
+ systems to migrate pages from fast tiers to slow tiers
+ when the fast tier is under pressure. This migration
+ is performed before swap. It may move data to a NUMA
+ node that does not fall into the cpuset of the
+ allocating process which might be construed to violate
+ the guarantees of cpusets. This should not be enabled
+ on systems which need strict cpuset location
+ guarantees.
diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
index c6bae2d77160..03dfbc925252 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -1,466 +1,576 @@
.. _admin_guide_memory_hotplug:
-==============
-Memory Hotplug
-==============
+==================
+Memory Hot(Un)Plug
+==================
-:Created: Jul 28 2007
-:Updated: Add some details about locking internals: Aug 20 2018
-
-This document is about memory hotplug including how-to-use and current status.
-Because Memory Hotplug is still under development, contents of this text will
-be changed often.
+This document describes generic Linux support for memory hot(un)plug with
+a focus on System RAM, including ZONE_MOVABLE support.
.. contents:: :local:
-.. note::
+Introduction
+============
- (1) x86_64's has special implementation for memory hotplug.
- This text does not describe it.
- (2) This text assumes that sysfs is mounted at ``/sys``.
+Memory hot(un)plug allows for increasing and decreasing the size of physical
+memory available to a machine at runtime. In the simplest case, it consists of
+physically plugging or unplugging a DIMM at runtime, coordinated with the
+operating system.
+Memory hot(un)plug is used for various purposes:
-Introduction
-============
+- The physical memory available to a machine can be adjusted at runtime, up- or
+ downgrading the memory capacity. This dynamic memory resizing, sometimes
+ referred to as "capacity on demand", is frequently used with virtual machines
+ and logical partitions.
+
+- Replacing hardware, such as DIMMs or whole NUMA nodes, without downtime. One
+ example is replacing failing memory modules.
-Purpose of memory hotplug
--------------------------
+- Reducing energy consumption either by physically unplugging memory modules or
+ by logically unplugging (parts of) memory modules from Linux.
-Memory Hotplug allows users to increase/decrease the amount of memory.
-Generally, there are two purposes.
+Further, the basic memory hot(un)plug infrastructure in Linux is nowadays also
+used to expose persistent memory, other performance-differentiated memory and
+reserved memory regions as ordinary system RAM to Linux.
-(A) For changing the amount of memory.
- This is to allow a feature like capacity on demand.
-(B) For installing/removing DIMMs or NUMA-nodes physically.
- This is to exchange DIMMs/NUMA-nodes, reduce power consumption, etc.
+Linux only supports memory hot(un)plug on selected 64 bit architectures, such as
+x86_64, arm64, ppc64, s390x and ia64.
-(A) is required by highly virtualized environments and (B) is required by
-hardware which supports memory power management.
+Memory Hot(Un)Plug Granularity
+------------------------------
-Linux memory hotplug is designed for both purpose.
+Memory hot(un)plug in Linux uses the SPARSEMEM memory model, which divides the
+physical memory address space into chunks of the same size: memory sections. The
+size of a memory section is architecture dependent. For example, x86_64 uses
+128 MiB and ppc64 uses 16 MiB.
-Phases of memory hotplug
+Memory sections are combined into chunks referred to as "memory blocks". The
+size of a memory block is architecture dependent and corresponds to the smallest
+granularity that can be hot(un)plugged. The default size of a memory block is
+the same as memory section size, unless an architecture specifies otherwise.
+
+All memory blocks have the same size.
+
+Phases of Memory Hotplug
------------------------
-There are 2 phases in Memory Hotplug:
+Memory hotplug consists of two phases:
- 1) Physical Memory Hotplug phase
- 2) Logical Memory Hotplug phase.
+(1) Adding the memory to Linux
+(2) Onlining memory blocks
-The First phase is to communicate hardware/firmware and make/erase
-environment for hotplugged memory. Basically, this phase is necessary
-for the purpose (B), but this is good phase for communication between
-highly virtualized environments too.
+In the first phase, metadata, such as the memory map ("memmap") and page tables
+for the direct mapping, is allocated and initialized, and memory blocks are
+created; the latter also creates sysfs files for managing newly created memory
+blocks.
-When memory is hotplugged, the kernel recognizes new memory, makes new memory
-management tables, and makes sysfs files for new memory's operation.
+In the second phase, added memory is exposed to the page allocator. After this
+phase, the memory is visible in memory statistics, such as free and total
+memory, of the system.
-If firmware supports notification of connection of new memory to OS,
-this phase is triggered automatically. ACPI can notify this event. If not,
-"probe" operation by system administration is used instead.
-(see :ref:`memory_hotplug_physical_mem`).
+Phases of Memory Hotunplug
+--------------------------
-Logical Memory Hotplug phase is to change memory state into
-available/unavailable for users. Amount of memory from user's view is
-changed by this phase. The kernel makes all memory in it as free pages
-when a memory range is available.
+Memory hotunplug consists of two phases:
-In this document, this phase is described as online/offline.
+(1) Offlining memory blocks
+(2) Removing the memory from Linux
-Logical Memory Hotplug phase is triggered by write of sysfs file by system
-administrator. For the hot-add case, it must be executed after Physical Hotplug
-phase by hand.
-(However, if you writes udev's hotplug scripts for memory hotplug, these
-phases can be execute in seamless way.)
+In the fist phase, memory is "hidden" from the page allocator again, for
+example, by migrating busy memory to other memory locations and removing all
+relevant free pages from the page allocator After this phase, the memory is no
+longer visible in memory statistics of the system.
-Unit of Memory online/offline operation
----------------------------------------
+In the second phase, the memory blocks are removed and metadata is freed.
-Memory hotplug uses SPARSEMEM memory model which allows memory to be divided
-into chunks of the same size. These chunks are called "sections". The size of
-a memory section is architecture dependent. For example, power uses 16MiB, ia64
-uses 1GiB.
+Memory Hotplug Notifications
+============================
-Memory sections are combined into chunks referred to as "memory blocks". The
-size of a memory block is architecture dependent and represents the logical
-unit upon which memory online/offline operations are to be performed. The
-default size of a memory block is the same as memory section size unless an
-architecture specifies otherwise. (see :ref:`memory_hotplug_sysfs_files`.)
+There are various ways how Linux is notified about memory hotplug events such
+that it can start adding hotplugged memory. This description is limited to
+systems that support ACPI; mechanisms specific to other firmware interfaces or
+virtual machines are not described.
-To determine the size (in bytes) of a memory block please read this file::
+ACPI Notifications
+------------------
- /sys/devices/system/memory/block_size_bytes
+Platforms that support ACPI, such as x86_64, can support memory hotplug
+notifications via ACPI.
-Kernel Configuration
-====================
+In general, a firmware supporting memory hotplug defines a memory class object
+HID "PNP0C80". When notified about hotplug of a new memory device, the ACPI
+driver will hotplug the memory to Linux.
-To use memory hotplug feature, kernel must be compiled with following
-config options.
+If the firmware supports hotplug of NUMA nodes, it defines an object _HID
+"ACPI0004", "PNP0A05", or "PNP0A06". When notified about an hotplug event, all
+assigned memory devices are added to Linux by the ACPI driver.
-- For all memory hotplug:
- - Memory model -> Sparse Memory (``CONFIG_SPARSEMEM``)
- - Allow for memory hot-add (``CONFIG_MEMORY_HOTPLUG``)
+Similarly, Linux can be notified about requests to hotunplug a memory device or
+a NUMA node via ACPI. The ACPI driver will try offlining all relevant memory
+blocks, and, if successful, hotunplug the memory from Linux.
-- To enable memory removal, the following are also necessary:
- - Allow for memory hot remove (``CONFIG_MEMORY_HOTREMOVE``)
- - Page Migration (``CONFIG_MIGRATION``)
+Manual Probing
+--------------
-- For ACPI memory hotplug, the following are also necessary:
- - Memory hotplug (under ACPI Support menu) (``CONFIG_ACPI_HOTPLUG_MEMORY``)
- - This option can be kernel module.
+On some architectures, the firmware may not be able to notify the operating
+system about a memory hotplug event. Instead, the memory has to be manually
+probed from user space.
-- As a related configuration, if your box has a feature of NUMA-node hotplug
- via ACPI, then this option is necessary too.
+The probe interface is located at::
- - ACPI0004,PNP0A05 and PNP0A06 Container Driver (under ACPI Support menu)
- (``CONFIG_ACPI_CONTAINER``).
+ /sys/devices/system/memory/probe
- This option can be kernel module too.
+Only complete memory blocks can be probed. Individual memory blocks are probed
+by providing the physical start address of the memory block::
+ % echo addr > /sys/devices/system/memory/probe
-.. _memory_hotplug_sysfs_files:
+Which results in a memory block for the range [addr, addr + memory_block_size)
+being created.
-sysfs files for memory hotplug
-==============================
+.. note::
-All memory blocks have their device information in sysfs. Each memory block
-is described under ``/sys/devices/system/memory`` as::
+ Using the probe interface is discouraged as it is easy to crash the kernel,
+ because Linux cannot validate user input; this interface might be removed in
+ the future.
- /sys/devices/system/memory/memoryXXX
+Onlining and Offlining Memory Blocks
+====================================
-where XXX is the memory block id.
+After a memory block has been created, Linux has to be instructed to actually
+make use of that memory: the memory block has to be "online".
-For the memory block covered by the sysfs directory. It is expected that all
-memory sections in this range are present and no memory holes exist in the
-range. Currently there is no way to determine if there is a memory hole, but
-the existence of one should not affect the hotplug capabilities of the memory
-block.
+Before a memory block can be removed, Linux has to stop using any memory part of
+the memory block: the memory block has to be "offlined".
-For example, assume 1GiB memory block size. A device for a memory starting at
-0x100000000 is ``/sys/device/system/memory/memory4``::
+The Linux kernel can be configured to automatically online added memory blocks
+and drivers automatically trigger offlining of memory blocks when trying
+hotunplug of memory. Memory blocks can only be removed once offlining succeeded
+and drivers may trigger offlining of memory blocks when attempting hotunplug of
+memory.
- (0x100000000 / 1Gib = 4)
+Onlining Memory Blocks Manually
+-------------------------------
-This device covers address range [0x100000000 ... 0x140000000)
+If auto-onlining of memory blocks isn't enabled, user-space has to manually
+trigger onlining of memory blocks. Often, udev rules are used to automate this
+task in user space.
-Under each memory block, you can see 5 files:
+Onlining of a memory block can be triggered via::
-- ``/sys/devices/system/memory/memoryXXX/phys_index``
-- ``/sys/devices/system/memory/memoryXXX/phys_device``
-- ``/sys/devices/system/memory/memoryXXX/state``
-- ``/sys/devices/system/memory/memoryXXX/removable``
-- ``/sys/devices/system/memory/memoryXXX/valid_zones``
+ % echo online > /sys/devices/system/memory/memoryXXX/state
-=================== ============================================================
-``phys_index`` read-only and contains memory block id, same as XXX.
-``state`` read-write
+Or alternatively::
- - at read: contains online/offline state of memory.
- - at write: user can specify "online_kernel",
+ % echo 1 > /sys/devices/system/memory/memoryXXX/online
- "online_movable", "online", "offline" command
- which will be performed on all sections in the block.
-``phys_device`` read-only: legacy interface only ever used on s390x to
- expose the covered storage increment.
-``removable`` read-only: legacy interface that indicated whether a memory
- block was likely to be offlineable or not. Newer kernel
- versions return "1" if and only if the kernel supports
- memory offlining.
-``valid_zones`` read-only: designed to show by which zone memory provided by
- a memory block is managed, and to show by which zone memory
- provided by an offline memory block could be managed when
- onlining.
-
- The first column shows it`s default zone.
-
- "memory6/valid_zones: Normal Movable" shows this memoryblock
- can be onlined to ZONE_NORMAL by default and to ZONE_MOVABLE
- by online_movable.
-
- "memory7/valid_zones: Movable Normal" shows this memoryblock
- can be onlined to ZONE_MOVABLE by default and to ZONE_NORMAL
- by online_kernel.
-=================== ============================================================
+The kernel will select the target zone automatically, usually defaulting to
+``ZONE_NORMAL`` unless ``movablecore=1`` has been specified on the kernel
+command line or if the memory block would intersect the ZONE_MOVABLE already.
-.. note::
+One can explicitly request to associate an offline memory block with
+ZONE_MOVABLE by::
- These directories/files appear after physical memory hotplug phase.
+ % echo online_movable > /sys/devices/system/memory/memoryXXX/state
-If CONFIG_NUMA is enabled the memoryXXX/ directories can also be accessed
-via symbolic links located in the ``/sys/devices/system/node/node*`` directories.
+Or one can explicitly request a kernel zone (usually ZONE_NORMAL) by::
-For example::
+ % echo online_kernel > /sys/devices/system/memory/memoryXXX/state
- /sys/devices/system/node/node0/memory9 -> ../../memory/memory9
+In any case, if onlining succeeds, the state of the memory block is changed to
+be "online". If it fails, the state of the memory block will remain unchanged
+and the above commands will fail.
-A backlink will also be created::
+Onlining Memory Blocks Automatically
+------------------------------------
- /sys/devices/system/memory/memory9/node0 -> ../../node/node0
+The kernel can be configured to try auto-onlining of newly added memory blocks.
+If this feature is disabled, the memory blocks will stay offline until
+explicitly onlined from user space.
-.. _memory_hotplug_physical_mem:
+The configured auto-online behavior can be observed via::
-Physical memory hot-add phase
-=============================
+ % cat /sys/devices/system/memory/auto_online_blocks
-Hardware(Firmware) Support
---------------------------
+Auto-onlining can be enabled by writing ``online``, ``online_kernel`` or
+``online_movable`` to that file, like::
-On x86_64/ia64 platform, memory hotplug by ACPI is supported.
+ % echo online > /sys/devices/system/memory/auto_online_blocks
-In general, the firmware (ACPI) which supports memory hotplug defines
-memory class object of _HID "PNP0C80". When a notify is asserted to PNP0C80,
-Linux's ACPI handler does hot-add memory to the system and calls a hotplug udev
-script. This will be done automatically.
+Modifying the auto-online behavior will only affect all subsequently added
+memory blocks only.
-But scripts for memory hotplug are not contained in generic udev package(now).
-You may have to write it by yourself or online/offline memory by hand.
-Please see :ref:`memory_hotplug_how_to_online_memory` and
-:ref:`memory_hotplug_how_to_offline_memory`.
+.. note::
-If firmware supports NUMA-node hotplug, and defines an object _HID "ACPI0004",
-"PNP0A05", or "PNP0A06", notification is asserted to it, and ACPI handler
-calls hotplug code for all of objects which are defined in it.
-If memory device is found, memory hotplug code will be called.
+ In corner cases, auto-onlining can fail. The kernel won't retry. Note that
+ auto-onlining is not expected to fail in default configurations.
-Notify memory hot-add event by hand
------------------------------------
+.. note::
-On some architectures, the firmware may not notify the kernel of a memory
-hotplug event. Therefore, the memory "probe" interface is supported to
-explicitly notify the kernel. This interface depends on
-CONFIG_ARCH_MEMORY_PROBE and can be configured on powerpc, sh, and x86
-if hotplug is supported, although for x86 this should be handled by ACPI
-notification.
+ DLPAR on ppc64 ignores the ``offline`` setting and will still online added
+ memory blocks; if onlining fails, memory blocks are removed again.
-Probe interface is located at::
+Offlining Memory Blocks
+-----------------------
- /sys/devices/system/memory/probe
+In the current implementation, Linux's memory offlining will try migrating all
+movable pages off the affected memory block. As most kernel allocations, such as
+page tables, are unmovable, page migration can fail and, therefore, inhibit
+memory offlining from succeeding.
-You can tell the physical address of new memory to the kernel by::
+Having the memory provided by memory block managed by ZONE_MOVABLE significantly
+increases memory offlining reliability; still, memory offlining can fail in
+some corner cases.
- % echo start_address_of_new_memory > /sys/devices/system/memory/probe
+Further, memory offlining might retry for a long time (or even forever), until
+aborted by the user.
-Then, [start_address_of_new_memory, start_address_of_new_memory +
-memory_block_size] memory range is hot-added. In this case, hotplug script is
-not called (in current implementation). You'll have to online memory by
-yourself. Please see :ref:`memory_hotplug_how_to_online_memory`.
+Offlining of a memory block can be triggered via::
-Logical Memory hot-add phase
-============================
+ % echo offline > /sys/devices/system/memory/memoryXXX/state
-State of memory
----------------
+Or alternatively::
-To see (online/offline) state of a memory block, read 'state' file::
+ % echo 0 > /sys/devices/system/memory/memoryXXX/online
- % cat /sys/device/system/memory/memoryXXX/state
+If offlining succeeds, the state of the memory block is changed to be "offline".
+If it fails, the state of the memory block will remain unchanged and the above
+commands will fail, for example, via::
+ bash: echo: write error: Device or resource busy
-- If the memory block is online, you'll read "online".
-- If the memory block is offline, you'll read "offline".
+or via::
+ bash: echo: write error: Invalid argument
-.. _memory_hotplug_how_to_online_memory:
+Observing the State of Memory Blocks
+------------------------------------
-How to online memory
---------------------
+The state (online/offline/going-offline) of a memory block can be observed
+either via::
-When the memory is hot-added, the kernel decides whether or not to "online"
-it according to the policy which can be read from "auto_online_blocks" file::
+ % cat /sys/device/system/memory/memoryXXX/state
- % cat /sys/devices/system/memory/auto_online_blocks
+Or alternatively (1/0) via::
-The default depends on the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
-option. If it is disabled the default is "offline" which means the newly added
-memory is not in a ready-to-use state and you have to "online" the newly added
-memory blocks manually. Automatic onlining can be requested by writing "online"
-to "auto_online_blocks" file::
+ % cat /sys/device/system/memory/memoryXXX/online
- % echo online > /sys/devices/system/memory/auto_online_blocks
+For an online memory block, the managing zone can be observed via::
-This sets a global policy and impacts all memory blocks that will subsequently
-be hotplugged. Currently offline blocks keep their state. It is possible, under
-certain circumstances, that some memory blocks will be added but will fail to
-online. User space tools can check their "state" files
-(``/sys/devices/system/memory/memoryXXX/state``) and try to online them manually.
+ % cat /sys/device/system/memory/memoryXXX/valid_zones
-If the automatic onlining wasn't requested, failed, or some memory block was
-offlined it is possible to change the individual block's state by writing to the
-"state" file::
+Configuring Memory Hot(Un)Plug
+==============================
- % echo online > /sys/devices/system/memory/memoryXXX/state
+There are various ways how system administrators can configure memory
+hot(un)plug and interact with memory blocks, especially, to online them.
-This onlining will not change the ZONE type of the target memory block,
-If the memory block doesn't belong to any zone an appropriate kernel zone
-(usually ZONE_NORMAL) will be used unless movable_node kernel command line
-option is specified when ZONE_MOVABLE will be used.
+Memory Hot(Un)Plug Configuration via Sysfs
+------------------------------------------
-You can explicitly request to associate it with ZONE_MOVABLE by::
+Some memory hot(un)plug properties can be configured or inspected via sysfs in::
- % echo online_movable > /sys/devices/system/memory/memoryXXX/state
+ /sys/devices/system/memory/
-.. note:: current limit: this memory block must be adjacent to ZONE_MOVABLE
+The following files are currently defined:
-Or you can explicitly request a kernel zone (usually ZONE_NORMAL) by::
+====================== =========================================================
+``auto_online_blocks`` read-write: set or get the default state of new memory
+ blocks; configure auto-onlining.
- % echo online_kernel > /sys/devices/system/memory/memoryXXX/state
+ The default value depends on the
+ CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel configuration
+ option.
-.. note:: current limit: this memory block must be adjacent to ZONE_NORMAL
+ See the ``state`` property of memory blocks for details.
+``block_size_bytes`` read-only: the size in bytes of a memory block.
+``probe`` write-only: add (probe) selected memory blocks manually
+ from user space by supplying the physical start address.
-An explicit zone onlining can fail (e.g. when the range is already within
-and existing and incompatible zone already).
+ Availability depends on the CONFIG_ARCH_MEMORY_PROBE
+ kernel configuration option.
+``uevent`` read-write: generic udev file for device subsystems.
+====================== =========================================================
-After this, memory block XXX's state will be 'online' and the amount of
-available memory will be increased.
+.. note::
-This may be changed in future.
+ When the CONFIG_MEMORY_FAILURE kernel configuration option is enabled, two
+ additional files ``hard_offline_page`` and ``soft_offline_page`` are available
+ to trigger hwpoisoning of pages, for example, for testing purposes. Note that
+ this functionality is not really related to memory hot(un)plug or actual
+ offlining of memory blocks.
-Logical memory remove
-=====================
+Memory Block Configuration via Sysfs
+------------------------------------
-Memory offline and ZONE_MOVABLE
--------------------------------
+Each memory block is represented as a memory block device that can be
+onlined or offlined. All memory blocks have their device information located in
+sysfs. Each present memory block is listed under
+``/sys/devices/system/memory`` as::
-Memory offlining is more complicated than memory online. Because memory offline
-has to make the whole memory block be unused, memory offline can fail if
-the memory block includes memory which cannot be freed.
+ /sys/devices/system/memory/memoryXXX
-In general, memory offline can use 2 techniques.
+where XXX is the memory block id; the number of digits is variable.
-(1) reclaim and free all memory in the memory block.
-(2) migrate all pages in the memory block.
+A present memory block indicates that some memory in the range is present;
+however, a memory block might span memory holes. A memory block spanning memory
+holes cannot be offlined.
-In the current implementation, Linux's memory offline uses method (2), freeing
-all pages in the memory block by page migration. But not all pages are
-migratable. Under current Linux, migratable pages are anonymous pages and
-page caches. For offlining a memory block by migration, the kernel has to
-guarantee that the memory block contains only migratable pages.
+For example, assume 1 GiB memory block size. A device for a memory starting at
+0x100000000 is ``/sys/device/system/memory/memory4``::
-Now, a boot option for making a memory block which consists of migratable pages
-is supported. By specifying "kernelcore=" or "movablecore=" boot option, you can
-create ZONE_MOVABLE...a zone which is just used for movable pages.
-(See also Documentation/admin-guide/kernel-parameters.rst)
+ (0x100000000 / 1Gib = 4)
-Assume the system has "TOTAL" amount of memory at boot time, this boot option
-creates ZONE_MOVABLE as following.
+This device covers address range [0x100000000 ... 0x140000000)
-1) When kernelcore=YYYY boot option is used,
- Size of memory not for movable pages (not for offline) is YYYY.
- Size of memory for movable pages (for offline) is TOTAL-YYYY.
+The following files are currently defined:
-2) When movablecore=ZZZZ boot option is used,
- Size of memory not for movable pages (not for offline) is TOTAL - ZZZZ.
- Size of memory for movable pages (for offline) is ZZZZ.
+=================== ============================================================
+``online`` read-write: simplified interface to trigger onlining /
+ offlining and to observe the state of a memory block.
+ When onlining, the zone is selected automatically.
+``phys_device`` read-only: legacy interface only ever used on s390x to
+ expose the covered storage increment.
+``phys_index`` read-only: the memory block id (XXX).
+``removable`` read-only: legacy interface that indicated whether a memory
+ block was likely to be offlineable or not. Nowadays, the
+ kernel return ``1`` if and only if it supports memory
+ offlining.
+``state`` read-write: advanced interface to trigger onlining /
+ offlining and to observe the state of a memory block.
+
+ When writing, ``online``, ``offline``, ``online_kernel`` and
+ ``online_movable`` are supported.
+
+ ``online_movable`` specifies onlining to ZONE_MOVABLE.
+ ``online_kernel`` specifies onlining to the default kernel
+ zone for the memory block, such as ZONE_NORMAL.
+ ``online`` let's the kernel select the zone automatically.
+
+ When reading, ``online``, ``offline`` and ``going-offline``
+ may be returned.
+``uevent`` read-write: generic uevent file for devices.
+``valid_zones`` read-only: when a block is online, shows the zone it
+ belongs to; when a block is offline, shows what zone will
+ manage it when the block will be onlined.
+
+ For online memory blocks, ``DMA``, ``DMA32``, ``Normal``,
+ ``Movable`` and ``none`` may be returned. ``none`` indicates
+ that memory provided by a memory block is managed by
+ multiple zones or spans multiple nodes; such memory blocks
+ cannot be offlined. ``Movable`` indicates ZONE_MOVABLE.
+ Other values indicate a kernel zone.
+
+ For offline memory blocks, the first column shows the
+ zone the kernel would select when onlining the memory block
+ right now without further specifying a zone.
+
+ Availability depends on the CONFIG_MEMORY_HOTREMOVE
+ kernel configuration option.
+=================== ============================================================
.. note::
- Unfortunately, there is no information to show which memory block belongs
- to ZONE_MOVABLE. This is TBD.
+ If the CONFIG_NUMA kernel configuration option is enabled, the memoryXXX/
+ directories can also be accessed via symbolic links located in the
+ ``/sys/devices/system/node/node*`` directories.
+
+ For example::
+
+ /sys/devices/system/node/node0/memory9 -> ../../memory/memory9
+
+ A backlink will also be created::
+
+ /sys/devices/system/memory/memory9/node0 -> ../../node/node0
+
+Command Line Parameters
+-----------------------
+
+Some command line parameters affect memory hot(un)plug handling. The following
+command line parameters are relevant:
+
+======================== =======================================================
+``memhp_default_state`` configure auto-onlining by essentially setting
+ ``/sys/devices/system/memory/auto_online_blocks``.
+``movablecore`` configure automatic zone selection of the kernel. When
+ set, the kernel will default to ZONE_MOVABLE, unless
+ other zones can be kept contiguous.
+======================== =======================================================
+
+Module Parameters
+------------------
- Memory offlining can fail when dissolving a free huge page on ZONE_MOVABLE
- and the feature of freeing unused vmemmap pages associated with each hugetlb
- page is enabled.
+Instead of additional command line parameters or sysfs files, the
+``memory_hotplug`` subsystem now provides a dedicated namespace for module
+parameters. Module parameters can be set via the command line by predicating
+them with ``memory_hotplug.`` such as::
+
+ memory_hotplug.memmap_on_memory=1
+
+and they can be observed (and some even modified at runtime) via::
+
+ /sys/modules/memory_hotplug/parameters/
+
+The following module parameters are currently defined:
+
+======================== =======================================================
+``memmap_on_memory`` read-write: Allocate memory for the memmap from the
+ added memory block itself. Even if enabled, actual
+ support depends on various other system properties and
+ should only be regarded as a hint whether the behavior
+ would be desired.
+
+ While allocating the memmap from the memory block
+ itself makes memory hotplug less likely to fail and
+ keeps the memmap on the same NUMA node in any case, it
+ can fragment physical memory in a way that huge pages
+ in bigger granularity cannot be formed on hotplugged
+ memory.
+======================== =======================================================
+
+ZONE_MOVABLE
+============
+
+ZONE_MOVABLE is an important mechanism for more reliable memory offlining.
+Further, having system RAM managed by ZONE_MOVABLE instead of one of the
+kernel zones can increase the number of possible transparent huge pages and
+dynamically allocated huge pages.
+
+Most kernel allocations are unmovable. Important examples include the memory
+map (usually 1/64ths of memory), page tables, and kmalloc(). Such allocations
+can only be served from the kernel zones.
+
+Most user space pages, such as anonymous memory, and page cache pages are
+movable. Such allocations can be served from ZONE_MOVABLE and the kernel zones.
+
+Only movable allocations are served from ZONE_MOVABLE, resulting in unmovable
+allocations being limited to the kernel zones. Without ZONE_MOVABLE, there is
+absolutely no guarantee whether a memory block can be offlined successfully.
+
+Zone Imbalances
+---------------
- This can happen when we have plenty of ZONE_MOVABLE memory, but not enough
- kernel memory to allocate vmemmmap pages. We may even be able to migrate
- huge page contents, but will not be able to dissolve the source huge page.
- This will prevent an offline operation and is unfortunate as memory offlining
- is expected to succeed on movable zones. Users that depend on memory hotplug
- to succeed for movable zones should carefully consider whether the memory
- savings gained from this feature are worth the risk of possibly not being
- able to offline memory in certain situations.
+Having too much system RAM managed by ZONE_MOVABLE is called a zone imbalance,
+which can harm the system or degrade performance. As one example, the kernel
+might crash because it runs out of free memory for unmovable allocations,
+although there is still plenty of free memory left in ZONE_MOVABLE.
+
+Usually, MOVABLE:KERNEL ratios of up to 3:1 or even 4:1 are fine. Ratios of 63:1
+are definitely impossible due to the overhead for the memory map.
+
+Actual safe zone ratios depend on the workload. Extreme cases, like excessive
+long-term pinning of pages, might not be able to deal with ZONE_MOVABLE at all.
.. note::
- Techniques that rely on long-term pinnings of memory (especially, RDMA and
- vfio) are fundamentally problematic with ZONE_MOVABLE and, therefore, memory
- hot remove. Pinned pages cannot reside on ZONE_MOVABLE, to guarantee that
- memory can still get hot removed - be aware that pinning can fail even if
- there is plenty of free memory in ZONE_MOVABLE. In addition, using
- ZONE_MOVABLE might make page pinning more expensive, because pages have to be
- migrated off that zone first.
-.. _memory_hotplug_how_to_offline_memory:
+ CMA memory part of a kernel zone essentially behaves like memory in
+ ZONE_MOVABLE and similar considerations apply, especially when combining
+ CMA with ZONE_MOVABLE.
-How to offline memory
----------------------
+ZONE_MOVABLE Sizing Considerations
+----------------------------------
-You can offline a memory block by using the same sysfs interface that was used
-in memory onlining::
+We usually expect that a large portion of available system RAM will actually
+be consumed by user space, either directly or indirectly via the page cache. In
+the normal case, ZONE_MOVABLE can be used when allocating such pages just fine.
- % echo offline > /sys/devices/system/memory/memoryXXX/state
+With that in mind, it makes sense that we can have a big portion of system RAM
+managed by ZONE_MOVABLE. However, there are some things to consider when using
+ZONE_MOVABLE, especially when fine-tuning zone ratios:
+
+- Having a lot of offline memory blocks. Even offline memory blocks consume
+ memory for metadata and page tables in the direct map; having a lot of offline
+ memory blocks is not a typical case, though.
+
+- Memory ballooning without balloon compaction is incompatible with
+ ZONE_MOVABLE. Only some implementations, such as virtio-balloon and
+ pseries CMM, fully support balloon compaction.
+
+ Further, the CONFIG_BALLOON_COMPACTION kernel configuration option might be
+ disabled. In that case, balloon inflation will only perform unmovable
+ allocations and silently create a zone imbalance, usually triggered by
+ inflation requests from the hypervisor.
+
+- Gigantic pages are unmovable, resulting in user space consuming a
+ lot of unmovable memory.
+
+- Huge pages are unmovable when an architectures does not support huge
+ page migration, resulting in a similar issue as with gigantic pages.
+
+- Page tables are unmovable. Excessive swapping, mapping extremely large
+ files or ZONE_DEVICE memory can be problematic, although only really relevant
+ in corner cases. When we manage a lot of user space memory that has been
+ swapped out or is served from a file/persistent memory/... we still need a lot
+ of page tables to manage that memory once user space accessed that memory.
+
+- In certain DAX configurations the memory map for the device memory will be
+ allocated from the kernel zones.
+
+- KASAN can have a significant memory overhead, for example, consuming 1/8th of
+ the total system memory size as (unmovable) tracking metadata.
+
+- Long-term pinning of pages. Techniques that rely on long-term pinnings
+ (especially, RDMA and vfio/mdev) are fundamentally problematic with
+ ZONE_MOVABLE, and therefore, memory offlining. Pinned pages cannot reside
+ on ZONE_MOVABLE as that would turn these pages unmovable. Therefore, they
+ have to be migrated off that zone while pinning. Pinning a page can fail
+ even if there is plenty of free memory in ZONE_MOVABLE.
+
+ In addition, using ZONE_MOVABLE might make page pinning more expensive,
+ because of the page migration overhead.
+
+By default, all the memory configured at boot time is managed by the kernel
+zones and ZONE_MOVABLE is not used.
+
+To enable ZONE_MOVABLE to include the memory present at boot and to control the
+ratio between movable and kernel zones there are two command line options:
+``kernelcore=`` and ``movablecore=``. See
+Documentation/admin-guide/kernel-parameters.rst for their description.
+
+Memory Offlining and ZONE_MOVABLE
+---------------------------------
+
+Even with ZONE_MOVABLE, there are some corner cases where offlining a memory
+block might fail:
+
+- Memory blocks with memory holes; this applies to memory blocks present during
+ boot and can apply to memory blocks hotplugged via the XEN balloon and the
+ Hyper-V balloon.
+
+- Mixed NUMA nodes and mixed zones within a single memory block prevent memory
+ offlining; this applies to memory blocks present during boot only.
+
+- Special memory blocks prevented by the system from getting offlined. Examples
+ include any memory available during boot on arm64 or memory blocks spanning
+ the crashkernel area on s390x; this usually applies to memory blocks present
+ during boot only.
+
+- Memory blocks overlapping with CMA areas cannot be offlined, this applies to
+ memory blocks present during boot only.
+
+- Concurrent activity that operates on the same physical memory area, such as
+ allocating gigantic pages, can result in temporary offlining failures.
+
+- Out of memory when dissolving huge pages, especially when freeing unused
+ vmemmap pages associated with each hugetlb page is enabled.
+
+ Offlining code may be able to migrate huge page contents, but may not be able
+ to dissolve the source huge page because it fails allocating (unmovable) pages
+ for the vmemmap, because the system might not have free memory in the kernel
+ zones left.
+
+ Users that depend on memory offlining to succeed for movable zones should
+ carefully consider whether the memory savings gained from this feature are
+ worth the risk of possibly not being able to offline memory in certain
+ situations.
+
+Further, when running into out of memory situations while migrating pages, or
+when still encountering permanently unmovable pages within ZONE_MOVABLE
+(-> BUG), memory offlining will keep retrying until it eventually succeeds.
+
+When offlining is triggered from user space, the offlining context can be
+terminated by sending a fatal signal. A timeout based offlining can easily be
+implemented via::
-If offline succeeds, the state of the memory block is changed to be "offline".
-If it fails, some error core (like -EBUSY) will be returned by the kernel.
-Even if a memory block does not belong to ZONE_MOVABLE, you can try to offline
-it. If it doesn't contain 'unmovable' memory, you'll get success.
-
-A memory block under ZONE_MOVABLE is considered to be able to be offlined
-easily. But under some busy state, it may return -EBUSY. Even if a memory
-block cannot be offlined due to -EBUSY, you can retry offlining it and may be
-able to offline it (or not). (For example, a page is referred to by some kernel
-internal call and released soon.)
-
-Consideration:
- Memory hotplug's design direction is to make the possibility of memory
- offlining higher and to guarantee unplugging memory under any situation. But
- it needs more work. Returning -EBUSY under some situation may be good because
- the user can decide to retry more or not by himself. Currently, memory
- offlining code does some amount of retry with 120 seconds timeout.
-
-Physical memory remove
-======================
-
-Need more implementation yet....
- - Notification completion of remove works by OS to firmware.
- - Guard from remove if not yet.
-
-
-Locking Internals
-=================
-
-When adding/removing memory that uses memory block devices (i.e. ordinary RAM),
-the device_hotplug_lock should be held to:
-
-- synchronize against online/offline requests (e.g. via sysfs). This way, memory
- block devices can only be accessed (.online/.state attributes) by user
- space once memory has been fully added. And when removing memory, we
- know nobody is in critical sections.
-- synchronize against CPU hotplug and similar (e.g. relevant for ACPI and PPC)
-
-Especially, there is a possible lock inversion that is avoided using
-device_hotplug_lock when adding memory and user space tries to online that
-memory faster than expected:
-
-- device_online() will first take the device_lock(), followed by
- mem_hotplug_lock
-- add_memory_resource() will first take the mem_hotplug_lock, followed by
- the device_lock() (while creating the devices, during bus_add_device()).
-
-As the device is visible to user space before taking the device_lock(), this
-can result in a lock inversion.
-
-onlining/offlining of memory should be done via device_online()/
-device_offline() - to make sure it is properly synchronized to actions
-via sysfs. Holding device_hotplug_lock is advised (to e.g. protect online_type)
-
-When adding/removing/onlining/offlining memory or adding/removing
-heterogeneous/device memory, we should always hold the mem_hotplug_lock in
-write mode to serialise memory hotplug (e.g. access to global/zone
-variables).
-
-In addition, mem_hotplug_lock (in contrast to device_hotplug_lock) in read
-mode allows for a quite efficient get_online_mems/put_online_mems
-implementation, so code accessing memory can protect from that memory
-vanishing.
-
-
-Future Work
-===========
-
- - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
- sysctl or new control file.
- - showing memory block and physical device relationship.
- - test and make it better memory offlining.
- - support HugeTLB page migration and offlining.
- - memmap removing at memory offline.
- - physical remove memory.
+ % timeout $TIMEOUT offline_block | failure_handling
diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index 067a90a1499c..cd653561e531 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -245,6 +245,14 @@ MPOL_INTERLEAVED
address range or file. During system boot up, the temporary
interleaved system default policy works in this mode.
+MPOL_PREFERRED_MANY
+ This mode specifies that the allocation should be attempted from the
+ nodemask specified in the policy. If that allocation fails, the kernel
+ will search other nodes, in order of increasing distance from the first
+ set bit in the nodemask based on information provided by the platform
+ firmware. It is similar to MPOL_PREFERRED with the main exception that
+ is an error to have an empty nodemask.
+
NUMA memory policy supports the following optional mode flags:
MPOL_F_STATIC_NODES
@@ -253,10 +261,10 @@ MPOL_F_STATIC_NODES
nodes changes after the memory policy has been defined.
Without this flag, any time a mempolicy is rebound because of a
- change in the set of allowed nodes, the node (Preferred) or
- nodemask (Bind, Interleave) is remapped to the new set of
- allowed nodes. This may result in nodes being used that were
- previously undesired.
+ change in the set of allowed nodes, the preferred nodemask (Preferred
+ Many), preferred node (Preferred) or nodemask (Bind, Interleave) is
+ remapped to the new set of allowed nodes. This may result in nodes
+ being used that were previously undesired.
With this flag, if the user-specified nodes overlap with the
nodes allowed by the task's cpuset, then the memory policy is
diff --git a/Documentation/arm64/tagged-address-abi.rst b/Documentation/arm64/tagged-address-abi.rst
index 459e6b66ff68..0c9120ec58ae 100644
--- a/Documentation/arm64/tagged-address-abi.rst
+++ b/Documentation/arm64/tagged-address-abi.rst
@@ -45,14 +45,24 @@ how the user addresses are used by the kernel:
1. User addresses not accessed by the kernel but used for address space
management (e.g. ``mprotect()``, ``madvise()``). The use of valid
- tagged pointers in this context is allowed with the exception of
- ``brk()``, ``mmap()`` and the ``new_address`` argument to
- ``mremap()`` as these have the potential to alias with existing
- user addresses.
-
- NOTE: This behaviour changed in v5.6 and so some earlier kernels may
- incorrectly accept valid tagged pointers for the ``brk()``,
- ``mmap()`` and ``mremap()`` system calls.
+ tagged pointers in this context is allowed with these exceptions:
+
+ - ``brk()``, ``mmap()`` and the ``new_address`` argument to
+ ``mremap()`` as these have the potential to alias with existing
+ user addresses.
+
+ NOTE: This behaviour changed in v5.6 and so some earlier kernels may
+ incorrectly accept valid tagged pointers for the ``brk()``,
+ ``mmap()`` and ``mremap()`` system calls.
+
+ - The ``range.start``, ``start`` and ``dst`` arguments to the
+ ``UFFDIO_*`` ``ioctl()``s used on a file descriptor obtained from
+ ``userfaultfd()``, as fault addresses subsequently obtained by reading
+ the file descriptor will be untagged, which may otherwise confuse
+ tag-unaware programs.
+
+ NOTE: This behaviour changed in v5.14 and so some earlier kernels may
+ incorrectly accept valid tagged pointers for this system call.
2. User addresses accessed by the kernel (e.g. ``write()``). This ABI
relaxation is disabled by default and the application thread needs to
diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst
index 29682f69a915..5c0552e78c58 100644
--- a/Documentation/core-api/cachetlb.rst
+++ b/Documentation/core-api/cachetlb.rst
@@ -271,10 +271,15 @@ maps this page at its virtual address.
``void flush_dcache_page(struct page *page)``
- Any time the kernel writes to a page cache page, _OR_
- the kernel is about to read from a page cache page and
- user space shared/writable mappings of this page potentially
- exist, this routine is called.
+ This routines must be called when:
+
+ a) the kernel did write to a page that is in the page cache page
+ and / or in high memory
+ b) the kernel is about to read from a page cache page and user space
+ shared/writable mappings of this page potentially exist. Note
+ that {get,pin}_user_pages{_fast} already call flush_dcache_page
+ on any page found in the user address space and thus driver
+ code rarely needs to take this into account.
.. note::
@@ -284,38 +289,34 @@ maps this page at its virtual address.
handling vfs symlinks in the page cache need not call
this interface at all.
- The phrase "kernel writes to a page cache page" means,
- specifically, that the kernel executes store instructions
- that dirty data in that page at the page->virtual mapping
- of that page. It is important to flush here to handle
- D-cache aliasing, to make sure these kernel stores are
- visible to user space mappings of that page.
-
- The corollary case is just as important, if there are users
- which have shared+writable mappings of this file, we must make
- sure that kernel reads of these pages will see the most recent
- stores done by the user.
-
- If D-cache aliasing is not an issue, this routine may
- simply be defined as a nop on that architecture.
-
- There is a bit set aside in page->flags (PG_arch_1) as
- "architecture private". The kernel guarantees that,
- for pagecache pages, it will clear this bit when such
- a page first enters the pagecache.
-
- This allows these interfaces to be implemented much more
- efficiently. It allows one to "defer" (perhaps indefinitely)
- the actual flush if there are currently no user processes
- mapping this page. See sparc64's flush_dcache_page and
- update_mmu_cache implementations for an example of how to go
- about doing this.
-
- The idea is, first at flush_dcache_page() time, if
- page->mapping->i_mmap is an empty tree, just mark the architecture
- private page flag bit. Later, in update_mmu_cache(), a check is
- made of this flag bit, and if set the flush is done and the flag
- bit is cleared.
+ The phrase "kernel writes to a page cache page" means, specifically,
+ that the kernel executes store instructions that dirty data in that
+ page at the page->virtual mapping of that page. It is important to
+ flush here to handle D-cache aliasing, to make sure these kernel stores
+ are visible to user space mappings of that page.
+
+ The corollary case is just as important, if there are users which have
+ shared+writable mappings of this file, we must make sure that kernel
+ reads of these pages will see the most recent stores done by the user.
+
+ If D-cache aliasing is not an issue, this routine may simply be defined
+ as a nop on that architecture.
+
+ There is a bit set aside in page->flags (PG_arch_1) as "architecture
+ private". The kernel guarantees that, for pagecache pages, it will
+ clear this bit when such a page first enters the pagecache.
+
+ This allows these interfaces to be implemented much more efficiently.
+ It allows one to "defer" (perhaps indefinitely) the actual flush if
+ there are currently no user processes mapping this page. See sparc64's
+ flush_dcache_page and update_mmu_cache implementations for an example
+ of how to go about doing this.
+
+ The idea is, first at flush_dcache_page() time, if page_file_mapping()
+ returns a mapping, and mapping_mapped on that mapping returns %false,
+ just mark the architecture private page flag bit. Later, in
+ update_mmu_cache(), a check is made of this flag bit, and if set the
+ flush is done and the flag bit is cleared.
.. important::
@@ -357,19 +358,6 @@ maps this page at its virtual address.
architectures). For incoherent architectures, it should flush
the cache of the page at vmaddr.
- ``void flush_kernel_dcache_page(struct page *page)``
-
- When the kernel needs to modify a user page is has obtained
- with kmap, it calls this function after all modifications are
- complete (but before kunmapping it) to bring the underlying
- page up to date. It is assumed here that the user has no
- incoherent cached copies (i.e. the original page was obtained
- from a mechanism like get_user_pages()). The default
- implementation is a nop and should remain so on all coherent
- architectures. On incoherent architectures, this should flush
- the kernel cache for page (using page_address(page)).
-
-
``void flush_icache_range(unsigned long start, unsigned long end)``
When the kernel stores into addresses that it will execute
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index 83ec4a556c19..21dc03bc10a4 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -181,9 +181,16 @@ By default, KASAN prints a bug report only for the first invalid memory access.
With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This
effectively disables ``panic_on_warn`` for KASAN reports.
+Alternatively, independent of ``panic_on_warn`` the ``kasan.fault=`` boot
+parameter can be used to control panic and reporting behaviour:
+
+- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN
+ report or also panic the kernel (default: ``report``). The panic happens even
+ if ``kasan_multi_shot`` is enabled.
+
Hardware tag-based KASAN mode (see the section about various modes below) is
intended for use in production as a security mitigation. Therefore, it supports
-boot parameters that allow disabling KASAN or controlling its features.
+additional boot parameters that allow disabling KASAN or controlling features:
- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``).
@@ -199,10 +206,6 @@ boot parameters that allow disabling KASAN or controlling its features.
- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
traces collection (default: ``on``).
-- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN
- report or also panic the kernel (default: ``report``). The panic happens even
- if ``kasan_multi_shot`` is enabled.
-
Implementation details
----------------------
diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst
index fdf04e741ea5..0fbe3308bf37 100644
--- a/Documentation/dev-tools/kfence.rst
+++ b/Documentation/dev-tools/kfence.rst
@@ -65,25 +65,27 @@ Error reports
A typical out-of-bounds access looks like this::
==================================================================
- BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0xa3/0x22b
+ BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0xa6/0x234
- Out-of-bounds read at 0xffffffffb672efff (1B left of kfence-#17):
- test_out_of_bounds_read+0xa3/0x22b
- kunit_try_run_case+0x51/0x85
+ Out-of-bounds read at 0xffff8c3f2e291fff (1B left of kfence-#72):
+ test_out_of_bounds_read+0xa6/0x234
+ kunit_try_run_case+0x61/0xa0
kunit_generic_run_threadfn_adapter+0x16/0x30
- kthread+0x137/0x160
+ kthread+0x176/0x1b0
ret_from_fork+0x22/0x30
- kfence-#17 [0xffffffffb672f000-0xffffffffb672f01f, size=32, cache=kmalloc-32] allocated by task 507:
- test_alloc+0xf3/0x25b
- test_out_of_bounds_read+0x98/0x22b
- kunit_try_run_case+0x51/0x85
+ kfence-#72: 0xffff8c3f2e292000-0xffff8c3f2e29201f, size=32, cache=kmalloc-32
+
+ allocated by task 484 on cpu 0 at 32.919330s:
+ test_alloc+0xfe/0x738
+ test_out_of_bounds_read+0x9b/0x234
+ kunit_try_run_case+0x61/0xa0
kunit_generic_run_threadfn_adapter+0x16/0x30
- kthread+0x137/0x160
+ kthread+0x176/0x1b0
ret_from_fork+0x22/0x30
- CPU: 4 PID: 107 Comm: kunit_try_catch Not tainted 5.8.0-rc6+ #7
- Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
+ CPU: 0 PID: 484 Comm: kunit_try_catch Not tainted 5.13.0-rc3+ #7
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
==================================================================
The header of the report provides a short summary of the function involved in
@@ -96,30 +98,32 @@ Use-after-free accesses are reported as::
==================================================================
BUG: KFENCE: use-after-free read in test_use_after_free_read+0xb3/0x143
- Use-after-free read at 0xffffffffb673dfe0 (in kfence-#24):
+ Use-after-free read at 0xffff8c3f2e2a0000 (in kfence-#79):
test_use_after_free_read+0xb3/0x143
- kunit_try_run_case+0x51/0x85
+ kunit_try_run_case+0x61/0xa0
kunit_generic_run_threadfn_adapter+0x16/0x30
- kthread+0x137/0x160
+ kthread+0x176/0x1b0
ret_from_fork+0x22/0x30
- kfence-#24 [0xffffffffb673dfe0-0xffffffffb673dfff, size=32, cache=kmalloc-32] allocated by task 507:
- test_alloc+0xf3/0x25b
+ kfence-#79: 0xffff8c3f2e2a0000-0xffff8c3f2e2a001f, size=32, cache=kmalloc-32
+
+ allocated by task 488 on cpu 2 at 33.871326s:
+ test_alloc+0xfe/0x738
test_use_after_free_read+0x76/0x143
- kunit_try_run_case+0x51/0x85
+ kunit_try_run_case+0x61/0xa0
kunit_generic_run_threadfn_adapter+0x16/0x30
- kthread+0x137/0x160
+ kthread+0x176/0x1b0
ret_from_fork+0x22/0x30
- freed by task 507:
+ freed by task 488 on cpu 2 at 33.871358s:
test_use_after_free_read+0xa8/0x143
- kunit_try_run_case+0x51/0x85
+ kunit_try_run_case+0x61/0xa0
kunit_generic_run_threadfn_adapter+0x16/0x30
- kthread+0x137/0x160
+ kthread+0x176/0x1b0
ret_from_fork+0x22/0x30
- CPU: 4 PID: 109 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7
- Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
+ CPU: 2 PID: 488 Comm: kunit_try_catch Tainted: G B 5.13.0-rc3+ #7
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
==================================================================
KFENCE also reports on invalid frees, such as double-frees::
@@ -127,30 +131,32 @@ KFENCE also reports on invalid frees, such as double-frees::
==================================================================
BUG: KFENCE: invalid free in test_double_free+0xdc/0x171
- Invalid free of 0xffffffffb6741000:
+ Invalid free of 0xffff8c3f2e2a4000 (in kfence-#81):
test_double_free+0xdc/0x171
- kunit_try_run_case+0x51/0x85
+ kunit_try_run_case+0x61/0xa0
kunit_generic_run_threadfn_adapter+0x16/0x30
- kthread+0x137/0x160
+ kthread+0x176/0x1b0
ret_from_fork+0x22/0x30
- kfence-#26 [0xffffffffb6741000-0xffffffffb674101f, size=32, cache=kmalloc-32] allocated by task 507:
- test_alloc+0xf3/0x25b
+ kfence-#81: 0xffff8c3f2e2a4000-0xffff8c3f2e2a401f, size=32, cache=kmalloc-32
+
+ allocated by task 490 on cpu 1 at 34.175321s:
+ test_alloc+0xfe/0x738
test_double_free+0x76/0x171
- kunit_try_run_case+0x51/0x85
+ kunit_try_run_case+0x61/0xa0
kunit_generic_run_threadfn_adapter+0x16/0x30
- kthread+0x137/0x160
+ kthread+0x176/0x1b0
ret_from_fork+0x22/0x30
- freed by task 507:
+ freed by task 490 on cpu 1 at 34.175348s:
test_double_free+0xa8/0x171
- kunit_try_run_case+0x51/0x85
+ kunit_try_run_case+0x61/0xa0
kunit_generic_run_threadfn_adapter+0x16/0x30
- kthread+0x137/0x160
+ kthread+0x176/0x1b0
ret_from_fork+0x22/0x30
- CPU: 4 PID: 111 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7
- Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
+ CPU: 1 PID: 490 Comm: kunit_try_catch Tainted: G B 5.13.0-rc3+ #7
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
==================================================================
KFENCE also uses pattern-based redzones on the other side of an object's guard
@@ -160,23 +166,25 @@ These are reported on frees::
==================================================================
BUG: KFENCE: memory corruption in test_kmalloc_aligned_oob_write+0xef/0x184
- Corrupted memory at 0xffffffffb6797ff9 [ 0xac . . . . . . ] (in kfence-#69):
+ Corrupted memory at 0xffff8c3f2e33aff9 [ 0xac . . . . . . ] (in kfence-#156):
test_kmalloc_aligned_oob_write+0xef/0x184
- kunit_try_run_case+0x51/0x85
+ kunit_try_run_case+0x61/0xa0
kunit_generic_run_threadfn_adapter+0x16/0x30
- kthread+0x137/0x160
+ kthread+0x176/0x1b0
ret_from_fork+0x22/0x30
- kfence-#69 [0xffffffffb6797fb0-0xffffffffb6797ff8, size=73, cache=kmalloc-96] allocated by task 507:
- test_alloc+0xf3/0x25b
+ kfence-#156: 0xffff8c3f2e33afb0-0xffff8c3f2e33aff8, size=73, cache=kmalloc-96
+
+ allocated by task 502 on cpu 7 at 42.159302s:
+ test_alloc+0xfe/0x738
test_kmalloc_aligned_oob_write+0x57/0x184
- kunit_try_run_case+0x51/0x85
+ kunit_try_run_case+0x61/0xa0
kunit_generic_run_threadfn_adapter+0x16/0x30
- kthread+0x137/0x160
+ kthread+0x176/0x1b0
ret_from_fork+0x22/0x30
- CPU: 4 PID: 120 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7
- Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
+ CPU: 7 PID: 502 Comm: kunit_try_catch Tainted: G B 5.13.0-rc3+ #7
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
==================================================================
For such errors, the address where the corruption occurred as well as the
diff --git a/Documentation/translations/zh_CN/core-api/cachetlb.rst b/Documentation/translations/zh_CN/core-api/cachetlb.rst
index 8376485a534d..55827b8a7c53 100644
--- a/Documentation/translations/zh_CN/core-api/cachetlb.rst
+++ b/Documentation/translations/zh_CN/core-api/cachetlb.rst
@@ -298,15 +298,6 @@ HyperSparc cpu就是这样一个具有这种属性的cpu。
用。默认的实现是nop(对于所有相干的架构应该保持这样)。对于不一致性
的架构,它应该刷新vmaddr处的页面缓存。
- ``void flush_kernel_dcache_page(struct page *page)``
-
- 当内核需要修改一个用kmap获得的用户页时,它会在所有修改完成后(但在
- kunmapping之前)调用这个函数,以使底层页面达到最新状态。这里假定用
- 户没有不一致性的缓存副本(即原始页面是从类似get_user_pages()的机制
- 中获得的)。默认的实现是一个nop,在所有相干的架构上都应该如此。在不
- 一致性的架构上,这应该刷新内核缓存中的页面(使用page_address(page))。
-
-
``void flush_icache_range(unsigned long start, unsigned long end)``
当内核存储到它将执行的地址中时(例如在加载模块时),这个函数被调用。
diff --git a/arch/Kconfig b/arch/Kconfig
index 129df498a8e1..4f7596092ad8 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -425,6 +425,34 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
irqs disabled over activate_mm. Architectures that do IPI based TLB
shootdowns should enable this.
+# Use normal mm refcounting for MMU_LAZY_TLB kernel thread references.
+# MMU_LAZY_TLB_REFCOUNT=n can improve the scalability of context switching
+# to/from kernel threads when the same mm is running on a lot of CPUs (a large
+# multi-threaded application), by reducing contention on the mm refcount.
+#
+# This can be disabled if the architecture ensures no CPUs are using an mm as a
+# "lazy tlb" beyond its final refcount (i.e., by the time __mmdrop frees the mm
+# or its kernel page tables). This could be arranged by arch_exit_mmap(), or
+# final exit(2) TLB flush, for example. arch code must also ensure the
+# _lazy_tlb variants of mmgrab/mmdrop are used when dropping the lazy reference
+# to a kthread ->active_mm (non-arch code has been converted already).
+config MMU_LAZY_TLB_REFCOUNT
+ def_bool y
+ depends on !MMU_LAZY_TLB_SHOOTDOWN
+
+# This option allows MMU_LAZY_TLB_REFCOUNT=n. It ensures no CPUs are using an
+# mm as a lazy tlb beyond its last reference count, by shooting down these
+# users before the mm is deallocated. __mmdrop() first IPIs all CPUs that may
+# be using the mm as a lazy tlb, so that they may switch themselves to using
+# init_mm for their active mm. mm_cpumask(mm) is used to determine which CPUs
+# may be using mm as a lazy tlb mm.
+#
+# To implement this, an arch must ensure mm_cpumask(mm) contains at least all
+# possible CPUs in which the mm is lazy, and it must meet the requirements for
+# MMU_LAZY_TLB_REFCOUNT=n (see above).
+config MMU_LAZY_TLB_SHOOTDOWN
+ bool
+
config ARCH_HAVE_NMI_SAFE_CMPXCHG
bool
@@ -886,7 +914,7 @@ config HAVE_SOFTIRQ_ON_OWN_STACK
bool
help
Architecture provides a function to run __do_softirq() on a
- seperate stack.
+ separate stack.
config PGTABLE_LEVELS
int
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 23bf823376e1..e68fb879e4f9 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -292,6 +292,7 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr
extern void flush_dcache_page(struct page *);
void flush_dcache_folio(struct folio *folio);
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
static inline void flush_kernel_vmap_range(void *addr, int size)
{
if ((cache_is_vivt() || cache_is_vipt_aliasing()))
@@ -313,9 +314,6 @@ static inline void flush_anon_page(struct vm_area_struct *vma,
__flush_anon_page(vma, page, vmaddr);
}
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-extern void flush_kernel_dcache_page(struct page *);
-
#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages)
#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages)
diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index 827b50f1c73e..1b4a41aad793 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -253,7 +253,7 @@ static int ecard_init_mm(void)
current->mm = mm;
current->active_mm = mm;
activate_mm(active_mm, mm);
- mmdrop(active_mm);
+ mmdrop_lazy_tlb(active_mm);
ecard_init_pgtables(mm);
return 0;
}
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 6d89db7895d1..7ff9feea13a6 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -346,39 +346,6 @@ void flush_dcache_page(struct page *page)
EXPORT_SYMBOL(flush_dcache_page);
/*
- * Ensure cache coherency for the kernel mapping of this page. We can
- * assume that the page is pinned via kmap.
- *
- * If the page only exists in the page cache and there are no user
- * space mappings, this is a no-op since the page was already marked
- * dirty at creation. Otherwise, we need to flush the dirty kernel
- * cache lines directly.
- */
-void flush_kernel_dcache_page(struct page *page)
-{
- if (cache_is_vivt() || cache_is_vipt_aliasing()) {
- struct address_space *mapping;
-
- mapping = page_mapping_file(page);
-
- if (!mapping || mapping_mapped(mapping)) {
- void *addr;
-
- addr = page_address(page);
- /*
- * kmap_atomic() doesn't set the page virtual
- * address for highmem pages, and
- * kunmap_atomic() takes care of cache
- * flushing already.
- */
- if (!IS_ENABLED(CONFIG_HIGHMEM) || addr)
- __cpuc_flush_dcache_area(addr, PAGE_SIZE);
- }
- }
-}
-EXPORT_SYMBOL(flush_kernel_dcache_page);
-
-/*
* Flush an anonymous page so that users of get_user_pages()
* can safely access the data. The expected sequence is:
*
diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c
index 8b3d7191e2b8..2658f52903da 100644
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@@ -166,12 +166,6 @@ void flush_dcache_page(struct page *page)
}
EXPORT_SYMBOL(flush_dcache_page);
-void flush_kernel_dcache_page(struct page *page)
-{
- __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
-}
-EXPORT_SYMBOL(flush_kernel_dcache_page);
-
void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
unsigned long uaddr, void *dst, const void *src,
unsigned long len)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index d74586508448..af8ab553a268 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1506,8 +1506,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return ret;
}
-void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c
index 07ff17ea33de..fb91b069dc69 100644
--- a/arch/csky/abiv1/cacheflush.c
+++ b/arch/csky/abiv1/cacheflush.c
@@ -56,17 +56,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr,
}
}
-void flush_kernel_dcache_page(struct page *page)
-{
- struct address_space *mapping;
-
- mapping = page_mapping_file(page);
-
- if (!mapping || mapping_mapped(mapping))
- dcache_wbinv_all();
-}
-EXPORT_SYMBOL(flush_kernel_dcache_page);
-
void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
{
diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h
index 6cab7afae962..ed62e2066ba7 100644
--- a/arch/csky/abiv1/inc/abi/cacheflush.h
+++ b/arch/csky/abiv1/inc/abi/cacheflush.h
@@ -14,12 +14,10 @@ extern void flush_dcache_page(struct page *);
#define flush_cache_page(vma, page, pfn) cache_wbinv_all()
#define flush_cache_dup_mm(mm) cache_wbinv_all()
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-extern void flush_kernel_dcache_page(struct page *);
-
#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages)
#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages)
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
static inline void flush_kernel_vmap_range(void *addr, int size)
{
dcache_wbinv_all();
diff --git a/arch/csky/kernel/probes/kprobes.c b/arch/csky/kernel/probes/kprobes.c
index 68b22b499aeb..8fffa34d4e1c 100644
--- a/arch/csky/kernel/probes/kprobes.c
+++ b/arch/csky/kernel/probes/kprobes.c
@@ -283,8 +283,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr)
* normal page fault.
*/
regs->pc = (unsigned long) cur->addr;
- if (!instruction_pointer(regs))
- BUG();
+ BUG_ON(!instruction_pointer(regs));
if (kcb->kprobe_status == KPROBE_REENTER)
restore_previous_kprobe(kcb);
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 064a967a7b6e..5c6da8d83c1a 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -484,8 +484,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return ret;
}
-void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index 71cd547655d9..c136a01e467e 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -443,8 +443,6 @@ extern int mem_init_done;
asmlinkage void __init mmu_init(void);
-void __init *early_get_page(void);
-
#endif /* __ASSEMBLY__ */
#endif /* __KERNEL__ */
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index ab55c70380a5..952f35b335b2 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -265,18 +265,6 @@ asmlinkage void __init mmu_init(void)
dma_contiguous_reserve(memory_start + lowmem_size - 1);
}
-/* This is only called until mem_init is done. */
-void __init *early_get_page(void)
-{
- /*
- * Mem start + kernel_tlb -> here is limit
- * because of mem mapping from head.S
- */
- return memblock_alloc_try_nid_raw(PAGE_SIZE, PAGE_SIZE,
- MEMBLOCK_LOW_LIMIT, memory_start + kernel_tlb,
- NUMA_NO_NODE);
-}
-
void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask)
{
void *p;
diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c
index 38ccb909bc9d..c1833b159d3b 100644
--- a/arch/microblaze/mm/pgtable.c
+++ b/arch/microblaze/mm/pgtable.c
@@ -33,6 +33,7 @@
#include <linux/init.h>
#include <linux/mm_types.h>
#include <linux/pgtable.h>
+#include <linux/memblock.h>
#include <asm/pgalloc.h>
#include <linux/io.h>
@@ -242,15 +243,13 @@ unsigned long iopa(unsigned long addr)
__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
{
- pte_t *pte;
- if (mem_init_done) {
- pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
- } else {
- pte = (pte_t *)early_get_page();
- if (pte)
- clear_page(pte);
- }
- return pte;
+ if (mem_init_done)
+ return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ else
+ return memblock_alloc_try_nid(PAGE_SIZE, PAGE_SIZE,
+ MEMBLOCK_LOW_LIMIT,
+ memory_start + kernel_tlb,
+ NUMA_NO_NODE);
}
void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h
index d687b40b9fbb..b3dc9c589442 100644
--- a/arch/mips/include/asm/cacheflush.h
+++ b/arch/mips/include/asm/cacheflush.h
@@ -125,13 +125,7 @@ static inline void kunmap_noncoherent(void)
kunmap_coherent();
}
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-static inline void flush_kernel_dcache_page(struct page *page)
-{
- BUG_ON(cpu_has_dc_aliases && PageHighMem(page));
- flush_dcache_page(page);
-}
-
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
/*
* For now flush_kernel_vmap_range and invalidate_kernel_vmap_range both do a
* cache writeback and invalidate operation.
diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h
index b5037981f023..3fc0bb7d6487 100644
--- a/arch/nds32/include/asm/cacheflush.h
+++ b/arch/nds32/include/asm/cacheflush.h
@@ -37,8 +37,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
void flush_anon_page(struct vm_area_struct *vma,
struct page *page, unsigned long vaddr);
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-void flush_kernel_dcache_page(struct page *page);
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
void flush_kernel_vmap_range(void *addr, int size);
void invalidate_kernel_vmap_range(void *addr, int size);
#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&(mapping)->i_pages)
diff --git a/arch/nds32/mm/cacheflush.c b/arch/nds32/mm/cacheflush.c
index ad5344ef5d33..07aac65d1cab 100644
--- a/arch/nds32/mm/cacheflush.c
+++ b/arch/nds32/mm/cacheflush.c
@@ -318,15 +318,6 @@ void flush_anon_page(struct vm_area_struct *vma,
local_irq_restore(flags);
}
-void flush_kernel_dcache_page(struct page *page)
-{
- unsigned long flags;
- local_irq_save(flags);
- cpu_dcache_wbinval_page((unsigned long)page_address(page));
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(flush_kernel_dcache_page);
-
void flush_kernel_vmap_range(void *addr, int size)
{
unsigned long flags;
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index 99663fc1f997..eef0096db5f8 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -36,16 +36,12 @@ void flush_cache_all_local(void);
void flush_cache_all(void);
void flush_cache_mm(struct mm_struct *mm);
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
void flush_kernel_dcache_page_addr(void *addr);
-static inline void flush_kernel_dcache_page(struct page *page)
-{
- flush_kernel_dcache_page_addr(page_address(page));
-}
#define flush_kernel_dcache_range(start,size) \
flush_kernel_dcache_range_asm((start), (start)+(size));
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
void flush_kernel_vmap_range(void *vaddr, int size);
void invalidate_kernel_vmap_range(void *vaddr, int size);
@@ -59,7 +55,7 @@ extern void flush_dcache_page(struct page *page);
#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages)
#define flush_icache_page(vma,page) do { \
- flush_kernel_dcache_page(page); \
+ flush_kernel_dcache_page_addr(page_address(page)); \
flush_kernel_icache_page(page_address(page)); \
} while (0)
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index 86a1a63563fd..39e02227e231 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -334,7 +334,7 @@ void flush_dcache_page(struct page *page)
return;
}
- flush_kernel_dcache_page(page);
+ flush_kernel_dcache_page_addr(page_address(page));
if (!mapping)
return;
@@ -375,7 +375,6 @@ EXPORT_SYMBOL(flush_dcache_page);
/* Defined in arch/parisc/kernel/pacache.S */
EXPORT_SYMBOL(flush_kernel_dcache_range_asm);
-EXPORT_SYMBOL(flush_kernel_dcache_page_asm);
EXPORT_SYMBOL(flush_data_cache_local);
EXPORT_SYMBOL(flush_kernel_icache_range_asm);
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d01e3401581d..53db06ba4223 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -254,6 +254,7 @@ config PPC
select IRQ_FORCED_THREADING
select MMU_GATHER_PAGE_SIZE
select MMU_GATHER_RCU_TABLE_FREE
+ select MMU_LAZY_TLB_SHOOTDOWN if PPC_BOOK3S_64
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE if PPC64 || NOT_COHERENT_CACHE
select NEED_SG_DMA_LENGTH
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 447b78a87c8f..a492170b5ba0 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1556,7 +1556,7 @@ void start_secondary(void *unused)
if (IS_ENABLED(CONFIG_PPC32))
setup_kup();
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
current->active_mm = &init_mm;
smp_store_cpu_info(cpu);
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index aefc100d79a7..2710a61d7ef2 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -785,10 +785,10 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush)
if (current->active_mm == mm) {
WARN_ON_ONCE(current->mm != NULL);
/* Is a kernel thread and is using mm as the lazy tlb */
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
current->active_mm = &init_mm;
switch_mm_irqs_off(mm, &init_mm, current);
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
}
/*
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index ad198b439222..c3c4e31462ec 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -119,8 +119,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
return rc;
}
-void __ref arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 377d852f5a9a..533cb1335b7f 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -211,13 +211,11 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb)
static struct memory_block *lmb_to_memblock(struct drmem_lmb *lmb)
{
unsigned long section_nr;
- struct mem_section *mem_sect;
struct memory_block *mem_block;
section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr));
- mem_sect = __nr_to_section(section_nr);
- mem_block = find_memory_block(mem_sect);
+ mem_block = find_memory_block(section_nr);
return mem_block;
}
@@ -286,7 +284,7 @@ static int pseries_remove_memblock(unsigned long base, unsigned long memblock_si
{
unsigned long block_sz, start_pfn;
int sections_per_block;
- int i, nid;
+ int i;
start_pfn = base >> PAGE_SHIFT;
@@ -297,10 +295,9 @@ static int pseries_remove_memblock(unsigned long base, unsigned long memblock_si
block_sz = pseries_memory_block_size();
sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
- nid = memory_add_physaddr_to_nid(base);
for (i = 0; i < sections_per_block; i++) {
- __remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE);
+ __remove_memory(base, MIN_MEMORY_BLOCK_SIZE);
base += MIN_MEMORY_BLOCK_SIZE;
}
@@ -387,7 +384,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
block_sz = pseries_memory_block_size();
- __remove_memory(mem_block->nid, lmb->base_addr, block_sz);
+ __remove_memory(lmb->base_addr, block_sz);
put_device(&mem_block->dev);
/* Update memory regions for memory remove */
@@ -660,7 +657,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
rc = dlpar_online_lmb(lmb);
if (rc) {
- __remove_memory(nid, lmb->base_addr, block_sz);
+ __remove_memory(lmb->base_addr, block_sz);
invalidate_lmb_associativity_index(lmb);
} else {
lmb->flags |= DRCONF_MEM_ASSIGNED;
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 8ac710de1ab1..d85bd7f5d8dc 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -306,8 +306,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return rc;
}
-void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h
index 4486a865ff62..372afa82fee6 100644
--- a/arch/sh/include/asm/cacheflush.h
+++ b/arch/sh/include/asm/cacheflush.h
@@ -63,6 +63,8 @@ static inline void flush_anon_page(struct vm_area_struct *vma,
if (boot_cpu_data.dcache.n_aliases && PageAnon(page))
__flush_anon_page(page, vmaddr);
}
+
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
static inline void flush_kernel_vmap_range(void *addr, int size)
{
__flush_wback_region(addr, size);
@@ -72,12 +74,6 @@ static inline void invalidate_kernel_vmap_range(void *addr, int size)
__flush_invalidate_region(addr, size);
}
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-static inline void flush_kernel_dcache_page(struct page *page)
-{
- flush_dcache_page(page);
-}
-
extern void copy_to_user_page(struct vm_area_struct *vma,
struct page *page, unsigned long vaddr, void *dst, const void *src,
unsigned long len);
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index ce26c7f8950a..506784702430 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -414,8 +414,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return ret;
}
-void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 74b78840182d..bd90b8fe81e4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -801,8 +801,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return __add_pages(nid, start_pfn, nr_pages, params);
}
-void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ddeaba947eb3..a6e11763763f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1255,8 +1255,7 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
remove_pagetable(start, end, true, NULL);
}
-void __ref arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/block/blk-map.c b/block/blk-map.c
index 3743158ddaeb..4639bc6b5c62 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -309,7 +309,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
static void bio_invalidate_vmalloc_pages(struct bio *bio)
{
-#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
+#ifdef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE
if (bio->bi_private && !op_is_write(bio_op(bio))) {
unsigned long i, len = 0;
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 8cc195c4c861..eb4faf7c5cad 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -182,10 +182,6 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
* (i.e. memory-hot-remove function)
*/
list_for_each_entry(info, &mem_device->res_list, list) {
- if (info->enabled) { /* just sanity check...*/
- num_enabled++;
- continue;
- }
/*
* If the memory block size is zero, please ignore it.
* Don't try to do the following memory hotplug flowchart.
@@ -239,19 +235,14 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
static void acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
{
- acpi_handle handle = mem_device->device->handle;
struct acpi_memory_info *info, *n;
- int nid = acpi_get_node(handle);
list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
if (!info->enabled)
continue;
- if (nid == NUMA_NO_NODE)
- nid = memory_add_physaddr_to_nid(info->start_addr);
-
acpi_unbind_memory_blocks(info);
- __remove_memory(nid, info->start_addr, info->length);
+ __remove_memory(info->start_addr, info->length);
list_del(&info->list);
kfree(info);
}
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index aa31a21f33d7..e3fd2dbf4eea 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -578,9 +578,9 @@ static struct memory_block *find_memory_block_by_id(unsigned long block_id)
/*
* Called under device_hotplug_lock.
*/
-struct memory_block *find_memory_block(struct mem_section *section)
+struct memory_block *find_memory_block(unsigned long section_nr)
{
- unsigned long block_id = memory_block_id(__section_nr(section));
+ unsigned long block_id = memory_block_id(section_nr);
return find_memory_block_by_id(block_id);
}
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 4a4ae868ad9f..8ec6b7dfbb0f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -768,8 +768,6 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static int __ref get_nid_for_pfn(unsigned long pfn)
{
- if (!pfn_valid_within(pfn))
- return -1;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
if (system_state < SYSTEM_RUNNING)
return early_pfn_to_nid(pfn);
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index f374ea2c67ce..32bfb0487bdb 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -100,7 +100,7 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
else
memcpy(buf, dev->bounce_buf+offset, size);
offset += size;
- flush_kernel_dcache_page(bvec.bv_page);
+ flush_dcache_page(bvec.bv_page);
bvec_kunmap_irq(buf, &flags);
i++;
}
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index ac231cc36359..99e0f60c4c26 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -156,8 +156,7 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
if (rc)
continue;
- rc = remove_memory(dev_dax->target_node, range.start,
- range_len(&range));
+ rc = remove_memory(range.start, range_len(&range));
if (rc == 0) {
release_resource(data->res[i]);
kfree(data->res[i]);
diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c
index cb1a64a5c256..80a2c270d502 100644
--- a/drivers/mmc/host/jz4740_mmc.c
+++ b/drivers/mmc/host/jz4740_mmc.c
@@ -578,10 +578,6 @@ static bool jz4740_mmc_read_data(struct jz4740_mmc_host *host,
}
}
data->bytes_xfered += miter->length;
-
- /* This can go away once MIPS implements
- * flush_kernel_dcache_page */
- flush_dcache_page(miter->page);
}
sg_miter_stop(miter);
diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
index a1bcde3395a6..f4c8e1a61f53 100644
--- a/drivers/mmc/host/mmc_spi.c
+++ b/drivers/mmc/host/mmc_spi.c
@@ -941,7 +941,7 @@ mmc_spi_data_do(struct mmc_spi_host *host, struct mmc_command *cmd,
/* discard mappings */
if (direction == DMA_FROM_DEVICE)
- flush_kernel_dcache_page(sg_page(sg));
+ flush_dcache_page(sg_page(sg));
kunmap(sg_page(sg));
if (dma_dev)
dma_unmap_page(dma_dev, dma_addr, PAGE_SIZE, dir);
diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c
index 761fd870d1db..053e241f593c 100644
--- a/drivers/of/kexec.c
+++ b/drivers/of/kexec.c
@@ -16,6 +16,7 @@
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <linux/types.h>
#define RNG_SEED_SIZE 128
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 09ed55de07d7..774986695dc4 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -677,7 +677,7 @@ static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
addr + size - 1);
- rc = remove_memory(vm->nid, addr, size);
+ rc = remove_memory(addr, size);
if (!rc) {
atomic64_sub(size, &vm->offline_size);
/*
@@ -720,7 +720,7 @@ static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
"offlining and removing memory: 0x%llx - 0x%llx\n", addr,
addr + size - 1);
- rc = offline_and_remove_memory(vm->nid, addr, size);
+ rc = offline_and_remove_memory(addr, size);
if (!rc) {
atomic64_sub(size, &vm->offline_size);
/*
diff --git a/fs/buffer.c b/fs/buffer.c
index 6290c3afdba4..f5384cff7e0c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -956,10 +956,20 @@ grow_dev_page(struct block_device *bdev, sector_t block,
end_block = init_page_buffers(page, bdev,
(sector_t)index << sizebits,
size);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x01;
+#endif
goto done;
}
- if (!try_to_free_buffers(page))
+ if (!try_to_free_buffers(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x02;
+#endif
goto failed;
+ }
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x04;
+#endif
}
/*
@@ -979,6 +989,9 @@ grow_dev_page(struct block_device *bdev, sector_t block,
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x08;
+#endif
failed:
unlock_page(page);
put_page(page);
@@ -1030,6 +1043,12 @@ __getblk_slow(struct block_device *bdev, sector_t block,
return NULL;
}
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_stamp = jiffies;
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+#endif
for (;;) {
struct buffer_head *bh;
int ret;
@@ -1041,6 +1060,24 @@ __getblk_slow(struct block_device *bdev, sector_t block,
ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
+
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ if (!time_after(jiffies, current->getblk_stamp + 3 * HZ))
+ continue;
+ printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx bdev_super_blocksize=%ld size=%u bdev_super_blocksize_bits=%d bdev_inode_blkbits=%d\n",
+ current->comm, current->pid, current->getblk_executed,
+ current->getblk_bh_count, current->getblk_bh_state,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1L :
+ bdev->bd_super->s_blocksize, size,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1 :
+ bdev->bd_super->s_blocksize_bits,
+ IS_ERR_OR_NULL(bdev->bd_inode) ? -1 :
+ bdev->bd_inode->i_blkbits);
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+ current->getblk_stamp = jiffies;
+#endif
}
}
@@ -3187,6 +3224,11 @@ EXPORT_SYMBOL(sync_dirty_buffer);
*/
static inline int buffer_busy(struct buffer_head *bh)
{
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x80;
+ current->getblk_bh_count = atomic_read(&bh->b_count);
+ current->getblk_bh_state = bh->b_state;
+#endif
return atomic_read(&bh->b_count) |
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
@@ -3225,11 +3267,18 @@ int try_to_free_buffers(struct page *page)
int ret = 0;
BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
+ if (PageWriteback(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x10;
+#endif
return 0;
+ }
if (mapping == NULL) { /* can this still happen? */
ret = drop_buffers(page, &buffers_to_free);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x20;
+#endif
goto out;
}
@@ -3253,6 +3302,9 @@ int try_to_free_buffers(struct page *page)
if (ret)
cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x40;
+#endif
out:
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index f00fcc4a4f72..e619c31b6bd9 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -3,6 +3,7 @@
* Implement the manual drop-all-pagecache function
*/
+#include <linux/pagemap.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/fs.h>
@@ -27,7 +28,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
* we need to reschedule to avoid softlockups.
*/
if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
- (inode->i_mapping->nrpages == 0 && !need_resched())) {
+ (mapping_empty(inode->i_mapping) && !need_resched())) {
spin_unlock(&inode->i_lock);
continue;
}
diff --git a/fs/exec.c b/fs/exec.c
index 38f63451b928..eb2a99793018 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -574,7 +574,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
}
if (kmapped_page) {
- flush_kernel_dcache_page(kmapped_page);
+ flush_dcache_page(kmapped_page);
kunmap(kmapped_page);
put_arg_page(kmapped_page);
}
@@ -592,7 +592,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
ret = 0;
out:
if (kmapped_page) {
- flush_kernel_dcache_page(kmapped_page);
+ flush_dcache_page(kmapped_page);
kunmap(kmapped_page);
put_arg_page(kmapped_page);
}
@@ -634,7 +634,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
kaddr = kmap_atomic(page);
flush_arg_page(bprm, pos & PAGE_MASK, page);
memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
- flush_kernel_dcache_page(page);
+ flush_dcache_page(page);
kunmap_atomic(kaddr);
put_arg_page(page);
}
@@ -1026,9 +1026,9 @@ static int exec_mmap(struct mm_struct *mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
mm_update_next_owner(old_mm);
mmput(old_mm);
- return 0;
+ } else {
+ mmdrop_lazy_tlb(active_mm);
}
- mmdrop(active_mm);
return 0;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 06d04a74ab6c..867984e778c3 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -406,6 +406,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
inc_wb_stat(new_wb, WB_WRITEBACK);
}
+ if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
+ atomic_dec(&old_wb->writeback_inodes);
+ atomic_inc(&new_wb->writeback_inodes);
+ }
+
wb_get(new_wb);
/*
@@ -521,6 +526,9 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
*/
smp_mb();
+ if (IS_DAX(inode))
+ return false;
+
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
@@ -1996,7 +2004,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
static long wb_writeback(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
- unsigned long wb_start = jiffies;
long nr_pages = work->nr_pages;
unsigned long dirtied_before = jiffies;
struct inode *inode;
@@ -2050,8 +2057,6 @@ static long wb_writeback(struct bdi_writeback *wb,
progress = __writeback_inodes_wb(wb, work);
trace_writeback_written(wb, work);
- wb_update_bandwidth(wb, wb_start);
-
/*
* Did we write something? Try for more
*
diff --git a/fs/fs_context.c b/fs/fs_context.c
index de1985eae535..b7e43a780a62 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -254,7 +254,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
struct fs_context *fc;
int ret = -ENOMEM;
- fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
+ fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT);
if (!fc)
return ERR_PTR(-ENOMEM);
@@ -649,7 +649,7 @@ const struct fs_context_operations legacy_fs_context_ops = {
*/
static int legacy_init_fs_context(struct fs_context *fc)
{
- fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL);
+ fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT);
if (!fc->fs_private)
return -ENOMEM;
fc->ops = &legacy_fs_context_ops;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 35472cba750e..9cdc6550b468 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -124,7 +124,7 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
hfsplus_cat_set_perms(inode, &folder->permissions);
if (inode == sbi->hidden_dir)
/* invisible and namelocked */
- folder->user_info.frFlags = cpu_to_be16(0x5000);
+ folder->info.user.frFlags = cpu_to_be16(0x5000);
return sizeof(*folder);
} else {
struct hfsplus_cat_file *file;
@@ -142,14 +142,14 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
if (cnid == inode->i_ino) {
hfsplus_cat_set_perms(inode, &file->permissions);
if (S_ISLNK(inode->i_mode)) {
- file->user_info.fdType =
+ file->info.user.fdType =
cpu_to_be32(HFSP_SYMLINK_TYPE);
- file->user_info.fdCreator =
+ file->info.user.fdCreator =
cpu_to_be32(HFSP_SYMLINK_CREATOR);
} else {
- file->user_info.fdType =
+ file->info.user.fdType =
cpu_to_be32(sbi->type);
- file->user_info.fdCreator =
+ file->info.user.fdCreator =
cpu_to_be32(sbi->creator);
}
if (HFSPLUS_FLG_IMMUTABLE &
@@ -158,11 +158,11 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
file->flags |=
cpu_to_be16(HFSPLUS_FILE_LOCKED);
} else {
- file->user_info.fdType =
+ file->info.user.fdType =
cpu_to_be32(HFSP_HARDLINK_TYPE);
- file->user_info.fdCreator =
+ file->info.user.fdCreator =
cpu_to_be32(HFSP_HFSPLUS_CREATOR);
- file->user_info.fdFlags =
+ file->info.user.fdFlags =
cpu_to_be16(0x100);
file->create_date =
HFSPLUS_I(sbi->hidden_dir)->create_date;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 84714bbccc12..135279a19b55 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -73,9 +73,9 @@ again:
goto fail;
}
cnid = be32_to_cpu(entry.file.id);
- if (entry.file.user_info.fdType ==
+ if (entry.file.info.user.fdType ==
cpu_to_be32(HFSP_HARDLINK_TYPE) &&
- entry.file.user_info.fdCreator ==
+ entry.file.info.user.fdCreator ==
cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
HFSPLUS_SB(sb)->hidden_dir &&
(entry.file.create_date ==
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 456e87aec7fd..005a043bc7ee 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -260,8 +260,10 @@ struct hfsplus_cat_folder {
__be32 access_date;
__be32 backup_date;
struct hfsplus_perm permissions;
- struct DInfo user_info;
- struct DXInfo finder_info;
+ struct {
+ struct DInfo user;
+ struct DXInfo finder;
+ } info;
__be32 text_encoding;
__be32 subfolders; /* Subfolder count in HFSX. Reserved in HFS+. */
} __packed;
@@ -294,8 +296,10 @@ struct hfsplus_cat_file {
__be32 access_date;
__be32 backup_date;
struct hfsplus_perm permissions;
- struct FInfo user_info;
- struct FXInfo finder_info;
+ struct {
+ struct FInfo user;
+ struct FXInfo finder;
+ } info;
__be32 text_encoding;
u32 reserved2;
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index e2855ceefd39..b21811baab0f 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -261,10 +261,8 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
struct hfs_find_data cat_fd;
hfsplus_cat_entry entry;
u16 cat_entry_flags, cat_entry_type;
- u16 folder_finderinfo_len = sizeof(struct DInfo) +
- sizeof(struct DXInfo);
- u16 file_finderinfo_len = sizeof(struct FInfo) +
- sizeof(struct FXInfo);
+ u16 folder_finderinfo_len = sizeof(entry.folder.info);
+ u16 file_finderinfo_len = sizeof(entry.file.info);
if ((!S_ISREG(inode->i_mode) &&
!S_ISDIR(inode->i_mode)) ||
@@ -296,7 +294,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
sizeof(hfsplus_cat_entry));
if (be16_to_cpu(entry.type) == HFSPLUS_FOLDER) {
if (size == folder_finderinfo_len) {
- memcpy(&entry.folder.user_info, value,
+ memcpy(&entry.folder.info, value,
folder_finderinfo_len);
hfs_bnode_write(cat_fd.bnode, &entry,
cat_fd.entryoffset,
@@ -309,7 +307,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
}
} else if (be16_to_cpu(entry.type) == HFSPLUS_FILE) {
if (size == file_finderinfo_len) {
- memcpy(&entry.file.user_info, value,
+ memcpy(&entry.file.info, value,
file_finderinfo_len);
hfs_bnode_write(cat_fd.bnode, &entry,
cat_fd.entryoffset,
@@ -462,14 +460,14 @@ static ssize_t hfsplus_getxattr_finder_info(struct inode *inode,
if (entry_type == HFSPLUS_FOLDER) {
hfs_bnode_read(fd.bnode, folder_finder_info,
fd.entryoffset +
- offsetof(struct hfsplus_cat_folder, user_info),
+ offsetof(struct hfsplus_cat_folder, info.user),
folder_rec_len);
memcpy(value, folder_finder_info, folder_rec_len);
res = folder_rec_len;
} else if (entry_type == HFSPLUS_FILE) {
hfs_bnode_read(fd.bnode, file_finder_info,
fd.entryoffset +
- offsetof(struct hfsplus_cat_file, user_info),
+ offsetof(struct hfsplus_cat_file, info.user),
file_rec_len);
memcpy(value, file_finder_info, file_rec_len);
res = file_rec_len;
@@ -630,14 +628,14 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry,
len = sizeof(struct DInfo) + sizeof(struct DXInfo);
hfs_bnode_read(fd.bnode, folder_finder_info,
fd.entryoffset +
- offsetof(struct hfsplus_cat_folder, user_info),
+ offsetof(struct hfsplus_cat_folder, info.user),
len);
found_bit = find_first_bit((void *)folder_finder_info, len*8);
} else if (entry_type == HFSPLUS_FILE) {
len = sizeof(struct FInfo) + sizeof(struct FXInfo);
hfs_bnode_read(fd.bnode, file_finder_info,
fd.entryoffset +
- offsetof(struct hfsplus_cat_file, user_info),
+ offsetof(struct hfsplus_cat_file, info.user),
len);
found_bit = find_first_bit((void *)file_finder_info, len*8);
} else {
diff --git a/fs/inode.c b/fs/inode.c
index 84c528cd1955..cb41f02d8ced 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -426,11 +426,20 @@ void ihold(struct inode *inode)
}
EXPORT_SYMBOL(ihold);
-static void inode_lru_list_add(struct inode *inode)
+static void __inode_add_lru(struct inode *inode, bool rotate)
{
+ if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
+ return;
+ if (atomic_read(&inode->i_count))
+ return;
+ if (!(inode->i_sb->s_flags & SB_ACTIVE))
+ return;
+ if (!mapping_shrinkable(&inode->i_data))
+ return;
+
if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_inc(nr_unused);
- else
+ else if (rotate)
inode->i_state |= I_REFERENCED;
}
@@ -441,16 +450,11 @@ static void inode_lru_list_add(struct inode *inode)
*/
void inode_add_lru(struct inode *inode)
{
- if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
- I_FREEING | I_WILL_FREE)) &&
- !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
- inode_lru_list_add(inode);
+ __inode_add_lru(inode, false);
}
-
static void inode_lru_list_del(struct inode *inode)
{
-
if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_dec(nr_unused);
}
@@ -726,10 +730,6 @@ again:
/*
* Isolate the inode from the LRU in preparation for freeing it.
*
- * Any inodes which are pinned purely because of attached pagecache have their
- * pagecache removed. If the inode has metadata buffers attached to
- * mapping->private_list then try to remove them.
- *
* If the inode has the I_REFERENCED flag set, then it means that it has been
* used recently - the flag is set in iput_final(). When we encounter such an
* inode, clear the flag and move it to the back of the LRU so it gets another
@@ -745,32 +745,40 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
struct inode *inode = container_of(item, struct inode, i_lru);
/*
- * we are inverting the lru lock/inode->i_lock here, so use a trylock.
- * If we fail to get the lock, just skip it.
+ * We are inverting the lru lock/inode->i_lock here, so use a
+ * trylock. If we fail to get the lock, just skip it.
*/
if (!spin_trylock(&inode->i_lock))
return LRU_SKIP;
/*
- * Referenced or dirty inodes are still in use. Give them another pass
- * through the LRU as we canot reclaim them now.
+ * Inodes can get referenced, redirtied, or repopulated while
+ * they're already on the LRU, and this can make them
+ * unreclaimable for a while. Remove them lazily here; iput,
+ * sync, or the last page cache deletion will requeue them.
*/
if (atomic_read(&inode->i_count) ||
- (inode->i_state & ~I_REFERENCED)) {
+ (inode->i_state & ~I_REFERENCED) ||
+ !mapping_shrinkable(&inode->i_data)) {
list_lru_isolate(lru, &inode->i_lru);
spin_unlock(&inode->i_lock);
this_cpu_dec(nr_unused);
return LRU_REMOVED;
}
- /* recently referenced inodes get one more pass */
+ /* Recently referenced inodes get one more pass */
if (inode->i_state & I_REFERENCED) {
inode->i_state &= ~I_REFERENCED;
spin_unlock(&inode->i_lock);
return LRU_ROTATE;
}
- if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ /*
+ * On highmem systems, mapping_shrinkable() permits dropping
+ * page cache in order to free up struct inodes: lowmem might
+ * be under pressure before the cache inside the highmem zone.
+ */
+ if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
__iget(inode);
spin_unlock(&inode->i_lock);
spin_unlock(lru_lock);
@@ -1636,7 +1644,7 @@ static void iput_final(struct inode *inode)
if (!drop &&
!(inode->i_state & I_DONTCACHE) &&
(sb->s_flags & SB_ACTIVE)) {
- inode_add_lru(inode);
+ __inode_add_lru(inode, true);
spin_unlock(&inode->i_lock);
return;
}
diff --git a/fs/internal.h b/fs/internal.h
index 82e8eb32ff3d..693a88430da6 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -145,7 +145,6 @@ extern int vfs_open(const struct path *, struct file *);
* inode.c
*/
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
-extern void inode_add_lru(struct inode *inode);
extern int dentry_needs_remove_privs(struct dentry *dentry);
/*
diff --git a/fs/namei.c b/fs/namei.c
index bf6d8a738c59..ff866c07f4d2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4024,7 +4024,9 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
return -EPERM;
inode_lock(target);
- if (is_local_mountpoint(dentry))
+ if (IS_SWAPFILE(target))
+ error = -EPERM;
+ else if (is_local_mountpoint(dentry))
error = -EBUSY;
else {
error = security_inode_unlink(dir, dentry);
@@ -4526,6 +4528,10 @@ int vfs_rename(struct renamedata *rd)
else if (target)
inode_lock(target);
+ error = -EPERM;
+ if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
+ goto out;
+
error = -EBUSY;
if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
goto out;
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 68e8d61e28dd..62f8a7ac19c8 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -51,11 +51,9 @@ static const struct sysfs_ops nilfs_##name##_attr_ops = { \
#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \
static void nilfs_##name##_attr_release(struct kobject *kobj) \
{ \
- struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \
- struct the_nilfs *nilfs = container_of(kobj->parent, \
- struct the_nilfs, \
- ns_##parent_name##_kobj); \
- subgroups = nilfs->ns_##parent_name##_subgroups; \
+ struct nilfs_sysfs_##parent_name##_subgroups *subgroups = container_of(kobj, \
+ struct nilfs_sysfs_##parent_name##_subgroups, \
+ sg_##name##_kobj); \
complete(&subgroups->sg_##name##_kobj_unregister); \
} \
static struct kobj_type nilfs_##name##_ktype = { \
@@ -81,12 +79,12 @@ static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \
err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \
#name); \
if (err) \
- return err; \
- return 0; \
+ kobject_put(kobj); \
+ return err; \
} \
static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \
{ \
- kobject_del(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \
+ kobject_put(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \
}
/************************************************************************
@@ -197,14 +195,14 @@ int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root)
}
if (err)
- return err;
+ kobject_put(&root->snapshot_kobj);
- return 0;
+ return err;
}
void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root)
{
- kobject_del(&root->snapshot_kobj);
+ kobject_put(&root->snapshot_kobj);
}
/************************************************************************
@@ -986,7 +984,7 @@ int nilfs_sysfs_create_device_group(struct super_block *sb)
err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL,
"%s", sb->s_id);
if (err)
- goto free_dev_subgroups;
+ goto cleanup_dev_kobject;
err = nilfs_sysfs_create_mounted_snapshots_group(nilfs);
if (err)
@@ -1023,9 +1021,7 @@ delete_mounted_snapshots_group:
nilfs_sysfs_delete_mounted_snapshots_group(nilfs);
cleanup_dev_kobject:
- kobject_del(&nilfs->ns_dev_kobj);
-
-free_dev_subgroups:
+ kobject_put(&nilfs->ns_dev_kobj);
kfree(nilfs->ns_dev_subgroups);
failed_create_device_group:
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 48fd369c29a4..33fbdc823278 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2721,7 +2721,7 @@ int ocfs2_inode_lock_tracker(struct inode *inode,
return status;
}
}
- return tmp_oh ? 1 : 0;
+ return 1;
}
void ocfs2_inode_unlock_tracker(struct inode *inode,
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2c46ff6ba4ea..307dd2222874 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -453,8 +453,12 @@ roll_back:
leave:
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && new_fe_bh != NULL)
+ ocfs2_set_links_count((struct ocfs2_dinode *)
+ new_fe_bh->b_data, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
@@ -598,6 +602,8 @@ static int __ocfs2_mknod_locked(struct inode *dir,
leave:
if (status < 0) {
if (*new_fe_bh) {
+ if (fe)
+ ocfs2_set_links_count(fe, 0);
brelse(*new_fe_bh);
*new_fe_bh = NULL;
}
@@ -634,7 +640,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
parent_fe_bh, handle, inode_ac,
fe_blkno, suballoc_loc, suballoc_bit);
- if (status < 0) {
+ if (status < 0 && !(OCFS2_I(inode)->ip_inode_lockres.l_flags &
+ OCFS2_LOCK_INITIALIZED)) {
u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
@@ -2027,8 +2034,12 @@ bail:
ocfs2_clusters_to_bytes(osb->sb, 1));
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && new_fe_bh != NULL)
+ ocfs2_set_links_count((struct ocfs2_dinode *)
+ new_fe_bh->b_data, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e5b5f7709d48..533d5836eb9a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -95,6 +95,7 @@
#include <linux/posix-timers.h>
#include <linux/time_namespace.h>
#include <linux/resctrl.h>
+#include <linux/cn_proc.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"
@@ -1674,8 +1675,10 @@ static ssize_t comm_write(struct file *file, const char __user *buf,
if (!p)
return -ESRCH;
- if (same_thread_group(current, p))
+ if (same_thread_group(current, p)) {
set_task_comm(p, buffer);
+ proc_comm_connector(p);
+ }
else
count = -EINVAL;
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 172c86270b31..aea59e243bae 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -72,7 +72,7 @@ out:
return 0;
}
-static int seq_fdinfo_open(struct inode *inode, struct file *file)
+static int proc_fdinfo_access_allowed(struct inode *inode)
{
bool allowed = false;
struct task_struct *task = get_proc_task(inode);
@@ -86,13 +86,44 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file)
if (!allowed)
return -EACCES;
+ return 0;
+}
+
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
return single_open(file, seq_show, inode);
}
+static ssize_t seq_fdinfo_read(struct file *file, char __user *buf, size_t size,
+ loff_t *ppos)
+{
+ int ret = proc_fdinfo_access_allowed(file_inode(file));
+
+ if (ret)
+ return ret;
+
+ return seq_read(file, buf, size, ppos);
+}
+
+static loff_t seq_fdinfo_lseek(struct file *file, loff_t offset, int whence)
+{
+ int ret = proc_fdinfo_access_allowed(file_inode(file));
+
+ if (ret)
+ return ret;
+
+ return seq_lseek(file, offset, whence);
+}
+
static const struct file_operations proc_fdinfo_file_operations = {
.open = seq_fdinfo_open,
- .read = seq_read,
- .llseek = seq_lseek,
+ .read = seq_fdinfo_read,
+ .llseek = seq_fdinfo_lseek,
.release = single_release,
};
@@ -344,17 +375,43 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
{
+ int ret = proc_fdinfo_access_allowed(file_inode(file));
+
+ if (ret)
+ return ret;
+
return proc_readfd_common(file, ctx,
proc_fdinfo_instantiate);
}
+static loff_t proc_llseek_fdinfo(struct file *file, loff_t offset, int whence)
+{
+ int ret = proc_fdinfo_access_allowed(file_inode(file));
+
+ if (ret)
+ return ret;
+
+ return generic_file_llseek(file, offset, whence);
+}
+
+static int proc_open_fdinfo(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
const struct inode_operations proc_fdinfo_inode_operations = {
.lookup = proc_lookupfdinfo,
.setattr = proc_setattr,
};
const struct file_operations proc_fdinfo_operations = {
+ .open = proc_open_fdinfo,
.read = generic_read_dir,
.iterate_shared = proc_readfdinfo,
- .llseek = generic_file_llseek,
+ .llseek = proc_llseek_fdinfo,
};
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 982e694aae77..3f148759a5fd 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -614,11 +614,84 @@ static int release_kcore(struct inode *inode, struct file *file)
return 0;
}
+static vm_fault_t mmap_kcore_fault(struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_operations_struct kcore_mmap_ops = {
+ .fault = mmap_kcore_fault,
+};
+
+static int mmap_kcore(struct file *file, struct vm_area_struct *vma)
+{
+ size_t size = vma->vm_end - vma->vm_start;
+ u64 start, end, pfn;
+ int nphdr;
+ size_t data_offset;
+ size_t phdrs_len, notes_len;
+ struct kcore_list *m = NULL;
+ int ret = 0;
+
+ down_read(&kclist_lock);
+
+ get_kcore_size(&nphdr, &phdrs_len, &notes_len, &data_offset);
+
+ data_offset &= PAGE_MASK;
+ start = (u64)vma->vm_pgoff << PAGE_SHIFT;
+ if (start < data_offset) {
+ ret = -EINVAL;
+ goto out;
+ }
+ start = kc_offset_to_vaddr(start - data_offset);
+ end = start + size;
+
+ list_for_each_entry(m, &kclist_head, list) {
+ if (start >= m->addr && end <= m->addr + m->size)
+ break;
+ }
+
+ if (&m->list == &kclist_head) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC)) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+ vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_ops = &kcore_mmap_ops;
+
+ if (kern_addr_valid(start)) {
+ if (m->type == KCORE_RAM)
+ pfn = __pa(start) >> PAGE_SHIFT;
+ else if (m->type == KCORE_TEXT)
+ pfn = __pa_symbol(start) >> PAGE_SHIFT;
+ else {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = remap_pfn_range(vma, vma->vm_start, pfn, size,
+ vma->vm_page_prot);
+ } else {
+ ret = -EFAULT;
+ }
+
+out:
+ up_read(&kclist_lock);
+ return ret;
+}
+
static const struct proc_ops kcore_proc_ops = {
.proc_read = read_kcore,
.proc_open = open_kcore,
.proc_release = release_kcore,
.proc_lseek = default_llseek,
+ .proc_mmap = mmap_kcore,
};
/* just remember that we have to update kcore */
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 9f1077d94cde..4dcbcd506cb6 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -115,7 +115,10 @@ u64 stable_page_flags(struct page *page)
* it differentiates a memory hole from a page with no flags
*/
if (!page)
- return 1 << KPF_NOPAGE;
+ return BIT_ULL(KPF_NOPAGE);
+
+ if (pfn_zone_device_reserved(page_to_pfn(page)))
+ return BIT_ULL(KPF_RESERVED);
k = page->flags;
u = 0;
@@ -127,22 +130,22 @@ u64 stable_page_flags(struct page *page)
* simple test in page_mapped() is not enough.
*/
if (!PageSlab(page) && page_mapped(page))
- u |= 1 << KPF_MMAP;
+ u |= BIT_ULL(KPF_MMAP);
if (PageAnon(page))
- u |= 1 << KPF_ANON;
+ u |= BIT_ULL(KPF_ANON);
if (PageKsm(page))
- u |= 1 << KPF_KSM;
+ u |= BIT_ULL(KPF_KSM);
/*
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
if (PageHead(page))
- u |= 1 << KPF_COMPOUND_HEAD;
+ u |= BIT_ULL(KPF_COMPOUND_HEAD);
if (PageTail(page))
- u |= 1 << KPF_COMPOUND_TAIL;
+ u |= BIT_ULL(KPF_COMPOUND_TAIL);
if (PageHuge(page))
- u |= 1 << KPF_HUGE;
+ u |= BIT_ULL(KPF_HUGE);
/*
* PageTransCompound can be true for non-huge compound pages (slab
* pages or pages allocated by drivers with __GFP_COMP) because it
@@ -153,14 +156,13 @@ u64 stable_page_flags(struct page *page)
struct page *head = compound_head(page);
if (PageLRU(head) || PageAnon(head))
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_THP);
else if (is_huge_zero_page(head)) {
- u |= 1 << KPF_ZERO_PAGE;
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_ZERO_PAGE);
+ u |= BIT_ULL(KPF_THP);
}
} else if (is_zero_pfn(page_to_pfn(page)))
- u |= 1 << KPF_ZERO_PAGE;
-
+ u |= BIT_ULL(KPF_ZERO_PAGE);
/*
* Caveats on high order pages: page->_refcount will only be set
@@ -168,23 +170,23 @@ u64 stable_page_flags(struct page *page)
* SLOB won't set PG_slab at all on compound pages.
*/
if (PageBuddy(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
else if (page_count(page) == 0 && is_free_buddy_page(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
if (PageOffline(page))
- u |= 1 << KPF_OFFLINE;
+ u |= BIT_ULL(KPF_OFFLINE);
if (PageTable(page))
- u |= 1 << KPF_PGTABLE;
+ u |= BIT_ULL(KPF_PGTABLE);
if (page_is_idle(page))
- u |= 1 << KPF_IDLE;
+ u |= BIT_ULL(KPF_IDLE);
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
if (PageTail(page) && PageSlab(compound_head(page)))
- u |= 1 << KPF_SLAB;
+ u |= BIT_ULL(KPF_SLAB);
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
@@ -197,7 +199,7 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
if (PageSwapCache(page))
- u |= 1 << KPF_SWAPCACHE;
+ u |= BIT_ULL(KPF_SWAPCACHE);
u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f6e0f0c0d0e5..5c2d806e6ae5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1236,23 +1236,21 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
}
static __always_inline int validate_range(struct mm_struct *mm,
- __u64 *start, __u64 len)
+ __u64 start, __u64 len)
{
__u64 task_size = mm->task_size;
- *start = untagged_addr(*start);
-
- if (*start & ~PAGE_MASK)
+ if (start & ~PAGE_MASK)
return -EINVAL;
if (len & ~PAGE_MASK)
return -EINVAL;
if (!len)
return -EINVAL;
- if (*start < mmap_min_addr)
+ if (start < mmap_min_addr)
return -EINVAL;
- if (*start >= task_size)
+ if (start >= task_size)
return -EINVAL;
- if (len > task_size - *start)
+ if (len > task_size - start)
return -EINVAL;
return 0;
}
@@ -1316,7 +1314,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vm_flags |= VM_UFFD_MINOR;
}
- ret = validate_range(mm, &uffdio_register.range.start,
+ ret = validate_range(mm, uffdio_register.range.start,
uffdio_register.range.len);
if (ret)
goto out;
@@ -1522,7 +1520,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
goto out;
- ret = validate_range(mm, &uffdio_unregister.start,
+ ret = validate_range(mm, uffdio_unregister.start,
uffdio_unregister.len);
if (ret)
goto out;
@@ -1671,7 +1669,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len);
+ ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
if (ret)
goto out;
@@ -1711,7 +1709,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
sizeof(uffdio_copy)-sizeof(__s64)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len);
+ ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
if (ret)
goto out;
/*
@@ -1768,7 +1766,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
sizeof(uffdio_zeropage)-sizeof(__s64)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_zeropage.range.start,
+ ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
uffdio_zeropage.range.len);
if (ret)
goto out;
@@ -1818,7 +1816,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
sizeof(struct uffdio_writeprotect)))
return -EFAULT;
- ret = validate_range(ctx->mm, &uffdio_wp.range.start,
+ ret = validate_range(ctx->mm, uffdio_wp.range.start,
uffdio_wp.range.len);
if (ret)
return ret;
@@ -1866,7 +1864,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
sizeof(uffdio_continue) - (sizeof(__s64))))
goto out;
- ret = validate_range(ctx->mm, &uffdio_continue.range.start,
+ ret = validate_range(ctx->mm, uffdio_continue.range.start,
uffdio_continue.range.len);
if (ret)
goto out;
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 1d7edad9914f..33207004cfde 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -116,6 +116,7 @@ struct bdi_writeback {
struct list_head b_dirty_time; /* time stamps are dirty */
spinlock_t list_lock; /* protects the b_* lists */
+ atomic_t writeback_inodes; /* number of inodes under writeback */
struct percpu_counter stat[NR_WB_STAT_ITEMS];
unsigned long congested; /* WB_[a]sync_congested flags */
@@ -142,6 +143,7 @@ struct bdi_writeback {
spinlock_t work_lock; /* protects work_list & dwork scheduling */
struct list_head work_list;
struct delayed_work dwork; /* work item used for writeback */
+ struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
unsigned long dirty_sleep; /* last wait */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index a852876bb6e2..84ba698eea0b 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -288,6 +288,17 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
return inode->i_wb;
}
+static inline struct bdi_writeback *inode_to_wb_wbc(
+ struct inode *inode,
+ struct writeback_control *wbc)
+{
+ /*
+ * If wbc does not have inode attached, it means cgroup writeback was
+ * disabled when wbc started. Just use the default wb in that case.
+ */
+ return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb;
+}
+
/**
* unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
* @inode: target inode
@@ -366,6 +377,14 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
return &inode_to_bdi(inode)->wb;
}
+static inline struct bdi_writeback *inode_to_wb_wbc(
+ struct inode *inode,
+ struct writeback_control *wbc)
+{
+ return inode_to_wb(inode);
+}
+
+
static inline struct bdi_writeback *
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2676dfbca3b7..50b7cca019d7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3258,6 +3258,7 @@ static inline void remove_inode_hash(struct inode *inode)
}
extern void inode_sb_list_add(struct inode *inode);
+extern void inode_add_lru(struct inode *inode);
extern int sb_set_blocksize(struct super_block *, int);
extern int sb_min_blocksize(struct super_block *, int);
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 85de3bd0b47d..7fb09a10aa92 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -168,10 +168,7 @@ static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page
}
#endif
-#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-static inline void flush_kernel_dcache_page(struct page *page)
-{
-}
+#ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE
static inline void flush_kernel_vmap_range(void *vaddr, int size)
{
}
@@ -184,9 +181,9 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
#ifndef clear_user_highpage
static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
{
- void *addr = kmap_atomic(page);
+ void *addr = kmap_local_page(page);
clear_user_page(addr, vaddr, page);
- kunmap_atomic(addr);
+ kunmap_local(addr);
}
#endif
@@ -218,9 +215,9 @@ alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
static inline void clear_highpage(struct page *page)
{
- void *kaddr = kmap_atomic(page);
+ void *kaddr = kmap_local_page(page);
clear_page(kaddr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
}
#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
@@ -243,7 +240,7 @@ static inline void zero_user_segments(struct page *page,
unsigned start1, unsigned end1,
unsigned start2, unsigned end2)
{
- void *kaddr = kmap_atomic(page);
+ void *kaddr = kmap_local_page(page);
unsigned int i;
BUG_ON(end1 > page_size(page) || end2 > page_size(page));
@@ -254,7 +251,7 @@ static inline void zero_user_segments(struct page *page,
if (end2 > start2)
memset(kaddr + start2, 0, end2 - start2);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
for (i = 0; i < compound_nr(page); i++)
flush_dcache_page(page + i);
}
@@ -279,11 +276,11 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
{
char *vfrom, *vto;
- vfrom = kmap_atomic(from);
- vto = kmap_atomic(to);
+ vfrom = kmap_local_page(from);
+ vto = kmap_local_page(to);
copy_user_page(vto, vfrom, vaddr, to);
- kunmap_atomic(vto);
- kunmap_atomic(vfrom);
+ kunmap_local(vto);
+ kunmap_local(vfrom);
}
#endif
@@ -294,11 +291,11 @@ static inline void copy_highpage(struct page *to, struct page *from)
{
char *vfrom, *vto;
- vfrom = kmap_atomic(from);
- vto = kmap_atomic(to);
+ vfrom = kmap_local_page(from);
+ vto = kmap_local_page(to);
copy_page(vto, vfrom);
- kunmap_atomic(vto);
- kunmap_atomic(vfrom);
+ kunmap_local(vto);
+ kunmap_local(vfrom);
}
#endif
@@ -356,14 +353,16 @@ static inline void memcpy_to_page(struct page *page, size_t offset,
VM_BUG_ON(offset + len > PAGE_SIZE);
memcpy(to + offset, from, len);
+ flush_dcache_page(page);
kunmap_local(to);
}
static inline void memzero_page(struct page *page, size_t offset, size_t len)
{
- char *addr = kmap_atomic(page);
+ char *addr = kmap_local_page(page);
memset(addr + offset, 0, len);
- kunmap_atomic(addr);
+ flush_dcache_page(page);
+ kunmap_local(addr);
}
#endif /* _LINUX_HIGHMEM_H */
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index cbf46f56d105..4a53c3ca86bd 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -209,7 +209,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
*/
#define for_each_mem_range(i, p_start, p_end) \
__for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, \
- MEMBLOCK_NONE, p_start, p_end, NULL)
+ MEMBLOCK_HOTPLUG, p_start, p_end, NULL)
/**
* for_each_mem_range_rev - reverse iterate through memblock areas from
@@ -220,7 +220,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
*/
#define for_each_mem_range_rev(i, p_start, p_end) \
__for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, \
- MEMBLOCK_NONE, p_start, p_end, NULL)
+ MEMBLOCK_HOTPLUG, p_start, p_end, NULL)
/**
* for_each_reserved_mem_range - iterate over all reserved memblock areas
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index af9c44bb1e42..e2a42a3bedf6 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -105,14 +105,6 @@ struct mem_cgroup_reclaim_iter {
unsigned int generation;
};
-struct lruvec_stat {
- long count[NR_VM_NODE_STAT_ITEMS];
-};
-
-struct batched_lruvec_stat {
- s32 count[NR_VM_NODE_STAT_ITEMS];
-};
-
/*
* Bitmap and deferred work of shrinker::id corresponding to memcg-aware
* shrinkers, which have elements charged to this memcg.
@@ -123,24 +115,30 @@ struct shrinker_info {
unsigned long *map;
};
+struct lruvec_stats_percpu {
+ /* Local (CPU and cgroup) state */
+ long state[NR_VM_NODE_STAT_ITEMS];
+
+ /* Delta calculation for lockless upward propagation */
+ long state_prev[NR_VM_NODE_STAT_ITEMS];
+};
+
+struct lruvec_stats {
+ /* Aggregated (CPU and subtree) state */
+ long state[NR_VM_NODE_STAT_ITEMS];
+
+ /* Pending child counts during tree propagation */
+ long state_pending[NR_VM_NODE_STAT_ITEMS];
+};
+
/*
* per-node information in memory controller.
*/
struct mem_cgroup_per_node {
struct lruvec lruvec;
- /*
- * Legacy local VM stats. This should be struct lruvec_stat and
- * cannot be optimized to struct batched_lruvec_stat. Because
- * the threshold of the lruvec_stat_cpu can be as big as
- * MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this
- * filed has no upper limit.
- */
- struct lruvec_stat __percpu *lruvec_stat_local;
-
- /* Subtree VM stats (batched updates) */
- struct batched_lruvec_stat __percpu *lruvec_stat_cpu;
- atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS];
+ struct lruvec_stats_percpu __percpu *lruvec_stats_percpu;
+ struct lruvec_stats lruvec_stats;
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
@@ -704,13 +702,34 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
page_counter_read(&memcg->memory);
}
-int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);
+int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);
+static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
+ gfp_t gfp)
+{
+ if (mem_cgroup_disabled())
+ return 0;
+ return __mem_cgroup_charge(folio, mm, gfp);
+}
+
int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
gfp_t gfp, swp_entry_t entry);
void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
-void mem_cgroup_uncharge(struct folio *folio);
-void mem_cgroup_uncharge_list(struct list_head *page_list);
+void __mem_cgroup_uncharge(struct folio *folio);
+static inline void mem_cgroup_uncharge(struct folio *folio)
+{
+ if (mem_cgroup_disabled())
+ return;
+ __mem_cgroup_uncharge(folio);
+}
+
+void __mem_cgroup_uncharge_list(struct list_head *page_list);
+static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
+{
+ if (mem_cgroup_disabled())
+ return;
+ __mem_cgroup_uncharge_list(page_list);
+}
void mem_cgroup_migrate(struct folio *old, struct folio *new);
@@ -978,7 +997,7 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
return node_page_state(lruvec_pgdat(lruvec), idx);
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- x = atomic_long_read(&pn->lruvec_stat[idx]);
+ x = READ_ONCE(pn->lruvec_stats.state[idx]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -998,7 +1017,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
for_each_possible_cpu(cpu)
- x += per_cpu(pn->lruvec_stat_local->count[idx], cpu);
+ x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -1006,6 +1025,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return x;
}
+void mem_cgroup_flush_stats(void);
+
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val);
void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
@@ -1433,6 +1454,10 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return node_page_state(lruvec_pgdat(lruvec), idx);
}
+static inline void mem_cgroup_flush_stats(void)
+{
+}
+
static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx, int val)
{
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 97e92e8b556a..d9a0b61cd432 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -90,7 +90,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
void remove_memory_block_devices(unsigned long start, unsigned long size);
extern void memory_dev_init(void);
extern int memory_notify(unsigned long val, void *v);
-extern struct memory_block *find_memory_block(struct mem_section *);
+extern struct memory_block *find_memory_block(unsigned long section_nr);
typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
extern int walk_memory_blocks(unsigned long start, unsigned long size,
void *arg, walk_memory_blocks_func_t func);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index a7fd2c3ccb77..068e3dcf4690 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -130,8 +130,7 @@ static inline bool movable_node_is_enabled(void)
return movable_node_enabled;
}
-extern void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap);
+extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap);
extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages,
struct vmem_altmap *altmap);
@@ -293,9 +292,9 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
extern void try_offline_node(int nid);
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
-extern int remove_memory(int nid, u64 start, u64 size);
-extern void __remove_memory(int nid, u64 start, u64 size);
-extern int offline_and_remove_memory(int nid, u64 start, u64 size);
+extern int remove_memory(u64 start, u64 size);
+extern void __remove_memory(u64 start, u64 size);
+extern int offline_and_remove_memory(u64 start, u64 size);
#else
static inline void try_offline_node(int nid) {}
@@ -305,12 +304,12 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
return -EINVAL;
}
-static inline int remove_memory(int nid, u64 start, u64 size)
+static inline int remove_memory(u64 start, u64 size)
{
return -EBUSY;
}
-static inline void __remove_memory(int nid, u64 start, u64 size) {}
+static inline void __remove_memory(u64 start, u64 size) {}
#endif /* CONFIG_MEMORY_HOTREMOVE */
extern void set_zone_contiguous(struct zone *zone);
@@ -339,8 +338,8 @@ extern void sparse_remove_section(struct mem_section *ms,
unsigned long map_offset, struct vmem_altmap *altmap);
extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
unsigned long pnum);
-extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
- unsigned long nr_pages);
+extern struct zone *zone_for_pfn_range(int online_type, int nid,
+ unsigned long start_pfn, unsigned long nr_pages);
extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
struct mhp_params *params);
void arch_remove_linear_mapping(u64 start, u64 size);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 0aaf91b496e2..0117e1ec7b1e 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -6,6 +6,7 @@
#ifndef _LINUX_MEMPOLICY_H
#define _LINUX_MEMPOLICY_H 1
+#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/mmzone.h>
#include <linux/dax.h>
@@ -43,7 +44,7 @@ struct mm_struct;
* to 1, representing the caller of mpol_dup().
*/
struct mempolicy {
- atomic_t refcnt;
+ refcount_t refcnt;
unsigned short mode; /* See MPOL_* above */
unsigned short flags; /* See set_mempolicy() MPOL_F_* above */
nodemask_t nodes; /* interleave/bind/perfer */
@@ -94,7 +95,7 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
static inline void mpol_get(struct mempolicy *pol)
{
if (pol)
- atomic_inc(&pol->refcnt);
+ refcount_inc(&pol->refcnt);
}
extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
@@ -184,6 +185,8 @@ extern bool vma_migratable(struct vm_area_struct *vma);
extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
extern void mpol_put_task_policy(struct task_struct *);
+extern bool numa_demotion_enabled;
+
#else
struct mempolicy {};
@@ -292,5 +295,7 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
{
return NULL;
}
+
+#define numa_demotion_enabled false
#endif /* CONFIG_NUMA */
#endif
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index c0e9d35889e8..119f130ef8f1 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -131,6 +131,7 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
}
#ifdef CONFIG_ZONE_DEVICE
+bool pfn_zone_device_reserved(unsigned long pfn);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
@@ -143,6 +144,11 @@ unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
unsigned long memremap_compat_align(void);
#else
+static inline bool pfn_zone_device_reserved(unsigned long pfn)
+{
+ return false;
+}
+
static inline void *devm_memremap_pages(struct device *dev,
struct dev_pagemap *pgmap)
{
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 6a01de9faff5..b5ad813dce6e 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -28,6 +28,7 @@ enum migrate_reason {
MR_NUMA_MISPLACED,
MR_CONTIG_RANGE,
MR_LONGTERM_PIN,
+ MR_DEMOTION,
MR_TYPES
};
@@ -41,7 +42,8 @@ extern int migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page,
enum migrate_mode mode);
extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
- unsigned long private, enum migrate_mode mode, int reason);
+ unsigned long private, enum migrate_mode mode, int reason,
+ unsigned int *ret_succeeded);
extern struct page *alloc_migration_target(struct page *page, unsigned long private);
extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
@@ -60,7 +62,7 @@ int folio_migrate_mapping(struct address_space *mapping,
static inline void putback_movable_pages(struct list_head *l) {}
static inline int migrate_pages(struct list_head *l, new_page_t new,
free_page_t free, unsigned long private, enum migrate_mode mode,
- int reason)
+ int reason, unsigned int *ret_succeeded)
{ return -ENOSYS; }
static inline struct page *alloc_migration_target(struct page *page,
unsigned long private)
@@ -170,6 +172,14 @@ struct migrate_vma {
int migrate_vma_setup(struct migrate_vma *args);
void migrate_vma_pages(struct migrate_vma *migrate);
void migrate_vma_finalize(struct migrate_vma *migrate);
+int next_demotion_node(int node);
+
+#else /* CONFIG_MIGRATION disabled: */
+
+static inline int next_demotion_node(int node)
+{
+ return NUMA_NO_NODE;
+}
#endif /* CONFIG_MIGRATION */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 99450a007574..7bc883cd1207 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -795,38 +795,6 @@ static inline int is_vmalloc_or_module_addr(const void *x)
}
#endif
-extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
-static inline void *kvmalloc(size_t size, gfp_t flags)
-{
- return kvmalloc_node(size, flags, NUMA_NO_NODE);
-}
-static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
-{
- return kvmalloc_node(size, flags | __GFP_ZERO, node);
-}
-static inline void *kvzalloc(size_t size, gfp_t flags)
-{
- return kvmalloc(size, flags | __GFP_ZERO);
-}
-
-static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
-{
- size_t bytes;
-
- if (unlikely(check_mul_overflow(n, size, &bytes)))
- return NULL;
-
- return kvmalloc(bytes, flags);
-}
-
-static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
-{
- return kvmalloc_array(n, size, flags | __GFP_ZERO);
-}
-
-extern void kvfree(const void *addr);
-extern void kvfree_sensitive(const void *addr, size_t len);
-
static inline int head_compound_mapcount(struct page *head)
{
return atomic_read(compound_mapcount_ptr(head)) + 1;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1066afc9a06d..85e9f5b517c8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -159,11 +159,13 @@ struct page {
struct mm_struct *pt_mm; /* x86 pgds only */
atomic_t pt_frag_refcount; /* powerpc */
};
+#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
spinlock_t *ptl;
#else
spinlock_t ptl;
#endif
+#endif
};
struct { /* ZONE_DEVICE pages */
/** @pgmap: Points to the hosting device page map. */
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index c1bc6731125c..1b222f8039d1 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -22,7 +22,12 @@
#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \
IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
+
+#if USE_SPLIT_PTE_PTLOCKS
#define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8)
+#else
+#define ALLOC_SPLIT_PTLOCKS 0
+#endif
/*
* The per task VMA cache array:
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fcb535560028..5c0318509f9e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1342,7 +1342,6 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
return NULL;
return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
}
-extern unsigned long __section_nr(struct mem_section *ms);
extern size_t mem_section_usage_size(void);
/*
@@ -1365,7 +1364,7 @@ extern size_t mem_section_usage_size(void);
#define SECTION_TAINT_ZONE_DEVICE (1UL<<4)
#define SECTION_MAP_LAST_BIT (1UL<<5)
#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1))
-#define SECTION_NID_SHIFT 3
+#define SECTION_NID_SHIFT 6
static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
@@ -1525,18 +1524,6 @@ void sparse_init(void);
#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
#endif /* CONFIG_SPARSEMEM */
-/*
- * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
- * need to check pfn validity within that MAX_ORDER_NR_PAGES block.
- * pfn_valid_within() should be used in this case; we optimise this away
- * when we have no holes within a MAX_ORDER_NR_PAGES block.
- */
-#ifdef CONFIG_HOLES_IN_ZONE
-#define pfn_valid_within(pfn) pfn_valid(pfn)
-#else
-#define pfn_valid_within(pfn) (1)
-#endif
-
#endif /* !__GENERATING_BOUNDS.H */
#endif /* !__ASSEMBLY__ */
#endif /* _LINUX_MMZONE_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 83c1a798265f..79ec90e97e94 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -24,6 +24,56 @@ static inline bool mapping_empty(struct address_space *mapping)
}
/*
+ * mapping_shrinkable - test if page cache state allows inode reclaim
+ * @mapping: the page cache mapping
+ *
+ * This checks the mapping's cache state for the pupose of inode
+ * reclaim and LRU management.
+ *
+ * The caller is expected to hold the i_lock, but is not required to
+ * hold the i_pages lock, which usually protects cache state. That's
+ * because the i_lock and the list_lru lock that protect the inode and
+ * its LRU state don't nest inside the irq-safe i_pages lock.
+ *
+ * Cache deletions are performed under the i_lock, which ensures that
+ * when an inode goes empty, it will reliably get queued on the LRU.
+ *
+ * Cache additions do not acquire the i_lock and may race with this
+ * check, in which case we'll report the inode as shrinkable when it
+ * has cache pages. This is okay: the shrinker also checks the
+ * refcount and the referenced bit, which will be elevated or set in
+ * the process of adding new cache pages to an inode.
+ */
+static inline bool mapping_shrinkable(struct address_space *mapping)
+{
+ void *head;
+
+ /*
+ * On highmem systems, there could be lowmem pressure from the
+ * inodes before there is highmem pressure from the page
+ * cache. Make inodes shrinkable regardless of cache state.
+ */
+ if (IS_ENABLED(CONFIG_HIGHMEM))
+ return true;
+
+ /* Cache completely empty? Shrink away. */
+ head = rcu_access_pointer(mapping->i_pages.xa_head);
+ if (!head)
+ return true;
+
+ /*
+ * The xarray stores single offset-0 entries directly in the
+ * head pointer, which allows non-resident page cache entries
+ * to escape the shadow shrinker's list of xarray nodes. The
+ * inode shrinker needs to pick them up under memory pressure.
+ */
+ if (!xa_is_node(head) && xa_is_value(head))
+ return true;
+
+ return false;
+}
+
+/*
* Bits in mapping->flags.
*/
enum mapping_flags {
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index e704b1a4c06c..221c3c6438a7 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -12,6 +12,8 @@
#include <linux/memcontrol.h>
#include <linux/highmem.h>
+#include <linux/refcount.h>
+
/*
* The anon_vma heads a list of private "related" vmas, to scan if
* an anonymous page pointing to this anon_vma needs to be unmapped:
@@ -36,7 +38,7 @@ struct anon_vma {
* the reference is responsible for clearing up the
* anon_vma if they are the last user on release
*/
- atomic_t refcount;
+ refcount_t refcount;
/*
* Count of child anon_vmas and VMAs which points to this anon_vma.
@@ -100,14 +102,14 @@ enum ttu_flags {
#ifdef CONFIG_MMU
static inline void get_anon_vma(struct anon_vma *anon_vma)
{
- atomic_inc(&anon_vma->refcount);
+ refcount_inc(&anon_vma->refcount);
}
void __put_anon_vma(struct anon_vma *anon_vma);
static inline void put_anon_vma(struct anon_vma *anon_vma)
{
- if (atomic_dec_and_test(&anon_vma->refcount))
+ if (refcount_dec_and_test(&anon_vma->refcount))
__put_anon_vma(anon_vma);
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c64119aa2e60..0e1d14c2bf20 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -991,6 +991,7 @@ struct task_struct {
#ifdef CONFIG_DETECT_HUNG_TASK
unsigned long last_switch_count;
unsigned long last_switch_time;
+ unsigned long killed_time;
#endif
/* Filesystem information: */
struct fs_struct *fs;
@@ -1403,6 +1404,13 @@ struct task_struct {
struct llist_head kretprobe_instances;
#endif
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ unsigned long getblk_stamp;
+ unsigned int getblk_executed;
+ unsigned int getblk_bh_count;
+ unsigned long getblk_bh_state;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index e24b1fe348e3..8a5d6db25430 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,27 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm);
}
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+ mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+ mmdrop(mm);
+ } else {
+ /*
+ * mmdrop_lazy_tlb must provide a full memory barrier, see the
+ * membarrier comment in finish_task_switch which relies on
+ * this.
+ */
+ smp_mb();
+ }
+}
+
/**
* mmget() - Pin the address space associated with a &struct mm_struct.
* @mm: The address space to pin.
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 083f3ce550bc..2c0d80cca6b8 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -732,6 +732,38 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
return kmalloc_node(size, flags | __GFP_ZERO, node);
}
+void *kvmalloc_node(size_t size, gfp_t flags, int node);
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+ return kvmalloc_node(size, flags, NUMA_NO_NODE);
+}
+static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
+{
+ return kvmalloc_node(size, flags | __GFP_ZERO, node);
+}
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+ return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+ size_t bytes;
+
+ if (unlikely(check_mul_overflow(n, size, &bytes)))
+ return NULL;
+
+ return kvmalloc(bytes, flags);
+}
+
+static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
+{
+ return kvmalloc_array(n, size, flags | __GFP_ZERO);
+}
+
+void kvfree(const void *addr);
+void kvfree_sensitive(const void *addr, size_t len);
+
unsigned int kmem_cache_size(struct kmem_cache *s);
void __init kmem_cache_init_late(void);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 81801ba78b1e..cdf0957a88a4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -415,7 +415,7 @@ static inline bool node_reclaim_enabled(void)
extern void check_move_unevictable_pages(struct pagevec *pvec);
-extern int kswapd_run(int nid);
+extern void kswapd_run(int nid);
extern void kswapd_stop(int nid);
#ifdef CONFIG_SWAP
@@ -728,7 +728,13 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
#endif
#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-extern void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask);
+extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask);
+static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+{
+ if (mem_cgroup_disabled())
+ return;
+ __cgroup_throttle_swaprate(page, gfp_mask);
+}
#else
static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
{
@@ -737,8 +743,22 @@ static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
#ifdef CONFIG_MEMCG_SWAP
extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
-extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
-extern void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
+extern int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
+static inline int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+{
+ if (mem_cgroup_disabled())
+ return 0;
+ return __mem_cgroup_try_charge_swap(page, entry);
+}
+
+extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
+static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+{
+ if (mem_cgroup_disabled())
+ return;
+ __mem_cgroup_uncharge_swap(entry, nr_pages);
+}
+
extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
extern bool mem_cgroup_swap_full(struct page *page);
#else
diff --git a/include/linux/threads.h b/include/linux/threads.h
index 18d5a74bcc3d..c34173e6c5f1 100644
--- a/include/linux/threads.h
+++ b/include/linux/threads.h
@@ -38,7 +38,7 @@
* Define a minimum number of pids per cpu. Heuristically based
* on original pid max of 32k for 32 cpus. Also, increase the
* minimum settable value for pid_max on the running system based
- * on similar defaults. See kernel/pid.c:pidmap_init() for details.
+ * on similar defaults. See kernel/pid.c:pid_idr_init() for details.
*/
#define PIDS_PER_CPU_DEFAULT 1024
#define PIDS_PER_CPU_MIN 8
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index ae0dd1948c2b..a185cc75ff52 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -33,6 +33,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
PGREUSE,
PGSTEAL_KSWAPD,
PGSTEAL_DIRECT,
+ PGDEMOTE_KSWAPD,
+ PGDEMOTE_DIRECT,
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_DIRECT_THROTTLE,
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 6d28bc433c1c..6a2f51ebbfd3 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -37,7 +37,7 @@ extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
extern void vmpressure_init(struct vmpressure *vmpr);
extern void vmpressure_cleanup(struct vmpressure *vmpr);
extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
-extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
+extern struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr);
extern int vmpressure_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd,
const char *args);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 5383f7e39816..1f34ddf284dc 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -379,7 +379,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
-void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time);
+void wb_update_bandwidth(struct bdi_writeback *wb);
void balance_dirty_pages_ratelimited(struct address_space *mapping);
bool wb_over_bg_thresh(struct bdi_writeback *wb);
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 9fb2a3bbcdfb..779f3fad9ecd 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -21,7 +21,8 @@
EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \
EM( MR_NUMA_MISPLACED, "numa_misplaced") \
EM( MR_CONTIG_RANGE, "contig_range") \
- EMe(MR_LONGTERM_PIN, "longterm_pin")
+ EM( MR_LONGTERM_PIN, "longterm_pin") \
+ EMe(MR_DEMOTION, "demotion")
/*
* First define the enums in the above macros to be exported to userspace
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 19a00bc7fe86..046d0ccba4cd 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -22,6 +22,7 @@ enum {
MPOL_BIND,
MPOL_INTERLEAVE,
MPOL_LOCAL,
+ MPOL_PREFERRED_MANY,
MPOL_MAX, /* always last member of enum */
};
diff --git a/init/main.c b/init/main.c
index f5b8246e8aa1..5339b8026cec 100644
--- a/init/main.c
+++ b/init/main.c
@@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(static_key_initialized);
unsigned int reset_devices;
EXPORT_SYMBOL(reset_devices);
-static int __init set_reset_devices(char *str)
+static int __init set_reset_devices(char *str __always_unused)
{
reset_devices = 1;
return 1;
@@ -231,13 +231,13 @@ static bool __init obsolete_checksetup(char *line)
unsigned long loops_per_jiffy = (1<<12);
EXPORT_SYMBOL(loops_per_jiffy);
-static int __init debug_kernel(char *str)
+static int __init debug_kernel(char *str __always_unused)
{
console_loglevel = CONSOLE_LOGLEVEL_DEBUG;
return 0;
}
-static int __init quiet_kernel(char *str)
+static int __init quiet_kernel(char *str __always_unused)
{
console_loglevel = CONSOLE_LOGLEVEL_QUIET;
return 0;
@@ -470,7 +470,7 @@ static void __init setup_boot_config(void)
get_boot_config_from_initrd(NULL, NULL);
}
-static int __init warn_bootconfig(char *str)
+static int __init warn_bootconfig(char *str __always_unused)
{
pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n");
return 0;
@@ -496,7 +496,8 @@ static void __init repair_env_string(char *param, char *val)
/* Anything after -- gets handed straight to init. */
static int __init set_init_arg(char *param, char *val,
- const char *unused, void *arg)
+ const char *unused __always_unused,
+ void *arg __always_unused)
{
unsigned int i;
@@ -521,7 +522,8 @@ static int __init set_init_arg(char *param, char *val,
* unused parameters (modprobe will find them in /proc/cmdline).
*/
static int __init unknown_bootoption(char *param, char *val,
- const char *unused, void *arg)
+ const char *unused __always_unused,
+ void *arg __always_unused)
{
size_t len = strlen(param);
@@ -716,7 +718,8 @@ noinline void __ref rest_init(void)
/* Check for early params. */
static int __init do_early_param(char *param, char *val,
- const char *unused, void *arg)
+ const char *unused __always_unused,
+ void *arg __always_unused)
{
const struct obs_kernel_param *p;
@@ -1334,8 +1337,10 @@ static const char *initcall_level_names[] __initdata = {
"late",
};
-static int __init ignore_unknown_bootoption(char *param, char *val,
- const char *unused, void *arg)
+static int __init ignore_unknown_bootoption(char *param __always_unused,
+ char *val __always_unused,
+ const char *unused __always_unused,
+ void *arg __always_unused)
{
return 0;
}
@@ -1473,7 +1478,7 @@ void __weak free_initmem(void)
free_initmem_default(POISON_FREE_INITMEM);
}
-static int __ref kernel_init(void *unused)
+static int __ref kernel_init(void *unused __always_unused)
{
int ret;
diff --git a/ipc/shm.c b/ipc/shm.c
index 748933e376ca..70a41171b8bb 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -173,6 +173,14 @@ static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace
return container_of(ipcp, struct shmid_kernel, shm_perm);
}
+static inline bool is_shm_in_ns(struct ipc_namespace *ns, struct shmid_kernel *shp)
+{
+ int idx = ipcid_to_idx(shp->shm_perm.id);
+ struct shmid_kernel *tshp = shm_obtain_object(ns, idx);
+
+ return !IS_ERR(tshp) && tshp == shp;
+}
+
/*
* shm_lock_(check_) routines are called in the paths where the rwsem
* is not necessarily held.
@@ -415,7 +423,7 @@ void exit_shm(struct task_struct *task)
list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
shp->shm_creator = NULL;
- if (shm_may_destroy(ns, shp)) {
+ if (is_shm_in_ns(ns, shp) && shm_may_destroy(ns, shp)) {
shm_lock_by_ptr(shp);
shm_destroy(ns, shp);
}
diff --git a/ipc/util.c b/ipc/util.c
index 0027e47626b7..45bb8ce6c42c 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -447,8 +447,8 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
{
if (ipcp->key != IPC_PRIVATE)
- rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode,
- ipc_kht_params);
+ WARN_ON(rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode,
+ ipc_kht_params));
}
/**
@@ -498,7 +498,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
{
int idx = ipcid_to_idx(ipcp->id);
- idr_remove(&ids->ipcs_idr, idx);
+ WARN_ON(idr_remove(&ids->ipcs_idr, idx) != ipcp);
ipc_kht_remove(ids, ipcp);
ids->in_use--;
ipcp->deleted = true;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 804b847912dc..79882ce1f2b5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -603,7 +603,7 @@ static int finish_cpu(unsigned int cpu)
*/
if (mm != &init_mm)
idle->active_mm = &init_mm;
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
return 0;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 9a89e7f36acb..3e9ec041a4e5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -475,7 +475,7 @@ static void exit_mm(void)
__set_current_state(TASK_RUNNING);
mmap_read_lock(mm);
}
- mmgrab(mm);
+ mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
diff --git a/kernel/fork.c b/kernel/fork.c
index e8b41e212110..af408d06e1fb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -674,6 +674,53 @@ static void check_mm(struct mm_struct *mm)
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+static void do_shoot_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ if (current->active_mm == mm) {
+ WARN_ON_ONCE(current->mm);
+ current->active_mm = &init_mm;
+ switch_mm(mm, &init_mm, current);
+ }
+}
+
+static void do_check_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void shoot_lazy_tlbs(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * IPI overheads have not found to be expensive, but they could
+ * be reduced in a number of possible ways, for example (in
+ * roughly increasing order of complexity):
+ * - A batch of mms requiring IPIs could be gathered and freed
+ * at once.
+ * - CPUs could store their active mm somewhere that can be
+ * remotely checked without a lock, to filter out
+ * false-positives in the cpumask.
+ * - After mm_users or mm_count reaches zero, switching away
+ * from the mm could clear mm_cpumask to reduce some IPIs
+ * (some batching or delaying would help).
+ * - A delayed freeing and RCU-like quiescing sequence based on
+ * mm switching to avoid IPIs completely.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+ if (IS_ENABLED(CONFIG_DEBUG_VM))
+ on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+ } else {
+ /*
+ * In this case, lazy tlb mms are refounted and would not reach
+ * __mmdrop until all CPUs have switched away and mmdrop()ed.
+ */
+ }
+}
+
/*
* Called when the last reference to the mm
* is dropped: either by a lazy thread or by
@@ -683,6 +730,10 @@ void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
+
+ /* Ensure no CPUs are using this as their lazy tlb mm */
+ shoot_lazy_tlbs(mm);
+
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 9888e2bc8c76..8cc07e7f29aa 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -145,6 +145,47 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
touch_nmi_watchdog();
}
+static void check_killed_task(struct task_struct *t, unsigned long timeout)
+{
+ unsigned long stamp = t->killed_time;
+
+ /*
+ * Ensure the task is not frozen.
+ * Also, skip vfork and any other user process that freezer should skip.
+ */
+ if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
+ return;
+ /*
+ * Skip threads which are already inside do_exit(), for exit_mm() etc.
+ * might take many seconds.
+ */
+ if (t->flags & PF_EXITING)
+ return;
+ if (!stamp) {
+ stamp = jiffies;
+ if (!stamp)
+ stamp++;
+ t->killed_time = stamp;
+ return;
+ }
+ if (time_is_after_jiffies(stamp + timeout * HZ))
+ return;
+ trace_sched_process_hang(t);
+ if (sysctl_hung_task_panic) {
+ console_verbose();
+ hung_task_call_panic = true;
+ }
+ /*
+ * This thread failed to terminate for more than
+ * sysctl_hung_task_timeout_secs seconds, complain:
+ */
+ pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n",
+ t->comm, t->pid, (jiffies - stamp) / HZ);
+ sched_show_task(t);
+ hung_task_show_lock = true;
+ touch_nmi_watchdog();
+}
+
/*
* To avoid extending the RCU grace period for an unbounded amount of time,
* periodically exit the critical section and enter a new one.
@@ -196,6 +237,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
goto unlock;
last_break = jiffies;
}
+ /* Check threads which are about to terminate. */
+ if (unlikely(fatal_signal_pending(t)))
+ check_killed_task(t, timeout);
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5b37a8567168..83ed75d531b4 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1350,14 +1350,19 @@ void kthread_use_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(tsk->mm);
+ /*
+ * It's possible that tsk->active_mm == mm here, but we must
+ * still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), because lazy
+ * mm may not have its own refcount (see mmgrab/drop_lazy_tlb()).
+ */
+ mmgrab(mm);
+
task_lock(tsk);
/* Hold off tlb flush IPIs while switching mm's */
local_irq_disable();
active_mm = tsk->active_mm;
- if (active_mm != mm) {
- mmgrab(mm);
+ if (active_mm != mm)
tsk->active_mm = mm;
- }
tsk->mm = mm;
membarrier_update_current_mm(mm);
switch_mm_irqs_off(active_mm, mm, tsk);
@@ -1374,12 +1379,9 @@ void kthread_use_mm(struct mm_struct *mm)
* memory barrier after storing to tsk->mm, before accessing
* user-space memory. A full memory barrier for membarrier
* {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
- * mmdrop(), or explicitly with smp_mb().
+ * mmdrop_lazy_tlb().
*/
- if (active_mm != mm)
- mmdrop(active_mm);
- else
- smp_mb();
+ mmdrop_lazy_tlb(active_mm);
to_kthread(tsk)->oldfs = force_uaccess_begin();
}
@@ -1411,10 +1413,13 @@ void kthread_unuse_mm(struct mm_struct *mm)
local_irq_disable();
tsk->mm = NULL;
membarrier_update_current_mm(NULL);
+ mmgrab_lazy_tlb(mm);
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
local_irq_enable();
task_unlock(tsk);
+
+ mmdrop(mm);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ca43239a255a..cb5a25a8a0cc 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -51,7 +51,8 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
mutex_lock(&pid_caches_mutex);
/* Name collision forces to do allocation under mutex. */
if (!*pkc)
- *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0);
+ *pkc = kmem_cache_create(name, len, 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0);
mutex_unlock(&pid_caches_mutex);
/* current can fail, but someone else can succeed. */
return READ_ONCE(*pkc);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 67fd8faa7a7f..014ce8b385c7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4517,7 +4517,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
__releases(rq->lock)
{
struct rq *rq = this_rq();
- struct mm_struct *mm = rq->prev_mm;
+ struct mm_struct *mm = NULL;
long prev_state;
/*
@@ -4536,7 +4536,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
current->comm, current->pid, preempt_count()))
preempt_count_set(FORK_PREEMPT_COUNT);
- rq->prev_mm = NULL;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+ mm = rq->prev_lazy_mm;
+ rq->prev_lazy_mm = NULL;
+#endif
/*
* A task struct has one reference for the use as "current".
@@ -4576,13 +4579,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* rq->curr, before returning to userspace, so provide them here:
*
* - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
- * provided by mmdrop(),
+ * provided by mmdrop_lazy_tlb(),
* - a sync_core for SYNC_CORE.
*/
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
}
+
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
@@ -4645,9 +4649,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
/*
* kernel -> kernel lazy + transfer active
- * user -> kernel lazy + mmgrab() active
+ * user -> kernel lazy + mmgrab_lazy_tlb() active
*
- * kernel -> user switch + mmdrop() active
+ * kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
*/
if (!next->mm) { // to kernel
@@ -4655,7 +4659,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
next->active_mm = prev->active_mm;
if (prev->mm) // from user
- mmgrab(prev->active_mm);
+ mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
} else { // to user
@@ -4671,9 +4675,20 @@ context_switch(struct rq *rq, struct task_struct *prev,
switch_mm_irqs_off(prev->active_mm, next->mm, next);
if (!prev->mm) { // from kernel
- /* will mmdrop() in finish_task_switch(). */
- rq->prev_mm = prev->active_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+ /* Will mmdrop_lazy_tlb() in finish_task_switch(). */
+ rq->prev_lazy_mm = prev->active_mm;
prev->active_mm = NULL;
+#else
+ /*
+ * Without MMU_LAZY_TLB_REFCOUNT there is no lazy
+ * tracking (because no rq->prev_lazy_mm) in
+ * finish_task_switch, so no mmdrop_lazy_tlb(), so no
+ * memory barrier for membarrier (see the membarrier
+ * comment in finish_task_switch()). Do it here.
+ */
+ smp_mb();
+#endif
}
}
@@ -9070,7 +9085,7 @@ void __init sched_init(void)
/*
* The boot idle thread does lazy MMU switching as well:
*/
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
enter_lazy_tlb(&init_mm, current);
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a2c6f6ae6392..0fa583db5c4a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -967,7 +967,9 @@ struct rq {
struct task_struct *idle;
struct task_struct *stop;
unsigned long next_balance;
- struct mm_struct *prev_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+ struct mm_struct *prev_lazy_mm;
+#endif
unsigned int clock_update_flags;
u64 clock;
diff --git a/kernel/sys.c b/kernel/sys.c
index ef1a78f5d71c..6ec50924b517 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1960,13 +1960,6 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map)
error = -EINVAL;
/*
- * @brk should be after @end_data in traditional maps.
- */
- if (prctl_map->start_brk <= prctl_map->end_data ||
- prctl_map->brk <= prctl_map->end_data)
- goto out;
-
- /*
* Neither we should allow to override limits if they set.
*/
if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 272f4a272f8c..82d6ff6d85cd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3275,7 +3275,7 @@ static struct ctl_table fs_table[] = {
.procname = "protected_symlinks",
.data = &sysctl_protected_symlinks,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
@@ -3284,7 +3284,7 @@ static struct ctl_table fs_table[] = {
.procname = "protected_hardlinks",
.data = &sysctl_protected_hardlinks,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
@@ -3293,7 +3293,7 @@ static struct ctl_table fs_table[] = {
.procname = "protected_fifos",
.data = &sysctl_protected_fifos,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &two,
@@ -3302,7 +3302,7 @@ static struct ctl_table fs_table[] = {
.procname = "protected_regular",
.data = &sysctl_protected_regular,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &two,
diff --git a/lib/Kconfig b/lib/Kconfig
index d241fe476fda..5c9c0687f76d 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -683,9 +683,6 @@ config PARMAN
config OBJAGG
tristate "objagg" if COMPILE_TEST
-config STRING_SELFTEST
- tristate "Test string functions"
-
endmenu
config GENERIC_IOREMAP
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b41252e664d7..4654e838d68b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1806,6 +1806,12 @@ config IO_STRICT_DEVMEM
menu "$(SRCARCH) Debugging"
+config DEBUG_AID_FOR_SYZBOT
+ bool "Additional debug code for syzbot"
+ default n
+ help
+ This option is intended for testing by syzbot.
+
source "arch/$(SRCARCH)/Kconfig.debug"
endmenu
@@ -2077,8 +2083,9 @@ config TEST_MIN_HEAP
If unsure, say N.
config TEST_SORT
- tristate "Array-based sort test"
- depends on DEBUG_KERNEL || m
+ tristate "Array-based sort test" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
help
This option enables the self-test function of 'sort()' at boot,
or at module load time.
@@ -2179,6 +2186,9 @@ config ASYNC_RAID6_TEST
config TEST_HEXDUMP
tristate "Test functions located in the hexdump module at runtime"
+config STRING_SELFTEST
+ tristate "Test string functions at runtime"
+
config TEST_STRING_HELPERS
tristate "Test functions located in the string_helpers module at runtime"
@@ -2456,8 +2466,7 @@ config SLUB_KUNIT_TEST
config RATIONAL_KUNIT_TEST
tristate "KUnit test for rational.c" if !KUNIT_ALL_TESTS
- depends on KUNIT
- select RATIONAL
+ depends on KUNIT && RATIONAL
default KUNIT_ALL_TESTS
help
This builds the rational math unit test.
diff --git a/lib/math/Kconfig b/lib/math/Kconfig
index f19bc9734fa7..0634b428d0cb 100644
--- a/lib/math/Kconfig
+++ b/lib/math/Kconfig
@@ -14,4 +14,4 @@ config PRIME_NUMBERS
If unsure, say N.
config RATIONAL
- bool
+ tristate
diff --git a/lib/math/rational.c b/lib/math/rational.c
index c0ab51d8fbb9..ec59d426ea63 100644
--- a/lib/math/rational.c
+++ b/lib/math/rational.c
@@ -13,6 +13,7 @@
#include <linux/export.h>
#include <linux/minmax.h>
#include <linux/limits.h>
+#include <linux/module.h>
/*
* calculate best rational approximation for a given fraction
@@ -106,3 +107,5 @@ void rational_best_approximation(
}
EXPORT_SYMBOL(rational_best_approximation);
+
+MODULE_LICENSE("GPL v2");
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 27efa6178153..627aa84f8bbd 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -887,9 +887,8 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
miter->__offset += miter->consumed;
miter->__remaining -= miter->consumed;
- if ((miter->__flags & SG_MITER_TO_SG) &&
- !PageSlab(miter->page))
- flush_kernel_dcache_page(miter->page);
+ if (miter->__flags & SG_MITER_TO_SG)
+ flush_dcache_page(miter->page);
if (miter->__flags & SG_MITER_ATOMIC) {
WARN_ON_ONCE(preemptible());
diff --git a/lib/string.c b/lib/string.c
index 77bd0b1d3296..4fec38fc6e58 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -33,6 +33,23 @@
#include <asm/word-at-a-time.h>
#include <asm/page.h>
+#define BYTES_LONG sizeof(long)
+#define WORD_MASK (BYTES_LONG - 1)
+#define MIN_THRESHOLD (BYTES_LONG * 2)
+
+/* convenience union to avoid cast between different pointer types */
+union types {
+ u8 *as_u8;
+ unsigned long *as_ulong;
+ uintptr_t as_uptr;
+};
+
+union const_types {
+ const u8 *as_u8;
+ const unsigned long *as_ulong;
+ uintptr_t as_uptr;
+};
+
#ifndef __HAVE_ARCH_STRNCASECMP
/**
* strncasecmp - Case insensitive, length-limited string comparison
@@ -793,10 +810,38 @@ EXPORT_SYMBOL(__sysfs_match_string);
*/
void *memset(void *s, int c, size_t count)
{
- char *xs = s;
+ union types dest = { .as_u8 = s };
+
+ if (count >= MIN_THRESHOLD) {
+ unsigned long cu = (unsigned long)c;
+ /* Compose an ulong with 'c' repeated 4/8 times */
+#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
+ cu *= 0x0101010101010101UL;
+#else
+ cu |= cu << 8;
+ cu |= cu << 16;
+ /* Suppress warning on 32 bit machines */
+ cu |= (cu << 16) << 16;
+#endif
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+ /*
+ * Fill the buffer one byte at time until
+ * the destination is word aligned.
+ */
+ for (; count && dest.as_uptr & WORD_MASK; count--)
+ *dest.as_u8++ = c;
+ }
+
+ /* Copy using the largest size allowed */
+ for (; count >= BYTES_LONG; count -= BYTES_LONG)
+ *dest.as_ulong++ = cu;
+ }
+
+ /* copy the remainder */
while (count--)
- *xs++ = c;
+ *dest.as_u8++ = c;
+
return s;
}
EXPORT_SYMBOL(memset);
@@ -869,6 +914,13 @@ EXPORT_SYMBOL(memset64);
#endif
#ifndef __HAVE_ARCH_MEMCPY
+
+#ifdef __BIG_ENDIAN
+#define MERGE_UL(h, l, d) ((h) << ((d) * 8) | (l) >> ((BYTES_LONG - (d)) * 8))
+#else
+#define MERGE_UL(h, l, d) ((h) >> ((d) * 8) | (l) << ((BYTES_LONG - (d)) * 8))
+#endif
+
/**
* memcpy - Copy one area of memory to another
* @dest: Where to copy to
@@ -880,14 +932,64 @@ EXPORT_SYMBOL(memset64);
*/
void *memcpy(void *dest, const void *src, size_t count)
{
- char *tmp = dest;
- const char *s = src;
+ union const_types s = { .as_u8 = src };
+ union types d = { .as_u8 = dest };
+ int distance = 0;
+
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+ if (count < MIN_THRESHOLD)
+ goto copy_remainder;
+ /* Copy a byte at time until destination is aligned. */
+ for (; d.as_uptr & WORD_MASK; count--)
+ *d.as_u8++ = *s.as_u8++;
+
+ distance = s.as_uptr & WORD_MASK;
+ }
+
+ if (distance) {
+ unsigned long last, next;
+
+ /*
+ * s is distance bytes ahead of d, and d just reached
+ * the alignment boundary. Move s backward to word align it
+ * and shift data to compensate for distance, in order to do
+ * word-by-word copy.
+ */
+ s.as_u8 -= distance;
+
+ next = s.as_ulong[0];
+ for (; count >= BYTES_LONG; count -= BYTES_LONG) {
+ last = next;
+ next = s.as_ulong[1];
+
+ d.as_ulong[0] = MERGE_UL(last, next, distance);
+
+ d.as_ulong++;
+ s.as_ulong++;
+ }
+
+ /* Restore s with the original offset. */
+ s.as_u8 += distance;
+ } else {
+ /*
+ * If the source and dest lower bits are the same, do a simple
+ * 32/64 bit wide copy.
+ */
+ for (; count >= BYTES_LONG; count -= BYTES_LONG)
+ *d.as_ulong++ = *s.as_ulong++;
+ }
+
+copy_remainder:
while (count--)
- *tmp++ = *s++;
+ *d.as_u8++ = *s.as_u8++;
+
return dest;
}
EXPORT_SYMBOL(memcpy);
+
+#undef MERGE_UL
+
#endif
#ifndef __HAVE_ARCH_MEMMOVE
@@ -901,19 +1003,13 @@ EXPORT_SYMBOL(memcpy);
*/
void *memmove(void *dest, const void *src, size_t count)
{
- char *tmp;
- const char *s;
+ if (dest < src || src + count <= dest)
+ return memcpy(dest, src, count);
+
+ if (dest > src) {
+ const char *s = src + count;
+ char *tmp = dest + count;
- if (dest <= src) {
- tmp = dest;
- s = src;
- while (count--)
- *tmp++ = *s++;
- } else {
- tmp = dest;
- tmp += count;
- s = src;
- s += count;
while (count--)
*--tmp = *--s;
}
diff --git a/lib/test_sort.c b/lib/test_sort.c
index 52edbe10f2e5..be02e3a098cf 100644
--- a/lib/test_sort.c
+++ b/lib/test_sort.c
@@ -1,4 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
+
+#include <kunit/test.h>
+
#include <linux/sort.h>
#include <linux/slab.h>
#include <linux/module.h>
@@ -7,18 +10,17 @@
#define TEST_LEN 1000
-static int __init cmpint(const void *a, const void *b)
+static int cmpint(const void *a, const void *b)
{
return *(int *)a - *(int *)b;
}
-static int __init test_sort_init(void)
+static void test_sort(struct kunit *test)
{
- int *a, i, r = 1, err = -ENOMEM;
+ int *a, i, r = 1;
- a = kmalloc_array(TEST_LEN, sizeof(*a), GFP_KERNEL);
- if (!a)
- return err;
+ a = kunit_kmalloc_array(test, TEST_LEN, sizeof(*a), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a);
for (i = 0; i < TEST_LEN; i++) {
r = (r * 725861) % 6599;
@@ -27,24 +29,20 @@ static int __init test_sort_init(void)
sort(a, TEST_LEN, sizeof(*a), cmpint, NULL);
- err = -EINVAL;
for (i = 0; i < TEST_LEN-1; i++)
- if (a[i] > a[i+1]) {
- pr_err("test has failed\n");
- goto exit;
- }
- err = 0;
- pr_info("test passed\n");
-exit:
- kfree(a);
- return err;
+ KUNIT_ASSERT_LE(test, a[i], a[i + 1]);
}
-static void __exit test_sort_exit(void)
-{
-}
+static struct kunit_case sort_test_cases[] = {
+ KUNIT_CASE(test_sort),
+ {}
+};
+
+static struct kunit_suite sort_test_suite = {
+ .name = "lib_sort",
+ .test_cases = sort_test_cases,
+};
-module_init(test_sort_init);
-module_exit(test_sort_exit);
+kunit_test_suites(&sort_test_suite);
MODULE_LICENSE("GPL");
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index 01e9543de566..e14993bc84d2 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -35,6 +35,9 @@ __param(int, test_repeat_count, 1,
__param(int, test_loop_count, 1000000,
"Set test loop counter");
+__param(int, nr_pages, 0,
+ "Set number of pages for fix_size_alloc_test(default: 1)");
+
__param(int, run_test_mask, INT_MAX,
"Set tests specified in the mask.\n\n"
"\t\tid: 1, name: fix_size_alloc_test\n"
@@ -262,7 +265,7 @@ static int fix_size_alloc_test(void)
int i;
for (i = 0; i < test_loop_count; i++) {
- ptr = vmalloc(3 * PAGE_SIZE);
+ ptr = vmalloc((nr_pages > 0 ? nr_pages:1) * PAGE_SIZE);
if (!ptr)
return -1;
diff --git a/mm/Kconfig b/mm/Kconfig
index 40a9bfcd5062..1f9bd3371765 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -96,9 +96,6 @@ config HAVE_FAST_GUP
depends on MMU
bool
-config HOLES_IN_ZONE
- bool
-
# Don't discard allocated memory used to track "memory" and "reserved" memblocks
# after early boot, so it can still be used to test for validity of memory.
# Also, memblocks are updated with memory hot(un)plug.
@@ -744,7 +741,7 @@ config DEFERRED_STRUCT_PAGE_INIT
config IDLE_PAGE_TRACKING
bool "Enable idle page tracking"
- depends on SYSFS && MMU
+ depends on SYSFS && MMU && BROKEN
select PAGE_EXTENSION if !64BIT
help
This feature allows to estimate the amount of user pages that have
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 271f2ca862c8..6122c78ce914 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -271,6 +271,14 @@ void wb_wakeup_delayed(struct bdi_writeback *wb)
spin_unlock_bh(&wb->work_lock);
}
+static void wb_update_bandwidth_workfn(struct work_struct *work)
+{
+ struct bdi_writeback *wb = container_of(to_delayed_work(work),
+ struct bdi_writeback, bw_dwork);
+
+ wb_update_bandwidth(wb);
+}
+
/*
* Initial write bandwidth: 100 MB/s
*/
@@ -293,6 +301,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->b_dirty_time);
spin_lock_init(&wb->list_lock);
+ atomic_set(&wb->writeback_inodes, 0);
wb->bw_time_stamp = jiffies;
wb->balanced_dirty_ratelimit = INIT_BW;
wb->dirty_ratelimit = INIT_BW;
@@ -302,6 +311,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
spin_lock_init(&wb->work_lock);
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
+ INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
@@ -350,6 +360,7 @@ static void wb_shutdown(struct bdi_writeback *wb)
mod_delayed_work(bdi_wq, &wb->dwork, 0);
flush_delayed_work(&wb->dwork);
WARN_ON(!list_empty(&wb->work_list));
+ flush_delayed_work(&wb->bw_dwork);
}
static void wb_exit(struct bdi_writeback *wb)
@@ -398,12 +409,12 @@ static void cgwb_release_workfn(struct work_struct *work)
blkcg_unpin_online(blkcg);
fprop_local_destroy_percpu(&wb->memcg_completions);
- percpu_ref_exit(&wb->refcnt);
spin_lock_irq(&cgwb_lock);
list_del(&wb->offline_node);
spin_unlock_irq(&cgwb_lock);
+ percpu_ref_exit(&wb->refcnt);
wb_exit(wb);
WARN_ON_ONCE(!list_empty(&wb->b_attached));
kfree_rcu(wb, rcu);
diff --git a/mm/compaction.c b/mm/compaction.c
index 6f77577be248..b86bcfea321b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -306,16 +306,14 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
* is necessary for the block to be a migration source/target.
*/
do {
- if (pfn_valid_within(pfn)) {
- if (check_source && PageLRU(page)) {
- clear_pageblock_skip(page);
- return true;
- }
+ if (check_source && PageLRU(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
- if (check_target && PageBuddy(page)) {
- clear_pageblock_skip(page);
- return true;
- }
+ if (check_target && PageBuddy(page)) {
+ clear_pageblock_skip(page);
+ return true;
}
page += (1 << PAGE_ALLOC_COSTLY_ORDER);
@@ -585,8 +583,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
break;
nr_scanned++;
- if (!pfn_valid_within(blockpfn))
- goto isolate_fail;
/*
* For compound pages such as THP and hugetlbfs, we can save
@@ -885,8 +881,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
cond_resched();
}
- if (!pfn_valid_within(low_pfn))
- goto isolate_fail;
nr_scanned++;
page = pfn_to_page(low_pfn);
@@ -2398,7 +2392,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
- MR_COMPACTION);
+ MR_COMPACTION, NULL);
trace_mm_compaction_migratepages(cc->nr_migratepages, err,
&cc->migratepages);
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 1c922691aa61..de9def6f7ce1 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -29,6 +29,8 @@
#include <linux/start_kernel.h>
#include <linux/sched/mm.h>
#include <linux/io.h>
+
+#include <asm/cacheflush.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
@@ -58,10 +60,40 @@
#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
#define RANDOM_NZVALUE GENMASK(7, 0)
-static void __init pte_basic_tests(unsigned long pfn, int idx)
+struct pgtable_debug_args {
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ p4d_t *start_p4dp;
+ pud_t *start_pudp;
+ pmd_t *start_pmdp;
+ pgtable_t start_ptep;
+
+ unsigned long vaddr;
+ pgprot_t page_prot;
+ pgprot_t page_prot_none;
+
+ unsigned long pud_pfn;
+ unsigned long pmd_pfn;
+ unsigned long pte_pfn;
+
+ unsigned long fixed_pgd_pfn;
+ unsigned long fixed_p4d_pfn;
+ unsigned long fixed_pud_pfn;
+ unsigned long fixed_pmd_pfn;
+ unsigned long fixed_pte_pfn;
+};
+
+static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx)
{
pgprot_t prot = protection_map[idx];
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, prot);
unsigned long val = idx, *ptr = &val;
pr_debug("Validating PTE basic (%pGv)\n", ptr);
@@ -86,11 +118,9 @@ static void __init pte_basic_tests(unsigned long pfn, int idx)
WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte))));
}
-static void __init pte_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pte_t *ptep,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+static void __init pte_advanced_tests(struct pgtable_debug_args *args)
{
+ struct page *page;
pte_t pte;
/*
@@ -100,39 +130,49 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
*/
pr_debug("Validating PTE advanced\n");
- pte = pfn_pte(pfn, prot);
- set_pte_at(mm, vaddr, ptep, pte);
- ptep_set_wrprotect(mm, vaddr, ptep);
- pte = ptep_get(ptep);
+ page = (args->pte_pfn != ULONG_MAX) ?
+ pfn_to_page(args->pte_pfn) : NULL;
+ if (!page) {
+ pr_debug("%s: Skipped\n", __func__);
+ return;
+ }
+
+ pte = pfn_pte(args->pte_pfn, args->page_prot);
+ set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+ flush_dcache_page(page);
+ ptep_set_wrprotect(args->mm, args->vaddr, args->ptep);
+ pte = ptep_get(args->ptep);
WARN_ON(pte_write(pte));
- ptep_get_and_clear(mm, vaddr, ptep);
- pte = ptep_get(ptep);
+ ptep_get_and_clear(args->mm, args->vaddr, args->ptep);
+ pte = ptep_get(args->ptep);
WARN_ON(!pte_none(pte));
- pte = pfn_pte(pfn, prot);
+ pte = pfn_pte(args->pte_pfn, args->page_prot);
pte = pte_wrprotect(pte);
pte = pte_mkclean(pte);
- set_pte_at(mm, vaddr, ptep, pte);
+ set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+ flush_dcache_page(page);
pte = pte_mkwrite(pte);
pte = pte_mkdirty(pte);
- ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
- pte = ptep_get(ptep);
+ ptep_set_access_flags(args->vma, args->vaddr, args->ptep, pte, 1);
+ pte = ptep_get(args->ptep);
WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
- ptep_get_and_clear_full(mm, vaddr, ptep, 1);
- pte = ptep_get(ptep);
+ ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1);
+ pte = ptep_get(args->ptep);
WARN_ON(!pte_none(pte));
- pte = pfn_pte(pfn, prot);
+ pte = pfn_pte(args->pte_pfn, args->page_prot);
pte = pte_mkyoung(pte);
- set_pte_at(mm, vaddr, ptep, pte);
- ptep_test_and_clear_young(vma, vaddr, ptep);
- pte = ptep_get(ptep);
+ set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+ flush_dcache_page(page);
+ ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep);
+ pte = ptep_get(args->ptep);
WARN_ON(pte_young(pte));
}
-static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_savedwrite_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none);
if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
return;
@@ -143,7 +183,7 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_basic_tests(unsigned long pfn, int idx)
+static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx)
{
pgprot_t prot = protection_map[idx];
unsigned long val = idx, *ptr = &val;
@@ -153,7 +193,7 @@ static void __init pmd_basic_tests(unsigned long pfn, int idx)
return;
pr_debug("Validating PMD basic (%pGv)\n", ptr);
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, prot);
/*
* This test needs to be executed after the given page table entry
@@ -181,57 +221,63 @@ static void __init pmd_basic_tests(unsigned long pfn, int idx)
WARN_ON(!pmd_bad(pmd_mkhuge(pmd)));
}
-static void __init pmd_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pmd_t *pmdp,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot, pgtable_t pgtable)
+static void __init pmd_advanced_tests(struct pgtable_debug_args *args)
{
+ struct page *page;
pmd_t pmd;
+ unsigned long vaddr = (args->vaddr & HPAGE_PMD_MASK);
if (!has_transparent_hugepage())
return;
pr_debug("Validating PMD advanced\n");
- /* Align the address wrt HPAGE_PMD_SIZE */
- vaddr &= HPAGE_PMD_MASK;
+ page = (args->pmd_pfn != ULONG_MAX) ?
+ pfn_to_page(args->pmd_pfn) : NULL;
+ if (!page) {
+ pr_debug("%s: Skipped\n", __func__);
+ return;
+ }
- pgtable_trans_huge_deposit(mm, pmdp, pgtable);
+ pgtable_trans_huge_deposit(args->mm, args->pmdp, args->start_ptep);
- pmd = pfn_pmd(pfn, prot);
- set_pmd_at(mm, vaddr, pmdp, pmd);
- pmdp_set_wrprotect(mm, vaddr, pmdp);
- pmd = READ_ONCE(*pmdp);
+ pmd = pfn_pmd(args->pmd_pfn, args->page_prot);
+ set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
+ flush_dcache_page(page);
+ pmdp_set_wrprotect(args->mm, vaddr, args->pmdp);
+ pmd = READ_ONCE(*(args->pmdp));
WARN_ON(pmd_write(pmd));
- pmdp_huge_get_and_clear(mm, vaddr, pmdp);
- pmd = READ_ONCE(*pmdp);
+ pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp);
+ pmd = READ_ONCE(*(args->pmdp));
WARN_ON(!pmd_none(pmd));
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->pmd_pfn, args->page_prot);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
- set_pmd_at(mm, vaddr, pmdp, pmd);
+ set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
+ flush_dcache_page(page);
pmd = pmd_mkwrite(pmd);
pmd = pmd_mkdirty(pmd);
- pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
- pmd = READ_ONCE(*pmdp);
+ pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1);
+ pmd = READ_ONCE(*(args->pmdp));
WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
- pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
- pmd = READ_ONCE(*pmdp);
+ pmdp_huge_get_and_clear_full(args->vma, vaddr, args->pmdp, 1);
+ pmd = READ_ONCE(*(args->pmdp));
WARN_ON(!pmd_none(pmd));
- pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+ pmd = pmd_mkhuge(pfn_pmd(args->pmd_pfn, args->page_prot));
pmd = pmd_mkyoung(pmd);
- set_pmd_at(mm, vaddr, pmdp, pmd);
- pmdp_test_and_clear_young(vma, vaddr, pmdp);
- pmd = READ_ONCE(*pmdp);
+ set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
+ flush_dcache_page(page);
+ pmdp_test_and_clear_young(args->vma, vaddr, args->pmdp);
+ pmd = READ_ONCE(*(args->pmdp));
WARN_ON(pmd_young(pmd));
/* Clear the pte entries */
- pmdp_huge_get_and_clear(mm, vaddr, pmdp);
- pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
+ pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp);
+ pgtable_trans_huge_withdraw(args->mm, args->pmdp);
}
-static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_leaf_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -239,7 +285,7 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PMD leaf\n");
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
/*
* PMD based THP is a leaf entry.
@@ -248,7 +294,7 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_leaf(pmd));
}
-static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -259,13 +305,13 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PMD saved write\n");
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none);
WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx)
+static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx)
{
pgprot_t prot = protection_map[idx];
unsigned long val = idx, *ptr = &val;
@@ -275,7 +321,7 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int
return;
pr_debug("Validating PUD basic (%pGv)\n", ptr);
- pud = pfn_pud(pfn, prot);
+ pud = pfn_pud(args->fixed_pud_pfn, prot);
/*
* This test needs to be executed after the given page table entry
@@ -296,7 +342,7 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int
WARN_ON(pud_dirty(pud_wrprotect(pud_mkclean(pud))));
WARN_ON(!pud_dirty(pud_wrprotect(pud_mkdirty(pud))));
- if (mm_pmd_folded(mm))
+ if (mm_pmd_folded(args->mm))
return;
/*
@@ -306,58 +352,64 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int
WARN_ON(!pud_bad(pud_mkhuge(pud)));
}
-static void __init pud_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pud_t *pudp,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+static void __init pud_advanced_tests(struct pgtable_debug_args *args)
{
+ struct page *page;
+ unsigned long vaddr = (args->vaddr & HPAGE_PUD_MASK);
pud_t pud;
if (!has_transparent_hugepage())
return;
pr_debug("Validating PUD advanced\n");
- /* Align the address wrt HPAGE_PUD_SIZE */
- vaddr &= HPAGE_PUD_MASK;
+ page = (args->pud_pfn != ULONG_MAX) ?
+ pfn_to_page(args->pud_pfn) : NULL;
+ if (!page) {
+ pr_debug("%s: Skipped\n", __func__);
+ return;
+ }
- pud = pfn_pud(pfn, prot);
- set_pud_at(mm, vaddr, pudp, pud);
- pudp_set_wrprotect(mm, vaddr, pudp);
- pud = READ_ONCE(*pudp);
+ pud = pfn_pud(args->pud_pfn, args->page_prot);
+ set_pud_at(args->mm, vaddr, args->pudp, pud);
+ flush_dcache_page(page);
+ pudp_set_wrprotect(args->mm, vaddr, args->pudp);
+ pud = READ_ONCE(*(args->pudp));
WARN_ON(pud_write(pud));
#ifndef __PAGETABLE_PMD_FOLDED
- pudp_huge_get_and_clear(mm, vaddr, pudp);
- pud = READ_ONCE(*pudp);
+ pudp_huge_get_and_clear(args->mm, vaddr, args->pudp);
+ pud = READ_ONCE(*(args->pudp));
WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
- pud = pfn_pud(pfn, prot);
+ pud = pfn_pud(args->pud_pfn, args->page_prot);
pud = pud_wrprotect(pud);
pud = pud_mkclean(pud);
- set_pud_at(mm, vaddr, pudp, pud);
+ set_pud_at(args->mm, vaddr, args->pudp, pud);
+ flush_dcache_page(page);
pud = pud_mkwrite(pud);
pud = pud_mkdirty(pud);
- pudp_set_access_flags(vma, vaddr, pudp, pud, 1);
- pud = READ_ONCE(*pudp);
+ pudp_set_access_flags(args->vma, vaddr, args->pudp, pud, 1);
+ pud = READ_ONCE(*(args->pudp));
WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
#ifndef __PAGETABLE_PMD_FOLDED
- pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
- pud = READ_ONCE(*pudp);
+ pudp_huge_get_and_clear_full(args->mm, vaddr, args->pudp, 1);
+ pud = READ_ONCE(*(args->pudp));
WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
- pud = pfn_pud(pfn, prot);
+ pud = pfn_pud(args->pud_pfn, args->page_prot);
pud = pud_mkyoung(pud);
- set_pud_at(mm, vaddr, pudp, pud);
- pudp_test_and_clear_young(vma, vaddr, pudp);
- pud = READ_ONCE(*pudp);
+ set_pud_at(args->mm, vaddr, args->pudp, pud);
+ flush_dcache_page(page);
+ pudp_test_and_clear_young(args->vma, vaddr, args->pudp);
+ pud = READ_ONCE(*(args->pudp));
WARN_ON(pud_young(pud));
- pudp_huge_get_and_clear(mm, vaddr, pudp);
+ pudp_huge_get_and_clear(args->mm, vaddr, args->pudp);
}
-static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
+static void __init pud_leaf_tests(struct pgtable_debug_args *args)
{
pud_t pud;
@@ -365,7 +417,7 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PUD leaf\n");
- pud = pfn_pud(pfn, prot);
+ pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
/*
* PUD based THP is a leaf entry.
*/
@@ -373,41 +425,26 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pud_leaf(pud));
}
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { }
-static void __init pud_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pud_t *pudp,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
-{
-}
-static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { }
+static void __init pud_advanced_tests(struct pgtable_debug_args *args) { }
+static void __init pud_leaf_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_basic_tests(unsigned long pfn, int idx) { }
-static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { }
-static void __init pmd_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pmd_t *pmdp,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot, pgtable_t pgtable)
-{
-}
-static void __init pud_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma, pud_t *pudp,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
-{
-}
-static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { }
+static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { }
+static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { }
+static void __init pud_advanced_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { }
+static void __init pud_leaf_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+static void __init pmd_huge_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
- if (!arch_vmap_pmd_supported(prot))
+ if (!arch_vmap_pmd_supported(args->page_prot))
return;
pr_debug("Validating PMD huge\n");
@@ -415,18 +452,19 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
* X86 defined pmd_set_huge() verifies that the given
* PMD is not a populated non-leaf entry.
*/
- WRITE_ONCE(*pmdp, __pmd(0));
- WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
- WARN_ON(!pmd_clear_huge(pmdp));
- pmd = READ_ONCE(*pmdp);
+ WRITE_ONCE(*(args->pmdp), __pmd(0));
+ WARN_ON(!pmd_set_huge(args->pmdp, __pfn_to_phys(args->fixed_pmd_pfn),
+ args->page_prot));
+ WARN_ON(!pmd_clear_huge(args->pmdp));
+ pmd = READ_ONCE(*(args->pmdp));
WARN_ON(!pmd_none(pmd));
}
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+static void __init pud_huge_tests(struct pgtable_debug_args *args)
{
pud_t pud;
- if (!arch_vmap_pud_supported(prot))
+ if (!arch_vmap_pud_supported(args->page_prot))
return;
pr_debug("Validating PUD huge\n");
@@ -434,18 +472,19 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
* X86 defined pud_set_huge() verifies that the given
* PUD is not a populated non-leaf entry.
*/
- WRITE_ONCE(*pudp, __pud(0));
- WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
- WARN_ON(!pud_clear_huge(pudp));
- pud = READ_ONCE(*pudp);
+ WRITE_ONCE(*(args->pudp), __pud(0));
+ WARN_ON(!pud_set_huge(args->pudp, __pfn_to_phys(args->fixed_pud_pfn),
+ args->page_prot));
+ WARN_ON(!pud_clear_huge(args->pudp));
+ pud = READ_ONCE(*(args->pudp));
WARN_ON(!pud_none(pud));
}
#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { }
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_huge_tests(struct pgtable_debug_args *args) { }
+static void __init pud_huge_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init p4d_basic_tests(void)
{
p4d_t p4d;
@@ -454,7 +493,7 @@ static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!p4d_same(p4d, p4d));
}
-static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init pgd_basic_tests(void)
{
pgd_t pgd;
@@ -464,27 +503,26 @@ static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
}
#ifndef __PAGETABLE_PUD_FOLDED
-static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp)
+static void __init pud_clear_tests(struct pgtable_debug_args *args)
{
- pud_t pud = READ_ONCE(*pudp);
+ pud_t pud = READ_ONCE(*(args->pudp));
- if (mm_pmd_folded(mm))
+ if (mm_pmd_folded(args->mm))
return;
pr_debug("Validating PUD clear\n");
pud = __pud(pud_val(pud) | RANDOM_ORVALUE);
- WRITE_ONCE(*pudp, pud);
- pud_clear(pudp);
- pud = READ_ONCE(*pudp);
+ WRITE_ONCE(*(args->pudp), pud);
+ pud_clear(args->pudp);
+ pud = READ_ONCE(*(args->pudp));
WARN_ON(!pud_none(pud));
}
-static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
- pmd_t *pmdp)
+static void __init pud_populate_tests(struct pgtable_debug_args *args)
{
pud_t pud;
- if (mm_pmd_folded(mm))
+ if (mm_pmd_folded(args->mm))
return;
pr_debug("Validating PUD populate\n");
@@ -492,40 +530,36 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
* This entry points to next level page table page.
* Hence this must not qualify as pud_bad().
*/
- pud_populate(mm, pudp, pmdp);
- pud = READ_ONCE(*pudp);
+ pud_populate(args->mm, args->pudp, args->start_pmdp);
+ pud = READ_ONCE(*(args->pudp));
WARN_ON(pud_bad(pud));
}
#else /* !__PAGETABLE_PUD_FOLDED */
-static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) { }
-static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
- pmd_t *pmdp)
-{
-}
+static void __init pud_clear_tests(struct pgtable_debug_args *args) { }
+static void __init pud_populate_tests(struct pgtable_debug_args *args) { }
#endif /* PAGETABLE_PUD_FOLDED */
#ifndef __PAGETABLE_P4D_FOLDED
-static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp)
+static void __init p4d_clear_tests(struct pgtable_debug_args *args)
{
- p4d_t p4d = READ_ONCE(*p4dp);
+ p4d_t p4d = READ_ONCE(*(args->p4dp));
- if (mm_pud_folded(mm))
+ if (mm_pud_folded(args->mm))
return;
pr_debug("Validating P4D clear\n");
p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE);
- WRITE_ONCE(*p4dp, p4d);
- p4d_clear(p4dp);
- p4d = READ_ONCE(*p4dp);
+ WRITE_ONCE(*(args->p4dp), p4d);
+ p4d_clear(args->p4dp);
+ p4d = READ_ONCE(*(args->p4dp));
WARN_ON(!p4d_none(p4d));
}
-static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
- pud_t *pudp)
+static void __init p4d_populate_tests(struct pgtable_debug_args *args)
{
p4d_t p4d;
- if (mm_pud_folded(mm))
+ if (mm_pud_folded(args->mm))
return;
pr_debug("Validating P4D populate\n");
@@ -533,34 +567,33 @@ static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
* This entry points to next level page table page.
* Hence this must not qualify as p4d_bad().
*/
- pud_clear(pudp);
- p4d_clear(p4dp);
- p4d_populate(mm, p4dp, pudp);
- p4d = READ_ONCE(*p4dp);
+ pud_clear(args->pudp);
+ p4d_clear(args->p4dp);
+ p4d_populate(args->mm, args->p4dp, args->start_pudp);
+ p4d = READ_ONCE(*(args->p4dp));
WARN_ON(p4d_bad(p4d));
}
-static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp)
+static void __init pgd_clear_tests(struct pgtable_debug_args *args)
{
- pgd_t pgd = READ_ONCE(*pgdp);
+ pgd_t pgd = READ_ONCE(*(args->pgdp));
- if (mm_p4d_folded(mm))
+ if (mm_p4d_folded(args->mm))
return;
pr_debug("Validating PGD clear\n");
pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE);
- WRITE_ONCE(*pgdp, pgd);
- pgd_clear(pgdp);
- pgd = READ_ONCE(*pgdp);
+ WRITE_ONCE(*(args->pgdp), pgd);
+ pgd_clear(args->pgdp);
+ pgd = READ_ONCE(*(args->pgdp));
WARN_ON(!pgd_none(pgd));
}
-static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
- p4d_t *p4dp)
+static void __init pgd_populate_tests(struct pgtable_debug_args *args)
{
pgd_t pgd;
- if (mm_p4d_folded(mm))
+ if (mm_p4d_folded(args->mm))
return;
pr_debug("Validating PGD populate\n");
@@ -568,56 +601,57 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
* This entry points to next level page table page.
* Hence this must not qualify as pgd_bad().
*/
- p4d_clear(p4dp);
- pgd_clear(pgdp);
- pgd_populate(mm, pgdp, p4dp);
- pgd = READ_ONCE(*pgdp);
+ p4d_clear(args->p4dp);
+ pgd_clear(args->pgdp);
+ pgd_populate(args->mm, args->pgdp, args->start_p4dp);
+ pgd = READ_ONCE(*(args->pgdp));
WARN_ON(pgd_bad(pgd));
}
#else /* !__PAGETABLE_P4D_FOLDED */
-static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) { }
-static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) { }
-static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
- pud_t *pudp)
-{
-}
-static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
- p4d_t *p4dp)
-{
-}
+static void __init p4d_clear_tests(struct pgtable_debug_args *args) { }
+static void __init pgd_clear_tests(struct pgtable_debug_args *args) { }
+static void __init p4d_populate_tests(struct pgtable_debug_args *args) { }
+static void __init pgd_populate_tests(struct pgtable_debug_args *args) { }
#endif /* PAGETABLE_P4D_FOLDED */
-static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
- unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+static void __init pte_clear_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ struct page *page;
+ pte_t pte;
pr_debug("Validating PTE clear\n");
+ page = (args->pte_pfn != ULONG_MAX) ?
+ pfn_to_page(args->pte_pfn) : NULL;
+ if (!page) {
+ pr_debug("%s: Skipped\n", __func__);
+ return;
+ }
+
+ pte = pfn_pte(args->pte_pfn, args->page_prot);
#ifndef CONFIG_RISCV
pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
#endif
- set_pte_at(mm, vaddr, ptep, pte);
+ set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+ flush_dcache_page(page);
barrier();
- pte_clear(mm, vaddr, ptep);
- pte = ptep_get(ptep);
+ pte_clear(args->mm, args->vaddr, args->ptep);
+ pte = ptep_get(args->ptep);
WARN_ON(!pte_none(pte));
}
-static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp)
+static void __init pmd_clear_tests(struct pgtable_debug_args *args)
{
- pmd_t pmd = READ_ONCE(*pmdp);
+ pmd_t pmd = READ_ONCE(*(args->pmdp));
pr_debug("Validating PMD clear\n");
pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE);
- WRITE_ONCE(*pmdp, pmd);
- pmd_clear(pmdp);
- pmd = READ_ONCE(*pmdp);
+ WRITE_ONCE(*(args->pmdp), pmd);
+ pmd_clear(args->pmdp);
+ pmd = READ_ONCE(*(args->pmdp));
WARN_ON(!pmd_none(pmd));
}
-static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
- pgtable_t pgtable)
+static void __init pmd_populate_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -626,14 +660,14 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
* This entry points to next level page table page.
* Hence this must not qualify as pmd_bad().
*/
- pmd_populate(mm, pmdp, pgtable);
- pmd = READ_ONCE(*pmdp);
+ pmd_populate(args->mm, args->pmdp, args->start_ptep);
+ pmd = READ_ONCE(*(args->pmdp));
WARN_ON(pmd_bad(pmd));
}
-static void __init pte_special_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_special_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL))
return;
@@ -642,9 +676,9 @@ static void __init pte_special_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pte_special(pte_mkspecial(pte)));
}
-static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_protnone_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none);
if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
return;
@@ -655,7 +689,7 @@ static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_protnone_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -666,25 +700,25 @@ static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PMD protnone\n");
- pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+ pmd = pmd_mkhuge(pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none));
WARN_ON(!pmd_protnone(pmd));
WARN_ON(!pmd_present(pmd));
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
-static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_devmap_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
pr_debug("Validating PTE devmap\n");
WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_devmap_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -692,12 +726,12 @@ static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PMD devmap\n");
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pud_devmap_tests(struct pgtable_debug_args *args)
{
pud_t pud;
@@ -705,25 +739,25 @@ static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PUD devmap\n");
- pud = pfn_pud(pfn, prot);
+ pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
}
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
+static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#else
-static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pte_devmap_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
+static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
-static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
return;
@@ -733,9 +767,9 @@ static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(pte_soft_dirty(pte_clear_soft_dirty(pte)));
}
-static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
return;
@@ -746,7 +780,7 @@ static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -757,12 +791,12 @@ static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PMD soft dirty\n");
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd)));
WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd)));
}
-static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -774,31 +808,29 @@ static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PMD swap soft dirty\n");
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
-{
-}
+static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_swap_tests(struct pgtable_debug_args *args)
{
swp_entry_t swp;
pte_t pte;
pr_debug("Validating PTE swap\n");
- pte = pfn_pte(pfn, prot);
+ pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
swp = __pte_to_swp_entry(pte);
pte = __swp_entry_to_pte(swp);
- WARN_ON(pfn != pte_pfn(pte));
+ WARN_ON(args->fixed_pte_pfn != pte_pfn(pte));
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_swap_tests(struct pgtable_debug_args *args)
{
swp_entry_t swp;
pmd_t pmd;
@@ -807,16 +839,16 @@ static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PMD swap\n");
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
swp = __pmd_to_swp_entry(pmd);
pmd = __swp_entry_to_pmd(swp);
- WARN_ON(pfn != pmd_pfn(pmd));
+ WARN_ON(args->fixed_pmd_pfn != pmd_pfn(pmd));
}
#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
-static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_swap_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
-static void __init swap_migration_tests(void)
+static void __init swap_migration_tests(struct pgtable_debug_args *args)
{
struct page *page;
swp_entry_t swp;
@@ -832,9 +864,10 @@ static void __init swap_migration_tests(void)
* problematic. Lets allocate a dedicated page explicitly for this
* purpose that will be freed subsequently.
*/
- page = alloc_page(GFP_KERNEL);
+ page = (args->pte_pfn != ULONG_MAX) ?
+ pfn_to_page(args->pte_pfn) : NULL;
if (!page) {
- pr_err("page allocation failed\n");
+ pr_err("no page available\n");
return;
}
@@ -855,11 +888,10 @@ static void __init swap_migration_tests(void)
WARN_ON(!is_migration_entry(swp));
WARN_ON(is_writable_migration_entry(swp));
__ClearPageLocked(page);
- __free_page(page);
}
#ifdef CONFIG_HUGETLB_PAGE
-static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init hugetlb_basic_tests(struct pgtable_debug_args *args)
{
struct page *page;
pte_t pte;
@@ -869,25 +901,25 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
* Accessing the page associated with the pfn is safe here,
* as it was previously derived from a real kernel symbol.
*/
- page = pfn_to_page(pfn);
- pte = mk_huge_pte(page, prot);
+ page = pfn_to_page(args->fixed_pmd_pfn);
+ pte = mk_huge_pte(page, args->page_prot);
WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
- pte = pfn_pte(pfn, prot);
+ pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
WARN_ON(!pte_huge(pte_mkhuge(pte)));
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
}
#else /* !CONFIG_HUGETLB_PAGE */
-static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_thp_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
@@ -906,7 +938,7 @@ static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
* needs to return true. pmd_present() should be true whenever
* pmd_trans_huge() returns true.
*/
- pmd = pfn_pmd(pfn, prot);
+ pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
WARN_ON(!pmd_trans_huge(pmd_mkhuge(pmd)));
#ifndef __HAVE_ARCH_PMDP_INVALIDATE
@@ -916,7 +948,7 @@ static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
+static void __init pud_thp_tests(struct pgtable_debug_args *args)
{
pud_t pud;
@@ -924,7 +956,7 @@ static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
return;
pr_debug("Validating PUD based THP\n");
- pud = pfn_pud(pfn, prot);
+ pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
WARN_ON(!pud_trans_huge(pud_mkhuge(pud)));
/*
@@ -936,11 +968,11 @@ static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
*/
}
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_thp_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_thp_tests(struct pgtable_debug_args *args) { }
+static void __init pud_thp_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static unsigned long __init get_random_vaddr(void)
@@ -955,85 +987,174 @@ static unsigned long __init get_random_vaddr(void)
return random_vaddr;
}
-static int __init debug_vm_pgtable(void)
+static void __init destroy_args(struct pgtable_debug_args *args)
{
- struct vm_area_struct *vma;
- struct mm_struct *mm;
- pgd_t *pgdp;
- p4d_t *p4dp, *saved_p4dp;
- pud_t *pudp, *saved_pudp;
- pmd_t *pmdp, *saved_pmdp, pmd;
- pte_t *ptep;
- pgtable_t saved_ptep;
- pgprot_t prot, protnone;
- phys_addr_t paddr;
- unsigned long vaddr, pte_aligned, pmd_aligned;
- unsigned long pud_aligned, p4d_aligned, pgd_aligned;
- spinlock_t *ptl = NULL;
- int idx;
+ struct page *page = NULL;
+
+ /* Free (huge) page */
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
+ has_transparent_hugepage() &&
+ args->pud_pfn != ULONG_MAX) {
+ page = pfn_to_page(args->pud_pfn);
+ __free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT);
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ has_transparent_hugepage() &&
+ args->pmd_pfn != ULONG_MAX) {
+ page = pfn_to_page(args->pmd_pfn);
+ __free_pages(page, HPAGE_PMD_ORDER);
+ } else if (args->pte_pfn != ULONG_MAX) {
+ page = pfn_to_page(args->pte_pfn);
+ __free_pages(page, 0);
+ }
- pr_info("Validating architecture page table helpers\n");
- prot = vm_get_page_prot(VMFLAGS);
- vaddr = get_random_vaddr();
- mm = mm_alloc();
- if (!mm) {
- pr_err("mm_struct allocation failed\n");
- return 1;
+ /* Free page table */
+ if (args->start_ptep) {
+ pte_free(args->mm, args->start_ptep);
+ mm_dec_nr_ptes(args->mm);
}
- /*
- * __P000 (or even __S000) will help create page table entries with
- * PROT_NONE permission as required for pxx_protnone_tests().
- */
- protnone = __P000;
+ if (args->start_pmdp) {
+ pmd_free(args->mm, args->start_pmdp);
+ mm_dec_nr_pmds(args->mm);
+ }
+
+ if (args->start_pudp) {
+ pud_free(args->mm, args->start_pudp);
+ mm_dec_nr_puds(args->mm);
+ }
+
+ if (args->start_p4dp)
+ p4d_free(args->mm, args->p4dp);
+
+ /* Free vma and mm struct */
+ if (args->vma)
+ vm_area_free(args->vma);
+ if (args->mm)
+ mmdrop(args->mm);
+}
- vma = vm_area_alloc(mm);
- if (!vma) {
- pr_err("vma allocation failed\n");
- return 1;
+static int __init init_args(struct pgtable_debug_args *args)
+{
+ struct page *page = NULL;
+ phys_addr_t phys;
+ int ret = 0;
+
+ /* Initialize the debugging data */
+ memset(args, 0, sizeof(*args));
+ args->page_prot = vm_get_page_prot(VMFLAGS);
+ args->page_prot_none = __P000;
+ args->pud_pfn = ULONG_MAX;
+ args->pmd_pfn = ULONG_MAX;
+ args->pte_pfn = ULONG_MAX;
+ args->fixed_pgd_pfn = ULONG_MAX;
+ args->fixed_p4d_pfn = ULONG_MAX;
+ args->fixed_pud_pfn = ULONG_MAX;
+ args->fixed_pmd_pfn = ULONG_MAX;
+ args->fixed_pte_pfn = ULONG_MAX;
+
+ /* Allocate mm and vma */
+ args->mm = mm_alloc();
+ if (!args->mm) {
+ pr_err("Failed to allocate mm struct\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ args->vma = vm_area_alloc(args->mm);
+ if (!args->vma) {
+ pr_err("Failed to allocate vma\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ /* Figure out the virtual address and allocate page table entries */
+ args->vaddr = get_random_vaddr();
+ args->pgdp = pgd_offset(args->mm, args->vaddr);
+ args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr);
+ args->pudp = args->p4dp ?
+ pud_alloc(args->mm, args->p4dp, args->vaddr) : NULL;
+ args->pmdp = args->pudp ?
+ pmd_alloc(args->mm, args->pudp, args->vaddr) : NULL;
+ args->ptep = args->pmdp ?
+ pte_alloc_map(args->mm, args->pmdp, args->vaddr) : NULL;
+ if (!args->ptep) {
+ pr_err("Failed to allocate page table\n");
+ ret = -ENOMEM;
+ goto error;
}
/*
- * PFN for mapping at PTE level is determined from a standard kernel
- * text symbol. But pfns for higher page table levels are derived by
- * masking lower bits of this real pfn. These derived pfns might not
- * exist on the platform but that does not really matter as pfn_pxx()
- * helpers will still create appropriate entries for the test. This
- * helps avoid large memory block allocations to be used for mapping
- * at higher page table levels.
+ * The above page table entries will be modified. Lets save the
+ * page table entries so that they can be released when the tests
+ * are completed.
*/
- paddr = __pa_symbol(&start_kernel);
-
- pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT;
- pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT;
- pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT;
- p4d_aligned = (paddr & P4D_MASK) >> PAGE_SHIFT;
- pgd_aligned = (paddr & PGDIR_MASK) >> PAGE_SHIFT;
- WARN_ON(!pfn_valid(pte_aligned));
-
- pgdp = pgd_offset(mm, vaddr);
- p4dp = p4d_alloc(mm, pgdp, vaddr);
- pudp = pud_alloc(mm, p4dp, vaddr);
- pmdp = pmd_alloc(mm, pudp, vaddr);
+ args->start_p4dp = p4d_offset(args->pgdp, 0UL);
+ args->start_pudp = pud_offset(args->p4dp, 0UL);
+ args->start_pmdp = pmd_offset(args->pudp, 0UL);
+ args->start_ptep = pmd_pgtable(READ_ONCE(*(args->pmdp)));
+
/*
- * Allocate pgtable_t
+ * Figure out the fixed addresses, which are all around the kernel
+ * symbol (@start_kernel). The corresponding PFNs might be invalid,
+ * but it's fine as the following tests won't access the pages.
*/
- if (pte_alloc(mm, pmdp)) {
- pr_err("pgtable allocation failed\n");
- return 1;
- }
+ phys = __pa_symbol(&start_kernel);
+ args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
+ args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
+ args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
+ args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
+ args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
/*
- * Save all the page table page addresses as the page table
- * entries will be used for testing with random or garbage
- * values. These saved addresses will be used for freeing
- * page table pages.
+ * Allocate (huge) pages because some of the tests need to access
+ * the data in the pages. The corresponding tests will be skipped
+ * if we fail to allocate (huge) pages.
*/
- pmd = READ_ONCE(*pmdp);
- saved_p4dp = p4d_offset(pgdp, 0UL);
- saved_pudp = pud_offset(p4dp, 0UL);
- saved_pmdp = pmd_offset(pudp, 0UL);
- saved_ptep = pmd_pgtable(pmd);
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
+ has_transparent_hugepage()) {
+ page = alloc_pages(GFP_KERNEL | __GFP_NOWARN,
+ HPAGE_PUD_SHIFT - PAGE_SHIFT);
+ if (page) {
+ args->pud_pfn = page_to_pfn(page);
+ args->pmd_pfn = args->pud_pfn;
+ args->pte_pfn = args->pud_pfn;
+ return 0;
+ }
+ }
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ has_transparent_hugepage()) {
+ page = alloc_pages(GFP_KERNEL | __GFP_NOWARN, HPAGE_PMD_ORDER);
+ if (page) {
+ args->pmd_pfn = page_to_pfn(page);
+ args->pte_pfn = args->pmd_pfn;
+ return 0;
+ }
+ }
+
+ page = alloc_pages(GFP_KERNEL, 0);
+ if (page)
+ args->pte_pfn = page_to_pfn(page);
+
+ return 0;
+
+error:
+ destroy_args(args);
+ return ret;
+}
+
+static int __init debug_vm_pgtable(void)
+{
+ struct pgtable_debug_args args;
+ spinlock_t *ptl = NULL;
+ int idx, ret;
+
+ pr_info("Validating architecture page table helpers\n");
+ ret = init_args(&args);
+ if (ret)
+ return ret;
/*
* Iterate over the protection_map[] to make sure that all
@@ -1042,9 +1163,9 @@ static int __init debug_vm_pgtable(void)
* given page table entry.
*/
for (idx = 0; idx < ARRAY_SIZE(protection_map); idx++) {
- pte_basic_tests(pte_aligned, idx);
- pmd_basic_tests(pmd_aligned, idx);
- pud_basic_tests(mm, pud_aligned, idx);
+ pte_basic_tests(&args, idx);
+ pmd_basic_tests(&args, idx);
+ pud_basic_tests(&args, idx);
}
/*
@@ -1054,79 +1175,69 @@ static int __init debug_vm_pgtable(void)
* the above iteration for now to save some test execution
* time.
*/
- p4d_basic_tests(p4d_aligned, prot);
- pgd_basic_tests(pgd_aligned, prot);
+ p4d_basic_tests();
+ pgd_basic_tests();
+ hugetlb_basic_tests(&args);
- pmd_leaf_tests(pmd_aligned, prot);
- pud_leaf_tests(pud_aligned, prot);
+ pmd_leaf_tests(&args);
+ pud_leaf_tests(&args);
- pte_savedwrite_tests(pte_aligned, protnone);
- pmd_savedwrite_tests(pmd_aligned, protnone);
+ pte_savedwrite_tests(&args);
+ pmd_savedwrite_tests(&args);
- pte_special_tests(pte_aligned, prot);
- pte_protnone_tests(pte_aligned, protnone);
- pmd_protnone_tests(pmd_aligned, protnone);
+ pte_special_tests(&args);
+ pte_protnone_tests(&args);
+ pmd_protnone_tests(&args);
- pte_devmap_tests(pte_aligned, prot);
- pmd_devmap_tests(pmd_aligned, prot);
- pud_devmap_tests(pud_aligned, prot);
+ pte_devmap_tests(&args);
+ pmd_devmap_tests(&args);
+ pud_devmap_tests(&args);
- pte_soft_dirty_tests(pte_aligned, prot);
- pmd_soft_dirty_tests(pmd_aligned, prot);
- pte_swap_soft_dirty_tests(pte_aligned, prot);
- pmd_swap_soft_dirty_tests(pmd_aligned, prot);
+ pte_soft_dirty_tests(&args);
+ pmd_soft_dirty_tests(&args);
+ pte_swap_soft_dirty_tests(&args);
+ pmd_swap_soft_dirty_tests(&args);
- pte_swap_tests(pte_aligned, prot);
- pmd_swap_tests(pmd_aligned, prot);
+ pte_swap_tests(&args);
+ pmd_swap_tests(&args);
- swap_migration_tests();
+ swap_migration_tests(&args);
- pmd_thp_tests(pmd_aligned, prot);
- pud_thp_tests(pud_aligned, prot);
-
- hugetlb_basic_tests(pte_aligned, prot);
+ pmd_thp_tests(&args);
+ pud_thp_tests(&args);
/*
* Page table modifying tests. They need to hold
* proper page table lock.
*/
+ ptl = pte_lockptr(args.mm, args.pmdp);
+ spin_lock(ptl);
+ pte_clear_tests(&args);
+ pte_advanced_tests(&args);
+ spin_unlock(ptl);
- ptep = pte_offset_map_lock(mm, pmdp, vaddr, &ptl);
- pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot);
- pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
- pte_unmap_unlock(ptep, ptl);
-
- ptl = pmd_lock(mm, pmdp);
- pmd_clear_tests(mm, pmdp);
- pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
- pmd_huge_tests(pmdp, pmd_aligned, prot);
- pmd_populate_tests(mm, pmdp, saved_ptep);
+ ptl = pmd_lock(args.mm, args.pmdp);
+ pmd_clear_tests(&args);
+ pmd_advanced_tests(&args);
+ pmd_huge_tests(&args);
+ pmd_populate_tests(&args);
spin_unlock(ptl);
- ptl = pud_lock(mm, pudp);
- pud_clear_tests(mm, pudp);
- pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
- pud_huge_tests(pudp, pud_aligned, prot);
- pud_populate_tests(mm, pudp, saved_pmdp);
+ ptl = pud_lock(args.mm, args.pudp);
+ pud_clear_tests(&args);
+ pud_advanced_tests(&args);
+ pud_huge_tests(&args);
+ pud_populate_tests(&args);
spin_unlock(ptl);
- spin_lock(&mm->page_table_lock);
- p4d_clear_tests(mm, p4dp);
- pgd_clear_tests(mm, pgdp);
- p4d_populate_tests(mm, p4dp, saved_pudp);
- pgd_populate_tests(mm, pgdp, saved_p4dp);
- spin_unlock(&mm->page_table_lock);
-
- p4d_free(mm, saved_p4dp);
- pud_free(mm, saved_pudp);
- pmd_free(mm, saved_pmdp);
- pte_free(mm, saved_ptep);
-
- vm_area_free(vma);
- mm_dec_nr_puds(mm);
- mm_dec_nr_pmds(mm);
- mm_dec_nr_ptes(mm);
- mmdrop(mm);
+ spin_lock(&(args.mm->page_table_lock));
+ p4d_clear_tests(&args);
+ pgd_clear_tests(&args);
+ p4d_populate_tests(&args);
+ pgd_populate_tests(&args);
+ spin_unlock(&(args.mm->page_table_lock));
+
+ destroy_args(&args);
return 0;
}
late_initcall(debug_vm_pgtable);
diff --git a/mm/filemap.c b/mm/filemap.c
index 119a8b14e261..680991e81e17 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -260,12 +260,15 @@ static void page_cache_free_page(struct address_space *mapping,
void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- unsigned long flags;
BUG_ON(!PageLocked(page));
- xa_lock_irqsave(&mapping->i_pages, flags);
+ spin_lock(&mapping->host->i_lock);
+ xa_lock_irq(&mapping->i_pages);
__delete_from_page_cache(page, NULL);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
page_cache_free_page(mapping, page);
}
@@ -337,19 +340,22 @@ void delete_from_page_cache_batch(struct address_space *mapping,
struct pagevec *pvec)
{
int i;
- unsigned long flags;
if (!pagevec_count(pvec))
return;
- xa_lock_irqsave(&mapping->i_pages, flags);
+ spin_lock(&mapping->host->i_lock);
+ xa_lock_irq(&mapping->i_pages);
for (i = 0; i < pagevec_count(pvec); i++) {
trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
unaccount_page_cache_page(mapping, pvec->pages[i]);
}
page_cache_delete_batch(mapping, pvec);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
for (i = 0; i < pagevec_count(pvec); i++)
page_cache_free_page(mapping, pvec->pages[i]);
@@ -825,7 +831,6 @@ void replace_page_cache_page(struct page *old, struct page *new)
void (*freepage)(struct page *) = mapping->a_ops->freepage;
pgoff_t offset = old->index;
XA_STATE(xas, &mapping->i_pages, offset);
- unsigned long flags;
VM_BUG_ON_PAGE(!PageLocked(old), old);
VM_BUG_ON_PAGE(!PageLocked(new), new);
@@ -837,7 +842,7 @@ void replace_page_cache_page(struct page *old, struct page *new)
mem_cgroup_migrate(fold, fnew);
- xas_lock_irqsave(&xas, flags);
+ xas_lock_irq(&xas);
xas_store(&xas, new);
old->mapping = NULL;
@@ -850,7 +855,7 @@ void replace_page_cache_page(struct page *old, struct page *new)
__dec_lruvec_page_state(old, NR_SHMEM);
if (PageSwapBacked(new))
__inc_lruvec_page_state(new, NR_SHMEM);
- xas_unlock_irqrestore(&xas, flags);
+ xas_unlock_irq(&xas);
if (freepage)
freepage(old);
put_page(old);
diff --git a/mm/gup.c b/mm/gup.c
index 42b8b1fa6521..c4441fc4cfba 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1772,7 +1772,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
if (!list_empty(&movable_page_list)) {
ret = migrate_pages(&movable_page_list, alloc_migration_target,
NULL, (unsigned long)&mtc, MIGRATE_SYNC,
- MR_LONGTERM_PIN);
+ MR_LONGTERM_PIN, NULL);
if (ret && !list_empty(&movable_page_list))
putback_movable_pages(&movable_page_list);
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 763bf687ca92..ade81c123d87 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1440,32 +1440,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
goto out;
}
- /*
- * Since we took the NUMA fault, we must have observed the !accessible
- * bit. Make sure all other CPUs agree with that, to avoid them
- * modifying the page we're about to migrate.
- *
- * Must be done under PTL such that we'll observe the relevant
- * inc_tlb_flush_pending().
- *
- * We are not sure a pending tlb flush here is for a huge page
- * mapping or not. Hence use the tlb range variant
- */
- if (mm_tlb_flush_pending(vma->vm_mm)) {
- flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
- /*
- * change_huge_pmd() released the pmd lock before
- * invalidating the secondary MMUs sharing the primary
- * MMU pagetables (with ->invalidate_range()). The
- * mmu_notifier_invalidate_range_end() (which
- * internally calls ->invalidate_range()) in
- * change_pmd_range() will run after us, so we can't
- * rely on it here and we need an explicit invalidate.
- */
- mmu_notifier_invalidate_range(vma->vm_mm, haddr,
- haddr + HPAGE_PMD_SIZE);
- }
-
pmd = pmd_modify(oldpmd, vma->vm_page_prot);
page = vm_normal_page_pmd(vma, haddr, pmd);
if (!page)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 68eead0259cc..ce79d76c42ce 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1072,6 +1072,8 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
int nid = page_to_nid(page);
lockdep_assert_held(&hugetlb_lock);
+ VM_BUG_ON_PAGE(page_count(page), page);
+
list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
@@ -1164,7 +1166,20 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
gfp_mask = htlb_alloc_mask(h);
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+#ifdef CONFIG_NUMA
+ if (mpol->mode == MPOL_PREFERRED_MANY) {
+ page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+ if (page)
+ goto check_reserve;
+ /* Fallback to all nodes */
+ nodemask = NULL;
+ }
+#endif
page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+
+#ifdef CONFIG_NUMA
+check_reserve:
+#endif
if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
SetHPageRestoreReserve(page);
h->resv_huge_pages--;
@@ -1368,8 +1383,28 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
h->surplus_huge_pages_node[nid]--;
}
+ /*
+ * Very subtle
+ *
+ * For non-gigantic pages set the destructor to the normal compound
+ * page dtor. This is needed in case someone takes an additional
+ * temporary ref to the page, and freeing is delayed until they drop
+ * their reference.
+ *
+ * For gigantic pages set the destructor to the null dtor. This
+ * destructor will never be called. Before freeing the gigantic
+ * page destroy_compound_gigantic_page will turn the compound page
+ * into a simple group of pages. After this the destructor does not
+ * apply.
+ *
+ * This handles the case where more than one ref is held when and
+ * after update_and_free_page is called.
+ */
set_page_refcounted(page);
- set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+ if (hstate_is_gigantic(h))
+ set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+ else
+ set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
h->nr_huge_pages--;
h->nr_huge_pages_node[nid]--;
@@ -1399,11 +1434,20 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
SetHPageVmemmapOptimized(page);
/*
- * This page is now managed by the hugetlb allocator and has
- * no users -- drop the last reference.
+ * This page is about to be managed by the hugetlb allocator and
+ * should have no users. Drop our reference, and check for others
+ * just in case.
*/
zeroed = put_page_testzero(page);
- VM_BUG_ON_PAGE(!zeroed, page);
+ if (!zeroed)
+ /*
+ * It is VERY unlikely soneone else has taken a ref on
+ * the page. In this case, we simply return as the
+ * hugetlb destructor (free_huge_page) will be called
+ * when this other ref is dropped.
+ */
+ return;
+
arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
}
@@ -1657,16 +1701,12 @@ static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
* cache adding could take a ref on a 'to be' tail page.
* We need to respect any increased ref count, and only set
* the ref count to zero if count is currently 1. If count
- * is not 1, we call synchronize_rcu in the hope that a rcu
- * grace period will cause ref count to drop and then retry.
- * If count is still inflated on retry we return an error and
- * must discard the pages.
+ * is not 1, we return an error and caller must discard the
+ * pages.
*/
if (!page_ref_freeze(p, 1)) {
- pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
- synchronize_rcu();
- if (!page_ref_freeze(p, 1))
- goto out_error;
+ pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+ goto out_error;
}
set_page_count(p, 0);
set_compound_head(p, page);
@@ -1830,7 +1870,6 @@ retry:
retry = true;
goto retry;
}
- pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
return NULL;
}
}
@@ -2020,9 +2059,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
* Allocates a fresh surplus page from the page allocator.
*/
static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nmask)
+ int nid, nodemask_t *nmask, bool zero_ref)
{
struct page *page = NULL;
+ bool retry = false;
if (hstate_is_gigantic(h))
return NULL;
@@ -2032,6 +2072,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
goto out_unlock;
spin_unlock_irq(&hugetlb_lock);
+retry:
page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
@@ -2049,11 +2090,35 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
spin_unlock_irq(&hugetlb_lock);
put_page(page);
return NULL;
- } else {
- h->surplus_huge_pages++;
- h->surplus_huge_pages_node[page_to_nid(page)]++;
}
+ if (zero_ref) {
+ /*
+ * Caller requires a page with zero ref count.
+ * We will drop ref count here. If someone else is holding
+ * a ref, the page will be freed when they drop it. Abuse
+ * temporary page flag to accomplish this.
+ */
+ SetHPageTemporary(page);
+ if (!put_page_testzero(page)) {
+ /*
+ * Unexpected inflated ref count on freshly allocated
+ * huge. Retry once.
+ */
+ pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
+ spin_unlock_irq(&hugetlb_lock);
+ if (retry)
+ return NULL;
+
+ retry = true;
+ goto retry;
+ }
+ ClearHPageTemporary(page);
+ }
+
+ h->surplus_huge_pages++;
+ h->surplus_huge_pages_node[page_to_nid(page)]++;
+
out_unlock:
spin_unlock_irq(&hugetlb_lock);
@@ -2095,7 +2160,21 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
nodemask_t *nodemask;
nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
- page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
+#ifdef CONFIG_NUMA
+ if (mpol->mode == MPOL_PREFERRED_MANY) {
+ gfp_t gfp = (gfp_mask | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM;
+
+ page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
+ if (page) {
+ mpol_cond_put(mpol);
+ return page;
+ }
+
+ /* Fallback to all nodes */
+ nodemask = NULL;
+ }
+#endif
+ page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false);
mpol_cond_put(mpol);
return page;
@@ -2167,7 +2246,7 @@ retry:
spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
- NUMA_NO_NODE, NULL);
+ NUMA_NO_NODE, NULL, true);
if (!page) {
alloc_ok = false;
break;
@@ -2208,24 +2287,20 @@ retry:
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
- int zeroed;
-
if ((--needed) < 0)
break;
- /*
- * This page is now managed by the hugetlb allocator and has
- * no users -- drop the buddy allocator's reference.
- */
- zeroed = put_page_testzero(page);
- VM_BUG_ON_PAGE(!zeroed, page);
+ /* Add the page to the hugetlb allocator */
enqueue_huge_page(h, page);
}
free:
spin_unlock_irq(&hugetlb_lock);
- /* Free unnecessary surplus pages to the buddy allocator */
+ /*
+ * Free unnecessary surplus pages to the buddy allocator.
+ * Pages have no ref count, call free_huge_page directly.
+ */
list_for_each_entry_safe(page, tmp, &surplus_list, lru)
- put_page(page);
+ free_huge_page(page);
spin_lock_irq(&hugetlb_lock);
return ret;
@@ -2534,6 +2609,7 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
int nid = page_to_nid(old_page);
+ bool alloc_retry = false;
struct page *new_page;
int ret = 0;
@@ -2544,9 +2620,30 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
* the pool. This simplifies and let us do most of the processing
* under the lock.
*/
+alloc_retry:
new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
if (!new_page)
return -ENOMEM;
+ /*
+ * If all goes well, this page will be directly added to the free
+ * list in the pool. For this the ref count needs to be zero.
+ * Attempt to drop now, and retry once if needed. It is VERY
+ * unlikely there is another ref on the page.
+ *
+ * If someone else has a reference to the page, it will be freed
+ * when they drop their ref. Abuse temporary page flag to accomplish
+ * this. Retry once if there is an inflated ref count.
+ */
+ SetHPageTemporary(new_page);
+ if (!put_page_testzero(new_page)) {
+ if (alloc_retry)
+ return -EBUSY;
+
+ alloc_retry = true;
+ goto alloc_retry;
+ }
+ ClearHPageTemporary(new_page);
+
__prep_new_huge_page(h, new_page);
retry:
@@ -2586,11 +2683,10 @@ retry:
remove_hugetlb_page(h, old_page, false);
/*
- * Reference count trick is needed because allocator gives us
- * referenced page but the pool requires pages with 0 refcount.
+ * Ref count on new page is already zero as it was dropped
+ * earlier. It can be directly added to the pool free list.
*/
__prep_account_new_huge_page(h, nid);
- page_ref_dec(new_page);
enqueue_huge_page(h, new_page);
/*
@@ -2604,6 +2700,8 @@ retry:
free_new:
spin_unlock_irq(&hugetlb_lock);
+ /* Page has a zero ref count, but needs a ref to be freed */
+ set_page_refcounted(new_page);
update_and_free_page(h, new_page, false);
return ret;
@@ -2828,8 +2926,8 @@ static void __init gather_bootmem_prealloc(void)
prep_new_huge_page(h, page, page_to_nid(page));
put_page(page); /* add to the hugepage allocator */
} else {
+ /* VERY unlikely inflated ref count on a tail page */
free_gigantic_page(page, huge_page_order(h));
- pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
}
/*
diff --git a/mm/internal.h b/mm/internal.h
index 0910efec5821..b1001ebeb286 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -231,6 +231,10 @@ extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
+extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr,
+ int nid, bool exact_nid);
+
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
@@ -559,12 +563,17 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
#ifdef CONFIG_NUMA
extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
+extern int find_next_best_node(int node, nodemask_t *used_node_mask);
#else
static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
unsigned int order)
{
return NODE_RECLAIM_NOSCAN;
}
+static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+ return NUMA_NO_NODE;
+}
#endif
extern int hwpoison_filter(struct page *p);
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 4ea8c368b5b8..51903639e55f 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -37,16 +37,9 @@ enum kasan_arg_stacktrace {
KASAN_ARG_STACKTRACE_ON,
};
-enum kasan_arg_fault {
- KASAN_ARG_FAULT_DEFAULT,
- KASAN_ARG_FAULT_REPORT,
- KASAN_ARG_FAULT_PANIC,
-};
-
static enum kasan_arg kasan_arg __ro_after_init;
static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init;
-static enum kasan_arg_fault kasan_arg_fault __ro_after_init;
/* Whether KASAN is enabled at all. */
DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
@@ -59,9 +52,6 @@ EXPORT_SYMBOL_GPL(kasan_flag_async);
/* Whether to collect alloc/free stack traces. */
DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
-/* Whether to panic or print a report and disable tag checking on fault. */
-bool kasan_flag_panic __ro_after_init;
-
/* kasan=off/on */
static int __init early_kasan_flag(char *arg)
{
@@ -113,23 +103,6 @@ static int __init early_kasan_flag_stacktrace(char *arg)
}
early_param("kasan.stacktrace", early_kasan_flag_stacktrace);
-/* kasan.fault=report/panic */
-static int __init early_kasan_fault(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (!strcmp(arg, "report"))
- kasan_arg_fault = KASAN_ARG_FAULT_REPORT;
- else if (!strcmp(arg, "panic"))
- kasan_arg_fault = KASAN_ARG_FAULT_PANIC;
- else
- return -EINVAL;
-
- return 0;
-}
-early_param("kasan.fault", early_kasan_fault);
-
/* kasan_init_hw_tags_cpu() is called for each CPU. */
void kasan_init_hw_tags_cpu(void)
{
@@ -197,22 +170,6 @@ void __init kasan_init_hw_tags(void)
break;
}
- switch (kasan_arg_fault) {
- case KASAN_ARG_FAULT_DEFAULT:
- /*
- * Default to no panic on report.
- * Do nothing, kasan_flag_panic keeps its default value.
- */
- break;
- case KASAN_ARG_FAULT_REPORT:
- /* Do nothing, kasan_flag_panic keeps its default value. */
- break;
- case KASAN_ARG_FAULT_PANIC:
- /* Enable panic on report. */
- kasan_flag_panic = true;
- break;
- }
-
pr_info("KernelAddressSanitizer initialized\n");
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index d739cdd1621a..fa02c88b6948 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -37,7 +37,6 @@ static inline bool kasan_async_mode_enabled(void)
#endif
-extern bool kasan_flag_panic __ro_after_init;
extern bool kasan_flag_async __ro_after_init;
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 8fff1825b22c..884a950c7026 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -39,6 +39,31 @@ static unsigned long kasan_flags;
#define KASAN_BIT_REPORTED 0
#define KASAN_BIT_MULTI_SHOT 1
+enum kasan_arg_fault {
+ KASAN_ARG_FAULT_DEFAULT,
+ KASAN_ARG_FAULT_REPORT,
+ KASAN_ARG_FAULT_PANIC,
+};
+
+static enum kasan_arg_fault kasan_arg_fault __ro_after_init = KASAN_ARG_FAULT_DEFAULT;
+
+/* kasan.fault=report/panic */
+static int __init early_kasan_fault(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "report"))
+ kasan_arg_fault = KASAN_ARG_FAULT_REPORT;
+ else if (!strcmp(arg, "panic"))
+ kasan_arg_fault = KASAN_ARG_FAULT_PANIC;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.fault", early_kasan_fault);
+
bool kasan_save_enable_multi_shot(void)
{
return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
@@ -102,10 +127,8 @@ static void end_report(unsigned long *flags, unsigned long addr)
panic_on_warn = 0;
panic("panic_on_warn set ...\n");
}
-#ifdef CONFIG_KASAN_HW_TAGS
- if (kasan_flag_panic)
+ if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC)
panic("kasan.fault=panic set ...\n");
-#endif
kasan_enable_current();
}
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index d7666ace9d2e..7a97db8bc8e7 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -20,6 +20,7 @@
#include <linux/moduleparam.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
+#include <linux/sched/clock.h>
#include <linux/sched/sysctl.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
@@ -196,6 +197,8 @@ static noinline void metadata_update_state(struct kfence_metadata *meta,
*/
track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
track->pid = task_pid_nr(current);
+ track->cpu = raw_smp_processor_id();
+ track->ts_nsec = local_clock(); /* Same source as printk timestamps. */
/*
* Pairs with READ_ONCE() in
@@ -734,6 +737,22 @@ void kfence_shutdown_cache(struct kmem_cache *s)
void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
{
/*
+ * Perform size check before switching kfence_allocation_gate, so that
+ * we don't disable KFENCE without making an allocation.
+ */
+ if (size > PAGE_SIZE)
+ return NULL;
+
+ /*
+ * Skip allocations from non-default zones, including DMA. We cannot
+ * guarantee that pages in the KFENCE pool will have the requested
+ * properties (e.g. reside in DMAable memory).
+ */
+ if ((flags & GFP_ZONEMASK) ||
+ (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
+ return NULL;
+
+ /*
* allocation_gate only needs to become non-zero, so it doesn't make
* sense to continue writing to it and pay the associated contention
* cost, in case we have a large number of concurrent allocations.
@@ -757,9 +776,6 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
if (!READ_ONCE(kfence_enabled))
return NULL;
- if (size > PAGE_SIZE)
- return NULL;
-
return kfence_guarded_alloc(s, size, flags);
}
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index 24065321ff8a..c1f23c61e5f9 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -36,6 +36,8 @@ enum kfence_object_state {
/* Alloc/free tracking information. */
struct kfence_track {
pid_t pid;
+ int cpu;
+ u64 ts_nsec;
int num_stack_entries;
unsigned long stack_entries[KFENCE_STACK_DEPTH];
};
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 7f24b9bcb2ec..942cbc16ad26 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -852,7 +852,7 @@ static void kfence_test_exit(void)
tracepoint_synchronize_unregister();
}
-late_initcall(kfence_test_init);
+late_initcall_sync(kfence_test_init);
module_exit(kfence_test_exit);
MODULE_LICENSE("GPL v2");
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index 2a319c21c939..cbdd8d442d0b 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/lockdep.h>
+#include <linux/math.h>
#include <linux/printk.h>
#include <linux/sched/debug.h>
#include <linux/seq_file.h>
@@ -100,6 +101,13 @@ static void kfence_print_stack(struct seq_file *seq, const struct kfence_metadat
bool show_alloc)
{
const struct kfence_track *track = show_alloc ? &meta->alloc_track : &meta->free_track;
+ u64 ts_sec = track->ts_nsec;
+ unsigned long rem_nsec = do_div(ts_sec, NSEC_PER_SEC);
+
+ /* Timestamp matches printk timestamp format. */
+ seq_con_printf(seq, "%s by task %d on cpu %d at %lu.%06lus:\n",
+ show_alloc ? "allocated" : "freed", track->pid,
+ track->cpu, (unsigned long)ts_sec, rem_nsec / 1000);
if (track->num_stack_entries) {
/* Skip allocation/free internals stack. */
@@ -126,15 +134,14 @@ void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *met
return;
}
- seq_con_printf(seq,
- "kfence-#%td [0x%p-0x%p"
- ", size=%d, cache=%s] allocated by task %d:\n",
- meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size,
- (cache && cache->name) ? cache->name : "<destroyed>", meta->alloc_track.pid);
+ seq_con_printf(seq, "kfence-#%td: 0x%p-0x%p, size=%d, cache=%s\n\n",
+ meta - kfence_metadata, (void *)start, (void *)(start + size - 1),
+ size, (cache && cache->name) ? cache->name : "<destroyed>");
+
kfence_print_stack(seq, meta, true);
if (meta->state == KFENCE_OBJECT_FREED) {
- seq_con_printf(seq, "\nfreed by task %d:\n", meta->free_track.pid);
+ seq_con_printf(seq, "\n");
kfence_print_stack(seq, meta, false);
}
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 0041ff62c584..a69449bffc8d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -947,7 +947,8 @@ static bool should_skip_region(struct memblock_type *type,
return true;
/* skip hotpluggable memory regions if needed */
- if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
+ if (movable_node_is_enabled() && memblock_is_hotpluggable(m) &&
+ !(flags & MEMBLOCK_HOTPLUG))
return true;
/* if we want mirror memory skip non-mirror memory regions */
@@ -1490,18 +1491,12 @@ void * __init memblock_alloc_exact_nid_raw(
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
{
- void *ptr;
-
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
- ptr = memblock_alloc_internal(size, align,
- min_addr, max_addr, nid, true);
- if (ptr && size > 0)
- page_init_poison(ptr, size);
-
- return ptr;
+ return memblock_alloc_internal(size, align, min_addr, max_addr, nid,
+ true);
}
/**
@@ -1528,18 +1523,12 @@ void * __init memblock_alloc_try_nid_raw(
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
{
- void *ptr;
-
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
- ptr = memblock_alloc_internal(size, align,
- min_addr, max_addr, nid, false);
- if (ptr && size > 0)
- page_init_poison(ptr, size);
-
- return ptr;
+ return memblock_alloc_internal(size, align, min_addr, max_addr, nid,
+ false);
}
/**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1d77c873463c..35bb5f8f9ea8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -103,6 +103,14 @@ static bool do_memsw_account(void)
return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
}
+/* memcg and lruvec stats flushing */
+static void flush_memcg_stats_dwork(struct work_struct *w);
+static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+static void flush_memcg_stats_work(struct work_struct *w);
+static DECLARE_WORK(stats_flush_work, flush_memcg_stats_work);
+static DEFINE_PER_CPU(unsigned int, stats_flush_threshold);
+static DEFINE_SPINLOCK(stats_flush_lock);
+
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
@@ -248,9 +256,9 @@ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
return &memcg->vmpressure;
}
-struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
+struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
{
- return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
+ return container_of(vmpr, struct mem_cgroup, vmpressure);
}
#ifdef CONFIG_MEMCG_KMEM
@@ -649,23 +657,11 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
return x;
}
-static struct mem_cgroup_per_node *
-parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
-{
- struct mem_cgroup *parent;
-
- parent = parent_mem_cgroup(pn->memcg);
- if (!parent)
- return NULL;
- return parent->nodeinfo[nid];
-}
-
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
- long x, threshold = MEMCG_CHARGE_BATCH;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
@@ -674,21 +670,9 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
__mod_memcg_state(memcg, idx, val);
/* Update lruvec */
- __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
-
- if (vmstat_item_in_bytes(idx))
- threshold <<= PAGE_SHIFT;
-
- x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
- if (unlikely(abs(x) > threshold)) {
- pg_data_t *pgdat = lruvec_pgdat(lruvec);
- struct mem_cgroup_per_node *pi;
-
- for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
- atomic_long_add(x, &pi->lruvec_stat[idx]);
- x = 0;
- }
- __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
+ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
+ if (!(__this_cpu_inc_return(stats_flush_threshold) % MEMCG_CHARGE_BATCH))
+ queue_work(system_unbound_wq, &stats_flush_work);
}
/**
@@ -2274,40 +2258,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
mutex_unlock(&percpu_charge_mutex);
}
-static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu)
-{
- int nid;
-
- for_each_node(nid) {
- struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
- unsigned long stat[NR_VM_NODE_STAT_ITEMS];
- struct batched_lruvec_stat *lstatc;
- int i;
-
- lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
- stat[i] = lstatc->count[i];
- lstatc->count[i] = 0;
- }
-
- do {
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- atomic_long_add(stat[i], &pn->lruvec_stat[i]);
- } while ((pn = parent_nodeinfo(pn, nid)));
- }
-}
-
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
- struct mem_cgroup *memcg;
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
- for_each_mem_cgroup(memcg)
- memcg_flush_lruvec_page_state(memcg, cpu);
-
return 0;
}
@@ -5113,17 +5070,9 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return 1;
- pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
- GFP_KERNEL_ACCOUNT);
- if (!pn->lruvec_stat_local) {
- kfree(pn);
- return 1;
- }
-
- pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat,
- GFP_KERNEL_ACCOUNT);
- if (!pn->lruvec_stat_cpu) {
- free_percpu(pn->lruvec_stat_local);
+ pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
+ GFP_KERNEL_ACCOUNT);
+ if (!pn->lruvec_stats_percpu) {
kfree(pn);
return 1;
}
@@ -5144,8 +5093,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return;
- free_percpu(pn->lruvec_stat_cpu);
- free_percpu(pn->lruvec_stat_local);
+ free_percpu(pn->lruvec_stats_percpu);
kfree(pn);
}
@@ -5161,15 +5109,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
- int cpu;
-
memcg_wb_domain_exit(memcg);
- /*
- * Flush percpu lruvec stats to guarantee the value
- * correctness on parent's and all ancestor levels.
- */
- for_each_online_cpu(cpu)
- memcg_flush_lruvec_page_state(memcg, cpu);
__mem_cgroup_free(memcg);
}
@@ -5305,6 +5245,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
/* Online state pins memcg ID, memcg ID pins CSS */
refcount_set(&memcg->id.ref, 1);
css_get(css);
+
+ if (unlikely(mem_cgroup_is_root(memcg)))
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
+ 2UL*HZ);
return 0;
}
@@ -5396,13 +5340,33 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
memcg_wb_domain_size_changed(memcg);
}
+void mem_cgroup_flush_stats(void)
+{
+ if (!spin_trylock(&stats_flush_lock))
+ return;
+
+ cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
+ spin_unlock(&stats_flush_lock);
+}
+
+static void flush_memcg_stats_dwork(struct work_struct *w)
+{
+ mem_cgroup_flush_stats();
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
+}
+
+static void flush_memcg_stats_work(struct work_struct *w)
+{
+ mem_cgroup_flush_stats();
+}
+
static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
struct memcg_vmstats_percpu *statc;
long delta, v;
- int i;
+ int i, nid;
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
@@ -5450,6 +5414,36 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (parent)
parent->vmstats.events_pending[i] += delta;
}
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct mem_cgroup_per_node *ppn = NULL;
+ struct lruvec_stats_percpu *lstatc;
+
+ if (parent)
+ ppn = parent->nodeinfo[nid];
+
+ lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ delta = pn->lruvec_stats.state_pending[i];
+ if (delta)
+ pn->lruvec_stats.state_pending[i] = 0;
+
+ v = READ_ONCE(lstatc->state[i]);
+ if (v != lstatc->state_prev[i]) {
+ delta += v - lstatc->state_prev[i];
+ lstatc->state_prev[i] = v;
+ }
+
+ if (!delta)
+ continue;
+
+ pn->lruvec_stats.state[i] += delta;
+ if (ppn)
+ ppn->lruvec_stats.state_pending[i] += delta;
+ }
+ }
}
#ifdef CONFIG_MMU
@@ -6385,6 +6379,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
int i;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ cgroup_rstat_flush(memcg->css.cgroup);
+
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
int nid;
@@ -6712,7 +6708,7 @@ out:
}
/**
- * mem_cgroup_charge - Charge a newly allocated folio to a cgroup.
+ * __mem_cgroup_charge - Charge a newly allocated folio to a cgroup.
* @folio: Folio to charge.
* @mm: mm context of the allocating task.
* @gfp: Reclaim mode.
@@ -6725,14 +6721,11 @@ out:
*
* Return: 0 on success. Otherwise, an error code is returned.
*/
-int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
+int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
{
struct mem_cgroup *memcg;
int ret;
- if (mem_cgroup_disabled())
- return 0;
-
memcg = get_mem_cgroup_from_mm(mm);
ret = charge_memcg(folio, memcg, gfp);
css_put(&memcg->css);
@@ -6906,18 +6899,15 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
}
/**
- * mem_cgroup_uncharge - Uncharge a folio.
+ * __mem_cgroup_uncharge - Uncharge a folio.
* @folio: Folio to uncharge.
*
* Uncharge a folio previously charged with mem_cgroup_charge().
*/
-void mem_cgroup_uncharge(struct folio *folio)
+void __mem_cgroup_uncharge(struct folio *folio)
{
struct uncharge_gather ug;
- if (mem_cgroup_disabled())
- return;
-
/* Don't touch folio->lru of any random page, pre-check: */
if (!folio_memcg(folio))
return;
@@ -6928,20 +6918,17 @@ void mem_cgroup_uncharge(struct folio *folio)
}
/**
- * mem_cgroup_uncharge_list - uncharge a list of page
+ * __mem_cgroup_uncharge_list - uncharge a list of page
* @page_list: list of pages to uncharge
*
* Uncharge a list of pages previously charged with
- * mem_cgroup_charge().
+ * __mem_cgroup_charge().
*/
-void mem_cgroup_uncharge_list(struct list_head *page_list)
+void __mem_cgroup_uncharge_list(struct list_head *page_list)
{
struct uncharge_gather ug;
struct folio *folio;
- if (mem_cgroup_disabled())
- return;
-
uncharge_gather_clear(&ug);
list_for_each_entry(folio, page_list, lru)
uncharge_folio(folio, &ug);
@@ -7230,7 +7217,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
}
/**
- * mem_cgroup_try_charge_swap - try charging swap space for a page
+ * __mem_cgroup_try_charge_swap - try charging swap space for a page
* @page: page being added to swap
* @entry: swap entry to charge
*
@@ -7238,16 +7225,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
*
* Returns 0 on success, -ENOMEM on failure.
*/
-int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
{
unsigned int nr_pages = thp_nr_pages(page);
struct page_counter *counter;
struct mem_cgroup *memcg;
unsigned short oldid;
- if (mem_cgroup_disabled())
- return 0;
-
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
@@ -7283,11 +7267,11 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
}
/**
- * mem_cgroup_uncharge_swap - uncharge swap space
+ * __mem_cgroup_uncharge_swap - uncharge swap space
* @entry: swap entry to uncharge
* @nr_pages: the amount of swap space to uncharge
*/
-void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
struct mem_cgroup *memcg;
unsigned short id;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index de92e3cf09ee..03f83e7d075b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2093,7 +2093,7 @@ static int __soft_offline_page(struct page *page)
if (isolate_page(hpage, &pagelist)) {
ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
- (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL);
if (!ret) {
bool release = !huge;
diff --git a/mm/memory.c b/mm/memory.c
index 627e7836ade6..7a050ed0b2b0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5434,7 +5434,7 @@ long copy_huge_page_from_user(struct page *dst_page,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
-#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
+#if ALLOC_SPLIT_PTLOCKS
static struct kmem_cache *page_ptl_cachep;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8cb75b26ea4f..d45c69d78b83 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -708,8 +708,8 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn
return movable_node_enabled ? movable_zone : kernel_zone;
}
-struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
- unsigned long nr_pages)
+struct zone *zone_for_pfn_range(int online_type, int nid,
+ unsigned long start_pfn, unsigned long nr_pages)
{
if (online_type == MMOP_ONLINE_KERNEL)
return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
@@ -1106,7 +1106,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
/* create memory block devices after memory was added */
ret = create_memory_block_devices(start, size, mhp_altmap.alloc);
if (ret) {
- arch_remove_memory(nid, start, size, NULL);
+ arch_remove_memory(start, size, NULL);
goto error;
}
@@ -1298,7 +1298,7 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn,
unsigned long pfn, sec_end_pfn;
struct zone *zone = NULL;
struct page *page;
- int i;
+
for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
pfn < end_pfn;
pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
@@ -1307,17 +1307,10 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn,
continue;
for (; pfn < sec_end_pfn && pfn < end_pfn;
pfn += MAX_ORDER_NR_PAGES) {
- i = 0;
- /* This is just a CONFIG_HOLES_IN_ZONE check.*/
- while ((i < MAX_ORDER_NR_PAGES) &&
- !pfn_valid_within(pfn + i))
- i++;
- if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
- continue;
/* Check if we got outside of the zone */
- if (zone && !zone_spans_pfn(zone, pfn + i))
+ if (zone && !zone_spans_pfn(zone, pfn))
return NULL;
- page = pfn_to_page(pfn + i);
+ page = pfn_to_page(pfn);
if (zone && page_zone(page) != zone)
return NULL;
zone = page_zone(page);
@@ -1469,7 +1462,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (nodes_empty(nmask))
node_set(mtc.nid, nmask);
ret = migrate_pages(&source, alloc_migration_target, NULL,
- (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL);
if (ret) {
list_for_each_entry(page, &source, lru) {
if (__ratelimit(&migrate_rs)) {
@@ -1745,7 +1738,9 @@ failed_removal:
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
int ret = !is_memblock_offlined(mem);
+ int *nid = arg;
+ *nid = mem->nid;
if (unlikely(ret)) {
phys_addr_t beginpa, endpa;
@@ -1838,12 +1833,12 @@ void try_offline_node(int nid)
}
EXPORT_SYMBOL(try_offline_node);
-static int __ref try_remove_memory(int nid, u64 start, u64 size)
+static int __ref try_remove_memory(u64 start, u64 size)
{
- int rc = 0;
struct vmem_altmap mhp_altmap = {};
struct vmem_altmap *altmap = NULL;
unsigned long nr_vmemmap_pages;
+ int rc = 0, nid = NUMA_NO_NODE;
BUG_ON(check_hotplug_memory_range(start, size));
@@ -1851,8 +1846,12 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
* All memory blocks must be offlined before removing memory. Check
* whether all memory blocks in question are offline and return error
* if this is not the case.
+ *
+ * While at it, determine the nid. Note that if we'd have mixed nodes,
+ * we'd only try to offline the last determined one -- which is good
+ * enough for the cases we care about.
*/
- rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
+ rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb);
if (rc)
return rc;
@@ -1892,7 +1891,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
mem_hotplug_begin();
- arch_remove_memory(nid, start, size, altmap);
+ arch_remove_memory(start, size, altmap);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
memblock_free(start, size);
@@ -1901,7 +1900,8 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
release_mem_region_adjustable(start, size);
- try_offline_node(nid);
+ if (nid != NUMA_NO_NODE)
+ try_offline_node(nid);
mem_hotplug_done();
return 0;
@@ -1909,7 +1909,6 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
/**
* __remove_memory - Remove memory if every memory block is offline
- * @nid: the node ID
* @start: physical address of the region to remove
* @size: size of the region to remove
*
@@ -1917,14 +1916,14 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
* and online/offline operations before this call, as required by
* try_offline_node().
*/
-void __remove_memory(int nid, u64 start, u64 size)
+void __remove_memory(u64 start, u64 size)
{
/*
* trigger BUG() if some memory is not offlined prior to calling this
* function
*/
- if (try_remove_memory(nid, start, size))
+ if (try_remove_memory(start, size))
BUG();
}
@@ -1932,12 +1931,12 @@ void __remove_memory(int nid, u64 start, u64 size)
* Remove memory if every memory block is offline, otherwise return -EBUSY is
* some memory is not offline
*/
-int remove_memory(int nid, u64 start, u64 size)
+int remove_memory(u64 start, u64 size)
{
int rc;
lock_device_hotplug();
- rc = try_remove_memory(nid, start, size);
+ rc = try_remove_memory(start, size);
unlock_device_hotplug();
return rc;
@@ -1997,7 +1996,7 @@ static int try_reonline_memory_block(struct memory_block *mem, void *arg)
* unplugged all memory (so it's no longer in use) and want to offline + remove
* that memory.
*/
-int offline_and_remove_memory(int nid, u64 start, u64 size)
+int offline_and_remove_memory(u64 start, u64 size)
{
const unsigned long mb_count = size / memory_block_size_bytes();
uint8_t *online_types, *tmp;
@@ -2033,7 +2032,7 @@ int offline_and_remove_memory(int nid, u64 start, u64 size)
* This cannot fail as it cannot get onlined in the meantime.
*/
if (!rc) {
- rc = try_remove_memory(nid, start, size);
+ rc = try_remove_memory(start, size);
if (rc)
pr_err("%s: Failed to remove memory: %d", __func__, rc);
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 95d0cf05f7ca..cedc81679f03 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -31,6 +31,9 @@
* but useful to set in a VMA when you have a non default
* process policy.
*
+ * preferred many Try a set of nodes first before normal fallback. This is
+ * similar to preferred without the special case.
+ *
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
@@ -120,7 +123,7 @@ enum zone_type policy_zone = 0;
* run-time system-wide default policy => local allocation
*/
static struct mempolicy default_policy = {
- .refcnt = ATOMIC_INIT(1), /* never free it */
+ .refcnt = { ATOMIC_INIT(1), }, /* never free it */
.mode = MPOL_LOCAL,
};
@@ -189,7 +192,7 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
nodes_onto(*ret, tmp, *rel);
}
-static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
+static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
@@ -207,14 +210,6 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
return 0;
}
-static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
-{
- if (nodes_empty(*nodes))
- return -EINVAL;
- pol->nodes = *nodes;
- return 0;
-}
-
/*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes
@@ -298,7 +293,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
- atomic_set(&policy->refcnt, 1);
+ refcount_set(&policy->refcnt, 1);
policy->mode = mode;
policy->flags = flags;
@@ -308,7 +303,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
/* Slow path of a mpol destructor. */
void __mpol_put(struct mempolicy *p)
{
- if (!atomic_dec_and_test(&p->refcnt))
+ if (!refcount_dec_and_test(&p->refcnt))
return;
kmem_cache_free(policy_cache, p);
}
@@ -394,7 +389,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.rebind = mpol_rebind_default,
},
[MPOL_INTERLEAVE] = {
- .create = mpol_new_interleave,
+ .create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
[MPOL_PREFERRED] = {
@@ -402,12 +397,16 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.rebind = mpol_rebind_preferred,
},
[MPOL_BIND] = {
- .create = mpol_new_bind,
+ .create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
[MPOL_LOCAL] = {
.rebind = mpol_rebind_default,
},
+ [MPOL_PREFERRED_MANY] = {
+ .create = mpol_new_nodemask,
+ .rebind = mpol_rebind_preferred,
+ },
};
static int migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -900,6 +899,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
*nodes = p->nodes;
break;
case MPOL_LOCAL:
@@ -1084,7 +1084,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, alloc_migration_target, NULL,
- (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
if (err)
putback_movable_pages(&pagelist);
}
@@ -1338,7 +1338,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_page, NULL,
- start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+ start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
if (nr_failed)
putback_movable_pages(&pagelist);
}
@@ -1446,7 +1446,8 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
{
*flags = *mode & MPOL_MODE_FLAGS;
*mode &= ~MPOL_MODE_FLAGS;
- if ((unsigned int)(*mode) >= MPOL_MAX)
+
+ if ((unsigned int)(*mode) >= MPOL_MAX)
return -EINVAL;
if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
@@ -1887,7 +1888,8 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
/* Return the node id preferred by the given mempolicy, or the given id */
static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
- if (policy->mode == MPOL_PREFERRED) {
+ if (policy->mode == MPOL_PREFERRED ||
+ policy->mode == MPOL_PREFERRED_MANY) {
nd = first_node(policy->nodes);
} else {
/*
@@ -1931,6 +1933,7 @@ unsigned int mempolicy_slab_node(void)
switch (policy->mode) {
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
return first_node(policy->nodes);
case MPOL_INTERLEAVE:
@@ -2030,7 +2033,8 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
huge_page_shift(hstate_vma(vma)));
} else {
nid = policy_node(gfp_flags, *mpol, numa_node_id());
- if ((*mpol)->mode == MPOL_BIND)
+ if ((*mpol)->mode == MPOL_BIND ||
+ (*mpol)->mode == MPOL_PREFERRED_MANY)
*nodemask = &(*mpol)->nodes;
}
return nid;
@@ -2063,6 +2067,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
*mask = mempolicy->nodes;
@@ -2128,6 +2133,25 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
return page;
}
+static struct page *alloc_page_preferred_many(gfp_t gfp, unsigned int order,
+ struct mempolicy *pol)
+{
+ struct page *page;
+
+ /*
+ * This is a two pass approach. The first pass will only try the
+ * preferred nodes but skip the direct reclaim and allow the
+ * allocation to fail, while the second pass will try all the
+ * nodes in system.
+ */
+ page = __alloc_pages(((gfp | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM),
+ order, first_node(pol->nodes), &pol->nodes);
+ if (!page)
+ page = __alloc_pages(gfp, order, numa_node_id(), NULL);
+
+ return page;
+}
+
/**
* alloc_pages_vma - Allocate a page for a VMA.
* @gfp: GFP flags.
@@ -2163,6 +2187,12 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
goto out;
}
+ if (pol->mode == MPOL_PREFERRED_MANY) {
+ page = alloc_page_preferred_many(gfp, order, pol);
+ mpol_cond_put(pol);
+ goto out;
+ }
+
if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
int hpage_node = node;
@@ -2173,10 +2203,12 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
* node and don't fall back to other nodes, as the cost of
* remote accesses would likely offset THP benefits.
*
- * If the policy is interleave, or does not allow the current
- * node in its nodemask, we allocate the standard way.
+ * If the policy is interleave or multiple preferred nodes, or
+ * does not allow the current node in its nodemask, we allocate
+ * the standard way.
*/
- if (pol->mode == MPOL_PREFERRED)
+ if ((pol->mode == MPOL_PREFERRED ||
+ pol->mode == MPOL_PREFERRED_MANY))
hpage_node = first_node(pol->nodes);
nmask = policy_nodemask(gfp, pol);
@@ -2240,6 +2272,8 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
*/
if (pol->mode == MPOL_INTERLEAVE)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+ else if (pol->mode == MPOL_PREFERRED_MANY)
+ page = alloc_page_preferred_many(gfp, order, pol);
else
page = __alloc_pages(gfp, order,
policy_node(gfp, pol, numa_node_id()),
@@ -2300,7 +2334,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
nodemask_t mems = cpuset_mems_allowed(current);
mpol_rebind_policy(new, &mems);
}
- atomic_set(&new->refcnt, 1);
+ refcount_set(&new->refcnt, 1);
return new;
}
@@ -2321,6 +2355,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
return !!nodes_equal(a->nodes, b->nodes);
case MPOL_LOCAL:
return true;
@@ -2461,6 +2496,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
break;
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
+ if (node_isset(curnid, pol->nodes))
+ goto out;
polnid = first_node(pol->nodes);
break;
@@ -2591,7 +2629,7 @@ restart:
goto alloc_new;
*mpol_new = *n->policy;
- atomic_set(&mpol_new->refcnt, 1);
+ refcount_set(&mpol_new->refcnt, 1);
sp_node_init(n_new, end, n->end, mpol_new);
n->end = start;
sp_insert(sp, n_new);
@@ -2785,7 +2823,7 @@ void __init numa_policy_init(void)
for_each_node(nid) {
preferred_node_policy[nid] = (struct mempolicy) {
- .refcnt = ATOMIC_INIT(1),
+ .refcnt = { ATOMIC_INIT(1), },
.mode = MPOL_PREFERRED,
.flags = MPOL_F_MOF | MPOL_F_MORON,
.nodes = nodemask_of_node(nid),
@@ -2839,6 +2877,7 @@ static const char * const policy_modes[] =
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
[MPOL_LOCAL] = "local",
+ [MPOL_PREFERRED_MANY] = "prefer (many)",
};
@@ -2917,6 +2956,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
if (!nodelist)
err = 0;
goto out;
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
/*
* Insist on a nodelist
@@ -3003,6 +3043,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
case MPOL_LOCAL:
break;
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
nodes = pol->nodes;
@@ -3031,3 +3072,64 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}
+
+bool numa_demotion_enabled = false;
+
+#ifdef CONFIG_SYSFS
+static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n",
+ numa_demotion_enabled? "true" : "false");
+}
+
+static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
+ numa_demotion_enabled = true;
+ else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
+ numa_demotion_enabled = false;
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static struct kobj_attribute numa_demotion_enabled_attr =
+ __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
+ numa_demotion_enabled_store);
+
+static struct attribute *numa_attrs[] = {
+ &numa_demotion_enabled_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group numa_attr_group = {
+ .attrs = numa_attrs,
+};
+
+static int __init numa_init_sysfs(void)
+{
+ int err;
+ struct kobject *numa_kobj;
+
+ numa_kobj = kobject_create_and_add("numa", mm_kobj);
+ if (!numa_kobj) {
+ pr_err("failed to create numa kobject\n");
+ return -ENOMEM;
+ }
+ err = sysfs_create_group(numa_kobj, &numa_attr_group);
+ if (err) {
+ pr_err("failed to register numa group\n");
+ goto delete_obj;
+ }
+ return 0;
+
+delete_obj:
+ kobject_put(numa_kobj);
+ return err;
+}
+subsys_initcall(numa_init_sysfs);
+#endif
diff --git a/mm/memremap.c b/mm/memremap.c
index 6eac40f9f62a..84de22c14567 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -109,6 +109,26 @@ static unsigned long pfn_next(unsigned long pfn)
return pfn + 1;
}
+/*
+ * This returns true if the page is reserved by ZONE_DEVICE driver.
+ */
+bool pfn_zone_device_reserved(unsigned long pfn)
+{
+ struct dev_pagemap *pgmap;
+ struct vmem_altmap *altmap;
+ bool ret = false;
+
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (!pgmap)
+ return ret;
+ altmap = pgmap_altmap(pgmap);
+ if (altmap && pfn < (altmap->base_pfn + altmap->reserve))
+ ret = true;
+ put_dev_pagemap(pgmap);
+
+ return ret;
+}
+
#define for_each_device_pfn(pfn, map, i) \
for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
@@ -140,14 +160,11 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
{
struct range *range = &pgmap->ranges[range_id];
struct page *first_page;
- int nid;
/* make sure to access a memmap that was actually initialized */
first_page = pfn_to_page(pfn_first(pgmap, range_id));
/* pages are dead and unused, undo the arch mapping */
- nid = page_to_nid(first_page);
-
mem_hotplug_begin();
remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(range->start),
PHYS_PFN(range_len(range)));
@@ -155,7 +172,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
__remove_pages(PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), NULL);
} else {
- arch_remove_memory(nid, range->start, range_len(range),
+ arch_remove_memory(range->start, range_len(range),
pgmap_altmap(pgmap));
kasan_remove_zero_shadow(__va(range->start), range_len(range));
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 36cdae0a1235..7d06515d94de 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -49,6 +49,7 @@
#include <linux/sched/mm.h>
#include <linux/ptrace.h>
#include <linux/oom.h>
+#include <linux/memory.h>
#include <asm/tlbflush.h>
@@ -1098,6 +1099,80 @@ out:
return rc;
}
+
+/*
+ * node_demotion[] example:
+ *
+ * Consider a system with two sockets. Each socket has
+ * three classes of memory attached: fast, medium and slow.
+ * Each memory class is placed in its own NUMA node. The
+ * CPUs are placed in the node with the "fast" memory. The
+ * 6 NUMA nodes (0-5) might be split among the sockets like
+ * this:
+ *
+ * Socket A: 0, 1, 2
+ * Socket B: 3, 4, 5
+ *
+ * When Node 0 fills up, its memory should be migrated to
+ * Node 1. When Node 1 fills up, it should be migrated to
+ * Node 2. The migration path start on the nodes with the
+ * processors (since allocations default to this node) and
+ * fast memory, progress through medium and end with the
+ * slow memory:
+ *
+ * 0 -> 1 -> 2 -> stop
+ * 3 -> 4 -> 5 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ *
+ * { 1, // Node 0 migrates to 1
+ * 2, // Node 1 migrates to 2
+ * -1, // Node 2 does not migrate
+ * 4, // Node 3 migrates to 4
+ * 5, // Node 4 migrates to 5
+ * -1} // Node 5 does not migrate
+ */
+
+/*
+ * Writes to this array occur without locking. Cycles are
+ * not allowed: Node X demotes to Y which demotes to X...
+ *
+ * If multiple reads are performed, a single rcu_read_lock()
+ * must be held over all reads to ensure that no cycles are
+ * observed.
+ */
+static int node_demotion[MAX_NUMNODES] __read_mostly =
+ {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE};
+
+/**
+ * next_demotion_node() - Get the next node in the demotion path
+ * @node: The starting node to lookup the next node
+ *
+ * @returns: node id for next memory node in the demotion path hierarchy
+ * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
+ * @node online or guarantee that it *continues* to be the next demotion
+ * target.
+ */
+int next_demotion_node(int node)
+{
+ int target;
+
+ /*
+ * node_demotion[] is updated without excluding this
+ * function from running. RCU doesn't provide any
+ * compiler barriers, so the READ_ONCE() is required
+ * to avoid compiler reordering or read merging.
+ *
+ * Make sure to use RCU over entire code blocks if
+ * node_demotion[] reads need to be consistent.
+ */
+ rcu_read_lock();
+ target = READ_ONCE(node_demotion[node]);
+ rcu_read_unlock();
+
+ return target;
+}
+
/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
@@ -1353,6 +1428,8 @@ static inline int try_split_thp(struct page *page, struct page **page2,
* @mode: The migration mode that specifies the constraints for
* page migration, if any.
* @reason: The reason for page migration.
+ * @ret_succeeded: Set to the number of pages migrated successfully if
+ * the caller passes a non-NULL pointer.
*
* The function returns after 10 attempts or if no pages are movable any more
* because the list has become empty or no retryable pages exist any more.
@@ -1363,7 +1440,7 @@ static inline int try_split_thp(struct page *page, struct page **page2,
*/
int migrate_pages(struct list_head *from, new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
- enum migrate_mode mode, int reason)
+ enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
{
int retry = 1;
int thp_retry = 1;
@@ -1518,6 +1595,9 @@ out:
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
+ if (ret_succeeded)
+ *ret_succeeded = nr_succeeded;
+
return rc;
}
@@ -1587,7 +1667,7 @@ static int do_move_pages_to_node(struct mm_struct *mm,
};
err = migrate_pages(pagelist, alloc_migration_target, NULL,
- (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
if (err)
putback_movable_pages(pagelist);
return err;
@@ -2102,7 +2182,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
list_add(&page->lru, &migratepages);
nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
- MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+ MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL);
if (nr_remaining) {
if (!list_empty(&migratepages)) {
list_del(&page->lru);
@@ -2981,3 +3061,232 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
}
EXPORT_SYMBOL(migrate_vma_finalize);
#endif /* CONFIG_DEVICE_PRIVATE */
+
+#if defined(CONFIG_MEMORY_HOTPLUG)
+/* Disable reclaim-based migration. */
+static void __disable_all_migrate_targets(void)
+{
+ int node;
+
+ for_each_online_node(node)
+ node_demotion[node] = NUMA_NO_NODE;
+}
+
+static void disable_all_migrate_targets(void)
+{
+ __disable_all_migrate_targets();
+
+ /*
+ * Ensure that the "disable" is visible across the system.
+ * Readers will see either a combination of before+disable
+ * state or disable+after. They will never see before and
+ * after state together.
+ *
+ * The before+after state together might have cycles and
+ * could cause readers to do things like loop until this
+ * function finishes. This ensures they can only see a
+ * single "bad" read and would, for instance, only loop
+ * once.
+ */
+ synchronize_rcu();
+}
+
+/*
+ * Find an automatic demotion target for 'node'.
+ * Failing here is OK. It might just indicate
+ * being at the end of a chain.
+ */
+static int establish_migrate_target(int node, nodemask_t *used)
+{
+ int migration_target;
+
+ /*
+ * Can not set a migration target on a
+ * node with it already set.
+ *
+ * No need for READ_ONCE() here since this
+ * in the write path for node_demotion[].
+ * This should be the only thread writing.
+ */
+ if (node_demotion[node] != NUMA_NO_NODE)
+ return NUMA_NO_NODE;
+
+ migration_target = find_next_best_node(node, used);
+ if (migration_target == NUMA_NO_NODE)
+ return NUMA_NO_NODE;
+
+ node_demotion[node] = migration_target;
+
+ return migration_target;
+}
+
+/*
+ * When memory fills up on a node, memory contents can be
+ * automatically migrated to another node instead of
+ * discarded at reclaim.
+ *
+ * Establish a "migration path" which will start at nodes
+ * with CPUs and will follow the priorities used to build the
+ * page allocator zonelists.
+ *
+ * The difference here is that cycles must be avoided. If
+ * node0 migrates to node1, then neither node1, nor anything
+ * node1 migrates to can migrate to node0.
+ *
+ * This function can run simultaneously with readers of
+ * node_demotion[]. However, it can not run simultaneously
+ * with itself. Exclusion is provided by memory hotplug events
+ * being single-threaded.
+ */
+static void __set_migration_target_nodes(void)
+{
+ nodemask_t next_pass = NODE_MASK_NONE;
+ nodemask_t this_pass = NODE_MASK_NONE;
+ nodemask_t used_targets = NODE_MASK_NONE;
+ int node;
+
+ /*
+ * Avoid any oddities like cycles that could occur
+ * from changes in the topology. This will leave
+ * a momentary gap when migration is disabled.
+ */
+ disable_all_migrate_targets();
+
+ /*
+ * Allocations go close to CPUs, first. Assume that
+ * the migration path starts at the nodes with CPUs.
+ */
+ next_pass = node_states[N_CPU];
+again:
+ this_pass = next_pass;
+ next_pass = NODE_MASK_NONE;
+ /*
+ * To avoid cycles in the migration "graph", ensure
+ * that migration sources are not future targets by
+ * setting them in 'used_targets'. Do this only
+ * once per pass so that multiple source nodes can
+ * share a target node.
+ *
+ * 'used_targets' will become unavailable in future
+ * passes. This limits some opportunities for
+ * multiple source nodes to share a destination.
+ */
+ nodes_or(used_targets, used_targets, this_pass);
+ for_each_node_mask(node, this_pass) {
+ int target_node = establish_migrate_target(node, &used_targets);
+
+ if (target_node == NUMA_NO_NODE)
+ continue;
+
+ /*
+ * Visit targets from this pass in the next pass.
+ * Eventually, every node will have been part of
+ * a pass, and will become set in 'used_targets'.
+ */
+ node_set(target_node, next_pass);
+ }
+ /*
+ * 'next_pass' contains nodes which became migration
+ * targets in this pass. Make additional passes until
+ * no more migrations targets are available.
+ */
+ if (!nodes_empty(next_pass))
+ goto again;
+}
+
+/*
+ * For callers that do not hold get_online_mems() already.
+ */
+static void set_migration_target_nodes(void)
+{
+ get_online_mems();
+ __set_migration_target_nodes();
+ put_online_mems();
+}
+
+/*
+ * React to hotplug events that might affect the migration targets
+ * like events that online or offline NUMA nodes.
+ *
+ * The ordering is also currently dependent on which nodes have
+ * CPUs. That means we need CPU on/offline notification too.
+ */
+static int migration_online_cpu(unsigned int cpu)
+{
+ set_migration_target_nodes();
+ return 0;
+}
+
+static int migration_offline_cpu(unsigned int cpu)
+{
+ set_migration_target_nodes();
+ return 0;
+}
+
+/*
+ * This leaves migrate-on-reclaim transiently disabled between
+ * the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs
+ * whether reclaim-based migration is enabled or not, which
+ * ensures that the user can turn reclaim-based migration at
+ * any time without needing to recalculate migration targets.
+ *
+ * These callbacks already hold get_online_mems(). That is why
+ * __set_migration_target_nodes() can be used as opposed to
+ * set_migration_target_nodes().
+ */
+static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ /*
+ * Make sure there are not transient states where
+ * an offline node is a migration target. This
+ * will leave migration disabled until the offline
+ * completes and the MEM_OFFLINE case below runs.
+ */
+ disable_all_migrate_targets();
+ break;
+ case MEM_OFFLINE:
+ case MEM_ONLINE:
+ /*
+ * Recalculate the target nodes once the node
+ * reaches its final state (online or offline).
+ */
+ __set_migration_target_nodes();
+ break;
+ case MEM_CANCEL_OFFLINE:
+ /*
+ * MEM_GOING_OFFLINE disabled all the migration
+ * targets. Reenable them.
+ */
+ __set_migration_target_nodes();
+ break;
+ case MEM_GOING_ONLINE:
+ case MEM_CANCEL_ONLINE:
+ break;
+ }
+
+ return notifier_from_errno(0);
+}
+
+static int __init migrate_on_reclaim_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim",
+ migration_online_cpu,
+ migration_offline_cpu);
+ /*
+ * In the unlikely case that this fails, the automatic
+ * migration targets may become suboptimal for nodes
+ * where N_CPU changes. With such a small impact in a
+ * rare case, do not bother trying to do anything special.
+ */
+ WARN_ON(ret < 0);
+
+ hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+ return 0;
+}
+late_initcall(migrate_on_reclaim_init);
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index f5852a058ce0..1854850b4b89 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -156,14 +156,14 @@ static inline void put_memcg_path_buf(void)
#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
do { \
const char *memcg_path; \
- preempt_disable(); \
+ local_lock(&memcg_paths.lock); \
memcg_path = get_mm_memcg_path(mm); \
trace_mmap_lock_##type(mm, \
memcg_path != NULL ? memcg_path : "", \
##__VA_ARGS__); \
if (likely(memcg_path != NULL)) \
put_memcg_path_buf(); \
- preempt_enable(); \
+ local_unlock(&memcg_paths.lock); \
} while (0)
#else /* !CONFIG_MEMCG */
diff --git a/mm/mremap.c b/mm/mremap.c
index 5989d3990020..badfe17ade1f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -686,7 +686,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
/* OOM: unable to split vma, just get accounts right */
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
- vm_acct_memory(new_len >> PAGE_SHIFT);
+ vm_acct_memory(old_len >> PAGE_SHIFT);
excess = 0;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c729a4c4a1ac..a15ddf7fc3b8 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -233,8 +233,11 @@ long oom_badness(struct task_struct *p, unsigned long totalpages)
mm_pgtables_bytes(p->mm) / PAGE_SIZE;
task_unlock(p);
- /* Normalize to oom_score_adj units */
- adj *= totalpages / 1000;
+ /*
+ * Normalize to oom_score_adj units. You should never
+ * multiply by zero here, or oom_score_adj will not work.
+ */
+ adj *= (totalpages + 1000) / 1000;
points += adj;
return points;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c2987f05c944..96b69365de65 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -183,7 +183,7 @@ static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
static void wb_min_max_ratio(struct bdi_writeback *wb,
unsigned long *minp, unsigned long *maxp)
{
- unsigned long this_bw = wb->avg_write_bandwidth;
+ unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
unsigned long long min = wb->bdi->min_ratio;
unsigned long long max = wb->bdi->max_ratio;
@@ -892,7 +892,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
static void wb_position_ratio(struct dirty_throttle_control *dtc)
{
struct bdi_writeback *wb = dtc->wb;
- unsigned long write_bw = wb->avg_write_bandwidth;
+ unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
unsigned long wb_thresh = dtc->wb_thresh;
@@ -1115,7 +1115,7 @@ out:
&wb->bdi->tot_write_bandwidth) <= 0);
}
wb->write_bandwidth = bw;
- wb->avg_write_bandwidth = avg;
+ WRITE_ONCE(wb->avg_write_bandwidth, avg);
}
static void update_dirty_limit(struct dirty_throttle_control *dtc)
@@ -1147,8 +1147,8 @@ update:
dom->dirty_limit = limit;
}
-static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
- unsigned long now)
+static void domain_update_dirty_limit(struct dirty_throttle_control *dtc,
+ unsigned long now)
{
struct wb_domain *dom = dtc_dom(dtc);
@@ -1324,7 +1324,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
else
dirty_ratelimit -= step;
- wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+ WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL));
wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
@@ -1332,35 +1332,21 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
struct dirty_throttle_control *mdtc,
- unsigned long start_time,
bool update_ratelimit)
{
struct bdi_writeback *wb = gdtc->wb;
unsigned long now = jiffies;
- unsigned long elapsed = now - wb->bw_time_stamp;
+ unsigned long elapsed;
unsigned long dirtied;
unsigned long written;
- lockdep_assert_held(&wb->list_lock);
-
- /*
- * rate-limit, only update once every 200ms.
- */
- if (elapsed < BANDWIDTH_INTERVAL)
- return;
-
+ spin_lock(&wb->list_lock);
+ elapsed = now - wb->bw_time_stamp;
dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
- /*
- * Skip quiet periods when disk bandwidth is under-utilized.
- * (at least 1s idle time between two flusher runs)
- */
- if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
- goto snapshot;
-
if (update_ratelimit) {
- domain_update_bandwidth(gdtc, now);
+ domain_update_dirty_limit(gdtc, now);
wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
/*
@@ -1368,23 +1354,41 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
* compiler has no way to figure that out. Help it.
*/
if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
- domain_update_bandwidth(mdtc, now);
+ domain_update_dirty_limit(mdtc, now);
wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
}
}
wb_update_write_bandwidth(wb, elapsed, written);
-snapshot:
wb->dirtied_stamp = dirtied;
wb->written_stamp = written;
- wb->bw_time_stamp = now;
+ WRITE_ONCE(wb->bw_time_stamp, now);
+ spin_unlock(&wb->list_lock);
}
-void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
+void wb_update_bandwidth(struct bdi_writeback *wb)
{
struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
- __wb_update_bandwidth(&gdtc, NULL, start_time, false);
+ __wb_update_bandwidth(&gdtc, NULL, false);
+}
+
+/* Interval after which we consider wb idle and don't estimate bandwidth */
+#define WB_BANDWIDTH_IDLE_JIF (HZ)
+
+static void wb_bandwidth_estimate_start(struct bdi_writeback *wb)
+{
+ unsigned long now = jiffies;
+ unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp);
+
+ if (elapsed > WB_BANDWIDTH_IDLE_JIF &&
+ !atomic_read(&wb->writeback_inodes)) {
+ spin_lock(&wb->list_lock);
+ wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED);
+ wb->written_stamp = wb_stat(wb, WB_WRITTEN);
+ WRITE_ONCE(wb->bw_time_stamp, now);
+ spin_unlock(&wb->list_lock);
+ }
}
/*
@@ -1407,7 +1411,7 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
static unsigned long wb_max_pause(struct bdi_writeback *wb,
unsigned long wb_dirty)
{
- unsigned long bw = wb->avg_write_bandwidth;
+ unsigned long bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long t;
/*
@@ -1429,8 +1433,8 @@ static long wb_min_pause(struct bdi_writeback *wb,
unsigned long dirty_ratelimit,
int *nr_dirtied_pause)
{
- long hi = ilog2(wb->avg_write_bandwidth);
- long lo = ilog2(wb->dirty_ratelimit);
+ long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth));
+ long lo = ilog2(READ_ONCE(wb->dirty_ratelimit));
long t; /* target pause */
long pause; /* estimated next pause */
int pages; /* target nr_dirtied_pause */
@@ -1710,15 +1714,12 @@ free_running:
if (dirty_exceeded && !wb->dirty_exceeded)
wb->dirty_exceeded = 1;
- if (time_is_before_jiffies(wb->bw_time_stamp +
- BANDWIDTH_INTERVAL)) {
- spin_lock(&wb->list_lock);
- __wb_update_bandwidth(gdtc, mdtc, start_time, true);
- spin_unlock(&wb->list_lock);
- }
+ if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
+ BANDWIDTH_INTERVAL))
+ __wb_update_bandwidth(gdtc, mdtc, true);
/* throttle according to the chosen dtc */
- dirty_ratelimit = wb->dirty_ratelimit;
+ dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
RATELIMIT_CALC_SHIFT;
max_pause = wb_max_pause(wb, sdtc->wb_dirty);
@@ -2347,9 +2348,12 @@ EXPORT_SYMBOL(generic_writepages);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
+ struct bdi_writeback *wb;
if (wbc->nr_to_write <= 0)
return 0;
+ wb = inode_to_wb_wbc(mapping->host, wbc);
+ wb_bandwidth_estimate_start(wb);
while (1) {
if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
@@ -2360,6 +2364,14 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
cond_resched();
congestion_wait(BLK_RW_ASYNC, HZ/50);
}
+ /*
+ * Usually few pages are written by now from those we've just submitted
+ * but if there's constant writeback being submitted, this makes sure
+ * writeback bandwidth is updated once in a while.
+ */
+ if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
+ BANDWIDTH_INTERVAL))
+ wb_update_bandwidth(wb);
return ret;
}
@@ -2755,6 +2767,24 @@ bool folio_clear_dirty_for_io(struct folio *folio)
}
EXPORT_SYMBOL(folio_clear_dirty_for_io);
+static void wb_inode_writeback_start(struct bdi_writeback *wb)
+{
+ atomic_inc(&wb->writeback_inodes);
+}
+
+static void wb_inode_writeback_end(struct bdi_writeback *wb)
+{
+ atomic_dec(&wb->writeback_inodes);
+ /*
+ * Make sure estimate of writeback throughput gets updated after
+ * writeback completed. We delay the update by BANDWIDTH_INTERVAL
+ * (which is the interval other bandwidth updates use for batching) so
+ * that if multiple inodes end writeback at a similar time, they get
+ * batched into one bandwidth update.
+ */
+ queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
+}
+
bool __folio_end_writeback(struct folio *folio)
{
long nr = folio_nr_pages(folio);
@@ -2777,6 +2807,9 @@ bool __folio_end_writeback(struct folio *folio)
wb_stat_mod(wb, WB_WRITEBACK, -nr);
__wb_writeout_add(wb, nr);
+ if (!mapping_tagged(mapping,
+ PAGECACHE_TAG_WRITEBACK))
+ wb_inode_writeback_end(wb);
}
}
@@ -2821,9 +2854,13 @@ bool __folio_start_writeback(struct folio *folio, bool keep_write)
PAGECACHE_TAG_WRITEBACK);
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
- if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT)
- wb_stat_mod(inode_to_wb(inode), WB_WRITEBACK,
- nr);
+ if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ wb_stat_mod(wb, WB_WRITEBACK, nr);
+ if (!on_wblist)
+ wb_inode_writeback_start(wb);
+ }
/*
* We can come through here when swapping
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 77d2a3811bdd..4d986e2d70b9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -594,8 +594,6 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
static int page_is_consistent(struct zone *zone, struct page *page)
{
- if (!pfn_valid_within(page_to_pfn(page)))
- return 0;
if (zone != page_zone(page))
return 0;
@@ -840,21 +838,24 @@ void init_mem_debugging_and_hardening(void)
}
#endif
- if (_init_on_alloc_enabled_early) {
- if (page_poisoning_requested)
- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
- "will take precedence over init_on_alloc\n");
- else
- static_branch_enable(&init_on_alloc);
- }
- if (_init_on_free_enabled_early) {
- if (page_poisoning_requested)
- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
- "will take precedence over init_on_free\n");
- else
- static_branch_enable(&init_on_free);
+ if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
+ page_poisoning_requested) {
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
+ "will take precedence over init_on_alloc and init_on_free\n");
+ _init_on_alloc_enabled_early = false;
+ _init_on_free_enabled_early = false;
}
+ if (_init_on_alloc_enabled_early)
+ static_branch_enable(&init_on_alloc);
+ else
+ static_branch_disable(&init_on_alloc);
+
+ if (_init_on_free_enabled_early)
+ static_branch_enable(&init_on_free);
+ else
+ static_branch_disable(&init_on_free);
+
#ifdef CONFIG_DEBUG_PAGEALLOC
if (!debug_pagealloc_enabled())
return;
@@ -1022,16 +1023,12 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
if (order >= MAX_ORDER - 2)
return false;
- if (!pfn_valid_within(buddy_pfn))
- return false;
-
combined_pfn = buddy_pfn & pfn;
higher_page = page + (combined_pfn - pfn);
buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
- return pfn_valid_within(buddy_pfn) &&
- page_is_buddy(higher_page, higher_buddy, order + 1);
+ return page_is_buddy(higher_page, higher_buddy, order + 1);
}
/*
@@ -1092,8 +1089,6 @@ continue_merging:
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
- if (!pfn_valid_within(buddy_pfn))
- goto done_merging;
if (!page_is_buddy(page, buddy, order))
goto done_merging;
/*
@@ -1751,9 +1746,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
/*
* Check that the whole (or subset of) a pageblock given by the interval of
* [start_pfn, end_pfn) is valid and within the same zone, before scanning it
- * with the migration of free compaction scanner. The scanners then need to
- * use only pfn_valid_within() check for arches that allow holes within
- * pageblocks.
+ * with the migration of free compaction scanner.
*
* Return struct page pointer of start_pfn, or NULL if checks were not passed.
*
@@ -1869,8 +1862,6 @@ static inline void __init pgdat_init_report_one_done(void)
*/
static inline bool __init deferred_pfn_valid(unsigned long pfn)
{
- if (!pfn_valid_within(pfn))
- return false;
if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
return false;
return true;
@@ -2517,11 +2508,6 @@ static int move_freepages(struct zone *zone,
int pages_moved = 0;
for (pfn = start_pfn; pfn <= end_pfn;) {
- if (!pfn_valid_within(pfn)) {
- pfn++;
- continue;
- }
-
page = pfn_to_page(pfn);
if (!PageBuddy(page)) {
/*
@@ -6165,7 +6151,7 @@ static int node_load[MAX_NUMNODES];
*
* Return: node id of the found node or %NUMA_NO_NODE if no node is found.
*/
-static int find_next_best_node(int node, nodemask_t *used_node_mask)
+int find_next_best_node(int node, nodemask_t *used_node_mask)
{
int n, val;
int min_val = INT_MAX;
@@ -6650,7 +6636,6 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
}
-#if !defined(CONFIG_FLATMEM)
/*
* Only struct pages that correspond to ranges defined by memblock.memory
* are zeroed and initialized by going through __init_single_page() during
@@ -6695,13 +6680,6 @@ static void __init init_unavailable_range(unsigned long spfn,
pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
node, zone_names[zone], pgcnt);
}
-#else
-static inline void init_unavailable_range(unsigned long spfn,
- unsigned long epfn,
- int zone, int node)
-{
-}
-#endif
static void __init memmap_init_zone_range(struct zone *zone,
unsigned long start_pfn,
@@ -6731,7 +6709,7 @@ static void __init memmap_init(void)
{
unsigned long start_pfn, end_pfn;
unsigned long hole_pfn = 0;
- int i, j, zone_id, nid;
+ int i, j, zone_id = 0, nid;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
struct pglist_data *node = NODE_DATA(nid);
@@ -6764,6 +6742,26 @@ static void __init memmap_init(void)
init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
}
+void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, int nid, bool exact_nid)
+{
+ void *ptr;
+
+ if (exact_nid)
+ ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
+ MEMBLOCK_ALLOC_ACCESSIBLE,
+ nid);
+ else
+ ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
+ MEMBLOCK_ALLOC_ACCESSIBLE,
+ nid);
+
+ if (ptr && size > 0)
+ page_init_poison(ptr, size);
+
+ return ptr;
+}
+
static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
@@ -7511,7 +7509,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
}
#ifdef CONFIG_FLATMEM
-static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
{
unsigned long __maybe_unused start = 0;
unsigned long __maybe_unused offset = 0;
@@ -7535,8 +7533,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
- map = memblock_alloc_node(size, SMP_CACHE_BYTES,
- pgdat->node_id);
+ map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
+ pgdat->node_id, false);
if (!map)
panic("Failed to allocate %ld bytes for node %d memory map\n",
size, pgdat->node_id);
@@ -7557,7 +7555,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
#endif
}
#else
-static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
+static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
#endif /* CONFIG_FLATMEM */
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -8824,9 +8822,6 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
}
for (; iter < pageblock_nr_pages - offset; iter++) {
- if (!pfn_valid_within(pfn + iter))
- continue;
-
page = pfn_to_page(pfn + iter);
/*
@@ -8986,7 +8981,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
cc->nr_migratepages -= nr_reclaimed;
ret = migrate_pages(&cc->migratepages, alloc_migration_target,
- NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
+ NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
/*
* On -ENOMEM, migrate_pages() bails out right away. It is pointless
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index bddf788f45bf..471e3a13b541 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -93,8 +93,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
- if (pfn_valid_within(buddy_pfn) &&
- !is_migrate_isolate_page(buddy)) {
+ if (!is_migrate_isolate_page(buddy)) {
__isolate_free_page(page, order);
isolated_page = true;
}
@@ -250,10 +249,6 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
struct page *page;
while (pfn < end_pfn) {
- if (!pfn_valid_within(pfn)) {
- pfn++;
- continue;
- }
page = pfn_to_page(pfn);
if (PageBuddy(page))
/*
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 23bfb074ca3f..d24ed221357c 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -276,9 +276,6 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
pageblock_mt = get_pageblock_migratetype(page);
for (; pfn < block_end_pfn; pfn++) {
- if (!pfn_valid_within(pfn))
- continue;
-
/* The pageblock is online, no need to recheck. */
page = pfn_to_page(pfn);
@@ -479,10 +476,6 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
continue;
}
- /* Check for holes within a MAX_ORDER area */
- if (!pfn_valid_within(pfn))
- continue;
-
page = pfn_to_page(pfn);
if (PageBuddy(page)) {
unsigned long freepage_order = buddy_order_unsafe(page);
@@ -560,14 +553,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
block_end_pfn = min(block_end_pfn, end_pfn);
for (; pfn < block_end_pfn; pfn++) {
- struct page *page;
+ struct page *page = pfn_to_page(pfn);
struct page_ext *page_ext;
- if (!pfn_valid_within(pfn))
- continue;
-
- page = pfn_to_page(pfn);
-
if (page_zone(page) != zone)
continue;
diff --git a/mm/rmap.c b/mm/rmap.c
index 80588dba16b6..09c41e1f44d8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -89,7 +89,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
if (anon_vma) {
- atomic_set(&anon_vma->refcount, 1);
+ refcount_set(&anon_vma->refcount, 1);
anon_vma->degree = 1; /* Reference for first vma */
anon_vma->parent = anon_vma;
/*
@@ -104,7 +104,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
static inline void anon_vma_free(struct anon_vma *anon_vma)
{
- VM_BUG_ON(atomic_read(&anon_vma->refcount));
+ VM_BUG_ON(refcount_read(&anon_vma->refcount));
/*
* Synchronize against page_lock_anon_vma_read() such that
@@ -446,7 +446,7 @@ static void anon_vma_ctor(void *data)
struct anon_vma *anon_vma = data;
init_rwsem(&anon_vma->rwsem);
- atomic_set(&anon_vma->refcount, 0);
+ refcount_set(&anon_vma->refcount, 0);
anon_vma->rb_root = RB_ROOT_CACHED;
}
@@ -496,7 +496,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
- if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+ if (!refcount_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
goto out;
}
@@ -555,7 +555,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
}
/* trylock failed, we got to sleep */
- if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+ if (!refcount_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
goto out;
}
@@ -570,7 +570,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
rcu_read_unlock();
anon_vma_lock_read(anon_vma);
- if (atomic_dec_and_test(&anon_vma->refcount)) {
+ if (refcount_dec_and_test(&anon_vma->refcount)) {
/*
* Oops, we held the last refcount, release the lock
* and bail -- can't simply use put_anon_vma() because
@@ -2222,7 +2222,7 @@ void __put_anon_vma(struct anon_vma *anon_vma)
struct anon_vma *root = anon_vma->root;
anon_vma_free(anon_vma);
- if (root != anon_vma && atomic_dec_and_test(&root->refcount))
+ if (root != anon_vma && refcount_dec_and_test(&root->refcount))
anon_vma_free(root);
}
diff --git a/mm/secretmem.c b/mm/secretmem.c
index f77d25467a14..030f02ddc7c1 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -152,6 +152,7 @@ static void secretmem_freepage(struct page *page)
}
const struct address_space_operations secretmem_aops = {
+ .set_page_dirty = __set_page_dirty_no_writeback,
.freepage = secretmem_freepage,
.migratepage = secretmem_migratepage,
.isolate_page = secretmem_isolate_page,
diff --git a/mm/slub.c b/mm/slub.c
index 090fa14628f9..6dad2b6fda6f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1404,6 +1404,7 @@ static int __init setup_slub_debug(char *str)
char *slab_list;
bool global_slub_debug_changed = false;
bool slab_list_specified = false;
+ bool slab_list_debug_disable = true;
slub_debug = DEBUG_DEFAULT_FLAGS;
if (*str++ != '=' || !*str)
@@ -1411,7 +1412,6 @@ static int __init setup_slub_debug(char *str)
* No options specified. Switch on full debugging.
*/
goto out;
-
saved_str = str;
while (str) {
str = parse_slub_debug_flags(str, &flags, &slab_list, true);
@@ -1420,6 +1420,8 @@ static int __init setup_slub_debug(char *str)
slub_debug = flags;
global_slub_debug_changed = true;
} else {
+ if (flags || !IS_ENABLED(CONFIG_SLUB_DEBUG_ON))
+ slab_list_debug_disable = false;
slab_list_specified = true;
}
}
@@ -1431,7 +1433,7 @@ static int __init setup_slub_debug(char *str)
* long as there is no option specifying flags without a slab list.
*/
if (slab_list_specified) {
- if (!global_slub_debug_changed)
+ if (!global_slub_debug_changed && !slab_list_debug_disable)
slub_debug = 0;
slub_debug_string = saved_str;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 6326cdf36c4f..6a0e7b692dcd 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -109,32 +109,6 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
}
#endif
-#ifdef CONFIG_SPARSEMEM_EXTREME
-unsigned long __section_nr(struct mem_section *ms)
-{
- unsigned long root_nr;
- struct mem_section *root = NULL;
-
- for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
- root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
- if (!root)
- continue;
-
- if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
- break;
- }
-
- VM_BUG_ON(!root);
-
- return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
-}
-#else
-unsigned long __section_nr(struct mem_section *ms)
-{
- return (unsigned long)(ms - mem_section[0]);
-}
-#endif
-
/*
* During early boot, before section_mem_map is used for an actual
* mem_map, we use section_mem_map to store the section's NUMA
@@ -143,7 +117,7 @@ unsigned long __section_nr(struct mem_section *ms)
*/
static inline unsigned long sparse_encode_early_nid(int nid)
{
- return (nid << SECTION_NID_SHIFT);
+ return ((unsigned long)nid << SECTION_NID_SHIFT);
}
static inline int sparse_early_nid(struct mem_section *section)
@@ -187,10 +161,9 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
* those loops early.
*/
unsigned long __highest_present_section_nr;
-static void section_mark_present(struct mem_section *ms)
+static void __section_mark_present(struct mem_section *ms,
+ unsigned long section_nr)
{
- unsigned long section_nr = __section_nr(ms);
-
if (section_nr > __highest_present_section_nr)
__highest_present_section_nr = section_nr;
@@ -280,7 +253,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en
if (!ms->section_mem_map) {
ms->section_mem_map = sparse_encode_early_nid(nid) |
SECTION_IS_ONLINE;
- section_mark_present(ms);
+ __section_mark_present(ms, section);
}
}
}
@@ -462,8 +435,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
if (map)
return map;
- map = memblock_alloc_try_nid_raw(size, size, addr,
- MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ map = memmap_alloc(size, size, addr, nid, false);
if (!map)
panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
__func__, size, PAGE_SIZE, nid, &addr);
@@ -490,8 +462,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid)
* and we want it to be properly aligned to the section size - this is
* especially the case for VMEMMAP which maps memmap to PMDs
*/
- sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(),
- addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
sparsemap_buf_end = sparsemap_buf + size;
}
@@ -934,7 +905,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
ms = __nr_to_section(section_nr);
set_section_nid(section_nr, nid);
- section_mark_present(ms);
+ __section_mark_present(ms, section_nr);
/* Align memmap to section boundary in the subsection case */
if (section_nr_to_pfn(section_nr) != start_pfn)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 3a6c094310da..e3dcaeecc50f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3130,6 +3130,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
struct filename *name;
struct file *swap_file = NULL;
struct address_space *mapping;
+ struct dentry *dentry;
int prio;
int error;
union swap_header *swap_header;
@@ -3173,6 +3174,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->swap_file = swap_file;
mapping = swap_file->f_mapping;
+ dentry = swap_file->f_path.dentry;
inode = mapping->host;
error = claim_swapfile(p, inode);
@@ -3180,6 +3182,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap;
inode_lock(inode);
+ if (d_unlinked(dentry) || cant_mount(dentry)) {
+ error = -ENOENT;
+ goto bad_swap_unlock_inode;
+ }
if (IS_SWAPFILE(inode)) {
error = -EBUSY;
goto bad_swap_unlock_inode;
@@ -3773,7 +3779,7 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
{
struct swap_info_struct *si, *next;
int nid = page_to_nid(page);
diff --git a/mm/truncate.c b/mm/truncate.c
index 44ad5e515140..cc83a3f7c1ad 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -45,9 +45,13 @@ static inline void __clear_shadow_entry(struct address_space *mapping,
static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
void *entry)
{
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
__clear_shadow_entry(mapping, index, entry);
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
}
/*
@@ -73,8 +77,10 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
return;
dax = dax_mapping(mapping);
- if (!dax)
+ if (!dax) {
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
+ }
for (i = j; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
@@ -93,8 +99,12 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
__clear_shadow_entry(mapping, index, page);
}
- if (!dax)
+ if (!dax) {
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
+ }
pvec->nr = j;
}
@@ -484,8 +494,9 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
index = indices[i];
if (xa_is_value(page)) {
- invalidate_exceptional_entry(mapping, index,
- page);
+ count += invalidate_exceptional_entry(mapping,
+ index,
+ page);
continue;
}
index += thp_nr_pages(page) - 1;
@@ -513,19 +524,18 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
}
/**
- * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
- * @mapping: the address_space which holds the pages to invalidate
+ * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode
+ * @mapping: the address_space which holds the cache to invalidate
* @start: the offset 'from' which to invalidate
* @end: the offset 'to' which to invalidate (inclusive)
*
- * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
+ * This function removes pages that are clean, unmapped and unlocked,
+ * as well as shadow entries. It will not block on IO activity.
*
- * invalidate_mapping_pages() will not block on IO activity. It will not
- * invalidate pages which are dirty, locked, under writeback or mapped into
- * pagetables.
+ * If you want to remove all the pages of one inode, regardless of
+ * their use and writeback state, use truncate_inode_pages().
*
- * Return: the number of the pages that were invalidated
+ * Return: the number of the cache entries that were invalidated
*/
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
@@ -561,21 +571,23 @@ void invalidate_mapping_pagevec(struct address_space *mapping,
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
{
- unsigned long flags;
-
if (page->mapping != mapping)
return 0;
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
- xa_lock_irqsave(&mapping->i_pages, flags);
+ spin_lock(&mapping->host->i_lock);
+ xa_lock_irq(&mapping->i_pages);
if (PageDirty(page))
goto failed;
BUG_ON(page_has_private(page));
__delete_from_page_cache(page, NULL);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
@@ -583,7 +595,8 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
put_page(page); /* pagecache ref */
return 1;
failed:
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
+ spin_unlock(&mapping->host->i_lock);
return 0;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d5cd52805149..3824dc16ce1c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -787,6 +787,28 @@ unsigned long vmalloc_nr_pages(void)
return atomic_long_read(&nr_vmalloc_pages);
}
+static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
+{
+ struct vmap_area *va = NULL;
+ struct rb_node *n = vmap_area_root.rb_node;
+
+ while (n) {
+ struct vmap_area *tmp;
+
+ tmp = rb_entry(n, struct vmap_area, rb_node);
+ if (tmp->va_end > addr) {
+ va = tmp;
+ if (tmp->va_start <= addr)
+ break;
+
+ n = n->rb_left;
+ } else
+ n = n->rb_right;
+ }
+
+ return va;
+}
+
static struct vmap_area *__find_vmap_area(unsigned long addr)
{
struct rb_node *n = vmap_area_root.rb_node;
@@ -1479,6 +1501,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
int node, gfp_t gfp_mask)
{
struct vmap_area *va;
+ unsigned long freed;
unsigned long addr;
int purged = 0;
int ret;
@@ -1542,13 +1565,12 @@ overflow:
goto retry;
}
- if (gfpflags_allow_blocking(gfp_mask)) {
- unsigned long freed = 0;
- blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
- if (freed > 0) {
- purged = 0;
- goto retry;
- }
+ freed = 0;
+ blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
+
+ if (freed > 0) {
+ purged = 0;
+ goto retry;
}
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
@@ -2779,7 +2801,7 @@ EXPORT_SYMBOL_GPL(vmap_pfn);
static inline unsigned int
vm_area_alloc_pages(gfp_t gfp, int nid,
- unsigned int order, unsigned long nr_pages, struct page **pages)
+ unsigned int order, unsigned int nr_pages, struct page **pages)
{
unsigned int nr_allocated = 0;
@@ -2789,10 +2811,32 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* to fails, fallback to a single page allocator that is
* more permissive.
*/
- if (!order)
- nr_allocated = alloc_pages_bulk_array_node(
- gfp, nid, nr_pages, pages);
- else
+ if (!order) {
+ while (nr_allocated < nr_pages) {
+ unsigned int nr, nr_pages_request;
+
+ /*
+ * A maximum allowed request is hard-coded and is 100
+ * pages per call. That is done in order to prevent a
+ * long preemption off scenario in the bulk-allocator
+ * so the range is [1:100].
+ */
+ nr_pages_request = min(100U, nr_pages - nr_allocated);
+
+ nr = alloc_pages_bulk_array_node(gfp, nid,
+ nr_pages_request, pages + nr_allocated);
+
+ nr_allocated += nr;
+ cond_resched();
+
+ /*
+ * If zero or pages were obtained partly,
+ * fallback to a single page allocator.
+ */
+ if (nr != nr_pages_request)
+ break;
+ }
+ } else
/*
* Compound pages required for remap_vmalloc_page if
* high-order pages.
@@ -2816,9 +2860,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
for (i = 0; i < (1U << order); i++)
pages[nr_allocated + i] = page + i;
- if (gfpflags_allow_blocking(gfp))
- cond_resched();
-
+ cond_resched();
nr_allocated += 1U << order;
}
@@ -3267,9 +3309,14 @@ long vread(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
spin_lock(&vmap_area_lock);
- va = __find_vmap_area((unsigned long)addr);
+ va = find_vmap_area_exceed_addr((unsigned long)addr);
if (!va)
goto finished;
+
+ /* no intersects with alive vmap_area */
+ if ((unsigned long)addr + count <= va->va_start)
+ goto finished;
+
list_for_each_entry_from(va, &vmap_area_list, list) {
if (!count)
break;
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index d69019fc3789..76518e4166dc 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -74,8 +74,7 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work)
static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
{
- struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr);
memcg = parent_mem_cgroup(memcg);
if (!memcg)
@@ -240,7 +239,12 @@ static void vmpressure_work_fn(struct work_struct *work)
void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed)
{
- struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+ struct vmpressure *vmpr;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ vmpr = memcg_to_vmpressure(memcg);
/*
* Here we only want to account pressure that userland is able to
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7a2f25b904d9..7faf6509b009 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -41,6 +41,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
+#include <linux/migrate.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
#include <linux/oom.h>
@@ -118,6 +119,9 @@ struct scan_control {
/* The file pages on the current node are dangerously low */
unsigned int file_is_tiny:1;
+ /* Always discard instead of demoting to lower tier memory */
+ unsigned int no_demotion:1;
+
/* Allocation order */
s8 order;
@@ -515,6 +519,48 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker,
return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
}
+static bool can_demote_anon_pages(int nid, struct scan_control *sc)
+{
+ if (!numa_demotion_enabled)
+ return false;
+ if (sc) {
+ if (sc->no_demotion)
+ return false;
+ /* It is pointless to do demotion in memcg reclaim */
+ if (cgroup_reclaim(sc))
+ return false;
+ }
+ if (next_demotion_node(nid) == NUMA_NO_NODE)
+ return false;
+
+ return true;
+}
+
+static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
+ int nid,
+ struct scan_control *sc)
+{
+ if (memcg == NULL) {
+ /*
+ * For non-memcg reclaim, is there
+ * space in any swap device?
+ */
+ if (get_nr_swap_pages() > 0)
+ return true;
+ } else {
+ /* Is the memcg below its swap limit? */
+ if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
+ return true;
+ }
+
+ /*
+ * The page can not be swapped.
+ *
+ * Can it be reclaimed from this node via demotion?
+ */
+ return can_demote_anon_pages(nid, sc);
+}
+
/*
* This misses isolated pages which are not accounted for to save counters.
* As the data only determines if reclaim or compaction continues, it is
@@ -526,7 +572,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
+ if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
@@ -1049,14 +1095,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed, struct mem_cgroup *target_memcg)
{
- unsigned long flags;
int refcount;
void *shadow = NULL;
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- xa_lock_irqsave(&mapping->i_pages, flags);
+ if (!PageSwapCache(page))
+ spin_lock(&mapping->host->i_lock);
+ xa_lock_irq(&mapping->i_pages);
/*
* The non racy check for a busy page.
*
@@ -1097,7 +1144,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(page, target_memcg);
__delete_from_swap_cache(page, swap, shadow);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
put_swap_page(page, swap);
} else {
void (*freepage)(struct page *);
@@ -1123,7 +1170,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(page, target_memcg);
__delete_from_page_cache(page, shadow);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
if (freepage != NULL)
freepage(page);
@@ -1132,7 +1182,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
return 1;
cannot_free:
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
+ if (!PageSwapCache(page))
+ spin_unlock(&mapping->host->i_lock);
return 0;
}
@@ -1261,6 +1313,54 @@ static void page_check_dirty_writeback(struct page *page,
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
}
+static struct page *alloc_demote_page(struct page *page, unsigned long node)
+{
+ struct migration_target_control mtc = {
+ /*
+ * Allocate from 'node', or fail quickly and quietly.
+ * When this happens, 'page' will likely just be discarded
+ * instead of migrated.
+ */
+ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
+ __GFP_THISNODE | __GFP_NOWARN |
+ __GFP_NOMEMALLOC | GFP_NOWAIT,
+ .nid = node
+ };
+
+ return alloc_migration_target(page, (unsigned long)&mtc);
+}
+
+/*
+ * Take pages on @demote_list and attempt to demote them to
+ * another node. Pages which are not demoted are left on
+ * @demote_pages.
+ */
+static unsigned int demote_page_list(struct list_head *demote_pages,
+ struct pglist_data *pgdat)
+{
+ int target_nid = next_demotion_node(pgdat->node_id);
+ unsigned int nr_succeeded;
+ int err;
+
+ if (list_empty(demote_pages))
+ return 0;
+
+ if (target_nid == NUMA_NO_NODE)
+ return 0;
+
+ /* Demotion ignores all cpuset and mempolicy settings */
+ err = migrate_pages(demote_pages, alloc_demote_page, NULL,
+ target_nid, MIGRATE_ASYNC, MR_DEMOTION,
+ &nr_succeeded);
+
+ if (current_is_kswapd())
+ __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
+ else
+ __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
+
+ return nr_succeeded;
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -1272,12 +1372,16 @@ static unsigned int shrink_page_list(struct list_head *page_list,
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
+ LIST_HEAD(demote_pages);
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
+ bool do_demote_pass;
memset(stat, 0, sizeof(*stat));
cond_resched();
+ do_demote_pass = can_demote_anon_pages(pgdat->node_id, sc);
+retry:
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
@@ -1427,6 +1531,17 @@ static unsigned int shrink_page_list(struct list_head *page_list,
}
/*
+ * Before reclaiming the page, try to relocate
+ * its contents to another node.
+ */
+ if (do_demote_pass &&
+ (thp_migration_supported() || !PageTransHuge(page))) {
+ list_add(&page->lru, &demote_pages);
+ unlock_page(page);
+ continue;
+ }
+
+ /*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
* Lazyfree page could be freed directly
@@ -1621,11 +1736,14 @@ static unsigned int shrink_page_list(struct list_head *page_list,
/* follow __remove_mapping for reference */
if (!page_ref_freeze(page, 1))
goto keep_locked;
- if (PageDirty(page)) {
- page_ref_unfreeze(page, 1);
- goto keep_locked;
- }
-
+ /*
+ * The page has only one reference left, which is
+ * from the isolation. After the caller puts the
+ * page back on lru and drops the reference, the
+ * page will be freed anyway. It doesn't matter
+ * which lru it goes. So we don't bother checking
+ * PageDirty here.
+ */
count_vm_event(PGLAZYFREED);
count_memcg_page_event(page, PGLAZYFREED);
} else if (!mapping || !__remove_mapping(mapping, page, true,
@@ -1677,6 +1795,17 @@ keep:
list_add(&page->lru, &ret_pages);
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
+ /* 'page_list' is always empty here */
+
+ /* Migrate pages selected for demotion */
+ nr_reclaimed += demote_page_list(&demote_pages, pgdat);
+ /* Pages that could not be demoted are still in @demote_pages */
+ if (!list_empty(&demote_pages)) {
+ /* Pages which failed to demoted go back on @page_list for retry: */
+ list_splice_init(&demote_pages, page_list);
+ do_demote_pass = false;
+ goto retry;
+ }
pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
@@ -1695,7 +1824,6 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
{
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
- .priority = DEF_PRIORITY,
.may_unmap = 1,
};
struct reclaim_stat stat;
@@ -2321,10 +2449,10 @@ unsigned long reclaim_pages(struct list_head *page_list)
unsigned int noreclaim_flag;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
- .priority = DEF_PRIORITY,
.may_writepage = 1,
.may_unmap = 1,
.may_swap = 1,
+ .no_demotion = 1,
};
noreclaim_flag = memalloc_noreclaim_save();
@@ -2450,6 +2578,7 @@ enum scan_balance {
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
{
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
unsigned long anon_cost, file_cost, total_cost;
int swappiness = mem_cgroup_swappiness(memcg);
@@ -2460,7 +2589,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
enum lru_list lru;
/* If we have no swap space, do not bother scanning anon pages. */
- if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
+ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2635,6 +2764,21 @@ out:
}
}
+/*
+ * Anonymous LRU management is a waste if there is
+ * ultimately no way to reclaim the memory.
+ */
+static bool can_age_anon_pages(struct pglist_data *pgdat,
+ struct scan_control *sc)
+{
+ /* Aging the anon LRU is valuable if swap is present: */
+ if (total_swap_pages > 0)
+ return true;
+
+ /* Also valuable if anon pages can be demoted: */
+ return can_demote_anon_pages(pgdat->node_id, sc);
+}
+
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
@@ -2744,7 +2888,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
+ if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+ inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
@@ -2814,7 +2959,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
*/
pages_for_compaction = compact_gap(sc->order);
inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
+ if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
return inactive_lru_pages > pages_for_compaction;
@@ -2888,6 +3033,12 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
again:
+ /*
+ * Flush the memory cgroup stats, so that we read accurate per-memcg
+ * lruvec stats for heuristics.
+ */
+ mem_cgroup_flush_stats();
+
memset(&sc->nr, 0, sizeof(sc->nr));
nr_reclaimed = sc->nr_reclaimed;
@@ -3424,18 +3575,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
* blocked waiting on the same lock. Instead, throttle for up to a
* second before continuing.
*/
- if (!(gfp_mask & __GFP_FS)) {
+ if (!(gfp_mask & __GFP_FS))
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
allow_direct_reclaim(pgdat), HZ);
+ else
+ /* Throttle until kswapd wakes the process */
+ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+ allow_direct_reclaim(pgdat));
- goto check_pending;
- }
-
- /* Throttle until kswapd wakes the process */
- wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
- allow_direct_reclaim(pgdat));
-
-check_pending:
if (fatal_signal_pending(current))
return true;
@@ -3573,7 +3720,7 @@ static void age_active_anon(struct pglist_data *pgdat,
struct mem_cgroup *memcg;
struct lruvec *lruvec;
- if (!total_swap_pages)
+ if (!can_age_anon_pages(pgdat, sc))
return;
lruvec = mem_cgroup_lruvec(NULL, pgdat);
@@ -4280,23 +4427,20 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
-int kswapd_run(int nid)
+void kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
- int ret = 0;
if (pgdat->kswapd)
- return 0;
+ return;
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd on node %d\n", nid);
- ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;
}
- return ret;
}
/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b0534e068166..13ff25d0d96a 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -204,7 +204,7 @@ int calculate_normal_threshold(struct zone *zone)
*
* Some sample thresholds:
*
- * Threshold Processors (fls) Zonesize fls(mem+1)
+ * Threshold Processors (fls) Zonesize fls(mem)+1
* ------------------------------------------------------------------
* 8 1 1 0.9-1 GB 4
* 16 2 2 0.9-1 GB 4
@@ -1217,6 +1217,8 @@ const char * const vmstat_text[] = {
"pgreuse",
"pgsteal_kswapd",
"pgsteal_direct",
+ "pgdemote_kswapd",
+ "pgdemote_direct",
"pgscan_kswapd",
"pgscan_direct",
"pgscan_direct_throttle",
@@ -1452,7 +1454,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
}
/* Print out the free pages at each order for each migatetype */
-static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
+static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
{
int order;
pg_data_t *pgdat = (pg_data_t *)arg;
@@ -1464,8 +1466,6 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
seq_putc(m, '\n');
walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
-
- return 0;
}
static void pagetypeinfo_showblockcount_print(struct seq_file *m,
@@ -1501,7 +1501,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
}
/* Print out the number of pageblocks for each migratetype */
-static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
+static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
{
int mtype;
pg_data_t *pgdat = (pg_data_t *)arg;
@@ -1512,8 +1512,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
seq_putc(m, '\n');
walk_zones_in_node(m, pgdat, true, false,
pagetypeinfo_showblockcount_print);
-
- return 0;
}
/*
@@ -1874,11 +1872,6 @@ static void vmstat_update(struct work_struct *w)
}
/*
- * Switch off vmstat processing and then fold all the remaining differentials
- * until the diffs stay at zero. The function is used by NOHZ and can only be
- * invoked when tick processing is not active.
- */
-/*
* Check if the diffs for a certain cpu indicate that
* an update is needed.
*/
@@ -1894,17 +1887,15 @@ static bool need_update(int cpu)
/*
* The fast way of checking if there are any vmstat diffs.
*/
- if (memchr_inv(pzstats->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
- sizeof(pzstats->vm_stat_diff[0])))
+ if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
return true;
if (last_pgdat == zone->zone_pgdat)
continue;
last_pgdat = zone->zone_pgdat;
n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
- if (memchr_inv(n->vm_node_stat_diff, 0, NR_VM_NODE_STAT_ITEMS *
- sizeof(n->vm_node_stat_diff[0])))
- return true;
+ if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
+ return true;
}
return false;
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 10830211a187..a67c384fea05 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -540,6 +540,13 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
goto out;
}
+ if (!spin_trylock(&mapping->host->i_lock)) {
+ xa_unlock(&mapping->i_pages);
+ spin_unlock_irq(lru_lock);
+ ret = LRU_RETRY;
+ goto out;
+ }
+
list_lru_isolate(lru, item);
__dec_lruvec_kmem_state(node, WORKINGSET_NODES);
@@ -559,6 +566,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
out_invalid:
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
ret = LRU_REMOVED_RETRY;
out:
cond_resched();
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 68e8831068f4..3e713ff6261e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1828,13 +1828,13 @@ static void putback_zspage_deferred(struct zs_pool *pool,
static inline void zs_pool_dec_isolated(struct zs_pool *pool)
{
VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
- atomic_long_dec(&pool->isolated_pages);
/*
- * There's no possibility of racing, since wait_for_isolated_drain()
- * checks the isolated count under &class->lock after enqueuing
- * on migration_wait.
+ * Checking pool->destroying must happen after atomic_long_dec()
+ * for pool->isolated_pages above. Paired with the smp_mb() in
+ * zs_unregister_migration().
*/
- if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
+ smp_mb__after_atomic();
+ if (atomic_long_dec_and_test(&pool->isolated_pages) && pool->destroying)
wake_up_all(&pool->migration_wait);
}
diff --git a/tools/testing/scatterlist/linux/mm.h b/tools/testing/scatterlist/linux/mm.h
index f9a12005fcea..16ec895bbe5f 100644
--- a/tools/testing/scatterlist/linux/mm.h
+++ b/tools/testing/scatterlist/linux/mm.h
@@ -127,7 +127,6 @@ kmalloc_array(unsigned int n, unsigned int size, unsigned int flags)
#define kmemleak_free(a)
#define PageSlab(p) (0)
-#define flush_kernel_dcache_page(p)
#define MAX_ERRNO 4095
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 74baab83fec3..192a2899bae8 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -56,7 +56,7 @@ static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags)
static int mfd_assert_reopen_fd(int fd_in)
{
- int r, fd;
+ int fd;
char path[100];
sprintf(path, "/proc/self/fd/%d", fd_in);
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index f0fd80ef17df..b02eac613fdd 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -27,3 +27,4 @@ hmm-tests
memfd_secret
local_config.*
split_huge_page_test
+ksm_tests
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 521243770f26..d9605bd10f2d 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -45,6 +45,7 @@ TEST_GEN_FILES += thuge-gen
TEST_GEN_FILES += transhuge-stress
TEST_GEN_FILES += userfaultfd
TEST_GEN_FILES += split_huge_page_test
+TEST_GEN_FILES += ksm_tests
ifeq ($(MACHINE),x86_64)
CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
@@ -145,6 +146,8 @@ $(OUTPUT)/hmm-tests: local_config.h
# HMM_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
$(OUTPUT)/hmm-tests: LDLIBS += $(HMM_EXTRA_LIBS)
+$(OUTPUT)/ksm_tests: LDLIBS += -lnuma
+
local_config.mk local_config.h: check_config.sh
/bin/sh ./check_config.sh $(CC)
diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c
new file mode 100644
index 000000000000..cdeb4a028538
--- /dev/null
+++ b/tools/testing/selftests/vm/ksm_tests.c
@@ -0,0 +1,516 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <time.h>
+#include <string.h>
+#include <numa.h>
+
+#include "../kselftest.h"
+
+#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/"
+#define KSM_FP(s) (KSM_SYSFS_PATH s)
+#define KSM_SCAN_LIMIT_SEC_DEFAULT 120
+#define KSM_PAGE_COUNT_DEFAULT 10l
+#define KSM_PROT_STR_DEFAULT "rw"
+#define KSM_USE_ZERO_PAGES_DEFAULT false
+#define KSM_MERGE_ACROSS_NODES_DEFAULT true
+
+struct ksm_sysfs {
+ unsigned long max_page_sharing;
+ unsigned long merge_across_nodes;
+ unsigned long pages_to_scan;
+ unsigned long run;
+ unsigned long sleep_millisecs;
+ unsigned long stable_node_chains_prune_millisecs;
+ unsigned long use_zero_pages;
+};
+
+enum ksm_test_name {
+ CHECK_KSM_MERGE,
+ CHECK_KSM_UNMERGE,
+ CHECK_KSM_ZERO_PAGE_MERGE,
+ CHECK_KSM_NUMA_MERGE
+};
+
+static int ksm_write_sysfs(const char *file_path, unsigned long val)
+{
+ FILE *f = fopen(file_path, "w");
+
+ if (!f) {
+ fprintf(stderr, "f %s\n", file_path);
+ perror("fopen");
+ return 1;
+ }
+ if (fprintf(f, "%lu", val) < 0) {
+ perror("fprintf");
+ return 1;
+ }
+ fclose(f);
+
+ return 0;
+}
+
+static int ksm_read_sysfs(const char *file_path, unsigned long *val)
+{
+ FILE *f = fopen(file_path, "r");
+
+ if (!f) {
+ fprintf(stderr, "f %s\n", file_path);
+ perror("fopen");
+ return 1;
+ }
+ if (fscanf(f, "%lu", val) != 1) {
+ perror("fscanf");
+ return 1;
+ }
+ fclose(f);
+
+ return 0;
+}
+
+static int str_to_prot(char *prot_str)
+{
+ int prot = 0;
+
+ if ((strchr(prot_str, 'r')) != NULL)
+ prot |= PROT_READ;
+ if ((strchr(prot_str, 'w')) != NULL)
+ prot |= PROT_WRITE;
+ if ((strchr(prot_str, 'x')) != NULL)
+ prot |= PROT_EXEC;
+
+ return prot;
+}
+
+static void print_help(void)
+{
+ printf("usage: ksm_tests [-h] <test type> [-a prot] [-p page_count] [-l timeout]\n"
+ "[-z use_zero_pages] [-m merge_across_nodes]\n");
+
+ printf("Supported <test type>:\n"
+ " -M (page merging)\n"
+ " -Z (zero pages merging)\n"
+ " -N (merging of pages in different NUMA nodes)\n"
+ " -U (page unmerging)\n\n");
+
+ printf(" -a: specify the access protections of pages.\n"
+ " <prot> must be of the form [rwx].\n"
+ " Default: %s\n", KSM_PROT_STR_DEFAULT);
+ printf(" -p: specify the number of pages to test.\n"
+ " Default: %ld\n", KSM_PAGE_COUNT_DEFAULT);
+ printf(" -l: limit the maximum running time (in seconds) for a test.\n"
+ " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT);
+ printf(" -z: change use_zero_pages tunable\n"
+ " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT);
+ printf(" -m: change merge_across_nodes tunable\n"
+ " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT);
+
+ exit(0);
+}
+
+static void *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size)
+{
+ void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0);
+
+ if (!map_ptr) {
+ perror("mmap");
+ return NULL;
+ }
+ memset(map_ptr, data, map_size);
+ if (mprotect(map_ptr, map_size, prot)) {
+ perror("mprotect");
+ munmap(map_ptr, map_size);
+ return NULL;
+ }
+
+ return map_ptr;
+}
+
+static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout)
+{
+ struct timespec cur_time;
+ unsigned long cur_scan, init_scan;
+
+ if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan))
+ return 1;
+ cur_scan = init_scan;
+
+ while (cur_scan < init_scan + scan_count) {
+ if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan))
+ return 1;
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) {
+ perror("clock_gettime");
+ return 1;
+ }
+ if ((cur_time.tv_sec - start_time.tv_sec) > timeout) {
+ printf("Scan time limit exceeded\n");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout)
+{
+ if (madvise(addr, size, MADV_MERGEABLE)) {
+ perror("madvise");
+ return 1;
+ }
+ if (ksm_write_sysfs(KSM_FP("run"), 1))
+ return 1;
+
+ /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */
+ if (ksm_do_scan(2, start_time, timeout))
+ return 1;
+
+ return 0;
+}
+
+static bool assert_ksm_pages_count(long dupl_page_count)
+{
+ unsigned long max_page_sharing, pages_sharing, pages_shared;
+
+ if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) ||
+ ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) ||
+ ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing))
+ return false;
+
+ /*
+ * Since there must be at least 2 pages for merging and 1 page can be
+ * shared with the limited number of pages (max_page_sharing), sometimes
+ * there are 'leftover' pages that cannot be merged. For example, if there
+ * are 11 pages and max_page_sharing = 10, then only 10 pages will be
+ * merged and the 11th page won't be affected. As a result, when the number
+ * of duplicate pages is divided by max_page_sharing and the remainder is 1,
+ * pages_shared and pages_sharing values will be equal between dupl_page_count
+ * and dupl_page_count - 1.
+ */
+ if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) {
+ if (pages_shared == dupl_page_count / max_page_sharing &&
+ pages_sharing == pages_shared * (max_page_sharing - 1))
+ return true;
+ } else {
+ if (pages_shared == (dupl_page_count / max_page_sharing + 1) &&
+ pages_sharing == dupl_page_count - pages_shared)
+ return true;
+ }
+
+ return false;
+}
+
+static int ksm_save_def(struct ksm_sysfs *ksm_sysfs)
+{
+ if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) ||
+ ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) ||
+ ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) ||
+ ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) ||
+ ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) ||
+ ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
+ &ksm_sysfs->stable_node_chains_prune_millisecs) ||
+ ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages))
+ return 1;
+
+ return 0;
+}
+
+static int ksm_restore(struct ksm_sysfs *ksm_sysfs)
+{
+ if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) ||
+ ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) ||
+ ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) ||
+ ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) ||
+ ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) ||
+ ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
+ ksm_sysfs->stable_node_chains_prune_millisecs) ||
+ ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages))
+ return 1;
+
+ return 0;
+}
+
+static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size)
+{
+ void *map_ptr;
+ struct timespec start_time;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+
+ /* fill pages with the same data and merge them */
+ map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+ if (!map_ptr)
+ return KSFT_FAIL;
+
+ if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+ goto err_out;
+
+ /* verify that the right number of pages are merged */
+ if (assert_ksm_pages_count(page_count)) {
+ printf("OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_PASS;
+ }
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_FAIL;
+}
+
+static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size)
+{
+ void *map_ptr;
+ struct timespec start_time;
+ int page_count = 2;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+
+ /* fill pages with the same data and merge them */
+ map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+ if (!map_ptr)
+ return KSFT_FAIL;
+
+ if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+ goto err_out;
+
+ /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */
+ memset(map_ptr, '-', 1);
+ memset(map_ptr + page_size, '+', 1);
+
+ /* get at least 1 scan, so KSM can detect that the pages were modified */
+ if (ksm_do_scan(1, start_time, timeout))
+ goto err_out;
+
+ /* check that unmerging was successful and 0 pages are currently merged */
+ if (assert_ksm_pages_count(0)) {
+ printf("OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_PASS;
+ }
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_FAIL;
+}
+
+static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout,
+ bool use_zero_pages, size_t page_size)
+{
+ void *map_ptr;
+ struct timespec start_time;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+
+ if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages))
+ return KSFT_FAIL;
+
+ /* fill pages with zero and try to merge them */
+ map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count);
+ if (!map_ptr)
+ return KSFT_FAIL;
+
+ if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+ goto err_out;
+
+ /*
+ * verify that the right number of pages are merged:
+ * 1) if use_zero_pages is set to 1, empty pages are merged
+ * with the kernel zero page instead of with each other;
+ * 2) if use_zero_pages is set to 0, empty pages are not treated specially
+ * and merged as usual.
+ */
+ if (use_zero_pages && !assert_ksm_pages_count(0))
+ goto err_out;
+ else if (!use_zero_pages && !assert_ksm_pages_count(page_count))
+ goto err_out;
+
+ printf("OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_PASS;
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_FAIL;
+}
+
+static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes,
+ size_t page_size)
+{
+ void *numa1_map_ptr, *numa2_map_ptr;
+ struct timespec start_time;
+ int page_count = 2;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+
+ if (numa_available() < 0) {
+ perror("NUMA support not enabled");
+ return KSFT_SKIP;
+ }
+ if (numa_max_node() < 1) {
+ printf("At least 2 NUMA nodes must be available\n");
+ return KSFT_SKIP;
+ }
+ if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes))
+ return KSFT_FAIL;
+
+ /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */
+ numa1_map_ptr = numa_alloc_onnode(page_size, 0);
+ numa2_map_ptr = numa_alloc_onnode(page_size, 1);
+ if (!numa1_map_ptr || !numa2_map_ptr) {
+ perror("numa_alloc_onnode");
+ return KSFT_FAIL;
+ }
+
+ memset(numa1_map_ptr, '*', page_size);
+ memset(numa2_map_ptr, '*', page_size);
+
+ /* try to merge the pages */
+ if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) ||
+ ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout))
+ goto err_out;
+
+ /*
+ * verify that the right number of pages are merged:
+ * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged;
+ * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is
+ * only 1 unique page in each node and they can't be shared.
+ */
+ if (merge_across_nodes && !assert_ksm_pages_count(page_count))
+ goto err_out;
+ else if (!merge_across_nodes && !assert_ksm_pages_count(0))
+ goto err_out;
+
+ numa_free(numa1_map_ptr, page_size);
+ numa_free(numa2_map_ptr, page_size);
+ printf("OK\n");
+ return KSFT_PASS;
+
+err_out:
+ numa_free(numa1_map_ptr, page_size);
+ numa_free(numa2_map_ptr, page_size);
+ printf("Not OK\n");
+ return KSFT_FAIL;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret, opt;
+ int prot = 0;
+ int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT;
+ long page_count = KSM_PAGE_COUNT_DEFAULT;
+ size_t page_size = sysconf(_SC_PAGESIZE);
+ struct ksm_sysfs ksm_sysfs_old;
+ int test_name = CHECK_KSM_MERGE;
+ bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT;
+ bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT;
+
+ while ((opt = getopt(argc, argv, "ha:p:l:z:m:MUZN")) != -1) {
+ switch (opt) {
+ case 'a':
+ prot = str_to_prot(optarg);
+ break;
+ case 'p':
+ page_count = atol(optarg);
+ if (page_count <= 0) {
+ printf("The number of pages must be greater than 0\n");
+ return KSFT_FAIL;
+ }
+ break;
+ case 'l':
+ ksm_scan_limit_sec = atoi(optarg);
+ if (ksm_scan_limit_sec <= 0) {
+ printf("Timeout value must be greater than 0\n");
+ return KSFT_FAIL;
+ }
+ break;
+ case 'h':
+ print_help();
+ break;
+ case 'z':
+ if (strcmp(optarg, "0") == 0)
+ use_zero_pages = 0;
+ else
+ use_zero_pages = 1;
+ break;
+ case 'm':
+ if (strcmp(optarg, "0") == 0)
+ merge_across_nodes = 0;
+ else
+ merge_across_nodes = 1;
+ break;
+ case 'M':
+ break;
+ case 'U':
+ test_name = CHECK_KSM_UNMERGE;
+ break;
+ case 'Z':
+ test_name = CHECK_KSM_ZERO_PAGE_MERGE;
+ break;
+ case 'N':
+ test_name = CHECK_KSM_NUMA_MERGE;
+ break;
+ default:
+ return KSFT_FAIL;
+ }
+ }
+
+ if (prot == 0)
+ prot = str_to_prot(KSM_PROT_STR_DEFAULT);
+
+ if (access(KSM_SYSFS_PATH, F_OK)) {
+ printf("Config KSM not enabled\n");
+ return KSFT_SKIP;
+ }
+
+ if (ksm_save_def(&ksm_sysfs_old)) {
+ printf("Cannot save default tunables\n");
+ return KSFT_FAIL;
+ }
+
+ if (ksm_write_sysfs(KSM_FP("run"), 2) ||
+ ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) ||
+ ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) ||
+ ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count))
+ return KSFT_FAIL;
+
+ switch (test_name) {
+ case CHECK_KSM_MERGE:
+ ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
+ ksm_scan_limit_sec, page_size);
+ break;
+ case CHECK_KSM_UNMERGE:
+ ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+ page_size);
+ break;
+ case CHECK_KSM_ZERO_PAGE_MERGE:
+ ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
+ ksm_scan_limit_sec, use_zero_pages, page_size);
+ break;
+ case CHECK_KSM_NUMA_MERGE:
+ ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+ merge_across_nodes, page_size);
+ break;
+ }
+
+ if (ksm_restore(&ksm_sysfs_old)) {
+ printf("Cannot restore default tunables\n");
+ return KSFT_FAIL;
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh
index d09a6b71f1e9..45e803af7c77 100755
--- a/tools/testing/selftests/vm/run_vmtests.sh
+++ b/tools/testing/selftests/vm/run_vmtests.sh
@@ -377,6 +377,102 @@ else
exitcode=1
fi
+echo "-------------------------------------------------------"
+echo "running KSM MADV_MERGEABLE test with 10 identical pages"
+echo "-------------------------------------------------------"
+./ksm_tests -M -p 10
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
+echo "------------------------"
+echo "running KSM unmerge test"
+echo "------------------------"
+./ksm_tests -U
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
+echo "----------------------------------------------------------"
+echo "running KSM test with 10 zero pages and use_zero_pages = 0"
+echo "----------------------------------------------------------"
+./ksm_tests -Z -p 10 -z 0
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
+echo "----------------------------------------------------------"
+echo "running KSM test with 10 zero pages and use_zero_pages = 1"
+echo "----------------------------------------------------------"
+./ksm_tests -Z -p 10 -z 1
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
+echo "-------------------------------------------------------------"
+echo "running KSM test with 2 NUMA nodes and merge_across_nodes = 1"
+echo "-------------------------------------------------------------"
+./ksm_tests -N -m 1
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
+echo "-------------------------------------------------------------"
+echo "running KSM test with 2 NUMA nodes and merge_across_nodes = 0"
+echo "-------------------------------------------------------------"
+./ksm_tests -N -m 0
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
exit $exitcode
exit $exitcode
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index e363bdaff59d..2ea438e6b8b1 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -210,8 +210,10 @@ static void anon_release_pages(char *rel_area)
static void anon_allocate_area(void **alloc_area)
{
- if (posix_memalign(alloc_area, page_size, nr_pages * page_size))
- err("posix_memalign() failed");
+ *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (*alloc_area == MAP_FAILED)
+ err("mmap of anonymous memory failed");
}
static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)