summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/compiler-gcc.h11
-rw-r--r--include/linux/damon.h10
-rw-r--r--include/linux/dax.h56
-rw-r--r--include/linux/fs.h13
-rw-r--r--include/linux/gfp.h12
-rw-r--r--include/linux/highmem.h10
-rw-r--r--include/linux/huge_mm.h88
-rw-r--r--include/linux/hugetlb.h24
-rw-r--r--include/linux/kfifo.h2
-rw-r--r--include/linux/khugepaged.h30
-rw-r--r--include/linux/maple_tree.h685
-rw-r--r--include/linux/memcontrol.h239
-rw-r--r--include/linux/memory_hotplug.h9
-rw-r--r--include/linux/memremap.h31
-rw-r--r--include/linux/migrate.h1
-rw-r--r--include/linux/mm.h121
-rw-r--r--include/linux/mm_inline.h6
-rw-r--r--include/linux/mm_types.h48
-rw-r--r--include/linux/mm_types_task.h12
-rw-r--r--include/linux/mmzone.h59
-rw-r--r--include/linux/net.h2
-rw-r--r--include/linux/once.h2
-rw-r--r--include/linux/oom.h11
-rw-r--r--include/linux/page-flags.h23
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--include/linux/pagevec.h1
-rw-r--r--include/linux/rmap.h4
-rw-r--r--include/linux/sched.h1
-rw-r--r--include/linux/sched/coredump.h7
-rw-r--r--include/linux/shrinker.h31
-rw-r--r--include/linux/swap.h5
-rw-r--r--include/linux/swapops.h12
-rw-r--r--include/linux/userfaultfd_k.h7
-rw-r--r--include/linux/vm_event_item.h4
-rw-r--r--include/linux/vmacache.h28
-rw-r--r--include/linux/vmstat.h6
36 files changed, 1217 insertions, 396 deletions
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index a0c55eeaeaf1..9b157b71036f 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -66,17 +66,6 @@
__builtin_unreachable(); \
} while (0)
-/*
- * GCC 'asm goto' miscompiles certain code sequences:
- *
- * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
- *
- * Work it around via a compiler barrier quirk suggested by Jakub Jelinek.
- *
- * (asm goto is automatically volatile - the naming reflects this.)
- */
-#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0)
-
#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP)
#define __HAVE_BUILTIN_BSWAP32__
#define __HAVE_BUILTIN_BSWAP64__
diff --git a/include/linux/damon.h b/include/linux/damon.h
index 2765c7d99beb..7b1f4a488230 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -86,6 +86,8 @@ struct damon_target {
* @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT.
* @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE.
* @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
+ * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists.
+ * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists.
* @DAMOS_STAT: Do nothing but count the stat.
* @NR_DAMOS_ACTIONS: Total number of DAMOS actions
*/
@@ -95,6 +97,8 @@ enum damos_action {
DAMOS_PAGEOUT,
DAMOS_HUGEPAGE,
DAMOS_NOHUGEPAGE,
+ DAMOS_LRU_PRIO,
+ DAMOS_LRU_DEPRIO,
DAMOS_STAT, /* Do nothing but only record the stat */
NR_DAMOS_ACTIONS,
};
@@ -525,6 +529,12 @@ bool damon_is_registered_ops(enum damon_ops_id id);
int damon_register_ops(struct damon_operations *ops);
int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id);
+static inline bool damon_target_has_pid(const struct damon_ctx *ctx)
+{
+ return ctx->ops.id == DAMON_OPS_VADDR || ctx->ops.id == DAMON_OPS_FVADDR;
+}
+
+
int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index e7b81634c52a..ba985333e26b 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -43,8 +43,21 @@ struct dax_operations {
void *addr, size_t bytes, struct iov_iter *iter);
};
+struct dax_holder_operations {
+ /*
+ * notify_failure - notify memory failure into inner holder device
+ * @dax_dev: the dax device which contains the holder
+ * @offset: offset on this dax device where memory failure occurs
+ * @len: length of this memory failure event
+ * @flags: action flags for memory failure handler
+ */
+ int (*notify_failure)(struct dax_device *dax_dev, u64 offset,
+ u64 len, int mf_flags);
+};
+
#if IS_ENABLED(CONFIG_DAX)
struct dax_device *alloc_dax(void *private, const struct dax_operations *ops);
+void *dax_holder(struct dax_device *dax_dev);
void put_dax(struct dax_device *dax_dev);
void kill_dax(struct dax_device *dax_dev);
void dax_write_cache(struct dax_device *dax_dev, bool wc);
@@ -66,6 +79,10 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
return dax_synchronous(dax_dev);
}
#else
+static inline void *dax_holder(struct dax_device *dax_dev)
+{
+ return NULL;
+}
static inline struct dax_device *alloc_dax(void *private,
const struct dax_operations *ops)
{
@@ -114,12 +131,9 @@ struct writeback_control;
#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
void dax_remove_host(struct gendisk *disk);
-struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
- u64 *start_off);
-static inline void fs_put_dax(struct dax_device *dax_dev)
-{
- put_dax(dax_dev);
-}
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
+ void *holder, const struct dax_holder_operations *ops);
+void fs_put_dax(struct dax_device *dax_dev, void *holder);
#else
static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
{
@@ -129,11 +143,12 @@ static inline void dax_remove_host(struct gendisk *disk)
{
}
static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
- u64 *start_off)
+ u64 *start_off, void *holder,
+ const struct dax_holder_operations *ops)
{
return NULL;
}
-static inline void fs_put_dax(struct dax_device *dax_dev)
+static inline void fs_put_dax(struct dax_device *dax_dev, void *holder)
{
}
#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
@@ -146,6 +161,10 @@ struct page *dax_layout_busy_page(struct address_space *mapping);
struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
dax_entry_t dax_lock_page(struct page *page);
void dax_unlock_page(struct page *page, dax_entry_t cookie);
+dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
+ unsigned long index, struct page **page);
+void dax_unlock_mapping_entry(struct address_space *mapping,
+ unsigned long index, dax_entry_t cookie);
#else
static inline struct page *dax_layout_busy_page(struct address_space *mapping)
{
@@ -173,6 +192,17 @@ static inline dax_entry_t dax_lock_page(struct page *page)
static inline void dax_unlock_page(struct page *page, dax_entry_t cookie)
{
}
+
+static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
+ unsigned long index, struct page **page)
+{
+ return 0;
+}
+
+static inline void dax_unlock_mapping_entry(struct address_space *mapping,
+ unsigned long index, dax_entry_t cookie)
+{
+}
#endif
int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
@@ -203,6 +233,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t bytes, struct iov_iter *i);
int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
size_t nr_pages);
+int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len,
+ int mf_flags);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
@@ -214,6 +246,14 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
+int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dest, loff_t destoff,
+ loff_t len, bool *is_same,
+ const struct iomap_ops *ops);
+int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags,
+ const struct iomap_ops *ops);
static inline bool dax_mapping(struct address_space *mapping)
{
return mapping->host && IS_DAX(mapping->host);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0af108dac569..13f06941953e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -74,6 +74,7 @@ struct fsverity_operations;
struct fs_context;
struct fs_parameter_spec;
struct fileattr;
+struct iomap_ops;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -2199,10 +2200,13 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t len, unsigned int flags);
-extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *count,
- unsigned int remap_flags);
+int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags,
+ const struct iomap_ops *dax_read_ops);
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *count, unsigned int remap_flags);
extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
@@ -3241,6 +3245,7 @@ extern void page_put_link(void *);
extern int page_symlink(struct inode *inode, const char *symname, int len);
extern const struct inode_operations page_symlink_inode_operations;
extern void kfree_link(void *);
+extern bool is_anon_inode(struct inode *inode);
void generic_fillattr(struct user_namespace *, struct inode *, struct kstat *);
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 2d2ccae933c2..9a88cce23e17 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -39,7 +39,7 @@ struct vm_area_struct;
#define ___GFP_IO 0x40u
#define ___GFP_FS 0x80u
#define ___GFP_ZERO 0x100u
-#define ___GFP_ATOMIC 0x200u
+/* 0x200u unused */
#define ___GFP_DIRECT_RECLAIM 0x400u
#define ___GFP_KSWAPD_RECLAIM 0x800u
#define ___GFP_WRITE 0x1000u
@@ -124,11 +124,8 @@ struct vm_area_struct;
*
* %__GFP_HIGH indicates that the caller is high-priority and that granting
* the request is necessary before the system can make forward progress.
- * For example, creating an IO context to clean pages.
- *
- * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
- * high priority. Users are typically interrupt handlers. This may be
- * used in conjunction with %__GFP_HIGH
+ * For example creating an IO context to clean pages and requests
+ * from atomic context.
*
* %__GFP_MEMALLOC allows access to all memory. This should only be used when
* the caller guarantees the allocation will allow more memory to be freed
@@ -143,7 +140,6 @@ struct vm_area_struct;
* %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
* This takes precedence over the %__GFP_MEMALLOC flag if both are set.
*/
-#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC)
#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH)
#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
@@ -337,7 +333,7 @@ struct vm_area_struct;
* version does not attempt reclaim/compaction at all and is by default used
* in page fault path, while the non-light is used by khugepaged.
*/
-#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
+#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM)
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM)
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 158a3a2907dc..177b07944640 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -243,6 +243,16 @@ static inline void clear_highpage(struct page *page)
kunmap_local(kaddr);
}
+static inline void clear_highpage_kasan_tagged(struct page *page)
+{
+ u8 tag;
+
+ tag = page_kasan_tag(page);
+ page_kasan_tag_reset(page);
+ clear_highpage(page);
+ page_kasan_tag_set(page, tag);
+}
+
#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
static inline void tag_clear_highpage(struct page *page)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index de29821231c9..ae3d8e2fd9e2 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -116,9 +116,30 @@ extern struct kobj_attribute shmem_enabled_attr;
extern unsigned long transparent_hugepage_flags;
+#define hugepage_flags_enabled() \
+ (transparent_hugepage_flags & \
+ ((1<<TRANSPARENT_HUGEPAGE_FLAG) | \
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
+#define hugepage_flags_always() \
+ (transparent_hugepage_flags & \
+ (1<<TRANSPARENT_HUGEPAGE_FLAG))
+
+/*
+ * Do the below checks:
+ * - For file vma, check if the linear page offset of vma is
+ * HPAGE_PMD_NR aligned within the file. The hugepage is
+ * guaranteed to be hugepage-aligned within the file, but we must
+ * check that the PMD-aligned addresses in the VMA map to
+ * PMD-aligned offsets within the file, else the hugepage will
+ * not be PMD-mappable.
+ * - For all vmas, check if the haddr is in an aligned HPAGE_PMD_SIZE
+ * area.
+ */
static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
- unsigned long haddr)
+ unsigned long addr)
{
+ unsigned long haddr;
+
/* Don't have to check pgoff for anonymous vma */
if (!vma_is_anonymous(vma)) {
if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
@@ -126,53 +147,13 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
return false;
}
- if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
- return false;
- return true;
-}
+ haddr = addr & HPAGE_PMD_MASK;
-static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
- unsigned long vm_flags)
-{
- /* Explicitly disabled through madvise. */
- if ((vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
return false;
return true;
}
-/*
- * to be used on vmas which are known to support THP.
- * Use transparent_hugepage_active otherwise
- */
-static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
-{
-
- /*
- * If the hardware/firmware marked hugepage support disabled.
- */
- if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
- return false;
-
- if (!transhuge_vma_enabled(vma, vma->vm_flags))
- return false;
-
- if (vma_is_temporary_stack(vma))
- return false;
-
- if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
- return true;
-
- if (vma_is_dax(vma))
- return true;
-
- if (transparent_hugepage_flags &
- (1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
- return !!(vma->vm_flags & VM_HUGEPAGE);
-
- return false;
-}
-
static inline bool file_thp_enabled(struct vm_area_struct *vma)
{
struct inode *inode;
@@ -187,7 +168,9 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
!inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
}
-bool transparent_hugepage_active(struct vm_area_struct *vma);
+bool hugepage_vma_check(struct vm_area_struct *vma,
+ unsigned long vm_flags,
+ bool smaps, bool in_pf);
#define transparent_hugepage_use_zero_page() \
(transparent_hugepage_flags & \
@@ -331,24 +314,15 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
return false;
}
-static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
-{
- return false;
-}
-
-static inline bool transparent_hugepage_active(struct vm_area_struct *vma)
-{
- return false;
-}
-
static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
- unsigned long haddr)
+ unsigned long addr)
{
return false;
}
-static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
- unsigned long vm_flags)
+static inline bool hugepage_vma_check(struct vm_area_struct *vma,
+ unsigned long vm_flags,
+ bool smaps, bool in_pf)
{
return false;
}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e4cff27d1198..75ee739d815b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -170,7 +170,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
vm_flags_t vm_flags);
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed);
-bool isolate_huge_page(struct page *page, struct list_head *list);
+int isolate_hugetlb(struct page *page, struct list_head *list);
int get_hwpoison_huge_page(struct page *page, bool *hugetlb);
int get_huge_page_for_hwpoison(unsigned long pfn, int flags);
void putback_active_hugepage(struct page *page);
@@ -194,8 +194,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz);
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
+unsigned long hugetlb_mask_last_page(struct hstate *h);
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long *addr, pte_t *ptep);
+ unsigned long addr, pte_t *ptep);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end);
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
@@ -242,7 +243,7 @@ static inline struct address_space *hugetlb_page_mapping_lock_write(
static inline int huge_pmd_unshare(struct mm_struct *mm,
struct vm_area_struct *vma,
- unsigned long *addr, pte_t *ptep)
+ unsigned long addr, pte_t *ptep)
{
return 0;
}
@@ -376,9 +377,9 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
return NULL;
}
-static inline bool isolate_huge_page(struct page *page, struct list_head *list)
+static inline int isolate_hugetlb(struct page *page, struct list_head *list)
{
- return false;
+ return -EBUSY;
}
static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
@@ -903,14 +904,6 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
atomic_long_sub(l, &mm->hugetlb_usage);
}
-#ifndef set_huge_swap_pte_at
-static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte, unsigned long sz)
-{
- set_huge_pte_at(mm, addr, ptep, pte);
-}
-#endif
-
#ifndef huge_ptep_modify_prot_start
#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
@@ -1094,11 +1087,6 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
{
}
-static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte, unsigned long sz)
-{
-}
-
static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 86249476b57f..0b35a41440ff 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -688,7 +688,7 @@ __kfifo_uint_must_check_helper( \
* writer, you don't need extra locking to use these macro.
*/
#define kfifo_to_user(fifo, to, len, copied) \
-__kfifo_uint_must_check_helper( \
+__kfifo_int_must_check_helper( \
({ \
typeof((fifo) + 1) __tmp = (fifo); \
void __user *__to = (to); \
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 392d34c3c59a..384f034ae947 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -10,8 +10,6 @@ extern struct attribute_group khugepaged_attr_group;
extern int khugepaged_init(void);
extern void khugepaged_destroy(void);
extern int start_stop_khugepaged(void);
-extern bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags);
extern void __khugepaged_enter(struct mm_struct *mm);
extern void __khugepaged_exit(struct mm_struct *mm);
extern void khugepaged_enter_vma(struct vm_area_struct *vma,
@@ -26,20 +24,6 @@ static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
}
#endif
-#define khugepaged_enabled() \
- (transparent_hugepage_flags & \
- ((1<<TRANSPARENT_HUGEPAGE_FLAG) | \
- (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
-#define khugepaged_always() \
- (transparent_hugepage_flags & \
- (1<<TRANSPARENT_HUGEPAGE_FLAG))
-#define khugepaged_req_madv() \
- (transparent_hugepage_flags & \
- (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
-#define khugepaged_defrag() \
- (transparent_hugepage_flags & \
- (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
-
static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags))
@@ -51,16 +35,6 @@ static inline void khugepaged_exit(struct mm_struct *mm)
if (test_bit(MMF_VM_HUGEPAGE, &mm->flags))
__khugepaged_exit(mm);
}
-
-static inline void khugepaged_enter(struct vm_area_struct *vma,
- unsigned long vm_flags)
-{
- if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
- khugepaged_enabled()) {
- if (hugepage_vma_check(vma, vm_flags))
- __khugepaged_enter(vma->vm_mm);
- }
-}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
@@ -68,10 +42,6 @@ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm
static inline void khugepaged_exit(struct mm_struct *mm)
{
}
-static inline void khugepaged_enter(struct vm_area_struct *vma,
- unsigned long vm_flags)
-{
-}
static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
unsigned long vm_flags)
{
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
new file mode 100644
index 000000000000..3c6642358904
--- /dev/null
+++ b/include/linux/maple_tree.h
@@ -0,0 +1,685 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef _LINUX_MAPLE_TREE_H
+#define _LINUX_MAPLE_TREE_H
+/*
+ * Maple Tree - An RCU-safe adaptive tree for storing ranges
+ * Copyright (c) 2018-2022 Oracle
+ * Authors: Liam R. Howlett <Liam.Howlett@Oracle.com>
+ * Matthew Wilcox <willy@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+/* #define CONFIG_MAPLE_RCU_DISABLED */
+/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */
+
+/*
+ * Allocated nodes are mutable until they have been inserted into the tree,
+ * at which time they cannot change their type until they have been removed
+ * from the tree and an RCU grace period has passed.
+ *
+ * Removed nodes have their ->parent set to point to themselves. RCU readers
+ * check ->parent before relying on the value that they loaded from the
+ * slots array. This lets us reuse the slots array for the RCU head.
+ *
+ * Nodes in the tree point to their parent unless bit 0 is set.
+ */
+#if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64)
+/* 64bit sizes */
+#define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */
+#define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */
+#define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */
+#define MAPLE_ARANGE64_META_MAX 15 /* Out of range for metadata */
+#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1)
+#else
+/* 32bit sizes */
+#define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */
+#define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */
+#define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */
+#define MAPLE_ARANGE64_META_MAX 31 /* Out of range for metadata */
+#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2)
+#endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */
+
+#define MAPLE_NODE_MASK 255UL
+
+/*
+ * The node->parent of the root node has bit 0 set and the rest of the pointer
+ * is a pointer to the tree itself. No more bits are available in this pointer
+ * (on m68k, the data structure may only be 2-byte aligned).
+ *
+ * Internal non-root nodes can only have maple_range_* nodes as parents. The
+ * parent pointer is 256B aligned like all other tree nodes. When storing a 32
+ * or 64 bit values, the offset can fit into 4 bits. The 16 bit values need an
+ * extra bit to store the offset. This extra bit comes from a reuse of the last
+ * bit in the node type. This is possible by using bit 1 to indicate if bit 2
+ * is part of the type or the slot.
+ *
+ * Once the type is decided, the decision of an allocation range type or a range
+ * type is done by examining the immutable tree flag for the MAPLE_ALLOC_RANGE
+ * flag.
+ *
+ * Node types:
+ * 0x??1 = Root
+ * 0x?00 = 16 bit nodes
+ * 0x010 = 32 bit nodes
+ * 0x110 = 64 bit nodes
+ *
+ * Slot size and location in the parent pointer:
+ * type : slot location
+ * 0x??1 : Root
+ * 0x?00 : 16 bit values, type in 0-1, slot in 2-6
+ * 0x010 : 32 bit values, type in 0-2, slot in 3-6
+ * 0x110 : 64 bit values, type in 0-2, slot in 3-6
+ */
+typedef struct maple_enode *maple_enode; /* encoded node */
+typedef struct maple_pnode *maple_pnode; /* parent node */
+
+/*
+ * This metadata is used to optimize the gap updating code and in reverse
+ * searching for gaps or any other code that needs to find the end of the data.
+ */
+struct maple_metadata {
+ unsigned char end;
+ unsigned char gap;
+};
+
+/*
+ * Leaf nodes do not store pointers to nodes, they store user data. Users may
+ * store almost any bit pattern. As noted above, the optimisation of storing an
+ * entry at 0 in the root pointer cannot be done for data which have the bottom
+ * two bits set to '10'. We also reserve values with the bottom two bits set to
+ * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use. Some APIs
+ * return errnos as a negative errno shifted right by two bits and the bottom
+ * two bits set to '10', and while choosing to store these values in the array
+ * is not an error, it may lead to confusion if you're testing for an error with
+ * mas_is_err().
+ *
+ * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits
+ * 3-6), bit 2 is reserved. That leaves bits 0-1 unused for now.
+ *
+ * In regular B-Tree terms, pivots are called keys. The term pivot is used to
+ * indicate that the tree is specifying ranges, Pivots may appear in the
+ * subtree with an entry attached to the value whereas keys are unique to a
+ * specific position of a B-tree. Pivot values are inclusive of the slot with
+ * the same index.
+ */
+
+struct maple_range_64 {
+ struct maple_pnode *parent;
+ unsigned long pivot[MAPLE_RANGE64_SLOTS - 1];
+ union {
+ void __rcu *slot[MAPLE_RANGE64_SLOTS];
+ struct {
+ void __rcu *pad[MAPLE_RANGE64_SLOTS - 1];
+ struct maple_metadata meta;
+ };
+ };
+};
+
+/*
+ * At tree creation time, the user can specify that they're willing to trade off
+ * storing fewer entries in a tree in return for storing more information in
+ * each node.
+ *
+ * The maple tree supports recording the largest range of NULL entries available
+ * in this node, also called gaps. This optimises the tree for allocating a
+ * range.
+ */
+struct maple_arange_64 {
+ struct maple_pnode *parent;
+ unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1];
+ void __rcu *slot[MAPLE_ARANGE64_SLOTS];
+ unsigned long gap[MAPLE_ARANGE64_SLOTS];
+ struct maple_metadata meta;
+};
+
+struct maple_alloc {
+ unsigned long total;
+ unsigned char node_count;
+ unsigned int request_count;
+ struct maple_alloc *slot[MAPLE_ALLOC_SLOTS];
+};
+
+struct maple_topiary {
+ struct maple_pnode *parent;
+ struct maple_enode *next; /* Overlaps the pivot */
+};
+
+enum maple_type {
+ maple_dense,
+ maple_leaf_64,
+ maple_range_64,
+ maple_arange_64,
+};
+
+
+/**
+ * DOC: Maple tree flags
+ *
+ * * MT_FLAGS_ALLOC_RANGE - Track gaps in this tree
+ * * MT_FLAGS_USE_RCU - Operate in RCU mode
+ * * MT_FLAGS_HEIGHT_OFFSET - The position of the tree height in the flags
+ * * MT_FLAGS_HEIGHT_MASK - The mask for the maple tree height value
+ * * MT_FLAGS_LOCK_MASK - How the mt_lock is used
+ * * MT_FLAGS_LOCK_IRQ - Acquired irq-safe
+ * * MT_FLAGS_LOCK_BH - Acquired bh-safe
+ * * MT_FLAGS_LOCK_EXTERN - mt_lock is not used
+ *
+ * MAPLE_HEIGHT_MAX The largest height that can be stored
+ */
+#define MT_FLAGS_ALLOC_RANGE 0x01
+#define MT_FLAGS_USE_RCU 0x02
+#define MT_FLAGS_HEIGHT_OFFSET 0x02
+#define MT_FLAGS_HEIGHT_MASK 0x7C
+#define MT_FLAGS_LOCK_MASK 0x300
+#define MT_FLAGS_LOCK_IRQ 0x100
+#define MT_FLAGS_LOCK_BH 0x200
+#define MT_FLAGS_LOCK_EXTERN 0x300
+
+#define MAPLE_HEIGHT_MAX 31
+
+
+#define MAPLE_NODE_TYPE_MASK 0x0F
+#define MAPLE_NODE_TYPE_SHIFT 0x03
+
+#define MAPLE_RESERVED_RANGE 4096
+
+#ifdef CONFIG_LOCKDEP
+typedef struct lockdep_map *lockdep_map_p;
+#define mt_lock_is_held(mt) lock_is_held(mt->ma_external_lock)
+#define mt_set_external_lock(mt, lock) \
+ (mt)->ma_external_lock = &(lock)->dep_map
+#else
+typedef struct { /* nothing */ } lockdep_map_p;
+#define mt_lock_is_held(mt) 1
+#define mt_set_external_lock(mt, lock) do { } while (0)
+#endif
+
+/*
+ * If the tree contains a single entry at index 0, it is usually stored in
+ * tree->ma_root. To optimise for the page cache, an entry which ends in '00',
+ * '01' or '11' is stored in the root, but an entry which ends in '10' will be
+ * stored in a node. Bits 3-6 are used to store enum maple_type.
+ *
+ * The flags are used both to store some immutable information about this tree
+ * (set at tree creation time) and dynamic information set under the spinlock.
+ *
+ * Another use of flags are to indicate global states of the tree. This is the
+ * case with the MAPLE_USE_RCU flag, which indicates the tree is currently in
+ * RCU mode. This mode was added to allow the tree to reuse nodes instead of
+ * re-allocating and RCU freeing nodes when there is a single user.
+ */
+struct maple_tree {
+ union {
+ spinlock_t ma_lock;
+ lockdep_map_p ma_external_lock;
+ };
+ void __rcu *ma_root;
+ unsigned int ma_flags;
+};
+
+/**
+ * MTREE_INIT() - Initialize a maple tree
+ * @name: The maple tree name
+ * @flags: The maple tree flags
+ *
+ */
+#define MTREE_INIT(name, flags) { \
+ .ma_lock = __SPIN_LOCK_UNLOCKED(name.ma_lock), \
+ .ma_flags = flags, \
+ .ma_root = NULL, \
+}
+
+/**
+ * MTREE_INIT_EXT() - Initialize a maple tree with an external lock.
+ * @name: The tree name
+ * @flags: The maple tree flags
+ * @lock: The external lock
+ */
+#ifdef CONFIG_LOCKDEP
+#define MTREE_INIT_EXT(name, flags, lock) { \
+ .ma_external_lock = &(lock).dep_map, \
+ .ma_flags = flags, \
+ .ma_root = NULL, \
+}
+#else
+#define MTREE_INIT_EXT(name, flags, lock) MTREE_INIT(name, flags)
+#endif
+
+#define DEFINE_MTREE(name) \
+ struct maple_tree name = MTREE_INIT(name, 0)
+
+#define mtree_lock(mt) spin_lock((&(mt)->ma_lock))
+#define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock))
+
+/*
+ * The Maple Tree squeezes various bits in at various points which aren't
+ * necessarily obvious. Usually, this is done by observing that pointers are
+ * N-byte aligned and thus the bottom log_2(N) bits are available for use. We
+ * don't use the high bits of pointers to store additional information because
+ * we don't know what bits are unused on any given architecture.
+ *
+ * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8
+ * low bits for our own purposes. Nodes are currently of 4 types:
+ * 1. Single pointer (Range is 0-0)
+ * 2. Non-leaf Allocation Range nodes
+ * 3. Non-leaf Range nodes
+ * 4. Leaf Range nodes All nodes consist of a number of node slots,
+ * pivots, and a parent pointer.
+ */
+
+struct maple_node {
+ union {
+ struct {
+ struct maple_pnode *parent;
+ void __rcu *slot[MAPLE_NODE_SLOTS];
+ };
+ struct {
+ void *pad;
+ struct rcu_head rcu;
+ struct maple_enode *piv_parent;
+ unsigned char parent_slot;
+ enum maple_type type;
+ unsigned char slot_len;
+ unsigned int ma_flags;
+ };
+ struct maple_range_64 mr64;
+ struct maple_arange_64 ma64;
+ struct maple_alloc alloc;
+ };
+};
+
+/*
+ * More complicated stores can cause two nodes to become one or three and
+ * potentially alter the height of the tree. Either half of the tree may need
+ * to be rebalanced against the other. The ma_topiary struct is used to track
+ * which nodes have been 'cut' from the tree so that the change can be done
+ * safely at a later date. This is done to support RCU.
+ */
+struct ma_topiary {
+ struct maple_enode *head;
+ struct maple_enode *tail;
+ struct maple_tree *mtree;
+};
+
+void *mtree_load(struct maple_tree *mt, unsigned long index);
+
+int mtree_insert(struct maple_tree *mt, unsigned long index,
+ void *entry, gfp_t gfp);
+int mtree_insert_range(struct maple_tree *mt, unsigned long first,
+ unsigned long last, void *entry, gfp_t gfp);
+int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
+ void *entry, unsigned long size, unsigned long min,
+ unsigned long max, gfp_t gfp);
+int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp,
+ void *entry, unsigned long size, unsigned long min,
+ unsigned long max, gfp_t gfp);
+
+int mtree_store_range(struct maple_tree *mt, unsigned long first,
+ unsigned long last, void *entry, gfp_t gfp);
+int mtree_store(struct maple_tree *mt, unsigned long index,
+ void *entry, gfp_t gfp);
+void *mtree_erase(struct maple_tree *mt, unsigned long index);
+
+void mtree_destroy(struct maple_tree *mt);
+void __mt_destroy(struct maple_tree *mt);
+
+/**
+ * mtree_empty() - Determine if a tree has any present entries.
+ * @mt: Maple Tree.
+ *
+ * Context: Any context.
+ * Return: %true if the tree contains only NULL pointers.
+ */
+static inline bool mtree_empty(const struct maple_tree *mt)
+{
+ return mt->ma_root == NULL;
+}
+
+/* Advanced API */
+
+/*
+ * The maple state is defined in the struct ma_state and is used to keep track
+ * of information during operations, and even between operations when using the
+ * advanced API.
+ *
+ * If state->node has bit 0 set then it references a tree location which is not
+ * a node (eg the root). If bit 1 is set, the rest of the bits are a negative
+ * errno. Bit 2 (the 'unallocated slots' bit) is clear. Bits 3-6 indicate the
+ * node type.
+ *
+ * state->alloc either has a request number of nodes or an allocated node. If
+ * stat->alloc has a requested number of nodes, the first bit will be set (0x1)
+ * and the remaining bits are the value. If state->alloc is a node, then the
+ * node will be of type maple_alloc. maple_alloc has MAPLE_NODE_SLOTS - 1 for
+ * storing more allocated nodes, a total number of nodes allocated, and the
+ * node_count in this node. node_count is the number of allocated nodes in this
+ * node. The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further
+ * nodes into state->alloc->slot[0]'s node. Nodes are taken from state->alloc
+ * by removing a node from the state->alloc node until state->alloc->node_count
+ * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted
+ * to state->alloc. Nodes are pushed onto state->alloc by putting the current
+ * state->alloc into the pushed node's slot[0].
+ *
+ * The state also contains the implied min/max of the state->node, the depth of
+ * this search, and the offset. The implied min/max are either from the parent
+ * node or are 0-oo for the root node. The depth is incremented or decremented
+ * every time a node is walked down or up. The offset is the slot/pivot of
+ * interest in the node - either for reading or writing.
+ *
+ * When returning a value the maple state index and last respectively contain
+ * the start and end of the range for the entry. Ranges are inclusive in the
+ * Maple Tree.
+ */
+struct ma_state {
+ struct maple_tree *tree; /* The tree we're operating in */
+ unsigned long index; /* The index we're operating on - range start */
+ unsigned long last; /* The last index we're operating on - range end */
+ struct maple_enode *node; /* The node containing this entry */
+ unsigned long min; /* The minimum index of this node - implied pivot min */
+ unsigned long max; /* The maximum index of this node - implied pivot max */
+ struct maple_alloc *alloc; /* Allocated nodes for this operation */
+ unsigned char depth; /* depth of tree descent during write */
+ unsigned char offset;
+ unsigned char mas_flags;
+};
+
+struct ma_wr_state {
+ struct ma_state *mas;
+ struct maple_node *node; /* Decoded mas->node */
+ unsigned long r_min; /* range min */
+ unsigned long r_max; /* range max */
+ enum maple_type type; /* mas->node type */
+ unsigned char offset_end; /* The offset where the write ends */
+ unsigned char node_end; /* mas->node end */
+ unsigned long *pivots; /* mas->node->pivots pointer */
+ unsigned long end_piv; /* The pivot at the offset end */
+ void __rcu **slots; /* mas->node->slots pointer */
+ void *entry; /* The entry to write */
+ void *content; /* The existing entry that is being overwritten */
+};
+
+#define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock))
+#define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock))
+
+
+/*
+ * Special values for ma_state.node.
+ * MAS_START means we have not searched the tree.
+ * MAS_ROOT means we have searched the tree and the entry we found lives in
+ * the root of the tree (ie it has index 0, length 1 and is the only entry in
+ * the tree).
+ * MAS_NONE means we have searched the tree and there is no node in the
+ * tree for this entry. For example, we searched for index 1 in an empty
+ * tree. Or we have a tree which points to a full leaf node and we
+ * searched for an entry which is larger than can be contained in that
+ * leaf node.
+ * MA_ERROR represents an errno. After dropping the lock and attempting
+ * to resolve the error, the walk would have to be restarted from the
+ * top of the tree as the tree may have been modified.
+ */
+#define MAS_START ((struct maple_enode *)1UL)
+#define MAS_ROOT ((struct maple_enode *)5UL)
+#define MAS_NONE ((struct maple_enode *)9UL)
+#define MAS_PAUSE ((struct maple_enode *)17UL)
+#define MA_ERROR(err) \
+ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL))
+
+#define MA_STATE(name, mt, first, end) \
+ struct ma_state name = { \
+ .tree = mt, \
+ .index = first, \
+ .last = end, \
+ .node = MAS_START, \
+ .min = 0, \
+ .max = ULONG_MAX, \
+ }
+
+#define MA_WR_STATE(name, ma_state, wr_entry) \
+ struct ma_wr_state name = { \
+ .mas = ma_state, \
+ .content = NULL, \
+ .entry = wr_entry, \
+ }
+
+#define MA_TOPIARY(name, tree) \
+ struct ma_topiary name = { \
+ .head = NULL, \
+ .tail = NULL, \
+ .mtree = tree, \
+ }
+
+void *mas_walk(struct ma_state *mas);
+void *mas_store(struct ma_state *mas, void *entry);
+void *mas_erase(struct ma_state *mas);
+int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp);
+void mas_store_prealloc(struct ma_state *mas, void *entry);
+void *mas_find(struct ma_state *mas, unsigned long max);
+void *mas_find_rev(struct ma_state *mas, unsigned long min);
+int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
+
+bool mas_nomem(struct ma_state *mas, gfp_t gfp);
+void mas_pause(struct ma_state *mas);
+void maple_tree_init(void);
+void mas_destroy(struct ma_state *mas);
+int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries);
+
+void *mas_prev(struct ma_state *mas, unsigned long min);
+void *mas_next(struct ma_state *mas, unsigned long max);
+
+int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max,
+ unsigned long size);
+
+/* Checks if a mas has not found anything */
+static inline bool mas_is_none(struct ma_state *mas)
+{
+ return mas->node == MAS_NONE;
+}
+
+/* Checks if a mas has been paused */
+static inline bool mas_is_paused(struct ma_state *mas)
+{
+ return mas->node == MAS_PAUSE;
+}
+
+void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas);
+void mas_dup_store(struct ma_state *mas, void *entry);
+
+/*
+ * This finds an empty area from the highest address to the lowest.
+ * AKA "Topdown" version,
+ */
+int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
+ unsigned long max, unsigned long size);
+/**
+ * mas_reset() - Reset a Maple Tree operation state.
+ * @mas: Maple Tree operation state.
+ *
+ * Resets the error or walk state of the @mas so future walks of the
+ * array will start from the root. Use this if you have dropped the
+ * lock and want to reuse the ma_state.
+ *
+ * Context: Any context.
+ */
+static inline void mas_reset(struct ma_state *mas)
+{
+ mas->node = MAS_START;
+}
+
+/**
+ * mas_for_each() - Iterate over a range of the maple tree.
+ * @mas: Maple Tree operation state (maple_state)
+ * @entry: Entry retrieved from the tree
+ * @max: maximum index to retrieve from the tree
+ *
+ * When returned, mas->index and mas->last will hold the entire range for the
+ * entry.
+ *
+ * Note: may return the zero entry.
+ *
+ */
+#define mas_for_each(mas, entry, max) \
+ while (((entry) = mas_find((mas), (max))) != NULL)
+
+
+/**
+ * mas_set_range() - Set up Maple Tree operation state for a different index.
+ * @mas: Maple Tree operation state.
+ * @start: New start of range in the Maple Tree.
+ * @last: New end of range in the Maple Tree.
+ *
+ * Move the operation state to refer to a different range. This will
+ * have the effect of starting a walk from the top; see mas_next()
+ * to move to an adjacent index.
+ */
+static inline
+void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
+{
+ mas->index = start;
+ mas->last = last;
+ mas->node = MAS_START;
+}
+
+/**
+ * mas_set() - Set up Maple Tree operation state for a different index.
+ * @mas: Maple Tree operation state.
+ * @index: New index into the Maple Tree.
+ *
+ * Move the operation state to refer to a different index. This will
+ * have the effect of starting a walk from the top; see mas_next()
+ * to move to an adjacent index.
+ */
+static inline void mas_set(struct ma_state *mas, unsigned long index)
+{
+
+ mas_set_range(mas, index, index);
+}
+
+static inline bool mt_external_lock(const struct maple_tree *mt)
+{
+ return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN;
+}
+
+/**
+ * mt_init_flags() - Initialise an empty maple tree with flags.
+ * @mt: Maple Tree
+ * @flags: maple tree flags.
+ *
+ * If you need to initialise a Maple Tree with special flags (eg, an
+ * allocation tree), use this function.
+ *
+ * Context: Any context.
+ */
+static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags)
+{
+ mt->ma_flags = flags;
+ if (!mt_external_lock(mt))
+ spin_lock_init(&mt->ma_lock);
+ rcu_assign_pointer(mt->ma_root, NULL);
+}
+
+/**
+ * mt_init() - Initialise an empty maple tree.
+ * @mt: Maple Tree
+ *
+ * An empty Maple Tree.
+ *
+ * Context: Any context.
+ */
+static inline void mt_init(struct maple_tree *mt)
+{
+ mt_init_flags(mt, 0);
+}
+
+static inline bool mt_in_rcu(struct maple_tree *mt)
+{
+#ifdef CONFIG_MAPLE_RCU_DISABLED
+ return false;
+#endif
+ return mt->ma_flags & MT_FLAGS_USE_RCU;
+}
+
+/**
+ * mt_clear_in_rcu() - Switch the tree to non-RCU mode.
+ * @mt: The Maple Tree
+ */
+static inline void mt_clear_in_rcu(struct maple_tree *mt)
+{
+ if (!mt_in_rcu(mt))
+ return;
+
+ if (mt_external_lock(mt)) {
+ BUG_ON(!mt_lock_is_held(mt));
+ mt->ma_flags &= ~MT_FLAGS_USE_RCU;
+ } else {
+ mtree_lock(mt);
+ mt->ma_flags &= ~MT_FLAGS_USE_RCU;
+ mtree_unlock(mt);
+ }
+}
+
+/**
+ * mt_set_in_rcu() - Switch the tree to RCU safe mode.
+ * @mt: The Maple Tree
+ */
+static inline void mt_set_in_rcu(struct maple_tree *mt)
+{
+ if (mt_in_rcu(mt))
+ return;
+
+ if (mt_external_lock(mt)) {
+ BUG_ON(!mt_lock_is_held(mt));
+ mt->ma_flags |= MT_FLAGS_USE_RCU;
+ } else {
+ mtree_lock(mt);
+ mt->ma_flags |= MT_FLAGS_USE_RCU;
+ mtree_unlock(mt);
+ }
+}
+
+void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max);
+void *mt_find_after(struct maple_tree *mt, unsigned long *index,
+ unsigned long max);
+void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min);
+void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max);
+
+/**
+ * mt_for_each - Iterate over each entry starting at index until max.
+ * @tree: The Maple Tree
+ * @entry: The current entry
+ * @index: The index to update to track the location in the tree
+ * @max: The maximum limit for @index
+ *
+ * Note: Will not return the zero entry.
+ */
+#define mt_for_each(tree, entry, index, max) \
+ for (entry = mt_find(tree, &(index), max); \
+ entry; entry = mt_find_after(tree, &(index), max))
+
+
+#ifdef CONFIG_DEBUG_MAPLE_TREE
+extern atomic_t maple_tree_tests_run;
+extern atomic_t maple_tree_tests_passed;
+
+void mt_dump(const struct maple_tree *mt);
+void mt_validate(struct maple_tree *mt);
+#define MT_BUG_ON(tree, x) do { \
+ atomic_inc(&maple_tree_tests_run); \
+ if (x) { \
+ pr_info("BUG at %s:%d (%u)\n", \
+ __func__, __LINE__, x); \
+ mt_dump(tree); \
+ pr_info("Pass: %u Run:%u\n", \
+ atomic_read(&maple_tree_tests_passed), \
+ atomic_read(&maple_tree_tests_run)); \
+ dump_stack(); \
+ } else { \
+ atomic_inc(&maple_tree_tests_passed); \
+ } \
+} while (0)
+#else
+#define MT_BUG_ON(tree, x) BUG_ON(x)
+#endif /* CONFIG_DEBUG_MAPLE_TREE */
+
+#endif /*_LINUX_MAPLE_TREE_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 04f2f33607e9..744cde2b2368 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -321,10 +321,10 @@ struct mem_cgroup {
#ifdef CONFIG_MEMCG_KMEM
int kmemcg_id;
+#endif
struct obj_cgroup __rcu *objcg;
/* list of inherited objcgs, protected by objcg_lock */
struct list_head objcg_list;
-#endif
MEMCG_PADDING(_pad2_);
@@ -353,6 +353,26 @@ struct mem_cgroup {
struct mem_cgroup_per_node *nodeinfo[];
};
+struct memcg_reparent_ops {
+ /*
+ * Note that interrupt is disabled before calling those callbacks,
+ * so the interrupt should remain disabled when leaving those callbacks.
+ */
+ void (*lock)(struct mem_cgroup *src, struct mem_cgroup *dst);
+ void (*relocate)(struct mem_cgroup *src, struct mem_cgroup *dst);
+ void (*unlock)(struct mem_cgroup *src, struct mem_cgroup *dst);
+};
+
+#define DEFINE_MEMCG_REPARENT_OPS(name) \
+ const struct memcg_reparent_ops memcg_##name##_reparent_ops = { \
+ .lock = name##_reparent_lock, \
+ .relocate = name##_reparent_relocate, \
+ .unlock = name##_reparent_unlock, \
+ }
+
+#define DECLARE_MEMCG_REPARENT_OPS(name) \
+ extern const struct memcg_reparent_ops memcg_##name##_reparent_ops
+
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
@@ -372,14 +392,12 @@ enum page_memcg_data_flags {
#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
-static inline bool folio_memcg_kmem(struct folio *folio);
-
/*
* After the initialization objcg->memcg is always pointing at
* a valid memcg, but can be atomically swapped to the parent memcg.
*
* The caller must ensure that the returned memcg won't be released:
- * e.g. acquire the rcu_read_lock or css_set_lock.
+ * e.g. acquire the rcu_read_lock or objcg_lock or cgroup_mutex.
*/
static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
{
@@ -387,43 +405,19 @@ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
}
/*
- * __folio_memcg - Get the memory cgroup associated with a non-kmem folio
- * @folio: Pointer to the folio.
- *
- * Returns a pointer to the memory cgroup associated with the folio,
- * or NULL. This function assumes that the folio is known to have a
- * proper memory cgroup pointer. It's not safe to call this function
- * against some type of folios, e.g. slab folios or ex-slab folios or
- * kmem folios.
- */
-static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
-{
- unsigned long memcg_data = folio->memcg_data;
-
- VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
- VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
- VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);
-
- return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
-}
-
-/*
- * __folio_objcg - get the object cgroup associated with a kmem folio.
+ * folio_objcg - get the object cgroup associated with a folio.
* @folio: Pointer to the folio.
*
* Returns a pointer to the object cgroup associated with the folio,
* or NULL. This function assumes that the folio is known to have a
- * proper object cgroup pointer. It's not safe to call this function
- * against some type of folios, e.g. slab folios or ex-slab folios or
- * LRU folios.
+ * proper object cgroup pointer.
*/
-static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
+static inline struct obj_cgroup *folio_objcg(struct folio *folio)
{
unsigned long memcg_data = folio->memcg_data;
VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
- VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);
return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}
@@ -437,30 +431,79 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
* proper memory cgroup pointer. It's not safe to call this function
* against some type of folios, e.g. slab folios or ex-slab folios.
*
- * For a non-kmem folio any of the following ensures folio and memcg binding
- * stability:
+ * For a folio any of the following ensures folio and objcg binding stability:
*
* - the folio lock
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
*
- * For a kmem folio a caller should hold an rcu read lock to protect memcg
- * associated with a kmem folio from being released.
+ * Based on the stable binding of folio and objcg, for a folio any of the
+ * following ensures folio and memcg binding stability:
+ *
+ * - objcg_lock
+ * - cgroup_mutex
+ * - the lruvec lock
+ * - the split queue lock (only THP page)
+ *
+ * If the caller only want to ensure that the page counters of memcg are
+ * updated correctly, ensure that the binding stability of folio and objcg
+ * is sufficient.
+ *
+ * Note: The caller should hold an rcu read lock to protect memcg associated
+ * with a folio from being released.
*/
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
- if (folio_memcg_kmem(folio))
- return obj_cgroup_memcg(__folio_objcg(folio));
- return __folio_memcg(folio);
+ struct obj_cgroup *objcg = folio_objcg(folio);
+
+ return objcg ? obj_cgroup_memcg(objcg) : NULL;
}
+/*
+ * page_memcg - Get the memory cgroup associated with a page.
+ * @page: Pointer to the page.
+ *
+ * See the cooments in folio_memcg().
+ */
static inline struct mem_cgroup *page_memcg(struct page *page)
{
return folio_memcg(page_folio(page));
}
-/**
+/*
+ * get_mem_cgroup_from_folio - Obtain a reference on the memory cgroup
+ * associated with a folio.
+ * @folio: Pointer to the folio.
+ *
+ * Returns a pointer to the memory cgroup (and obtain a reference on it)
+ * associated with the folio, or NULL. This function assumes that the
+ * folio is known to have a proper memory cgroup pointer. It's not safe
+ * to call this function against some type of pages, e.g. slab pages or
+ * ex-slab pages.
+ *
+ * The page and objcg or memcg binding rules can refer to folio_memcg().
+ */
+static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+retry:
+ memcg = folio_memcg(folio);
+ if (unlikely(memcg && !css_tryget(&memcg->css)))
+ goto retry;
+ rcu_read_unlock();
+
+ return memcg;
+}
+
+static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
+{
+ return get_mem_cgroup_from_folio(page_folio(page));
+}
+
+/*
* folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
* @folio: Pointer to the folio.
*
@@ -470,22 +513,20 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
*
* Return: A pointer to the memory cgroup associated with the folio,
* or NULL.
+ *
+ * The folio and objcg or memcg binding rules can refer to folio_memcg().
*/
static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
{
unsigned long memcg_data = READ_ONCE(folio->memcg_data);
+ struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
WARN_ON_ONCE(!rcu_read_lock_held());
- if (memcg_data & MEMCG_DATA_KMEM) {
- struct obj_cgroup *objcg;
+ objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
- objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
- return obj_cgroup_memcg(objcg);
- }
-
- return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ return objcg ? obj_cgroup_memcg(objcg) : NULL;
}
/*
@@ -498,16 +539,10 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
* has an associated memory cgroup pointer or an object cgroups vector or
* an object cgroup.
*
- * For a non-kmem page any of the following ensures page and memcg binding
- * stability:
+ * The page and objcg or memcg binding rules can refer to page_memcg().
*
- * - the page lock
- * - LRU isolation
- * - lock_page_memcg()
- * - exclusive reference
- *
- * For a kmem page a caller should hold an rcu read lock to protect memcg
- * associated with a kmem page from being released.
+ * A caller should hold an rcu read lock to protect memcg associated with a
+ * page from being released.
*/
static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
@@ -516,18 +551,14 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
* for slab pages, READ_ONCE() should be used here.
*/
unsigned long memcg_data = READ_ONCE(page->memcg_data);
+ struct obj_cgroup *objcg;
if (memcg_data & MEMCG_DATA_OBJCGS)
return NULL;
- if (memcg_data & MEMCG_DATA_KMEM) {
- struct obj_cgroup *objcg;
-
- objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
- return obj_cgroup_memcg(objcg);
- }
+ objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
- return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ return objcg ? obj_cgroup_memcg(objcg) : NULL;
}
static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
@@ -758,7 +789,9 @@ out:
* folio_lruvec - return lruvec for isolating/putting an LRU folio
* @folio: Pointer to the folio.
*
- * This function relies on folio->mem_cgroup being stable.
+ * The lruvec can be changed to its parent lruvec when the page reparented.
+ * The caller need to recheck if it cares about this changes (just like
+ * folio_lruvec_lock() does).
*/
static inline struct lruvec *folio_lruvec(struct folio *folio)
{
@@ -777,15 +810,6 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
unsigned long *flags);
-#ifdef CONFIG_DEBUG_VM
-void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio);
-#else
-static inline
-void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
-{
-}
-#endif
-
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -837,6 +861,15 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
}
struct mem_cgroup *mem_cgroup_from_id(unsigned short id);
+#ifdef CONFIG_SHRINKER_DEBUG
+static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
+{
+ return memcg ? cgroup_ino(memcg->css.cgroup) : 0;
+}
+
+struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino);
+#endif
+
static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
return mem_cgroup_from_css(seq_css(m));
@@ -857,8 +890,7 @@ static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
* parent_mem_cgroup - find the accounting parent of a memcg
* @memcg: memcg whose parent to find
*
- * Returns the parent memcg, or NULL if this is the root or the memory
- * controller is in legacy no-hierarchy mode.
+ * Returns the parent memcg, or NULL if this is the root.
*/
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
@@ -887,7 +919,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
return match;
}
-struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page);
ino_t page_cgroup_ino(struct page *page);
static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
@@ -1057,19 +1089,25 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
static inline void count_memcg_page_event(struct page *page,
enum vm_event_item idx)
{
- struct mem_cgroup *memcg = page_memcg(page);
+ struct mem_cgroup *memcg;
+ rcu_read_lock();
+ memcg = page_memcg(page);
if (memcg)
count_memcg_events(memcg, idx, 1);
+ rcu_read_unlock();
}
static inline void count_memcg_folio_events(struct folio *folio,
enum vm_event_item idx, unsigned long nr)
{
- struct mem_cgroup *memcg = folio_memcg(folio);
+ struct mem_cgroup *memcg;
+ rcu_read_lock();
+ memcg = folio_memcg(folio);
if (memcg)
count_memcg_events(memcg, idx, nr);
+ rcu_read_unlock();
}
static inline void count_memcg_event_mm(struct mm_struct *mm,
@@ -1148,6 +1186,16 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
return NULL;
}
+static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
+{
+ return NULL;
+}
+
+static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
+{
+ return NULL;
+}
+
static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
{
WARN_ON_ONCE(!rcu_read_lock_held());
@@ -1252,11 +1300,6 @@ static inline struct lruvec *folio_lruvec(struct folio *folio)
return &pgdat->__lruvec;
}
-static inline
-void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
-{
-}
-
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
return NULL;
@@ -1343,6 +1386,18 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
return NULL;
}
+#ifdef CONFIG_SHRINKER_DEBUG
+static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
+static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
+{
+ return NULL;
+}
+#endif
+
static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
return NULL;
@@ -1559,17 +1614,17 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}
-static inline void unlock_page_lruvec(struct lruvec *lruvec)
+static inline void lruvec_unlock(struct lruvec *lruvec)
{
spin_unlock(&lruvec->lru_lock);
}
-static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
+static inline void lruvec_unlock_irq(struct lruvec *lruvec)
{
spin_unlock_irq(&lruvec->lru_lock);
}
-static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
+static inline void lruvec_unlock_irqrestore(struct lruvec *lruvec,
unsigned long flags)
{
spin_unlock_irqrestore(&lruvec->lru_lock, flags);
@@ -1591,7 +1646,7 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
if (folio_matches_lruvec(folio, locked_lruvec))
return locked_lruvec;
- unlock_page_lruvec_irq(locked_lruvec);
+ lruvec_unlock_irq(locked_lruvec);
}
return folio_lruvec_lock_irq(folio);
@@ -1605,7 +1660,7 @@ static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio,
if (folio_matches_lruvec(folio, locked_lruvec))
return locked_lruvec;
- unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
+ lruvec_unlock_irqrestore(locked_lruvec, *flags);
}
return folio_lruvec_lock_irqsave(folio, flags);
@@ -1683,6 +1738,11 @@ int alloc_shrinker_info(struct mem_cgroup *memcg);
void free_shrinker_info(struct mem_cgroup *memcg);
void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
void reparent_shrinker_deferred(struct mem_cgroup *memcg);
+
+static inline int shrinker_id(struct shrinker *shrinker)
+{
+ return shrinker->id;
+}
#else
#define mem_cgroup_sockets_enabled 0
static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
@@ -1696,6 +1756,11 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg,
int nid, int shrinker_id)
{
}
+
+static inline int shrinker_id(struct shrinker *shrinker)
+{
+ return -1;
+}
#endif
#ifdef CONFIG_MEMCG_KMEM
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 20d7edf62a6a..e0b2209ab71c 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -351,13 +351,4 @@ void arch_remove_linear_mapping(u64 start, u64 size);
extern bool mhp_supports_memmap_on_memory(unsigned long size);
#endif /* CONFIG_MEMORY_HOTPLUG */
-#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
-bool mhp_memmap_on_memory(void);
-#else
-static inline bool mhp_memmap_on_memory(void)
-{
- return false;
-}
-#endif
-
#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 9f5ee49482de..2e20b20a0543 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -41,6 +41,13 @@ struct vmem_altmap {
* A more complete discussion of unaddressable memory may be found in
* include/linux/hmm.h and Documentation/mm/hmm.rst.
*
+ * MEMORY_DEVICE_COHERENT:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is used on platforms that have an advanced system bus (like CAPI or CXL). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allowed to pin such memory so that it can always be evicted.
+ *
* MEMORY_DEVICE_FS_DAX:
* Host memory that has similar access semantics as System RAM i.e. DMA
* coherent and supports page pinning. In support of coordinating page
@@ -61,6 +68,7 @@ struct vmem_altmap {
enum memory_type {
/* 0 is reserved to catch uninitialized type fields */
MEMORY_DEVICE_PRIVATE = 1,
+ MEMORY_DEVICE_COHERENT,
MEMORY_DEVICE_FS_DAX,
MEMORY_DEVICE_GENERIC,
MEMORY_DEVICE_PCI_P2PDMA,
@@ -79,6 +87,18 @@ struct dev_pagemap_ops {
* the page back to a CPU accessible page.
*/
vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
+
+ /*
+ * Handle the memory failure happens on a range of pfns. Notify the
+ * processes who are using these pfns, and try to recover the data on
+ * them if necessary. The mf_flags is finally passed to the recover
+ * function through the whole notify routine.
+ *
+ * When this is not implemented, or it returns -EOPNOTSUPP, the caller
+ * will fall back to a common handler called mf_generic_kill_procs().
+ */
+ int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn,
+ unsigned long nr_pages, int mf_flags);
};
#define PGMAP_ALTMAP_VALID (1 << 0)
@@ -143,6 +163,17 @@ static inline bool folio_is_device_private(const struct folio *folio)
return is_device_private_page(&folio->page);
}
+static inline bool is_device_coherent_page(const struct page *page)
+{
+ return is_zone_device_page(page) &&
+ page->pgmap->type == MEMORY_DEVICE_COHERENT;
+}
+
+static inline bool folio_is_device_coherent(const struct folio *folio)
+{
+ return is_device_coherent_page(&folio->page);
+}
+
static inline bool is_pci_p2pdma_page(const struct page *page)
{
return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index ae5bb67a9ba1..22c0a0cf5e0c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -182,6 +182,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
enum migrate_vma_direction {
MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
+ MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
};
struct migrate_vma {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf3d0d673f6b..cf746d37e504 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -601,7 +601,7 @@ struct vm_operations_struct {
#endif
/*
* Called by vm_normal_page() for special PTEs to find the
- * page for @addr. This is useful if the default behavior
+ * page for @addr. This is useful if the default behavior
* (using pte_page()) would not find the correct page.
*/
struct page *(*find_special_page)(struct vm_area_struct *vma,
@@ -658,6 +658,37 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
return vma->vm_flags & VM_ACCESS_FLAGS;
}
+static inline
+struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
+{
+ return mas_find(&vmi->mas, max);
+}
+
+static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
+{
+ /*
+ * Uses vma_find() to get the first VMA when the iterator starts.
+ * Calling mas_next() could skip the first entry.
+ */
+ return vma_find(vmi, ULONG_MAX);
+}
+
+static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
+{
+ return mas_prev(&vmi->mas, 0);
+}
+
+static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
+{
+ return vmi->mas.index;
+}
+
+#define for_each_vma(vmi, vma) while ((vma = vma_next(&(vmi))) != NULL)
+
+/* The MM code likes to work with exclusive end addresses */
+#define for_each_vma_range(vmi, vma, end) \
+ while ((vma = vma_find(&(vmi), (end) - 1)) != NULL)
+
#ifdef CONFIG_SHMEM
/*
* The vma_is_shmem is not inline because it is used only by slow
@@ -855,7 +886,7 @@ static inline struct folio *virt_to_folio(const void *x)
return page_folio(page);
}
-void __put_page(struct page *page);
+void __folio_put(struct folio *folio);
void put_pages_list(struct list_head *pages);
@@ -892,11 +923,7 @@ static inline void set_compound_page_dtor(struct page *page,
page[1].compound_dtor = compound_dtor;
}
-static inline void destroy_compound_page(struct page *page)
-{
- VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page);
- compound_page_dtors[page[1].compound_dtor](page);
-}
+void destroy_large_folio(struct folio *folio);
static inline int head_compound_pincount(struct page *head)
{
@@ -1197,7 +1224,7 @@ static inline __must_check bool try_get_page(struct page *page)
static inline void folio_put(struct folio *folio)
{
if (folio_put_testzero(folio))
- __put_page(&folio->page);
+ __folio_put(folio);
}
/**
@@ -1217,7 +1244,26 @@ static inline void folio_put(struct folio *folio)
static inline void folio_put_refs(struct folio *folio, int refs)
{
if (folio_ref_sub_and_test(folio, refs))
- __put_page(&folio->page);
+ __folio_put(folio);
+}
+
+void release_pages(struct page **pages, int nr);
+
+/**
+ * folios_put - Decrement the reference count on an array of folios.
+ * @folios: The folios.
+ * @nr: How many folios there are.
+ *
+ * Like folio_put(), but for an array of folios. This is more efficient
+ * than writing the loop yourself as it will optimise the locks which
+ * need to be taken if the folios are freed.
+ *
+ * Context: May be called in process or interrupt context, but not in NMI
+ * context. May be called while holding a spinlock.
+ */
+static inline void folios_put(struct folio **folios, unsigned int nr)
+{
+ release_pages((struct page **)folios, nr);
}
static inline void put_page(struct page *page)
@@ -1842,8 +1888,9 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
void zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
-void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
- unsigned long start, unsigned long end);
+void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
+ struct vm_area_struct *start_vma, unsigned long start,
+ unsigned long end);
struct mmu_notifier_range;
@@ -1962,8 +2009,12 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
* for now all the callers are only use one of the flags at the same
* time.
*/
-/* Whether we should allow dirty bit accounting */
-#define MM_CP_DIRTY_ACCT (1UL << 0)
+/*
+ * Whether we should manually check if we can map individual PTEs writable,
+ * because something (e.g., COW, uffd-wp) blocks that from happening for all
+ * PTEs automatically in a writable mapping.
+ */
+#define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0)
/* Whether this protection change is for NUMA hints */
#define MM_CP_PROT_NUMA (1UL << 1)
/* Whether this change is for write protecting */
@@ -2636,14 +2687,15 @@ extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
extern int split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
-extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
- struct rb_node **, struct rb_node *);
extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
unsigned long addr, unsigned long len, pgoff_t pgoff,
bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);
+void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas);
+void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas);
+
static inline int check_data_rlimit(unsigned long rlim,
unsigned long new,
unsigned long start,
@@ -2691,8 +2743,9 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr,
extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
unsigned long pgoff, unsigned long *populate, struct list_head *uf);
-extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
- struct list_head *uf, bool downgrade);
+extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
+ unsigned long start, size_t len, struct list_head *uf,
+ bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
@@ -2759,26 +2812,12 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
struct vm_area_struct **pprev);
-/**
- * find_vma_intersection() - Look up the first VMA which intersects the interval
- * @mm: The process address space.
- * @start_addr: The inclusive start user address.
- * @end_addr: The exclusive end user address.
- *
- * Returns: The first VMA within the provided range, %NULL otherwise. Assumes
- * start_addr < end_addr.
+/*
+ * Look up the first VMA which intersects the interval [start_addr, end_addr)
+ * NULL if none. Assume start_addr < end_addr.
*/
-static inline
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
- unsigned long start_addr,
- unsigned long end_addr)
-{
- struct vm_area_struct *vma = find_vma(mm, start_addr);
-
- if (vma && end_addr <= vma->vm_start)
- vma = NULL;
- return vma;
-}
+ unsigned long start_addr, unsigned long end_addr);
/**
* vma_lookup() - Find a VMA at a specific address
@@ -2790,12 +2829,7 @@ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
static inline
struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma = find_vma(mm, addr);
-
- if (vma && addr < vma->vm_start)
- vma = NULL;
-
- return vma;
+ return mtree_load(&mm->mm_mt, addr);
}
static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
@@ -2831,7 +2865,7 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
unsigned long vm_start, unsigned long vm_end)
{
- struct vm_area_struct *vma = find_vma(mm, vm_start);
+ struct vm_area_struct *vma = vma_lookup(mm, vm_start);
if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
vma = NULL;
@@ -2934,6 +2968,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
+#define FOLL_LRU 0x1000 /* return only LRU (anon or page cache) */
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
#define FOLL_COW 0x4000 /* internal GUP flag */
#define FOLL_ANON 0x8000 /* don't do file mappings */
@@ -3234,6 +3269,8 @@ enum mf_flags {
MF_UNPOISON = 1 << 4,
MF_SW_SIMULATED = 1 << 5,
};
+int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
+ unsigned long count, int mf_flags);
extern int memory_failure(unsigned long pfn, int flags);
extern void memory_failure_queue(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 7b25b53c474a..6585198b19e2 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -99,6 +99,8 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
+
update_lru_size(lruvec, lru, folio_zonenum(folio),
folio_nr_pages(folio));
if (lru != LRU_UNEVICTABLE)
@@ -116,6 +118,8 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
+
update_lru_size(lruvec, lru, folio_zonenum(folio),
folio_nr_pages(folio));
/* This is not expected to be used on LRU_UNEVICTABLE */
@@ -133,6 +137,8 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
+
if (lru != LRU_UNEVICTABLE)
list_del(&folio->lru);
update_lru_size(lruvec, lru, folio_zonenum(folio),
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6b961a29bf26..acbd8d03e01e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -9,6 +9,7 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
+#include <linux/maple_tree.h>
#include <linux/rwsem.h>
#include <linux/completion.h>
#include <linux/cpumask.h>
@@ -87,6 +88,7 @@ struct page {
*/
union {
struct list_head lru;
+
/* Or, for the Unevictable "LRU list" slot */
struct {
/* Always even, to negate PageTail */
@@ -94,6 +96,10 @@ struct page {
/* Count page's or folio's mlocks */
unsigned int mlock_count;
};
+
+ /* Or, free page */
+ struct list_head buddy_list;
+ struct list_head pcp_list;
};
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
@@ -402,21 +408,6 @@ struct vm_area_struct {
unsigned long vm_end; /* The first byte after our end address
within vm_mm. */
- /* linked list of VM areas per task, sorted by address */
- struct vm_area_struct *vm_next, *vm_prev;
-
- struct rb_node vm_rb;
-
- /*
- * Largest free memory gap in bytes to the left of this VMA.
- * Either between this VMA and vma->vm_prev, or between one of the
- * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
- * get_unmapped_area find a free area of the right size.
- */
- unsigned long rb_subtree_gap;
-
- /* Second cache line starts here. */
-
struct mm_struct *vm_mm; /* The address space we belong to. */
/*
@@ -480,9 +471,7 @@ struct vm_area_struct {
struct kioctx_table;
struct mm_struct {
struct {
- struct vm_area_struct *mmap; /* list of VMAs */
- struct rb_root mm_rb;
- u64 vmacache_seqnum; /* per-thread vmacache */
+ struct maple_tree mm_mt;
#ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
@@ -496,7 +485,6 @@ struct mm_struct {
unsigned long mmap_compat_legacy_base;
#endif
unsigned long task_size; /* size of task vm space */
- unsigned long highest_vm_end; /* highest vma end address */
pgd_t * pgd;
#ifdef CONFIG_MEMBARRIER
@@ -676,6 +664,7 @@ struct mm_struct {
unsigned long cpu_bitmap[];
};
+#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN)
extern struct mm_struct init_mm;
/* Pointer magic because the dynamic array size confuses some compilers. */
@@ -693,6 +682,27 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
return (struct cpumask *)&mm->cpu_bitmap;
}
+struct vma_iterator {
+ struct ma_state mas;
+};
+
+#define VMA_ITERATOR(name, mm, addr) \
+ struct vma_iterator name = { \
+ .mas = { \
+ .tree = &mm->mm_mt, \
+ .index = addr, \
+ .node = MAS_START, \
+ }, \
+ }
+
+static inline void vma_iter_init(struct vma_iterator *vmi,
+ struct mm_struct *mm, unsigned long addr)
+{
+ vmi->mas.tree = &mm->mm_mt;
+ vmi->mas.index = addr;
+ vmi->mas.node = MAS_START;
+}
+
struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index c1bc6731125c..0bb4b6da9993 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -25,18 +25,6 @@
#define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8)
/*
- * The per task VMA cache array:
- */
-#define VMACACHE_BITS 2
-#define VMACACHE_SIZE (1U << VMACACHE_BITS)
-#define VMACACHE_MASK (VMACACHE_SIZE - 1)
-
-struct vmacache {
- u64 seqnum;
- struct vm_area_struct *vmas[VMACACHE_SIZE];
-};
-
-/*
* When updating this, please also update struct resident_page_types[] in
* kernel/fork.c
*/
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index aab70355d64f..ed3ebe5e9e50 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -355,15 +355,18 @@ enum zone_watermarks {
};
/*
- * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional
- * for pageblock size for THP if configured.
+ * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list
+ * for THP which will usually be GFP_MOVABLE. Even if it is another type,
+ * it should not contribute to serious fragmentation causing THP allocation
+ * failures.
*/
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define NR_PCP_THP 1
#else
#define NR_PCP_THP 0
#endif
-#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP))
+#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
+#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)
/*
* Shift to encode migratetype and order in the same integer, with order
@@ -379,6 +382,7 @@ enum zone_watermarks {
/* Fields and list protected by pagesets local_lock in page_alloc.c */
struct per_cpu_pages {
+ spinlock_t lock; /* Protects lists field */
int count; /* number of pages in the list */
int high; /* high watermark, emptying needed */
int batch; /* chunk size for buddy add/remove */
@@ -389,7 +393,7 @@ struct per_cpu_pages {
/* Lists of pages, one per migrate type stored on the pcp-lists */
struct list_head lists[NR_PCP_LISTS];
-};
+} ____cacheline_aligned_in_smp;
struct per_cpu_zonestat {
#ifdef CONFIG_SMP
@@ -591,8 +595,8 @@ struct zone {
* give them a chance of being in the same cacheline.
*
* Write access to present_pages at runtime should be protected by
- * mem_hotplug_begin/end(). Any reader who can't tolerant drift of
- * present_pages should get_online_mems() to get a stable value.
+ * mem_hotplug_begin/done(). Any reader who can't tolerant drift of
+ * present_pages should use get_online_mems() to get a stable value.
*/
atomic_long_t managed_pages;
unsigned long spanned_pages;
@@ -870,7 +874,7 @@ typedef struct pglist_data {
unsigned long nr_reclaim_start; /* nr pages written while throttled
* when throttling started. */
struct task_struct *kswapd; /* Protected by
- mem_hotplug_begin/end() */
+ mem_hotplug_begin/done() */
int kswapd_order;
enum zone_type kswapd_highest_zoneidx;
@@ -1418,16 +1422,32 @@ extern size_t mem_section_usage_size(void);
* (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
* worst combination is powerpc with 256k pages,
* which results in PFN_SECTION_SHIFT equal 6.
- * To sum it up, at least 6 bits are available.
+ * To sum it up, at least 6 bits are available on all architectures.
+ * However, we can exceed 6 bits on some other architectures except
+ * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available
+ * with the worst case of 64K pages on arm64) if we make sure the
+ * exceeded bit is not applicable to powerpc.
*/
-#define SECTION_MARKED_PRESENT (1UL<<0)
-#define SECTION_HAS_MEM_MAP (1UL<<1)
-#define SECTION_IS_ONLINE (1UL<<2)
-#define SECTION_IS_EARLY (1UL<<3)
-#define SECTION_TAINT_ZONE_DEVICE (1UL<<4)
-#define SECTION_MAP_LAST_BIT (1UL<<5)
-#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1))
-#define SECTION_NID_SHIFT 6
+enum {
+ SECTION_MARKED_PRESENT_BIT,
+ SECTION_HAS_MEM_MAP_BIT,
+ SECTION_IS_ONLINE_BIT,
+ SECTION_IS_EARLY_BIT,
+#ifdef CONFIG_ZONE_DEVICE
+ SECTION_TAINT_ZONE_DEVICE_BIT,
+#endif
+ SECTION_MAP_LAST_BIT,
+};
+
+#define SECTION_MARKED_PRESENT BIT(SECTION_MARKED_PRESENT_BIT)
+#define SECTION_HAS_MEM_MAP BIT(SECTION_HAS_MEM_MAP_BIT)
+#define SECTION_IS_ONLINE BIT(SECTION_IS_ONLINE_BIT)
+#define SECTION_IS_EARLY BIT(SECTION_IS_EARLY_BIT)
+#ifdef CONFIG_ZONE_DEVICE
+#define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT)
+#endif
+#define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1))
+#define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT
static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
@@ -1466,12 +1486,19 @@ static inline int online_section(struct mem_section *section)
return (section && (section->section_mem_map & SECTION_IS_ONLINE));
}
+#ifdef CONFIG_ZONE_DEVICE
static inline int online_device_section(struct mem_section *section)
{
unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;
return section && ((section->section_mem_map & flags) == flags);
}
+#else
+static inline int online_device_section(struct mem_section *section)
+{
+ return 0;
+}
+#endif
static inline int online_section_nr(unsigned long nr)
{
diff --git a/include/linux/net.h b/include/linux/net.h
index a03485e8cbb2..711c3593c3b8 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -307,8 +307,6 @@ do { \
#define net_get_random_once(buf, nbytes) \
get_random_once((buf), (nbytes))
-#define net_get_random_once_wait(buf, nbytes) \
- get_random_once_wait((buf), (nbytes))
/*
* E.g. XFS meta- & log-data is in slab pages, or bcache meta
diff --git a/include/linux/once.h b/include/linux/once.h
index f54523052bbc..b14d8b309d52 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -54,7 +54,5 @@ void __do_once_done(bool *done, struct static_key_true *once_key,
#define get_random_once(buf, nbytes) \
DO_ONCE(get_random_bytes, (buf), (nbytes))
-#define get_random_once_wait(buf, nbytes) \
- DO_ONCE(get_random_bytes_wait, (buf), (nbytes)) \
#endif /* _LINUX_ONCE_H */
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 02d1e7bbd8cd..7d0c9c48a0c5 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -78,15 +78,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
}
/*
- * Use this helper if tsk->mm != mm and the victim mm needs a special
- * handling. This is guaranteed to stay true after once set.
- */
-static inline bool mm_is_oom_victim(struct mm_struct *mm)
-{
- return test_bit(MMF_OOM_VICTIM, &mm->flags);
-}
-
-/*
* Checks whether a page fault on the given mm is still reliable.
* This is no longer true if the oom reaper started to reap the
* address space which is reflected by MMF_UNSTABLE flag set in
@@ -106,8 +97,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
return 0;
}
-bool __oom_reap_task_mm(struct mm_struct *mm);
-
long oom_badness(struct task_struct *p,
unsigned long totalpages);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 3f5490f6f038..ea19528564d1 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -193,6 +193,11 @@ enum pageflags {
/* Only valid for buddy pages. Used to track pages that are reported */
PG_reported = PG_uptodate,
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /* For self-hosted memmap pages */
+ PG_vmemmap_self_hosted = PG_owner_priv_1,
+#endif
};
#define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
@@ -628,6 +633,12 @@ PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
*/
__PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
+#ifdef CONFIG_MEMORY_HOTPLUG
+PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY)
+#else
+PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
+#endif
+
/*
* On an anonymous page mapped into a user virtual memory area,
* page->mapping points to its anon_vma, not to a struct address_space;
@@ -650,6 +661,12 @@ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
#define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
+/*
+ * Different with flags above, this flag is used only for fsdax mode. It
+ * indicates that this page->mapping is now under reflink case.
+ */
+#define PAGE_MAPPING_DAX_COW 0x1
+
static __always_inline bool folio_mapping_flags(struct folio *folio)
{
return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0;
@@ -670,6 +687,12 @@ static __always_inline bool PageAnon(struct page *page)
return folio_test_anon(page_folio(page));
}
+static __always_inline bool __folio_test_movable(const struct folio *folio)
+{
+ return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
+ PAGE_MAPPING_MOVABLE;
+}
+
static __always_inline int __PageMovable(struct page *page)
{
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2a67c0ad7348..7c4748a62841 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -345,8 +345,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping)
#endif
}
-void release_pages(struct page **pages, int nr);
-
struct address_space *page_mapping(struct page *);
struct address_space *folio_mapping(struct folio *);
struct address_space *swapcache_mapping(struct folio *);
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 6649154a2115..215eb6c3bdc9 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -26,7 +26,6 @@ struct pagevec {
};
void __pagevec_release(struct pagevec *pvec);
-void __pagevec_lru_add(struct pagevec *pvec);
unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
struct address_space *mapping, pgoff_t *index, pgoff_t end,
xa_mark_t tag);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 9ec23138e410..bf80adca980b 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -325,8 +325,8 @@ struct page_vma_mapped_walk {
#define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \
struct page_vma_mapped_walk name = { \
.pfn = page_to_pfn(_page), \
- .nr_pages = compound_nr(page), \
- .pgoff = page_to_pgoff(page), \
+ .nr_pages = compound_nr(_page), \
+ .pgoff = page_to_pgoff(_page), \
.vma = _vma, \
.address = _address, \
.flags = _flags, \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 72242bc73d85..9776dee75048 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -860,7 +860,6 @@ struct task_struct {
struct mm_struct *active_mm;
/* Per-thread vma caching: */
- struct vmacache vmacache;
#ifdef SPLIT_RSS_COUNTING
struct task_rss_stat rss_stat;
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 4d0a5be28b70..8270ad7ae14c 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -71,9 +71,8 @@ static inline int get_dumpable(struct mm_struct *mm)
#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */
#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */
-#define MMF_OOM_VICTIM 25 /* mm is the oom victim */
-#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */
-#define MMF_MULTIPROCESS 27 /* mm is shared between processes */
+#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */
+#define MMF_MULTIPROCESS 26 /* mm is shared between processes */
/*
* MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either
* replaced in the future by mm.pinned_vm when it becomes stable, or grow into
@@ -81,7 +80,7 @@ static inline int get_dumpable(struct mm_struct *mm)
* pinned pages were unpinned later on, we'll still keep this bit set for the
* lifecycle of this mm, just for simplicity.
*/
-#define MMF_HAS_PINNED 28 /* FOLL_PIN has run, never cleared */
+#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */
#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 76fbf92b04d9..64416f3e0a1f 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -73,6 +73,11 @@ struct shrinker {
/* ID in shrinker_idr */
int id;
#endif
+#ifdef CONFIG_SHRINKER_DEBUG
+ int debugfs_id;
+ const char *name;
+ struct dentry *debugfs_entry;
+#endif
/* objs pending delete, per node */
atomic_long_t *nr_deferred;
};
@@ -88,10 +93,30 @@ struct shrinker {
*/
#define SHRINKER_NONSLAB (1 << 3)
-extern int prealloc_shrinker(struct shrinker *shrinker);
+extern int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...);
extern void register_shrinker_prepared(struct shrinker *shrinker);
-extern int register_shrinker(struct shrinker *shrinker);
+extern int register_shrinker(struct shrinker *shrinker, const char *fmt, ...);
extern void unregister_shrinker(struct shrinker *shrinker);
extern void free_prealloced_shrinker(struct shrinker *shrinker);
extern void synchronize_shrinkers(void);
-#endif
+
+#ifdef CONFIG_SHRINKER_DEBUG
+extern int shrinker_debugfs_add(struct shrinker *shrinker);
+extern void shrinker_debugfs_remove(struct shrinker *shrinker);
+extern int shrinker_debugfs_rename(struct shrinker *shrinker,
+ const char *fmt, ...);
+#else /* CONFIG_SHRINKER_DEBUG */
+static inline int shrinker_debugfs_add(struct shrinker *shrinker)
+{
+ return 0;
+}
+static inline void shrinker_debugfs_remove(struct shrinker *shrinker)
+{
+}
+static inline int shrinker_debugfs_rename(struct shrinker *shrinker,
+ const char *fmt, ...)
+{
+ return 0;
+}
+#endif /* CONFIG_SHRINKER_DEBUG */
+#endif /* _LINUX_SHRINKER_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 33c8b0a4c401..4e84f8b55309 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -456,6 +456,7 @@ static inline unsigned long total_swapcache_pages(void)
return global_node_page_state(NR_SWAPCACHE);
}
+extern void free_swap_cache(struct page *page);
extern void free_page_and_swap_cache(struct page *);
extern void free_pages_and_swap_cache(struct page **, int);
/* linux/mm/swapfile.c */
@@ -540,6 +541,10 @@ static inline void put_swap_device(struct swap_info_struct *si)
/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
#define free_swap_and_cache(e) is_pfn_swap_entry(e)
+static inline void free_swap_cache(struct page *page)
+{
+}
+
static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
{
return 0;
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index f24775b41880..bb7afd03a324 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -244,8 +244,10 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
spinlock_t *ptl);
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address);
-extern void migration_entry_wait_huge(struct vm_area_struct *vma,
- struct mm_struct *mm, pte_t *pte);
+#ifdef CONFIG_HUGETLB_PAGE
+extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
+extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
+#endif
#else
static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
{
@@ -271,8 +273,10 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
spinlock_t *ptl) { }
static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address) { }
-static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
- struct mm_struct *mm, pte_t *pte) { }
+#ifdef CONFIG_HUGETLB_PAGE
+static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
+static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { }
+#endif
static inline int is_writable_migration_entry(swp_entry_t entry)
{
return 0;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 732b522bacb7..eee374c29c85 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -173,9 +173,8 @@ extern bool userfaultfd_remove(struct vm_area_struct *vma,
unsigned long start,
unsigned long end);
-extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct list_head *uf);
+extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *uf);
extern void userfaultfd_unmap_complete(struct mm_struct *mm,
struct list_head *uf);
@@ -256,7 +255,7 @@ static inline bool userfaultfd_remove(struct vm_area_struct *vma,
return true;
}
-static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+static inline int userfaultfd_unmap_prep(struct mm_struct *mm,
unsigned long start, unsigned long end,
struct list_head *uf)
{
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 404024486fa5..1ce8fadb2b1c 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -122,10 +122,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
NR_TLB_LOCAL_FLUSH_ALL,
NR_TLB_LOCAL_FLUSH_ONE,
#endif /* CONFIG_DEBUG_TLBFLUSH */
-#ifdef CONFIG_DEBUG_VM_VMACACHE
- VMACACHE_FIND_CALLS,
- VMACACHE_FIND_HITS,
-#endif
#ifdef CONFIG_SWAP
SWAP_RA,
SWAP_RA_HIT,
diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
deleted file mode 100644
index 6fce268a4588..000000000000
--- a/include/linux/vmacache.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __LINUX_VMACACHE_H
-#define __LINUX_VMACACHE_H
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-
-static inline void vmacache_flush(struct task_struct *tsk)
-{
- memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas));
-}
-
-extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
-extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
- unsigned long addr);
-
-#ifndef CONFIG_MMU
-extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
- unsigned long start,
- unsigned long end);
-#endif
-
-static inline void vmacache_invalidate(struct mm_struct *mm)
-{
- mm->vmacache_seqnum++;
-}
-
-#endif /* __LINUX_VMACACHE_H */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index bfe38869498d..19cf5b6892ce 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -125,12 +125,6 @@ static inline void vm_events_fold_cpu(int cpu)
#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
#endif
-#ifdef CONFIG_DEBUG_VM_VMACACHE
-#define count_vm_vmacache_event(x) count_vm_event(x)
-#else
-#define count_vm_vmacache_event(x) do {} while (0)
-#endif
-
#define __count_zid_vm_events(item, zid, delta) \
__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)