summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c592
1 files changed, 401 insertions, 191 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 655c09393ad5..a04903fd733b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -77,6 +77,7 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
+static struct obj_cgroup *root_obj_cgroup __read_mostly;
/* Active memory cgroup to use from an interrupt context */
DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
@@ -252,9 +253,14 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
return container_of(vmpr, struct mem_cgroup, vmpressure);
}
-#ifdef CONFIG_MEMCG_KMEM
static DEFINE_SPINLOCK(objcg_lock);
+static inline bool obj_cgroup_is_root(struct obj_cgroup *objcg)
+{
+ return objcg == root_obj_cgroup;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
bool mem_cgroup_kmem_disabled(void)
{
return cgroup_memory_nokmem;
@@ -263,12 +269,10 @@ bool mem_cgroup_kmem_disabled(void)
static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
unsigned int nr_pages);
-static void obj_cgroup_release(struct percpu_ref *ref)
+static void obj_cgroup_release_bytes(struct obj_cgroup *objcg)
{
- struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
unsigned int nr_bytes;
unsigned int nr_pages;
- unsigned long flags;
/*
* At this point all allocated objects are freed, and
@@ -282,9 +286,9 @@ static void obj_cgroup_release(struct percpu_ref *ref)
* 3) CPU1: a process from another memcg is allocating something,
* the stock if flushed,
* objcg->nr_charged_bytes = PAGE_SIZE - 92
- * 5) CPU0: we do release this object,
+ * 4) CPU0: we do release this object,
* 92 bytes are added to stock->nr_bytes
- * 6) CPU0: stock is flushed,
+ * 5) CPU0: stock is flushed,
* 92 bytes are added to objcg->nr_charged_bytes
*
* In the result, nr_charged_bytes == PAGE_SIZE.
@@ -296,6 +300,19 @@ static void obj_cgroup_release(struct percpu_ref *ref)
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
+}
+#else
+static inline void obj_cgroup_release_bytes(struct obj_cgroup *objcg)
+{
+}
+#endif
+
+static void obj_cgroup_release(struct percpu_ref *ref)
+{
+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
+ unsigned long flags;
+
+ obj_cgroup_release_bytes(objcg);
spin_lock_irqsave(&objcg_lock, flags);
list_del(&objcg->list);
@@ -324,28 +341,135 @@ static struct obj_cgroup *obj_cgroup_alloc(void)
return objcg;
}
-static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
- struct mem_cgroup *parent)
+static void objcg_reparent_lock(struct mem_cgroup *src, struct mem_cgroup *dst)
+{
+ spin_lock(&objcg_lock);
+}
+
+static void objcg_reparent_relocate(struct mem_cgroup *src, struct mem_cgroup *dst)
{
struct obj_cgroup *objcg, *iter;
- objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
+ objcg = rcu_replace_pointer(src->objcg, NULL, true);
+ /* 1) Ready to reparent active objcg. */
+ list_add(&objcg->list, &src->objcg_list);
+ /* 2) Reparent active objcg and already reparented objcgs to dst. */
+ list_for_each_entry(iter, &src->objcg_list, list)
+ WRITE_ONCE(iter->memcg, dst);
+ /* 3) Move already reparented objcgs to the dst's list */
+ list_splice(&src->objcg_list, &dst->objcg_list);
+}
- spin_lock_irq(&objcg_lock);
+static void objcg_reparent_unlock(struct mem_cgroup *src, struct mem_cgroup *dst)
+{
+ spin_unlock(&objcg_lock);
+}
- /* 1) Ready to reparent active objcg. */
- list_add(&objcg->list, &memcg->objcg_list);
- /* 2) Reparent active objcg and already reparented objcgs to parent. */
- list_for_each_entry(iter, &memcg->objcg_list, list)
- WRITE_ONCE(iter->memcg, parent);
- /* 3) Move already reparented objcgs to the parent's list */
- list_splice(&memcg->objcg_list, &parent->objcg_list);
+static DEFINE_MEMCG_REPARENT_OPS(objcg);
+
+static void lruvec_reparent_lock(struct mem_cgroup *src, struct mem_cgroup *dst)
+{
+ int nid, nest = 0;
+
+ for_each_node(nid) {
+ spin_lock_nested(&mem_cgroup_lruvec(src,
+ NODE_DATA(nid))->lru_lock, nest++);
+ spin_lock_nested(&mem_cgroup_lruvec(dst,
+ NODE_DATA(nid))->lru_lock, nest++);
+ }
+}
+
+static void lruvec_reparent_lru(struct lruvec *src, struct lruvec *dst,
+ enum lru_list lru)
+{
+ int zid;
+ struct mem_cgroup_per_node *mz_src, *mz_dst;
+
+ mz_src = container_of(src, struct mem_cgroup_per_node, lruvec);
+ mz_dst = container_of(dst, struct mem_cgroup_per_node, lruvec);
- spin_unlock_irq(&objcg_lock);
+ if (lru != LRU_UNEVICTABLE)
+ list_splice_tail_init(&src->lists[lru], &dst->lists[lru]);
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz_dst->lru_zone_size[zid][lru] += mz_src->lru_zone_size[zid][lru];
+ mz_src->lru_zone_size[zid][lru] = 0;
+ }
+}
+
+static void lruvec_reparent_relocate(struct mem_cgroup *src, struct mem_cgroup *dst)
+{
+ int nid;
+
+ for_each_node(nid) {
+ enum lru_list lru;
+ struct lruvec *src_lruvec, *dst_lruvec;
+
+ src_lruvec = mem_cgroup_lruvec(src, NODE_DATA(nid));
+ dst_lruvec = mem_cgroup_lruvec(dst, NODE_DATA(nid));
+
+ dst_lruvec->anon_cost += src_lruvec->anon_cost;
+ dst_lruvec->file_cost += src_lruvec->file_cost;
+
+ for_each_lru(lru)
+ lruvec_reparent_lru(src_lruvec, dst_lruvec, lru);
+ }
+}
+
+static void lruvec_reparent_unlock(struct mem_cgroup *src, struct mem_cgroup *dst)
+{
+ int nid;
+
+ for_each_node(nid) {
+ spin_unlock(&mem_cgroup_lruvec(dst, NODE_DATA(nid))->lru_lock);
+ spin_unlock(&mem_cgroup_lruvec(src, NODE_DATA(nid))->lru_lock);
+ }
+}
+
+static DEFINE_MEMCG_REPARENT_OPS(lruvec);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+DECLARE_MEMCG_REPARENT_OPS(thp_sq);
+#endif
+
+/* The lock order depends on the order of elements in this array. */
+static const struct memcg_reparent_ops *memcg_reparent_ops[] = {
+ &memcg_objcg_reparent_ops,
+ &memcg_lruvec_reparent_ops,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ &memcg_thp_sq_reparent_ops,
+#endif
+};
+
+#define DEFINE_MEMCG_REPARENT_FUNC(phase) \
+ static void memcg_reparent_##phase(struct mem_cgroup *src, \
+ struct mem_cgroup *dst) \
+ { \
+ int i; \
+ \
+ for (i = 0; i < ARRAY_SIZE(memcg_reparent_ops); i++) \
+ memcg_reparent_ops[i]->phase(src, dst); \
+ }
+
+DEFINE_MEMCG_REPARENT_FUNC(lock)
+DEFINE_MEMCG_REPARENT_FUNC(relocate)
+DEFINE_MEMCG_REPARENT_FUNC(unlock)
+
+static void memcg_reparent_objcgs(struct mem_cgroup *src)
+{
+ struct mem_cgroup *dst = parent_mem_cgroup(src);
+ struct obj_cgroup *objcg = rcu_dereference_protected(src->objcg, true);
+
+ local_irq_disable();
+ memcg_reparent_lock(src, dst);
+ memcg_reparent_relocate(src, dst);
+ memcg_reparent_unlock(src, dst);
+ local_irq_enable();
percpu_ref_kill(&objcg->refcnt);
}
+#ifdef CONFIG_MEMCG_KMEM
/*
* A lot of the calls to the cache allocation functions are expected to be
* inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
@@ -357,7 +481,7 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
#endif
/**
- * mem_cgroup_css_from_page - css of the memcg associated with a page
+ * get_mem_cgroup_css_from_page - get css of the memcg associated with a page
* @page: page of interest
*
* If memcg is bound to the default hierarchy, css of the memcg associated
@@ -367,13 +491,15 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
* If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
* is returned.
*/
-struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page)
{
struct mem_cgroup *memcg;
- memcg = page_memcg(page);
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return &root_mem_cgroup->css;
- if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ memcg = get_mem_cgroup_from_page(page);
+ if (!memcg)
memcg = root_mem_cgroup;
return &memcg->css;
@@ -756,13 +882,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
int val)
{
- struct page *head = compound_head(page); /* rmap on tail pages */
+ struct folio *folio = page_folio(page); /* rmap on tail pages */
struct mem_cgroup *memcg;
pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
rcu_read_lock();
- memcg = page_memcg(head);
+ memcg = folio_memcg(folio);
/* Untracked pages have no memcg, no lruvec. Update only the node */
if (!memcg) {
rcu_read_unlock();
@@ -1183,23 +1309,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
return ret;
}
-#ifdef CONFIG_DEBUG_VM
-void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
-{
- struct mem_cgroup *memcg;
-
- if (mem_cgroup_disabled())
- return;
-
- memcg = folio_memcg(folio);
-
- if (!memcg)
- VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio);
- else
- VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
-}
-#endif
-
/**
* folio_lruvec_lock - Lock the lruvec for a folio.
* @folio: Pointer to the folio.
@@ -1214,10 +1323,18 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
*/
struct lruvec *folio_lruvec_lock(struct folio *folio)
{
- struct lruvec *lruvec = folio_lruvec(folio);
+ struct lruvec *lruvec;
+ rcu_read_lock();
+retry:
+ lruvec = folio_lruvec(folio);
spin_lock(&lruvec->lru_lock);
- lruvec_memcg_debug(lruvec, folio);
+
+ if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
+ spin_unlock(&lruvec->lru_lock);
+ goto retry;
+ }
+ rcu_read_unlock();
return lruvec;
}
@@ -1237,10 +1354,18 @@ struct lruvec *folio_lruvec_lock(struct folio *folio)
*/
struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
{
- struct lruvec *lruvec = folio_lruvec(folio);
+ struct lruvec *lruvec;
+ rcu_read_lock();
+retry:
+ lruvec = folio_lruvec(folio);
spin_lock_irq(&lruvec->lru_lock);
- lruvec_memcg_debug(lruvec, folio);
+
+ if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ goto retry;
+ }
+ rcu_read_unlock();
return lruvec;
}
@@ -1262,10 +1387,18 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
unsigned long *flags)
{
- struct lruvec *lruvec = folio_lruvec(folio);
+ struct lruvec *lruvec;
+ rcu_read_lock();
+retry:
+ lruvec = folio_lruvec(folio);
spin_lock_irqsave(&lruvec->lru_lock, *flags);
- lruvec_memcg_debug(lruvec, folio);
+
+ if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
+ spin_unlock_irqrestore(&lruvec->lru_lock, *flags);
+ goto retry;
+ }
+ rcu_read_unlock();
return lruvec;
}
@@ -2037,7 +2170,9 @@ void folio_memcg_lock(struct folio *folio)
* The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period.
- */
+ *
+ * The RCU lock also protects the memcg from being freed.
+ */
rcu_read_lock();
if (mem_cgroup_disabled())
@@ -2766,18 +2901,33 @@ static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages
page_counter_uncharge(&memcg->memsw, nr_pages);
}
-static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
+static void commit_charge(struct folio *folio, struct obj_cgroup *objcg)
{
- VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
+ VM_BUG_ON_FOLIO(folio_objcg(folio), folio);
/*
- * Any of the following ensures page's memcg stability:
+ * Any of the following ensures page's objcg stability:
*
* - the page lock
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
*/
- folio->memcg_data = (unsigned long)memcg;
+ folio->memcg_data = (unsigned long)objcg;
+}
+
+static struct obj_cgroup *get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
+{
+ struct obj_cgroup *objcg = NULL;
+
+ rcu_read_lock();
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ objcg = rcu_dereference(memcg->objcg);
+ if (objcg && obj_cgroup_tryget(objcg))
+ break;
+ }
+ rcu_read_unlock();
+
+ return objcg;
}
#ifdef CONFIG_MEMCG_KMEM
@@ -2923,19 +3073,6 @@ struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
}
-static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
-{
- struct obj_cgroup *objcg = NULL;
-
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
- objcg = rcu_dereference(memcg->objcg);
- if (objcg && obj_cgroup_tryget(objcg))
- break;
- objcg = NULL;
- }
- return objcg;
-}
-
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
{
struct obj_cgroup *objcg = NULL;
@@ -2949,7 +3086,16 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
memcg = active_memcg();
else
memcg = mem_cgroup_from_task(current);
- objcg = __get_obj_cgroup_from_memcg(memcg);
+
+ if (mem_cgroup_is_root(memcg))
+ goto out;
+
+ objcg = get_obj_cgroup_from_memcg(memcg);
+ if (obj_cgroup_is_root(objcg)) {
+ obj_cgroup_put(objcg);
+ objcg = NULL;
+ }
+out:
rcu_read_unlock();
return objcg;
}
@@ -2961,20 +3107,10 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
if (!memcg_kmem_enabled() || memcg_kmem_bypass())
return NULL;
- if (PageMemcgKmem(page)) {
- objcg = __folio_objcg(page_folio(page));
+ objcg = folio_objcg(page_folio(page));
+ if (objcg)
obj_cgroup_get(objcg);
- } else {
- struct mem_cgroup *memcg;
- rcu_read_lock();
- memcg = __folio_memcg(page_folio(page));
- if (memcg)
- objcg = __get_obj_cgroup_from_memcg(memcg);
- else
- objcg = NULL;
- rcu_read_unlock();
- }
return objcg;
}
@@ -3069,13 +3205,13 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
struct folio *folio = page_folio(page);
- struct obj_cgroup *objcg;
+ struct obj_cgroup *objcg = folio_objcg(folio);
unsigned int nr_pages = 1 << order;
- if (!folio_memcg_kmem(folio))
+ if (!objcg)
return;
- objcg = __folio_objcg(folio);
+ VM_BUG_ON_FOLIO(!folio_memcg_kmem(folio), folio);
obj_cgroup_uncharge_pages(objcg, nr_pages);
folio->memcg_data = 0;
obj_cgroup_put(objcg);
@@ -3329,24 +3465,21 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
#endif /* CONFIG_MEMCG_KMEM */
/*
- * Because page_memcg(head) is not set on tails, set it now.
+ * Because page_objcg(head) is not set on tails, set it now.
*/
void split_page_memcg(struct page *head, unsigned int nr)
{
struct folio *folio = page_folio(head);
- struct mem_cgroup *memcg = folio_memcg(folio);
+ struct obj_cgroup *objcg = folio_objcg(folio);
int i;
- if (mem_cgroup_disabled() || !memcg)
+ if (mem_cgroup_disabled() || !objcg)
return;
for (i = 1; i < nr; i++)
folio_page(folio, i)->memcg_data = folio->memcg_data;
- if (folio_memcg_kmem(folio))
- obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
- else
- css_get_many(&memcg->css, nr - 1);
+ obj_cgroup_get_many(objcg, nr - 1);
}
#ifdef CONFIG_MEMCG_SWAP
@@ -3651,21 +3784,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
- struct obj_cgroup *objcg;
-
- if (cgroup_memory_nokmem)
+ if (mem_cgroup_kmem_disabled())
return 0;
if (unlikely(mem_cgroup_is_root(memcg)))
return 0;
- objcg = obj_cgroup_alloc();
- if (!objcg)
- return -ENOMEM;
-
- objcg->memcg = memcg;
- rcu_assign_pointer(memcg->objcg, objcg);
-
static_branch_enable(&memcg_kmem_enabled_key);
memcg->kmemcg_id = memcg->id.id;
@@ -3675,27 +3799,13 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
- struct mem_cgroup *parent;
-
- if (cgroup_memory_nokmem)
+ if (mem_cgroup_kmem_disabled())
return;
if (unlikely(mem_cgroup_is_root(memcg)))
return;
- parent = parent_mem_cgroup(memcg);
- if (!parent)
- parent = root_mem_cgroup;
-
- memcg_reparent_objcgs(memcg, parent);
-
- /*
- * After we have finished memcg_reparent_objcgs(), all list_lrus
- * corresponding to this cgroup are guaranteed to remain empty.
- * The ordering is imposed by list_lru_node->lock taken by
- * memcg_reparent_list_lrus().
- */
- memcg_reparent_list_lrus(memcg, parent);
+ memcg_reparent_list_lrus(memcg, parent_mem_cgroup(memcg));
}
#else
static int memcg_online_kmem(struct mem_cgroup *memcg)
@@ -4562,7 +4672,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
struct bdi_writeback *wb)
{
- struct mem_cgroup *memcg = folio_memcg(folio);
+ struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
struct memcg_cgwb_frn *frn;
u64 now = get_jiffies_64();
u64 oldest_at = now;
@@ -4609,6 +4719,7 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
frn->memcg_id = wb->memcg_css->id;
frn->at = now;
}
+ css_put(&memcg->css);
}
/* issue foreign writeback flushes for recorded foreign dirtying events */
@@ -5088,6 +5199,29 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
return idr_find(&mem_cgroup_idr, id);
}
+#ifdef CONFIG_SHRINKER_DEBUG
+struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
+{
+ struct cgroup *cgrp;
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *memcg;
+
+ cgrp = cgroup_get_from_id(ino);
+ if (!cgrp)
+ return ERR_PTR(-ENOENT);
+
+ css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
+ if (css)
+ memcg = container_of(css, struct mem_cgroup, css);
+ else
+ memcg = ERR_PTR(-ENOENT);
+
+ cgroup_put(cgrp);
+
+ return memcg;
+}
+#endif
+
static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
@@ -5152,6 +5286,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL);
if (memcg->id.id < 0) {
error = memcg->id.id;
+ if (error == -ENOSPC)
+ pr_notice_ratelimited("mem_cgroup_id space is exhausted\n");
goto fail;
}
@@ -5177,8 +5313,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memcg->socket_pressure = jiffies;
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
- INIT_LIST_HEAD(&memcg->objcg_list);
#endif
+ INIT_LIST_HEAD(&memcg->objcg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
@@ -5243,6 +5379,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct obj_cgroup *objcg;
if (memcg_online_kmem(memcg))
goto remove_id;
@@ -5255,6 +5392,16 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (alloc_shrinker_info(memcg))
goto offline_kmem;
+ objcg = obj_cgroup_alloc();
+ if (!objcg)
+ goto free_shrinker;
+
+ objcg->memcg = memcg;
+ rcu_assign_pointer(memcg->objcg, objcg);
+
+ if (unlikely(mem_cgroup_is_root(memcg)))
+ root_obj_cgroup = objcg;
+
/* Online state pins memcg ID, memcg ID pins CSS */
refcount_set(&memcg->id.ref, 1);
css_get(css);
@@ -5263,6 +5410,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
2UL*HZ);
return 0;
+free_shrinker:
+ free_shrinker_info(memcg);
offline_kmem:
memcg_offline_kmem(memcg);
remove_id:
@@ -5290,6 +5439,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
+ memcg_reparent_objcgs(memcg);
memcg_offline_kmem(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
@@ -5656,10 +5806,12 @@ static int mem_cgroup_move_account(struct page *page,
*/
smp_mb();
- css_get(&to->css);
- css_put(&from->css);
+ rcu_read_lock();
+ obj_cgroup_get(rcu_dereference(to->objcg));
+ obj_cgroup_put(rcu_dereference(from->objcg));
+ rcu_read_unlock();
- folio->memcg_data = (unsigned long)to;
+ folio->memcg_data = (unsigned long)rcu_access_pointer(to->objcg);
__folio_memcg_unlock(from);
@@ -5693,8 +5845,8 @@ out:
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
- * (so ZONE_DEVICE page and thus not on the lru).
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and
+ * thus not on the lru.
* For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a
* regular page.
@@ -5732,7 +5884,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/
if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
- if (is_device_private_page(page))
+ if (is_device_private_page(page) ||
+ is_device_coherent_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
@@ -5833,7 +5986,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
unsigned long precharge;
mmap_read_lock(mm);
- walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
+ walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
mmap_read_unlock(mm);
precharge = mc.precharge;
@@ -6131,13 +6284,55 @@ retry:
* When we have consumed all precharges and failed in doing
* additional charge, the page walk just aborts.
*/
- walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
- NULL);
-
+ walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL);
mmap_read_unlock(mc.mm);
atomic_dec(&mc.from->moving_account);
+
+ /*
+ * Moving its pages to another memcg is finished. Wait for already
+ * started RCU-only updates to finish to make sure that the caller
+ * of lock_page_memcg() can unlock the correct move_lock. The
+ * possible bad scenario would like:
+ *
+ * CPU0: CPU1:
+ * mem_cgroup_move_charge()
+ * walk_page_range()
+ *
+ * lock_page_memcg(page)
+ * memcg = folio_memcg()
+ * spin_lock_irqsave(&memcg->move_lock)
+ * memcg->move_lock_task = current
+ *
+ * atomic_dec(&mc.from->moving_account)
+ *
+ * mem_cgroup_css_offline()
+ * memcg_offline_kmem()
+ * memcg_reparent_objcgs() <== reparented
+ *
+ * unlock_page_memcg(page)
+ * memcg = folio_memcg() <== memcg has been changed
+ * if (memcg->move_lock_task == current) <== false
+ * spin_unlock_irqrestore(&memcg->move_lock)
+ *
+ * Once mem_cgroup_move_charge() returns (it means that the cgroup_mutex
+ * would be released soon), the page can be reparented to its parent
+ * memcg. When the unlock_page_memcg() is called for the page, we will
+ * miss unlock the move_lock. So using synchronize_rcu to wait for
+ * already started RCU-only updates to finish before this function
+ * returns (mem_cgroup_move_charge() and mem_cgroup_css_offline() are
+ * serialized by cgroup_mutex).
+ */
+ synchronize_rcu();
}
+/*
+ * The cgroup migration and memory cgroup offlining are serialized by
+ * @cgroup_mutex. If we reach here, it means that the LRU pages cannot
+ * be reparented to its parent memory cgroup. So during the whole process
+ * of mem_cgroup_move_task(), page_memcg(page) is stable. So we do not
+ * need to worry about the memcg (returned from page_memcg()) being
+ * released even if we do not hold an rcu read lock.
+ */
static void mem_cgroup_move_task(void)
{
if (mc.to) {
@@ -6742,21 +6937,26 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
gfp_t gfp)
{
+ struct obj_cgroup *objcg;
long nr_pages = folio_nr_pages(folio);
- int ret;
+ int ret = 0;
- ret = try_charge(memcg, gfp, nr_pages);
+ objcg = get_obj_cgroup_from_memcg(memcg);
+ /* Do not account at the root objcg level. */
+ if (!obj_cgroup_is_root(objcg))
+ ret = try_charge(memcg, gfp, nr_pages);
if (ret)
goto out;
- css_get(&memcg->css);
- commit_charge(folio, memcg);
+ obj_cgroup_get(objcg);
+ commit_charge(folio, objcg);
local_irq_disable();
mem_cgroup_charge_statistics(memcg, nr_pages);
memcg_check_events(memcg, folio_nid(folio));
local_irq_enable();
out:
+ obj_cgroup_put(objcg);
return ret;
}
@@ -6842,7 +7042,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
}
struct uncharge_gather {
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
unsigned long nr_memory;
unsigned long pgpgout;
unsigned long nr_kmem;
@@ -6857,63 +7057,56 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
unsigned long flags;
+ struct mem_cgroup *memcg;
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(ug->objcg);
if (ug->nr_memory) {
- page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
+ page_counter_uncharge(&memcg->memory, ug->nr_memory);
if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
+ page_counter_uncharge(&memcg->memsw, ug->nr_memory);
if (ug->nr_kmem)
- memcg_account_kmem(ug->memcg, -ug->nr_kmem);
- memcg_oom_recover(ug->memcg);
+ memcg_account_kmem(memcg, -ug->nr_kmem);
+ memcg_oom_recover(memcg);
}
local_irq_save(flags);
- __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
- memcg_check_events(ug->memcg, ug->nid);
+ __count_memcg_events(memcg, PGPGOUT, ug->pgpgout);
+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
+ memcg_check_events(memcg, ug->nid);
local_irq_restore(flags);
+ rcu_read_unlock();
/* drop reference from uncharge_folio */
- css_put(&ug->memcg->css);
+ obj_cgroup_put(ug->objcg);
}
static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
{
long nr_pages;
- struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
/*
* Nobody should be changing or seriously looking at
- * folio memcg or objcg at this point, we have fully
- * exclusive access to the folio.
+ * folio objcg at this point, we have fully exclusive
+ * access to the folio.
*/
- if (folio_memcg_kmem(folio)) {
- objcg = __folio_objcg(folio);
- /*
- * This get matches the put at the end of the function and
- * kmem pages do not hold memcg references anymore.
- */
- memcg = get_mem_cgroup_from_objcg(objcg);
- } else {
- memcg = __folio_memcg(folio);
- }
-
- if (!memcg)
+ objcg = folio_objcg(folio);
+ if (!objcg)
return;
- if (ug->memcg != memcg) {
- if (ug->memcg) {
+ if (ug->objcg != objcg) {
+ if (ug->objcg) {
uncharge_batch(ug);
uncharge_gather_clear(ug);
}
- ug->memcg = memcg;
+ ug->objcg = objcg;
ug->nid = folio_nid(folio);
- /* pairs with css_put in uncharge_batch */
- css_get(&memcg->css);
+ /* pairs with obj_cgroup_put in uncharge_batch */
+ obj_cgroup_get(objcg);
}
nr_pages = folio_nr_pages(folio);
@@ -6921,19 +7114,15 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
if (folio_memcg_kmem(folio)) {
ug->nr_memory += nr_pages;
ug->nr_kmem += nr_pages;
-
- folio->memcg_data = 0;
- obj_cgroup_put(objcg);
} else {
/* LRU pages aren't accounted at the root level */
- if (!mem_cgroup_is_root(memcg))
+ if (!obj_cgroup_is_root(objcg))
ug->nr_memory += nr_pages;
ug->pgpgout++;
-
- folio->memcg_data = 0;
}
- css_put(&memcg->css);
+ folio->memcg_data = 0;
+ obj_cgroup_put(objcg);
}
void __mem_cgroup_uncharge(struct folio *folio)
@@ -6941,7 +7130,7 @@ void __mem_cgroup_uncharge(struct folio *folio)
struct uncharge_gather ug;
/* Don't touch folio->lru of any random page, pre-check: */
- if (!folio_memcg(folio))
+ if (!folio_objcg(folio))
return;
uncharge_gather_clear(&ug);
@@ -6964,7 +7153,7 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list)
uncharge_gather_clear(&ug);
list_for_each_entry(folio, page_list, lru)
uncharge_folio(folio, &ug);
- if (ug.memcg)
+ if (ug.objcg)
uncharge_batch(&ug);
}
@@ -6981,6 +7170,7 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list)
void mem_cgroup_migrate(struct folio *old, struct folio *new)
{
struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
long nr_pages = folio_nr_pages(new);
unsigned long flags;
@@ -6993,28 +7183,33 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
return;
/* Page cache replacement: new folio already charged? */
- if (folio_memcg(new))
+ if (folio_objcg(new))
return;
- memcg = folio_memcg(old);
- VM_WARN_ON_ONCE_FOLIO(!memcg, old);
- if (!memcg)
+ objcg = folio_objcg(old);
+ VM_WARN_ON_ONCE_FOLIO(!objcg, old);
+ if (!objcg)
return;
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+
/* Force-charge the new page. The old one will be freed soon */
- if (!mem_cgroup_is_root(memcg)) {
+ if (!obj_cgroup_is_root(objcg)) {
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
}
- css_get(&memcg->css);
- commit_charge(new, memcg);
+ obj_cgroup_get(objcg);
+ commit_charge(new, objcg);
local_irq_save(flags);
mem_cgroup_charge_statistics(memcg, nr_pages);
memcg_check_events(memcg, folio_nid(new));
local_irq_restore(flags);
+
+ rcu_read_unlock();
}
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
@@ -7173,8 +7368,6 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
break;
}
memcg = parent_mem_cgroup(memcg);
- if (!memcg)
- memcg = root_mem_cgroup;
}
return memcg;
}
@@ -7189,6 +7382,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
{
struct mem_cgroup *memcg, *swap_memcg;
+ struct obj_cgroup *objcg;
unsigned int nr_entries;
unsigned short oldid;
@@ -7201,13 +7395,18 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
- memcg = folio_memcg(folio);
-
- VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
- if (!memcg)
+ objcg = folio_objcg(folio);
+ VM_WARN_ON_ONCE_FOLIO(!objcg, folio);
+ if (!objcg)
return;
/*
+ * Interrupts should be disabled by the caller (see the comments below),
+ * which can serve as RCU read-side critical sections.
+ */
+ memcg = obj_cgroup_memcg(objcg);
+
+ /*
* In case the memcg owning these pages has been offlined and doesn't
* have an ID allocated to it anymore, charge the closest online
* ancestor for the swap instead and transfer the memory+swap charge.
@@ -7224,7 +7423,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
folio->memcg_data = 0;
- if (!mem_cgroup_is_root(memcg))
+ if (!obj_cgroup_is_root(objcg))
page_counter_uncharge(&memcg->memory, nr_entries);
if (!cgroup_memory_noswap && memcg != swap_memcg) {
@@ -7244,7 +7443,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
memcg_stats_unlock();
memcg_check_events(memcg, folio_nid(folio));
- css_put(&memcg->css);
+ obj_cgroup_put(objcg);
}
/**
@@ -7262,19 +7461,21 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
struct page_counter *counter;
struct mem_cgroup *memcg;
unsigned short oldid;
+ int ret = 0;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
+ rcu_read_lock();
memcg = folio_memcg(folio);
VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
if (!memcg)
- return 0;
+ goto out;
if (!entry.val) {
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
- return 0;
+ goto out;
}
memcg = mem_cgroup_id_get_online(memcg);
@@ -7284,7 +7485,8 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
memcg_memory_event(memcg, MEMCG_SWAP_MAX);
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
mem_cgroup_id_put(memcg);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
/* Get references for the tail pages, too */
@@ -7293,8 +7495,10 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
+out:
+ rcu_read_unlock();
- return 0;
+ return ret;
}
/**
@@ -7339,6 +7543,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
bool mem_cgroup_swap_full(struct page *page)
{
struct mem_cgroup *memcg;
+ bool ret = false;
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -7347,19 +7552,24 @@ bool mem_cgroup_swap_full(struct page *page)
if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return false;
+ rcu_read_lock();
memcg = page_memcg(page);
if (!memcg)
- return false;
+ goto out;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
unsigned long usage = page_counter_read(&memcg->swap);
if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
- usage * 2 >= READ_ONCE(memcg->swap.max))
- return true;
+ usage * 2 >= READ_ONCE(memcg->swap.max)) {
+ ret = true;
+ goto out;
+ }
}
+out:
+ rcu_read_unlock();
- return false;
+ return ret;
}
static int __init setup_swap_account(char *s)