summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShakeel Butt <shakeelb@google.com>2019-04-26 10:27:12 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2019-04-28 01:04:18 +1000
commitdbc36a394d064c04c8097df07703cdcfbeebdde7 (patch)
tree2a16c181ab40d3ebf0fd12d4143c74df7c9013c2
parentcb3ed54bba982a537a1c87a97f68aaa36c6193b9 (diff)
downloadlinux-next-dbc36a394d064c04c8097df07703cdcfbeebdde7.tar.gz
memcg: schedule high reclaim for remote memcgs on high_work
If a memcg is over high limit, memory reclaim is scheduled to run on return-to-userland. However it is assumed that the memcg is the current process's memcg. With remote memcg charging for kmem or swapping in a page charged to remote memcg, current process can trigger reclaim on remote memcg. So, schduling reclaim on return-to-userland for remote memcgs will ignore the high reclaim altogether. So, record the memcg needing high reclaim and trigger high reclaim for that memcg on return-to-userland. However if the memcg is already recorded for high reclaim and the recorded memcg is not the descendant of the the memcg needing high reclaim, punt the high reclaim to the work queue. Link: http://lkml.kernel.org/r/20190108200538.80371-1-shakeelb@google.com Signed-off-by: Shakeel Butt <shakeelb@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
-rw-r--r--include/linux/sched.h3
-rw-r--r--kernel/fork.c1
-rw-r--r--mm/memcontrol.c18
3 files changed, 17 insertions, 5 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1549584a1538..493634654257 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1160,6 +1160,9 @@ struct task_struct {
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;
+
+ /* Used by memcontrol for high relcaim: */
+ struct mem_cgroup *memcg_high_reclaim;
#endif
#ifdef CONFIG_BLK_CGROUP
diff --git a/kernel/fork.c b/kernel/fork.c
index dfc7c69a5c9b..dc8871d0b0fe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -918,6 +918,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
+ tsk->memcg_high_reclaim = NULL;
#endif
return tsk;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 287933005e11..d03aa7a07041 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2149,7 +2149,8 @@ void mem_cgroup_handle_over_high(void)
if (likely(!nr_pages))
return;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ memcg = current->memcg_high_reclaim;
+ current->memcg_high_reclaim = NULL;
reclaim_high(memcg, nr_pages, GFP_KERNEL);
css_put(&memcg->css);
current->memcg_nr_pages_over_high = 0;
@@ -2303,10 +2304,10 @@ done_restock:
* If the hierarchy is above the normal consumption range, schedule
* reclaim on returning to userland. We can perform reclaim here
* if __GFP_RECLAIM but let's always punt for simplicity and so that
- * GFP_KERNEL can consistently be used during reclaim. @memcg is
- * not recorded as it most likely matches current's and won't
- * change in the meantime. As high limit is checked again before
- * reclaim, the cost of mismatch is negligible.
+ * GFP_KERNEL can consistently be used during reclaim. Record the memcg
+ * for the return-to-userland high reclaim. If the memcg is already
+ * recorded and the recorded memcg is not the descendant of the memcg
+ * needing high reclaim, punt the high reclaim to the work queue.
*/
do {
if (page_counter_read(&memcg->memory) > memcg->high) {
@@ -2314,6 +2315,13 @@ done_restock:
if (in_interrupt()) {
schedule_work(&memcg->high_work);
break;
+ } else if (!current->memcg_high_reclaim) {
+ css_get(&memcg->css);
+ current->memcg_high_reclaim = memcg;
+ } else if (!mem_cgroup_is_descendant(
+ current->memcg_high_reclaim, memcg)) {
+ schedule_work(&memcg->high_work);
+ break;
}
current->memcg_nr_pages_over_high += batch;
set_notify_resume(current);