From: Sebastian Andrzej Siewior Date: Tue, 18 Jan 2022 17:28:07 +0100 Subject: [PATCH 1/4] mm/memcg: Disable threshold event handlers on PREEMPT_RT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During the integration of PREEMPT_RT support, the code flow around memcg_check_events() resulted in `twisted code'. Moving the code around and avoiding then would then lead to an additional local-irq-save section within memcg_check_events(). While looking better, it adds a local-irq-save section to code flow which is usually within an local-irq-off block on non-PREEMPT_RT configurations. The threshold event handler is a deprecated memcg v1 feature. Instead of trying to get it to work under PREEMPT_RT just disable it. There should be no users on PREEMPT_RT. From that perspective it makes even less sense to get it to work under PREEMPT_RT while having zero users. Make memory.soft_limit_in_bytes and cgroup.event_control return -EOPNOTSUPP on PREEMPT_RT. Make an empty memcg_check_events() and memcg_write_event_control() which return only -EOPNOTSUPP on PREEMPT_RT. Document that the two knobs are disabled on PREEMPT_RT. Shuffle the code around so that all unused function are in on #ifdef block. Suggested-by: Michal Hocko Suggested-by: Michal Koutný Signed-off-by: Sebastian Andrzej Siewior --- Documentation/admin-guide/cgroup-v1/memory.rst | 2 mm/memcontrol.c | 728 ++++++++++++------------- 2 files changed, 374 insertions(+), 356 deletions(-) --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -64,6 +64,7 @@ Brief summary of control files. threads cgroup.procs show list of processes cgroup.event_control an interface for event_fd() + This knob is not available on CONFIG_PREEMPT_RT systems. memory.usage_in_bytes show current usage for memory (See 5.5 for details) memory.memsw.usage_in_bytes show current usage for memory+Swap @@ -75,6 +76,7 @@ Brief summary of control files. memory.max_usage_in_bytes show max memory usage recorded memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded memory.soft_limit_in_bytes set/show soft limit of memory usage + This knob is not available on CONFIG_PREEMPT_RT systems. memory.stat show various statistics memory.use_hierarchy set/show hierarchical account enabled This knob is deprecated and shouldn't be --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -169,7 +169,6 @@ struct mem_cgroup_event { struct work_struct remove; }; -static void mem_cgroup_threshold(struct mem_cgroup *memcg); static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); /* Stuffs for move charges at task migration. */ @@ -521,43 +520,6 @@ static unsigned long soft_limit_excess(s return excess; } -static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) -{ - unsigned long excess; - struct mem_cgroup_per_node *mz; - struct mem_cgroup_tree_per_node *mctz; - - mctz = soft_limit_tree.rb_tree_per_node[nid]; - if (!mctz) - return; - /* - * Necessary to update all ancestors when hierarchy is used. - * because their event counter is not touched. - */ - for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = memcg->nodeinfo[nid]; - excess = soft_limit_excess(memcg); - /* - * We have to update the tree if mz is on RB-tree or - * mem is over its softlimit. - */ - if (excess || mz->on_tree) { - unsigned long flags; - - spin_lock_irqsave(&mctz->lock, flags); - /* if on-tree, remove it */ - if (mz->on_tree) - __mem_cgroup_remove_exceeded(mz, mctz); - /* - * Insert again. mz->usage_in_excess will be updated. - * If excess is 0, no tree ops. - */ - __mem_cgroup_insert_exceeded(mz, mctz, excess); - spin_unlock_irqrestore(&mctz->lock, flags); - } - } -} - static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) { struct mem_cgroup_tree_per_node *mctz; @@ -827,50 +789,6 @@ static void mem_cgroup_charge_statistics __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); } -static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, - enum mem_cgroup_events_target target) -{ - unsigned long val, next; - - val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); - next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); - /* from time_after() in jiffies.h */ - if ((long)(next - val) < 0) { - switch (target) { - case MEM_CGROUP_TARGET_THRESH: - next = val + THRESHOLDS_EVENTS_TARGET; - break; - case MEM_CGROUP_TARGET_SOFTLIMIT: - next = val + SOFTLIMIT_EVENTS_TARGET; - break; - default: - break; - } - __this_cpu_write(memcg->vmstats_percpu->targets[target], next); - return true; - } - return false; -} - -/* - * Check events in order. - * - */ -static void memcg_check_events(struct mem_cgroup *memcg, int nid) -{ - /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(mem_cgroup_event_ratelimit(memcg, - MEM_CGROUP_TARGET_THRESH))) { - bool do_softlimit; - - do_softlimit = mem_cgroup_event_ratelimit(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT); - mem_cgroup_threshold(memcg); - if (unlikely(do_softlimit)) - mem_cgroup_update_tree(memcg, nid); - } -} - struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { /* @@ -3763,8 +3681,12 @@ static ssize_t mem_cgroup_write(struct k } break; case RES_SOFT_LIMIT: +#ifndef CONFIG_PREEMPT_RT memcg->soft_limit = nr_pages; ret = 0; +#else + ret = -EOPNOTSUPP; +#endif break; } return ret ?: nbytes; @@ -4069,6 +3991,343 @@ static int mem_cgroup_swappiness_write(s return 0; } +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) +{ + struct mem_cgroup_eventfd_list *ev; + + spin_lock(&memcg_oom_lock); + + list_for_each_entry(ev, &memcg->oom_notify, list) + eventfd_signal(ev->eventfd, 1); + + spin_unlock(&memcg_oom_lock); + return 0; +} + +static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, memcg) + mem_cgroup_oom_notify_cb(iter); +} + +static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); + + seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); + seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); + seq_printf(sf, "oom_kill %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); + return 0; +} + +static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + /* cannot set to root cgroup and only 0 and 1 are allowed */ + if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) + return -EINVAL; + + memcg->oom_kill_disable = val; + if (!val) + memcg_oom_recover(memcg); + + return 0; +} + +#ifdef CONFIG_CGROUP_WRITEBACK + +#include + +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return wb_domain_init(&memcg->cgwb_domain, gfp); +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ + wb_domain_exit(&memcg->cgwb_domain); +} + +static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) +{ + wb_domain_size_changed(&memcg->cgwb_domain); +} + +struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + + if (!memcg->css.parent) + return NULL; + + return &memcg->cgwb_domain; +} + +/** + * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg + * @wb: bdi_writeback in question + * @pfilepages: out parameter for number of file pages + * @pheadroom: out parameter for number of allocatable pages according to memcg + * @pdirty: out parameter for number of dirty pages + * @pwriteback: out parameter for number of pages under writeback + * + * Determine the numbers of file, headroom, dirty, and writeback pages in + * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom + * is a bit more involved. + * + * A memcg's headroom is "min(max, high) - used". In the hierarchy, the + * headroom is calculated as the lowest headroom of itself and the + * ancestors. Note that this doesn't consider the actual amount of + * available memory in the system. The caller should further cap + * *@pheadroom accordingly. + */ +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, + unsigned long *pheadroom, unsigned long *pdirty, + unsigned long *pwriteback) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + struct mem_cgroup *parent; + + mem_cgroup_flush_stats(); + + *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); + *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); + *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + + memcg_page_state(memcg, NR_ACTIVE_FILE); + + *pheadroom = PAGE_COUNTER_MAX; + while ((parent = parent_mem_cgroup(memcg))) { + unsigned long ceiling = min(READ_ONCE(memcg->memory.max), + READ_ONCE(memcg->memory.high)); + unsigned long used = page_counter_read(&memcg->memory); + + *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); + memcg = parent; + } +} + +/* + * Foreign dirty flushing + * + * There's an inherent mismatch between memcg and writeback. The former + * tracks ownership per-page while the latter per-inode. This was a + * deliberate design decision because honoring per-page ownership in the + * writeback path is complicated, may lead to higher CPU and IO overheads + * and deemed unnecessary given that write-sharing an inode across + * different cgroups isn't a common use-case. + * + * Combined with inode majority-writer ownership switching, this works well + * enough in most cases but there are some pathological cases. For + * example, let's say there are two cgroups A and B which keep writing to + * different but confined parts of the same inode. B owns the inode and + * A's memory is limited far below B's. A's dirty ratio can rise enough to + * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid + * triggering background writeback. A will be slowed down without a way to + * make writeback of the dirty pages happen. + * + * Conditions like the above can lead to a cgroup getting repeatedly and + * severely throttled after making some progress after each + * dirty_expire_interval while the underlying IO device is almost + * completely idle. + * + * Solving this problem completely requires matching the ownership tracking + * granularities between memcg and writeback in either direction. However, + * the more egregious behaviors can be avoided by simply remembering the + * most recent foreign dirtying events and initiating remote flushes on + * them when local writeback isn't enough to keep the memory clean enough. + * + * The following two functions implement such mechanism. When a foreign + * page - a page whose memcg and writeback ownerships don't match - is + * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning + * bdi_writeback on the page owning memcg. When balance_dirty_pages() + * decides that the memcg needs to sleep due to high dirty ratio, it calls + * mem_cgroup_flush_foreign() which queues writeback on the recorded + * foreign bdi_writebacks which haven't expired. Both the numbers of + * recorded bdi_writebacks and concurrent in-flight foreign writebacks are + * limited to MEMCG_CGWB_FRN_CNT. + * + * The mechanism only remembers IDs and doesn't hold any object references. + * As being wrong occasionally doesn't matter, updates and accesses to the + * records are lockless and racy. + */ +void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, + struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = folio_memcg(folio); + struct memcg_cgwb_frn *frn; + u64 now = get_jiffies_64(); + u64 oldest_at = now; + int oldest = -1; + int i; + + trace_track_foreign_dirty(folio, wb); + + /* + * Pick the slot to use. If there is already a slot for @wb, keep + * using it. If not replace the oldest one which isn't being + * written out. + */ + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { + frn = &memcg->cgwb_frn[i]; + if (frn->bdi_id == wb->bdi->id && + frn->memcg_id == wb->memcg_css->id) + break; + if (time_before64(frn->at, oldest_at) && + atomic_read(&frn->done.cnt) == 1) { + oldest = i; + oldest_at = frn->at; + } + } + + if (i < MEMCG_CGWB_FRN_CNT) { + /* + * Re-using an existing one. Update timestamp lazily to + * avoid making the cacheline hot. We want them to be + * reasonably up-to-date and significantly shorter than + * dirty_expire_interval as that's what expires the record. + * Use the shorter of 1s and dirty_expire_interval / 8. + */ + unsigned long update_intv = + min_t(unsigned long, HZ, + msecs_to_jiffies(dirty_expire_interval * 10) / 8); + + if (time_before64(frn->at, now - update_intv)) + frn->at = now; + } else if (oldest >= 0) { + /* replace the oldest free one */ + frn = &memcg->cgwb_frn[oldest]; + frn->bdi_id = wb->bdi->id; + frn->memcg_id = wb->memcg_css->id; + frn->at = now; + } +} + +/* issue foreign writeback flushes for recorded foreign dirtying events */ +void mem_cgroup_flush_foreign(struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); + u64 now = jiffies_64; + int i; + + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { + struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; + + /* + * If the record is older than dirty_expire_interval, + * writeback on it has already started. No need to kick it + * off again. Also, don't start a new one if there's + * already one in flight. + */ + if (time_after64(frn->at, now - intv) && + atomic_read(&frn->done.cnt) == 1) { + frn->at = 0; + trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); + cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, + WB_REASON_FOREIGN_FLUSH, + &frn->done); + } + } +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return 0; +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ +} + +static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + +#ifndef CONFIG_PREEMPT_RT +/* + * DO NOT USE IN NEW FILES. + * + * "cgroup.event_control" implementation. + * + * This is way over-engineered. It tries to support fully configurable + * events for each user. Such level of flexibility is completely + * unnecessary especially in the light of the planned unified hierarchy. + * + * Please deprecate this and replace with something simpler if at all + * possible. + */ + +static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, + enum mem_cgroup_events_target target) +{ + unsigned long val, next; + + val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); + next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); + /* from time_after() in jiffies.h */ + if ((long)(next - val) < 0) { + switch (target) { + case MEM_CGROUP_TARGET_THRESH: + next = val + THRESHOLDS_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; + default: + break; + } + __this_cpu_write(memcg->vmstats_percpu->targets[target], next); + return true; + } + return false; +} + +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) +{ + unsigned long excess; + struct mem_cgroup_per_node *mz; + struct mem_cgroup_tree_per_node *mctz; + + mctz = soft_limit_tree.rb_tree_per_node[nid]; + if (!mctz) + return; + /* + * Necessary to update all ancestors when hierarchy is used. + * because their event counter is not touched. + */ + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + mz = memcg->nodeinfo[nid]; + excess = soft_limit_excess(memcg); + /* + * We have to update the tree if mz is on RB-tree or + * mem is over its softlimit. + */ + if (excess || mz->on_tree) { + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); + /* if on-tree, remove it */ + if (mz->on_tree) + __mem_cgroup_remove_exceeded(mz, mctz); + /* + * Insert again. mz->usage_in_excess will be updated. + * If excess is 0, no tree ops. + */ + __mem_cgroup_insert_exceeded(mz, mctz, excess); + spin_unlock_irqrestore(&mctz->lock, flags); + } + } +} + static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) { struct mem_cgroup_threshold_ary *t; @@ -4131,6 +4390,25 @@ static void mem_cgroup_threshold(struct } } +/* + * Check events in order. + * + */ +static void memcg_check_events(struct mem_cgroup *memcg, int nid) +{ + /* threshold event is triggered in finer grain than soft limit */ + if (unlikely(mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_THRESH))) { + bool do_softlimit; + + do_softlimit = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_SOFTLIMIT); + mem_cgroup_threshold(memcg); + if (unlikely(do_softlimit)) + mem_cgroup_update_tree(memcg, nid); + } +} + static int compare_thresholds(const void *a, const void *b) { const struct mem_cgroup_threshold *_a = a; @@ -4145,27 +4423,6 @@ static int compare_thresholds(const void return 0; } -static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) -{ - struct mem_cgroup_eventfd_list *ev; - - spin_lock(&memcg_oom_lock); - - list_for_each_entry(ev, &memcg->oom_notify, list) - eventfd_signal(ev->eventfd, 1); - - spin_unlock(&memcg_oom_lock); - return 0; -} - -static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) -{ - struct mem_cgroup *iter; - - for_each_mem_cgroup_tree(iter, memcg) - mem_cgroup_oom_notify_cb(iter); -} - static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args, enum res_type type) { @@ -4394,259 +4651,6 @@ static void mem_cgroup_oom_unregister_ev spin_unlock(&memcg_oom_lock); } -static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) -{ - struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); - - seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); - seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); - seq_printf(sf, "oom_kill %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); - return 0; -} - -static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 val) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - - /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) - return -EINVAL; - - memcg->oom_kill_disable = val; - if (!val) - memcg_oom_recover(memcg); - - return 0; -} - -#ifdef CONFIG_CGROUP_WRITEBACK - -#include - -static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) -{ - return wb_domain_init(&memcg->cgwb_domain, gfp); -} - -static void memcg_wb_domain_exit(struct mem_cgroup *memcg) -{ - wb_domain_exit(&memcg->cgwb_domain); -} - -static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) -{ - wb_domain_size_changed(&memcg->cgwb_domain); -} - -struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); - - if (!memcg->css.parent) - return NULL; - - return &memcg->cgwb_domain; -} - -/** - * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg - * @wb: bdi_writeback in question - * @pfilepages: out parameter for number of file pages - * @pheadroom: out parameter for number of allocatable pages according to memcg - * @pdirty: out parameter for number of dirty pages - * @pwriteback: out parameter for number of pages under writeback - * - * Determine the numbers of file, headroom, dirty, and writeback pages in - * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom - * is a bit more involved. - * - * A memcg's headroom is "min(max, high) - used". In the hierarchy, the - * headroom is calculated as the lowest headroom of itself and the - * ancestors. Note that this doesn't consider the actual amount of - * available memory in the system. The caller should further cap - * *@pheadroom accordingly. - */ -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, - unsigned long *pheadroom, unsigned long *pdirty, - unsigned long *pwriteback) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); - struct mem_cgroup *parent; - - mem_cgroup_flush_stats(); - - *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); - *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); - *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + - memcg_page_state(memcg, NR_ACTIVE_FILE); - - *pheadroom = PAGE_COUNTER_MAX; - while ((parent = parent_mem_cgroup(memcg))) { - unsigned long ceiling = min(READ_ONCE(memcg->memory.max), - READ_ONCE(memcg->memory.high)); - unsigned long used = page_counter_read(&memcg->memory); - - *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); - memcg = parent; - } -} - -/* - * Foreign dirty flushing - * - * There's an inherent mismatch between memcg and writeback. The former - * tracks ownership per-page while the latter per-inode. This was a - * deliberate design decision because honoring per-page ownership in the - * writeback path is complicated, may lead to higher CPU and IO overheads - * and deemed unnecessary given that write-sharing an inode across - * different cgroups isn't a common use-case. - * - * Combined with inode majority-writer ownership switching, this works well - * enough in most cases but there are some pathological cases. For - * example, let's say there are two cgroups A and B which keep writing to - * different but confined parts of the same inode. B owns the inode and - * A's memory is limited far below B's. A's dirty ratio can rise enough to - * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid - * triggering background writeback. A will be slowed down without a way to - * make writeback of the dirty pages happen. - * - * Conditions like the above can lead to a cgroup getting repeatedly and - * severely throttled after making some progress after each - * dirty_expire_interval while the underlying IO device is almost - * completely idle. - * - * Solving this problem completely requires matching the ownership tracking - * granularities between memcg and writeback in either direction. However, - * the more egregious behaviors can be avoided by simply remembering the - * most recent foreign dirtying events and initiating remote flushes on - * them when local writeback isn't enough to keep the memory clean enough. - * - * The following two functions implement such mechanism. When a foreign - * page - a page whose memcg and writeback ownerships don't match - is - * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning - * bdi_writeback on the page owning memcg. When balance_dirty_pages() - * decides that the memcg needs to sleep due to high dirty ratio, it calls - * mem_cgroup_flush_foreign() which queues writeback on the recorded - * foreign bdi_writebacks which haven't expired. Both the numbers of - * recorded bdi_writebacks and concurrent in-flight foreign writebacks are - * limited to MEMCG_CGWB_FRN_CNT. - * - * The mechanism only remembers IDs and doesn't hold any object references. - * As being wrong occasionally doesn't matter, updates and accesses to the - * records are lockless and racy. - */ -void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, - struct bdi_writeback *wb) -{ - struct mem_cgroup *memcg = folio_memcg(folio); - struct memcg_cgwb_frn *frn; - u64 now = get_jiffies_64(); - u64 oldest_at = now; - int oldest = -1; - int i; - - trace_track_foreign_dirty(folio, wb); - - /* - * Pick the slot to use. If there is already a slot for @wb, keep - * using it. If not replace the oldest one which isn't being - * written out. - */ - for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { - frn = &memcg->cgwb_frn[i]; - if (frn->bdi_id == wb->bdi->id && - frn->memcg_id == wb->memcg_css->id) - break; - if (time_before64(frn->at, oldest_at) && - atomic_read(&frn->done.cnt) == 1) { - oldest = i; - oldest_at = frn->at; - } - } - - if (i < MEMCG_CGWB_FRN_CNT) { - /* - * Re-using an existing one. Update timestamp lazily to - * avoid making the cacheline hot. We want them to be - * reasonably up-to-date and significantly shorter than - * dirty_expire_interval as that's what expires the record. - * Use the shorter of 1s and dirty_expire_interval / 8. - */ - unsigned long update_intv = - min_t(unsigned long, HZ, - msecs_to_jiffies(dirty_expire_interval * 10) / 8); - - if (time_before64(frn->at, now - update_intv)) - frn->at = now; - } else if (oldest >= 0) { - /* replace the oldest free one */ - frn = &memcg->cgwb_frn[oldest]; - frn->bdi_id = wb->bdi->id; - frn->memcg_id = wb->memcg_css->id; - frn->at = now; - } -} - -/* issue foreign writeback flushes for recorded foreign dirtying events */ -void mem_cgroup_flush_foreign(struct bdi_writeback *wb) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); - unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); - u64 now = jiffies_64; - int i; - - for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { - struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; - - /* - * If the record is older than dirty_expire_interval, - * writeback on it has already started. No need to kick it - * off again. Also, don't start a new one if there's - * already one in flight. - */ - if (time_after64(frn->at, now - intv) && - atomic_read(&frn->done.cnt) == 1) { - frn->at = 0; - trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); - cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, - WB_REASON_FOREIGN_FLUSH, - &frn->done); - } - } -} - -#else /* CONFIG_CGROUP_WRITEBACK */ - -static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) -{ - return 0; -} - -static void memcg_wb_domain_exit(struct mem_cgroup *memcg) -{ -} - -static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) -{ -} - -#endif /* CONFIG_CGROUP_WRITEBACK */ - -/* - * DO NOT USE IN NEW FILES. - * - * "cgroup.event_control" implementation. - * - * This is way over-engineered. It tries to support fully configurable - * events for each user. Such level of flexibility is completely - * unnecessary especially in the light of the planned unified hierarchy. - * - * Please deprecate this and replace with something simpler if at all - * possible. - */ - /* * Unregister event and free resources. * @@ -4857,6 +4861,18 @@ static ssize_t memcg_write_event_control return ret; } +#else + +static ssize_t memcg_write_event_control(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return -EOPNOTSUPP; +} + +static void memcg_check_events(struct mem_cgroup *memcg, int nid) { } + +#endif + #if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) static int mem_cgroup_slab_show(struct seq_file *m, void *p) {