summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSebastian Andrzej Siewior <bigeasy@linutronix.de>2022-01-19 18:35:12 +0100
committerSebastian Andrzej Siewior <bigeasy@linutronix.de>2022-01-19 18:35:12 +0100
commit715da4c19855eb48714f5e1e4a41f7412a850cb3 (patch)
treed3081a76d1009045aa3a34c251859451066ba918
parentb8f3cf6ce0d58825dd408951611589aaea129838 (diff)
downloadlinux-rt-715da4c19855eb48714f5e1e4a41f7412a850cb3.tar.gz
[ANNOUNCE] v5.16.1-rt17v5.16.1-rt17-patches
Dear RT folks! I'm pleased to announce the v5.16.1-rt17 patch set. Changes since v5.16.1-rt16: - Make sure that the local_lock_*() are completely optimized away on !RT without debug. - Updates to memcg: Disable the threshold handler on RT which is a cgroup v1 feature (deprecated). - i2c: - Host notify on smbus seems not working on RT. Reported by Michael Below, waiting for feedback. - The rcar host driver must not disable force threading. Known issues - netconsole triggers WARN. - Valentin Schneider reported a few splats on ARM64, see https://lkml.kernel.org/r/20210810134127.1394269-1-valentin.schneider@arm.com The delta patch against v5.16.1-rt16 is appended below and can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.16/incr/patch-5.16.1-rt16-rt17.patch.xz You can get this release via the git tree at: git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.16.1-rt17 The RT patch against v5.16.1 can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.16/older/patch-5.16.1-rt17.patch.xz The split quilt queue is available at: https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.16/older/patches-5.16.1-rt17.tar.xz Sebastian Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
-rw-r--r--patches/0001-mm-memcg-Disable-threshold-event-handlers-on-PREEMPT.patch846
-rw-r--r--patches/0001-mm-memcg-Protect-per-CPU-counter-by-disabling-preemp.patch210
-rw-r--r--patches/0001_random_remove_unused_irq_flags_argument_from_add_interrupt_randomness.patch2
-rw-r--r--patches/0002-mm-memcg-Protect-per-CPU-counter-by-disabling-preemp.patch85
-rw-r--r--patches/0003-mm-memcg-Add-a-local_lock_t-for-IRQ-and-TASK-object.patch (renamed from patches/0002-mm-memcg-Add-a-local_lock_t-for-IRQ-and-TASK-object.patch)51
-rw-r--r--patches/0003_random_split_add_interrupt_randomness.patch4
-rw-r--r--patches/0004-mm-memcg-Allow-the-task_obj-optimization-only-on-non.patch (renamed from patches/0003-mm-memcg-Allow-the-task_obj-optimization-only-on-non.patch)20
-rw-r--r--patches/0004_random_move_the_fast_pool_reset_into_the_caller.patch4
-rw-r--r--patches/0005_random_defer_processing_of_randomness_on_preempt_rt.patch4
-rw-r--r--patches/Add_localversion_for_-RT_release.patch2
-rw-r--r--patches/i2c-core-Let-i2c_handle_smbus_host_notify-use-handle.patch40
-rw-r--r--patches/i2c-rcar-Allow-interrupt-handler-to-be-threaded.patch49
-rw-r--r--patches/locking-local_lock-Make-the-empty-local_lock_-functi.patch40
-rw-r--r--patches/printk__remove_deferred_printing.patch8
-rw-r--r--patches/series12
15 files changed, 1118 insertions, 259 deletions
diff --git a/patches/0001-mm-memcg-Disable-threshold-event-handlers-on-PREEMPT.patch b/patches/0001-mm-memcg-Disable-threshold-event-handlers-on-PREEMPT.patch
new file mode 100644
index 000000000000..d299bfa5b069
--- /dev/null
+++ b/patches/0001-mm-memcg-Disable-threshold-event-handlers-on-PREEMPT.patch
@@ -0,0 +1,846 @@
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Date: Tue, 18 Jan 2022 17:28:07 +0100
+Subject: [PATCH 1/4] mm/memcg: Disable threshold event handlers on PREEMPT_RT
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+During the integration of PREEMPT_RT support, the code flow around
+memcg_check_events() resulted in `twisted code'. Moving the code around
+and avoiding then would then lead to an additional local-irq-save
+section within memcg_check_events(). While looking better, it adds a
+local-irq-save section to code flow which is usually within an
+local-irq-off block on non-PREEMPT_RT configurations.
+
+The threshold event handler is a deprecated memcg v1 feature. Instead of
+trying to get it to work under PREEMPT_RT just disable it. There should
+be no users on PREEMPT_RT. From that perspective it makes even less
+sense to get it to work under PREEMPT_RT while having zero users.
+
+Make memory.soft_limit_in_bytes and cgroup.event_control return
+-EOPNOTSUPP on PREEMPT_RT. Make an empty memcg_check_events() and
+memcg_write_event_control() which return only -EOPNOTSUPP on PREEMPT_RT.
+Document that the two knobs are disabled on PREEMPT_RT. Shuffle the code around
+so that all unused function are in on #ifdef block.
+
+Suggested-by: Michal Hocko <mhocko@kernel.org>
+Suggested-by: Michal Koutný <mkoutny@suse.com>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ Documentation/admin-guide/cgroup-v1/memory.rst | 2
+ mm/memcontrol.c | 728 ++++++++++++-------------
+ 2 files changed, 374 insertions(+), 356 deletions(-)
+
+--- a/Documentation/admin-guide/cgroup-v1/memory.rst
++++ b/Documentation/admin-guide/cgroup-v1/memory.rst
+@@ -64,6 +64,7 @@ Brief summary of control files.
+ threads
+ cgroup.procs show list of processes
+ cgroup.event_control an interface for event_fd()
++ This knob is not available on CONFIG_PREEMPT_RT systems.
+ memory.usage_in_bytes show current usage for memory
+ (See 5.5 for details)
+ memory.memsw.usage_in_bytes show current usage for memory+Swap
+@@ -75,6 +76,7 @@ Brief summary of control files.
+ memory.max_usage_in_bytes show max memory usage recorded
+ memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded
+ memory.soft_limit_in_bytes set/show soft limit of memory usage
++ This knob is not available on CONFIG_PREEMPT_RT systems.
+ memory.stat show various statistics
+ memory.use_hierarchy set/show hierarchical account enabled
+ This knob is deprecated and shouldn't be
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -169,7 +169,6 @@ struct mem_cgroup_event {
+ struct work_struct remove;
+ };
+
+-static void mem_cgroup_threshold(struct mem_cgroup *memcg);
+ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
+
+ /* Stuffs for move charges at task migration. */
+@@ -521,43 +520,6 @@ static unsigned long soft_limit_excess(s
+ return excess;
+ }
+
+-static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
+-{
+- unsigned long excess;
+- struct mem_cgroup_per_node *mz;
+- struct mem_cgroup_tree_per_node *mctz;
+-
+- mctz = soft_limit_tree.rb_tree_per_node[nid];
+- if (!mctz)
+- return;
+- /*
+- * Necessary to update all ancestors when hierarchy is used.
+- * because their event counter is not touched.
+- */
+- for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+- mz = memcg->nodeinfo[nid];
+- excess = soft_limit_excess(memcg);
+- /*
+- * We have to update the tree if mz is on RB-tree or
+- * mem is over its softlimit.
+- */
+- if (excess || mz->on_tree) {
+- unsigned long flags;
+-
+- spin_lock_irqsave(&mctz->lock, flags);
+- /* if on-tree, remove it */
+- if (mz->on_tree)
+- __mem_cgroup_remove_exceeded(mz, mctz);
+- /*
+- * Insert again. mz->usage_in_excess will be updated.
+- * If excess is 0, no tree ops.
+- */
+- __mem_cgroup_insert_exceeded(mz, mctz, excess);
+- spin_unlock_irqrestore(&mctz->lock, flags);
+- }
+- }
+-}
+-
+ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
+ {
+ struct mem_cgroup_tree_per_node *mctz;
+@@ -821,50 +783,6 @@ static void mem_cgroup_charge_statistics
+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
+ }
+
+-static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
+- enum mem_cgroup_events_target target)
+-{
+- unsigned long val, next;
+-
+- val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
+- next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
+- /* from time_after() in jiffies.h */
+- if ((long)(next - val) < 0) {
+- switch (target) {
+- case MEM_CGROUP_TARGET_THRESH:
+- next = val + THRESHOLDS_EVENTS_TARGET;
+- break;
+- case MEM_CGROUP_TARGET_SOFTLIMIT:
+- next = val + SOFTLIMIT_EVENTS_TARGET;
+- break;
+- default:
+- break;
+- }
+- __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
+- return true;
+- }
+- return false;
+-}
+-
+-/*
+- * Check events in order.
+- *
+- */
+-static void memcg_check_events(struct mem_cgroup *memcg, int nid)
+-{
+- /* threshold event is triggered in finer grain than soft limit */
+- if (unlikely(mem_cgroup_event_ratelimit(memcg,
+- MEM_CGROUP_TARGET_THRESH))) {
+- bool do_softlimit;
+-
+- do_softlimit = mem_cgroup_event_ratelimit(memcg,
+- MEM_CGROUP_TARGET_SOFTLIMIT);
+- mem_cgroup_threshold(memcg);
+- if (unlikely(do_softlimit))
+- mem_cgroup_update_tree(memcg, nid);
+- }
+-}
+-
+ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
+ {
+ /*
+@@ -3751,8 +3669,12 @@ static ssize_t mem_cgroup_write(struct k
+ }
+ break;
+ case RES_SOFT_LIMIT:
++#ifndef CONFIG_PREEMPT_RT
+ memcg->soft_limit = nr_pages;
+ ret = 0;
++#else
++ ret = -EOPNOTSUPP;
++#endif
+ break;
+ }
+ return ret ?: nbytes;
+@@ -4057,6 +3979,343 @@ static int mem_cgroup_swappiness_write(s
+ return 0;
+ }
+
++static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
++{
++ struct mem_cgroup_eventfd_list *ev;
++
++ spin_lock(&memcg_oom_lock);
++
++ list_for_each_entry(ev, &memcg->oom_notify, list)
++ eventfd_signal(ev->eventfd, 1);
++
++ spin_unlock(&memcg_oom_lock);
++ return 0;
++}
++
++static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
++{
++ struct mem_cgroup *iter;
++
++ for_each_mem_cgroup_tree(iter, memcg)
++ mem_cgroup_oom_notify_cb(iter);
++}
++
++static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
++{
++ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
++
++ seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
++ seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
++ seq_printf(sf, "oom_kill %lu\n",
++ atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
++ return 0;
++}
++
++static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
++ struct cftype *cft, u64 val)
++{
++ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
++
++ /* cannot set to root cgroup and only 0 and 1 are allowed */
++ if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
++ return -EINVAL;
++
++ memcg->oom_kill_disable = val;
++ if (!val)
++ memcg_oom_recover(memcg);
++
++ return 0;
++}
++
++#ifdef CONFIG_CGROUP_WRITEBACK
++
++#include <trace/events/writeback.h>
++
++static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
++{
++ return wb_domain_init(&memcg->cgwb_domain, gfp);
++}
++
++static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
++{
++ wb_domain_exit(&memcg->cgwb_domain);
++}
++
++static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
++{
++ wb_domain_size_changed(&memcg->cgwb_domain);
++}
++
++struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
++{
++ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
++
++ if (!memcg->css.parent)
++ return NULL;
++
++ return &memcg->cgwb_domain;
++}
++
++/**
++ * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
++ * @wb: bdi_writeback in question
++ * @pfilepages: out parameter for number of file pages
++ * @pheadroom: out parameter for number of allocatable pages according to memcg
++ * @pdirty: out parameter for number of dirty pages
++ * @pwriteback: out parameter for number of pages under writeback
++ *
++ * Determine the numbers of file, headroom, dirty, and writeback pages in
++ * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom
++ * is a bit more involved.
++ *
++ * A memcg's headroom is "min(max, high) - used". In the hierarchy, the
++ * headroom is calculated as the lowest headroom of itself and the
++ * ancestors. Note that this doesn't consider the actual amount of
++ * available memory in the system. The caller should further cap
++ * *@pheadroom accordingly.
++ */
++void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
++ unsigned long *pheadroom, unsigned long *pdirty,
++ unsigned long *pwriteback)
++{
++ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
++ struct mem_cgroup *parent;
++
++ mem_cgroup_flush_stats();
++
++ *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
++ *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
++ *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
++ memcg_page_state(memcg, NR_ACTIVE_FILE);
++
++ *pheadroom = PAGE_COUNTER_MAX;
++ while ((parent = parent_mem_cgroup(memcg))) {
++ unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
++ READ_ONCE(memcg->memory.high));
++ unsigned long used = page_counter_read(&memcg->memory);
++
++ *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
++ memcg = parent;
++ }
++}
++
++/*
++ * Foreign dirty flushing
++ *
++ * There's an inherent mismatch between memcg and writeback. The former
++ * tracks ownership per-page while the latter per-inode. This was a
++ * deliberate design decision because honoring per-page ownership in the
++ * writeback path is complicated, may lead to higher CPU and IO overheads
++ * and deemed unnecessary given that write-sharing an inode across
++ * different cgroups isn't a common use-case.
++ *
++ * Combined with inode majority-writer ownership switching, this works well
++ * enough in most cases but there are some pathological cases. For
++ * example, let's say there are two cgroups A and B which keep writing to
++ * different but confined parts of the same inode. B owns the inode and
++ * A's memory is limited far below B's. A's dirty ratio can rise enough to
++ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
++ * triggering background writeback. A will be slowed down without a way to
++ * make writeback of the dirty pages happen.
++ *
++ * Conditions like the above can lead to a cgroup getting repeatedly and
++ * severely throttled after making some progress after each
++ * dirty_expire_interval while the underlying IO device is almost
++ * completely idle.
++ *
++ * Solving this problem completely requires matching the ownership tracking
++ * granularities between memcg and writeback in either direction. However,
++ * the more egregious behaviors can be avoided by simply remembering the
++ * most recent foreign dirtying events and initiating remote flushes on
++ * them when local writeback isn't enough to keep the memory clean enough.
++ *
++ * The following two functions implement such mechanism. When a foreign
++ * page - a page whose memcg and writeback ownerships don't match - is
++ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
++ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
++ * decides that the memcg needs to sleep due to high dirty ratio, it calls
++ * mem_cgroup_flush_foreign() which queues writeback on the recorded
++ * foreign bdi_writebacks which haven't expired. Both the numbers of
++ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
++ * limited to MEMCG_CGWB_FRN_CNT.
++ *
++ * The mechanism only remembers IDs and doesn't hold any object references.
++ * As being wrong occasionally doesn't matter, updates and accesses to the
++ * records are lockless and racy.
++ */
++void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
++ struct bdi_writeback *wb)
++{
++ struct mem_cgroup *memcg = folio_memcg(folio);
++ struct memcg_cgwb_frn *frn;
++ u64 now = get_jiffies_64();
++ u64 oldest_at = now;
++ int oldest = -1;
++ int i;
++
++ trace_track_foreign_dirty(folio, wb);
++
++ /*
++ * Pick the slot to use. If there is already a slot for @wb, keep
++ * using it. If not replace the oldest one which isn't being
++ * written out.
++ */
++ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
++ frn = &memcg->cgwb_frn[i];
++ if (frn->bdi_id == wb->bdi->id &&
++ frn->memcg_id == wb->memcg_css->id)
++ break;
++ if (time_before64(frn->at, oldest_at) &&
++ atomic_read(&frn->done.cnt) == 1) {
++ oldest = i;
++ oldest_at = frn->at;
++ }
++ }
++
++ if (i < MEMCG_CGWB_FRN_CNT) {
++ /*
++ * Re-using an existing one. Update timestamp lazily to
++ * avoid making the cacheline hot. We want them to be
++ * reasonably up-to-date and significantly shorter than
++ * dirty_expire_interval as that's what expires the record.
++ * Use the shorter of 1s and dirty_expire_interval / 8.
++ */
++ unsigned long update_intv =
++ min_t(unsigned long, HZ,
++ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
++
++ if (time_before64(frn->at, now - update_intv))
++ frn->at = now;
++ } else if (oldest >= 0) {
++ /* replace the oldest free one */
++ frn = &memcg->cgwb_frn[oldest];
++ frn->bdi_id = wb->bdi->id;
++ frn->memcg_id = wb->memcg_css->id;
++ frn->at = now;
++ }
++}
++
++/* issue foreign writeback flushes for recorded foreign dirtying events */
++void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
++{
++ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
++ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
++ u64 now = jiffies_64;
++ int i;
++
++ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
++ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
++
++ /*
++ * If the record is older than dirty_expire_interval,
++ * writeback on it has already started. No need to kick it
++ * off again. Also, don't start a new one if there's
++ * already one in flight.
++ */
++ if (time_after64(frn->at, now - intv) &&
++ atomic_read(&frn->done.cnt) == 1) {
++ frn->at = 0;
++ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
++ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
++ WB_REASON_FOREIGN_FLUSH,
++ &frn->done);
++ }
++ }
++}
++
++#else /* CONFIG_CGROUP_WRITEBACK */
++
++static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
++{
++ return 0;
++}
++
++static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
++{
++}
++
++static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
++{
++}
++
++#endif /* CONFIG_CGROUP_WRITEBACK */
++
++#ifndef CONFIG_PREEMPT_RT
++/*
++ * DO NOT USE IN NEW FILES.
++ *
++ * "cgroup.event_control" implementation.
++ *
++ * This is way over-engineered. It tries to support fully configurable
++ * events for each user. Such level of flexibility is completely
++ * unnecessary especially in the light of the planned unified hierarchy.
++ *
++ * Please deprecate this and replace with something simpler if at all
++ * possible.
++ */
++
++static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
++ enum mem_cgroup_events_target target)
++{
++ unsigned long val, next;
++
++ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
++ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
++ /* from time_after() in jiffies.h */
++ if ((long)(next - val) < 0) {
++ switch (target) {
++ case MEM_CGROUP_TARGET_THRESH:
++ next = val + THRESHOLDS_EVENTS_TARGET;
++ break;
++ case MEM_CGROUP_TARGET_SOFTLIMIT:
++ next = val + SOFTLIMIT_EVENTS_TARGET;
++ break;
++ default:
++ break;
++ }
++ __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
++ return true;
++ }
++ return false;
++}
++
++static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
++{
++ unsigned long excess;
++ struct mem_cgroup_per_node *mz;
++ struct mem_cgroup_tree_per_node *mctz;
++
++ mctz = soft_limit_tree.rb_tree_per_node[nid];
++ if (!mctz)
++ return;
++ /*
++ * Necessary to update all ancestors when hierarchy is used.
++ * because their event counter is not touched.
++ */
++ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
++ mz = memcg->nodeinfo[nid];
++ excess = soft_limit_excess(memcg);
++ /*
++ * We have to update the tree if mz is on RB-tree or
++ * mem is over its softlimit.
++ */
++ if (excess || mz->on_tree) {
++ unsigned long flags;
++
++ spin_lock_irqsave(&mctz->lock, flags);
++ /* if on-tree, remove it */
++ if (mz->on_tree)
++ __mem_cgroup_remove_exceeded(mz, mctz);
++ /*
++ * Insert again. mz->usage_in_excess will be updated.
++ * If excess is 0, no tree ops.
++ */
++ __mem_cgroup_insert_exceeded(mz, mctz, excess);
++ spin_unlock_irqrestore(&mctz->lock, flags);
++ }
++ }
++}
++
+ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
+ {
+ struct mem_cgroup_threshold_ary *t;
+@@ -4119,6 +4378,25 @@ static void mem_cgroup_threshold(struct
+ }
+ }
+
++/*
++ * Check events in order.
++ *
++ */
++static void memcg_check_events(struct mem_cgroup *memcg, int nid)
++{
++ /* threshold event is triggered in finer grain than soft limit */
++ if (unlikely(mem_cgroup_event_ratelimit(memcg,
++ MEM_CGROUP_TARGET_THRESH))) {
++ bool do_softlimit;
++
++ do_softlimit = mem_cgroup_event_ratelimit(memcg,
++ MEM_CGROUP_TARGET_SOFTLIMIT);
++ mem_cgroup_threshold(memcg);
++ if (unlikely(do_softlimit))
++ mem_cgroup_update_tree(memcg, nid);
++ }
++}
++
+ static int compare_thresholds(const void *a, const void *b)
+ {
+ const struct mem_cgroup_threshold *_a = a;
+@@ -4133,27 +4411,6 @@ static int compare_thresholds(const void
+ return 0;
+ }
+
+-static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
+-{
+- struct mem_cgroup_eventfd_list *ev;
+-
+- spin_lock(&memcg_oom_lock);
+-
+- list_for_each_entry(ev, &memcg->oom_notify, list)
+- eventfd_signal(ev->eventfd, 1);
+-
+- spin_unlock(&memcg_oom_lock);
+- return 0;
+-}
+-
+-static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
+-{
+- struct mem_cgroup *iter;
+-
+- for_each_mem_cgroup_tree(iter, memcg)
+- mem_cgroup_oom_notify_cb(iter);
+-}
+-
+ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args, enum res_type type)
+ {
+@@ -4382,259 +4639,6 @@ static void mem_cgroup_oom_unregister_ev
+ spin_unlock(&memcg_oom_lock);
+ }
+
+-static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
+-{
+- struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
+-
+- seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
+- seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
+- seq_printf(sf, "oom_kill %lu\n",
+- atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
+- return 0;
+-}
+-
+-static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
+- struct cftype *cft, u64 val)
+-{
+- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+-
+- /* cannot set to root cgroup and only 0 and 1 are allowed */
+- if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
+- return -EINVAL;
+-
+- memcg->oom_kill_disable = val;
+- if (!val)
+- memcg_oom_recover(memcg);
+-
+- return 0;
+-}
+-
+-#ifdef CONFIG_CGROUP_WRITEBACK
+-
+-#include <trace/events/writeback.h>
+-
+-static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+-{
+- return wb_domain_init(&memcg->cgwb_domain, gfp);
+-}
+-
+-static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+-{
+- wb_domain_exit(&memcg->cgwb_domain);
+-}
+-
+-static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+-{
+- wb_domain_size_changed(&memcg->cgwb_domain);
+-}
+-
+-struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+-{
+- struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+-
+- if (!memcg->css.parent)
+- return NULL;
+-
+- return &memcg->cgwb_domain;
+-}
+-
+-/**
+- * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+- * @wb: bdi_writeback in question
+- * @pfilepages: out parameter for number of file pages
+- * @pheadroom: out parameter for number of allocatable pages according to memcg
+- * @pdirty: out parameter for number of dirty pages
+- * @pwriteback: out parameter for number of pages under writeback
+- *
+- * Determine the numbers of file, headroom, dirty, and writeback pages in
+- * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom
+- * is a bit more involved.
+- *
+- * A memcg's headroom is "min(max, high) - used". In the hierarchy, the
+- * headroom is calculated as the lowest headroom of itself and the
+- * ancestors. Note that this doesn't consider the actual amount of
+- * available memory in the system. The caller should further cap
+- * *@pheadroom accordingly.
+- */
+-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+- unsigned long *pheadroom, unsigned long *pdirty,
+- unsigned long *pwriteback)
+-{
+- struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+- struct mem_cgroup *parent;
+-
+- mem_cgroup_flush_stats();
+-
+- *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+- *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
+- *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
+- memcg_page_state(memcg, NR_ACTIVE_FILE);
+-
+- *pheadroom = PAGE_COUNTER_MAX;
+- while ((parent = parent_mem_cgroup(memcg))) {
+- unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
+- READ_ONCE(memcg->memory.high));
+- unsigned long used = page_counter_read(&memcg->memory);
+-
+- *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
+- memcg = parent;
+- }
+-}
+-
+-/*
+- * Foreign dirty flushing
+- *
+- * There's an inherent mismatch between memcg and writeback. The former
+- * tracks ownership per-page while the latter per-inode. This was a
+- * deliberate design decision because honoring per-page ownership in the
+- * writeback path is complicated, may lead to higher CPU and IO overheads
+- * and deemed unnecessary given that write-sharing an inode across
+- * different cgroups isn't a common use-case.
+- *
+- * Combined with inode majority-writer ownership switching, this works well
+- * enough in most cases but there are some pathological cases. For
+- * example, let's say there are two cgroups A and B which keep writing to
+- * different but confined parts of the same inode. B owns the inode and
+- * A's memory is limited far below B's. A's dirty ratio can rise enough to
+- * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
+- * triggering background writeback. A will be slowed down without a way to
+- * make writeback of the dirty pages happen.
+- *
+- * Conditions like the above can lead to a cgroup getting repeatedly and
+- * severely throttled after making some progress after each
+- * dirty_expire_interval while the underlying IO device is almost
+- * completely idle.
+- *
+- * Solving this problem completely requires matching the ownership tracking
+- * granularities between memcg and writeback in either direction. However,
+- * the more egregious behaviors can be avoided by simply remembering the
+- * most recent foreign dirtying events and initiating remote flushes on
+- * them when local writeback isn't enough to keep the memory clean enough.
+- *
+- * The following two functions implement such mechanism. When a foreign
+- * page - a page whose memcg and writeback ownerships don't match - is
+- * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
+- * bdi_writeback on the page owning memcg. When balance_dirty_pages()
+- * decides that the memcg needs to sleep due to high dirty ratio, it calls
+- * mem_cgroup_flush_foreign() which queues writeback on the recorded
+- * foreign bdi_writebacks which haven't expired. Both the numbers of
+- * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
+- * limited to MEMCG_CGWB_FRN_CNT.
+- *
+- * The mechanism only remembers IDs and doesn't hold any object references.
+- * As being wrong occasionally doesn't matter, updates and accesses to the
+- * records are lockless and racy.
+- */
+-void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
+- struct bdi_writeback *wb)
+-{
+- struct mem_cgroup *memcg = folio_memcg(folio);
+- struct memcg_cgwb_frn *frn;
+- u64 now = get_jiffies_64();
+- u64 oldest_at = now;
+- int oldest = -1;
+- int i;
+-
+- trace_track_foreign_dirty(folio, wb);
+-
+- /*
+- * Pick the slot to use. If there is already a slot for @wb, keep
+- * using it. If not replace the oldest one which isn't being
+- * written out.
+- */
+- for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+- frn = &memcg->cgwb_frn[i];
+- if (frn->bdi_id == wb->bdi->id &&
+- frn->memcg_id == wb->memcg_css->id)
+- break;
+- if (time_before64(frn->at, oldest_at) &&
+- atomic_read(&frn->done.cnt) == 1) {
+- oldest = i;
+- oldest_at = frn->at;
+- }
+- }
+-
+- if (i < MEMCG_CGWB_FRN_CNT) {
+- /*
+- * Re-using an existing one. Update timestamp lazily to
+- * avoid making the cacheline hot. We want them to be
+- * reasonably up-to-date and significantly shorter than
+- * dirty_expire_interval as that's what expires the record.
+- * Use the shorter of 1s and dirty_expire_interval / 8.
+- */
+- unsigned long update_intv =
+- min_t(unsigned long, HZ,
+- msecs_to_jiffies(dirty_expire_interval * 10) / 8);
+-
+- if (time_before64(frn->at, now - update_intv))
+- frn->at = now;
+- } else if (oldest >= 0) {
+- /* replace the oldest free one */
+- frn = &memcg->cgwb_frn[oldest];
+- frn->bdi_id = wb->bdi->id;
+- frn->memcg_id = wb->memcg_css->id;
+- frn->at = now;
+- }
+-}
+-
+-/* issue foreign writeback flushes for recorded foreign dirtying events */
+-void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+-{
+- struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+- unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
+- u64 now = jiffies_64;
+- int i;
+-
+- for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+- struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
+-
+- /*
+- * If the record is older than dirty_expire_interval,
+- * writeback on it has already started. No need to kick it
+- * off again. Also, don't start a new one if there's
+- * already one in flight.
+- */
+- if (time_after64(frn->at, now - intv) &&
+- atomic_read(&frn->done.cnt) == 1) {
+- frn->at = 0;
+- trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
+- cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
+- WB_REASON_FOREIGN_FLUSH,
+- &frn->done);
+- }
+- }
+-}
+-
+-#else /* CONFIG_CGROUP_WRITEBACK */
+-
+-static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+-{
+- return 0;
+-}
+-
+-static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+-{
+-}
+-
+-static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+-{
+-}
+-
+-#endif /* CONFIG_CGROUP_WRITEBACK */
+-
+-/*
+- * DO NOT USE IN NEW FILES.
+- *
+- * "cgroup.event_control" implementation.
+- *
+- * This is way over-engineered. It tries to support fully configurable
+- * events for each user. Such level of flexibility is completely
+- * unnecessary especially in the light of the planned unified hierarchy.
+- *
+- * Please deprecate this and replace with something simpler if at all
+- * possible.
+- */
+-
+ /*
+ * Unregister event and free resources.
+ *
+@@ -4845,6 +4849,18 @@ static ssize_t memcg_write_event_control
+ return ret;
+ }
+
++#else
++
++static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
++ char *buf, size_t nbytes, loff_t off)
++{
++ return -EOPNOTSUPP;
++}
++
++static void memcg_check_events(struct mem_cgroup *memcg, int nid) { }
++
++#endif
++
+ static struct cftype mem_cgroup_legacy_files[] = {
+ {
+ .name = "usage_in_bytes",
diff --git a/patches/0001-mm-memcg-Protect-per-CPU-counter-by-disabling-preemp.patch b/patches/0001-mm-memcg-Protect-per-CPU-counter-by-disabling-preemp.patch
deleted file mode 100644
index 9ac4dc028011..000000000000
--- a/patches/0001-mm-memcg-Protect-per-CPU-counter-by-disabling-preemp.patch
+++ /dev/null
@@ -1,210 +0,0 @@
-From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
-Date: Fri, 17 Dec 2021 18:19:19 +0100
-Subject: [PATCH 1/3] mm/memcg: Protect per-CPU counter by disabling preemption
- on PREEMPT_RT
-
-The per-CPU counter are modified with the non-atomic modifier. The
-consistency is ensure by disabling interrupts for the update.
-This breaks on PREEMPT_RT because some sections additionally
-acquire a spinlock_t lock (which becomes sleeping and must not be
-acquired with disabled interrupts). Another problem is that
-mem_cgroup_swapout() expects to be invoked with disabled interrupts
-because the caller has to acquire a spinlock_t which is acquired with
-disabled interrupts. Since spinlock_t never disables interrupts on
-PREEMPT_RT the interrupts are never disabled at this point.
-
-The code is never called from in_irq() context on PREEMPT_RT therefore
-disabling preemption during the update is sufficient on PREEMPT_RT. The
-sections with disabled preemption must exclude memcg_check_events() so
-that spinlock_t locks can still be acquired (for instance in
-eventfd_signal()).
-
-Don't disable interrupts during updates of the per-CPU variables,
-instead use shorter sections which disable preemption.
-
-Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
-Link: https://lkml.kernel.org/r/20211222114111.2206248-2-bigeasy@linutronix.de
----
- mm/memcontrol.c | 74 +++++++++++++++++++++++++++++++++++++++++++++-----------
- 1 file changed, 60 insertions(+), 14 deletions(-)
-
---- a/mm/memcontrol.c
-+++ b/mm/memcontrol.c
-@@ -671,8 +671,14 @@ void __mod_memcg_state(struct mem_cgroup
- if (mem_cgroup_disabled())
- return;
-
-+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
-+ preempt_disable();
-+
- __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
- memcg_rstat_updated(memcg);
-+
-+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
-+ preempt_enable();
- }
-
- /* idx can be of type enum memcg_stat_item or node_stat_item. */
-@@ -699,6 +705,9 @@ void __mod_memcg_lruvec_state(struct lru
- pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- memcg = pn->memcg;
-
-+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
-+ preempt_disable();
-+
- /* Update memcg */
- __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
-
-@@ -706,6 +715,9 @@ void __mod_memcg_lruvec_state(struct lru
- __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
-
- memcg_rstat_updated(memcg);
-+
-+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
-+ preempt_enable();
- }
-
- /**
-@@ -788,8 +800,13 @@ void __count_memcg_events(struct mem_cgr
- if (mem_cgroup_disabled())
- return;
-
-+ if (IS_ENABLED(PREEMPT_RT))
-+ preempt_disable();
-+
- __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
- memcg_rstat_updated(memcg);
-+ if (IS_ENABLED(PREEMPT_RT))
-+ preempt_enable();
- }
-
- static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
-@@ -810,6 +827,9 @@ static unsigned long memcg_events_local(
- static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
- int nr_pages)
- {
-+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
-+ preempt_disable();
-+
- /* pagein of a big page is an event. So, ignore page size */
- if (nr_pages > 0)
- __count_memcg_events(memcg, PGPGIN, 1);
-@@ -819,12 +839,19 @@ static void mem_cgroup_charge_statistics
- }
-
- __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
-+
-+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
-+ preempt_enable();
- }
-
- static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
- enum mem_cgroup_events_target target)
- {
- unsigned long val, next;
-+ bool ret = false;
-+
-+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
-+ preempt_disable();
-
- val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
- next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
-@@ -841,9 +868,11 @@ static bool mem_cgroup_event_ratelimit(s
- break;
- }
- __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
-- return true;
-+ ret = true;
- }
-- return false;
-+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
-+ preempt_enable();
-+ return ret;
- }
-
- /*
-@@ -5645,12 +5674,14 @@ static int mem_cgroup_move_account(struc
- ret = 0;
- nid = folio_nid(folio);
-
-- local_irq_disable();
-+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
-+ local_irq_disable();
- mem_cgroup_charge_statistics(to, nr_pages);
- memcg_check_events(to, nid);
- mem_cgroup_charge_statistics(from, -nr_pages);
- memcg_check_events(from, nid);
-- local_irq_enable();
-+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
-+ local_irq_enable();
- out_unlock:
- folio_unlock(folio);
- out:
-@@ -6670,10 +6701,12 @@ static int charge_memcg(struct folio *fo
- css_get(&memcg->css);
- commit_charge(folio, memcg);
-
-- local_irq_disable();
-+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
-+ local_irq_disable();
- mem_cgroup_charge_statistics(memcg, nr_pages);
- memcg_check_events(memcg, folio_nid(folio));
-- local_irq_enable();
-+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
-+ local_irq_enable();
- out:
- return ret;
- }
-@@ -6785,11 +6818,20 @@ static void uncharge_batch(const struct
- memcg_oom_recover(ug->memcg);
- }
-
-- local_irq_save(flags);
-- __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
-- __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
-- memcg_check_events(ug->memcg, ug->nid);
-- local_irq_restore(flags);
-+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
-+ local_irq_save(flags);
-+ __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
-+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
-+ memcg_check_events(ug->memcg, ug->nid);
-+ local_irq_restore(flags);
-+ } else {
-+ preempt_disable();
-+ __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
-+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
-+ preempt_enable();
-+
-+ memcg_check_events(ug->memcg, ug->nid);
-+ }
-
- /* drop reference from uncharge_folio */
- css_put(&ug->memcg->css);
-@@ -6930,10 +6972,12 @@ void mem_cgroup_migrate(struct folio *ol
- css_get(&memcg->css);
- commit_charge(new, memcg);
-
-- local_irq_save(flags);
-+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
-+ local_irq_save(flags);
- mem_cgroup_charge_statistics(memcg, nr_pages);
- memcg_check_events(memcg, folio_nid(new));
-- local_irq_restore(flags);
-+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
-+ local_irq_restore(flags);
- }
-
- DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
-@@ -7157,8 +7201,10 @@ void mem_cgroup_swapout(struct page *pag
- * i_pages lock which is taken with interrupts-off. It is
- * important here to have the interrupts disabled because it is the
- * only synchronisation we have for updating the per-CPU variables.
-+ * On PREEMPT_RT interrupts are never disabled and the updates to per-CPU
-+ * variables are synchronised by keeping preemption disabled.
- */
-- VM_BUG_ON(!irqs_disabled());
-+ VM_BUG_ON(!IS_ENABLED(CONFIG_PREEMPT_RT) && !irqs_disabled());
- mem_cgroup_charge_statistics(memcg, -nr_entries);
- memcg_check_events(memcg, page_to_nid(page));
-
diff --git a/patches/0001_random_remove_unused_irq_flags_argument_from_add_interrupt_randomness.patch b/patches/0001_random_remove_unused_irq_flags_argument_from_add_interrupt_randomness.patch
index e596a409f7ec..01dcb9789602 100644
--- a/patches/0001_random_remove_unused_irq_flags_argument_from_add_interrupt_randomness.patch
+++ b/patches/0001_random_remove_unused_irq_flags_argument_from_add_interrupt_randomness.patch
@@ -53,7 +53,7 @@ Link: https://lore.kernel.org/r/20211207121737.2347312-2-bigeasy@linutronix.de
* void add_disk_randomness(struct gendisk *disk);
*
* add_device_randomness() is for adding data to the random pool that
-@@ -1242,7 +1242,7 @@ static __u32 get_reg(struct fast_pool *f
+@@ -1260,7 +1260,7 @@ static __u32 get_reg(struct fast_pool *f
return *ptr;
}
diff --git a/patches/0002-mm-memcg-Protect-per-CPU-counter-by-disabling-preemp.patch b/patches/0002-mm-memcg-Protect-per-CPU-counter-by-disabling-preemp.patch
new file mode 100644
index 000000000000..7ed1f73e4ea1
--- /dev/null
+++ b/patches/0002-mm-memcg-Protect-per-CPU-counter-by-disabling-preemp.patch
@@ -0,0 +1,85 @@
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Date: Fri, 17 Dec 2021 18:19:19 +0100
+Subject: [PATCH 2/4] mm/memcg: Protect per-CPU counter by disabling preemption
+ on PREEMPT_RT where needed.
+
+The per-CPU counter are modified with the non-atomic modifier. The
+consistency is ensured by disabling interrupts for the update.
+On non PREEMPT_RT configuration this works because acquiring a
+spinlock_t typed lock with the _irq() suffix disables interrupts. On
+PREEMPT_RT configurations the RMW operation can be interrupted.
+
+Another problem is that mem_cgroup_swapout() expects to be invoked with
+disabled interrupts because the caller has to acquire a spinlock_t which
+is acquired with disabled interrupts. Since spinlock_t never disables
+interrupts on PREEMPT_RT the interrupts are never disabled at this
+point.
+
+The code is never called from in_irq() context on PREEMPT_RT therefore
+disabling preemption during the update is sufficient on PREEMPT_RT.
+The sections which explicitly disable interrupts can remain on
+PREEMPT_RT because the sections remain short and they don't involve
+sleeping locks (memcg_check_events() is doing nothing on PREEMPT_RT).
+
+Disable preemption during update of the per-CPU variables which do not
+explicitly disable interrupts.
+
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ mm/memcontrol.c | 21 +++++++++++++++++++--
+ 1 file changed, 19 insertions(+), 2 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -661,6 +661,8 @@ void __mod_memcg_lruvec_state(struct lru
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+ memcg = pn->memcg;
+
++ if (IS_ENABLED(CONFIG_PREEMPT_RT))
++ preempt_disable();
+ /* Update memcg */
+ __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
+
+@@ -668,6 +670,8 @@ void __mod_memcg_lruvec_state(struct lru
+ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
+
+ memcg_rstat_updated(memcg);
++ if (IS_ENABLED(CONFIG_PREEMPT_RT))
++ preempt_enable();
+ }
+
+ /**
+@@ -750,8 +754,12 @@ void __count_memcg_events(struct mem_cgr
+ if (mem_cgroup_disabled())
+ return;
+
++ if (IS_ENABLED(PREEMPT_RT))
++ preempt_disable();
+ __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
+ memcg_rstat_updated(memcg);
++ if (IS_ENABLED(PREEMPT_RT))
++ preempt_enable();
+ }
+
+ static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
+@@ -7173,9 +7181,18 @@ void mem_cgroup_swapout(struct page *pag
+ * i_pages lock which is taken with interrupts-off. It is
+ * important here to have the interrupts disabled because it is the
+ * only synchronisation we have for updating the per-CPU variables.
++ * On PREEMPT_RT interrupts are never disabled and the updates to per-CPU
++ * variables are synchronised by keeping preemption disabled.
+ */
+- VM_BUG_ON(!irqs_disabled());
+- mem_cgroup_charge_statistics(memcg, -nr_entries);
++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
++ VM_BUG_ON(!irqs_disabled());
++ mem_cgroup_charge_statistics(memcg, -nr_entries);
++ } else {
++ preempt_disable();
++ mem_cgroup_charge_statistics(memcg, -nr_entries);
++ preempt_enable();
++ }
++
+ memcg_check_events(memcg, page_to_nid(page));
+
+ css_put(&memcg->css);
diff --git a/patches/0002-mm-memcg-Add-a-local_lock_t-for-IRQ-and-TASK-object.patch b/patches/0003-mm-memcg-Add-a-local_lock_t-for-IRQ-and-TASK-object.patch
index 42b457e4a0f2..13ba236cffa9 100644
--- a/patches/0002-mm-memcg-Add-a-local_lock_t-for-IRQ-and-TASK-object.patch
+++ b/patches/0003-mm-memcg-Add-a-local_lock_t-for-IRQ-and-TASK-object.patch
@@ -1,6 +1,6 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 20 Dec 2021 11:14:17 +0100
-Subject: [PATCH 2/3] mm/memcg: Add a local_lock_t for IRQ and TASK object.
+Subject: [PATCH 3/4] mm/memcg: Add a local_lock_t for IRQ and TASK object.
The members of the per-CPU structure memcg_stock_pcp are protected
either by disabling interrupts or by disabling preemption if the
@@ -50,14 +50,13 @@ interrupts with a local_lock_t. This change requires some factoring:
complains.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
-Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
---
mm/memcontrol.c | 176 ++++++++++++++++++++++++++++++++++++--------------------
1 file changed, 115 insertions(+), 61 deletions(-)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
-@@ -261,8 +261,10 @@ bool mem_cgroup_kmem_disabled(void)
+@@ -260,8 +260,10 @@ bool mem_cgroup_kmem_disabled(void)
return cgroup_memory_nokmem;
}
@@ -69,7 +68,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
static void obj_cgroup_release(struct percpu_ref *ref)
{
-@@ -296,7 +298,7 @@ static void obj_cgroup_release(struct pe
+@@ -295,7 +297,7 @@ static void obj_cgroup_release(struct pe
nr_pages = nr_bytes >> PAGE_SHIFT;
if (nr_pages)
@@ -78,7 +77,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
spin_lock_irqsave(&css_set_lock, flags);
list_del(&objcg->list);
-@@ -2120,26 +2122,40 @@ struct obj_stock {
+@@ -2017,26 +2019,40 @@ struct obj_stock {
};
struct memcg_stock_pcp {
@@ -122,7 +121,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
}
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg)
-@@ -2168,7 +2184,7 @@ static bool consume_stock(struct mem_cgr
+@@ -2065,7 +2081,7 @@ static bool consume_stock(struct mem_cgr
if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
@@ -131,7 +130,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
stock = this_cpu_ptr(&memcg_stock);
if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
-@@ -2176,7 +2192,7 @@ static bool consume_stock(struct mem_cgr
+@@ -2073,7 +2089,7 @@ static bool consume_stock(struct mem_cgr
ret = true;
}
@@ -140,7 +139,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
return ret;
}
-@@ -2204,38 +2220,43 @@ static void drain_stock(struct memcg_sto
+@@ -2101,38 +2117,43 @@ static void drain_stock(struct memcg_sto
static void drain_local_stock(struct work_struct *dummy)
{
@@ -192,16 +191,16 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
{
- struct memcg_stock_pcp *stock;
- unsigned long flags;
+-
+- local_irq_save(flags);
+ struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
-- local_irq_save(flags);
--
- stock = this_cpu_ptr(&memcg_stock);
+ lockdep_assert_held(&stock->stock_lock);
if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
css_get(&memcg->css);
-@@ -2245,8 +2266,20 @@ static void refill_stock(struct mem_cgro
+@@ -2142,8 +2163,20 @@ static void refill_stock(struct mem_cgro
if (stock->nr_pages > MEMCG_CHARGE_BATCH)
drain_stock(stock);
@@ -223,7 +222,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
}
/*
-@@ -2255,7 +2288,7 @@ static void refill_stock(struct mem_cgro
+@@ -2152,7 +2185,7 @@ static void refill_stock(struct mem_cgro
*/
static void drain_all_stock(struct mem_cgroup *root_memcg)
{
@@ -232,7 +231,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
/* If someone's already draining, avoid adding running more workers. */
if (!mutex_trylock(&percpu_charge_mutex))
-@@ -2266,7 +2299,7 @@ static void drain_all_stock(struct mem_c
+@@ -2163,7 +2196,7 @@ static void drain_all_stock(struct mem_c
* as well as workers from this path always operate on the local
* per-cpu data. CPU up doesn't touch memcg_stock at all.
*/
@@ -241,7 +240,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
-@@ -2282,14 +2315,10 @@ static void drain_all_stock(struct mem_c
+@@ -2179,14 +2212,10 @@ static void drain_all_stock(struct mem_c
rcu_read_unlock();
if (flush &&
@@ -259,7 +258,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
mutex_unlock(&percpu_charge_mutex);
}
-@@ -2690,7 +2719,7 @@ static int try_charge_memcg(struct mem_c
+@@ -2587,7 +2616,7 @@ static int try_charge_memcg(struct mem_c
done_restock:
if (batch > nr_pages)
@@ -268,7 +267,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
/*
* If the hierarchy is above the normal consumption range, schedule
-@@ -2803,28 +2832,36 @@ static struct mem_cgroup *get_mem_cgroup
+@@ -2700,28 +2729,36 @@ static struct mem_cgroup *get_mem_cgroup
* can only be accessed after disabling interrupt. User context code can
* access interrupt object stock, but not vice versa.
*/
@@ -314,7 +313,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
}
/*
-@@ -3002,7 +3039,8 @@ static void memcg_free_cache_id(int id)
+@@ -2899,7 +2936,8 @@ static void memcg_free_cache_id(int id)
* @nr_pages: number of pages to uncharge
*/
static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
@@ -324,7 +323,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
{
struct mem_cgroup *memcg;
-@@ -3010,7 +3048,7 @@ static void obj_cgroup_uncharge_pages(st
+@@ -2907,7 +2945,7 @@ static void obj_cgroup_uncharge_pages(st
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->kmem, nr_pages);
@@ -333,7 +332,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
css_put(&memcg->css);
}
-@@ -3084,7 +3122,7 @@ void __memcg_kmem_uncharge_page(struct p
+@@ -2981,7 +3019,7 @@ void __memcg_kmem_uncharge_page(struct p
return;
objcg = __folio_objcg(folio);
@@ -342,7 +341,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
folio->memcg_data = 0;
obj_cgroup_put(objcg);
}
-@@ -3092,17 +3130,21 @@ void __memcg_kmem_uncharge_page(struct p
+@@ -2989,17 +3027,21 @@ void __memcg_kmem_uncharge_page(struct p
void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
@@ -366,7 +365,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
obj_cgroup_get(objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
-@@ -3146,38 +3188,43 @@ void mod_objcg_state(struct obj_cgroup *
+@@ -3043,38 +3085,43 @@ void mod_objcg_state(struct obj_cgroup *
if (nr)
mod_objcg_mlstate(objcg, pgdat, idx, nr);
@@ -416,7 +415,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
/*
* The leftover is flushed to the centralized per-memcg value.
-@@ -3212,8 +3259,8 @@ static void drain_obj_stock(struct obj_s
+@@ -3109,8 +3156,8 @@ static void drain_obj_stock(struct obj_s
stock->cached_pgdat = NULL;
}
@@ -426,7 +425,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
}
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
-@@ -3221,11 +3268,13 @@ static bool obj_stock_flush_required(str
+@@ -3118,11 +3165,13 @@ static bool obj_stock_flush_required(str
{
struct mem_cgroup *memcg;
@@ -440,7 +439,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
if (stock->irq_obj.cached_objcg) {
memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
-@@ -3238,12 +3287,15 @@ static bool obj_stock_flush_required(str
+@@ -3135,12 +3184,15 @@ static bool obj_stock_flush_required(str
static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
bool allow_uncharge)
{
@@ -458,7 +457,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
obj_cgroup_get(objcg);
stock->cached_objcg = objcg;
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
-@@ -3257,10 +3309,12 @@ static void refill_obj_stock(struct obj_
+@@ -3154,10 +3206,12 @@ static void refill_obj_stock(struct obj_
stock->nr_bytes &= (PAGE_SIZE - 1);
}
@@ -473,7 +472,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-3-bigeasy@linutronix.de
}
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
-@@ -7061,7 +7115,7 @@ void mem_cgroup_uncharge_skmem(struct me
+@@ -7041,7 +7095,7 @@ void mem_cgroup_uncharge_skmem(struct me
mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
diff --git a/patches/0003_random_split_add_interrupt_randomness.patch b/patches/0003_random_split_add_interrupt_randomness.patch
index bd0750993a37..d94c0ca75a94 100644
--- a/patches/0003_random_split_add_interrupt_randomness.patch
+++ b/patches/0003_random_split_add_interrupt_randomness.patch
@@ -21,7 +21,7 @@ Link: https://lore.kernel.org/r/20211207121737.2347312-4-bigeasy@linutronix.de
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
-@@ -1242,29 +1242,10 @@ static __u32 get_reg(struct fast_pool *f
+@@ -1260,29 +1260,10 @@ static __u32 get_reg(struct fast_pool *f
return *ptr;
}
@@ -52,7 +52,7 @@ Link: https://lore.kernel.org/r/20211207121737.2347312-4-bigeasy@linutronix.de
if (unlikely(crng_init == 0)) {
if ((fast_pool->count >= 64) &&
-@@ -1293,6 +1274,32 @@ void add_interrupt_randomness(int irq)
+@@ -1311,6 +1292,32 @@ void add_interrupt_randomness(int irq)
/* award one bit for the contents of the fast pool */
credit_entropy_bits(r, 1);
}
diff --git a/patches/0003-mm-memcg-Allow-the-task_obj-optimization-only-on-non.patch b/patches/0004-mm-memcg-Allow-the-task_obj-optimization-only-on-non.patch
index 624edfa49778..1bfba2709995 100644
--- a/patches/0003-mm-memcg-Allow-the-task_obj-optimization-only-on-non.patch
+++ b/patches/0004-mm-memcg-Allow-the-task_obj-optimization-only-on-non.patch
@@ -1,6 +1,6 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 22 Dec 2021 12:16:27 +0100
-Subject: [PATCH 3/3] mm/memcg: Allow the task_obj optimization only on
+Subject: [PATCH 4/4] mm/memcg: Allow the task_obj optimization only on
non-PREEMPTIBLE kernels.
Based on my understanding the optimisation with task_obj for in_task()
@@ -11,17 +11,21 @@ With CONFIG_PREEMPT_DYNAMIC a non-PREEMPTIBLE kernel can also be
configured but these kernels always have preempt_disable()/enable()
present so it probably makes no sense here for the optimisation.
+I did a micro benchmark with disabled interrupts and a loop of
+100.000.000 invokcations of kfree(kmalloc()). Based on the results it
+makes no sense to add an exception based on dynamic preemption.
+
Restrict the optimisation to !CONFIG_PREEMPTION kernels.
+Link: https://lore.kernel.org/all/YdX+INO9gQje6d0S@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
-Link: https://lkml.kernel.org/r/20211222114111.2206248-4-bigeasy@linutronix.de
---
mm/memcontrol.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
-@@ -2126,7 +2126,7 @@ struct memcg_stock_pcp {
+@@ -2023,7 +2023,7 @@ struct memcg_stock_pcp {
local_lock_t stock_lock;
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
@@ -30,7 +34,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-4-bigeasy@linutronix.de
/* Protects only task_obj */
local_lock_t task_obj_lock;
struct obj_stock task_obj;
-@@ -2139,7 +2139,7 @@ struct memcg_stock_pcp {
+@@ -2036,7 +2036,7 @@ struct memcg_stock_pcp {
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
.stock_lock = INIT_LOCAL_LOCK(stock_lock),
@@ -39,7 +43,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-4-bigeasy@linutronix.de
.task_obj_lock = INIT_LOCAL_LOCK(task_obj_lock),
#endif
};
-@@ -2228,7 +2228,7 @@ static void drain_local_stock(struct wor
+@@ -2125,7 +2125,7 @@ static void drain_local_stock(struct wor
* drain_stock races is that we always operate on local CPU stock
* here with IRQ disabled
*/
@@ -48,7 +52,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-4-bigeasy@linutronix.de
local_lock(&memcg_stock.task_obj_lock);
old = drain_obj_stock(&this_cpu_ptr(&memcg_stock)->task_obj, NULL);
local_unlock(&memcg_stock.task_obj_lock);
-@@ -2837,7 +2837,7 @@ static inline struct obj_stock *get_obj_
+@@ -2734,7 +2734,7 @@ static inline struct obj_stock *get_obj_
{
struct memcg_stock_pcp *stock;
@@ -57,7 +61,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-4-bigeasy@linutronix.de
if (likely(in_task())) {
*pflags = 0UL;
*stock_lock_acquried = false;
-@@ -2855,7 +2855,7 @@ static inline struct obj_stock *get_obj_
+@@ -2752,7 +2752,7 @@ static inline struct obj_stock *get_obj_
static inline void put_obj_stock(unsigned long flags,
bool stock_lock_acquried)
{
@@ -66,7 +70,7 @@ Link: https://lkml.kernel.org/r/20211222114111.2206248-4-bigeasy@linutronix.de
if (likely(!stock_lock_acquried)) {
local_unlock(&memcg_stock.task_obj_lock);
return;
-@@ -3268,7 +3268,7 @@ static bool obj_stock_flush_required(str
+@@ -3165,7 +3165,7 @@ static bool obj_stock_flush_required(str
{
struct mem_cgroup *memcg;
diff --git a/patches/0004_random_move_the_fast_pool_reset_into_the_caller.patch b/patches/0004_random_move_the_fast_pool_reset_into_the_caller.patch
index 62e8ae44d325..9c8b5dc57660 100644
--- a/patches/0004_random_move_the_fast_pool_reset_into_the_caller.patch
+++ b/patches/0004_random_move_the_fast_pool_reset_into_the_caller.patch
@@ -16,7 +16,7 @@ Link: https://lore.kernel.org/r/20211207121737.2347312-5-bigeasy@linutronix.de
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
-@@ -1242,37 +1242,35 @@ static __u32 get_reg(struct fast_pool *f
+@@ -1260,37 +1260,35 @@ static __u32 get_reg(struct fast_pool *f
return *ptr;
}
@@ -65,7 +65,7 @@ Link: https://lore.kernel.org/r/20211207121737.2347312-5-bigeasy@linutronix.de
}
void add_interrupt_randomness(int irq)
-@@ -1298,7 +1296,10 @@ void add_interrupt_randomness(int irq)
+@@ -1316,7 +1314,10 @@ void add_interrupt_randomness(int irq)
fast_mix(fast_pool);
add_interrupt_bench(cycles);
diff --git a/patches/0005_random_defer_processing_of_randomness_on_preempt_rt.patch b/patches/0005_random_defer_processing_of_randomness_on_preempt_rt.patch
index 6a1164699e75..70bb19289067 100644
--- a/patches/0005_random_defer_processing_of_randomness_on_preempt_rt.patch
+++ b/patches/0005_random_defer_processing_of_randomness_on_preempt_rt.patch
@@ -39,7 +39,7 @@ Link: https://lore.kernel.org/r/20211207121737.2347312-6-bigeasy@linutronix.de
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
-@@ -1273,6 +1273,32 @@ static bool process_interrupt_randomness
+@@ -1291,6 +1291,32 @@ static bool process_interrupt_randomness
return true;
}
@@ -72,7 +72,7 @@ Link: https://lore.kernel.org/r/20211207121737.2347312-6-bigeasy@linutronix.de
void add_interrupt_randomness(int irq)
{
struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
-@@ -1296,9 +1322,16 @@ void add_interrupt_randomness(int irq)
+@@ -1314,9 +1340,16 @@ void add_interrupt_randomness(int irq)
fast_mix(fast_pool);
add_interrupt_bench(cycles);
diff --git a/patches/Add_localversion_for_-RT_release.patch b/patches/Add_localversion_for_-RT_release.patch
index 22146ab020cb..efeddd431fc4 100644
--- a/patches/Add_localversion_for_-RT_release.patch
+++ b/patches/Add_localversion_for_-RT_release.patch
@@ -15,4 +15,4 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
--- /dev/null
+++ b/localversion-rt
@@ -0,0 +1 @@
-+-rt16
++-rt17
diff --git a/patches/i2c-core-Let-i2c_handle_smbus_host_notify-use-handle.patch b/patches/i2c-core-Let-i2c_handle_smbus_host_notify-use-handle.patch
new file mode 100644
index 000000000000..719ffc5da81d
--- /dev/null
+++ b/patches/i2c-core-Let-i2c_handle_smbus_host_notify-use-handle.patch
@@ -0,0 +1,40 @@
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Date: Wed, 19 Jan 2022 16:10:39 +0100
+Subject: [PATCH] i2c: core: Let i2c_handle_smbus_host_notify() use
+ handle_nested_irq() on PREEMPT_RT.
+
+The i2c-i801 driver invokes i2c_handle_smbus_host_notify() from his
+interrupt service routine. On PREEMPT_RT i2c-i801's handler is forced
+threaded with enabled interrupts which leads to a warning by
+handle_irq_event_percpu() assuming that irq_default_primary_handler()
+enabled interrupts.
+
+i2c-i801's interrupt handler can't be made non-threaded because the
+interrupt line is shared with other devices.
+All i2c host driver's interrupt handler are (force-)threaded on
+PREEMPT_RT.
+
+Handle the IRQs by invoking handle_nested_irq() on PREEMPT_RT.
+
+Reported-by: Michael Below <below@judiz.de>
+Link: https://bugs.debian.org/1002537
+Cc: Salvatore Bonaccorso <carnil@debian.org>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ drivers/i2c/i2c-core-base.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/drivers/i2c/i2c-core-base.c
++++ b/drivers/i2c/i2c-core-base.c
+@@ -1423,7 +1423,10 @@ int i2c_handle_smbus_host_notify(struct
+ if (irq <= 0)
+ return -ENXIO;
+
+- generic_handle_irq(irq);
++ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
++ generic_handle_irq(irq);
++ else
++ handle_nested_irq(irq);
+
+ return 0;
+ }
diff --git a/patches/i2c-rcar-Allow-interrupt-handler-to-be-threaded.patch b/patches/i2c-rcar-Allow-interrupt-handler-to-be-threaded.patch
new file mode 100644
index 000000000000..bccd17f553fd
--- /dev/null
+++ b/patches/i2c-rcar-Allow-interrupt-handler-to-be-threaded.patch
@@ -0,0 +1,49 @@
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Date: Wed, 19 Jan 2022 15:56:50 +0100
+Subject: [PATCH] i2c: rcar: Allow interrupt handler to be threaded.
+
+This is a revert of commit
+ 24c6d4bc56388 ("i2c: rcar: make sure irq is not threaded on Gen2 and earlier")
+
+The IRQ-handler is not threaded unless requested. On PREEMPT_RT the
+handler must be threaded because the handler acquires spinlock_t locks
+which are sleeping locks on PREEMPT_RT. This is either via the DMA API
+or the possible wake_up() invocation.
+
+Remove IRQF_NO_THREAD from flags passed to request_irq().
+
+Fixes: 24c6d4bc56388 ("i2c: rcar: make sure irq is not threaded on Gen2 and earlier")
+Cc: Wolfram Sang <wsa+renesas@sang-engineering.com>
+Cc: linux-renesas-soc@vger.kernel.org
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ drivers/i2c/busses/i2c-rcar.c | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+--- a/drivers/i2c/busses/i2c-rcar.c
++++ b/drivers/i2c/busses/i2c-rcar.c
+@@ -1025,7 +1025,6 @@ static int rcar_i2c_probe(struct platfor
+ struct rcar_i2c_priv *priv;
+ struct i2c_adapter *adap;
+ struct device *dev = &pdev->dev;
+- unsigned long irqflags = 0;
+ irqreturn_t (*irqhandler)(int irq, void *ptr) = rcar_i2c_gen3_irq;
+ int ret;
+
+@@ -1076,7 +1075,6 @@ static int rcar_i2c_probe(struct platfor
+ rcar_i2c_write(priv, ICSAR, 0); /* Gen2: must be 0 if not using slave */
+
+ if (priv->devtype < I2C_RCAR_GEN3) {
+- irqflags |= IRQF_NO_THREAD;
+ irqhandler = rcar_i2c_gen2_irq;
+ }
+
+@@ -1102,7 +1100,7 @@ static int rcar_i2c_probe(struct platfor
+ if (ret < 0)
+ goto out_pm_disable;
+ priv->irq = ret;
+- ret = devm_request_irq(dev, priv->irq, irqhandler, irqflags, dev_name(dev), priv);
++ ret = devm_request_irq(dev, priv->irq, irqhandler, 0, dev_name(dev), priv);
+ if (ret < 0) {
+ dev_err(dev, "cannot get irq %d\n", priv->irq);
+ goto out_pm_disable;
diff --git a/patches/locking-local_lock-Make-the-empty-local_lock_-functi.patch b/patches/locking-local_lock-Make-the-empty-local_lock_-functi.patch
new file mode 100644
index 000000000000..52501259d245
--- /dev/null
+++ b/patches/locking-local_lock-Make-the-empty-local_lock_-functi.patch
@@ -0,0 +1,40 @@
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Date: Wed, 5 Jan 2022 10:53:55 +0100
+Subject: [PATCH] locking/local_lock: Make the empty local_lock_*() function a
+ macro.
+
+It has been said that local_lock() does not add any overhead compared to
+preempt_disable() in a !LOCKDEP configuration. A microbenchmark showed
+an unexpected result which can be reduced to the fact that local_lock()
+was not entirely optimized away.
+In the !LOCKDEP configuration local_lock_acquire() is an empty static
+inline function. On x86 the this_cpu_ptr() argument of that function is
+fully evaluated leading to an additional mov+add instructions which are
+not needed and not used.
+
+Replace the static inline function whith a macro. The typecheck() macro
+ensures that the argument is of proper type while the resulting
+dissasembly shows no traces of this_cpu_ptr().
+
+Link: https://lkml.kernel.org/r/20220105202623.1118172-1-bigeasy@linutronix.de
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Reviewed-by: Waiman Long <longman@redhat.com>
+---
+ include/linux/local_lock_internal.h | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/include/linux/local_lock_internal.h
++++ b/include/linux/local_lock_internal.h
+@@ -44,9 +44,9 @@ static inline void local_lock_debug_init
+ }
+ #else /* CONFIG_DEBUG_LOCK_ALLOC */
+ # define LOCAL_LOCK_DEBUG_INIT(lockname)
+-static inline void local_lock_acquire(local_lock_t *l) { }
+-static inline void local_lock_release(local_lock_t *l) { }
+-static inline void local_lock_debug_init(local_lock_t *l) { }
++# define local_lock_acquire(__ll) do { typecheck(local_lock_t *, __ll); } while (0)
++# define local_lock_release(__ll) do { typecheck(local_lock_t *, __ll); } while (0)
++# define local_lock_debug_init(__ll) do { typecheck(local_lock_t *, __ll); } while (0)
+ #endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+ #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) }
diff --git a/patches/printk__remove_deferred_printing.patch b/patches/printk__remove_deferred_printing.patch
index 6e99ed585191..17af3e09682d 100644
--- a/patches/printk__remove_deferred_printing.patch
+++ b/patches/printk__remove_deferred_printing.patch
@@ -162,7 +162,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
({ \
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
-@@ -1507,9 +1507,8 @@ static void _warn_unseeded_randomness(co
+@@ -1525,9 +1525,8 @@ static void _warn_unseeded_randomness(co
print_once = true;
#endif
if (__ratelimit(&unseeded_warning))
@@ -782,7 +782,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
-@@ -4826,9 +4826,7 @@ void show_one_workqueue(struct workqueue
+@@ -4845,9 +4845,7 @@ void show_one_workqueue(struct workqueue
* drivers that queue work while holding locks
* also taken in their write paths.
*/
@@ -792,7 +792,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
}
raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
/*
-@@ -4859,7 +4857,6 @@ static void show_one_worker_pool(struct
+@@ -4878,7 +4876,6 @@ static void show_one_worker_pool(struct
* queue work while holding locks also taken in their write
* paths.
*/
@@ -800,7 +800,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
pr_info("pool %d:", pool->id);
pr_cont_pool_info(pool);
pr_cont(" hung=%us workers=%d",
-@@ -4874,7 +4871,6 @@ static void show_one_worker_pool(struct
+@@ -4893,7 +4890,6 @@ static void show_one_worker_pool(struct
first = false;
}
pr_cont("\n");
diff --git a/patches/series b/patches/series
index 44e701ef85e9..b8a4fe43d56a 100644
--- a/patches/series
+++ b/patches/series
@@ -58,6 +58,7 @@ smp_wake_ksoftirqd_on_preempt_rt_instead_do_softirq.patch
fscache-Use-only-one-fscache_object_cong_wait.patch
tcp-Don-t-acquire-inet_listen_hashbucket-lock-with-d.patch
panic_remove_oops_id.patch
+locking-local_lock-Make-the-empty-local_lock_-functi.patch
# sched
0001_kernel_fork_redo_ifdefs_around_task_s_handling.patch
@@ -77,9 +78,10 @@ panic_remove_oops_id.patch
0005_random_defer_processing_of_randomness_on_preempt_rt.patch
# cgroup
-0001-mm-memcg-Protect-per-CPU-counter-by-disabling-preemp.patch
-0002-mm-memcg-Add-a-local_lock_t-for-IRQ-and-TASK-object.patch
-0003-mm-memcg-Allow-the-task_obj-optimization-only-on-non.patch
+0001-mm-memcg-Disable-threshold-event-handlers-on-PREEMPT.patch
+0002-mm-memcg-Protect-per-CPU-counter-by-disabling-preemp.patch
+0003-mm-memcg-Add-a-local_lock_t-for-IRQ-and-TASK-object.patch
+0004-mm-memcg-Allow-the-task_obj-optimization-only-on-non.patch
###########################################################################
# Post
@@ -88,6 +90,10 @@ cgroup__use_irqsave_in_cgroup_rstat_flush_locked.patch
mm__workingset__replace_IRQ-off_check_with_a_lockdep_assert..patch
softirq-Use-a-dedicated-thread-for-timer-wakeups.patch
+# These two need some feedback.
+i2c-rcar-Allow-interrupt-handler-to-be-threaded.patch
+i2c-core-Let-i2c_handle_smbus_host_notify-use-handle.patch
+
###########################################################################
# Kconfig bits:
###########################################################################