From 5beb0b705c1e1173ebf6e53d33f7f5e0045a2835 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 7 Sep 2021 12:43:49 +0200 Subject: [ANNOUNCE] v5.14.1-rt19 Dear RT folks! I'm pleased to announce the v5.14.1-rt19 patch set. Changes since v5.14.1-rt18: - Dan Carpenter reported a possible NULL pointer dereference in rt_mutex_adjust_prio_chain(). Patch by Peter Zijlstra. - Unused rt_rwlock_is_contended() has been removed. Reported by kernel test robot. - The "play idle" timer was missing a _HARD annotation. It would freeze the system uppon activation of the intel powerclamp driver. Reported by Thomas Gleixner. - Vlastimil Babka SLUB queue has been updated to v6r2. Known issues - netconsole triggers WARN. - The "Memory controller" (CONFIG_MEMCG) has been disabled. - A RCU and ARM64 warning has been fixed by Valentin Schneider. It is still not clear if the RCU related change is correct. - Clark Williams reported issues in i915 (execlists_dequeue_irq()) - Valentin Schneider reported a few splats on ARM64, see https://https://lkml.kernel.org/r/.kernel.org/lkml/20210810134127.1394269-1-valentin.schneider@arm.com/ The delta patch against v5.14.1-rt18 is appended below and can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.14/incr/patch-5.14.1-rt18-rt19.patch.xz You can get this release via the git tree at: git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.14.1-rt19 The RT patch against v5.14.1 can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.14/older/patch-5.14.1-rt19.patch.xz The split quilt queue is available at: https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.14/older/patches-5.14.1-rt19.tar.xz Sebastian Signed-off-by: Sebastian Andrzej Siewior --- ...-t-call-flush_all-from-slab_debug_trace_o.patch | 5 +- ...ocate-private-object-map-for-debugfs-list.patch | 5 +- ...ocate-private-object-map-for-validate_sla.patch | 5 +- ...-t-disable-irq-for-debug_check_no_locks_f.patch | 5 +- ...ove-redundant-unfreeze_partials-from-put_.patch | 5 +- ...extract-get_partial-from-new_slab_objects.patch | 57 +++ ...fy-cmpxchg_double_slab-and-__cmpxchg_doub.patch | 122 ------ ...solve-new_slab_objects-into-___slab_alloc.patch | 98 +++++ ...extract-get_partial-from-new_slab_objects.patch | 58 --- ...solve-new_slab_objects-into-___slab_alloc.patch | 99 ----- ...urn-slab-page-from-get_partial-and-set-c-.patch | 101 +++++ ...tructure-new-page-checks-in-___slab_alloc.patch | 58 +++ ...urn-slab-page-from-get_partial-and-set-c-.patch | 102 ----- ...tructure-new-page-checks-in-___slab_alloc.patch | 59 --- ...lub-simplify-kmem_cache_cpu-and-tid-setup.patch | 61 +++ ...e-disabling-enabling-irqs-to-___slab_allo.patch | 179 +++++++++ ...lub-simplify-kmem_cache_cpu-and-tid-setup.patch | 62 --- ...initial-checks-in-___slab_alloc-with-irqs.patch | 153 ++++++++ ...e-disabling-enabling-irqs-to-___slab_allo.patch | 180 --------- ...initial-checks-in-___slab_alloc-with-irqs.patch | 154 -------- ...e-disabling-irqs-closer-to-get_partial-in.patch | 96 +++++ ...e-disabling-irqs-closer-to-get_partial-in.patch | 97 ----- ...slub-restore-irqs-around-calling-new_slab.patch | 54 +++ ...slub-restore-irqs-around-calling-new_slab.patch | 55 --- ...idate-slab-from-partial-list-or-page-allo.patch | 76 ++++ ...m-slub-check-new-pages-with-restored-irqs.patch | 69 ++++ ...idate-slab-from-partial-list-or-page-allo.patch | 77 ---- ...m-slub-check-new-pages-with-restored-irqs.patch | 70 ---- ...ub-stop-disabling-irqs-around-get_partial.patch | 86 +++++ ...e-reset-of-c-page-and-freelist-out-of-dea.patch | 93 +++++ ...ub-stop-disabling-irqs-around-get_partial.patch | 87 ----- ...-make-locking-in-deactivate_slab-irq-safe.patch | 61 +++ ...e-reset-of-c-page-and-freelist-out-of-dea.patch | 94 ----- ...ll-deactivate_slab-without-disabling-irqs.patch | 71 ++++ ...-make-locking-in-deactivate_slab-irq-safe.patch | 62 --- ...ll-deactivate_slab-without-disabling-irqs.patch | 72 ---- ...b-move-irq-control-into-unfreeze_partials.patch | 61 +++ ...card-slabs-in-unfreeze_partials-without-i.patch | 32 ++ ...b-move-irq-control-into-unfreeze_partials.patch | 62 --- ...ach-whole-partial-list-at-once-in-unfreez.patch | 39 ++ ...card-slabs-in-unfreeze_partials-without-i.patch | 33 -- ...ach-whole-partial-list-at-once-in-unfreez.patch | 40 -- ...arate-detaching-of-partial-list-in-unfree.patch | 156 ++++++++ ...y-disable-irq-with-spin_lock-in-__unfreez.patch | 51 +++ ...arate-detaching-of-partial-list-in-unfree.patch | 157 -------- ...-slub-don-t-disable-irqs-in-slub_cpu_dead.patch | 30 ++ ...y-disable-irq-with-spin_lock-in-__unfreez.patch | 52 --- ...it-out-the-cpu-offline-variant-of-flush_s.patch | 44 +++ ...-slub-don-t-disable-irqs-in-slub_cpu_dead.patch | 31 -- ...e-flush_slab-possible-to-call-with-irqs-e.patch | 69 ---- ...e-flush_cpu_slab-invocations-__free_slab-.patch | 211 +++++++++++ ...e-flush_cpu_slab-invocations-__free_slab-.patch | 179 --------- ...lub-make-object_map_lock-a-raw_spinlock_t.patch | 44 +++ ...lub-Make-object_map_lock-a-raw_spinlock_t.patch | 45 --- ...ke-slab_lock-disable-irqs-with-PREEMPT_RT.patch | 186 +++++++++ ...ionally-save-restore-irqs-in-slab_-un-loc.patch | 150 -------- ...tect-put_cpu_partial-with-disabled-irqs-i.patch | 167 ++++++++ ...ke-slab_lock-disable-irqs-with-PREEMPT_RT.patch | 59 --- ...mm-slub-use-migrate_disable-on-PREEMPT_RT.patch | 120 ++++++ ...vert-kmem_cpu_slab-protection-to-local_lo.patch | 420 +++++++++++++++++++++ ...tect-put_cpu_partial-with-disabled-irqs-i.patch | 168 --------- ...mm-slub-use-migrate_disable-on-PREEMPT_RT.patch | 121 ------ ...vert-kmem_cpu_slab-protection-to-local_lo.patch | 403 -------------------- patches/Add_localversion_for_-RT_release.patch | 2 +- .../locking-Remove-rt_rwlock_is_contended.patch | 33 ++ ...cking-rtmutex-Fix-ww_mutex-deadlock-check.patch | 38 ++ ...the-idle-timer-expire-always-in-hardirq-c.patch | 38 ++ patches/series | 63 ++-- 68 files changed, 3026 insertions(+), 3066 deletions(-) create mode 100644 patches/0006-mm-slub-extract-get_partial-from-new_slab_objects.patch delete mode 100644 patches/0006-mm-slub-unify-cmpxchg_double_slab-and-__cmpxchg_doub.patch create mode 100644 patches/0007-mm-slub-dissolve-new_slab_objects-into-___slab_alloc.patch delete mode 100644 patches/0007-mm-slub-extract-get_partial-from-new_slab_objects.patch delete mode 100644 patches/0008-mm-slub-dissolve-new_slab_objects-into-___slab_alloc.patch create mode 100644 patches/0008-mm-slub-return-slab-page-from-get_partial-and-set-c-.patch create mode 100644 patches/0009-mm-slub-restructure-new-page-checks-in-___slab_alloc.patch delete mode 100644 patches/0009-mm-slub-return-slab-page-from-get_partial-and-set-c-.patch delete mode 100644 patches/0010-mm-slub-restructure-new-page-checks-in-___slab_alloc.patch create mode 100644 patches/0010-mm-slub-simplify-kmem_cache_cpu-and-tid-setup.patch create mode 100644 patches/0011-mm-slub-move-disabling-enabling-irqs-to-___slab_allo.patch delete mode 100644 patches/0011-mm-slub-simplify-kmem_cache_cpu-and-tid-setup.patch create mode 100644 patches/0012-mm-slub-do-initial-checks-in-___slab_alloc-with-irqs.patch delete mode 100644 patches/0012-mm-slub-move-disabling-enabling-irqs-to-___slab_allo.patch delete mode 100644 patches/0013-mm-slub-do-initial-checks-in-___slab_alloc-with-irqs.patch create mode 100644 patches/0013-mm-slub-move-disabling-irqs-closer-to-get_partial-in.patch delete mode 100644 patches/0014-mm-slub-move-disabling-irqs-closer-to-get_partial-in.patch create mode 100644 patches/0014-mm-slub-restore-irqs-around-calling-new_slab.patch delete mode 100644 patches/0015-mm-slub-restore-irqs-around-calling-new_slab.patch create mode 100644 patches/0015-mm-slub-validate-slab-from-partial-list-or-page-allo.patch create mode 100644 patches/0016-mm-slub-check-new-pages-with-restored-irqs.patch delete mode 100644 patches/0016-mm-slub-validate-slab-from-partial-list-or-page-allo.patch delete mode 100644 patches/0017-mm-slub-check-new-pages-with-restored-irqs.patch create mode 100644 patches/0017-mm-slub-stop-disabling-irqs-around-get_partial.patch create mode 100644 patches/0018-mm-slub-move-reset-of-c-page-and-freelist-out-of-dea.patch delete mode 100644 patches/0018-mm-slub-stop-disabling-irqs-around-get_partial.patch create mode 100644 patches/0019-mm-slub-make-locking-in-deactivate_slab-irq-safe.patch delete mode 100644 patches/0019-mm-slub-move-reset-of-c-page-and-freelist-out-of-dea.patch create mode 100644 patches/0020-mm-slub-call-deactivate_slab-without-disabling-irqs.patch delete mode 100644 patches/0020-mm-slub-make-locking-in-deactivate_slab-irq-safe.patch delete mode 100644 patches/0021-mm-slub-call-deactivate_slab-without-disabling-irqs.patch create mode 100644 patches/0021-mm-slub-move-irq-control-into-unfreeze_partials.patch create mode 100644 patches/0022-mm-slub-discard-slabs-in-unfreeze_partials-without-i.patch delete mode 100644 patches/0022-mm-slub-move-irq-control-into-unfreeze_partials.patch create mode 100644 patches/0023-mm-slub-detach-whole-partial-list-at-once-in-unfreez.patch delete mode 100644 patches/0023-mm-slub-discard-slabs-in-unfreeze_partials-without-i.patch delete mode 100644 patches/0024-mm-slub-detach-whole-partial-list-at-once-in-unfreez.patch create mode 100644 patches/0024-mm-slub-separate-detaching-of-partial-list-in-unfree.patch create mode 100644 patches/0025-mm-slub-only-disable-irq-with-spin_lock-in-__unfreez.patch delete mode 100644 patches/0025-mm-slub-separate-detaching-of-partial-list-in-unfree.patch create mode 100644 patches/0026-mm-slub-don-t-disable-irqs-in-slub_cpu_dead.patch delete mode 100644 patches/0026-mm-slub-only-disable-irq-with-spin_lock-in-__unfreez.patch create mode 100644 patches/0027-mm-slab-split-out-the-cpu-offline-variant-of-flush_s.patch delete mode 100644 patches/0027-mm-slub-don-t-disable-irqs-in-slub_cpu_dead.patch delete mode 100644 patches/0028-mm-slab-make-flush_slab-possible-to-call-with-irqs-e.patch create mode 100644 patches/0028-mm-slub-move-flush_cpu_slab-invocations-__free_slab-.patch delete mode 100644 patches/0029-mm-slub-Move-flush_cpu_slab-invocations-__free_slab-.patch create mode 100644 patches/0029-mm-slub-make-object_map_lock-a-raw_spinlock_t.patch delete mode 100644 patches/0030-mm-slub-Make-object_map_lock-a-raw_spinlock_t.patch create mode 100644 patches/0030-mm-slub-make-slab_lock-disable-irqs-with-PREEMPT_RT.patch delete mode 100644 patches/0031-mm-slub-optionally-save-restore-irqs-in-slab_-un-loc.patch create mode 100644 patches/0031-mm-slub-protect-put_cpu_partial-with-disabled-irqs-i.patch delete mode 100644 patches/0032-mm-slub-make-slab_lock-disable-irqs-with-PREEMPT_RT.patch create mode 100644 patches/0032-mm-slub-use-migrate_disable-on-PREEMPT_RT.patch create mode 100644 patches/0033-mm-slub-convert-kmem_cpu_slab-protection-to-local_lo.patch delete mode 100644 patches/0033-mm-slub-protect-put_cpu_partial-with-disabled-irqs-i.patch delete mode 100644 patches/0034-mm-slub-use-migrate_disable-on-PREEMPT_RT.patch delete mode 100644 patches/0035-mm-slub-convert-kmem_cpu_slab-protection-to-local_lo.patch create mode 100644 patches/locking-Remove-rt_rwlock_is_contended.patch create mode 100644 patches/locking-rtmutex-Fix-ww_mutex-deadlock-check.patch create mode 100644 patches/sched-Make-the-idle-timer-expire-always-in-hardirq-c.patch diff --git a/patches/0001-mm-slub-don-t-call-flush_all-from-slab_debug_trace_o.patch b/patches/0001-mm-slub-don-t-call-flush_all-from-slab_debug_trace_o.patch index 1e0a3c0e60c0..0c84435c45a3 100644 --- a/patches/0001-mm-slub-don-t-call-flush_all-from-slab_debug_trace_o.patch +++ b/patches/0001-mm-slub-don-t-call-flush_all-from-slab_debug_trace_o.patch @@ -1,6 +1,6 @@ From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:20:58 +0200 -Subject: [PATCH 01/35] mm, slub: don't call flush_all() from +Date: Fri, 28 May 2021 14:32:10 +0200 +Subject: [PATCH 01/33] mm, slub: don't call flush_all() from slab_debug_trace_open() slab_debug_trace_open() can only be called on caches with SLAB_STORE_USER flag @@ -9,7 +9,6 @@ slabs altogether, so there's nothing to flush. Signed-off-by: Vlastimil Babka Acked-by: Christoph Lameter -Signed-off-by: Sebastian Andrzej Siewior --- mm/slub.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/patches/0002-mm-slub-allocate-private-object-map-for-debugfs-list.patch b/patches/0002-mm-slub-allocate-private-object-map-for-debugfs-list.patch index 1d9cf5525b15..4900e14e9304 100644 --- a/patches/0002-mm-slub-allocate-private-object-map-for-debugfs-list.patch +++ b/patches/0002-mm-slub-allocate-private-object-map-for-debugfs-list.patch @@ -1,6 +1,6 @@ From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:20:59 +0200 -Subject: [PATCH 02/35] mm, slub: allocate private object map for debugfs +Date: Sun, 23 May 2021 01:28:37 +0200 +Subject: [PATCH 02/33] mm, slub: allocate private object map for debugfs listings Slub has a static spinlock protected bitmap for marking which objects are on @@ -16,7 +16,6 @@ to use a private bitmap. Signed-off-by: Vlastimil Babka Acked-by: Christoph Lameter Acked-by: Mel Gorman -Signed-off-by: Sebastian Andrzej Siewior --- mm/slub.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/patches/0003-mm-slub-allocate-private-object-map-for-validate_sla.patch b/patches/0003-mm-slub-allocate-private-object-map-for-validate_sla.patch index dd9c26775a55..6fc0b18f9099 100644 --- a/patches/0003-mm-slub-allocate-private-object-map-for-validate_sla.patch +++ b/patches/0003-mm-slub-allocate-private-object-map-for-validate_sla.patch @@ -1,6 +1,6 @@ From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:00 +0200 -Subject: [PATCH 03/35] mm, slub: allocate private object map for +Date: Sun, 23 May 2021 01:37:07 +0200 +Subject: [PATCH 03/33] mm, slub: allocate private object map for validate_slab_cache() validate_slab_cache() is called either to handle a sysfs write, or from a @@ -11,7 +11,6 @@ critical sections, so let's do that. Signed-off-by: Vlastimil Babka Acked-by: Christoph Lameter Acked-by: Mel Gorman -Signed-off-by: Sebastian Andrzej Siewior --- mm/slub.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/patches/0004-mm-slub-don-t-disable-irq-for-debug_check_no_locks_f.patch b/patches/0004-mm-slub-don-t-disable-irq-for-debug_check_no_locks_f.patch index aebe358c82ba..f2c670fa2b35 100644 --- a/patches/0004-mm-slub-don-t-disable-irq-for-debug_check_no_locks_f.patch +++ b/patches/0004-mm-slub-don-t-disable-irq-for-debug_check_no_locks_f.patch @@ -1,6 +1,6 @@ From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:01 +0200 -Subject: [PATCH 04/35] mm, slub: don't disable irq for +Date: Fri, 21 May 2021 01:25:06 +0200 +Subject: [PATCH 04/33] mm, slub: don't disable irq for debug_check_no_locks_freed() In slab_free_hook() we disable irqs around the debug_check_no_locks_freed() @@ -13,7 +13,6 @@ Mel noted: Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman -Signed-off-by: Sebastian Andrzej Siewior --- mm/slub.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/patches/0005-mm-slub-remove-redundant-unfreeze_partials-from-put_.patch b/patches/0005-mm-slub-remove-redundant-unfreeze_partials-from-put_.patch index e9adac1c5455..a07027ba2f30 100644 --- a/patches/0005-mm-slub-remove-redundant-unfreeze_partials-from-put_.patch +++ b/patches/0005-mm-slub-remove-redundant-unfreeze_partials-from-put_.patch @@ -1,6 +1,6 @@ From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:02 +0200 -Subject: [PATCH 05/35] mm, slub: remove redundant unfreeze_partials() from +Date: Tue, 8 Jun 2021 01:19:03 +0200 +Subject: [PATCH 05/33] mm, slub: remove redundant unfreeze_partials() from put_cpu_partial() Commit d6e0b7fa1186 ("slub: make dead caches discard free slabs immediately") @@ -17,7 +17,6 @@ unfreeze_partials() which could be thus also considered unnecessary. But further patches will rely on it, so keep it. Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior --- mm/slub.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/patches/0006-mm-slub-extract-get_partial-from-new_slab_objects.patch b/patches/0006-mm-slub-extract-get_partial-from-new_slab_objects.patch new file mode 100644 index 000000000000..6da43addac38 --- /dev/null +++ b/patches/0006-mm-slub-extract-get_partial-from-new_slab_objects.patch @@ -0,0 +1,57 @@ +From: Vlastimil Babka +Date: Tue, 11 May 2021 12:45:48 +0200 +Subject: [PATCH 06/33] mm, slub: extract get_partial() from new_slab_objects() + +The later patches will need more fine grained control over individual actions +in ___slab_alloc(), the only caller of new_slab_objects(), so this is a first +preparatory step with no functional change. + +This adds a goto label that appears unnecessary at this point, but will be +useful for later changes. + +Signed-off-by: Vlastimil Babka +Acked-by: Christoph Lameter +--- + mm/slub.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2613,17 +2613,12 @@ slab_out_of_memory(struct kmem_cache *s, + static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, + int node, struct kmem_cache_cpu **pc) + { +- void *freelist; ++ void *freelist = NULL; + struct kmem_cache_cpu *c = *pc; + struct page *page; + + WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); + +- freelist = get_partial(s, flags, node, c); +- +- if (freelist) +- return freelist; +- + page = new_slab(s, flags, node); + if (page) { + c = raw_cpu_ptr(s->cpu_slab); +@@ -2787,6 +2782,10 @@ static void *___slab_alloc(struct kmem_c + goto redo; + } + ++ freelist = get_partial(s, gfpflags, node, c); ++ if (freelist) ++ goto check_new_page; ++ + freelist = new_slab_objects(s, gfpflags, node, &c); + + if (unlikely(!freelist)) { +@@ -2794,6 +2793,7 @@ static void *___slab_alloc(struct kmem_c + return NULL; + } + ++check_new_page: + page = c->page; + if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) + goto load_freelist; diff --git a/patches/0006-mm-slub-unify-cmpxchg_double_slab-and-__cmpxchg_doub.patch b/patches/0006-mm-slub-unify-cmpxchg_double_slab-and-__cmpxchg_doub.patch deleted file mode 100644 index b76ba63b77ba..000000000000 --- a/patches/0006-mm-slub-unify-cmpxchg_double_slab-and-__cmpxchg_doub.patch +++ /dev/null @@ -1,122 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:03 +0200 -Subject: [PATCH 06/35] mm, slub: unify cmpxchg_double_slab() and - __cmpxchg_double_slab() - -These functions differ only in irq disabling in the slow path. We can create a -common function with an extra bool parameter to control the irq disabling. -As the functions are inline and the parameter compile-time constant, there -will be no runtime overhead due to this change. - -Also change the DEBUG_VM based irqs disable assert to the more standard -lockdep_assert based one. - -Signed-off-by: Vlastimil Babka -Acked-by: Christoph Lameter -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 62 ++++++++++++++++++++++++-------------------------------------- - 1 file changed, 24 insertions(+), 38 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -371,13 +371,13 @@ static __always_inline void slab_unlock( - __bit_spin_unlock(PG_locked, &page->flags); - } - --/* Interrupts must be disabled (for the fallback code to work right) */ --static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, -+static inline bool ___cmpxchg_double_slab(struct kmem_cache *s, struct page *page, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new, -- const char *n) -+ const char *n, bool disable_irqs) - { -- VM_BUG_ON(!irqs_disabled()); -+ if (!disable_irqs) -+ lockdep_assert_irqs_disabled(); - #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - if (s->flags & __CMPXCHG_DOUBLE) { -@@ -388,15 +388,23 @@ static inline bool __cmpxchg_double_slab - } else - #endif - { -+ unsigned long flags; -+ -+ if (disable_irqs) -+ local_irq_save(flags); - slab_lock(page); - if (page->freelist == freelist_old && - page->counters == counters_old) { - page->freelist = freelist_new; - page->counters = counters_new; - slab_unlock(page); -+ if (disable_irqs) -+ local_irq_restore(flags); - return true; - } - slab_unlock(page); -+ if (disable_irqs) -+ local_irq_restore(flags); - } - - cpu_relax(); -@@ -409,45 +417,23 @@ static inline bool __cmpxchg_double_slab - return false; - } - --static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, -+/* Interrupts must be disabled (for the fallback code to work right) */ -+static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new, - const char *n) - { --#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ -- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) -- if (s->flags & __CMPXCHG_DOUBLE) { -- if (cmpxchg_double(&page->freelist, &page->counters, -- freelist_old, counters_old, -- freelist_new, counters_new)) -- return true; -- } else --#endif -- { -- unsigned long flags; -- -- local_irq_save(flags); -- slab_lock(page); -- if (page->freelist == freelist_old && -- page->counters == counters_old) { -- page->freelist = freelist_new; -- page->counters = counters_new; -- slab_unlock(page); -- local_irq_restore(flags); -- return true; -- } -- slab_unlock(page); -- local_irq_restore(flags); -- } -- -- cpu_relax(); -- stat(s, CMPXCHG_DOUBLE_FAIL); -- --#ifdef SLUB_DEBUG_CMPXCHG -- pr_info("%s %s: cmpxchg double redo ", n, s->name); --#endif -+ return ___cmpxchg_double_slab(s, page, freelist_old, counters_old, -+ freelist_new, counters_new, n, false); -+} - -- return false; -+static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, -+ void *freelist_old, unsigned long counters_old, -+ void *freelist_new, unsigned long counters_new, -+ const char *n) -+{ -+ return ___cmpxchg_double_slab(s, page, freelist_old, counters_old, -+ freelist_new, counters_new, n, true); - } - - #ifdef CONFIG_SLUB_DEBUG diff --git a/patches/0007-mm-slub-dissolve-new_slab_objects-into-___slab_alloc.patch b/patches/0007-mm-slub-dissolve-new_slab_objects-into-___slab_alloc.patch new file mode 100644 index 000000000000..9deaec0605cc --- /dev/null +++ b/patches/0007-mm-slub-dissolve-new_slab_objects-into-___slab_alloc.patch @@ -0,0 +1,98 @@ +From: Vlastimil Babka +Date: Tue, 11 May 2021 13:01:34 +0200 +Subject: [PATCH 07/33] mm, slub: dissolve new_slab_objects() into + ___slab_alloc() + +The later patches will need more fine grained control over individual actions +in ___slab_alloc(), the only caller of new_slab_objects(), so dissolve it +there. This is a preparatory step with no functional change. + +The only minor change is moving WARN_ON_ONCE() for using a constructor together +with __GFP_ZERO to new_slab(), which makes it somewhat less frequent, but still +able to catch a development change introducing a systematic misuse. + +Signed-off-by: Vlastimil Babka +Acked-by: Christoph Lameter +Acked-by: Mel Gorman +--- + mm/slub.c | 50 ++++++++++++++++++-------------------------------- + 1 file changed, 18 insertions(+), 32 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -1885,6 +1885,8 @@ static struct page *new_slab(struct kmem + if (unlikely(flags & GFP_SLAB_BUG_MASK)) + flags = kmalloc_fix_flags(flags); + ++ WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); ++ + return allocate_slab(s, + flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); + } +@@ -2610,36 +2612,6 @@ slab_out_of_memory(struct kmem_cache *s, + #endif + } + +-static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, +- int node, struct kmem_cache_cpu **pc) +-{ +- void *freelist = NULL; +- struct kmem_cache_cpu *c = *pc; +- struct page *page; +- +- WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); +- +- page = new_slab(s, flags, node); +- if (page) { +- c = raw_cpu_ptr(s->cpu_slab); +- if (c->page) +- flush_slab(s, c); +- +- /* +- * No other reference to the page yet so we can +- * muck around with it freely without cmpxchg +- */ +- freelist = page->freelist; +- page->freelist = NULL; +- +- stat(s, ALLOC_SLAB); +- c->page = page; +- *pc = c; +- } +- +- return freelist; +-} +- + static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) + { + if (unlikely(PageSlabPfmemalloc(page))) +@@ -2786,13 +2758,27 @@ static void *___slab_alloc(struct kmem_c + if (freelist) + goto check_new_page; + +- freelist = new_slab_objects(s, gfpflags, node, &c); ++ page = new_slab(s, gfpflags, node); + +- if (unlikely(!freelist)) { ++ if (unlikely(!page)) { + slab_out_of_memory(s, gfpflags, node); + return NULL; + } + ++ c = raw_cpu_ptr(s->cpu_slab); ++ if (c->page) ++ flush_slab(s, c); ++ ++ /* ++ * No other reference to the page yet so we can ++ * muck around with it freely without cmpxchg ++ */ ++ freelist = page->freelist; ++ page->freelist = NULL; ++ ++ stat(s, ALLOC_SLAB); ++ c->page = page; ++ + check_new_page: + page = c->page; + if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) diff --git a/patches/0007-mm-slub-extract-get_partial-from-new_slab_objects.patch b/patches/0007-mm-slub-extract-get_partial-from-new_slab_objects.patch deleted file mode 100644 index 38665c4767ff..000000000000 --- a/patches/0007-mm-slub-extract-get_partial-from-new_slab_objects.patch +++ /dev/null @@ -1,58 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:04 +0200 -Subject: [PATCH 07/35] mm, slub: extract get_partial() from new_slab_objects() - -The later patches will need more fine grained control over individual actions -in ___slab_alloc(), the only caller of new_slab_objects(), so this is a first -preparatory step with no functional change. - -This adds a goto label that appears unnecessary at this point, but will be -useful for later changes. - -Signed-off-by: Vlastimil Babka -Acked-by: Christoph Lameter -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 12 ++++++------ - 1 file changed, 6 insertions(+), 6 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2599,17 +2599,12 @@ slab_out_of_memory(struct kmem_cache *s, - static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, - int node, struct kmem_cache_cpu **pc) - { -- void *freelist; -+ void *freelist = NULL; - struct kmem_cache_cpu *c = *pc; - struct page *page; - - WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); - -- freelist = get_partial(s, flags, node, c); -- -- if (freelist) -- return freelist; -- - page = new_slab(s, flags, node); - if (page) { - c = raw_cpu_ptr(s->cpu_slab); -@@ -2773,6 +2768,10 @@ static void *___slab_alloc(struct kmem_c - goto redo; - } - -+ freelist = get_partial(s, gfpflags, node, c); -+ if (freelist) -+ goto check_new_page; -+ - freelist = new_slab_objects(s, gfpflags, node, &c); - - if (unlikely(!freelist)) { -@@ -2780,6 +2779,7 @@ static void *___slab_alloc(struct kmem_c - return NULL; - } - -+check_new_page: - page = c->page; - if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) - goto load_freelist; diff --git a/patches/0008-mm-slub-dissolve-new_slab_objects-into-___slab_alloc.patch b/patches/0008-mm-slub-dissolve-new_slab_objects-into-___slab_alloc.patch deleted file mode 100644 index f9434d20cd3e..000000000000 --- a/patches/0008-mm-slub-dissolve-new_slab_objects-into-___slab_alloc.patch +++ /dev/null @@ -1,99 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:05 +0200 -Subject: [PATCH 08/35] mm, slub: dissolve new_slab_objects() into - ___slab_alloc() - -The later patches will need more fine grained control over individual actions -in ___slab_alloc(), the only caller of new_slab_objects(), so dissolve it -there. This is a preparatory step with no functional change. - -The only minor change is moving WARN_ON_ONCE() for using a constructor together -with __GFP_ZERO to new_slab(), which makes it somewhat less frequent, but still -able to catch a development change introducing a systematic misuse. - -Signed-off-by: Vlastimil Babka -Acked-by: Christoph Lameter -Acked-by: Mel Gorman -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 50 ++++++++++++++++++-------------------------------- - 1 file changed, 18 insertions(+), 32 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -1871,6 +1871,8 @@ static struct page *new_slab(struct kmem - if (unlikely(flags & GFP_SLAB_BUG_MASK)) - flags = kmalloc_fix_flags(flags); - -+ WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); -+ - return allocate_slab(s, - flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); - } -@@ -2596,36 +2598,6 @@ slab_out_of_memory(struct kmem_cache *s, - #endif - } - --static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, -- int node, struct kmem_cache_cpu **pc) --{ -- void *freelist = NULL; -- struct kmem_cache_cpu *c = *pc; -- struct page *page; -- -- WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); -- -- page = new_slab(s, flags, node); -- if (page) { -- c = raw_cpu_ptr(s->cpu_slab); -- if (c->page) -- flush_slab(s, c); -- -- /* -- * No other reference to the page yet so we can -- * muck around with it freely without cmpxchg -- */ -- freelist = page->freelist; -- page->freelist = NULL; -- -- stat(s, ALLOC_SLAB); -- c->page = page; -- *pc = c; -- } -- -- return freelist; --} -- - static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) - { - if (unlikely(PageSlabPfmemalloc(page))) -@@ -2772,13 +2744,27 @@ static void *___slab_alloc(struct kmem_c - if (freelist) - goto check_new_page; - -- freelist = new_slab_objects(s, gfpflags, node, &c); -+ page = new_slab(s, gfpflags, node); - -- if (unlikely(!freelist)) { -+ if (unlikely(!page)) { - slab_out_of_memory(s, gfpflags, node); - return NULL; - } - -+ c = raw_cpu_ptr(s->cpu_slab); -+ if (c->page) -+ flush_slab(s, c); -+ -+ /* -+ * No other reference to the page yet so we can -+ * muck around with it freely without cmpxchg -+ */ -+ freelist = page->freelist; -+ page->freelist = NULL; -+ -+ stat(s, ALLOC_SLAB); -+ c->page = page; -+ - check_new_page: - page = c->page; - if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) diff --git a/patches/0008-mm-slub-return-slab-page-from-get_partial-and-set-c-.patch b/patches/0008-mm-slub-return-slab-page-from-get_partial-and-set-c-.patch new file mode 100644 index 000000000000..eb941ec46be4 --- /dev/null +++ b/patches/0008-mm-slub-return-slab-page-from-get_partial-and-set-c-.patch @@ -0,0 +1,101 @@ +From: Vlastimil Babka +Date: Tue, 11 May 2021 14:05:22 +0200 +Subject: [PATCH 08/33] mm, slub: return slab page from get_partial() and set + c->page afterwards + +The function get_partial() finds a suitable page on a partial list, acquires +and returns its freelist and assigns the page pointer to kmem_cache_cpu. +In later patch we will need more control over the kmem_cache_cpu.page +assignment, so instead of passing a kmem_cache_cpu pointer, pass a pointer to a +pointer to a page that get_partial() can fill and the caller can assign the +kmem_cache_cpu.page pointer. No functional change as all of this still happens +with disabled IRQs. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 21 +++++++++++---------- + 1 file changed, 11 insertions(+), 10 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2017,7 +2017,7 @@ static inline bool pfmemalloc_match(stru + * Try to allocate a partial slab from a specific node. + */ + static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, +- struct kmem_cache_cpu *c, gfp_t flags) ++ struct page **ret_page, gfp_t flags) + { + struct page *page, *page2; + void *object = NULL; +@@ -2046,7 +2046,7 @@ static void *get_partial_node(struct kme + + available += objects; + if (!object) { +- c->page = page; ++ *ret_page = page; + stat(s, ALLOC_FROM_PARTIAL); + object = t; + } else { +@@ -2066,7 +2066,7 @@ static void *get_partial_node(struct kme + * Get a page from somewhere. Search in increasing NUMA distances. + */ + static void *get_any_partial(struct kmem_cache *s, gfp_t flags, +- struct kmem_cache_cpu *c) ++ struct page **ret_page) + { + #ifdef CONFIG_NUMA + struct zonelist *zonelist; +@@ -2108,7 +2108,7 @@ static void *get_any_partial(struct kmem + + if (n && cpuset_zone_allowed(zone, flags) && + n->nr_partial > s->min_partial) { +- object = get_partial_node(s, n, c, flags); ++ object = get_partial_node(s, n, ret_page, flags); + if (object) { + /* + * Don't check read_mems_allowed_retry() +@@ -2130,7 +2130,7 @@ static void *get_any_partial(struct kmem + * Get a partial page, lock it and return it. + */ + static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, +- struct kmem_cache_cpu *c) ++ struct page **ret_page) + { + void *object; + int searchnode = node; +@@ -2138,11 +2138,11 @@ static void *get_partial(struct kmem_cac + if (node == NUMA_NO_NODE) + searchnode = numa_mem_id(); + +- object = get_partial_node(s, get_node(s, searchnode), c, flags); ++ object = get_partial_node(s, get_node(s, searchnode), ret_page, flags); + if (object || node != NUMA_NO_NODE) + return object; + +- return get_any_partial(s, flags, c); ++ return get_any_partial(s, flags, ret_page); + } + + #ifdef CONFIG_PREEMPTION +@@ -2754,9 +2754,11 @@ static void *___slab_alloc(struct kmem_c + goto redo; + } + +- freelist = get_partial(s, gfpflags, node, c); +- if (freelist) ++ freelist = get_partial(s, gfpflags, node, &page); ++ if (freelist) { ++ c->page = page; + goto check_new_page; ++ } + + page = new_slab(s, gfpflags, node); + +@@ -2780,7 +2782,6 @@ static void *___slab_alloc(struct kmem_c + c->page = page; + + check_new_page: +- page = c->page; + if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) + goto load_freelist; + diff --git a/patches/0009-mm-slub-restructure-new-page-checks-in-___slab_alloc.patch b/patches/0009-mm-slub-restructure-new-page-checks-in-___slab_alloc.patch new file mode 100644 index 000000000000..00b71cf553b5 --- /dev/null +++ b/patches/0009-mm-slub-restructure-new-page-checks-in-___slab_alloc.patch @@ -0,0 +1,58 @@ +From: Vlastimil Babka +Date: Tue, 11 May 2021 18:25:09 +0200 +Subject: [PATCH 09/33] mm, slub: restructure new page checks in + ___slab_alloc() + +When we allocate slab object from a newly acquired page (from node's partial +list or page allocator), we usually also retain the page as a new percpu slab. +There are two exceptions - when pfmemalloc status of the page doesn't match our +gfp flags, or when the cache has debugging enabled. + +The current code for these decisions is not easy to follow, so restructure it +and add comments. The new structure will also help with the following changes. +No functional change. + +Signed-off-by: Vlastimil Babka +Acked-by: Mel Gorman +--- + mm/slub.c | 28 ++++++++++++++++++++++------ + 1 file changed, 22 insertions(+), 6 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2782,13 +2782,29 @@ static void *___slab_alloc(struct kmem_c + c->page = page; + + check_new_page: +- if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) +- goto load_freelist; + +- /* Only entered in the debug case */ +- if (kmem_cache_debug(s) && +- !alloc_debug_processing(s, page, freelist, addr)) +- goto new_slab; /* Slab failed checks. Next slab needed */ ++ if (kmem_cache_debug(s)) { ++ if (!alloc_debug_processing(s, page, freelist, addr)) ++ /* Slab failed checks. Next slab needed */ ++ goto new_slab; ++ else ++ /* ++ * For debug case, we don't load freelist so that all ++ * allocations go through alloc_debug_processing() ++ */ ++ goto return_single; ++ } ++ ++ if (unlikely(!pfmemalloc_match(page, gfpflags))) ++ /* ++ * For !pfmemalloc_match() case we don't load freelist so that ++ * we don't make further mismatched allocations easier. ++ */ ++ goto return_single; ++ ++ goto load_freelist; ++ ++return_single: + + deactivate_slab(s, page, get_freepointer(s, freelist), c); + return freelist; diff --git a/patches/0009-mm-slub-return-slab-page-from-get_partial-and-set-c-.patch b/patches/0009-mm-slub-return-slab-page-from-get_partial-and-set-c-.patch deleted file mode 100644 index 2c06299abe75..000000000000 --- a/patches/0009-mm-slub-return-slab-page-from-get_partial-and-set-c-.patch +++ /dev/null @@ -1,102 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:06 +0200 -Subject: [PATCH 09/35] mm, slub: return slab page from get_partial() and set - c->page afterwards - -The function get_partial() finds a suitable page on a partial list, acquires -and returns its freelist and assigns the page pointer to kmem_cache_cpu. -In later patch we will need more control over the kmem_cache_cpu.page -assignment, so instead of passing a kmem_cache_cpu pointer, pass a pointer to a -pointer to a page that get_partial() can fill and the caller can assign the -kmem_cache_cpu.page pointer. No functional change as all of this still happens -with disabled IRQs. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 21 +++++++++++---------- - 1 file changed, 11 insertions(+), 10 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2003,7 +2003,7 @@ static inline bool pfmemalloc_match(stru - * Try to allocate a partial slab from a specific node. - */ - static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, -- struct kmem_cache_cpu *c, gfp_t flags) -+ struct page **ret_page, gfp_t flags) - { - struct page *page, *page2; - void *object = NULL; -@@ -2032,7 +2032,7 @@ static void *get_partial_node(struct kme - - available += objects; - if (!object) { -- c->page = page; -+ *ret_page = page; - stat(s, ALLOC_FROM_PARTIAL); - object = t; - } else { -@@ -2052,7 +2052,7 @@ static void *get_partial_node(struct kme - * Get a page from somewhere. Search in increasing NUMA distances. - */ - static void *get_any_partial(struct kmem_cache *s, gfp_t flags, -- struct kmem_cache_cpu *c) -+ struct page **ret_page) - { - #ifdef CONFIG_NUMA - struct zonelist *zonelist; -@@ -2094,7 +2094,7 @@ static void *get_any_partial(struct kmem - - if (n && cpuset_zone_allowed(zone, flags) && - n->nr_partial > s->min_partial) { -- object = get_partial_node(s, n, c, flags); -+ object = get_partial_node(s, n, ret_page, flags); - if (object) { - /* - * Don't check read_mems_allowed_retry() -@@ -2116,7 +2116,7 @@ static void *get_any_partial(struct kmem - * Get a partial page, lock it and return it. - */ - static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, -- struct kmem_cache_cpu *c) -+ struct page **ret_page) - { - void *object; - int searchnode = node; -@@ -2124,11 +2124,11 @@ static void *get_partial(struct kmem_cac - if (node == NUMA_NO_NODE) - searchnode = numa_mem_id(); - -- object = get_partial_node(s, get_node(s, searchnode), c, flags); -+ object = get_partial_node(s, get_node(s, searchnode), ret_page, flags); - if (object || node != NUMA_NO_NODE) - return object; - -- return get_any_partial(s, flags, c); -+ return get_any_partial(s, flags, ret_page); - } - - #ifdef CONFIG_PREEMPTION -@@ -2740,9 +2740,11 @@ static void *___slab_alloc(struct kmem_c - goto redo; - } - -- freelist = get_partial(s, gfpflags, node, c); -- if (freelist) -+ freelist = get_partial(s, gfpflags, node, &page); -+ if (freelist) { -+ c->page = page; - goto check_new_page; -+ } - - page = new_slab(s, gfpflags, node); - -@@ -2766,7 +2768,6 @@ static void *___slab_alloc(struct kmem_c - c->page = page; - - check_new_page: -- page = c->page; - if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) - goto load_freelist; - diff --git a/patches/0010-mm-slub-restructure-new-page-checks-in-___slab_alloc.patch b/patches/0010-mm-slub-restructure-new-page-checks-in-___slab_alloc.patch deleted file mode 100644 index b5511a0f418e..000000000000 --- a/patches/0010-mm-slub-restructure-new-page-checks-in-___slab_alloc.patch +++ /dev/null @@ -1,59 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:07 +0200 -Subject: [PATCH 10/35] mm, slub: restructure new page checks in - ___slab_alloc() - -When we allocate slab object from a newly acquired page (from node's partial -list or page allocator), we usually also retain the page as a new percpu slab. -There are two exceptions - when pfmemalloc status of the page doesn't match our -gfp flags, or when the cache has debugging enabled. - -The current code for these decisions is not easy to follow, so restructure it -and add comments. The new structure will also help with the following changes. -No functional change. - -Signed-off-by: Vlastimil Babka -Acked-by: Mel Gorman -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 28 ++++++++++++++++++++++------ - 1 file changed, 22 insertions(+), 6 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2768,13 +2768,29 @@ static void *___slab_alloc(struct kmem_c - c->page = page; - - check_new_page: -- if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) -- goto load_freelist; - -- /* Only entered in the debug case */ -- if (kmem_cache_debug(s) && -- !alloc_debug_processing(s, page, freelist, addr)) -- goto new_slab; /* Slab failed checks. Next slab needed */ -+ if (kmem_cache_debug(s)) { -+ if (!alloc_debug_processing(s, page, freelist, addr)) -+ /* Slab failed checks. Next slab needed */ -+ goto new_slab; -+ else -+ /* -+ * For debug case, we don't load freelist so that all -+ * allocations go through alloc_debug_processing() -+ */ -+ goto return_single; -+ } -+ -+ if (unlikely(!pfmemalloc_match(page, gfpflags))) -+ /* -+ * For !pfmemalloc_match() case we don't load freelist so that -+ * we don't make further mismatched allocations easier. -+ */ -+ goto return_single; -+ -+ goto load_freelist; -+ -+return_single: - - deactivate_slab(s, page, get_freepointer(s, freelist), c); - return freelist; diff --git a/patches/0010-mm-slub-simplify-kmem_cache_cpu-and-tid-setup.patch b/patches/0010-mm-slub-simplify-kmem_cache_cpu-and-tid-setup.patch new file mode 100644 index 000000000000..8bb5b3f0f758 --- /dev/null +++ b/patches/0010-mm-slub-simplify-kmem_cache_cpu-and-tid-setup.patch @@ -0,0 +1,61 @@ +From: Vlastimil Babka +Date: Tue, 18 May 2021 02:01:39 +0200 +Subject: [PATCH 10/33] mm, slub: simplify kmem_cache_cpu and tid setup + +In slab_alloc_node() and do_slab_free() fastpaths we need to guarantee that +our kmem_cache_cpu pointer is from the same cpu as the tid value. Currently +that's done by reading the tid first using this_cpu_read(), then the +kmem_cache_cpu pointer and verifying we read the same tid using the pointer and +plain READ_ONCE(). + +This can be simplified to just fetching kmem_cache_cpu pointer and then reading +tid using the pointer. That guarantees they are from the same cpu. We don't +need to read the tid using this_cpu_read() because the value will be validated +by this_cpu_cmpxchg_double(), making sure we are on the correct cpu and the +freelist didn't change by anyone preempting us since reading the tid. + +Signed-off-by: Vlastimil Babka +Acked-by: Mel Gorman +--- + mm/slub.c | 22 +++++++++------------- + 1 file changed, 9 insertions(+), 13 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2882,15 +2882,14 @@ static __always_inline void *slab_alloc_ + * reading from one cpu area. That does not matter as long + * as we end up on the original cpu again when doing the cmpxchg. + * +- * We should guarantee that tid and kmem_cache are retrieved on +- * the same cpu. It could be different if CONFIG_PREEMPTION so we need +- * to check if it is matched or not. ++ * We must guarantee that tid and kmem_cache_cpu are retrieved on the ++ * same cpu. We read first the kmem_cache_cpu pointer and use it to read ++ * the tid. If we are preempted and switched to another cpu between the ++ * two reads, it's OK as the two are still associated with the same cpu ++ * and cmpxchg later will validate the cpu. + */ +- do { +- tid = this_cpu_read(s->cpu_slab->tid); +- c = raw_cpu_ptr(s->cpu_slab); +- } while (IS_ENABLED(CONFIG_PREEMPTION) && +- unlikely(tid != READ_ONCE(c->tid))); ++ c = raw_cpu_ptr(s->cpu_slab); ++ tid = READ_ONCE(c->tid); + + /* + * Irqless object alloc/free algorithm used here depends on sequence +@@ -3164,11 +3163,8 @@ static __always_inline void do_slab_free + * data is retrieved via this pointer. If we are on the same cpu + * during the cmpxchg then the free will succeed. + */ +- do { +- tid = this_cpu_read(s->cpu_slab->tid); +- c = raw_cpu_ptr(s->cpu_slab); +- } while (IS_ENABLED(CONFIG_PREEMPTION) && +- unlikely(tid != READ_ONCE(c->tid))); ++ c = raw_cpu_ptr(s->cpu_slab); ++ tid = READ_ONCE(c->tid); + + /* Same with comment on barrier() in slab_alloc_node() */ + barrier(); diff --git a/patches/0011-mm-slub-move-disabling-enabling-irqs-to-___slab_allo.patch b/patches/0011-mm-slub-move-disabling-enabling-irqs-to-___slab_allo.patch new file mode 100644 index 000000000000..a66be68ec007 --- /dev/null +++ b/patches/0011-mm-slub-move-disabling-enabling-irqs-to-___slab_allo.patch @@ -0,0 +1,179 @@ +From: Vlastimil Babka +Date: Fri, 7 May 2021 19:32:31 +0200 +Subject: [PATCH 11/33] mm, slub: move disabling/enabling irqs to + ___slab_alloc() + +Currently __slab_alloc() disables irqs around the whole ___slab_alloc(). This +includes cases where this is not needed, such as when the allocation ends up in +the page allocator and has to awkwardly enable irqs back based on gfp flags. +Also the whole kmem_cache_alloc_bulk() is executed with irqs disabled even when +it hits the __slab_alloc() slow path, and long periods with disabled interrupts +are undesirable. + +As a first step towards reducing irq disabled periods, move irq handling into +___slab_alloc(). Callers will instead prevent the s->cpu_slab percpu pointer +from becoming invalid via get_cpu_ptr(), thus preempt_disable(). This does not +protect against modification by an irq handler, which is still done by disabled +irq for most of ___slab_alloc(). As a small immediate benefit, +slab_out_of_memory() from ___slab_alloc() is now called with irqs enabled. + +kmem_cache_alloc_bulk() disables irqs for its fastpath and then re-enables them +before calling ___slab_alloc(), which then disables them at its discretion. The +whole kmem_cache_alloc_bulk() operation also disables preemption. + +When ___slab_alloc() calls new_slab() to allocate a new page, re-enable +preemption, because new_slab() will re-enable interrupts in contexts that allow +blocking (this will be improved by later patches). + +The patch itself will thus increase overhead a bit due to disabled preemption +(on configs where it matters) and increased disabling/enabling irqs in +kmem_cache_alloc_bulk(), but that will be gradually improved in the following +patches. + +Note in __slab_alloc() we need to change the #ifdef CONFIG_PREEMPT guard to +CONFIG_PREEMPT_COUNT to make sure preempt disable/enable is properly paired in +all configurations. On configs without involuntary preemption and debugging +the re-read of kmem_cache_cpu pointer is still compiled out as it was before. + +[ Mike Galbraith : Fix kmem_cache_alloc_bulk() error path ] +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 36 ++++++++++++++++++++++++------------ + 1 file changed, 24 insertions(+), 12 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2670,7 +2670,7 @@ static inline void *get_freelist(struct + * we need to allocate a new slab. This is the slowest path since it involves + * a call to the page allocator and the setup of a new slab. + * +- * Version of __slab_alloc to use when we know that interrupts are ++ * Version of __slab_alloc to use when we know that preemption is + * already disabled (which is the case for bulk allocation). + */ + static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, +@@ -2678,9 +2678,11 @@ static void *___slab_alloc(struct kmem_c + { + void *freelist; + struct page *page; ++ unsigned long flags; + + stat(s, ALLOC_SLOWPATH); + ++ local_irq_save(flags); + page = c->page; + if (!page) { + /* +@@ -2743,6 +2745,7 @@ static void *___slab_alloc(struct kmem_c + VM_BUG_ON(!c->page->frozen); + c->freelist = get_freepointer(s, freelist); + c->tid = next_tid(c->tid); ++ local_irq_restore(flags); + return freelist; + + new_slab: +@@ -2760,14 +2763,16 @@ static void *___slab_alloc(struct kmem_c + goto check_new_page; + } + ++ put_cpu_ptr(s->cpu_slab); + page = new_slab(s, gfpflags, node); ++ c = get_cpu_ptr(s->cpu_slab); + + if (unlikely(!page)) { ++ local_irq_restore(flags); + slab_out_of_memory(s, gfpflags, node); + return NULL; + } + +- c = raw_cpu_ptr(s->cpu_slab); + if (c->page) + flush_slab(s, c); + +@@ -2807,31 +2812,33 @@ static void *___slab_alloc(struct kmem_c + return_single: + + deactivate_slab(s, page, get_freepointer(s, freelist), c); ++ local_irq_restore(flags); + return freelist; + } + + /* +- * Another one that disabled interrupt and compensates for possible +- * cpu changes by refetching the per cpu area pointer. ++ * A wrapper for ___slab_alloc() for contexts where preemption is not yet ++ * disabled. Compensates for possible cpu changes by refetching the per cpu area ++ * pointer. + */ + static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c) + { + void *p; +- unsigned long flags; + +- local_irq_save(flags); +-#ifdef CONFIG_PREEMPTION ++#ifdef CONFIG_PREEMPT_COUNT + /* + * We may have been preempted and rescheduled on a different +- * cpu before disabling interrupts. Need to reload cpu area ++ * cpu before disabling preemption. Need to reload cpu area + * pointer. + */ +- c = this_cpu_ptr(s->cpu_slab); ++ c = get_cpu_ptr(s->cpu_slab); + #endif + + p = ___slab_alloc(s, gfpflags, node, addr, c); +- local_irq_restore(flags); ++#ifdef CONFIG_PREEMPT_COUNT ++ put_cpu_ptr(s->cpu_slab); ++#endif + return p; + } + +@@ -3359,8 +3366,8 @@ int kmem_cache_alloc_bulk(struct kmem_ca + * IRQs, which protects against PREEMPT and interrupts + * handlers invoking normal fastpath. + */ ++ c = get_cpu_ptr(s->cpu_slab); + local_irq_disable(); +- c = this_cpu_ptr(s->cpu_slab); + + for (i = 0; i < size; i++) { + void *object = kfence_alloc(s, s->object_size, flags); +@@ -3381,6 +3388,8 @@ int kmem_cache_alloc_bulk(struct kmem_ca + */ + c->tid = next_tid(c->tid); + ++ local_irq_enable(); ++ + /* + * Invoking slow path likely have side-effect + * of re-populating per CPU c->freelist +@@ -3393,6 +3402,8 @@ int kmem_cache_alloc_bulk(struct kmem_ca + c = this_cpu_ptr(s->cpu_slab); + maybe_wipe_obj_freeptr(s, p[i]); + ++ local_irq_disable(); ++ + continue; /* goto for-loop */ + } + c->freelist = get_freepointer(s, object); +@@ -3401,6 +3412,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca + } + c->tid = next_tid(c->tid); + local_irq_enable(); ++ put_cpu_ptr(s->cpu_slab); + + /* + * memcg and kmem_cache debug support and memory initialization. +@@ -3410,7 +3422,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca + slab_want_init_on_alloc(flags, s)); + return i; + error: +- local_irq_enable(); ++ put_cpu_ptr(s->cpu_slab); + slab_post_alloc_hook(s, objcg, flags, i, p, false); + __kmem_cache_free_bulk(s, i, p); + return 0; diff --git a/patches/0011-mm-slub-simplify-kmem_cache_cpu-and-tid-setup.patch b/patches/0011-mm-slub-simplify-kmem_cache_cpu-and-tid-setup.patch deleted file mode 100644 index 37f53c3424ae..000000000000 --- a/patches/0011-mm-slub-simplify-kmem_cache_cpu-and-tid-setup.patch +++ /dev/null @@ -1,62 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:08 +0200 -Subject: [PATCH 11/35] mm, slub: simplify kmem_cache_cpu and tid setup - -In slab_alloc_node() and do_slab_free() fastpaths we need to guarantee that -our kmem_cache_cpu pointer is from the same cpu as the tid value. Currently -that's done by reading the tid first using this_cpu_read(), then the -kmem_cache_cpu pointer and verifying we read the same tid using the pointer and -plain READ_ONCE(). - -This can be simplified to just fetching kmem_cache_cpu pointer and then reading -tid using the pointer. That guarantees they are from the same cpu. We don't -need to read the tid using this_cpu_read() because the value will be validated -by this_cpu_cmpxchg_double(), making sure we are on the correct cpu and the -freelist didn't change by anyone preempting us since reading the tid. - -Signed-off-by: Vlastimil Babka -Acked-by: Mel Gorman -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 22 +++++++++------------- - 1 file changed, 9 insertions(+), 13 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2868,15 +2868,14 @@ static __always_inline void *slab_alloc_ - * reading from one cpu area. That does not matter as long - * as we end up on the original cpu again when doing the cmpxchg. - * -- * We should guarantee that tid and kmem_cache are retrieved on -- * the same cpu. It could be different if CONFIG_PREEMPTION so we need -- * to check if it is matched or not. -+ * We must guarantee that tid and kmem_cache_cpu are retrieved on the -+ * same cpu. We read first the kmem_cache_cpu pointer and use it to read -+ * the tid. If we are preempted and switched to another cpu between the -+ * two reads, it's OK as the two are still associated with the same cpu -+ * and cmpxchg later will validate the cpu. - */ -- do { -- tid = this_cpu_read(s->cpu_slab->tid); -- c = raw_cpu_ptr(s->cpu_slab); -- } while (IS_ENABLED(CONFIG_PREEMPTION) && -- unlikely(tid != READ_ONCE(c->tid))); -+ c = raw_cpu_ptr(s->cpu_slab); -+ tid = READ_ONCE(c->tid); - - /* - * Irqless object alloc/free algorithm used here depends on sequence -@@ -3150,11 +3149,8 @@ static __always_inline void do_slab_free - * data is retrieved via this pointer. If we are on the same cpu - * during the cmpxchg then the free will succeed. - */ -- do { -- tid = this_cpu_read(s->cpu_slab->tid); -- c = raw_cpu_ptr(s->cpu_slab); -- } while (IS_ENABLED(CONFIG_PREEMPTION) && -- unlikely(tid != READ_ONCE(c->tid))); -+ c = raw_cpu_ptr(s->cpu_slab); -+ tid = READ_ONCE(c->tid); - - /* Same with comment on barrier() in slab_alloc_node() */ - barrier(); diff --git a/patches/0012-mm-slub-do-initial-checks-in-___slab_alloc-with-irqs.patch b/patches/0012-mm-slub-do-initial-checks-in-___slab_alloc-with-irqs.patch new file mode 100644 index 000000000000..2e9b8e7fd07d --- /dev/null +++ b/patches/0012-mm-slub-do-initial-checks-in-___slab_alloc-with-irqs.patch @@ -0,0 +1,153 @@ +From: Vlastimil Babka +Date: Sat, 8 May 2021 02:28:02 +0200 +Subject: [PATCH 12/33] mm, slub: do initial checks in ___slab_alloc() with + irqs enabled + +As another step of shortening irq disabled sections in ___slab_alloc(), delay +disabling irqs until we pass the initial checks if there is a cached percpu +slab and it's suitable for our allocation. + +Now we have to recheck c->page after actually disabling irqs as an allocation +in irq handler might have replaced it. + +Because we call pfmemalloc_match() as one of the checks, we might hit +VM_BUG_ON_PAGE(!PageSlab(page)) in PageSlabPfmemalloc in case we get +interrupted and the page is freed. Thus introduce a pfmemalloc_match_unsafe() +variant that lacks the PageSlab check. + +Signed-off-by: Vlastimil Babka +Acked-by: Mel Gorman +--- + include/linux/page-flags.h | 9 +++++++ + mm/slub.c | 54 +++++++++++++++++++++++++++++++++++++-------- + 2 files changed, 54 insertions(+), 9 deletions(-) + +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -815,6 +815,15 @@ static inline int PageSlabPfmemalloc(str + return PageActive(page); + } + ++/* ++ * A version of PageSlabPfmemalloc() for opportunistic checks where the page ++ * might have been freed under us and not be a PageSlab anymore. ++ */ ++static inline int __PageSlabPfmemalloc(struct page *page) ++{ ++ return PageActive(page); ++} ++ + static inline void SetPageSlabPfmemalloc(struct page *page) + { + VM_BUG_ON_PAGE(!PageSlab(page), page); +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2621,6 +2621,19 @@ static inline bool pfmemalloc_match(stru + } + + /* ++ * A variant of pfmemalloc_match() that tests page flags without asserting ++ * PageSlab. Intended for opportunistic checks before taking a lock and ++ * rechecking that nobody else freed the page under us. ++ */ ++static inline bool pfmemalloc_match_unsafe(struct page *page, gfp_t gfpflags) ++{ ++ if (unlikely(__PageSlabPfmemalloc(page))) ++ return gfp_pfmemalloc_allowed(gfpflags); ++ ++ return true; ++} ++ ++/* + * Check the page->freelist of a page and either transfer the freelist to the + * per cpu freelist or deactivate the page. + * +@@ -2682,8 +2695,9 @@ static void *___slab_alloc(struct kmem_c + + stat(s, ALLOC_SLOWPATH); + +- local_irq_save(flags); +- page = c->page; ++reread_page: ++ ++ page = READ_ONCE(c->page); + if (!page) { + /* + * if the node is not online or has no normal memory, just +@@ -2692,6 +2706,11 @@ static void *___slab_alloc(struct kmem_c + if (unlikely(node != NUMA_NO_NODE && + !node_isset(node, slab_nodes))) + node = NUMA_NO_NODE; ++ local_irq_save(flags); ++ if (unlikely(c->page)) { ++ local_irq_restore(flags); ++ goto reread_page; ++ } + goto new_slab; + } + redo: +@@ -2706,8 +2725,7 @@ static void *___slab_alloc(struct kmem_c + goto redo; + } else { + stat(s, ALLOC_NODE_MISMATCH); +- deactivate_slab(s, page, c->freelist, c); +- goto new_slab; ++ goto deactivate_slab; + } + } + +@@ -2716,12 +2734,15 @@ static void *___slab_alloc(struct kmem_c + * PFMEMALLOC but right now, we are losing the pfmemalloc + * information when the page leaves the per-cpu allocator + */ +- if (unlikely(!pfmemalloc_match(page, gfpflags))) { +- deactivate_slab(s, page, c->freelist, c); +- goto new_slab; +- } ++ if (unlikely(!pfmemalloc_match_unsafe(page, gfpflags))) ++ goto deactivate_slab; + +- /* must check again c->freelist in case of cpu migration or IRQ */ ++ /* must check again c->page in case IRQ handler changed it */ ++ local_irq_save(flags); ++ if (unlikely(page != c->page)) { ++ local_irq_restore(flags); ++ goto reread_page; ++ } + freelist = c->freelist; + if (freelist) + goto load_freelist; +@@ -2737,6 +2758,9 @@ static void *___slab_alloc(struct kmem_c + stat(s, ALLOC_REFILL); + + load_freelist: ++ ++ lockdep_assert_irqs_disabled(); ++ + /* + * freelist is pointing to the list of objects to be used. + * page is pointing to the page from which the objects are obtained. +@@ -2748,11 +2772,23 @@ static void *___slab_alloc(struct kmem_c + local_irq_restore(flags); + return freelist; + ++deactivate_slab: ++ ++ local_irq_save(flags); ++ if (page != c->page) { ++ local_irq_restore(flags); ++ goto reread_page; ++ } ++ deactivate_slab(s, page, c->freelist, c); ++ + new_slab: + ++ lockdep_assert_irqs_disabled(); ++ + if (slub_percpu_partial(c)) { + page = c->page = slub_percpu_partial(c); + slub_set_percpu_partial(c, page); ++ local_irq_restore(flags); + stat(s, CPU_PARTIAL_ALLOC); + goto redo; + } diff --git a/patches/0012-mm-slub-move-disabling-enabling-irqs-to-___slab_allo.patch b/patches/0012-mm-slub-move-disabling-enabling-irqs-to-___slab_allo.patch deleted file mode 100644 index 73ca5d4db92c..000000000000 --- a/patches/0012-mm-slub-move-disabling-enabling-irqs-to-___slab_allo.patch +++ /dev/null @@ -1,180 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:09 +0200 -Subject: [PATCH 12/35] mm, slub: move disabling/enabling irqs to - ___slab_alloc() - -Currently __slab_alloc() disables irqs around the whole ___slab_alloc(). This -includes cases where this is not needed, such as when the allocation ends up in -the page allocator and has to awkwardly enable irqs back based on gfp flags. -Also the whole kmem_cache_alloc_bulk() is executed with irqs disabled even when -it hits the __slab_alloc() slow path, and long periods with disabled interrupts -are undesirable. - -As a first step towards reducing irq disabled periods, move irq handling into -___slab_alloc(). Callers will instead prevent the s->cpu_slab percpu pointer -from becoming invalid via get_cpu_ptr(), thus preempt_disable(). This does not -protect against modification by an irq handler, which is still done by disabled -irq for most of ___slab_alloc(). As a small immediate benefit, -slab_out_of_memory() from ___slab_alloc() is now called with irqs enabled. - -kmem_cache_alloc_bulk() disables irqs for its fastpath and then re-enables them -before calling ___slab_alloc(), which then disables them at its discretion. The -whole kmem_cache_alloc_bulk() operation also disables preemption. - -When ___slab_alloc() calls new_slab() to allocate a new page, re-enable -preemption, because new_slab() will re-enable interrupts in contexts that allow -blocking (this will be improved by later patches). - -The patch itself will thus increase overhead a bit due to disabled preemption -(on configs where it matters) and increased disabling/enabling irqs in -kmem_cache_alloc_bulk(), but that will be gradually improved in the following -patches. - -Note in __slab_alloc() we need to change the #ifdef CONFIG_PREEMPT guard to -CONFIG_PREEMPT_COUNT to make sure preempt disable/enable is properly paired in -all configurations. On configs without involuntary preemption and debugging -the re-read of kmem_cache_cpu pointer is still compiled out as it was before. - -[ Mike Galbraith : Fix kmem_cache_alloc_bulk() error path ] -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 36 ++++++++++++++++++++++++------------ - 1 file changed, 24 insertions(+), 12 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2656,7 +2656,7 @@ static inline void *get_freelist(struct - * we need to allocate a new slab. This is the slowest path since it involves - * a call to the page allocator and the setup of a new slab. - * -- * Version of __slab_alloc to use when we know that interrupts are -+ * Version of __slab_alloc to use when we know that preemption is - * already disabled (which is the case for bulk allocation). - */ - static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, -@@ -2664,9 +2664,11 @@ static void *___slab_alloc(struct kmem_c - { - void *freelist; - struct page *page; -+ unsigned long flags; - - stat(s, ALLOC_SLOWPATH); - -+ local_irq_save(flags); - page = c->page; - if (!page) { - /* -@@ -2729,6 +2731,7 @@ static void *___slab_alloc(struct kmem_c - VM_BUG_ON(!c->page->frozen); - c->freelist = get_freepointer(s, freelist); - c->tid = next_tid(c->tid); -+ local_irq_restore(flags); - return freelist; - - new_slab: -@@ -2746,14 +2749,16 @@ static void *___slab_alloc(struct kmem_c - goto check_new_page; - } - -+ put_cpu_ptr(s->cpu_slab); - page = new_slab(s, gfpflags, node); -+ c = get_cpu_ptr(s->cpu_slab); - - if (unlikely(!page)) { -+ local_irq_restore(flags); - slab_out_of_memory(s, gfpflags, node); - return NULL; - } - -- c = raw_cpu_ptr(s->cpu_slab); - if (c->page) - flush_slab(s, c); - -@@ -2793,31 +2798,33 @@ static void *___slab_alloc(struct kmem_c - return_single: - - deactivate_slab(s, page, get_freepointer(s, freelist), c); -+ local_irq_restore(flags); - return freelist; - } - - /* -- * Another one that disabled interrupt and compensates for possible -- * cpu changes by refetching the per cpu area pointer. -+ * A wrapper for ___slab_alloc() for contexts where preemption is not yet -+ * disabled. Compensates for possible cpu changes by refetching the per cpu area -+ * pointer. - */ - static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c) - { - void *p; -- unsigned long flags; - -- local_irq_save(flags); --#ifdef CONFIG_PREEMPTION -+#ifdef CONFIG_PREEMPT_COUNT - /* - * We may have been preempted and rescheduled on a different -- * cpu before disabling interrupts. Need to reload cpu area -+ * cpu before disabling preemption. Need to reload cpu area - * pointer. - */ -- c = this_cpu_ptr(s->cpu_slab); -+ c = get_cpu_ptr(s->cpu_slab); - #endif - - p = ___slab_alloc(s, gfpflags, node, addr, c); -- local_irq_restore(flags); -+#ifdef CONFIG_PREEMPT_COUNT -+ put_cpu_ptr(s->cpu_slab); -+#endif - return p; - } - -@@ -3345,8 +3352,8 @@ int kmem_cache_alloc_bulk(struct kmem_ca - * IRQs, which protects against PREEMPT and interrupts - * handlers invoking normal fastpath. - */ -+ c = get_cpu_ptr(s->cpu_slab); - local_irq_disable(); -- c = this_cpu_ptr(s->cpu_slab); - - for (i = 0; i < size; i++) { - void *object = kfence_alloc(s, s->object_size, flags); -@@ -3367,6 +3374,8 @@ int kmem_cache_alloc_bulk(struct kmem_ca - */ - c->tid = next_tid(c->tid); - -+ local_irq_enable(); -+ - /* - * Invoking slow path likely have side-effect - * of re-populating per CPU c->freelist -@@ -3379,6 +3388,8 @@ int kmem_cache_alloc_bulk(struct kmem_ca - c = this_cpu_ptr(s->cpu_slab); - maybe_wipe_obj_freeptr(s, p[i]); - -+ local_irq_disable(); -+ - continue; /* goto for-loop */ - } - c->freelist = get_freepointer(s, object); -@@ -3387,6 +3398,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca - } - c->tid = next_tid(c->tid); - local_irq_enable(); -+ put_cpu_ptr(s->cpu_slab); - - /* - * memcg and kmem_cache debug support and memory initialization. -@@ -3396,7 +3408,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca - slab_want_init_on_alloc(flags, s)); - return i; - error: -- local_irq_enable(); -+ put_cpu_ptr(s->cpu_slab); - slab_post_alloc_hook(s, objcg, flags, i, p, false); - __kmem_cache_free_bulk(s, i, p); - return 0; diff --git a/patches/0013-mm-slub-do-initial-checks-in-___slab_alloc-with-irqs.patch b/patches/0013-mm-slub-do-initial-checks-in-___slab_alloc-with-irqs.patch deleted file mode 100644 index d9c1bc411bab..000000000000 --- a/patches/0013-mm-slub-do-initial-checks-in-___slab_alloc-with-irqs.patch +++ /dev/null @@ -1,154 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:10 +0200 -Subject: [PATCH 13/35] mm, slub: do initial checks in ___slab_alloc() with - irqs enabled - -As another step of shortening irq disabled sections in ___slab_alloc(), delay -disabling irqs until we pass the initial checks if there is a cached percpu -slab and it's suitable for our allocation. - -Now we have to recheck c->page after actually disabling irqs as an allocation -in irq handler might have replaced it. - -Because we call pfmemalloc_match() as one of the checks, we might hit -VM_BUG_ON_PAGE(!PageSlab(page)) in PageSlabPfmemalloc in case we get -interrupted and the page is freed. Thus introduce a pfmemalloc_match_unsafe() -variant that lacks the PageSlab check. - -Signed-off-by: Vlastimil Babka -Acked-by: Mel Gorman -Signed-off-by: Sebastian Andrzej Siewior ---- - include/linux/page-flags.h | 9 +++++++ - mm/slub.c | 54 +++++++++++++++++++++++++++++++++++++-------- - 2 files changed, 54 insertions(+), 9 deletions(-) - ---- a/include/linux/page-flags.h -+++ b/include/linux/page-flags.h -@@ -815,6 +815,15 @@ static inline int PageSlabPfmemalloc(str - return PageActive(page); - } - -+/* -+ * A version of PageSlabPfmemalloc() for opportunistic checks where the page -+ * might have been freed under us and not be a PageSlab anymore. -+ */ -+static inline int __PageSlabPfmemalloc(struct page *page) -+{ -+ return PageActive(page); -+} -+ - static inline void SetPageSlabPfmemalloc(struct page *page) - { - VM_BUG_ON_PAGE(!PageSlab(page), page); ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2607,6 +2607,19 @@ static inline bool pfmemalloc_match(stru - } - - /* -+ * A variant of pfmemalloc_match() that tests page flags without asserting -+ * PageSlab. Intended for opportunistic checks before taking a lock and -+ * rechecking that nobody else freed the page under us. -+ */ -+static inline bool pfmemalloc_match_unsafe(struct page *page, gfp_t gfpflags) -+{ -+ if (unlikely(__PageSlabPfmemalloc(page))) -+ return gfp_pfmemalloc_allowed(gfpflags); -+ -+ return true; -+} -+ -+/* - * Check the page->freelist of a page and either transfer the freelist to the - * per cpu freelist or deactivate the page. - * -@@ -2668,8 +2681,9 @@ static void *___slab_alloc(struct kmem_c - - stat(s, ALLOC_SLOWPATH); - -- local_irq_save(flags); -- page = c->page; -+reread_page: -+ -+ page = READ_ONCE(c->page); - if (!page) { - /* - * if the node is not online or has no normal memory, just -@@ -2678,6 +2692,11 @@ static void *___slab_alloc(struct kmem_c - if (unlikely(node != NUMA_NO_NODE && - !node_isset(node, slab_nodes))) - node = NUMA_NO_NODE; -+ local_irq_save(flags); -+ if (unlikely(c->page)) { -+ local_irq_restore(flags); -+ goto reread_page; -+ } - goto new_slab; - } - redo: -@@ -2692,8 +2711,7 @@ static void *___slab_alloc(struct kmem_c - goto redo; - } else { - stat(s, ALLOC_NODE_MISMATCH); -- deactivate_slab(s, page, c->freelist, c); -- goto new_slab; -+ goto deactivate_slab; - } - } - -@@ -2702,12 +2720,15 @@ static void *___slab_alloc(struct kmem_c - * PFMEMALLOC but right now, we are losing the pfmemalloc - * information when the page leaves the per-cpu allocator - */ -- if (unlikely(!pfmemalloc_match(page, gfpflags))) { -- deactivate_slab(s, page, c->freelist, c); -- goto new_slab; -- } -+ if (unlikely(!pfmemalloc_match_unsafe(page, gfpflags))) -+ goto deactivate_slab; - -- /* must check again c->freelist in case of cpu migration or IRQ */ -+ /* must check again c->page in case IRQ handler changed it */ -+ local_irq_save(flags); -+ if (unlikely(page != c->page)) { -+ local_irq_restore(flags); -+ goto reread_page; -+ } - freelist = c->freelist; - if (freelist) - goto load_freelist; -@@ -2723,6 +2744,9 @@ static void *___slab_alloc(struct kmem_c - stat(s, ALLOC_REFILL); - - load_freelist: -+ -+ lockdep_assert_irqs_disabled(); -+ - /* - * freelist is pointing to the list of objects to be used. - * page is pointing to the page from which the objects are obtained. -@@ -2734,11 +2758,23 @@ static void *___slab_alloc(struct kmem_c - local_irq_restore(flags); - return freelist; - -+deactivate_slab: -+ -+ local_irq_save(flags); -+ if (page != c->page) { -+ local_irq_restore(flags); -+ goto reread_page; -+ } -+ deactivate_slab(s, page, c->freelist, c); -+ - new_slab: - -+ lockdep_assert_irqs_disabled(); -+ - if (slub_percpu_partial(c)) { - page = c->page = slub_percpu_partial(c); - slub_set_percpu_partial(c, page); -+ local_irq_restore(flags); - stat(s, CPU_PARTIAL_ALLOC); - goto redo; - } diff --git a/patches/0013-mm-slub-move-disabling-irqs-closer-to-get_partial-in.patch b/patches/0013-mm-slub-move-disabling-irqs-closer-to-get_partial-in.patch new file mode 100644 index 000000000000..ef3f9b15596c --- /dev/null +++ b/patches/0013-mm-slub-move-disabling-irqs-closer-to-get_partial-in.patch @@ -0,0 +1,96 @@ +From: Vlastimil Babka +Date: Mon, 10 May 2021 13:56:17 +0200 +Subject: [PATCH 13/33] mm, slub: move disabling irqs closer to get_partial() + in ___slab_alloc() + +Continue reducing the irq disabled scope. Check for per-cpu partial slabs with +first with irqs enabled and then recheck with irqs disabled before grabbing +the slab page. Mostly preparatory for the following patches. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 34 +++++++++++++++++++++++++--------- + 1 file changed, 25 insertions(+), 9 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2706,11 +2706,6 @@ static void *___slab_alloc(struct kmem_c + if (unlikely(node != NUMA_NO_NODE && + !node_isset(node, slab_nodes))) + node = NUMA_NO_NODE; +- local_irq_save(flags); +- if (unlikely(c->page)) { +- local_irq_restore(flags); +- goto reread_page; +- } + goto new_slab; + } + redo: +@@ -2751,6 +2746,7 @@ static void *___slab_alloc(struct kmem_c + + if (!freelist) { + c->page = NULL; ++ local_irq_restore(flags); + stat(s, DEACTIVATE_BYPASS); + goto new_slab; + } +@@ -2780,12 +2776,19 @@ static void *___slab_alloc(struct kmem_c + goto reread_page; + } + deactivate_slab(s, page, c->freelist, c); ++ local_irq_restore(flags); + + new_slab: + +- lockdep_assert_irqs_disabled(); +- + if (slub_percpu_partial(c)) { ++ local_irq_save(flags); ++ if (unlikely(c->page)) { ++ local_irq_restore(flags); ++ goto reread_page; ++ } ++ if (unlikely(!slub_percpu_partial(c))) ++ goto new_objects; /* stolen by an IRQ handler */ ++ + page = c->page = slub_percpu_partial(c); + slub_set_percpu_partial(c, page); + local_irq_restore(flags); +@@ -2793,6 +2796,16 @@ static void *___slab_alloc(struct kmem_c + goto redo; + } + ++ local_irq_save(flags); ++ if (unlikely(c->page)) { ++ local_irq_restore(flags); ++ goto reread_page; ++ } ++ ++new_objects: ++ ++ lockdep_assert_irqs_disabled(); ++ + freelist = get_partial(s, gfpflags, node, &page); + if (freelist) { + c->page = page; +@@ -2825,15 +2838,18 @@ static void *___slab_alloc(struct kmem_c + check_new_page: + + if (kmem_cache_debug(s)) { +- if (!alloc_debug_processing(s, page, freelist, addr)) ++ if (!alloc_debug_processing(s, page, freelist, addr)) { + /* Slab failed checks. Next slab needed */ ++ c->page = NULL; ++ local_irq_restore(flags); + goto new_slab; +- else ++ } else { + /* + * For debug case, we don't load freelist so that all + * allocations go through alloc_debug_processing() + */ + goto return_single; ++ } + } + + if (unlikely(!pfmemalloc_match(page, gfpflags))) diff --git a/patches/0014-mm-slub-move-disabling-irqs-closer-to-get_partial-in.patch b/patches/0014-mm-slub-move-disabling-irqs-closer-to-get_partial-in.patch deleted file mode 100644 index a4baed5461da..000000000000 --- a/patches/0014-mm-slub-move-disabling-irqs-closer-to-get_partial-in.patch +++ /dev/null @@ -1,97 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:11 +0200 -Subject: [PATCH 14/35] mm, slub: move disabling irqs closer to get_partial() - in ___slab_alloc() - -Continue reducing the irq disabled scope. Check for per-cpu partial slabs with -first with irqs enabled and then recheck with irqs disabled before grabbing -the slab page. Mostly preparatory for the following patches. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 34 +++++++++++++++++++++++++--------- - 1 file changed, 25 insertions(+), 9 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2692,11 +2692,6 @@ static void *___slab_alloc(struct kmem_c - if (unlikely(node != NUMA_NO_NODE && - !node_isset(node, slab_nodes))) - node = NUMA_NO_NODE; -- local_irq_save(flags); -- if (unlikely(c->page)) { -- local_irq_restore(flags); -- goto reread_page; -- } - goto new_slab; - } - redo: -@@ -2737,6 +2732,7 @@ static void *___slab_alloc(struct kmem_c - - if (!freelist) { - c->page = NULL; -+ local_irq_restore(flags); - stat(s, DEACTIVATE_BYPASS); - goto new_slab; - } -@@ -2766,12 +2762,19 @@ static void *___slab_alloc(struct kmem_c - goto reread_page; - } - deactivate_slab(s, page, c->freelist, c); -+ local_irq_restore(flags); - - new_slab: - -- lockdep_assert_irqs_disabled(); -- - if (slub_percpu_partial(c)) { -+ local_irq_save(flags); -+ if (unlikely(c->page)) { -+ local_irq_restore(flags); -+ goto reread_page; -+ } -+ if (unlikely(!slub_percpu_partial(c))) -+ goto new_objects; /* stolen by an IRQ handler */ -+ - page = c->page = slub_percpu_partial(c); - slub_set_percpu_partial(c, page); - local_irq_restore(flags); -@@ -2779,6 +2782,16 @@ static void *___slab_alloc(struct kmem_c - goto redo; - } - -+ local_irq_save(flags); -+ if (unlikely(c->page)) { -+ local_irq_restore(flags); -+ goto reread_page; -+ } -+ -+new_objects: -+ -+ lockdep_assert_irqs_disabled(); -+ - freelist = get_partial(s, gfpflags, node, &page); - if (freelist) { - c->page = page; -@@ -2811,15 +2824,18 @@ static void *___slab_alloc(struct kmem_c - check_new_page: - - if (kmem_cache_debug(s)) { -- if (!alloc_debug_processing(s, page, freelist, addr)) -+ if (!alloc_debug_processing(s, page, freelist, addr)) { - /* Slab failed checks. Next slab needed */ -+ c->page = NULL; -+ local_irq_restore(flags); - goto new_slab; -- else -+ } else { - /* - * For debug case, we don't load freelist so that all - * allocations go through alloc_debug_processing() - */ - goto return_single; -+ } - } - - if (unlikely(!pfmemalloc_match(page, gfpflags))) diff --git a/patches/0014-mm-slub-restore-irqs-around-calling-new_slab.patch b/patches/0014-mm-slub-restore-irqs-around-calling-new_slab.patch new file mode 100644 index 000000000000..d1801d8cd0a4 --- /dev/null +++ b/patches/0014-mm-slub-restore-irqs-around-calling-new_slab.patch @@ -0,0 +1,54 @@ +From: Vlastimil Babka +Date: Mon, 10 May 2021 16:30:01 +0200 +Subject: [PATCH 14/33] mm, slub: restore irqs around calling new_slab() + +allocate_slab() currently re-enables irqs before calling to the page allocator. +It depends on gfpflags_allow_blocking() to determine if it's safe to do so. +Now we can instead simply restore irq before calling it through new_slab(). +The other caller early_kmem_cache_node_alloc() is unaffected by this. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 8 ++------ + 1 file changed, 2 insertions(+), 6 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -1809,9 +1809,6 @@ static struct page *allocate_slab(struct + + flags &= gfp_allowed_mask; + +- if (gfpflags_allow_blocking(flags)) +- local_irq_enable(); +- + flags |= s->allocflags; + + /* +@@ -1870,8 +1867,6 @@ static struct page *allocate_slab(struct + page->frozen = 1; + + out: +- if (gfpflags_allow_blocking(flags)) +- local_irq_disable(); + if (!page) + return NULL; + +@@ -2812,16 +2807,17 @@ static void *___slab_alloc(struct kmem_c + goto check_new_page; + } + ++ local_irq_restore(flags); + put_cpu_ptr(s->cpu_slab); + page = new_slab(s, gfpflags, node); + c = get_cpu_ptr(s->cpu_slab); + + if (unlikely(!page)) { +- local_irq_restore(flags); + slab_out_of_memory(s, gfpflags, node); + return NULL; + } + ++ local_irq_save(flags); + if (c->page) + flush_slab(s, c); + diff --git a/patches/0015-mm-slub-restore-irqs-around-calling-new_slab.patch b/patches/0015-mm-slub-restore-irqs-around-calling-new_slab.patch deleted file mode 100644 index 09feac10c056..000000000000 --- a/patches/0015-mm-slub-restore-irqs-around-calling-new_slab.patch +++ /dev/null @@ -1,55 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:12 +0200 -Subject: [PATCH 15/35] mm, slub: restore irqs around calling new_slab() - -allocate_slab() currently re-enables irqs before calling to the page allocator. -It depends on gfpflags_allow_blocking() to determine if it's safe to do so. -Now we can instead simply restore irq before calling it through new_slab(). -The other caller early_kmem_cache_node_alloc() is unaffected by this. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 8 ++------ - 1 file changed, 2 insertions(+), 6 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -1795,9 +1795,6 @@ static struct page *allocate_slab(struct - - flags &= gfp_allowed_mask; - -- if (gfpflags_allow_blocking(flags)) -- local_irq_enable(); -- - flags |= s->allocflags; - - /* -@@ -1856,8 +1853,6 @@ static struct page *allocate_slab(struct - page->frozen = 1; - - out: -- if (gfpflags_allow_blocking(flags)) -- local_irq_disable(); - if (!page) - return NULL; - -@@ -2798,16 +2793,17 @@ static void *___slab_alloc(struct kmem_c - goto check_new_page; - } - -+ local_irq_restore(flags); - put_cpu_ptr(s->cpu_slab); - page = new_slab(s, gfpflags, node); - c = get_cpu_ptr(s->cpu_slab); - - if (unlikely(!page)) { -- local_irq_restore(flags); - slab_out_of_memory(s, gfpflags, node); - return NULL; - } - -+ local_irq_save(flags); - if (c->page) - flush_slab(s, c); - diff --git a/patches/0015-mm-slub-validate-slab-from-partial-list-or-page-allo.patch b/patches/0015-mm-slub-validate-slab-from-partial-list-or-page-allo.patch new file mode 100644 index 000000000000..4373ad8a486b --- /dev/null +++ b/patches/0015-mm-slub-validate-slab-from-partial-list-or-page-allo.patch @@ -0,0 +1,76 @@ +From: Vlastimil Babka +Date: Tue, 11 May 2021 16:37:51 +0200 +Subject: [PATCH 15/33] mm, slub: validate slab from partial list or page + allocator before making it cpu slab + +When we obtain a new slab page from node partial list or page allocator, we +assign it to kmem_cache_cpu, perform some checks, and if they fail, we undo +the assignment. + +In order to allow doing the checks without irq disabled, restructure the code +so that the checks are done first, and kmem_cache_cpu.page assignment only +after they pass. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 17 +++++++++-------- + 1 file changed, 9 insertions(+), 8 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2802,10 +2802,8 @@ static void *___slab_alloc(struct kmem_c + lockdep_assert_irqs_disabled(); + + freelist = get_partial(s, gfpflags, node, &page); +- if (freelist) { +- c->page = page; ++ if (freelist) + goto check_new_page; +- } + + local_irq_restore(flags); + put_cpu_ptr(s->cpu_slab); +@@ -2818,9 +2816,6 @@ static void *___slab_alloc(struct kmem_c + } + + local_irq_save(flags); +- if (c->page) +- flush_slab(s, c); +- + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg +@@ -2829,14 +2824,12 @@ static void *___slab_alloc(struct kmem_c + page->freelist = NULL; + + stat(s, ALLOC_SLAB); +- c->page = page; + + check_new_page: + + if (kmem_cache_debug(s)) { + if (!alloc_debug_processing(s, page, freelist, addr)) { + /* Slab failed checks. Next slab needed */ +- c->page = NULL; + local_irq_restore(flags); + goto new_slab; + } else { +@@ -2855,10 +2848,18 @@ static void *___slab_alloc(struct kmem_c + */ + goto return_single; + ++ if (unlikely(c->page)) ++ flush_slab(s, c); ++ c->page = page; ++ + goto load_freelist; + + return_single: + ++ if (unlikely(c->page)) ++ flush_slab(s, c); ++ c->page = page; ++ + deactivate_slab(s, page, get_freepointer(s, freelist), c); + local_irq_restore(flags); + return freelist; diff --git a/patches/0016-mm-slub-check-new-pages-with-restored-irqs.patch b/patches/0016-mm-slub-check-new-pages-with-restored-irqs.patch new file mode 100644 index 000000000000..72776818ac30 --- /dev/null +++ b/patches/0016-mm-slub-check-new-pages-with-restored-irqs.patch @@ -0,0 +1,69 @@ +From: Vlastimil Babka +Date: Tue, 11 May 2021 16:56:09 +0200 +Subject: [PATCH 16/33] mm, slub: check new pages with restored irqs + +Building on top of the previous patch, re-enable irqs before checking new +pages. alloc_debug_processing() is now called with enabled irqs so we need to +remove VM_BUG_ON(!irqs_disabled()); in check_slab() - there doesn't seem to be +a need for it anyway. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -1009,8 +1009,6 @@ static int check_slab(struct kmem_cache + { + int maxobj; + +- VM_BUG_ON(!irqs_disabled()); +- + if (!PageSlab(page)) { + slab_err(s, page, "Not a valid slab page"); + return 0; +@@ -2802,10 +2800,10 @@ static void *___slab_alloc(struct kmem_c + lockdep_assert_irqs_disabled(); + + freelist = get_partial(s, gfpflags, node, &page); ++ local_irq_restore(flags); + if (freelist) + goto check_new_page; + +- local_irq_restore(flags); + put_cpu_ptr(s->cpu_slab); + page = new_slab(s, gfpflags, node); + c = get_cpu_ptr(s->cpu_slab); +@@ -2815,7 +2813,6 @@ static void *___slab_alloc(struct kmem_c + return NULL; + } + +- local_irq_save(flags); + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg +@@ -2830,7 +2827,6 @@ static void *___slab_alloc(struct kmem_c + if (kmem_cache_debug(s)) { + if (!alloc_debug_processing(s, page, freelist, addr)) { + /* Slab failed checks. Next slab needed */ +- local_irq_restore(flags); + goto new_slab; + } else { + /* +@@ -2848,6 +2844,7 @@ static void *___slab_alloc(struct kmem_c + */ + goto return_single; + ++ local_irq_save(flags); + if (unlikely(c->page)) + flush_slab(s, c); + c->page = page; +@@ -2856,6 +2853,7 @@ static void *___slab_alloc(struct kmem_c + + return_single: + ++ local_irq_save(flags); + if (unlikely(c->page)) + flush_slab(s, c); + c->page = page; diff --git a/patches/0016-mm-slub-validate-slab-from-partial-list-or-page-allo.patch b/patches/0016-mm-slub-validate-slab-from-partial-list-or-page-allo.patch deleted file mode 100644 index 3926e84c00e7..000000000000 --- a/patches/0016-mm-slub-validate-slab-from-partial-list-or-page-allo.patch +++ /dev/null @@ -1,77 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:13 +0200 -Subject: [PATCH 16/35] mm, slub: validate slab from partial list or page - allocator before making it cpu slab - -When we obtain a new slab page from node partial list or page allocator, we -assign it to kmem_cache_cpu, perform some checks, and if they fail, we undo -the assignment. - -In order to allow doing the checks without irq disabled, restructure the code -so that the checks are done first, and kmem_cache_cpu.page assignment only -after they pass. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 17 +++++++++-------- - 1 file changed, 9 insertions(+), 8 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2788,10 +2788,8 @@ static void *___slab_alloc(struct kmem_c - lockdep_assert_irqs_disabled(); - - freelist = get_partial(s, gfpflags, node, &page); -- if (freelist) { -- c->page = page; -+ if (freelist) - goto check_new_page; -- } - - local_irq_restore(flags); - put_cpu_ptr(s->cpu_slab); -@@ -2804,9 +2802,6 @@ static void *___slab_alloc(struct kmem_c - } - - local_irq_save(flags); -- if (c->page) -- flush_slab(s, c); -- - /* - * No other reference to the page yet so we can - * muck around with it freely without cmpxchg -@@ -2815,14 +2810,12 @@ static void *___slab_alloc(struct kmem_c - page->freelist = NULL; - - stat(s, ALLOC_SLAB); -- c->page = page; - - check_new_page: - - if (kmem_cache_debug(s)) { - if (!alloc_debug_processing(s, page, freelist, addr)) { - /* Slab failed checks. Next slab needed */ -- c->page = NULL; - local_irq_restore(flags); - goto new_slab; - } else { -@@ -2841,10 +2834,18 @@ static void *___slab_alloc(struct kmem_c - */ - goto return_single; - -+ if (unlikely(c->page)) -+ flush_slab(s, c); -+ c->page = page; -+ - goto load_freelist; - - return_single: - -+ if (unlikely(c->page)) -+ flush_slab(s, c); -+ c->page = page; -+ - deactivate_slab(s, page, get_freepointer(s, freelist), c); - local_irq_restore(flags); - return freelist; diff --git a/patches/0017-mm-slub-check-new-pages-with-restored-irqs.patch b/patches/0017-mm-slub-check-new-pages-with-restored-irqs.patch deleted file mode 100644 index 356f13571bc5..000000000000 --- a/patches/0017-mm-slub-check-new-pages-with-restored-irqs.patch +++ /dev/null @@ -1,70 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:14 +0200 -Subject: [PATCH 17/35] mm, slub: check new pages with restored irqs - -Building on top of the previous patch, re-enable irqs before checking new -pages. alloc_debug_processing() is now called with enabled irqs so we need to -remove VM_BUG_ON(!irqs_disabled()); in check_slab() - there doesn't seem to be -a need for it anyway. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 8 +++----- - 1 file changed, 3 insertions(+), 5 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -995,8 +995,6 @@ static int check_slab(struct kmem_cache - { - int maxobj; - -- VM_BUG_ON(!irqs_disabled()); -- - if (!PageSlab(page)) { - slab_err(s, page, "Not a valid slab page"); - return 0; -@@ -2788,10 +2786,10 @@ static void *___slab_alloc(struct kmem_c - lockdep_assert_irqs_disabled(); - - freelist = get_partial(s, gfpflags, node, &page); -+ local_irq_restore(flags); - if (freelist) - goto check_new_page; - -- local_irq_restore(flags); - put_cpu_ptr(s->cpu_slab); - page = new_slab(s, gfpflags, node); - c = get_cpu_ptr(s->cpu_slab); -@@ -2801,7 +2799,6 @@ static void *___slab_alloc(struct kmem_c - return NULL; - } - -- local_irq_save(flags); - /* - * No other reference to the page yet so we can - * muck around with it freely without cmpxchg -@@ -2816,7 +2813,6 @@ static void *___slab_alloc(struct kmem_c - if (kmem_cache_debug(s)) { - if (!alloc_debug_processing(s, page, freelist, addr)) { - /* Slab failed checks. Next slab needed */ -- local_irq_restore(flags); - goto new_slab; - } else { - /* -@@ -2834,6 +2830,7 @@ static void *___slab_alloc(struct kmem_c - */ - goto return_single; - -+ local_irq_save(flags); - if (unlikely(c->page)) - flush_slab(s, c); - c->page = page; -@@ -2842,6 +2839,7 @@ static void *___slab_alloc(struct kmem_c - - return_single: - -+ local_irq_save(flags); - if (unlikely(c->page)) - flush_slab(s, c); - c->page = page; diff --git a/patches/0017-mm-slub-stop-disabling-irqs-around-get_partial.patch b/patches/0017-mm-slub-stop-disabling-irqs-around-get_partial.patch new file mode 100644 index 000000000000..8a837152186c --- /dev/null +++ b/patches/0017-mm-slub-stop-disabling-irqs-around-get_partial.patch @@ -0,0 +1,86 @@ +From: Vlastimil Babka +Date: Tue, 11 May 2021 17:45:26 +0200 +Subject: [PATCH 17/33] mm, slub: stop disabling irqs around get_partial() + +The function get_partial() does not need to have irqs disabled as a whole. It's +sufficient to convert spin_lock operations to their irq saving/restoring +versions. + +As a result, it's now possible to reach the page allocator from the slab +allocator without disabling and re-enabling interrupts on the way. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 22 ++++++++-------------- + 1 file changed, 8 insertions(+), 14 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2010,11 +2010,12 @@ static inline bool pfmemalloc_match(stru + * Try to allocate a partial slab from a specific node. + */ + static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, +- struct page **ret_page, gfp_t flags) ++ struct page **ret_page, gfp_t gfpflags) + { + struct page *page, *page2; + void *object = NULL; + unsigned int available = 0; ++ unsigned long flags; + int objects; + + /* +@@ -2026,11 +2027,11 @@ static void *get_partial_node(struct kme + if (!n || !n->nr_partial) + return NULL; + +- spin_lock(&n->list_lock); ++ spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry_safe(page, page2, &n->partial, slab_list) { + void *t; + +- if (!pfmemalloc_match(page, flags)) ++ if (!pfmemalloc_match(page, gfpflags)) + continue; + + t = acquire_slab(s, n, page, object == NULL, &objects); +@@ -2051,7 +2052,7 @@ static void *get_partial_node(struct kme + break; + + } +- spin_unlock(&n->list_lock); ++ spin_unlock_irqrestore(&n->list_lock, flags); + return object; + } + +@@ -2779,8 +2780,10 @@ static void *___slab_alloc(struct kmem_c + local_irq_restore(flags); + goto reread_page; + } +- if (unlikely(!slub_percpu_partial(c))) ++ if (unlikely(!slub_percpu_partial(c))) { ++ local_irq_restore(flags); + goto new_objects; /* stolen by an IRQ handler */ ++ } + + page = c->page = slub_percpu_partial(c); + slub_set_percpu_partial(c, page); +@@ -2789,18 +2792,9 @@ static void *___slab_alloc(struct kmem_c + goto redo; + } + +- local_irq_save(flags); +- if (unlikely(c->page)) { +- local_irq_restore(flags); +- goto reread_page; +- } +- + new_objects: + +- lockdep_assert_irqs_disabled(); +- + freelist = get_partial(s, gfpflags, node, &page); +- local_irq_restore(flags); + if (freelist) + goto check_new_page; + diff --git a/patches/0018-mm-slub-move-reset-of-c-page-and-freelist-out-of-dea.patch b/patches/0018-mm-slub-move-reset-of-c-page-and-freelist-out-of-dea.patch new file mode 100644 index 000000000000..71dc9fbb2803 --- /dev/null +++ b/patches/0018-mm-slub-move-reset-of-c-page-and-freelist-out-of-dea.patch @@ -0,0 +1,93 @@ +From: Vlastimil Babka +Date: Wed, 12 May 2021 13:53:34 +0200 +Subject: [PATCH 18/33] mm, slub: move reset of c->page and freelist out of + deactivate_slab() + +deactivate_slab() removes the cpu slab by merging the cpu freelist with slab's +freelist and putting the slab on the proper node's list. It also sets the +respective kmem_cache_cpu pointers to NULL. + +By extracting the kmem_cache_cpu operations from the function, we can make it +not dependent on disabled irqs. + +Also if we return a single free pointer from ___slab_alloc, we no longer have +to assign kmem_cache_cpu.page before deactivation or care if somebody preempted +us and assigned a different page to our kmem_cache_cpu in the process. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 31 ++++++++++++++++++------------- + 1 file changed, 18 insertions(+), 13 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2209,10 +2209,13 @@ static void init_kmem_cache_cpus(struct + } + + /* +- * Remove the cpu slab ++ * Finishes removing the cpu slab. Merges cpu's freelist with page's freelist, ++ * unfreezes the slabs and puts it on the proper list. ++ * Assumes the slab has been already safely taken away from kmem_cache_cpu ++ * by the caller. + */ + static void deactivate_slab(struct kmem_cache *s, struct page *page, +- void *freelist, struct kmem_cache_cpu *c) ++ void *freelist) + { + enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); +@@ -2341,9 +2344,6 @@ static void deactivate_slab(struct kmem_ + discard_slab(s, page); + stat(s, FREE_SLAB); + } +- +- c->page = NULL; +- c->freelist = NULL; + } + + /* +@@ -2468,10 +2468,16 @@ static void put_cpu_partial(struct kmem_ + + static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) + { +- stat(s, CPUSLAB_FLUSH); +- deactivate_slab(s, c->page, c->freelist, c); ++ void *freelist = c->freelist; ++ struct page *page = c->page; + ++ c->page = NULL; ++ c->freelist = NULL; + c->tid = next_tid(c->tid); ++ ++ deactivate_slab(s, page, freelist); ++ ++ stat(s, CPUSLAB_FLUSH); + } + + /* +@@ -2769,7 +2775,10 @@ static void *___slab_alloc(struct kmem_c + local_irq_restore(flags); + goto reread_page; + } +- deactivate_slab(s, page, c->freelist, c); ++ freelist = c->freelist; ++ c->page = NULL; ++ c->freelist = NULL; ++ deactivate_slab(s, page, freelist); + local_irq_restore(flags); + + new_slab: +@@ -2848,11 +2857,7 @@ static void *___slab_alloc(struct kmem_c + return_single: + + local_irq_save(flags); +- if (unlikely(c->page)) +- flush_slab(s, c); +- c->page = page; +- +- deactivate_slab(s, page, get_freepointer(s, freelist), c); ++ deactivate_slab(s, page, get_freepointer(s, freelist)); + local_irq_restore(flags); + return freelist; + } diff --git a/patches/0018-mm-slub-stop-disabling-irqs-around-get_partial.patch b/patches/0018-mm-slub-stop-disabling-irqs-around-get_partial.patch deleted file mode 100644 index 147016573331..000000000000 --- a/patches/0018-mm-slub-stop-disabling-irqs-around-get_partial.patch +++ /dev/null @@ -1,87 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:15 +0200 -Subject: [PATCH 18/35] mm, slub: stop disabling irqs around get_partial() - -The function get_partial() does not need to have irqs disabled as a whole. It's -sufficient to convert spin_lock operations to their irq saving/restoring -versions. - -As a result, it's now possible to reach the page allocator from the slab -allocator without disabling and re-enabling interrupts on the way. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 22 ++++++++-------------- - 1 file changed, 8 insertions(+), 14 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -1996,11 +1996,12 @@ static inline bool pfmemalloc_match(stru - * Try to allocate a partial slab from a specific node. - */ - static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, -- struct page **ret_page, gfp_t flags) -+ struct page **ret_page, gfp_t gfpflags) - { - struct page *page, *page2; - void *object = NULL; - unsigned int available = 0; -+ unsigned long flags; - int objects; - - /* -@@ -2012,11 +2013,11 @@ static void *get_partial_node(struct kme - if (!n || !n->nr_partial) - return NULL; - -- spin_lock(&n->list_lock); -+ spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry_safe(page, page2, &n->partial, slab_list) { - void *t; - -- if (!pfmemalloc_match(page, flags)) -+ if (!pfmemalloc_match(page, gfpflags)) - continue; - - t = acquire_slab(s, n, page, object == NULL, &objects); -@@ -2037,7 +2038,7 @@ static void *get_partial_node(struct kme - break; - - } -- spin_unlock(&n->list_lock); -+ spin_unlock_irqrestore(&n->list_lock, flags); - return object; - } - -@@ -2765,8 +2766,10 @@ static void *___slab_alloc(struct kmem_c - local_irq_restore(flags); - goto reread_page; - } -- if (unlikely(!slub_percpu_partial(c))) -+ if (unlikely(!slub_percpu_partial(c))) { -+ local_irq_restore(flags); - goto new_objects; /* stolen by an IRQ handler */ -+ } - - page = c->page = slub_percpu_partial(c); - slub_set_percpu_partial(c, page); -@@ -2775,18 +2778,9 @@ static void *___slab_alloc(struct kmem_c - goto redo; - } - -- local_irq_save(flags); -- if (unlikely(c->page)) { -- local_irq_restore(flags); -- goto reread_page; -- } -- - new_objects: - -- lockdep_assert_irqs_disabled(); -- - freelist = get_partial(s, gfpflags, node, &page); -- local_irq_restore(flags); - if (freelist) - goto check_new_page; - diff --git a/patches/0019-mm-slub-make-locking-in-deactivate_slab-irq-safe.patch b/patches/0019-mm-slub-make-locking-in-deactivate_slab-irq-safe.patch new file mode 100644 index 000000000000..3373cebc4aa4 --- /dev/null +++ b/patches/0019-mm-slub-make-locking-in-deactivate_slab-irq-safe.patch @@ -0,0 +1,61 @@ +From: Vlastimil Babka +Date: Wed, 12 May 2021 13:59:58 +0200 +Subject: [PATCH 19/33] mm, slub: make locking in deactivate_slab() irq-safe + +dectivate_slab() now no longer touches the kmem_cache_cpu structure, so it will +be possible to call it with irqs enabled. Just convert the spin_lock calls to +their irq saving/restoring variants to make it irq-safe. + +Note we now have to use cmpxchg_double_slab() for irq-safe slab_lock(), because +in some situations we don't take the list_lock, which would disable irqs. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2223,6 +2223,7 @@ static void deactivate_slab(struct kmem_ + enum slab_modes l = M_NONE, m = M_NONE; + void *nextfree, *freelist_iter, *freelist_tail; + int tail = DEACTIVATE_TO_HEAD; ++ unsigned long flags = 0; + struct page new; + struct page old; + +@@ -2298,7 +2299,7 @@ static void deactivate_slab(struct kmem_ + * that acquire_slab() will see a slab page that + * is frozen + */ +- spin_lock(&n->list_lock); ++ spin_lock_irqsave(&n->list_lock, flags); + } + } else { + m = M_FULL; +@@ -2309,7 +2310,7 @@ static void deactivate_slab(struct kmem_ + * slabs from diagnostic functions will not see + * any frozen slabs. + */ +- spin_lock(&n->list_lock); ++ spin_lock_irqsave(&n->list_lock, flags); + } + } + +@@ -2326,14 +2327,14 @@ static void deactivate_slab(struct kmem_ + } + + l = m; +- if (!__cmpxchg_double_slab(s, page, ++ if (!cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")) + goto redo; + + if (lock) +- spin_unlock(&n->list_lock); ++ spin_unlock_irqrestore(&n->list_lock, flags); + + if (m == M_PARTIAL) + stat(s, tail); diff --git a/patches/0019-mm-slub-move-reset-of-c-page-and-freelist-out-of-dea.patch b/patches/0019-mm-slub-move-reset-of-c-page-and-freelist-out-of-dea.patch deleted file mode 100644 index a7beb833a7c1..000000000000 --- a/patches/0019-mm-slub-move-reset-of-c-page-and-freelist-out-of-dea.patch +++ /dev/null @@ -1,94 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:16 +0200 -Subject: [PATCH 19/35] mm, slub: move reset of c->page and freelist out of - deactivate_slab() - -deactivate_slab() removes the cpu slab by merging the cpu freelist with slab's -freelist and putting the slab on the proper node's list. It also sets the -respective kmem_cache_cpu pointers to NULL. - -By extracting the kmem_cache_cpu operations from the function, we can make it -not dependent on disabled irqs. - -Also if we return a single free pointer from ___slab_alloc, we no longer have -to assign kmem_cache_cpu.page before deactivation or care if somebody preempted -us and assigned a different page to our kmem_cache_cpu in the process. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 31 ++++++++++++++++++------------- - 1 file changed, 18 insertions(+), 13 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2195,10 +2195,13 @@ static void init_kmem_cache_cpus(struct - } - - /* -- * Remove the cpu slab -+ * Finishes removing the cpu slab. Merges cpu's freelist with page's freelist, -+ * unfreezes the slabs and puts it on the proper list. -+ * Assumes the slab has been already safely taken away from kmem_cache_cpu -+ * by the caller. - */ - static void deactivate_slab(struct kmem_cache *s, struct page *page, -- void *freelist, struct kmem_cache_cpu *c) -+ void *freelist) - { - enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); -@@ -2327,9 +2330,6 @@ static void deactivate_slab(struct kmem_ - discard_slab(s, page); - stat(s, FREE_SLAB); - } -- -- c->page = NULL; -- c->freelist = NULL; - } - - /* -@@ -2454,10 +2454,16 @@ static void put_cpu_partial(struct kmem_ - - static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) - { -- stat(s, CPUSLAB_FLUSH); -- deactivate_slab(s, c->page, c->freelist, c); -+ void *freelist = c->freelist; -+ struct page *page = c->page; - -+ c->page = NULL; -+ c->freelist = NULL; - c->tid = next_tid(c->tid); -+ -+ deactivate_slab(s, page, freelist); -+ -+ stat(s, CPUSLAB_FLUSH); - } - - /* -@@ -2755,7 +2761,10 @@ static void *___slab_alloc(struct kmem_c - local_irq_restore(flags); - goto reread_page; - } -- deactivate_slab(s, page, c->freelist, c); -+ freelist = c->freelist; -+ c->page = NULL; -+ c->freelist = NULL; -+ deactivate_slab(s, page, freelist); - local_irq_restore(flags); - - new_slab: -@@ -2834,11 +2843,7 @@ static void *___slab_alloc(struct kmem_c - return_single: - - local_irq_save(flags); -- if (unlikely(c->page)) -- flush_slab(s, c); -- c->page = page; -- -- deactivate_slab(s, page, get_freepointer(s, freelist), c); -+ deactivate_slab(s, page, get_freepointer(s, freelist)); - local_irq_restore(flags); - return freelist; - } diff --git a/patches/0020-mm-slub-call-deactivate_slab-without-disabling-irqs.patch b/patches/0020-mm-slub-call-deactivate_slab-without-disabling-irqs.patch new file mode 100644 index 000000000000..90b6157b1bde --- /dev/null +++ b/patches/0020-mm-slub-call-deactivate_slab-without-disabling-irqs.patch @@ -0,0 +1,71 @@ +From: Vlastimil Babka +Date: Wed, 12 May 2021 14:04:43 +0200 +Subject: [PATCH 20/33] mm, slub: call deactivate_slab() without disabling irqs + +The function is now safe to be called with irqs enabled, so move the calls +outside of irq disabled sections. + +When called from ___slab_alloc() -> flush_slab() we have irqs disabled, so to +reenable them before deactivate_slab() we need to open-code flush_slab() in +___slab_alloc() and reenable irqs after modifying the kmem_cache_cpu fields. +But that means a IRQ handler meanwhile might have assigned a new page to +kmem_cache_cpu.page so we have to retry the whole check. + +The remaining callers of flush_slab() are the IPI handler which has disabled +irqs anyway, and slub_cpu_dead() which will be dealt with in the following +patch. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 24 +++++++++++++++++++----- + 1 file changed, 19 insertions(+), 5 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2779,8 +2779,8 @@ static void *___slab_alloc(struct kmem_c + freelist = c->freelist; + c->page = NULL; + c->freelist = NULL; +- deactivate_slab(s, page, freelist); + local_irq_restore(flags); ++ deactivate_slab(s, page, freelist); + + new_slab: + +@@ -2848,18 +2848,32 @@ static void *___slab_alloc(struct kmem_c + */ + goto return_single; + ++retry_load_page: ++ + local_irq_save(flags); +- if (unlikely(c->page)) +- flush_slab(s, c); ++ if (unlikely(c->page)) { ++ void *flush_freelist = c->freelist; ++ struct page *flush_page = c->page; ++ ++ c->page = NULL; ++ c->freelist = NULL; ++ c->tid = next_tid(c->tid); ++ ++ local_irq_restore(flags); ++ ++ deactivate_slab(s, flush_page, flush_freelist); ++ ++ stat(s, CPUSLAB_FLUSH); ++ ++ goto retry_load_page; ++ } + c->page = page; + + goto load_freelist; + + return_single: + +- local_irq_save(flags); + deactivate_slab(s, page, get_freepointer(s, freelist)); +- local_irq_restore(flags); + return freelist; + } + diff --git a/patches/0020-mm-slub-make-locking-in-deactivate_slab-irq-safe.patch b/patches/0020-mm-slub-make-locking-in-deactivate_slab-irq-safe.patch deleted file mode 100644 index 678355c7d274..000000000000 --- a/patches/0020-mm-slub-make-locking-in-deactivate_slab-irq-safe.patch +++ /dev/null @@ -1,62 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:17 +0200 -Subject: [PATCH 20/35] mm, slub: make locking in deactivate_slab() irq-safe - -dectivate_slab() now no longer touches the kmem_cache_cpu structure, so it will -be possible to call it with irqs enabled. Just convert the spin_lock calls to -their irq saving/restoring variants to make it irq-safe. - -Note we now have to use cmpxchg_double_slab() for irq-safe slab_lock(), because -in some situations we don't take the list_lock, which would disable irqs. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 9 +++++---- - 1 file changed, 5 insertions(+), 4 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2209,6 +2209,7 @@ static void deactivate_slab(struct kmem_ - enum slab_modes l = M_NONE, m = M_NONE; - void *nextfree, *freelist_iter, *freelist_tail; - int tail = DEACTIVATE_TO_HEAD; -+ unsigned long flags = 0; - struct page new; - struct page old; - -@@ -2284,7 +2285,7 @@ static void deactivate_slab(struct kmem_ - * that acquire_slab() will see a slab page that - * is frozen - */ -- spin_lock(&n->list_lock); -+ spin_lock_irqsave(&n->list_lock, flags); - } - } else { - m = M_FULL; -@@ -2295,7 +2296,7 @@ static void deactivate_slab(struct kmem_ - * slabs from diagnostic functions will not see - * any frozen slabs. - */ -- spin_lock(&n->list_lock); -+ spin_lock_irqsave(&n->list_lock, flags); - } - } - -@@ -2312,14 +2313,14 @@ static void deactivate_slab(struct kmem_ - } - - l = m; -- if (!__cmpxchg_double_slab(s, page, -+ if (!cmpxchg_double_slab(s, page, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")) - goto redo; - - if (lock) -- spin_unlock(&n->list_lock); -+ spin_unlock_irqrestore(&n->list_lock, flags); - - if (m == M_PARTIAL) - stat(s, tail); diff --git a/patches/0021-mm-slub-call-deactivate_slab-without-disabling-irqs.patch b/patches/0021-mm-slub-call-deactivate_slab-without-disabling-irqs.patch deleted file mode 100644 index 576b90ba7e59..000000000000 --- a/patches/0021-mm-slub-call-deactivate_slab-without-disabling-irqs.patch +++ /dev/null @@ -1,72 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:18 +0200 -Subject: [PATCH 21/35] mm, slub: call deactivate_slab() without disabling irqs - -The function is now safe to be called with irqs enabled, so move the calls -outside of irq disabled sections. - -When called from ___slab_alloc() -> flush_slab() we have irqs disabled, so to -reenable them before deactivate_slab() we need to open-code flush_slab() in -___slab_alloc() and reenable irqs after modifying the kmem_cache_cpu fields. -But that means a IRQ handler meanwhile might have assigned a new page to -kmem_cache_cpu.page so we have to retry the whole check. - -The remaining callers of flush_slab() are the IPI handler which has disabled -irqs anyway, and slub_cpu_dead() which will be dealt with in the following -patch. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 24 +++++++++++++++++++----- - 1 file changed, 19 insertions(+), 5 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2765,8 +2765,8 @@ static void *___slab_alloc(struct kmem_c - freelist = c->freelist; - c->page = NULL; - c->freelist = NULL; -- deactivate_slab(s, page, freelist); - local_irq_restore(flags); -+ deactivate_slab(s, page, freelist); - - new_slab: - -@@ -2834,18 +2834,32 @@ static void *___slab_alloc(struct kmem_c - */ - goto return_single; - -+retry_load_page: -+ - local_irq_save(flags); -- if (unlikely(c->page)) -- flush_slab(s, c); -+ if (unlikely(c->page)) { -+ void *flush_freelist = c->freelist; -+ struct page *flush_page = c->page; -+ -+ c->page = NULL; -+ c->freelist = NULL; -+ c->tid = next_tid(c->tid); -+ -+ local_irq_restore(flags); -+ -+ deactivate_slab(s, flush_page, flush_freelist); -+ -+ stat(s, CPUSLAB_FLUSH); -+ -+ goto retry_load_page; -+ } - c->page = page; - - goto load_freelist; - - return_single: - -- local_irq_save(flags); - deactivate_slab(s, page, get_freepointer(s, freelist)); -- local_irq_restore(flags); - return freelist; - } - diff --git a/patches/0021-mm-slub-move-irq-control-into-unfreeze_partials.patch b/patches/0021-mm-slub-move-irq-control-into-unfreeze_partials.patch new file mode 100644 index 000000000000..55d97a8a4893 --- /dev/null +++ b/patches/0021-mm-slub-move-irq-control-into-unfreeze_partials.patch @@ -0,0 +1,61 @@ +From: Vlastimil Babka +Date: Thu, 20 May 2021 14:00:03 +0200 +Subject: [PATCH 21/33] mm, slub: move irq control into unfreeze_partials() + +unfreeze_partials() can be optimized so that it doesn't need irqs disabled for +the whole time. As the first step, move irq control into the function and +remove it from the put_cpu_partial() caller. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2350,9 +2350,8 @@ static void deactivate_slab(struct kmem_ + /* + * Unfreeze all the cpu partial slabs. + * +- * This function must be called with interrupts disabled +- * for the cpu using c (or some other guarantee must be there +- * to guarantee no concurrent accesses). ++ * This function must be called with preemption or migration ++ * disabled with c local to the cpu. + */ + static void unfreeze_partials(struct kmem_cache *s, + struct kmem_cache_cpu *c) +@@ -2360,6 +2359,9 @@ static void unfreeze_partials(struct kme + #ifdef CONFIG_SLUB_CPU_PARTIAL + struct kmem_cache_node *n = NULL, *n2 = NULL; + struct page *page, *discard_page = NULL; ++ unsigned long flags; ++ ++ local_irq_save(flags); + + while ((page = slub_percpu_partial(c))) { + struct page new; +@@ -2412,6 +2414,8 @@ static void unfreeze_partials(struct kme + discard_slab(s, page); + stat(s, FREE_SLAB); + } ++ ++ local_irq_restore(flags); + #endif /* CONFIG_SLUB_CPU_PARTIAL */ + } + +@@ -2439,14 +2443,11 @@ static void put_cpu_partial(struct kmem_ + pobjects = oldpage->pobjects; + pages = oldpage->pages; + if (drain && pobjects > slub_cpu_partial(s)) { +- unsigned long flags; + /* + * partial array is full. Move the existing + * set to the per node partial list. + */ +- local_irq_save(flags); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); +- local_irq_restore(flags); + oldpage = NULL; + pobjects = 0; + pages = 0; diff --git a/patches/0022-mm-slub-discard-slabs-in-unfreeze_partials-without-i.patch b/patches/0022-mm-slub-discard-slabs-in-unfreeze_partials-without-i.patch new file mode 100644 index 000000000000..5a81f66bd24a --- /dev/null +++ b/patches/0022-mm-slub-discard-slabs-in-unfreeze_partials-without-i.patch @@ -0,0 +1,32 @@ +From: Vlastimil Babka +Date: Thu, 20 May 2021 14:01:57 +0200 +Subject: [PATCH 22/33] mm, slub: discard slabs in unfreeze_partials() without + irqs disabled + +No need for disabled irqs when discarding slabs, so restore them before +discarding. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2406,6 +2406,8 @@ static void unfreeze_partials(struct kme + if (n) + spin_unlock(&n->list_lock); + ++ local_irq_restore(flags); ++ + while (discard_page) { + page = discard_page; + discard_page = discard_page->next; +@@ -2415,7 +2417,6 @@ static void unfreeze_partials(struct kme + stat(s, FREE_SLAB); + } + +- local_irq_restore(flags); + #endif /* CONFIG_SLUB_CPU_PARTIAL */ + } + diff --git a/patches/0022-mm-slub-move-irq-control-into-unfreeze_partials.patch b/patches/0022-mm-slub-move-irq-control-into-unfreeze_partials.patch deleted file mode 100644 index 6a79f28bb712..000000000000 --- a/patches/0022-mm-slub-move-irq-control-into-unfreeze_partials.patch +++ /dev/null @@ -1,62 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:19 +0200 -Subject: [PATCH 22/35] mm, slub: move irq control into unfreeze_partials() - -unfreeze_partials() can be optimized so that it doesn't need irqs disabled for -the whole time. As the first step, move irq control into the function and -remove it from the put_cpu_partial() caller. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 13 +++++++------ - 1 file changed, 7 insertions(+), 6 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2336,9 +2336,8 @@ static void deactivate_slab(struct kmem_ - /* - * Unfreeze all the cpu partial slabs. - * -- * This function must be called with interrupts disabled -- * for the cpu using c (or some other guarantee must be there -- * to guarantee no concurrent accesses). -+ * This function must be called with preemption or migration -+ * disabled with c local to the cpu. - */ - static void unfreeze_partials(struct kmem_cache *s, - struct kmem_cache_cpu *c) -@@ -2346,6 +2345,9 @@ static void unfreeze_partials(struct kme - #ifdef CONFIG_SLUB_CPU_PARTIAL - struct kmem_cache_node *n = NULL, *n2 = NULL; - struct page *page, *discard_page = NULL; -+ unsigned long flags; -+ -+ local_irq_save(flags); - - while ((page = slub_percpu_partial(c))) { - struct page new; -@@ -2398,6 +2400,8 @@ static void unfreeze_partials(struct kme - discard_slab(s, page); - stat(s, FREE_SLAB); - } -+ -+ local_irq_restore(flags); - #endif /* CONFIG_SLUB_CPU_PARTIAL */ - } - -@@ -2425,14 +2429,11 @@ static void put_cpu_partial(struct kmem_ - pobjects = oldpage->pobjects; - pages = oldpage->pages; - if (drain && pobjects > slub_cpu_partial(s)) { -- unsigned long flags; - /* - * partial array is full. Move the existing - * set to the per node partial list. - */ -- local_irq_save(flags); - unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); -- local_irq_restore(flags); - oldpage = NULL; - pobjects = 0; - pages = 0; diff --git a/patches/0023-mm-slub-detach-whole-partial-list-at-once-in-unfreez.patch b/patches/0023-mm-slub-detach-whole-partial-list-at-once-in-unfreez.patch new file mode 100644 index 000000000000..627a8c94488e --- /dev/null +++ b/patches/0023-mm-slub-detach-whole-partial-list-at-once-in-unfreez.patch @@ -0,0 +1,39 @@ +From: Vlastimil Babka +Date: Thu, 20 May 2021 14:18:12 +0200 +Subject: [PATCH 23/33] mm, slub: detach whole partial list at once in + unfreeze_partials() + +Instead of iterating through the live percpu partial list, detach it from the +kmem_cache_cpu at once. This is simpler and will allow further optimization. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2358,16 +2358,20 @@ static void unfreeze_partials(struct kme + { + #ifdef CONFIG_SLUB_CPU_PARTIAL + struct kmem_cache_node *n = NULL, *n2 = NULL; +- struct page *page, *discard_page = NULL; ++ struct page *page, *partial_page, *discard_page = NULL; + unsigned long flags; + + local_irq_save(flags); + +- while ((page = slub_percpu_partial(c))) { ++ partial_page = slub_percpu_partial(c); ++ c->partial = NULL; ++ ++ while (partial_page) { + struct page new; + struct page old; + +- slub_set_percpu_partial(c, page); ++ page = partial_page; ++ partial_page = page->next; + + n2 = get_node(s, page_to_nid(page)); + if (n != n2) { diff --git a/patches/0023-mm-slub-discard-slabs-in-unfreeze_partials-without-i.patch b/patches/0023-mm-slub-discard-slabs-in-unfreeze_partials-without-i.patch deleted file mode 100644 index 58d0c6a18180..000000000000 --- a/patches/0023-mm-slub-discard-slabs-in-unfreeze_partials-without-i.patch +++ /dev/null @@ -1,33 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:20 +0200 -Subject: [PATCH 23/35] mm, slub: discard slabs in unfreeze_partials() without - irqs disabled - -No need for disabled irqs when discarding slabs, so restore them before -discarding. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2392,6 +2392,8 @@ static void unfreeze_partials(struct kme - if (n) - spin_unlock(&n->list_lock); - -+ local_irq_restore(flags); -+ - while (discard_page) { - page = discard_page; - discard_page = discard_page->next; -@@ -2401,7 +2403,6 @@ static void unfreeze_partials(struct kme - stat(s, FREE_SLAB); - } - -- local_irq_restore(flags); - #endif /* CONFIG_SLUB_CPU_PARTIAL */ - } - diff --git a/patches/0024-mm-slub-detach-whole-partial-list-at-once-in-unfreez.patch b/patches/0024-mm-slub-detach-whole-partial-list-at-once-in-unfreez.patch deleted file mode 100644 index 4b36f4376cb4..000000000000 --- a/patches/0024-mm-slub-detach-whole-partial-list-at-once-in-unfreez.patch +++ /dev/null @@ -1,40 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:21 +0200 -Subject: [PATCH 24/35] mm, slub: detach whole partial list at once in - unfreeze_partials() - -Instead of iterating through the live percpu partial list, detach it from the -kmem_cache_cpu at once. This is simpler and will allow further optimization. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 10 +++++++--- - 1 file changed, 7 insertions(+), 3 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2344,16 +2344,20 @@ static void unfreeze_partials(struct kme - { - #ifdef CONFIG_SLUB_CPU_PARTIAL - struct kmem_cache_node *n = NULL, *n2 = NULL; -- struct page *page, *discard_page = NULL; -+ struct page *page, *partial_page, *discard_page = NULL; - unsigned long flags; - - local_irq_save(flags); - -- while ((page = slub_percpu_partial(c))) { -+ partial_page = slub_percpu_partial(c); -+ c->partial = NULL; -+ -+ while (partial_page) { - struct page new; - struct page old; - -- slub_set_percpu_partial(c, page); -+ page = partial_page; -+ partial_page = page->next; - - n2 = get_node(s, page_to_nid(page)); - if (n != n2) { diff --git a/patches/0024-mm-slub-separate-detaching-of-partial-list-in-unfree.patch b/patches/0024-mm-slub-separate-detaching-of-partial-list-in-unfree.patch new file mode 100644 index 000000000000..7a01091b8456 --- /dev/null +++ b/patches/0024-mm-slub-separate-detaching-of-partial-list-in-unfree.patch @@ -0,0 +1,156 @@ +From: Vlastimil Babka +Date: Thu, 20 May 2021 16:39:51 +0200 +Subject: [PATCH 24/33] mm, slub: separate detaching of partial list in + unfreeze_partials() from unfreezing + +Unfreezing partial list can be split to two phases - detaching the list from +struct kmem_cache_cpu, and processing the list. The whole operation does not +need to be protected by disabled irqs. Restructure the code to separate the +detaching (with disabled irqs) and unfreezing (with irq disabling to be reduced +in the next patch). + +Also, unfreeze_partials() can be called from another cpu on behalf of a cpu +that is being offlined, where disabling irqs on the local cpu has no sense, so +restructure the code as follows: + +- __unfreeze_partials() is the bulk of unfreeze_partials() that processes the + detached percpu partial list +- unfreeze_partials() detaches list from current cpu with irqs disabled and + calls __unfreeze_partials() +- unfreeze_partials_cpu() is to be called for the offlined cpu so it needs no + irq disabling, and is called from __flush_cpu_slab() +- flush_cpu_slab() is for the local cpu thus it needs to call + unfreeze_partials(). So it can't simply call + __flush_cpu_slab(smp_processor_id()) anymore and we have to open-code the + proper calls. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 73 +++++++++++++++++++++++++++++++++++++++++++------------------- + 1 file changed, 51 insertions(+), 22 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2347,25 +2347,15 @@ static void deactivate_slab(struct kmem_ + } + } + +-/* +- * Unfreeze all the cpu partial slabs. +- * +- * This function must be called with preemption or migration +- * disabled with c local to the cpu. +- */ +-static void unfreeze_partials(struct kmem_cache *s, +- struct kmem_cache_cpu *c) +-{ + #ifdef CONFIG_SLUB_CPU_PARTIAL ++static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page) ++{ + struct kmem_cache_node *n = NULL, *n2 = NULL; +- struct page *page, *partial_page, *discard_page = NULL; ++ struct page *page, *discard_page = NULL; + unsigned long flags; + + local_irq_save(flags); + +- partial_page = slub_percpu_partial(c); +- c->partial = NULL; +- + while (partial_page) { + struct page new; + struct page old; +@@ -2420,10 +2410,45 @@ static void unfreeze_partials(struct kme + discard_slab(s, page); + stat(s, FREE_SLAB); + } ++} + +-#endif /* CONFIG_SLUB_CPU_PARTIAL */ ++/* ++ * Unfreeze all the cpu partial slabs. ++ */ ++static void unfreeze_partials(struct kmem_cache *s) ++{ ++ struct page *partial_page; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ partial_page = this_cpu_read(s->cpu_slab->partial); ++ this_cpu_write(s->cpu_slab->partial, NULL); ++ local_irq_restore(flags); ++ ++ if (partial_page) ++ __unfreeze_partials(s, partial_page); ++} ++ ++static void unfreeze_partials_cpu(struct kmem_cache *s, ++ struct kmem_cache_cpu *c) ++{ ++ struct page *partial_page; ++ ++ partial_page = slub_percpu_partial(c); ++ c->partial = NULL; ++ ++ if (partial_page) ++ __unfreeze_partials(s, partial_page); + } + ++#else /* CONFIG_SLUB_CPU_PARTIAL */ ++ ++static inline void unfreeze_partials(struct kmem_cache *s) { } ++static inline void unfreeze_partials_cpu(struct kmem_cache *s, ++ struct kmem_cache_cpu *c) { } ++ ++#endif /* CONFIG_SLUB_CPU_PARTIAL */ ++ + /* + * Put a page that was just frozen (in __slab_free|get_partial_node) into a + * partial page slot if available. +@@ -2452,7 +2477,7 @@ static void put_cpu_partial(struct kmem_ + * partial array is full. Move the existing + * set to the per node partial list. + */ +- unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); ++ unfreeze_partials(s); + oldpage = NULL; + pobjects = 0; + pages = 0; +@@ -2487,11 +2512,6 @@ static inline void flush_slab(struct kme + stat(s, CPUSLAB_FLUSH); + } + +-/* +- * Flush cpu slab. +- * +- * Called from IPI handler with interrupts disabled. +- */ + static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) + { + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); +@@ -2499,14 +2519,23 @@ static inline void __flush_cpu_slab(stru + if (c->page) + flush_slab(s, c); + +- unfreeze_partials(s, c); ++ unfreeze_partials_cpu(s, c); + } + ++/* ++ * Flush cpu slab. ++ * ++ * Called from IPI handler with interrupts disabled. ++ */ + static void flush_cpu_slab(void *d) + { + struct kmem_cache *s = d; ++ struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); ++ ++ if (c->page) ++ flush_slab(s, c); + +- __flush_cpu_slab(s, smp_processor_id()); ++ unfreeze_partials(s); + } + + static bool has_cpu_slab(int cpu, void *info) diff --git a/patches/0025-mm-slub-only-disable-irq-with-spin_lock-in-__unfreez.patch b/patches/0025-mm-slub-only-disable-irq-with-spin_lock-in-__unfreez.patch new file mode 100644 index 000000000000..8d8f92eb7a96 --- /dev/null +++ b/patches/0025-mm-slub-only-disable-irq-with-spin_lock-in-__unfreez.patch @@ -0,0 +1,51 @@ +From: Vlastimil Babka +Date: Fri, 21 May 2021 01:16:54 +0200 +Subject: [PATCH 25/33] mm, slub: only disable irq with spin_lock in + __unfreeze_partials() + +__unfreeze_partials() no longer needs to have irqs disabled, except for making +the spin_lock operations irq-safe, so convert the spin_locks operations and +remove the separate irq handling. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 12 ++++-------- + 1 file changed, 4 insertions(+), 8 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2352,9 +2352,7 @@ static void __unfreeze_partials(struct k + { + struct kmem_cache_node *n = NULL, *n2 = NULL; + struct page *page, *discard_page = NULL; +- unsigned long flags; +- +- local_irq_save(flags); ++ unsigned long flags = 0; + + while (partial_page) { + struct page new; +@@ -2366,10 +2364,10 @@ static void __unfreeze_partials(struct k + n2 = get_node(s, page_to_nid(page)); + if (n != n2) { + if (n) +- spin_unlock(&n->list_lock); ++ spin_unlock_irqrestore(&n->list_lock, flags); + + n = n2; +- spin_lock(&n->list_lock); ++ spin_lock_irqsave(&n->list_lock, flags); + } + + do { +@@ -2398,9 +2396,7 @@ static void __unfreeze_partials(struct k + } + + if (n) +- spin_unlock(&n->list_lock); +- +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&n->list_lock, flags); + + while (discard_page) { + page = discard_page; diff --git a/patches/0025-mm-slub-separate-detaching-of-partial-list-in-unfree.patch b/patches/0025-mm-slub-separate-detaching-of-partial-list-in-unfree.patch deleted file mode 100644 index 363cfdd1c429..000000000000 --- a/patches/0025-mm-slub-separate-detaching-of-partial-list-in-unfree.patch +++ /dev/null @@ -1,157 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:22 +0200 -Subject: [PATCH 25/35] mm, slub: separate detaching of partial list in - unfreeze_partials() from unfreezing - -Unfreezing partial list can be split to two phases - detaching the list from -struct kmem_cache_cpu, and processing the list. The whole operation does not -need to be protected by disabled irqs. Restructure the code to separate the -detaching (with disabled irqs) and unfreezing (with irq disabling to be reduced -in the next patch). - -Also, unfreeze_partials() can be called from another cpu on behalf of a cpu -that is being offlined, where disabling irqs on the local cpu has no sense, so -restructure the code as follows: - -- __unfreeze_partials() is the bulk of unfreeze_partials() that processes the - detached percpu partial list -- unfreeze_partials() detaches list from current cpu with irqs disabled and - calls __unfreeze_partials() -- unfreeze_partials_cpu() is to be called for the offlined cpu so it needs no - irq disabling, and is called from __flush_cpu_slab() -- flush_cpu_slab() is for the local cpu thus it needs to call - unfreeze_partials(). So it can't simply call - __flush_cpu_slab(smp_processor_id()) anymore and we have to open-code the - proper calls. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 73 +++++++++++++++++++++++++++++++++++++++++++------------------- - 1 file changed, 51 insertions(+), 22 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2333,25 +2333,15 @@ static void deactivate_slab(struct kmem_ - } - } - --/* -- * Unfreeze all the cpu partial slabs. -- * -- * This function must be called with preemption or migration -- * disabled with c local to the cpu. -- */ --static void unfreeze_partials(struct kmem_cache *s, -- struct kmem_cache_cpu *c) --{ - #ifdef CONFIG_SLUB_CPU_PARTIAL -+static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page) -+{ - struct kmem_cache_node *n = NULL, *n2 = NULL; -- struct page *page, *partial_page, *discard_page = NULL; -+ struct page *page, *discard_page = NULL; - unsigned long flags; - - local_irq_save(flags); - -- partial_page = slub_percpu_partial(c); -- c->partial = NULL; -- - while (partial_page) { - struct page new; - struct page old; -@@ -2406,10 +2396,45 @@ static void unfreeze_partials(struct kme - discard_slab(s, page); - stat(s, FREE_SLAB); - } -+} - --#endif /* CONFIG_SLUB_CPU_PARTIAL */ -+/* -+ * Unfreeze all the cpu partial slabs. -+ */ -+static void unfreeze_partials(struct kmem_cache *s) -+{ -+ struct page *partial_page; -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ partial_page = this_cpu_read(s->cpu_slab->partial); -+ this_cpu_write(s->cpu_slab->partial, NULL); -+ local_irq_restore(flags); -+ -+ if (partial_page) -+ __unfreeze_partials(s, partial_page); -+} -+ -+static void unfreeze_partials_cpu(struct kmem_cache *s, -+ struct kmem_cache_cpu *c) -+{ -+ struct page *partial_page; -+ -+ partial_page = slub_percpu_partial(c); -+ c->partial = NULL; -+ -+ if (partial_page) -+ __unfreeze_partials(s, partial_page); - } - -+#else /* CONFIG_SLUB_CPU_PARTIAL */ -+ -+static inline void unfreeze_partials(struct kmem_cache *s) { } -+static inline void unfreeze_partials_cpu(struct kmem_cache *s, -+ struct kmem_cache_cpu *c) { } -+ -+#endif /* CONFIG_SLUB_CPU_PARTIAL */ -+ - /* - * Put a page that was just frozen (in __slab_free|get_partial_node) into a - * partial page slot if available. -@@ -2438,7 +2463,7 @@ static void put_cpu_partial(struct kmem_ - * partial array is full. Move the existing - * set to the per node partial list. - */ -- unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); -+ unfreeze_partials(s); - oldpage = NULL; - pobjects = 0; - pages = 0; -@@ -2473,11 +2498,6 @@ static inline void flush_slab(struct kme - stat(s, CPUSLAB_FLUSH); - } - --/* -- * Flush cpu slab. -- * -- * Called from IPI handler with interrupts disabled. -- */ - static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) - { - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); -@@ -2485,14 +2505,23 @@ static inline void __flush_cpu_slab(stru - if (c->page) - flush_slab(s, c); - -- unfreeze_partials(s, c); -+ unfreeze_partials_cpu(s, c); - } - -+/* -+ * Flush cpu slab. -+ * -+ * Called from IPI handler with interrupts disabled. -+ */ - static void flush_cpu_slab(void *d) - { - struct kmem_cache *s = d; -+ struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); -+ -+ if (c->page) -+ flush_slab(s, c); - -- __flush_cpu_slab(s, smp_processor_id()); -+ unfreeze_partials(s); - } - - static bool has_cpu_slab(int cpu, void *info) diff --git a/patches/0026-mm-slub-don-t-disable-irqs-in-slub_cpu_dead.patch b/patches/0026-mm-slub-don-t-disable-irqs-in-slub_cpu_dead.patch new file mode 100644 index 000000000000..ecc53e05a8e6 --- /dev/null +++ b/patches/0026-mm-slub-don-t-disable-irqs-in-slub_cpu_dead.patch @@ -0,0 +1,30 @@ +From: Vlastimil Babka +Date: Fri, 21 May 2021 01:48:56 +0200 +Subject: [PATCH 26/33] mm, slub: don't disable irqs in slub_cpu_dead() + +slub_cpu_dead() cleans up for an offlined cpu from another cpu and calls only +functions that are now irq safe, so we don't need to disable irqs anymore. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 6 +----- + 1 file changed, 1 insertion(+), 5 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2554,14 +2554,10 @@ static void flush_all(struct kmem_cache + static int slub_cpu_dead(unsigned int cpu) + { + struct kmem_cache *s; +- unsigned long flags; + + mutex_lock(&slab_mutex); +- list_for_each_entry(s, &slab_caches, list) { +- local_irq_save(flags); ++ list_for_each_entry(s, &slab_caches, list) + __flush_cpu_slab(s, cpu); +- local_irq_restore(flags); +- } + mutex_unlock(&slab_mutex); + return 0; + } diff --git a/patches/0026-mm-slub-only-disable-irq-with-spin_lock-in-__unfreez.patch b/patches/0026-mm-slub-only-disable-irq-with-spin_lock-in-__unfreez.patch deleted file mode 100644 index 6f52a063a54b..000000000000 --- a/patches/0026-mm-slub-only-disable-irq-with-spin_lock-in-__unfreez.patch +++ /dev/null @@ -1,52 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:23 +0200 -Subject: [PATCH 26/35] mm, slub: only disable irq with spin_lock in - __unfreeze_partials() - -__unfreeze_partials() no longer needs to have irqs disabled, except for making -the spin_lock operations irq-safe, so convert the spin_locks operations and -remove the separate irq handling. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 12 ++++-------- - 1 file changed, 4 insertions(+), 8 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2338,9 +2338,7 @@ static void __unfreeze_partials(struct k - { - struct kmem_cache_node *n = NULL, *n2 = NULL; - struct page *page, *discard_page = NULL; -- unsigned long flags; -- -- local_irq_save(flags); -+ unsigned long flags = 0; - - while (partial_page) { - struct page new; -@@ -2352,10 +2350,10 @@ static void __unfreeze_partials(struct k - n2 = get_node(s, page_to_nid(page)); - if (n != n2) { - if (n) -- spin_unlock(&n->list_lock); -+ spin_unlock_irqrestore(&n->list_lock, flags); - - n = n2; -- spin_lock(&n->list_lock); -+ spin_lock_irqsave(&n->list_lock, flags); - } - - do { -@@ -2384,9 +2382,7 @@ static void __unfreeze_partials(struct k - } - - if (n) -- spin_unlock(&n->list_lock); -- -- local_irq_restore(flags); -+ spin_unlock_irqrestore(&n->list_lock, flags); - - while (discard_page) { - page = discard_page; diff --git a/patches/0027-mm-slab-split-out-the-cpu-offline-variant-of-flush_s.patch b/patches/0027-mm-slab-split-out-the-cpu-offline-variant-of-flush_s.patch new file mode 100644 index 000000000000..d4a58b39704b --- /dev/null +++ b/patches/0027-mm-slab-split-out-the-cpu-offline-variant-of-flush_s.patch @@ -0,0 +1,44 @@ +From: Vlastimil Babka +Date: Thu, 3 Jun 2021 19:17:42 +0200 +Subject: [PATCH 27/33] mm, slab: split out the cpu offline variant of + flush_slab() + +flush_slab() is called either as part IPI handler on given live cpu, or as a +cleanup on behalf of another cpu that went offline. The first case needs to +protect updating the kmem_cache_cpu fields with disabled irqs. Currently the +whole call happens with irqs disabled by the IPI handler, but the following +patch will change from IPI to workqueue, and flush_slab() will have to disable +irqs (to be replaced with a local lock later) in the critical part. + +To prepare for this change, replace the call to flush_slab() for the dead cpu +handling with an opencoded variant that will not disable irqs nor take a local +lock. + +Suggested-by: Mike Galbraith +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2511,9 +2511,17 @@ static inline void flush_slab(struct kme + static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) + { + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); ++ void *freelist = c->freelist; ++ struct page *page = c->page; + +- if (c->page) +- flush_slab(s, c); ++ c->page = NULL; ++ c->freelist = NULL; ++ c->tid = next_tid(c->tid); ++ ++ if (page) { ++ deactivate_slab(s, page, freelist); ++ stat(s, CPUSLAB_FLUSH); ++ } + + unfreeze_partials_cpu(s, c); + } diff --git a/patches/0027-mm-slub-don-t-disable-irqs-in-slub_cpu_dead.patch b/patches/0027-mm-slub-don-t-disable-irqs-in-slub_cpu_dead.patch deleted file mode 100644 index cf715091134f..000000000000 --- a/patches/0027-mm-slub-don-t-disable-irqs-in-slub_cpu_dead.patch +++ /dev/null @@ -1,31 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:24 +0200 -Subject: [PATCH 27/35] mm, slub: don't disable irqs in slub_cpu_dead() - -slub_cpu_dead() cleans up for an offlined cpu from another cpu and calls only -functions that are now irq safe, so we don't need to disable irqs anymore. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 6 +----- - 1 file changed, 1 insertion(+), 5 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2540,14 +2540,10 @@ static void flush_all(struct kmem_cache - static int slub_cpu_dead(unsigned int cpu) - { - struct kmem_cache *s; -- unsigned long flags; - - mutex_lock(&slab_mutex); -- list_for_each_entry(s, &slab_caches, list) { -- local_irq_save(flags); -+ list_for_each_entry(s, &slab_caches, list) - __flush_cpu_slab(s, cpu); -- local_irq_restore(flags); -- } - mutex_unlock(&slab_mutex); - return 0; - } diff --git a/patches/0028-mm-slab-make-flush_slab-possible-to-call-with-irqs-e.patch b/patches/0028-mm-slab-make-flush_slab-possible-to-call-with-irqs-e.patch deleted file mode 100644 index dd475caaf163..000000000000 --- a/patches/0028-mm-slab-make-flush_slab-possible-to-call-with-irqs-e.patch +++ /dev/null @@ -1,69 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:25 +0200 -Subject: [PATCH 28/35] mm, slab: make flush_slab() possible to call with irqs - enabled - -Currently flush_slab() is always called with disabled IRQs if it's needed, but -the following patches will change that, so add a parameter to control IRQ -disabling within the function, which only protects the kmem_cache_cpu -manipulation and not the call to deactivate_slab() which doesn't need it. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 24 ++++++++++++++++++------ - 1 file changed, 18 insertions(+), 6 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2480,16 +2480,28 @@ static void put_cpu_partial(struct kmem_ - #endif /* CONFIG_SLUB_CPU_PARTIAL */ - } - --static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) -+static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c, -+ bool lock) - { -- void *freelist = c->freelist; -- struct page *page = c->page; -+ unsigned long flags; -+ void *freelist; -+ struct page *page; -+ -+ if (lock) -+ local_irq_save(flags); -+ -+ freelist = c->freelist; -+ page = c->page; - - c->page = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - -- deactivate_slab(s, page, freelist); -+ if (lock) -+ local_irq_restore(flags); -+ -+ if (page) -+ deactivate_slab(s, page, freelist); - - stat(s, CPUSLAB_FLUSH); - } -@@ -2499,7 +2511,7 @@ static inline void __flush_cpu_slab(stru - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - - if (c->page) -- flush_slab(s, c); -+ flush_slab(s, c, false); - - unfreeze_partials_cpu(s, c); - } -@@ -2515,7 +2527,7 @@ static void flush_cpu_slab(void *d) - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); - - if (c->page) -- flush_slab(s, c); -+ flush_slab(s, c, false); - - unfreeze_partials(s); - } diff --git a/patches/0028-mm-slub-move-flush_cpu_slab-invocations-__free_slab-.patch b/patches/0028-mm-slub-move-flush_cpu_slab-invocations-__free_slab-.patch new file mode 100644 index 000000000000..78b29d9aeabd --- /dev/null +++ b/patches/0028-mm-slub-move-flush_cpu_slab-invocations-__free_slab-.patch @@ -0,0 +1,211 @@ +From: Sebastian Andrzej Siewior +Date: Fri, 26 Feb 2021 17:11:55 +0100 +Subject: [PATCH 28/33] mm: slub: move flush_cpu_slab() invocations + __free_slab() invocations out of IRQ context + +flush_all() flushes a specific SLAB cache on each CPU (where the cache +is present). The deactivate_slab()/__free_slab() invocation happens +within IPI handler and is problematic for PREEMPT_RT. + +The flush operation is not a frequent operation or a hot path. The +per-CPU flush operation can be moved to within a workqueue. + +Because a workqueue handler, unlike IPI handler, does not disable irqs, +flush_slab() now has to disable them for working with the kmem_cache_cpu +fields. deactivate_slab() is safe to call with irqs enabled. + +[vbabka@suse.cz: adapt to new SLUB changes] +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Vlastimil Babka +--- + mm/slab_common.c | 2 + + mm/slub.c | 94 +++++++++++++++++++++++++++++++++++++++++++++---------- + 2 files changed, 80 insertions(+), 16 deletions(-) + +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -502,6 +502,7 @@ void kmem_cache_destroy(struct kmem_cach + if (unlikely(!s)) + return; + ++ cpus_read_lock(); + mutex_lock(&slab_mutex); + + s->refcount--; +@@ -516,6 +517,7 @@ void kmem_cache_destroy(struct kmem_cach + } + out_unlock: + mutex_unlock(&slab_mutex); ++ cpus_read_unlock(); + } + EXPORT_SYMBOL(kmem_cache_destroy); + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2496,16 +2496,25 @@ static void put_cpu_partial(struct kmem_ + + static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) + { +- void *freelist = c->freelist; +- struct page *page = c->page; ++ unsigned long flags; ++ struct page *page; ++ void *freelist; ++ ++ local_irq_save(flags); ++ ++ page = c->page; ++ freelist = c->freelist; + + c->page = NULL; + c->freelist = NULL; + c->tid = next_tid(c->tid); + +- deactivate_slab(s, page, freelist); ++ local_irq_restore(flags); + +- stat(s, CPUSLAB_FLUSH); ++ if (page) { ++ deactivate_slab(s, page, freelist); ++ stat(s, CPUSLAB_FLUSH); ++ } + } + + static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) +@@ -2526,15 +2535,27 @@ static inline void __flush_cpu_slab(stru + unfreeze_partials_cpu(s, c); + } + ++struct slub_flush_work { ++ struct work_struct work; ++ struct kmem_cache *s; ++ bool skip; ++}; ++ + /* + * Flush cpu slab. + * +- * Called from IPI handler with interrupts disabled. ++ * Called from CPU work handler with migration disabled. + */ +-static void flush_cpu_slab(void *d) ++static void flush_cpu_slab(struct work_struct *w) + { +- struct kmem_cache *s = d; +- struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); ++ struct kmem_cache *s; ++ struct kmem_cache_cpu *c; ++ struct slub_flush_work *sfw; ++ ++ sfw = container_of(w, struct slub_flush_work, work); ++ ++ s = sfw->s; ++ c = this_cpu_ptr(s->cpu_slab); + + if (c->page) + flush_slab(s, c); +@@ -2542,17 +2563,51 @@ static void flush_cpu_slab(void *d) + unfreeze_partials(s); + } + +-static bool has_cpu_slab(int cpu, void *info) ++static bool has_cpu_slab(int cpu, struct kmem_cache *s) + { +- struct kmem_cache *s = info; + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return c->page || slub_percpu_partial(c); + } + ++static DEFINE_MUTEX(flush_lock); ++static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); ++ ++static void flush_all_cpus_locked(struct kmem_cache *s) ++{ ++ struct slub_flush_work *sfw; ++ unsigned int cpu; ++ ++ lockdep_assert_cpus_held(); ++ mutex_lock(&flush_lock); ++ ++ for_each_online_cpu(cpu) { ++ sfw = &per_cpu(slub_flush, cpu); ++ if (!has_cpu_slab(cpu, s)) { ++ sfw->skip = true; ++ continue; ++ } ++ INIT_WORK(&sfw->work, flush_cpu_slab); ++ sfw->skip = false; ++ sfw->s = s; ++ schedule_work_on(cpu, &sfw->work); ++ } ++ ++ for_each_online_cpu(cpu) { ++ sfw = &per_cpu(slub_flush, cpu); ++ if (sfw->skip) ++ continue; ++ flush_work(&sfw->work); ++ } ++ ++ mutex_unlock(&flush_lock); ++} ++ + static void flush_all(struct kmem_cache *s) + { +- on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1); ++ cpus_read_lock(); ++ flush_all_cpus_locked(s); ++ cpus_read_unlock(); + } + + /* +@@ -4097,7 +4152,7 @@ int __kmem_cache_shutdown(struct kmem_ca + int node; + struct kmem_cache_node *n; + +- flush_all(s); ++ flush_all_cpus_locked(s); + /* Attempt to free all objects */ + for_each_kmem_cache_node(s, node, n) { + free_partial(s, n); +@@ -4373,7 +4428,7 @@ EXPORT_SYMBOL(kfree); + * being allocated from last increasing the chance that the last objects + * are freed in them. + */ +-int __kmem_cache_shrink(struct kmem_cache *s) ++static int __kmem_cache_do_shrink(struct kmem_cache *s) + { + int node; + int i; +@@ -4385,7 +4440,6 @@ int __kmem_cache_shrink(struct kmem_cach + unsigned long flags; + int ret = 0; + +- flush_all(s); + for_each_kmem_cache_node(s, node, n) { + INIT_LIST_HEAD(&discard); + for (i = 0; i < SHRINK_PROMOTE_MAX; i++) +@@ -4435,13 +4489,21 @@ int __kmem_cache_shrink(struct kmem_cach + return ret; + } + ++int __kmem_cache_shrink(struct kmem_cache *s) ++{ ++ flush_all(s); ++ return __kmem_cache_do_shrink(s); ++} ++ + static int slab_mem_going_offline_callback(void *arg) + { + struct kmem_cache *s; + + mutex_lock(&slab_mutex); +- list_for_each_entry(s, &slab_caches, list) +- __kmem_cache_shrink(s); ++ list_for_each_entry(s, &slab_caches, list) { ++ flush_all_cpus_locked(s); ++ __kmem_cache_do_shrink(s); ++ } + mutex_unlock(&slab_mutex); + + return 0; diff --git a/patches/0029-mm-slub-Move-flush_cpu_slab-invocations-__free_slab-.patch b/patches/0029-mm-slub-Move-flush_cpu_slab-invocations-__free_slab-.patch deleted file mode 100644 index 641577dcc7e7..000000000000 --- a/patches/0029-mm-slub-Move-flush_cpu_slab-invocations-__free_slab-.patch +++ /dev/null @@ -1,179 +0,0 @@ -From: Sebastian Andrzej Siewior -Date: Thu, 29 Jul 2021 15:21:26 +0200 -Subject: [PATCH 29/35] mm: slub: Move flush_cpu_slab() invocations - __free_slab() invocations out of IRQ context - -flush_all() flushes a specific SLAB cache on each CPU (where the cache -is present). The deactivate_slab()/__free_slab() invocation happens -within IPI handler and is problematic for PREEMPT_RT. - -The flush operation is not a frequent operation or a hot path. The -per-CPU flush operation can be moved to within a workqueue. - -[vbabka@suse.cz: adapt to new SLUB changes] -Signed-off-by: Sebastian Andrzej Siewior -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slab_common.c | 2 + - mm/slub.c | 79 +++++++++++++++++++++++++++++++++++++++++++++---------- - 2 files changed, 68 insertions(+), 13 deletions(-) - ---- a/mm/slab_common.c -+++ b/mm/slab_common.c -@@ -502,6 +502,7 @@ void kmem_cache_destroy(struct kmem_cach - if (unlikely(!s)) - return; - -+ cpus_read_lock(); - mutex_lock(&slab_mutex); - - s->refcount--; -@@ -516,6 +517,7 @@ void kmem_cache_destroy(struct kmem_cach - } - out_unlock: - mutex_unlock(&slab_mutex); -+ cpus_read_unlock(); - } - EXPORT_SYMBOL(kmem_cache_destroy); - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2516,33 +2516,79 @@ static inline void __flush_cpu_slab(stru - unfreeze_partials_cpu(s, c); - } - -+struct slub_flush_work { -+ struct work_struct work; -+ struct kmem_cache *s; -+ bool skip; -+}; -+ - /* - * Flush cpu slab. - * -- * Called from IPI handler with interrupts disabled. -+ * Called from CPU work handler with migration disabled. - */ --static void flush_cpu_slab(void *d) -+static void flush_cpu_slab(struct work_struct *w) - { -- struct kmem_cache *s = d; -- struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); -+ struct kmem_cache *s; -+ struct kmem_cache_cpu *c; -+ struct slub_flush_work *sfw; -+ -+ sfw = container_of(w, struct slub_flush_work, work); -+ -+ s = sfw->s; -+ c = this_cpu_ptr(s->cpu_slab); - - if (c->page) -- flush_slab(s, c, false); -+ flush_slab(s, c, true); - - unfreeze_partials(s); - } - --static bool has_cpu_slab(int cpu, void *info) -+static bool has_cpu_slab(int cpu, struct kmem_cache *s) - { -- struct kmem_cache *s = info; - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - - return c->page || slub_percpu_partial(c); - } - -+static DEFINE_MUTEX(flush_lock); -+static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); -+ -+static void flush_all_cpus_locked(struct kmem_cache *s) -+{ -+ struct slub_flush_work *sfw; -+ unsigned int cpu; -+ -+ lockdep_assert_cpus_held(); -+ mutex_lock(&flush_lock); -+ -+ for_each_online_cpu(cpu) { -+ sfw = &per_cpu(slub_flush, cpu); -+ if (!has_cpu_slab(cpu, s)) { -+ sfw->skip = true; -+ continue; -+ } -+ INIT_WORK(&sfw->work, flush_cpu_slab); -+ sfw->skip = false; -+ sfw->s = s; -+ schedule_work_on(cpu, &sfw->work); -+ } -+ -+ for_each_online_cpu(cpu) { -+ sfw = &per_cpu(slub_flush, cpu); -+ if (sfw->skip) -+ continue; -+ flush_work(&sfw->work); -+ } -+ -+ mutex_unlock(&flush_lock); -+} -+ - static void flush_all(struct kmem_cache *s) - { -- on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1); -+ cpus_read_lock(); -+ flush_all_cpus_locked(s); -+ cpus_read_unlock(); - } - - /* -@@ -4087,7 +4133,7 @@ int __kmem_cache_shutdown(struct kmem_ca - int node; - struct kmem_cache_node *n; - -- flush_all(s); -+ flush_all_cpus_locked(s); - /* Attempt to free all objects */ - for_each_kmem_cache_node(s, node, n) { - free_partial(s, n); -@@ -4363,7 +4409,7 @@ EXPORT_SYMBOL(kfree); - * being allocated from last increasing the chance that the last objects - * are freed in them. - */ --int __kmem_cache_shrink(struct kmem_cache *s) -+static int __kmem_cache_do_shrink(struct kmem_cache *s) - { - int node; - int i; -@@ -4375,7 +4421,6 @@ int __kmem_cache_shrink(struct kmem_cach - unsigned long flags; - int ret = 0; - -- flush_all(s); - for_each_kmem_cache_node(s, node, n) { - INIT_LIST_HEAD(&discard); - for (i = 0; i < SHRINK_PROMOTE_MAX; i++) -@@ -4425,13 +4470,21 @@ int __kmem_cache_shrink(struct kmem_cach - return ret; - } - -+int __kmem_cache_shrink(struct kmem_cache *s) -+{ -+ flush_all(s); -+ return __kmem_cache_do_shrink(s); -+} -+ - static int slab_mem_going_offline_callback(void *arg) - { - struct kmem_cache *s; - - mutex_lock(&slab_mutex); -- list_for_each_entry(s, &slab_caches, list) -- __kmem_cache_shrink(s); -+ list_for_each_entry(s, &slab_caches, list) { -+ flush_all_cpus_locked(s); -+ __kmem_cache_do_shrink(s); -+ } - mutex_unlock(&slab_mutex); - - return 0; diff --git a/patches/0029-mm-slub-make-object_map_lock-a-raw_spinlock_t.patch b/patches/0029-mm-slub-make-object_map_lock-a-raw_spinlock_t.patch new file mode 100644 index 000000000000..a058bfd30507 --- /dev/null +++ b/patches/0029-mm-slub-make-object_map_lock-a-raw_spinlock_t.patch @@ -0,0 +1,44 @@ +From: Sebastian Andrzej Siewior +Date: Thu, 16 Jul 2020 18:47:50 +0200 +Subject: [PATCH 29/33] mm: slub: make object_map_lock a raw_spinlock_t + +The variable object_map is protected by object_map_lock. The lock is always +acquired in debug code and within already atomic context + +Make object_map_lock a raw_spinlock_t. + +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -452,7 +452,7 @@ static inline bool cmpxchg_double_slab(s + + #ifdef CONFIG_SLUB_DEBUG + static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; +-static DEFINE_SPINLOCK(object_map_lock); ++static DEFINE_RAW_SPINLOCK(object_map_lock); + + static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, + struct page *page) +@@ -497,7 +497,7 @@ static unsigned long *get_map(struct kme + { + VM_BUG_ON(!irqs_disabled()); + +- spin_lock(&object_map_lock); ++ raw_spin_lock(&object_map_lock); + + __fill_map(object_map, s, page); + +@@ -507,7 +507,7 @@ static unsigned long *get_map(struct kme + static void put_map(unsigned long *map) __releases(&object_map_lock) + { + VM_BUG_ON(map != object_map); +- spin_unlock(&object_map_lock); ++ raw_spin_unlock(&object_map_lock); + } + + static inline unsigned int size_from_object(struct kmem_cache *s) diff --git a/patches/0030-mm-slub-Make-object_map_lock-a-raw_spinlock_t.patch b/patches/0030-mm-slub-Make-object_map_lock-a-raw_spinlock_t.patch deleted file mode 100644 index 0cdb2c71066e..000000000000 --- a/patches/0030-mm-slub-Make-object_map_lock-a-raw_spinlock_t.patch +++ /dev/null @@ -1,45 +0,0 @@ -From: Sebastian Andrzej Siewior -Date: Thu, 29 Jul 2021 15:21:27 +0200 -Subject: [PATCH 30/35] mm: slub: Make object_map_lock a raw_spinlock_t - -The variable object_map is protected by object_map_lock. The lock is always -acquired in debug code and within already atomic context - -Make object_map_lock a raw_spinlock_t. - -Signed-off-by: Sebastian Andrzej Siewior -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -438,7 +438,7 @@ static inline bool cmpxchg_double_slab(s - - #ifdef CONFIG_SLUB_DEBUG - static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; --static DEFINE_SPINLOCK(object_map_lock); -+static DEFINE_RAW_SPINLOCK(object_map_lock); - - static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, - struct page *page) -@@ -483,7 +483,7 @@ static unsigned long *get_map(struct kme - { - VM_BUG_ON(!irqs_disabled()); - -- spin_lock(&object_map_lock); -+ raw_spin_lock(&object_map_lock); - - __fill_map(object_map, s, page); - -@@ -493,7 +493,7 @@ static unsigned long *get_map(struct kme - static void put_map(unsigned long *map) __releases(&object_map_lock) - { - VM_BUG_ON(map != object_map); -- spin_unlock(&object_map_lock); -+ raw_spin_unlock(&object_map_lock); - } - - static inline unsigned int size_from_object(struct kmem_cache *s) diff --git a/patches/0030-mm-slub-make-slab_lock-disable-irqs-with-PREEMPT_RT.patch b/patches/0030-mm-slub-make-slab_lock-disable-irqs-with-PREEMPT_RT.patch new file mode 100644 index 000000000000..9453152ed8f2 --- /dev/null +++ b/patches/0030-mm-slub-make-slab_lock-disable-irqs-with-PREEMPT_RT.patch @@ -0,0 +1,186 @@ +From: Vlastimil Babka +Date: Fri, 4 Jun 2021 12:55:55 +0200 +Subject: [PATCH 30/33] mm, slub: make slab_lock() disable irqs with PREEMPT_RT + +We need to disable irqs around slab_lock() (a bit spinlock) to make it +irq-safe. Most calls to slab_lock() are nested under spin_lock_irqsave() which +doesn't disable irqs on PREEMPT_RT, so add explicit disabling with PREEMPT_RT. +The exception is cmpxchg_double_slab() which already disables irqs, so use a +__slab_[un]lock() variant without irq disable there. + +slab_[un]lock() thus needs a flags pointer parameter, which is unused on !RT. +free_debug_processing() now has two flags variables, which looks odd, but only +one is actually used - the one used in spin_lock_irqsave() on !RT and the one +used in slab_lock() on RT. + +As a result, __cmpxchg_double_slab() and cmpxchg_double_slab() become +effectively identical on RT, as both will disable irqs, which is necessary on +RT as most callers of this function also rely on irqsaving lock operations. +Thus, assert that irqs are already disabled in __cmpxchg_double_slab() only on +!RT and also change the VM_BUG_ON assertion to the more standard lockdep_assert +one. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 58 +++++++++++++++++++++++++++++++++++++++++----------------- + 1 file changed, 41 insertions(+), 17 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -359,25 +359,44 @@ static inline unsigned int oo_objects(st + /* + * Per slab locking using the pagelock + */ +-static __always_inline void slab_lock(struct page *page) ++static __always_inline void __slab_lock(struct page *page) + { + VM_BUG_ON_PAGE(PageTail(page), page); + bit_spin_lock(PG_locked, &page->flags); + } + +-static __always_inline void slab_unlock(struct page *page) ++static __always_inline void __slab_unlock(struct page *page) + { + VM_BUG_ON_PAGE(PageTail(page), page); + __bit_spin_unlock(PG_locked, &page->flags); + } + +-/* Interrupts must be disabled (for the fallback code to work right) */ ++static __always_inline void slab_lock(struct page *page, unsigned long *flags) ++{ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_save(*flags); ++ __slab_lock(page); ++} ++ ++static __always_inline void slab_unlock(struct page *page, unsigned long *flags) ++{ ++ __slab_unlock(page); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_restore(*flags); ++} ++ ++/* ++ * Interrupts must be disabled (for the fallback code to work right), typically ++ * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different ++ * so we disable interrupts as part of slab_[un]lock(). ++ */ + static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) + { +- VM_BUG_ON(!irqs_disabled()); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ lockdep_assert_irqs_disabled(); + #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + if (s->flags & __CMPXCHG_DOUBLE) { +@@ -388,15 +407,18 @@ static inline bool __cmpxchg_double_slab + } else + #endif + { +- slab_lock(page); ++ /* init to 0 to prevent spurious warnings */ ++ unsigned long flags = 0; ++ ++ slab_lock(page, &flags); + if (page->freelist == freelist_old && + page->counters == counters_old) { + page->freelist = freelist_new; + page->counters = counters_new; +- slab_unlock(page); ++ slab_unlock(page, &flags); + return true; + } +- slab_unlock(page); ++ slab_unlock(page, &flags); + } + + cpu_relax(); +@@ -427,16 +449,16 @@ static inline bool cmpxchg_double_slab(s + unsigned long flags; + + local_irq_save(flags); +- slab_lock(page); ++ __slab_lock(page); + if (page->freelist == freelist_old && + page->counters == counters_old) { + page->freelist = freelist_new; + page->counters = counters_new; +- slab_unlock(page); ++ __slab_unlock(page); + local_irq_restore(flags); + return true; + } +- slab_unlock(page); ++ __slab_unlock(page); + local_irq_restore(flags); + } + +@@ -1269,11 +1291,11 @@ static noinline int free_debug_processin + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + void *object = head; + int cnt = 0; +- unsigned long flags; ++ unsigned long flags, flags2; + int ret = 0; + + spin_lock_irqsave(&n->list_lock, flags); +- slab_lock(page); ++ slab_lock(page, &flags2); + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!check_slab(s, page)) +@@ -1306,7 +1328,7 @@ static noinline int free_debug_processin + slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", + bulk_cnt, cnt); + +- slab_unlock(page); ++ slab_unlock(page, &flags2); + spin_unlock_irqrestore(&n->list_lock, flags); + if (!ret) + slab_fix(s, "Object at 0x%p not freed", object); +@@ -4087,11 +4109,12 @@ static void list_slab_objects(struct kme + { + #ifdef CONFIG_SLUB_DEBUG + void *addr = page_address(page); ++ unsigned long flags; + unsigned long *map; + void *p; + + slab_err(s, page, text, s->name); +- slab_lock(page); ++ slab_lock(page, &flags); + + map = get_map(s, page); + for_each_object(p, s, addr, page->objects) { +@@ -4102,7 +4125,7 @@ static void list_slab_objects(struct kme + } + } + put_map(map); +- slab_unlock(page); ++ slab_unlock(page, &flags); + #endif + } + +@@ -4834,8 +4857,9 @@ static void validate_slab(struct kmem_ca + { + void *p; + void *addr = page_address(page); ++ unsigned long flags; + +- slab_lock(page); ++ slab_lock(page, &flags); + + if (!check_slab(s, page) || !on_freelist(s, page, NULL)) + goto unlock; +@@ -4850,7 +4874,7 @@ static void validate_slab(struct kmem_ca + break; + } + unlock: +- slab_unlock(page); ++ slab_unlock(page, &flags); + } + + static int validate_slab_node(struct kmem_cache *s, diff --git a/patches/0031-mm-slub-optionally-save-restore-irqs-in-slab_-un-loc.patch b/patches/0031-mm-slub-optionally-save-restore-irqs-in-slab_-un-loc.patch deleted file mode 100644 index e80e03924e66..000000000000 --- a/patches/0031-mm-slub-optionally-save-restore-irqs-in-slab_-un-loc.patch +++ /dev/null @@ -1,150 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:28 +0200 -Subject: [PATCH 31/35] mm, slub: optionally save/restore irqs in - slab_[un]lock()/ - -For PREEMPT_RT we will need to disable irqs for this bit spinlock. As a -preparation, add a flags parameter, and an internal version that takes -additional bool parameter to control irq saving/restoring (the flags -parameter is compile-time unused if the bool is a constant false). - -Convert ___cmpxchg_double_slab(), which also comes with the same bool -parameter, to use the internal version. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 52 +++++++++++++++++++++++++++++++++------------------- - 1 file changed, 33 insertions(+), 19 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -359,16 +359,33 @@ static inline unsigned int oo_objects(st - /* - * Per slab locking using the pagelock - */ --static __always_inline void slab_lock(struct page *page) -+static __always_inline void -+__slab_lock(struct page *page, unsigned long *flags, bool disable_irqs) - { - VM_BUG_ON_PAGE(PageTail(page), page); -+ if (disable_irqs) -+ local_irq_save(*flags); - bit_spin_lock(PG_locked, &page->flags); - } - --static __always_inline void slab_unlock(struct page *page) -+static __always_inline void -+__slab_unlock(struct page *page, unsigned long *flags, bool disable_irqs) - { - VM_BUG_ON_PAGE(PageTail(page), page); - __bit_spin_unlock(PG_locked, &page->flags); -+ if (disable_irqs) -+ local_irq_restore(*flags); -+} -+ -+static __always_inline void -+slab_lock(struct page *page, unsigned long *flags) -+{ -+ __slab_lock(page, flags, false); -+} -+ -+static __always_inline void slab_unlock(struct page *page, unsigned long *flags) -+{ -+ __slab_unlock(page, flags, false); - } - - static inline bool ___cmpxchg_double_slab(struct kmem_cache *s, struct page *page, -@@ -388,23 +405,18 @@ static inline bool ___cmpxchg_double_sla - } else - #endif - { -- unsigned long flags; -+ /* init to 0 to prevent spurious warnings */ -+ unsigned long flags = 0; - -- if (disable_irqs) -- local_irq_save(flags); -- slab_lock(page); -+ __slab_lock(page, &flags, disable_irqs); - if (page->freelist == freelist_old && - page->counters == counters_old) { - page->freelist = freelist_new; - page->counters = counters_new; -- slab_unlock(page); -- if (disable_irqs) -- local_irq_restore(flags); -+ __slab_unlock(page, &flags, disable_irqs); - return true; - } -- slab_unlock(page); -- if (disable_irqs) -- local_irq_restore(flags); -+ __slab_unlock(page, &flags, disable_irqs); - } - - cpu_relax(); -@@ -1255,11 +1267,11 @@ static noinline int free_debug_processin - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - void *object = head; - int cnt = 0; -- unsigned long flags; -+ unsigned long flags, flags2; - int ret = 0; - - spin_lock_irqsave(&n->list_lock, flags); -- slab_lock(page); -+ slab_lock(page, &flags2); - - if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!check_slab(s, page)) -@@ -1292,7 +1304,7 @@ static noinline int free_debug_processin - slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", - bulk_cnt, cnt); - -- slab_unlock(page); -+ slab_unlock(page, &flags2); - spin_unlock_irqrestore(&n->list_lock, flags); - if (!ret) - slab_fix(s, "Object at 0x%p not freed", object); -@@ -4070,9 +4082,10 @@ static void list_slab_objects(struct kme - void *addr = page_address(page); - unsigned long *map; - void *p; -+ unsigned long flags; - - slab_err(s, page, text, s->name); -- slab_lock(page); -+ slab_lock(page, &flags); - - map = get_map(s, page); - for_each_object(p, s, addr, page->objects) { -@@ -4083,7 +4096,7 @@ static void list_slab_objects(struct kme - } - } - put_map(map); -- slab_unlock(page); -+ slab_unlock(page, &flags); - #endif - } - -@@ -4815,8 +4828,9 @@ static void validate_slab(struct kmem_ca - { - void *p; - void *addr = page_address(page); -+ unsigned long flags; - -- slab_lock(page); -+ slab_lock(page, &flags); - - if (!check_slab(s, page) || !on_freelist(s, page, NULL)) - goto unlock; -@@ -4831,7 +4845,7 @@ static void validate_slab(struct kmem_ca - break; - } - unlock: -- slab_unlock(page); -+ slab_unlock(page, &flags); - } - - static int validate_slab_node(struct kmem_cache *s, diff --git a/patches/0031-mm-slub-protect-put_cpu_partial-with-disabled-irqs-i.patch b/patches/0031-mm-slub-protect-put_cpu_partial-with-disabled-irqs-i.patch new file mode 100644 index 000000000000..84d123a5489d --- /dev/null +++ b/patches/0031-mm-slub-protect-put_cpu_partial-with-disabled-irqs-i.patch @@ -0,0 +1,167 @@ +From: Vlastimil Babka +Date: Wed, 28 Jul 2021 12:26:27 +0200 +Subject: [PATCH 31/33] mm, slub: protect put_cpu_partial() with disabled irqs + instead of cmpxchg + +Jann Horn reported [1] the following theoretically possible race: + + task A: put_cpu_partial() calls preempt_disable() + task A: oldpage = this_cpu_read(s->cpu_slab->partial) + interrupt: kfree() reaches unfreeze_partials() and discards the page + task B (on another CPU): reallocates page as page cache + task A: reads page->pages and page->pobjects, which are actually + halves of the pointer page->lru.prev + task B (on another CPU): frees page + interrupt: allocates page as SLUB page and places it on the percpu partial list + task A: this_cpu_cmpxchg() succeeds + + which would cause page->pages and page->pobjects to end up containing + halves of pointers that would then influence when put_cpu_partial() + happens and show up in root-only sysfs files. Maybe that's acceptable, + I don't know. But there should probably at least be a comment for now + to point out that we're reading union fields of a page that might be + in a completely different state. + +Additionally, the this_cpu_cmpxchg() approach in put_cpu_partial() is only safe +against s->cpu_slab->partial manipulation in ___slab_alloc() if the latter +disables irqs, otherwise a __slab_free() in an irq handler could call +put_cpu_partial() in the middle of ___slab_alloc() manipulating ->partial +and corrupt it. This becomes an issue on RT after a local_lock is introduced +in later patch. The fix means taking the local_lock also in put_cpu_partial() +on RT. + +After debugging this issue, Mike Galbraith suggested [2] that to avoid +different locking schemes on RT and !RT, we can just protect put_cpu_partial() +with disabled irqs (to be converted to local_lock_irqsave() later) everywhere. +This should be acceptable as it's not a fast path, and moving the actual +partial unfreezing outside of the irq disabled section makes it short, and with +the retry loop gone the code can be also simplified. In addition, the race +reported by Jann should no longer be possible. + +[1] https://lore.kernel.org/lkml/CAG48ez1mvUuXwg0YPH5ANzhQLpbphqk-ZS+jbRz+H66fvm4FcA@mail.gmail.com/ +[2] https://lore.kernel.org/linux-rt-users/e3470ab357b48bccfbd1f5133b982178a7d2befb.camel@gmx.de/ + +Reported-by: Jann Horn +Suggested-by: Mike Galbraith +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 83 +++++++++++++++++++++++++++++++++----------------------------- + 1 file changed, 45 insertions(+), 38 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2025,7 +2025,12 @@ static inline void *acquire_slab(struct + return freelist; + } + ++#ifdef CONFIG_SLUB_CPU_PARTIAL + static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); ++#else ++static inline void put_cpu_partial(struct kmem_cache *s, struct page *page, ++ int drain) { } ++#endif + static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); + + /* +@@ -2459,14 +2464,6 @@ static void unfreeze_partials_cpu(struct + __unfreeze_partials(s, partial_page); + } + +-#else /* CONFIG_SLUB_CPU_PARTIAL */ +- +-static inline void unfreeze_partials(struct kmem_cache *s) { } +-static inline void unfreeze_partials_cpu(struct kmem_cache *s, +- struct kmem_cache_cpu *c) { } +- +-#endif /* CONFIG_SLUB_CPU_PARTIAL */ +- + /* + * Put a page that was just frozen (in __slab_free|get_partial_node) into a + * partial page slot if available. +@@ -2476,46 +2473,56 @@ static inline void unfreeze_partials_cpu + */ + static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) + { +-#ifdef CONFIG_SLUB_CPU_PARTIAL + struct page *oldpage; +- int pages; +- int pobjects; ++ struct page *page_to_unfreeze = NULL; ++ unsigned long flags; ++ int pages = 0; ++ int pobjects = 0; + +- preempt_disable(); +- do { +- pages = 0; +- pobjects = 0; +- oldpage = this_cpu_read(s->cpu_slab->partial); ++ local_irq_save(flags); ++ ++ oldpage = this_cpu_read(s->cpu_slab->partial); + +- if (oldpage) { ++ if (oldpage) { ++ if (drain && oldpage->pobjects > slub_cpu_partial(s)) { ++ /* ++ * Partial array is full. Move the existing set to the ++ * per node partial list. Postpone the actual unfreezing ++ * outside of the critical section. ++ */ ++ page_to_unfreeze = oldpage; ++ oldpage = NULL; ++ } else { + pobjects = oldpage->pobjects; + pages = oldpage->pages; +- if (drain && pobjects > slub_cpu_partial(s)) { +- /* +- * partial array is full. Move the existing +- * set to the per node partial list. +- */ +- unfreeze_partials(s); +- oldpage = NULL; +- pobjects = 0; +- pages = 0; +- stat(s, CPU_PARTIAL_DRAIN); +- } + } ++ } + +- pages++; +- pobjects += page->objects - page->inuse; ++ pages++; ++ pobjects += page->objects - page->inuse; + +- page->pages = pages; +- page->pobjects = pobjects; +- page->next = oldpage; +- +- } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) +- != oldpage); +- preempt_enable(); +-#endif /* CONFIG_SLUB_CPU_PARTIAL */ ++ page->pages = pages; ++ page->pobjects = pobjects; ++ page->next = oldpage; ++ ++ this_cpu_write(s->cpu_slab->partial, page); ++ ++ local_irq_restore(flags); ++ ++ if (page_to_unfreeze) { ++ __unfreeze_partials(s, page_to_unfreeze); ++ stat(s, CPU_PARTIAL_DRAIN); ++ } + } + ++#else /* CONFIG_SLUB_CPU_PARTIAL */ ++ ++static inline void unfreeze_partials(struct kmem_cache *s) { } ++static inline void unfreeze_partials_cpu(struct kmem_cache *s, ++ struct kmem_cache_cpu *c) { } ++ ++#endif /* CONFIG_SLUB_CPU_PARTIAL */ ++ + static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) + { + unsigned long flags; diff --git a/patches/0032-mm-slub-make-slab_lock-disable-irqs-with-PREEMPT_RT.patch b/patches/0032-mm-slub-make-slab_lock-disable-irqs-with-PREEMPT_RT.patch deleted file mode 100644 index a87acd032ff6..000000000000 --- a/patches/0032-mm-slub-make-slab_lock-disable-irqs-with-PREEMPT_RT.patch +++ /dev/null @@ -1,59 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:29 +0200 -Subject: [PATCH 32/35] mm, slub: make slab_lock() disable irqs with PREEMPT_RT - -We need to disable irqs around slab_lock() (a bit spinlock) to make it -irq-safe. The calls to slab_lock() are nested under spin_lock_irqsave() which -doesn't disable irqs on PREEMPT_RT, so add explicit disabling with PREEMPT_RT. - -We also distinguish cmpxchg_double_slab() where we do the disabling explicitly -and __cmpxchg_double_slab() for contexts with already disabled irqs. However -these context are also typically spin_lock_irqsave() thus insufficient on -PREEMPT_RT. Thus, change __cmpxchg_double_slab() to be same as -cmpxchg_double_slab() on PREEMPT_RT. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 13 +++++++++---- - 1 file changed, 9 insertions(+), 4 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -380,12 +380,12 @@ static __always_inline void - static __always_inline void - slab_lock(struct page *page, unsigned long *flags) - { -- __slab_lock(page, flags, false); -+ __slab_lock(page, flags, IS_ENABLED(CONFIG_PREEMPT_RT)); - } - - static __always_inline void slab_unlock(struct page *page, unsigned long *flags) - { -- __slab_unlock(page, flags, false); -+ __slab_unlock(page, flags, IS_ENABLED(CONFIG_PREEMPT_RT)); - } - - static inline bool ___cmpxchg_double_slab(struct kmem_cache *s, struct page *page, -@@ -429,14 +429,19 @@ static inline bool ___cmpxchg_double_sla - return false; - } - --/* Interrupts must be disabled (for the fallback code to work right) */ -+/* -+ * Interrupts must be disabled (for the fallback code to work right), typically -+ * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different -+ * so we disable interrupts explicitly here. -+ */ - static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new, - const char *n) - { - return ___cmpxchg_double_slab(s, page, freelist_old, counters_old, -- freelist_new, counters_new, n, false); -+ freelist_new, counters_new, n, -+ IS_ENABLED(CONFIG_PREEMPT_RT)); - } - - static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, diff --git a/patches/0032-mm-slub-use-migrate_disable-on-PREEMPT_RT.patch b/patches/0032-mm-slub-use-migrate_disable-on-PREEMPT_RT.patch new file mode 100644 index 000000000000..abeb6ac4d6a7 --- /dev/null +++ b/patches/0032-mm-slub-use-migrate_disable-on-PREEMPT_RT.patch @@ -0,0 +1,120 @@ +From: Vlastimil Babka +Date: Fri, 21 May 2021 14:03:23 +0200 +Subject: [PATCH 32/33] mm, slub: use migrate_disable() on PREEMPT_RT + +We currently use preempt_disable() (directly or via get_cpu_ptr()) to stabilize +the pointer to kmem_cache_cpu. On PREEMPT_RT this would be incompatible with +the list_lock spinlock. We can use migrate_disable() instead, but that +increases overhead on !PREEMPT_RT as it's an unconditional function call. + +In order to get the best available mechanism on both PREEMPT_RT and +!PREEMPT_RT, introduce private slub_get_cpu_ptr() and slub_put_cpu_ptr() +wrappers and use them. + +Signed-off-by: Vlastimil Babka +--- + mm/slub.c | 39 ++++++++++++++++++++++++++++++--------- + 1 file changed, 30 insertions(+), 9 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -118,6 +118,26 @@ + * the fast path and disables lockless freelists. + */ + ++/* ++ * We could simply use migrate_disable()/enable() but as long as it's a ++ * function call even on !PREEMPT_RT, use inline preempt_disable() there. ++ */ ++#ifndef CONFIG_PREEMPT_RT ++#define slub_get_cpu_ptr(var) get_cpu_ptr(var) ++#define slub_put_cpu_ptr(var) put_cpu_ptr(var) ++#else ++#define slub_get_cpu_ptr(var) \ ++({ \ ++ migrate_disable(); \ ++ this_cpu_ptr(var); \ ++}) ++#define slub_put_cpu_ptr(var) \ ++do { \ ++ (void)(var); \ ++ migrate_enable(); \ ++} while (0) ++#endif ++ + #ifdef CONFIG_SLUB_DEBUG + #ifdef CONFIG_SLUB_DEBUG_ON + DEFINE_STATIC_KEY_TRUE(slub_debug_enabled); +@@ -2852,7 +2872,7 @@ static void *___slab_alloc(struct kmem_c + if (unlikely(!pfmemalloc_match_unsafe(page, gfpflags))) + goto deactivate_slab; + +- /* must check again c->page in case IRQ handler changed it */ ++ /* must check again c->page in case we got preempted and it changed */ + local_irq_save(flags); + if (unlikely(page != c->page)) { + local_irq_restore(flags); +@@ -2911,7 +2931,8 @@ static void *___slab_alloc(struct kmem_c + } + if (unlikely(!slub_percpu_partial(c))) { + local_irq_restore(flags); +- goto new_objects; /* stolen by an IRQ handler */ ++ /* we were preempted and partial list got empty */ ++ goto new_objects; + } + + page = c->page = slub_percpu_partial(c); +@@ -2927,9 +2948,9 @@ static void *___slab_alloc(struct kmem_c + if (freelist) + goto check_new_page; + +- put_cpu_ptr(s->cpu_slab); ++ slub_put_cpu_ptr(s->cpu_slab); + page = new_slab(s, gfpflags, node); +- c = get_cpu_ptr(s->cpu_slab); ++ c = slub_get_cpu_ptr(s->cpu_slab); + + if (unlikely(!page)) { + slab_out_of_memory(s, gfpflags, node); +@@ -3012,12 +3033,12 @@ static void *__slab_alloc(struct kmem_ca + * cpu before disabling preemption. Need to reload cpu area + * pointer. + */ +- c = get_cpu_ptr(s->cpu_slab); ++ c = slub_get_cpu_ptr(s->cpu_slab); + #endif + + p = ___slab_alloc(s, gfpflags, node, addr, c); + #ifdef CONFIG_PREEMPT_COUNT +- put_cpu_ptr(s->cpu_slab); ++ slub_put_cpu_ptr(s->cpu_slab); + #endif + return p; + } +@@ -3546,7 +3567,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca + * IRQs, which protects against PREEMPT and interrupts + * handlers invoking normal fastpath. + */ +- c = get_cpu_ptr(s->cpu_slab); ++ c = slub_get_cpu_ptr(s->cpu_slab); + local_irq_disable(); + + for (i = 0; i < size; i++) { +@@ -3592,7 +3613,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca + } + c->tid = next_tid(c->tid); + local_irq_enable(); +- put_cpu_ptr(s->cpu_slab); ++ slub_put_cpu_ptr(s->cpu_slab); + + /* + * memcg and kmem_cache debug support and memory initialization. +@@ -3602,7 +3623,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca + slab_want_init_on_alloc(flags, s)); + return i; + error: +- put_cpu_ptr(s->cpu_slab); ++ slub_put_cpu_ptr(s->cpu_slab); + slab_post_alloc_hook(s, objcg, flags, i, p, false); + __kmem_cache_free_bulk(s, i, p); + return 0; diff --git a/patches/0033-mm-slub-convert-kmem_cpu_slab-protection-to-local_lo.patch b/patches/0033-mm-slub-convert-kmem_cpu_slab-protection-to-local_lo.patch new file mode 100644 index 000000000000..0c04004739fc --- /dev/null +++ b/patches/0033-mm-slub-convert-kmem_cpu_slab-protection-to-local_lo.patch @@ -0,0 +1,420 @@ +From: Vlastimil Babka +Date: Sat, 22 May 2021 01:59:38 +0200 +Subject: [PATCH 33/33] mm, slub: convert kmem_cpu_slab protection to + local_lock + +Embed local_lock into struct kmem_cpu_slab and use the irq-safe versions of +local_lock instead of plain local_irq_save/restore. On !PREEMPT_RT that's +equivalent, with better lockdep visibility. On PREEMPT_RT that means better +preemption. + +However, the cost on PREEMPT_RT is the loss of lockless fast paths which only +work with cpu freelist. Those are designed to detect and recover from being +preempted by other conflicting operations (both fast or slow path), but the +slow path operations assume they cannot be preempted by a fast path operation, +which is guaranteed naturally with disabled irqs. With local locks on +PREEMPT_RT, the fast paths now also need to take the local lock to avoid races. + +In the allocation fastpath slab_alloc_node() we can just defer to the slowpath +__slab_alloc() which also works with cpu freelist, but under the local lock. +In the free fastpath do_slab_free() we have to add a new local lock protected +version of freeing to the cpu freelist, as the existing slowpath only works +with the page freelist. + +Also update the comment about locking scheme in SLUB to reflect changes done +by this series. + +[ Mike Galbraith : use local_lock() without irq in PREEMPT_RT + scope; debugging of RT crashes resulting in put_cpu_partial() locking changes ] +Signed-off-by: Vlastimil Babka +--- + include/linux/slub_def.h | 6 + + mm/slub.c | 146 +++++++++++++++++++++++++++++++++++------------ + 2 files changed, 117 insertions(+), 35 deletions(-) + +--- a/include/linux/slub_def.h ++++ b/include/linux/slub_def.h +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + + enum stat_item { + ALLOC_FASTPATH, /* Allocation from cpu slab */ +@@ -40,6 +41,10 @@ enum stat_item { + CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ + NR_SLUB_STAT_ITEMS }; + ++/* ++ * When changing the layout, make sure freelist and tid are still compatible ++ * with this_cpu_cmpxchg_double() alignment requirements. ++ */ + struct kmem_cache_cpu { + void **freelist; /* Pointer to next available object */ + unsigned long tid; /* Globally unique transaction id */ +@@ -47,6 +52,7 @@ struct kmem_cache_cpu { + #ifdef CONFIG_SLUB_CPU_PARTIAL + struct page *partial; /* Partially allocated frozen slabs */ + #endif ++ local_lock_t lock; /* Protects the fields above */ + #ifdef CONFIG_SLUB_STATS + unsigned stat[NR_SLUB_STAT_ITEMS]; + #endif +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -46,13 +46,21 @@ + /* + * Lock order: + * 1. slab_mutex (Global Mutex) +- * 2. node->list_lock +- * 3. slab_lock(page) (Only on some arches and for debugging) ++ * 2. node->list_lock (Spinlock) ++ * 3. kmem_cache->cpu_slab->lock (Local lock) ++ * 4. slab_lock(page) (Only on some arches or for debugging) ++ * 5. object_map_lock (Only for debugging) + * + * slab_mutex + * + * The role of the slab_mutex is to protect the list of all the slabs + * and to synchronize major metadata changes to slab cache structures. ++ * Also synchronizes memory hotplug callbacks. ++ * ++ * slab_lock ++ * ++ * The slab_lock is a wrapper around the page lock, thus it is a bit ++ * spinlock. + * + * The slab_lock is only used for debugging and on arches that do not + * have the ability to do a cmpxchg_double. It only protects: +@@ -61,6 +69,8 @@ + * C. page->objects -> Number of objects in page + * D. page->frozen -> frozen state + * ++ * Frozen slabs ++ * + * If a slab is frozen then it is exempt from list management. It is not + * on any list except per cpu partial list. The processor that froze the + * slab is the one who can perform list operations on the page. Other +@@ -68,6 +78,8 @@ + * froze the slab is the only one that can retrieve the objects from the + * page's freelist. + * ++ * list_lock ++ * + * The list_lock protects the partial and full list on each node and + * the partial slab counter. If taken then no new slabs may be added or + * removed from the lists nor make the number of partial slabs be modified. +@@ -79,10 +91,36 @@ + * slabs, operations can continue without any centralized lock. F.e. + * allocating a long series of objects that fill up slabs does not require + * the list lock. +- * Interrupts are disabled during allocation and deallocation in order to +- * make the slab allocator safe to use in the context of an irq. In addition +- * interrupts are disabled to ensure that the processor does not change +- * while handling per_cpu slabs, due to kernel preemption. ++ * ++ * cpu_slab->lock local lock ++ * ++ * This locks protect slowpath manipulation of all kmem_cache_cpu fields ++ * except the stat counters. This is a percpu structure manipulated only by ++ * the local cpu, so the lock protects against being preempted or interrupted ++ * by an irq. Fast path operations rely on lockless operations instead. ++ * On PREEMPT_RT, the local lock does not actually disable irqs (and thus ++ * prevent the lockless operations), so fastpath operations also need to take ++ * the lock and are no longer lockless. ++ * ++ * lockless fastpaths ++ * ++ * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) ++ * are fully lockless when satisfied from the percpu slab (and when ++ * cmpxchg_double is possible to use, otherwise slab_lock is taken). ++ * They also don't disable preemption or migration or irqs. They rely on ++ * the transaction id (tid) field to detect being preempted or moved to ++ * another cpu. ++ * ++ * irq, preemption, migration considerations ++ * ++ * Interrupts are disabled as part of list_lock or local_lock operations, or ++ * around the slab_lock operation, in order to make the slab allocator safe ++ * to use in the context of an irq. ++ * ++ * In addition, preemption (or migration on PREEMPT_RT) is disabled in the ++ * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the ++ * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer ++ * doesn't have to be revalidated in each section protected by the local lock. + * + * SLUB assigns one slab for allocation to each processor. + * Allocations only occur from these slabs called cpu slabs. +@@ -2250,9 +2288,13 @@ static inline void note_cmpxchg_failure( + static void init_kmem_cache_cpus(struct kmem_cache *s) + { + int cpu; ++ struct kmem_cache_cpu *c; + +- for_each_possible_cpu(cpu) +- per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); ++ for_each_possible_cpu(cpu) { ++ c = per_cpu_ptr(s->cpu_slab, cpu); ++ local_lock_init(&c->lock); ++ c->tid = init_tid(cpu); ++ } + } + + /* +@@ -2463,10 +2505,10 @@ static void unfreeze_partials(struct kme + struct page *partial_page; + unsigned long flags; + +- local_irq_save(flags); ++ local_lock_irqsave(&s->cpu_slab->lock, flags); + partial_page = this_cpu_read(s->cpu_slab->partial); + this_cpu_write(s->cpu_slab->partial, NULL); +- local_irq_restore(flags); ++ local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + if (partial_page) + __unfreeze_partials(s, partial_page); +@@ -2499,7 +2541,7 @@ static void put_cpu_partial(struct kmem_ + int pages = 0; + int pobjects = 0; + +- local_irq_save(flags); ++ local_lock_irqsave(&s->cpu_slab->lock, flags); + + oldpage = this_cpu_read(s->cpu_slab->partial); + +@@ -2527,7 +2569,7 @@ static void put_cpu_partial(struct kmem_ + + this_cpu_write(s->cpu_slab->partial, page); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + if (page_to_unfreeze) { + __unfreeze_partials(s, page_to_unfreeze); +@@ -2549,7 +2591,7 @@ static inline void flush_slab(struct kme + struct page *page; + void *freelist; + +- local_irq_save(flags); ++ local_lock_irqsave(&s->cpu_slab->lock, flags); + + page = c->page; + freelist = c->freelist; +@@ -2558,7 +2600,7 @@ static inline void flush_slab(struct kme + c->freelist = NULL; + c->tid = next_tid(c->tid); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + if (page) { + deactivate_slab(s, page, freelist); +@@ -2780,8 +2822,6 @@ static inline bool pfmemalloc_match_unsa + * The page is still frozen if the return value is not NULL. + * + * If this function returns NULL then the page has been unfrozen. +- * +- * This function must be called with interrupt disabled. + */ + static inline void *get_freelist(struct kmem_cache *s, struct page *page) + { +@@ -2789,6 +2829,8 @@ static inline void *get_freelist(struct + unsigned long counters; + void *freelist; + ++ lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); ++ + do { + freelist = page->freelist; + counters = page->counters; +@@ -2873,9 +2915,9 @@ static void *___slab_alloc(struct kmem_c + goto deactivate_slab; + + /* must check again c->page in case we got preempted and it changed */ +- local_irq_save(flags); ++ local_lock_irqsave(&s->cpu_slab->lock, flags); + if (unlikely(page != c->page)) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&s->cpu_slab->lock, flags); + goto reread_page; + } + freelist = c->freelist; +@@ -2886,7 +2928,7 @@ static void *___slab_alloc(struct kmem_c + + if (!freelist) { + c->page = NULL; +- local_irq_restore(flags); ++ local_unlock_irqrestore(&s->cpu_slab->lock, flags); + stat(s, DEACTIVATE_BYPASS); + goto new_slab; + } +@@ -2895,7 +2937,7 @@ static void *___slab_alloc(struct kmem_c + + load_freelist: + +- lockdep_assert_irqs_disabled(); ++ lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); + + /* + * freelist is pointing to the list of objects to be used. +@@ -2905,39 +2947,39 @@ static void *___slab_alloc(struct kmem_c + VM_BUG_ON(!c->page->frozen); + c->freelist = get_freepointer(s, freelist); + c->tid = next_tid(c->tid); +- local_irq_restore(flags); ++ local_unlock_irqrestore(&s->cpu_slab->lock, flags); + return freelist; + + deactivate_slab: + +- local_irq_save(flags); ++ local_lock_irqsave(&s->cpu_slab->lock, flags); + if (page != c->page) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&s->cpu_slab->lock, flags); + goto reread_page; + } + freelist = c->freelist; + c->page = NULL; + c->freelist = NULL; +- local_irq_restore(flags); ++ local_unlock_irqrestore(&s->cpu_slab->lock, flags); + deactivate_slab(s, page, freelist); + + new_slab: + + if (slub_percpu_partial(c)) { +- local_irq_save(flags); ++ local_lock_irqsave(&s->cpu_slab->lock, flags); + if (unlikely(c->page)) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&s->cpu_slab->lock, flags); + goto reread_page; + } + if (unlikely(!slub_percpu_partial(c))) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&s->cpu_slab->lock, flags); + /* we were preempted and partial list got empty */ + goto new_objects; + } + + page = c->page = slub_percpu_partial(c); + slub_set_percpu_partial(c, page); +- local_irq_restore(flags); ++ local_unlock_irqrestore(&s->cpu_slab->lock, flags); + stat(s, CPU_PARTIAL_ALLOC); + goto redo; + } +@@ -2990,7 +3032,7 @@ static void *___slab_alloc(struct kmem_c + + retry_load_page: + +- local_irq_save(flags); ++ local_lock_irqsave(&s->cpu_slab->lock, flags); + if (unlikely(c->page)) { + void *flush_freelist = c->freelist; + struct page *flush_page = c->page; +@@ -2999,7 +3041,7 @@ static void *___slab_alloc(struct kmem_c + c->freelist = NULL; + c->tid = next_tid(c->tid); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + deactivate_slab(s, flush_page, flush_freelist); + +@@ -3118,7 +3160,15 @@ static __always_inline void *slab_alloc_ + + object = c->freelist; + page = c->page; +- if (unlikely(!object || !page || !node_match(page, node))) { ++ /* ++ * We cannot use the lockless fastpath on PREEMPT_RT because if a ++ * slowpath has taken the local_lock_irqsave(), it is not protected ++ * against a fast path operation in an irq handler. So we need to take ++ * the slow path which uses local_lock. It is still relatively fast if ++ * there is a suitable cpu freelist. ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) || ++ unlikely(!object || !page || !node_match(page, node))) { + object = __slab_alloc(s, gfpflags, node, addr, c); + } else { + void *next_object = get_freepointer_safe(s, object); +@@ -3378,6 +3428,7 @@ static __always_inline void do_slab_free + barrier(); + + if (likely(page == c->page)) { ++#ifndef CONFIG_PREEMPT_RT + void **freelist = READ_ONCE(c->freelist); + + set_freepointer(s, tail_obj, freelist); +@@ -3390,6 +3441,31 @@ static __always_inline void do_slab_free + note_cmpxchg_failure("slab_free", s, tid); + goto redo; + } ++#else /* CONFIG_PREEMPT_RT */ ++ /* ++ * We cannot use the lockless fastpath on PREEMPT_RT because if ++ * a slowpath has taken the local_lock_irqsave(), it is not ++ * protected against a fast path operation in an irq handler. So ++ * we need to take the local_lock. We shouldn't simply defer to ++ * __slab_free() as that wouldn't use the cpu freelist at all. ++ */ ++ void **freelist; ++ ++ local_lock(&s->cpu_slab->lock); ++ c = this_cpu_ptr(s->cpu_slab); ++ if (unlikely(page != c->page)) { ++ local_unlock(&s->cpu_slab->lock); ++ goto redo; ++ } ++ tid = c->tid; ++ freelist = c->freelist; ++ ++ set_freepointer(s, tail_obj, freelist); ++ c->freelist = head; ++ c->tid = next_tid(tid); ++ ++ local_unlock(&s->cpu_slab->lock); ++#endif + stat(s, FREE_FASTPATH); + } else + __slab_free(s, page, head, tail_obj, cnt, addr); +@@ -3568,7 +3644,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca + * handlers invoking normal fastpath. + */ + c = slub_get_cpu_ptr(s->cpu_slab); +- local_irq_disable(); ++ local_lock_irq(&s->cpu_slab->lock); + + for (i = 0; i < size; i++) { + void *object = kfence_alloc(s, s->object_size, flags); +@@ -3589,7 +3665,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca + */ + c->tid = next_tid(c->tid); + +- local_irq_enable(); ++ local_unlock_irq(&s->cpu_slab->lock); + + /* + * Invoking slow path likely have side-effect +@@ -3603,7 +3679,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca + c = this_cpu_ptr(s->cpu_slab); + maybe_wipe_obj_freeptr(s, p[i]); + +- local_irq_disable(); ++ local_lock_irq(&s->cpu_slab->lock); + + continue; /* goto for-loop */ + } +@@ -3612,7 +3688,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca + maybe_wipe_obj_freeptr(s, p[i]); + } + c->tid = next_tid(c->tid); +- local_irq_enable(); ++ local_unlock_irq(&s->cpu_slab->lock); + slub_put_cpu_ptr(s->cpu_slab); + + /* diff --git a/patches/0033-mm-slub-protect-put_cpu_partial-with-disabled-irqs-i.patch b/patches/0033-mm-slub-protect-put_cpu_partial-with-disabled-irqs-i.patch deleted file mode 100644 index 7ecc48a04726..000000000000 --- a/patches/0033-mm-slub-protect-put_cpu_partial-with-disabled-irqs-i.patch +++ /dev/null @@ -1,168 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:30 +0200 -Subject: [PATCH 33/35] mm, slub: protect put_cpu_partial() with disabled irqs - instead of cmpxchg - -Jann Horn reported [1] the following theoretically possible race: - - task A: put_cpu_partial() calls preempt_disable() - task A: oldpage = this_cpu_read(s->cpu_slab->partial) - interrupt: kfree() reaches unfreeze_partials() and discards the page - task B (on another CPU): reallocates page as page cache - task A: reads page->pages and page->pobjects, which are actually - halves of the pointer page->lru.prev - task B (on another CPU): frees page - interrupt: allocates page as SLUB page and places it on the percpu partial list - task A: this_cpu_cmpxchg() succeeds - - which would cause page->pages and page->pobjects to end up containing - halves of pointers that would then influence when put_cpu_partial() - happens and show up in root-only sysfs files. Maybe that's acceptable, - I don't know. But there should probably at least be a comment for now - to point out that we're reading union fields of a page that might be - in a completely different state. - -Additionally, the this_cpu_cmpxchg() approach in put_cpu_partial() is only safe -against s->cpu_slab->partial manipulation in ___slab_alloc() if the latter -disables irqs, otherwise a __slab_free() in an irq handler could call -put_cpu_partial() in the middle of ___slab_alloc() manipulating ->partial -and corrupt it. This becomes an issue on RT after a local_lock is introduced -in later patch. The fix means taking the local_lock also in put_cpu_partial() -on RT. - -After debugging this issue, Mike Galbraith suggested [2] that to avoid -different locking schemes on RT and !RT, we can just protect put_cpu_partial() -with disabled irqs (to be converted to local_lock_irqsave() later) everywhere. -This should be acceptable as it's not a fast path, and moving the actual -partial unfreezing outside of the irq disabled section makes it short, and with -the retry loop gone the code can be also simplified. In addition, the race -reported by Jann should no longer be possible. - -[1] https://lore.kernel.org/lkml/CAG48ez1mvUuXwg0YPH5ANzhQLpbphqk-ZS+jbRz+H66fvm4FcA@mail.gmail.com/ -[2] https://lore.kernel.org/linux-rt-users/e3470ab357b48bccfbd1f5133b982178a7d2befb.camel@gmx.de/ - -Reported-by: Jann Horn -Suggested-by: Mike Galbraith -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 83 +++++++++++++++++++++++++++++++++----------------------------- - 1 file changed, 45 insertions(+), 38 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -2006,7 +2006,12 @@ static inline void *acquire_slab(struct - return freelist; - } - -+#ifdef CONFIG_SLUB_CPU_PARTIAL - static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); -+#else -+static inline void put_cpu_partial(struct kmem_cache *s, struct page *page, -+ int drain) { } -+#endif - static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); - - /* -@@ -2440,14 +2445,6 @@ static void unfreeze_partials_cpu(struct - __unfreeze_partials(s, partial_page); - } - --#else /* CONFIG_SLUB_CPU_PARTIAL */ -- --static inline void unfreeze_partials(struct kmem_cache *s) { } --static inline void unfreeze_partials_cpu(struct kmem_cache *s, -- struct kmem_cache_cpu *c) { } -- --#endif /* CONFIG_SLUB_CPU_PARTIAL */ -- - /* - * Put a page that was just frozen (in __slab_free|get_partial_node) into a - * partial page slot if available. -@@ -2457,46 +2454,56 @@ static inline void unfreeze_partials_cpu - */ - static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) - { --#ifdef CONFIG_SLUB_CPU_PARTIAL - struct page *oldpage; -- int pages; -- int pobjects; -+ struct page *page_to_unfreeze = NULL; -+ unsigned long flags; -+ int pages = 0; -+ int pobjects = 0; - -- preempt_disable(); -- do { -- pages = 0; -- pobjects = 0; -- oldpage = this_cpu_read(s->cpu_slab->partial); -+ local_irq_save(flags); -+ -+ oldpage = this_cpu_read(s->cpu_slab->partial); - -- if (oldpage) { -+ if (oldpage) { -+ if (drain && oldpage->pobjects > slub_cpu_partial(s)) { -+ /* -+ * Partial array is full. Move the existing set to the -+ * per node partial list. Postpone the actual unfreezing -+ * outside of the critical section. -+ */ -+ page_to_unfreeze = oldpage; -+ oldpage = NULL; -+ } else { - pobjects = oldpage->pobjects; - pages = oldpage->pages; -- if (drain && pobjects > slub_cpu_partial(s)) { -- /* -- * partial array is full. Move the existing -- * set to the per node partial list. -- */ -- unfreeze_partials(s); -- oldpage = NULL; -- pobjects = 0; -- pages = 0; -- stat(s, CPU_PARTIAL_DRAIN); -- } - } -+ } - -- pages++; -- pobjects += page->objects - page->inuse; -+ pages++; -+ pobjects += page->objects - page->inuse; - -- page->pages = pages; -- page->pobjects = pobjects; -- page->next = oldpage; -- -- } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) -- != oldpage); -- preempt_enable(); --#endif /* CONFIG_SLUB_CPU_PARTIAL */ -+ page->pages = pages; -+ page->pobjects = pobjects; -+ page->next = oldpage; -+ -+ this_cpu_write(s->cpu_slab->partial, page); -+ -+ local_irq_restore(flags); -+ -+ if (page_to_unfreeze) { -+ __unfreeze_partials(s, page_to_unfreeze); -+ stat(s, CPU_PARTIAL_DRAIN); -+ } - } - -+#else /* CONFIG_SLUB_CPU_PARTIAL */ -+ -+static inline void unfreeze_partials(struct kmem_cache *s) { } -+static inline void unfreeze_partials_cpu(struct kmem_cache *s, -+ struct kmem_cache_cpu *c) { } -+ -+#endif /* CONFIG_SLUB_CPU_PARTIAL */ -+ - static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c, - bool lock) - { diff --git a/patches/0034-mm-slub-use-migrate_disable-on-PREEMPT_RT.patch b/patches/0034-mm-slub-use-migrate_disable-on-PREEMPT_RT.patch deleted file mode 100644 index 07772dcd3f4a..000000000000 --- a/patches/0034-mm-slub-use-migrate_disable-on-PREEMPT_RT.patch +++ /dev/null @@ -1,121 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:31 +0200 -Subject: [PATCH 34/35] mm, slub: use migrate_disable() on PREEMPT_RT - -We currently use preempt_disable() (directly or via get_cpu_ptr()) to stabilize -the pointer to kmem_cache_cpu. On PREEMPT_RT this would be incompatible with -the list_lock spinlock. We can use migrate_disable() instead, but that -increases overhead on !PREEMPT_RT as it's an unconditional function call. - -In order to get the best available mechanism on both PREEMPT_RT and -!PREEMPT_RT, introduce private slub_get_cpu_ptr() and slub_put_cpu_ptr() -wrappers and use them. - -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - mm/slub.c | 39 ++++++++++++++++++++++++++++++--------- - 1 file changed, 30 insertions(+), 9 deletions(-) - ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -118,6 +118,26 @@ - * the fast path and disables lockless freelists. - */ - -+/* -+ * We could simply use migrate_disable()/enable() but as long as it's a -+ * function call even on !PREEMPT_RT, use inline preempt_disable() there. -+ */ -+#ifndef CONFIG_PREEMPT_RT -+#define slub_get_cpu_ptr(var) get_cpu_ptr(var) -+#define slub_put_cpu_ptr(var) put_cpu_ptr(var) -+#else -+#define slub_get_cpu_ptr(var) \ -+({ \ -+ migrate_disable(); \ -+ this_cpu_ptr(var); \ -+}) -+#define slub_put_cpu_ptr(var) \ -+do { \ -+ (void)(var); \ -+ migrate_enable(); \ -+} while (0) -+#endif -+ - #ifdef CONFIG_SLUB_DEBUG - #ifdef CONFIG_SLUB_DEBUG_ON - DEFINE_STATIC_KEY_TRUE(slub_debug_enabled); -@@ -2828,7 +2848,7 @@ static void *___slab_alloc(struct kmem_c - if (unlikely(!pfmemalloc_match_unsafe(page, gfpflags))) - goto deactivate_slab; - -- /* must check again c->page in case IRQ handler changed it */ -+ /* must check again c->page in case we got preempted and it changed */ - local_irq_save(flags); - if (unlikely(page != c->page)) { - local_irq_restore(flags); -@@ -2887,7 +2907,8 @@ static void *___slab_alloc(struct kmem_c - } - if (unlikely(!slub_percpu_partial(c))) { - local_irq_restore(flags); -- goto new_objects; /* stolen by an IRQ handler */ -+ /* we were preempted and partial list got empty */ -+ goto new_objects; - } - - page = c->page = slub_percpu_partial(c); -@@ -2903,9 +2924,9 @@ static void *___slab_alloc(struct kmem_c - if (freelist) - goto check_new_page; - -- put_cpu_ptr(s->cpu_slab); -+ slub_put_cpu_ptr(s->cpu_slab); - page = new_slab(s, gfpflags, node); -- c = get_cpu_ptr(s->cpu_slab); -+ c = slub_get_cpu_ptr(s->cpu_slab); - - if (unlikely(!page)) { - slab_out_of_memory(s, gfpflags, node); -@@ -2988,12 +3009,12 @@ static void *__slab_alloc(struct kmem_ca - * cpu before disabling preemption. Need to reload cpu area - * pointer. - */ -- c = get_cpu_ptr(s->cpu_slab); -+ c = slub_get_cpu_ptr(s->cpu_slab); - #endif - - p = ___slab_alloc(s, gfpflags, node, addr, c); - #ifdef CONFIG_PREEMPT_COUNT -- put_cpu_ptr(s->cpu_slab); -+ slub_put_cpu_ptr(s->cpu_slab); - #endif - return p; - } -@@ -3522,7 +3543,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca - * IRQs, which protects against PREEMPT and interrupts - * handlers invoking normal fastpath. - */ -- c = get_cpu_ptr(s->cpu_slab); -+ c = slub_get_cpu_ptr(s->cpu_slab); - local_irq_disable(); - - for (i = 0; i < size; i++) { -@@ -3568,7 +3589,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca - } - c->tid = next_tid(c->tid); - local_irq_enable(); -- put_cpu_ptr(s->cpu_slab); -+ slub_put_cpu_ptr(s->cpu_slab); - - /* - * memcg and kmem_cache debug support and memory initialization. -@@ -3578,7 +3599,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca - slab_want_init_on_alloc(flags, s)); - return i; - error: -- put_cpu_ptr(s->cpu_slab); -+ slub_put_cpu_ptr(s->cpu_slab); - slab_post_alloc_hook(s, objcg, flags, i, p, false); - __kmem_cache_free_bulk(s, i, p); - return 0; diff --git a/patches/0035-mm-slub-convert-kmem_cpu_slab-protection-to-local_lo.patch b/patches/0035-mm-slub-convert-kmem_cpu_slab-protection-to-local_lo.patch deleted file mode 100644 index 5778979fe3c7..000000000000 --- a/patches/0035-mm-slub-convert-kmem_cpu_slab-protection-to-local_lo.patch +++ /dev/null @@ -1,403 +0,0 @@ -From: Vlastimil Babka -Date: Thu, 29 Jul 2021 15:21:32 +0200 -Subject: [PATCH 35/35] mm, slub: convert kmem_cpu_slab protection to - local_lock - -Embed local_lock into struct kmem_cpu_slab and use the irq-safe versions of -local_lock instead of plain local_irq_save/restore. On !PREEMPT_RT that's -equivalent, with better lockdep visibility. On PREEMPT_RT that means better -preemption. - -However, the cost on PREEMPT_RT is the loss of lockless fast paths which only -work with cpu freelist. Those are designed to detect and recover from being -preempted by other conflicting operations (both fast or slow path), but the -slow path operations assume they cannot be preempted by a fast path operation, -which is guaranteed naturally with disabled irqs. With local locks on -PREEMPT_RT, the fast paths now also need to take the local lock to avoid races. - -In the allocation fastpath slab_alloc_node() we can just defer to the slowpath -__slab_alloc() which also works with cpu freelist, but under the local lock. -In the free fastpath do_slab_free() we have to add a new local lock protected -version of freeing to the cpu freelist, as the existing slowpath only works -with the page freelist. - -Also update the comment about locking scheme in SLUB to reflect changes done -by this series. - -[ Mike Galbraith : use local_lock() without irq in PREEMPT_RT - scope; debugging of RT crashes resulting in put_cpu_partial() locking changes ] -Signed-off-by: Vlastimil Babka -Signed-off-by: Sebastian Andrzej Siewior ---- - include/linux/slub_def.h | 6 + - mm/slub.c | 142 ++++++++++++++++++++++++++++++++++++----------- - 2 files changed, 115 insertions(+), 33 deletions(-) - ---- a/include/linux/slub_def.h -+++ b/include/linux/slub_def.h -@@ -10,6 +10,7 @@ - #include - #include - #include -+#include - - enum stat_item { - ALLOC_FASTPATH, /* Allocation from cpu slab */ -@@ -40,6 +41,10 @@ enum stat_item { - CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ - NR_SLUB_STAT_ITEMS }; - -+/* -+ * When changing the layout, make sure freelist and tid are still compatible -+ * with this_cpu_cmpxchg_double() alignment requirements. -+ */ - struct kmem_cache_cpu { - void **freelist; /* Pointer to next available object */ - unsigned long tid; /* Globally unique transaction id */ -@@ -47,6 +52,7 @@ struct kmem_cache_cpu { - #ifdef CONFIG_SLUB_CPU_PARTIAL - struct page *partial; /* Partially allocated frozen slabs */ - #endif -+ local_lock_t lock; /* Protects the fields above */ - #ifdef CONFIG_SLUB_STATS - unsigned stat[NR_SLUB_STAT_ITEMS]; - #endif ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -46,13 +46,21 @@ - /* - * Lock order: - * 1. slab_mutex (Global Mutex) -- * 2. node->list_lock -- * 3. slab_lock(page) (Only on some arches and for debugging) -+ * 2. node->list_lock (Spinlock) -+ * 3. kmem_cache->cpu_slab->lock (Local lock) -+ * 4. slab_lock(page) (Only on some arches or for debugging) -+ * 5. object_map_lock (Only for debugging) - * - * slab_mutex - * - * The role of the slab_mutex is to protect the list of all the slabs - * and to synchronize major metadata changes to slab cache structures. -+ * Also synchronizes memory hotplug callbacks. -+ * -+ * slab_lock -+ * -+ * The slab_lock is a wrapper around the page lock, thus it is a bit -+ * spinlock. - * - * The slab_lock is only used for debugging and on arches that do not - * have the ability to do a cmpxchg_double. It only protects: -@@ -61,6 +69,8 @@ - * C. page->objects -> Number of objects in page - * D. page->frozen -> frozen state - * -+ * Frozen slabs -+ * - * If a slab is frozen then it is exempt from list management. It is not - * on any list except per cpu partial list. The processor that froze the - * slab is the one who can perform list operations on the page. Other -@@ -68,6 +78,8 @@ - * froze the slab is the only one that can retrieve the objects from the - * page's freelist. - * -+ * list_lock -+ * - * The list_lock protects the partial and full list on each node and - * the partial slab counter. If taken then no new slabs may be added or - * removed from the lists nor make the number of partial slabs be modified. -@@ -79,10 +91,36 @@ - * slabs, operations can continue without any centralized lock. F.e. - * allocating a long series of objects that fill up slabs does not require - * the list lock. -- * Interrupts are disabled during allocation and deallocation in order to -- * make the slab allocator safe to use in the context of an irq. In addition -- * interrupts are disabled to ensure that the processor does not change -- * while handling per_cpu slabs, due to kernel preemption. -+ * -+ * cpu_slab->lock local lock -+ * -+ * This locks protect slowpath manipulation of all kmem_cache_cpu fields -+ * except the stat counters. This is a percpu structure manipulated only by -+ * the local cpu, so the lock protects against being preempted or interrupted -+ * by an irq. Fast path operations rely on lockless operations instead. -+ * On PREEMPT_RT, the local lock does not actually disable irqs (and thus -+ * prevent the lockless operations), so fastpath operations also need to take -+ * the lock and are no longer lockless. -+ * -+ * lockless fastpaths -+ * -+ * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) -+ * are fully lockless when satisfied from the percpu slab (and when -+ * cmpxchg_double is possible to use, otherwise slab_lock is taken). -+ * They also don't disable preemption or migration or irqs. They rely on -+ * the transaction id (tid) field to detect being preempted or moved to -+ * another cpu. -+ * -+ * irq, preemption, migration considerations -+ * -+ * Interrupts are disabled as part of list_lock or local_lock operations, or -+ * around the slab_lock operation, in order to make the slab allocator safe -+ * to use in the context of an irq. -+ * -+ * In addition, preemption (or migration on PREEMPT_RT) is disabled in the -+ * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the -+ * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer -+ * doesn't have to be revalidated in each section protected by the local lock. - * - * SLUB assigns one slab for allocation to each processor. - * Allocations only occur from these slabs called cpu slabs. -@@ -2231,9 +2269,13 @@ static inline void note_cmpxchg_failure( - static void init_kmem_cache_cpus(struct kmem_cache *s) - { - int cpu; -+ struct kmem_cache_cpu *c; - -- for_each_possible_cpu(cpu) -- per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); -+ for_each_possible_cpu(cpu) { -+ c = per_cpu_ptr(s->cpu_slab, cpu); -+ local_lock_init(&c->lock); -+ c->tid = init_tid(cpu); -+ } - } - - /* -@@ -2444,10 +2486,10 @@ static void unfreeze_partials(struct kme - struct page *partial_page; - unsigned long flags; - -- local_irq_save(flags); -+ local_lock_irqsave(&s->cpu_slab->lock, flags); - partial_page = this_cpu_read(s->cpu_slab->partial); - this_cpu_write(s->cpu_slab->partial, NULL); -- local_irq_restore(flags); -+ local_unlock_irqrestore(&s->cpu_slab->lock, flags); - - if (partial_page) - __unfreeze_partials(s, partial_page); -@@ -2480,7 +2522,7 @@ static void put_cpu_partial(struct kmem_ - int pages = 0; - int pobjects = 0; - -- local_irq_save(flags); -+ local_lock_irqsave(&s->cpu_slab->lock, flags); - - oldpage = this_cpu_read(s->cpu_slab->partial); - -@@ -2508,7 +2550,7 @@ static void put_cpu_partial(struct kmem_ - - this_cpu_write(s->cpu_slab->partial, page); - -- local_irq_restore(flags); -+ local_unlock_irqrestore(&s->cpu_slab->lock, flags); - - if (page_to_unfreeze) { - __unfreeze_partials(s, page_to_unfreeze); -@@ -2532,7 +2574,7 @@ static inline void flush_slab(struct kme - struct page *page; - - if (lock) -- local_irq_save(flags); -+ local_lock_irqsave(&s->cpu_slab->lock, flags); - - freelist = c->freelist; - page = c->page; -@@ -2542,7 +2584,7 @@ static inline void flush_slab(struct kme - c->tid = next_tid(c->tid); - - if (lock) -- local_irq_restore(flags); -+ local_unlock_irqrestore(&s->cpu_slab->lock, flags); - - if (page) - deactivate_slab(s, page, freelist); -@@ -2849,9 +2891,9 @@ static void *___slab_alloc(struct kmem_c - goto deactivate_slab; - - /* must check again c->page in case we got preempted and it changed */ -- local_irq_save(flags); -+ local_lock_irqsave(&s->cpu_slab->lock, flags); - if (unlikely(page != c->page)) { -- local_irq_restore(flags); -+ local_unlock_irqrestore(&s->cpu_slab->lock, flags); - goto reread_page; - } - freelist = c->freelist; -@@ -2862,7 +2904,7 @@ static void *___slab_alloc(struct kmem_c - - if (!freelist) { - c->page = NULL; -- local_irq_restore(flags); -+ local_unlock_irqrestore(&s->cpu_slab->lock, flags); - stat(s, DEACTIVATE_BYPASS); - goto new_slab; - } -@@ -2871,7 +2913,7 @@ static void *___slab_alloc(struct kmem_c - - load_freelist: - -- lockdep_assert_irqs_disabled(); -+ lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); - - /* - * freelist is pointing to the list of objects to be used. -@@ -2881,39 +2923,39 @@ static void *___slab_alloc(struct kmem_c - VM_BUG_ON(!c->page->frozen); - c->freelist = get_freepointer(s, freelist); - c->tid = next_tid(c->tid); -- local_irq_restore(flags); -+ local_unlock_irqrestore(&s->cpu_slab->lock, flags); - return freelist; - - deactivate_slab: - -- local_irq_save(flags); -+ local_lock_irqsave(&s->cpu_slab->lock, flags); - if (page != c->page) { -- local_irq_restore(flags); -+ local_unlock_irqrestore(&s->cpu_slab->lock, flags); - goto reread_page; - } - freelist = c->freelist; - c->page = NULL; - c->freelist = NULL; -- local_irq_restore(flags); -+ local_unlock_irqrestore(&s->cpu_slab->lock, flags); - deactivate_slab(s, page, freelist); - - new_slab: - - if (slub_percpu_partial(c)) { -- local_irq_save(flags); -+ local_lock_irqsave(&s->cpu_slab->lock, flags); - if (unlikely(c->page)) { -- local_irq_restore(flags); -+ local_unlock_irqrestore(&s->cpu_slab->lock, flags); - goto reread_page; - } - if (unlikely(!slub_percpu_partial(c))) { -- local_irq_restore(flags); -+ local_unlock_irqrestore(&s->cpu_slab->lock, flags); - /* we were preempted and partial list got empty */ - goto new_objects; - } - - page = c->page = slub_percpu_partial(c); - slub_set_percpu_partial(c, page); -- local_irq_restore(flags); -+ local_unlock_irqrestore(&s->cpu_slab->lock, flags); - stat(s, CPU_PARTIAL_ALLOC); - goto redo; - } -@@ -2966,7 +3008,7 @@ static void *___slab_alloc(struct kmem_c - - retry_load_page: - -- local_irq_save(flags); -+ local_lock_irqsave(&s->cpu_slab->lock, flags); - if (unlikely(c->page)) { - void *flush_freelist = c->freelist; - struct page *flush_page = c->page; -@@ -2975,7 +3017,7 @@ static void *___slab_alloc(struct kmem_c - c->freelist = NULL; - c->tid = next_tid(c->tid); - -- local_irq_restore(flags); -+ local_unlock_irqrestore(&s->cpu_slab->lock, flags); - - deactivate_slab(s, flush_page, flush_freelist); - -@@ -3094,7 +3136,15 @@ static __always_inline void *slab_alloc_ - - object = c->freelist; - page = c->page; -- if (unlikely(!object || !page || !node_match(page, node))) { -+ /* -+ * We cannot use the lockless fastpath on PREEMPT_RT because if a -+ * slowpath has taken the local_lock_irqsave(), it is not protected -+ * against a fast path operation in an irq handler. So we need to take -+ * the slow path which uses local_lock. It is still relatively fast if -+ * there is a suitable cpu freelist. -+ */ -+ if (IS_ENABLED(CONFIG_PREEMPT_RT) || -+ unlikely(!object || !page || !node_match(page, node))) { - object = __slab_alloc(s, gfpflags, node, addr, c); - } else { - void *next_object = get_freepointer_safe(s, object); -@@ -3354,6 +3404,7 @@ static __always_inline void do_slab_free - barrier(); - - if (likely(page == c->page)) { -+#ifndef CONFIG_PREEMPT_RT - void **freelist = READ_ONCE(c->freelist); - - set_freepointer(s, tail_obj, freelist); -@@ -3366,6 +3417,31 @@ static __always_inline void do_slab_free - note_cmpxchg_failure("slab_free", s, tid); - goto redo; - } -+#else /* CONFIG_PREEMPT_RT */ -+ /* -+ * We cannot use the lockless fastpath on PREEMPT_RT because if -+ * a slowpath has taken the local_lock_irqsave(), it is not -+ * protected against a fast path operation in an irq handler. So -+ * we need to take the local_lock. We shouldn't simply defer to -+ * __slab_free() as that wouldn't use the cpu freelist at all. -+ */ -+ void **freelist; -+ -+ local_lock(&s->cpu_slab->lock); -+ c = this_cpu_ptr(s->cpu_slab); -+ if (unlikely(page != c->page)) { -+ local_unlock(&s->cpu_slab->lock); -+ goto redo; -+ } -+ tid = c->tid; -+ freelist = c->freelist; -+ -+ set_freepointer(s, tail_obj, freelist); -+ c->freelist = head; -+ c->tid = next_tid(tid); -+ -+ local_unlock(&s->cpu_slab->lock); -+#endif - stat(s, FREE_FASTPATH); - } else - __slab_free(s, page, head, tail_obj, cnt, addr); -@@ -3544,7 +3620,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca - * handlers invoking normal fastpath. - */ - c = slub_get_cpu_ptr(s->cpu_slab); -- local_irq_disable(); -+ local_lock_irq(&s->cpu_slab->lock); - - for (i = 0; i < size; i++) { - void *object = kfence_alloc(s, s->object_size, flags); -@@ -3565,7 +3641,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca - */ - c->tid = next_tid(c->tid); - -- local_irq_enable(); -+ local_unlock_irq(&s->cpu_slab->lock); - - /* - * Invoking slow path likely have side-effect -@@ -3579,7 +3655,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca - c = this_cpu_ptr(s->cpu_slab); - maybe_wipe_obj_freeptr(s, p[i]); - -- local_irq_disable(); -+ local_lock_irq(&s->cpu_slab->lock); - - continue; /* goto for-loop */ - } -@@ -3588,7 +3664,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca - maybe_wipe_obj_freeptr(s, p[i]); - } - c->tid = next_tid(c->tid); -- local_irq_enable(); -+ local_unlock_irq(&s->cpu_slab->lock); - slub_put_cpu_ptr(s->cpu_slab); - - /* diff --git a/patches/Add_localversion_for_-RT_release.patch b/patches/Add_localversion_for_-RT_release.patch index 5da6b6218f1c..c0ab4419d1ec 100644 --- a/patches/Add_localversion_for_-RT_release.patch +++ b/patches/Add_localversion_for_-RT_release.patch @@ -15,4 +15,4 @@ Signed-off-by: Thomas Gleixner --- /dev/null +++ b/localversion-rt @@ -0,0 +1 @@ -+-rt18 ++-rt19 diff --git a/patches/locking-Remove-rt_rwlock_is_contended.patch b/patches/locking-Remove-rt_rwlock_is_contended.patch new file mode 100644 index 000000000000..06fb1d6c4216 --- /dev/null +++ b/patches/locking-Remove-rt_rwlock_is_contended.patch @@ -0,0 +1,33 @@ +From: Sebastian Andrzej Siewior +Date: Tue, 7 Sep 2021 12:11:47 +0200 +Subject: [PATCH] locking: Remove rt_rwlock_is_contended() + +rt_rwlock_is_contended() has not users. It makes no sense to use it as +rwlock_is_contended() because it is a sleeping lock on RT and preemption +is possible. It reports always != 0 if used by a writer and even if +there is a waiter then the lock might not be handed over if the +current owner has the highest priority. + +Remove rt_rwlock_is_contended(). + +Reported-by: kernel test robot +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/locking/spinlock_rt.c | 6 ------ + 1 file changed, 6 deletions(-) + +--- a/kernel/locking/spinlock_rt.c ++++ b/kernel/locking/spinlock_rt.c +@@ -246,12 +246,6 @@ void __sched rt_write_unlock(rwlock_t *r + } + EXPORT_SYMBOL(rt_write_unlock); + +-int __sched rt_rwlock_is_contended(rwlock_t *rwlock) +-{ +- return rw_base_is_contended(&rwlock->rwbase); +-} +-EXPORT_SYMBOL(rt_rwlock_is_contended); +- + #ifdef CONFIG_DEBUG_LOCK_ALLOC + void __rt_rwlock_init(rwlock_t *rwlock, const char *name, + struct lock_class_key *key) diff --git a/patches/locking-rtmutex-Fix-ww_mutex-deadlock-check.patch b/patches/locking-rtmutex-Fix-ww_mutex-deadlock-check.patch new file mode 100644 index 000000000000..ead382f482cd --- /dev/null +++ b/patches/locking-rtmutex-Fix-ww_mutex-deadlock-check.patch @@ -0,0 +1,38 @@ +From: Peter Zijlstra +Date: Wed, 1 Sep 2021 11:44:11 +0200 +Subject: [PATCH] locking/rtmutex: Fix ww_mutex deadlock check + +Dan reported that rt_mutex_adjust_prio_chain() can be called with +.orig_waiter == NULL however commit a055fcc132d4 ("locking/rtmutex: +Return success on deadlock for ww_mutex waiters") unconditionally +dereferences it. + +Since both call-sites that have .orig_waiter == NULL don't care for the +return value, simply disable the deadlock squash by adding the NULL +check. + +Notably, both callers use the deadlock condition as a termination +condition for the iteration; once detected, we're sure (de)boosting is +done. Arguably [3] would be a more natural termination point, but I'm +not sure adding a third deadlock detection state would improve the code. + +Fixes: a055fcc132d4 ("locking/rtmutex: Return success on deadlock for ww_mutex waiters") +Reported-by: Dan Carpenter +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Sebastian Andrzej Siewior +Link: https://lore.kernel.org/r/YS9La56fHMiCCo75@hirez.programming.kicks-ass.net +--- + kernel/locking/rtmutex.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -753,7 +753,7 @@ static int __sched rt_mutex_adjust_prio_ + * other configuration and we fail to report; also, see + * lockdep. + */ +- if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter->ww_ctx) ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx) + ret = 0; + + raw_spin_unlock(&lock->wait_lock); diff --git a/patches/sched-Make-the-idle-timer-expire-always-in-hardirq-c.patch b/patches/sched-Make-the-idle-timer-expire-always-in-hardirq-c.patch new file mode 100644 index 000000000000..a10d1f9569d3 --- /dev/null +++ b/patches/sched-Make-the-idle-timer-expire-always-in-hardirq-c.patch @@ -0,0 +1,38 @@ +From: Sebastian Andrzej Siewior +Date: Mon, 6 Sep 2021 11:40:48 +0200 +Subject: [PATCH] sched: Make the idle timer expire always in hardirq context. + +The intel powerclamp driver will setup a per-CPU worker with RT +priority. The worker will then invoke play_idle() in which it remains in +the idle poll loop until it is stopped by the timer it started earlier. + +That timer needs to expire in hardirq context on PREEMPT_RT. Otherwise +the timer will expire in ksoftirqd as a SOFT timer but that task won't +be scheduled on the CPU because its priority is lower than the priority +of the worker which is in the idle loop. + +Always expire the idle timer in hardirq context. + +Fixes:c1de45ca831ac ("sched/idle: Add support for tasks that inject idle") +Reported-by: Thomas Gleixner +Signed-off-by: Sebastian Andrzej Siewior +Link: https://lkml.kernel.org/r/20210906113034.jgfxrjdvxnjqgtmc@linutronix.de +--- + kernel/sched/idle.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -379,10 +379,10 @@ void play_idle_precise(u64 duration_ns, + cpuidle_use_deepest_state(latency_ns); + + it.done = 0; +- hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); + it.timer.function = idle_inject_timer_fn; + hrtimer_start(&it.timer, ns_to_ktime(duration_ns), +- HRTIMER_MODE_REL_PINNED); ++ HRTIMER_MODE_REL_PINNED_HARD); + + while (!READ_ONCE(it.done)) + do_idle(); diff --git a/patches/series b/patches/series index 7fb2aa6ec52a..11ea9abed892 100644 --- a/patches/series +++ b/patches/series @@ -37,43 +37,41 @@ printk__Enhance_the_condition_check_of_msleep_in_pr_flush.patch ########################################################################### # mm bits polished by Mel and Vlastimil -# slub-local-lock-v4r3 +# slub-local-lock-v6r2 ########################################################################### 0001-mm-slub-don-t-call-flush_all-from-slab_debug_trace_o.patch 0002-mm-slub-allocate-private-object-map-for-debugfs-list.patch 0003-mm-slub-allocate-private-object-map-for-validate_sla.patch 0004-mm-slub-don-t-disable-irq-for-debug_check_no_locks_f.patch 0005-mm-slub-remove-redundant-unfreeze_partials-from-put_.patch -0006-mm-slub-unify-cmpxchg_double_slab-and-__cmpxchg_doub.patch -0007-mm-slub-extract-get_partial-from-new_slab_objects.patch -0008-mm-slub-dissolve-new_slab_objects-into-___slab_alloc.patch -0009-mm-slub-return-slab-page-from-get_partial-and-set-c-.patch -0010-mm-slub-restructure-new-page-checks-in-___slab_alloc.patch -0011-mm-slub-simplify-kmem_cache_cpu-and-tid-setup.patch -0012-mm-slub-move-disabling-enabling-irqs-to-___slab_allo.patch -0013-mm-slub-do-initial-checks-in-___slab_alloc-with-irqs.patch -0014-mm-slub-move-disabling-irqs-closer-to-get_partial-in.patch -0015-mm-slub-restore-irqs-around-calling-new_slab.patch -0016-mm-slub-validate-slab-from-partial-list-or-page-allo.patch -0017-mm-slub-check-new-pages-with-restored-irqs.patch -0018-mm-slub-stop-disabling-irqs-around-get_partial.patch -0019-mm-slub-move-reset-of-c-page-and-freelist-out-of-dea.patch -0020-mm-slub-make-locking-in-deactivate_slab-irq-safe.patch -0021-mm-slub-call-deactivate_slab-without-disabling-irqs.patch -0022-mm-slub-move-irq-control-into-unfreeze_partials.patch -0023-mm-slub-discard-slabs-in-unfreeze_partials-without-i.patch -0024-mm-slub-detach-whole-partial-list-at-once-in-unfreez.patch -0025-mm-slub-separate-detaching-of-partial-list-in-unfree.patch -0026-mm-slub-only-disable-irq-with-spin_lock-in-__unfreez.patch -0027-mm-slub-don-t-disable-irqs-in-slub_cpu_dead.patch -0028-mm-slab-make-flush_slab-possible-to-call-with-irqs-e.patch -0029-mm-slub-Move-flush_cpu_slab-invocations-__free_slab-.patch -0030-mm-slub-Make-object_map_lock-a-raw_spinlock_t.patch -0031-mm-slub-optionally-save-restore-irqs-in-slab_-un-loc.patch -0032-mm-slub-make-slab_lock-disable-irqs-with-PREEMPT_RT.patch -0033-mm-slub-protect-put_cpu_partial-with-disabled-irqs-i.patch -0034-mm-slub-use-migrate_disable-on-PREEMPT_RT.patch -0035-mm-slub-convert-kmem_cpu_slab-protection-to-local_lo.patch +0006-mm-slub-extract-get_partial-from-new_slab_objects.patch +0007-mm-slub-dissolve-new_slab_objects-into-___slab_alloc.patch +0008-mm-slub-return-slab-page-from-get_partial-and-set-c-.patch +0009-mm-slub-restructure-new-page-checks-in-___slab_alloc.patch +0010-mm-slub-simplify-kmem_cache_cpu-and-tid-setup.patch +0011-mm-slub-move-disabling-enabling-irqs-to-___slab_allo.patch +0012-mm-slub-do-initial-checks-in-___slab_alloc-with-irqs.patch +0013-mm-slub-move-disabling-irqs-closer-to-get_partial-in.patch +0014-mm-slub-restore-irqs-around-calling-new_slab.patch +0015-mm-slub-validate-slab-from-partial-list-or-page-allo.patch +0016-mm-slub-check-new-pages-with-restored-irqs.patch +0017-mm-slub-stop-disabling-irqs-around-get_partial.patch +0018-mm-slub-move-reset-of-c-page-and-freelist-out-of-dea.patch +0019-mm-slub-make-locking-in-deactivate_slab-irq-safe.patch +0020-mm-slub-call-deactivate_slab-without-disabling-irqs.patch +0021-mm-slub-move-irq-control-into-unfreeze_partials.patch +0022-mm-slub-discard-slabs-in-unfreeze_partials-without-i.patch +0023-mm-slub-detach-whole-partial-list-at-once-in-unfreez.patch +0024-mm-slub-separate-detaching-of-partial-list-in-unfree.patch +0025-mm-slub-only-disable-irq-with-spin_lock-in-__unfreez.patch +0026-mm-slub-don-t-disable-irqs-in-slub_cpu_dead.patch +0027-mm-slab-split-out-the-cpu-offline-variant-of-flush_s.patch +0028-mm-slub-move-flush_cpu_slab-invocations-__free_slab-.patch +0029-mm-slub-make-object_map_lock-a-raw_spinlock_t.patch +0030-mm-slub-make-slab_lock-disable-irqs-with-PREEMPT_RT.patch +0031-mm-slub-protect-put_cpu_partial-with-disabled-irqs-i.patch +0032-mm-slub-use-migrate_disable-on-PREEMPT_RT.patch +0033-mm-slub-convert-kmem_cpu_slab-protection-to-local_lo.patch ########################################################################### # Posted @@ -82,6 +80,7 @@ highmem-Don-t-disable-preemption-on-RT-in-kmap_atomi.patch sched-Switch-wait_task_inactive-to-HRTIMER_MODE_REL_.patch sched-Prevent-balance_push-on-remote-runqueues.patch lockdep-Let-lock_is_held_type-detect-recursive-read-.patch +sched-Make-the-idle-timer-expire-always-in-hardirq-c.patch #KCOV 0001_documentation_kcov_include_types_h_in_the_example.patch @@ -227,6 +226,8 @@ locking-rtmutex-Prevent-spurious-EDEADLK-return-caus.patch ########################################################################### # Locking: RT bits. Need review ########################################################################### +locking-rtmutex-Fix-ww_mutex-deadlock-check.patch +locking-Remove-rt_rwlock_is_contended.patch lockdep-selftests-Avoid-using-local_lock_-acquire-re.patch 0001-sched-Trigger-warning-if-migration_disabled-counter-.patch 0003-rtmutex-Add-a-special-case-for-ww-mutex-handling.patch -- cgit v1.2.1