summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/compaction.c10
-rw-r--r--mm/highmem.c5
-rw-r--r--mm/memcontrol.c28
-rw-r--r--mm/page_alloc.c175
-rw-r--r--mm/slab.c90
-rw-r--r--mm/slab.h2
-rw-r--r--mm/slub.c146
-rw-r--r--mm/swap.c38
-rw-r--r--mm/vmalloc.c13
-rw-r--r--mm/vmstat.c12
-rw-r--r--mm/workingset.c5
-rw-r--r--mm/zsmalloc.c80
-rw-r--r--mm/zswap.c12
14 files changed, 438 insertions, 180 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ab80933be65f..f5b974844da5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -370,7 +370,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
- depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
select COMPACTION
select XARRAY_MULTI
help
diff --git a/mm/compaction.c b/mm/compaction.c
index 672d3c78c6ab..31e6e103f38b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1590,7 +1590,11 @@ typedef enum {
* Allow userspace to control policy on scanning the unevictable LRU for
* compactable pages.
*/
+#ifdef CONFIG_PREEMPT_RT
+int sysctl_compact_unevictable_allowed __read_mostly = 0;
+#else
int sysctl_compact_unevictable_allowed __read_mostly = 1;
+#endif
static inline void
update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
@@ -2240,10 +2244,12 @@ check_drain:
block_start_pfn(cc->migrate_pfn, cc->order);
if (last_migrated_pfn < current_block_start) {
- cpu = get_cpu();
+ cpu = get_cpu_light();
+ local_lock_irq(swapvec_lock);
lru_add_drain_cpu(cpu);
+ local_unlock_irq(swapvec_lock);
drain_local_pages(cc->zone);
- put_cpu();
+ put_cpu_light();
/* No more flushing until we migrate again */
last_migrated_pfn = 0;
}
diff --git a/mm/highmem.c b/mm/highmem.c
index 64d8dea47dd1..7d3065719ce8 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -31,8 +31,11 @@
#include <asm/tlbflush.h>
#include <linux/vmalloc.h>
+#ifndef CONFIG_PREEMPT_RT
#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
DEFINE_PER_CPU(int, __kmap_atomic_idx);
+EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
+#endif
#endif
/*
@@ -108,8 +111,6 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
atomic_long_t _totalhigh_pages __read_mostly;
EXPORT_SYMBOL(_totalhigh_pages);
-EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
-
unsigned int nr_free_highpages (void)
{
struct zone *zone;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7ddf91c4295f..33472eeaa1ee 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -63,6 +63,7 @@
#include <net/sock.h>
#include <net/ip.h>
#include "slab.h"
+#include <linux/locallock.h>
#include <linux/uaccess.h>
@@ -92,6 +93,8 @@ int do_swap_account __read_mostly;
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
+static DEFINE_LOCAL_IRQ_LOCK(event_lock);
+
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
@@ -2161,7 +2164,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
* as well as workers from this path always operate on the local
* per-cpu data. CPU up doesn't touch memcg_stock at all.
*/
- curcpu = get_cpu();
+ curcpu = get_cpu_light();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
@@ -2182,7 +2185,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
schedule_work_on(cpu, &stock->work);
}
}
- put_cpu();
+ put_cpu_light();
mutex_unlock(&percpu_charge_mutex);
}
@@ -5422,12 +5425,12 @@ static int mem_cgroup_move_account(struct page *page,
ret = 0;
- local_irq_disable();
+ local_lock_irq(event_lock);
mem_cgroup_charge_statistics(to, page, compound, nr_pages);
memcg_check_events(to, page);
mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
memcg_check_events(from, page);
- local_irq_enable();
+ local_unlock_irq(event_lock);
out_unlock:
unlock_page(page);
out:
@@ -6491,10 +6494,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
commit_charge(page, memcg, lrucare);
- local_irq_disable();
+ local_lock_irq(event_lock);
mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
memcg_check_events(memcg, page);
- local_irq_enable();
+ local_unlock_irq(event_lock);
if (do_memsw_account() && PageSwapCache(page)) {
swp_entry_t entry = { .val = page_private(page) };
@@ -6563,7 +6566,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
memcg_oom_recover(ug->memcg);
}
- local_irq_save(flags);
+ local_lock_irqsave(event_lock, flags);
__mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
__mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
__mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
@@ -6571,7 +6574,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
- local_irq_restore(flags);
+ local_unlock_irqrestore(event_lock, flags);
if (!mem_cgroup_is_root(ug->memcg))
css_put_many(&ug->memcg->css, nr_pages);
@@ -6732,11 +6735,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
commit_charge(newpage, memcg, false);
- local_irq_save(flags);
+ local_lock_irqsave(event_lock, flags);
mem_cgroup_charge_statistics(memcg, newpage, PageTransHuge(newpage),
nr_pages);
memcg_check_events(memcg, newpage);
- local_irq_restore(flags);
+ local_unlock_irqrestore(event_lock, flags);
}
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
@@ -6918,6 +6921,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
struct mem_cgroup *memcg, *swap_memcg;
unsigned int nr_entries;
unsigned short oldid;
+ unsigned long flags;
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
@@ -6963,10 +6967,14 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* important here to have the interrupts disabled because it is the
* only synchronisation we have for updating the per-CPU variables.
*/
+ local_lock_irqsave(event_lock, flags);
+#ifndef CONFIG_PREEMPT_RT
VM_BUG_ON(!irqs_disabled());
+#endif
mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
-nr_entries);
memcg_check_events(memcg, page);
+ local_unlock_irqrestore(event_lock, flags);
if (!mem_cgroup_is_root(memcg))
css_put_many(&memcg->css, nr_entries);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3c4eb750a199..d8cb18d589cd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,6 +61,7 @@
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
#include <linux/sched/mm.h>
+#include <linux/locallock.h>
#include <linux/page_owner.h>
#include <linux/kthread.h>
#include <linux/memcontrol.h>
@@ -357,6 +358,8 @@ EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
+static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
+
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -1236,7 +1239,7 @@ static inline void prefetch_buddy(struct page *page)
}
/*
- * Frees a number of pages from the PCP lists
+ * Frees a number of pages which have been collected from the pcp lists.
* Assumes all pages on list are in same zone, and of same order.
* count is the number of pages to free.
*
@@ -1246,15 +1249,57 @@ static inline void prefetch_buddy(struct page *page)
* And clear the zone's pages_scanned counter, to hold off the "all pages are
* pinned" detection logic.
*/
-static void free_pcppages_bulk(struct zone *zone, int count,
- struct per_cpu_pages *pcp)
+static void free_pcppages_bulk(struct zone *zone, struct list_head *head,
+ bool zone_retry)
+{
+ bool isolated_pageblocks;
+ struct page *page, *tmp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ isolated_pageblocks = has_isolate_pageblock(zone);
+
+ /*
+ * Use safe version since after __free_one_page(),
+ * page->lru.next will not point to original list.
+ */
+ list_for_each_entry_safe(page, tmp, head, lru) {
+ int mt = get_pcppage_migratetype(page);
+
+ if (page_zone(page) != zone) {
+ /*
+ * free_unref_page_list() sorts pages by zone. If we end
+ * up with pages from a different NUMA nodes belonging
+ * to the same ZONE index then we need to redo with the
+ * correct ZONE pointer. Skip the page for now, redo it
+ * on the next iteration.
+ */
+ WARN_ON_ONCE(zone_retry == false);
+ if (zone_retry)
+ continue;
+ }
+
+ /* MIGRATE_ISOLATE page should not go to pcplists */
+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+ /* Pageblock could have been isolated meanwhile */
+ if (unlikely(isolated_pageblocks))
+ mt = get_pageblock_migratetype(page);
+
+ list_del(&page->lru);
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt);
+ trace_mm_page_pcpu_drain(page, 0, mt);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp,
+ struct list_head *dst)
+
{
int migratetype = 0;
int batch_free = 0;
int prefetch_nr = 0;
- bool isolated_pageblocks;
- struct page *page, *tmp;
- LIST_HEAD(head);
+ struct page *page;
while (count) {
struct list_head *list;
@@ -1286,7 +1331,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
if (bulkfree_pcp_prepare(page))
continue;
- list_add_tail(&page->lru, &head);
+ list_add_tail(&page->lru, dst);
/*
* We are going to put the page back to the global
@@ -1301,26 +1346,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
prefetch_buddy(page);
} while (--count && --batch_free && !list_empty(list));
}
-
- spin_lock(&zone->lock);
- isolated_pageblocks = has_isolate_pageblock(zone);
-
- /*
- * Use safe version since after __free_one_page(),
- * page->lru.next will not point to original list.
- */
- list_for_each_entry_safe(page, tmp, &head, lru) {
- int mt = get_pcppage_migratetype(page);
- /* MIGRATE_ISOLATE page should not go to pcplists */
- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
- /* Pageblock could have been isolated meanwhile */
- if (unlikely(isolated_pageblocks))
- mt = get_pageblock_migratetype(page);
-
- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
- trace_mm_page_pcpu_drain(page, 0, mt);
- }
- spin_unlock(&zone->lock);
}
static void free_one_page(struct zone *zone,
@@ -1421,10 +1446,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
return;
migratetype = get_pfnblock_migratetype(page, pfn);
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, pfn, order, migratetype);
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
}
void __free_pages_core(struct page *page, unsigned int order)
@@ -2788,13 +2813,18 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
unsigned long flags;
int to_drain, batch;
+ LIST_HEAD(dst);
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
- free_pcppages_bulk(zone, to_drain, pcp);
- local_irq_restore(flags);
+ isolate_pcp_pages(to_drain, pcp, &dst);
+
+ local_unlock_irqrestore(pa_lock, flags);
+
+ if (to_drain > 0)
+ free_pcppages_bulk(zone, &dst, false);
}
#endif
@@ -2810,14 +2840,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
unsigned long flags;
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
+ LIST_HEAD(dst);
+ int count;
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
- if (pcp->count)
- free_pcppages_bulk(zone, pcp->count, pcp);
- local_irq_restore(flags);
+ count = pcp->count;
+ if (count)
+ isolate_pcp_pages(count, pcp, &dst);
+
+ local_unlock_irqrestore(pa_lock, flags);
+
+ if (count)
+ free_pcppages_bulk(zone, &dst, false);
}
/*
@@ -2865,9 +2902,9 @@ static void drain_local_pages_wq(struct work_struct *work)
* cpu which is allright but we also have to make sure to not move to
* a different one.
*/
- preempt_disable();
+ migrate_disable();
drain_local_pages(drain->zone);
- preempt_enable();
+ migrate_enable();
}
/*
@@ -3016,7 +3053,8 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
return true;
}
-static void free_unref_page_commit(struct page *page, unsigned long pfn)
+static void free_unref_page_commit(struct page *page, unsigned long pfn,
+ struct list_head *dst)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
@@ -3045,7 +3083,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
pcp->count++;
if (pcp->count >= pcp->high) {
unsigned long batch = READ_ONCE(pcp->batch);
- free_pcppages_bulk(zone, batch, pcp);
+
+ isolate_pcp_pages(batch, pcp, dst);
}
}
@@ -3056,13 +3095,17 @@ void free_unref_page(struct page *page)
{
unsigned long flags;
unsigned long pfn = page_to_pfn(page);
+ struct zone *zone = page_zone(page);
+ LIST_HEAD(dst);
if (!free_unref_page_prepare(page, pfn))
return;
- local_irq_save(flags);
- free_unref_page_commit(page, pfn);
- local_irq_restore(flags);
+ local_lock_irqsave(pa_lock, flags);
+ free_unref_page_commit(page, pfn, &dst);
+ local_unlock_irqrestore(pa_lock, flags);
+ if (!list_empty(&dst))
+ free_pcppages_bulk(zone, &dst, false);
}
/*
@@ -3073,6 +3116,11 @@ void free_unref_page_list(struct list_head *list)
struct page *page, *next;
unsigned long flags, pfn;
int batch_count = 0;
+ struct list_head dsts[__MAX_NR_ZONES];
+ int i;
+
+ for (i = 0; i < __MAX_NR_ZONES; i++)
+ INIT_LIST_HEAD(&dsts[i]);
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
@@ -3082,25 +3130,42 @@ void free_unref_page_list(struct list_head *list)
set_page_private(page, pfn);
}
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
list_for_each_entry_safe(page, next, list, lru) {
unsigned long pfn = page_private(page);
+ enum zone_type type;
set_page_private(page, 0);
trace_mm_page_free_batched(page);
- free_unref_page_commit(page, pfn);
+ type = page_zonenum(page);
+ free_unref_page_commit(page, pfn, &dsts[type]);
/*
* Guard against excessive IRQ disabled times when we get
* a large list of pages to free.
*/
if (++batch_count == SWAP_CLUSTER_MAX) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
batch_count = 0;
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
}
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
+
+ for (i = 0; i < __MAX_NR_ZONES; ) {
+ struct page *page;
+ struct zone *zone;
+
+ if (list_empty(&dsts[i])) {
+ i++;
+ continue;
+ }
+
+ page = list_first_entry(&dsts[i], struct page, lru);
+ zone = page_zone(page);
+
+ free_pcppages_bulk(zone, &dsts[i], true);
+ }
}
/*
@@ -3235,7 +3300,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct page *page;
unsigned long flags;
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
@@ -3243,7 +3308,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone);
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
return page;
}
@@ -3270,7 +3335,7 @@ struct page *rmqueue(struct zone *preferred_zone,
* allocate greater than order-1 page units with __GFP_NOFAIL.
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
- spin_lock_irqsave(&zone->lock, flags);
+ local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
do {
page = NULL;
@@ -3290,7 +3355,7 @@ struct page *rmqueue(struct zone *preferred_zone,
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
out:
/* Separate test+clear to avoid unnecessary atomics */
@@ -3303,7 +3368,7 @@ out:
return page;
failed:
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
return NULL;
}
@@ -8651,7 +8716,7 @@ void zone_pcp_reset(struct zone *zone)
struct per_cpu_pageset *pset;
/* avoid races with drain_pages() */
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
if (zone->pageset != &boot_pageset) {
for_each_online_cpu(cpu) {
pset = per_cpu_ptr(zone->pageset, cpu);
@@ -8660,7 +8725,7 @@ void zone_pcp_reset(struct zone *zone)
free_percpu(zone->pageset);
zone->pageset = &boot_pageset;
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/mm/slab.c b/mm/slab.c
index a89633603b2d..1e3a586ed116 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -233,7 +233,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
parent->shared = NULL;
parent->alien = NULL;
parent->colour_next = 0;
- spin_lock_init(&parent->list_lock);
+ raw_spin_lock_init(&parent->list_lock);
parent->free_objects = 0;
parent->free_touched = 0;
}
@@ -558,9 +558,9 @@ static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
page_node = page_to_nid(page);
n = get_node(cachep, page_node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
free_block(cachep, &objp, 1, page_node, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -688,7 +688,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
struct kmem_cache_node *n = get_node(cachep, node);
if (ac->avail) {
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
/*
* Stuff objects into the remote nodes shared array first.
* That way we could avoid the overhead of putting the objects
@@ -699,7 +699,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
free_block(cachep, ac->entry, ac->avail, node, list);
ac->avail = 0;
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
}
}
@@ -772,9 +772,9 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
slabs_destroy(cachep, &list);
} else {
n = get_node(cachep, page_node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
free_block(cachep, &objp, 1, page_node, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
}
return 1;
@@ -815,10 +815,10 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
*/
n = get_node(cachep, node);
if (n) {
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
cachep->num;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
return 0;
}
@@ -897,7 +897,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep,
goto fail;
n = get_node(cachep, node);
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
if (n->shared && force_change) {
free_block(cachep, n->shared->entry,
n->shared->avail, node, &list);
@@ -915,7 +915,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep,
new_alien = NULL;
}
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
/*
@@ -954,7 +954,7 @@ static void cpuup_canceled(long cpu)
if (!n)
continue;
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
/* Free limit for this kmem_cache_node */
n->free_limit -= cachep->batchcount;
@@ -965,7 +965,7 @@ static void cpuup_canceled(long cpu)
nc->avail = 0;
if (!cpumask_empty(mask)) {
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
goto free_slab;
}
@@ -979,7 +979,7 @@ static void cpuup_canceled(long cpu)
alien = n->alien;
n->alien = NULL;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
kfree(shared);
if (alien) {
@@ -1163,7 +1163,7 @@ static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
- spin_lock_init(&ptr->list_lock);
+ raw_spin_lock_init(&ptr->list_lock);
MAKE_ALL_LISTS(cachep, ptr, nodeid);
cachep->node[nodeid] = ptr;
@@ -1335,11 +1335,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
for_each_kmem_cache_node(cachep, node, n) {
unsigned long total_slabs, free_slabs, free_objs;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
total_slabs = n->total_slabs;
free_slabs = n->free_slabs;
free_objs = n->free_objects;
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
node, total_slabs - free_slabs, total_slabs,
@@ -2097,7 +2097,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
+ assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
#endif
}
@@ -2105,7 +2105,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&get_node(cachep, node)->list_lock);
+ assert_raw_spin_locked(&get_node(cachep, node)->list_lock);
#endif
}
@@ -2145,9 +2145,9 @@ static void do_drain(void *arg)
check_irq_off();
ac = cpu_cache_get(cachep);
n = get_node(cachep, node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
free_block(cachep, ac->entry, ac->avail, node, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
ac->avail = 0;
}
@@ -2165,9 +2165,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
drain_alien_cache(cachep, n->alien);
for_each_kmem_cache_node(cachep, node, n) {
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
drain_array_locked(cachep, n->shared, node, true, &list);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -2189,10 +2189,10 @@ static int drain_freelist(struct kmem_cache *cache,
nr_freed = 0;
while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
p = n->slabs_free.prev;
if (p == &n->slabs_free) {
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
goto out;
}
@@ -2205,7 +2205,7 @@ static int drain_freelist(struct kmem_cache *cache,
* to the cache.
*/
n->free_objects -= cache->num;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slab_destroy(cache, page);
nr_freed++;
}
@@ -2658,7 +2658,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
INIT_LIST_HEAD(&page->slab_list);
n = get_node(cachep, page_to_nid(page));
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
n->total_slabs++;
if (!page->active) {
list_add_tail(&page->slab_list, &n->slabs_free);
@@ -2668,7 +2668,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
}
@@ -2834,7 +2834,7 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
struct page *page;
- assert_spin_locked(&n->list_lock);
+ assert_raw_spin_locked(&n->list_lock);
page = list_first_entry_or_null(&n->slabs_partial, struct page,
slab_list);
if (!page) {
@@ -2861,10 +2861,10 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
if (!gfp_pfmemalloc_allowed(flags))
return NULL;
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
page = get_first_slab(n, true);
if (!page) {
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
return NULL;
}
@@ -2873,7 +2873,7 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
fixup_slab_list(cachep, n, page, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
return obj;
@@ -2932,7 +2932,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
if (!n->free_objects && (!shared || !shared->avail))
goto direct_grow;
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
shared = READ_ONCE(n->shared);
/* See if we can refill from the shared array */
@@ -2956,7 +2956,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
must_grow:
n->free_objects -= ac->avail;
alloc_done:
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
direct_grow:
@@ -3181,7 +3181,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
BUG_ON(!n);
check_irq_off();
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
page = get_first_slab(n, false);
if (!page)
goto must_grow;
@@ -3199,12 +3199,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
fixup_slab_list(cachep, n, page, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
return obj;
must_grow:
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
if (page) {
/* This slab isn't counted yet so don't update free_objects */
@@ -3380,7 +3380,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
check_irq_off();
n = get_node(cachep, node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
if (n->shared) {
struct array_cache *shared_array = n->shared;
int max = shared_array->limit - shared_array->avail;
@@ -3409,7 +3409,7 @@ free_done:
STATS_SET_FREEABLE(cachep, i);
}
#endif
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
ac->avail -= batchcount;
memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
@@ -3831,9 +3831,9 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
node = cpu_to_mem(cpu);
n = get_node(cachep, node);
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
free_block(cachep, ac->entry, ac->avail, node, &list);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
free_percpu(prev);
@@ -3958,9 +3958,9 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
return;
}
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
drain_array_locked(cachep, ac, node, false, &list);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -4044,7 +4044,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
for_each_kmem_cache_node(cachep, node, n) {
check_irq_on();
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
total_slabs += n->total_slabs;
free_slabs += n->free_slabs;
@@ -4053,7 +4053,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
if (n->shared)
shared_avail += n->shared->avail;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
}
num_objs = total_slabs * cachep->num;
active_slabs = total_slabs - free_slabs;
diff --git a/mm/slab.h b/mm/slab.h
index 7e94700aa78c..caa5027fc282 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -596,7 +596,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
* The slab lists for all objects.
*/
struct kmem_cache_node {
- spinlock_t list_lock;
+ raw_spinlock_t list_lock;
#ifdef CONFIG_SLAB
struct list_head slabs_partial; /* partial list first, better asm code */
diff --git a/mm/slub.c b/mm/slub.c
index 3b17e774831a..15c194ff16e6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1196,7 +1196,7 @@ static noinline int free_debug_processing(
unsigned long uninitialized_var(flags);
int ret = 0;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
slab_lock(page);
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
@@ -1231,7 +1231,7 @@ out:
bulk_cnt, cnt);
slab_unlock(page);
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
if (!ret)
slab_fix(s, "Object at 0x%p not freed", object);
return ret;
@@ -1401,6 +1401,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
#endif /* CONFIG_SLUB_DEBUG */
+struct slub_free_list {
+ raw_spinlock_t lock;
+ struct list_head list;
+};
+static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
+
/*
* Hooks for other subsystems that check memory allocations. In a typical
* production configuration these hooks all should produce no code at all.
@@ -1641,10 +1647,18 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
void *start, *p, *next;
int idx;
bool shuffle;
+ bool enableirqs = false;
flags &= gfp_allowed_mask;
if (gfpflags_allow_blocking(flags))
+ enableirqs = true;
+
+#ifdef CONFIG_PREEMPT_RT
+ if (system_state > SYSTEM_BOOTING)
+ enableirqs = true;
+#endif
+ if (enableirqs)
local_irq_enable();
flags |= s->allocflags;
@@ -1703,7 +1717,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
page->frozen = 1;
out:
- if (gfpflags_allow_blocking(flags))
+ if (enableirqs)
local_irq_disable();
if (!page)
return NULL;
@@ -1751,6 +1765,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__free_pages(page, order);
}
+static void free_delayed(struct list_head *h)
+{
+ while (!list_empty(h)) {
+ struct page *page = list_first_entry(h, struct page, lru);
+
+ list_del(&page->lru);
+ __free_slab(page->slab_cache, page);
+ }
+}
+
static void rcu_free_slab(struct rcu_head *h)
{
struct page *page = container_of(h, struct page, rcu_head);
@@ -1762,6 +1786,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
{
if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
call_rcu(&page->rcu_head, rcu_free_slab);
+ } else if (irqs_disabled()) {
+ struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
+
+ raw_spin_lock(&f->lock);
+ list_add(&page->lru, &f->list);
+ raw_spin_unlock(&f->lock);
} else
__free_slab(s, page);
}
@@ -1869,7 +1899,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
if (!n || !n->nr_partial)
return NULL;
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
void *t;
@@ -1894,7 +1924,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
break;
}
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
return object;
}
@@ -2140,7 +2170,7 @@ redo:
* that acquire_slab() will see a slab page that
* is frozen
*/
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
}
} else {
m = M_FULL;
@@ -2151,7 +2181,7 @@ redo:
* slabs from diagnostic functions will not see
* any frozen slabs.
*/
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
}
}
@@ -2175,7 +2205,7 @@ redo:
goto redo;
if (lock)
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
if (m == M_PARTIAL)
stat(s, tail);
@@ -2214,10 +2244,10 @@ static void unfreeze_partials(struct kmem_cache *s,
n2 = get_node(s, page_to_nid(page));
if (n != n2) {
if (n)
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
n = n2;
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
}
do {
@@ -2246,7 +2276,7 @@ static void unfreeze_partials(struct kmem_cache *s,
}
if (n)
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
while (discard_page) {
page = discard_page;
@@ -2283,14 +2313,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
pobjects = oldpage->pobjects;
pages = oldpage->pages;
if (drain && pobjects > s->cpu_partial) {
+ struct slub_free_list *f;
unsigned long flags;
+ LIST_HEAD(tofree);
/*
* partial array is full. Move the existing
* set to the per node partial list.
*/
local_irq_save(flags);
unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+ f = this_cpu_ptr(&slub_free_list);
+ raw_spin_lock(&f->lock);
+ list_splice_init(&f->list, &tofree);
+ raw_spin_unlock(&f->lock);
local_irq_restore(flags);
+ free_delayed(&tofree);
oldpage = NULL;
pobjects = 0;
pages = 0;
@@ -2358,7 +2395,22 @@ static bool has_cpu_slab(int cpu, void *info)
static void flush_all(struct kmem_cache *s)
{
+ LIST_HEAD(tofree);
+ int cpu;
+
on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
+ for_each_online_cpu(cpu) {
+ struct slub_free_list *f;
+
+ if (!has_cpu_slab(cpu, s))
+ continue;
+
+ f = &per_cpu(slub_free_list, cpu);
+ raw_spin_lock_irq(&f->lock);
+ list_splice_init(&f->list, &tofree);
+ raw_spin_unlock_irq(&f->lock);
+ free_delayed(&tofree);
+ }
}
/*
@@ -2413,10 +2465,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
unsigned long x = 0;
struct page *page;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list)
x += get_count(page);
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
return x;
}
#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
@@ -2555,8 +2607,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
* already disabled (which is the case for bulk allocation).
*/
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c)
+ unsigned long addr, struct kmem_cache_cpu *c,
+ struct list_head *to_free)
{
+ struct slub_free_list *f;
void *freelist;
struct page *page;
@@ -2622,6 +2676,13 @@ load_freelist:
VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
+
+out:
+ f = this_cpu_ptr(&slub_free_list);
+ raw_spin_lock(&f->lock);
+ list_splice_init(&f->list, to_free);
+ raw_spin_unlock(&f->lock);
+
return freelist;
new_slab:
@@ -2637,7 +2698,7 @@ new_slab:
if (unlikely(!freelist)) {
slab_out_of_memory(s, gfpflags, node);
- return NULL;
+ goto out;
}
page = c->page;
@@ -2650,7 +2711,7 @@ new_slab:
goto new_slab; /* Slab failed checks. Next slab needed */
deactivate_slab(s, page, get_freepointer(s, freelist), c);
- return freelist;
+ goto out;
}
/*
@@ -2662,6 +2723,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
{
void *p;
unsigned long flags;
+ LIST_HEAD(tofree);
local_irq_save(flags);
#ifdef CONFIG_PREEMPTION
@@ -2673,8 +2735,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
c = this_cpu_ptr(s->cpu_slab);
#endif
- p = ___slab_alloc(s, gfpflags, node, addr, c);
+ p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
local_irq_restore(flags);
+ free_delayed(&tofree);
return p;
}
@@ -2707,6 +2770,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
struct page *page;
unsigned long tid;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && IS_ENABLED(CONFIG_DEBUG_ATOMIC_SLEEP))
+ WARN_ON_ONCE(!preemptible() && system_state >= SYSTEM_SCHEDULING);
+
s = slab_pre_alloc_hook(s, gfpflags);
if (!s)
return NULL;
@@ -2873,7 +2939,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
do {
if (unlikely(n)) {
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
n = NULL;
}
prior = page->freelist;
@@ -2905,7 +2971,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
* Otherwise the list_lock will synchronize with
* other processors updating the list of slabs.
*/
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
}
}
@@ -2946,7 +3012,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
return;
slab_empty:
@@ -2961,7 +3027,7 @@ slab_empty:
remove_full(s, n, page);
}
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, FREE_SLAB);
discard_slab(s, page);
}
@@ -3166,8 +3232,12 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
void **p)
{
struct kmem_cache_cpu *c;
+ LIST_HEAD(to_free);
int i;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && IS_ENABLED(CONFIG_DEBUG_ATOMIC_SLEEP))
+ WARN_ON_ONCE(!preemptible() && system_state >= SYSTEM_SCHEDULING);
+
/* memcg and kmem_cache debug support */
s = slab_pre_alloc_hook(s, flags);
if (unlikely(!s))
@@ -3198,7 +3268,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
* of re-populating per CPU c->freelist
*/
p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
- _RET_IP_, c);
+ _RET_IP_, c, &to_free);
if (unlikely(!p[i]))
goto error;
@@ -3213,6 +3283,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
}
c->tid = next_tid(c->tid);
local_irq_enable();
+ free_delayed(&to_free);
/* Clear memory outside IRQ disabled fastpath loop */
if (unlikely(slab_want_init_on_alloc(flags, s))) {
@@ -3227,6 +3298,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
return i;
error:
local_irq_enable();
+ free_delayed(&to_free);
slab_post_alloc_hook(s, flags, i, p);
__kmem_cache_free_bulk(s, i, p);
return 0;
@@ -3362,7 +3434,7 @@ static void
init_kmem_cache_node(struct kmem_cache_node *n)
{
n->nr_partial = 0;
- spin_lock_init(&n->list_lock);
+ raw_spin_lock_init(&n->list_lock);
INIT_LIST_HEAD(&n->partial);
#ifdef CONFIG_SLUB_DEBUG
atomic_long_set(&n->nr_slabs, 0);
@@ -3711,6 +3783,11 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
const char *text)
{
#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_PREEMPT_RT
+ /* XXX move out of irq-off section */
+ slab_err(s, page, text, s->name);
+#else
+
void *addr = page_address(page);
void *p;
unsigned long *map;
@@ -3730,6 +3807,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
slab_unlock(page);
#endif
+#endif
}
/*
@@ -3743,7 +3821,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
struct page *page, *h;
BUG_ON(irqs_disabled());
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
list_for_each_entry_safe(page, h, &n->partial, slab_list) {
if (!page->inuse) {
remove_partial(n, page);
@@ -3753,7 +3831,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
"Objects remaining in %s on __kmem_cache_shutdown()");
}
}
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
list_for_each_entry_safe(page, h, &discard, slab_list)
discard_slab(s, page);
@@ -4025,7 +4103,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
INIT_LIST_HEAD(promote + i);
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
/*
* Build lists of slabs to discard or promote.
@@ -4056,7 +4134,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
list_splice(promote + i, &n->partial);
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
list_for_each_entry_safe(page, t, &discard, slab_list)
@@ -4263,6 +4341,12 @@ void __init kmem_cache_init(void)
{
static __initdata struct kmem_cache boot_kmem_cache,
boot_kmem_cache_node;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
+ INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
+ }
if (debug_guardpage_minorder())
slub_max_order = 0;
@@ -4454,7 +4538,7 @@ static int validate_slab_node(struct kmem_cache *s,
struct page *page;
unsigned long flags;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list) {
validate_slab(s, page);
@@ -4476,7 +4560,7 @@ static int validate_slab_node(struct kmem_cache *s,
s->name, count, atomic_long_read(&n->nr_slabs));
out:
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
return count;
}
@@ -4655,12 +4739,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
if (!atomic_long_read(&n->nr_slabs))
continue;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list)
process_slab(&t, s, page, alloc);
list_for_each_entry(page, &n->full, slab_list)
process_slab(&t, s, page, alloc);
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
}
for (i = 0; i < t.count; i++) {
diff --git a/mm/swap.c b/mm/swap.c
index cf39d24ada2a..953d0edc2a5a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -33,6 +33,7 @@
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
+#include <linux/locallock.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
@@ -52,6 +53,8 @@ static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
#endif
+static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
+DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
/*
* This path almost never happens for VM activity - pages are normally
@@ -254,11 +257,11 @@ void rotate_reclaimable_page(struct page *page)
unsigned long flags;
get_page(page);
- local_irq_save(flags);
+ local_lock_irqsave(rotate_lock, flags);
pvec = this_cpu_ptr(&lru_rotate_pvecs);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_move_tail(pvec);
- local_irq_restore(flags);
+ local_unlock_irqrestore(rotate_lock, flags);
}
}
@@ -308,12 +311,13 @@ void activate_page(struct page *page)
{
page = compound_head(page);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
+ activate_page_pvecs);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, __activate_page, NULL);
- put_cpu_var(activate_page_pvecs);
+ put_locked_var(swapvec_lock, activate_page_pvecs);
}
}
@@ -335,7 +339,7 @@ void activate_page(struct page *page)
static void __lru_cache_activate_page(struct page *page)
{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+ struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
int i;
/*
@@ -357,7 +361,7 @@ static void __lru_cache_activate_page(struct page *page)
}
}
- put_cpu_var(lru_add_pvec);
+ put_locked_var(swapvec_lock, lru_add_pvec);
}
/*
@@ -404,12 +408,12 @@ EXPORT_SYMBOL(mark_page_accessed);
static void __lru_cache_add(struct page *page)
{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+ struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
__pagevec_lru_add(pvec);
- put_cpu_var(lru_add_pvec);
+ put_locked_var(swapvec_lock, lru_add_pvec);
}
/**
@@ -603,9 +607,9 @@ void lru_add_drain_cpu(int cpu)
unsigned long flags;
/* No harm done if a racing interrupt already did this */
- local_irq_save(flags);
+ local_lock_irqsave(rotate_lock, flags);
pagevec_move_tail(pvec);
- local_irq_restore(flags);
+ local_unlock_irqrestore(rotate_lock, flags);
}
pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
@@ -641,11 +645,12 @@ void deactivate_file_page(struct page *page)
return;
if (likely(get_page_unless_zero(page))) {
- struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
+ lru_deactivate_file_pvecs);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
- put_cpu_var(lru_deactivate_file_pvecs);
+ put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
}
}
@@ -680,19 +685,20 @@ void mark_page_lazyfree(struct page *page)
{
if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
!PageSwapCache(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
+ lru_lazyfree_pvecs);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
- put_cpu_var(lru_lazyfree_pvecs);
+ put_locked_var(swapvec_lock, lru_lazyfree_pvecs);
}
}
void lru_add_drain(void)
{
- lru_add_drain_cpu(get_cpu());
- put_cpu();
+ lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
+ local_unlock_cpu(swapvec_lock);
}
#ifdef CONFIG_SMP
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6b8eeb0ecee5..2e89ac817e3e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1501,7 +1501,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
struct vmap_block *vb;
struct vmap_area *va;
unsigned long vb_idx;
- int node, err;
+ int node, err, cpu;
void *vaddr;
node = numa_node_id();
@@ -1544,11 +1544,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
BUG_ON(err);
radix_tree_preload_end();
- vbq = &get_cpu_var(vmap_block_queue);
+ cpu = get_cpu_light();
+ vbq = this_cpu_ptr(&vmap_block_queue);
spin_lock(&vbq->lock);
list_add_tail_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
- put_cpu_var(vmap_block_queue);
+ put_cpu_light();
return vaddr;
}
@@ -1617,6 +1618,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
struct vmap_block *vb;
void *vaddr = NULL;
unsigned int order;
+ int cpu;
BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -1631,7 +1633,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
order = get_order(size);
rcu_read_lock();
- vbq = &get_cpu_var(vmap_block_queue);
+ cpu = get_cpu_light();
+ vbq = this_cpu_ptr(&vmap_block_queue);
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
unsigned long pages_off;
@@ -1654,7 +1657,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
break;
}
- put_cpu_var(vmap_block_queue);
+ put_cpu_light();
rcu_read_unlock();
/* Allocate new block if nothing was found */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 78d53378db99..5385f81c3b7c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -321,6 +321,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
long x;
long t;
+ preempt_disable_rt();
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -330,6 +331,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
x = 0;
}
__this_cpu_write(*p, x);
+ preempt_enable_rt();
}
EXPORT_SYMBOL(__mod_zone_page_state);
@@ -341,6 +343,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long x;
long t;
+ preempt_disable_rt();
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -350,6 +353,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
x = 0;
}
__this_cpu_write(*p, x);
+ preempt_enable_rt();
}
EXPORT_SYMBOL(__mod_node_page_state);
@@ -382,6 +386,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
+ preempt_disable_rt();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -390,6 +395,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
zone_page_state_add(v + overstep, zone, item);
__this_cpu_write(*p, -overstep);
}
+ preempt_enable_rt();
}
void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -398,6 +404,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
s8 __percpu *p = pcp->vm_node_stat_diff + item;
s8 v, t;
+ preempt_disable_rt();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -406,6 +413,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
node_page_state_add(v + overstep, pgdat, item);
__this_cpu_write(*p, -overstep);
}
+ preempt_enable_rt();
}
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -426,6 +434,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
+ preempt_disable_rt();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -434,6 +443,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
zone_page_state_add(v - overstep, zone, item);
__this_cpu_write(*p, overstep);
}
+ preempt_enable_rt();
}
void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -442,6 +452,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
s8 __percpu *p = pcp->vm_node_stat_diff + item;
s8 v, t;
+ preempt_disable_rt();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -450,6 +461,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
node_page_state_add(v - overstep, pgdat, item);
__this_cpu_write(*p, overstep);
}
+ preempt_enable_rt();
}
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
diff --git a/mm/workingset.c b/mm/workingset.c
index 474186b76ced..0d3d8c947b09 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -404,6 +404,8 @@ static struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
+ struct address_space *mapping;
+
/*
* Track non-empty nodes that contain only shadow entries;
* unlink those that contain pages or are being freed.
@@ -412,7 +414,8 @@ void workingset_update_node(struct xa_node *node)
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
+ mapping = container_of(node->array, struct address_space, i_pages);
+ lockdep_assert_held(&mapping->i_pages.xa_lock);
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 22d17ecfe7df..e494ebf279ea 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -57,6 +57,7 @@
#include <linux/wait.h>
#include <linux/pagemap.h>
#include <linux/fs.h>
+#include <linux/locallock.h>
#define ZSPAGE_MAGIC 0x58
@@ -74,9 +75,22 @@
*/
#define ZS_MAX_ZSPAGE_ORDER 2
#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
-
#define ZS_HANDLE_SIZE (sizeof(unsigned long))
+#ifdef CONFIG_PREEMPT_RT
+
+struct zsmalloc_handle {
+ unsigned long addr;
+ struct mutex lock;
+};
+
+#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
+
+#else
+
+#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
+#endif
+
/*
* Object location (<PFN>, <obj_idx>) is encoded as
* as single (unsigned long) handle value.
@@ -326,7 +340,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
static int create_cache(struct zs_pool *pool)
{
- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
0, 0, NULL);
if (!pool->handle_cachep)
return 1;
@@ -350,10 +364,27 @@ static void destroy_cache(struct zs_pool *pool)
static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
{
- return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+ void *p;
+
+ p = kmem_cache_alloc(pool->handle_cachep,
+ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+#ifdef CONFIG_PREEMPT_RT
+ if (p) {
+ struct zsmalloc_handle *zh = p;
+
+ mutex_init(&zh->lock);
+ }
+#endif
+ return (unsigned long)p;
}
+#ifdef CONFIG_PREEMPT_RT
+static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
+{
+ return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
+}
+#endif
+
static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
{
kmem_cache_free(pool->handle_cachep, (void *)handle);
@@ -372,12 +403,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
static void record_obj(unsigned long handle, unsigned long obj)
{
+#ifdef CONFIG_PREEMPT_RT
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ WRITE_ONCE(zh->addr, obj);
+#else
/*
* lsb of @obj represents handle lock while other bits
* represent object value the handle is pointing so
* updating shouldn't do store tearing.
*/
WRITE_ONCE(*(unsigned long *)handle, obj);
+#endif
}
/* zpool driver */
@@ -460,6 +497,7 @@ MODULE_ALIAS("zpool-zsmalloc");
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
+static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
static bool is_zspage_isolated(struct zspage *zspage)
{
@@ -869,7 +907,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
static unsigned long handle_to_obj(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return zh->addr;
+#else
return *(unsigned long *)handle;
+#endif
}
static unsigned long obj_to_head(struct page *page, void *obj)
@@ -883,22 +927,46 @@ static unsigned long obj_to_head(struct page *page, void *obj)
static inline int testpin_tag(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_is_locked(&zh->lock);
+#else
return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static inline int trypin_tag(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_trylock(&zh->lock);
+#else
return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static void pin_tag(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_lock(&zh->lock);
+#else
bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static void unpin_tag(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_unlock(&zh->lock);
+#else
bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static void reset_page(struct page *page)
@@ -1324,7 +1392,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
class = pool->size_class[class_idx];
off = (class->size * obj_idx) & ~PAGE_MASK;
- area = &get_cpu_var(zs_map_area);
+ area = &get_locked_var(zs_map_area_lock, zs_map_area);
area->vm_mm = mm;
if (off + class->size <= PAGE_SIZE) {
/* this object is contained entirely within a page */
@@ -1378,7 +1446,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
__zs_unmap_object(area, pages, off, class->size);
}
- put_cpu_var(zs_map_area);
+ put_locked_var(zs_map_area_lock, zs_map_area);
migrate_read_unlock(zspage);
unpin_tag(handle);
diff --git a/mm/zswap.c b/mm/zswap.c
index 55094e63b72d..1591b5d37f65 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -18,6 +18,7 @@
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/locallock.h>
#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/frontswap.h>
@@ -995,6 +996,8 @@ static void zswap_fill_page(void *ptr, unsigned long value)
memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
}
+/* protect zswap_dstmem from concurrency */
+static DEFINE_LOCAL_IRQ_LOCK(zswap_dstmem_lock);
/*********************************
* frontswap hooks
**********************************/
@@ -1074,12 +1077,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
}
/* compress */
- dst = get_cpu_var(zswap_dstmem);
- tfm = *get_cpu_ptr(entry->pool->tfm);
+ dst = get_locked_var(zswap_dstmem_lock, zswap_dstmem);
+ tfm = *this_cpu_ptr(entry->pool->tfm);
src = kmap_atomic(page);
ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
kunmap_atomic(src);
- put_cpu_ptr(entry->pool->tfm);
if (ret) {
ret = -EINVAL;
goto put_dstmem;
@@ -1103,7 +1105,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
memcpy(buf, &zhdr, hlen);
memcpy(buf + hlen, dst, dlen);
zpool_unmap_handle(entry->pool->zpool, handle);
- put_cpu_var(zswap_dstmem);
+ put_locked_var(zswap_dstmem_lock, zswap_dstmem);
/* populate entry */
entry->offset = offset;
@@ -1131,7 +1133,7 @@ insert_entry:
return 0;
put_dstmem:
- put_cpu_var(zswap_dstmem);
+ put_locked_var(zswap_dstmem_lock, zswap_dstmem);
zswap_pool_put(entry->pool);
freepage:
zswap_entry_cache_free(entry);