diff options
author | Sebastian Andrzej Siewior <bigeasy@linutronix.de> | 2021-11-22 13:28:14 +0100 |
---|---|---|
committer | Sebastian Andrzej Siewior <bigeasy@linutronix.de> | 2021-11-22 13:28:14 +0100 |
commit | e8dc317fedb7105a8d4b23d3c808d2680ccd7dcd (patch) | |
tree | 4a373e98e3f7d51833e07f6b2c39fdcd9f3f150c | |
parent | ee740a0bb8014e7981d6d179ce8579a685e8d1f4 (diff) | |
download | linux-rt-e8dc317fedb7105a8d4b23d3c808d2680ccd7dcd.tar.gz |
[ANNOUNCE] v5.16-rc2-rt4v5.16-rc2-rt4-patches
Dear RT folks!
I'm pleased to announce the v5.16-rc2-rt4 patch set.
Changes since v5.16-rc2-rt3:
- Replace the current zsmalloc patch with patches from Minchan Kim
which have been applied to akpm tree.
Known issues
- netconsole triggers WARN.
- The "Memory controller" (CONFIG_MEMCG) has been disabled.
- Valentin Schneider reported a few splats on ARM64, see
https://lkml.kernel.org/r/20210810134127.1394269-1-valentin.schneider@arm.com
The delta patch against v5.16-rc2-rt3 is appended below and can be found here:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.16/incr/patch-5.16-rc2-rt3-rt4.patch.xz
You can get this release via the git tree at:
git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.16-rc2-rt4
The RT patch against v5.16-rc2 can be found here:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.16/older/patch-5.16-rc2-rt4.patch.xz
The split quilt queue is available at:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.16/older/patches-5.16-rc2-rt4.tar.xz
Sebastian
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
22 files changed, 1732 insertions, 268 deletions
diff --git a/patches/0001_zsmalloc_introduce_some_helper_functions.patch b/patches/0001_zsmalloc_introduce_some_helper_functions.patch new file mode 100644 index 000000000000..bdadaf2d44fb --- /dev/null +++ b/patches/0001_zsmalloc_introduce_some_helper_functions.patch @@ -0,0 +1,187 @@ +From: Minchan Kim <minchan@kernel.org> +Subject: zsmalloc: introduce some helper functions +Date: Mon, 15 Nov 2021 10:59:01 -0800 + +get_zspage_mapping returns fullness as well as class_idx. However, +the fullness is usually not used since it could be stale in some +contexts. It causes misleading as well as unnecessary instructions +so this patch introduces zspage_class. + +obj_to_location also produces page and index but we don't need +always the index, either so this patch introduces obj_to_page. + +Signed-off-by: Minchan Kim <minchan@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lore.kernel.org/r/20211115185909.3949505-2-minchan@kernel.org +--- + mm/zsmalloc.c | 54 +++++++++++++++++++++++------------------------------- + 1 file changed, 23 insertions(+), 31 deletions(-) + +--- a/mm/zsmalloc.c ++++ b/mm/zsmalloc.c +@@ -517,6 +517,12 @@ static void get_zspage_mapping(struct zs + *class_idx = zspage->class; + } + ++static struct size_class *zspage_class(struct zs_pool *pool, ++ struct zspage *zspage) ++{ ++ return pool->size_class[zspage->class]; ++} ++ + static void set_zspage_mapping(struct zspage *zspage, + unsigned int class_idx, + enum fullness_group fullness) +@@ -844,6 +850,12 @@ static void obj_to_location(unsigned lon + *obj_idx = (obj & OBJ_INDEX_MASK); + } + ++static void obj_to_page(unsigned long obj, struct page **page) ++{ ++ obj >>= OBJ_TAG_BITS; ++ *page = pfn_to_page(obj >> OBJ_INDEX_BITS); ++} ++ + /** + * location_to_obj - get obj value encoded from (<page>, <obj_idx>) + * @page: page object resides in zspage +@@ -1246,8 +1258,6 @@ void *zs_map_object(struct zs_pool *pool + unsigned long obj, off; + unsigned int obj_idx; + +- unsigned int class_idx; +- enum fullness_group fg; + struct size_class *class; + struct mapping_area *area; + struct page *pages[2]; +@@ -1270,8 +1280,7 @@ void *zs_map_object(struct zs_pool *pool + /* migration cannot move any subpage in this zspage */ + migrate_read_lock(zspage); + +- get_zspage_mapping(zspage, &class_idx, &fg); +- class = pool->size_class[class_idx]; ++ class = zspage_class(pool, zspage); + off = (class->size * obj_idx) & ~PAGE_MASK; + + area = &get_cpu_var(zs_map_area); +@@ -1304,16 +1313,13 @@ void zs_unmap_object(struct zs_pool *poo + unsigned long obj, off; + unsigned int obj_idx; + +- unsigned int class_idx; +- enum fullness_group fg; + struct size_class *class; + struct mapping_area *area; + + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); + zspage = get_zspage(page); +- get_zspage_mapping(zspage, &class_idx, &fg); +- class = pool->size_class[class_idx]; ++ class = zspage_class(pool, zspage); + off = (class->size * obj_idx) & ~PAGE_MASK; + + area = this_cpu_ptr(&zs_map_area); +@@ -1491,8 +1497,6 @@ void zs_free(struct zs_pool *pool, unsig + struct zspage *zspage; + struct page *f_page; + unsigned long obj; +- unsigned int f_objidx; +- int class_idx; + struct size_class *class; + enum fullness_group fullness; + bool isolated; +@@ -1502,13 +1506,11 @@ void zs_free(struct zs_pool *pool, unsig + + pin_tag(handle); + obj = handle_to_obj(handle); +- obj_to_location(obj, &f_page, &f_objidx); ++ obj_to_page(obj, &f_page); + zspage = get_zspage(f_page); + + migrate_read_lock(zspage); +- +- get_zspage_mapping(zspage, &class_idx, &fullness); +- class = pool->size_class[class_idx]; ++ class = zspage_class(pool, zspage); + + spin_lock(&class->lock); + obj_free(class, obj); +@@ -1866,8 +1868,6 @@ static bool zs_page_isolate(struct page + { + struct zs_pool *pool; + struct size_class *class; +- int class_idx; +- enum fullness_group fullness; + struct zspage *zspage; + struct address_space *mapping; + +@@ -1880,15 +1880,10 @@ static bool zs_page_isolate(struct page + + zspage = get_zspage(page); + +- /* +- * Without class lock, fullness could be stale while class_idx is okay +- * because class_idx is constant unless page is freed so we should get +- * fullness again under class lock. +- */ +- get_zspage_mapping(zspage, &class_idx, &fullness); + mapping = page_mapping(page); + pool = mapping->private_data; +- class = pool->size_class[class_idx]; ++ ++ class = zspage_class(pool, zspage); + + spin_lock(&class->lock); + if (get_zspage_inuse(zspage) == 0) { +@@ -1907,6 +1902,9 @@ static bool zs_page_isolate(struct page + * size_class to prevent further object allocation from the zspage. + */ + if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { ++ enum fullness_group fullness; ++ unsigned int class_idx; ++ + get_zspage_mapping(zspage, &class_idx, &fullness); + atomic_long_inc(&pool->isolated_pages); + remove_zspage(class, zspage, fullness); +@@ -1923,8 +1921,6 @@ static int zs_page_migrate(struct addres + { + struct zs_pool *pool; + struct size_class *class; +- int class_idx; +- enum fullness_group fullness; + struct zspage *zspage; + struct page *dummy; + void *s_addr, *d_addr, *addr; +@@ -1949,9 +1945,8 @@ static int zs_page_migrate(struct addres + + /* Concurrent compactor cannot migrate any subpage in zspage */ + migrate_write_lock(zspage); +- get_zspage_mapping(zspage, &class_idx, &fullness); + pool = mapping->private_data; +- class = pool->size_class[class_idx]; ++ class = zspage_class(pool, zspage); + offset = get_first_obj_offset(page); + + spin_lock(&class->lock); +@@ -2049,8 +2044,6 @@ static void zs_page_putback(struct page + { + struct zs_pool *pool; + struct size_class *class; +- int class_idx; +- enum fullness_group fg; + struct address_space *mapping; + struct zspage *zspage; + +@@ -2058,10 +2051,9 @@ static void zs_page_putback(struct page + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + zspage = get_zspage(page); +- get_zspage_mapping(zspage, &class_idx, &fg); + mapping = page_mapping(page); + pool = mapping->private_data; +- class = pool->size_class[class_idx]; ++ class = zspage_class(pool, zspage); + + spin_lock(&class->lock); + dec_zspage_isolation(zspage); diff --git a/patches/0002_zsmalloc_rename_zs_stat_type_to_class_stat_type.patch b/patches/0002_zsmalloc_rename_zs_stat_type_to_class_stat_type.patch new file mode 100644 index 000000000000..8efd92883d8a --- /dev/null +++ b/patches/0002_zsmalloc_rename_zs_stat_type_to_class_stat_type.patch @@ -0,0 +1,105 @@ +From: Minchan Kim <minchan@kernel.org> +Subject: zsmalloc: rename zs_stat_type to class_stat_type +Date: Mon, 15 Nov 2021 10:59:02 -0800 + +The stat aims for class stat, not zspage so rename it. + +Signed-off-by: Minchan Kim <minchan@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lore.kernel.org/r/20211115185909.3949505-3-minchan@kernel.org +--- + mm/zsmalloc.c | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +--- a/mm/zsmalloc.c ++++ b/mm/zsmalloc.c +@@ -158,7 +158,7 @@ enum fullness_group { + NR_ZS_FULLNESS, + }; + +-enum zs_stat_type { ++enum class_stat_type { + CLASS_EMPTY, + CLASS_ALMOST_EMPTY, + CLASS_ALMOST_FULL, +@@ -549,21 +549,21 @@ static int get_size_class_index(int size + return min_t(int, ZS_SIZE_CLASSES - 1, idx); + } + +-/* type can be of enum type zs_stat_type or fullness_group */ +-static inline void zs_stat_inc(struct size_class *class, ++/* type can be of enum type class_stat_type or fullness_group */ ++static inline void class_stat_inc(struct size_class *class, + int type, unsigned long cnt) + { + class->stats.objs[type] += cnt; + } + +-/* type can be of enum type zs_stat_type or fullness_group */ +-static inline void zs_stat_dec(struct size_class *class, ++/* type can be of enum type class_stat_type or fullness_group */ ++static inline void class_stat_dec(struct size_class *class, + int type, unsigned long cnt) + { + class->stats.objs[type] -= cnt; + } + +-/* type can be of enum type zs_stat_type or fullness_group */ ++/* type can be of enum type class_stat_type or fullness_group */ + static inline unsigned long zs_stat_get(struct size_class *class, + int type) + { +@@ -725,7 +725,7 @@ static void insert_zspage(struct size_cl + { + struct zspage *head; + +- zs_stat_inc(class, fullness, 1); ++ class_stat_inc(class, fullness, 1); + head = list_first_entry_or_null(&class->fullness_list[fullness], + struct zspage, list); + /* +@@ -750,7 +750,7 @@ static void remove_zspage(struct size_cl + VM_BUG_ON(is_zspage_isolated(zspage)); + + list_del_init(&zspage->list); +- zs_stat_dec(class, fullness, 1); ++ class_stat_dec(class, fullness, 1); + } + + /* +@@ -964,7 +964,7 @@ static void __free_zspage(struct zs_pool + + cache_free_zspage(pool, zspage); + +- zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); ++ class_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); + } +@@ -1394,7 +1394,7 @@ static unsigned long obj_malloc(struct s + + kunmap_atomic(vaddr); + mod_zspage_inuse(zspage, 1); +- zs_stat_inc(class, OBJ_USED, 1); ++ class_stat_inc(class, OBJ_USED, 1); + + obj = location_to_obj(m_page, obj); + +@@ -1458,7 +1458,7 @@ unsigned long zs_malloc(struct zs_pool * + record_obj(handle, obj); + atomic_long_add(class->pages_per_zspage, + &pool->pages_allocated); +- zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); ++ class_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); + + /* We completely set up zspage so mark them as movable */ + SetZsPageMovable(pool, zspage); +@@ -1489,7 +1489,7 @@ static void obj_free(struct size_class * + kunmap_atomic(vaddr); + set_freeobj(zspage, f_objidx); + mod_zspage_inuse(zspage, -1); +- zs_stat_dec(class, OBJ_USED, 1); ++ class_stat_dec(class, OBJ_USED, 1); + } + + void zs_free(struct zs_pool *pool, unsigned long handle) diff --git a/patches/0003_zsmalloc_decouple_class_actions_from_zspage_works.patch b/patches/0003_zsmalloc_decouple_class_actions_from_zspage_works.patch new file mode 100644 index 000000000000..a1af4aa766fc --- /dev/null +++ b/patches/0003_zsmalloc_decouple_class_actions_from_zspage_works.patch @@ -0,0 +1,131 @@ +From: Minchan Kim <minchan@kernel.org> +Subject: zsmalloc: decouple class actions from zspage works +Date: Mon, 15 Nov 2021 10:59:03 -0800 + +This patch moves class stat update out of obj_malloc since +it's not related to zspage operation. +This is a preparation to introduce new lock scheme in next +patch. + +Signed-off-by: Minchan Kim <minchan@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lore.kernel.org/r/20211115185909.3949505-4-minchan@kernel.org +--- + mm/zsmalloc.c | 23 +++++++++++++---------- + 1 file changed, 13 insertions(+), 10 deletions(-) + +--- a/mm/zsmalloc.c ++++ b/mm/zsmalloc.c +@@ -1360,17 +1360,19 @@ size_t zs_huge_class_size(struct zs_pool + } + EXPORT_SYMBOL_GPL(zs_huge_class_size); + +-static unsigned long obj_malloc(struct size_class *class, ++static unsigned long obj_malloc(struct zs_pool *pool, + struct zspage *zspage, unsigned long handle) + { + int i, nr_page, offset; + unsigned long obj; + struct link_free *link; ++ struct size_class *class; + + struct page *m_page; + unsigned long m_offset; + void *vaddr; + ++ class = pool->size_class[zspage->class]; + handle |= OBJ_ALLOCATED_TAG; + obj = get_freeobj(zspage); + +@@ -1394,7 +1396,6 @@ static unsigned long obj_malloc(struct s + + kunmap_atomic(vaddr); + mod_zspage_inuse(zspage, 1); +- class_stat_inc(class, OBJ_USED, 1); + + obj = location_to_obj(m_page, obj); + +@@ -1433,10 +1434,11 @@ unsigned long zs_malloc(struct zs_pool * + spin_lock(&class->lock); + zspage = find_get_zspage(class); + if (likely(zspage)) { +- obj = obj_malloc(class, zspage, handle); ++ obj = obj_malloc(pool, zspage, handle); + /* Now move the zspage to another fullness group, if required */ + fix_fullness_group(class, zspage); + record_obj(handle, obj); ++ class_stat_inc(class, OBJ_USED, 1); + spin_unlock(&class->lock); + + return handle; +@@ -1451,7 +1453,7 @@ unsigned long zs_malloc(struct zs_pool * + } + + spin_lock(&class->lock); +- obj = obj_malloc(class, zspage, handle); ++ obj = obj_malloc(pool, zspage, handle); + newfg = get_fullness_group(class, zspage); + insert_zspage(class, zspage, newfg); + set_zspage_mapping(zspage, class->index, newfg); +@@ -1459,6 +1461,7 @@ unsigned long zs_malloc(struct zs_pool * + atomic_long_add(class->pages_per_zspage, + &pool->pages_allocated); + class_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); ++ class_stat_inc(class, OBJ_USED, 1); + + /* We completely set up zspage so mark them as movable */ + SetZsPageMovable(pool, zspage); +@@ -1468,7 +1471,7 @@ unsigned long zs_malloc(struct zs_pool * + } + EXPORT_SYMBOL_GPL(zs_malloc); + +-static void obj_free(struct size_class *class, unsigned long obj) ++static void obj_free(int class_size, unsigned long obj) + { + struct link_free *link; + struct zspage *zspage; +@@ -1478,7 +1481,7 @@ static void obj_free(struct size_class * + void *vaddr; + + obj_to_location(obj, &f_page, &f_objidx); +- f_offset = (class->size * f_objidx) & ~PAGE_MASK; ++ f_offset = (class_size * f_objidx) & ~PAGE_MASK; + zspage = get_zspage(f_page); + + vaddr = kmap_atomic(f_page); +@@ -1489,7 +1492,6 @@ static void obj_free(struct size_class * + kunmap_atomic(vaddr); + set_freeobj(zspage, f_objidx); + mod_zspage_inuse(zspage, -1); +- class_stat_dec(class, OBJ_USED, 1); + } + + void zs_free(struct zs_pool *pool, unsigned long handle) +@@ -1513,7 +1515,8 @@ void zs_free(struct zs_pool *pool, unsig + class = zspage_class(pool, zspage); + + spin_lock(&class->lock); +- obj_free(class, obj); ++ obj_free(class->size, obj); ++ class_stat_dec(class, OBJ_USED, 1); + fullness = fix_fullness_group(class, zspage); + if (fullness != ZS_EMPTY) { + migrate_read_unlock(zspage); +@@ -1671,7 +1674,7 @@ static int migrate_zspage(struct zs_pool + } + + used_obj = handle_to_obj(handle); +- free_obj = obj_malloc(class, get_zspage(d_page), handle); ++ free_obj = obj_malloc(pool, get_zspage(d_page), handle); + zs_object_copy(class, free_obj, used_obj); + obj_idx++; + /* +@@ -1683,7 +1686,7 @@ static int migrate_zspage(struct zs_pool + free_obj |= BIT(HANDLE_PIN_BIT); + record_obj(handle, free_obj); + unpin_tag(handle); +- obj_free(class, used_obj); ++ obj_free(class->size, used_obj); + } + + /* Remember last position in this iteration */ diff --git a/patches/0004_zsmalloc_introduce_obj_allocated.patch b/patches/0004_zsmalloc_introduce_obj_allocated.patch new file mode 100644 index 000000000000..eb344cd8835f --- /dev/null +++ b/patches/0004_zsmalloc_introduce_obj_allocated.patch @@ -0,0 +1,102 @@ +From: Minchan Kim <minchan@kernel.org> +Subject: zsmalloc: introduce obj_allocated +Date: Mon, 15 Nov 2021 10:59:04 -0800 + +The usage pattern for obj_to_head is to check whether the zpage +is allocated or not. Thus, introduce obj_allocated. + +Signed-off-by: Minchan Kim <minchan@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lore.kernel.org/r/20211115185909.3949505-5-minchan@kernel.org +--- + mm/zsmalloc.c | 33 ++++++++++++++++----------------- + 1 file changed, 16 insertions(+), 17 deletions(-) + +--- a/mm/zsmalloc.c ++++ b/mm/zsmalloc.c +@@ -877,13 +877,21 @@ static unsigned long handle_to_obj(unsig + return *(unsigned long *)handle; + } + +-static unsigned long obj_to_head(struct page *page, void *obj) ++static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) + { ++ unsigned long handle; ++ + if (unlikely(PageHugeObject(page))) { + VM_BUG_ON_PAGE(!is_first_page(page), page); +- return page->index; ++ handle = page->index; + } else +- return *(unsigned long *)obj; ++ handle = *(unsigned long *)obj; ++ ++ if (!(handle & OBJ_ALLOCATED_TAG)) ++ return false; ++ ++ *phandle = handle & ~OBJ_ALLOCATED_TAG; ++ return true; + } + + static inline int testpin_tag(unsigned long handle) +@@ -1606,7 +1614,6 @@ static void zs_object_copy(struct size_c + static unsigned long find_alloced_obj(struct size_class *class, + struct page *page, int *obj_idx) + { +- unsigned long head; + int offset = 0; + int index = *obj_idx; + unsigned long handle = 0; +@@ -1616,9 +1623,7 @@ static unsigned long find_alloced_obj(st + offset += class->size * index; + + while (offset < PAGE_SIZE) { +- head = obj_to_head(page, addr + offset); +- if (head & OBJ_ALLOCATED_TAG) { +- handle = head & ~OBJ_ALLOCATED_TAG; ++ if (obj_allocated(page, addr + offset, &handle)) { + if (trypin_tag(handle)) + break; + handle = 0; +@@ -1928,7 +1933,7 @@ static int zs_page_migrate(struct addres + struct page *dummy; + void *s_addr, *d_addr, *addr; + int offset, pos; +- unsigned long handle, head; ++ unsigned long handle; + unsigned long old_obj, new_obj; + unsigned int obj_idx; + int ret = -EAGAIN; +@@ -1964,9 +1969,7 @@ static int zs_page_migrate(struct addres + pos = offset; + s_addr = kmap_atomic(page); + while (pos < PAGE_SIZE) { +- head = obj_to_head(page, s_addr + pos); +- if (head & OBJ_ALLOCATED_TAG) { +- handle = head & ~OBJ_ALLOCATED_TAG; ++ if (obj_allocated(page, s_addr + pos, &handle)) { + if (!trypin_tag(handle)) + goto unpin_objects; + } +@@ -1982,9 +1985,7 @@ static int zs_page_migrate(struct addres + + for (addr = s_addr + offset; addr < s_addr + pos; + addr += class->size) { +- head = obj_to_head(page, addr); +- if (head & OBJ_ALLOCATED_TAG) { +- handle = head & ~OBJ_ALLOCATED_TAG; ++ if (obj_allocated(page, addr, &handle)) { + BUG_ON(!testpin_tag(handle)); + + old_obj = handle_to_obj(handle); +@@ -2029,9 +2030,7 @@ static int zs_page_migrate(struct addres + unpin_objects: + for (addr = s_addr + offset; addr < s_addr + pos; + addr += class->size) { +- head = obj_to_head(page, addr); +- if (head & OBJ_ALLOCATED_TAG) { +- handle = head & ~OBJ_ALLOCATED_TAG; ++ if (obj_allocated(page, addr, &handle)) { + BUG_ON(!testpin_tag(handle)); + unpin_tag(handle); + } diff --git a/patches/0005_zsmalloc_move_huge_compressed_obj_from_page_to_zspage.patch b/patches/0005_zsmalloc_move_huge_compressed_obj_from_page_to_zspage.patch new file mode 100644 index 000000000000..954d95c74977 --- /dev/null +++ b/patches/0005_zsmalloc_move_huge_compressed_obj_from_page_to_zspage.patch @@ -0,0 +1,150 @@ +From: Minchan Kim <minchan@kernel.org> +Subject: zsmalloc: move huge compressed obj from page to zspage +Date: Mon, 15 Nov 2021 10:59:05 -0800 + +the flag aims for zspage, not per page. Let's move it to zspage. + +Signed-off-by: Minchan Kim <minchan@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lore.kernel.org/r/20211115185909.3949505-6-minchan@kernel.org +--- + mm/zsmalloc.c | 50 ++++++++++++++++++++++++++------------------------ + 1 file changed, 26 insertions(+), 24 deletions(-) + +--- a/mm/zsmalloc.c ++++ b/mm/zsmalloc.c +@@ -121,6 +121,7 @@ + #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) + #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) + ++#define HUGE_BITS 1 + #define FULLNESS_BITS 2 + #define CLASS_BITS 8 + #define ISOLATED_BITS 3 +@@ -213,22 +214,6 @@ struct size_class { + struct zs_size_stat stats; + }; + +-/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ +-static void SetPageHugeObject(struct page *page) +-{ +- SetPageOwnerPriv1(page); +-} +- +-static void ClearPageHugeObject(struct page *page) +-{ +- ClearPageOwnerPriv1(page); +-} +- +-static int PageHugeObject(struct page *page) +-{ +- return PageOwnerPriv1(page); +-} +- + /* + * Placed within free objects to form a singly linked list. + * For every zspage, zspage->freeobj gives head of this list. +@@ -278,6 +263,7 @@ struct zs_pool { + + struct zspage { + struct { ++ unsigned int huge:HUGE_BITS; + unsigned int fullness:FULLNESS_BITS; + unsigned int class:CLASS_BITS + 1; + unsigned int isolated:ISOLATED_BITS; +@@ -298,6 +284,17 @@ struct mapping_area { + enum zs_mapmode vm_mm; /* mapping mode */ + }; + ++/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ ++static void SetZsHugePage(struct zspage *zspage) ++{ ++ zspage->huge = 1; ++} ++ ++static bool ZsHugePage(struct zspage *zspage) ++{ ++ return zspage->huge; ++} ++ + #ifdef CONFIG_COMPACTION + static int zs_register_migration(struct zs_pool *pool); + static void zs_unregister_migration(struct zs_pool *pool); +@@ -830,7 +827,9 @@ static struct zspage *get_zspage(struct + + static struct page *get_next_page(struct page *page) + { +- if (unlikely(PageHugeObject(page))) ++ struct zspage *zspage = get_zspage(page); ++ ++ if (unlikely(ZsHugePage(zspage))) + return NULL; + + return page->freelist; +@@ -880,8 +879,9 @@ static unsigned long handle_to_obj(unsig + static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) + { + unsigned long handle; ++ struct zspage *zspage = get_zspage(page); + +- if (unlikely(PageHugeObject(page))) { ++ if (unlikely(ZsHugePage(zspage))) { + VM_BUG_ON_PAGE(!is_first_page(page), page); + handle = page->index; + } else +@@ -920,7 +920,6 @@ static void reset_page(struct page *page + ClearPagePrivate(page); + set_page_private(page, 0); + page_mapcount_reset(page); +- ClearPageHugeObject(page); + page->freelist = NULL; + } + +@@ -1062,7 +1061,7 @@ static void create_page_chain(struct siz + SetPagePrivate(page); + if (unlikely(class->objs_per_zspage == 1 && + class->pages_per_zspage == 1)) +- SetPageHugeObject(page); ++ SetZsHugePage(zspage); + } else { + prev_page->freelist = page; + } +@@ -1307,7 +1306,7 @@ void *zs_map_object(struct zs_pool *pool + + ret = __zs_map_object(area, pages, off, class->size); + out: +- if (likely(!PageHugeObject(page))) ++ if (likely(!ZsHugePage(zspage))) + ret += ZS_HANDLE_SIZE; + + return ret; +@@ -1395,7 +1394,7 @@ static unsigned long obj_malloc(struct z + vaddr = kmap_atomic(m_page); + link = (struct link_free *)vaddr + m_offset / sizeof(*link); + set_freeobj(zspage, link->next >> OBJ_TAG_BITS); +- if (likely(!PageHugeObject(m_page))) ++ if (likely(!ZsHugePage(zspage))) + /* record handle in the header of allocated chunk */ + link->handle = handle; + else +@@ -1496,7 +1495,10 @@ static void obj_free(int class_size, uns + + /* Insert this object in containing zspage's freelist */ + link = (struct link_free *)(vaddr + f_offset); +- link->next = get_freeobj(zspage) << OBJ_TAG_BITS; ++ if (likely(!ZsHugePage(zspage))) ++ link->next = get_freeobj(zspage) << OBJ_TAG_BITS; ++ else ++ f_page->index = 0; + kunmap_atomic(vaddr); + set_freeobj(zspage, f_objidx); + mod_zspage_inuse(zspage, -1); +@@ -1867,7 +1869,7 @@ static void replace_sub_page(struct size + + create_page_chain(class, zspage, pages); + set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); +- if (unlikely(PageHugeObject(oldpage))) ++ if (unlikely(ZsHugePage(zspage))) + newpage->index = oldpage->index; + __SetPageMovable(newpage, page_mapping(oldpage)); + } diff --git a/patches/0006_zsmalloc_remove_zspage_isolation_for_migration.patch b/patches/0006_zsmalloc_remove_zspage_isolation_for_migration.patch new file mode 100644 index 000000000000..f9d3f7596660 --- /dev/null +++ b/patches/0006_zsmalloc_remove_zspage_isolation_for_migration.patch @@ -0,0 +1,309 @@ +From: Minchan Kim <minchan@kernel.org> +Subject: zsmalloc: remove zspage isolation for migration +Date: Mon, 15 Nov 2021 10:59:06 -0800 + +zspage isolation for migration introduced additional exceptions +to be dealt with since the zspage was isolated from class list. +The reason why I isolated zspage from class list was to prevent +race between obj_malloc and page migration via allocating zpage +from the zspage further. However, it couldn't prevent object +freeing from zspage so it needed corner case handling. + +This patch removes the whole mess. Now, we are fine since +class->lock and zspage->lock can prevent the race. + +Signed-off-by: Minchan Kim <minchan@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lore.kernel.org/r/20211115185909.3949505-7-minchan@kernel.org +--- + mm/zsmalloc.c | 157 ++-------------------------------------------------------- + 1 file changed, 8 insertions(+), 149 deletions(-) + +--- a/mm/zsmalloc.c ++++ b/mm/zsmalloc.c +@@ -254,10 +254,6 @@ struct zs_pool { + #ifdef CONFIG_COMPACTION + struct inode *inode; + struct work_struct free_work; +- /* A wait queue for when migration races with async_free_zspage() */ +- struct wait_queue_head migration_wait; +- atomic_long_t isolated_pages; +- bool destroying; + #endif + }; + +@@ -454,11 +450,6 @@ MODULE_ALIAS("zpool-zsmalloc"); + /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ + static DEFINE_PER_CPU(struct mapping_area, zs_map_area); + +-static bool is_zspage_isolated(struct zspage *zspage) +-{ +- return zspage->isolated; +-} +- + static __maybe_unused int is_first_page(struct page *page) + { + return PagePrivate(page); +@@ -744,7 +735,6 @@ static void remove_zspage(struct size_cl + enum fullness_group fullness) + { + VM_BUG_ON(list_empty(&class->fullness_list[fullness])); +- VM_BUG_ON(is_zspage_isolated(zspage)); + + list_del_init(&zspage->list); + class_stat_dec(class, fullness, 1); +@@ -770,13 +760,9 @@ static enum fullness_group fix_fullness_ + if (newfg == currfg) + goto out; + +- if (!is_zspage_isolated(zspage)) { +- remove_zspage(class, zspage, currfg); +- insert_zspage(class, zspage, newfg); +- } +- ++ remove_zspage(class, zspage, currfg); ++ insert_zspage(class, zspage, newfg); + set_zspage_mapping(zspage, class_idx, newfg); +- + out: + return newfg; + } +@@ -1511,7 +1497,6 @@ void zs_free(struct zs_pool *pool, unsig + unsigned long obj; + struct size_class *class; + enum fullness_group fullness; +- bool isolated; + + if (unlikely(!handle)) + return; +@@ -1533,11 +1518,9 @@ void zs_free(struct zs_pool *pool, unsig + goto out; + } + +- isolated = is_zspage_isolated(zspage); + migrate_read_unlock(zspage); + /* If zspage is isolated, zs_page_putback will free the zspage */ +- if (likely(!isolated)) +- free_zspage(pool, class, zspage); ++ free_zspage(pool, class, zspage); + out: + + spin_unlock(&class->lock); +@@ -1718,7 +1701,6 @@ static struct zspage *isolate_zspage(str + zspage = list_first_entry_or_null(&class->fullness_list[fg[i]], + struct zspage, list); + if (zspage) { +- VM_BUG_ON(is_zspage_isolated(zspage)); + remove_zspage(class, zspage, fg[i]); + return zspage; + } +@@ -1739,8 +1721,6 @@ static enum fullness_group putback_zspag + { + enum fullness_group fullness; + +- VM_BUG_ON(is_zspage_isolated(zspage)); +- + fullness = get_fullness_group(class, zspage); + insert_zspage(class, zspage, fullness); + set_zspage_mapping(zspage, class->index, fullness); +@@ -1822,35 +1802,10 @@ static void inc_zspage_isolation(struct + + static void dec_zspage_isolation(struct zspage *zspage) + { ++ VM_BUG_ON(zspage->isolated == 0); + zspage->isolated--; + } + +-static void putback_zspage_deferred(struct zs_pool *pool, +- struct size_class *class, +- struct zspage *zspage) +-{ +- enum fullness_group fg; +- +- fg = putback_zspage(class, zspage); +- if (fg == ZS_EMPTY) +- schedule_work(&pool->free_work); +- +-} +- +-static inline void zs_pool_dec_isolated(struct zs_pool *pool) +-{ +- VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); +- atomic_long_dec(&pool->isolated_pages); +- /* +- * Checking pool->destroying must happen after atomic_long_dec() +- * for pool->isolated_pages above. Paired with the smp_mb() in +- * zs_unregister_migration(). +- */ +- smp_mb__after_atomic(); +- if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) +- wake_up_all(&pool->migration_wait); +-} +- + static void replace_sub_page(struct size_class *class, struct zspage *zspage, + struct page *newpage, struct page *oldpage) + { +@@ -1876,10 +1831,7 @@ static void replace_sub_page(struct size + + static bool zs_page_isolate(struct page *page, isolate_mode_t mode) + { +- struct zs_pool *pool; +- struct size_class *class; + struct zspage *zspage; +- struct address_space *mapping; + + /* + * Page is locked so zspage couldn't be destroyed. For detail, look at +@@ -1889,39 +1841,9 @@ static bool zs_page_isolate(struct page + VM_BUG_ON_PAGE(PageIsolated(page), page); + + zspage = get_zspage(page); +- +- mapping = page_mapping(page); +- pool = mapping->private_data; +- +- class = zspage_class(pool, zspage); +- +- spin_lock(&class->lock); +- if (get_zspage_inuse(zspage) == 0) { +- spin_unlock(&class->lock); +- return false; +- } +- +- /* zspage is isolated for object migration */ +- if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { +- spin_unlock(&class->lock); +- return false; +- } +- +- /* +- * If this is first time isolation for the zspage, isolate zspage from +- * size_class to prevent further object allocation from the zspage. +- */ +- if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { +- enum fullness_group fullness; +- unsigned int class_idx; +- +- get_zspage_mapping(zspage, &class_idx, &fullness); +- atomic_long_inc(&pool->isolated_pages); +- remove_zspage(class, zspage, fullness); +- } +- ++ migrate_write_lock(zspage); + inc_zspage_isolation(zspage); +- spin_unlock(&class->lock); ++ migrate_write_unlock(zspage); + + return true; + } +@@ -2004,21 +1926,6 @@ static int zs_page_migrate(struct addres + + dec_zspage_isolation(zspage); + +- /* +- * Page migration is done so let's putback isolated zspage to +- * the list if @page is final isolated subpage in the zspage. +- */ +- if (!is_zspage_isolated(zspage)) { +- /* +- * We cannot race with zs_destroy_pool() here because we wait +- * for isolation to hit zero before we start destroying. +- * Also, we ensure that everyone can see pool->destroying before +- * we start waiting. +- */ +- putback_zspage_deferred(pool, class, zspage); +- zs_pool_dec_isolated(pool); +- } +- + if (page_zone(newpage) != page_zone(page)) { + dec_zone_page_state(page, NR_ZSPAGES); + inc_zone_page_state(newpage, NR_ZSPAGES); +@@ -2046,30 +1953,15 @@ static int zs_page_migrate(struct addres + + static void zs_page_putback(struct page *page) + { +- struct zs_pool *pool; +- struct size_class *class; +- struct address_space *mapping; + struct zspage *zspage; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + zspage = get_zspage(page); +- mapping = page_mapping(page); +- pool = mapping->private_data; +- class = zspage_class(pool, zspage); +- +- spin_lock(&class->lock); ++ migrate_write_lock(zspage); + dec_zspage_isolation(zspage); +- if (!is_zspage_isolated(zspage)) { +- /* +- * Due to page_lock, we cannot free zspage immediately +- * so let's defer. +- */ +- putback_zspage_deferred(pool, class, zspage); +- zs_pool_dec_isolated(pool); +- } +- spin_unlock(&class->lock); ++ migrate_write_unlock(zspage); + } + + static const struct address_space_operations zsmalloc_aops = { +@@ -2091,36 +1983,8 @@ static int zs_register_migration(struct + return 0; + } + +-static bool pool_isolated_are_drained(struct zs_pool *pool) +-{ +- return atomic_long_read(&pool->isolated_pages) == 0; +-} +- +-/* Function for resolving migration */ +-static void wait_for_isolated_drain(struct zs_pool *pool) +-{ +- +- /* +- * We're in the process of destroying the pool, so there are no +- * active allocations. zs_page_isolate() fails for completely free +- * zspages, so we need only wait for the zs_pool's isolated +- * count to hit zero. +- */ +- wait_event(pool->migration_wait, +- pool_isolated_are_drained(pool)); +-} +- + static void zs_unregister_migration(struct zs_pool *pool) + { +- pool->destroying = true; +- /* +- * We need a memory barrier here to ensure global visibility of +- * pool->destroying. Thus pool->isolated pages will either be 0 in which +- * case we don't care, or it will be > 0 and pool->destroying will +- * ensure that we wake up once isolation hits 0. +- */ +- smp_mb(); +- wait_for_isolated_drain(pool); /* This can block */ + flush_work(&pool->free_work); + iput(pool->inode); + } +@@ -2150,7 +2014,6 @@ static void async_free_zspage(struct wor + spin_unlock(&class->lock); + } + +- + list_for_each_entry_safe(zspage, tmp, &free_pages, list) { + list_del(&zspage->list); + lock_zspage(zspage); +@@ -2363,10 +2226,6 @@ struct zs_pool *zs_create_pool(const cha + if (!pool->name) + goto err; + +-#ifdef CONFIG_COMPACTION +- init_waitqueue_head(&pool->migration_wait); +-#endif +- + if (create_cache(pool)) + goto err; + diff --git a/patches/0007_locking_rwlocks_introduce_write_lock_nested.patch b/patches/0007_locking_rwlocks_introduce_write_lock_nested.patch new file mode 100644 index 000000000000..ae6d1ae4d54b --- /dev/null +++ b/patches/0007_locking_rwlocks_introduce_write_lock_nested.patch @@ -0,0 +1,146 @@ +From: Minchan Kim <minchan@kernel.org> +Subject: locking/rwlocks: introduce write_lock_nested +Date: Mon, 15 Nov 2021 10:59:07 -0800 + +In preparation for converting bit_spin_lock to rwlock in zsmalloc +so that multiple writers of zspages can run at the same time but +those zspages are supposed to be different zspage instance. Thus, +it's not deadlock. This patch adds write_lock_nested to support +the case for LOCKDEP. + +[bigeasy: folded write_lock_nested() fixups for PREEMPT_RT.] + +Cc: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Minchan Kim <minchan@kernel.org> +Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lore.kernel.org/r/20211115185909.3949505-8-minchan@kernel.org +Link: https://lkml.kernel.org/r/YZfrMTAXV56HFWJY@google.com +--- + include/linux/rwlock.h | 6 ++++++ + include/linux/rwlock_api_smp.h | 9 +++++++++ + include/linux/rwlock_rt.h | 10 ++++++++++ + include/linux/spinlock_api_up.h | 1 + + kernel/locking/spinlock.c | 6 ++++++ + kernel/locking/spinlock_rt.c | 12 ++++++++++++ + 6 files changed, 44 insertions(+) + +--- a/include/linux/rwlock.h ++++ b/include/linux/rwlock.h +@@ -55,6 +55,12 @@ do { \ + #define write_lock(lock) _raw_write_lock(lock) + #define read_lock(lock) _raw_read_lock(lock) + ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++#define write_lock_nested(lock, subclass) _raw_write_lock_nested(lock, subclass) ++#else ++#define write_lock_nested(lock, subclass) _raw_write_lock(lock) ++#endif ++ + #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + + #define read_lock_irqsave(lock, flags) \ +--- a/include/linux/rwlock_api_smp.h ++++ b/include/linux/rwlock_api_smp.h +@@ -17,6 +17,7 @@ + + void __lockfunc _raw_read_lock(rwlock_t *lock) __acquires(lock); + void __lockfunc _raw_write_lock(rwlock_t *lock) __acquires(lock); ++void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) __acquires(lock); + void __lockfunc _raw_read_lock_bh(rwlock_t *lock) __acquires(lock); + void __lockfunc _raw_write_lock_bh(rwlock_t *lock) __acquires(lock); + void __lockfunc _raw_read_lock_irq(rwlock_t *lock) __acquires(lock); +@@ -46,6 +47,7 @@ void __lockfunc + + #ifdef CONFIG_INLINE_WRITE_LOCK + #define _raw_write_lock(lock) __raw_write_lock(lock) ++#define _raw_write_lock_nested(lock, subclass) __raw_write_lock_nested(lock, subclass) + #endif + + #ifdef CONFIG_INLINE_READ_LOCK_BH +@@ -209,6 +211,13 @@ static inline void __raw_write_lock(rwlo + LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); + } + ++static inline void __raw_write_lock_nested(rwlock_t *lock, int subclass) ++{ ++ preempt_disable(); ++ rwlock_acquire(&lock->dep_map, subclass, 0, _RET_IP_); ++ LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); ++} ++ + #endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */ + + static inline void __raw_write_unlock(rwlock_t *lock) +--- a/include/linux/rwlock_rt.h ++++ b/include/linux/rwlock_rt.h +@@ -28,6 +28,7 @@ extern void rt_read_lock(rwlock_t *rwloc + extern int rt_read_trylock(rwlock_t *rwlock); + extern void rt_read_unlock(rwlock_t *rwlock); + extern void rt_write_lock(rwlock_t *rwlock); ++extern void rt_write_lock_nested(rwlock_t *rwlock, int subclass); + extern int rt_write_trylock(rwlock_t *rwlock); + extern void rt_write_unlock(rwlock_t *rwlock); + +@@ -83,6 +84,15 @@ static __always_inline void write_lock(r + rt_write_lock(rwlock); + } + ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++static __always_inline void write_lock_nested(rwlock_t *rwlock, int subclass) ++{ ++ rt_write_lock_nested(rwlock, subclass); ++} ++#else ++#define write_lock_nested(lock, subclass) rt_write_lock(((void)(subclass), (lock))) ++#endif ++ + static __always_inline void write_lock_bh(rwlock_t *rwlock) + { + local_bh_disable(); +--- a/include/linux/spinlock_api_up.h ++++ b/include/linux/spinlock_api_up.h +@@ -59,6 +59,7 @@ + #define _raw_spin_lock_nested(lock, subclass) __LOCK(lock) + #define _raw_read_lock(lock) __LOCK(lock) + #define _raw_write_lock(lock) __LOCK(lock) ++#define _raw_write_lock_nested(lock, subclass) __LOCK(lock) + #define _raw_spin_lock_bh(lock) __LOCK_BH(lock) + #define _raw_read_lock_bh(lock) __LOCK_BH(lock) + #define _raw_write_lock_bh(lock) __LOCK_BH(lock) +--- a/kernel/locking/spinlock.c ++++ b/kernel/locking/spinlock.c +@@ -300,6 +300,12 @@ void __lockfunc _raw_write_lock(rwlock_t + __raw_write_lock(lock); + } + EXPORT_SYMBOL(_raw_write_lock); ++ ++void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) ++{ ++ __raw_write_lock_nested(lock, subclass); ++} ++EXPORT_SYMBOL(_raw_write_lock_nested); + #endif + + #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE +--- a/kernel/locking/spinlock_rt.c ++++ b/kernel/locking/spinlock_rt.c +@@ -239,6 +239,18 @@ void __sched rt_write_lock(rwlock_t *rwl + } + EXPORT_SYMBOL(rt_write_lock); + ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) ++{ ++ rtlock_might_resched(); ++ rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_); ++ rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); ++ rcu_read_lock(); ++ migrate_disable(); ++} ++EXPORT_SYMBOL(rt_write_lock_nested); ++#endif ++ + void __sched rt_read_unlock(rwlock_t *rwlock) + { + rwlock_release(&rwlock->dep_map, _RET_IP_); diff --git a/patches/0008_zsmalloc_replace_per_zpage_lock_with_pool_migrate_lock.patch b/patches/0008_zsmalloc_replace_per_zpage_lock_with_pool_migrate_lock.patch new file mode 100644 index 000000000000..10e82cf97255 --- /dev/null +++ b/patches/0008_zsmalloc_replace_per_zpage_lock_with_pool_migrate_lock.patch @@ -0,0 +1,469 @@ +From: Minchan Kim <minchan@kernel.org> +Subject: zsmalloc: replace per zpage lock with pool->migrate_lock +Date: Mon, 15 Nov 2021 10:59:08 -0800 + +The zsmalloc has used a bit for spin_lock in zpage handle to keep +zpage object alive during several operations. However, it causes +the problem for PREEMPT_RT as well as introducing too complicated. + +This patch replaces the bit spin_lock with pool->migrate_lock +rwlock. It could make the code simple as well as zsmalloc work +under PREEMPT_RT. + +The drawback is the pool->migrate_lock is bigger granuarity than +per zpage lock so the contention would be higher than old when +both IO-related operations(i.e., zsmalloc, zsfree, zs_[map|unmap]) +and compaction(page/zpage migration) are going in parallel(*, +the migrate_lock is rwlock and IO related functions are all read +side lock so there is no contention). However, the write-side +is fast enough(dominant overhead is just page copy) so it wouldn't +affect much. If the lock granurity becomes more problem later, +we could introduce table locks based on handle as a hash value. + +Signed-off-by: Minchan Kim <minchan@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lore.kernel.org/r/20211115185909.3949505-9-minchan@kernel.org +--- + mm/zsmalloc.c | 205 +++++++++++++++++++++++++++------------------------------- + 1 file changed, 96 insertions(+), 109 deletions(-) + +--- a/mm/zsmalloc.c ++++ b/mm/zsmalloc.c +@@ -30,6 +30,14 @@ + + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + ++/* ++ * lock ordering: ++ * page_lock ++ * pool->migrate_lock ++ * class->lock ++ * zspage->lock ++ */ ++ + #include <linux/module.h> + #include <linux/kernel.h> + #include <linux/sched.h> +@@ -101,15 +109,6 @@ + #define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) + + /* +- * Memory for allocating for handle keeps object position by +- * encoding <page, obj_idx> and the encoded value has a room +- * in least bit(ie, look at obj_to_location). +- * We use the bit to synchronize between object access by +- * user and migration. +- */ +-#define HANDLE_PIN_BIT 0 +- +-/* + * Head in allocated object should have OBJ_ALLOCATED_TAG + * to identify the object was allocated or not. + * It's okay to add the status bit in the least bit because +@@ -255,6 +254,8 @@ struct zs_pool { + struct inode *inode; + struct work_struct free_work; + #endif ++ /* protect page/zspage migration */ ++ rwlock_t migrate_lock; + }; + + struct zspage { +@@ -297,6 +298,9 @@ static void zs_unregister_migration(stru + static void migrate_lock_init(struct zspage *zspage); + static void migrate_read_lock(struct zspage *zspage); + static void migrate_read_unlock(struct zspage *zspage); ++static void migrate_write_lock(struct zspage *zspage); ++static void migrate_write_lock_nested(struct zspage *zspage); ++static void migrate_write_unlock(struct zspage *zspage); + static void kick_deferred_free(struct zs_pool *pool); + static void init_deferred_free(struct zs_pool *pool); + static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); +@@ -308,6 +312,9 @@ static void zs_unregister_migration(stru + static void migrate_lock_init(struct zspage *zspage) {} + static void migrate_read_lock(struct zspage *zspage) {} + static void migrate_read_unlock(struct zspage *zspage) {} ++static void migrate_write_lock(struct zspage *zspage) {} ++static void migrate_write_lock_nested(struct zspage *zspage) {} ++static void migrate_write_unlock(struct zspage *zspage) {} + static void kick_deferred_free(struct zs_pool *pool) {} + static void init_deferred_free(struct zs_pool *pool) {} + static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} +@@ -359,14 +366,10 @@ static void cache_free_zspage(struct zs_ + kmem_cache_free(pool->zspage_cachep, zspage); + } + ++/* class->lock(which owns the handle) synchronizes races */ + static void record_obj(unsigned long handle, unsigned long obj) + { +- /* +- * lsb of @obj represents handle lock while other bits +- * represent object value the handle is pointing so +- * updating shouldn't do store tearing. +- */ +- WRITE_ONCE(*(unsigned long *)handle, obj); ++ *(unsigned long *)handle = obj; + } + + /* zpool driver */ +@@ -880,26 +883,6 @@ static bool obj_allocated(struct page *p + return true; + } + +-static inline int testpin_tag(unsigned long handle) +-{ +- return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); +-} +- +-static inline int trypin_tag(unsigned long handle) +-{ +- return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); +-} +- +-static void pin_tag(unsigned long handle) __acquires(bitlock) +-{ +- bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); +-} +- +-static void unpin_tag(unsigned long handle) __releases(bitlock) +-{ +- bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); +-} +- + static void reset_page(struct page *page) + { + __ClearPageMovable(page); +@@ -968,6 +951,11 @@ static void free_zspage(struct zs_pool * + VM_BUG_ON(get_zspage_inuse(zspage)); + VM_BUG_ON(list_empty(&zspage->list)); + ++ /* ++ * Since zs_free couldn't be sleepable, this function cannot call ++ * lock_page. The page locks trylock_zspage got will be released ++ * by __free_zspage. ++ */ + if (!trylock_zspage(zspage)) { + kick_deferred_free(pool); + return; +@@ -1263,15 +1251,20 @@ void *zs_map_object(struct zs_pool *pool + */ + BUG_ON(in_interrupt()); + +- /* From now on, migration cannot move the object */ +- pin_tag(handle); +- ++ /* It guarantees it can get zspage from handle safely */ ++ read_lock(&pool->migrate_lock); + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); + zspage = get_zspage(page); + +- /* migration cannot move any subpage in this zspage */ ++ /* ++ * migration cannot move any zpages in this zspage. Here, class->lock ++ * is too heavy since callers would take some time until they calls ++ * zs_unmap_object API so delegate the locking from class to zspage ++ * which is smaller granularity. ++ */ + migrate_read_lock(zspage); ++ read_unlock(&pool->migrate_lock); + + class = zspage_class(pool, zspage); + off = (class->size * obj_idx) & ~PAGE_MASK; +@@ -1330,7 +1323,6 @@ void zs_unmap_object(struct zs_pool *poo + put_cpu_var(zs_map_area); + + migrate_read_unlock(zspage); +- unpin_tag(handle); + } + EXPORT_SYMBOL_GPL(zs_unmap_object); + +@@ -1424,6 +1416,7 @@ unsigned long zs_malloc(struct zs_pool * + size += ZS_HANDLE_SIZE; + class = pool->size_class[get_size_class_index(size)]; + ++ /* class->lock effectively protects the zpage migration */ + spin_lock(&class->lock); + zspage = find_get_zspage(class); + if (likely(zspage)) { +@@ -1501,30 +1494,27 @@ void zs_free(struct zs_pool *pool, unsig + if (unlikely(!handle)) + return; + +- pin_tag(handle); ++ /* ++ * The pool->migrate_lock protects the race with zpage's migration ++ * so it's safe to get the page from handle. ++ */ ++ read_lock(&pool->migrate_lock); + obj = handle_to_obj(handle); + obj_to_page(obj, &f_page); + zspage = get_zspage(f_page); +- +- migrate_read_lock(zspage); + class = zspage_class(pool, zspage); +- + spin_lock(&class->lock); ++ read_unlock(&pool->migrate_lock); ++ + obj_free(class->size, obj); + class_stat_dec(class, OBJ_USED, 1); + fullness = fix_fullness_group(class, zspage); +- if (fullness != ZS_EMPTY) { +- migrate_read_unlock(zspage); ++ if (fullness != ZS_EMPTY) + goto out; +- } + +- migrate_read_unlock(zspage); +- /* If zspage is isolated, zs_page_putback will free the zspage */ + free_zspage(pool, class, zspage); + out: +- + spin_unlock(&class->lock); +- unpin_tag(handle); + cache_free_handle(pool, handle); + } + EXPORT_SYMBOL_GPL(zs_free); +@@ -1608,11 +1598,8 @@ static unsigned long find_alloced_obj(st + offset += class->size * index; + + while (offset < PAGE_SIZE) { +- if (obj_allocated(page, addr + offset, &handle)) { +- if (trypin_tag(handle)) +- break; +- handle = 0; +- } ++ if (obj_allocated(page, addr + offset, &handle)) ++ break; + + offset += class->size; + index++; +@@ -1658,7 +1645,6 @@ static int migrate_zspage(struct zs_pool + + /* Stop if there is no more space */ + if (zspage_full(class, get_zspage(d_page))) { +- unpin_tag(handle); + ret = -ENOMEM; + break; + } +@@ -1667,15 +1653,7 @@ static int migrate_zspage(struct zs_pool + free_obj = obj_malloc(pool, get_zspage(d_page), handle); + zs_object_copy(class, free_obj, used_obj); + obj_idx++; +- /* +- * record_obj updates handle's value to free_obj and it will +- * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which +- * breaks synchronization using pin_tag(e,g, zs_free) so +- * let's keep the lock bit. +- */ +- free_obj |= BIT(HANDLE_PIN_BIT); + record_obj(handle, free_obj); +- unpin_tag(handle); + obj_free(class->size, used_obj); + } + +@@ -1789,6 +1767,11 @@ static void migrate_write_lock(struct zs + write_lock(&zspage->lock); + } + ++static void migrate_write_lock_nested(struct zspage *zspage) ++{ ++ write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING); ++} ++ + static void migrate_write_unlock(struct zspage *zspage) + { + write_unlock(&zspage->lock); +@@ -1856,11 +1839,10 @@ static int zs_page_migrate(struct addres + struct zspage *zspage; + struct page *dummy; + void *s_addr, *d_addr, *addr; +- int offset, pos; ++ int offset; + unsigned long handle; + unsigned long old_obj, new_obj; + unsigned int obj_idx; +- int ret = -EAGAIN; + + /* + * We cannot support the _NO_COPY case here, because copy needs to +@@ -1873,32 +1855,25 @@ static int zs_page_migrate(struct addres + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + +- zspage = get_zspage(page); +- +- /* Concurrent compactor cannot migrate any subpage in zspage */ +- migrate_write_lock(zspage); + pool = mapping->private_data; ++ ++ /* ++ * The pool migrate_lock protects the race between zpage migration ++ * and zs_free. ++ */ ++ write_lock(&pool->migrate_lock); ++ zspage = get_zspage(page); + class = zspage_class(pool, zspage); +- offset = get_first_obj_offset(page); + ++ /* ++ * the class lock protects zpage alloc/free in the zspage. ++ */ + spin_lock(&class->lock); +- if (!get_zspage_inuse(zspage)) { +- /* +- * Set "offset" to end of the page so that every loops +- * skips unnecessary object scanning. +- */ +- offset = PAGE_SIZE; +- } ++ /* the migrate_write_lock protects zpage access via zs_map_object */ ++ migrate_write_lock(zspage); + +- pos = offset; ++ offset = get_first_obj_offset(page); + s_addr = kmap_atomic(page); +- while (pos < PAGE_SIZE) { +- if (obj_allocated(page, s_addr + pos, &handle)) { +- if (!trypin_tag(handle)) +- goto unpin_objects; +- } +- pos += class->size; +- } + + /* + * Here, any user cannot access all objects in the zspage so let's move. +@@ -1907,25 +1882,30 @@ static int zs_page_migrate(struct addres + memcpy(d_addr, s_addr, PAGE_SIZE); + kunmap_atomic(d_addr); + +- for (addr = s_addr + offset; addr < s_addr + pos; ++ for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; + addr += class->size) { + if (obj_allocated(page, addr, &handle)) { +- BUG_ON(!testpin_tag(handle)); + + old_obj = handle_to_obj(handle); + obj_to_location(old_obj, &dummy, &obj_idx); + new_obj = (unsigned long)location_to_obj(newpage, + obj_idx); +- new_obj |= BIT(HANDLE_PIN_BIT); + record_obj(handle, new_obj); + } + } ++ kunmap_atomic(s_addr); + + replace_sub_page(class, zspage, newpage, page); +- get_page(newpage); +- ++ /* ++ * Since we complete the data copy and set up new zspage structure, ++ * it's okay to release migration_lock. ++ */ ++ write_unlock(&pool->migrate_lock); ++ spin_unlock(&class->lock); + dec_zspage_isolation(zspage); ++ migrate_write_unlock(zspage); + ++ get_page(newpage); + if (page_zone(newpage) != page_zone(page)) { + dec_zone_page_state(page, NR_ZSPAGES); + inc_zone_page_state(newpage, NR_ZSPAGES); +@@ -1933,22 +1913,8 @@ static int zs_page_migrate(struct addres + + reset_page(page); + put_page(page); +- page = newpage; +- +- ret = MIGRATEPAGE_SUCCESS; +-unpin_objects: +- for (addr = s_addr + offset; addr < s_addr + pos; +- addr += class->size) { +- if (obj_allocated(page, addr, &handle)) { +- BUG_ON(!testpin_tag(handle)); +- unpin_tag(handle); +- } +- } +- kunmap_atomic(s_addr); +- spin_unlock(&class->lock); +- migrate_write_unlock(zspage); + +- return ret; ++ return MIGRATEPAGE_SUCCESS; + } + + static void zs_page_putback(struct page *page) +@@ -2077,8 +2043,13 @@ static unsigned long __zs_compact(struct + struct zspage *dst_zspage = NULL; + unsigned long pages_freed = 0; + ++ /* protect the race between zpage migration and zs_free */ ++ write_lock(&pool->migrate_lock); ++ /* protect zpage allocation/free */ + spin_lock(&class->lock); + while ((src_zspage = isolate_zspage(class, true))) { ++ /* protect someone accessing the zspage(i.e., zs_map_object) */ ++ migrate_write_lock(src_zspage); + + if (!zs_can_compact(class)) + break; +@@ -2087,6 +2058,8 @@ static unsigned long __zs_compact(struct + cc.s_page = get_first_page(src_zspage); + + while ((dst_zspage = isolate_zspage(class, false))) { ++ migrate_write_lock_nested(dst_zspage); ++ + cc.d_page = get_first_page(dst_zspage); + /* + * If there is no more space in dst_page, resched +@@ -2096,6 +2069,10 @@ static unsigned long __zs_compact(struct + break; + + putback_zspage(class, dst_zspage); ++ migrate_write_unlock(dst_zspage); ++ dst_zspage = NULL; ++ if (rwlock_is_contended(&pool->migrate_lock)) ++ break; + } + + /* Stop if we couldn't find slot */ +@@ -2103,19 +2080,28 @@ static unsigned long __zs_compact(struct + break; + + putback_zspage(class, dst_zspage); ++ migrate_write_unlock(dst_zspage); ++ + if (putback_zspage(class, src_zspage) == ZS_EMPTY) { ++ migrate_write_unlock(src_zspage); + free_zspage(pool, class, src_zspage); + pages_freed += class->pages_per_zspage; +- } ++ } else ++ migrate_write_unlock(src_zspage); + spin_unlock(&class->lock); ++ write_unlock(&pool->migrate_lock); + cond_resched(); ++ write_lock(&pool->migrate_lock); + spin_lock(&class->lock); + } + +- if (src_zspage) ++ if (src_zspage) { + putback_zspage(class, src_zspage); ++ migrate_write_unlock(src_zspage); ++ } + + spin_unlock(&class->lock); ++ write_unlock(&pool->migrate_lock); + + return pages_freed; + } +@@ -2221,6 +2207,7 @@ struct zs_pool *zs_create_pool(const cha + return NULL; + + init_deferred_free(pool); ++ rwlock_init(&pool->migrate_lock); + + pool->name = kstrdup(name, GFP_KERNEL); + if (!pool->name) diff --git a/patches/0009_zsmalloc_replace_get_cpu_var_with_local_lock.patch b/patches/0009_zsmalloc_replace_get_cpu_var_with_local_lock.patch new file mode 100644 index 000000000000..f90bd47c90f2 --- /dev/null +++ b/patches/0009_zsmalloc_replace_get_cpu_var_with_local_lock.patch @@ -0,0 +1,70 @@ +From: Mike Galbraith <umgwanakikbuti@gmail.com> +Subject: zsmalloc: replace get_cpu_var with local_lock +Date: Mon, 15 Nov 2021 10:59:09 -0800 + +The usage of get_cpu_var() in zs_map_object() is problematic because +it disables preemption and makes it impossible to acquire any sleeping +lock on PREEMPT_RT such as a spinlock_t. +Replace the get_cpu_var() usage with a local_lock_t which is embedded +struct mapping_area. It ensures that the access the struct is +synchronized against all users on the same CPU. + +Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +[minchan: remove the bit_spin_lock part and change the title] +Signed-off-by: Minchan Kim <minchan@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lore.kernel.org/r/20211115185909.3949505-10-minchan@kernel.org +--- + mm/zsmalloc.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/mm/zsmalloc.c ++++ b/mm/zsmalloc.c +@@ -65,6 +65,7 @@ + #include <linux/wait.h> + #include <linux/pagemap.h> + #include <linux/fs.h> ++#include <linux/local_lock.h> + + #define ZSPAGE_MAGIC 0x58 + +@@ -276,6 +277,7 @@ struct zspage { + }; + + struct mapping_area { ++ local_lock_t lock; + char *vm_buf; /* copy buffer for objects that span pages */ + char *vm_addr; /* address of kmap_atomic()'ed pages */ + enum zs_mapmode vm_mm; /* mapping mode */ +@@ -451,7 +453,9 @@ MODULE_ALIAS("zpool-zsmalloc"); + #endif /* CONFIG_ZPOOL */ + + /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ +-static DEFINE_PER_CPU(struct mapping_area, zs_map_area); ++static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { ++ .lock = INIT_LOCAL_LOCK(lock), ++}; + + static __maybe_unused int is_first_page(struct page *page) + { +@@ -1269,7 +1273,8 @@ void *zs_map_object(struct zs_pool *pool + class = zspage_class(pool, zspage); + off = (class->size * obj_idx) & ~PAGE_MASK; + +- area = &get_cpu_var(zs_map_area); ++ local_lock(&zs_map_area.lock); ++ area = this_cpu_ptr(&zs_map_area); + area->vm_mm = mm; + if (off + class->size <= PAGE_SIZE) { + /* this object is contained entirely within a page */ +@@ -1320,7 +1325,7 @@ void zs_unmap_object(struct zs_pool *poo + + __zs_unmap_object(area, pages, off, class->size); + } +- put_cpu_var(zs_map_area); ++ local_unlock(&zs_map_area.lock); + + migrate_read_unlock(zspage); + } diff --git a/patches/Add_localversion_for_-RT_release.patch b/patches/Add_localversion_for_-RT_release.patch index 53b69a97ca19..41fc0b58e69e 100644 --- a/patches/Add_localversion_for_-RT_release.patch +++ b/patches/Add_localversion_for_-RT_release.patch @@ -15,4 +15,4 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- /dev/null +++ b/localversion-rt @@ -0,0 +1 @@ -+-rt3 ++-rt4 diff --git a/patches/KVM__arm_arm64__downgrade_preempt_disabled_region_to_migrate_disable.patch b/patches/KVM__arm_arm64__downgrade_preempt_disabled_region_to_migrate_disable.patch index b3f09f2a67d5..e32167ddd734 100644 --- a/patches/KVM__arm_arm64__downgrade_preempt_disabled_region_to_migrate_disable.patch +++ b/patches/KVM__arm_arm64__downgrade_preempt_disabled_region_to_migrate_disable.patch @@ -27,7 +27,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c -@@ -813,7 +813,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_v +@@ -820,7 +820,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_v * involves poking the GIC, which must be done in a * non-preemptible context. */ @@ -36,7 +36,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> kvm_pmu_flush_hwstate(vcpu); -@@ -837,7 +837,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_v +@@ -844,7 +844,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_v kvm_timer_sync_user(vcpu); kvm_vgic_sync_hwstate(vcpu); local_irq_enable(); @@ -45,7 +45,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> continue; } -@@ -909,7 +909,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_v +@@ -916,7 +916,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_v /* Exit types that need handling before we can be preempted */ handle_exit_early(vcpu, ret); diff --git a/patches/console__add_write_atomic_interface.patch b/patches/console__add_write_atomic_interface.patch index ee5b697b9199..ec5663190789 100644 --- a/patches/console__add_write_atomic_interface.patch +++ b/patches/console__add_write_atomic_interface.patch @@ -144,7 +144,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> #endif /* _KGDB_H_ */ --- a/include/linux/printk.h +++ b/include/linux/printk.h -@@ -280,10 +280,18 @@ static inline void dump_stack(void) +@@ -284,10 +284,18 @@ static inline void printk_trigger_flush( extern int __printk_cpu_trylock(void); extern void __printk_wait_on_cpu_lock(void); extern void __printk_cpu_unlock(void); @@ -163,7 +163,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> #endif /* CONFIG_SMP */ /** -@@ -315,6 +323,21 @@ extern void __printk_cpu_unlock(void); +@@ -319,6 +327,21 @@ extern void __printk_cpu_unlock(void); local_irq_restore(flags); \ } while (0) @@ -259,7 +259,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> -@@ -3583,6 +3584,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind); +@@ -3588,6 +3589,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #ifdef CONFIG_SMP static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); static atomic_t printk_cpulock_nested = ATOMIC_INIT(0); @@ -267,7 +267,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> /** * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant -@@ -3662,6 +3664,9 @@ EXPORT_SYMBOL(__printk_cpu_trylock); +@@ -3667,6 +3669,9 @@ EXPORT_SYMBOL(__printk_cpu_trylock); */ void __printk_cpu_unlock(void) { @@ -277,7 +277,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> if (atomic_read(&printk_cpulock_nested)) { atomic_dec(&printk_cpulock_nested); return; -@@ -3672,6 +3677,12 @@ void __printk_cpu_unlock(void) +@@ -3677,6 +3682,12 @@ void __printk_cpu_unlock(void) * LMM(__printk_cpu_unlock:A) */ @@ -290,7 +290,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> /* * Guarantee loads and stores from this CPU when it was the * lock owner are visible to the next lock owner. This pairs -@@ -3692,6 +3703,21 @@ void __printk_cpu_unlock(void) +@@ -3697,6 +3708,21 @@ void __printk_cpu_unlock(void) */ atomic_set_release(&printk_cpulock_owner, -1); /* LMM(__printk_cpu_unlock:B) */ diff --git a/patches/mm-zsmalloc-Replace-bit-spinlock-and-get_cpu_var-usa.patch b/patches/mm-zsmalloc-Replace-bit-spinlock-and-get_cpu_var-usa.patch deleted file mode 100644 index 8a4c80446b7f..000000000000 --- a/patches/mm-zsmalloc-Replace-bit-spinlock-and-get_cpu_var-usa.patch +++ /dev/null @@ -1,244 +0,0 @@ -From: Mike Galbraith <umgwanakikbuti@gmail.com> -Date: Tue, 28 Sep 2021 09:38:47 +0200 -Subject: [PATCH] mm/zsmalloc: Replace bit spinlock and get_cpu_var() usage. - -For efficiency reasons, zsmalloc is using a slim `handle'. The value is -the address of a memory allocation of 4 or 8 bytes depending on the size -of the long data type. The lowest bit in that allocated memory is used -as a bit spin lock. -The usage of the bit spin lock is problematic because with the bit spin -lock held zsmalloc acquires a rwlock_t and spinlock_t which are both -sleeping locks on PREEMPT_RT and therefore must not be acquired with -disabled preemption. - -Extend the handle to struct zsmalloc_handle which holds the old handle as -addr and a spinlock_t which replaces the bit spinlock. Replace all the -wrapper functions accordingly. - -The usage of get_cpu_var() in zs_map_object() is problematic because -it disables preemption and makes it impossible to acquire any sleeping -lock on PREEMPT_RT such as a spinlock_t. -Replace the get_cpu_var() usage with a local_lock_t which is embedded -struct mapping_area. It ensures that the access the struct is -synchronized against all users on the same CPU. - -This survived LTP testing. - -Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> -Signed-off-by: Thomas Gleixner <tglx@linutronix.de> -[bigeasy: replace the bitspin_lock() with a mutex, get_locked_var() and - patch description. Mike then fixed the size magic and made handle lock - spinlock_t.] -Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> ---- - mm/Kconfig | 3 -- - mm/zsmalloc.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- - 2 files changed, 79 insertions(+), 8 deletions(-) - ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -643,7 +643,6 @@ config ZSWAP_ZPOOL_DEFAULT_Z3FOLD - - config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC - bool "zsmalloc" -- depends on !PREEMPT_RT - select ZSMALLOC - help - Use the zsmalloc allocator as the default allocator. -@@ -694,7 +693,7 @@ config Z3FOLD - - config ZSMALLOC - tristate "Memory allocator for compressed pages" -- depends on MMU && !PREEMPT_RT -+ depends on MMU - help - zsmalloc is a slab-based memory allocator designed to store - compressed RAM pages. zsmalloc uses virtual memory mapping ---- a/mm/zsmalloc.c -+++ b/mm/zsmalloc.c -@@ -57,6 +57,7 @@ - #include <linux/wait.h> - #include <linux/pagemap.h> - #include <linux/fs.h> -+#include <linux/local_lock.h> - - #define ZSPAGE_MAGIC 0x58 - -@@ -77,6 +78,20 @@ - - #define ZS_HANDLE_SIZE (sizeof(unsigned long)) - -+#ifdef CONFIG_PREEMPT_RT -+ -+struct zsmalloc_handle { -+ unsigned long addr; -+ spinlock_t lock; -+}; -+ -+#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle)) -+ -+#else -+ -+#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long)) -+#endif -+ - /* - * Object location (<PFN>, <obj_idx>) is encoded as - * a single (unsigned long) handle value. -@@ -293,6 +308,7 @@ struct zspage { - }; - - struct mapping_area { -+ local_lock_t lock; - char *vm_buf; /* copy buffer for objects that span pages */ - char *vm_addr; /* address of kmap_atomic()'ed pages */ - enum zs_mapmode vm_mm; /* mapping mode */ -@@ -322,7 +338,7 @@ static void SetZsPageMovable(struct zs_p - - static int create_cache(struct zs_pool *pool) - { -- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, -+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE, - 0, 0, NULL); - if (!pool->handle_cachep) - return 1; -@@ -346,10 +362,27 @@ static void destroy_cache(struct zs_pool - - static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) - { -- return (unsigned long)kmem_cache_alloc(pool->handle_cachep, -- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); -+ void *p; -+ -+ p = kmem_cache_alloc(pool->handle_cachep, -+ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); -+#ifdef CONFIG_PREEMPT_RT -+ if (p) { -+ struct zsmalloc_handle *zh = p; -+ -+ spin_lock_init(&zh->lock); -+ } -+#endif -+ return (unsigned long)p; - } - -+#ifdef CONFIG_PREEMPT_RT -+static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle) -+{ -+ return (void *)(handle & ~((1 << OBJ_TAG_BITS) - 1)); -+} -+#endif -+ - static void cache_free_handle(struct zs_pool *pool, unsigned long handle) - { - kmem_cache_free(pool->handle_cachep, (void *)handle); -@@ -368,12 +401,18 @@ static void cache_free_zspage(struct zs_ - - static void record_obj(unsigned long handle, unsigned long obj) - { -+#ifdef CONFIG_PREEMPT_RT -+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); -+ -+ WRITE_ONCE(zh->addr, obj); -+#else - /* - * lsb of @obj represents handle lock while other bits - * represent object value the handle is pointing so - * updating shouldn't do store tearing. - */ - WRITE_ONCE(*(unsigned long *)handle, obj); -+#endif - } - - /* zpool driver */ -@@ -455,7 +494,9 @@ MODULE_ALIAS("zpool-zsmalloc"); - #endif /* CONFIG_ZPOOL */ - - /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ --static DEFINE_PER_CPU(struct mapping_area, zs_map_area); -+static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { -+ .lock = INIT_LOCAL_LOCK(lock), -+}; - - static bool is_zspage_isolated(struct zspage *zspage) - { -@@ -862,7 +903,13 @@ static unsigned long location_to_obj(str - - static unsigned long handle_to_obj(unsigned long handle) - { -+#ifdef CONFIG_PREEMPT_RT -+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); -+ -+ return zh->addr; -+#else - return *(unsigned long *)handle; -+#endif - } - - static unsigned long obj_to_head(struct page *page, void *obj) -@@ -876,22 +923,46 @@ static unsigned long obj_to_head(struct - - static inline int testpin_tag(unsigned long handle) - { -+#ifdef CONFIG_PREEMPT_RT -+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); -+ -+ return spin_is_locked(&zh->lock); -+#else - return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); -+#endif - } - - static inline int trypin_tag(unsigned long handle) - { -+#ifdef CONFIG_PREEMPT_RT -+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); -+ -+ return spin_trylock(&zh->lock); -+#else - return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); -+#endif - } - - static void pin_tag(unsigned long handle) __acquires(bitlock) - { -+#ifdef CONFIG_PREEMPT_RT -+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); -+ -+ return spin_lock(&zh->lock); -+#else - bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); -+#endif - } - - static void unpin_tag(unsigned long handle) __releases(bitlock) - { -+#ifdef CONFIG_PREEMPT_RT -+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); -+ -+ return spin_unlock(&zh->lock); -+#else - bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); -+#endif - } - - static void reset_page(struct page *page) -@@ -1274,7 +1345,8 @@ void *zs_map_object(struct zs_pool *pool - class = pool->size_class[class_idx]; - off = (class->size * obj_idx) & ~PAGE_MASK; - -- area = &get_cpu_var(zs_map_area); -+ local_lock(&zs_map_area.lock); -+ area = this_cpu_ptr(&zs_map_area); - area->vm_mm = mm; - if (off + class->size <= PAGE_SIZE) { - /* this object is contained entirely within a page */ -@@ -1328,7 +1400,7 @@ void zs_unmap_object(struct zs_pool *poo - - __zs_unmap_object(area, pages, off, class->size); - } -- put_cpu_var(zs_map_area); -+ local_unlock(&zs_map_area.lock); - - migrate_read_unlock(zspage); - unpin_tag(handle); diff --git a/patches/printk__Enhance_the_condition_check_of_msleep_in_pr_flush.patch b/patches/printk__Enhance_the_condition_check_of_msleep_in_pr_flush.patch index 18a51f7130f9..0fa777df022d 100644 --- a/patches/printk__Enhance_the_condition_check_of_msleep_in_pr_flush.patch +++ b/patches/printk__Enhance_the_condition_check_of_msleep_in_pr_flush.patch @@ -27,7 +27,7 @@ Link: https://lore.kernel.org/lkml/20210719022649.3444072-1-chao.qin@intel.com --- --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c -@@ -3650,7 +3650,9 @@ bool pr_flush(int timeout_ms, bool reset +@@ -3655,7 +3655,9 @@ bool pr_flush(int timeout_ms, bool reset u64 diff; u64 seq; diff --git a/patches/printk__add_pr_flush.patch b/patches/printk__add_pr_flush.patch index d83ac06a2099..ebf6398747e7 100644 --- a/patches/printk__add_pr_flush.patch +++ b/patches/printk__add_pr_flush.patch @@ -33,7 +33,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> /* * Please don't use printk_ratelimit(), because it shares ratelimiting state * with all other unrelated printk_ratelimit() callsites. Instead use -@@ -201,6 +203,11 @@ int _printk(const char *s, ...) +@@ -202,6 +204,11 @@ int _printk(const char *s, ...) return 0; } @@ -111,7 +111,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> /* --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c -@@ -3286,6 +3286,12 @@ void kmsg_dump(enum kmsg_dump_reason rea +@@ -3291,6 +3291,12 @@ void kmsg_dump(enum kmsg_dump_reason rea sync_mode = true; pr_info("enabled sync mode\n"); } @@ -124,7 +124,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> } rcu_read_lock(); -@@ -3607,3 +3613,78 @@ bool kgdb_roundup_delay(unsigned int cpu +@@ -3612,3 +3618,78 @@ bool kgdb_roundup_delay(unsigned int cpu } EXPORT_SYMBOL(kgdb_roundup_delay); #endif /* CONFIG_SMP */ diff --git a/patches/printk__introduce_kernel_sync_mode.patch b/patches/printk__introduce_kernel_sync_mode.patch index 6585490b66c5..c4a6812c190c 100644 --- a/patches/printk__introduce_kernel_sync_mode.patch +++ b/patches/printk__introduce_kernel_sync_mode.patch @@ -353,7 +353,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); -@@ -3412,6 +3558,18 @@ void kmsg_dump(enum kmsg_dump_reason rea +@@ -3417,6 +3563,18 @@ void kmsg_dump(enum kmsg_dump_reason rea { struct kmsg_dumper *dumper; diff --git a/patches/printk__rename_printk_cpulock_API_and_always_disable_interrupts.patch b/patches/printk__rename_printk_cpulock_API_and_always_disable_interrupts.patch index 49f49d009869..a03c43bd4afa 100644 --- a/patches/printk__rename_printk_cpulock_API_and_always_disable_interrupts.patch +++ b/patches/printk__rename_printk_cpulock_API_and_always_disable_interrupts.patch @@ -26,7 +26,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- --- a/include/linux/printk.h +++ b/include/linux/printk.h -@@ -280,17 +280,22 @@ static inline void dump_stack(void) +@@ -284,17 +284,22 @@ static inline void printk_trigger_flush( extern int __printk_cpu_trylock(void); extern void __printk_wait_on_cpu_lock(void); extern void __printk_cpu_unlock(void); @@ -53,7 +53,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> for (;;) { \ local_irq_save(flags); \ if (__printk_cpu_trylock()) \ -@@ -300,22 +305,15 @@ extern void __printk_cpu_unlock(void); +@@ -304,22 +309,15 @@ extern void __printk_cpu_unlock(void); } /** @@ -97,7 +97,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c -@@ -93,7 +93,7 @@ bool nmi_cpu_backtrace(struct pt_regs *r +@@ -99,7 +99,7 @@ bool nmi_cpu_backtrace(struct pt_regs *r * Allow nested NMI backtraces while serializing * against other CPUs. */ @@ -106,7 +106,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> if (!READ_ONCE(backtrace_idle) && regs && cpu_in_idle(instruction_pointer(regs))) { pr_warn("NMI backtrace for cpu %d skipped: idling at %pS\n", cpu, (void *)instruction_pointer(regs)); -@@ -104,7 +104,7 @@ bool nmi_cpu_backtrace(struct pt_regs *r +@@ -110,7 +110,7 @@ bool nmi_cpu_backtrace(struct pt_regs *r else dump_stack(); } diff --git a/patches/sched__Add_support_for_lazy_preemption.patch b/patches/sched__Add_support_for_lazy_preemption.patch index dbf76920b1a9..f5112bc0cb1d 100644 --- a/patches/sched__Add_support_for_lazy_preemption.patch +++ b/patches/sched__Add_support_for_lazy_preemption.patch @@ -552,7 +552,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } -@@ -4170,15 +4176,17 @@ unsigned long trace_total_entries(struct +@@ -4182,15 +4188,17 @@ unsigned long trace_total_entries(struct static void print_lat_help_header(struct seq_file *m) { @@ -579,7 +579,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> } static void print_event_info(struct array_buffer *buf, struct seq_file *m) -@@ -4212,14 +4220,16 @@ static void print_func_help_header_irq(s +@@ -4224,14 +4232,16 @@ static void print_func_help_header_irq(s print_event_info(buf, m); diff --git a/patches/series b/patches/series index ff3c6f427b73..2a0ffdb3ef3e 100644 --- a/patches/series +++ b/patches/series @@ -172,11 +172,22 @@ x86__Enable_RT_also_on_32bit.patch ########################################################################### virt-acrn-Remove-unsued-acrn_irqfds_mutex.patch tpm_tis__fix_stall_after_iowrites.patch -mm-zsmalloc-Replace-bit-spinlock-and-get_cpu_var-usa.patch drivers_block_zram__Replace_bit_spinlocks_with_rtmutex_for_-rt.patch generic-softirq-Disable-softirq-stacks-on-PREEMPT_RT.patch softirq-Disable-softirq-stacks-on-PREEMPT_RT.patch +# zsmalloc +0001_zsmalloc_introduce_some_helper_functions.patch +0002_zsmalloc_rename_zs_stat_type_to_class_stat_type.patch +0003_zsmalloc_decouple_class_actions_from_zspage_works.patch +0004_zsmalloc_introduce_obj_allocated.patch +0005_zsmalloc_move_huge_compressed_obj_from_page_to_zspage.patch +0006_zsmalloc_remove_zspage_isolation_for_migration.patch +0007_locking_rwlocks_introduce_write_lock_nested.patch +0008_zsmalloc_replace_per_zpage_lock_with_pool_migrate_lock.patch +0009_zsmalloc_replace_get_cpu_var_with_local_lock.patch +zsmalloc-enable.patch + ########################################################################### # Lazy preemption ########################################################################### diff --git a/patches/signal__Revert_ptrace_preempt_magic.patch b/patches/signal__Revert_ptrace_preempt_magic.patch index 266271bd329e..74fa095b3fca 100644 --- a/patches/signal__Revert_ptrace_preempt_magic.patch +++ b/patches/signal__Revert_ptrace_preempt_magic.patch @@ -17,7 +17,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- --- a/kernel/signal.c +++ b/kernel/signal.c -@@ -2249,16 +2249,8 @@ static void ptrace_stop(int exit_code, i +@@ -2271,16 +2271,8 @@ static void ptrace_stop(int exit_code, i if (gstop_done && ptrace_reparented(current)) do_notify_parent_cldstop(current, false, why); diff --git a/patches/signal_x86__Delay_calling_signals_in_atomic.patch b/patches/signal_x86__Delay_calling_signals_in_atomic.patch index 4abe2b5aeaa6..81f9b8fb5fa7 100644 --- a/patches/signal_x86__Delay_calling_signals_in_atomic.patch +++ b/patches/signal_x86__Delay_calling_signals_in_atomic.patch @@ -96,7 +96,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- a/kernel/signal.c +++ b/kernel/signal.c -@@ -1317,6 +1317,34 @@ force_sig_info_to_task(struct kernel_sig +@@ -1324,6 +1324,34 @@ force_sig_info_to_task(struct kernel_sig struct k_sigaction *action; int sig = info->si_signo; diff --git a/patches/zsmalloc-enable.patch b/patches/zsmalloc-enable.patch new file mode 100644 index 000000000000..e1b1fef3ac9c --- /dev/null +++ b/patches/zsmalloc-enable.patch @@ -0,0 +1,28 @@ +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 28 Sep 2021 09:38:47 +0200 +Subject: [PATCH] mm/zsmalloc: Enable again. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/Kconfig | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -643,7 +643,6 @@ config ZSWAP_ZPOOL_DEFAULT_Z3FOLD + + config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC + bool "zsmalloc" +- depends on !PREEMPT_RT + select ZSMALLOC + help + Use the zsmalloc allocator as the default allocator. +@@ -694,7 +693,7 @@ config Z3FOLD + + config ZSMALLOC + tristate "Memory allocator for compressed pages" +- depends on MMU && !PREEMPT_RT ++ depends on MMU + help + zsmalloc is a slab-based memory allocator designed to store + compressed RAM pages. zsmalloc uses virtual memory mapping |