From b2ca916ce392a9d4cea3489a3efb2b627b839eaf Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 16 Feb 2020 12:00:48 -0800 Subject: ACPI: NUMA: Up-level "map to online node" functionality The acpi_map_pxm_to_online_node() helper is used to find the closest online node to a given proximity domain. This is used to map devices in a proximity domain with no online memory or cpus to the closest online node and populate a device's 'numa_node' property. The numa_node property allows applications to be migrated "close" to a resource. In preparation for providing a generic facility to optionally map an address range to its closest online node, or the node the range would represent were it to be onlined (target_node), up-level the core of acpi_map_pxm_to_online_node() to a generic mm/numa helper. Cc: Michal Hocko Acked-by: Rafael J. Wysocki Reviewed-by: Ingo Molnar Signed-off-by: Dan Williams Link: https://lore.kernel.org/r/158188324802.894464.13128795207831894206.stgit@dwillia2-desk3.amr.corp.intel.com --- mm/mempolicy.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 977c641f78cf..756d6e5bb59f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -127,6 +127,36 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; +/** + * numa_map_to_online_node - Find closest online node + * @nid: Node id to start the search + * + * Lookup the next closest node by distance if @nid is not online. + */ +int numa_map_to_online_node(int node) +{ + int min_node; + + if (node == NUMA_NO_NODE) + node = 0; + + min_node = node; + if (!node_online(node)) { + int min_dist = INT_MAX, dist, n; + + for_each_online_node(n) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; + } + } + } + + return min_node; +} +EXPORT_SYMBOL_GPL(numa_map_to_online_node); + struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; -- cgit v1.2.1 From 4fcbe96e4d0bc4abe22ee10573ff663b9ebbcf17 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 16 Feb 2020 12:00:53 -0800 Subject: mm/numa: Skip NUMA_NO_NODE and online nodes in numa_map_to_online_node() Update numa_map_to_online_node() to stop falling back to numa node 0 when the input is NUMA_NO_NODE. Also, skip the lookup if @node is online. This makes the routine compatible with other arch node mapping routines. Reported-by: Aneesh Kumar K.V Reviewed-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/157401275716.43284.13185549705765009174.stgit@dwillia2-desk3.amr.corp.intel.com Reviewed-by: Ingo Molnar Signed-off-by: Dan Williams Link: https://lore.kernel.org/r/158188325316.894464.15650888748083329531.stgit@dwillia2-desk3.amr.corp.intel.com --- mm/mempolicy.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 756d6e5bb59f..19f7e71945a7 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -135,21 +135,17 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES]; */ int numa_map_to_online_node(int node) { - int min_node; + int min_dist = INT_MAX, dist, n, min_node; - if (node == NUMA_NO_NODE) - node = 0; + if (node == NUMA_NO_NODE || node_online(node)) + return node; min_node = node; - if (!node_online(node)) { - int min_dist = INT_MAX, dist, n; - - for_each_online_node(n) { - dist = node_distance(node, n); - if (dist < min_dist) { - min_dist = dist; - min_node = n; - } + for_each_online_node(n) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; } } -- cgit v1.2.1 From 1e5d8e1e47afde23e3249aed25d7d124feff5c1c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 16 Feb 2020 12:01:04 -0800 Subject: x86/mm: Introduce CONFIG_NUMA_KEEP_MEMINFO Currently x86 numa_meminfo is marked __initdata in the CONFIG_MEMORY_HOTPLUG=n case. In support of a new facility to allow drivers to map reserved memory to a 'target_node' (phys_to_target_node()), add support for removing the __initdata designation for those users. Both memory hotplug and phys_to_target_node() users select CONFIG_NUMA_KEEP_MEMINFO to tell the arch to maintain its physical address to NUMA mapping infrastructure post init. Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Cc: Andrew Morton Cc: David Hildenbrand Cc: Michal Hocko Reviewed-by: Ingo Molnar Signed-off-by: Dan Williams Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/158188326422.894464.15742054998046628934.stgit@dwillia2-desk3.amr.corp.intel.com --- mm/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index ab80933be65f..328268473fec 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -139,6 +139,10 @@ config HAVE_FAST_GUP config ARCH_KEEP_MEMBLOCK bool +# Keep arch NUMA mapping infrastructure post-init. +config NUMA_KEEP_MEMINFO + bool + config MEMORY_ISOLATION bool @@ -154,6 +158,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG + select NUMA_KEEP_MEMINFO if NUMA config MEMORY_HOTPLUG_SPARSE def_bool y -- cgit v1.2.1 From 9ffc1d19fc4a6dfcfe06c91c2861ad6d44fdd92d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 30 Jan 2020 12:06:07 -0800 Subject: mm/memremap_pages: Introduce memremap_compat_align() The "sub-section memory hotplug" facility allows memremap_pages() users like libnvdimm to compensate for hardware platforms like x86 that have a section size larger than their hardware memory mapping granularity. The compensation that sub-section support affords is being tolerant of physical memory resources shifting by units smaller (64MiB on x86) than the memory-hotplug section size (128 MiB). Where the platform physical-memory mapping granularity is limited by the number and capability of address-decode-registers in the memory controller. While the sub-section support allows memremap_pages() to operate on sub-section (2MiB) granularity, the Power architecture may still require 16MiB alignment on "!radix_enabled()" platforms. In order for libnvdimm to be able to detect and manage this per-arch limitation, introduce memremap_compat_align() as a common minimum alignment across all driver-facing memory-mapping interfaces, and let Power override it to 16MiB in the "!radix_enabled()" case. The assumption / requirement for 16MiB to be a viable memremap_compat_align() value is that Power does not have platforms where its equivalent of address-decode-registers never hardware remaps a persistent memory resource on smaller than 16MiB boundaries. Note that I tried my best to not add a new Kconfig symbol, but header include entanglements defeated the #ifndef memremap_compat_align design pattern and the need to export it defeats the __weak design pattern for arch overrides. Based on an initial patch by Aneesh. Link: http://lore.kernel.org/r/CAPcyv4gBGNP95APYaBcsocEa50tQj9b5h__83vgngjq3ouGX_Q@mail.gmail.com Reported-by: Aneesh Kumar K.V Reported-by: Jeff Moyer Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Reviewed-by: Aneesh Kumar K.V Acked-by: Michael Ellerman (powerpc) Signed-off-by: Dan Williams --- mm/memremap.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'mm') diff --git a/mm/memremap.c b/mm/memremap.c index 09b5b7adc773..3e7afaf05639 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -14,6 +15,28 @@ static DEFINE_XARRAY(pgmap_array); +/* + * The memremap() and memremap_pages() interfaces are alternately used + * to map persistent memory namespaces. These interfaces place different + * constraints on the alignment and size of the mapping (namespace). + * memremap() can map individual PAGE_SIZE pages. memremap_pages() can + * only map subsections (2MB), and at least one architecture (PowerPC) + * the minimum mapping granularity of memremap_pages() is 16MB. + * + * The role of memremap_compat_align() is to communicate the minimum + * arch supported alignment of a namespace such that it can freely + * switch modes without violating the arch constraint. Namely, do not + * allow a namespace to be PAGE_SIZE aligned since that namespace may be + * reconfigured into a mode that requires SUBSECTION_SIZE alignment. + */ +#ifndef CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN +unsigned long memremap_compat_align(void) +{ + return SUBSECTION_SIZE; +} +EXPORT_SYMBOL_GPL(memremap_compat_align); +#endif + #ifdef CONFIG_DEV_PAGEMAP_OPS DEFINE_STATIC_KEY_FALSE(devmap_managed_key); EXPORT_SYMBOL(devmap_managed_key); -- cgit v1.2.1