mm/hmm/mirror: mirror process address space on device with HMM helpers

This is a heterogeneous memory management (HMM) process address space mirroring. In a nutshell this provide an API to mirror process address space on a device. This boils down to keeping CPU and device page table synchronize (we assume that both device and CPU are cache coherent like PCIe device can be). This patch provide a simple API for device driver to achieve address space mirroring thus avoiding each device driver to grow its own CPU page table walker and its own CPU page table synchronization mechanism. This is useful for NVidia GPU >= Pascal, Mellanox IB >= mlx5 and more hardware in the future. Link: http://lkml.kernel.org/r/20170405204026.3940-10-jglisse@redhat.com Signed-off-by: Jérôme Glisse <jglisse@redhat.com> Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com> Signed-off-by: John Hubbard <jhubbard@nvidia.com> Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com> Signed-off-by: Sherry Cheung <SCheung@nvidia.com> Signed-off-by: Subhash Gutti <sgutti@nvidia.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Chris Metcalf <cmetcalf@mellanox.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David Nellans <dnellans@nvidia.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Rich Felker <dalias@libc.org> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Russell King <linux@armlinux.org.uk> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Cc: "Figo.zhang" <figo1802@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
author: Jérôme Glisse <jglisse@redhat.com> 2017-04-08 14:02:27 +1000
committer: Stephen Rothwell <sfr@canb.auug.org.au> 2017-04-11 15:14:35 +1000
commit: 6b0bf7486427babdcfebc0f466ac5c5f2229f8a7 (patch)
tree: f4087bcc7aa04714d6f590c60847c2d5744d7caf /mm
parent: 55b3003579180185f1a173c0808432c63753b3fb (diff)
download: linux-next-6b0bf7486427babdcfebc0f466ac5c5f2229f8a7.tar.gz
2 files changed, 167 insertions, 15 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 195e5cc58823..0813dd4e35fc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -303,6 +303,18 @@ config HMM
 	  This is primarily useful for devices like GPU, for GPGPU compute workload,
 	  with APIs such as OpenCL or CUDA. See Documentation/vm/hmm.txt.
 
+config HMM_MIRROR
+	bool "HMM mirror CPU page table into a device page table"
+	depends on MMU && 64BIT
+	select HMM
+	select MMU_NOTIFIER
+	help
+	  Select HMM_MIRROR if you want to mirror range of the CPU page table of a
+	  process into a device page table. Here, mirror means "keep synchronized".
+	  Prerequisites: the device must provide the ability to write-protect its
+	  page tables (at PAGE_SIZE granularity), and must be able to recover from
+	  the resulting potential page faults.
+
 config PHYS_ADDR_T_64BIT
 	def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
 
diff --git a/mm/hmm.c b/mm/hmm.c
index acadb49f897e..7ed4b4c12efb 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -21,14 +21,26 @@
 #include <linux/hmm.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/mmu_notifier.h>
+
+static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
+
 
 /*
  * struct hmm - HMM per mm struct
  *
  * @mm: mm struct this HMM struct is bound to
+ * @sequence: we track updates to the CPU page table with a sequence number
+ * @mirrors: list of mirrors for this mm
+ * @mmu_notifier: mmu notifier to track updates to CPU page table
+ * @mirrors_sem: read/write semaphore protecting the mirrors list
  */
 struct hmm {
 	struct mm_struct	*mm;
+	atomic_t		sequence;
+	struct list_head	mirrors;
+	struct mmu_notifier	mmu_notifier;
+	struct rw_semaphore	mirrors_sem;
 };
 
 /*
@@ -41,27 +53,48 @@ struct hmm {
  */
 static struct hmm *hmm_register(struct mm_struct *mm)
 {
-	if (!mm->hmm) {
-		struct hmm *hmm = NULL;
-
-		hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
-		if (!hmm)
-			return NULL;
-		hmm->mm = mm;
-
-		spin_lock(&mm->page_table_lock);
-		if (!mm->hmm)
-			mm->hmm = hmm;
-		else
-			kfree(hmm);
-		spin_unlock(&mm->page_table_lock);
-	}
+	struct hmm *hmm = READ_ONCE(mm->hmm);
+	bool cleanup = false;
 
 	/*
 	 * The hmm struct can only be freed once the mm_struct goes away,
 	 * hence we should always have pre-allocated an new hmm struct
 	 * above.
 	 */
+	if (hmm)
+		return hmm;
+
+	hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
+	if (!hmm)
+		return NULL;
+	INIT_LIST_HEAD(&hmm->mirrors);
+	init_rwsem(&hmm->mirrors_sem);
+	atomic_set(&hmm->sequence, 0);
+	hmm->mmu_notifier.ops = NULL;
+	hmm->mm = mm;
+
+	/*
+	 * We should only get here if hold the mmap_sem in write mode ie on
+	 * registration of first mirror through hmm_mirror_register()
+	 */
+	hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
+	if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
+		kfree(hmm);
+		return NULL;
+	}
+
+	spin_lock(&mm->page_table_lock);
+	if (!mm->hmm)
+		mm->hmm = hmm;
+	else
+		cleanup = true;
+	spin_unlock(&mm->page_table_lock);
+
+	if (cleanup) {
+		mmu_notifier_unregister(&hmm->mmu_notifier, mm);
+		kfree(hmm);
+	}
+
 	return mm->hmm;
 }
 
@@ -69,3 +102,110 @@ void hmm_mm_destroy(struct mm_struct *mm)
 {
 	kfree(mm->hmm);
 }
+
+
+#if IS_ENABLED(CONFIG_HMM_MIRROR)
+static void hmm_invalidate_range(struct hmm *hmm,
+				 enum hmm_update_type action,
+				 unsigned long start,
+				 unsigned long end)
+{
+	struct hmm_mirror *mirror;
+
+	down_read(&hmm->mirrors_sem);
+	list_for_each_entry(mirror, &hmm->mirrors, list)
+		mirror->ops->sync_cpu_device_pagetables(mirror, action,
+							start, end);
+	up_read(&hmm->mirrors_sem);
+}
+
+static void hmm_invalidate_page(struct mmu_notifier *mn,
+				struct mm_struct *mm,
+				unsigned long addr)
+{
+	unsigned long start = addr & PAGE_MASK;
+	unsigned long end = start + PAGE_SIZE;
+	struct hmm *hmm = mm->hmm;
+
+	VM_BUG_ON(!hmm);
+
+	atomic_inc(&hmm->sequence);
+	hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
+}
+
+static void hmm_invalidate_range_start(struct mmu_notifier *mn,
+				       struct mm_struct *mm,
+				       unsigned long start,
+				       unsigned long end)
+{
+	struct hmm *hmm = mm->hmm;
+
+	VM_BUG_ON(!hmm);
+
+	atomic_inc(&hmm->sequence);
+}
+
+static void hmm_invalidate_range_end(struct mmu_notifier *mn,
+				     struct mm_struct *mm,
+				     unsigned long start,
+				     unsigned long end)
+{
+	struct hmm *hmm = mm->hmm;
+
+	VM_BUG_ON(!hmm);
+
+	hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
+}
+
+static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
+	.invalidate_page	= hmm_invalidate_page,
+	.invalidate_range_start	= hmm_invalidate_range_start,
+	.invalidate_range_end	= hmm_invalidate_range_end,
+};
+
+/*
+ * hmm_mirror_register() - register a mirror against an mm
+ *
+ * @mirror: new mirror struct to register
+ * @mm: mm to register against
+ *
+ * To start mirroring a process address space, the device driver must register
+ * an HMM mirror struct.
+ *
+ * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
+ */
+int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
+{
+	/* Sanity check */
+	if (!mm || !mirror || !mirror->ops)
+		return -EINVAL;
+
+	mirror->hmm = hmm_register(mm);
+	if (!mirror->hmm)
+		return -ENOMEM;
+
+	down_write(&mirror->hmm->mirrors_sem);
+	list_add(&mirror->list, &mirror->hmm->mirrors);
+	up_write(&mirror->hmm->mirrors_sem);
+
+	return 0;
+}
+EXPORT_SYMBOL(hmm_mirror_register);
+
+/*
+ * hmm_mirror_unregister() - unregister a mirror
+ *
+ * @mirror: new mirror struct to register
+ *
+ * Stop mirroring a process address space, and cleanup.
+ */
+void hmm_mirror_unregister(struct hmm_mirror *mirror)
+{
+	struct hmm *hmm = mirror->hmm;
+
+	down_write(&hmm->mirrors_sem);
+	list_del(&mirror->list);
+	up_write(&hmm->mirrors_sem);
+}
+EXPORT_SYMBOL(hmm_mirror_unregister);
+#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
author	Jérôme Glisse <jglisse@redhat.com>	2017-04-08 14:02:27 +1000
committer	Stephen Rothwell <sfr@canb.auug.org.au>	2017-04-11 15:14:35 +1000
commit	6b0bf7486427babdcfebc0f466ac5c5f2229f8a7 (patch)
tree	f4087bcc7aa04714d6f590c60847c2d5744d7caf /mm
parent	55b3003579180185f1a173c0808432c63753b3fb (diff)
download	linux-next-6b0bf7486427babdcfebc0f466ac5c5f2229f8a7.tar.gz