From 476134070c037820bd909ff6e43e0d3eae33f376 Mon Sep 17 00:00:00 2001
From: Hamish Martin <hamish.martin@alliedtelesis.co.nz>
Date: Fri, 24 Feb 2017 13:52:09 +1300
Subject: powerpc: Move THREAD_SHIFT config to Kconfig

Shift the logic for defining THREAD_SHIFT logic to Kconfig in order to
allow override by users.

Signed-off-by: Hamish Martin <hamish.martin@alliedtelesis.co.nz>
Reviewed-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig                   | 10 ++++++++++
 arch/powerpc/include/asm/thread_info.h | 10 +---------
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 97a8bc8a095c..45724923a69c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -678,6 +678,16 @@ config PPC_256K_PAGES
 
 endchoice
 
+config THREAD_SHIFT
+	int "Thread shift" if EXPERT
+	range 13 15
+	default "15" if PPC_256K_PAGES
+	default "14" if PPC64
+	default "13"
+	help
+	  Used to define the stack size. The default is almost always what you
+	  want. Only change this if you know what you are doing.
+
 config FORCE_MAX_ZONEORDER
 	int "Maximum zone order"
 	range 8 9 if PPC64 && PPC_64K_PAGES
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 87e4b2d8dcd4..2e17d668c472 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -10,15 +10,7 @@
 
 #ifdef __KERNEL__
 
-/* We have 8k stacks on ppc32 and 16k on ppc64 */
-
-#if defined(CONFIG_PPC64)
-#define THREAD_SHIFT		14
-#elif defined(CONFIG_PPC_256K_PAGES)
-#define THREAD_SHIFT		15
-#else
-#define THREAD_SHIFT		13
-#endif
+#define THREAD_SHIFT		CONFIG_THREAD_SHIFT
 
 #define THREAD_SIZE		(1 << THREAD_SHIFT)
 
-- 
cgit v1.2.1


From cabed14891c52618688533bd413b206bf2ebf5c5 Mon Sep 17 00:00:00 2001
From: Hamish Martin <hamish.martin@alliedtelesis.co.nz>
Date: Fri, 24 Feb 2017 13:52:10 +1300
Subject: powerpc/64: Allow for THREAD_SIZE > 16k

Fix an assembler error when the THREAD_SIZE is greater than 16k.

Signed-off-by: Hamish Martin <hamish.martin@alliedtelesis.co.nz>
Reviewed-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/head_64.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 1dc5eae2ced3..0ddc602b33a4 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -949,7 +949,8 @@ start_here_multiplatform:
 	LOAD_REG_ADDR(r3,init_thread_union)
 
 	/* set up a stack pointer */
-	addi	r1,r3,THREAD_SIZE
+	LOAD_REG_IMMEDIATE(r1,THREAD_SIZE)
+	add	r1,r3,r1
 	li	r0,0
 	stdu	r0,-STACK_FRAME_OVERHEAD(r1)
 
-- 
cgit v1.2.1


From 017614a5d6c09ec9e0dc3fd46a5018c20b407d92 Mon Sep 17 00:00:00 2001
From: "Tobin C. Harding" <me@tobin.cc>
Date: Tue, 7 Mar 2017 20:32:42 +1100
Subject: powerpc/pseries: Move struct hcall_stats to hvCall_inst.c

struct hcall_stats is only used in hvCall_inst.c, so move it there.

Signed-off-by: Tobin C. Harding <me@tobin.cc>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/hvcall.h            | 10 ----------
 arch/powerpc/platforms/pseries/hvCall_inst.c | 10 ++++++++++
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 3cc12a86ef5d..d73755fafbb0 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -377,16 +377,6 @@ long plpar_hcall_raw(unsigned long opcode, unsigned long *retbuf, ...);
 long plpar_hcall9(unsigned long opcode, unsigned long *retbuf, ...);
 long plpar_hcall9_raw(unsigned long opcode, unsigned long *retbuf, ...);
 
-/* For hcall instrumentation.  One structure per-hcall, per-CPU */
-struct hcall_stats {
-	unsigned long	num_calls;	/* number of calls (on this CPU) */
-	unsigned long	tb_total;	/* total wall time (mftb) of calls. */
-	unsigned long	purr_total;	/* total cpu time (PURR) of calls. */
-	unsigned long	tb_start;
-	unsigned long	purr_start;
-};
-#define HCALL_STAT_ARRAY_SIZE	((MAX_HCALL_OPCODE >> 2) + 1)
-
 struct hvcall_mpp_data {
 	unsigned long entitled_mem;
 	unsigned long mapped_mem;
diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c
index f02ec3ab428c..957ae347b0b3 100644
--- a/arch/powerpc/platforms/pseries/hvCall_inst.c
+++ b/arch/powerpc/platforms/pseries/hvCall_inst.c
@@ -29,6 +29,16 @@
 #include <asm/trace.h>
 #include <asm/machdep.h>
 
+/* For hcall instrumentation. One structure per-hcall, per-CPU */
+struct hcall_stats {
+	unsigned long	num_calls;	/* number of calls (on this CPU) */
+	unsigned long	tb_total;	/* total wall time (mftb) of calls. */
+	unsigned long	purr_total;	/* total cpu time (PURR) of calls. */
+	unsigned long	tb_start;
+	unsigned long	purr_start;
+};
+#define HCALL_STAT_ARRAY_SIZE	((MAX_HCALL_OPCODE >> 2) + 1)
+
 DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats);
 
 /*
-- 
cgit v1.2.1


From 1fc439c81312cd27aed553964c0d9d48946582ce Mon Sep 17 00:00:00 2001
From: "Tobin C. Harding" <me@tobin.cc>
Date: Mon, 6 Mar 2017 19:25:31 +1100
Subject: powerpc/swsusp: Include suspend.h to silence sparse warnings

Sparse emits two symbol not declared warnings for swsusp.c. The two
functions, save_processor_state() and restore_processor_state() are
declared already in suspend.h, so include it.

Signed-off-by: Tobin C. Harding <me@tobin.cc>
Reviewed-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/swsusp.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c
index 6ae9bd5086a4..0050b2d2ff7a 100644
--- a/arch/powerpc/kernel/swsusp.c
+++ b/arch/powerpc/kernel/swsusp.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/suspend.h>
 #include <asm/current.h>
 #include <asm/mmu_context.h>
 #include <asm/switch_to.h>
-- 
cgit v1.2.1


From b3a7864c6feb0fb30bc2cd3726570746bec41697 Mon Sep 17 00:00:00 2001
From: "Tobin C. Harding" <me@tobin.cc>
Date: Mon, 6 Mar 2017 19:49:46 +1100
Subject: powerpc/ftrace: Add prototype for prepare_ftrace_return()

Sparse emits a warning: symbol 'prepare_ftrace_return' was not
declared. Should it be static? prepare_ftrace_return() is called from
assembler and should not be static.

Add a prototype for it to asm-prototypes.h and include that in ftrace.c.

Signed-off-by: Tobin C. Harding <me@tobin.cc>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/asm-prototypes.h | 2 ++
 arch/powerpc/kernel/ftrace.c              | 1 +
 2 files changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index f6c5264287e5..e02db66a77e3 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -120,6 +120,8 @@ extern s64 __ashrdi3(s64, int);
 extern int __cmpdi2(s64, s64);
 extern int __ucmpdi2(u64, u64);
 
+/* tracing */
 void _mcount(void);
+unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip);
 
 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index 5c9f50c1aa99..32509de6ce4c 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/list.h>
 
+#include <asm/asm-prototypes.h>
 #include <asm/cacheflush.h>
 #include <asm/code-patching.h>
 #include <asm/ftrace.h>
-- 
cgit v1.2.1


From 81d5fe1a3b1acfaadc7921d08609e097c6345d7f Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Tue, 21 Feb 2017 13:38:54 +1100
Subject: powerpc/powernv: Fix it_ops::get() callback to return in cpu endian

The iommu_table_ops callbacks are declared CPU endian as they take and
return "unsigned long"; underlying hardware tables are big-endian.

However get() was missing be64_to_cpu(), this adds the missing conversion.

The only caller of this is crash dump at arch/powerpc/kernel/iommu.c,
iommu_table_clear() which only compares TCE to zero so this change
should not cause behavioral change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index eb835e977e33..a43f22dc069e 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -758,7 +758,7 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 
 unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
 {
-	return *(pnv_tce(tbl, index - tbl->it_offset));
+	return be64_to_cpu(*(pnv_tce(tbl, index - tbl->it_offset)));
 }
 
 struct iommu_table *pnv_pci_table_alloc(int nid)
-- 
cgit v1.2.1


From 20f13b95eef1b3ca75535e313357b0b096d39336 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Tue, 21 Feb 2017 13:40:20 +1100
Subject: powerpc/powernv/npu: Remove dead iommu code

PNV_IODA_PE_DEV is only used for NPU devices (emulated PCI bridges
representing NVLink). These are added to IOMMU groups with corresponding
NVIDIA devices after all non-NPU PEs are setup; a special helper -
pnv_pci_ioda_setup_iommu_api() - handles this in pnv_pci_ioda_fixup().

The pnv_pci_ioda2_setup_dma_pe() helper sets up DMA for a PE. It is called
for VFs (so it does not handle NPU case) and PCI bridges but only
IODA1 and IODA2 types. An NPU bridge has its own type id (PNV_PHB_NPU)
so pnv_pci_ioda2_setup_dma_pe() cannot be called on NPU and therefore
(pe->flags & PNV_IODA_PE_DEV) is always "false".

This removes not used iommu_add_device(). This should not cause any
behavioral change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index e36738291c32..7461322f0a1a 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2735,9 +2735,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	if (rc)
 		return;
 
-	if (pe->flags & PNV_IODA_PE_DEV)
-		iommu_add_device(&pe->pdev->dev);
-	else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
+	if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
 		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
 }
 
-- 
cgit v1.2.1


From c2294e0ffe741c8b34c630a71c7dc44d30503538 Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Date: Tue, 14 Feb 2017 17:45:10 +0100
Subject: powerpc/mm: Move mmap_sem unlock up from do_sigbus

Move mmap_sem releasing in the do_sigbus()'s unique caller : mm_fault_error()

No functional changes.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/fault.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 51def8a515be..2d77c1c8014a 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -120,8 +120,6 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
 	siginfo_t info;
 	unsigned int lsb = 0;
 
-	up_read(&current->mm->mmap_sem);
-
 	if (!user_mode(regs))
 		return MM_FAULT_ERR(SIGBUS);
 
@@ -185,8 +183,10 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
 		return MM_FAULT_RETURN;
 	}
 
-	if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE))
+	if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
+		up_read(&current->mm->mmap_sem);
 		return do_sigbus(regs, addr, fault);
+	}
 
 	/* We don't understand the fault code, this is fatal */
 	BUG();
-- 
cgit v1.2.1


From 14c02e419a395c4bd7950175fc47eda1ecaa8c69 Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Date: Tue, 14 Feb 2017 17:45:11 +0100
Subject: powerpc/mm: Handle VM_FAULT_RETRY earlier

In do_page_fault() if handle_mm_fault() returns VM_FAULT_RETRY, retry
the page fault handling before anything else.

This would simplify the handling of the mmap_sem lock in this part of
the code.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/fault.c | 67 ++++++++++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 29 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 2d77c1c8014a..5809a3c86161 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -458,6 +458,26 @@ good_area:
 	 * the fault.
 	 */
 	fault = handle_mm_fault(vma, address, flags);
+
+	/*
+	 * Handle the retry right now, the mmap_sem has been released in that
+	 * case.
+	 */
+	if (unlikely(fault & VM_FAULT_RETRY)) {
+		/* We retry only once */
+		if (flags & FAULT_FLAG_ALLOW_RETRY) {
+			/*
+			 * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+			 * of starvation.
+			 */
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
+			if (!fatal_signal_pending(current))
+				goto retry;
+		}
+		/* We will enter mm_fault_error() below */
+	}
+
 	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
 		if (fault & VM_FAULT_SIGSEGV)
 			goto bad_area;
@@ -469,38 +489,27 @@ good_area:
 	}
 
 	/*
-	 * Major/minor page fault accounting is only done on the
-	 * initial attempt. If we go through a retry, it is extremely
-	 * likely that the page will be found in page cache at that point.
+	 * Major/minor page fault accounting.
 	 */
-	if (flags & FAULT_FLAG_ALLOW_RETRY) {
-		if (fault & VM_FAULT_MAJOR) {
-			current->maj_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
-				      regs, address);
+	if (fault & VM_FAULT_MAJOR) {
+		current->maj_flt++;
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
+			      regs, address);
 #ifdef CONFIG_PPC_SMLPAR
-			if (firmware_has_feature(FW_FEATURE_CMO)) {
-				u32 page_ins;
-
-				preempt_disable();
-				page_ins = be32_to_cpu(get_lppaca()->page_ins);
-				page_ins += 1 << PAGE_FACTOR;
-				get_lppaca()->page_ins = cpu_to_be32(page_ins);
-				preempt_enable();
-			}
-#endif /* CONFIG_PPC_SMLPAR */
-		} else {
-			current->min_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
-				      regs, address);
-		}
-		if (fault & VM_FAULT_RETRY) {
-			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
-			 * of starvation. */
-			flags &= ~FAULT_FLAG_ALLOW_RETRY;
-			flags |= FAULT_FLAG_TRIED;
-			goto retry;
+		if (firmware_has_feature(FW_FEATURE_CMO)) {
+			u32 page_ins;
+
+			preempt_disable();
+			page_ins = be32_to_cpu(get_lppaca()->page_ins);
+			page_ins += 1 << PAGE_FACTOR;
+			get_lppaca()->page_ins = cpu_to_be32(page_ins);
+			preempt_enable();
 		}
+#endif /* CONFIG_PPC_SMLPAR */
+	} else {
+		current->min_flt++;
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
+			      regs, address);
 	}
 
 	up_read(&mm->mmap_sem);
-- 
cgit v1.2.1


From 819cdcdb7b2ac6f8294a969eb2d9003b1be222bf Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Date: Tue, 14 Feb 2017 17:45:12 +0100
Subject: powerpc/mm: Move mmap_sem unlocking in do_page_fault()

Since the fault retry is now handled earlier, we can release the
mmap_sem lock earlier too and remove later unlocking previously done in
mm_fault_error().

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/fault.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 5809a3c86161..fd6484fc2fa9 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -152,13 +152,6 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
 	 * continue the pagefault.
 	 */
 	if (fatal_signal_pending(current)) {
-		/*
-		 * If we have retry set, the mmap semaphore will have
-		 * alrady been released in __lock_page_or_retry(). Else
-		 * we release it now.
-		 */
-		if (!(fault & VM_FAULT_RETRY))
-			up_read(&current->mm->mmap_sem);
 		/* Coming from kernel, we need to deal with uaccess fixups */
 		if (user_mode(regs))
 			return MM_FAULT_RETURN;
@@ -171,8 +164,6 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
 
 	/* Out of memory */
 	if (fault & VM_FAULT_OOM) {
-		up_read(&current->mm->mmap_sem);
-
 		/*
 		 * We ran out of memory, or some other thing happened to us that
 		 * made us unable to handle the page fault gracefully.
@@ -183,10 +174,8 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
 		return MM_FAULT_RETURN;
 	}
 
-	if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
-		up_read(&current->mm->mmap_sem);
+	if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE))
 		return do_sigbus(regs, addr, fault);
-	}
 
 	/* We don't understand the fault code, this is fatal */
 	BUG();
@@ -476,11 +465,12 @@ good_area:
 				goto retry;
 		}
 		/* We will enter mm_fault_error() below */
-	}
+	} else
+		up_read(&current->mm->mmap_sem);
 
 	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
 		if (fault & VM_FAULT_SIGSEGV)
-			goto bad_area;
+			goto bad_area_nosemaphore;
 		rc = mm_fault_error(regs, address, fault);
 		if (rc >= MM_FAULT_RETURN)
 			goto bail;
@@ -512,7 +502,6 @@ good_area:
 			      regs, address);
 	}
 
-	up_read(&mm->mmap_sem);
 	goto bail;
 
 bad_area:
-- 
cgit v1.2.1


From 3072601375ffff92bddd0ce810dd64f6e3c7f45c Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Fri, 2 Dec 2016 02:35:52 +0000
Subject: powerpc/32: Remove Mac-on-Linux/rtlinux hooks

The symbols exported for use by MOL/rtlinux aren't getting CRCs and I
was about to fix that. But MOL is dead upstream, and the latest work on
it was to make it use KVM instead of its own kernel module. So remove
them instead.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/head_32.S | 16 +---------------
 arch/powerpc/mm/hash_low_32.S |  2 --
 2 files changed, 1 insertion(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 1607be7c0ef2..e22734278458 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -735,11 +735,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU)
 	EXCEPTION(0x2c00, Trap_2c, unknown_exception, EXC_XFER_EE)
 	EXCEPTION(0x2d00, Trap_2d, unknown_exception, EXC_XFER_EE)
 	EXCEPTION(0x2e00, Trap_2e, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2f00, MOLTrampoline, unknown_exception, EXC_XFER_EE_LITE)
-
-	.globl mol_trampoline
-	.set mol_trampoline, i0x2f00
-	EXPORT_SYMBOL(mol_trampoline)
+	EXCEPTION(0x2f00, Trap_2f, unknown_exception, EXC_XFER_EE)
 
 	. = 0x3000
 
@@ -1278,16 +1274,6 @@ EXPORT_SYMBOL(empty_zero_page)
 swapper_pg_dir:
 	.space	PGD_TABLE_SIZE
 
-	.globl intercept_table
-intercept_table:
-	.long 0, 0, i0x200, i0x300, i0x400, 0, i0x600, i0x700
-	.long i0x800, 0, 0, 0, 0, i0xd00, 0, 0
-	.long 0, 0, 0, i0x1300, 0, 0, 0, 0
-	.long 0, 0, 0, 0, 0, 0, 0, 0
-	.long 0, 0, 0, 0, 0, 0, 0, 0
-	.long 0, 0, 0, 0, 0, 0, 0, 0
-EXPORT_SYMBOL(intercept_table)
-
 /* Room for two PTE pointers, usually the kernel and current user pointers
  * to their respective root page table.
  */
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 09cc50c8dace..6f962e5cb5e1 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -31,10 +31,8 @@
 #ifdef CONFIG_SMP
 	.section .bss
 	.align	2
-	.globl mmu_hash_lock
 mmu_hash_lock:
 	.space	4
-EXPORT_SYMBOL(mmu_hash_lock)
 #endif /* CONFIG_SMP */
 
 /*
-- 
cgit v1.2.1


From 43a8888f0a70db62488fc3ad1505f6c6a2267803 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Fri, 2 Dec 2016 02:38:38 +0000
Subject: powerpc: Fix missing CRCs, add more asm-prototypes.h declarations

Add declarations for:
  - __mfdcr, __mtdcr (if CONFIG_PPC_DCR_NATIVE=y; through <asm/dcr.h>)
  - switch_mmu_context (if CONFIG_PPC_BOOK3S_64=n; through <asm/mmu_context.h>)

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/asm-prototypes.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index e02db66a77e3..7330150bfe34 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -17,6 +17,8 @@
 #include <asm/checksum.h>
 #include <linux/uaccess.h>
 #include <asm/epapr_hcalls.h>
+#include <asm/dcr.h>
+#include <asm/mmu_context.h>
 
 #include <uapi/asm/ucontext.h>
 
-- 
cgit v1.2.1


From fc84427b7e1471b0a1220c56072f574618335edc Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 14 Mar 2017 22:36:43 +1000
Subject: powerpc/64s: Machine check print NIP

Print the faulting address of the machine check that may help with
debugging. The effective address reported can be a target memory address
rather than the faulting instruction address.

Fix up a dangling bracket while here.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/mce.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index a1475e6aef3a..399aeafb7dd4 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -310,7 +310,8 @@ void machine_check_print_event_info(struct machine_check_event *evt)
 
 	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
 	       evt->disposition == MCE_DISPOSITION_RECOVERED ?
-	       "Recovered" : "[Not recovered");
+	       "Recovered" : "Not recovered");
+	printk("%s  NIP: %016llx\n", level, evt->srr0);
 	printk("%s  Initiator: %s\n", level,
 	       evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
 	switch (evt->error_type) {
-- 
cgit v1.2.1


From 88c6511a8c0c09bbc5f9792c4a2c252c7e47e49b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 14 Mar 2017 22:36:44 +1000
Subject: powerpc/64s: Clean up machine check recovery flushing

Use the flush function introduced with the POWER9 machine check handler
for POWER7 and 8, rather than open coding it multiple times in callers.

There is a specific ERAT flush type introduced for POWER9, but the
POWER7-8 ERAT errors continue to do SLB flushing (which also flushes
ERAT), so as not to introduce functional changes with this cleanup
patch.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/mce_power.c | 97 ++++++++++-------------------------------
 1 file changed, 23 insertions(+), 74 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 763d6f58caa8..0f35a88e3655 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -161,81 +161,27 @@ static int mce_handle_flush_derrors(uint64_t dsisr, uint64_t slb, uint64_t tlb,
 	return 1;
 }
 
-static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits)
-{
-	long handled = 1;
-
-	/*
-	 * flush and reload SLBs for SLB errors and flush TLBs for TLB errors.
-	 * reset the error bits whenever we handle them so that at the end
-	 * we can check whether we handled all of them or not.
-	 * */
-#ifdef CONFIG_PPC_STD_MMU_64
-	if (dsisr & slb_error_bits) {
-		flush_and_reload_slb();
-		/* reset error bits */
-		dsisr &= ~(slb_error_bits);
-	}
-	if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) {
-		if (cur_cpu_spec && cur_cpu_spec->flush_tlb)
-			cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL);
-		/* reset error bits */
-		dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB;
-	}
-#endif
-	/* Any other errors we don't understand? */
-	if (dsisr & 0xffffffffUL)
-		handled = 0;
-
-	return handled;
-}
-
 static long mce_handle_derror_p7(uint64_t dsisr)
 {
-	return mce_handle_derror(dsisr, P7_DSISR_MC_SLB_ERRORS);
+	return mce_handle_flush_derrors(dsisr,
+			P7_DSISR_MC_SLB_ERRORS,
+			P7_DSISR_MC_TLB_MULTIHIT_MFTLB,
+			0);
 }
 
-static long mce_handle_common_ierror(uint64_t srr1)
+static long mce_handle_ierror_p7(uint64_t srr1)
 {
-	long handled = 0;
-
 	switch (P7_SRR1_MC_IFETCH(srr1)) {
-	case 0:
-		break;
-#ifdef CONFIG_PPC_STD_MMU_64
 	case P7_SRR1_MC_IFETCH_SLB_PARITY:
 	case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
-		/* flush and reload SLBs for SLB errors. */
-		flush_and_reload_slb();
-		handled = 1;
-		break;
+	case P7_SRR1_MC_IFETCH_SLB_BOTH:
+		return mce_flush(MCE_FLUSH_SLB);
+
 	case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
-		if (cur_cpu_spec && cur_cpu_spec->flush_tlb) {
-			cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL);
-			handled = 1;
-		}
-		break;
-#endif
+		return mce_flush(MCE_FLUSH_TLB);
 	default:
-		break;
-	}
-
-	return handled;
-}
-
-static long mce_handle_ierror_p7(uint64_t srr1)
-{
-	long handled = 0;
-
-	handled = mce_handle_common_ierror(srr1);
-
-#ifdef CONFIG_PPC_STD_MMU_64
-	if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
-		flush_and_reload_slb();
-		handled = 1;
+		return 0;
 	}
-#endif
-	return handled;
 }
 
 static void mce_get_common_ierror(struct mce_error_info *mce_err, uint64_t srr1)
@@ -376,22 +322,25 @@ static void mce_get_derror_p8(struct mce_error_info *mce_err, uint64_t dsisr)
 
 static long mce_handle_ierror_p8(uint64_t srr1)
 {
-	long handled = 0;
-
-	handled = mce_handle_common_ierror(srr1);
+	switch (P7_SRR1_MC_IFETCH(srr1)) {
+	case P7_SRR1_MC_IFETCH_SLB_PARITY:
+	case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
+	case P8_SRR1_MC_IFETCH_ERAT_MULTIHIT:
+		return mce_flush(MCE_FLUSH_SLB);
 
-#ifdef CONFIG_PPC_STD_MMU_64
-	if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
-		flush_and_reload_slb();
-		handled = 1;
+	case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
+		return mce_flush(MCE_FLUSH_TLB);
+	default:
+		return 0;
 	}
-#endif
-	return handled;
 }
 
 static long mce_handle_derror_p8(uint64_t dsisr)
 {
-	return mce_handle_derror(dsisr, P8_DSISR_MC_SLB_ERRORS);
+	return mce_handle_flush_derrors(dsisr,
+			P8_DSISR_MC_SLB_ERRORS,
+			P7_DSISR_MC_TLB_MULTIHIT_MFTLB,
+			0);
 }
 
 long __machine_check_early_realmode_p8(struct pt_regs *regs)
-- 
cgit v1.2.1


From 58c8d17f2ed7367f9886908633fbe85624e41e78 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 14 Mar 2017 22:36:45 +1000
Subject: powerpc/64s: Move POWER machine check defines into mce_power.c

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mce.h  | 91 ----------------------------------------
 arch/powerpc/kernel/mce_power.c | 92 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+), 91 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index ed62efe01e49..e3498b446788 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -24,97 +24,6 @@
 
 #include <linux/bitops.h>
 
-/*
- * Machine Check bits on power7 and power8
- */
-#define P7_SRR1_MC_LOADSTORE(srr1)	((srr1) & PPC_BIT(42)) /* P8 too */
-
-/* SRR1 bits for machine check (On Power7 and Power8) */
-#define P7_SRR1_MC_IFETCH(srr1)	((srr1) & PPC_BITMASK(43, 45)) /* P8 too */
-
-#define P7_SRR1_MC_IFETCH_UE		(0x1 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_SLB_PARITY	(0x2 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_SLB_MULTIHIT	(0x3 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_SLB_BOTH	(0x4 << PPC_BITLSHIFT(45))
-#define P7_SRR1_MC_IFETCH_TLB_MULTIHIT	(0x5 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_UE_TLB_RELOAD	(0x6 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL	(0x7 << PPC_BITLSHIFT(45))
-
-/* SRR1 bits for machine check (On Power8) */
-#define P8_SRR1_MC_IFETCH_ERAT_MULTIHIT	(0x4 << PPC_BITLSHIFT(45))
-
-/* DSISR bits for machine check (On Power7 and Power8) */
-#define P7_DSISR_MC_UE			(PPC_BIT(48))	/* P8 too */
-#define P7_DSISR_MC_UE_TABLEWALK	(PPC_BIT(49))	/* P8 too */
-#define P7_DSISR_MC_ERAT_MULTIHIT	(PPC_BIT(52))	/* P8 too */
-#define P7_DSISR_MC_TLB_MULTIHIT_MFTLB	(PPC_BIT(53))	/* P8 too */
-#define P7_DSISR_MC_SLB_PARITY_MFSLB	(PPC_BIT(55))	/* P8 too */
-#define P7_DSISR_MC_SLB_MULTIHIT	(PPC_BIT(56))	/* P8 too */
-#define P7_DSISR_MC_SLB_MULTIHIT_PARITY	(PPC_BIT(57))	/* P8 too */
-
-/*
- * DSISR bits for machine check (Power8) in addition to above.
- * Secondary DERAT Multihit
- */
-#define P8_DSISR_MC_ERAT_MULTIHIT_SEC	(PPC_BIT(54))
-
-/* SLB error bits */
-#define P7_DSISR_MC_SLB_ERRORS		(P7_DSISR_MC_ERAT_MULTIHIT | \
-					 P7_DSISR_MC_SLB_PARITY_MFSLB | \
-					 P7_DSISR_MC_SLB_MULTIHIT | \
-					 P7_DSISR_MC_SLB_MULTIHIT_PARITY)
-
-#define P8_DSISR_MC_SLB_ERRORS		(P7_DSISR_MC_SLB_ERRORS | \
-					 P8_DSISR_MC_ERAT_MULTIHIT_SEC)
-
-/*
- * Machine Check bits on power9
- */
-#define P9_SRR1_MC_LOADSTORE(srr1)	(((srr1) >> PPC_BITLSHIFT(42)) & 1)
-
-#define P9_SRR1_MC_IFETCH(srr1)	(	\
-	PPC_BITEXTRACT(srr1, 45, 0) |	\
-	PPC_BITEXTRACT(srr1, 44, 1) |	\
-	PPC_BITEXTRACT(srr1, 43, 2) |	\
-	PPC_BITEXTRACT(srr1, 36, 3) )
-
-/* 0 is reserved */
-#define P9_SRR1_MC_IFETCH_UE				1
-#define P9_SRR1_MC_IFETCH_SLB_PARITY			2
-#define P9_SRR1_MC_IFETCH_SLB_MULTIHIT			3
-#define P9_SRR1_MC_IFETCH_ERAT_MULTIHIT			4
-#define P9_SRR1_MC_IFETCH_TLB_MULTIHIT			5
-#define P9_SRR1_MC_IFETCH_UE_TLB_RELOAD			6
-/* 7 is reserved */
-#define P9_SRR1_MC_IFETCH_LINK_TIMEOUT			8
-#define P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT	9
-/* 10 ? */
-#define P9_SRR1_MC_IFETCH_RA			11
-#define P9_SRR1_MC_IFETCH_RA_TABLEWALK		12
-#define P9_SRR1_MC_IFETCH_RA_ASYNC_STORE		13
-#define P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT	14
-#define P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN	15
-
-/* DSISR bits for machine check (On Power9) */
-#define P9_DSISR_MC_UE					(PPC_BIT(48))
-#define P9_DSISR_MC_UE_TABLEWALK			(PPC_BIT(49))
-#define P9_DSISR_MC_LINK_LOAD_TIMEOUT			(PPC_BIT(50))
-#define P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT		(PPC_BIT(51))
-#define P9_DSISR_MC_ERAT_MULTIHIT			(PPC_BIT(52))
-#define P9_DSISR_MC_TLB_MULTIHIT_MFTLB			(PPC_BIT(53))
-#define P9_DSISR_MC_USER_TLBIE				(PPC_BIT(54))
-#define P9_DSISR_MC_SLB_PARITY_MFSLB			(PPC_BIT(55))
-#define P9_DSISR_MC_SLB_MULTIHIT_MFSLB			(PPC_BIT(56))
-#define P9_DSISR_MC_RA_LOAD				(PPC_BIT(57))
-#define P9_DSISR_MC_RA_TABLEWALK			(PPC_BIT(58))
-#define P9_DSISR_MC_RA_TABLEWALK_FOREIGN		(PPC_BIT(59))
-#define P9_DSISR_MC_RA_FOREIGN				(PPC_BIT(60))
-
-/* SLB error bits */
-#define P9_DSISR_MC_SLB_ERRORS		(P9_DSISR_MC_ERAT_MULTIHIT | \
-					 P9_DSISR_MC_SLB_PARITY_MFSLB | \
-					 P9_DSISR_MC_SLB_MULTIHIT_MFSLB)
-
 enum MCE_Version {
 	MCE_V1 = 1,
 };
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 0f35a88e3655..9b3bcd1213bf 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -161,6 +161,98 @@ static int mce_handle_flush_derrors(uint64_t dsisr, uint64_t slb, uint64_t tlb,
 	return 1;
 }
 
+
+/*
+ * Machine Check bits on power7 and power8
+ */
+#define P7_SRR1_MC_LOADSTORE(srr1)	((srr1) & PPC_BIT(42)) /* P8 too */
+
+/* SRR1 bits for machine check (On Power7 and Power8) */
+#define P7_SRR1_MC_IFETCH(srr1)	((srr1) & PPC_BITMASK(43, 45)) /* P8 too */
+
+#define P7_SRR1_MC_IFETCH_UE		(0x1 << PPC_BITLSHIFT(45)) /* P8 too */
+#define P7_SRR1_MC_IFETCH_SLB_PARITY	(0x2 << PPC_BITLSHIFT(45)) /* P8 too */
+#define P7_SRR1_MC_IFETCH_SLB_MULTIHIT	(0x3 << PPC_BITLSHIFT(45)) /* P8 too */
+#define P7_SRR1_MC_IFETCH_SLB_BOTH	(0x4 << PPC_BITLSHIFT(45))
+#define P7_SRR1_MC_IFETCH_TLB_MULTIHIT	(0x5 << PPC_BITLSHIFT(45)) /* P8 too */
+#define P7_SRR1_MC_IFETCH_UE_TLB_RELOAD	(0x6 << PPC_BITLSHIFT(45)) /* P8 too */
+#define P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL	(0x7 << PPC_BITLSHIFT(45))
+
+/* SRR1 bits for machine check (On Power8) */
+#define P8_SRR1_MC_IFETCH_ERAT_MULTIHIT	(0x4 << PPC_BITLSHIFT(45))
+
+/* DSISR bits for machine check (On Power7 and Power8) */
+#define P7_DSISR_MC_UE			(PPC_BIT(48))	/* P8 too */
+#define P7_DSISR_MC_UE_TABLEWALK	(PPC_BIT(49))	/* P8 too */
+#define P7_DSISR_MC_ERAT_MULTIHIT	(PPC_BIT(52))	/* P8 too */
+#define P7_DSISR_MC_TLB_MULTIHIT_MFTLB	(PPC_BIT(53))	/* P8 too */
+#define P7_DSISR_MC_SLB_PARITY_MFSLB	(PPC_BIT(55))	/* P8 too */
+#define P7_DSISR_MC_SLB_MULTIHIT	(PPC_BIT(56))	/* P8 too */
+#define P7_DSISR_MC_SLB_MULTIHIT_PARITY	(PPC_BIT(57))	/* P8 too */
+
+/*
+ * DSISR bits for machine check (Power8) in addition to above.
+ * Secondary DERAT Multihit
+ */
+#define P8_DSISR_MC_ERAT_MULTIHIT_SEC	(PPC_BIT(54))
+
+/* SLB error bits */
+#define P7_DSISR_MC_SLB_ERRORS		(P7_DSISR_MC_ERAT_MULTIHIT | \
+					 P7_DSISR_MC_SLB_PARITY_MFSLB | \
+					 P7_DSISR_MC_SLB_MULTIHIT | \
+					 P7_DSISR_MC_SLB_MULTIHIT_PARITY)
+
+#define P8_DSISR_MC_SLB_ERRORS		(P7_DSISR_MC_SLB_ERRORS | \
+					 P8_DSISR_MC_ERAT_MULTIHIT_SEC)
+
+/*
+ * Machine Check bits on power9
+ */
+#define P9_SRR1_MC_LOADSTORE(srr1)	(((srr1) >> PPC_BITLSHIFT(42)) & 1)
+
+#define P9_SRR1_MC_IFETCH(srr1)	(	\
+	PPC_BITEXTRACT(srr1, 45, 0) |	\
+	PPC_BITEXTRACT(srr1, 44, 1) |	\
+	PPC_BITEXTRACT(srr1, 43, 2) |	\
+	PPC_BITEXTRACT(srr1, 36, 3) )
+
+/* 0 is reserved */
+#define P9_SRR1_MC_IFETCH_UE				1
+#define P9_SRR1_MC_IFETCH_SLB_PARITY			2
+#define P9_SRR1_MC_IFETCH_SLB_MULTIHIT			3
+#define P9_SRR1_MC_IFETCH_ERAT_MULTIHIT			4
+#define P9_SRR1_MC_IFETCH_TLB_MULTIHIT			5
+#define P9_SRR1_MC_IFETCH_UE_TLB_RELOAD			6
+/* 7 is reserved */
+#define P9_SRR1_MC_IFETCH_LINK_TIMEOUT			8
+#define P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT	9
+/* 10 ? */
+#define P9_SRR1_MC_IFETCH_RA			11
+#define P9_SRR1_MC_IFETCH_RA_TABLEWALK		12
+#define P9_SRR1_MC_IFETCH_RA_ASYNC_STORE		13
+#define P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT	14
+#define P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN	15
+
+/* DSISR bits for machine check (On Power9) */
+#define P9_DSISR_MC_UE					(PPC_BIT(48))
+#define P9_DSISR_MC_UE_TABLEWALK			(PPC_BIT(49))
+#define P9_DSISR_MC_LINK_LOAD_TIMEOUT			(PPC_BIT(50))
+#define P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT		(PPC_BIT(51))
+#define P9_DSISR_MC_ERAT_MULTIHIT			(PPC_BIT(52))
+#define P9_DSISR_MC_TLB_MULTIHIT_MFTLB			(PPC_BIT(53))
+#define P9_DSISR_MC_USER_TLBIE				(PPC_BIT(54))
+#define P9_DSISR_MC_SLB_PARITY_MFSLB			(PPC_BIT(55))
+#define P9_DSISR_MC_SLB_MULTIHIT_MFSLB			(PPC_BIT(56))
+#define P9_DSISR_MC_RA_LOAD				(PPC_BIT(57))
+#define P9_DSISR_MC_RA_TABLEWALK			(PPC_BIT(58))
+#define P9_DSISR_MC_RA_TABLEWALK_FOREIGN		(PPC_BIT(59))
+#define P9_DSISR_MC_RA_FOREIGN				(PPC_BIT(60))
+
+/* SLB error bits */
+#define P9_DSISR_MC_SLB_ERRORS		(P9_DSISR_MC_ERAT_MULTIHIT | \
+					 P9_DSISR_MC_SLB_PARITY_MFSLB | \
+					 P9_DSISR_MC_SLB_MULTIHIT_MFSLB)
+
 static long mce_handle_derror_p7(uint64_t dsisr)
 {
 	return mce_handle_flush_derrors(dsisr,
-- 
cgit v1.2.1


From 631bc46c8c8e7014f2e90c94e356bf59e1fbf800 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 14 Mar 2017 22:36:46 +1000
Subject: powerpc/64s: Data driven machine check evaluation

Have machine types define i-side and d-side tables to describe their
machine check encodings, and match entries to evaluate (for reporting)
machine checks.

Functionality is mostly unchanged (tested with a userspace harness), but
it does make a change in that it no longer records DAR as the effective
address for those errors where it is specified to be invalid (which is a
reporting change only).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/mce_power.c | 569 +++++++++++++++++++++++-----------------
 1 file changed, 326 insertions(+), 243 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 9b3bcd1213bf..c5ea992fec45 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -253,91 +253,302 @@ static int mce_handle_flush_derrors(uint64_t dsisr, uint64_t slb, uint64_t tlb,
 					 P9_DSISR_MC_SLB_PARITY_MFSLB | \
 					 P9_DSISR_MC_SLB_MULTIHIT_MFSLB)
 
-static long mce_handle_derror_p7(uint64_t dsisr)
-{
-	return mce_handle_flush_derrors(dsisr,
-			P7_DSISR_MC_SLB_ERRORS,
-			P7_DSISR_MC_TLB_MULTIHIT_MFTLB,
-			0);
-}
-
-static long mce_handle_ierror_p7(uint64_t srr1)
+struct mce_ierror_table {
+	unsigned long srr1_mask;
+	unsigned long srr1_value;
+	bool nip_valid; /* nip is a valid indicator of faulting address */
+	unsigned int error_type;
+	unsigned int error_subtype;
+	unsigned int initiator;
+	unsigned int severity;
+};
+
+static const struct mce_ierror_table mce_p7_ierror_table[] = {
+{ 0x00000000001c0000, 0x0000000000040000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000080000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x00000000000c0000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000100000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000140000, true,
+  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000180000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x00000000001c0000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0, 0, 0, 0, 0, 0 } };
+
+static const struct mce_ierror_table mce_p8_ierror_table[] = {
+{ 0x00000000001c0000, 0x0000000000040000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000080000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x00000000000c0000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000100000, true,
+  MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000140000, true,
+  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000180000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x00000000001c0000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0, 0, 0, 0, 0, 0 } };
+
+static const struct mce_ierror_table mce_p9_ierror_table[] = {
+{ 0x00000000081c0000, 0x0000000000040000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000080000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000000c0000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000100000, true,
+  MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000140000, true,
+  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000180000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008000000, true,
+  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008040000, true,
+  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000080c0000, true,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008100000, true,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008140000, false,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_STORE,
+  MCE_INITIATOR_CPU,  MCE_SEV_FATAL, }, /* ASYNC is fatal */
+{ 0x00000000081c0000, 0x0000000008180000, false,
+  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_STORE_TIMEOUT,
+  MCE_INITIATOR_CPU,  MCE_SEV_FATAL, }, /* ASYNC is fatal */
+{ 0x00000000081c0000, 0x00000000081c0000, true,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0, 0, 0, 0, 0, 0 } };
+
+struct mce_derror_table {
+	unsigned long dsisr_value;
+	bool dar_valid; /* dar is a valid indicator of faulting address */
+	unsigned int error_type;
+	unsigned int error_subtype;
+	unsigned int initiator;
+	unsigned int severity;
+};
+
+static const struct mce_derror_table mce_p7_derror_table[] = {
+{ 0x00008000, false,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00004000, true,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000800, true,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000400, true,
+  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000100, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000080, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000040, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0, false, 0, 0, 0, 0 } };
+
+static const struct mce_derror_table mce_p8_derror_table[] = {
+{ 0x00008000, false,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00004000, true,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000800, true,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000400, true,
+  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000200, true,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000100, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000080, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0, false, 0, 0, 0, 0 } };
+
+static const struct mce_derror_table mce_p9_derror_table[] = {
+{ 0x00008000, false,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00004000, true,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00002000, true,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00001000, true,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000800, true,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000400, true,
+  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000200, false,
+  MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000100, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000080, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000040, true,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000020, false,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000010, false,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000008, false,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD_STORE_FOREIGN,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0, false, 0, 0, 0, 0 } };
+
+static void mce_get_ierror(struct pt_regs *regs,
+		const struct mce_ierror_table table[],
+		struct mce_error_info *mce_err, uint64_t *addr)
 {
-	switch (P7_SRR1_MC_IFETCH(srr1)) {
-	case P7_SRR1_MC_IFETCH_SLB_PARITY:
-	case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
-	case P7_SRR1_MC_IFETCH_SLB_BOTH:
-		return mce_flush(MCE_FLUSH_SLB);
-
-	case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
-		return mce_flush(MCE_FLUSH_TLB);
-	default:
-		return 0;
+	uint64_t srr1 = regs->msr;
+	int i;
+
+	*addr = 0;
+
+	for (i = 0; table[i].srr1_mask; i++) {
+		if ((srr1 & table[i].srr1_mask) != table[i].srr1_value)
+			continue;
+
+		mce_err->error_type = table[i].error_type;
+		switch (table[i].error_type) {
+		case MCE_ERROR_TYPE_UE:
+			mce_err->u.ue_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_SLB:
+			mce_err->u.slb_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_ERAT:
+			mce_err->u.erat_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_TLB:
+			mce_err->u.tlb_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_USER:
+			mce_err->u.user_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_RA:
+			mce_err->u.ra_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_LINK:
+			mce_err->u.link_error_type = table[i].error_subtype;
+			break;
+		}
+		mce_err->severity = table[i].severity;
+		mce_err->initiator = table[i].initiator;
+		if (table[i].nip_valid)
+			*addr = regs->nip;
+		return;
 	}
-}
 
-static void mce_get_common_ierror(struct mce_error_info *mce_err, uint64_t srr1)
-{
-	switch (P7_SRR1_MC_IFETCH(srr1)) {
-	case P7_SRR1_MC_IFETCH_SLB_PARITY:
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
-		break;
-	case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
-		break;
-	case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
-		mce_err->error_type = MCE_ERROR_TYPE_TLB;
-		mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
-		break;
-	case P7_SRR1_MC_IFETCH_UE:
-	case P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL:
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH;
-		break;
-	case P7_SRR1_MC_IFETCH_UE_TLB_RELOAD:
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type =
-				MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
-		break;
-	}
+	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
+	mce_err->severity = MCE_SEV_ERROR_SYNC;
+	mce_err->initiator = MCE_INITIATOR_CPU;
 }
 
-static void mce_get_ierror_p7(struct mce_error_info *mce_err, uint64_t srr1)
+static void mce_get_derror(struct pt_regs *regs,
+		const struct mce_derror_table table[],
+		struct mce_error_info *mce_err, uint64_t *addr)
 {
-	mce_get_common_ierror(mce_err, srr1);
-	if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
-	}
-}
+	uint64_t dsisr = regs->dsisr;
+	int i;
+
+	*addr = 0;
+
+	for (i = 0; table[i].dsisr_value; i++) {
+		if (!(dsisr & table[i].dsisr_value))
+			continue;
+
+		mce_err->error_type = table[i].error_type;
+		switch (table[i].error_type) {
+		case MCE_ERROR_TYPE_UE:
+			mce_err->u.ue_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_SLB:
+			mce_err->u.slb_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_ERAT:
+			mce_err->u.erat_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_TLB:
+			mce_err->u.tlb_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_USER:
+			mce_err->u.user_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_RA:
+			mce_err->u.ra_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_LINK:
+			mce_err->u.link_error_type = table[i].error_subtype;
+			break;
+		}
+		mce_err->severity = table[i].severity;
+		mce_err->initiator = table[i].initiator;
+		if (table[i].dar_valid)
+			*addr = regs->dar;
 
-static void mce_get_derror_p7(struct mce_error_info *mce_err, uint64_t dsisr)
-{
-	if (dsisr & P7_DSISR_MC_UE) {
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
-	} else if (dsisr & P7_DSISR_MC_UE_TABLEWALK) {
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type =
-				MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
-	} else if (dsisr & P7_DSISR_MC_ERAT_MULTIHIT) {
-		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
-		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
-	} else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT) {
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
-	} else if (dsisr & P7_DSISR_MC_SLB_PARITY_MFSLB) {
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
-	} else if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) {
-		mce_err->error_type = MCE_ERROR_TYPE_TLB;
-		mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
-	} else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT_PARITY) {
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
+		return;
 	}
+
+	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
+	mce_err->severity = MCE_SEV_ERROR_SYNC;
+	mce_err->initiator = MCE_INITIATOR_CPU;
 }
 
 static long mce_handle_ue_error(struct pt_regs *regs)
@@ -358,18 +569,41 @@ static long mce_handle_ue_error(struct pt_regs *regs)
 	return handled;
 }
 
+static long mce_handle_derror_p7(uint64_t dsisr)
+{
+	return mce_handle_flush_derrors(dsisr,
+			P7_DSISR_MC_SLB_ERRORS,
+			P7_DSISR_MC_TLB_MULTIHIT_MFTLB,
+			0);
+}
+
+static long mce_handle_ierror_p7(uint64_t srr1)
+{
+	switch (P7_SRR1_MC_IFETCH(srr1)) {
+	case P7_SRR1_MC_IFETCH_SLB_PARITY:
+	case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
+	case P7_SRR1_MC_IFETCH_SLB_BOTH:
+		return mce_flush(MCE_FLUSH_SLB);
+
+	case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
+		return mce_flush(MCE_FLUSH_TLB);
+	default:
+		return 0;
+	}
+}
+
 long __machine_check_early_realmode_p7(struct pt_regs *regs)
 {
 	uint64_t srr1, nip, addr;
 	long handled = 1;
 	struct mce_error_info mce_error_info = { 0 };
 
-	mce_error_info.severity = MCE_SEV_ERROR_SYNC;
-	mce_error_info.initiator = MCE_INITIATOR_CPU;
-
 	srr1 = regs->msr;
 	nip = regs->nip;
 
+	/* P7 DD1 leaves top bits of DSISR undefined */
+	regs->dsisr &= 0x0000ffff;
+
 	/*
 	 * Handle memory errors depending whether this was a load/store or
 	 * ifetch exception. Also, populate the mce error_type and
@@ -378,12 +612,12 @@ long __machine_check_early_realmode_p7(struct pt_regs *regs)
 	 */
 	if (P7_SRR1_MC_LOADSTORE(srr1)) {
 		handled = mce_handle_derror_p7(regs->dsisr);
-		mce_get_derror_p7(&mce_error_info, regs->dsisr);
-		addr = regs->dar;
+		mce_get_derror(regs, mce_p7_derror_table,
+				&mce_error_info, &addr);
 	} else {
 		handled = mce_handle_ierror_p7(srr1);
-		mce_get_ierror_p7(&mce_error_info, srr1);
-		addr = regs->nip;
+		mce_get_ierror(regs, mce_p7_ierror_table,
+				&mce_error_info, &addr);
 	}
 
 	/* Handle UE error. */
@@ -394,24 +628,6 @@ long __machine_check_early_realmode_p7(struct pt_regs *regs)
 	return handled;
 }
 
-static void mce_get_ierror_p8(struct mce_error_info *mce_err, uint64_t srr1)
-{
-	mce_get_common_ierror(mce_err, srr1);
-	if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
-		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
-		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
-	}
-}
-
-static void mce_get_derror_p8(struct mce_error_info *mce_err, uint64_t dsisr)
-{
-	mce_get_derror_p7(mce_err, dsisr);
-	if (dsisr & P8_DSISR_MC_ERAT_MULTIHIT_SEC) {
-		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
-		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
-	}
-}
-
 static long mce_handle_ierror_p8(uint64_t srr1)
 {
 	switch (P7_SRR1_MC_IFETCH(srr1)) {
@@ -441,20 +657,17 @@ long __machine_check_early_realmode_p8(struct pt_regs *regs)
 	long handled = 1;
 	struct mce_error_info mce_error_info = { 0 };
 
-	mce_error_info.severity = MCE_SEV_ERROR_SYNC;
-	mce_error_info.initiator = MCE_INITIATOR_CPU;
-
 	srr1 = regs->msr;
 	nip = regs->nip;
 
 	if (P7_SRR1_MC_LOADSTORE(srr1)) {
 		handled = mce_handle_derror_p8(regs->dsisr);
-		mce_get_derror_p8(&mce_error_info, regs->dsisr);
-		addr = regs->dar;
+		mce_get_derror(regs, mce_p8_derror_table,
+				&mce_error_info, &addr);
 	} else {
 		handled = mce_handle_ierror_p8(srr1);
-		mce_get_ierror_p8(&mce_error_info, srr1);
-		addr = regs->nip;
+		mce_get_ierror(regs, mce_p8_ierror_table,
+				&mce_error_info, &addr);
 	}
 
 	/* Handle UE error. */
@@ -495,138 +708,6 @@ static int mce_handle_ierror_p9(struct pt_regs *regs)
 	}
 }
 
-static void mce_get_derror_p9(struct pt_regs *regs,
-		struct mce_error_info *mce_err, uint64_t *addr)
-{
-	uint64_t dsisr = regs->dsisr;
-
-	mce_err->severity = MCE_SEV_ERROR_SYNC;
-	mce_err->initiator = MCE_INITIATOR_CPU;
-
-	if (dsisr & P9_DSISR_MC_USER_TLBIE)
-		*addr = regs->nip;
-	else
-		*addr = regs->dar;
-
-	if (dsisr & P9_DSISR_MC_UE) {
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
-	} else if (dsisr & P9_DSISR_MC_UE_TABLEWALK) {
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
-	} else if (dsisr & P9_DSISR_MC_LINK_LOAD_TIMEOUT) {
-		mce_err->error_type = MCE_ERROR_TYPE_LINK;
-		mce_err->u.link_error_type = MCE_LINK_ERROR_LOAD_TIMEOUT;
-	} else if (dsisr & P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT) {
-		mce_err->error_type = MCE_ERROR_TYPE_LINK;
-		mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT;
-	} else if (dsisr & P9_DSISR_MC_ERAT_MULTIHIT) {
-		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
-		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
-	} else if (dsisr & P9_DSISR_MC_TLB_MULTIHIT_MFTLB) {
-		mce_err->error_type = MCE_ERROR_TYPE_TLB;
-		mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
-	} else if (dsisr & P9_DSISR_MC_USER_TLBIE) {
-		mce_err->error_type = MCE_ERROR_TYPE_USER;
-		mce_err->u.user_error_type = MCE_USER_ERROR_TLBIE;
-	} else if (dsisr & P9_DSISR_MC_SLB_PARITY_MFSLB) {
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
-	} else if (dsisr & P9_DSISR_MC_SLB_MULTIHIT_MFSLB) {
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
-	} else if (dsisr & P9_DSISR_MC_RA_LOAD) {
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD;
-	} else if (dsisr & P9_DSISR_MC_RA_TABLEWALK) {
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
-	} else if (dsisr & P9_DSISR_MC_RA_TABLEWALK_FOREIGN) {
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;
-	} else if (dsisr & P9_DSISR_MC_RA_FOREIGN) {
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD_STORE_FOREIGN;
-	}
-}
-
-static void mce_get_ierror_p9(struct pt_regs *regs,
-		struct mce_error_info *mce_err, uint64_t *addr)
-{
-	uint64_t srr1 = regs->msr;
-
-	switch (P9_SRR1_MC_IFETCH(srr1)) {
-	case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE:
-	case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT:
-		mce_err->severity = MCE_SEV_FATAL;
-		break;
-	default:
-		mce_err->severity = MCE_SEV_ERROR_SYNC;
-		break;
-	}
-
-	mce_err->initiator = MCE_INITIATOR_CPU;
-
-	*addr = regs->nip;
-
-	switch (P9_SRR1_MC_IFETCH(srr1)) {
-	case P9_SRR1_MC_IFETCH_UE:
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH;
-		break;
-	case P9_SRR1_MC_IFETCH_SLB_PARITY:
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
-		break;
-	case P9_SRR1_MC_IFETCH_SLB_MULTIHIT:
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
-		break;
-	case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT:
-		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
-		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
-		break;
-	case P9_SRR1_MC_IFETCH_TLB_MULTIHIT:
-		mce_err->error_type = MCE_ERROR_TYPE_TLB;
-		mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
-		break;
-	case P9_SRR1_MC_IFETCH_UE_TLB_RELOAD:
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
-		break;
-	case P9_SRR1_MC_IFETCH_LINK_TIMEOUT:
-		mce_err->error_type = MCE_ERROR_TYPE_LINK;
-		mce_err->u.link_error_type = MCE_LINK_ERROR_IFETCH_TIMEOUT;
-		break;
-	case P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT:
-		mce_err->error_type = MCE_ERROR_TYPE_LINK;
-		mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT;
-		break;
-	case P9_SRR1_MC_IFETCH_RA:
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_IFETCH;
-		break;
-	case P9_SRR1_MC_IFETCH_RA_TABLEWALK:
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH;
-		break;
-	case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE:
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_STORE;
-		break;
-	case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT:
-		mce_err->error_type = MCE_ERROR_TYPE_LINK;
-		mce_err->u.link_error_type = MCE_LINK_ERROR_STORE_TIMEOUT;
-		break;
-	case P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN:
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN;
-		break;
-	default:
-		break;
-	}
-}
-
 long __machine_check_early_realmode_p9(struct pt_regs *regs)
 {
 	uint64_t nip, addr;
@@ -637,10 +718,12 @@ long __machine_check_early_realmode_p9(struct pt_regs *regs)
 
 	if (P9_SRR1_MC_LOADSTORE(regs->msr)) {
 		handled = mce_handle_derror_p9(regs);
-		mce_get_derror_p9(regs, &mce_error_info, &addr);
+		mce_get_derror(regs, mce_p9_derror_table,
+				&mce_error_info, &addr);
 	} else {
 		handled = mce_handle_ierror_p9(regs);
-		mce_get_ierror_p9(regs, &mce_error_info, &addr);
+		mce_get_ierror(regs, mce_p9_ierror_table,
+				&mce_error_info, &addr);
 	}
 
 	/* Handle UE error. */
-- 
cgit v1.2.1


From 755309be770695cafa4137520789c584903caa39 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 14 Mar 2017 22:36:47 +1000
Subject: powerpc/64s: Data driven machine check handling

Move the handling (corrective action) of machine checks to the table
based evaluation.

This changes P7 and P8 ERAT flushing from using SLB flush to using ERAT
flush.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/mce_power.c | 328 +++++++++-------------------------------
 1 file changed, 74 insertions(+), 254 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index c5ea992fec45..4529be2b5bf7 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -147,111 +147,7 @@ static int mce_flush(int what)
 	return 0;
 }
 
-static int mce_handle_flush_derrors(uint64_t dsisr, uint64_t slb, uint64_t tlb, uint64_t erat)
-{
-	if ((dsisr & slb) && mce_flush(MCE_FLUSH_SLB))
-		dsisr &= ~slb;
-	if ((dsisr & erat) && mce_flush(MCE_FLUSH_ERAT))
-		dsisr &= ~erat;
-	if ((dsisr & tlb) && mce_flush(MCE_FLUSH_TLB))
-		dsisr &= ~tlb;
-	/* Any other errors we don't understand? */
-	if (dsisr)
-		return 0;
-	return 1;
-}
-
-
-/*
- * Machine Check bits on power7 and power8
- */
-#define P7_SRR1_MC_LOADSTORE(srr1)	((srr1) & PPC_BIT(42)) /* P8 too */
-
-/* SRR1 bits for machine check (On Power7 and Power8) */
-#define P7_SRR1_MC_IFETCH(srr1)	((srr1) & PPC_BITMASK(43, 45)) /* P8 too */
-
-#define P7_SRR1_MC_IFETCH_UE		(0x1 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_SLB_PARITY	(0x2 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_SLB_MULTIHIT	(0x3 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_SLB_BOTH	(0x4 << PPC_BITLSHIFT(45))
-#define P7_SRR1_MC_IFETCH_TLB_MULTIHIT	(0x5 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_UE_TLB_RELOAD	(0x6 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL	(0x7 << PPC_BITLSHIFT(45))
-
-/* SRR1 bits for machine check (On Power8) */
-#define P8_SRR1_MC_IFETCH_ERAT_MULTIHIT	(0x4 << PPC_BITLSHIFT(45))
-
-/* DSISR bits for machine check (On Power7 and Power8) */
-#define P7_DSISR_MC_UE			(PPC_BIT(48))	/* P8 too */
-#define P7_DSISR_MC_UE_TABLEWALK	(PPC_BIT(49))	/* P8 too */
-#define P7_DSISR_MC_ERAT_MULTIHIT	(PPC_BIT(52))	/* P8 too */
-#define P7_DSISR_MC_TLB_MULTIHIT_MFTLB	(PPC_BIT(53))	/* P8 too */
-#define P7_DSISR_MC_SLB_PARITY_MFSLB	(PPC_BIT(55))	/* P8 too */
-#define P7_DSISR_MC_SLB_MULTIHIT	(PPC_BIT(56))	/* P8 too */
-#define P7_DSISR_MC_SLB_MULTIHIT_PARITY	(PPC_BIT(57))	/* P8 too */
-
-/*
- * DSISR bits for machine check (Power8) in addition to above.
- * Secondary DERAT Multihit
- */
-#define P8_DSISR_MC_ERAT_MULTIHIT_SEC	(PPC_BIT(54))
-
-/* SLB error bits */
-#define P7_DSISR_MC_SLB_ERRORS		(P7_DSISR_MC_ERAT_MULTIHIT | \
-					 P7_DSISR_MC_SLB_PARITY_MFSLB | \
-					 P7_DSISR_MC_SLB_MULTIHIT | \
-					 P7_DSISR_MC_SLB_MULTIHIT_PARITY)
-
-#define P8_DSISR_MC_SLB_ERRORS		(P7_DSISR_MC_SLB_ERRORS | \
-					 P8_DSISR_MC_ERAT_MULTIHIT_SEC)
-
-/*
- * Machine Check bits on power9
- */
-#define P9_SRR1_MC_LOADSTORE(srr1)	(((srr1) >> PPC_BITLSHIFT(42)) & 1)
-
-#define P9_SRR1_MC_IFETCH(srr1)	(	\
-	PPC_BITEXTRACT(srr1, 45, 0) |	\
-	PPC_BITEXTRACT(srr1, 44, 1) |	\
-	PPC_BITEXTRACT(srr1, 43, 2) |	\
-	PPC_BITEXTRACT(srr1, 36, 3) )
-
-/* 0 is reserved */
-#define P9_SRR1_MC_IFETCH_UE				1
-#define P9_SRR1_MC_IFETCH_SLB_PARITY			2
-#define P9_SRR1_MC_IFETCH_SLB_MULTIHIT			3
-#define P9_SRR1_MC_IFETCH_ERAT_MULTIHIT			4
-#define P9_SRR1_MC_IFETCH_TLB_MULTIHIT			5
-#define P9_SRR1_MC_IFETCH_UE_TLB_RELOAD			6
-/* 7 is reserved */
-#define P9_SRR1_MC_IFETCH_LINK_TIMEOUT			8
-#define P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT	9
-/* 10 ? */
-#define P9_SRR1_MC_IFETCH_RA			11
-#define P9_SRR1_MC_IFETCH_RA_TABLEWALK		12
-#define P9_SRR1_MC_IFETCH_RA_ASYNC_STORE		13
-#define P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT	14
-#define P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN	15
-
-/* DSISR bits for machine check (On Power9) */
-#define P9_DSISR_MC_UE					(PPC_BIT(48))
-#define P9_DSISR_MC_UE_TABLEWALK			(PPC_BIT(49))
-#define P9_DSISR_MC_LINK_LOAD_TIMEOUT			(PPC_BIT(50))
-#define P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT		(PPC_BIT(51))
-#define P9_DSISR_MC_ERAT_MULTIHIT			(PPC_BIT(52))
-#define P9_DSISR_MC_TLB_MULTIHIT_MFTLB			(PPC_BIT(53))
-#define P9_DSISR_MC_USER_TLBIE				(PPC_BIT(54))
-#define P9_DSISR_MC_SLB_PARITY_MFSLB			(PPC_BIT(55))
-#define P9_DSISR_MC_SLB_MULTIHIT_MFSLB			(PPC_BIT(56))
-#define P9_DSISR_MC_RA_LOAD				(PPC_BIT(57))
-#define P9_DSISR_MC_RA_TABLEWALK			(PPC_BIT(58))
-#define P9_DSISR_MC_RA_TABLEWALK_FOREIGN		(PPC_BIT(59))
-#define P9_DSISR_MC_RA_FOREIGN				(PPC_BIT(60))
-
-/* SLB error bits */
-#define P9_DSISR_MC_SLB_ERRORS		(P9_DSISR_MC_ERAT_MULTIHIT | \
-					 P9_DSISR_MC_SLB_PARITY_MFSLB | \
-					 P9_DSISR_MC_SLB_MULTIHIT_MFSLB)
+#define SRR1_MC_LOADSTORE(srr1)	((srr1) & PPC_BIT(42))
 
 struct mce_ierror_table {
 	unsigned long srr1_mask;
@@ -452,11 +348,12 @@ static const struct mce_derror_table mce_p9_derror_table[] = {
   MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
 { 0, false, 0, 0, 0, 0 } };
 
-static void mce_get_ierror(struct pt_regs *regs,
+static int mce_handle_ierror(struct pt_regs *regs,
 		const struct mce_ierror_table table[],
 		struct mce_error_info *mce_err, uint64_t *addr)
 {
 	uint64_t srr1 = regs->msr;
+	int handled = 0;
 	int i;
 
 	*addr = 0;
@@ -465,6 +362,20 @@ static void mce_get_ierror(struct pt_regs *regs,
 		if ((srr1 & table[i].srr1_mask) != table[i].srr1_value)
 			continue;
 
+		/* attempt to correct the error */
+		switch (table[i].error_type) {
+		case MCE_ERROR_TYPE_SLB:
+			handled = mce_flush(MCE_FLUSH_SLB);
+			break;
+		case MCE_ERROR_TYPE_ERAT:
+			handled = mce_flush(MCE_FLUSH_ERAT);
+			break;
+		case MCE_ERROR_TYPE_TLB:
+			handled = mce_flush(MCE_FLUSH_TLB);
+			break;
+		}
+
+		/* now fill in mce_error_info */
 		mce_err->error_type = table[i].error_type;
 		switch (table[i].error_type) {
 		case MCE_ERROR_TYPE_UE:
@@ -493,19 +404,23 @@ static void mce_get_ierror(struct pt_regs *regs,
 		mce_err->initiator = table[i].initiator;
 		if (table[i].nip_valid)
 			*addr = regs->nip;
-		return;
+		return handled;
 	}
 
 	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
 	mce_err->severity = MCE_SEV_ERROR_SYNC;
 	mce_err->initiator = MCE_INITIATOR_CPU;
+
+	return 0;
 }
 
-static void mce_get_derror(struct pt_regs *regs,
+static int mce_handle_derror(struct pt_regs *regs,
 		const struct mce_derror_table table[],
 		struct mce_error_info *mce_err, uint64_t *addr)
 {
 	uint64_t dsisr = regs->dsisr;
+	int handled = 0;
+	int found = 0;
 	int i;
 
 	*addr = 0;
@@ -514,6 +429,31 @@ static void mce_get_derror(struct pt_regs *regs,
 		if (!(dsisr & table[i].dsisr_value))
 			continue;
 
+		/* attempt to correct the error */
+		switch (table[i].error_type) {
+		case MCE_ERROR_TYPE_SLB:
+			if (mce_flush(MCE_FLUSH_SLB))
+				handled = 1;
+			break;
+		case MCE_ERROR_TYPE_ERAT:
+			if (mce_flush(MCE_FLUSH_ERAT))
+				handled = 1;
+			break;
+		case MCE_ERROR_TYPE_TLB:
+			if (mce_flush(MCE_FLUSH_TLB))
+				handled = 1;
+			break;
+		}
+
+		/*
+		 * Attempt to handle multiple conditions, but only return
+		 * one. Ensure uncorrectable errors are first in the table
+		 * to match.
+		 */
+		if (found)
+			continue;
+
+		/* now fill in mce_error_info */
 		mce_err->error_type = table[i].error_type;
 		switch (table[i].error_type) {
 		case MCE_ERROR_TYPE_UE:
@@ -543,12 +483,17 @@ static void mce_get_derror(struct pt_regs *regs,
 		if (table[i].dar_valid)
 			*addr = regs->dar;
 
-		return;
+		found = 1;
 	}
 
+	if (found)
+		return handled;
+
 	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
 	mce_err->severity = MCE_SEV_ERROR_SYNC;
 	mce_err->initiator = MCE_INITIATOR_CPU;
+
+	return 0;
 }
 
 static long mce_handle_ue_error(struct pt_regs *regs)
@@ -569,167 +514,42 @@ static long mce_handle_ue_error(struct pt_regs *regs)
 	return handled;
 }
 
-static long mce_handle_derror_p7(uint64_t dsisr)
-{
-	return mce_handle_flush_derrors(dsisr,
-			P7_DSISR_MC_SLB_ERRORS,
-			P7_DSISR_MC_TLB_MULTIHIT_MFTLB,
-			0);
-}
-
-static long mce_handle_ierror_p7(uint64_t srr1)
-{
-	switch (P7_SRR1_MC_IFETCH(srr1)) {
-	case P7_SRR1_MC_IFETCH_SLB_PARITY:
-	case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
-	case P7_SRR1_MC_IFETCH_SLB_BOTH:
-		return mce_flush(MCE_FLUSH_SLB);
-
-	case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
-		return mce_flush(MCE_FLUSH_TLB);
-	default:
-		return 0;
-	}
-}
-
-long __machine_check_early_realmode_p7(struct pt_regs *regs)
+static long mce_handle_error(struct pt_regs *regs,
+		const struct mce_derror_table dtable[],
+		const struct mce_ierror_table itable[])
 {
-	uint64_t srr1, nip, addr;
-	long handled = 1;
-	struct mce_error_info mce_error_info = { 0 };
-
-	srr1 = regs->msr;
-	nip = regs->nip;
+	struct mce_error_info mce_err = { 0 };
+	uint64_t addr;
+	uint64_t srr1 = regs->msr;
+	long handled;
 
-	/* P7 DD1 leaves top bits of DSISR undefined */
-	regs->dsisr &= 0x0000ffff;
+	if (SRR1_MC_LOADSTORE(srr1))
+		handled = mce_handle_derror(regs, dtable, &mce_err, &addr);
+	else
+		handled = mce_handle_ierror(regs, itable, &mce_err, &addr);
 
-	/*
-	 * Handle memory errors depending whether this was a load/store or
-	 * ifetch exception. Also, populate the mce error_type and
-	 * type-specific error_type from either SRR1 or DSISR, depending
-	 * whether this was a load/store or ifetch exception
-	 */
-	if (P7_SRR1_MC_LOADSTORE(srr1)) {
-		handled = mce_handle_derror_p7(regs->dsisr);
-		mce_get_derror(regs, mce_p7_derror_table,
-				&mce_error_info, &addr);
-	} else {
-		handled = mce_handle_ierror_p7(srr1);
-		mce_get_ierror(regs, mce_p7_ierror_table,
-				&mce_error_info, &addr);
-	}
-
-	/* Handle UE error. */
-	if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
+	if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
 		handled = mce_handle_ue_error(regs);
 
-	save_mce_event(regs, handled, &mce_error_info, nip, addr);
-	return handled;
-}
-
-static long mce_handle_ierror_p8(uint64_t srr1)
-{
-	switch (P7_SRR1_MC_IFETCH(srr1)) {
-	case P7_SRR1_MC_IFETCH_SLB_PARITY:
-	case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
-	case P8_SRR1_MC_IFETCH_ERAT_MULTIHIT:
-		return mce_flush(MCE_FLUSH_SLB);
-
-	case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
-		return mce_flush(MCE_FLUSH_TLB);
-	default:
-		return 0;
-	}
-}
-
-static long mce_handle_derror_p8(uint64_t dsisr)
-{
-	return mce_handle_flush_derrors(dsisr,
-			P8_DSISR_MC_SLB_ERRORS,
-			P7_DSISR_MC_TLB_MULTIHIT_MFTLB,
-			0);
-}
-
-long __machine_check_early_realmode_p8(struct pt_regs *regs)
-{
-	uint64_t srr1, nip, addr;
-	long handled = 1;
-	struct mce_error_info mce_error_info = { 0 };
-
-	srr1 = regs->msr;
-	nip = regs->nip;
-
-	if (P7_SRR1_MC_LOADSTORE(srr1)) {
-		handled = mce_handle_derror_p8(regs->dsisr);
-		mce_get_derror(regs, mce_p8_derror_table,
-				&mce_error_info, &addr);
-	} else {
-		handled = mce_handle_ierror_p8(srr1);
-		mce_get_ierror(regs, mce_p8_ierror_table,
-				&mce_error_info, &addr);
-	}
-
-	/* Handle UE error. */
-	if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
-		handled = mce_handle_ue_error(regs);
+	save_mce_event(regs, handled, &mce_err, regs->nip, addr);
 
-	save_mce_event(regs, handled, &mce_error_info, nip, addr);
 	return handled;
 }
 
-static int mce_handle_derror_p9(struct pt_regs *regs)
+long __machine_check_early_realmode_p7(struct pt_regs *regs)
 {
-	uint64_t dsisr = regs->dsisr;
-
-	return mce_handle_flush_derrors(dsisr,
-			P9_DSISR_MC_SLB_PARITY_MFSLB |
-			P9_DSISR_MC_SLB_MULTIHIT_MFSLB,
-
-			P9_DSISR_MC_TLB_MULTIHIT_MFTLB,
+	/* P7 DD1 leaves top bits of DSISR undefined */
+	regs->dsisr &= 0x0000ffff;
 
-			P9_DSISR_MC_ERAT_MULTIHIT);
+	return mce_handle_error(regs, mce_p7_derror_table, mce_p7_ierror_table);
 }
 
-static int mce_handle_ierror_p9(struct pt_regs *regs)
+long __machine_check_early_realmode_p8(struct pt_regs *regs)
 {
-	uint64_t srr1 = regs->msr;
-
-	switch (P9_SRR1_MC_IFETCH(srr1)) {
-	case P9_SRR1_MC_IFETCH_SLB_PARITY:
-	case P9_SRR1_MC_IFETCH_SLB_MULTIHIT:
-		return mce_flush(MCE_FLUSH_SLB);
-	case P9_SRR1_MC_IFETCH_TLB_MULTIHIT:
-		return mce_flush(MCE_FLUSH_TLB);
-	case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT:
-		return mce_flush(MCE_FLUSH_ERAT);
-	default:
-		return 0;
-	}
+	return mce_handle_error(regs, mce_p8_derror_table, mce_p8_ierror_table);
 }
 
 long __machine_check_early_realmode_p9(struct pt_regs *regs)
 {
-	uint64_t nip, addr;
-	long handled;
-	struct mce_error_info mce_error_info = { 0 };
-
-	nip = regs->nip;
-
-	if (P9_SRR1_MC_LOADSTORE(regs->msr)) {
-		handled = mce_handle_derror_p9(regs);
-		mce_get_derror(regs, mce_p9_derror_table,
-				&mce_error_info, &addr);
-	} else {
-		handled = mce_handle_ierror_p9(regs);
-		mce_get_ierror(regs, mce_p9_ierror_table,
-				&mce_error_info, &addr);
-	}
-
-	/* Handle UE error. */
-	if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
-		handled = mce_handle_ue_error(regs);
-
-	save_mce_event(regs, handled, &mce_error_info, nip, addr);
-	return handled;
+	return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table);
 }
-- 
cgit v1.2.1


From c7e790c5f4c3e923664f45dfbc84492406bef045 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 14 Mar 2017 22:36:48 +1000
Subject: powerpc/64s: POWER8 add missing machine check definitions

POWER8 uses bit 36 in SRR1 like POWER9 for i-side machine checks, and
contains several conditions for link timeouts that are not currently
handled.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/mce_power.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 4529be2b5bf7..de242b4bbd20 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -184,27 +184,33 @@ static const struct mce_ierror_table mce_p7_ierror_table[] = {
 { 0, 0, 0, 0, 0, 0 } };
 
 static const struct mce_ierror_table mce_p8_ierror_table[] = {
-{ 0x00000000001c0000, 0x0000000000040000, true,
+{ 0x00000000081c0000, 0x0000000000040000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
-{ 0x00000000001c0000, 0x0000000000080000, true,
+{ 0x00000000081c0000, 0x0000000000080000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
-{ 0x00000000001c0000, 0x00000000000c0000, true,
+{ 0x00000000081c0000, 0x00000000000c0000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
-{ 0x00000000001c0000, 0x0000000000100000, true,
+{ 0x00000000081c0000, 0x0000000000100000, true,
   MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
-{ 0x00000000001c0000, 0x0000000000140000, true,
+{ 0x00000000081c0000, 0x0000000000140000, true,
   MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
-{ 0x00000000001c0000, 0x0000000000180000, true,
+{ 0x00000000081c0000, 0x0000000000180000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
-{ 0x00000000001c0000, 0x00000000001c0000, true,
+{ 0x00000000081c0000, 0x00000000001c0000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008000000, true,
+  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008040000, true,
+  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
 { 0, 0, 0, 0, 0, 0 } };
 
 static const struct mce_ierror_table mce_p9_ierror_table[] = {
@@ -289,6 +295,12 @@ static const struct mce_derror_table mce_p8_derror_table[] = {
 { 0x00004000, true,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
   MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00002000, true,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00001000, true,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
 { 0x00000800, true,
   MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
   MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
-- 
cgit v1.2.1


From 6b5c19c55266f6efd10ffac0e9f9f2b7aa420a58 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:47 +1100
Subject: powerpc/mmu: Add real mode support for IOMMU preregistered memory

This makes mm_iommu_lookup() able to work in realmode by replacing
list_for_each_entry_rcu() (which can do debug stuff which can fail in
real mode) with list_for_each_entry_lockless().

This adds realmode version of mm_iommu_ua_to_hpa() which adds
explicit vmalloc'd-to-linear address conversion.
Unlike mm_iommu_ua_to_hpa(), mm_iommu_ua_to_hpa_rm() can fail.

This changes mm_iommu_preregistered() to receive @mm as in real mode
@current does not always have a correct pointer.

This adds realmode version of mm_iommu_lookup() which receives @mm
(for the same reason as for mm_iommu_preregistered()) and uses
lockless version of list_for_each_entry_rcu().

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mmu_context.h |  4 ++++
 arch/powerpc/mm/mmu_context_iommu.c    | 39 ++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index b9e3f0aca261..c70c8272523d 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -29,10 +29,14 @@ extern void mm_iommu_init(struct mm_struct *mm);
 extern void mm_iommu_cleanup(struct mm_struct *mm);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
 		unsigned long ua, unsigned long size);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
+		struct mm_struct *mm, unsigned long ua, unsigned long size);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
 		unsigned long ua, unsigned long entries);
 extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
 		unsigned long ua, unsigned long *hpa);
+extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa);
 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
 #endif
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index 497130c5c742..fc67bd766eaf 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -314,6 +314,25 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
 }
 EXPORT_SYMBOL_GPL(mm_iommu_lookup);
 
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
+		unsigned long ua, unsigned long size)
+{
+	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+	list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua <= ua) &&
+				(ua + size <= mem->ua +
+				 (mem->entries << PAGE_SHIFT))) {
+			ret = mem;
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm);
+
 struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
 		unsigned long ua, unsigned long entries)
 {
@@ -345,6 +364,26 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
 }
 EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
 
+long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa)
+{
+	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+	void *va = &mem->hpas[entry];
+	unsigned long *pa;
+
+	if (entry >= mem->entries)
+		return -EFAULT;
+
+	pa = (void *) vmalloc_to_phys(va);
+	if (!pa)
+		return -EFAULT;
+
+	*hpa = *pa | (ua & ~PAGE_MASK);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm);
+
 long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
 {
 	if (atomic64_inc_not_zero(&mem->mapped))
-- 
cgit v1.2.1


From 3b5bf42b81d56085fd58692b5117f69aa77fdff7 Mon Sep 17 00:00:00 2001
From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 16:27:49 -0300
Subject: powerpc/xmon: Fix an unexpected xmon on/off state change

Once xmon is triggered by sysrq-x, it is enabled always afterwards even
if it is disabled during boot. This will cause a system reset interrupt
fail to dump. So keep xmon in its original state after exit.

We have several ways to set xmon on or off.
1) by a build config CONFIG_XMON_DEFAULT.
2) by a boot cmdline with xmon or xmon=early or xmon=on to enable xmon
and xmon=off to disable xmon. This value will override that in step 1.
3) by a debugfs interface, as proposed in this patchset.
And this value can override those in step 1 and 2.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/xmon/xmon.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 16321ad9e70c..a89db1b3f66d 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -76,6 +76,7 @@ static int xmon_gate;
 #endif /* CONFIG_SMP */
 
 static unsigned long in_xmon __read_mostly = 0;
+static int xmon_on = IS_ENABLED(CONFIG_XMON_DEFAULT);
 
 static unsigned long adrs;
 static int size = 1;
@@ -3302,6 +3303,8 @@ static void sysrq_handle_xmon(int key)
 	/* ensure xmon is enabled */
 	xmon_init(1);
 	debugger(get_irq_regs());
+	if (!xmon_on)
+		xmon_init(0);
 }
 
 static struct sysrq_key_op sysrq_xmon_op = {
@@ -3318,7 +3321,7 @@ static int __init setup_xmon_sysrq(void)
 __initcall(setup_xmon_sysrq);
 #endif /* CONFIG_MAGIC_SYSRQ */
 
-static int __initdata xmon_early, xmon_off;
+static int __initdata xmon_early;
 
 static int __init early_parse_xmon(char *p)
 {
@@ -3326,10 +3329,12 @@ static int __init early_parse_xmon(char *p)
 		/* just "xmon" is equivalent to "xmon=early" */
 		xmon_init(1);
 		xmon_early = 1;
-	} else if (strncmp(p, "on", 2) == 0)
+		xmon_on = 1;
+	} else if (strncmp(p, "on", 2) == 0) {
 		xmon_init(1);
-	else if (strncmp(p, "off", 3) == 0)
-		xmon_off = 1;
+		xmon_on = 1;
+	} else if (strncmp(p, "off", 3) == 0)
+		xmon_on = 0;
 	else if (strncmp(p, "nobt", 4) == 0)
 		xmon_no_auto_backtrace = 1;
 	else
@@ -3341,10 +3346,8 @@ early_param("xmon", early_parse_xmon);
 
 void __init xmon_setup(void)
 {
-#ifdef CONFIG_XMON_DEFAULT
-	if (!xmon_off)
+	if (xmon_on)
 		xmon_init(1);
-#endif
 	if (xmon_early)
 		debugger(NULL);
 }
-- 
cgit v1.2.1


From b561783c7bfd4bcb18ad30e1410ee19500c36a7c Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 16:27:50 -0300
Subject: powerpc/xmon: drop the nobt option from xmon plus minor fixes

The xmon parameter nobt was added long time ago, by commit 26c8af5f01df
("[POWERPC] print backtrace when entering xmon"). The problem that time
was that during a crash in a machine with USB keyboard, xmon wouldn't
respond to commands from the keyboard, so printing the backtrace wouldn't
be possible.

Idea then was to show automatically the backtrace on xmon crash for the
first time it's invoked (if it recovers, next time xmon won't show
backtrace automatically). The nobt parameter was added _only_ to prevent
this automatic trace show. Seems long time ago USB keyboards didn't work
that well!

We don't need this parameter anymore, the feature of auto showing the
backtrace is interesting (imagine a case of auto-reboot script),
so this patch extends the functionality, by always showing the backtrace
automatically when xmon is invoked; it removes the nobt parameter too.

Also, this patch fixes __initdata placement on xmon_early and replaces
__initcall() with modern device_initcall() on sysrq handler.

Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/xmon/xmon.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index a89db1b3f66d..25a32815f310 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -185,8 +185,6 @@ static void dump_tlb_44x(void);
 static void dump_tlb_book3e(void);
 #endif
 
-static int xmon_no_auto_backtrace;
-
 #ifdef CONFIG_PPC64
 #define REG		"%.16lx"
 #else
@@ -885,10 +883,7 @@ cmds(struct pt_regs *excp)
 	last_cmd = NULL;
 	xmon_regs = excp;
 
-	if (!xmon_no_auto_backtrace) {
-		xmon_no_auto_backtrace = 1;
-		xmon_show_stack(excp->gpr[1], excp->link, excp->nip);
-	}
+	xmon_show_stack(excp->gpr[1], excp->link, excp->nip);
 
 	for(;;) {
 #ifdef CONFIG_SMP
@@ -3318,10 +3313,10 @@ static int __init setup_xmon_sysrq(void)
 	register_sysrq_key('x', &sysrq_xmon_op);
 	return 0;
 }
-__initcall(setup_xmon_sysrq);
+device_initcall(setup_xmon_sysrq);
 #endif /* CONFIG_MAGIC_SYSRQ */
 
-static int __initdata xmon_early;
+static int xmon_early __initdata;
 
 static int __init early_parse_xmon(char *p)
 {
@@ -3335,8 +3330,6 @@ static int __init early_parse_xmon(char *p)
 		xmon_on = 1;
 	} else if (strncmp(p, "off", 3) == 0)
 		xmon_on = 0;
-	else if (strncmp(p, "nobt", 4) == 0)
-		xmon_no_auto_backtrace = 1;
 	else
 		return 1;
 
-- 
cgit v1.2.1


From de78ae6c9ef55d9ded036c8e29d9364e1ce5f6b7 Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 16:27:51 -0300
Subject: powerpc/xmon: add debugfs entry for xmon

Currently the xmon debugger is set only via kernel boot command-line.
It's disabled by default, and can be enabled with "xmon=on" on the
command-line. Also, xmon may be accessed via sysrq mechanism.
But we cannot enable/disable xmon in runtime, it needs kernel reload.

This patch introduces a debugfs entry for xmon, allowing user to query
its current state and change it if desired. Basically, the "xmon" file
to read from/write to is under the debugfs mount point, on powerpc
directory. It's a simple attribute, value 0 meaning xmon is disabled
and value 1 the opposite. Writing these states to the file will take
immediate effect in the debugger.

Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/xmon/xmon.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 25a32815f310..0fab9d7349eb 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -29,6 +29,10 @@
 #include <linux/nmi.h>
 #include <linux/ctype.h>
 
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+#endif
+
 #include <asm/ptrace.h>
 #include <asm/string.h>
 #include <asm/prom.h>
@@ -3316,6 +3320,33 @@ static int __init setup_xmon_sysrq(void)
 device_initcall(setup_xmon_sysrq);
 #endif /* CONFIG_MAGIC_SYSRQ */
 
+#ifdef CONFIG_DEBUG_FS
+static int xmon_dbgfs_set(void *data, u64 val)
+{
+	xmon_on = !!val;
+	xmon_init(xmon_on);
+
+	return 0;
+}
+
+static int xmon_dbgfs_get(void *data, u64 *val)
+{
+	*val = xmon_on;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(xmon_dbgfs_ops, xmon_dbgfs_get,
+			xmon_dbgfs_set, "%llu\n");
+
+static int __init setup_xmon_dbgfs(void)
+{
+	debugfs_create_file("xmon", 0600, powerpc_debugfs_root, NULL,
+				&xmon_dbgfs_ops);
+	return 0;
+}
+device_initcall(setup_xmon_dbgfs);
+#endif /* CONFIG_DEBUG_FS */
+
 static int xmon_early __initdata;
 
 static int __init early_parse_xmon(char *p)
-- 
cgit v1.2.1


From c3a08e93d66e9ba59ffbfb610af2f76b82200185 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Thu, 23 Mar 2017 18:54:01 +1100
Subject: powerpc/powernv: de-deuplicate OPAL call wrappers

Currently the code to perform an OPAL call is duplicated between the
normal path and path taken when tracepoints are enabled. There's no
real need for this and combining them makes opal_tracepoint_entry
considerably easier to understand.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-wrappers.S | 53 +++++++++++---------------
 1 file changed, 22 insertions(+), 31 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index da8a0f7a035c..ebf6719d241a 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -50,21 +50,13 @@ END_FTR_SECTION(0, 1);						\
 #define OPAL_BRANCH(LABEL)
 #endif
 
-/* TODO:
- *
- * - Trace irqs in/off (needs saving/restoring all args, argh...)
- * - Get r11 feed up by Dave so I can have better register usage
+/*
+ * DO_OPAL_CALL assumes:
+ * r0  = opal call token
+ * r12 = msr
+ * LR has been saved
  */
-
-#define OPAL_CALL(name, token)		\
- _GLOBAL_TOC(name);			\
-	mfmsr	r12;			\
-	mflr	r0;			\
-	andi.	r11,r12,MSR_IR|MSR_DR; 	\
-	std	r0,PPC_LR_STKOFF(r1);	\
-	li	r0,token;		\
-	beq	opal_real_call;         \
-	OPAL_BRANCH(opal_tracepoint_entry) \
+#define DO_OPAL_CALL()			\
 	mfcr	r11;			\
 	stw	r11,8(r1);		\
 	li	r11,0;			\
@@ -83,6 +75,18 @@ END_FTR_SECTION(0, 1);						\
 	mtspr	SPRN_HSRR0,r12;		\
 	hrfid
 
+#define OPAL_CALL(name, token)		\
+ _GLOBAL_TOC(name);			\
+	mfmsr	r12;			\
+	mflr	r0;			\
+	andi.	r11,r12,MSR_IR|MSR_DR; 	\
+	std	r0,PPC_LR_STKOFF(r1);	\
+	li	r0,token;		\
+	beq	opal_real_call;         \
+	OPAL_BRANCH(opal_tracepoint_entry) \
+	DO_OPAL_CALL()
+
+
 opal_return:
 	/*
 	 * Fixup endian on OPAL return... we should be able to simplify
@@ -148,26 +152,13 @@ opal_tracepoint_entry:
 	ld	r8,STK_REG(R29)(r1)
 	ld	r9,STK_REG(R30)(r1)
 	ld	r10,STK_REG(R31)(r1)
+
+	/* setup LR so we return via tracepoint_return */
 	LOAD_REG_ADDR(r11,opal_tracepoint_return)
-	mfcr	r12
 	std	r11,16(r1)
-	stw	r12,8(r1)
-	li	r11,0
+
 	mfmsr	r12
-	ori	r11,r11,MSR_EE
-	std	r12,PACASAVEDMSR(r13)
-	andc	r12,r12,r11
-	mtmsrd	r12,1
-	LOAD_REG_ADDR(r11,opal_return)
-	mtlr	r11
-	li	r11,MSR_DR|MSR_IR|MSR_LE
-	andc	r12,r12,r11
-	mtspr	SPRN_HSRR1,r12
-	LOAD_REG_ADDR(r11,opal)
-	ld	r12,8(r11)
-	ld	r2,0(r11)
-	mtspr	SPRN_HSRR0,r12
-	hrfid
+	DO_OPAL_CALL()
 
 opal_tracepoint_return:
 	std	r3,STK_REG(R31)(r1)
-- 
cgit v1.2.1


From 517c27570cf38f182e7a688d50a9b978333f8ea8 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Fri, 24 Mar 2017 21:20:56 +1100
Subject: powerpc/powernv: Fix XSCOM address mangling for form 1 indirect

POWER9 adds form 1 scoms. The form of the indirection is specified in
the top nibble of the scom address.

Currently we do some (ugly) bit mangling so that we can fit a 64 bit
scom address into the debugfs interface. The current code only shifts
the top bit (indirect bit).

This patch changes it to shift the whole top nibble so that the form
of the indirection is also shifted.

This patch is backwards compatible with older scoms.

(This change isn't required in the arch/powerpc/platforms/powernv/opal-prd.c
scom interface as it passes the whole 64bit scom address without any bit
mangling)

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-xscom.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index d0ac535cf5d7..28651fb25417 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -73,25 +73,32 @@ static int opal_xscom_err_xlate(int64_t rc)
 
 static u64 opal_scom_unmangle(u64 addr)
 {
+	u64 tmp;
+
 	/*
-	 * XSCOM indirect addresses have the top bit set. Additionally
-	 * the rest of the top 3 nibbles is always 0.
+	 * XSCOM addresses use the top nibble to set indirect mode and
+	 * its form.  Bits 4-11 are always 0.
 	 *
 	 * Because the debugfs interface uses signed offsets and shifts
 	 * the address left by 3, we basically cannot use the top 4 bits
 	 * of the 64-bit address, and thus cannot use the indirect bit.
 	 *
-	 * To deal with that, we support the indirect bit being in bit
-	 * 4 (IBM notation) instead of bit 0 in this API, we do the
-	 * conversion here. To leave room for further xscom address
-	 * expansion, we only clear out the top byte
+	 * To deal with that, we support the indirect bits being in
+	 * bits 4-7 (IBM notation) instead of bit 0-3 in this API, we
+	 * do the conversion here.
 	 *
-	 * For in-kernel use, we also support the real indirect bit, so
-	 * we test for any of the top 5 bits
+	 * For in-kernel use, we don't need to do this mangling.  In
+	 * kernel won't have bits 4-7 set.
 	 *
+	 * So:
+	 *   debugfs will always   set 0-3 = 0 and clear 4-7
+	 *    kernel will always clear 0-3 = 0 and   set 4-7
 	 */
-	if (addr & (0x1full << 59))
-		addr = (addr & ~(0xffull << 56)) | (1ull << 63);
+	tmp = addr;
+	tmp  &= 0x0f00000000000000;
+	addr &= 0xf0ffffffffffffff;
+	addr |= tmp << 4;
+
 	return addr;
 }
 
-- 
cgit v1.2.1


From 4ce410d65a4703aa3fc29c8a1f172e829892f39c Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 23 Mar 2017 08:21:59 +1100
Subject: powerpc/configs: Re-enable ISO9660_FS as a built-in in 64 bit configs

It turns out cloud-config uses ISO9660 filesystems to inject
configuration data into cloud images. The cloud-config failures when
ISO9660_FS is not enabled are cryptic, and building it in makes
mainline testing easier, so re-enable it.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/configs/powernv_defconfig | 2 +-
 arch/powerpc/configs/ppc64_defconfig   | 2 +-
 arch/powerpc/configs/pseries_defconfig | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index ac8b8332ed82..eb78c741e944 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -261,7 +261,7 @@ CONFIG_NILFS2_FS=m
 CONFIG_AUTOFS4_FS=m
 CONFIG_FUSE_FS=m
 CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=m
+CONFIG_ISO9660_FS=y
 CONFIG_UDF_FS=m
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=m
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index 4f1288b04303..bdca32e2b639 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -291,7 +291,7 @@ CONFIG_NILFS2_FS=m
 CONFIG_AUTOFS4_FS=m
 CONFIG_FUSE_FS=m
 CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=m
+CONFIG_ISO9660_FS=y
 CONFIG_UDF_FS=m
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=m
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index 4ff68b752618..cd26091c86ca 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -259,7 +259,7 @@ CONFIG_NILFS2_FS=m
 CONFIG_AUTOFS4_FS=m
 CONFIG_FUSE_FS=m
 CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=m
+CONFIG_ISO9660_FS=y
 CONFIG_UDF_FS=m
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=m
-- 
cgit v1.2.1


From 4e215d2584b2e5b9448924f374f60ad2734144d8 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 23 Mar 2017 08:22:00 +1100
Subject: powerpc/configs: Make oprofile a module

Most people use perf these days, so save about 31kB by making oprofile
a module.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/configs/powernv_defconfig | 2 +-
 arch/powerpc/configs/ppc64_defconfig   | 2 +-
 arch/powerpc/configs/pseries_defconfig | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index eb78c741e944..4926d7f3d8da 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -33,7 +33,7 @@ CONFIG_BLK_DEV_INITRD=y
 CONFIG_BPF_SYSCALL=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index bdca32e2b639..dfac33cef5ae 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -19,7 +19,7 @@ CONFIG_BLK_DEV_INITRD=y
 CONFIG_BPF_SYSCALL=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index cd26091c86ca..47f72c86af4f 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -34,7 +34,7 @@ CONFIG_BLK_DEV_INITRD=y
 CONFIG_BPF_SYSCALL=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
-- 
cgit v1.2.1


From 1127d5f7fdcd99c28f2b70e12578372c330c8fa1 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 23 Mar 2017 08:22:01 +1100
Subject: powerpc/configs: Re-enable POWER8 crc32c

The config option for the POWER8 crc32c recently changed from
CONFIG_CRYPT_CRC32C_VPMSUM to CONFIG_CRYPTO_CRC32C_VPMSUM. Update
the configs.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/configs/powernv_defconfig | 2 +-
 arch/powerpc/configs/ppc64_defconfig   | 2 +-
 arch/powerpc/configs/pseries_defconfig | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index 4926d7f3d8da..0695ce047d56 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -306,7 +306,7 @@ CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CCM=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPT_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_CRC32C_VPMSUM=m
 CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA256=y
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index dfac33cef5ae..e353168f98a7 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -340,7 +340,7 @@ CONFIG_PPC_EARLY_DEBUG=y
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPT_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_CRC32C_VPMSUM=m
 CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA256=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index 47f72c86af4f..1a61aa20dfba 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -303,7 +303,7 @@ CONFIG_XMON=y
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPT_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_CRC32C_VPMSUM=m
 CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA256=y
-- 
cgit v1.2.1


From 5511a45fc134f0784c403ef3488e2c07cd15bf14 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 21 Mar 2017 16:24:38 +1100
Subject: powerpc/64: Don't use early_cpu_has_feature() in
 cpu_ready_for_interrupts()

cpu_ready_for_interrupts() is called after feature patching, so there's
no need to use early_cpu_has_feature().

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 9cfaa8b69b5f..729e990a019d 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -230,8 +230,8 @@ static void cpu_ready_for_interrupts(void)
 	 * If we are not in hypervisor mode the job is done once for
 	 * the whole partition in configure_exceptions().
 	 */
-	if (early_cpu_has_feature(CPU_FTR_HVMODE) &&
-	    early_cpu_has_feature(CPU_FTR_ARCH_207S)) {
+	if (cpu_has_feature(CPU_FTR_HVMODE) &&
+	    cpu_has_feature(CPU_FTR_ARCH_207S)) {
 		unsigned long lpcr = mfspr(SPRN_LPCR);
 		mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3);
 	}
-- 
cgit v1.2.1


From 4f9b514b765a3057341f3236c94877d9413babc7 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Date: Mon, 27 Mar 2017 16:32:33 -0300
Subject: powerpc: Make /proc/self/stack always print the current stack

For the current task, the kernel stack would only tell the last time the
process was rescheduled, if ever. Use the current stack pointer for the
current task.

Otherwise, every once in a while, the stacktrace printed when reading
/proc/self/stack would look like the process is running in userspace,
while it's not, which some may consider as a bug.

This is also consistent with some other architectures, like x86 and arm,
at least.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/stacktrace.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
index 66711958493c..d534ed901538 100644
--- a/arch/powerpc/kernel/stacktrace.c
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -59,7 +59,14 @@ EXPORT_SYMBOL_GPL(save_stack_trace);
 
 void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 {
-	save_context_stack(trace, tsk->thread.ksp, tsk, 0);
+	unsigned long sp;
+
+	if (tsk == current)
+		sp = current_stack_pointer();
+	else
+		sp = tsk->thread.ksp;
+
+	save_context_stack(trace, sp, tsk, 0);
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 
-- 
cgit v1.2.1


From a540aa56ba3d29084f28710c8b93cc9c3c422943 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:48 +1100
Subject: powerpc/powernv/iommu: Add real mode version of
 iommu_table_ops::exchange()

In real mode, TCE tables are invalidated using special
cache-inhibited store instructions which are not available in
virtual mode

This defines and implements exchange_rm() callback. This does not
define set_rm/clear_rm/flush_rm callbacks as there is no user for those -
exchange/exchange_rm are only to be used by KVM for VFIO.

The exchange_rm callback is defined for IODA1/IODA2 powernv platforms.

This replaces list_for_each_entry_rcu with its lockless version as
from now on pnv_pci_ioda2_tce_invalidate() can be called in
the real mode too.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/iommu.h          |  7 +++++++
 arch/powerpc/kernel/iommu.c               | 25 +++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci-ioda.c | 26 +++++++++++++++++++++++++-
 3 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 2c1d50792944..4554699aec02 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -64,6 +64,11 @@ struct iommu_table_ops {
 			long index,
 			unsigned long *hpa,
 			enum dma_data_direction *direction);
+	/* Real mode */
+	int (*exchange_rm)(struct iommu_table *tbl,
+			long index,
+			unsigned long *hpa,
+			enum dma_data_direction *direction);
 #endif
 	void (*clear)(struct iommu_table *tbl,
 			long index, long npages);
@@ -208,6 +213,8 @@ extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
 extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
 		unsigned long *hpa, enum dma_data_direction *direction);
+extern long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry,
+		unsigned long *hpa, enum dma_data_direction *direction);
 #else
 static inline void iommu_register_group(struct iommu_table_group *table_group,
 					int pci_domain_number,
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 5f202a566ec5..685a4767b722 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1004,6 +1004,31 @@ long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_xchg);
 
+#ifdef CONFIG_PPC_BOOK3S_64
+long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret;
+
+	ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
+
+	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
+			(*direction == DMA_BIDIRECTIONAL))) {
+		struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT);
+
+		if (likely(pg)) {
+			SetPageDirty(pg);
+		} else {
+			tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
+			ret = -EFAULT;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm);
+#endif
+
 int iommu_take_ownership(struct iommu_table *tbl)
 {
 	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index e36738291c32..572e9c9f1ea0 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1860,6 +1860,17 @@ static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
 
 	return ret;
 }
+
+static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+	if (!ret)
+		pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
+
+	return ret;
+}
 #endif
 
 static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
@@ -1874,6 +1885,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 	.set = pnv_ioda1_tce_build,
 #ifdef CONFIG_IOMMU_API
 	.exchange = pnv_ioda1_tce_xchg,
+	.exchange_rm = pnv_ioda1_tce_xchg_rm,
 #endif
 	.clear = pnv_ioda1_tce_free,
 	.get = pnv_tce_get,
@@ -1948,7 +1960,7 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 {
 	struct iommu_table_group_link *tgl;
 
-	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
+	list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) {
 		struct pnv_ioda_pe *pe = container_of(tgl->table_group,
 				struct pnv_ioda_pe, table_group);
 		struct pnv_phb *phb = pe->phb;
@@ -2004,6 +2016,17 @@ static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
 
 	return ret;
 }
+
+static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+	if (!ret)
+		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
+
+	return ret;
+}
 #endif
 
 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
@@ -2024,6 +2047,7 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 	.set = pnv_ioda2_tce_build,
 #ifdef CONFIG_IOMMU_API
 	.exchange = pnv_ioda2_tce_xchg,
+	.exchange_rm = pnv_ioda2_tce_xchg_rm,
 #endif
 	.clear = pnv_ioda2_tce_free,
 	.get = pnv_tce_get,
-- 
cgit v1.2.1


From 11edf116e3a6352cfee6b1437d41603c9aff79c9 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:49 +1100
Subject: powerpc/iommu/vfio_spapr_tce: Cleanup iommu_table disposal

At the moment iommu_table can be disposed by either calling
iommu_table_free() directly or it_ops::free(); the only implementation
of free() is in IODA2 - pnv_ioda2_table_free() - and it calls
iommu_table_free() anyway.

As we are going to have reference counting on tables, we need an unified
way of disposing tables.

This moves it_ops::free() call into iommu_free_table() and makes use
of the latter. The free() callback now handles only platform-specific
data.

As from now on the iommu_free_table() calls it_ops->free(), we need
to have it_ops initialized before calling iommu_free_table() so this
moves this initialization in pnv_pci_ioda2_create_table().

This should cause no behavioral change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/iommu.c               |  4 ++++
 arch/powerpc/platforms/powernv/pci-ioda.c | 10 ++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 685a4767b722..a3689fdedd4a 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -719,6 +719,9 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	if (!tbl)
 		return;
 
+	if (tbl->it_ops->free)
+		tbl->it_ops->free(tbl);
+
 	if (!tbl->it_map) {
 		kfree(tbl);
 		return;
@@ -745,6 +748,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	/* free table */
 	kfree(tbl);
 }
+EXPORT_SYMBOL_GPL(iommu_free_table);
 
 /* Creates TCEs for a user provided buffer.  The user buffer must be
  * contiguous real kernel storage (not vmalloc).  The address passed here
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 572e9c9f1ea0..5dae54cb11e3 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1424,7 +1424,6 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 		iommu_group_put(pe->table_group.group);
 		BUG_ON(pe->table_group.group);
 	}
-	pnv_pci_ioda2_table_free_pages(tbl);
 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
 }
 
@@ -2040,7 +2039,6 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
 static void pnv_ioda2_table_free(struct iommu_table *tbl)
 {
 	pnv_pci_ioda2_table_free_pages(tbl);
-	iommu_free_table(tbl, "pnv");
 }
 
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
@@ -2317,6 +2315,8 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
 	if (!tbl)
 		return -ENOMEM;
 
+	tbl->it_ops = &pnv_ioda2_iommu_ops;
+
 	ret = pnv_pci_ioda2_table_alloc_pages(nid,
 			bus_offset, page_shift, window_size,
 			levels, tbl);
@@ -2325,8 +2325,6 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
 		return ret;
 	}
 
-	tbl->it_ops = &pnv_ioda2_iommu_ops;
-
 	*ptbl = tbl;
 
 	return 0;
@@ -2367,7 +2365,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	if (rc) {
 		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
 				rc);
-		pnv_ioda2_table_free(tbl);
+		iommu_free_table(tbl, "");
 		return rc;
 	}
 
@@ -2455,7 +2453,7 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (pe->pbus)
 		pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
-	pnv_ioda2_table_free(tbl);
+	iommu_free_table(tbl, "pnv");
 }
 
 static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
-- 
cgit v1.2.1


From e5afdf9dd515a9446c009f44f99f9bc2f91b89a7 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:50 +1100
Subject: powerpc/vfio_spapr_tce: Add reference counting to iommu_table

So far iommu_table obejcts were only used in virtual mode and had
a single owner. We are going to change this by implementing in-kernel
acceleration of DMA mapping requests. The proposed acceleration
will handle requests in real mode and KVM will keep references to tables.

This adds a kref to iommu_table and defines new helpers to update it.
This replaces iommu_free_table() with iommu_tce_table_put() and makes
iommu_free_table() static. iommu_tce_table_get() is not used in this patch
but it will be in the following patch.

Since this touches prototypes, this also removes @node_name parameter as
it has never been really useful on powernv and carrying it for
the pseries platform code to iommu_free_table() seems to be quite
useless as well.

This should cause no behavioral change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/iommu.h          |  5 +++--
 arch/powerpc/kernel/iommu.c               | 27 ++++++++++++++++++++++-----
 arch/powerpc/platforms/powernv/pci-ioda.c | 14 +++++++-------
 arch/powerpc/platforms/powernv/pci.c      |  1 +
 arch/powerpc/platforms/pseries/iommu.c    |  3 ++-
 arch/powerpc/platforms/pseries/vio.c      |  2 +-
 6 files changed, 36 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 4554699aec02..d96142572e6d 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -119,6 +119,7 @@ struct iommu_table {
 	struct list_head it_group_list;/* List of iommu_table_group_link */
 	unsigned long *it_userspace; /* userspace view of the table */
 	struct iommu_table_ops *it_ops;
+	struct kref    it_kref;
 };
 
 #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
@@ -151,8 +152,8 @@ static inline void *get_iommu_table_base(struct device *dev)
 
 extern int dma_iommu_dma_supported(struct device *dev, u64 mask);
 
-/* Frees table for an individual device node */
-extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
+extern struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl);
+extern int iommu_tce_table_put(struct iommu_table *tbl);
 
 /* Initializes an iommu_table based in values set in the passed-in
  * structure
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index a3689fdedd4a..5a3231fedf08 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -711,13 +711,13 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	return tbl;
 }
 
-void iommu_free_table(struct iommu_table *tbl, const char *node_name)
+static void iommu_table_free(struct kref *kref)
 {
 	unsigned long bitmap_sz;
 	unsigned int order;
+	struct iommu_table *tbl;
 
-	if (!tbl)
-		return;
+	tbl = container_of(kref, struct iommu_table, it_kref);
 
 	if (tbl->it_ops->free)
 		tbl->it_ops->free(tbl);
@@ -736,7 +736,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 
 	/* verify that table contains no entries */
 	if (!bitmap_empty(tbl->it_map, tbl->it_size))
-		pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
+		pr_warn("%s: Unexpected TCEs\n", __func__);
 
 	/* calculate bitmap size in bytes */
 	bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
@@ -748,7 +748,24 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	/* free table */
 	kfree(tbl);
 }
-EXPORT_SYMBOL_GPL(iommu_free_table);
+
+struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl)
+{
+	if (kref_get_unless_zero(&tbl->it_kref))
+		return tbl;
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_table_get);
+
+int iommu_tce_table_put(struct iommu_table *tbl)
+{
+	if (WARN_ON(!tbl))
+		return 0;
+
+	return kref_put(&tbl->it_kref, iommu_table_free);
+}
+EXPORT_SYMBOL_GPL(iommu_tce_table_put);
 
 /* Creates TCEs for a user provided buffer.  The user buffer must be
  * contiguous real kernel storage (not vmalloc).  The address passed here
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 5dae54cb11e3..ee4cdb5b893f 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1424,7 +1424,7 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 		iommu_group_put(pe->table_group.group);
 		BUG_ON(pe->table_group.group);
 	}
-	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
@@ -2225,7 +2225,7 @@ found:
 		__free_pages(tce_mem, get_order(tce32_segsz * segs));
 	if (tbl) {
 		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
-		iommu_free_table(tbl, "pnv");
+		iommu_tce_table_put(tbl);
 	}
 }
 
@@ -2321,7 +2321,7 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
 			bus_offset, page_shift, window_size,
 			levels, tbl);
 	if (ret) {
-		iommu_free_table(tbl, "pnv");
+		iommu_tce_table_put(tbl);
 		return ret;
 	}
 
@@ -2365,7 +2365,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	if (rc) {
 		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
 				rc);
-		iommu_free_table(tbl, "");
+		iommu_tce_table_put(tbl);
 		return rc;
 	}
 
@@ -2453,7 +2453,7 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (pe->pbus)
 		pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
-	iommu_free_table(tbl, "pnv");
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
@@ -3428,7 +3428,7 @@ static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
 	}
 
 	free_pages(tbl->it_base, get_order(tbl->it_size << 3));
-	iommu_free_table(tbl, "pnv");
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
@@ -3455,7 +3455,7 @@ static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
 	}
 
 	pnv_pci_ioda2_table_free_pages(tbl);
-	iommu_free_table(tbl, "pnv");
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index eb835e977e33..204a829ff506 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -767,6 +767,7 @@ struct iommu_table *pnv_pci_table_alloc(int nid)
 
 	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid);
 	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+	kref_init(&tbl->it_kref);
 
 	return tbl;
 }
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 4d757eaa46bf..7ce5db209abf 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -74,6 +74,7 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 		goto fail_exit;
 
 	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+	kref_init(&tbl->it_kref);
 	tgl->table_group = table_group;
 	list_add_rcu(&tgl->next, &tbl->it_group_list);
 
@@ -115,7 +116,7 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group,
 		BUG_ON(table_group->group);
 	}
 #endif
-	iommu_free_table(tbl, node_name);
+	iommu_tce_table_put(tbl);
 
 	kfree(table_group);
 }
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 720493932486..28b09fd797ec 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -1318,7 +1318,7 @@ static void vio_dev_release(struct device *dev)
 	struct iommu_table *tbl = get_iommu_table_base(dev);
 
 	if (tbl)
-		iommu_free_table(tbl, of_node_full_name(dev->of_node));
+		iommu_tce_table_put(tbl);
 	of_node_put(dev->of_node);
 	kfree(to_vio_dev(dev));
 }
-- 
cgit v1.2.1


From 17bb69515c6dfbe282e8a9df08ffb719ff4dbc73 Mon Sep 17 00:00:00 2001
From: Vipin K Parashar <vipin@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 17:27:32 +0530
Subject: powerpc/powernv: Handle OPAL_WRONG_STATE in opal_get_sensor_data()

OPAL returns OPAL_WRONG_STATE upon failing to provide sensor data due to
core sleeping/offline. Add a check in opal_get_sensor_data() for sensor
read failure with OPAL_WRONG_STATE return code and return -EIO.

Signed-off-by: Vipin K Parashar <vipin@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-sensor.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c
index 308efd170c27..aa267f120033 100644
--- a/arch/powerpc/platforms/powernv/opal-sensor.c
+++ b/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -64,6 +64,10 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
 		*sensor_data = be32_to_cpu(data);
 		break;
 
+	case OPAL_WRONG_STATE:
+		ret = -EIO;
+		break;
+
 	default:
 		ret = opal_error_code(ret);
 		break;
-- 
cgit v1.2.1


From f6e6bedb773118713a8eb3736a4c99c71144275e Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 02:35:26 +0530
Subject: powerpc/fadump: Reserve memory at an offset closer to bottom of RAM

Currently, the area to preserve boot memory is reserved at the top of
RAM. This leaves fadump vulnerable to memory hot-remove operations. As
memory for fadump has to be reserved early in the boot process, fadump
can't be registered after a memory hot-remove operation. Though this
problem can't be eleminated completely, the impact can be minimized by
reserving memory at an offset closer to bottom of the RAM. The offset
for fadump memory reservation can be any value greater than fadump boot
memory size.

Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/fadump.c | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 8ff0dd4e77a7..33b2da302730 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -319,15 +319,34 @@ int __init fadump_reserve_mem(void)
 		pr_debug("fadumphdr_addr = %p\n",
 				(void *) fw_dump.fadumphdr_addr);
 	} else {
-		/* Reserve the memory at the top of memory. */
 		size = get_fadump_area_size();
-		base = memory_boundary - size;
-		memblock_reserve(base, size);
-		printk(KERN_INFO "Reserved %ldMB of memory at %ldMB "
-				"for firmware-assisted dump\n",
-				(unsigned long)(size >> 20),
-				(unsigned long)(base >> 20));
+
+		/*
+		 * Reserve memory at an offset closer to bottom of the RAM to
+		 * minimize the impact of memory hot-remove operation. We can't
+		 * use memblock_find_in_range() here since it doesn't allocate
+		 * from bottom to top.
+		 */
+		for (base = fw_dump.boot_memory_size;
+		     base <= (memory_boundary - size);
+		     base += size) {
+			if (memblock_is_region_memory(base, size) &&
+			    !memblock_is_region_reserved(base, size))
+				break;
+		}
+		if ((base > (memory_boundary - size)) ||
+		    memblock_reserve(base, size)) {
+			pr_err("Failed to reserve memory\n");
+			return 0;
+		}
+
+		pr_info("Reserved %ldMB of memory at %ldMB for firmware-"
+			"assisted dump (System RAM: %ldMB)\n",
+			(unsigned long)(size >> 20),
+			(unsigned long)(base >> 20),
+			(unsigned long)(memblock_phys_mem_size() >> 20));
 	}
+
 	fw_dump.reserve_dump_area_start = base;
 	fw_dump.reserve_dump_area_size = size;
 	return 1;
-- 
cgit v1.2.1


From adec9a2e7ca3b4193818f9a7b39563851b180711 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@windriver.com>
Date: Tue, 26 Apr 2016 09:49:38 -0700
Subject: powerpc/4xx: Make sam440ep_setup_rtc() init

sam440ep_setup_rtc() is just called by machine_device_initcall() so make
it __init.

Signed-off-by: Yang Shi <yang.shi@windriver.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/44x/sam440ep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/44x/sam440ep.c b/arch/powerpc/platforms/44x/sam440ep.c
index 688ffeab0699..55fed5e4de14 100644
--- a/arch/powerpc/platforms/44x/sam440ep.c
+++ b/arch/powerpc/platforms/44x/sam440ep.c
@@ -70,7 +70,7 @@ static struct i2c_board_info sam440ep_rtc_info = {
 	.irq = -1,
 };
 
-static int sam440ep_setup_rtc(void)
+static int __init sam440ep_setup_rtc(void)
 {
 	return i2c_register_board_info(0, &sam440ep_rtc_info, 1);
 }
-- 
cgit v1.2.1


From b42279f0165cdfb093718ec368e56dde7e8d80af Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 21 Mar 2017 22:59:51 +0530
Subject: powerpc/mm/nohash: MM_SLICE is only used by book3s 64

BOOKE code is dead code as per the Kconfig details. So make it simpler
by enabling MM_SLICE only for book3s_64. The changes w.r.t nohash is just
removing deadcode. W.r.t ppc64, 4k without hugetlb will now enable MM_SLICE.
But that is good, because we reduce one extra variant which probably is not
getting tested much.

Reviewed-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mmu-book3e.h        | 5 -----
 arch/powerpc/include/asm/nohash/64/pgtable.h | 5 -----
 arch/powerpc/mm/hugetlbpage-book3e.c         | 7 -------
 arch/powerpc/mm/mmu_context_nohash.c         | 5 -----
 arch/powerpc/platforms/Kconfig.cputype       | 2 +-
 5 files changed, 1 insertion(+), 23 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index b62a8d43a06c..7ca8d8e80ffa 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -229,11 +229,6 @@ typedef struct {
 	unsigned int	id;
 	unsigned int	active;
 	unsigned long	vdso_base;
-#ifdef CONFIG_PPC_MM_SLICES
-	u64 low_slices_psize;   /* SLB page size encodings */
-	u64 high_slices_psize;  /* 4 bits per slice for now */
-	u16 user_psize;         /* page size index */
-#endif
 #ifdef CONFIG_PPC_64K_PAGES
 	/* for 4K PTE fragment support */
 	void *pte_frag;
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index c7f927e67d14..f0ff384d4ca5 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -88,11 +88,6 @@
 #include <asm/nohash/pte-book3e.h>
 #include <asm/pte-common.h>
 
-#ifdef CONFIG_PPC_MM_SLICES
-#define HAVE_ARCH_UNMAPPED_AREA
-#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-#endif /* CONFIG_PPC_MM_SLICES */
-
 #ifndef __ASSEMBLY__
 /* pte_clear moved to later in this file */
 
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
index 83a8be791e06..bfe4e8526b2d 100644
--- a/arch/powerpc/mm/hugetlbpage-book3e.c
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -148,16 +148,9 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
 
 	mm = vma->vm_mm;
 
-#ifdef CONFIG_PPC_MM_SLICES
-	psize = get_slice_psize(mm, ea);
-	tsize = mmu_get_tsize(psize);
-	shift = mmu_psize_defs[psize].shift;
-#else
 	psize = vma_mmu_pagesize(vma);
 	shift = __ilog2(psize);
 	tsize = shift - 10;
-#endif
-
 	/*
 	 * We can't be interrupted while we're setting up the MAS
 	 * regusters or after we've confirmed that no tlb exists.
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index c491f2c8f2b9..4554d6527682 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -333,11 +333,6 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm)
 
 	mm->context.id = MMU_NO_CONTEXT;
 	mm->context.active = 0;
-
-#ifdef CONFIG_PPC_MM_SLICES
-	slice_set_user_psize(mm, mmu_virtual_psize);
-#endif
-
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 99b0ae8acb78..a7c0c1fafe68 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -359,7 +359,7 @@ config PPC_BOOK3E_MMU
 
 config PPC_MM_SLICES
 	bool
-	default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES)
+	default y if PPC_STD_MMU_64
 	default n
 
 config PPC_HAVE_PMU_SUPPORT
-- 
cgit v1.2.1


From 98beda74de246520653b42147443292d9814426d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 21 Mar 2017 22:59:52 +0530
Subject: powerpc/mm/slice: Fix off-by-1 error when computing slice mask

For low slice, max addr should be less than 4G. Without limiting this correctly
we will end up with a low slice mask which has 17th bit set. This is not
a problem with the current code because our low slice mask is of type u16. But
in later patch I am switching low slice mask to u64 type and having the 17bit
set result in wrong slice mask which in turn results in mmap failures.

Reviewed-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/slice.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 2b27458902ee..bf150557dba8 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -83,11 +83,10 @@ static struct slice_mask slice_range_to_mask(unsigned long start,
 	struct slice_mask ret = { 0, 0 };
 
 	if (start < SLICE_LOW_TOP) {
-		unsigned long mend = min(end, SLICE_LOW_TOP);
-		unsigned long mstart = min(start, SLICE_LOW_TOP);
+		unsigned long mend = min(end, (SLICE_LOW_TOP - 1));
 
 		ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
-			- (1u << GET_LOW_SLICE_INDEX(mstart));
+			- (1u << GET_LOW_SLICE_INDEX(start));
 	}
 
 	if ((start + len) > SLICE_LOW_TOP)
-- 
cgit v1.2.1


From f5bd0fdc0c74dc4cdc5d81319441c4ef9b47cd65 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 21 Mar 2017 22:59:53 +0530
Subject: powerpc/mm: Cleanup bits definition between hash and radix.

Define everything based on bits present in pgtable.h. This will help in easily
identifying overlapping bits between hash/radix.

No functional change with this patch.

Reviewed-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-64k.h |  4 ++++
 arch/powerpc/include/asm/book3s/64/hash.h     |  9 +++++----
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 10 ++--------
 arch/powerpc/include/asm/book3s/64/radix.h    |  6 ++++++
 4 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index f3dd21efa2ea..b39f0b86405e 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -6,6 +6,10 @@
 #define H_PUD_INDEX_SIZE  5
 #define H_PGD_INDEX_SIZE  12
 
+/*
+ * 64k aligned address free up few of the lower bits of RPN for us
+ * We steal that here. For more deatils look at pte_pfn/pfn_pte()
+ */
 #define H_PAGE_COMBO	0x00001000 /* this is a combo 4k page */
 #define H_PAGE_4K_PFN	0x00002000 /* PFN is for a single 4k page */
 /*
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index f7b721bbf918..ec2828b1db07 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -13,12 +13,13 @@
  * We could create separate kernel read-only if we used the 3 PP bits
  * combinations that newer processors provide but we currently don't.
  */
-#define H_PAGE_BUSY		0x00800 /* software: PTE & hash are busy */
+#define H_PAGE_BUSY		_RPAGE_SW1 /* software: PTE & hash are busy */
 #define H_PTE_NONE_MASK		_PAGE_HPTEFLAGS
 #define H_PAGE_F_GIX_SHIFT	57
-#define H_PAGE_F_GIX		(7ul << 57)	/* HPTE index within HPTEG */
-#define H_PAGE_F_SECOND		(1ul << 60)	/* HPTE is in 2ndary HPTEG */
-#define H_PAGE_HASHPTE		(1ul << 61)	/* PTE has associated HPTE */
+/* (7ul << 57) HPTE index within HPTEG */
+#define H_PAGE_F_GIX		(_RPAGE_RSV2 | _RPAGE_RSV3 | _RPAGE_RSV4)
+#define H_PAGE_F_SECOND		_RPAGE_RSV1	/* HPTE is in 2ndary HPTEG */
+#define H_PAGE_HASHPTE		_RPAGE_SW0	/* PTE has associated HPTE */
 
 #ifdef CONFIG_PPC_64K_PAGES
 #include <asm/book3s/64/hash-64k.h>
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 8f4d41936e5a..c39bc4cb9247 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -44,14 +44,8 @@
 #endif
 #define _PAGE_SPECIAL		_RPAGE_SW2 /* software: special page */
 
-/*
- * For P9 DD1 only, we need to track whether the pte's huge.
- */
-#define _PAGE_LARGE	_RPAGE_RSV1
-
-
-#define _PAGE_PTE		(1ul << 62)	/* distinguishes PTEs from pointers */
-#define _PAGE_PRESENT		(1ul << 63)	/* pte contains a translation */
+#define _PAGE_PTE		0x4000000000000000UL	/* distinguishes PTEs from pointers */
+#define _PAGE_PRESENT		0x8000000000000000UL	/* pte contains a translation */
 /*
  * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE
  * Instead of fixing all of them, add an alternate define which
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 9e0bb7cd6e22..2a2ea47a9bd2 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -11,6 +11,12 @@
 #include <asm/book3s/64/radix-4k.h>
 #endif
 
+/*
+ * For P9 DD1 only, we need to track whether the pte's huge.
+ */
+#define _PAGE_LARGE	_RPAGE_RSV1
+
+
 #ifndef __ASSEMBLY__
 #include <asm/book3s/64/tlbflush-radix.h>
 #include <asm/cpu_has_feature.h>
-- 
cgit v1.2.1


From ddb014b68dc620e76b7831b5e09d9b8ff07c570e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 21 Mar 2017 22:59:54 +0530
Subject: powerpc/mm/radix: rename _PAGE_LARGE to R_PAGE_LARGE

This bit is only used by radix and it is nice to follow the naming style of having
bit name start with H_/R_ depending on which translation mode they are used.

No functional change in this patch.

Reviewed-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hugetlb.h | 2 +-
 arch/powerpc/include/asm/book3s/64/radix.h   | 4 ++--
 arch/powerpc/mm/tlb-radix.c                  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index c62f14d0bec1..6666cd366596 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -46,7 +46,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
 	 */
 	VM_WARN_ON(page_shift == mmu_psize_defs[MMU_PAGE_1G].shift);
 	if (page_shift == mmu_psize_defs[MMU_PAGE_2M].shift)
-		return __pte(pte_val(entry) | _PAGE_LARGE);
+		return __pte(pte_val(entry) | R_PAGE_LARGE);
 	else
 		return entry;
 }
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 2a2ea47a9bd2..ac16d1943022 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -14,7 +14,7 @@
 /*
  * For P9 DD1 only, we need to track whether the pte's huge.
  */
-#define _PAGE_LARGE	_RPAGE_RSV1
+#define R_PAGE_LARGE	_RPAGE_RSV1
 
 
 #ifndef __ASSEMBLY__
@@ -258,7 +258,7 @@ static inline int radix__pmd_trans_huge(pmd_t pmd)
 static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
 {
 	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
-		return __pmd(pmd_val(pmd) | _PAGE_PTE | _PAGE_LARGE);
+		return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE);
 	return __pmd(pmd_val(pmd) | _PAGE_PTE);
 }
 static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma,
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 952713d6cf04..83dc1ccc2fa1 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -437,7 +437,7 @@ void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm,
 		return;
 	}
 
-	if (old_pte & _PAGE_LARGE)
+	if (old_pte & R_PAGE_LARGE)
 		radix__flush_tlb_page_psize(mm, address, MMU_PAGE_2M);
 	else
 		radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize);
-- 
cgit v1.2.1


From 3a4c26011da3ed8df1f4f93af82bd5f2b420751f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 21 Mar 2017 22:59:55 +0530
Subject: powerpc/mm: Add translation mode information in /proc/cpuinfo

With this we have on powernv and pseries /proc/cpuinfo reporting

timebase        : 512000000
platform        : PowerNV
model           : 8247-22L
machine         : PowerNV 8247-22L
firmware        : OPAL
MMU		: Hash

Reviewed-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/setup.c | 4 ++++
 arch/powerpc/platforms/pseries/setup.c | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index d50c7d99baaf..2d937f6d9260 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -95,6 +95,10 @@ static void pnv_show_cpuinfo(struct seq_file *m)
 	else
 		seq_printf(m, "firmware\t: BML\n");
 	of_node_put(root);
+	if (radix_enabled())
+		seq_printf(m, "MMU\t\t: Radix\n");
+	else
+		seq_printf(m, "MMU\t\t: Hash\n");
 }
 
 static void pnv_prepare_going_down(void)
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index b4d362ed03a1..b5d86426e97b 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -87,6 +87,10 @@ static void pSeries_show_cpuinfo(struct seq_file *m)
 		model = of_get_property(root, "model", NULL);
 	seq_printf(m, "machine\t\t: CHRP %s\n", model);
 	of_node_put(root);
+	if (radix_enabled())
+		seq_printf(m, "MMU\t\t: Radix\n");
+	else
+		seq_printf(m, "MMU\t\t: Hash\n");
 }
 
 /* Initialize firmware assisted non-maskable interrupts if
-- 
cgit v1.2.1


From a525108cf1cc14651602d678da38fa627a76a724 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 21 Mar 2017 22:59:56 +0530
Subject: powerpc/mm/hugetlb: Filter out hugepage size not supported by page
 table layout

Without this if firmware reports 1MB page size support we will crash
trying to use 1MB as hugetlb page size.

echo 300 > /sys/kernel/mm/hugepages/hugepages-1024kB/nr_hugepages

kernel BUG at ./arch/powerpc/include/asm/hugetlb.h:19!
.....
....
[c0000000e2c27b30] c00000000029dae8 .hugetlb_fault+0x638/0xda0
[c0000000e2c27c30] c00000000026fb64 .handle_mm_fault+0x844/0x1d70
[c0000000e2c27d70] c00000000004805c .do_page_fault+0x3dc/0x7c0
[c0000000e2c27e30] c00000000000ac98 handle_page_fault+0x10/0x30

With fix, we don't enable 1MB as hugepage size.

bash-4.2# cd /sys/kernel/mm/hugepages/
bash-4.2# ls
hugepages-16384kB  hugepages-16777216kB

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/hugetlbpage.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 8c3389cbcd12..a4f33de4008e 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -753,6 +753,24 @@ static int __init add_huge_page_size(unsigned long long size)
 	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
 		return -EINVAL;
 
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * We need to make sure that for different page sizes reported by
+	 * firmware we only add hugetlb support for page sizes that can be
+	 * supported by linux page table layout.
+	 * For now we have
+	 * Radix: 2M
+	 * Hash: 16M and 16G
+	 */
+	if (radix_enabled()) {
+		if (mmu_psize != MMU_PAGE_2M)
+			return -EINVAL;
+	} else {
+		if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
+			return -EINVAL;
+	}
+#endif
+
 	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
 
 	/* Return if huge page size has already been setup */
-- 
cgit v1.2.1


From 54c4025efc0fbc2a145d1b80ddc4875d5037c887 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 21 Mar 2017 22:59:57 +0530
Subject: powerpc/mm: Define _PAGE_SOFT_DIRTY unconditionally

Conditional PTE bit definition is confusing and results in coding error.

Reviewed-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index c39bc4cb9247..4d4ff9a324f0 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -37,11 +37,7 @@
 #define _RPAGE_RSV3		0x0400000000000000UL
 #define _RPAGE_RSV4		0x0200000000000000UL
 
-#ifdef CONFIG_MEM_SOFT_DIRTY
 #define _PAGE_SOFT_DIRTY	_RPAGE_SW3 /* software: software dirty tracking */
-#else
-#define _PAGE_SOFT_DIRTY	0x00000
-#endif
 #define _PAGE_SPECIAL		_RPAGE_SW2 /* software: special page */
 
 #define _PAGE_PTE		0x4000000000000000UL	/* distinguishes PTEs from pointers */
-- 
cgit v1.2.1


From 32789d3821898cf8038394772d41e32902018068 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 21 Mar 2017 22:59:58 +0530
Subject: powerpc/mm: Define all PTE bits based on radix definitions.

Reviewed-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 4 ++--
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index b39f0b86405e..7be54f9590a3 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -10,8 +10,8 @@
  * 64k aligned address free up few of the lower bits of RPN for us
  * We steal that here. For more deatils look at pte_pfn/pfn_pte()
  */
-#define H_PAGE_COMBO	0x00001000 /* this is a combo 4k page */
-#define H_PAGE_4K_PFN	0x00002000 /* PFN is for a single 4k page */
+#define H_PAGE_COMBO	_RPAGE_RPN0 /* this is a combo 4k page */
+#define H_PAGE_4K_PFN	_RPAGE_RPN1 /* PFN is for a single 4k page */
 /*
  * We need to differentiate between explicit huge page and THP huge
  * page, since THP huge page also need to track real subpage details
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 4d4ff9a324f0..96566df547a8 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -36,6 +36,8 @@
 #define _RPAGE_RSV2		0x0800000000000000UL
 #define _RPAGE_RSV3		0x0400000000000000UL
 #define _RPAGE_RSV4		0x0200000000000000UL
+#define _RPAGE_RPN0		0x01000
+#define _RPAGE_RPN1		0x02000
 
 #define _PAGE_SOFT_DIRTY	_RPAGE_SW3 /* software: software dirty tracking */
 #define _PAGE_SPECIAL		_RPAGE_SW2 /* software: special page */
-- 
cgit v1.2.1


From 2f18d533757da3899f4bedab0b2c051b080079dc Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 21 Mar 2017 22:59:59 +0530
Subject: powerpc/mm: Lower the max real address to 53 bits

Max value supported by hardware is 51 bits address. Radix page table define
a slot of 57 bits for future expansion. We restrict the value supported in
linux kernel 53 bits, so that we can use the bits between 57-53 for storing
hash linux page table bits. This is done in the next patch.

This will free up the software page table bits to be used for features
that are needed for both hash and radix. The current hash linux page table
format doesn't have any free software bits. Moving hash linux page table
specific bits to top of RPN field free up the software bits for other purpose.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 31 +++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 96566df547a8..1f9e9848bbd5 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -39,6 +39,30 @@
 #define _RPAGE_RPN0		0x01000
 #define _RPAGE_RPN1		0x02000
 
+/* Max physical address bit as per radix table */
+#define _RPAGE_PA_MAX		57
+
+/*
+ * Max physical address bit we will use for now.
+ *
+ * This is mostly a hardware limitation and for now Power9 has
+ * a 51 bit limit.
+ *
+ * This is different from the number of physical bit required to address
+ * the last byte of memory. That is defined by MAX_PHYSMEM_BITS.
+ * MAX_PHYSMEM_BITS is a linux limitation imposed by the maximum
+ * number of sections we can support (SECTIONS_SHIFT).
+ *
+ * This is different from Radix page table limitation above and
+ * should always be less than that. The limit is done such that
+ * we can overload the bits between _RPAGE_PA_MAX and _PAGE_PA_MAX
+ * for hash linux page table specific bits.
+ *
+ * In order to be compatible with future hardware generations we keep
+ * some offsets and limit this for now to 53
+ */
+#define _PAGE_PA_MAX		53
+
 #define _PAGE_SOFT_DIRTY	_RPAGE_SW3 /* software: software dirty tracking */
 #define _PAGE_SPECIAL		_RPAGE_SW2 /* software: special page */
 
@@ -51,10 +75,11 @@
  */
 #define _PAGE_NO_CACHE		_PAGE_TOLERANT
 /*
- * We support 57 bit real address in pte. Clear everything above 57, and
- * every thing below PAGE_SHIFT;
+ * We support _RPAGE_PA_MAX bit real address in pte. On the linux side
+ * we are limited by _PAGE_PA_MAX. Clear everything above _PAGE_PA_MAX
+ * and every thing below PAGE_SHIFT;
  */
-#define PTE_RPN_MASK	(((1UL << 57) - 1) & (PAGE_MASK))
+#define PTE_RPN_MASK	(((1UL << _PAGE_PA_MAX) - 1) & (PAGE_MASK))
 /*
  * set of bits not changed in pmd_modify. Even though we have hash specific bits
  * in here, on radix we expect them to be zero.
-- 
cgit v1.2.1


From 6aa59f5162fcca09c7dcc84d64e2ebd1e7449884 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 28 Mar 2017 15:21:12 +1100
Subject: powerpc/mm: Move hash specific pte bits to be top bits of RPN

We don't support the full 57 bits of physical address and hence can
overload the top bits of RPN as hash specific pte bits.

Add a BUILD_BUG_ON() to enforce the relationship between H_PAGE_F_SECOND
and H_PAGE_F_GIX.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Paul Mackerras <paulus@ozlabs.org>
[mpe: Move the BUILD_BUG_ON() into hash_utils_64.c and comment it]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash.h    | 17 +++++------------
 arch/powerpc/include/asm/book3s/64/pgtable.h | 16 +++++++++++++---
 arch/powerpc/mm/hash_utils_64.c              | 13 +++++++++++++
 3 files changed, 31 insertions(+), 15 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index ec2828b1db07..4e957b027fe0 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -6,20 +6,13 @@
  * Common bits between 4K and 64K pages in a linux-style PTE.
  * Additional bits may be defined in pgtable-hash64-*.h
  *
- * Note: We only support user read/write permissions. Supervisor always
- * have full read/write to pages above PAGE_OFFSET (pages below that
- * always use the user access permissions).
- *
- * We could create separate kernel read-only if we used the 3 PP bits
- * combinations that newer processors provide but we currently don't.
  */
-#define H_PAGE_BUSY		_RPAGE_SW1 /* software: PTE & hash are busy */
 #define H_PTE_NONE_MASK		_PAGE_HPTEFLAGS
-#define H_PAGE_F_GIX_SHIFT	57
-/* (7ul << 57) HPTE index within HPTEG */
-#define H_PAGE_F_GIX		(_RPAGE_RSV2 | _RPAGE_RSV3 | _RPAGE_RSV4)
-#define H_PAGE_F_SECOND		_RPAGE_RSV1	/* HPTE is in 2ndary HPTEG */
-#define H_PAGE_HASHPTE		_RPAGE_SW0	/* PTE has associated HPTE */
+#define H_PAGE_F_GIX_SHIFT	56
+#define H_PAGE_BUSY		_RPAGE_RSV1 /* software: PTE & hash are busy */
+#define H_PAGE_F_SECOND		_RPAGE_RSV2	/* HPTE is in 2ndary HPTEG */
+#define H_PAGE_F_GIX		(_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
+#define H_PAGE_HASHPTE		_RPAGE_RPN43	/* PTE has associated HPTE */
 
 #ifdef CONFIG_PPC_64K_PAGES
 #include <asm/book3s/64/hash-64k.h>
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 1f9e9848bbd5..fb72ff6b98e6 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -36,8 +36,21 @@
 #define _RPAGE_RSV2		0x0800000000000000UL
 #define _RPAGE_RSV3		0x0400000000000000UL
 #define _RPAGE_RSV4		0x0200000000000000UL
+
+#define _PAGE_PTE		0x4000000000000000UL	/* distinguishes PTEs from pointers */
+#define _PAGE_PRESENT		0x8000000000000000UL	/* pte contains a translation */
+
+/*
+ * Top and bottom bits of RPN which can be used by hash
+ * translation mode, because we expect them to be zero
+ * otherwise.
+ */
 #define _RPAGE_RPN0		0x01000
 #define _RPAGE_RPN1		0x02000
+#define _RPAGE_RPN44		0x0100000000000000UL
+#define _RPAGE_RPN43		0x0080000000000000UL
+#define _RPAGE_RPN42		0x0040000000000000UL
+#define _RPAGE_RPN41		0x0020000000000000UL
 
 /* Max physical address bit as per radix table */
 #define _RPAGE_PA_MAX		57
@@ -65,9 +78,6 @@
 
 #define _PAGE_SOFT_DIRTY	_RPAGE_SW3 /* software: software dirty tracking */
 #define _PAGE_SPECIAL		_RPAGE_SW2 /* software: special page */
-
-#define _PAGE_PTE		0x4000000000000000UL	/* distinguishes PTEs from pointers */
-#define _PAGE_PRESENT		0x8000000000000000UL	/* pte contains a translation */
 /*
  * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE
  * Instead of fixing all of them, add an alternate define which
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index c554768b1fa2..d0ee17029e99 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -981,6 +981,19 @@ void __init hash__early_init_devtree(void)
 
 void __init hash__early_init_mmu(void)
 {
+	/*
+	 * We have code in __hash_page_64K() and elsewhere, which assumes it can
+	 * do the following:
+	 *   new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX);
+	 *
+	 * Where the slot number is between 0-15, and values of 8-15 indicate
+	 * the secondary bucket. For that code to work H_PAGE_F_SECOND and
+	 * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and
+	 * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here
+	 * with a BUILD_BUG_ON().
+	 */
+	BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul  << (H_PAGE_F_GIX_SHIFT + 3)));
+
 	htab_init_page_sizes();
 
 	/*
-- 
cgit v1.2.1


From f3207c124e7aa8d4d9cf32cc45b10ceb4defedb9 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 09:06:47 +0530
Subject: powerpc/mm/slice: Convert slice_mask high slice to a bitmap

In followup patch we want to increase the va range which will result
in us requiring high_slices to have more than 64 bits. To enable this
convert high_slices to bitmap. We keep the number bits same in this patch
and later change that to higher value

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
[mpe: Fold in fix to use bitmap_empty()]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/page_64.h |  15 ++---
 arch/powerpc/mm/slice.c            | 112 +++++++++++++++++++++++++------------
 2 files changed, 81 insertions(+), 46 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
index 3e83d2a20b6f..bd55ff751938 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -98,19 +98,16 @@ extern u64 ppc64_pft_size;
 #define GET_LOW_SLICE_INDEX(addr)	((addr) >> SLICE_LOW_SHIFT)
 #define GET_HIGH_SLICE_INDEX(addr)	((addr) >> SLICE_HIGH_SHIFT)
 
+#ifndef __ASSEMBLY__
 /*
- * 1 bit per slice and we have one slice per 1TB
- * Right now we support only 64TB.
- * IF we change this we will have to change the type
- * of high_slices
+ * One bit per slice. We have lower slices which cover 256MB segments
+ * upto 4G range. That gets us 16 low slices. For the rest we track slices
+ * in 1TB size.
+ * 64 below is actually SLICE_NUM_HIGH to fixup complie errros
  */
-#define SLICE_MASK_SIZE 8
-
-#ifndef __ASSEMBLY__
-
 struct slice_mask {
 	u16 low_slices;
-	u64 high_slices;
+	DECLARE_BITMAP(high_slices, 64);
 };
 
 struct mm_struct;
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index bf150557dba8..639c7171d174 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -36,11 +36,6 @@
 #include <asm/copro.h>
 #include <asm/hugetlb.h>
 
-/* some sanity checks */
-#if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE
-#error H_PGTABLE_RANGE exceeds slice_mask high_slices size
-#endif
-
 static DEFINE_SPINLOCK(slice_convert_lock);
 
 
@@ -49,7 +44,7 @@ int _slice_debug = 1;
 
 static void slice_print_mask(const char *label, struct slice_mask mask)
 {
-	char	*p, buf[16 + 3 + 64 + 1];
+	char	*p, buf[SLICE_NUM_LOW + 3 + SLICE_NUM_HIGH + 1];
 	int	i;
 
 	if (!_slice_debug)
@@ -60,8 +55,12 @@ static void slice_print_mask(const char *label, struct slice_mask mask)
 	*(p++) = ' ';
 	*(p++) = '-';
 	*(p++) = ' ';
-	for (i = 0; i < SLICE_NUM_HIGH; i++)
-		*(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0';
+	for (i = 0; i < SLICE_NUM_HIGH; i++) {
+		if (test_bit(i, mask.high_slices))
+			*(p++) = '1';
+		else
+			*(p++) = '0';
+	}
 	*(p++) = 0;
 
 	printk(KERN_DEBUG "%s:%s\n", label, buf);
@@ -80,7 +79,10 @@ static struct slice_mask slice_range_to_mask(unsigned long start,
 					     unsigned long len)
 {
 	unsigned long end = start + len - 1;
-	struct slice_mask ret = { 0, 0 };
+	struct slice_mask ret;
+
+	ret.low_slices = 0;
+	bitmap_zero(ret.high_slices, SLICE_NUM_HIGH);
 
 	if (start < SLICE_LOW_TOP) {
 		unsigned long mend = min(end, (SLICE_LOW_TOP - 1));
@@ -89,10 +91,13 @@ static struct slice_mask slice_range_to_mask(unsigned long start,
 			- (1u << GET_LOW_SLICE_INDEX(start));
 	}
 
-	if ((start + len) > SLICE_LOW_TOP)
-		ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1))
-			- (1ul << GET_HIGH_SLICE_INDEX(start));
+	if ((start + len) > SLICE_LOW_TOP) {
+		unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
+		unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
+		unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;
 
+		bitmap_set(ret.high_slices, start_index, count);
+	}
 	return ret;
 }
 
@@ -129,9 +134,12 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
 
 static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
 {
-	struct slice_mask ret = { 0, 0 };
+	struct slice_mask ret;
 	unsigned long i;
 
+	ret.low_slices = 0;
+	bitmap_zero(ret.high_slices, SLICE_NUM_HIGH);
+
 	for (i = 0; i < SLICE_NUM_LOW; i++)
 		if (!slice_low_has_vma(mm, i))
 			ret.low_slices |= 1u << i;
@@ -141,7 +149,7 @@ static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
 
 	for (i = 0; i < SLICE_NUM_HIGH; i++)
 		if (!slice_high_has_vma(mm, i))
-			ret.high_slices |= 1ul << i;
+			__set_bit(i, ret.high_slices);
 
 	return ret;
 }
@@ -150,10 +158,13 @@ static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
 {
 	unsigned char *hpsizes;
 	int index, mask_index;
-	struct slice_mask ret = { 0, 0 };
+	struct slice_mask ret;
 	unsigned long i;
 	u64 lpsizes;
 
+	ret.low_slices = 0;
+	bitmap_zero(ret.high_slices, SLICE_NUM_HIGH);
+
 	lpsizes = mm->context.low_slices_psize;
 	for (i = 0; i < SLICE_NUM_LOW; i++)
 		if (((lpsizes >> (i * 4)) & 0xf) == psize)
@@ -164,7 +175,7 @@ static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
 		mask_index = i & 0x1;
 		index = i >> 1;
 		if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
-			ret.high_slices |= 1ul << i;
+			__set_bit(i, ret.high_slices);
 	}
 
 	return ret;
@@ -172,8 +183,13 @@ static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
 
 static int slice_check_fit(struct slice_mask mask, struct slice_mask available)
 {
+	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
+
+	bitmap_and(result, mask.high_slices,
+		   available.high_slices, SLICE_NUM_HIGH);
+
 	return (mask.low_slices & available.low_slices) == mask.low_slices &&
-		(mask.high_slices & available.high_slices) == mask.high_slices;
+		bitmap_equal(result, mask.high_slices, SLICE_NUM_HIGH);
 }
 
 static void slice_flush_segments(void *parm)
@@ -220,7 +236,7 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
 	for (i = 0; i < SLICE_NUM_HIGH; i++) {
 		mask_index = i & 0x1;
 		index = i >> 1;
-		if (mask.high_slices & (1ul << i))
+		if (test_bit(i, mask.high_slices))
 			hpsizes[index] = (hpsizes[index] &
 					  ~(0xf << (mask_index * 4))) |
 				(((unsigned long)psize) << (mask_index * 4));
@@ -256,7 +272,7 @@ static bool slice_scan_available(unsigned long addr,
 		slice = GET_HIGH_SLICE_INDEX(addr);
 		*boundary_addr = (slice + end) ?
 			((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
-		return !!(available.high_slices & (1ul << slice));
+		return !!test_bit(slice, available.high_slices);
 	}
 }
 
@@ -363,15 +379,24 @@ static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
 		return slice_find_area_bottomup(mm, len, mask, psize);
 }
 
-#define or_mask(dst, src)	do {			\
-	(dst).low_slices |= (src).low_slices;		\
-	(dst).high_slices |= (src).high_slices;		\
-} while (0)
+static inline void slice_or_mask(struct slice_mask *dst, struct slice_mask *src)
+{
+	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
 
-#define andnot_mask(dst, src)	do {			\
-	(dst).low_slices &= ~(src).low_slices;		\
-	(dst).high_slices &= ~(src).high_slices;	\
-} while (0)
+	dst->low_slices |= src->low_slices;
+	bitmap_or(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
+	bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH);
+}
+
+static inline void slice_andnot_mask(struct slice_mask *dst, struct slice_mask *src)
+{
+	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
+
+	dst->low_slices &= ~src->low_slices;
+
+	bitmap_andnot(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
+	bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH);
+}
 
 #ifdef CONFIG_PPC_64K_PAGES
 #define MMU_PAGE_BASE	MMU_PAGE_64K
@@ -383,15 +408,28 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 				      unsigned long flags, unsigned int psize,
 				      int topdown)
 {
-	struct slice_mask mask = {0, 0};
+	struct slice_mask mask;
 	struct slice_mask good_mask;
-	struct slice_mask potential_mask = {0,0} /* silence stupid warning */;
-	struct slice_mask compat_mask = {0, 0};
+	struct slice_mask potential_mask;
+	struct slice_mask compat_mask;
 	int fixed = (flags & MAP_FIXED);
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
 	struct mm_struct *mm = current->mm;
 	unsigned long newaddr;
 
+	/*
+	 * init different masks
+	 */
+	mask.low_slices = 0;
+	bitmap_zero(mask.high_slices, SLICE_NUM_HIGH);
+
+	/* silence stupid warning */;
+	potential_mask.low_slices = 0;
+	bitmap_zero(potential_mask.high_slices, SLICE_NUM_HIGH);
+
+	compat_mask.low_slices = 0;
+	bitmap_zero(compat_mask.high_slices, SLICE_NUM_HIGH);
+
 	/* Sanity checks */
 	BUG_ON(mm->task_size == 0);
 	VM_BUG_ON(radix_enabled());
@@ -449,7 +487,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	if (psize == MMU_PAGE_64K) {
 		compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
 		if (fixed)
-			or_mask(good_mask, compat_mask);
+			slice_or_mask(&good_mask, &compat_mask);
 	}
 #endif
 
@@ -484,7 +522,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	 * empty and thus can be converted
 	 */
 	potential_mask = slice_mask_for_free(mm);
-	or_mask(potential_mask, good_mask);
+	slice_or_mask(&potential_mask, &good_mask);
 	slice_print_mask(" potential", potential_mask);
 
 	if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) {
@@ -517,7 +555,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 #ifdef CONFIG_PPC_64K_PAGES
 	if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
 		/* retry the search with 4k-page slices included */
-		or_mask(potential_mask, compat_mask);
+		slice_or_mask(&potential_mask, &compat_mask);
 		addr = slice_find_area(mm, len, potential_mask, psize,
 				       topdown);
 	}
@@ -531,9 +569,9 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	slice_print_mask(" mask", mask);
 
  convert:
-	andnot_mask(mask, good_mask);
-	andnot_mask(mask, compat_mask);
-	if (mask.low_slices || mask.high_slices) {
+	slice_andnot_mask(&mask, &good_mask);
+	slice_andnot_mask(&mask, &compat_mask);
+	if (mask.low_slices || !bitmap_empty(mask.high_slices, SLICE_NUM_HIGH)) {
 		slice_convert(mm, mask, psize);
 		if (psize > MMU_PAGE_BASE)
 			on_each_cpu(slice_flush_segments, mm, 1);
@@ -700,7 +738,7 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 	if (psize == MMU_PAGE_64K) {
 		struct slice_mask compat_mask;
 		compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
-		or_mask(available, compat_mask);
+		slice_or_mask(&available, &compat_mask);
 	}
 #endif
 
-- 
cgit v1.2.1


From a4d3621503290f73b2ca65a6de58f01296c0b85c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 09:06:48 +0530
Subject: powerpc/mm/slice: Update the function prototype

This avoid copying the slice_mask struct as function return value

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/slice.c | 62 ++++++++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 34 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 639c7171d174..d1da357583e3 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -75,19 +75,18 @@ static void slice_print_mask(const char *label, struct slice_mask mask) {}
 
 #endif
 
-static struct slice_mask slice_range_to_mask(unsigned long start,
-					     unsigned long len)
+static void slice_range_to_mask(unsigned long start, unsigned long len,
+				struct slice_mask *ret)
 {
 	unsigned long end = start + len - 1;
-	struct slice_mask ret;
 
-	ret.low_slices = 0;
-	bitmap_zero(ret.high_slices, SLICE_NUM_HIGH);
+	ret->low_slices = 0;
+	bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
 
 	if (start < SLICE_LOW_TOP) {
 		unsigned long mend = min(end, (SLICE_LOW_TOP - 1));
 
-		ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
+		ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
 			- (1u << GET_LOW_SLICE_INDEX(start));
 	}
 
@@ -96,9 +95,8 @@ static struct slice_mask slice_range_to_mask(unsigned long start,
 		unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
 		unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;
 
-		bitmap_set(ret.high_slices, start_index, count);
+		bitmap_set(ret->high_slices, start_index, count);
 	}
-	return ret;
 }
 
 static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
@@ -132,53 +130,47 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
 	return !slice_area_is_free(mm, start, end - start);
 }
 
-static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
+static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret)
 {
-	struct slice_mask ret;
 	unsigned long i;
 
-	ret.low_slices = 0;
-	bitmap_zero(ret.high_slices, SLICE_NUM_HIGH);
+	ret->low_slices = 0;
+	bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
 
 	for (i = 0; i < SLICE_NUM_LOW; i++)
 		if (!slice_low_has_vma(mm, i))
-			ret.low_slices |= 1u << i;
+			ret->low_slices |= 1u << i;
 
 	if (mm->task_size <= SLICE_LOW_TOP)
-		return ret;
+		return;
 
 	for (i = 0; i < SLICE_NUM_HIGH; i++)
 		if (!slice_high_has_vma(mm, i))
-			__set_bit(i, ret.high_slices);
-
-	return ret;
+			__set_bit(i, ret->high_slices);
 }
 
-static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
+static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_mask *ret)
 {
 	unsigned char *hpsizes;
 	int index, mask_index;
-	struct slice_mask ret;
 	unsigned long i;
 	u64 lpsizes;
 
-	ret.low_slices = 0;
-	bitmap_zero(ret.high_slices, SLICE_NUM_HIGH);
+	ret->low_slices = 0;
+	bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
 
 	lpsizes = mm->context.low_slices_psize;
 	for (i = 0; i < SLICE_NUM_LOW; i++)
 		if (((lpsizes >> (i * 4)) & 0xf) == psize)
-			ret.low_slices |= 1u << i;
+			ret->low_slices |= 1u << i;
 
 	hpsizes = mm->context.high_slices_psize;
 	for (i = 0; i < SLICE_NUM_HIGH; i++) {
 		mask_index = i & 0x1;
 		index = i >> 1;
 		if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
-			__set_bit(i, ret.high_slices);
+			__set_bit(i, ret->high_slices);
 	}
-
-	return ret;
 }
 
 static int slice_check_fit(struct slice_mask mask, struct slice_mask available)
@@ -460,7 +452,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	/* First make up a "good" mask of slices that have the right size
 	 * already
 	 */
-	good_mask = slice_mask_for_size(mm, psize);
+	slice_mask_for_size(mm, psize, &good_mask);
 	slice_print_mask(" good_mask", good_mask);
 
 	/*
@@ -485,7 +477,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 #ifdef CONFIG_PPC_64K_PAGES
 	/* If we support combo pages, we can allow 64k pages in 4k slices */
 	if (psize == MMU_PAGE_64K) {
-		compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
+		slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask);
 		if (fixed)
 			slice_or_mask(&good_mask, &compat_mask);
 	}
@@ -494,7 +486,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	/* First check hint if it's valid or if we have MAP_FIXED */
 	if (addr != 0 || fixed) {
 		/* Build a mask for the requested range */
-		mask = slice_range_to_mask(addr, len);
+		slice_range_to_mask(addr, len, &mask);
 		slice_print_mask(" mask", mask);
 
 		/* Check if we fit in the good mask. If we do, we just return,
@@ -521,7 +513,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	/* We don't fit in the good mask, check what other slices are
 	 * empty and thus can be converted
 	 */
-	potential_mask = slice_mask_for_free(mm);
+	slice_mask_for_free(mm, &potential_mask);
 	slice_or_mask(&potential_mask, &good_mask);
 	slice_print_mask(" potential", potential_mask);
 
@@ -564,7 +556,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	if (addr == -ENOMEM)
 		return -ENOMEM;
 
-	mask = slice_range_to_mask(addr, len);
+	slice_range_to_mask(addr, len, &mask);
 	slice_dbg(" found potential area at 0x%lx\n", addr);
 	slice_print_mask(" mask", mask);
 
@@ -696,9 +688,11 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
 void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
 			   unsigned long len, unsigned int psize)
 {
-	struct slice_mask mask = slice_range_to_mask(start, len);
+	struct slice_mask mask;
 
 	VM_BUG_ON(radix_enabled());
+
+	slice_range_to_mask(start, len, &mask);
 	slice_convert(mm, mask, psize);
 }
 
@@ -731,13 +725,13 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 	if (radix_enabled())
 		return 0;
 
-	mask = slice_range_to_mask(addr, len);
-	available = slice_mask_for_size(mm, psize);
+	slice_range_to_mask(addr, len, &mask);
+	slice_mask_for_size(mm, psize, &available);
 #ifdef CONFIG_PPC_64K_PAGES
 	/* We need to account for 4k slices too */
 	if (psize == MMU_PAGE_64K) {
 		struct slice_mask compat_mask;
-		compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
+		slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask);
 		slice_or_mask(&available, &compat_mask);
 	}
 #endif
-- 
cgit v1.2.1


From 52b1e66587b6b4f2e5a09102b3a40eed8ec3675f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 09:06:49 +0530
Subject: powerpc/mm: Move copy_mm_to_paca to paca.c

We also update the function arg to struct mm_struct. Move this so that function
finds the definition of struct mm_struct. No functional change in this patch.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/paca.h | 18 +-----------------
 arch/powerpc/kernel/paca.c      | 19 +++++++++++++++++++
 arch/powerpc/mm/hash_utils_64.c |  4 ++--
 arch/powerpc/mm/slb.c           |  2 +-
 arch/powerpc/mm/slice.c         |  2 +-
 5 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 708c3e592eeb..f48c250339fd 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -206,23 +206,7 @@ struct paca_struct {
 #endif
 };
 
-#ifdef CONFIG_PPC_BOOK3S
-static inline void copy_mm_to_paca(mm_context_t *context)
-{
-	get_paca()->mm_ctx_id = context->id;
-#ifdef CONFIG_PPC_MM_SLICES
-	get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
-	memcpy(&get_paca()->mm_ctx_high_slices_psize,
-	       &context->high_slices_psize, SLICE_ARRAY_SIZE);
-#else
-	get_paca()->mm_ctx_user_psize = context->user_psize;
-	get_paca()->mm_ctx_sllp = context->sllp;
-#endif
-}
-#else
-static inline void copy_mm_to_paca(mm_context_t *context){}
-#endif
-
+extern void copy_mm_to_paca(struct mm_struct *mm);
 extern struct paca_struct *paca;
 extern void initialise_paca(struct paca_struct *new_paca, int cpu);
 extern void setup_paca(struct paca_struct *new_paca);
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index dfc479df9634..e2cf745a4b94 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -245,3 +245,22 @@ void __init free_unused_pacas(void)
 
 	free_lppacas();
 }
+
+void copy_mm_to_paca(struct mm_struct *mm)
+{
+#ifdef CONFIG_PPC_BOOK3S
+	mm_context_t *context = &mm->context;
+
+	get_paca()->mm_ctx_id = context->id;
+#ifdef CONFIG_PPC_MM_SLICES
+	get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
+	memcpy(&get_paca()->mm_ctx_high_slices_psize,
+	       &context->high_slices_psize, SLICE_ARRAY_SIZE);
+#else /* CONFIG_PPC_MM_SLICES */
+	get_paca()->mm_ctx_user_psize = context->user_psize;
+	get_paca()->mm_ctx_sllp = context->sllp;
+#endif
+#else /* CONFIG_PPC_BOOK3S */
+	return;
+#endif
+}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index d0ee17029e99..716255f88bbd 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1133,7 +1133,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
 	copro_flush_all_slbs(mm);
 	if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
 
-		copy_mm_to_paca(&mm->context);
+		copy_mm_to_paca(mm);
 		slb_flush_and_rebolt();
 	}
 }
@@ -1205,7 +1205,7 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
 {
 	if (user_region) {
 		if (psize != get_paca_psize(ea)) {
-			copy_mm_to_paca(&mm->context);
+			copy_mm_to_paca(mm);
 			slb_flush_and_rebolt();
 		}
 	} else if (get_paca()->vmalloc_sllp !=
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 5e01b2ece1d0..98ae810b8c21 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -229,7 +229,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 		asm volatile("slbie %0" : : "r" (slbie_data));
 
 	get_paca()->slb_cache_ptr = 0;
-	copy_mm_to_paca(&mm->context);
+	copy_mm_to_paca(mm);
 
 	/*
 	 * preload some userspace segments into the SLB.
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index d1da357583e3..175e0dbce89b 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -192,7 +192,7 @@ static void slice_flush_segments(void *parm)
 	if (mm != current->active_mm)
 		return;
 
-	copy_mm_to_paca(&current->active_mm->context);
+	copy_mm_to_paca(current->active_mm);
 
 	local_irq_save(flags);
 	slb_flush_and_rebolt();
-- 
cgit v1.2.1


From 1b2fa5a046b4716cf4b11bb29a581e680cc6e9ef Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 09:06:50 +0530
Subject: powerpc/mm: Remove checks that TASK_SIZE_USER64 is too small

Remove the checks that TASK_SIZE_USER64 is smaller than H_PGTABLE_RANGE
and USER_VSID_RANGE.

In a following patch we will deliberately add support for a TASK_SIZE
smaller than both ranges, so this will no longer be an error condition.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
[mpe: Keep the check in pgtable_64.c that we don't exceed USER_VSID_RANGE]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/init_64.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 9be992083d2a..8f6f2a173e47 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -71,10 +71,6 @@
 #if H_PGTABLE_RANGE > USER_VSID_RANGE
 #warning Limited user VSID range means pagetable space is wasted
 #endif
-
-#if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
-#warning TASK_SIZE is smaller than it needs to be.
-#endif
 #endif /* CONFIG_PPC_STD_MMU_64 */
 
 phys_addr_t memstart_addr = ~0;
-- 
cgit v1.2.1


From 82185222ff984edb50fc2adc752a46490843ecee Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 09:06:51 +0530
Subject: powerpc/mm/slice: Move slice_mask struct definition to slice.c

This structure definition need not be in a header since this is used only by
slice.c file. So move it to slice.c. This also allow us to use SLICE_NUM_HIGH
instead of 64.

I also switch the low_slices type to u64 from u16. This doesn't have an impact
on size of struct due to padding added with u16 type. This helps in using
bitmap printing function for printing slice mask.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/page_64.h | 11 -----------
 arch/powerpc/mm/slice.c            | 10 +++++++++-
 2 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
index bd55ff751938..c4d9654bd637 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -99,17 +99,6 @@ extern u64 ppc64_pft_size;
 #define GET_HIGH_SLICE_INDEX(addr)	((addr) >> SLICE_HIGH_SHIFT)
 
 #ifndef __ASSEMBLY__
-/*
- * One bit per slice. We have lower slices which cover 256MB segments
- * upto 4G range. That gets us 16 low slices. For the rest we track slices
- * in 1TB size.
- * 64 below is actually SLICE_NUM_HIGH to fixup complie errros
- */
-struct slice_mask {
-	u16 low_slices;
-	DECLARE_BITMAP(high_slices, 64);
-};
-
 struct mm_struct;
 
 extern unsigned long slice_get_unmapped_area(unsigned long addr,
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 175e0dbce89b..7599c2703520 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -37,7 +37,15 @@
 #include <asm/hugetlb.h>
 
 static DEFINE_SPINLOCK(slice_convert_lock);
-
+/*
+ * One bit per slice. We have lower slices which cover 256MB segments
+ * upto 4G range. That gets us 16 low slices. For the rest we track slices
+ * in 1TB size.
+ */
+struct slice_mask {
+	u64 low_slices;
+	DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH);
+};
 
 #ifdef DEBUG
 int _slice_debug = 1;
-- 
cgit v1.2.1


From 302413cad56820d4cd0d4fb66d01c81b4fcb1a10 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 09:06:52 +0530
Subject: powerpc/mm/slice: Update slice mask printing to use bitmap printing.

We now get output like below which is much better.

[    0.935306]  good_mask low_slice: 0-15
[    0.935360]  good_mask high_slice: 0-511

Compared to

[    0.953414]  good_mask:1111111111111111 - 1111111111111.........

I also fixed an error with slice_dbg printing.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/slice.c | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 7599c2703520..95e5a20b1b6a 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -52,29 +52,13 @@ int _slice_debug = 1;
 
 static void slice_print_mask(const char *label, struct slice_mask mask)
 {
-	char	*p, buf[SLICE_NUM_LOW + 3 + SLICE_NUM_HIGH + 1];
-	int	i;
-
 	if (!_slice_debug)
 		return;
-	p = buf;
-	for (i = 0; i < SLICE_NUM_LOW; i++)
-		*(p++) = (mask.low_slices & (1 << i)) ? '1' : '0';
-	*(p++) = ' ';
-	*(p++) = '-';
-	*(p++) = ' ';
-	for (i = 0; i < SLICE_NUM_HIGH; i++) {
-		if (test_bit(i, mask.high_slices))
-			*(p++) = '1';
-		else
-			*(p++) = '0';
-	}
-	*(p++) = 0;
-
-	printk(KERN_DEBUG "%s:%s\n", label, buf);
+	pr_devel("%s low_slice: %*pbl\n", label, (int)SLICE_NUM_LOW, &mask.low_slices);
+	pr_devel("%s high_slice: %*pbl\n", label, (int)SLICE_NUM_HIGH, mask.high_slices);
 }
 
-#define slice_dbg(fmt...) do { if (_slice_debug) pr_debug(fmt); } while(0)
+#define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0)
 
 #else
 
@@ -243,8 +227,8 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
 	}
 
 	slice_dbg(" lsps=%lx, hsps=%lx\n",
-		  mm->context.low_slices_psize,
-		  mm->context.high_slices_psize);
+		  (unsigned long)mm->context.low_slices_psize,
+		  (unsigned long)mm->context.high_slices_psize);
 
 	spin_unlock_irqrestore(&slice_convert_lock, flags);
 
@@ -686,8 +670,8 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
 
 
 	slice_dbg(" lsps=%lx, hsps=%lx\n",
-		  mm->context.low_slices_psize,
-		  mm->context.high_slices_psize);
+		  (unsigned long)mm->context.low_slices_psize,
+		  (unsigned long)mm->context.high_slices_psize);
 
  bail:
 	spin_unlock_irqrestore(&slice_convert_lock, flags);
-- 
cgit v1.2.1


From a336f2f5b05c3c02876a365b8f17b3d10920dbd5 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 29 Mar 2017 22:00:46 +1100
Subject: powerpc/mm/hash: Abstract context id allocation for KVM

KVM wants to be able to allocate an MMU context id, which it does
currently by calling __init_new_context().

We're about to rework that code, so provide a wrapper for KVM so it
can not worry about the details.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mmu_context.h |  2 +-
 arch/powerpc/kvm/book3s_64_mmu_host.c  |  2 +-
 arch/powerpc/mm/mmu_context_book3s64.c | 10 ++++++++--
 3 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index b9e3f0aca261..7d721101ec78 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -51,7 +51,7 @@ static inline void switch_mmu_context(struct mm_struct *prev,
 	return switch_slb(tsk, next);
 }
 
-extern int __init_new_context(void);
+extern int hash__alloc_context_id(void);
 extern void __destroy_context(int context_id);
 static inline void mmu_context_init(void) { }
 #else
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index a587e8f4fd26..b35f44c98d1f 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -390,7 +390,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
 	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	int err;
 
-	err = __init_new_context();
+	err = hash__alloc_context_id();
 	if (err < 0)
 		return -1;
 	vcpu3s->context_id[0] = err;
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 73bf6e14c3aa..650a498b1de9 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -30,7 +30,7 @@
 static DEFINE_SPINLOCK(mmu_context_lock);
 static DEFINE_IDA(mmu_context_ida);
 
-int __init_new_context(void)
+static int __init_new_context(void)
 {
 	int index;
 	int err;
@@ -57,7 +57,13 @@ again:
 
 	return index;
 }
-EXPORT_SYMBOL_GPL(__init_new_context);
+
+int hash__alloc_context_id(void)
+{
+	return __init_new_context();
+}
+EXPORT_SYMBOL_GPL(hash__alloc_context_id);
+
 static int radix__init_new_context(struct mm_struct *mm, int index)
 {
 	unsigned long rts_field;
-- 
cgit v1.2.1


From c1ff840d21208aff8cea18c0ae052c536d74f53e Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 29 Mar 2017 22:10:45 +1100
Subject: powerpc/mm/hash: Pull hash constants into hash__alloc_context_id()

The min and max context id values used in alloc_context_id() are
currently the right values for use on hash, and happen to also be safe
for use on radix.

But we need to change that in a subsequent patch, so make the min/max
ids parameters and pull the hash values into hsah__alloc_context_id().

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/mmu_context_book3s64.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 650a498b1de9..981c3b02e46a 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -30,17 +30,16 @@
 static DEFINE_SPINLOCK(mmu_context_lock);
 static DEFINE_IDA(mmu_context_ida);
 
-static int __init_new_context(void)
+static int alloc_context_id(int min_id, int max_id)
 {
-	int index;
-	int err;
+	int index, err;
 
 again:
 	if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
 		return -ENOMEM;
 
 	spin_lock(&mmu_context_lock);
-	err = ida_get_new_above(&mmu_context_ida, 1, &index);
+	err = ida_get_new_above(&mmu_context_ida, min_id, &index);
 	spin_unlock(&mmu_context_lock);
 
 	if (err == -EAGAIN)
@@ -48,7 +47,7 @@ again:
 	else if (err)
 		return err;
 
-	if (index > MAX_USER_CONTEXT) {
+	if (index > max_id) {
 		spin_lock(&mmu_context_lock);
 		ida_remove(&mmu_context_ida, index);
 		spin_unlock(&mmu_context_lock);
@@ -60,7 +59,7 @@ again:
 
 int hash__alloc_context_id(void)
 {
-	return __init_new_context();
+	return alloc_context_id(1, MAX_USER_CONTEXT);
 }
 EXPORT_SYMBOL_GPL(hash__alloc_context_id);
 
@@ -80,7 +79,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
 	int index;
 
-	index = __init_new_context();
+	index = hash__alloc_context_id();
 	if (index < 0)
 		return index;
 
-- 
cgit v1.2.1


From 760573c1a98cce20b9306f3922d460e58b4ba656 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 29 Mar 2017 22:36:56 +1100
Subject: powerpc/mm: Split radix vs hash mm context initialisation

Complete the split of the radix vs hash mm context initialisation.

This is mostly code movement, with the exception that we now limit the
context allocation to PRTB_ENTRIES - 1 on radix.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu.h |  2 +
 arch/powerpc/mm/mmu_context_book3s64.c   | 67 ++++++++++++++++++++------------
 2 files changed, 45 insertions(+), 24 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 805d4105e9bb..cce434e7de06 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -65,6 +65,8 @@ extern struct patb_entry *partition_tb;
  * MAX_USER_CONTEXT * 16 bytes of space.
  */
 #define PRTB_SIZE_SHIFT	(CONTEXT_BITS + 4)
+#define PRTB_ENTRIES	(1ul << CONTEXT_BITS)
+
 /*
  * Power9 currently only support 64K partition table size.
  */
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 981c3b02e46a..e5da551edde3 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -63,47 +63,66 @@ int hash__alloc_context_id(void)
 }
 EXPORT_SYMBOL_GPL(hash__alloc_context_id);
 
-static int radix__init_new_context(struct mm_struct *mm, int index)
+static int hash__init_new_context(struct mm_struct *mm)
+{
+	int index;
+
+	index = hash__alloc_context_id();
+	if (index < 0)
+		return index;
+
+	/*
+	 * The old code would re-promote on fork, we don't do that when using
+	 * slices as it could cause problem promoting slices that have been
+	 * forced down to 4K.
+	 *
+	 * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
+	 * explicitly against context.id == 0. This ensures that we properly
+	 * initialize context slice details for newly allocated mm's (which will
+	 * have id == 0) and don't alter context slice inherited via fork (which
+	 * will have id != 0).
+	 *
+	 * We should not be calling init_new_context() on init_mm. Hence a
+	 * check against 0 is OK.
+	 */
+	if (mm->context.id == 0)
+		slice_set_user_psize(mm, mmu_virtual_psize);
+
+	subpage_prot_init_new_context(mm);
+
+	return index;
+}
+
+static int radix__init_new_context(struct mm_struct *mm)
 {
 	unsigned long rts_field;
+	int index;
+
+	index = alloc_context_id(1, PRTB_ENTRIES - 1);
+	if (index < 0)
+		return index;
 
 	/*
 	 * set the process table entry,
 	 */
 	rts_field = radix__get_tree_size();
 	process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
-	return 0;
+
+	return index;
 }
 
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
 	int index;
 
-	index = hash__alloc_context_id();
+	if (radix_enabled())
+		index = radix__init_new_context(mm);
+	else
+		index = hash__init_new_context(mm);
+
 	if (index < 0)
 		return index;
 
-	if (radix_enabled()) {
-		radix__init_new_context(mm, index);
-	} else {
-
-		/* The old code would re-promote on fork, we don't do that
-		 * when using slices as it could cause problem promoting slices
-		 * that have been forced down to 4K
-		 *
-		 * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
-		 * explicitly against context.id == 0. This ensures that we
-		 * properly initialize context slice details for newly allocated
-		 * mm's (which will have id == 0) and don't alter context slice
-		 * inherited via fork (which will have id != 0).
-		 *
-		 * We should not be calling init_new_context() on init_mm. Hence a
-		 * check against 0 is ok.
-		 */
-		if (mm->context.id == 0)
-			slice_set_user_psize(mm, mmu_virtual_psize);
-		subpage_prot_init_new_context(mm);
-	}
 	mm->context.id = index;
 #ifdef CONFIG_PPC_ICSWX
 	mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
-- 
cgit v1.2.1


From 941711a3a049abc012be0eb8856ff6eec40a4570 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 29 Mar 2017 23:10:22 +1100
Subject: powerpc/mm/hash: Use context ids 1-4 for the kernel

Currently we use the top 4 context ids (0x7fffc-0x7ffff) for the kernel.
Kernel VSIDs are built using these top context values and effective the
segement ID. In subsequent patches we want to increase the max effective
address to 512TB. We will achieve that by increasing the effective
segment IDs there by increasing virtual address range.

We will be switching to a 68bit virtual address in the following patch.
But platforms like Power4 and Power5 only support a 65 bit virtual
address. We will handle that by limiting the context bits to 16 instead
of 19 on those platforms. That means the max context id will have a
different value on different platforms.

So that we don't have to deal with the kernel context ids changing
between different platforms, move the kernel context ids down to use
context ids 1-4.

We can't use segment 0 of context-id 0, because that maps to VSID 0,
which we want to keep as invalid, so we avoid context-id 0 entirely.
Similarly we can't use the last segment of the maximum context, so we
avoid it too.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
[mpe: Switch from 0-3 to 1-4 so VSID=0 remains invalid]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 60 ++++++++++++++++-----------
 arch/powerpc/mm/mmu_context_book3s64.c        |  2 +-
 arch/powerpc/mm/slb_low.S                     | 20 +++------
 3 files changed, 41 insertions(+), 41 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 52d8d1e4b772..a5ab6f5b8a7f 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -491,13 +491,14 @@ extern void slb_set_size(u16 size);
  * We first generate a 37-bit "proto-VSID". Proto-VSIDs are generated
  * from mmu context id and effective segment id of the address.
  *
- * For user processes max context id is limited to ((1ul << 19) - 5)
- * for kernel space, we use the top 4 context ids to map address as below
+ * For user processes max context id is limited to MAX_USER_CONTEXT.
+
+ * For kernel space, we use context ids 1-5 to map address as below:
  * NOTE: each context only support 64TB now.
- * 0x7fffc -  [ 0xc000000000000000 - 0xc0003fffffffffff ]
- * 0x7fffd -  [ 0xd000000000000000 - 0xd0003fffffffffff ]
- * 0x7fffe -  [ 0xe000000000000000 - 0xe0003fffffffffff ]
- * 0x7ffff -  [ 0xf000000000000000 - 0xf0003fffffffffff ]
+ * 0x00001 -  [ 0xc000000000000000 - 0xc0003fffffffffff ]
+ * 0x00002 -  [ 0xd000000000000000 - 0xd0003fffffffffff ]
+ * 0x00003 -  [ 0xe000000000000000 - 0xe0003fffffffffff ]
+ * 0x00004 -  [ 0xf000000000000000 - 0xf0003fffffffffff ]
  *
  * The proto-VSIDs are then scrambled into real VSIDs with the
  * multiplicative hash:
@@ -511,15 +512,13 @@ extern void slb_set_size(u16 size);
  * robust scattering in the hash table (at least based on some initial
  * results).
  *
- * We also consider VSID 0 special. We use VSID 0 for slb entries mapping
- * bad address. This enables us to consolidate bad address handling in
- * hash_page.
+ * We use VSID 0 to indicate an invalid VSID. The means we can't use context id
+ * 0, because a context id of 0 and an EA of 0 gives a proto-VSID of 0, which
+ * will produce a VSID of 0.
  *
  * We also need to avoid the last segment of the last context, because that
  * would give a protovsid of 0x1fffffffff. That will result in a VSID 0
- * because of the modulo operation in vsid scramble. But the vmemmap
- * (which is what uses region 0xf) will never be close to 64TB in size
- * (it's 56 bytes per page of system memory).
+ * because of the modulo operation in vsid scramble.
  */
 
 #define CONTEXT_BITS		19
@@ -532,12 +531,19 @@ extern void slb_set_size(u16 size);
 /*
  * 256MB segment
  * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments
- * available for user + kernel mapping. The top 4 contexts are used for
- * kernel mapping. Each segment contains 2^28 bytes. Each
- * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts
- * (19 == 37 + 28 - 46).
+ * available for user + kernel mapping. VSID 0 is reserved as invalid, contexts
+ * 1-4 are used for kernel mapping. Each segment contains 2^28 bytes. Each
+ * context maps 2^46 bytes (64TB).
+ *
+ * We also need to avoid the last segment of the last context, because that
+ * would give a protovsid of 0x1fffffffff. That will result in a VSID 0
+ * because of the modulo operation in vsid scramble.
  */
-#define MAX_USER_CONTEXT	((ASM_CONST(1) << CONTEXT_BITS) - 5)
+#define MAX_USER_CONTEXT	((ASM_CONST(1) << CONTEXT_BITS) - 2)
+#define MIN_USER_CONTEXT	(5)
+
+/* Would be nice to use KERNEL_REGION_ID here */
+#define KERNEL_REGION_CONTEXT_OFFSET	(0xc - 1)
 
 /*
  * This should be computed such that protovosid * vsid_mulitplier
@@ -671,21 +677,25 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea,
 
 /*
  * This is only valid for addresses >= PAGE_OFFSET
- *
- * For kernel space, we use the top 4 context ids to map address as below
- * 0x7fffc -  [ 0xc000000000000000 - 0xc0003fffffffffff ]
- * 0x7fffd -  [ 0xd000000000000000 - 0xd0003fffffffffff ]
- * 0x7fffe -  [ 0xe000000000000000 - 0xe0003fffffffffff ]
- * 0x7ffff -  [ 0xf000000000000000 - 0xf0003fffffffffff ]
  */
 static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
 {
 	unsigned long context;
 
 	/*
-	 * kernel take the top 4 context from the available range
+	 * For kernel space, we use context ids 1-4 to map the address space as
+	 * below:
+	 *
+	 * 0x00001 -  [ 0xc000000000000000 - 0xc0003fffffffffff ]
+	 * 0x00002 -  [ 0xd000000000000000 - 0xd0003fffffffffff ]
+	 * 0x00003 -  [ 0xe000000000000000 - 0xe0003fffffffffff ]
+	 * 0x00004 -  [ 0xf000000000000000 - 0xf0003fffffffffff ]
+	 *
+	 * So we can compute the context from the region (top nibble) by
+	 * subtracting 11, or 0xc - 1.
 	 */
-	context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1;
+	context = (ea >> 60) - KERNEL_REGION_CONTEXT_OFFSET;
+
 	return get_vsid(context, ea, ssize);
 }
 
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index e5da551edde3..a10e972221c4 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -59,7 +59,7 @@ again:
 
 int hash__alloc_context_id(void)
 {
-	return alloc_context_id(1, MAX_USER_CONTEXT);
+	return alloc_context_id(MIN_USER_CONTEXT, MAX_USER_CONTEXT);
 }
 EXPORT_SYMBOL_GPL(hash__alloc_context_id);
 
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index a85e06ea6c20..ba1f8696c338 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -45,13 +45,6 @@ _GLOBAL(slb_allocate_realmode)
 	/* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
 	blt	cr7,0f			/* user or kernel? */
 
-	/* kernel address: proto-VSID = ESID */
-	/* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but
-	 * this code will generate the protoVSID 0xfffffffff for the
-	 * top segment.  That's ok, the scramble below will translate
-	 * it to VSID 0, which is reserved as a bad VSID - one which
-	 * will never have any pages in it.  */
-
 	/* Check if hitting the linear mapping or some other kernel space
 	*/
 	bne	cr7,1f
@@ -63,12 +56,10 @@ _GLOBAL(slb_allocate_realmode)
 slb_miss_kernel_load_linear:
 	li	r11,0
 	/*
-	 * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+	 * context = (ea >> 60) - (0xc - 1)
 	 * r9 = region id.
 	 */
-	addis	r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
-	addi	r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
-
+	subi	r9,r9,KERNEL_REGION_CONTEXT_OFFSET
 
 BEGIN_FTR_SECTION
 	b	.Lslb_finish_load
@@ -77,9 +68,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
 
 1:
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-	/* Check virtual memmap region. To be patches at kernel boot */
 	cmpldi	cr0,r9,0xf
 	bne	1f
+/* Check virtual memmap region. To be patched at kernel boot */
 .globl slb_miss_kernel_load_vmemmap
 slb_miss_kernel_load_vmemmap:
 	li	r11,0
@@ -102,11 +93,10 @@ slb_miss_kernel_load_io:
 	li	r11,0
 6:
 	/*
-	 * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+	 * context = (ea >> 60) - (0xc - 1)
 	 * r9 = region id.
 	 */
-	addis	r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
-	addi	r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
+	subi	r9,r9,KERNEL_REGION_CONTEXT_OFFSET
 
 BEGIN_FTR_SECTION
 	b	.Lslb_finish_load
-- 
cgit v1.2.1


From 85beb1c486df76a7a851a0e063785282c2608f37 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 29 Mar 2017 23:10:34 +1100
Subject: powerpc/mm/hash: Check for non-kernel address in get_kernel_vsid()

get_kernel_vsid() has a very stern comment saying that it's only valid
for kernel addresses, but there's nothing in the code to enforce that.

Rather than hoping our callers are well behaved, add a check and return
a VSID of 0 (invalid).

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index a5ab6f5b8a7f..10a34282829e 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -682,6 +682,9 @@ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
 {
 	unsigned long context;
 
+	if (!is_kernel_addr(ea))
+		return 0;
+
 	/*
 	 * For kernel space, we use context ids 1-4 to map the address space as
 	 * below:
-- 
cgit v1.2.1


From e6f81a92015b2c1d12fad32b8456f99855da6e13 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 29 Mar 2017 17:21:53 +1100
Subject: powerpc/mm/hash: Support 68 bit VA

Inorder to support large effective address range (512TB), we want to
increase the virtual address bits to 68. But we do have platforms like
p4 and p5 that can only do 65 bit VA. We support those platforms by
limiting context bits on them to 16.

The protovsid -> vsid conversion is verified to work with both 65 and 68
bit va values. I also documented the restrictions in a table format as
part of code comments.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 127 ++++++++++++++++----------
 arch/powerpc/include/asm/mmu.h                |  19 ++--
 arch/powerpc/kvm/book3s_64_mmu_host.c         |   8 +-
 arch/powerpc/mm/mmu_context_book3s64.c        |   9 +-
 arch/powerpc/mm/slb_low.S                     |  54 +++++++++--
 5 files changed, 152 insertions(+), 65 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 10a34282829e..c68102293a19 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -39,6 +39,7 @@
 
 /* Bits in the SLB VSID word */
 #define SLB_VSID_SHIFT		12
+#define SLB_VSID_SHIFT_256M	SLB_VSID_SHIFT
 #define SLB_VSID_SHIFT_1T	24
 #define SLB_VSID_SSIZE_SHIFT	62
 #define SLB_VSID_B		ASM_CONST(0xc000000000000000)
@@ -521,9 +522,19 @@ extern void slb_set_size(u16 size);
  * because of the modulo operation in vsid scramble.
  */
 
+/*
+ * Max Va bits we support as of now is 68 bits. We want 19 bit
+ * context ID.
+ * Restrictions:
+ * GPU has restrictions of not able to access beyond 128TB
+ * (47 bit effective address). We also cannot do more than 20bit PID.
+ * For p4 and p5 which can only do 65 bit VA, we restrict our CONTEXT_BITS
+ * to 16 bits (ie, we can only have 2^16 pids at the same time).
+ */
+#define VA_BITS			68
 #define CONTEXT_BITS		19
-#define ESID_BITS		18
-#define ESID_BITS_1T		6
+#define ESID_BITS		(VA_BITS - (SID_SHIFT + CONTEXT_BITS))
+#define ESID_BITS_1T		(VA_BITS - (SID_SHIFT_1T + CONTEXT_BITS))
 
 #define ESID_BITS_MASK		((1 << ESID_BITS) - 1)
 #define ESID_BITS_1T_MASK	((1 << ESID_BITS_1T) - 1)
@@ -533,7 +544,7 @@ extern void slb_set_size(u16 size);
  * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments
  * available for user + kernel mapping. VSID 0 is reserved as invalid, contexts
  * 1-4 are used for kernel mapping. Each segment contains 2^28 bytes. Each
- * context maps 2^46 bytes (64TB).
+ * context maps 2^49 bytes (512TB).
  *
  * We also need to avoid the last segment of the last context, because that
  * would give a protovsid of 0x1fffffffff. That will result in a VSID 0
@@ -545,54 +556,46 @@ extern void slb_set_size(u16 size);
 /* Would be nice to use KERNEL_REGION_ID here */
 #define KERNEL_REGION_CONTEXT_OFFSET	(0xc - 1)
 
+/*
+ * For platforms that support on 65bit VA we limit the context bits
+ */
+#define MAX_USER_CONTEXT_65BIT_VA ((ASM_CONST(1) << (65 - (SID_SHIFT + ESID_BITS))) - 2)
+
 /*
  * This should be computed such that protovosid * vsid_mulitplier
- * doesn't overflow 64 bits. It should also be co-prime to vsid_modulus
+ * doesn't overflow 64 bits. The vsid_mutliplier should also be
+ * co-prime to vsid_modulus. We also need to make sure that number
+ * of bits in multiplied result (dividend) is less than twice the number of
+ * protovsid bits for our modulus optmization to work.
+ *
+ * The below table shows the current values used.
+ * |-------+------------+----------------------+------------+-------------------|
+ * |       | Prime Bits | proto VSID_BITS_65VA | Total Bits | 2* prot VSID_BITS |
+ * |-------+------------+----------------------+------------+-------------------|
+ * | 1T    |         24 |                   25 |         49 |                50 |
+ * |-------+------------+----------------------+------------+-------------------|
+ * | 256MB |         24 |                   37 |         61 |                74 |
+ * |-------+------------+----------------------+------------+-------------------|
+ *
+ * |-------+------------+----------------------+------------+--------------------|
+ * |       | Prime Bits | proto VSID_BITS_68VA | Total Bits | 2* proto VSID_BITS |
+ * |-------+------------+----------------------+------------+--------------------|
+ * | 1T    |         24 |                   28 |         52 |                 56 |
+ * |-------+------------+----------------------+------------+--------------------|
+ * | 256MB |         24 |                   40 |         64 |                 80 |
+ * |-------+------------+----------------------+------------+--------------------|
+ *
  */
 #define VSID_MULTIPLIER_256M	ASM_CONST(12538073)	/* 24-bit prime */
-#define VSID_BITS_256M		(CONTEXT_BITS + ESID_BITS)
-#define VSID_MODULUS_256M	((1UL<<VSID_BITS_256M)-1)
+#define VSID_BITS_256M		(VA_BITS - SID_SHIFT)
+#define VSID_BITS_65_256M	(65 - SID_SHIFT)
 
 #define VSID_MULTIPLIER_1T	ASM_CONST(12538073)	/* 24-bit prime */
-#define VSID_BITS_1T		(CONTEXT_BITS + ESID_BITS_1T)
-#define VSID_MODULUS_1T		((1UL<<VSID_BITS_1T)-1)
-
+#define VSID_BITS_1T		(VA_BITS - SID_SHIFT_1T)
+#define VSID_BITS_65_1T		(65 - SID_SHIFT_1T)
 
 #define USER_VSID_RANGE	(1UL << (ESID_BITS + SID_SHIFT))
 
-/*
- * This macro generates asm code to compute the VSID scramble
- * function.  Used in slb_allocate() and do_stab_bolted.  The function
- * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS
- *
- *	rt = register containing the proto-VSID and into which the
- *		VSID will be stored
- *	rx = scratch register (clobbered)
- *
- * 	- rt and rx must be different registers
- * 	- The answer will end up in the low VSID_BITS bits of rt.  The higher
- * 	  bits may contain other garbage, so you may need to mask the
- * 	  result.
- */
-#define ASM_VSID_SCRAMBLE(rt, rx, size)					\
-	lis	rx,VSID_MULTIPLIER_##size@h;				\
-	ori	rx,rx,VSID_MULTIPLIER_##size@l;				\
-	mulld	rt,rt,rx;		/* rt = rt * MULTIPLIER */	\
-									\
-	srdi	rx,rt,VSID_BITS_##size;					\
-	clrldi	rt,rt,(64-VSID_BITS_##size);				\
-	add	rt,rt,rx;		/* add high and low bits */	\
-	/* NOTE: explanation based on VSID_BITS_##size = 36		\
-	 * Now, r3 == VSID (mod 2^36-1), and lies between 0 and		\
-	 * 2^36-1+2^28-1.  That in particular means that if r3 >=	\
-	 * 2^36-1, then r3+1 has the 2^36 bit set.  So, if r3+1 has	\
-	 * the bit clear, r3 already has the answer we want, if it	\
-	 * doesn't, the answer is the low 36 bits of r3+1.  So in all	\
-	 * cases the answer is the low 36 bits of (r3 + ((r3+1) >> 36))*/\
-	addi	rx,rt,1;						\
-	srdi	rx,rx,VSID_BITS_##size;	/* extract 2^VSID_BITS bit */	\
-	add	rt,rt,rx
-
 /* 4 bits per slice and we have one slice per 1TB */
 #define SLICE_ARRAY_SIZE  (H_PGTABLE_RANGE >> 41)
 
@@ -640,7 +643,7 @@ static inline void subpage_prot_init_new_context(struct mm_struct *mm) { }
 #define vsid_scramble(protovsid, size) \
 	((((protovsid) * VSID_MULTIPLIER_##size) % VSID_MODULUS_##size))
 
-#else /* 1 */
+/* simplified form avoiding mod operation */
 #define vsid_scramble(protovsid, size) \
 	({								 \
 		unsigned long x;					 \
@@ -648,6 +651,21 @@ static inline void subpage_prot_init_new_context(struct mm_struct *mm) { }
 		x = (x >> VSID_BITS_##size) + (x & VSID_MODULUS_##size); \
 		(x + ((x+1) >> VSID_BITS_##size)) & VSID_MODULUS_##size; \
 	})
+
+#else /* 1 */
+static inline unsigned long vsid_scramble(unsigned long protovsid,
+				  unsigned long vsid_multiplier, int vsid_bits)
+{
+	unsigned long vsid;
+	unsigned long vsid_modulus = ((1UL << vsid_bits) - 1);
+	/*
+	 * We have same multipler for both 256 and 1T segements now
+	 */
+	vsid = protovsid * vsid_multiplier;
+	vsid = (vsid >> vsid_bits) + (vsid & vsid_modulus);
+	return (vsid + ((vsid + 1) >> vsid_bits)) & vsid_modulus;
+}
+
 #endif /* 1 */
 
 /* Returns the segment size indicator for a user address */
@@ -662,17 +680,30 @@ static inline int user_segment_size(unsigned long addr)
 static inline unsigned long get_vsid(unsigned long context, unsigned long ea,
 				     int ssize)
 {
+	unsigned long va_bits = VA_BITS;
+	unsigned long vsid_bits;
+	unsigned long protovsid;
+
 	/*
 	 * Bad address. We return VSID 0 for that
 	 */
 	if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE)
 		return 0;
 
-	if (ssize == MMU_SEGSIZE_256M)
-		return vsid_scramble((context << ESID_BITS)
-				     | ((ea >> SID_SHIFT) & ESID_BITS_MASK), 256M);
-	return vsid_scramble((context << ESID_BITS_1T)
-			     | ((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK), 1T);
+	if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
+		va_bits = 65;
+
+	if (ssize == MMU_SEGSIZE_256M) {
+		vsid_bits = va_bits - SID_SHIFT;
+		protovsid = (context << ESID_BITS) |
+			((ea >> SID_SHIFT) & ESID_BITS_MASK);
+		return vsid_scramble(protovsid, VSID_MULTIPLIER_256M, vsid_bits);
+	}
+	/* 1T segment */
+	vsid_bits = va_bits - SID_SHIFT_1T;
+	protovsid = (context << ESID_BITS_1T) |
+		((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK);
+	return vsid_scramble(protovsid, VSID_MULTIPLIER_1T, vsid_bits);
 }
 
 /*
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 065e762fae85..78260409dc9c 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -28,6 +28,10 @@
  * Individual features below.
  */
 
+/*
+ * Support for 68 bit VA space. We added that from ISA 2.05
+ */
+#define MMU_FTR_68_BIT_VA		ASM_CONST(0x00002000)
 /*
  * Kernel read only support.
  * We added the ppp value 0b110 in ISA 2.04.
@@ -109,10 +113,10 @@
 #define MMU_FTRS_POWER4		MMU_FTRS_DEFAULT_HPTE_ARCH_V2
 #define MMU_FTRS_PPC970		MMU_FTRS_POWER4 | MMU_FTR_TLBIE_CROP_VA
 #define MMU_FTRS_POWER5		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
-#define MMU_FTRS_POWER6		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
-#define MMU_FTRS_POWER7		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
-#define MMU_FTRS_POWER8		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
-#define MMU_FTRS_POWER9		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
+#define MMU_FTRS_POWER6		MMU_FTRS_POWER5 | MMU_FTR_KERNEL_RO | MMU_FTR_68_BIT_VA
+#define MMU_FTRS_POWER7		MMU_FTRS_POWER6
+#define MMU_FTRS_POWER8		MMU_FTRS_POWER6
+#define MMU_FTRS_POWER9		MMU_FTRS_POWER6
 #define MMU_FTRS_CELL		MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
 				MMU_FTR_CI_LARGE_PAGE
 #define MMU_FTRS_PA6T		MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
@@ -136,7 +140,7 @@ enum {
 		MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL |
 		MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE |
 		MMU_FTR_1T_SEGMENT | MMU_FTR_TLBIE_CROP_VA |
-		MMU_FTR_KERNEL_RO |
+		MMU_FTR_KERNEL_RO | MMU_FTR_68_BIT_VA |
 #ifdef CONFIG_PPC_RADIX_MMU
 		MMU_FTR_TYPE_RADIX |
 #endif
@@ -290,7 +294,10 @@ static inline bool early_radix_enabled(void)
 #define MMU_PAGE_16G	14
 #define MMU_PAGE_64G	15
 
-/* N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 */
+/*
+ * N.B. we need to change the type of hpte_page_sizes if this gets to be > 16
+ * Also we need to change he type of mm_context.low/high_slices_psize.
+ */
 #define MMU_PAGE_COUNT	16
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index b35f44c98d1f..74b0153780e3 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -229,6 +229,7 @@ void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
 
 static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 {
+	unsigned long vsid_bits = VSID_BITS_65_256M;
 	struct kvmppc_sid_map *map;
 	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 	u16 sid_map_mask;
@@ -257,7 +258,12 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 		kvmppc_mmu_pte_flush(vcpu, 0, 0);
 		kvmppc_mmu_flush_segments(vcpu);
 	}
-	map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++, 256M);
+
+	if (mmu_has_feature(MMU_FTR_68_BIT_VA))
+		vsid_bits = VSID_BITS_256M;
+
+	map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++,
+				       VSID_MULTIPLIER_256M, vsid_bits);
 
 	map->guest_vsid = gvsid;
 	map->valid = true;
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index a10e972221c4..e5fde156e11d 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -59,7 +59,14 @@ again:
 
 int hash__alloc_context_id(void)
 {
-	return alloc_context_id(MIN_USER_CONTEXT, MAX_USER_CONTEXT);
+	unsigned long max;
+
+	if (mmu_has_feature(MMU_FTR_68_BIT_VA))
+		max = MAX_USER_CONTEXT;
+	else
+		max = MAX_USER_CONTEXT_65BIT_VA;
+
+	return alloc_context_id(MIN_USER_CONTEXT, max);
 }
 EXPORT_SYMBOL_GPL(hash__alloc_context_id);
 
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index ba1f8696c338..1c503d07e0fb 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -23,6 +23,48 @@
 #include <asm/pgtable.h>
 #include <asm/firmware.h>
 
+/*
+ * This macro generates asm code to compute the VSID scramble
+ * function.  Used in slb_allocate() and do_stab_bolted.  The function
+ * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS
+ *
+ *	rt = register containing the proto-VSID and into which the
+ *		VSID will be stored
+ *	rx = scratch register (clobbered)
+ *	rf = flags
+ *
+ *	- rt and rx must be different registers
+ *	- The answer will end up in the low VSID_BITS bits of rt.  The higher
+ *	  bits may contain other garbage, so you may need to mask the
+ *	  result.
+ */
+#define ASM_VSID_SCRAMBLE(rt, rx, rf, size)				\
+	lis	rx,VSID_MULTIPLIER_##size@h;				\
+	ori	rx,rx,VSID_MULTIPLIER_##size@l;				\
+	mulld	rt,rt,rx;		/* rt = rt * MULTIPLIER */	\
+/*									\
+ * powermac get slb fault before feature fixup, so make 65 bit part     \
+ * the default part of feature fixup					\
+ */									\
+BEGIN_MMU_FTR_SECTION							\
+	srdi	rx,rt,VSID_BITS_65_##size;				\
+	clrldi	rt,rt,(64-VSID_BITS_65_##size);				\
+	add	rt,rt,rx;						\
+	addi	rx,rt,1;						\
+	srdi	rx,rx,VSID_BITS_65_##size;				\
+	add	rt,rt,rx;						\
+	rldimi	rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_65_##size)); \
+MMU_FTR_SECTION_ELSE							\
+	srdi	rx,rt,VSID_BITS_##size;					\
+	clrldi	rt,rt,(64-VSID_BITS_##size);				\
+	add	rt,rt,rx;		/* add high and low bits */	\
+	addi	rx,rt,1;						\
+	srdi	rx,rx,VSID_BITS_##size;	/* extract 2^VSID_BITS bit */	\
+	add	rt,rt,rx;						\
+	rldimi	rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_##size)); \
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA)
+
+
 /* void slb_allocate_realmode(unsigned long ea);
  *
  * Create an SLB entry for the given EA (user or kernel).
@@ -179,13 +221,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
  */
 .Lslb_finish_load:
 	rldimi  r10,r9,ESID_BITS,0
-	ASM_VSID_SCRAMBLE(r10,r9,256M)
-	/*
-	 * bits above VSID_BITS_256M need to be ignored from r10
-	 * also combine VSID and flags
-	 */
-	rldimi	r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M))
-
+	ASM_VSID_SCRAMBLE(r10,r9,r11,256M)
 	/* r3 = EA, r11 = VSID data */
 	/*
 	 * Find a slot, round robin. Previously we tried to find a
@@ -249,12 +285,12 @@ slb_compare_rr_to_size:
 .Lslb_finish_load_1T:
 	srdi	r10,r10,(SID_SHIFT_1T - SID_SHIFT)	/* get 1T ESID */
 	rldimi  r10,r9,ESID_BITS_1T,0
-	ASM_VSID_SCRAMBLE(r10,r9,1T)
+	ASM_VSID_SCRAMBLE(r10,r9,r11,1T)
 	/*
 	 * bits above VSID_BITS_1T need to be ignored from r10
 	 * also combine VSID and flags
 	 */
-	rldimi	r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T))
+
 	li	r10,MMU_SEGSIZE_1T
 	rldimi	r11,r10,SLB_VSID_SSIZE_SHIFT,0	/* insert segment size */
 
-- 
cgit v1.2.1


From 59248aecc4aea80c2c8b444ff8af9b894302a41d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 09:06:56 +0530
Subject: powerpc/mm/hash: Convert mask to unsigned long

This doesn't have any functional change. But helps in avoiding mistakes
in case the shift bit changes

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index c68102293a19..c93450bf306f 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -409,7 +409,7 @@ static inline unsigned long hpt_vpn(unsigned long ea,
 static inline unsigned long hpt_hash(unsigned long vpn,
 				     unsigned int shift, int ssize)
 {
-	int mask;
+	unsigned long mask;
 	unsigned long hash, vsid;
 
 	/* VPN_SHIFT can be atmost 12 */
-- 
cgit v1.2.1


From f6eedbba7a26fdaee9ea8121336dc86236c136c7 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 09:06:57 +0530
Subject: powerpc/mm/hash: Increase VA range to 128TB

We update the hash linux page table layout such that we can support
512TB. But we limit the TASK_SIZE to 128TB. We can switch to 128TB by
default without conditional because that is the max virtual address
supported by other architectures. We will later add a mechanism to
on-demand increase the application's effective address range to 512TB.

Having the page table layout changed to accommodate 512TB makes testing
large memory configuration easier with less code changes to kernel

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |  2 +-
 arch/powerpc/include/asm/book3s/64/hash-64k.h |  2 +-
 arch/powerpc/include/asm/processor.h          | 22 ++++++++++++++++++----
 3 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 0c4e470571ca..b4b5e6b671ca 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -8,7 +8,7 @@
 #define H_PTE_INDEX_SIZE  9
 #define H_PMD_INDEX_SIZE  7
 #define H_PUD_INDEX_SIZE  9
-#define H_PGD_INDEX_SIZE  9
+#define H_PGD_INDEX_SIZE  12
 
 #ifndef __ASSEMBLY__
 #define H_PTE_TABLE_SIZE	(sizeof(pte_t) << H_PTE_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 7be54f9590a3..214219dff87c 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -4,7 +4,7 @@
 #define H_PTE_INDEX_SIZE  8
 #define H_PMD_INDEX_SIZE  5
 #define H_PUD_INDEX_SIZE  5
-#define H_PGD_INDEX_SIZE  12
+#define H_PGD_INDEX_SIZE  15
 
 /*
  * 64k aligned address free up few of the lower bits of RPN for us
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index e0fecbcea2a2..ea4b6c696f1f 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -102,11 +102,25 @@ void release_thread(struct task_struct *);
 #endif
 
 #ifdef CONFIG_PPC64
-/* 64-bit user address space is 46-bits (64TB user VM) */
-#define TASK_SIZE_USER64 (0x0000400000000000UL)
+/*
+ * 64-bit user address space can have multiple limits
+ * For now supported values are:
+ */
+#define TASK_SIZE_64TB  (0x0000400000000000UL)
+#define TASK_SIZE_128TB (0x0000800000000000UL)
+#define TASK_SIZE_512TB (0x0002000000000000UL)
 
-/* 
- * 32-bit user address space is 4GB - 1 page 
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * Max value currently used:
+ */
+#define TASK_SIZE_USER64 TASK_SIZE_128TB
+#else
+#define TASK_SIZE_USER64 TASK_SIZE_64TB
+#endif
+
+/*
+ * 32-bit user address space is 4GB - 1 page
  * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT
  */
 #define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE))
-- 
cgit v1.2.1


From 957b778a166e32e242a33fdab693ffb256a19cbd Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 09:06:58 +0530
Subject: powerpc/mm: Add addr_limit to mm_context and use it to derive max
 slice index

In the followup patch, we will increase the slice array size to handle
512TB range, but will limit the max addr to 128TB. Avoid doing
unnecessary computation and avoid doing slice mask related operation
above address limit.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  3 ++-
 arch/powerpc/include/asm/book3s/64/mmu.h      |  1 +
 arch/powerpc/kernel/paca.c                    |  3 ++-
 arch/powerpc/kernel/setup-common.c            |  9 +++++++++
 arch/powerpc/mm/mmu_context_book3s64.c        |  7 +++++++
 arch/powerpc/mm/slice.c                       | 20 +++++++++++---------
 6 files changed, 32 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index c93450bf306f..5961b0d65a79 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -597,7 +597,8 @@ extern void slb_set_size(u16 size);
 #define USER_VSID_RANGE	(1UL << (ESID_BITS + SID_SHIFT))
 
 /* 4 bits per slice and we have one slice per 1TB */
-#define SLICE_ARRAY_SIZE  (H_PGTABLE_RANGE >> 41)
+#define SLICE_ARRAY_SIZE	(H_PGTABLE_RANGE >> 41)
+#define TASK_SLICE_ARRAY_SZ(x)	((x)->context.addr_limit >> 41)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index cce434e7de06..c4b865112d24 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -82,6 +82,7 @@ typedef struct {
 #ifdef CONFIG_PPC_MM_SLICES
 	u64 low_slices_psize;	/* SLB page size encodings */
 	unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
+	unsigned long addr_limit;
 #else
 	u16 sllp;		/* SLB page size encoding */
 #endif
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index e2cf745a4b94..a2c7a6456ee6 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -253,9 +253,10 @@ void copy_mm_to_paca(struct mm_struct *mm)
 
 	get_paca()->mm_ctx_id = context->id;
 #ifdef CONFIG_PPC_MM_SLICES
+	VM_BUG_ON(!mm->context.addr_limit);
 	get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
 	memcpy(&get_paca()->mm_ctx_high_slices_psize,
-	       &context->high_slices_psize, SLICE_ARRAY_SIZE);
+	       &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm));
 #else /* CONFIG_PPC_MM_SLICES */
 	get_paca()->mm_ctx_user_psize = context->user_psize;
 	get_paca()->mm_ctx_sllp = context->sllp;
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 4697da895133..a79db6b63466 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -920,6 +920,15 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.end_code = (unsigned long) _etext;
 	init_mm.end_data = (unsigned long) _edata;
 	init_mm.brk = klimit;
+
+#ifdef CONFIG_PPC_MM_SLICES
+#ifdef CONFIG_PPC64
+	init_mm.context.addr_limit = TASK_SIZE_USER64;
+#else
+#error	"context.addr_limit not initialized."
+#endif
+#endif
+
 #ifdef CONFIG_PPC_64K_PAGES
 	init_mm.context.pte_frag = NULL;
 #endif
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index e5fde156e11d..fd0bc6db2dcd 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -78,6 +78,13 @@ static int hash__init_new_context(struct mm_struct *mm)
 	if (index < 0)
 		return index;
 
+	/*
+	 * We do switch_slb() early in fork, even before we setup the
+	 * mm->context.addr_limit. Default to max task size so that we copy the
+	 * default values to paca which will help us to handle slb miss early.
+	 */
+	mm->context.addr_limit = TASK_SIZE_USER64;
+
 	/*
 	 * The old code would re-promote on fork, we don't do that when using
 	 * slices as it could cause problem promoting slices that have been
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 95e5a20b1b6a..ded96edac817 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -136,7 +136,7 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret)
 	if (mm->task_size <= SLICE_LOW_TOP)
 		return;
 
-	for (i = 0; i < SLICE_NUM_HIGH; i++)
+	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++)
 		if (!slice_high_has_vma(mm, i))
 			__set_bit(i, ret->high_slices);
 }
@@ -157,7 +157,7 @@ static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_ma
 			ret->low_slices |= 1u << i;
 
 	hpsizes = mm->context.high_slices_psize;
-	for (i = 0; i < SLICE_NUM_HIGH; i++) {
+	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) {
 		mask_index = i & 0x1;
 		index = i >> 1;
 		if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
@@ -165,15 +165,17 @@ static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_ma
 	}
 }
 
-static int slice_check_fit(struct slice_mask mask, struct slice_mask available)
+static int slice_check_fit(struct mm_struct *mm,
+			   struct slice_mask mask, struct slice_mask available)
 {
 	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
+	unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.addr_limit);
 
 	bitmap_and(result, mask.high_slices,
-		   available.high_slices, SLICE_NUM_HIGH);
+		   available.high_slices, slice_count);
 
 	return (mask.low_slices & available.low_slices) == mask.low_slices &&
-		bitmap_equal(result, mask.high_slices, SLICE_NUM_HIGH);
+		bitmap_equal(result, mask.high_slices, slice_count);
 }
 
 static void slice_flush_segments(void *parm)
@@ -217,7 +219,7 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
 	mm->context.low_slices_psize = lpsizes;
 
 	hpsizes = mm->context.high_slices_psize;
-	for (i = 0; i < SLICE_NUM_HIGH; i++) {
+	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) {
 		mask_index = i & 0x1;
 		index = i >> 1;
 		if (test_bit(i, mask.high_slices))
@@ -484,7 +486,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 		/* Check if we fit in the good mask. If we do, we just return,
 		 * nothing else to do
 		 */
-		if (slice_check_fit(mask, good_mask)) {
+		if (slice_check_fit(mm, mask, good_mask)) {
 			slice_dbg(" fits good !\n");
 			return addr;
 		}
@@ -509,7 +511,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	slice_or_mask(&potential_mask, &good_mask);
 	slice_print_mask(" potential", potential_mask);
 
-	if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) {
+	if ((addr != 0 || fixed) && slice_check_fit(mm, mask, potential_mask)) {
 		slice_dbg(" fits potential !\n");
 		goto convert;
 	}
@@ -734,6 +736,6 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 	slice_print_mask(" mask", mask);
 	slice_print_mask(" available", available);
 #endif
-	return !slice_check_fit(mask, available);
+	return !slice_check_fit(mm, mask, available);
 }
 #endif
-- 
cgit v1.2.1


From bb1832217a859f6dbe4a45ff2ba7fdcab0bb3958 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 09:06:59 +0530
Subject: powerpc/mm/hash: Store addr_limit in PACA

We optmize the slice page size array copy to paca by copying only the
range based on addr_limit. This will require us to not look at page size
array beyond addr_limit in PACA on slb fault. To enable that copy task
size to paca which will be used during slb fault.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
[mpe: Rename from task_size to addr_limit, consolidate #ifdefs]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/paca.h   | 1 +
 arch/powerpc/kernel/asm-offsets.c | 1 +
 arch/powerpc/kernel/paca.c        | 1 +
 arch/powerpc/mm/slb_low.S         | 8 +++++++-
 4 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index f48c250339fd..65d0ffb2aa33 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -139,6 +139,7 @@ struct paca_struct {
 #ifdef CONFIG_PPC_MM_SLICES
 	u64 mm_ctx_low_slices_psize;
 	unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE];
+	unsigned long addr_limit;
 #else
 	u16 mm_ctx_user_psize;
 	u16 mm_ctx_sllp;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 4367e7df51a1..795505e27535 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -185,6 +185,7 @@ int main(void)
 #ifdef CONFIG_PPC_MM_SLICES
 	OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize);
 	OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize);
+	DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit));
 	DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def));
 #endif /* CONFIG_PPC_MM_SLICES */
 #endif
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index a2c7a6456ee6..8d63627e067f 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -254,6 +254,7 @@ void copy_mm_to_paca(struct mm_struct *mm)
 	get_paca()->mm_ctx_id = context->id;
 #ifdef CONFIG_PPC_MM_SLICES
 	VM_BUG_ON(!mm->context.addr_limit);
+	get_paca()->addr_limit = mm->context.addr_limit;
 	get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
 	memcpy(&get_paca()->mm_ctx_high_slices_psize,
 	       &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm));
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index 1c503d07e0fb..1519617aab36 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -149,7 +149,13 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
 	 * For userspace addresses, make sure this is region 0.
 	 */
 	cmpdi	r9, 0
-	bne	8f
+	bne-	8f
+        /*
+         * user space make sure we are within the allowed limit
+	 */
+	ld	r11,PACA_ADDR_LIMIT(r13)
+	cmpld	r3,r11
+	bge-	8f
 
 	/* when using slices, we extract the psize off the slice bitmaps
 	 * and then we need to get the sllp encoding off the mmu_psize_defs
-- 
cgit v1.2.1


From 82228e362f9b7f4b876d0fbb1036c235797c6b1d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 09:07:00 +0530
Subject: powerpc/pseries: Skip using reserved virtual address range

Now that we use all the available virtual address range, we need to make
sure we don't generate VSID such that it overlaps with the reserved vsid
range. Reserved vsid range include the virtual address range used by the
adjunct partition and also the VRMA virtual segment. We find the context
value that can result in generating such a VSID and reserve it early in
boot.

We don't look at the adjunct range, because for now we disable the
adjunct usage in a Linux LPAR via CAS interface.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
[mpe: Rewrite hash__reserve_context_id(), move the rest into pseries]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  7 +++
 arch/powerpc/include/asm/kvm_book3s_64.h      |  2 -
 arch/powerpc/include/asm/mmu_context.h        |  1 +
 arch/powerpc/mm/hash_utils_64.c               |  1 -
 arch/powerpc/mm/mmu_context_book3s64.c        | 16 +++++++
 arch/powerpc/platforms/pseries/lpar.c         | 61 +++++++++++++++++++++++++++
 6 files changed, 85 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 5961b0d65a79..6d56974adf28 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -589,11 +589,18 @@ extern void slb_set_size(u16 size);
 #define VSID_MULTIPLIER_256M	ASM_CONST(12538073)	/* 24-bit prime */
 #define VSID_BITS_256M		(VA_BITS - SID_SHIFT)
 #define VSID_BITS_65_256M	(65 - SID_SHIFT)
+/*
+ * Modular multiplicative inverse of VSID_MULTIPLIER under modulo VSID_MODULUS
+ */
+#define VSID_MULINV_256M	ASM_CONST(665548017062)
 
 #define VSID_MULTIPLIER_1T	ASM_CONST(12538073)	/* 24-bit prime */
 #define VSID_BITS_1T		(VA_BITS - SID_SHIFT_1T)
 #define VSID_BITS_65_1T		(65 - SID_SHIFT_1T)
+#define VSID_MULINV_1T		ASM_CONST(209034062)
 
+/* 1TB VSID reserved for VRMA */
+#define VRMA_VSID	0x1ffffffUL
 #define USER_VSID_RANGE	(1UL << (ESID_BITS + SID_SHIFT))
 
 /* 4 bits per slice and we have one slice per 1TB */
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index d9b48f5bb606..d55c7f881ce7 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -49,8 +49,6 @@ static inline bool kvm_is_radix(struct kvm *kvm)
 #define KVM_DEFAULT_HPT_ORDER	24	/* 16MB HPT by default */
 #endif
 
-#define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
-
 /*
  * We use a lock bit in HPTE dword 0 to synchronize updates and
  * accesses to each HPTE, and another bit to indicate non-present
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 7d721101ec78..78803a7ebdd9 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -52,6 +52,7 @@ static inline void switch_mmu_context(struct mm_struct *prev,
 }
 
 extern int hash__alloc_context_id(void);
+extern void hash__reserve_context_id(int id);
 extern void __destroy_context(int context_id);
 static inline void mmu_context_init(void) { }
 #else
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 716255f88bbd..8848fec51ce9 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1868,5 +1868,4 @@ static int __init hash64_debugfs(void)
 	return 0;
 }
 machine_device_initcall(pseries, hash64_debugfs);
-
 #endif /* CONFIG_DEBUG_FS */
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index fd0bc6db2dcd..7bc5b63034db 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -57,6 +57,22 @@ again:
 	return index;
 }
 
+void hash__reserve_context_id(int id)
+{
+	int rc, result = 0;
+
+	do {
+		if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
+			break;
+
+		spin_lock(&mmu_context_lock);
+		rc = ida_get_new_above(&mmu_context_ida, id, &result);
+		spin_unlock(&mmu_context_lock);
+	} while (rc == -EAGAIN);
+
+	WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result);
+}
+
 int hash__alloc_context_id(void)
 {
 	unsigned long max;
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 8b1fe895daa3..6541d0b03e4c 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -958,3 +958,64 @@ int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
 
 	return rc;
 }
+
+static unsigned long vsid_unscramble(unsigned long vsid, int ssize)
+{
+	unsigned long protovsid;
+	unsigned long va_bits = VA_BITS;
+	unsigned long modinv, vsid_modulus;
+	unsigned long max_mod_inv, tmp_modinv;
+
+	if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
+		va_bits = 65;
+
+	if (ssize == MMU_SEGSIZE_256M) {
+		modinv = VSID_MULINV_256M;
+		vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1);
+	} else {
+		modinv = VSID_MULINV_1T;
+		vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1);
+	}
+
+	/*
+	 * vsid outside our range.
+	 */
+	if (vsid >= vsid_modulus)
+		return 0;
+
+	/*
+	 * If modinv is the modular multiplicate inverse of (x % vsid_modulus)
+	 * and vsid = (protovsid * x) % vsid_modulus, then we say:
+	 *   protovsid = (vsid * modinv) % vsid_modulus
+	 */
+
+	/* Check if (vsid * modinv) overflow (63 bits) */
+	max_mod_inv = 0x7fffffffffffffffull / vsid;
+	if (modinv < max_mod_inv)
+		return (vsid * modinv) % vsid_modulus;
+
+	tmp_modinv = modinv/max_mod_inv;
+	modinv %= max_mod_inv;
+
+	protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus;
+	protovsid = (protovsid + vsid * modinv) % vsid_modulus;
+
+	return protovsid;
+}
+
+static int __init reserve_vrma_context_id(void)
+{
+	unsigned long protovsid;
+
+	/*
+	 * Reserve context ids which map to reserved virtual addresses. For now
+	 * we only reserve the context id which maps to the VRMA VSID. We ignore
+	 * the addresses in "ibm,adjunct-virtual-addresses" because we don't
+	 * enable adjunct support via the "ibm,client-architecture-support"
+	 * interface.
+	 */
+	protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T);
+	hash__reserve_context_id(protovsid >> ESID_BITS_1T);
+	return 0;
+}
+machine_device_initcall(pseries, reserve_vrma_context_id);
-- 
cgit v1.2.1


From fbfef9027c2a7ad9277755509fdb849dbccfe8c1 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 09:07:01 +0530
Subject: powerpc/mm: Switch some TASK_SIZE checks to use mm_context addr_limit

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/hugetlbpage-radix.c |  4 ++--
 arch/powerpc/mm/mmap.c              | 12 ++++++------
 arch/powerpc/mm/slice.c             |  6 +++---
 arch/powerpc/mm/subpage-prot.c      |  3 ++-
 4 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
index 35254a678456..95207c117b82 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -52,7 +52,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 
 	if (len & ~huge_page_mask(h))
 		return -EINVAL;
-	if (len > TASK_SIZE)
+	if (len > mm->context.addr_limit)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
@@ -64,7 +64,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	if (addr) {
 		addr = ALIGN(addr, huge_page_size(h));
 		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr &&
+		if (mm->context.addr_limit - len >= addr &&
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index a5d9ef59debe..6e9fead92969 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -97,7 +97,7 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	struct vm_area_struct *vma;
 	struct vm_unmapped_area_info info;
 
-	if (len > TASK_SIZE - mmap_min_addr)
+	if (len > mm->context.addr_limit - mmap_min_addr)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED)
@@ -106,7 +106,7 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+		if (mm->context.addr_limit - len >= addr && addr >= mmap_min_addr &&
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
@@ -114,7 +114,7 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	info.flags = 0;
 	info.length = len;
 	info.low_limit = mm->mmap_base;
-	info.high_limit = TASK_SIZE;
+	info.high_limit = mm->context.addr_limit;
 	info.align_mask = 0;
 	return vm_unmapped_area(&info);
 }
@@ -132,7 +132,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	struct vm_unmapped_area_info info;
 
 	/* requested length too big for entire address space */
-	if (len > TASK_SIZE - mmap_min_addr)
+	if (len > mm->context.addr_limit - mmap_min_addr)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED)
@@ -142,7 +142,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+		if (mm->context.addr_limit - len >= addr && addr >= mmap_min_addr &&
 				(!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
@@ -164,7 +164,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 		VM_BUG_ON(addr != -ENOMEM);
 		info.flags = 0;
 		info.low_limit = TASK_UNMAPPED_BASE;
-		info.high_limit = TASK_SIZE;
+		info.high_limit = mm->context.addr_limit;
 		addr = vm_unmapped_area(&info);
 	}
 
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index ded96edac817..19d8788820e1 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -277,7 +277,7 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
 	info.align_offset = 0;
 
 	addr = TASK_UNMAPPED_BASE;
-	while (addr < TASK_SIZE) {
+	while (addr < mm->context.addr_limit) {
 		info.low_limit = addr;
 		if (!slice_scan_available(addr, available, 1, &addr))
 			continue;
@@ -289,8 +289,8 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
 		 * Check if we need to reduce the range, or if we can
 		 * extend it to cover the next available slice.
 		 */
-		if (addr >= TASK_SIZE)
-			addr = TASK_SIZE;
+		if (addr >= mm->context.addr_limit)
+			addr = mm->context.addr_limit;
 		else if (slice_scan_available(addr, available, 1, &next_end)) {
 			addr = next_end;
 			goto next_slice;
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 94210940112f..a409f78d206b 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -197,7 +197,8 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
 
 	/* Check parameters */
 	if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
-	    addr >= TASK_SIZE || len >= TASK_SIZE || addr + len > TASK_SIZE)
+	    addr >= mm->context.addr_limit || len >= mm->context.addr_limit ||
+	    addr + len > mm->context.addr_limit)
 		return -EINVAL;
 
 	if (is_hugepage_only_range(mm, addr, len))
-- 
cgit v1.2.1


From f4ea6dcb08ea2c6c996c373573caf74d48d23b84 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 30 Mar 2017 16:35:21 +0530
Subject: powerpc/mm: Enable mappings above 128TB

Not all user space application is ready to handle wide addresses. It's
known that at least some JIT compilers use higher bits in pointers to
encode their information. It collides with valid pointers with 512TB
addresses and leads to crashes.

To mitigate this, we are not going to allocate virtual address space
above 128TB by default.

But userspace can ask for allocation from full address space by
specifying hint address (with or without MAP_FIXED) above 128TB.

If hint address set above 128TB, but MAP_FIXED is not specified, we try
to look for unmapped area by specified address. If it's already
occupied, we look for unmapped area in *full* address space, rather than
from 128TB window.

This approach helps to easily make application's memory allocator aware
about large address space without manually tracking allocated virtual
address space.

This is going to be a per mmap decision. ie, we can have some mmaps with
larger addresses and other that do not.

A sample memory layout looks like:

  10000000-10010000 r-xp 00000000 fc:00 9057045          /home/max_addr_512TB
  10010000-10020000 r--p 00000000 fc:00 9057045          /home/max_addr_512TB
  10020000-10030000 rw-p 00010000 fc:00 9057045          /home/max_addr_512TB
  10029630000-10029660000 rw-p 00000000 00:00 0          [heap]
  7fff834a0000-7fff834b0000 rw-p 00000000 00:00 0
  7fff834b0000-7fff83670000 r-xp 00000000 fc:00 9177190  /lib/powerpc64le-linux-gnu/libc-2.23.so
  7fff83670000-7fff83680000 r--p 001b0000 fc:00 9177190  /lib/powerpc64le-linux-gnu/libc-2.23.so
  7fff83680000-7fff83690000 rw-p 001c0000 fc:00 9177190  /lib/powerpc64le-linux-gnu/libc-2.23.so
  7fff83690000-7fff836a0000 rw-p 00000000 00:00 0
  7fff836a0000-7fff836c0000 r-xp 00000000 00:00 0        [vdso]
  7fff836c0000-7fff83700000 r-xp 00000000 fc:00 9177193  /lib/powerpc64le-linux-gnu/ld-2.23.so
  7fff83700000-7fff83710000 r--p 00030000 fc:00 9177193  /lib/powerpc64le-linux-gnu/ld-2.23.so
  7fff83710000-7fff83720000 rw-p 00040000 fc:00 9177193  /lib/powerpc64le-linux-gnu/ld-2.23.so
  7fffdccf0000-7fffdcd20000 rw-p 00000000 00:00 0        [stack]
  1000000000000-1000000010000 rw-p 00000000 00:00 0
  1ffff83710000-1ffff83720000 rw-p 00000000 00:00 0

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/processor.h   | 23 +++++++++++----
 arch/powerpc/kernel/setup-common.c     |  2 +-
 arch/powerpc/mm/hugetlbpage-radix.c    |  7 +++++
 arch/powerpc/mm/mmap.c                 | 32 +++++++++++++-------
 arch/powerpc/mm/mmu_context_book3s64.c |  2 +-
 arch/powerpc/mm/slice.c                | 54 ++++++++++++++++++++++++++--------
 6 files changed, 89 insertions(+), 31 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index ea4b6c696f1f..a4b1d8d6b793 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -114,9 +114,9 @@ void release_thread(struct task_struct *);
 /*
  * Max value currently used:
  */
-#define TASK_SIZE_USER64 TASK_SIZE_128TB
+#define TASK_SIZE_USER64	TASK_SIZE_512TB
 #else
-#define TASK_SIZE_USER64 TASK_SIZE_64TB
+#define TASK_SIZE_USER64	TASK_SIZE_64TB
 #endif
 
 /*
@@ -128,26 +128,37 @@ void release_thread(struct task_struct *);
 #define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \
 		TASK_SIZE_USER32 : TASK_SIZE_USER64)
 #define TASK_SIZE	  TASK_SIZE_OF(current)
-
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
 #define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4))
-#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_USER64 / 4))
+#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_128TB / 4))
 
 #define TASK_UNMAPPED_BASE ((is_32bit_task()) ? \
 		TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64 )
 #endif
 
+/*
+ * Initial task size value for user applications. For book3s 64 we start
+ * with 128TB and conditionally enable upto 512TB
+ */
+#ifdef CONFIG_PPC_BOOK3S_64
+#define DEFAULT_MAP_WINDOW	((is_32bit_task()) ? \
+				 TASK_SIZE_USER32 : TASK_SIZE_128TB)
+#else
+#define DEFAULT_MAP_WINDOW	TASK_SIZE
+#endif
+
 #ifdef __powerpc64__
 
-#define STACK_TOP_USER64 TASK_SIZE_USER64
+/* Limit stack to 128TB */
+#define STACK_TOP_USER64 TASK_SIZE_128TB
 #define STACK_TOP_USER32 TASK_SIZE_USER32
 
 #define STACK_TOP (is_32bit_task() ? \
 		   STACK_TOP_USER32 : STACK_TOP_USER64)
 
-#define STACK_TOP_MAX STACK_TOP_USER64
+#define STACK_TOP_MAX TASK_SIZE_USER64
 
 #else /* __powerpc64__ */
 
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index a79db6b63466..fcdca741f660 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -923,7 +923,7 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_PPC_MM_SLICES
 #ifdef CONFIG_PPC64
-	init_mm.context.addr_limit = TASK_SIZE_USER64;
+	init_mm.context.addr_limit = TASK_SIZE_128TB;
 #else
 #error	"context.addr_limit not initialized."
 #endif
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
index 95207c117b82..0aa9cade422f 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -50,6 +50,9 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	struct hstate *h = hstate_file(file);
 	struct vm_unmapped_area_info info;
 
+	if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
+		mm->context.addr_limit = TASK_SIZE;
+
 	if (len & ~huge_page_mask(h))
 		return -EINVAL;
 	if (len > mm->context.addr_limit)
@@ -78,5 +81,9 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	info.high_limit = current->mm->mmap_base;
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
+
+	if (addr > DEFAULT_MAP_WINDOW)
+		info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
+
 	return vm_unmapped_area(&info);
 }
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 6e9fead92969..b2111baa0da6 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -79,7 +79,7 @@ static inline unsigned long mmap_base(unsigned long rnd)
 	else if (gap > MAX_GAP)
 		gap = MAX_GAP;
 
-	return PAGE_ALIGN(TASK_SIZE - gap - rnd);
+	return PAGE_ALIGN(DEFAULT_MAP_WINDOW - gap - rnd);
 }
 
 #ifdef CONFIG_PPC_RADIX_MMU
@@ -97,6 +97,9 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	struct vm_area_struct *vma;
 	struct vm_unmapped_area_info info;
 
+	if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
+		mm->context.addr_limit = TASK_SIZE;
+
 	if (len > mm->context.addr_limit - mmap_min_addr)
 		return -ENOMEM;
 
@@ -114,8 +117,13 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	info.flags = 0;
 	info.length = len;
 	info.low_limit = mm->mmap_base;
-	info.high_limit = mm->context.addr_limit;
 	info.align_mask = 0;
+
+	if (unlikely(addr > DEFAULT_MAP_WINDOW))
+		info.high_limit = mm->context.addr_limit;
+	else
+		info.high_limit = DEFAULT_MAP_WINDOW;
+
 	return vm_unmapped_area(&info);
 }
 
@@ -131,6 +139,9 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	unsigned long addr = addr0;
 	struct vm_unmapped_area_info info;
 
+	if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
+		mm->context.addr_limit = TASK_SIZE;
+
 	/* requested length too big for entire address space */
 	if (len > mm->context.addr_limit - mmap_min_addr)
 		return -ENOMEM;
@@ -152,7 +163,14 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
 	info.high_limit = mm->mmap_base;
 	info.align_mask = 0;
+
+	if (addr > DEFAULT_MAP_WINDOW)
+		info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
+
 	addr = vm_unmapped_area(&info);
+	if (!(addr & ~PAGE_MASK))
+		return addr;
+	VM_BUG_ON(addr != -ENOMEM);
 
 	/*
 	 * A failed mmap() very likely causes application failure,
@@ -160,15 +178,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
-	if (addr & ~PAGE_MASK) {
-		VM_BUG_ON(addr != -ENOMEM);
-		info.flags = 0;
-		info.low_limit = TASK_UNMAPPED_BASE;
-		info.high_limit = mm->context.addr_limit;
-		addr = vm_unmapped_area(&info);
-	}
-
-	return addr;
+	return radix__arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 }
 
 static void radix__arch_pick_mmap_layout(struct mm_struct *mm,
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 7bc5b63034db..86719f6b5a7c 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -99,7 +99,7 @@ static int hash__init_new_context(struct mm_struct *mm)
 	 * mm->context.addr_limit. Default to max task size so that we copy the
 	 * default values to paca which will help us to handle slb miss early.
 	 */
-	mm->context.addr_limit = TASK_SIZE_USER64;
+	mm->context.addr_limit = TASK_SIZE_128TB;
 
 	/*
 	 * The old code would re-promote on fork, we don't do that when using
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 19d8788820e1..251b6bae7023 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -265,7 +265,7 @@ static bool slice_scan_available(unsigned long addr,
 static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
 					      unsigned long len,
 					      struct slice_mask available,
-					      int psize)
+					      int psize, unsigned long high_limit)
 {
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
 	unsigned long addr, found, next_end;
@@ -277,7 +277,10 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
 	info.align_offset = 0;
 
 	addr = TASK_UNMAPPED_BASE;
-	while (addr < mm->context.addr_limit) {
+	/*
+	 * Check till the allow max value for this mmap request
+	 */
+	while (addr < high_limit) {
 		info.low_limit = addr;
 		if (!slice_scan_available(addr, available, 1, &addr))
 			continue;
@@ -308,7 +311,7 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
 static unsigned long slice_find_area_topdown(struct mm_struct *mm,
 					     unsigned long len,
 					     struct slice_mask available,
-					     int psize)
+					     int psize, unsigned long high_limit)
 {
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
 	unsigned long addr, found, prev;
@@ -320,6 +323,15 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
 	info.align_offset = 0;
 
 	addr = mm->mmap_base;
+	/*
+	 * If we are trying to allocate above DEFAULT_MAP_WINDOW
+	 * Add the different to the mmap_base.
+	 * Only for that request for which high_limit is above
+	 * DEFAULT_MAP_WINDOW we should apply this.
+	 */
+	if (high_limit  > DEFAULT_MAP_WINDOW)
+		addr += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
+
 	while (addr > PAGE_SIZE) {
 		info.high_limit = addr;
 		if (!slice_scan_available(addr - 1, available, 0, &addr))
@@ -351,18 +363,18 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
-	return slice_find_area_bottomup(mm, len, available, psize);
+	return slice_find_area_bottomup(mm, len, available, psize, high_limit);
 }
 
 
 static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
 				     struct slice_mask mask, int psize,
-				     int topdown)
+				     int topdown, unsigned long high_limit)
 {
 	if (topdown)
-		return slice_find_area_topdown(mm, len, mask, psize);
+		return slice_find_area_topdown(mm, len, mask, psize, high_limit);
 	else
-		return slice_find_area_bottomup(mm, len, mask, psize);
+		return slice_find_area_bottomup(mm, len, mask, psize, high_limit);
 }
 
 static inline void slice_or_mask(struct slice_mask *dst, struct slice_mask *src)
@@ -402,7 +414,22 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
 	struct mm_struct *mm = current->mm;
 	unsigned long newaddr;
+	unsigned long high_limit;
 
+	/*
+	 * Check if we need to expland slice area.
+	 */
+	if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE)) {
+		mm->context.addr_limit = TASK_SIZE;
+		on_each_cpu(slice_flush_segments, mm, 1);
+	}
+	/*
+	 * This mmap request can allocate upt to 512TB
+	 */
+	if (addr > DEFAULT_MAP_WINDOW)
+		high_limit = mm->context.addr_limit;
+	else
+		high_limit = DEFAULT_MAP_WINDOW;
 	/*
 	 * init different masks
 	 */
@@ -494,7 +521,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 		/* Now let's see if we can find something in the existing
 		 * slices for that size
 		 */
-		newaddr = slice_find_area(mm, len, good_mask, psize, topdown);
+		newaddr = slice_find_area(mm, len, good_mask,
+					  psize, topdown, high_limit);
 		if (newaddr != -ENOMEM) {
 			/* Found within the good mask, we don't have to setup,
 			 * we thus return directly
@@ -526,7 +554,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	 * anywhere in the good area.
 	 */
 	if (addr) {
-		addr = slice_find_area(mm, len, good_mask, psize, topdown);
+		addr = slice_find_area(mm, len, good_mask,
+				       psize, topdown, high_limit);
 		if (addr != -ENOMEM) {
 			slice_dbg(" found area at 0x%lx\n", addr);
 			return addr;
@@ -536,14 +565,15 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	/* Now let's see if we can find something in the existing slices
 	 * for that size plus free slices
 	 */
-	addr = slice_find_area(mm, len, potential_mask, psize, topdown);
+	addr = slice_find_area(mm, len, potential_mask,
+			       psize, topdown, high_limit);
 
 #ifdef CONFIG_PPC_64K_PAGES
 	if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
 		/* retry the search with 4k-page slices included */
 		slice_or_mask(&potential_mask, &compat_mask);
-		addr = slice_find_area(mm, len, potential_mask, psize,
-				       topdown);
+		addr = slice_find_area(mm, len, potential_mask,
+				       psize, topdown, high_limit);
 	}
 #endif
 
-- 
cgit v1.2.1


From 5b1d6fc2d4d927852214f2a7e2a8eb1bdce3b574 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Tue, 28 Mar 2017 19:15:04 +0530
Subject: powerpc/book3s: Print the kernel function name in machine check

For D-side errors we print the load/store address that caused the
machine check as 'Effective address'. But the instruction that may have
caused the machine check can also be helpful, so in addition to printing
the NIP, also print the kernel function name as well.

After this patch the MCE console log would look like:

  Severe Machine check interrupt [Recovered]
    NIP [d00000001bc70194]: init_module+0x194/0x2b0 [bork_kernel]
    Initiator: CPU
    Error type: SLB [Parity]
      Effective address: d000000026de0000

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 399aeafb7dd4..b960f00f4908 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -311,7 +311,7 @@ void machine_check_print_event_info(struct machine_check_event *evt)
 	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
 	       evt->disposition == MCE_DISPOSITION_RECOVERED ?
 	       "Recovered" : "Not recovered");
-	printk("%s  NIP: %016llx\n", level, evt->srr0);
+	printk("%s  NIP [%016llx]: %pS\n", level, evt->srr0, (void *)evt->srr0);
 	printk("%s  Initiator: %s\n", level,
 	       evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
 	switch (evt->error_type) {
-- 
cgit v1.2.1


From 63f44d65143976af9b28cf70ebe23f2c8b38677a Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 3 Apr 2017 15:29:34 +1000
Subject: powerpc/book3s: Print task info if we take a machine check in user
 mode

For an MCE (Machine Check Exception) that hits while in user mode
MSR(PR=1), print the task info to the console MCE error log. This may
help to identify an application that triggered the MCE.

After this patch the MCE console looks like:

  Severe Machine check interrupt [Recovered]
    NIP: [0000000010039778] PID: 762 Comm: ebizzy
    Initiator: CPU
    Error type: SLB [Multihit]
      Effective address: 0000000010039778

  Severe Machine check interrupt [Not recovered]
    NIP: [0000000010039778] PID: 763 Comm: ebizzy
    Initiator: CPU
    Error type: UE [Page table walk ifetch]
      Effective address: 0000000010039778
  ebizzy[763]: unhandled signal 7 at 0000000010039778 nip 0000000010039778 lr 0000000010001b44 code 30004

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mce.h        |  3 ++-
 arch/powerpc/kernel/mce.c             | 15 ++++++++++++---
 arch/powerpc/platforms/powernv/opal.c |  2 +-
 3 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index e3498b446788..81eff8631434 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -207,7 +207,8 @@ extern void save_mce_event(struct pt_regs *regs, long handled,
 extern int get_mce_event(struct machine_check_event *mce, bool release);
 extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
-extern void machine_check_print_event_info(struct machine_check_event *evt);
+extern void machine_check_print_event_info(struct machine_check_event *evt,
+					   bool user_mode);
 extern uint64_t get_mce_fault_addr(struct machine_check_event *evt);
 
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b960f00f4908..16eb0b508761 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -228,12 +228,13 @@ static void machine_check_process_queued_event(struct irq_work *work)
 	while (__this_cpu_read(mce_queue_count) > 0) {
 		index = __this_cpu_read(mce_queue_count) - 1;
 		machine_check_print_event_info(
-				this_cpu_ptr(&mce_event_queue[index]));
+				this_cpu_ptr(&mce_event_queue[index]), false);
 		__this_cpu_dec(mce_queue_count);
 	}
 }
 
-void machine_check_print_event_info(struct machine_check_event *evt)
+void machine_check_print_event_info(struct machine_check_event *evt,
+				    bool user_mode)
 {
 	const char *level, *sevstr, *subtype;
 	static const char *mc_ue_types[] = {
@@ -311,7 +312,15 @@ void machine_check_print_event_info(struct machine_check_event *evt)
 	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
 	       evt->disposition == MCE_DISPOSITION_RECOVERED ?
 	       "Recovered" : "Not recovered");
-	printk("%s  NIP [%016llx]: %pS\n", level, evt->srr0, (void *)evt->srr0);
+
+	if (user_mode) {
+		printk("%s  NIP: [%016llx] PID: %d Comm: %s\n", level,
+			evt->srr0, current->pid, current->comm);
+	} else {
+		printk("%s  NIP [%016llx]: %pS\n", level, evt->srr0,
+		       (void *)evt->srr0);
+	}
+
 	printk("%s  Initiator: %s\n", level,
 	       evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
 	switch (evt->error_type) {
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index e0f856bfbfe8..296c9426f72b 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -435,7 +435,7 @@ int opal_machine_check(struct pt_regs *regs)
 		       evt.version);
 		return 0;
 	}
-	machine_check_print_event_info(&evt);
+	machine_check_print_event_info(&evt, user_mode(regs));
 
 	if (opal_recover_mce(regs, &evt))
 		return 1;
-- 
cgit v1.2.1


From a7a9dcd882a67b68568868b988289fce5ffd8419 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Mon, 3 Apr 2017 16:41:02 +1000
Subject: powerpc: Avoid taking a data miss on every userspace instruction miss

Early on in do_page_fault() we call store_updates_sp(), regardless of
the type of exception. For an instruction miss this doesn't make
sense, because we only use this information to detect if a data miss
is the result of a stack expansion instruction or not.

Worse still, it results in a data miss within every userspace
instruction miss handler, because we try and load the very instruction
we are about to install a pte for!

A simple exec microbenchmark runs 6% faster on POWER8 with this fix:

 #include <stdlib.h>
 #include <stdio.h>
 #include <unistd.h>

int main(int argc, char *argv[])
{
	unsigned long left = atol(argv[1]);
	char leftstr[16];

	if (left-- == 0)
		return 0;

	sprintf(leftstr, "%ld", left);
	execlp(argv[0], argv[0], leftstr, NULL);
	perror("exec failed\n");

	return 0;
}

Pass the number of iterations on the command line (eg 10000) and time
how long it takes to execute.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index fd6484fc2fa9..3a7d580fdc59 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -287,7 +287,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * can result in fault, which will cause a deadlock when called with
 	 * mmap_sem held
 	 */
-	if (user_mode(regs))
+	if (!is_exec && user_mode(regs))
 		store_update_sp = store_updates_sp(regs);
 
 	if (user_mode(regs))
-- 
cgit v1.2.1


From f6f9195ba0b0b1406af384037d9e7eb8215aca98 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Mon, 3 Apr 2017 18:09:06 +1000
Subject: powerpc/mm: Remove stale comment about the DART hole

The code to fix the problem it describes was removed in commit
c40785ad305b ("powerpc/dart: Use a cachable DART"), and it uses the
stupid comment style. Away it goooooooooooooes!

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/hash_utils_64.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 8848fec51ce9..69a05b3e4d3f 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -927,11 +927,6 @@ static void __init htab_initialize(void)
 	}
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
-	/* On U3 based machines, we need to reserve the DART area and
-	 * _NOT_ map it to avoid cache paradoxes as it's remapped non
-	 * cacheable later on
-	 */
-
 	/* create bolted the linear mapping in the hash table */
 	for_each_memblock(memory, reg) {
 		base = (unsigned long)__va(reg->base);
-- 
cgit v1.2.1


From 4c3b89effc281704d5395282c800c45e453235f6 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Mon, 3 Apr 2017 19:51:43 +1000
Subject: powerpc/powernv: Add sanity checks to pnv_pci_get_{gpu|npu}_dev

The pnv_pci_get_{gpu|npu}_dev functions are used to find associations
between nvlink PCIe devices and standard PCIe devices. However they
lacked basic sanity checking which results in NULL pointer
dereferencing if they are incorrect called can be harder to spot than
an explicit WARN_ON.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/npu-dma.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 1c383f38031d..050bd5da7096 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -37,6 +37,12 @@ struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
 	struct device_node *dn;
 	struct pci_dev *gpdev;
 
+	if (WARN_ON(!npdev))
+		return NULL;
+
+	if (WARN_ON(!npdev->dev.of_node))
+		return NULL;
+
 	/* Get assoicated PCI device */
 	dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
 	if (!dn)
@@ -55,6 +61,12 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
 	struct device_node *dn;
 	struct pci_dev *npdev;
 
+	if (WARN_ON(!gpdev))
+		return NULL;
+
+	if (WARN_ON(!gpdev->dev.of_node))
+		return NULL;
+
 	/* Get assoicated PCI device */
 	dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
 	if (!dn)
-- 
cgit v1.2.1


From 1ab66d1fbadad86b1f4a9c7857e193af0ee0022c Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Mon, 3 Apr 2017 19:51:44 +1000
Subject: powerpc/powernv: Introduce address translation services for Nvlink2

Nvlink2 supports address translation services (ATS) allowing devices
to request address translations from an mmu known as the nest MMU
which is setup to walk the CPU page tables.

To access this functionality certain firmware calls are required to
setup and manage hardware context tables in the nvlink processing unit
(NPU). The NPU also manages forwarding of TLB invalidates (known as
address translation shootdowns/ATSDs) to attached devices.

This patch exports several methods to allow device drivers to register
a process id (PASID/PID) in the hardware tables and to receive
notification of when a device should stop issuing address translation
requests (ATRs). It also adds a fault handler to allow device drivers
to demand fault pages in.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
[mpe: Fix up comment formatting, use flush_tlb_mm()]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu.h       |   6 +
 arch/powerpc/include/asm/opal-api.h            |   5 +-
 arch/powerpc/include/asm/opal.h                |   5 +
 arch/powerpc/include/asm/powernv.h             |  22 ++
 arch/powerpc/mm/mmu_context_book3s64.c         |   2 +
 arch/powerpc/platforms/powernv/npu-dma.c       | 450 +++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/opal-wrappers.S |   3 +
 arch/powerpc/platforms/powernv/pci-ioda.c      |   2 +
 arch/powerpc/platforms/powernv/pci.h           |  15 +-
 9 files changed, 508 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index c4b865112d24..77529a3e3811 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -75,10 +75,16 @@ extern struct patb_entry *partition_tb;
 typedef unsigned long mm_context_id_t;
 struct spinlock;
 
+/* Maximum possible number of NPUs in a system. */
+#define NV_MAX_NPUS 8
+
 typedef struct {
 	mm_context_id_t id;
 	u16 user_psize;		/* page size index */
 
+	/* NPU NMMU context */
+	struct npu_context *npu_context;
+
 #ifdef CONFIG_PPC_MM_SLICES
 	u64 low_slices_psize;	/* SLB page size encodings */
 	unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index a0aa285869b5..a599a2c893c3 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -168,7 +168,10 @@
 #define OPAL_INT_SET_MFRR			125
 #define OPAL_PCI_TCE_KILL			126
 #define OPAL_NMMU_SET_PTCR			127
-#define OPAL_LAST				127
+#define OPAL_NPU_INIT_CONTEXT			146
+#define OPAL_NPU_DESTROY_CONTEXT		147
+#define OPAL_NPU_MAP_LPAR			148
+#define OPAL_LAST				148
 
 /* Device tree flags */
 
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 1ff03a6da76e..b3b97c4cd54b 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -29,6 +29,11 @@ extern struct device_node *opal_node;
 
 /* API functions */
 int64_t opal_invalid_call(void);
+int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid, uint64_t bdf);
+int64_t opal_npu_init_context(uint64_t phb_id, int pasid, uint64_t msr,
+			uint64_t bdf);
+int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid,
+			uint64_t lpcr);
 int64_t opal_console_write(int64_t term_number, __be64 *length,
 			   const uint8_t *buffer);
 int64_t opal_console_read(int64_t term_number, __be64 *length,
diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h
index 0e9c2402dd20..f62797702300 100644
--- a/arch/powerpc/include/asm/powernv.h
+++ b/arch/powerpc/include/asm/powernv.h
@@ -11,9 +11,31 @@
 #define _ASM_POWERNV_H
 
 #ifdef CONFIG_PPC_POWERNV
+#define NPU2_WRITE 1
 extern void powernv_set_nmmu_ptcr(unsigned long ptcr);
+extern struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
+			unsigned long flags,
+			struct npu_context *(*cb)(struct npu_context *, void *),
+			void *priv);
+extern void pnv_npu2_destroy_context(struct npu_context *context,
+				struct pci_dev *gpdev);
+extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
+				unsigned long *flags, unsigned long *status,
+				int count);
 #else
 static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { }
+static inline struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
+			unsigned long flags,
+			struct npu_context *(*cb)(struct npu_context *, void *),
+			void *priv) { return ERR_PTR(-ENODEV); }
+static inline void pnv_npu2_destroy_context(struct npu_context *context,
+					struct pci_dev *gpdev) { }
+
+static inline int pnv_npu2_handle_fault(struct npu_context *context,
+					uintptr_t *ea, unsigned long *flags,
+					unsigned long *status, int count) {
+	return -ENODEV;
+}
 #endif
 
 #endif /* _ASM_POWERNV_H */
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 86719f6b5a7c..c6dca2ae78ef 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -138,6 +138,8 @@ static int radix__init_new_context(struct mm_struct *mm)
 	rts_field = radix__get_tree_size();
 	process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
 
+	mm->context.npu_context = NULL;
+
 	return index;
 }
 
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 050bd5da7096..4c88c3e6ec9e 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -9,11 +9,20 @@
  * License as published by the Free Software Foundation.
  */
 
+#include <linux/slab.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mmu_context.h>
+#include <linux/of.h>
 #include <linux/export.h>
 #include <linux/pci.h>
 #include <linux/memblock.h>
 #include <linux/iommu.h>
 
+#include <asm/tlb.h>
+#include <asm/powernv.h>
+#include <asm/reg.h>
+#include <asm/opal.h>
+#include <asm/io.h>
 #include <asm/iommu.h>
 #include <asm/pnv-pci.h>
 #include <asm/msi_bitmap.h>
@@ -22,6 +31,8 @@
 #include "powernv.h"
 #include "pci.h"
 
+#define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
+
 /*
  * Other types of TCE cache invalidation are not functional in the
  * hardware.
@@ -371,3 +382,442 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
 
 	return gpe;
 }
+
+/* Maximum number of nvlinks per npu */
+#define NV_MAX_LINKS 6
+
+/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
+static int max_npu2_index;
+
+struct npu_context {
+	struct mm_struct *mm;
+	struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
+	struct mmu_notifier mn;
+	struct kref kref;
+
+	/* Callback to stop translation requests on a given GPU */
+	struct npu_context *(*release_cb)(struct npu_context *, void *);
+
+	/*
+	 * Private pointer passed to the above callback for usage by
+	 * device drivers.
+	 */
+	void *priv;
+};
+
+/*
+ * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
+ * if none are available.
+ */
+static int get_mmio_atsd_reg(struct npu *npu)
+{
+	int i;
+
+	for (i = 0; i < npu->mmio_atsd_count; i++) {
+		if (!test_and_set_bit(i, &npu->mmio_atsd_usage))
+			return i;
+	}
+
+	return -ENOSPC;
+}
+
+static void put_mmio_atsd_reg(struct npu *npu, int reg)
+{
+	clear_bit(reg, &npu->mmio_atsd_usage);
+}
+
+/* MMIO ATSD register offsets */
+#define XTS_ATSD_AVA  1
+#define XTS_ATSD_STAT 2
+
+static int mmio_launch_invalidate(struct npu *npu, unsigned long launch,
+				unsigned long va)
+{
+	int mmio_atsd_reg;
+
+	do {
+		mmio_atsd_reg = get_mmio_atsd_reg(npu);
+		cpu_relax();
+	} while (mmio_atsd_reg < 0);
+
+	__raw_writeq(cpu_to_be64(va),
+		npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA);
+	eieio();
+	__raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]);
+
+	return mmio_atsd_reg;
+}
+
+static int mmio_invalidate_pid(struct npu *npu, unsigned long pid)
+{
+	unsigned long launch;
+
+	/* IS set to invalidate matching PID */
+	launch = PPC_BIT(12);
+
+	/* PRS set to process-scoped */
+	launch |= PPC_BIT(13);
+
+	/* AP */
+	launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
+
+	/* PID */
+	launch |= pid << PPC_BITLSHIFT(38);
+
+	/* Invalidating the entire process doesn't use a va */
+	return mmio_launch_invalidate(npu, launch, 0);
+}
+
+static int mmio_invalidate_va(struct npu *npu, unsigned long va,
+			unsigned long pid)
+{
+	unsigned long launch;
+
+	/* IS set to invalidate target VA */
+	launch = 0;
+
+	/* PRS set to process scoped */
+	launch |= PPC_BIT(13);
+
+	/* AP */
+	launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
+
+	/* PID */
+	launch |= pid << PPC_BITLSHIFT(38);
+
+	return mmio_launch_invalidate(npu, launch, va);
+}
+
+#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
+
+/*
+ * Invalidate either a single address or an entire PID depending on
+ * the value of va.
+ */
+static void mmio_invalidate(struct npu_context *npu_context, int va,
+			unsigned long address)
+{
+	int i, j, reg;
+	struct npu *npu;
+	struct pnv_phb *nphb;
+	struct pci_dev *npdev;
+	struct {
+		struct npu *npu;
+		int reg;
+	} mmio_atsd_reg[NV_MAX_NPUS];
+	unsigned long pid = npu_context->mm->context.id;
+
+	/*
+	 * Loop over all the NPUs this process is active on and launch
+	 * an invalidate.
+	 */
+	for (i = 0; i <= max_npu2_index; i++) {
+		mmio_atsd_reg[i].reg = -1;
+		for (j = 0; j < NV_MAX_LINKS; j++) {
+			npdev = npu_context->npdev[i][j];
+			if (!npdev)
+				continue;
+
+			nphb = pci_bus_to_host(npdev->bus)->private_data;
+			npu = &nphb->npu;
+			mmio_atsd_reg[i].npu = npu;
+
+			if (va)
+				mmio_atsd_reg[i].reg =
+					mmio_invalidate_va(npu, address, pid);
+			else
+				mmio_atsd_reg[i].reg =
+					mmio_invalidate_pid(npu, pid);
+
+			/*
+			 * The NPU hardware forwards the shootdown to all GPUs
+			 * so we only have to launch one shootdown per NPU.
+			 */
+			break;
+		}
+	}
+
+	/*
+	 * Unfortunately the nest mmu does not support flushing specific
+	 * addresses so we have to flush the whole mm.
+	 */
+	flush_tlb_mm(npu_context->mm);
+
+	/* Wait for all invalidations to complete */
+	for (i = 0; i <= max_npu2_index; i++) {
+		if (mmio_atsd_reg[i].reg < 0)
+			continue;
+
+		/* Wait for completion */
+		npu = mmio_atsd_reg[i].npu;
+		reg = mmio_atsd_reg[i].reg;
+		while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
+			cpu_relax();
+		put_mmio_atsd_reg(npu, reg);
+	}
+}
+
+static void pnv_npu2_mn_release(struct mmu_notifier *mn,
+				struct mm_struct *mm)
+{
+	struct npu_context *npu_context = mn_to_npu_context(mn);
+
+	/* Call into device driver to stop requests to the NMMU */
+	if (npu_context->release_cb)
+		npu_context->release_cb(npu_context, npu_context->priv);
+
+	/*
+	 * There should be no more translation requests for this PID, but we
+	 * need to ensure any entries for it are removed from the TLB.
+	 */
+	mmio_invalidate(npu_context, 0, 0);
+}
+
+static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
+				struct mm_struct *mm,
+				unsigned long address,
+				pte_t pte)
+{
+	struct npu_context *npu_context = mn_to_npu_context(mn);
+
+	mmio_invalidate(npu_context, 1, address);
+}
+
+static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn,
+					struct mm_struct *mm,
+					unsigned long address)
+{
+	struct npu_context *npu_context = mn_to_npu_context(mn);
+
+	mmio_invalidate(npu_context, 1, address);
+}
+
+static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
+					struct mm_struct *mm,
+					unsigned long start, unsigned long end)
+{
+	struct npu_context *npu_context = mn_to_npu_context(mn);
+	unsigned long address;
+
+	for (address = start; address <= end; address += PAGE_SIZE)
+		mmio_invalidate(npu_context, 1, address);
+}
+
+static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
+	.release = pnv_npu2_mn_release,
+	.change_pte = pnv_npu2_mn_change_pte,
+	.invalidate_page = pnv_npu2_mn_invalidate_page,
+	.invalidate_range = pnv_npu2_mn_invalidate_range,
+};
+
+/*
+ * Call into OPAL to setup the nmmu context for the current task in
+ * the NPU. This must be called to setup the context tables before the
+ * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
+ *
+ * A release callback should be registered to allow a device driver to
+ * be notified that it should not launch any new translation requests
+ * as the final TLB invalidate is about to occur.
+ *
+ * Returns an error if there no contexts are currently available or a
+ * npu_context which should be passed to pnv_npu2_handle_fault().
+ *
+ * mmap_sem must be held in write mode.
+ */
+struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
+			unsigned long flags,
+			struct npu_context *(*cb)(struct npu_context *, void *),
+			void *priv)
+{
+	int rc;
+	u32 nvlink_index;
+	struct device_node *nvlink_dn;
+	struct mm_struct *mm = current->mm;
+	struct pnv_phb *nphb;
+	struct npu *npu;
+	struct npu_context *npu_context;
+
+	/*
+	 * At present we don't support GPUs connected to multiple NPUs and I'm
+	 * not sure the hardware does either.
+	 */
+	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+
+	if (!firmware_has_feature(FW_FEATURE_OPAL))
+		return ERR_PTR(-ENODEV);
+
+	if (!npdev)
+		/* No nvlink associated with this GPU device */
+		return ERR_PTR(-ENODEV);
+
+	if (!mm) {
+		/* kernel thread contexts are not supported */
+		return ERR_PTR(-EINVAL);
+	}
+
+	nphb = pci_bus_to_host(npdev->bus)->private_data;
+	npu = &nphb->npu;
+
+	/*
+	 * Setup the NPU context table for a particular GPU. These need to be
+	 * per-GPU as we need the tables to filter ATSDs when there are no
+	 * active contexts on a particular GPU.
+	 */
+	rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
+				PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+	if (rc < 0)
+		return ERR_PTR(-ENOSPC);
+
+	/*
+	 * We store the npu pci device so we can more easily get at the
+	 * associated npus.
+	 */
+	npu_context = mm->context.npu_context;
+	if (!npu_context) {
+		npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
+		if (!npu_context)
+			return ERR_PTR(-ENOMEM);
+
+		mm->context.npu_context = npu_context;
+		npu_context->mm = mm;
+		npu_context->mn.ops = &nv_nmmu_notifier_ops;
+		__mmu_notifier_register(&npu_context->mn, mm);
+		kref_init(&npu_context->kref);
+	} else {
+		kref_get(&npu_context->kref);
+	}
+
+	npu_context->release_cb = cb;
+	npu_context->priv = priv;
+	nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
+	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
+							&nvlink_index)))
+		return ERR_PTR(-ENODEV);
+	npu_context->npdev[npu->index][nvlink_index] = npdev;
+
+	return npu_context;
+}
+EXPORT_SYMBOL(pnv_npu2_init_context);
+
+static void pnv_npu2_release_context(struct kref *kref)
+{
+	struct npu_context *npu_context =
+		container_of(kref, struct npu_context, kref);
+
+	npu_context->mm->context.npu_context = NULL;
+	mmu_notifier_unregister(&npu_context->mn,
+				npu_context->mm);
+
+	kfree(npu_context);
+}
+
+void pnv_npu2_destroy_context(struct npu_context *npu_context,
+			struct pci_dev *gpdev)
+{
+	struct pnv_phb *nphb, *phb;
+	struct npu *npu;
+	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+	struct device_node *nvlink_dn;
+	u32 nvlink_index;
+
+	if (WARN_ON(!npdev))
+		return;
+
+	if (!firmware_has_feature(FW_FEATURE_OPAL))
+		return;
+
+	nphb = pci_bus_to_host(npdev->bus)->private_data;
+	npu = &nphb->npu;
+	phb = pci_bus_to_host(gpdev->bus)->private_data;
+	nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
+	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
+							&nvlink_index)))
+		return;
+	npu_context->npdev[npu->index][nvlink_index] = NULL;
+	opal_npu_destroy_context(phb->opal_id, npu_context->mm->context.id,
+				PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+	kref_put(&npu_context->kref, pnv_npu2_release_context);
+}
+EXPORT_SYMBOL(pnv_npu2_destroy_context);
+
+/*
+ * Assumes mmap_sem is held for the contexts associated mm.
+ */
+int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
+			unsigned long *flags, unsigned long *status, int count)
+{
+	u64 rc = 0, result = 0;
+	int i, is_write;
+	struct page *page[1];
+
+	/* mmap_sem should be held so the struct_mm must be present */
+	struct mm_struct *mm = context->mm;
+
+	if (!firmware_has_feature(FW_FEATURE_OPAL))
+		return -ENODEV;
+
+	WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+	for (i = 0; i < count; i++) {
+		is_write = flags[i] & NPU2_WRITE;
+		rc = get_user_pages_remote(NULL, mm, ea[i], 1,
+					is_write ? FOLL_WRITE : 0,
+					page, NULL, NULL);
+
+		/*
+		 * To support virtualised environments we will have to do an
+		 * access to the page to ensure it gets faulted into the
+		 * hypervisor. For the moment virtualisation is not supported in
+		 * other areas so leave the access out.
+		 */
+		if (rc != 1) {
+			status[i] = rc;
+			result = -EFAULT;
+			continue;
+		}
+
+		status[i] = 0;
+		put_page(page[0]);
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(pnv_npu2_handle_fault);
+
+int pnv_npu2_init(struct pnv_phb *phb)
+{
+	unsigned int i;
+	u64 mmio_atsd;
+	struct device_node *dn;
+	struct pci_dev *gpdev;
+	static int npu_index;
+	uint64_t rc = 0;
+
+	for_each_child_of_node(phb->hose->dn, dn) {
+		gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
+		if (gpdev) {
+			rc = opal_npu_map_lpar(phb->opal_id,
+				PCI_DEVID(gpdev->bus->number, gpdev->devfn),
+				0, 0);
+			if (rc)
+				dev_err(&gpdev->dev,
+					"Error %lld mapping device to LPAR\n",
+					rc);
+		}
+	}
+
+	for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd",
+							i, &mmio_atsd); i++)
+		phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
+
+	pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i);
+	phb->npu.mmio_atsd_count = i;
+	phb->npu.mmio_atsd_usage = 0;
+	npu_index++;
+	if (WARN_ON(npu_index >= NV_MAX_NPUS))
+		return -ENOSPC;
+	max_npu2_index = npu_index;
+	phb->npu.index = npu_index;
+
+	return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index ebf6719d241a..5e1527eea8f7 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -292,3 +292,6 @@ OPAL_CALL(opal_int_eoi,				OPAL_INT_EOI);
 OPAL_CALL(opal_int_set_mfrr,			OPAL_INT_SET_MFRR);
 OPAL_CALL(opal_pci_tce_kill,			OPAL_PCI_TCE_KILL);
 OPAL_CALL(opal_nmmu_set_ptcr,			OPAL_NMMU_SET_PTCR);
+OPAL_CALL(opal_npu_init_context,		OPAL_NPU_INIT_CONTEXT);
+OPAL_CALL(opal_npu_destroy_context,		OPAL_NPU_DESTROY_CONTEXT);
+OPAL_CALL(opal_npu_map_lpar,			OPAL_NPU_MAP_LPAR);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 7461322f0a1a..9c050fe5a6e8 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1262,6 +1262,8 @@ static void pnv_pci_ioda_setup_PEs(void)
 			/* PE#0 is needed for error reporting */
 			pnv_ioda_reserve_pe(phb, 0);
 			pnv_ioda_setup_npu_PEs(hose->bus);
+			if (phb->model == PNV_PHB_MODEL_NPU2)
+				pnv_npu2_init(phb);
 		}
 	}
 }
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index e1d3e5526b54..4eab713136d1 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -7,6 +7,9 @@
 
 struct pci_dn;
 
+/* Maximum possible number of ATSD MMIO registers per NPU */
+#define NV_NMMU_ATSD_REGS 8
+
 enum pnv_phb_type {
 	PNV_PHB_IODA1	= 0,
 	PNV_PHB_IODA2	= 1,
@@ -174,6 +177,16 @@ struct pnv_phb {
 		struct OpalIoP7IOCErrorData 	hub_diag;
 	} diag;
 
+	/* Nvlink2 data */
+	struct npu {
+		int index;
+		__be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
+		unsigned int mmio_atsd_count;
+
+		/* Bitmask for MMIO register usage */
+		unsigned long mmio_atsd_usage;
+	} npu;
+
 #ifdef CONFIG_CXL_BASE
 	struct cxl_afu *cxl_afu;
 #endif
@@ -236,7 +249,7 @@ extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
 extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num);
 extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe);
 extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe);
-
+extern int pnv_npu2_init(struct pnv_phb *phb);
 
 /* cxl functions */
 extern bool pnv_cxl_enable_device_hook(struct pci_dev *dev);
-- 
cgit v1.2.1


From 687da8fce1682c9f1e87530e731189863c588265 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Mon, 27 Mar 2017 19:43:14 -0400
Subject: powerpc/prom: Increase minimum RMA size to 512MB

When booting very large systems with a large initrd, we run out of
space early in boot for either RTAS or the flattened device tree (FDT).
Boot fails with messages like:

	Could not allocate memory for RTAS
or
	No memory for flatten_device_tree (no room)

Increasing the minimum RMA size to 512MB fixes the problem. This
should not have an impact on smaller LPARs (with 256MB memory),
as the firmware will cap the RMA to the memory assigned to the LPAR.

Fix is based on input/discussions with Michael Ellerman. Thanks to
Praveen K. Pandey for testing on a large system.

Reported-by: Praveen K. Pandey <preveen.pandey@in.ibm.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/prom_init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 1c1b44ec7642..dd8a04f3053a 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -815,7 +815,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
 		.virt_base = cpu_to_be32(0xffffffff),
 		.virt_size = cpu_to_be32(0xffffffff),
 		.load_base = cpu_to_be32(0xffffffff),
-		.min_rma = cpu_to_be32(256),		/* 256MB min RMA */
+		.min_rma = cpu_to_be32(512),		/* 512MB min RMA */
 		.min_load = cpu_to_be32(0xffffffff),	/* full client load */
 		.min_rma_percent = 0,	/* min RMA percentage of total RAM */
 		.max_pft_size = 48,	/* max log_2(hash table size) */
-- 
cgit v1.2.1


From 11fe909d236263f62808dc3c73caf798e026d7aa Mon Sep 17 00:00:00 2001
From: Matt Brown <matthew.brown.dev@gmail.com>
Date: Thu, 30 Mar 2017 10:28:01 +1100
Subject: powerpc/powernv: Add OPAL exports attributes to sysfs

New versions of OPAL have a device node /ibm,opal/firmware/exports, each
property of which describes a range of memory in OPAL that Linux might
want to export to userspace for debugging.

This patch adds a sysfs file under 'opal/exports' for each property
found there, and makes it read-only by root.

Signed-off-by: Matt Brown <matthew.brown.dev@gmail.com>
[mpe: Drop counting of props, rename to attr, free on sysfs error, c'log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal.c | 76 +++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 296c9426f72b..76e153fc1f93 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -595,6 +595,79 @@ static void opal_export_symmap(void)
 		pr_warn("Error %d creating OPAL symbols file\n", rc);
 }
 
+static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
+				struct bin_attribute *bin_attr, char *buf,
+				loff_t off, size_t count)
+{
+	return memory_read_from_buffer(buf, count, &off, bin_attr->private,
+				       bin_attr->size);
+}
+
+/*
+ * opal_export_attrs: creates a sysfs node for each property listed in
+ * the device-tree under /ibm,opal/firmware/exports/
+ * All new sysfs nodes are created under /opal/exports/.
+ * This allows for reserved memory regions (e.g. HDAT) to be read.
+ * The new sysfs nodes are only readable by root.
+ */
+static void opal_export_attrs(void)
+{
+	struct bin_attribute *attr;
+	struct device_node *np;
+	struct property *prop;
+	struct kobject *kobj;
+	u64 vals[2];
+	int rc;
+
+	np = of_find_node_by_path("/ibm,opal/firmware/exports");
+	if (!np)
+		return;
+
+	/* Create new 'exports' directory - /sys/firmware/opal/exports */
+	kobj = kobject_create_and_add("exports", opal_kobj);
+	if (!kobj) {
+		pr_warn("kobject_create_and_add() of exports failed\n");
+		return;
+	}
+
+	for_each_property_of_node(np, prop) {
+		if (!strcmp(prop->name, "name") || !strcmp(prop->name, "phandle"))
+			continue;
+
+		if (of_property_read_u64_array(np, prop->name, &vals[0], 2))
+			continue;
+
+		attr = kmalloc(sizeof(*attr), GFP_KERNEL);
+
+		if (attr == NULL) {
+			pr_warn("Failed kmalloc for bin_attribute!");
+			continue;
+		}
+
+		attr->attr.name = kstrdup(prop->name, GFP_KERNEL);
+		attr->attr.mode = 0400;
+		attr->read = export_attr_read;
+		attr->private = __va(vals[0]);
+		attr->size = vals[1];
+
+		if (attr->attr.name == NULL) {
+			pr_warn("Failed kstrdup for bin_attribute attr.name");
+			kfree(attr);
+			continue;
+		}
+
+		rc = sysfs_create_bin_file(kobj, attr);
+		if (rc) {
+			pr_warn("Error %d creating OPAL sysfs exports/%s file\n",
+				 rc, prop->name);
+			kfree(attr->attr.name);
+			kfree(attr);
+		}
+	}
+
+	of_node_put(np);
+}
+
 static void __init opal_dump_region_init(void)
 {
 	void *addr;
@@ -733,6 +806,9 @@ static int __init opal_init(void)
 		opal_msglog_sysfs_init();
 	}
 
+	/* Export all properties */
+	opal_export_attrs();
+
 	/* Initialize platform devices: IPMI backend, PRD & flash interface */
 	opal_pdev_init("ibm,opal-ipmi");
 	opal_pdev_init("ibm,opal-flash");
-- 
cgit v1.2.1


From eeea1a434ddedbb5aaeac1a8661445b3ae3eb539 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 6 Apr 2017 09:01:33 +1000
Subject: powerpc/powernv: Add XIVE related definitions to opal-api.h

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/opal-api.h            | 74 +++++++++++++++++++++++++-
 arch/powerpc/include/asm/opal.h                | 36 +++++++++++++
 arch/powerpc/platforms/powernv/opal-wrappers.S | 15 ++++++
 3 files changed, 124 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index a0aa285869b5..bc8ac3c0e649 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -40,6 +40,8 @@
 #define OPAL_I2C_ARBT_LOST	-22
 #define OPAL_I2C_NACK_RCVD	-23
 #define OPAL_I2C_STOP_ERR	-24
+#define OPAL_XIVE_PROVISIONING	-31
+#define OPAL_XIVE_FREE_ACTIVE	-32
 
 /* API Tokens (in r0) */
 #define OPAL_INVALID_CALL		       -1
@@ -168,7 +170,24 @@
 #define OPAL_INT_SET_MFRR			125
 #define OPAL_PCI_TCE_KILL			126
 #define OPAL_NMMU_SET_PTCR			127
-#define OPAL_LAST				127
+#define OPAL_XIVE_RESET				128
+#define OPAL_XIVE_GET_IRQ_INFO			129
+#define OPAL_XIVE_GET_IRQ_CONFIG		130
+#define OPAL_XIVE_SET_IRQ_CONFIG		131
+#define OPAL_XIVE_GET_QUEUE_INFO		132
+#define OPAL_XIVE_SET_QUEUE_INFO		133
+#define OPAL_XIVE_DONATE_PAGE			134
+#define OPAL_XIVE_ALLOCATE_VP_BLOCK		135
+#define OPAL_XIVE_FREE_VP_BLOCK			136
+#define OPAL_XIVE_GET_VP_INFO			137
+#define OPAL_XIVE_SET_VP_INFO			138
+#define OPAL_XIVE_ALLOCATE_IRQ			139
+#define OPAL_XIVE_FREE_IRQ			140
+#define OPAL_XIVE_SYNC				141
+#define OPAL_XIVE_DUMP				142
+#define OPAL_XIVE_RESERVED3			143
+#define OPAL_XIVE_RESERVED4			144
+#define OPAL_LAST				144
 
 /* Device tree flags */
 
@@ -928,6 +947,59 @@ enum {
 	OPAL_PCI_TCE_KILL_ALL,
 };
 
+/* The xive operation mode indicates the active "API" and
+ * corresponds to the "mode" parameter of the opal_xive_reset()
+ * call
+ */
+enum {
+	OPAL_XIVE_MODE_EMU	= 0,
+	OPAL_XIVE_MODE_EXPL	= 1,
+};
+
+/* Flags for OPAL_XIVE_GET_IRQ_INFO */
+enum {
+	OPAL_XIVE_IRQ_TRIGGER_PAGE	= 0x00000001,
+	OPAL_XIVE_IRQ_STORE_EOI		= 0x00000002,
+	OPAL_XIVE_IRQ_LSI		= 0x00000004,
+	OPAL_XIVE_IRQ_SHIFT_BUG		= 0x00000008,
+	OPAL_XIVE_IRQ_MASK_VIA_FW	= 0x00000010,
+	OPAL_XIVE_IRQ_EOI_VIA_FW	= 0x00000020,
+};
+
+/* Flags for OPAL_XIVE_GET/SET_QUEUE_INFO */
+enum {
+	OPAL_XIVE_EQ_ENABLED		= 0x00000001,
+	OPAL_XIVE_EQ_ALWAYS_NOTIFY	= 0x00000002,
+	OPAL_XIVE_EQ_ESCALATE		= 0x00000004,
+};
+
+/* Flags for OPAL_XIVE_GET/SET_VP_INFO */
+enum {
+	OPAL_XIVE_VP_ENABLED		= 0x00000001,
+};
+
+/* "Any chip" replacement for chip ID for allocation functions */
+enum {
+	OPAL_XIVE_ANY_CHIP		= 0xffffffff,
+};
+
+/* Xive sync options */
+enum {
+	/* This bits are cumulative, arg is a girq */
+	XIVE_SYNC_EAS			= 0x00000001, /* Sync irq source */
+	XIVE_SYNC_QUEUE			= 0x00000002, /* Sync irq target */
+};
+
+/* Dump options */
+enum {
+	XIVE_DUMP_TM_HYP	= 0,
+	XIVE_DUMP_TM_POOL	= 1,
+	XIVE_DUMP_TM_OS		= 2,
+	XIVE_DUMP_TM_USER	= 3,
+	XIVE_DUMP_VP		= 4,
+	XIVE_DUMP_EMU_STATE	= 5,
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __OPAL_API_H */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 1ff03a6da76e..cb7d6078b03a 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -226,6 +226,42 @@ int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t kill_type,
 			  uint32_t pe_num, uint32_t tce_size,
 			  uint64_t dma_addr, uint32_t npages);
 int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr);
+int64_t opal_xive_reset(uint64_t version);
+int64_t opal_xive_get_irq_info(uint32_t girq,
+			       __be64 *out_flags,
+			       __be64 *out_eoi_page,
+			       __be64 *out_trig_page,
+			       __be32 *out_esb_shift,
+			       __be32 *out_src_chip);
+int64_t opal_xive_get_irq_config(uint32_t girq, __be64 *out_vp,
+				 uint8_t *out_prio, __be32 *out_lirq);
+int64_t opal_xive_set_irq_config(uint32_t girq, uint64_t vp, uint8_t prio,
+				 uint32_t lirq);
+int64_t opal_xive_get_queue_info(uint64_t vp, uint32_t prio,
+				 __be64 *out_qpage,
+				 __be64 *out_qsize,
+				 __be64 *out_qeoi_page,
+				 __be32 *out_escalate_irq,
+				 __be64 *out_qflags);
+int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio,
+				 uint64_t qpage,
+				 uint64_t qsize,
+				 uint64_t qflags);
+int64_t opal_xive_donate_page(uint32_t chip_id, uint64_t addr);
+int64_t opal_xive_alloc_vp_block(uint32_t alloc_order);
+int64_t opal_xive_free_vp_block(uint64_t vp);
+int64_t opal_xive_get_vp_info(uint64_t vp,
+			      __be64 *out_flags,
+			      __be64 *out_cam_value,
+			      __be64 *out_report_cl_pair,
+			      __be32 *out_chip_id);
+int64_t opal_xive_set_vp_info(uint64_t vp,
+			      uint64_t flags,
+			      uint64_t report_cl_pair);
+int64_t opal_xive_allocate_irq(uint32_t chip_id);
+int64_t opal_xive_free_irq(uint32_t girq);
+int64_t opal_xive_sync(uint32_t type, uint32_t id);
+int64_t opal_xive_dump(uint32_t type, uint32_t id);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index da8a0f7a035c..085605a73168 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -301,3 +301,18 @@ OPAL_CALL(opal_int_eoi,				OPAL_INT_EOI);
 OPAL_CALL(opal_int_set_mfrr,			OPAL_INT_SET_MFRR);
 OPAL_CALL(opal_pci_tce_kill,			OPAL_PCI_TCE_KILL);
 OPAL_CALL(opal_nmmu_set_ptcr,			OPAL_NMMU_SET_PTCR);
+OPAL_CALL(opal_xive_reset,			OPAL_XIVE_RESET);
+OPAL_CALL(opal_xive_get_irq_info,		OPAL_XIVE_GET_IRQ_INFO);
+OPAL_CALL(opal_xive_get_irq_config,		OPAL_XIVE_GET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_set_irq_config,		OPAL_XIVE_SET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_get_queue_info,		OPAL_XIVE_GET_QUEUE_INFO);
+OPAL_CALL(opal_xive_set_queue_info,		OPAL_XIVE_SET_QUEUE_INFO);
+OPAL_CALL(opal_xive_donate_page,		OPAL_XIVE_DONATE_PAGE);
+OPAL_CALL(opal_xive_alloc_vp_block,		OPAL_XIVE_ALLOCATE_VP_BLOCK);
+OPAL_CALL(opal_xive_free_vp_block,		OPAL_XIVE_FREE_VP_BLOCK);
+OPAL_CALL(opal_xive_allocate_irq,		OPAL_XIVE_ALLOCATE_IRQ);
+OPAL_CALL(opal_xive_free_irq,			OPAL_XIVE_FREE_IRQ);
+OPAL_CALL(opal_xive_get_vp_info,		OPAL_XIVE_GET_VP_INFO);
+OPAL_CALL(opal_xive_set_vp_info,		OPAL_XIVE_SET_VP_INFO);
+OPAL_CALL(opal_xive_sync,			OPAL_XIVE_SYNC);
+OPAL_CALL(opal_xive_dump,			OPAL_XIVE_DUMP);
-- 
cgit v1.2.1


From 22bd64a621cc80beeb009abec3d3df98ec0131c5 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 5 Apr 2017 17:54:47 +1000
Subject: powerpc: Add more PPC bit conversion macros

Add 32 and 8 bit variants

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/bitops.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index bc5fdfd22788..33a24fdd7958 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -55,6 +55,14 @@
 #define PPC_BITEXTRACT(bits, ppc_bit, dst_bit)			\
 	((((bits) >> PPC_BITLSHIFT(ppc_bit)) & 1) << (dst_bit))
 
+#define PPC_BITLSHIFT32(be)	(32 - 1 - (be))
+#define PPC_BIT32(bit)		(1UL << PPC_BITLSHIFT32(bit))
+#define PPC_BITMASK32(bs, be)	((PPC_BIT32(bs) - PPC_BIT32(be))|PPC_BIT32(bs))
+
+#define PPC_BITLSHIFT8(be)	(8 - 1 - (be))
+#define PPC_BIT8(bit)		(1UL << PPC_BITLSHIFT8(bit))
+#define PPC_BITMASK8(bs, be)	((PPC_BIT8(bs) - PPC_BIT8(be))|PPC_BIT8(bs))
+
 #include <asm/barrier.h>
 
 /* Macro for generating the ***_bits() functions */
-- 
cgit v1.2.1


From 14d4ae5c4cb89c05262fe41cb7a26f6ba949d8df Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 5 Apr 2017 17:54:48 +1000
Subject: powerpc: Add optional smp_ops->prepare_cpu SMP callback

Some platforms (will) need to perform allocations before bringing
a new CPU online. Doing it from smp_ops->setup_cpu is the wrong
thing to do:

 - It has no useful failure path (too late)
 - Calling any allocator will enable interrupts prematurely
   causing problems with large decrementer among others

Instead, add a new callback that is called from __cpu_up (so from
the context trying to online the new CPU) at a point where we
can safely allocate and handle failures.

This will be used by XIVE support.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/smp.h |  1 +
 arch/powerpc/kernel/smp.c      | 10 ++++++++++
 2 files changed, 11 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 32db16d2e7ad..2f8e36f91acd 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -44,6 +44,7 @@ struct smp_ops_t {
 #endif
 	void  (*probe)(void);
 	int   (*kick_cpu)(int nr);
+	int   (*prepare_cpu)(int nr);
 	void  (*setup_cpu)(int nr);
 	void  (*bringup_done)(void);
 	void  (*take_timebase)(void);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 46f89e66a273..b12f5f0a408f 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -521,6 +521,16 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 
 	cpu_idle_thread_init(cpu, tidle);
 
+	/*
+	 * The platform might need to allocate resources prior to bringing
+	 * up the CPU
+	 */
+	if (smp_ops->prepare_cpu) {
+		rc = smp_ops->prepare_cpu(cpu);
+		if (rc)
+			return rc;
+	}
+
 	/* Make sure callin-map entry is 0 (can be leftover a CPU
 	 * hotplug
 	 */
-- 
cgit v1.2.1


From a978e13965a40ac07163643cc3fa0ddb0d354198 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 5 Apr 2017 17:54:49 +1000
Subject: powerpc/smp: Remove migrate_irq() custom implementation

Some powerpc platforms use this to move IRQs away from a CPU being
unplugged. This function has several bugs such as not taking the right
locks or failing to NULL check pointers.

There's a new generic function doing exactly the same thing without all
the bugs, so let's use it instead.

mpe: The obvious place for the select of GENERIC_IRQ_MIGRATION is on
HOTPLUG_CPU, but that doesn't work. On some configs PM_SLEEP_SMP will
select HOTPLUG_CPU even though its dependencies are not met, which means
the select of GENERIC_IRQ_MIGRATION doesn't happen. That leads to the
build breaking. Fix it by moving the select of GENERIC_IRQ_MIGRATION to
SMP.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/smp.h         |  1 -
 arch/powerpc/kernel/irq.c              | 40 ----------------------------------
 arch/powerpc/kernel/smp.c              |  9 +++++++-
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 4 files changed, 9 insertions(+), 42 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 2f8e36f91acd..63fa780b71a0 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -62,7 +62,6 @@ extern void smp_generic_take_timebase(void);
 DECLARE_PER_CPU(unsigned int, cpu_pvr);
 
 #ifdef CONFIG_HOTPLUG_CPU
-extern void migrate_irqs(void);
 int generic_cpu_disable(void);
 void generic_cpu_die(unsigned int cpu);
 void generic_set_cpu_dead(unsigned int cpu);
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index a018f5cae899..8ee7b44450eb 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -442,46 +442,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	return sum;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-void migrate_irqs(void)
-{
-	struct irq_desc *desc;
-	unsigned int irq;
-	static int warned;
-	cpumask_var_t mask;
-	const struct cpumask *map = cpu_online_mask;
-
-	alloc_cpumask_var(&mask, GFP_KERNEL);
-
-	for_each_irq_desc(irq, desc) {
-		struct irq_data *data;
-		struct irq_chip *chip;
-
-		data = irq_desc_get_irq_data(desc);
-		if (irqd_is_per_cpu(data))
-			continue;
-
-		chip = irq_data_get_irq_chip(data);
-
-		cpumask_and(mask, irq_data_get_affinity_mask(data), map);
-		if (cpumask_any(mask) >= nr_cpu_ids) {
-			pr_warn("Breaking affinity for irq %i\n", irq);
-			cpumask_copy(mask, map);
-		}
-		if (chip->irq_set_affinity)
-			chip->irq_set_affinity(data, mask, true);
-		else if (desc->action && !(warned++))
-			pr_err("Cannot set affinity for irq %i\n", irq);
-	}
-
-	free_cpumask_var(mask);
-
-	local_irq_enable();
-	mdelay(1);
-	local_irq_disable();
-}
-#endif
-
 static inline void check_stack_overflow(void)
 {
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index b12f5f0a408f..6e61cdb89194 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -439,7 +439,14 @@ int generic_cpu_disable(void)
 #ifdef CONFIG_PPC64
 	vdso_data->processorCount--;
 #endif
-	migrate_irqs();
+	/* Update affinity of all IRQs previously aimed at this CPU */
+	irq_migrate_all_off_this_cpu();
+
+	/* Give the CPU time to drain in-flight ones */
+	local_irq_enable();
+	mdelay(1);
+	local_irq_disable();
+
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 99b0ae8acb78..9b25cded03e9 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -373,6 +373,7 @@ config PPC_PERF_CTRS
 
 config SMP
 	depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE || PPC_47x
+	select GENERIC_IRQ_MIGRATION
 	bool "Symmetric multi-processing support"
 	---help---
 	  This enables support for systems with more than one CPU. If you have
-- 
cgit v1.2.1


From 243e25112d06b348f087a6f7aba4bbc288285bdd Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 5 Apr 2017 17:54:50 +1000
Subject: powerpc/xive: Native exploitation of the XIVE interrupt controller

The XIVE interrupt controller is the new interrupt controller
found in POWER9. It supports advanced virtualization capabilities
among other things.

Currently we use a set of firmware calls that simulate the old
"XICS" interrupt controller but this is fairly inefficient.

This adds the framework for using XIVE along with a native
backend which OPAL for configuration. Later, a backend allowing
the use in a KVM or PowerVM guest will also be provided.

This disables some fast path for interrupts in KVM when XIVE is
enabled as these rely on the firmware emulation code which is no
longer available when the XIVE is used natively by Linux.

A latter patch will make KVM also directly exploit the XIVE, thus
recovering the lost performance (and more).

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Fixup pr_xxx("XIVE:"...), don't split pr_xxx() strings,
 tweak Kconfig so XIVE_NATIVE selects XIVE and depends on POWERNV,
 fix build errors when SMP=n, fold in fixes from Ben:
   Don't call cpu_online() on an invalid CPU number
   Fix irq target selection returning out of bounds cpu#
   Extra sanity checks on cpu numbers
 ]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/xive-regs.h     |   97 +++
 arch/powerpc/include/asm/xive.h          |  163 ++++
 arch/powerpc/include/asm/xmon.h          |    2 +
 arch/powerpc/kvm/book3s_hv_builtin.c     |    8 +
 arch/powerpc/platforms/powernv/Kconfig   |    1 +
 arch/powerpc/platforms/powernv/setup.c   |   15 +-
 arch/powerpc/platforms/powernv/smp.c     |   39 +-
 arch/powerpc/sysdev/Kconfig              |    1 +
 arch/powerpc/sysdev/Makefile             |    1 +
 arch/powerpc/sysdev/xive/Kconfig         |   11 +
 arch/powerpc/sysdev/xive/Makefile        |    4 +
 arch/powerpc/sysdev/xive/common.c        | 1302 ++++++++++++++++++++++++++++++
 arch/powerpc/sysdev/xive/native.c        |  639 +++++++++++++++
 arch/powerpc/sysdev/xive/xive-internal.h |   62 ++
 arch/powerpc/xmon/xmon.c                 |   94 ++-
 15 files changed, 2427 insertions(+), 12 deletions(-)
 create mode 100644 arch/powerpc/include/asm/xive-regs.h
 create mode 100644 arch/powerpc/include/asm/xive.h
 create mode 100644 arch/powerpc/sysdev/xive/Kconfig
 create mode 100644 arch/powerpc/sysdev/xive/Makefile
 create mode 100644 arch/powerpc/sysdev/xive/common.c
 create mode 100644 arch/powerpc/sysdev/xive/native.c
 create mode 100644 arch/powerpc/sysdev/xive/xive-internal.h

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/xive-regs.h b/arch/powerpc/include/asm/xive-regs.h
new file mode 100644
index 000000000000..1d3f2be5ae39
--- /dev/null
+++ b/arch/powerpc/include/asm/xive-regs.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef _ASM_POWERPC_XIVE_REGS_H
+#define _ASM_POWERPC_XIVE_REGS_H
+
+/*
+ * Thread Management (aka "TM") registers
+ */
+
+/* TM register offsets */
+#define TM_QW0_USER		0x000 /* All rings */
+#define TM_QW1_OS		0x010 /* Ring 0..2 */
+#define TM_QW2_HV_POOL		0x020 /* Ring 0..1 */
+#define TM_QW3_HV_PHYS		0x030 /* Ring 0..1 */
+
+/* Byte offsets inside a QW             QW0 QW1 QW2 QW3 */
+#define TM_NSR			0x0  /*  +   +   -   +  */
+#define TM_CPPR			0x1  /*  -   +   -   +  */
+#define TM_IPB			0x2  /*  -   +   +   +  */
+#define TM_LSMFB		0x3  /*  -   +   +   +  */
+#define TM_ACK_CNT		0x4  /*  -   +   -   -  */
+#define TM_INC			0x5  /*  -   +   -   +  */
+#define TM_AGE			0x6  /*  -   +   -   +  */
+#define TM_PIPR			0x7  /*  -   +   -   +  */
+
+#define TM_WORD0		0x0
+#define TM_WORD1		0x4
+
+/*
+ * QW word 2 contains the valid bit at the top and other fields
+ * depending on the QW.
+ */
+#define TM_WORD2		0x8
+#define   TM_QW0W2_VU		PPC_BIT32(0)
+#define   TM_QW0W2_LOGIC_SERV	PPC_BITMASK32(1,31) // XX 2,31 ?
+#define   TM_QW1W2_VO		PPC_BIT32(0)
+#define   TM_QW1W2_OS_CAM	PPC_BITMASK32(8,31)
+#define   TM_QW2W2_VP		PPC_BIT32(0)
+#define   TM_QW2W2_POOL_CAM	PPC_BITMASK32(8,31)
+#define   TM_QW3W2_VT		PPC_BIT32(0)
+#define   TM_QW3W2_LP		PPC_BIT32(6)
+#define   TM_QW3W2_LE		PPC_BIT32(7)
+#define   TM_QW3W2_T		PPC_BIT32(31)
+
+/*
+ * In addition to normal loads to "peek" and writes (only when invalid)
+ * using 4 and 8 bytes accesses, the above registers support these
+ * "special" byte operations:
+ *
+ *   - Byte load from QW0[NSR] - User level NSR (EBB)
+ *   - Byte store to QW0[NSR] - User level NSR (EBB)
+ *   - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access
+ *   - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0
+ *                                    otherwise VT||0000000
+ *   - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present)
+ *
+ * Then we have all these "special" CI ops at these offset that trigger
+ * all sorts of side effects:
+ */
+#define TM_SPC_ACK_EBB		0x800	/* Load8 ack EBB to reg*/
+#define TM_SPC_ACK_OS_REG	0x810	/* Load16 ack OS irq to reg */
+#define TM_SPC_PUSH_USR_CTX	0x808	/* Store32 Push/Validate user context */
+#define TM_SPC_PULL_USR_CTX	0x808	/* Load32 Pull/Invalidate user context */
+#define TM_SPC_SET_OS_PENDING	0x812	/* Store8 Set OS irq pending bit */
+#define TM_SPC_PULL_OS_CTX	0x818	/* Load32/Load64 Pull/Invalidate OS context to reg */
+#define TM_SPC_PULL_POOL_CTX	0x828	/* Load32/Load64 Pull/Invalidate Pool context to reg*/
+#define TM_SPC_ACK_HV_REG	0x830	/* Load16 ack HV irq to reg */
+#define TM_SPC_PULL_USR_CTX_OL	0xc08	/* Store8 Pull/Inval usr ctx to odd line */
+#define TM_SPC_ACK_OS_EL	0xc10	/* Store8 ack OS irq to even line */
+#define TM_SPC_ACK_HV_POOL_EL	0xc20	/* Store8 ack HV evt pool to even line */
+#define TM_SPC_ACK_HV_EL	0xc30	/* Store8 ack HV irq to even line */
+/* XXX more... */
+
+/* NSR fields for the various QW ack types */
+#define TM_QW0_NSR_EB		PPC_BIT8(0)
+#define TM_QW1_NSR_EO		PPC_BIT8(0)
+#define TM_QW3_NSR_HE		PPC_BITMASK8(0,1)
+#define  TM_QW3_NSR_HE_NONE	0
+#define  TM_QW3_NSR_HE_POOL	1
+#define  TM_QW3_NSR_HE_PHYS	2
+#define  TM_QW3_NSR_HE_LSI	3
+#define TM_QW3_NSR_I		PPC_BIT8(2)
+#define TM_QW3_NSR_GRP_LVL	PPC_BIT8(3,7)
+
+/* Utilities to manipulate these (originaly from OPAL) */
+#define MASK_TO_LSH(m)		(__builtin_ffsl(m) - 1)
+#define GETFIELD(m, v)		(((v) & (m)) >> MASK_TO_LSH(m))
+#define SETFIELD(m, v, val)				\
+	(((v) & ~(m)) |	((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m)))
+
+#endif /* _ASM_POWERPC_XIVE_REGS_H */
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
new file mode 100644
index 000000000000..3cdbeaeac397
--- /dev/null
+++ b/arch/powerpc/include/asm/xive.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef _ASM_POWERPC_XIVE_H
+#define _ASM_POWERPC_XIVE_H
+
+#define XIVE_INVALID_VP	0xffffffff
+
+#ifdef CONFIG_PPC_XIVE
+
+/*
+ * Thread Interrupt Management Area (TIMA)
+ *
+ * This is a global MMIO region divided in 4 pages of varying access
+ * permissions, providing access to per-cpu interrupt management
+ * functions. It always identifies the CPU doing the access based
+ * on the PowerBus initiator ID, thus we always access via the
+ * same offset regardless of where the code is executing
+ */
+extern void __iomem *xive_tima;
+
+/*
+ * Offset in the TM area of our current execution level (provided by
+ * the backend)
+ */
+extern u32 xive_tima_offset;
+
+/*
+ * Per-irq data (irq_get_handler_data for normal IRQs), IPIs
+ * have it stored in the xive_cpu structure. We also cache
+ * for normal interrupts the current target CPU.
+ *
+ * This structure is setup by the backend for each interrupt.
+ */
+struct xive_irq_data {
+	u64 flags;
+	u64 eoi_page;
+	void __iomem *eoi_mmio;
+	u64 trig_page;
+	void __iomem *trig_mmio;
+	u32 esb_shift;
+	int src_chip;
+
+	/* Setup/used by frontend */
+	int target;
+	bool saved_p;
+};
+#define XIVE_IRQ_FLAG_STORE_EOI	0x01
+#define XIVE_IRQ_FLAG_LSI	0x02
+#define XIVE_IRQ_FLAG_SHIFT_BUG	0x04
+#define XIVE_IRQ_FLAG_MASK_FW	0x08
+#define XIVE_IRQ_FLAG_EOI_FW	0x10
+
+#define XIVE_INVALID_CHIP_ID	-1
+
+/* A queue tracking structure in a CPU */
+struct xive_q {
+	__be32 			*qpage;
+	u32			msk;
+	u32			idx;
+	u32			toggle;
+	u64			eoi_phys;
+	u32			esc_irq;
+	atomic_t		count;
+	atomic_t		pending_count;
+};
+
+/*
+ * "magic" Event State Buffer (ESB) MMIO offsets.
+ *
+ * Each interrupt source has a 2-bit state machine called ESB
+ * which can be controlled by MMIO. It's made of 2 bits, P and
+ * Q. P indicates that an interrupt is pending (has been sent
+ * to a queue and is waiting for an EOI). Q indicates that the
+ * interrupt has been triggered while pending.
+ *
+ * This acts as a coalescing mechanism in order to guarantee
+ * that a given interrupt only occurs at most once in a queue.
+ *
+ * When doing an EOI, the Q bit will indicate if the interrupt
+ * needs to be re-triggered.
+ *
+ * The following offsets into the ESB MMIO allow to read or
+ * manipulate the PQ bits. They must be used with an 8-bytes
+ * load instruction. They all return the previous state of the
+ * interrupt (atomically).
+ *
+ * Additionally, some ESB pages support doing an EOI via a
+ * store at 0 and some ESBs support doing a trigger via a
+ * separate trigger page.
+ */
+#define XIVE_ESB_GET		0x800
+#define XIVE_ESB_SET_PQ_00	0xc00
+#define XIVE_ESB_SET_PQ_01	0xd00
+#define XIVE_ESB_SET_PQ_10	0xe00
+#define XIVE_ESB_SET_PQ_11	0xf00
+#define XIVE_ESB_MASK		XIVE_ESB_SET_PQ_01
+
+#define XIVE_ESB_VAL_P		0x2
+#define XIVE_ESB_VAL_Q		0x1
+
+/* Global enable flags for the XIVE support */
+extern bool __xive_enabled;
+
+static inline bool xive_enabled(void) { return __xive_enabled; }
+
+extern bool xive_native_init(void);
+extern void xive_smp_probe(void);
+extern int  xive_smp_prepare_cpu(unsigned int cpu);
+extern void xive_smp_setup_cpu(void);
+extern void xive_smp_disable_cpu(void);
+extern void xive_kexec_teardown_cpu(int secondary);
+extern void xive_shutdown(void);
+extern void xive_flush_interrupt(void);
+
+/* xmon hook */
+extern void xmon_xive_do_dump(int cpu);
+
+/* APIs used by KVM */
+extern u32 xive_native_default_eq_shift(void);
+extern u32 xive_native_alloc_vp_block(u32 max_vcpus);
+extern void xive_native_free_vp_block(u32 vp_base);
+extern int xive_native_populate_irq_data(u32 hw_irq,
+					 struct xive_irq_data *data);
+extern void xive_cleanup_irq_data(struct xive_irq_data *xd);
+extern u32 xive_native_alloc_irq(void);
+extern void xive_native_free_irq(u32 irq);
+extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
+
+extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
+				       __be32 *qpage, u32 order, bool can_escalate);
+extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
+
+extern bool __xive_irq_trigger(struct xive_irq_data *xd);
+extern bool __xive_irq_retrigger(struct xive_irq_data *xd);
+extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd);
+
+extern bool is_xive_irq(struct irq_chip *chip);
+
+#else
+
+static inline bool xive_enabled(void) { return false; }
+
+static inline bool xive_native_init(void) { return false; }
+static inline void xive_smp_probe(void) { }
+extern inline int  xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; }
+static inline void xive_smp_setup_cpu(void) { }
+static inline void xive_smp_disable_cpu(void) { }
+static inline void xive_kexec_teardown_cpu(int secondary) { }
+static inline void xive_shutdown(void) { }
+static inline void xive_flush_interrupt(void) { }
+
+static inline u32 xive_native_alloc_vp_block(u32 max_vcpus) { return XIVE_INVALID_VP; }
+static inline void xive_native_free_vp_block(u32 vp_base) { }
+
+#endif
+
+#endif /* _ASM_POWERPC_XIVE_H */
diff --git a/arch/powerpc/include/asm/xmon.h b/arch/powerpc/include/asm/xmon.h
index 5eb8e599e5cc..eb42a0c6e1d9 100644
--- a/arch/powerpc/include/asm/xmon.h
+++ b/arch/powerpc/include/asm/xmon.h
@@ -29,5 +29,7 @@ static inline void xmon_register_spus(struct list_head *list) { };
 extern int cpus_are_in_xmon(void);
 #endif
 
+extern void xmon_printf(const char *format, ...);
+
 #endif /* __KERNEL __ */
 #endif /* __ASM_POWERPC_XMON_H */
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 4d6c64b3041c..ae55603cf661 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -23,6 +23,7 @@
 #include <asm/kvm_book3s.h>
 #include <asm/archrandom.h>
 #include <asm/xics.h>
+#include <asm/xive.h>
 #include <asm/dbell.h>
 #include <asm/cputhreads.h>
 #include <asm/io.h>
@@ -224,6 +225,10 @@ void kvmhv_rm_send_ipi(int cpu)
 		return;
 	}
 
+	/* We should never reach this */
+	if (WARN_ON_ONCE(xive_enabled()))
+	    return;
+
 	/* Else poke the target with an IPI */
 	xics_phys = paca[cpu].kvm_hstate.xics_phys;
 	if (xics_phys)
@@ -386,6 +391,9 @@ long kvmppc_read_intr(void)
 	long rc;
 	bool again;
 
+	if (xive_enabled())
+		return 1;
+
 	do {
 		again = false;
 		rc = kvmppc_read_one_intr(&again);
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 3a07e4dcf97c..9689a6272995 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -4,6 +4,7 @@ config PPC_POWERNV
 	select PPC_NATIVE
 	select PPC_XICS
 	select PPC_ICP_NATIVE
+	select PPC_XIVE_NATIVE
 	select PPC_P7_NAP
 	select PCI
 	select PCI_MSI
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index d50c7d99baaf..adceac978d18 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -32,6 +32,7 @@
 #include <asm/machdep.h>
 #include <asm/firmware.h>
 #include <asm/xics.h>
+#include <asm/xive.h>
 #include <asm/opal.h>
 #include <asm/kexec.h>
 #include <asm/smp.h>
@@ -76,7 +77,9 @@ static void __init pnv_init(void)
 
 static void __init pnv_init_IRQ(void)
 {
-	xics_init();
+	/* Try using a XIVE if available, otherwise use a XICS */
+	if (!xive_native_init())
+		xics_init();
 
 	WARN_ON(!ppc_md.get_irq);
 }
@@ -218,10 +221,12 @@ static void pnv_kexec_wait_secondaries_down(void)
 
 static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
 {
-	xics_kexec_teardown_cpu(secondary);
+	if (xive_enabled())
+		xive_kexec_teardown_cpu(secondary);
+	else
+		xics_kexec_teardown_cpu(secondary);
 
 	/* On OPAL, we return all CPUs to firmware */
-
 	if (!firmware_has_feature(FW_FEATURE_OPAL))
 		return;
 
@@ -237,6 +242,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
 		/* Primary waits for the secondaries to have reached OPAL */
 		pnv_kexec_wait_secondaries_down();
 
+		/* Switch XIVE back to emulation mode */
+		if (xive_enabled())
+			xive_shutdown();
+
 		/*
 		 * We might be running as little-endian - now that interrupts
 		 * are disabled, reset the HILE bit to big-endian so we don't
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 8b67e1eefb5c..f57195588c6c 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -29,6 +29,7 @@
 #include <asm/vdso_datapage.h>
 #include <asm/cputhreads.h>
 #include <asm/xics.h>
+#include <asm/xive.h>
 #include <asm/opal.h>
 #include <asm/runlatch.h>
 #include <asm/code-patching.h>
@@ -47,7 +48,9 @@
 
 static void pnv_smp_setup_cpu(int cpu)
 {
-	if (cpu != boot_cpuid)
+	if (xive_enabled())
+		xive_smp_setup_cpu();
+	else if (cpu != boot_cpuid)
 		xics_setup_cpu();
 
 #ifdef CONFIG_PPC_DOORBELL
@@ -132,7 +135,10 @@ static int pnv_smp_cpu_disable(void)
 	vdso_data->processorCount--;
 	if (cpu == boot_cpuid)
 		boot_cpuid = cpumask_any(cpu_online_mask);
-	xics_migrate_irqs_away();
+	if (xive_enabled())
+		xive_smp_disable_cpu();
+	else
+		xics_migrate_irqs_away();
 	return 0;
 }
 
@@ -213,9 +219,12 @@ static void pnv_smp_cpu_kill_self(void)
 		if (((srr1 & wmask) == SRR1_WAKEEE) ||
 		    ((srr1 & wmask) == SRR1_WAKEHVI) ||
 		    (local_paca->irq_happened & PACA_IRQ_EE)) {
-			if (cpu_has_feature(CPU_FTR_ARCH_300))
-				icp_opal_flush_interrupt();
-			else
+			if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+				if (xive_enabled())
+					xive_flush_interrupt();
+				else
+					icp_opal_flush_interrupt();
+			} else
 				icp_native_flush_interrupt();
 		} else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
 			unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
@@ -252,10 +261,26 @@ static int pnv_cpu_bootable(unsigned int nr)
 	return smp_generic_cpu_bootable(nr);
 }
 
+static int pnv_smp_prepare_cpu(int cpu)
+{
+	if (xive_enabled())
+		return xive_smp_prepare_cpu(cpu);
+	return 0;
+}
+
+static void __init pnv_smp_probe(void)
+{
+	if (xive_enabled())
+		xive_smp_probe();
+	else
+		xics_smp_probe();
+}
+
 static struct smp_ops_t pnv_smp_ops = {
 	.message_pass	= smp_muxed_ipi_message_pass,
-	.cause_ipi	= NULL,	/* Filled at runtime by xics_smp_probe() */
-	.probe		= xics_smp_probe,
+	.cause_ipi	= NULL, /* Filled at runtime by xi{cs,ve}_smp_probe() */
+	.probe		= pnv_smp_probe,
+	.prepare_cpu	= pnv_smp_prepare_cpu,
 	.kick_cpu	= pnv_smp_kick_cpu,
 	.setup_cpu	= pnv_smp_setup_cpu,
 	.cpu_bootable	= pnv_cpu_bootable,
diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig
index 52dc165c0efb..caf882e749dc 100644
--- a/arch/powerpc/sysdev/Kconfig
+++ b/arch/powerpc/sysdev/Kconfig
@@ -28,6 +28,7 @@ config PPC_MSI_BITMAP
 	default y if PPC_POWERNV
 
 source "arch/powerpc/sysdev/xics/Kconfig"
+source "arch/powerpc/sysdev/xive/Kconfig"
 
 config PPC_SCOM
 	bool
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index a254824719f1..c0ae11d4f62f 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -71,5 +71,6 @@ obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS)	+= udbg_memcons.o
 subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 
 obj-$(CONFIG_PPC_XICS)		+= xics/
+obj-$(CONFIG_PPC_XIVE)		+= xive/
 
 obj-$(CONFIG_GE_FPGA)		+= ge/
diff --git a/arch/powerpc/sysdev/xive/Kconfig b/arch/powerpc/sysdev/xive/Kconfig
new file mode 100644
index 000000000000..12ccd7373d2f
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/Kconfig
@@ -0,0 +1,11 @@
+config PPC_XIVE
+	bool
+	default n
+	select PPC_SMP_MUXED_IPI
+	select HARDIRQS_SW_RESEND
+
+config PPC_XIVE_NATIVE
+	bool
+	default n
+	select PPC_XIVE
+	depends on PPC_POWERNV
diff --git a/arch/powerpc/sysdev/xive/Makefile b/arch/powerpc/sysdev/xive/Makefile
new file mode 100644
index 000000000000..3fab303fc169
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/Makefile
@@ -0,0 +1,4 @@
+subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
+
+obj-y				+= common.o
+obj-$(CONFIG_PPC_XIVE_NATIVE)	+= native.o
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
new file mode 100644
index 000000000000..d9cd7f705f21
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -0,0 +1,1302 @@
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "xive: " fmt
+
+#include <linux/types.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/debugfs.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/msi.h>
+
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/irq.h>
+#include <asm/errno.h>
+#include <asm/xive.h>
+#include <asm/xive-regs.h>
+#include <asm/xmon.h>
+
+#include "xive-internal.h"
+
+#undef DEBUG_FLUSH
+#undef DEBUG_ALL
+
+#ifdef DEBUG_ALL
+#define DBG_VERBOSE(fmt...)	pr_devel(fmt)
+#else
+#define DBG_VERBOSE(fmt...)	do { } while(0)
+#endif
+
+bool __xive_enabled;
+bool xive_cmdline_disabled;
+
+/* We use only one priority for now */
+static u8 xive_irq_priority;
+
+/* TIMA */
+void __iomem *xive_tima;
+u32 xive_tima_offset;
+
+/* Backend ops */
+static const struct xive_ops *xive_ops;
+
+/* Our global interrupt domain */
+static struct irq_domain *xive_irq_domain;
+
+#ifdef CONFIG_SMP
+/* The IPIs all use the same logical irq number */
+static u32 xive_ipi_irq;
+#endif
+
+/* Xive state for each CPU */
+static DEFINE_PER_CPU(struct xive_cpu *, xive_cpu);
+
+/*
+ * A "disabled" interrupt should never fire, to catch problems
+ * we set its logical number to this
+ */
+#define XIVE_BAD_IRQ		0x7fffffff
+#define XIVE_MAX_IRQ		(XIVE_BAD_IRQ - 1)
+
+/* An invalid CPU target */
+#define XIVE_INVALID_TARGET	(-1)
+
+/*
+ * Read the next entry in a queue, return its content if it's valid
+ * or 0 if there is no new entry.
+ *
+ * The queue pointer is moved forward unless "just_peek" is set
+ */
+static u32 xive_read_eq(struct xive_q *q, bool just_peek)
+{
+	u32 cur;
+
+	if (!q->qpage)
+		return 0;
+	cur = be32_to_cpup(q->qpage + q->idx);
+
+	/* Check valid bit (31) vs current toggle polarity */
+	if ((cur >> 31) == q->toggle)
+		return 0;
+
+	/* If consuming from the queue ... */
+	if (!just_peek) {
+		/* Next entry */
+		q->idx = (q->idx + 1) & q->msk;
+
+		/* Wrap around: flip valid toggle */
+		if (q->idx == 0)
+			q->toggle ^= 1;
+	}
+	/* Mask out the valid bit (31) */
+	return cur & 0x7fffffff;
+}
+
+/*
+ * Scans all the queue that may have interrupts in them
+ * (based on "pending_prio") in priority order until an
+ * interrupt is found or all the queues are empty.
+ *
+ * Then updates the CPPR (Current Processor Priority
+ * Register) based on the most favored interrupt found
+ * (0xff if none) and return what was found (0 if none).
+ *
+ * If just_peek is set, return the most favored pending
+ * interrupt if any but don't update the queue pointers.
+ *
+ * Note: This function can operate generically on any number
+ * of queues (up to 8). The current implementation of the XIVE
+ * driver only uses a single queue however.
+ *
+ * Note2: This will also "flush" "the pending_count" of a queue
+ * into the "count" when that queue is observed to be empty.
+ * This is used to keep track of the amount of interrupts
+ * targetting a queue. When an interrupt is moved away from
+ * a queue, we only decrement that queue count once the queue
+ * has been observed empty to avoid races.
+ */
+static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
+{
+	u32 irq = 0;
+	u8 prio;
+
+	/* Find highest pending priority */
+	while (xc->pending_prio != 0) {
+		struct xive_q *q;
+
+		prio = ffs(xc->pending_prio) - 1;
+		DBG_VERBOSE("scan_irq: trying prio %d\n", prio);
+
+		/* Try to fetch */
+		irq = xive_read_eq(&xc->queue[prio], just_peek);
+
+		/* Found something ? That's it */
+		if (irq)
+			break;
+
+		/* Clear pending bits */
+		xc->pending_prio &= ~(1 << prio);
+
+		/*
+		 * Check if the queue count needs adjusting due to
+		 * interrupts being moved away. See description of
+		 * xive_dec_target_count()
+		 */
+		q = &xc->queue[prio];
+		if (atomic_read(&q->pending_count)) {
+			int p = atomic_xchg(&q->pending_count, 0);
+			if (p) {
+				WARN_ON(p > atomic_read(&q->count));
+				atomic_sub(p, &q->count);
+			}
+		}
+	}
+
+	/* If nothing was found, set CPPR to 0xff */
+	if (irq == 0)
+		prio = 0xff;
+
+	/* Update HW CPPR to match if necessary */
+	if (prio != xc->cppr) {
+		DBG_VERBOSE("scan_irq: adjusting CPPR to %d\n", prio);
+		xc->cppr = prio;
+		out_8(xive_tima + xive_tima_offset + TM_CPPR, prio);
+	}
+
+	return irq;
+}
+
+/*
+ * This is used to perform the magic loads from an ESB
+ * described in xive.h
+ */
+static u8 xive_poke_esb(struct xive_irq_data *xd, u32 offset)
+{
+	u64 val;
+
+	/* Handle HW errata */
+	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
+		offset |= offset << 4;
+
+	val = in_be64(xd->eoi_mmio + offset);
+
+	return (u8)val;
+}
+
+#ifdef CONFIG_XMON
+static void xive_dump_eq(const char *name, struct xive_q *q)
+{
+	u32 i0, i1, idx;
+
+	if (!q->qpage)
+		return;
+	idx = q->idx;
+	i0 = be32_to_cpup(q->qpage + idx);
+	idx = (idx + 1) & q->msk;
+	i1 = be32_to_cpup(q->qpage + idx);
+	xmon_printf("  %s Q T=%d %08x %08x ...\n", name,
+		    q->toggle, i0, i1);
+}
+
+void xmon_xive_do_dump(int cpu)
+{
+	struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
+
+	xmon_printf("XIVE state for CPU %d:\n", cpu);
+	xmon_printf("  pp=%02x cppr=%02x\n", xc->pending_prio, xc->cppr);
+	xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]);
+#ifdef CONFIG_SMP
+	{
+		u64 val = xive_poke_esb(&xc->ipi_data, XIVE_ESB_GET);
+		xmon_printf("  IPI state: %x:%c%c\n", xc->hw_ipi,
+			val & XIVE_ESB_VAL_P ? 'P' : 'p',
+			val & XIVE_ESB_VAL_P ? 'Q' : 'q');
+	}
+#endif
+}
+#endif /* CONFIG_XMON */
+
+static unsigned int xive_get_irq(void)
+{
+	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+	u32 irq;
+
+	/*
+	 * This can be called either as a result of a HW interrupt or
+	 * as a "replay" because EOI decided there was still something
+	 * in one of the queues.
+	 *
+	 * First we perform an ACK cycle in order to update our mask
+	 * of pending priorities. This will also have the effect of
+	 * updating the CPPR to the most favored pending interrupts.
+	 *
+	 * In the future, if we have a way to differenciate a first
+	 * entry (on HW interrupt) from a replay triggered by EOI,
+	 * we could skip this on replays unless we soft-mask tells us
+	 * that a new HW interrupt occurred.
+	 */
+	xive_ops->update_pending(xc);
+
+	DBG_VERBOSE("get_irq: pending=%02x\n", xc->pending_prio);
+
+	/* Scan our queue(s) for interrupts */
+	irq = xive_scan_interrupts(xc, false);
+
+	DBG_VERBOSE("get_irq: got irq 0x%x, new pending=0x%02x\n",
+	    irq, xc->pending_prio);
+
+	/* Return pending interrupt if any */
+	if (irq == XIVE_BAD_IRQ)
+		return 0;
+	return irq;
+}
+
+/*
+ * After EOI'ing an interrupt, we need to re-check the queue
+ * to see if another interrupt is pending since multiple
+ * interrupts can coalesce into a single notification to the
+ * CPU.
+ *
+ * If we find that there is indeed more in there, we call
+ * force_external_irq_replay() to make Linux synthetize an
+ * external interrupt on the next call to local_irq_restore().
+ */
+static void xive_do_queue_eoi(struct xive_cpu *xc)
+{
+	if (xive_scan_interrupts(xc, true) != 0) {
+		DBG_VERBOSE("eoi: pending=0x%02x\n", xc->pending_prio);
+		force_external_irq_replay();
+	}
+}
+
+/*
+ * EOI an interrupt at the source. There are several methods
+ * to do this depending on the HW version and source type
+ */
+void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
+{
+	/* If the XIVE supports the new "store EOI facility, use it */
+	if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
+		out_be64(xd->eoi_mmio, 0);
+	else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) {
+		/*
+		 * The FW told us to call it. This happens for some
+		 * interrupt sources that need additional HW whacking
+		 * beyond the ESB manipulation. For example LPC interrupts
+		 * on P9 DD1.0 need a latch to be clared in the LPC bridge
+		 * itself. The Firmware will take care of it.
+		 */
+		if (WARN_ON_ONCE(!xive_ops->eoi))
+			return;
+		xive_ops->eoi(hw_irq);
+	} else {
+		u8 eoi_val;
+
+		/*
+		 * Otherwise for EOI, we use the special MMIO that does
+		 * a clear of both P and Q and returns the old Q,
+		 * except for LSIs where we use the "EOI cycle" special
+		 * load.
+		 *
+		 * This allows us to then do a re-trigger if Q was set
+		 * rather than synthesizing an interrupt in software
+		 *
+		 * For LSIs, using the HW EOI cycle works around a problem
+		 * on P9 DD1 PHBs where the other ESB accesses don't work
+		 * properly.
+		 */
+		if (xd->flags & XIVE_IRQ_FLAG_LSI)
+			in_be64(xd->eoi_mmio);
+		else {
+			eoi_val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_00);
+			DBG_VERBOSE("eoi_val=%x\n", offset, eoi_val);
+
+			/* Re-trigger if needed */
+			if ((eoi_val & XIVE_ESB_VAL_Q) && xd->trig_mmio)
+				out_be64(xd->trig_mmio, 0);
+		}
+	}
+}
+
+/* irq_chip eoi callback */
+static void xive_irq_eoi(struct irq_data *d)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+
+	DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n",
+		    d->irq, irqd_to_hwirq(d), xc->pending_prio);
+
+	/* EOI the source if it hasn't been disabled */
+	if (!irqd_irq_disabled(d))
+		xive_do_source_eoi(irqd_to_hwirq(d), xd);
+
+	/*
+	 * Clear saved_p to indicate that it's no longer occupying
+	 * a queue slot on the target queue
+	 */
+	xd->saved_p = false;
+
+	/* Check for more work in the queue */
+	xive_do_queue_eoi(xc);
+}
+
+/*
+ * Helper used to mask and unmask an interrupt source. This
+ * is only called for normal interrupts that do not require
+ * masking/unmasking via firmware.
+ */
+static void xive_do_source_set_mask(struct xive_irq_data *xd,
+				    bool mask)
+{
+	u64 val;
+
+	/*
+	 * If the interrupt had P set, it may be in a queue.
+	 *
+	 * We need to make sure we don't re-enable it until it
+	 * has been fetched from that queue and EOId. We keep
+	 * a copy of that P state and use it to restore the
+	 * ESB accordingly on unmask.
+	 */
+	if (mask) {
+		val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_01);
+		xd->saved_p = !!(val & XIVE_ESB_VAL_P);
+	} else if (xd->saved_p)
+		xive_poke_esb(xd, XIVE_ESB_SET_PQ_10);
+	else
+		xive_poke_esb(xd, XIVE_ESB_SET_PQ_00);
+}
+
+/*
+ * Try to chose "cpu" as a new interrupt target. Increments
+ * the queue accounting for that target if it's not already
+ * full.
+ */
+static bool xive_try_pick_target(int cpu)
+{
+	struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
+	struct xive_q *q = &xc->queue[xive_irq_priority];
+	int max;
+
+	/*
+	 * Calculate max number of interrupts in that queue.
+	 *
+	 * We leave a gap of 1 just in case...
+	 */
+	max = (q->msk + 1) - 1;
+	return !!atomic_add_unless(&q->count, 1, max);
+}
+
+/*
+ * Un-account an interrupt for a target CPU. We don't directly
+ * decrement q->count since the interrupt might still be present
+ * in the queue.
+ *
+ * Instead increment a separate counter "pending_count" which
+ * will be substracted from "count" later when that CPU observes
+ * the queue to be empty.
+ */
+static void xive_dec_target_count(int cpu)
+{
+	struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
+	struct xive_q *q = &xc->queue[xive_irq_priority];
+
+	if (unlikely(WARN_ON(cpu < 0 || !xc))) {
+		pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc);
+		return;
+	}
+
+	/*
+	 * We increment the "pending count" which will be used
+	 * to decrement the target queue count whenever it's next
+	 * processed and found empty. This ensure that we don't
+	 * decrement while we still have the interrupt there
+	 * occupying a slot.
+	 */
+	atomic_inc(&q->pending_count);
+}
+
+/* Find a tentative CPU target in a CPU mask */
+static int xive_find_target_in_mask(const struct cpumask *mask,
+				    unsigned int fuzz)
+{
+	int cpu, first, num, i;
+
+	/* Pick up a starting point CPU in the mask based on  fuzz */
+	num = cpumask_weight(mask);
+	first = fuzz % num;
+
+	/* Locate it */
+	cpu = cpumask_first(mask);
+	for (i = 0; i < first && cpu < nr_cpu_ids; i++)
+		cpu = cpumask_next(cpu, mask);
+
+	/* Sanity check */
+	if (WARN_ON(cpu >= nr_cpu_ids))
+		cpu = cpumask_first(cpu_online_mask);
+
+	/* Remember first one to handle wrap-around */
+	first = cpu;
+
+	/*
+	 * Now go through the entire mask until we find a valid
+	 * target.
+	 */
+	for (;;) {
+		/*
+		 * We re-check online as the fallback case passes us
+		 * an untested affinity mask
+		 */
+		if (cpu_online(cpu) && xive_try_pick_target(cpu))
+			return cpu;
+		cpu = cpumask_next(cpu, mask);
+		if (cpu == first)
+			break;
+		/* Wrap around */
+		if (cpu >= nr_cpu_ids)
+			cpu = cpumask_first(mask);
+	}
+	return -1;
+}
+
+/*
+ * Pick a target CPU for an interrupt. This is done at
+ * startup or if the affinity is changed in a way that
+ * invalidates the current target.
+ */
+static int xive_pick_irq_target(struct irq_data *d,
+				const struct cpumask *affinity)
+{
+	static unsigned int fuzz;
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	cpumask_var_t mask;
+	int cpu = -1;
+
+	/*
+	 * If we have chip IDs, first we try to build a mask of
+	 * CPUs matching the CPU and find a target in there
+	 */
+	if (xd->src_chip != XIVE_INVALID_CHIP_ID &&
+		zalloc_cpumask_var(&mask, GFP_ATOMIC)) {
+		/* Build a mask of matching chip IDs */
+		for_each_cpu_and(cpu, affinity, cpu_online_mask) {
+			struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
+			if (xc->chip_id == xd->src_chip)
+				cpumask_set_cpu(cpu, mask);
+		}
+		/* Try to find a target */
+		if (cpumask_empty(mask))
+			cpu = -1;
+		else
+			cpu = xive_find_target_in_mask(mask, fuzz++);
+		free_cpumask_var(mask);
+		if (cpu >= 0)
+			return cpu;
+		fuzz--;
+	}
+
+	/* No chip IDs, fallback to using the affinity mask */
+	return xive_find_target_in_mask(affinity, fuzz++);
+}
+
+static unsigned int xive_irq_startup(struct irq_data *d)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+	int target, rc;
+
+	pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n",
+		 d->irq, hw_irq, d);
+
+#ifdef CONFIG_PCI_MSI
+	/*
+	 * The generic MSI code returns with the interrupt disabled on the
+	 * card, using the MSI mask bits. Firmware doesn't appear to unmask
+	 * at that level, so we do it here by hand.
+	 */
+	if (irq_data_get_msi_desc(d))
+		pci_msi_unmask_irq(d);
+#endif
+
+	/* Pick a target */
+	target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d));
+	if (target == XIVE_INVALID_TARGET) {
+		/* Try again breaking affinity */
+		target = xive_pick_irq_target(d, cpu_online_mask);
+		if (target == XIVE_INVALID_TARGET)
+			return -ENXIO;
+		pr_warn("irq %d started with broken affinity\n", d->irq);
+	}
+
+	/* Sanity check */
+	if (WARN_ON(target == XIVE_INVALID_TARGET ||
+		    target >= nr_cpu_ids))
+		target = smp_processor_id();
+
+	xd->target = target;
+
+	/*
+	 * Configure the logical number to be the Linux IRQ number
+	 * and set the target queue
+	 */
+	rc = xive_ops->configure_irq(hw_irq,
+				     get_hard_smp_processor_id(target),
+				     xive_irq_priority, d->irq);
+	if (rc)
+		return rc;
+
+	/* Unmask the ESB */
+	xive_do_source_set_mask(xd, false);
+
+	return 0;
+}
+
+static void xive_irq_shutdown(struct irq_data *d)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+
+	pr_devel("xive_irq_shutdown: irq %d [0x%x] data @%p\n",
+		 d->irq, hw_irq, d);
+
+	if (WARN_ON(xd->target == XIVE_INVALID_TARGET))
+		return;
+
+	/* Mask the interrupt at the source */
+	xive_do_source_set_mask(xd, true);
+
+	/*
+	 * The above may have set saved_p. We clear it otherwise it
+	 * will prevent re-enabling later on. It is ok to forget the
+	 * fact that the interrupt might be in a queue because we are
+	 * accounting that already in xive_dec_target_count() and will
+	 * be re-routing it to a new queue with proper accounting when
+	 * it's started up again
+	 */
+	xd->saved_p = false;
+
+	/*
+	 * Mask the interrupt in HW in the IVT/EAS and set the number
+	 * to be the "bad" IRQ number
+	 */
+	xive_ops->configure_irq(hw_irq,
+				get_hard_smp_processor_id(xd->target),
+				0xff, XIVE_BAD_IRQ);
+
+	xive_dec_target_count(xd->target);
+	xd->target = XIVE_INVALID_TARGET;
+}
+
+static void xive_irq_unmask(struct irq_data *d)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+	pr_devel("xive_irq_unmask: irq %d data @%p\n", d->irq, xd);
+
+	/*
+	 * This is a workaround for PCI LSI problems on P9, for
+	 * these, we call FW to set the mask. The problems might
+	 * be fixed by P9 DD2.0, if that is the case, firmware
+	 * will no longer set that flag.
+	 */
+	if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) {
+		unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+		xive_ops->configure_irq(hw_irq,
+					get_hard_smp_processor_id(xd->target),
+					xive_irq_priority, d->irq);
+		return;
+	}
+
+	xive_do_source_set_mask(xd, false);
+}
+
+static void xive_irq_mask(struct irq_data *d)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+	pr_devel("xive_irq_mask: irq %d data @%p\n", d->irq, xd);
+
+	/*
+	 * This is a workaround for PCI LSI problems on P9, for
+	 * these, we call OPAL to set the mask. The problems might
+	 * be fixed by P9 DD2.0, if that is the case, firmware
+	 * will no longer set that flag.
+	 */
+	if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) {
+		unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+		xive_ops->configure_irq(hw_irq,
+					get_hard_smp_processor_id(xd->target),
+					0xff, d->irq);
+		return;
+	}
+
+	xive_do_source_set_mask(xd, true);
+}
+
+static int xive_irq_set_affinity(struct irq_data *d,
+				 const struct cpumask *cpumask,
+				 bool force)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+	u32 target, old_target;
+	int rc = 0;
+
+	pr_devel("xive_irq_set_affinity: irq %d\n", d->irq);
+
+	/* Is this valid ? */
+	if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids)
+		return -EINVAL;
+
+	/*
+	 * If existing target is already in the new mask, and is
+	 * online then do nothing.
+	 */
+	if (xd->target != XIVE_INVALID_TARGET &&
+	    cpu_online(xd->target) &&
+	    cpumask_test_cpu(xd->target, cpumask))
+		return IRQ_SET_MASK_OK;
+
+	/* Pick a new target */
+	target = xive_pick_irq_target(d, cpumask);
+
+	/* No target found */
+	if (target == XIVE_INVALID_TARGET)
+		return -ENXIO;
+
+	/* Sanity check */
+	if (WARN_ON(target >= nr_cpu_ids))
+		target = smp_processor_id();
+
+	old_target = xd->target;
+
+	rc = xive_ops->configure_irq(hw_irq,
+				     get_hard_smp_processor_id(target),
+				     xive_irq_priority, d->irq);
+	if (rc < 0) {
+		pr_err("Error %d reconfiguring irq %d\n", rc, d->irq);
+		return rc;
+	}
+
+	pr_devel("  target: 0x%x\n", target);
+	xd->target = target;
+
+	/* Give up previous target */
+	if (old_target != XIVE_INVALID_TARGET)
+	    xive_dec_target_count(old_target);
+
+	return IRQ_SET_MASK_OK;
+}
+
+static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+	/*
+	 * We only support these. This has really no effect other than setting
+	 * the corresponding descriptor bits mind you but those will in turn
+	 * affect the resend function when re-enabling an edge interrupt.
+	 *
+	 * Set set the default to edge as explained in map().
+	 */
+	if (flow_type == IRQ_TYPE_DEFAULT || flow_type == IRQ_TYPE_NONE)
+		flow_type = IRQ_TYPE_EDGE_RISING;
+
+	if (flow_type != IRQ_TYPE_EDGE_RISING &&
+	    flow_type != IRQ_TYPE_LEVEL_LOW)
+		return -EINVAL;
+
+	irqd_set_trigger_type(d, flow_type);
+
+	/*
+	 * Double check it matches what the FW thinks
+	 *
+	 * NOTE: We don't know yet if the PAPR interface will provide
+	 * the LSI vs MSI information apart from the device-tree so
+	 * this check might have to move into an optional backend call
+	 * that is specific to the native backend
+	 */
+	if ((flow_type == IRQ_TYPE_LEVEL_LOW) !=
+	    !!(xd->flags & XIVE_IRQ_FLAG_LSI)) {
+		pr_warn("Interrupt %d (HW 0x%x) type mismatch, Linux says %s, FW says %s\n",
+			d->irq, (u32)irqd_to_hwirq(d),
+			(flow_type == IRQ_TYPE_LEVEL_LOW) ? "Level" : "Edge",
+			(xd->flags & XIVE_IRQ_FLAG_LSI) ? "Level" : "Edge");
+	}
+
+	return IRQ_SET_MASK_OK_NOCOPY;
+}
+
+static int xive_irq_retrigger(struct irq_data *d)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+	/* This should be only for MSIs */
+	if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
+		return 0;
+
+	/*
+	 * To perform a retrigger, we first set the PQ bits to
+	 * 11, then perform an EOI.
+	 */
+	xive_poke_esb(xd, XIVE_ESB_SET_PQ_11);
+
+	/*
+	 * Note: We pass "0" to the hw_irq argument in order to
+	 * avoid calling into the backend EOI code which we don't
+	 * want to do in the case of a re-trigger. Backends typically
+	 * only do EOI for LSIs anyway.
+	 */
+	xive_do_source_eoi(0, xd);
+
+	return 1;
+}
+
+static struct irq_chip xive_irq_chip = {
+	.name = "XIVE-IRQ",
+	.irq_startup = xive_irq_startup,
+	.irq_shutdown = xive_irq_shutdown,
+	.irq_eoi = xive_irq_eoi,
+	.irq_mask = xive_irq_mask,
+	.irq_unmask = xive_irq_unmask,
+	.irq_set_affinity = xive_irq_set_affinity,
+	.irq_set_type = xive_irq_set_type,
+	.irq_retrigger = xive_irq_retrigger,
+};
+
+bool is_xive_irq(struct irq_chip *chip)
+{
+	return chip == &xive_irq_chip;
+}
+
+void xive_cleanup_irq_data(struct xive_irq_data *xd)
+{
+	if (xd->eoi_mmio) {
+		iounmap(xd->eoi_mmio);
+		if (xd->eoi_mmio == xd->trig_mmio)
+			xd->trig_mmio = NULL;
+		xd->eoi_mmio = NULL;
+	}
+	if (xd->trig_mmio) {
+		iounmap(xd->trig_mmio);
+		xd->trig_mmio = NULL;
+	}
+}
+
+static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
+{
+	struct xive_irq_data *xd;
+	int rc;
+
+	xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL);
+	if (!xd)
+		return -ENOMEM;
+	rc = xive_ops->populate_irq_data(hw, xd);
+	if (rc) {
+		kfree(xd);
+		return rc;
+	}
+	xd->target = XIVE_INVALID_TARGET;
+	irq_set_handler_data(virq, xd);
+
+	return 0;
+}
+
+static void xive_irq_free_data(unsigned int virq)
+{
+	struct xive_irq_data *xd = irq_get_handler_data(virq);
+
+	if (!xd)
+		return;
+	irq_set_handler_data(virq, NULL);
+	xive_cleanup_irq_data(xd);
+	kfree(xd);
+}
+
+#ifdef CONFIG_SMP
+
+static void xive_cause_ipi(int cpu, unsigned long msg)
+{
+	struct xive_cpu *xc;
+	struct xive_irq_data *xd;
+
+	xc = per_cpu(xive_cpu, cpu);
+
+	DBG_VERBOSE("IPI msg#%ld CPU %d -> %d (HW IRQ 0x%x)\n",
+		    msg, smp_processor_id(), cpu, xc->hw_ipi);
+
+	xd = &xc->ipi_data;
+	if (WARN_ON(!xd->trig_mmio))
+		return;
+	out_be64(xd->trig_mmio, 0);
+}
+
+static irqreturn_t xive_muxed_ipi_action(int irq, void *dev_id)
+{
+	return smp_ipi_demux();
+}
+
+static void xive_ipi_eoi(struct irq_data *d)
+{
+	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+
+	/* Handle possible race with unplug and drop stale IPIs */
+	if (!xc)
+		return;
+	xive_do_source_eoi(xc->hw_ipi, &xc->ipi_data);
+	xive_do_queue_eoi(xc);
+}
+
+static void xive_ipi_do_nothing(struct irq_data *d)
+{
+	/*
+	 * Nothing to do, we never mask/unmask IPIs, but the callback
+	 * has to exist for the struct irq_chip.
+	 */
+}
+
+static struct irq_chip xive_ipi_chip = {
+	.name = "XIVE-IPI",
+	.irq_eoi = xive_ipi_eoi,
+	.irq_mask = xive_ipi_do_nothing,
+	.irq_unmask = xive_ipi_do_nothing,
+};
+
+static void __init xive_request_ipi(void)
+{
+	unsigned int virq;
+
+	/*
+	 * Initialization failed, move on, we might manage to
+	 * reach the point where we display our errors before
+	 * the system falls appart
+	 */
+	if (!xive_irq_domain)
+		return;
+
+	/* Initialize it */
+	virq = irq_create_mapping(xive_irq_domain, 0);
+	xive_ipi_irq = virq;
+
+	WARN_ON(request_irq(virq, xive_muxed_ipi_action,
+			    IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
+}
+
+static int xive_setup_cpu_ipi(unsigned int cpu)
+{
+	struct xive_cpu *xc;
+	int rc;
+
+	pr_debug("Setting up IPI for CPU %d\n", cpu);
+
+	xc = per_cpu(xive_cpu, cpu);
+
+	/* Check if we are already setup */
+	if (xc->hw_ipi != 0)
+		return 0;
+
+	/* Grab an IPI from the backend, this will populate xc->hw_ipi */
+	if (xive_ops->get_ipi(cpu, xc))
+		return -EIO;
+
+	/*
+	 * Populate the IRQ data in the xive_cpu structure and
+	 * configure the HW / enable the IPIs.
+	 */
+	rc = xive_ops->populate_irq_data(xc->hw_ipi, &xc->ipi_data);
+	if (rc) {
+		pr_err("Failed to populate IPI data on CPU %d\n", cpu);
+		return -EIO;
+	}
+	rc = xive_ops->configure_irq(xc->hw_ipi,
+				     get_hard_smp_processor_id(cpu),
+				     xive_irq_priority, xive_ipi_irq);
+	if (rc) {
+		pr_err("Failed to map IPI CPU %d\n", cpu);
+		return -EIO;
+	}
+	pr_devel("CPU %d HW IPI %x, virq %d, trig_mmio=%p\n", cpu,
+	    xc->hw_ipi, xive_ipi_irq, xc->ipi_data.trig_mmio);
+
+	/* Unmask it */
+	xive_do_source_set_mask(&xc->ipi_data, false);
+
+	return 0;
+}
+
+static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc)
+{
+	/* Disable the IPI and free the IRQ data */
+
+	/* Already cleaned up ? */
+	if (xc->hw_ipi == 0)
+		return;
+
+	/* Mask the IPI */
+	xive_do_source_set_mask(&xc->ipi_data, true);
+
+	/*
+	 * Note: We don't call xive_cleanup_irq_data() to free
+	 * the mappings as this is called from an IPI on kexec
+	 * which is not a safe environment to call iounmap()
+	 */
+
+	/* Deconfigure/mask in the backend */
+	xive_ops->configure_irq(xc->hw_ipi, hard_smp_processor_id(),
+				0xff, xive_ipi_irq);
+
+	/* Free the IPIs in the backend */
+	xive_ops->put_ipi(cpu, xc);
+}
+
+void __init xive_smp_probe(void)
+{
+	smp_ops->cause_ipi = xive_cause_ipi;
+
+	/* Register the IPI */
+	xive_request_ipi();
+
+	/* Allocate and setup IPI for the boot CPU */
+	xive_setup_cpu_ipi(smp_processor_id());
+}
+
+#endif /* CONFIG_SMP */
+
+static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq,
+			       irq_hw_number_t hw)
+{
+	int rc;
+
+	/*
+	 * Mark interrupts as edge sensitive by default so that resend
+	 * actually works. Will fix that up below if needed.
+	 */
+	irq_clear_status_flags(virq, IRQ_LEVEL);
+
+#ifdef CONFIG_SMP
+	/* IPIs are special and come up with HW number 0 */
+	if (hw == 0) {
+		/*
+		 * IPIs are marked per-cpu. We use separate HW interrupts under
+		 * the hood but associated with the same "linux" interrupt
+		 */
+		irq_set_chip_and_handler(virq, &xive_ipi_chip,
+					 handle_percpu_irq);
+		return 0;
+	}
+#endif
+
+	rc = xive_irq_alloc_data(virq, hw);
+	if (rc)
+		return rc;
+
+	irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq);
+
+	return 0;
+}
+
+static void xive_irq_domain_unmap(struct irq_domain *d, unsigned int virq)
+{
+	struct irq_data *data = irq_get_irq_data(virq);
+	unsigned int hw_irq;
+
+	/* XXX Assign BAD number */
+	if (!data)
+		return;
+	hw_irq = (unsigned int)irqd_to_hwirq(data);
+	if (hw_irq)
+		xive_irq_free_data(virq);
+}
+
+static int xive_irq_domain_xlate(struct irq_domain *h, struct device_node *ct,
+				 const u32 *intspec, unsigned int intsize,
+				 irq_hw_number_t *out_hwirq, unsigned int *out_flags)
+
+{
+	*out_hwirq = intspec[0];
+
+	/*
+	 * If intsize is at least 2, we look for the type in the second cell,
+	 * we assume the LSB indicates a level interrupt.
+	 */
+	if (intsize > 1) {
+		if (intspec[1] & 1)
+			*out_flags = IRQ_TYPE_LEVEL_LOW;
+		else
+			*out_flags = IRQ_TYPE_EDGE_RISING;
+	} else
+		*out_flags = IRQ_TYPE_LEVEL_LOW;
+
+	return 0;
+}
+
+static int xive_irq_domain_match(struct irq_domain *h, struct device_node *node,
+				 enum irq_domain_bus_token bus_token)
+{
+	return xive_ops->match(node);
+}
+
+static const struct irq_domain_ops xive_irq_domain_ops = {
+	.match = xive_irq_domain_match,
+	.map = xive_irq_domain_map,
+	.unmap = xive_irq_domain_unmap,
+	.xlate = xive_irq_domain_xlate,
+};
+
+static void __init xive_init_host(void)
+{
+	xive_irq_domain = irq_domain_add_nomap(NULL, XIVE_MAX_IRQ,
+					       &xive_irq_domain_ops, NULL);
+	if (WARN_ON(xive_irq_domain == NULL))
+		return;
+	irq_set_default_host(xive_irq_domain);
+}
+
+static void xive_cleanup_cpu_queues(unsigned int cpu, struct xive_cpu *xc)
+{
+	if (xc->queue[xive_irq_priority].qpage)
+		xive_ops->cleanup_queue(cpu, xc, xive_irq_priority);
+}
+
+static int xive_setup_cpu_queues(unsigned int cpu, struct xive_cpu *xc)
+{
+	int rc = 0;
+
+	/* We setup 1 queues for now with a 64k page */
+	if (!xc->queue[xive_irq_priority].qpage)
+		rc = xive_ops->setup_queue(cpu, xc, xive_irq_priority);
+
+	return rc;
+}
+
+static int xive_prepare_cpu(unsigned int cpu)
+{
+	struct xive_cpu *xc;
+
+	xc = per_cpu(xive_cpu, cpu);
+	if (!xc) {
+		struct device_node *np;
+
+		xc = kzalloc_node(sizeof(struct xive_cpu),
+				  GFP_KERNEL, cpu_to_node(cpu));
+		if (!xc)
+			return -ENOMEM;
+		np = of_get_cpu_node(cpu, NULL);
+		if (np)
+			xc->chip_id = of_get_ibm_chip_id(np);
+		of_node_put(np);
+
+		per_cpu(xive_cpu, cpu) = xc;
+	}
+
+	/* Setup EQs if not already */
+	return xive_setup_cpu_queues(cpu, xc);
+}
+
+static void xive_setup_cpu(void)
+{
+	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+
+	/* Debug: Dump the TM state */
+	pr_devel("CPU %d [HW 0x%02x] VT=%02x\n",
+	    smp_processor_id(), hard_smp_processor_id(),
+	    in_8(xive_tima + xive_tima_offset + TM_WORD2));
+
+	/* The backend might have additional things to do */
+	if (xive_ops->setup_cpu)
+		xive_ops->setup_cpu(smp_processor_id(), xc);
+
+	/* Set CPPR to 0xff to enable flow of interrupts */
+	xc->cppr = 0xff;
+	out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff);
+}
+
+#ifdef CONFIG_SMP
+void xive_smp_setup_cpu(void)
+{
+	pr_devel("SMP setup CPU %d\n", smp_processor_id());
+
+	/* This will have already been done on the boot CPU */
+	if (smp_processor_id() != boot_cpuid)
+		xive_setup_cpu();
+
+}
+
+int xive_smp_prepare_cpu(unsigned int cpu)
+{
+	int rc;
+
+	/* Allocate per-CPU data and queues */
+	rc = xive_prepare_cpu(cpu);
+	if (rc)
+		return rc;
+
+	/* Allocate and setup IPI for the new CPU */
+	return xive_setup_cpu_ipi(cpu);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc)
+{
+	u32 irq;
+
+	/* We assume local irqs are disabled */
+	WARN_ON(!irqs_disabled());
+
+	/* Check what's already in the CPU queue */
+	while ((irq = xive_scan_interrupts(xc, false)) != 0) {
+		/*
+		 * We need to re-route that interrupt to its new destination.
+		 * First get and lock the descriptor
+		 */
+		struct irq_desc *desc = irq_to_desc(irq);
+		struct irq_data *d = irq_desc_get_irq_data(desc);
+		struct xive_irq_data *xd;
+		unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+
+		/*
+		 * Ignore anything that isn't a XIVE irq and ignore
+		 * IPIs, so can just be dropped.
+		 */
+		if (d->domain != xive_irq_domain || hw_irq == 0)
+			continue;
+
+		/*
+		 * The IRQ should have already been re-routed, it's just a
+		 * stale in the old queue, so re-trigger it in order to make
+		 * it reach is new destination.
+		 */
+#ifdef DEBUG_FLUSH
+		pr_info("CPU %d: Got irq %d while offline, re-sending...\n",
+			cpu, irq);
+#endif
+		raw_spin_lock(&desc->lock);
+		xd = irq_desc_get_handler_data(desc);
+
+		/*
+		 * For LSIs, we EOI, this will cause a resend if it's
+		 * still asserted. Otherwise do an MSI retrigger.
+		 */
+		if (xd->flags & XIVE_IRQ_FLAG_LSI)
+			xive_do_source_eoi(irqd_to_hwirq(d), xd);
+		else
+			xive_irq_retrigger(d);
+
+		raw_spin_unlock(&desc->lock);
+	}
+}
+
+void xive_smp_disable_cpu(void)
+{
+	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+	unsigned int cpu = smp_processor_id();
+
+	/* Migrate interrupts away from the CPU */
+	irq_migrate_all_off_this_cpu();
+
+	/* Set CPPR to 0 to disable flow of interrupts */
+	xc->cppr = 0;
+	out_8(xive_tima + xive_tima_offset + TM_CPPR, 0);
+
+	/* Flush everything still in the queue */
+	xive_flush_cpu_queue(cpu, xc);
+
+	/* Re-enable CPPR  */
+	xc->cppr = 0xff;
+	out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff);
+}
+
+void xive_flush_interrupt(void)
+{
+	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+	unsigned int cpu = smp_processor_id();
+
+	/* Called if an interrupt occurs while the CPU is hot unplugged */
+	xive_flush_cpu_queue(cpu, xc);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+#endif /* CONFIG_SMP */
+
+void xive_kexec_teardown_cpu(int secondary)
+{
+	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+	unsigned int cpu = smp_processor_id();
+
+	/* Set CPPR to 0 to disable flow of interrupts */
+	xc->cppr = 0;
+	out_8(xive_tima + xive_tima_offset + TM_CPPR, 0);
+
+	/* Backend cleanup if any */
+	if (xive_ops->teardown_cpu)
+		xive_ops->teardown_cpu(cpu, xc);
+
+#ifdef CONFIG_SMP
+	/* Get rid of IPI */
+	xive_cleanup_cpu_ipi(cpu, xc);
+#endif
+
+	/* Disable and free the queues */
+	xive_cleanup_cpu_queues(cpu, xc);
+}
+
+void xive_shutdown(void)
+{
+	xive_ops->shutdown();
+}
+
+bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset,
+		    u8 max_prio)
+{
+	xive_tima = area;
+	xive_tima_offset = offset;
+	xive_ops = ops;
+	xive_irq_priority = max_prio;
+
+	ppc_md.get_irq = xive_get_irq;
+	__xive_enabled = true;
+
+	pr_devel("Initializing host..\n");
+	xive_init_host();
+
+	pr_devel("Initializing boot CPU..\n");
+
+	/* Allocate per-CPU data and queues */
+	xive_prepare_cpu(smp_processor_id());
+
+	/* Get ready for interrupts */
+	xive_setup_cpu();
+
+	pr_info("Interrupt handling intialized with %s backend\n",
+		xive_ops->name);
+	pr_info("Using priority %d for all interrupts\n", max_prio);
+
+	return true;
+}
+
+static int __init xive_off(char *arg)
+{
+	xive_cmdline_disabled = true;
+	return 0;
+}
+__setup("xive=off", xive_off);
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
new file mode 100644
index 000000000000..5fae59186cb2
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -0,0 +1,639 @@
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "xive: " fmt
+
+#include <linux/types.h>
+#include <linux/irq.h>
+#include <linux/debugfs.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/cpumask.h>
+#include <linux/mm.h>
+
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/irq.h>
+#include <asm/errno.h>
+#include <asm/xive.h>
+#include <asm/xive-regs.h>
+#include <asm/opal.h>
+
+#include "xive-internal.h"
+
+
+static u32 xive_provision_size;
+static u32 *xive_provision_chips;
+static u32 xive_provision_chip_count;
+static u32 xive_queue_shift;
+static u32 xive_pool_vps = XIVE_INVALID_VP;
+static struct kmem_cache *xive_provision_cache;
+
+int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
+{
+	__be64 flags, eoi_page, trig_page;
+	__be32 esb_shift, src_chip;
+	u64 opal_flags;
+	s64 rc;
+
+	memset(data, 0, sizeof(*data));
+
+	rc = opal_xive_get_irq_info(hw_irq, &flags, &eoi_page, &trig_page,
+				    &esb_shift, &src_chip);
+	if (rc) {
+		pr_err("opal_xive_get_irq_info(0x%x) returned %lld\n",
+		       hw_irq, rc);
+		return -EINVAL;
+	}
+
+	opal_flags = be64_to_cpu(flags);
+	if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI)
+		data->flags |= XIVE_IRQ_FLAG_STORE_EOI;
+	if (opal_flags & OPAL_XIVE_IRQ_LSI)
+		data->flags |= XIVE_IRQ_FLAG_LSI;
+	if (opal_flags & OPAL_XIVE_IRQ_SHIFT_BUG)
+		data->flags |= XIVE_IRQ_FLAG_SHIFT_BUG;
+	if (opal_flags & OPAL_XIVE_IRQ_MASK_VIA_FW)
+		data->flags |= XIVE_IRQ_FLAG_MASK_FW;
+	if (opal_flags & OPAL_XIVE_IRQ_EOI_VIA_FW)
+		data->flags |= XIVE_IRQ_FLAG_EOI_FW;
+	data->eoi_page = be64_to_cpu(eoi_page);
+	data->trig_page = be64_to_cpu(trig_page);
+	data->esb_shift = be32_to_cpu(esb_shift);
+	data->src_chip = be32_to_cpu(src_chip);
+
+	data->eoi_mmio = ioremap(data->eoi_page, 1u << data->esb_shift);
+	if (!data->eoi_mmio) {
+		pr_err("Failed to map EOI page for irq 0x%x\n", hw_irq);
+		return -ENOMEM;
+	}
+
+	if (!data->trig_page)
+		return 0;
+	if (data->trig_page == data->eoi_page) {
+		data->trig_mmio = data->eoi_mmio;
+		return 0;
+	}
+
+	data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift);
+	if (!data->trig_mmio) {
+		pr_err("Failed to map trigger page for irq 0x%x\n", hw_irq);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
+{
+	s64 rc;
+
+	for (;;) {
+		rc = opal_xive_set_irq_config(hw_irq, target, prio, sw_irq);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+	return rc == 0 ? 0 : -ENXIO;
+}
+
+/* This can be called multiple time to change a queue configuration */
+int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
+				__be32 *qpage, u32 order, bool can_escalate)
+{
+	s64 rc = 0;
+	__be64 qeoi_page_be;
+	__be32 esc_irq_be;
+	u64 flags, qpage_phys;
+
+	/* If there's an actual queue page, clean it */
+	if (order) {
+		if (WARN_ON(!qpage))
+			return -EINVAL;
+		qpage_phys = __pa(qpage);
+	} else
+		qpage_phys = 0;
+
+	/* Initialize the rest of the fields */
+	q->msk = order ? ((1u << (order - 2)) - 1) : 0;
+	q->idx = 0;
+	q->toggle = 0;
+
+	rc = opal_xive_get_queue_info(vp_id, prio, NULL, NULL,
+				      &qeoi_page_be,
+				      &esc_irq_be,
+				      NULL);
+	if (rc) {
+		pr_err("Error %lld getting queue info prio %d\n", rc, prio);
+		rc = -EIO;
+		goto fail;
+	}
+	q->eoi_phys = be64_to_cpu(qeoi_page_be);
+
+	/* Default flags */
+	flags = OPAL_XIVE_EQ_ALWAYS_NOTIFY | OPAL_XIVE_EQ_ENABLED;
+
+	/* Escalation needed ? */
+	if (can_escalate) {
+		q->esc_irq = be32_to_cpu(esc_irq_be);
+		flags |= OPAL_XIVE_EQ_ESCALATE;
+	}
+
+	/* Configure and enable the queue in HW */
+	for (;;) {
+		rc = opal_xive_set_queue_info(vp_id, prio, qpage_phys, order, flags);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+	if (rc) {
+		pr_err("Error %lld setting queue for prio %d\n", rc, prio);
+		rc = -EIO;
+	} else {
+		/*
+		 * KVM code requires all of the above to be visible before
+		 * q->qpage is set due to how it manages IPI EOIs
+		 */
+		wmb();
+		q->qpage = qpage;
+	}
+fail:
+	return rc;
+}
+
+static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
+{
+	s64 rc;
+
+	/* Disable the queue in HW */
+	for (;;) {
+		rc = opal_xive_set_queue_info(vp_id, prio, 0, 0, 0);
+			break;
+		msleep(1);
+	}
+	if (rc)
+		pr_err("Error %lld disabling queue for prio %d\n", rc, prio);
+}
+
+void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
+{
+	__xive_native_disable_queue(vp_id, q, prio);
+}
+
+static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio)
+{
+	struct xive_q *q = &xc->queue[prio];
+	unsigned int alloc_order;
+	struct page *pages;
+	__be32 *qpage;
+
+	alloc_order = (xive_queue_shift > PAGE_SHIFT) ?
+		(xive_queue_shift - PAGE_SHIFT) : 0;
+	pages = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, alloc_order);
+	if (!pages)
+		return -ENOMEM;
+	qpage = (__be32 *)page_address(pages);
+	memset(qpage, 0, 1 << xive_queue_shift);
+	return xive_native_configure_queue(get_hard_smp_processor_id(cpu),
+					   q, prio, qpage, xive_queue_shift, false);
+}
+
+static void xive_native_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio)
+{
+	struct xive_q *q = &xc->queue[prio];
+	unsigned int alloc_order;
+
+	/*
+	 * We use the variant with no iounmap as this is called on exec
+	 * from an IPI and iounmap isn't safe
+	 */
+	__xive_native_disable_queue(get_hard_smp_processor_id(cpu), q, prio);
+	alloc_order = (xive_queue_shift > PAGE_SHIFT) ?
+		(xive_queue_shift - PAGE_SHIFT) : 0;
+	free_pages((unsigned long)q->qpage, alloc_order);
+	q->qpage = NULL;
+}
+
+static bool xive_native_match(struct device_node *node)
+{
+	return of_device_is_compatible(node, "ibm,opal-xive-vc");
+}
+
+#ifdef CONFIG_SMP
+static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc)
+{
+	struct device_node *np;
+	unsigned int chip_id;
+	s64 irq;
+
+	/* Find the chip ID */
+	np = of_get_cpu_node(cpu, NULL);
+	if (np) {
+		if (of_property_read_u32(np, "ibm,chip-id", &chip_id) < 0)
+			chip_id = 0;
+	}
+
+	/* Allocate an IPI and populate info about it */
+	for (;;) {
+		irq = opal_xive_allocate_irq(chip_id);
+		if (irq == OPAL_BUSY) {
+			msleep(1);
+			continue;
+		}
+		if (irq < 0) {
+			pr_err("Failed to allocate IPI on CPU %d\n", cpu);
+			return -ENXIO;
+		}
+		xc->hw_ipi = irq;
+		break;
+	}
+	return 0;
+}
+
+u32 xive_native_alloc_irq(void)
+{
+	s64 rc;
+
+	for (;;) {
+		rc = opal_xive_allocate_irq(OPAL_XIVE_ANY_CHIP);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+	if (rc < 0)
+		return 0;
+	return rc;
+}
+
+void xive_native_free_irq(u32 irq)
+{
+	for (;;) {
+		s64 rc = opal_xive_free_irq(irq);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+}
+
+static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc)
+{
+	s64 rc;
+
+	/* Free the IPI */
+	if (!xc->hw_ipi)
+		return;
+	for (;;) {
+		rc = opal_xive_free_irq(xc->hw_ipi);
+		if (rc == OPAL_BUSY) {
+			msleep(1);
+			continue;
+		}
+		xc->hw_ipi = 0;
+		break;
+	}
+}
+#endif /* CONFIG_SMP */
+
+static void xive_native_shutdown(void)
+{
+	/* Switch the XIVE to emulation mode */
+	opal_xive_reset(OPAL_XIVE_MODE_EMU);
+}
+
+/*
+ * Perform an "ack" cycle on the current thread, thus
+ * grabbing the pending active priorities and updating
+ * the CPPR to the most favored one.
+ */
+static void xive_native_update_pending(struct xive_cpu *xc)
+{
+	u8 he, cppr;
+	u16 ack;
+
+	/* Perform the acknowledge hypervisor to register cycle */
+	ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_HV_REG));
+
+	/* Synchronize subsequent queue accesses */
+	mb();
+
+	/*
+	 * Grab the CPPR and the "HE" field which indicates the source
+	 * of the hypervisor interrupt (if any)
+	 */
+	cppr = ack & 0xff;
+	he = GETFIELD(TM_QW3_NSR_HE, (ack >> 8));
+	switch(he) {
+	case TM_QW3_NSR_HE_NONE: /* Nothing to see here */
+		break;
+	case TM_QW3_NSR_HE_PHYS: /* Physical thread interrupt */
+		if (cppr == 0xff)
+			return;
+		/* Mark the priority pending */
+		xc->pending_prio |= 1 << cppr;
+
+		/*
+		 * A new interrupt should never have a CPPR less favored
+		 * than our current one.
+		 */
+		if (cppr >= xc->cppr)
+			pr_err("CPU %d odd ack CPPR, got %d at %d\n",
+			       smp_processor_id(), cppr, xc->cppr);
+
+		/* Update our idea of what the CPPR is */
+		xc->cppr = cppr;
+		break;
+	case TM_QW3_NSR_HE_POOL: /* HV Pool interrupt (unused) */
+	case TM_QW3_NSR_HE_LSI:  /* Legacy FW LSI (unused) */
+		pr_err("CPU %d got unexpected interrupt type HE=%d\n",
+		       smp_processor_id(), he);
+		return;
+	}
+}
+
+static void xive_native_eoi(u32 hw_irq)
+{
+	/*
+	 * Not normally used except if specific interrupts need
+	 * a workaround on EOI.
+	 */
+	opal_int_eoi(hw_irq);
+}
+
+static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc)
+{
+	s64 rc;
+	u32 vp;
+	__be64 vp_cam_be;
+	u64 vp_cam;
+
+	if (xive_pool_vps == XIVE_INVALID_VP)
+		return;
+
+	/* Enable the pool VP */
+	vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
+	pr_debug("CPU %d setting up pool VP 0x%x\n", cpu, vp);
+	for (;;) {
+		rc = opal_xive_set_vp_info(vp, OPAL_XIVE_VP_ENABLED, 0);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+	if (rc) {
+		pr_err("Failed to enable pool VP on CPU %d\n", cpu);
+		return;
+	}
+
+	/* Grab it's CAM value */
+	rc = opal_xive_get_vp_info(vp, NULL, &vp_cam_be, NULL, NULL);
+	if (rc) {
+		pr_err("Failed to get pool VP info CPU %d\n", cpu);
+		return;
+	}
+	vp_cam = be64_to_cpu(vp_cam_be);
+
+	pr_debug("VP CAM = %llx\n", vp_cam);
+
+	/* Push it on the CPU (set LSMFB to 0xff to skip backlog scan) */
+	pr_debug("(Old HW value: %08x)\n",
+		 in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2));
+	out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD0, 0xff);
+	out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2,
+		 TM_QW2W2_VP | vp_cam);
+	pr_debug("(New HW value: %08x)\n",
+		 in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2));
+}
+
+static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc)
+{
+	s64 rc;
+	u32 vp;
+
+	if (xive_pool_vps == XIVE_INVALID_VP)
+		return;
+
+	/* Pull the pool VP from the CPU */
+	in_be64(xive_tima + TM_SPC_PULL_POOL_CTX);
+
+	/* Disable it */
+	vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
+	for (;;) {
+		rc = opal_xive_set_vp_info(vp, 0, 0);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+}
+
+static void xive_native_sync_source(u32 hw_irq)
+{
+	opal_xive_sync(XIVE_SYNC_EAS, hw_irq);
+}
+
+static const struct xive_ops xive_native_ops = {
+	.populate_irq_data	= xive_native_populate_irq_data,
+	.configure_irq		= xive_native_configure_irq,
+	.setup_queue		= xive_native_setup_queue,
+	.cleanup_queue		= xive_native_cleanup_queue,
+	.match			= xive_native_match,
+	.shutdown		= xive_native_shutdown,
+	.update_pending		= xive_native_update_pending,
+	.eoi			= xive_native_eoi,
+	.setup_cpu		= xive_native_setup_cpu,
+	.teardown_cpu		= xive_native_teardown_cpu,
+	.sync_source		= xive_native_sync_source,
+#ifdef CONFIG_SMP
+	.get_ipi		= xive_native_get_ipi,
+	.put_ipi		= xive_native_put_ipi,
+#endif /* CONFIG_SMP */
+	.name			= "native",
+};
+
+static bool xive_parse_provisioning(struct device_node *np)
+{
+	int rc;
+
+	if (of_property_read_u32(np, "ibm,xive-provision-page-size",
+				 &xive_provision_size) < 0)
+		return true;
+	rc = of_property_count_elems_of_size(np, "ibm,xive-provision-chips", 4);
+	if (rc < 0) {
+		pr_err("Error %d getting provision chips array\n", rc);
+		return false;
+	}
+	xive_provision_chip_count = rc;
+	if (rc == 0)
+		return true;
+
+	xive_provision_chips = kzalloc(4 * xive_provision_chip_count,
+				       GFP_KERNEL);
+	if (WARN_ON(!xive_provision_chips))
+		return false;
+
+	rc = of_property_read_u32_array(np, "ibm,xive-provision-chips",
+					xive_provision_chips,
+					xive_provision_chip_count);
+	if (rc < 0) {
+		pr_err("Error %d reading provision chips array\n", rc);
+		return false;
+	}
+
+	xive_provision_cache = kmem_cache_create("xive-provision",
+						 xive_provision_size,
+						 xive_provision_size,
+						 0, NULL);
+	if (!xive_provision_cache) {
+		pr_err("Failed to allocate provision cache\n");
+		return false;
+	}
+	return true;
+}
+
+u32 xive_native_default_eq_shift(void)
+{
+	return xive_queue_shift;
+}
+
+bool xive_native_init(void)
+{
+	struct device_node *np;
+	struct resource r;
+	void __iomem *tima;
+	struct property *prop;
+	u8 max_prio = 7;
+	const __be32 *p;
+	u32 val;
+	s64 rc;
+
+	if (xive_cmdline_disabled)
+		return false;
+
+	pr_devel("xive_native_init()\n");
+	np = of_find_compatible_node(NULL, NULL, "ibm,opal-xive-pe");
+	if (!np) {
+		pr_devel("not found !\n");
+		return false;
+	}
+	pr_devel("Found %s\n", np->full_name);
+
+	/* Resource 1 is HV window */
+	if (of_address_to_resource(np, 1, &r)) {
+		pr_err("Failed to get thread mgmnt area resource\n");
+		return false;
+	}
+	tima = ioremap(r.start, resource_size(&r));
+	if (!tima) {
+		pr_err("Failed to map thread mgmnt area\n");
+		return false;
+	}
+
+	/* Read number of priorities */
+	if (of_property_read_u32(np, "ibm,xive-#priorities", &val) == 0)
+		max_prio = val - 1;
+
+	/* Iterate the EQ sizes and pick one */
+	of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, p, val) {
+		xive_queue_shift = val;
+		if (val == PAGE_SHIFT)
+			break;
+	}
+
+	/* Grab size of provisioning pages */
+	xive_parse_provisioning(np);
+
+	/* Switch the XIVE to exploitation mode */
+	rc = opal_xive_reset(OPAL_XIVE_MODE_EXPL);
+	if (rc) {
+		pr_err("Switch to exploitation mode failed with error %lld\n", rc);
+		return false;
+	}
+
+	/* Initialize XIVE core with our backend */
+	if (!xive_core_init(&xive_native_ops, tima, TM_QW3_HV_PHYS,
+			    max_prio)) {
+		opal_xive_reset(OPAL_XIVE_MODE_EMU);
+		return false;
+	}
+	pr_info("Using %dkB queues\n", 1 << (xive_queue_shift - 10));
+	return true;
+}
+
+static bool xive_native_provision_pages(void)
+{
+	u32 i;
+	void *p;
+
+	for (i = 0; i < xive_provision_chip_count; i++) {
+		u32 chip = xive_provision_chips[i];
+
+		/*
+		 * XXX TODO: Try to make the allocation local to the node where
+		 * the chip resides.
+		 */
+		p = kmem_cache_alloc(xive_provision_cache, GFP_KERNEL);
+		if (!p) {
+			pr_err("Failed to allocate provisioning page\n");
+			return false;
+		}
+		opal_xive_donate_page(chip, __pa(p));
+	}
+	return true;
+}
+
+u32 xive_native_alloc_vp_block(u32 max_vcpus)
+{
+	s64 rc;
+	u32 order;
+
+	order = fls(max_vcpus) - 1;
+	if (max_vcpus > (1 << order))
+		order++;
+
+	pr_info("VP block alloc, for max VCPUs %d use order %d\n",
+		max_vcpus, order);
+
+	for (;;) {
+		rc = opal_xive_alloc_vp_block(order);
+		switch (rc) {
+		case OPAL_BUSY:
+			msleep(1);
+			break;
+		case OPAL_XIVE_PROVISIONING:
+			if (!xive_native_provision_pages())
+				return XIVE_INVALID_VP;
+			break;
+		default:
+			if (rc < 0) {
+				pr_err("OPAL failed to allocate VCPUs order %d, err %lld\n",
+				       order, rc);
+				return XIVE_INVALID_VP;
+			}
+			return rc;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(xive_native_alloc_vp_block);
+
+void xive_native_free_vp_block(u32 vp_base)
+{
+	s64 rc;
+
+	if (vp_base == XIVE_INVALID_VP)
+		return;
+
+	rc = opal_xive_free_vp_block(vp_base);
+	if (rc < 0)
+		pr_warn("OPAL error %lld freeing VP block\n", rc);
+}
+EXPORT_SYMBOL_GPL(xive_native_free_vp_block);
diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h
new file mode 100644
index 000000000000..d07ef2d29caf
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/xive-internal.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef __XIVE_INTERNAL_H
+#define __XIVE_INTERNAL_H
+
+/* Each CPU carry one of these with various per-CPU state */
+struct xive_cpu {
+#ifdef CONFIG_SMP
+	/* HW irq number and data of IPI */
+	u32 hw_ipi;
+	struct xive_irq_data ipi_data;
+#endif /* CONFIG_SMP */
+
+	int chip_id;
+
+	/* Queue datas. Only one is populated */
+#define XIVE_MAX_QUEUES	8
+	struct xive_q queue[XIVE_MAX_QUEUES];
+
+	/*
+	 * Pending mask. Each bit corresponds to a priority that
+	 * potentially has pending interrupts.
+	 */
+	u8 pending_prio;
+
+	/* Cache of HW CPPR */
+	u8 cppr;
+};
+
+/* Backend ops */
+struct xive_ops {
+	int	(*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data);
+	int 	(*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
+	int	(*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
+	void	(*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
+	void	(*setup_cpu)(unsigned int cpu, struct xive_cpu *xc);
+	void	(*teardown_cpu)(unsigned int cpu, struct xive_cpu *xc);
+	bool	(*match)(struct device_node *np);
+	void	(*shutdown)(void);
+
+	void	(*update_pending)(struct xive_cpu *xc);
+	void	(*eoi)(u32 hw_irq);
+	void	(*sync_source)(u32 hw_irq);
+#ifdef CONFIG_SMP
+	int	(*get_ipi)(unsigned int cpu, struct xive_cpu *xc);
+	void	(*put_ipi)(unsigned int cpu, struct xive_cpu *xc);
+#endif
+	const char *name;
+};
+
+bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset,
+		    u8 max_prio);
+
+extern bool xive_cmdline_disabled;
+
+#endif /*  __XIVE_INTERNAL_H */
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 16321ad9e70c..67435b9bf98d 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -30,6 +30,7 @@
 #include <linux/ctype.h>
 
 #include <asm/ptrace.h>
+#include <asm/smp.h>
 #include <asm/string.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
@@ -48,7 +49,7 @@
 #include <asm/reg.h>
 #include <asm/debug.h>
 #include <asm/hw_breakpoint.h>
-
+#include <asm/xive.h>
 #include <asm/opal.h>
 #include <asm/firmware.h>
 
@@ -232,7 +233,13 @@ Commands:\n\
   "\
   dr	dump stream of raw bytes\n\
   dt	dump the tracing buffers (uses printk)\n\
-  e	print exception information\n\
+"
+#ifdef CONFIG_PPC_POWERNV
+"  dx#   dump xive on CPU #\n\
+  dxi#  dump xive irq state #\n\
+  dxa   dump xive on all CPUs\n"
+#endif
+"  e	print exception information\n\
   f	flush cache\n\
   la	lookup symbol+offset of specified address\n\
   ls	lookup address of specified symbol\n\
@@ -2338,6 +2345,81 @@ static void dump_pacas(void)
 }
 #endif
 
+#ifdef CONFIG_PPC_POWERNV
+static void dump_one_xive(int cpu)
+{
+	unsigned int hwid = get_hard_smp_processor_id(cpu);
+
+	opal_xive_dump(XIVE_DUMP_TM_HYP, hwid);
+	opal_xive_dump(XIVE_DUMP_TM_POOL, hwid);
+	opal_xive_dump(XIVE_DUMP_TM_OS, hwid);
+	opal_xive_dump(XIVE_DUMP_TM_USER, hwid);
+	opal_xive_dump(XIVE_DUMP_VP, hwid);
+	opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid);
+
+	if (setjmp(bus_error_jmp) != 0) {
+		catch_memory_errors = 0;
+		printf("*** Error dumping xive on cpu %d\n", cpu);
+		return;
+	}
+
+	catch_memory_errors = 1;
+	sync();
+	xmon_xive_do_dump(cpu);
+	sync();
+	__delay(200);
+	catch_memory_errors = 0;
+}
+
+static void dump_all_xives(void)
+{
+	int cpu;
+
+	if (num_possible_cpus() == 0) {
+		printf("No possible cpus, use 'dx #' to dump individual cpus\n");
+		return;
+	}
+
+	for_each_possible_cpu(cpu)
+		dump_one_xive(cpu);
+}
+
+static void dump_one_xive_irq(u32 num)
+{
+	s64 rc;
+	__be64 vp;
+	u8 prio;
+	__be32 lirq;
+
+	rc = opal_xive_get_irq_config(num, &vp, &prio, &lirq);
+	xmon_printf("IRQ 0x%x config: vp=0x%llx prio=%d lirq=0x%x (rc=%lld)\n",
+		    num, be64_to_cpu(vp), prio, be32_to_cpu(lirq), rc);
+}
+
+static void dump_xives(void)
+{
+	unsigned long num;
+	int c;
+
+	c = inchar();
+	if (c == 'a') {
+		dump_all_xives();
+		return;
+	} else if (c == 'i') {
+		if (scanhex(&num))
+			dump_one_xive_irq(num);
+		return;
+	}
+
+	termch = c;	/* Put c back, it wasn't 'a' */
+
+	if (scanhex(&num))
+		dump_one_xive(num);
+	else
+		dump_one_xive(xmon_owner);
+}
+#endif /* CONFIG_PPC_POWERNV */
+
 static void dump_by_size(unsigned long addr, long count, int size)
 {
 	unsigned char temp[16];
@@ -2386,6 +2468,14 @@ dump(void)
 		return;
 	}
 #endif
+#ifdef CONFIG_PPC_POWERNV
+	if (c == 'x') {
+		xmon_start_pagination();
+		dump_xives();
+		xmon_end_pagination();
+		return;
+	}
+#endif
 
 	if (c == '\n')
 		termch = c;
-- 
cgit v1.2.1


From d3989143d03917ddcf7103d4ad5fc871e8d8954b Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 5 Apr 2017 17:54:51 +1000
Subject: powerpc/kvm: Massage order of #include

We traditionally have linux/ before asm/

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kvm/book3s.c    |  8 ++++----
 arch/powerpc/kvm/book3s_hv.c | 18 +++++++++---------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b6b5c185bd92..aedacefd961d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -20,6 +20,10 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/miscdevice.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
@@ -31,10 +35,6 @@
 #include <asm/kvm_book3s.h>
 #include <asm/mmu_context.h>
 #include <asm/page.h>
-#include <linux/gfp.h>
-#include <linux/sched.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
 
 #include "book3s.h"
 #include "trace.h"
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1ec86d9e2a82..fadb75abfe37 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -35,6 +35,15 @@
 #include <linux/srcu.h>
 #include <linux/miscdevice.h>
 #include <linux/debugfs.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/of.h>
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
@@ -58,15 +67,6 @@
 #include <asm/mmu.h>
 #include <asm/opal.h>
 #include <asm/xics.h>
-#include <linux/gfp.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <linux/hugetlb.h>
-#include <linux/kvm_irqfd.h>
-#include <linux/irqbypass.h>
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/of.h>
 
 #include "book3s.h"
 
-- 
cgit v1.2.1


From 936774cd3fc8152e36c18a129fa940c42c177d14 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 5 Apr 2017 17:54:52 +1000
Subject: powerpc/kvm: Make kvmppc_xics_create_icp static

It's only used within the same file it's defined

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_ppc.h | 4 ----
 arch/powerpc/kvm/book3s_xics.c     | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index dd11c4c8c56a..bfef1ae66ffd 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -478,7 +478,6 @@ extern void kvmppc_free_host_rm_ops(void);
 extern void kvmppc_free_pimap(struct kvm *kvm);
 extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
 extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
-extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
 extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
 extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
 extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
@@ -507,9 +506,6 @@ static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 	{ return 0; }
 static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
-static inline int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu,
-					 unsigned long server)
-	{ return -EINVAL; }
 static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm,
 					struct kvm_irq_level *args)
 	{ return -ENOTTY; }
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index e48803e2918d..ef4fd528c193 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1084,7 +1084,7 @@ static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
 	return xics->ics[icsid];
 }
 
-int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num)
+static int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num)
 {
 	struct kvmppc_icp *icp;
 
-- 
cgit v1.2.1


From f50d6bd3442c3c1345b0da0885ac9d81fef2bb8e Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 5 Apr 2017 17:54:53 +1000
Subject: powerpc/kvm: Remove obsolete kvm_vm_ioctl_xics_irq declaration

The function doesn't exist anymore

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_ppc.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index bfef1ae66ffd..0c418655ee1f 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -478,7 +478,6 @@ extern void kvmppc_free_host_rm_ops(void);
 extern void kvmppc_free_pimap(struct kvm *kvm);
 extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
 extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
-extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
 extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
 extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
@@ -506,9 +505,6 @@ static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 	{ return 0; }
 static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
-static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm,
-					struct kvm_irq_level *args)
-	{ return -ENOTTY; }
 static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 	{ return 0; }
 #endif
-- 
cgit v1.2.1


From d381d7caf812f7aa9f05cfeb858c9004ac654412 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 5 Apr 2017 17:54:54 +1000
Subject: powerpc: Consolidate variants of real-mode MMIOs

We have all sort of variants of MMIO accessors for the real mode
instructions. This creates a clean set of accessors based on
Linux normal naming conventions, replacing all occurrences of
the old ones in the tree.

I have purposefully removed the "out/in" variants in favor of
only including __raw variants. Any code using these is already
pretty much hand tuned to operate in a very specific environment.
I've fixed up the 2 users (only one of them actually needed
a barrier in the first place).

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/io.h             | 98 ++++++++++++++++---------------
 arch/powerpc/include/asm/kvm_book3s_asm.h |  2 +-
 arch/powerpc/include/asm/kvm_ppc.h        |  2 +-
 arch/powerpc/kvm/book3s_hv_builtin.c      | 21 +++----
 arch/powerpc/kvm/book3s_hv_rm_xics.c      |  4 +-
 arch/powerpc/platforms/powernv/rng.c      |  2 +-
 arch/powerpc/sysdev/xics/icp-native.c     |  8 +--
 7 files changed, 68 insertions(+), 69 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 5ed292431b5b..45c136a832ce 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -192,24 +192,8 @@ DEF_MMIO_OUT_D(out_le32, 32, stw);
 
 #endif /* __BIG_ENDIAN */
 
-/*
- * Cache inhibitied accessors for use in real mode, you don't want to use these
- * unless you know what you're doing.
- *
- * NB. These use the cpu byte ordering.
- */
-DEF_MMIO_OUT_X(out_rm8,   8, stbcix);
-DEF_MMIO_OUT_X(out_rm16, 16, sthcix);
-DEF_MMIO_OUT_X(out_rm32, 32, stwcix);
-DEF_MMIO_IN_X(in_rm8,   8, lbzcix);
-DEF_MMIO_IN_X(in_rm16, 16, lhzcix);
-DEF_MMIO_IN_X(in_rm32, 32, lwzcix);
-
 #ifdef __powerpc64__
 
-DEF_MMIO_OUT_X(out_rm64, 64, stdcix);
-DEF_MMIO_IN_X(in_rm64, 64, ldcix);
-
 #ifdef __BIG_ENDIAN__
 DEF_MMIO_OUT_D(out_be64, 64, std);
 DEF_MMIO_IN_D(in_be64, 64, ld);
@@ -242,35 +226,6 @@ static inline void out_be64(volatile u64 __iomem *addr, u64 val)
 #endif
 #endif /* __powerpc64__ */
 
-
-/*
- * Simple Cache inhibited accessors
- * Unlike the DEF_MMIO_* macros, these don't include any h/w memory
- * barriers, callers need to manage memory barriers on their own.
- * These can only be used in hypervisor real mode.
- */
-
-static inline u32 _lwzcix(unsigned long addr)
-{
-	u32 ret;
-
-	__asm__ __volatile__("lwzcix %0,0, %1"
-			     : "=r" (ret) : "r" (addr) : "memory");
-	return ret;
-}
-
-static inline void _stbcix(u64 addr, u8 val)
-{
-	__asm__ __volatile__("stbcix %0,0,%1"
-		: : "r" (val), "r" (addr) : "memory");
-}
-
-static inline void _stwcix(u64 addr, u32 val)
-{
-	__asm__ __volatile__("stwcix %0,0,%1"
-		: : "r" (val), "r" (addr) : "memory");
-}
-
 /*
  * Low level IO stream instructions are defined out of line for now
  */
@@ -417,15 +372,64 @@ static inline void __raw_writeq(unsigned long v, volatile void __iomem *addr)
 }
 
 /*
- * Real mode version of the above. stdcix is only supposed to be used
- * in hypervisor real mode as per the architecture spec.
+ * Real mode versions of the above. Those instructions are only supposed
+ * to be used in hypervisor real mode as per the architecture spec.
  */
+static inline void __raw_rm_writeb(u8 val, volatile void __iomem *paddr)
+{
+	__asm__ __volatile__("stbcix %0,0,%1"
+		: : "r" (val), "r" (paddr) : "memory");
+}
+
+static inline void __raw_rm_writew(u16 val, volatile void __iomem *paddr)
+{
+	__asm__ __volatile__("sthcix %0,0,%1"
+		: : "r" (val), "r" (paddr) : "memory");
+}
+
+static inline void __raw_rm_writel(u32 val, volatile void __iomem *paddr)
+{
+	__asm__ __volatile__("stwcix %0,0,%1"
+		: : "r" (val), "r" (paddr) : "memory");
+}
+
 static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr)
 {
 	__asm__ __volatile__("stdcix %0,0,%1"
 		: : "r" (val), "r" (paddr) : "memory");
 }
 
+static inline u8 __raw_rm_readb(volatile void __iomem *paddr)
+{
+	u8 ret;
+	__asm__ __volatile__("lbzcix %0,0, %1"
+			     : "=r" (ret) : "r" (paddr) : "memory");
+	return ret;
+}
+
+static inline u16 __raw_rm_readw(volatile void __iomem *paddr)
+{
+	u16 ret;
+	__asm__ __volatile__("lhzcix %0,0, %1"
+			     : "=r" (ret) : "r" (paddr) : "memory");
+	return ret;
+}
+
+static inline u32 __raw_rm_readl(volatile void __iomem *paddr)
+{
+	u32 ret;
+	__asm__ __volatile__("lwzcix %0,0, %1"
+			     : "=r" (ret) : "r" (paddr) : "memory");
+	return ret;
+}
+
+static inline u64 __raw_rm_readq(volatile void __iomem *paddr)
+{
+	u64 ret;
+	__asm__ __volatile__("ldcix %0,0, %1"
+			     : "=r" (ret) : "r" (paddr) : "memory");
+	return ret;
+}
 #endif /* __powerpc64__ */
 
 /*
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index d318d432caa9..0593d9479f74 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -110,7 +110,7 @@ struct kvmppc_host_state {
 	u8 ptid;
 	struct kvm_vcpu *kvm_vcpu;
 	struct kvmppc_vcore *kvm_vcore;
-	unsigned long xics_phys;
+	void __iomem *xics_phys;
 	u32 saved_xirr;
 	u64 dabr;
 	u64 host_mmcr[7];	/* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0c418655ee1f..c3877992eff9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -409,7 +409,7 @@ struct openpic;
 extern void kvm_cma_reserve(void) __init;
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {
-	paca[cpu].kvm_hstate.xics_phys = addr;
+	paca[cpu].kvm_hstate.xics_phys = (void __iomem *)addr;
 }
 
 static inline u32 kvmppc_get_xics_latch(void)
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index ae55603cf661..a752e29977e0 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -194,12 +194,6 @@ long kvmppc_h_random(struct kvm_vcpu *vcpu)
 	return H_HARDWARE;
 }
 
-static inline void rm_writeb(unsigned long paddr, u8 val)
-{
-	__asm__ __volatile__("stbcix %0,0,%1"
-		: : "r" (val), "r" (paddr) : "memory");
-}
-
 /*
  * Send an interrupt or message to another CPU.
  * The caller needs to include any barrier needed to order writes
@@ -207,7 +201,7 @@ static inline void rm_writeb(unsigned long paddr, u8 val)
  */
 void kvmhv_rm_send_ipi(int cpu)
 {
-	unsigned long xics_phys;
+	void __iomem *xics_phys;
 	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
 
 	/* On POWER9 we can use msgsnd for any destination cpu. */
@@ -232,7 +226,7 @@ void kvmhv_rm_send_ipi(int cpu)
 	/* Else poke the target with an IPI */
 	xics_phys = paca[cpu].kvm_hstate.xics_phys;
 	if (xics_phys)
-		rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+		__raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR);
 	else
 		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
 }
@@ -405,7 +399,7 @@ long kvmppc_read_intr(void)
 
 static long kvmppc_read_one_intr(bool *again)
 {
-	unsigned long xics_phys;
+	void __iomem *xics_phys;
 	u32 h_xirr;
 	__be32 xirr;
 	u32 xisr;
@@ -423,7 +417,7 @@ static long kvmppc_read_one_intr(bool *again)
 	if (!xics_phys)
 		rc = opal_int_get_xirr(&xirr, false);
 	else
-		xirr = _lwzcix(xics_phys + XICS_XIRR);
+		xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
 	if (rc < 0)
 		return 1;
 
@@ -453,8 +447,8 @@ static long kvmppc_read_one_intr(bool *again)
 	if (xisr == XICS_IPI) {
 		rc = 0;
 		if (xics_phys) {
-			_stbcix(xics_phys + XICS_MFRR, 0xff);
-			_stwcix(xics_phys + XICS_XIRR, xirr);
+			__raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
+			__raw_rm_writel(xirr, xics_phys + XICS_XIRR);
 		} else {
 			opal_int_set_mfrr(hard_smp_processor_id(), 0xff);
 			rc = opal_int_eoi(h_xirr);
@@ -479,7 +473,8 @@ static long kvmppc_read_one_intr(bool *again)
 			 * we need to resend that IPI, bummer
 			 */
 			if (xics_phys)
-				_stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
+				__raw_rm_writeb(IPI_PRIORITY,
+						xics_phys + XICS_MFRR);
 			else
 				opal_int_set_mfrr(hard_smp_processor_id(),
 						  IPI_PRIORITY);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index e78542d99cd6..3a1a463a039a 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -766,7 +766,7 @@ unsigned long eoi_rc;
 
 static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
 {
-	unsigned long xics_phys;
+	void __iomem *xics_phys;
 	int64_t rc;
 
 	rc = pnv_opal_pci_msi_eoi(c, hwirq);
@@ -779,7 +779,7 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
 	/* EOI it */
 	xics_phys = local_paca->kvm_hstate.xics_phys;
 	if (xics_phys) {
-		_stwcix(xics_phys + XICS_XIRR, xirr);
+		__raw_rm_writel(xirr, xics_phys + XICS_XIRR);
 	} else {
 		rc = opal_int_eoi(be32_to_cpu(xirr));
 		*again = rc > 0;
diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
index 5dcbdea1afac..1a9d84371a4d 100644
--- a/arch/powerpc/platforms/powernv/rng.c
+++ b/arch/powerpc/platforms/powernv/rng.c
@@ -62,7 +62,7 @@ int powernv_get_random_real_mode(unsigned long *v)
 
 	rng = raw_cpu_read(powernv_rng);
 
-	*v = rng_whiten(rng, in_rm64(rng->regs_real));
+	*v = rng_whiten(rng, __raw_rm_readq(rng->regs_real));
 
 	return 1;
 }
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index 8a6a043e239b..f0f3f47a3fc9 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -168,15 +168,15 @@ void icp_native_cause_ipi_rm(int cpu)
 	 * Need the physical address of the XICS to be
 	 * previously saved in kvm_hstate in the paca.
 	 */
-	unsigned long xics_phys;
+	void __iomem *xics_phys;
 
 	/*
 	 * Just like the cause_ipi functions, it is required to
-	 * include a full barrier (out8 includes a sync) before
-	 * causing the IPI.
+	 * include a full barrier before causing the IPI.
 	 */
 	xics_phys = paca[cpu].kvm_hstate.xics_phys;
-	out_rm8((u8 *)(xics_phys + XICS_MFRR), IPI_PRIORITY);
+	mb();
+	__raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR);
 }
 #endif
 
-- 
cgit v1.2.1


From 08a1e650cc631ccc8cfe670beb38b2f9c58402cd Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 5 Apr 2017 17:54:55 +1000
Subject: powerpc: Fixup LPCR:PECE and HEIC setting on POWER9

We need to set LPES in order for normal external interrupts (0x500)
to be directed to the guest while running in guest state.

We also need HEIC set to prevent them to be sent to the host while
in host state.

With XIVE the host never gets one of these and wouldn't know how to
handle it. All host external interrupts come in via the new
hypervisor virtualization interrupts vector.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/reg.h        |  1 +
 arch/powerpc/kernel/cpu_setup_power.S | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index fc879fd6bdae..d0b332b8afad 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -365,6 +365,7 @@
 #define   LPCR_MER_SH		11
 #define	  LPCR_GTSE		ASM_CONST(0x0000000000000400)  	/* Guest Translation Shootdown Enable */
 #define   LPCR_TC		ASM_CONST(0x0000000000000200)	/* Translation control */
+#define   LPCR_HEIC		ASM_CONST(0x0000000000000010)   /* Hypervisor External Interrupt Control */
 #define   LPCR_LPES		0x0000000c
 #define   LPCR_LPES0		ASM_CONST(0x0000000000000008)      /* LPAR Env selector 0 */
 #define   LPCR_LPES1		ASM_CONST(0x0000000000000004)      /* LPAR Env selector 1 */
diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 7fe8c79e6937..7013ae3d1675 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -29,6 +29,7 @@ _GLOBAL(__setup_cpu_power7)
 	li	r0,0
 	mtspr	SPRN_LPID,r0
 	mfspr	r3,SPRN_LPCR
+	li	r4,(LPCR_LPES1 >> LPCR_LPES_SH)
 	bl	__init_LPCR
 	bl	__init_tlb_power7
 	mtlr	r11
@@ -42,6 +43,7 @@ _GLOBAL(__restore_cpu_power7)
 	li	r0,0
 	mtspr	SPRN_LPID,r0
 	mfspr	r3,SPRN_LPCR
+	li	r4,(LPCR_LPES1 >> LPCR_LPES_SH)
 	bl	__init_LPCR
 	bl	__init_tlb_power7
 	mtlr	r11
@@ -59,6 +61,7 @@ _GLOBAL(__setup_cpu_power8)
 	mtspr	SPRN_LPID,r0
 	mfspr	r3,SPRN_LPCR
 	ori	r3, r3, LPCR_PECEDH
+	li	r4,0 /* LPES = 0 */
 	bl	__init_LPCR
 	bl	__init_HFSCR
 	bl	__init_tlb_power8
@@ -80,6 +83,7 @@ _GLOBAL(__restore_cpu_power8)
 	mtspr	SPRN_LPID,r0
 	mfspr   r3,SPRN_LPCR
 	ori	r3, r3, LPCR_PECEDH
+	li	r4,0 /* LPES = 0 */
 	bl	__init_LPCR
 	bl	__init_HFSCR
 	bl	__init_tlb_power8
@@ -99,10 +103,11 @@ _GLOBAL(__setup_cpu_power9)
 	mtspr	SPRN_PSSCR,r0
 	mtspr	SPRN_LPID,r0
 	mfspr	r3,SPRN_LPCR
-	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE)
+	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE  | LPCR_HEIC)
 	or	r3, r3, r4
 	LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
 	andc	r3, r3, r4
+	li	r4,(LPCR_LPES0 >> LPCR_LPES_SH)
 	bl	__init_LPCR
 	bl	__init_HFSCR
 	bl	__init_tlb_power9
@@ -122,10 +127,11 @@ _GLOBAL(__restore_cpu_power9)
 	mtspr	SPRN_PSSCR,r0
 	mtspr	SPRN_LPID,r0
 	mfspr   r3,SPRN_LPCR
-	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE)
+	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC)
 	or	r3, r3, r4
 	LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
 	andc	r3, r3, r4
+	li	r4,(LPCR_LPES0 >> LPCR_LPES_SH)
 	bl	__init_LPCR
 	bl	__init_HFSCR
 	bl	__init_tlb_power9
@@ -146,7 +152,7 @@ __init_hvmode_206:
 
 __init_LPCR:
 	/* Setup a sane LPCR:
-	 *   Called with initial LPCR in R3
+	 *   Called with initial LPCR in R3 and desired LPES 2-bit value in R4
 	 *
 	 *   LPES = 0b01 (HSRR0/1 used for 0x500)
 	 *   PECE = 0b111
@@ -157,8 +163,7 @@ __init_LPCR:
 	 *
 	 * Other bits untouched for now
 	 */
-	li	r5,1
-	rldimi	r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
+	rldimi	r3,r4, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
 	ori	r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2)
 	li	r5,4
 	rldimi	r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3
-- 
cgit v1.2.1


From f6b0df55cad252fedd60aa2ba75a0207d0006283 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 1 Apr 2017 20:11:47 +0530
Subject: powerpc/mm/radix: Don't do page walk cache flush when doing full mm
 flush

For fullmm tlb flush, we do a flush with RIC_FLUSH_ALL which will invalidate all
related caches (radix__tlb_flush()). Hence the pwc flush is not needed.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/tlb-radix.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 83dc1ccc2fa1..f3e58bd60d1a 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -129,6 +129,12 @@ void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
 {
 	unsigned long pid;
 	struct mm_struct *mm = tlb->mm;
+	/*
+	 * If we are doing a full mm flush, we will do a tlb flush
+	 * with RIC_FLUSH_ALL later.
+	 */
+	if (tlb->fullmm)
+		return;
 
 	preempt_disable();
 
@@ -195,6 +201,12 @@ void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
 	unsigned long pid;
 	struct mm_struct *mm = tlb->mm;
 
+	/*
+	 * If we are doing a full mm flush, we will do a tlb flush
+	 * with RIC_FLUSH_ALL later.
+	 */
+	if (tlb->fullmm)
+		return;
 	preempt_disable();
 
 	pid = mm->context.id;
-- 
cgit v1.2.1


From f7327e0ba3805470cced2acfa053e795362ee41d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 1 Apr 2017 20:11:48 +0530
Subject: powerpc/mm/radix: Remove unnecessary ptesync

For a tlbiel with pid, we need to issue tlbiel with set number encoded. We
don't need to do ptesync for each of those. Instead we need one for the entire
tlbiel pid operation.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/tlb-radix.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index f3e58bd60d1a..b68b5219cf45 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -34,10 +34,8 @@ static inline void __tlbiel_pid(unsigned long pid, int set,
 	prs = 1; /* process scoped */
 	r = 1;   /* raidx format */
 
-	asm volatile("ptesync": : :"memory");
 	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
 		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	asm volatile("ptesync": : :"memory");
 }
 
 /*
@@ -47,9 +45,11 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
 {
 	int set;
 
+	asm volatile("ptesync": : :"memory");
 	for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) {
 		__tlbiel_pid(pid, set, ric);
 	}
+	asm volatile("ptesync": : :"memory");
 	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
 }
 
-- 
cgit v1.2.1


From abfe8026b505d66dad7a3e5ebe7235f12c29b67d Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Mon, 10 Apr 2017 15:24:35 +1000
Subject: powerpc/powernv: Require MMU_NOTIFIER to fix NPU build

In the recent commit 1ab66d1fbada ("powerpc/powernv: Introduce address
translation services for Nvlink2") the NPU code gained a dependency on MMU
notifiers.

All our defconfigs have KVM enabled, which selects MMU_NOTIFIER, but if KVM is
not enabled then the build breaks.

Fix it by always selecting MMU_NOTIFIER when we're building powernv.

Fixes: 1ab66d1fbada ("powerpc/powernv: Introduce address translation services for Nvlink2")
Signed-off-by: Alistair Popple <alistair@popple.id.au>
[mpe: Reword change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 3a07e4dcf97c..76b8eca5767a 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -19,6 +19,7 @@ config PPC_POWERNV
 	select CPU_FREQ_GOV_ONDEMAND
 	select CPU_FREQ_GOV_CONSERVATIVE
 	select PPC_DOORBELL
+	select MMU_NOTIFIER
 	default y
 
 config OPAL_PRD
-- 
cgit v1.2.1


From 7644d5819cf8956d799a0a0e5dc75f5a29889bd5 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 10 Feb 2017 12:04:56 +1100
Subject: powerpc: Create asm/debugfs.h and move powerpc_debugfs_root there

powerpc_debugfs_root is the dentry representing the root of the
"powerpc" directory tree in debugfs.

Currently it sits in asm/debug.h, a long with some other things that
have "debug" in the name, but are otherwise unrelated.

Pull it out into a separate header, which also includes linux/debugfs.h,
and convert all the users to include debugfs.h instead of debug.h.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/debug.h          |  2 --
 arch/powerpc/include/asm/debugfs.h        | 17 +++++++++++++++++
 arch/powerpc/kernel/eeh.c                 |  3 +--
 arch/powerpc/kernel/fadump.c              |  2 +-
 arch/powerpc/kernel/setup-common.c        |  2 +-
 arch/powerpc/kernel/traps.c               |  2 +-
 arch/powerpc/kvm/book3s_xics.c            |  3 +--
 arch/powerpc/mm/hash_utils_64.c           |  3 +--
 arch/powerpc/platforms/cell/axon_msi.c    |  2 +-
 arch/powerpc/platforms/powernv/opal-lpc.c |  3 +--
 arch/powerpc/platforms/powernv/pci-ioda.c |  3 +--
 arch/powerpc/platforms/pseries/dtl.c      |  3 +--
 arch/powerpc/sysdev/scom.c                |  3 +--
 arch/powerpc/xmon/xmon.c                  |  5 +----
 14 files changed, 29 insertions(+), 24 deletions(-)
 create mode 100644 arch/powerpc/include/asm/debugfs.h

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h
index 86308f177f2d..5d5af3fddfd8 100644
--- a/arch/powerpc/include/asm/debug.h
+++ b/arch/powerpc/include/asm/debug.h
@@ -8,8 +8,6 @@
 
 struct pt_regs;
 
-extern struct dentry *powerpc_debugfs_root;
-
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE)
 
 extern int (*__debugger)(struct pt_regs *regs);
diff --git a/arch/powerpc/include/asm/debugfs.h b/arch/powerpc/include/asm/debugfs.h
new file mode 100644
index 000000000000..4f3b39f3e3d2
--- /dev/null
+++ b/arch/powerpc/include/asm/debugfs.h
@@ -0,0 +1,17 @@
+#ifndef _ASM_POWERPC_DEBUGFS_H
+#define _ASM_POWERPC_DEBUGFS_H
+
+/*
+ * Copyright 2017, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/debugfs.h>
+
+extern struct dentry *powerpc_debugfs_root;
+
+#endif /* _ASM_POWERPC_DEBUGFS_H */
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9de7f79e702b..63992b2d8e15 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -22,7 +22,6 @@
  */
 
 #include <linux/delay.h>
-#include <linux/debugfs.h>
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/list.h>
@@ -37,7 +36,7 @@
 #include <linux/of.h>
 
 #include <linux/atomic.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
 #include <asm/eeh.h>
 #include <asm/eeh_event.h>
 #include <asm/io.h>
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 33b2da302730..c7acc6651ce7 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -30,12 +30,12 @@
 #include <linux/string.h>
 #include <linux/memblock.h>
 #include <linux/delay.h>
-#include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/crash_dump.h>
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
 
+#include <asm/debugfs.h>
 #include <asm/page.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index fcdca741f660..5c10b5925ac2 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -31,11 +31,11 @@
 #include <linux/unistd.h>
 #include <linux/serial.h>
 #include <linux/serial_8250.h>
-#include <linux/debugfs.h>
 #include <linux/percpu.h>
 #include <linux/memblock.h>
 #include <linux/of_platform.h>
 #include <linux/hugetlb.h>
+#include <asm/debugfs.h>
 #include <asm/io.h>
 #include <asm/paca.h>
 #include <asm/prom.h>
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index ff365f9de27a..354946236c61 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -35,13 +35,13 @@
 #include <linux/backlight.h>
 #include <linux/bug.h>
 #include <linux/kdebug.h>
-#include <linux/debugfs.h>
 #include <linux/ratelimit.h>
 #include <linux/context_tracking.h>
 
 #include <asm/emulated_ops.h>
 #include <asm/pgtable.h>
 #include <linux/uaccess.h>
+#include <asm/debugfs.h>
 #include <asm/io.h>
 #include <asm/machdep.h>
 #include <asm/rtas.h>
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index e48803e2918d..dc4352395887 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -19,10 +19,9 @@
 #include <asm/kvm_ppc.h>
 #include <asm/hvcall.h>
 #include <asm/xics.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
 #include <asm/time.h>
 
-#include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
 #include "book3s_xics.h"
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 69a05b3e4d3f..f2095ce9d4b0 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -35,9 +35,8 @@
 #include <linux/memblock.h>
 #include <linux/context_tracking.h>
 #include <linux/libfdt.h>
-#include <linux/debugfs.h>
 
-#include <asm/debug.h>
+#include <asm/debugfs.h>
 #include <asm/processor.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c
index 8b55c5f19d4c..8d3ae2cc52bf 100644
--- a/arch/powerpc/platforms/cell/axon_msi.c
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -15,9 +15,9 @@
 #include <linux/msi.h>
 #include <linux/export.h>
 #include <linux/of_platform.h>
-#include <linux/debugfs.h>
 #include <linux/slab.h>
 
+#include <asm/debugfs.h>
 #include <asm/dcr.h>
 #include <asm/machdep.h>
 #include <asm/prom.h>
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
index a91d7876fae2..6c7ad1d8b32e 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/of.h>
 #include <linux/bug.h>
-#include <linux/debugfs.h>
 #include <linux/io.h>
 #include <linux/slab.h>
 
@@ -21,7 +20,7 @@
 #include <asm/opal.h>
 #include <asm/prom.h>
 #include <linux/uaccess.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
 #include <asm/isa-bridge.h>
 
 static int opal_lpc_chip_id = -1;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9c050fe5a6e8..7eebc76721ea 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -14,7 +14,6 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/crash_dump.h>
-#include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/string.h>
 #include <linux/init.h>
@@ -38,7 +37,7 @@
 #include <asm/iommu.h>
 #include <asm/tce.h>
 #include <asm/xics.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
 #include <asm/firmware.h>
 #include <asm/pnv-pci.h>
 #include <asm/mmzone.h>
diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
index 6b04e3f0f982..18014cdeb590 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -21,13 +21,12 @@
  */
 
 #include <linux/slab.h>
-#include <linux/debugfs.h>
 #include <linux/spinlock.h>
 #include <asm/smp.h>
 #include <linux/uaccess.h>
 #include <asm/firmware.h>
 #include <asm/lppaca.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
 #include <asm/plpar_wrappers.h>
 #include <asm/machdep.h>
 
diff --git a/arch/powerpc/sysdev/scom.c b/arch/powerpc/sysdev/scom.c
index d0e9f178a324..76ea32c1b664 100644
--- a/arch/powerpc/sysdev/scom.c
+++ b/arch/powerpc/sysdev/scom.c
@@ -19,10 +19,9 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/export.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
 #include <asm/prom.h>
 #include <asm/scom.h>
 #include <linux/uaccess.h>
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 0fab9d7349eb..fddc857af772 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -29,10 +29,7 @@
 #include <linux/nmi.h>
 #include <linux/ctype.h>
 
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#endif
-
+#include <asm/debugfs.h>
 #include <asm/ptrace.h>
 #include <asm/string.h>
 #include <asm/prom.h>
-- 
cgit v1.2.1


From 3ae05fb3ccb365894662d6073c851cb466dd2220 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 10 Feb 2017 12:12:44 +1100
Subject: powerpc: Remove unnecessary includes of asm/debug.h

These files don't seem to have any need for asm/debug.h, now that all it
includes are the debugger hooks and breakpoint definitions.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/fadump.c         | 1 -
 arch/powerpc/kernel/irq.c            | 1 -
 arch/powerpc/kernel/prom.c           | 1 -
 arch/powerpc/kvm/book3s_hv_rm_xics.c | 1 -
 4 files changed, 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index c7acc6651ce7..243dbef7e926 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -40,7 +40,6 @@
 #include <asm/prom.h>
 #include <asm/rtas.h>
 #include <asm/fadump.h>
-#include <asm/debug.h>
 #include <asm/setup.h>
 
 static struct fw_dump fw_dump;
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index a018f5cae899..097f2f9ff85d 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -65,7 +65,6 @@
 #include <asm/machdep.h>
 #include <asm/udbg.h>
 #include <asm/smp.h>
-#include <asm/debug.h>
 #include <asm/livepatch.h>
 #include <asm/asm-prototypes.h>
 
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index f5d399e46193..d2f0afeae5a0 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -55,7 +55,6 @@
 #include <asm/kexec.h>
 #include <asm/opal.h>
 #include <asm/fadump.h>
-#include <asm/debug.h>
 #include <asm/epapr_hcalls.h>
 #include <asm/firmware.h>
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index e78542d99cd6..d9e312f253fa 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -16,7 +16,6 @@
 #include <asm/kvm_ppc.h>
 #include <asm/hvcall.h>
 #include <asm/xics.h>
-#include <asm/debug.h>
 #include <asm/synch.h>
 #include <asm/cputhreads.h>
 #include <asm/pgtable.h>
-- 
cgit v1.2.1


From 4868e3508d1934d28961f940ed6b9f1e347ab52c Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 3 Apr 2017 12:05:55 +1000
Subject: powerpc/nohash: Fix use of mmu_has_feature() in
 setup_initial_memory_limit()

setup_initial_memory_limit() is called from early_init_devtree(), which
runs prior to feature patching. If the kernel is built with CONFIG_JUMP_LABEL=y
and CONFIG_JUMP_LABEL_FEATURE_CHECKS=y then we will potentially get the
wrong value.

If we also have CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG=y we get a warning
and backtrace:

  Warning! mmu_has_feature() used prior to jump label init!
  CPU: 0 PID: 0 Comm: swapper Not tainted 4.11.0-rc4-gccN-next-20170331-g6af2434 #1
  Call Trace:
  [c000000000fc3d50] [c000000000a26c30] .dump_stack+0xa8/0xe8 (unreliable)
  [c000000000fc3de0] [c00000000002e6b8] .setup_initial_memory_limit+0xa4/0x104
  [c000000000fc3e60] [c000000000d5c23c] .early_init_devtree+0xd0/0x2f8
  [c000000000fc3f00] [c000000000d5d3b0] .early_setup+0x90/0x11c
  [c000000000fc3f90] [c000000000000520] start_here_multiplatform+0x68/0x80

Fix it by using early_mmu_has_feature().

Fixes: c12e6f24d413 ("powerpc: Add option to use jump label for mmu_has_feature()")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/tlb_nohash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index ba28fcb98597..bfc4a0869609 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -770,7 +770,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 	 * avoid going over total available memory just in case...
 	 */
 #ifdef CONFIG_PPC_FSL_BOOK3E
-	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
+	if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
 		unsigned long linear_sz;
 		unsigned int num_cams;
 
-- 
cgit v1.2.1


From 7b3912f4223541c5108565d4bad28928e3b628f9 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 5 Apr 2017 16:10:48 +1000
Subject: powerpc: Make sparsemem the default on 64-bit Book3S

Make sparsemem the default on all 64-bit Book3S platforms. It already is
for pseries and ps3, and we need to enable it for powernv because on
POWER9 memory between chips is discontiguous.

For the other platforms sparsemem should work fine, though it might add
a small amount of overhead. We can always force FLATMEM in the
defconfigs if necessary.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 45724923a69c..9ff731f50a29 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -586,7 +586,7 @@ config ARCH_SPARSEMEM_ENABLE
 
 config ARCH_SPARSEMEM_DEFAULT
 	def_bool y
-	depends on (SMP && PPC_PSERIES) || PPC_PS3
+	depends on PPC_BOOK3S_64
 
 config SYS_SUPPORTS_HUGETLBFS
 	bool
-- 
cgit v1.2.1


From ea6145557400b38faa6b3cee946ebceed8999872 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Fri, 7 Apr 2017 12:23:11 +0530
Subject: powerpc/mm: Remove reduntant initmem information from log

Generic core VM already prints these information in the log
buffer, hence there is no need for a second print. This just
removes the second print from arch powerpc NUMA init path.

Before the patch:

  $ dmesg | grep "Initmem"

  numa: Initmem setup node 0 [mem 0x00000000-0xffffffff]
  numa: Initmem setup node 1 [mem 0x100000000-0x1ffffffff]
  numa: Initmem setup node 2 [mem 0x200000000-0x2ffffffff]
  numa: Initmem setup node 3 [mem 0x300000000-0x3ffffffff]
  numa: Initmem setup node 4 [mem 0x400000000-0x4ffffffff]
  numa: Initmem setup node 5 [mem 0x500000000-0x5ffffffff]
  numa: Initmem setup node 6 [mem 0x600000000-0x6ffffffff]
  numa: Initmem setup node 7 [mem 0x700000000-0x7ffffffff]
  Initmem setup node 0 [mem 0x0000000000000000-0x00000000ffffffff]
  Initmem setup node 1 [mem 0x0000000100000000-0x00000001ffffffff]
  Initmem setup node 2 [mem 0x0000000200000000-0x00000002ffffffff]
  Initmem setup node 3 [mem 0x0000000300000000-0x00000003ffffffff]
  Initmem setup node 4 [mem 0x0000000400000000-0x00000004ffffffff]
  Initmem setup node 5 [mem 0x0000000500000000-0x00000005ffffffff]
  Initmem setup node 6 [mem 0x0000000600000000-0x00000006ffffffff]
  Initmem setup node 7 [mem 0x0000000700000000-0x00000007ffffffff]

After the patch just the latter set is printed.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/numa.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 9befaee237d6..371792e4418f 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -875,13 +875,6 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
 	void *nd;
 	int tnid;
 
-	if (spanned_pages)
-		pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
-			nid, start_pfn << PAGE_SHIFT,
-			(end_pfn << PAGE_SHIFT) - 1);
-	else
-		pr_info("Initmem setup node %d\n", nid);
-
 	nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
 	nd = __va(nd_pa);
 
-- 
cgit v1.2.1


From 2c9faa7675fec57f6ac0372688fae2e10fc4c272 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Fri, 7 Apr 2017 09:25:39 +0530
Subject: powerpc/hugetlb: Add ABI defines for supported HugeTLB page sizes

Add user space exported API definitions for 512KB, 1MB, 2MB, 8MB, 16MB,
1GB, 16GB non default huge page sizes to be used with mmap() system
call.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
[mpe: Reword the comment to emphasise that these are only needed to use
 the non-default huge page size, and updated the change log.]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/uapi/asm/mman.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h
index 03c06ba7464f..ab45cc2f3101 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -29,4 +29,20 @@
 #define MAP_STACK	0x20000		/* give out an address that is best suited for process/thread stacks */
 #define MAP_HUGETLB	0x40000		/* create a huge page mapping */
 
+/*
+ * When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2),
+ * encode the log2 of the huge page size. A value of zero indicates that the
+ * default huge page size should be used. To use a non-default huge page size,
+ * one of these defines can be used, or the size can be encoded by hand. Note
+ * that on most systems only a subset, or possibly none, of these sizes will be
+ * available.
+ */
+#define MAP_HUGE_512KB	(19 << MAP_HUGE_SHIFT)	/* 512KB HugeTLB Page */
+#define MAP_HUGE_1MB	(20 << MAP_HUGE_SHIFT)	/* 1MB   HugeTLB Page */
+#define MAP_HUGE_2MB	(21 << MAP_HUGE_SHIFT)	/* 2MB   HugeTLB Page */
+#define MAP_HUGE_8MB	(23 << MAP_HUGE_SHIFT)	/* 8MB   HugeTLB Page */
+#define MAP_HUGE_16MB	(24 << MAP_HUGE_SHIFT)	/* 16MB  HugeTLB Page */
+#define MAP_HUGE_1GB	(30 << MAP_HUGE_SHIFT)	/* 1GB   HugeTLB Page */
+#define MAP_HUGE_16GB	(34 << MAP_HUGE_SHIFT)	/* 16GB  HugeTLB Page */
+
 #endif /* _UAPI_ASM_POWERPC_MMAN_H */
-- 
cgit v1.2.1


From a7cd88da97040513e17cd77ae3e57764e854bae4 Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 20:34:14 +0530
Subject: powerpc/powernv: Move CPU-Offline idle state invocation from smp.c to
 idle.c

Move the piece of code in powernv/smp.c::pnv_smp_cpu_kill_self() which
transitions the CPU to the deepest available platform idle state to a
new function named pnv_cpu_offline() in powernv/idle.c. The rationale
behind this code movement is that the data required to determine the
deepest available platform state resides in powernv/idle.c.

Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cpuidle.h       |  1 +
 arch/powerpc/platforms/powernv/idle.c    | 25 +++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/powernv.h |  2 --
 arch/powerpc/platforms/powernv/smp.c     | 18 ++----------------
 4 files changed, 28 insertions(+), 18 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h
index 155731557c9b..4649ca0d28e3 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -46,6 +46,7 @@ extern u32 pnv_fastsleep_workaround_at_exit[];
 
 extern u64 pnv_first_deep_stop_state;
 
+unsigned long pnv_cpu_offline(unsigned int cpu);
 int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags);
 static inline void report_invalid_psscr_val(u64 psscr_val, int err)
 {
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 4ee837e6391a..419edffec516 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -265,6 +265,31 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
 u64 pnv_deepest_stop_psscr_val;
 u64 pnv_deepest_stop_psscr_mask;
 
+/*
+ * pnv_cpu_offline: A function that puts the CPU into the deepest
+ * available platform idle state on a CPU-Offline.
+ */
+unsigned long pnv_cpu_offline(unsigned int cpu)
+{
+	unsigned long srr1;
+
+	u32 idle_states = pnv_get_supported_cpuidle_states();
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val,
+					pnv_deepest_stop_psscr_mask);
+	} else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
+		srr1 = power7_winkle();
+	} else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
+		   (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
+		srr1 = power7_sleep();
+	} else {
+		srr1 = power7_nap(1);
+	}
+
+	return srr1;
+}
+
 /*
  * Power ISA 3.0 idle initialization.
  *
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index 613052232475..6dbc0a1da1f6 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -18,8 +18,6 @@ static inline void pnv_pci_shutdown(void) { }
 #endif
 
 extern u32 pnv_get_supported_cpuidle_states(void);
-extern u64 pnv_deepest_stop_psscr_val;
-extern u64 pnv_deepest_stop_psscr_mask;
 
 extern void pnv_lpc_init(void);
 
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 8b67e1eefb5c..914b456ef551 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -35,6 +35,7 @@
 #include <asm/dbell.h>
 #include <asm/kvm_ppc.h>
 #include <asm/ppc-opcode.h>
+#include <asm/cpuidle.h>
 
 #include "powernv.h"
 
@@ -140,7 +141,6 @@ static void pnv_smp_cpu_kill_self(void)
 {
 	unsigned int cpu;
 	unsigned long srr1, wmask;
-	u32 idle_states;
 
 	/* Standard hot unplug procedure */
 	local_irq_disable();
@@ -155,8 +155,6 @@ static void pnv_smp_cpu_kill_self(void)
 	if (cpu_has_feature(CPU_FTR_ARCH_207S))
 		wmask = SRR1_WAKEMASK_P8;
 
-	idle_states = pnv_get_supported_cpuidle_states();
-
 	/* We don't want to take decrementer interrupts while we are offline,
 	 * so clear LPCR:PECE1. We keep PECE2 (and LPCR_PECE_HVEE on P9)
 	 * enabled as to let IPIs in.
@@ -184,19 +182,7 @@ static void pnv_smp_cpu_kill_self(void)
 		kvmppc_set_host_ipi(cpu, 0);
 
 		ppc64_runlatch_off();
-
-		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-			srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val,
-						pnv_deepest_stop_psscr_mask);
-		} else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
-			srr1 = power7_winkle();
-		} else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
-			   (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
-			srr1 = power7_sleep();
-		} else {
-			srr1 = power7_nap(1);
-		}
-
+		srr1 = pnv_cpu_offline(cpu);
 		ppc64_runlatch_on();
 
 		/*
-- 
cgit v1.2.1


From 9006123157886acc47bdc17e26c933d710b8d6cd Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 20:34:15 +0530
Subject: powerpc/powernv/smp: Add busy-wait loop as fall back for CPU-Hotplug

Currently, the powernv cpu-offline function assumes that platform idle
states such as stop on POWER9, winkle/sleep/nap on POWER8 are always
available. On POWER8, it picks nap as the default state if other deep
idle states like sleep/winkle are not available and enabled in the
platform.

On POWER9, nap is not available and all idle states are managed by
STOP instruction.  The parameters to the idle state are passed through
processor stop status control register (PSSCR).  Hence as such
executing STOP would take parameters from current PSSCR. We do not
want to make any assumptions in kernel on what STOP states and PSSCR
features are configured by the platform.

Ideally platform will configure a good set of stop states that can be
used in the kernel.  We would like to start with a clean slate, if the
platform choose to not configure any state or there is an error in
platform firmware that lead to no stop states being configured or
allowed to be requested.

This patch adds a fallback method for CPU-Hotplug that is similar to
snooze loop at idle where the threads are left to spin at low priority
and hence reduce the cycles consumed.

This is a safe fallback mechanism in the case when no stop state would
be requested if the platform firmware did not configure them most
likely due to an error condition.

Requesting a stop state when the platform has not configured them or
enabled them would lead to further error conditions which could be
difficult to debug.

[Changelog written with inputs from svaidy@linux.vnet.ibm.com]
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/idle.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 419edffec516..f335e0f13192 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -283,8 +283,16 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
 	} else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
 		   (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
 		srr1 = power7_sleep();
-	} else {
+	} else if (idle_states & OPAL_PM_NAP_ENABLED) {
 		srr1 = power7_nap(1);
+	} else {
+		/* This is the fallback method. We emulate snooze */
+		while (!generic_check_cpu_restart(cpu)) {
+			HMT_low();
+			HMT_very_low();
+		}
+		srr1 = 0;
+		HMT_medium();
 	}
 
 	return srr1;
-- 
cgit v1.2.1


From f3b3f28493d93232a37d5fbb5cb5ad168ede0e1a Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 20:34:16 +0530
Subject: powerpc/powernv/idle: Don't override default/deepest directly in
 kernel

Currently during idle-init on power9, if we don't find suitable stop
states in the device tree that can be used as the
default_stop/deepest_stop, we set stop0 (ESL=1,EC=1) as the default
stop state psscr to be used by power9_idle and deepest stop state
which is used by CPU-Hotplug.

However, if the platform firmware has not configured or enabled a stop
state, the kernel should not make any assumptions and fallback to a
default choice.

If the kernel uses a stop state that is not configured by the platform
firmware, it may lead to further failures which should be avoided.

In this patch, we modify the init code to ensure that the kernel uses
only the stop states exposed by the firmware through the device
tree. When a suitable default stop state isn't found, we disable
ppc_md.power_save for power9. Similarly, when a suitable
deepest_stop_state is not found in the device tree exported by the
firmware, fall back to the default busy-wait loop in the CPU-Hotplug
code.

[Changelog written with inputs from svaidy@linux.vnet.ibm.com]
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/idle.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index f335e0f13192..63ade787aec4 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -147,7 +147,6 @@ u32 pnv_get_supported_cpuidle_states(void)
 }
 EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
 
-
 static void pnv_fastsleep_workaround_apply(void *info)
 
 {
@@ -241,8 +240,9 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
  * The default stop state that will be used by ppc_md.power_save
  * function on platforms that support stop instruction.
  */
-u64 pnv_default_stop_val;
-u64 pnv_default_stop_mask;
+static u64 pnv_default_stop_val;
+static u64 pnv_default_stop_mask;
+static bool default_stop_found;
 
 /*
  * Used for ppc_md.power_save which needs a function with no parameters
@@ -262,8 +262,9 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
  * psscr value and mask of the deepest stop idle state.
  * Used when a cpu is offlined.
  */
-u64 pnv_deepest_stop_psscr_val;
-u64 pnv_deepest_stop_psscr_mask;
+static u64 pnv_deepest_stop_psscr_val;
+static u64 pnv_deepest_stop_psscr_mask;
+static bool deepest_stop_found;
 
 /*
  * pnv_cpu_offline: A function that puts the CPU into the deepest
@@ -275,7 +276,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
 
 	u32 idle_states = pnv_get_supported_cpuidle_states();
 
-	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+	if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) {
 		srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val,
 					pnv_deepest_stop_psscr_mask);
 	} else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
@@ -385,7 +386,6 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
 	u32 *residency_ns = NULL;
 	u64 max_residency_ns = 0;
 	int rc = 0, i;
-	bool default_stop_found = false, deepest_stop_found = false;
 
 	psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL);
 	psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL);
@@ -465,21 +465,24 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
 		}
 	}
 
-	if (!default_stop_found) {
-		pnv_default_stop_val = PSSCR_HV_DEFAULT_VAL;
-		pnv_default_stop_mask = PSSCR_HV_DEFAULT_MASK;
-		pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n",
+	if (unlikely(!default_stop_found)) {
+		pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n");
+	} else {
+		ppc_md.power_save = power9_idle;
+		pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n",
 			pnv_default_stop_val, pnv_default_stop_mask);
 	}
 
-	if (!deepest_stop_found) {
-		pnv_deepest_stop_psscr_val = PSSCR_HV_DEFAULT_VAL;
-		pnv_deepest_stop_psscr_mask = PSSCR_HV_DEFAULT_MASK;
-		pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n",
+	if (unlikely(!deepest_stop_found)) {
+		pr_warn("cpuidle-powernv: No suitable stop state for CPU-Hotplug. Offlined CPUs will busy wait");
+	} else {
+		pr_info("cpuidle-powernv: Deepest stop: psscr = 0x%016llx,mask=0x%016llx\n",
 			pnv_deepest_stop_psscr_val,
 			pnv_deepest_stop_psscr_mask);
 	}
 
+	pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n",
+		pnv_first_deep_stop_state);
 out:
 	kfree(psscr_val);
 	kfree(psscr_mask);
@@ -559,8 +562,6 @@ static int __init pnv_init_idle_states(void)
 
 	if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED)
 		ppc_md.power_save = power7_idle;
-	else if (supported_cpuidle_states & OPAL_PM_STOP_INST_FAST)
-		ppc_md.power_save = power9_idle;
 
 out:
 	return 0;
-- 
cgit v1.2.1


From 17ed4c8f81da2bf340d33a8c875f4d6b1dfd9398 Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 20:34:17 +0530
Subject: powerpc/powernv: Recover correct PACA on wakeup from a stop on P9 DD1

POWER9 DD1.0 hardware has a bug where the SPRs of a thread waking up
from stop 0,1,2 with ESL=1 can endup being misplaced in the core. Thus
the HSPRG0 of a thread waking up from can contain the paca pointer of
its sibling.

This patch implements a context recovery framework within threads of a
core, by provisioning space in paca_struct for saving every sibling
threads's paca pointers. Basically, we should be able to arrive at the
right paca pointer from any of the thread's existing paca pointer.

At bootup, during powernv idle-init, we save the paca address of every
CPU in each one its siblings paca_struct in the slot corresponding to
this CPU's index in the core.

On wakeup from a stop, the thread will determine its index in the core
from the TIR register and recover its PACA pointer by indexing into
the correct slot in the provisioned space in the current PACA.

Furthermore, ensure that the NVGPRs are restored from the stack on the
way out by setting the NAPSTATELOST in paca.

[Changelog written with inputs from svaidy@linux.vnet.ibm.com]
Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Call it a bug]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/paca.h       |  5 ++++
 arch/powerpc/kernel/asm-offsets.c     |  1 +
 arch/powerpc/kernel/idle_book3s.S     | 48 ++++++++++++++++++++++++++++++++++-
 arch/powerpc/platforms/powernv/idle.c | 30 ++++++++++++++++++++++
 4 files changed, 83 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 65d0ffb2aa33..140ddb9ae5a8 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -173,6 +173,11 @@ struct paca_struct {
 	u8 thread_mask;
 	/* Mask to denote subcore sibling threads */
 	u8 subcore_sibling_mask;
+	/*
+	 * Pointer to an array which contains pointer
+	 * to the sibling threads' paca.
+	 */
+	struct paca_struct **thread_sibling_pacas;
 #endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 795505e27535..e7c8229a8812 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -728,6 +728,7 @@ int main(void)
 	OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state);
 	OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask);
 	OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask);
+	OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas);
 #endif
 
 	DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 995728736677..24717a73b6bb 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -375,6 +375,46 @@ _GLOBAL(power9_idle_stop)
 	li	r4,1
 	b	pnv_powersave_common
 	/* No return */
+
+
+/*
+ * On waking up from stop 0,1,2 with ESL=1 on POWER9 DD1,
+ * HSPRG0 will be set to the HSPRG0 value of one of the
+ * threads in this core. Thus the value we have in r13
+ * may not be this thread's paca pointer.
+ *
+ * Fortunately, the TIR remains invariant. Since this thread's
+ * paca pointer is recorded in all its sibling's paca, we can
+ * correctly recover this thread's paca pointer if we
+ * know the index of this thread in the core.
+ *
+ * This index can be obtained from the TIR.
+ *
+ * i.e, thread's position in the core = TIR.
+ * If this value is i, then this thread's paca is
+ * paca->thread_sibling_pacas[i].
+ */
+power9_dd1_recover_paca:
+	mfspr	r4, SPRN_TIR
+	/*
+	 * Since each entry in thread_sibling_pacas is 8 bytes
+	 * we need to left-shift by 3 bits. Thus r4 = i * 8
+	 */
+	sldi	r4, r4, 3
+	/* Get &paca->thread_sibling_pacas[0] in r5 */
+	ld	r5, PACA_SIBLING_PACA_PTRS(r13)
+	/* Load paca->thread_sibling_pacas[i] into r13 */
+	ldx	r13, r4, r5
+	SET_PACA(r13)
+	ld	r2, PACATOC(r13)
+	/*
+	 * Indicate that we have lost NVGPR state
+	 * which needs to be restored from the stack.
+	 */
+	li	r3, 1
+	stb	r0,PACA_NAPSTATELOST(r13)
+	blr
+
 /*
  * Called from reset vector. Check whether we have woken up with
  * hypervisor state loss. If yes, restore hypervisor state and return
@@ -385,7 +425,13 @@ _GLOBAL(power9_idle_stop)
  */
 _GLOBAL(pnv_restore_hyp_resource)
 BEGIN_FTR_SECTION
-	ld	r2,PACATOC(r13);
+BEGIN_FTR_SECTION_NESTED(70)
+	mflr 	r6
+	bl	power9_dd1_recover_paca
+	mtlr	r6
+FTR_SECTION_ELSE_NESTED(70)
+	ld	r2, PACATOC(r13)
+ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_POWER9_DD1, 70)
 	/*
 	 * POWER ISA 3. Use PSSCR to determine if we
 	 * are waking up from deep idle state
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 63ade787aec4..b369e39aa392 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -122,9 +122,12 @@ static void pnv_alloc_idle_core_states(void)
 	for (i = 0; i < nr_cores; i++) {
 		int first_cpu = i * threads_per_core;
 		int node = cpu_to_node(first_cpu);
+		size_t paca_ptr_array_size;
 
 		core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
 		*core_idle_state = PNV_CORE_IDLE_THREAD_BITS;
+		paca_ptr_array_size = (threads_per_core *
+				       sizeof(struct paca_struct *));
 
 		for (j = 0; j < threads_per_core; j++) {
 			int cpu = first_cpu + j;
@@ -132,6 +135,11 @@ static void pnv_alloc_idle_core_states(void)
 			paca[cpu].core_idle_state_ptr = core_idle_state;
 			paca[cpu].thread_idle_state = PNV_THREAD_RUNNING;
 			paca[cpu].thread_mask = 1 << j;
+			if (!cpu_has_feature(CPU_FTR_POWER9_DD1))
+				continue;
+			paca[cpu].thread_sibling_pacas =
+				kmalloc_node(paca_ptr_array_size,
+					     GFP_KERNEL, node);
 		}
 	}
 
@@ -560,6 +568,28 @@ static int __init pnv_init_idle_states(void)
 
 	pnv_alloc_idle_core_states();
 
+	/*
+	 * For each CPU, record its PACA address in each of it's
+	 * sibling thread's PACA at the slot corresponding to this
+	 * CPU's index in the core.
+	 */
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+		int cpu;
+
+		pr_info("powernv: idle: Saving PACA pointers of all CPUs in their thread sibling PACA\n");
+		for_each_possible_cpu(cpu) {
+			int base_cpu = cpu_first_thread_sibling(cpu);
+			int idx = cpu_thread_in_core(cpu);
+			int i;
+
+			for (i = 0; i < threads_per_core; i++) {
+				int j = base_cpu + i;
+
+				paca[j].thread_sibling_pacas[idx] = &paca[cpu];
+			}
+		}
+	}
+
 	if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED)
 		ppc_md.power_save = power7_idle;
 
-- 
cgit v1.2.1


From 03dfee6d5f824d14e3ecb742518740de69e603cc Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 12 Apr 2017 14:56:36 +1000
Subject: powerpc/mm: Fix swapper_pg_dir size on 64-bit hash w/64K pages

Recently in commit f6eedbba7a26 ("powerpc/mm/hash: Increase VA range to 128TB"),
we increased H_PGD_INDEX_SIZE to 15 when we're building with 64K pages. This
makes it larger than RADIX_PGD_INDEX_SIZE (13), which means the logic to
calculate MAX_PGD_INDEX_SIZE in book3s/64/pgtable.h is wrong.

The end result is that the PGD (Page Global Directory, ie top level page table)
of the kernel (aka. swapper_pg_dir), is too small.

This generally doesn't lead to a crash, as we don't use the full range in normal
operation. However if we try to dump the kernel pagetables we can trigger a
crash because we walk off the end of the pgd into other memory and eventually
try to dereference something bogus:

  $ cat /sys/kernel/debug/kernel_pagetables
  Unable to handle kernel paging request for data at address 0xe8fece0000000000
  Faulting instruction address: 0xc000000000072314
  cpu 0xc: Vector: 380 (Data SLB Access) at [c0000000daa13890]
      pc: c000000000072314: ptdump_show+0x164/0x430
      lr: c000000000072550: ptdump_show+0x3a0/0x430
     dar: e802cf0000000000
  seq_read+0xf8/0x560
  full_proxy_read+0x84/0xc0
  __vfs_read+0x6c/0x1d0
  vfs_read+0xbc/0x1b0
  SyS_read+0x6c/0x110
  system_call+0x38/0xfc

The root cause is that MAX_PGD_INDEX_SIZE isn't actually computed to be
the max of H_PGD_INDEX_SIZE or RADIX_PGD_INDEX_SIZE. To fix that move
the calculation into asm-offsets.c where we can do it easily using
max().

Fixes: f6eedbba7a26 ("powerpc/mm/hash: Increase VA range to 128TB")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 4 ----
 arch/powerpc/kernel/asm-offsets.c            | 4 ++--
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index fb72ff6b98e6..fb8380a2d8d5 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -232,10 +232,6 @@ extern unsigned long __pte_frag_nr;
 extern unsigned long __pte_frag_size_shift;
 #define PTE_FRAG_SIZE_SHIFT __pte_frag_size_shift
 #define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT)
-/*
- * Pgtable size used by swapper, init in asm code
- */
-#define MAX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE)
 
 #define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
 #define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index e7c8229a8812..8e1163426ccb 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -400,8 +400,8 @@ int main(void)
 	DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry));
 #endif
 
-#ifdef MAX_PGD_TABLE_SIZE
-	DEFINE(PGD_TABLE_SIZE, MAX_PGD_TABLE_SIZE);
+#ifdef CONFIG_PPC_BOOK3S_64
+	DEFINE(PGD_TABLE_SIZE, (sizeof(pgd_t) << max(RADIX_PGD_INDEX_SIZE, H_PGD_INDEX_SIZE)));
 #else
 	DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE);
 #endif
-- 
cgit v1.2.1


From 9c355917fcf006af47ffaa5ae43a1a804764a6f6 Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Wed, 12 Apr 2017 16:35:19 +1000
Subject: powerpc/tracing: Allow tracing of mmap syscalls

Currently sys_mmap() and sys_mmap2() (32-bit only), are not visible to the
syscall tracing machinery. This means users are not able to see the execution of
mmap() syscalls using the syscall tracer.

Fix that by using SYSCALL_DEFINE6 for sys_mmap() and sys_mmap2() so that the
meta-data associated with these syscalls is visible to the syscall tracer.

A side-effect of this change is that the return type has changed from unsigned
long to long. However this should have no effect, the only code in the kernel
which uses the result of these syscalls is in the syscall return path, which is
written in asm and treats the result as unsigned regardless.

Example output:
  cat-3399  [001] ....   196.542410: sys_mmap(addr: 7fff922a0000, len: 20000, prot: 3, flags: 812, fd: 3, offset: 1b0000)
  cat-3399  [001] ....   196.542443: sys_mmap -> 0x7fff922a0000
  cat-3399  [001] ....   196.542668: sys_munmap(addr: 7fff922c0000, len: 6d2c)
  cat-3399  [001] ....   196.542677: sys_munmap -> 0x0

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
[mpe: Massage change log, add detail on return type change]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/syscalls.h |  4 ++--
 arch/powerpc/kernel/syscalls.c      | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h
index 23be8f1e7e64..16fab6898240 100644
--- a/arch/powerpc/include/asm/syscalls.h
+++ b/arch/powerpc/include/asm/syscalls.h
@@ -8,10 +8,10 @@
 
 struct rtas_args;
 
-asmlinkage unsigned long sys_mmap(unsigned long addr, size_t len,
+asmlinkage long sys_mmap(unsigned long addr, size_t len,
 		unsigned long prot, unsigned long flags,
 		unsigned long fd, off_t offset);
-asmlinkage unsigned long sys_mmap2(unsigned long addr, size_t len,
+asmlinkage long sys_mmap2(unsigned long addr, size_t len,
 		unsigned long prot, unsigned long flags,
 		unsigned long fd, unsigned long pgoff);
 asmlinkage long ppc64_personality(unsigned long personality);
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index de04c9fbb5cd..a877bf8269fe 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -42,11 +42,11 @@
 #include <asm/unistd.h>
 #include <asm/asm-prototypes.h>
 
-static inline unsigned long do_mmap2(unsigned long addr, size_t len,
+static inline long do_mmap2(unsigned long addr, size_t len,
 			unsigned long prot, unsigned long flags,
 			unsigned long fd, unsigned long off, int shift)
 {
-	unsigned long ret = -EINVAL;
+	long ret = -EINVAL;
 
 	if (!arch_validate_prot(prot))
 		goto out;
@@ -62,16 +62,16 @@ out:
 	return ret;
 }
 
-unsigned long sys_mmap2(unsigned long addr, size_t len,
-			unsigned long prot, unsigned long flags,
-			unsigned long fd, unsigned long pgoff)
+SYSCALL_DEFINE6(mmap2, unsigned long, addr, size_t, len,
+		unsigned long, prot, unsigned long, flags,
+		unsigned long, fd, unsigned long, pgoff)
 {
 	return do_mmap2(addr, len, prot, flags, fd, pgoff, PAGE_SHIFT-12);
 }
 
-unsigned long sys_mmap(unsigned long addr, size_t len,
-		       unsigned long prot, unsigned long flags,
-		       unsigned long fd, off_t offset)
+SYSCALL_DEFINE6(mmap, unsigned long, addr, size_t, len,
+		unsigned long, prot, unsigned long, flags,
+		unsigned long, fd, off_t, offset)
 {
 	return do_mmap2(addr, len, prot, flags, fd, offset, PAGE_SHIFT);
 }
-- 
cgit v1.2.1


From 70538eaa70c36340b4bb27206cad31839e916f40 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Fri, 31 Mar 2017 12:37:48 +1100
Subject: powerpc/mm: Fix missing _PAGE_NON_IDEMPOTENT in pgtable dump

On Book3s we have two PTE flags used to mark cache-inhibited mappings:
_PAGE_TOLERANT and _PAGE_NON_IDEMPOTENT. Currently the kernel page table dumper
only looks at the generic _PAGE_NO_CACHE which is defined to be _PAGE_TOLERANT.
This patch modifies the dumper so both flags are shown in the dump.

Fixes: 8eb07b187000 ("powerpc/mm: Dump linux pagetables")
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/dump_linuxpagetables.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c
index 49abaf4dc8e3..e7cbfd5a0940 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -154,11 +154,24 @@ static const struct flag_info flag_array[] = {
 		.clear	= "             ",
 	}, {
 #endif
+#ifndef CONFIG_PPC_BOOK3S_64
 		.mask	= _PAGE_NO_CACHE,
 		.val	= _PAGE_NO_CACHE,
 		.set	= "no cache",
 		.clear	= "        ",
 	}, {
+#else
+		.mask	= _PAGE_NON_IDEMPOTENT,
+		.val	= _PAGE_NON_IDEMPOTENT,
+		.set	= "non-idempotent",
+		.clear	= "              ",
+	}, {
+		.mask	= _PAGE_TOLERANT,
+		.val	= _PAGE_TOLERANT,
+		.set	= "tolerant",
+		.clear	= "        ",
+	}, {
+#endif
 #ifdef CONFIG_PPC_BOOK3S_64
 		.mask	= H_PAGE_BUSY,
 		.val	= H_PAGE_BUSY,
-- 
cgit v1.2.1


From aaa229529244a1135b29353fefb001c430db79f0 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Fri, 31 Mar 2017 12:37:49 +1100
Subject: powerpc/mm: Add physical address to Linux page table dump

The current page table dumper scans the Linux page tables and coalesces mappings
with adjacent virtual addresses and similar PTE flags. This behaviour is
somewhat broken when you consider the IOREMAP space where entirely unrelated
mappings will appear to be virtually contiguous. This patch modifies the range
coalescing so that only ranges that are both physically and virtually contiguous
are combined. This patch also adds to the dump output the physical address at
the start of each range.

Fixes: 8eb07b187000 ("powerpc/mm: Dump linux pagetables")
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
[mpe: Print the physicall address with 0x like the other addresses]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/dump_linuxpagetables.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c
index e7cbfd5a0940..b4cb7818dfd0 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -56,6 +56,8 @@ struct pg_state {
 	struct seq_file *seq;
 	const struct addr_marker *marker;
 	unsigned long start_address;
+	unsigned long start_pa;
+	unsigned long last_pa;
 	unsigned int level;
 	u64 current_flags;
 };
@@ -265,7 +267,9 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
 	const char *unit = units;
 	unsigned long delta;
 
-	seq_printf(st->seq, "0x%016lx-0x%016lx   ", st->start_address, addr-1);
+	seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1);
+	seq_printf(st->seq, "0x%016lx ", st->start_pa);
+
 	delta = (addr - st->start_address) >> 10;
 	/* Work out what appropriate unit to use */
 	while (!(delta & 1023) && unit[1]) {
@@ -280,11 +284,15 @@ static void note_page(struct pg_state *st, unsigned long addr,
 	       unsigned int level, u64 val)
 {
 	u64 flag = val & pg_level[level].mask;
+	u64 pa = val & PTE_RPN_MASK;
+
 	/* At first no level is set */
 	if (!st->level) {
 		st->level = level;
 		st->current_flags = flag;
 		st->start_address = addr;
+		st->start_pa = pa;
+		st->last_pa = pa;
 		seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
 	/*
 	 * Dump the section of virtual memory when:
@@ -292,9 +300,11 @@ static void note_page(struct pg_state *st, unsigned long addr,
 	 *   - we change levels in the tree.
 	 *   - the address is in a different section of memory and is thus
 	 *   used for a different purpose, regardless of the flags.
+	 *   - the pa of this page is not adjacent to the last inspected page
 	 */
 	} else if (flag != st->current_flags || level != st->level ||
-		   addr >= st->marker[1].start_address) {
+		   addr >= st->marker[1].start_address ||
+		   pa != st->last_pa + PAGE_SIZE) {
 
 		/* Check the PTE flags */
 		if (st->current_flags) {
@@ -318,8 +328,12 @@ static void note_page(struct pg_state *st, unsigned long addr,
 			seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
 		}
 		st->start_address = addr;
+		st->start_pa = pa;
+		st->last_pa = pa;
 		st->current_flags = flag;
 		st->level = level;
+	} else {
+		st->last_pa = pa;
 	}
 }
 
-- 
cgit v1.2.1


From 9e4114b3913cff37be509224116b6f58a2a3918f Mon Sep 17 00:00:00 2001
From: Rashmica Gupta <rashmica.g@gmail.com>
Date: Mon, 10 Apr 2017 12:04:04 +1000
Subject: powerpc/mm: Fix hash table dump when memory is not contiguous

The current behaviour of the hash table dump assumes that memory is contiguous
and iterates from the start of memory to (start + size of memory). When memory
isn't physically contiguous, this doesn't work.

If memory exists at 0-5 GB and 6-10 GB then the current approach will check if
entries exist in the hash table from 0GB to 9GB. This patch changes the
behaviour to iterate over any holes up to the end of memory.

Fixes: 1515ab932156 ("powerpc/mm: Dump hash table")
Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/dump_hashpagetable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c
index d979709a0239..c6b900f54c07 100644
--- a/arch/powerpc/mm/dump_hashpagetable.c
+++ b/arch/powerpc/mm/dump_hashpagetable.c
@@ -468,7 +468,7 @@ static void walk_linearmapping(struct pg_state *st)
 	unsigned long psize = 1 << mmu_psize_defs[mmu_linear_psize].shift;
 
 	for (addr = PAGE_OFFSET; addr < PAGE_OFFSET +
-			memblock_phys_mem_size(); addr += psize)
+			memblock_end_of_DRAM(); addr += psize)
 		hpte_find(st, addr, mmu_linear_psize);
 }
 
-- 
cgit v1.2.1


From 5f8122611b35c563a7c907485aafd8d4682e9184 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 12 Apr 2017 10:10:22 +0530
Subject: powerpc/mm/hash: Don't open code VMALLOC_INDEX

We have a #define for it, so use it.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/slb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 98ae810b8c21..654a0d7ba0e7 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -131,7 +131,7 @@ static void __slb_flush_and_rebolt(void)
 		     "slbmte	%2,%3\n"
 		     "isync"
 		     :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)),
-		        "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, 1)),
+		        "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, VMALLOC_INDEX)),
 		        "r"(ksp_vsid_data),
 		        "r"(ksp_esid_data)
 		     : "memory");
-- 
cgit v1.2.1


From 794464f4dea0b13dacad267c06a01fc9c24f713a Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 7 Apr 2017 11:27:43 +1000
Subject: powerpc/64s: Add msgp facility unavailable log string

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/traps.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 354946236c61..65bd13338722 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1440,6 +1440,7 @@ void facility_unavailable_exception(struct pt_regs *regs)
 		[FSCR_TM_LG] = "TM",
 		[FSCR_EBB_LG] = "EBB",
 		[FSCR_TAR_LG] = "TAR",
+		[FSCR_MSGP_LG] = "MSGP",
 	};
 	char *facility = "unknown";
 	u64 value;
-- 
cgit v1.2.1


From 9b7ff0c6586bc0541ebcd1ff6773b11a49f1a058 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 7 Apr 2017 11:27:44 +1000
Subject: powerpc/64s: Add SCV FSCR bit for ISA v3.0

Add the bit definition and use it in facility_unavailable_exception() so we can
intelligently report the cause if we take a fault for SCV. This doesn't actually
enable SCV.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Drop whitespace changes to the existing entries, flush out change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/reg.h | 2 ++
 arch/powerpc/kernel/traps.c    | 1 +
 2 files changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index d0b332b8afad..4b594ed6180a 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -310,6 +310,7 @@
 #define SPRN_PMCR	0x374	/* Power Management Control Register */
 
 /* HFSCR and FSCR bit numbers are the same */
+#define FSCR_SCV_LG	12	/* Enable System Call Vectored */
 #define FSCR_MSGP_LG	10	/* Enable MSGP */
 #define FSCR_TAR_LG	8	/* Enable Target Address Register */
 #define FSCR_EBB_LG	7	/* Enable Event Based Branching */
@@ -320,6 +321,7 @@
 #define FSCR_VECVSX_LG	1	/* Enable VMX/VSX  */
 #define FSCR_FP_LG	0	/* Enable Floating Point */
 #define SPRN_FSCR	0x099	/* Facility Status & Control Register */
+#define   FSCR_SCV	__MASK(FSCR_SCV_LG)
 #define   FSCR_TAR	__MASK(FSCR_TAR_LG)
 #define   FSCR_EBB	__MASK(FSCR_EBB_LG)
 #define   FSCR_DSCR	__MASK(FSCR_DSCR_LG)
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 65bd13338722..76f6045b021b 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1441,6 +1441,7 @@ void facility_unavailable_exception(struct pt_regs *regs)
 		[FSCR_EBB_LG] = "EBB",
 		[FSCR_TAR_LG] = "TAR",
 		[FSCR_MSGP_LG] = "MSGP",
+		[FSCR_SCV_LG] = "SCV",
 	};
 	char *facility = "unknown";
 	u64 value;
-- 
cgit v1.2.1


From b866cc2199d6a6cdcefe4acfe4cfca3ac3c6d38e Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 13 Apr 2017 20:16:21 +1000
Subject: powerpc: Change the doorbell IPI calling convention

Change the doorbell callers to know about their msgsnd addressing,
rather than have them set a per-cpu target data tag at boot that gets
sent to the cause_ipi functions. The data is only used for doorbell IPI
functions, no other IPI types, so it makes sense to keep that detail
local to doorbell.

Have the platform code understand doorbell IPIs, rather than the
interrupt controller code understand them. Platform code can look at
capabilities it has available and decide which to use.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/dbell.h       |  9 ++----
 arch/powerpc/include/asm/smp.h         |  3 +-
 arch/powerpc/include/asm/xics.h        |  2 +-
 arch/powerpc/kernel/dbell.c            | 52 ++++++++++++++++++++++++++++++----
 arch/powerpc/kernel/smp.c              | 17 ++++-------
 arch/powerpc/platforms/85xx/smp.c      | 11 ++-----
 arch/powerpc/platforms/powermac/smp.c  |  2 +-
 arch/powerpc/platforms/powernv/smp.c   | 23 ++++++++++-----
 arch/powerpc/platforms/pseries/smp.c   | 27 +++++++-----------
 arch/powerpc/sysdev/xics/icp-hv.c      |  2 +-
 arch/powerpc/sysdev/xics/icp-native.c  | 12 +-------
 arch/powerpc/sysdev/xics/icp-opal.c    |  2 +-
 arch/powerpc/sysdev/xics/xics-common.c |  3 --
 arch/powerpc/sysdev/xive/common.c      |  6 ++--
 14 files changed, 92 insertions(+), 79 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h
index 378167377065..a66eb596940c 100644
--- a/arch/powerpc/include/asm/dbell.h
+++ b/arch/powerpc/include/asm/dbell.h
@@ -35,8 +35,6 @@ enum ppc_dbell {
 #ifdef CONFIG_PPC_BOOK3S
 
 #define PPC_DBELL_MSGTYPE		PPC_DBELL_SERVER
-#define SPRN_DOORBELL_CPUTAG		SPRN_TIR
-#define PPC_DBELL_TAG_MASK		0x7f
 
 static inline void _ppc_msgsnd(u32 msg)
 {
@@ -49,8 +47,6 @@ static inline void _ppc_msgsnd(u32 msg)
 #else /* CONFIG_PPC_BOOK3S */
 
 #define PPC_DBELL_MSGTYPE		PPC_DBELL
-#define SPRN_DOORBELL_CPUTAG		SPRN_PIR
-#define PPC_DBELL_TAG_MASK		0x3fff
 
 static inline void _ppc_msgsnd(u32 msg)
 {
@@ -59,9 +55,10 @@ static inline void _ppc_msgsnd(u32 msg)
 
 #endif /* CONFIG_PPC_BOOK3S */
 
-extern void doorbell_cause_ipi(int cpu, unsigned long data);
+extern void doorbell_global_ipi(int cpu);
+extern void doorbell_core_ipi(int cpu);
+extern int doorbell_try_core_ipi(int cpu);
 extern void doorbell_exception(struct pt_regs *regs);
-extern void doorbell_setup_this_cpu(void);
 
 static inline void ppc_msgsnd(enum ppc_dbell type, u32 flags, u32 tag)
 {
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 63fa780b71a0..d9ed3b02615e 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -40,7 +40,7 @@ extern int cpu_to_chip_id(int cpu);
 struct smp_ops_t {
 	void  (*message_pass)(int cpu, int msg);
 #ifdef CONFIG_PPC_SMP_MUXED_IPI
-	void  (*cause_ipi)(int cpu, unsigned long data);
+	void  (*cause_ipi)(int cpu);
 #endif
 	void  (*probe)(void);
 	int   (*kick_cpu)(int nr);
@@ -125,7 +125,6 @@ extern int smp_request_message_ipi(int virq, int message);
 extern const char *smp_ipi_name[];
 
 /* for irq controllers with only a single ipi */
-extern void smp_muxed_ipi_set_data(int cpu, unsigned long data);
 extern void smp_muxed_ipi_message_pass(int cpu, int msg);
 extern void smp_muxed_ipi_set_message(int cpu, int msg);
 extern irqreturn_t smp_ipi_demux(void);
diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h
index e0b9e576905a..7ce2c3ac2964 100644
--- a/arch/powerpc/include/asm/xics.h
+++ b/arch/powerpc/include/asm/xics.h
@@ -57,7 +57,7 @@ struct icp_ops {
 	void (*teardown_cpu)(void);
 	void (*flush_ipi)(void);
 #ifdef CONFIG_SMP
-	void (*cause_ipi)(int cpu, unsigned long data);
+	void (*cause_ipi)(int cpu);
 	irq_handler_t ipi_action;
 #endif
 };
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index 2128f3a96c32..5869c66adfd0 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@@ -20,18 +20,60 @@
 #include <asm/kvm_ppc.h>
 
 #ifdef CONFIG_SMP
-void doorbell_setup_this_cpu(void)
+
+/*
+ * Doorbells must only be used if CPU_FTR_DBELL is available.
+ * msgsnd is used in HV, and msgsndp is used in !HV.
+ *
+ * These should be used by platform code that is aware of restrictions.
+ * Other arch code should use ->cause_ipi.
+ *
+ * doorbell_global_ipi() sends a dbell to any target CPU.
+ * Must be used only by architectures that address msgsnd target
+ * by PIR/get_hard_smp_processor_id.
+ */
+void doorbell_global_ipi(int cpu)
 {
-	unsigned long tag = mfspr(SPRN_DOORBELL_CPUTAG) & PPC_DBELL_TAG_MASK;
+	u32 tag = get_hard_smp_processor_id(cpu);
 
-	smp_muxed_ipi_set_data(smp_processor_id(), tag);
+	kvmppc_set_host_ipi(cpu, 1);
+	/* Order previous accesses vs. msgsnd, which is treated as a store */
+	mb();
+	ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
 }
 
-void doorbell_cause_ipi(int cpu, unsigned long data)
+/*
+ * doorbell_core_ipi() sends a dbell to a target CPU in the same core.
+ * Must be used only by architectures that address msgsnd target
+ * by TIR/cpu_thread_in_core.
+ */
+void doorbell_core_ipi(int cpu)
 {
+	u32 tag = cpu_thread_in_core(cpu);
+
+	kvmppc_set_host_ipi(cpu, 1);
 	/* Order previous accesses vs. msgsnd, which is treated as a store */
 	mb();
-	ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, data);
+	ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
+}
+
+/*
+ * Attempt to cause a core doorbell if destination is on the same core.
+ * Returns 1 on success, 0 on failure.
+ */
+int doorbell_try_core_ipi(int cpu)
+{
+	int this_cpu = get_cpu();
+	int ret = 0;
+
+	if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu))) {
+		doorbell_core_ipi(cpu);
+		ret = 1;
+	}
+
+	put_cpu();
+
+	return ret;
 }
 
 void doorbell_exception(struct pt_regs *regs)
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 6e61cdb89194..17de938d41c5 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -39,6 +39,7 @@
 #include <asm/irq.h>
 #include <asm/hw_irq.h>
 #include <asm/kvm_ppc.h>
+#include <asm/dbell.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/prom.h>
@@ -211,17 +212,9 @@ int smp_request_message_ipi(int virq, int msg)
 #ifdef CONFIG_PPC_SMP_MUXED_IPI
 struct cpu_messages {
 	long messages;			/* current messages */
-	unsigned long data;		/* data for cause ipi */
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message);
 
-void smp_muxed_ipi_set_data(int cpu, unsigned long data)
-{
-	struct cpu_messages *info = &per_cpu(ipi_message, cpu);
-
-	info->data = data;
-}
-
 void smp_muxed_ipi_set_message(int cpu, int msg)
 {
 	struct cpu_messages *info = &per_cpu(ipi_message, cpu);
@@ -236,14 +229,13 @@ void smp_muxed_ipi_set_message(int cpu, int msg)
 
 void smp_muxed_ipi_message_pass(int cpu, int msg)
 {
-	struct cpu_messages *info = &per_cpu(ipi_message, cpu);
-
 	smp_muxed_ipi_set_message(cpu, msg);
+
 	/*
 	 * cause_ipi functions are required to include a full barrier
 	 * before doing whatever causes the IPI.
 	 */
-	smp_ops->cause_ipi(cpu, info->data);
+	smp_ops->cause_ipi(cpu);
 }
 
 #ifdef __BIG_ENDIAN__
@@ -254,11 +246,12 @@ void smp_muxed_ipi_message_pass(int cpu, int msg)
 
 irqreturn_t smp_ipi_demux(void)
 {
-	struct cpu_messages *info = this_cpu_ptr(&ipi_message);
+	struct cpu_messages *info;
 	unsigned long all;
 
 	mb();	/* order any irq clear */
 
+	info = this_cpu_ptr(&ipi_message);
 	do {
 		all = xchg(&info->messages, 0);
 #if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index 078097a0b09d..0975066f76e8 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -461,16 +461,9 @@ static void mpc85xx_smp_machine_kexec(struct kimage *image)
 }
 #endif /* CONFIG_KEXEC_CORE */
 
-static void smp_85xx_basic_setup(int cpu_nr)
-{
-	if (cpu_has_feature(CPU_FTR_DBELL))
-		doorbell_setup_this_cpu();
-}
-
 static void smp_85xx_setup_cpu(int cpu_nr)
 {
 	mpic_setup_this_cpu();
-	smp_85xx_basic_setup(cpu_nr);
 }
 
 void __init mpc85xx_smp_init(void)
@@ -484,7 +477,7 @@ void __init mpc85xx_smp_init(void)
 		smp_85xx_ops.setup_cpu = smp_85xx_setup_cpu;
 		smp_85xx_ops.message_pass = smp_mpic_message_pass;
 	} else
-		smp_85xx_ops.setup_cpu = smp_85xx_basic_setup;
+		smp_85xx_ops.setup_cpu = NULL;
 
 	if (cpu_has_feature(CPU_FTR_DBELL)) {
 		/*
@@ -492,7 +485,7 @@ void __init mpc85xx_smp_init(void)
 		 * smp_muxed_ipi_message_pass
 		 */
 		smp_85xx_ops.message_pass = NULL;
-		smp_85xx_ops.cause_ipi = doorbell_cause_ipi;
+		smp_85xx_ops.cause_ipi = doorbell_global_ipi;
 		smp_85xx_ops.probe = NULL;
 	}
 
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index 746ca7321b03..a3207cea360e 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -172,7 +172,7 @@ static irqreturn_t psurge_ipi_intr(int irq, void *d)
 	return IRQ_HANDLED;
 }
 
-static void smp_psurge_cause_ipi(int cpu, unsigned long data)
+static void smp_psurge_cause_ipi(int cpu)
 {
 	psurge_set_ipi(cpu);
 }
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 39296bf7009e..33bd4dd2cb41 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -53,11 +53,6 @@ static void pnv_smp_setup_cpu(int cpu)
 		xive_smp_setup_cpu();
 	else if (cpu != boot_cpuid)
 		xics_setup_cpu();
-
-#ifdef CONFIG_PPC_DOORBELL
-	if (cpu_has_feature(CPU_FTR_DBELL))
-		doorbell_setup_this_cpu();
-#endif
 }
 
 static int pnv_smp_kick_cpu(int nr)
@@ -254,17 +249,31 @@ static int pnv_smp_prepare_cpu(int cpu)
 	return 0;
 }
 
+static void pnv_cause_ipi(int cpu)
+{
+	if (doorbell_try_core_ipi(cpu))
+		return;
+
+	icp_ops->cause_ipi(cpu);
+}
+
 static void __init pnv_smp_probe(void)
 {
 	if (xive_enabled())
 		xive_smp_probe();
 	else
 		xics_smp_probe();
+
+	if (cpu_has_feature(CPU_FTR_DBELL) && !cpu_has_feature(CPU_FTR_ARCH_300)) {
+		smp_ops->cause_ipi = pnv_cause_ipi;
+	} else {
+		smp_ops->cause_ipi = icp_ops->cause_ipi;
+	}
 }
 
 static struct smp_ops_t pnv_smp_ops = {
-	.message_pass	= smp_muxed_ipi_message_pass,
-	.cause_ipi	= NULL, /* Filled at runtime by xi{cs,ve}_smp_probe() */
+	.message_pass	= NULL, /* Use smp_muxed_ipi_message_pass */
+	.cause_ipi	= NULL,	/* Filled at runtime by pnv_smp_probe() */
 	.probe		= pnv_smp_probe,
 	.prepare_cpu	= pnv_smp_prepare_cpu,
 	.kick_cpu	= pnv_smp_kick_cpu,
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index f6f83aeccaaa..12ff5bef67fd 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -55,11 +55,6 @@
  */
 static cpumask_var_t of_spin_mask;
 
-/*
- * If we multiplex IPI mechanisms, store the appropriate XICS IPI mechanism here
- */
-static void  (*xics_cause_ipi)(int cpu, unsigned long data);
-
 /* Query where a cpu is now.  Return codes #defined in plpar_wrappers.h */
 int smp_query_cpu_stopped(unsigned int pcpu)
 {
@@ -143,8 +138,6 @@ static void smp_setup_cpu(int cpu)
 {
 	if (cpu != boot_cpuid)
 		xics_setup_cpu();
-	if (cpu_has_feature(CPU_FTR_DBELL))
-		doorbell_setup_this_cpu();
 
 	if (firmware_has_feature(FW_FEATURE_SPLPAR))
 		vpa_init(cpu);
@@ -187,23 +180,23 @@ static int smp_pSeries_kick_cpu(int nr)
 	return 0;
 }
 
-/* Only used on systems that support multiple IPI mechanisms */
-static void pSeries_cause_ipi_mux(int cpu, unsigned long data)
+static void smp_pseries_cause_ipi(int cpu)
 {
-	if (cpumask_test_cpu(cpu, cpu_sibling_mask(smp_processor_id())))
-		doorbell_cause_ipi(cpu, data);
-	else
-		xics_cause_ipi(cpu, data);
+	/* POWER9 should not use this handler */
+	if (doorbell_try_core_ipi(cpu))
+		return;
+
+	icp_ops->cause_ipi(cpu);
 }
 
 static __init void pSeries_smp_probe(void)
 {
 	xics_smp_probe();
 
-	if (cpu_has_feature(CPU_FTR_DBELL)) {
-		xics_cause_ipi = smp_ops->cause_ipi;
-		smp_ops->cause_ipi = pSeries_cause_ipi_mux;
-	}
+	if (cpu_has_feature(CPU_FTR_DBELL))
+		smp_ops->cause_ipi = smp_pseries_cause_ipi;
+	else
+		smp_ops->cause_ipi = icp_ops->cause_ipi;
 }
 
 static struct smp_ops_t pseries_smp_ops = {
diff --git a/arch/powerpc/sysdev/xics/icp-hv.c b/arch/powerpc/sysdev/xics/icp-hv.c
index e7fa26c4ff73..bbc839a98c41 100644
--- a/arch/powerpc/sysdev/xics/icp-hv.c
+++ b/arch/powerpc/sysdev/xics/icp-hv.c
@@ -138,7 +138,7 @@ static void icp_hv_set_cpu_priority(unsigned char cppr)
 
 #ifdef CONFIG_SMP
 
-static void icp_hv_cause_ipi(int cpu, unsigned long data)
+static void icp_hv_cause_ipi(int cpu)
 {
 	icp_hv_set_qirr(cpu, IPI_PRIORITY);
 }
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index f0f3f47a3fc9..2bfb9968d562 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -143,19 +143,9 @@ static unsigned int icp_native_get_irq(void)
 
 #ifdef CONFIG_SMP
 
-static void icp_native_cause_ipi(int cpu, unsigned long data)
+static void icp_native_cause_ipi(int cpu)
 {
 	kvmppc_set_host_ipi(cpu, 1);
-#ifdef CONFIG_PPC_DOORBELL
-	if (cpu_has_feature(CPU_FTR_DBELL)) {
-		if (cpumask_test_cpu(cpu, cpu_sibling_mask(get_cpu()))) {
-			doorbell_cause_ipi(cpu, data);
-			put_cpu();
-			return;
-		}
-		put_cpu();
-	}
-#endif
 	icp_native_set_qirr(cpu, IPI_PRIORITY);
 }
 
diff --git a/arch/powerpc/sysdev/xics/icp-opal.c b/arch/powerpc/sysdev/xics/icp-opal.c
index b53f80f0b4d8..c71d2ea42627 100644
--- a/arch/powerpc/sysdev/xics/icp-opal.c
+++ b/arch/powerpc/sysdev/xics/icp-opal.c
@@ -126,7 +126,7 @@ static void icp_opal_eoi(struct irq_data *d)
 
 #ifdef CONFIG_SMP
 
-static void icp_opal_cause_ipi(int cpu, unsigned long data)
+static void icp_opal_cause_ipi(int cpu)
 {
 	int hw_cpu = get_hard_smp_processor_id(cpu);
 
diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c
index 23efe4e42172..2eb53c12ee6e 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -143,9 +143,6 @@ static void xics_request_ipi(void)
 
 void __init xics_smp_probe(void)
 {
-	/* Setup cause_ipi callback  based on which ICP is used */
-	smp_ops->cause_ipi = icp_ops->cause_ipi;
-
 	/* Register all the IPIs */
 	xics_request_ipi();
 }
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index d9cd7f705f21..6a98efb14264 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -834,15 +834,15 @@ static void xive_irq_free_data(unsigned int virq)
 
 #ifdef CONFIG_SMP
 
-static void xive_cause_ipi(int cpu, unsigned long msg)
+static void xive_cause_ipi(int cpu)
 {
 	struct xive_cpu *xc;
 	struct xive_irq_data *xd;
 
 	xc = per_cpu(xive_cpu, cpu);
 
-	DBG_VERBOSE("IPI msg#%ld CPU %d -> %d (HW IRQ 0x%x)\n",
-		    msg, smp_processor_id(), cpu, xc->hw_ipi);
+	DBG_VERBOSE("IPI CPU %d -> %d (HW IRQ 0x%x)\n",
+		    smp_processor_id(), cpu, xc->hw_ipi);
 
 	xd = &xc->ipi_data;
 	if (WARN_ON(!xd->trig_mmio))
-- 
cgit v1.2.1


From b87ac0218355a83abb899a0022bb2e5252879fc0 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 13 Apr 2017 20:16:22 +1000
Subject: powerpc: Introduce msgsnd/doorbell barrier primitives

POWER9 changes requirements and adds new instructions for
synchronization.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/dbell.h | 22 ++++++++++++++++++++++
 arch/powerpc/include/asm/smp.h   |  1 +
 arch/powerpc/kernel/dbell.c      |  8 +++++---
 arch/powerpc/kernel/smp.c        | 10 ++++++++--
 4 files changed, 36 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h
index a66eb596940c..350694a1a6e5 100644
--- a/arch/powerpc/include/asm/dbell.h
+++ b/arch/powerpc/include/asm/dbell.h
@@ -44,6 +44,17 @@ static inline void _ppc_msgsnd(u32 msg)
 		__asm__ __volatile__ (PPC_MSGSNDP(%0) : : "r" (msg));
 }
 
+/* sync before sending message */
+static inline void ppc_msgsnd_sync(void)
+{
+	__asm__ __volatile__ ("sync" : : : "memory");
+}
+
+/* sync after taking message interrupt */
+static inline void ppc_msgsync(void)
+{
+}
+
 #else /* CONFIG_PPC_BOOK3S */
 
 #define PPC_DBELL_MSGTYPE		PPC_DBELL
@@ -53,6 +64,17 @@ static inline void _ppc_msgsnd(u32 msg)
 	__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
 }
 
+/* sync before sending message */
+static inline void ppc_msgsnd_sync(void)
+{
+	__asm__ __volatile__ ("sync" : : : "memory");
+}
+
+/* sync after taking message interrupt */
+static inline void ppc_msgsync(void)
+{
+}
+
 #endif /* CONFIG_PPC_BOOK3S */
 
 extern void doorbell_global_ipi(int cpu);
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index d9ed3b02615e..751b2bd944fc 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -128,6 +128,7 @@ extern const char *smp_ipi_name[];
 extern void smp_muxed_ipi_message_pass(int cpu, int msg);
 extern void smp_muxed_ipi_set_message(int cpu, int msg);
 extern irqreturn_t smp_ipi_demux(void);
+extern irqreturn_t smp_ipi_demux_relaxed(void);
 
 void smp_init_pSeries(void);
 void smp_init_cell(void);
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index 5869c66adfd0..b6fe883b1016 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@@ -38,7 +38,7 @@ void doorbell_global_ipi(int cpu)
 
 	kvmppc_set_host_ipi(cpu, 1);
 	/* Order previous accesses vs. msgsnd, which is treated as a store */
-	mb();
+	ppc_msgsnd_sync();
 	ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
 }
 
@@ -53,7 +53,7 @@ void doorbell_core_ipi(int cpu)
 
 	kvmppc_set_host_ipi(cpu, 1);
 	/* Order previous accesses vs. msgsnd, which is treated as a store */
-	mb();
+	ppc_msgsnd_sync();
 	ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
 }
 
@@ -82,12 +82,14 @@ void doorbell_exception(struct pt_regs *regs)
 
 	irq_enter();
 
+	ppc_msgsync();
+
 	may_hard_irq_enable();
 
 	kvmppc_set_host_ipi(smp_processor_id(), 0);
 	__this_cpu_inc(irq_stat.doorbell_irqs);
 
-	smp_ipi_demux();
+	smp_ipi_demux_relaxed(); /* already performed the barrier */
 
 	irq_exit();
 	set_irq_regs(old_regs);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 17de938d41c5..2eca1e491e2b 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -245,12 +245,18 @@ void smp_muxed_ipi_message_pass(int cpu, int msg)
 #endif
 
 irqreturn_t smp_ipi_demux(void)
+{
+	mb();	/* order any irq clear */
+
+	return smp_ipi_demux_relaxed();
+}
+
+/* sync-free variant. Callers should ensure synchronization */
+irqreturn_t smp_ipi_demux_relaxed(void)
 {
 	struct cpu_messages *info;
 	unsigned long all;
 
-	mb();	/* order any irq clear */
-
 	info = this_cpu_ptr(&ipi_message);
 	do {
 		all = xchg(&info->messages, 0);
-- 
cgit v1.2.1


From a5adf282461fb6048973ca3aec590495bdbc33f1 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 13 Apr 2017 20:16:23 +1000
Subject: powerpc/64s: Avoid a branch for ppc_msgsnd

IPIs are a pretty hot path and we already have the ability to do asm feature
patching, so use it.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Change log detail]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/dbell.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h
index 350694a1a6e5..040944659a20 100644
--- a/arch/powerpc/include/asm/dbell.h
+++ b/arch/powerpc/include/asm/dbell.h
@@ -38,10 +38,8 @@ enum ppc_dbell {
 
 static inline void _ppc_msgsnd(u32 msg)
 {
-	if (cpu_has_feature(CPU_FTR_HVMODE))
-		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
-	else
-		__asm__ __volatile__ (PPC_MSGSNDP(%0) : : "r" (msg));
+	__asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGSND(%1), PPC_MSGSNDP(%1), %0)
+				: : "i" (CPU_FTR_HVMODE), "r" (msg));
 }
 
 /* sync before sending message */
-- 
cgit v1.2.1


From 6b3edefefa6752df57ad636f26baa1b0a502ddab Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 13 Apr 2017 20:16:24 +1000
Subject: powerpc/powernv: POWER9 support for msgsnd/doorbell IPI

POWER9 requires msgsync for receiver-side synchronization, and a DD1
workaround restricts IPIs to core-local.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Drop no longer needed asm feature macro changes]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/dbell.h      |  3 +++
 arch/powerpc/include/asm/ppc-opcode.h |  2 ++
 arch/powerpc/platforms/powernv/smp.c  | 28 ++++++++++++++++++++++++++--
 3 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h
index 040944659a20..f70cbfe0ec04 100644
--- a/arch/powerpc/include/asm/dbell.h
+++ b/arch/powerpc/include/asm/dbell.h
@@ -51,6 +51,9 @@ static inline void ppc_msgsnd_sync(void)
 /* sync after taking message interrupt */
 static inline void ppc_msgsync(void)
 {
+	/* sync is not required when taking messages from the same core */
+	__asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGSYNC " ; lwsync", "", %0)
+				: : "i" (CPU_FTR_HVMODE|CPU_FTR_ARCH_300));
 }
 
 #else /* CONFIG_PPC_BOOK3S */
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index e7d6d86563ee..142d78d645f4 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -161,6 +161,7 @@
 #define PPC_INST_MFTMR			0x7c0002dc
 #define PPC_INST_MSGSND			0x7c00019c
 #define PPC_INST_MSGCLR			0x7c0001dc
+#define PPC_INST_MSGSYNC		0x7c0006ec
 #define PPC_INST_MSGSNDP		0x7c00011c
 #define PPC_INST_MTTMR			0x7c0003dc
 #define PPC_INST_NOP			0x60000000
@@ -345,6 +346,7 @@
 					___PPC_RB(b) | __PPC_EH(eh))
 #define PPC_MSGSND(b)		stringify_in_c(.long PPC_INST_MSGSND | \
 					___PPC_RB(b))
+#define PPC_MSGSYNC		stringify_in_c(.long PPC_INST_MSGSYNC)
 #define PPC_MSGCLR(b)		stringify_in_c(.long PPC_INST_MSGCLR | \
 					___PPC_RB(b))
 #define PPC_MSGSNDP(b)		stringify_in_c(.long PPC_INST_MSGSNDP | \
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 33bd4dd2cb41..951a2e230cfa 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -257,6 +257,23 @@ static void pnv_cause_ipi(int cpu)
 	icp_ops->cause_ipi(cpu);
 }
 
+static void pnv_p9_dd1_cause_ipi(int cpu)
+{
+	int this_cpu = get_cpu();
+
+	/*
+	 * POWER9 DD1 has a global addressed msgsnd, but for now we restrict
+	 * IPIs to same core, because it requires additional synchronization
+	 * for inter-core doorbells which we do not implement.
+	 */
+	if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu)))
+		doorbell_global_ipi(cpu);
+	else
+		icp_ops->cause_ipi(cpu);
+
+	put_cpu();
+}
+
 static void __init pnv_smp_probe(void)
 {
 	if (xive_enabled())
@@ -264,8 +281,15 @@ static void __init pnv_smp_probe(void)
 	else
 		xics_smp_probe();
 
-	if (cpu_has_feature(CPU_FTR_DBELL) && !cpu_has_feature(CPU_FTR_ARCH_300)) {
-		smp_ops->cause_ipi = pnv_cause_ipi;
+	if (cpu_has_feature(CPU_FTR_DBELL)) {
+		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+			if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+				smp_ops->cause_ipi = pnv_p9_dd1_cause_ipi;
+			else
+				smp_ops->cause_ipi = doorbell_global_ipi;
+		} else {
+			smp_ops->cause_ipi = pnv_cause_ipi;
+		}
 	} else {
 		smp_ops->cause_ipi = icp_ops->cause_ipi;
 	}
-- 
cgit v1.2.1


From 590c369e7ecc00be736be39ae0c62d1b5d563a51 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 13 Apr 2017 23:14:36 +1000
Subject: powerpc: Drop include of linux/io.h from asm/io.h

Currently powerpc's asm/io.h includes linux/io.h, and linux/io.h
includes asm/io.h.

This can cause problems because depending on which is included first the
order of definitions between the two files will change.

The include of linux/io.h was added back in 2008 in commit b41e5fffe8b8
("[POWERPC] devres: Add devm_ioremap_prot()"). It's not entirely clear
it was needed then, but devm_ioremap_prot() has since been removed
entirely as unused, in dedd24a12fe6 ("powerpc: Remove unused
devm_ioremap_prot()").

So it seems to be unnecessary and can potentially cause problems, so
remove the include of linux/io.h from asm/io.h

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/io.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 45c136a832ce..c398e86cd1cc 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -25,8 +25,6 @@ extern struct pci_dev *isa_bridge_pcidev;
 #endif
 
 #include <linux/device.h>
-#include <linux/io.h>
-
 #include <linux/compiler.h>
 #include <asm/page.h>
 #include <asm/byteorder.h>
-- 
cgit v1.2.1


From ebbe9d7d3a2ca0d62f1a2c08c7e7a3e0a88cf999 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 5 Apr 2017 12:44:49 +1000
Subject: powerpc: Allow platforms to force-enable CONFIG_SMP

Of the 64-bit Book3S platforms, only powermac supports booting on an
actual non-SMP system. The other platforms can be built with SMP
disabled, but it doesn't make a lot of sense given the CPUs they support
are all multicore or multithreaded.

So give platforms the option of forcing SMP=y.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/Kconfig.cputype | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index ef4c4b8fc547..ee87cb37c580 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -371,10 +371,16 @@ config PPC_PERF_CTRS
        help
          This enables the powerpc-specific perf_event back-end.
 
+config FORCE_SMP
+	# Allow platforms to force SMP=y by selecting this
+	bool
+	default n
+	select SMP
+
 config SMP
 	depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE || PPC_47x
 	select GENERIC_IRQ_MIGRATION
-	bool "Symmetric multi-processing support"
+	bool "Symmetric multi-processing support" if !FORCE_SMP
 	---help---
 	  This enables support for systems with more than one CPU. If you have
 	  a system with only one CPU, say N. If you have a system with more
-- 
cgit v1.2.1


From 40e275653e2cdb5be5aa828d31cc96eb0eef3346 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 5 Apr 2017 12:44:50 +1000
Subject: powerpc/powernv: Always enable SMP when building powernv

The powernv platform supports Power7 and later CPUs, all of which are
multithreaded and multicore.

As such we never build a SMP=n kernel for those machines, other than
possibly for debugging or running in a simulator.

In the debugging case we can get a similar effect by booting with
nr_cpus=1, or there's always the option of building a custom kernel with
SMP hacked out.

For running in simulators the code size reduction from building without
SMP is not particularly important, what matters is the number of
instructions executed. A quick test shows that a SMP=y kernel takes ~6%
more instructions to boot to a shell. Booting with nr_cpus=1 recovers
about half that deficit.

On the flip side, keeping the SMP=n kernel building can be a pain at
times. And although we've mostly kept it building in recent years, no
one is regularly testing that the SMP=n kernel actually boots and works
well on these machines.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 2489805e79f1..6a6f4ef46b9e 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -21,6 +21,7 @@ config PPC_POWERNV
 	select CPU_FREQ_GOV_CONSERVATIVE
 	select PPC_DOORBELL
 	select MMU_NOTIFIER
+	select FORCE_SMP
 	default y
 
 config OPAL_PRD
-- 
cgit v1.2.1


From 270e2dc9b8031dd2ed31f6b86ed80340b1b889c2 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 5 Apr 2017 12:44:51 +1000
Subject: powerpc/pseries: Always enable SMP when building pseries

The pseries platform supports Power4 and later CPUs, all of which are
multithreaded and/or multicore.

In practice no one ever builds a SMP=n kernel for these machines. So as
we did for powernv, have the pseries platform imply SMP=y.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 30ec04f1c67c..913c54e23eea 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -17,9 +17,10 @@ config PPC_PSERIES
 	select PPC_UDBG_16550
 	select PPC_NATIVE
 	select PPC_DOORBELL
-	select HOTPLUG_CPU if SMP
+	select HOTPLUG_CPU
 	select ARCH_RANDOM
 	select PPC_DOORBELL
+	select FORCE_SMP
 	default y
 
 config PPC_SPLPAR
-- 
cgit v1.2.1


From 8d1b48ef580097e111c2644e6fc6041b9784d218 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Apr 2017 05:12:16 +1000
Subject: powerpc/64s: Revert setting of LPCR[LPES] on POWER9

The XIVE enablement patches included a change to set the LPES (Logical
Partitioning Environment Selector) bit (bit # 3) in LPCR (Logical Partitioning
Control Register) on POWER9 hosts. This bit sets external interrupts to guest
delivery mode, which uses SRR0/1. The host's EE interrupt handler is written to
expect HSRR0/1 (for earlier CPUs). This should be fine because XIVE is
configured not to deliver EEs to the host (Hypervisor Virtulization Interrupt is
used instead) so the EE handler should never be executed.

However a bug in interrupt controller code, hardware, or odd configuration of a
simulator could result in the host getting an EE incorrectly. Keeping the EE
delivery mode matching the host EE handler prevents strange crashes due to using
the wrong exception registers.

KVM will configure the LPCR to set LPES prior to running a guest so that EEs are
delivered to the guest using SRR0/1.

Fixes: 08a1e650cc ("powerpc: Fixup LPCR:PECE and HEIC setting on POWER9")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Massage change log to avoid referring to LPES0 which is now renamed LPES]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/cpu_setup_power.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 7013ae3d1675..1fce4ddd2e6c 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -107,7 +107,7 @@ _GLOBAL(__setup_cpu_power9)
 	or	r3, r3, r4
 	LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
 	andc	r3, r3, r4
-	li	r4,(LPCR_LPES0 >> LPCR_LPES_SH)
+	li	r4,0 /* LPES = 0 */
 	bl	__init_LPCR
 	bl	__init_HFSCR
 	bl	__init_tlb_power9
@@ -131,7 +131,7 @@ _GLOBAL(__restore_cpu_power9)
 	or	r3, r3, r4
 	LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
 	andc	r3, r3, r4
-	li	r4,(LPCR_LPES0 >> LPCR_LPES_SH)
+	li	r4,0 /* LPES = 0 */
 	bl	__init_LPCR
 	bl	__init_HFSCR
 	bl	__init_tlb_power9
-- 
cgit v1.2.1


From be77e999e3937322b7e15274b8fc7da309a040a0 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 14 Apr 2017 00:48:21 +0530
Subject: powerpc/mm/radix: Use mm->task_size for boundary checking instead of
 addr_limit

We don't init addr_limit correctly for 32 bit applications. So default to using
mm->task_size for boundary condition checking. We use addr_limit to only control
free space search. This makes sure that we do the right thing with 32 bit
applications.

We should consolidate the usage of TASK_SIZE/mm->task_size and
mm->context.addr_limit later.

This partially reverts commit fbfef9027c2a7ad (powerpc/mm: Switch some
TASK_SIZE checks to use mm_context addr_limit).

Fixes: fbfef9027c2a ("powerpc/mm: Switch some TASK_SIZE checks to use mm_context addr_limit")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/hugetlbpage-radix.c | 4 ++--
 arch/powerpc/mm/mmap.c              | 8 ++++----
 arch/powerpc/mm/slice.c             | 4 ++--
 arch/powerpc/mm/subpage-prot.c      | 4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
index 0aa9cade422f..6575b9aabef4 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -55,7 +55,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 
 	if (len & ~huge_page_mask(h))
 		return -EINVAL;
-	if (len > mm->context.addr_limit)
+	if (len > mm->task_size)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
@@ -67,7 +67,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	if (addr) {
 		addr = ALIGN(addr, huge_page_size(h));
 		vma = find_vma(mm, addr);
-		if (mm->context.addr_limit - len >= addr &&
+		if (mm->task_size - len >= addr &&
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index b2111baa0da6..106a86406c77 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -100,7 +100,7 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
 		mm->context.addr_limit = TASK_SIZE;
 
-	if (len > mm->context.addr_limit - mmap_min_addr)
+	if (len > mm->task_size - mmap_min_addr)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED)
@@ -109,7 +109,7 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (mm->context.addr_limit - len >= addr && addr >= mmap_min_addr &&
+		if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
@@ -143,7 +143,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 		mm->context.addr_limit = TASK_SIZE;
 
 	/* requested length too big for entire address space */
-	if (len > mm->context.addr_limit - mmap_min_addr)
+	if (len > mm->task_size - mmap_min_addr)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED)
@@ -153,7 +153,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (mm->context.addr_limit - len >= addr && addr >= mmap_min_addr &&
+		if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
 				(!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 251b6bae7023..ade66c3ecdce 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -292,8 +292,8 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
 		 * Check if we need to reduce the range, or if we can
 		 * extend it to cover the next available slice.
 		 */
-		if (addr >= mm->context.addr_limit)
-			addr = mm->context.addr_limit;
+		if (addr >= high_limit)
+			addr = high_limit;
 		else if (slice_scan_available(addr, available, 1, &next_end)) {
 			addr = next_end;
 			goto next_slice;
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index a409f78d206b..e94fbd4c8845 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -197,8 +197,8 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
 
 	/* Check parameters */
 	if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
-	    addr >= mm->context.addr_limit || len >= mm->context.addr_limit ||
-	    addr + len > mm->context.addr_limit)
+	    addr >= mm->task_size || len >= mm->task_size ||
+	    addr + len > mm->task_size)
 		return -EINVAL;
 
 	if (is_hugepage_only_range(mm, addr, len))
-- 
cgit v1.2.1


From 95dbdf4fa09823e35b29eab8f7093984daa6a913 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Mon, 17 Apr 2017 00:21:19 +1000
Subject: powerpc/64s: Minor fix for MCE TLB flush for radix

The TLB flush for radix first flushes TLB for radix configuration,
then flushes for hash configuration. The second flush is unnecessary
but does not affect correctness.

Fixes: 1a472c9dba6b9 ("powerpc/mm/radix: Add tlbflush routines")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/mce_power.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index de242b4bbd20..f913139bb0c2 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -72,10 +72,14 @@ void __flush_tlb_power8(unsigned int action)
 
 void __flush_tlb_power9(unsigned int action)
 {
+	unsigned int num_sets;
+
 	if (radix_enabled())
-		flush_tlb_206(POWER9_TLB_SETS_RADIX, action);
+		num_sets = POWER9_TLB_SETS_RADIX;
+	else
+		num_sets = POWER9_TLB_SETS_HASH;
 
-	flush_tlb_206(POWER9_TLB_SETS_HASH, action);
+	flush_tlb_206(num_sets, action);
 }
 
 
-- 
cgit v1.2.1


From 321f7d29e5163d7f1c9c0b705acc45bd1be34aa6 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 18 Apr 2017 12:31:27 +0530
Subject: powerpc/mmap: Any hint > 128TB searches the full VA space

As part of the new large address space support, processes start out life with a
128TB virtual address space. However when calling mmap() a process can pass a
hint address, and if that hint is > 128TB the kernel will use the full 512TB
address space to try and satisfy the mmap() request.

Currently we have a check that the hint is > 128TB and < 512TB (TASK_SIZE),
which was added as an optimisation to avoid updating addr_limit unnecessarily
and also to avoid calling slice_flush_segments() on all CPUs more than
necessary.

However this has the user-visible side effect that an mmap() hint above 512TB
does not search the full address space unless a preceding mmap() used a hint
value > 128TB && < 512TB.

So fix it to treat any hint above 128TB as a hint to search the full address
space, instead of checking the hint against TASK_SIZE, we instead check if the
addr_limit is already == TASK_SIZE.

This also brings the ABI in-line with what is proposed on x86. ie, that a hint
address above 128TB up to and including (2^64)-1 is an indication to search the
full address space.

Fixes: f4ea6dcb08ea2c (powerpc/mm: Enable mappings above 128TB)
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/mmap.c  | 6 ++++--
 arch/powerpc/mm/slice.c | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 106a86406c77..82fc5762f971 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -97,7 +97,8 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	struct vm_area_struct *vma;
 	struct vm_unmapped_area_info info;
 
-	if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
+	if (unlikely(addr > mm->context.addr_limit &&
+		     mm->context.addr_limit != TASK_SIZE))
 		mm->context.addr_limit = TASK_SIZE;
 
 	if (len > mm->task_size - mmap_min_addr)
@@ -139,7 +140,8 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	unsigned long addr = addr0;
 	struct vm_unmapped_area_info info;
 
-	if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
+	if (unlikely(addr > mm->context.addr_limit &&
+		     mm->context.addr_limit != TASK_SIZE))
 		mm->context.addr_limit = TASK_SIZE;
 
 	/* requested length too big for entire address space */
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index ade66c3ecdce..966b9fccfa66 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -419,7 +419,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	/*
 	 * Check if we need to expland slice area.
 	 */
-	if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE)) {
+	if (unlikely(addr > mm->context.addr_limit &&
+		     mm->context.addr_limit != TASK_SIZE)) {
 		mm->context.addr_limit = TASK_SIZE;
 		on_each_cpu(slice_flush_segments, mm, 1);
 	}
-- 
cgit v1.2.1


From e889e96e98e8da97bd39e46b7253615eabe14397 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Tue, 11 Apr 2017 17:54:57 +1000
Subject: powerpc/iommu: Do not call PageTransHuge() on tail pages

The CMA pages migration code does not support compound pages at
the moment so it performs few tests before proceeding to actual page
migration.

One of the tests - PageTransHuge() - has VM_BUG_ON_PAGE(PageTail()) as
it is designed to be called on head pages only. Since we also test for
PageCompound(), and it contains PageTail() and PageHead(), we can
simplify the check by leaving just PageCompound() and therefore avoid
possible VM_BUG_ON_PAGE.

Fixes: 2e5bbb5461f1 ("KVM: PPC: Book3S HV: Migrate pinned pages out of CMA")
Cc: stable@vger.kernel.org # v4.9+
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/mmu_context_iommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index 497130c5c742..96f835cbf212 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -81,7 +81,7 @@ struct page *new_iommu_non_cma_page(struct page *page, unsigned long private,
 	gfp_t gfp_mask = GFP_USER;
 	struct page *new_page;
 
-	if (PageHuge(page) || PageTransHuge(page) || PageCompound(page))
+	if (PageCompound(page))
 		return NULL;
 
 	if (PageHighMem(page))
@@ -100,7 +100,7 @@ static int mm_iommu_move_page_from_cma(struct page *page)
 	LIST_HEAD(cma_migrate_pages);
 
 	/* Ignore huge pages for now */
-	if (PageHuge(page) || PageTransHuge(page) || PageCompound(page))
+	if (PageCompound(page))
 		return -EBUSY;
 
 	lru_add_drain();
-- 
cgit v1.2.1


From 79e96f8f930d425ab48c511f8a6db16ca7fc68b1 Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Date: Tue, 11 Apr 2017 07:21:06 +0530
Subject: powerpc/perf: Export memory hierarchy info to user space

The LDST field and DATA_SRC in SIER identifies the memory hierarchy level
(eg: L1, L2 etc), from which a data-cache miss for a marked instruction
was satisfied. Use the 'perf_mem_data_src' object to export this
hierarchy level to user space.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/perf_event_server.h |  2 +
 arch/powerpc/perf/core-book3s.c              |  4 ++
 arch/powerpc/perf/isa207-common.c            | 74 ++++++++++++++++++++++++++++
 arch/powerpc/perf/isa207-common.h            | 16 +++++-
 4 files changed, 95 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index ae0a23091a9b..446cdcd9b7f5 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -38,6 +38,8 @@ struct power_pmu {
 				unsigned long *valp);
 	int		(*get_alternatives)(u64 event_id, unsigned int flags,
 				u64 alt[]);
+	void		(*get_mem_data_src)(union perf_mem_data_src *dsrc,
+				u32 flags, struct pt_regs *regs);
 	u64             (*bhrb_filter_map)(u64 branch_sample_type);
 	void            (*config_bhrb)(u64 pmu_bhrb_filter);
 	void		(*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 2ff13249f87a..e241ebebab6f 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2049,6 +2049,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 			data.br_stack = &cpuhw->bhrb_stack;
 		}
 
+		if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC &&
+						ppmu->get_mem_data_src)
+			ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs);
+
 		if (perf_event_overflow(event, &data, regs))
 			power_pmu_stop(event, 0);
 	}
diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c
index cd951fd231c4..a8b100ef8e6c 100644
--- a/arch/powerpc/perf/isa207-common.c
+++ b/arch/powerpc/perf/isa207-common.c
@@ -148,6 +148,80 @@ static bool is_thresh_cmp_valid(u64 event)
 	return true;
 }
 
+static inline u64 isa207_find_source(u64 idx, u32 sub_idx)
+{
+	u64 ret = PERF_MEM_NA;
+
+	switch(idx) {
+	case 0:
+		/* Nothing to do */
+		break;
+	case 1:
+		ret = PH(LVL, L1);
+		break;
+	case 2:
+		ret = PH(LVL, L2);
+		break;
+	case 3:
+		ret = PH(LVL, L3);
+		break;
+	case 4:
+		if (sub_idx <= 1)
+			ret = PH(LVL, LOC_RAM);
+		else if (sub_idx > 1 && sub_idx <= 2)
+			ret = PH(LVL, REM_RAM1);
+		else
+			ret = PH(LVL, REM_RAM2);
+		ret |= P(SNOOP, HIT);
+		break;
+	case 5:
+		ret = PH(LVL, REM_CCE1);
+		if ((sub_idx == 0) || (sub_idx == 2) || (sub_idx == 4))
+			ret |= P(SNOOP, HIT);
+		else if ((sub_idx == 1) || (sub_idx == 3) || (sub_idx == 5))
+			ret |= P(SNOOP, HITM);
+		break;
+	case 6:
+		ret = PH(LVL, REM_CCE2);
+		if ((sub_idx == 0) || (sub_idx == 2))
+			ret |= P(SNOOP, HIT);
+		else if ((sub_idx == 1) || (sub_idx == 3))
+			ret |= P(SNOOP, HITM);
+		break;
+	case 7:
+		ret = PM(LVL, L1);
+		break;
+	}
+
+	return ret;
+}
+
+void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
+							struct pt_regs *regs)
+{
+	u64 idx;
+	u32 sub_idx;
+	u64 sier;
+	u64 val;
+
+	/* Skip if no SIER support */
+	if (!(flags & PPMU_HAS_SIER)) {
+		dsrc->val = 0;
+		return;
+	}
+
+	sier = mfspr(SPRN_SIER);
+	val = (sier & ISA207_SIER_TYPE_MASK) >> ISA207_SIER_TYPE_SHIFT;
+	if (val == 1 || val == 2) {
+		idx = (sier & ISA207_SIER_LDST_MASK) >> ISA207_SIER_LDST_SHIFT;
+		sub_idx = (sier & ISA207_SIER_DATA_SRC_MASK) >> ISA207_SIER_DATA_SRC_SHIFT;
+
+		dsrc->val = isa207_find_source(idx, sub_idx);
+		dsrc->val |= (val == 1) ? P(OP, LOAD) : P(OP, STORE);
+	}
+}
+
+
 int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
 {
 	unsigned int unit, pmc, cache, ebb;
diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h
index 899210f14ee4..f711f337e358 100644
--- a/arch/powerpc/perf/isa207-common.h
+++ b/arch/powerpc/perf/isa207-common.h
@@ -260,6 +260,19 @@
 #define MAX_ALT				2
 #define MAX_PMU_COUNTERS		6
 
+#define ISA207_SIER_TYPE_SHIFT		15
+#define ISA207_SIER_TYPE_MASK		(0x7ull << ISA207_SIER_TYPE_SHIFT)
+
+#define ISA207_SIER_LDST_SHIFT		1
+#define ISA207_SIER_LDST_MASK		(0x7ull << ISA207_SIER_LDST_SHIFT)
+
+#define ISA207_SIER_DATA_SRC_SHIFT	53
+#define ISA207_SIER_DATA_SRC_MASK	(0x7ull << ISA207_SIER_DATA_SRC_SHIFT)
+
+#define P(a, b)				PERF_MEM_S(a, b)
+#define PH(a, b)			(P(LVL, HIT) | P(a, b))
+#define PM(a, b)			(P(LVL, MISS) | P(a, b))
+
 int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp);
 int isa207_compute_mmcr(u64 event[], int n_ev,
 				unsigned int hwc[], unsigned long mmcr[],
@@ -267,6 +280,7 @@ int isa207_compute_mmcr(u64 event[], int n_ev,
 void isa207_disable_pmc(unsigned int pmc, unsigned long mmcr[]);
 int isa207_get_alternatives(u64 event, u64 alt[],
 				const unsigned int ev_alt[][MAX_ALT], int size);
-
+void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
+							struct pt_regs *regs);
 
 #endif
-- 
cgit v1.2.1


From 170a315f41c647ce826e389c64047ee1f4cd2dde Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Date: Tue, 11 Apr 2017 07:21:07 +0530
Subject: powerpc/perf: Support to export MMCRA[TEC*] field to userspace

Threshold feature when used with MMCRA [Threshold Event Counter Event],
MMCRA[Threshold Start event] and MMCRA[Threshold End event] will update
MMCRA[Threashold Event Counter Exponent] and MMCRA[Threshold Event
Counter Multiplier] with the corresponding threshold event count values.
Patch to export MMCRA[TECX/TECM] to userspace in 'weight' field of
struct perf_sample_data.

Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/perf_event_server.h |  1 +
 arch/powerpc/perf/core-book3s.c              |  4 ++++
 arch/powerpc/perf/isa207-common.c            |  8 ++++++++
 arch/powerpc/perf/isa207-common.h            | 10 ++++++++++
 4 files changed, 23 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 446cdcd9b7f5..723bf48e7494 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -40,6 +40,7 @@ struct power_pmu {
 				u64 alt[]);
 	void		(*get_mem_data_src)(union perf_mem_data_src *dsrc,
 				u32 flags, struct pt_regs *regs);
+	void		(*get_mem_weight)(u64 *weight);
 	u64             (*bhrb_filter_map)(u64 branch_sample_type);
 	void            (*config_bhrb)(u64 pmu_bhrb_filter);
 	void		(*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index e241ebebab6f..6c2d4168daec 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2053,6 +2053,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 						ppmu->get_mem_data_src)
 			ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs);
 
+		if (event->attr.sample_type & PERF_SAMPLE_WEIGHT &&
+						ppmu->get_mem_weight)
+			ppmu->get_mem_weight(&data.weight);
+
 		if (perf_event_overflow(event, &data, regs))
 			power_pmu_stop(event, 0);
 	}
diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c
index a8b100ef8e6c..8125160be7bc 100644
--- a/arch/powerpc/perf/isa207-common.c
+++ b/arch/powerpc/perf/isa207-common.c
@@ -221,6 +221,14 @@ void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
 	}
 }
 
+void isa207_get_mem_weight(u64 *weight)
+{
+	u64 mmcra = mfspr(SPRN_MMCRA);
+	u64 exp = MMCRA_THR_CTR_EXP(mmcra);
+	u64 mantissa = MMCRA_THR_CTR_MANT(mmcra);
+
+	*weight = mantissa << (2 * exp);
+}
 
 int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
 {
diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h
index f711f337e358..8acbe6e802c7 100644
--- a/arch/powerpc/perf/isa207-common.h
+++ b/arch/powerpc/perf/isa207-common.h
@@ -248,6 +248,15 @@
 #define MMCRA_SDAR_MODE_TLB		(1ull << MMCRA_SDAR_MODE_SHIFT)
 #define MMCRA_SDAR_MODE_NO_UPDATES	~(0x3ull << MMCRA_SDAR_MODE_SHIFT)
 #define MMCRA_IFM_SHIFT			30
+#define MMCRA_THR_CTR_MANT_SHIFT	19
+#define MMCRA_THR_CTR_MANT_MASK		0x7Ful
+#define MMCRA_THR_CTR_MANT(v)		(((v) >> MMCRA_THR_CTR_MANT_SHIFT) &\
+						MMCRA_THR_CTR_MANT_MASK)
+
+#define MMCRA_THR_CTR_EXP_SHIFT		27
+#define MMCRA_THR_CTR_EXP_MASK		0x7ul
+#define MMCRA_THR_CTR_EXP(v)		(((v) >> MMCRA_THR_CTR_EXP_SHIFT) &\
+						MMCRA_THR_CTR_EXP_MASK)
 
 /* MMCR1 Threshold Compare bit constant for power9 */
 #define p9_MMCRA_THR_CMP_SHIFT	45
@@ -282,5 +291,6 @@ int isa207_get_alternatives(u64 event, u64 alt[],
 				const unsigned int ev_alt[][MAX_ALT], int size);
 void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
 							struct pt_regs *regs);
+void isa207_get_mem_weight(u64 *weight);
 
 #endif
-- 
cgit v1.2.1


From 453ce7a9432ee0bf3199072aa62f4c1895a195b1 Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Date: Tue, 11 Apr 2017 07:21:08 +0530
Subject: powerpc/perf: Support to export SIERs bit in Power8

Patch to export SIER bits to userspace via
perf_mem_data_src and perf_sample_data struct.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/power8-pmu.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index ce15b19a7962..932d7536f0eb 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -325,6 +325,8 @@ static struct power_pmu power8_pmu = {
 	.bhrb_filter_map	= power8_bhrb_filter_map,
 	.get_constraint		= isa207_get_constraint,
 	.get_alternatives	= power8_get_alternatives,
+	.get_mem_data_src	= isa207_get_mem_data_src,
+	.get_mem_weight		= isa207_get_mem_weight,
 	.disable_pmc		= isa207_disable_pmc,
 	.flags			= PPMU_HAS_SIER | PPMU_ARCH_207S,
 	.n_generic		= ARRAY_SIZE(power8_generic_events),
-- 
cgit v1.2.1


From d148c94c27a87213f995f0a7519b231719cfb919 Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Date: Tue, 11 Apr 2017 07:21:09 +0530
Subject: powerpc/perf: Support to export SIERs bit in Power9

Patch to export SIER bits to userspace via
perf_mem_data_src and perf_sample_data struct.

Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/power9-pmu.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 7f6582708e06..018f8e90ac35 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -427,6 +427,8 @@ static struct power_pmu power9_pmu = {
 	.bhrb_filter_map	= power9_bhrb_filter_map,
 	.get_constraint		= isa207_get_constraint,
 	.get_alternatives	= power9_get_alternatives,
+	.get_mem_data_src	= isa207_get_mem_data_src,
+	.get_mem_weight		= isa207_get_mem_weight,
 	.disable_pmc		= isa207_disable_pmc,
 	.flags			= PPMU_HAS_SIER | PPMU_ARCH_207S,
 	.n_generic		= ARRAY_SIZE(power9_generic_events),
-- 
cgit v1.2.1


From f2080b9ac3c450c06a380237e6338f1e43468665 Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Date: Tue, 11 Apr 2017 07:21:10 +0530
Subject: powerpc/perf: Add Power8 mem_access event to sysfs

Patch add "mem_access" event to sysfs. This as-is not a raw event
supported by Power8 pmu. Instead, it is formed based on
raw event encoding specificed in isa207-common.h.

Primary PMU event used here is PM_MRK_INST_CMPL.
This event tracks only the completed marked instructions.

Random sampling mode (MMCRA[SM]) with Random Instruction
Sampling (RIS) is enabled to mark type of instructions.

With Random sampling in RLS mode with PM_MRK_INST_CMPL event,
the LDST /DATA_SRC fields in SIER identifies the memory
hierarchy level (eg: L1, L2 etc) statisfied a data-cache
miss for a marked instruction.

Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/power8-events-list.h | 6 ++++++
 arch/powerpc/perf/power8-pmu.c         | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/perf/power8-events-list.h b/arch/powerpc/perf/power8-events-list.h
index 3a2e6e8ebb92..0f1d184627cc 100644
--- a/arch/powerpc/perf/power8-events-list.h
+++ b/arch/powerpc/perf/power8-events-list.h
@@ -89,3 +89,9 @@ EVENT(PM_MRK_FILT_MATCH,			0x2013c)
 EVENT(PM_MRK_FILT_MATCH_ALT,			0x3012e)
 /* Alternate event code for PM_LD_MISS_L1 */
 EVENT(PM_LD_MISS_L1_ALT,			0x400f0)
+/*
+ * Memory Access Event -- mem_access
+ * Primary PMU event used here is PM_MRK_INST_CMPL, along with
+ * Random Load/Store Facility Sampling (RIS) in Random sampling mode (MMCRA[SM]).
+ */
+EVENT(MEM_ACCESS,				0x10401e0)
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index 932d7536f0eb..5463516e369b 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -90,6 +90,7 @@ GENERIC_EVENT_ATTR(branch-instructions,		PM_BRU_FIN);
 GENERIC_EVENT_ATTR(branch-misses,		PM_BR_MPRED_CMPL);
 GENERIC_EVENT_ATTR(cache-references,		PM_LD_REF_L1);
 GENERIC_EVENT_ATTR(cache-misses,		PM_LD_MISS_L1);
+GENERIC_EVENT_ATTR(mem_access,			MEM_ACCESS);
 
 CACHE_EVENT_ATTR(L1-dcache-load-misses,		PM_LD_MISS_L1);
 CACHE_EVENT_ATTR(L1-dcache-loads,		PM_LD_REF_L1);
@@ -120,6 +121,7 @@ static struct attribute *power8_events_attr[] = {
 	GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL),
 	GENERIC_EVENT_PTR(PM_LD_REF_L1),
 	GENERIC_EVENT_PTR(PM_LD_MISS_L1),
+	GENERIC_EVENT_PTR(MEM_ACCESS),
 
 	CACHE_EVENT_PTR(PM_LD_MISS_L1),
 	CACHE_EVENT_PTR(PM_LD_REF_L1),
-- 
cgit v1.2.1


From 2384d2d7adc41463a69d3eb0720896cc2d5d1f55 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Apr 2017 12:27:37 +1000
Subject: powerpc/64s: Remove ICSWX feature from Power9

Power9 does not implement the icswx instruction. This CPU feature is not visible
to userspace and is only used in the CONFIG_PPC_ICSWX code, which is generally
not enabled, and can only be triggered by other code using icswx, which should
not happen on Power9 systems in the first place. So impact should be minimal.

Fixes: c3ab300ea5 ("powerpc: Add POWER9 cputable entry")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cputable.h    | 2 +-
 arch/powerpc/platforms/Kconfig.cputype | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index ab68d0ee7725..5869025673b4 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -471,7 +471,7 @@ enum {
 	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
 	    CPU_FTR_DSCR | CPU_FTR_SAO  | \
 	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
-	    CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
+	    CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
 	    CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \
 	    CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300)
 #define CPU_FTRS_POWER9_DD1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1)
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index ee87cb37c580..684e886eaae4 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -279,7 +279,8 @@ config PPC_ICSWX
 
 	  This option enables kernel support for the PowerPC Initiate
 	  Coprocessor Store Word (icswx) coprocessor instruction on POWER7
-	  or newer processors.
+	  and POWER8 processors. POWER9 uses new copy/paste instructions
+	  to invoke the coprocessor.
 
 	  This option is only useful if you have a processor that supports
 	  the icswx coprocessor instruction. It does not have any effect
-- 
cgit v1.2.1


From ca80d5d0a8175c9be04cfbce24180b8f5e0a744b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Apr 2017 12:27:38 +1000
Subject: powerpc/64s: Remove SAO feature from Power9 DD1

Power9 DD1 does not implement SAO. Although it's not widely used, its presence
or absence is visible to user space via arch_validate_prot() so it's moderately
important that we get the value right.

Fixes: 7dccfbc325bb ("powerpc/book3s: Add a cpu table entry for different POWER9 revs")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cputable.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 5869025673b4..1f6847b107e4 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -474,7 +474,8 @@ enum {
 	    CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
 	    CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \
 	    CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300)
-#define CPU_FTRS_POWER9_DD1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1)
+#define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \
+			     (~CPU_FTR_SAO))
 #define CPU_FTRS_CELL	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
 	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
-- 
cgit v1.2.1


From 686978b15cef8314c3b3f38f0124e070b98eae37 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 20 Apr 2017 14:43:19 +1000
Subject: powerpc/xive: Fix missing check of rc != OPAL_BUSY

Dan Carpenter noticed that the code in __xive_native_disable_queue() has a for
loop with an unconditional break in the middle, which doesn't make a lot of
sense.

What the code's supposed to do is loop as long as OPAL says it's busy, if we get
any other return code, either success or failure, then we should break the loop.

So add the missing check.

Fixes: 243e25112d06 ("powerpc/xive: Native exploitation of the XIVE interrupt controller")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/sysdev/xive/native.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index 5fae59186cb2..1a726229a427 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -180,6 +180,7 @@ static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
 	/* Disable the queue in HW */
 	for (;;) {
 		rc = opal_xive_set_queue_info(vp_id, prio, 0, 0, 0);
+		if (rc != OPAL_BUSY)
 			break;
 		msleep(1);
 	}
-- 
cgit v1.2.1


From a050d20d024dc3e1b988e715e2d811aaa3d5c39c Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 13 Apr 2017 19:45:48 +1000
Subject: powerpc/64s: Use relon prolog for EXC_VIRT_OOL_MASKABLE_HV handlers

Hypervisor Virtualization and Directed Hypervisor Doorbell interrupt handlers
use the macro EXC_VIRT_OOL_MASKABLE_HV for their relocation-on handlers, which
calls MASKABLE_RELON_EXCEPTION_HV_OOL, which uses the *real mode* interrupt
prolog. This means we needlessly rfid from virtual mode to virtual mode.

For POWER8 it only affects doorbell IPIs. Context switch microbenchmark between
threads with snooze disabled (which causes IPI) gets about 3% faster, about 370
cycles. Should be more important on POWER9 with global doorbells and HVI for
host interrupts.

Use the RELON variant instead to reduce overhead.

Fixes: 1707dd1613 ("powerpc: Save CFAR before branching in interrupt entry paths")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Fold some more detail into the change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 14752eee3d0c..b89c91e27dd0 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -522,7 +522,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 
 #define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label)			\
 	EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec);		\
-	EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV)
+	EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV)
 
 /*
  * Our exception common code can be passed various "additions"
-- 
cgit v1.2.1


From 49e0b4658fe6aab5bf6bfe0738a86c1895930ad1 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 19 Apr 2017 18:21:00 +0530
Subject: kprobes: Convert kprobe_lookup_name() to a function

The macro is now pretty long and ugly on powerpc. In the light of further
changes needed here, convert it to a __weak variant to be over-ridden with a
nicer looking function.

Suggested-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kprobes.h | 53 ----------------------------------
 arch/powerpc/kernel/kprobes.c      | 58 ++++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/optprobes.c    |  4 +--
 3 files changed, 60 insertions(+), 55 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index 0503c98b2117..a843884aafaf 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -61,59 +61,6 @@ extern kprobe_opcode_t optprobe_template_end[];
 #define MAX_OPTINSN_SIZE	(optprobe_template_end - optprobe_template_entry)
 #define RELATIVEJUMP_SIZE	sizeof(kprobe_opcode_t)	/* 4 bytes */
 
-#ifdef PPC64_ELF_ABI_v2
-/* PPC64 ABIv2 needs local entry point */
-#define kprobe_lookup_name(name, addr)					\
-{									\
-	addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);		\
-	if (addr)							\
-		addr = (kprobe_opcode_t *)ppc_function_entry(addr);	\
-}
-#elif defined(PPC64_ELF_ABI_v1)
-/*
- * 64bit powerpc ABIv1 uses function descriptors:
- * - Check for the dot variant of the symbol first.
- * - If that fails, try looking up the symbol provided.
- *
- * This ensures we always get to the actual symbol and not the descriptor.
- * Also handle <module:symbol> format.
- */
-#define kprobe_lookup_name(name, addr)					\
-{									\
-	char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN];		\
-	const char *modsym;							\
-	bool dot_appended = false;					\
-	if ((modsym = strchr(name, ':')) != NULL) {			\
-		modsym++;						\
-		if (*modsym != '\0' && *modsym != '.') {		\
-			/* Convert to <module:.symbol> */		\
-			strncpy(dot_name, name, modsym - name);		\
-			dot_name[modsym - name] = '.';			\
-			dot_name[modsym - name + 1] = '\0';		\
-			strncat(dot_name, modsym,			\
-				sizeof(dot_name) - (modsym - name) - 2);\
-			dot_appended = true;				\
-		} else {						\
-			dot_name[0] = '\0';				\
-			strncat(dot_name, name, sizeof(dot_name) - 1);	\
-		}							\
-	} else if (name[0] != '.') {					\
-		dot_name[0] = '.';					\
-		dot_name[1] = '\0';					\
-		strncat(dot_name, name, KSYM_NAME_LEN - 2);		\
-		dot_appended = true;					\
-	} else {							\
-		dot_name[0] = '\0';					\
-		strncat(dot_name, name, KSYM_NAME_LEN - 1);		\
-	}								\
-	addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name);	\
-	if (!addr && dot_appended) {					\
-		/* Let's try the original non-dot symbol lookup	*/	\
-		addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);	\
-	}								\
-}
-#endif
-
 #define flush_insn_slot(p)	do { } while (0)
 #define kretprobe_blacklist_size 0
 
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index fce05a38851c..e5f518a962c6 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -42,6 +42,64 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
 struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
 
+kprobe_opcode_t *kprobe_lookup_name(const char *name)
+{
+	kprobe_opcode_t *addr;
+
+#ifdef PPC64_ELF_ABI_v2
+	/* PPC64 ABIv2 needs local entry point */
+	addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
+	if (addr)
+		addr = (kprobe_opcode_t *)ppc_function_entry(addr);
+#elif defined(PPC64_ELF_ABI_v1)
+	/*
+	 * 64bit powerpc ABIv1 uses function descriptors:
+	 * - Check for the dot variant of the symbol first.
+	 * - If that fails, try looking up the symbol provided.
+	 *
+	 * This ensures we always get to the actual symbol and not
+	 * the descriptor.
+	 *
+	 * Also handle <module:symbol> format.
+	 */
+	char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN];
+	const char *modsym;
+	bool dot_appended = false;
+	if ((modsym = strchr(name, ':')) != NULL) {
+		modsym++;
+		if (*modsym != '\0' && *modsym != '.') {
+			/* Convert to <module:.symbol> */
+			strncpy(dot_name, name, modsym - name);
+			dot_name[modsym - name] = '.';
+			dot_name[modsym - name + 1] = '\0';
+			strncat(dot_name, modsym,
+				sizeof(dot_name) - (modsym - name) - 2);
+			dot_appended = true;
+		} else {
+			dot_name[0] = '\0';
+			strncat(dot_name, name, sizeof(dot_name) - 1);
+		}
+	} else if (name[0] != '.') {
+		dot_name[0] = '.';
+		dot_name[1] = '\0';
+		strncat(dot_name, name, KSYM_NAME_LEN - 2);
+		dot_appended = true;
+	} else {
+		dot_name[0] = '\0';
+		strncat(dot_name, name, KSYM_NAME_LEN - 1);
+	}
+	addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name);
+	if (!addr && dot_appended) {
+		/* Let's try the original non-dot symbol lookup	*/
+		addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
+	}
+#else
+	addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
+#endif
+
+	return addr;
+}
+
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
 	int ret = 0;
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 2282bf4e63cd..aefe076d00e0 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -243,8 +243,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
 	/*
 	 * 2. branch to optimized_callback() and emulate_step()
 	 */
-	kprobe_lookup_name("optimized_callback", op_callback_addr);
-	kprobe_lookup_name("emulate_step", emulate_step_addr);
+	op_callback_addr = kprobe_lookup_name("optimized_callback");
+	emulate_step_addr = kprobe_lookup_name("emulate_step");
 	if (!op_callback_addr || !emulate_step_addr) {
 		WARN(1, "kprobe_lookup_name() failed\n");
 		goto error;
-- 
cgit v1.2.1


From 290e3070762ac80e5fc4087d8c4de7e3f1d90aca Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 19 Apr 2017 18:21:01 +0530
Subject: powerpc/kprobes: Fix handling of function offsets on ABIv2

commit 239aeba76409 ("perf powerpc: Fix kprobe and kretprobe handling with
kallsyms on ppc64le") changed how we use the offset field in struct kprobe on
ABIv2. perf now offsets from the global entry point if an offset is specified
and otherwise chooses the local entry point.

Fix the same in kernel for kprobe API users. We do this by extending
kprobe_lookup_name() to accept an additional parameter to indicate the offset
specified with the kprobe registration. If offset is 0, we return the local
function entry and return the global entry point otherwise.

With:
  # cd /sys/kernel/debug/tracing/
  # echo "p _do_fork" >> kprobe_events
  # echo "p _do_fork+0x10" >> kprobe_events

before this patch:
  # cat ../kprobes/list
  c0000000000d0748  k  _do_fork+0x8    [DISABLED]
  c0000000000d0758  k  _do_fork+0x18    [DISABLED]
  c0000000000412b0  k  kretprobe_trampoline+0x0    [OPTIMIZED]

and after:
  # cat ../kprobes/list
  c0000000000d04c8  k  _do_fork+0x8    [DISABLED]
  c0000000000d04d0  k  _do_fork+0x10    [DISABLED]
  c0000000000412b0  k  kretprobe_trampoline+0x0    [OPTIMIZED]

Acked-by: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes.c   | 4 ++--
 arch/powerpc/kernel/optprobes.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index e5f518a962c6..65828fbc410d 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -42,14 +42,14 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
 struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
 
-kprobe_opcode_t *kprobe_lookup_name(const char *name)
+kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset)
 {
 	kprobe_opcode_t *addr;
 
 #ifdef PPC64_ELF_ABI_v2
 	/* PPC64 ABIv2 needs local entry point */
 	addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
-	if (addr)
+	if (addr && !offset)
 		addr = (kprobe_opcode_t *)ppc_function_entry(addr);
 #elif defined(PPC64_ELF_ABI_v1)
 	/*
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index aefe076d00e0..ce81a322251c 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -243,8 +243,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
 	/*
 	 * 2. branch to optimized_callback() and emulate_step()
 	 */
-	op_callback_addr = kprobe_lookup_name("optimized_callback");
-	emulate_step_addr = kprobe_lookup_name("emulate_step");
+	op_callback_addr = kprobe_lookup_name("optimized_callback", 0);
+	emulate_step_addr = kprobe_lookup_name("emulate_step", 0);
 	if (!op_callback_addr || !emulate_step_addr) {
 		WARN(1, "kprobe_lookup_name() failed\n");
 		goto error;
-- 
cgit v1.2.1


From a64e3f35a45f4a84148d0ba30a3c75c4c7076928 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 8 Mar 2017 13:56:07 +0530
Subject: powerpc/kretprobes: Override default function entry offset

With ABIv2, we offset 8 bytes into a function to get at the local entry
point.

mpe: NB this function is currently not called, the change to generic code to
call it is being merged via the tip tree.

Acked-by: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 65828fbc410d..97b5eed1f76d 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -189,6 +189,15 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 	kcb->kprobe_saved_msr = regs->msr;
 }
 
+bool arch_function_offset_within_entry(unsigned long offset)
+{
+#ifdef PPC64_ELF_ABI_v2
+	return offset <= 8;
+#else
+	return !offset;
+#endif
+}
+
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
-- 
cgit v1.2.1


From 1cabd2f8f720a0ed612139547acb655495b23e8b Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 19 Apr 2017 18:21:04 +0530
Subject: powerpc/kprobes: Factor out code to emulate instruction into a helper

Factor out code to emulate instruction into a try_to_emulate()
helper function. This makes no functional changes.

Acked-by: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes.c | 52 ++++++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 21 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 97b5eed1f76d..ed5a917c2edd 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -207,6 +207,35 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 	regs->link = (unsigned long)kretprobe_trampoline;
 }
 
+int __kprobes try_to_emulate(struct kprobe *p, struct pt_regs *regs)
+{
+	int ret;
+	unsigned int insn = *p->ainsn.insn;
+
+	/* regs->nip is also adjusted if emulate_step returns 1 */
+	ret = emulate_step(regs, insn);
+	if (ret > 0) {
+		/*
+		 * Once this instruction has been boosted
+		 * successfully, set the boostable flag
+		 */
+		if (unlikely(p->ainsn.boostable == 0))
+			p->ainsn.boostable = 1;
+	} else if (ret < 0) {
+		/*
+		 * We don't allow kprobes on mtmsr(d)/rfi(d), etc.
+		 * So, we should never get here... but, its still
+		 * good to catch them, just in case...
+		 */
+		printk("Can't step on instruction %x\n", insn);
+		BUG();
+	} else if (ret == 0)
+		/* This instruction can't be boosted */
+		p->ainsn.boostable = -1;
+
+	return ret;
+}
+
 int __kprobes kprobe_handler(struct pt_regs *regs)
 {
 	struct kprobe *p;
@@ -302,18 +331,9 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 
 ss_probe:
 	if (p->ainsn.boostable >= 0) {
-		unsigned int insn = *p->ainsn.insn;
+		ret = try_to_emulate(p, regs);
 
-		/* regs->nip is also adjusted if emulate_step returns 1 */
-		ret = emulate_step(regs, insn);
 		if (ret > 0) {
-			/*
-			 * Once this instruction has been boosted
-			 * successfully, set the boostable flag
-			 */
-			if (unlikely(p->ainsn.boostable == 0))
-				p->ainsn.boostable = 1;
-
 			if (p->post_handler)
 				p->post_handler(p, regs, 0);
 
@@ -321,17 +341,7 @@ ss_probe:
 			reset_current_kprobe();
 			preempt_enable_no_resched();
 			return 1;
-		} else if (ret < 0) {
-			/*
-			 * We don't allow kprobes on mtmsr(d)/rfi(d), etc.
-			 * So, we should never get here... but, its still
-			 * good to catch them, just in case...
-			 */
-			printk("Can't step on instruction %x\n", insn);
-			BUG();
-		} else if (ret == 0)
-			/* This instruction can't be boosted */
-			p->ainsn.boostable = -1;
+		}
 	}
 	prepare_singlestep(p, regs);
 	kcb->kprobe_status = KPROBE_HIT_SS;
-- 
cgit v1.2.1


From 22d8b3dec214cd43a773f621f95d254c50d2a092 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 19 Apr 2017 18:21:05 +0530
Subject: powerpc/kprobes: Emulate instructions on kprobe handler re-entry

On kprobe handler re-entry, try to emulate the instruction rather than single
stepping always.

Acked-by: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index ed5a917c2edd..b71922618ed2 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -277,6 +277,14 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 			kprobes_inc_nmissed_count(p);
 			prepare_singlestep(p, regs);
 			kcb->kprobe_status = KPROBE_REENTER;
+			if (p->ainsn.boostable >= 0) {
+				ret = try_to_emulate(p, regs);
+
+				if (ret > 0) {
+					restore_previous_kprobe(kcb);
+					return 1;
+				}
+			}
 			return 1;
 		} else {
 			if (*addr != BREAKPOINT_INSTRUCTION) {
-- 
cgit v1.2.1


From f855b2f544d664cfa3055edb7ffd20e9ae0e2dce Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Wed, 12 Apr 2017 03:42:31 +1000
Subject: powerpc/mm: Wire up ioremap_cache()

The default implementation of ioremap_cache() is aliased to ioremap().
On powerpc ioremap() creates cache-inhibited mappings by default which
is almost certainly not what you wanted.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/io.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index c398e86cd1cc..422f99cf9924 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -759,6 +759,8 @@ extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size,
 extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size);
 #define ioremap_nocache(addr, size)	ioremap((addr), (size))
 #define ioremap_uc(addr, size)		ioremap((addr), (size))
+#define ioremap_cache(addr, size) \
+	ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL))
 
 extern void iounmap(volatile void __iomem *addr);
 
-- 
cgit v1.2.1


From 9fea59bd7ca541e5d0851f0b6dbca83c60ea90cd Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 21 Apr 2017 00:36:20 +1000
Subject: powerpc/mm: Add support for runtime configuration of ASLR limits

Add powerpc support for mmap_rnd_bits and mmap_rnd_compat_bits, which are two
sysctls that allow a user to configure the number of bits of randomness used for
ASLR.

Because of the way the Kconfig for ARCH_MMAP_RND_BITS is defined, we have to
construct at least the MIN value in Kconfig, vs in a header which would be more
natural. Given that we just go ahead and do it all in Kconfig.

At least according to the code (the documentation makes no mention of it), the
value is defined as the number of bits of randomisation *of the page*, not the
address. This makes some sense, with larger page sizes more of the low bits are
forced to zero, which would reduce the randomisation if we didn't take the
PAGE_SIZE into account. However it does mean the min/max values have to change
depending on the PAGE_SIZE in order to actually limit the amount of address
space consumed by the randomisation.

The result of that is that we have to define the default values based on both
32-bit vs 64-bit, but also the configured PAGE_SIZE. Furthermore now that we
have 128TB address space support on Book3S, we also have to take that into
account.

Finally we can wire up the value in arch_mmap_rnd().

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Bhupesh Sharma <bhsharma@redhat.com>
Tested-by: Bhupesh Sharma <bhsharma@redhat.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/Kconfig   | 44 ++++++++++++++++++++++++++++++++++++++++++++
 arch/powerpc/mm/mmap.c | 11 ++++++-----
 2 files changed, 50 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 9ff731f50a29..276a624ec355 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -22,6 +22,48 @@ config MMU
 	bool
 	default y
 
+config ARCH_MMAP_RND_BITS_MAX
+	# On Book3S 64, the default virtual address space for 64-bit processes
+	# is 2^47 (128TB). As a maximum, allow randomisation to consume up to
+	# 32T of address space (2^45), which should ensure a reasonable gap
+	# between bottom-up and top-down allocations for applications that
+	# consume "normal" amounts of address space. Book3S 64 only supports 64K
+	# and 4K page sizes.
+	default 29 if PPC_BOOK3S_64 && PPC_64K_PAGES # 29 = 45 (32T) - 16 (64K)
+	default 33 if PPC_BOOK3S_64		     # 33 = 45 (32T) - 12 (4K)
+	#
+	# On all other 64-bit platforms (currently only Book3E), the virtual
+	# address space is 2^46 (64TB). Allow randomisation to consume up to 16T
+	# of address space (2^44). Only 4K page sizes are supported.
+	default 32 if 64BIT	# 32 = 44 (16T) - 12 (4K)
+	#
+	# For 32-bit, use the compat values, as they're the same.
+	default ARCH_MMAP_RND_COMPAT_BITS_MAX
+
+config ARCH_MMAP_RND_BITS_MIN
+	# Allow randomisation to consume up to 1GB of address space (2^30).
+	default 14 if 64BIT && PPC_64K_PAGES	# 14 = 30 (1GB) - 16 (64K)
+	default 18 if 64BIT			# 18 = 30 (1GB) - 12 (4K)
+	#
+	# For 32-bit, use the compat values, as they're the same.
+	default ARCH_MMAP_RND_COMPAT_BITS_MIN
+
+config ARCH_MMAP_RND_COMPAT_BITS_MAX
+	# Total virtual address space for 32-bit processes is 2^31 (2GB).
+	# Allow randomisation to consume up to 512MB of address space (2^29).
+	default 11 if PPC_256K_PAGES	# 11 = 29 (512MB) - 18 (256K)
+	default 13 if PPC_64K_PAGES	# 13 = 29 (512MB) - 16 (64K)
+	default 15 if PPC_16K_PAGES 	# 15 = 29 (512MB) - 14 (16K)
+	default 17			# 17 = 29 (512MB) - 12 (4K)
+
+config ARCH_MMAP_RND_COMPAT_BITS_MIN
+	# Total virtual address space for 32-bit processes is 2^31 (2GB).
+	# Allow randomisation to consume up to 8MB of address space (2^23).
+	default 5 if PPC_256K_PAGES	#  5 = 23 (8MB) - 18 (256K)
+	default 7 if PPC_64K_PAGES	#  7 = 23 (8MB) - 16 (64K)
+	default 9 if PPC_16K_PAGES	#  9 = 23 (8MB) - 14 (16K)
+	default 11			# 11 = 23 (8MB) - 12 (4K)
+
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool PPC64
 
@@ -120,6 +162,8 @@ config PPC
 	select HAVE_ARCH_HARDENED_USERCOPY
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_KGDB
+	select HAVE_ARCH_MMAP_RND_BITS
+	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if COMPAT
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_CBPF_JIT			if !PPC64
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 82fc5762f971..005aa8a44915 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -59,13 +59,14 @@ static inline int mmap_is_legacy(void)
 
 unsigned long arch_mmap_rnd(void)
 {
-	unsigned long rnd;
+	unsigned long shift, rnd;
 
-	/* 8MB for 32bit, 1GB for 64bit */
+	shift = mmap_rnd_bits;
+#ifdef CONFIG_COMPAT
 	if (is_32bit_task())
-		rnd = get_random_long() % (1<<(23-PAGE_SHIFT));
-	else
-		rnd = get_random_long() % (1UL<<(30-PAGE_SHIFT));
+		shift = mmap_rnd_compat_bits;
+#endif
+	rnd = get_random_long() % (1 << shift);
 
 	return rnd << PAGE_SHIFT;
 }
-- 
cgit v1.2.1


From 2563a70c3b2acc4f60083e2aaf1102f8dba73649 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Apr 2017 23:05:43 +1000
Subject: powerpc/64s: Remove unnecessary relocation branch from idle handler

The system reset idle handler system_reset_idle_common is relocated, so
relocation is not required to branch to kvm_start_guest. The superfluous
relocation does not result in incorrect code, but it does not compile
outside of exception-64s.S (with fixed section definitions).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 8 --------
 arch/powerpc/kernel/exceptions-64s.S     | 2 +-
 2 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index b89c91e27dd0..437550175b4d 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -256,11 +256,6 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	ld	r9,area+EX_R9(r13);					\
 	bctr
 
-#define BRANCH_TO_KVM(reg, label)					\
-	__LOAD_FAR_HANDLER(reg, label);					\
-	mtctr	reg;							\
-	bctr
-
 #else
 #define BRANCH_TO_COMMON(reg, label)					\
 	b	label
@@ -268,9 +263,6 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 #define BRANCH_LINK_TO_FAR(reg, label)					\
 	bl	label
 
-#define BRANCH_TO_KVM(reg, label)					\
-	b	label
-
 #define __BRANCH_TO_KVM_EXIT(area, label)				\
 	ld	r9,area+EX_R9(r13);					\
 	b	label
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 857bf7c5b946..7d496ee984fe 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -142,7 +142,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
 	cmpwi	r0,0
 	beq	1f
-	BRANCH_TO_KVM(r10, kvm_start_guest)
+	b	kvm_start_guest
 1:
 #endif
 
-- 
cgit v1.2.1


From bf0153c143a694e7bb3b28d92eafd8b3ef6db870 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Apr 2017 23:05:44 +1000
Subject: powerpc/64s: Move remaining system reset idle code into idle_book3s.S

No functional change.

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/exceptions-64s.S | 26 +--------------
 arch/powerpc/kernel/idle_book3s.S    | 64 ++++++++++++++++++++++++------------
 2 files changed, 44 insertions(+), 46 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 7d496ee984fe..2f837a4a78a2 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -126,31 +126,7 @@ EXC_VIRT_NONE(0x4100, 0x100)
 
 #ifdef CONFIG_PPC_P7_NAP
 EXC_COMMON_BEGIN(system_reset_idle_common)
-BEGIN_FTR_SECTION
-	GET_PACA(r13) /* Restore HSPRG0 to get the winkle bit in r13 */
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
-	bl	pnv_restore_hyp_resource
-
-	li	r0,PNV_THREAD_RUNNING
-	stb	r0,PACA_THREAD_IDLE_STATE(r13)	/* Clear thread state */
-
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	li	r0,KVM_HWTHREAD_IN_KERNEL
-	stb	r0,HSTATE_HWTHREAD_STATE(r13)
-	/* Order setting hwthread_state vs. testing hwthread_req */
-	sync
-	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
-	cmpwi	r0,0
-	beq	1f
-	b	kvm_start_guest
-1:
-#endif
-
-	/* Return SRR1 from power7_nap() */
-	mfspr	r3,SPRN_SRR1
-	blt	cr3,2f
-	b	pnv_wakeup_loss
-2:	b	pnv_wakeup_noloss
+	b	pnv_powersave_wakeup
 #endif
 
 EXC_COMMON(system_reset_common, 0x100, system_reset_exception)
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 24717a73b6bb..99de9de7efe7 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -20,6 +20,7 @@
 #include <asm/kvm_book3s_asm.h>
 #include <asm/opal.h>
 #include <asm/cpuidle.h>
+#include <asm/exception-64s.h>
 #include <asm/book3s/64/mmu-hash.h>
 #include <asm/mmu.h>
 
@@ -113,7 +114,7 @@ core_idle_lock_held:
  *
  * Address to 'rfid' to in r5
  */
-_GLOBAL(pnv_powersave_common)
+pnv_powersave_common:
 	/* Use r3 to pass state nap/sleep/winkle */
 	/* NAP is a state loss, we create a regs frame on the
 	 * stack, fill it up with the state we care about and
@@ -188,8 +189,8 @@ pnv_enter_arch207_idle_mode:
 	/* The following store to HSTATE_HWTHREAD_STATE(r13)  */
 	/* MUST occur in real mode, i.e. with the MMU off,    */
 	/* and the MMU must stay off until we clear this flag */
-	/* and test HSTATE_HWTHREAD_REQ(r13) in the system    */
-	/* reset interrupt vector in exceptions-64s.S.        */
+	/* and test HSTATE_HWTHREAD_REQ(r13) in               */
+	/* pnv_powersave_wakeup in this file.                 */
 	/* The reason is that another thread can switch the   */
 	/* MMU to a guest context whenever this flag is set   */
 	/* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on,    */
@@ -376,7 +377,6 @@ _GLOBAL(power9_idle_stop)
 	b	pnv_powersave_common
 	/* No return */
 
-
 /*
  * On waking up from stop 0,1,2 with ESL=1 on POWER9 DD1,
  * HSPRG0 will be set to the HSPRG0 value of one of the
@@ -415,15 +415,41 @@ power9_dd1_recover_paca:
 	stb	r0,PACA_NAPSTATELOST(r13)
 	blr
 
+.global pnv_powersave_wakeup
+pnv_powersave_wakeup:
+BEGIN_FTR_SECTION
+	GET_PACA(r13) /* Restore HSPRG0 to get the winkle bit in r13 */
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+	bl	pnv_restore_hyp_resource
+
+	li	r0,PNV_THREAD_RUNNING
+	stb	r0,PACA_THREAD_IDLE_STATE(r13)	/* Clear thread state */
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	li	r0,KVM_HWTHREAD_IN_KERNEL
+	stb	r0,HSTATE_HWTHREAD_STATE(r13)
+	/* Order setting hwthread_state vs. testing hwthread_req */
+	sync
+	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
+	cmpwi	r0,0
+	beq	1f
+	b	kvm_start_guest
+1:
+#endif
+
+	/* Return SRR1 from power7_nap() */
+	mfspr	r3,SPRN_SRR1
+	blt	cr3,pnv_wakeup_noloss
+	b	pnv_wakeup_loss
+
 /*
- * Called from reset vector. Check whether we have woken up with
- * hypervisor state loss. If yes, restore hypervisor state and return
- * back to reset vector.
+ * Check whether we have woken up with hypervisor state loss.
+ * If yes, restore hypervisor state and return back to link.
  *
  * r13 - Contents of HSPRG0
  * cr3 - set to gt if waking up with partial/complete hypervisor state loss
  */
-_GLOBAL(pnv_restore_hyp_resource)
+pnv_restore_hyp_resource:
 BEGIN_FTR_SECTION
 BEGIN_FTR_SECTION_NESTED(70)
 	mflr 	r6
@@ -446,12 +472,9 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_POWER9_DD1, 70)
 	 */
 	rldicl  r5,r5,4,60
 	cmpd	cr4,r5,r4
-	bge	cr4,pnv_wakeup_tb_loss
-	/*
-	 * Waking up without hypervisor state loss. Return to
-	 * reset vector
-	 */
-	blr
+	bge	cr4,pnv_wakeup_tb_loss /* returns to caller */
+
+	blr	/* Waking up without hypervisor state loss. */
 
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
@@ -479,8 +502,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	 */
 	bgt	cr3,.
 
-	blr	/* Return back to System Reset vector from where
-		   pnv_restore_hyp_resource was invoked */
+	blr	/* Waking up without hypervisor state loss */
 
 /*
  * Called if waking up from idle state which can cause either partial or
@@ -492,7 +514,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
  * cr3 - gt if waking up with partial/complete hypervisor state loss
  * cr4 - gt or eq if waking up from complete hypervisor state loss.
  */
-_GLOBAL(pnv_wakeup_tb_loss)
+pnv_wakeup_tb_loss:
 	ld	r1,PACAR1(r13)
 	/*
 	 * Before entering any idle state, the NVGPRs are saved in the stack
@@ -683,8 +705,7 @@ hypervisor_state_restored:
 
 	mtspr	SPRN_SRR1,r16
 	mtlr	r17
-	blr	/* Return back to System Reset vector from where
-		   pnv_restore_hyp_resource was invoked */
+	blr		/* return to pnv_powersave_wakeup */
 
 fastsleep_workaround_at_exit:
 	li	r3,1
@@ -696,7 +717,8 @@ fastsleep_workaround_at_exit:
  * R3 here contains the value that will be returned to the caller
  * of power7_nap.
  */
-_GLOBAL(pnv_wakeup_loss)
+.global pnv_wakeup_loss
+pnv_wakeup_loss:
 	ld	r1,PACAR1(r13)
 BEGIN_FTR_SECTION
 	CHECK_HMI_INTERRUPT
@@ -716,7 +738,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
  * R3 here contains the value that will be returned to the caller
  * of power7_nap.
  */
-_GLOBAL(pnv_wakeup_noloss)
+pnv_wakeup_noloss:
 	lbz	r0,PACA_NAPSTATELOST(r13)
 	cmpwi	r0,0
 	bne	pnv_wakeup_loss
-- 
cgit v1.2.1


From 544686cae8e4f5bbcd153998dcaf4a3d5b94e6be Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Apr 2017 23:05:45 +1000
Subject: powerpc/64s: Stop using bit in HSPRG0 to test winkle

The POWER8 idle code has a neat trick of programming the power on engine
to restore a low bit into HSPRG0, so idle wakeup code can test and see
if it has been programmed this way and therefore lost all state. Restore
time can be reduced if winkle has not been reached.

However this messes with our r13 PACA pointer, and requires HSPRG0 to be
written to. It also optimizes the slowest and most uncommon case at the
expense of another SPR write in the common nap state wakeup.

Remove this complexity and assume winkle sleeps always require a state
restore. This speedup could be made entirely contained within the winkle
idle code by counting per-core winkles and setting a thread bitmap when
all have gone to winkle.

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 13 ++-----------
 arch/powerpc/kernel/exceptions-64s.S     | 21 +++------------------
 arch/powerpc/kernel/idle_book3s.S        | 27 ++++++++++-----------------
 arch/powerpc/platforms/powernv/idle.c    | 13 -------------
 4 files changed, 15 insertions(+), 59 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 437550175b4d..89259817f5ef 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -167,17 +167,14 @@ BEGIN_FTR_SECTION_NESTED(943)						\
 	std	ra,offset(r13);						\
 END_FTR_SECTION_NESTED(ftr,ftr,943)
 
-#define EXCEPTION_PROLOG_0_PACA(area)					\
+#define EXCEPTION_PROLOG_0(area)					\
+	GET_PACA(r13);							\
 	std	r9,area+EX_R9(r13);	/* save r9 */			\
 	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR);			\
 	HMT_MEDIUM;							\
 	std	r10,area+EX_R10(r13);	/* save r10 - r12 */		\
 	OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR)
 
-#define EXCEPTION_PROLOG_0(area)					\
-	GET_PACA(r13);							\
-	EXCEPTION_PROLOG_0_PACA(area)
-
 #define __EXCEPTION_PROLOG_1(area, extra, vec)				\
 	OPT_SAVE_REG_TO_PACA(area+EX_PPR, r9, CPU_FTR_HAS_PPR);		\
 	OPT_SAVE_REG_TO_PACA(area+EX_CFAR, r10, CPU_FTR_CFAR);		\
@@ -208,12 +205,6 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	EXCEPTION_PROLOG_1(area, extra, vec);				\
 	EXCEPTION_PROLOG_PSERIES_1(label, h);
 
-/* Have the PACA in r13 already */
-#define EXCEPTION_PROLOG_PSERIES_PACA(area, label, h, extra, vec)	\
-	EXCEPTION_PROLOG_0_PACA(area);					\
-	EXCEPTION_PROLOG_1(area, extra, vec);				\
-	EXCEPTION_PROLOG_PSERIES_1(label, h);
-
 #define __KVMTEST(h, n)							\
 	lbz	r10,HSTATE_IN_GUEST(r13);				\
 	cmpwi	r10,0;							\
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 2f837a4a78a2..e390fcd04bcb 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -116,9 +116,7 @@ EXC_VIRT_NONE(0x4000, 0x100)
 
 EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
 	SET_SCRATCH0(r13)
-	GET_PACA(r13)
-	clrrdi	r13,r13,1 /* Last bit of HSPRG0 is set if waking from winkle */
-	EXCEPTION_PROLOG_PSERIES_PACA(PACA_EXGEN, system_reset_common, EXC_STD,
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
 				 IDLETEST, 0x100)
 
 EXC_REAL_END(system_reset, 0x100, 0x100)
@@ -148,14 +146,6 @@ EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
 	 * vector
 	 */
 	SET_SCRATCH0(r13)		/* save r13 */
-	/*
-	 * Running native on arch 2.06 or later, we may wakeup from winkle
-	 * inside machine check. If yes, then last bit of HSPRG0 would be set
-	 * to 1. Hence clear it unconditionally.
-	 */
-	GET_PACA(r13)
-	clrrdi	r13,r13,1
-	SET_PACA(r13)
 	EXCEPTION_PROLOG_0(PACA_EXMC)
 BEGIN_FTR_SECTION
 	b	machine_check_powernv_early
@@ -339,7 +329,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 	 * Go back to nap/sleep/winkle mode again if (b) is true.
 	 */
 	rlwinm.	r11,r12,47-31,30,31	/* Was it in power saving mode? */
-	beq	4f			/* No, it wasn;t */
+	beq	4f			/* No, it wasn't */
 	/* Thread was in power saving mode. Go back to nap again. */
 	cmpwi	r11,2
 	blt	3f
@@ -369,13 +359,8 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 	/*
 	 * Go back to winkle. Please note that this thread was woken up in
 	 * machine check from winkle and have not restored the per-subcore
-	 * state. Hence before going back to winkle, set last bit of HSPRG0
-	 * to 1. This will make sure that if this thread gets woken up
-	 * again at reset vector 0x100 then it will get chance to restore
-	 * the subcore state.
+	 * state.
 	 */
-	ori	r13,r13,1
-	SET_PACA(r13)
 	IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
 	/* No return */
 4:
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 99de9de7efe7..a80d76f9e961 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -415,11 +415,12 @@ power9_dd1_recover_paca:
 	stb	r0,PACA_NAPSTATELOST(r13)
 	blr
 
+/*
+ * Called from reset vector for powersave wakeups.
+ * cr3 - set to gt if waking up with partial/complete hypervisor state loss
+ */
 .global pnv_powersave_wakeup
 pnv_powersave_wakeup:
-BEGIN_FTR_SECTION
-	GET_PACA(r13) /* Restore HSPRG0 to get the winkle bit in r13 */
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 	bl	pnv_restore_hyp_resource
 
 	li	r0,PNV_THREAD_RUNNING
@@ -446,18 +447,18 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
  * Check whether we have woken up with hypervisor state loss.
  * If yes, restore hypervisor state and return back to link.
  *
- * r13 - Contents of HSPRG0
  * cr3 - set to gt if waking up with partial/complete hypervisor state loss
  */
 pnv_restore_hyp_resource:
 BEGIN_FTR_SECTION
-BEGIN_FTR_SECTION_NESTED(70)
 	mflr 	r6
 	bl	power9_dd1_recover_paca
 	mtlr	r6
-FTR_SECTION_ELSE_NESTED(70)
+FTR_SECTION_ELSE
 	ld	r2, PACATOC(r13)
-ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_POWER9_DD1, 70)
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_POWER9_DD1)
+
+BEGIN_FTR_SECTION
 	/*
 	 * POWER ISA 3. Use PSSCR to determine if we
 	 * are waking up from deep idle state
@@ -480,19 +481,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
 	/*
 	 * POWER ISA 2.07 or less.
-	 * Check if last bit of HSPGR0 is set. This indicates whether we are
-	 * waking up from winkle.
+	 * Check if we slept with winkle.
 	 */
-	clrldi	r5,r13,63
-	clrrdi	r13,r13,1
-
-	/* Now that we are sure r13 is corrected, load TOC */
-	ld	r2,PACATOC(r13);
-	cmpwi	cr4,r5,1
-	mtspr	SPRN_HSPRG0,r13
-
 	lbz	r0,PACA_THREAD_IDLE_STATE(r13)
 	cmpwi   cr2,r0,PNV_THREAD_NAP
+	cmpwi   cr4,r0,PNV_THREAD_WINKLE
 	bgt     cr2,pnv_wakeup_tb_loss	/* Either sleep or Winkle */
 
 	/*
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index b369e39aa392..445f30a2c5ef 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -53,19 +53,6 @@ static int pnv_save_sprs_for_deep_states(void)
 		uint64_t pir = get_hard_smp_processor_id(cpu);
 		uint64_t hsprg0_val = (uint64_t)&paca[cpu];
 
-		if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
-			/*
-			 * HSPRG0 is used to store the cpu's pointer to paca.
-			 * Hence last 3 bits are guaranteed to be 0. Program
-			 * slw to restore HSPRG0 with 63rd bit set, so that
-			 * when a thread wakes up at 0x100 we can use this bit
-			 * to distinguish between fastsleep and deep winkle.
-			 * This is not necessary with stop/psscr since PLS
-			 * field of psscr indicates which state we are waking
-			 * up from.
-			 */
-			hsprg0_val |= 1;
-		}
 		rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val);
 		if (rc != 0)
 			return rc;
-- 
cgit v1.2.1


From 10101aa9aa9d2e6b403610de196945ecbf93cf06 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Apr 2017 23:05:46 +1000
Subject: powerpc/64s: Use alternative feature patching

This reduces the number of nops for POWER8.

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/idle_book3s.S | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index a80d76f9e961..dd8ca7fdf55a 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -421,7 +421,11 @@ power9_dd1_recover_paca:
  */
 .global pnv_powersave_wakeup
 pnv_powersave_wakeup:
-	bl	pnv_restore_hyp_resource
+BEGIN_FTR_SECTION
+	bl	pnv_restore_hyp_resource_arch300
+FTR_SECTION_ELSE
+	bl	pnv_restore_hyp_resource_arch207
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
 
 	li	r0,PNV_THREAD_RUNNING
 	stb	r0,PACA_THREAD_IDLE_STATE(r13)	/* Clear thread state */
@@ -449,7 +453,11 @@ pnv_powersave_wakeup:
  *
  * cr3 - set to gt if waking up with partial/complete hypervisor state loss
  */
-pnv_restore_hyp_resource:
+pnv_restore_hyp_resource_arch300:
+	/*
+	 * POWER ISA 3. Use PSSCR to determine if we
+	 * are waking up from deep idle state
+	 */
 BEGIN_FTR_SECTION
 	mflr 	r6
 	bl	power9_dd1_recover_paca
@@ -458,11 +466,6 @@ FTR_SECTION_ELSE
 	ld	r2, PACATOC(r13)
 ALT_FTR_SECTION_END_IFSET(CPU_FTR_POWER9_DD1)
 
-BEGIN_FTR_SECTION
-	/*
-	 * POWER ISA 3. Use PSSCR to determine if we
-	 * are waking up from deep idle state
-	 */
 	LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
 	ld	r4,ADDROFF(pnv_first_deep_stop_state)(r5)
 
@@ -477,12 +480,14 @@ BEGIN_FTR_SECTION
 
 	blr	/* Waking up without hypervisor state loss. */
 
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-
+/* Same calling convention as arch300 */
+pnv_restore_hyp_resource_arch207:
 	/*
 	 * POWER ISA 2.07 or less.
 	 * Check if we slept with winkle.
 	 */
+	ld	r2,PACATOC(r13);
+
 	lbz	r0,PACA_THREAD_IDLE_STATE(r13)
 	cmpwi   cr2,r0,PNV_THREAD_NAP
 	cmpwi   cr4,r0,PNV_THREAD_WINKLE
-- 
cgit v1.2.1


From 1945bc4549e5cb1f9aa873ec29191aa54dc851d2 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Apr 2017 23:05:47 +1000
Subject: powerpc/64s: Fix POWER9 machine check handler from stop state

The ISA specifies power save wakeup due to a machine check exception can
cause a machine check interrupt (rather than the usual system reset
interrupt).

The machine check handler copes with this by doing low level machine
check recovery without restoring full state from idle, then queues up a
machine check event for logging, then directly executes the same idle
instruction it woke from. This minimises the work done before recovery
is performed.

The problem is that it requires machine specific instructions and
knowledge of the book3s idle code. Currently it only has code to handle
POWER8 idle, so POWER9 crashes when trying to execute the P8 idle
instructions which don't exist in ISAv3.0B.

cpu 0x0: Vector: e40 (Emulation Assist) at [c0000000008f3810]
    pc: c000000000008380: machine_check_handle_early+0x130/0x2f0
    lr: c00000000053a098: stop_loop+0x68/0xd0
    sp: c0000000008f3a90
   msr: 9000000000081001
  current = 0xc0000000008a1080
  paca    = 0xc00000000ffd0000   softe: 0        irq_happened: 0x01
    pid   = 0, comm = swapper/0

Instead of going to sleep after recovery, do the usual idle wakeup and
state restoration by calling into the normal idle wakeup path. This
reuses the normal idle wakeup paths.

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Reviewed-by: Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/reg.h       |  1 +
 arch/powerpc/kernel/exceptions-64s.S | 79 ++++++++++++++++++++----------------
 arch/powerpc/kernel/idle_book3s.S    | 25 ++++++++++++
 3 files changed, 70 insertions(+), 35 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 4b594ed6180a..d4f653c9259a 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -659,6 +659,7 @@
 #define   SRR1_ISI_PROT		0x08000000 /* ISI: Other protection fault */
 #define   SRR1_WAKEMASK		0x00380000 /* reason for wakeup */
 #define   SRR1_WAKEMASK_P8	0x003c0000 /* reason for wakeup on POWER8 and 9 */
+#define   SRR1_WAKEMCE_RESVD	0x003c0000 /* Unused/reserved value used by MCE wakeup to indicate cause to idle wakeup handler */
 #define   SRR1_WAKESYSERR	0x00300000 /* System error */
 #define   SRR1_WAKEEE		0x00200000 /* External interrupt */
 #define   SRR1_WAKEHVI		0x00240000 /* Hypervisor Virtualization Interrupt (P9) */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index e390fcd04bcb..28f8d7bed6b1 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -178,6 +178,12 @@ BEGIN_FTR_SECTION
 	 * NOTE: We are here with MSR_ME=0 (off), which means we risk a
 	 * checkstop if we get another machine check exception before we do
 	 * rfid with MSR_ME=1.
+	 *
+	 * This interrupt can wake directly from idle. If that is the case,
+	 * the machine check is handled then the idle wakeup code is called
+	 * to restore state. In that case, the POWER9 DD1 idle PACA workaround
+	 * is not applied in the early machine check code, which will cause
+	 * bugs.
 	 */
 	mr	r11,r1			/* Save r1 */
 	lhz	r10,PACA_IN_MCE(r13)
@@ -306,6 +312,37 @@ EXC_COMMON_BEGIN(machine_check_common)
 	/* restore original r1. */			\
 	ld	r1,GPR1(r1)
 
+#ifdef CONFIG_PPC_P7_NAP
+/*
+ * This is an idle wakeup. Low level machine check has already been
+ * done. Queue the event then call the idle code to do the wake up.
+ */
+EXC_COMMON_BEGIN(machine_check_idle_common)
+	bl	machine_check_queue_event
+
+	/*
+	 * We have not used any non-volatile GPRs here, and as a rule
+	 * most exception code including machine check does not.
+	 * Therefore PACA_NAPSTATELOST does not need to be set. Idle
+	 * wakeup will restore volatile registers.
+	 *
+	 * Load the original SRR1 into r3 for pnv_powersave_wakeup_mce.
+	 *
+	 * Then decrement MCE nesting after finishing with the stack.
+	 */
+	ld	r3,_MSR(r1)
+
+	lhz	r11,PACA_IN_MCE(r13)
+	subi	r11,r11,1
+	sth	r11,PACA_IN_MCE(r13)
+
+	/* Turn off the RI bit because SRR1 is used by idle wakeup code. */
+	/* Recoverability could be improved by reducing the use of SRR1. */
+	li	r11,0
+	mtmsrd	r11,1
+
+	b	pnv_powersave_wakeup_mce
+#endif
 	/*
 	 * Handle machine check early in real mode. We come here with
 	 * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
@@ -318,6 +355,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 	bl	machine_check_early
 	std	r3,RESULT(r1)	/* Save result */
 	ld	r12,_MSR(r1)
+
 #ifdef	CONFIG_PPC_P7_NAP
 	/*
 	 * Check if thread was in power saving mode. We come here when any
@@ -328,43 +366,14 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 	 *
 	 * Go back to nap/sleep/winkle mode again if (b) is true.
 	 */
-	rlwinm.	r11,r12,47-31,30,31	/* Was it in power saving mode? */
-	beq	4f			/* No, it wasn't */
-	/* Thread was in power saving mode. Go back to nap again. */
-	cmpwi	r11,2
-	blt	3f
-	/* Supervisor/Hypervisor state loss */
-	li	r0,1
-	stb	r0,PACA_NAPSTATELOST(r13)
-3:	bl	machine_check_queue_event
-	MACHINE_CHECK_HANDLER_WINDUP
-	GET_PACA(r13)
-	ld	r1,PACAR1(r13)
-	/*
-	 * Check what idle state this CPU was in and go back to same mode
-	 * again.
-	 */
-	lbz	r3,PACA_THREAD_IDLE_STATE(r13)
-	cmpwi	r3,PNV_THREAD_NAP
-	bgt	10f
-	IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
-	/* No return */
-10:
-	cmpwi	r3,PNV_THREAD_SLEEP
-	bgt	2f
-	IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
-	/* No return */
-
-2:
-	/*
-	 * Go back to winkle. Please note that this thread was woken up in
-	 * machine check from winkle and have not restored the per-subcore
-	 * state.
-	 */
-	IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
-	/* No return */
+	BEGIN_FTR_SECTION
+	rlwinm.	r11,r12,47-31,30,31
+	beq-	4f
+	BRANCH_TO_COMMON(r10, machine_check_idle_common)
 4:
+	END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif
+
 	/*
 	 * Check if we are coming from hypervisor userspace. If yes then we
 	 * continue in host kernel in V mode to deliver the MC event.
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index dd8ca7fdf55a..4e5a4fed86c5 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -415,6 +415,31 @@ power9_dd1_recover_paca:
 	stb	r0,PACA_NAPSTATELOST(r13)
 	blr
 
+/*
+ * Called from machine check handler for powersave wakeups.
+ * Low level machine check processing has already been done. Now just
+ * go through the wake up path to get everything in order.
+ *
+ * r3 - The original SRR1 value.
+ * Original SRR[01] have been clobbered.
+ * MSR_RI is clear.
+ */
+.global pnv_powersave_wakeup_mce
+pnv_powersave_wakeup_mce:
+	/* Set cr3 for pnv_powersave_wakeup */
+	rlwinm	r11,r3,47-31,30,31
+	cmpwi	cr3,r11,2
+
+	/*
+	 * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake
+	 * reason into SRR1, which allows reuse of the system reset wakeup
+	 * code without being mistaken for another type of wakeup.
+	 */
+	oris	r3,r3,SRR1_WAKEMCE_RESVD@h
+	mtspr	SPRN_SRR1,r3
+
+	b	pnv_powersave_wakeup
+
 /*
  * Called from reset vector for powersave wakeups.
  * cr3 - set to gt if waking up with partial/complete hypervisor state loss
-- 
cgit v1.2.1


From adbcf8d74f4609e142b01520afa173bfe911122b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Apr 2017 23:05:48 +1000
Subject: powerpc/64s: Expand core idle state bits

In preparation for adding more bits to the core idle state word, move
the lock bit up, and unlock by flipping the lock bit rather than masking
off all but the thread bits.

Add branch hints for atomic operations while we're here.

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cpuidle.h |  4 ++--
 arch/powerpc/kernel/idle_book3s.S  | 33 +++++++++++++++++----------------
 2 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h
index 4649ca0d28e3..dd6a63f0a10f 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -7,8 +7,8 @@
 #define PNV_THREAD_NAP                  1
 #define PNV_THREAD_SLEEP                2
 #define PNV_THREAD_WINKLE               3
-#define PNV_CORE_IDLE_LOCK_BIT          0x100
-#define PNV_CORE_IDLE_THREAD_BITS       0x0FF
+#define PNV_CORE_IDLE_LOCK_BIT          0x10000000
+#define PNV_CORE_IDLE_THREAD_BITS       0x000000FF
 
 /*
  * ============================ NOTE =================================
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 4e5a4fed86c5..335d15bfb945 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -95,12 +95,12 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
 core_idle_lock_held:
 	HMT_LOW
 3:	lwz	r15,0(r14)
-	andi.   r15,r15,PNV_CORE_IDLE_LOCK_BIT
+	andis.	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
 	bne	3b
 	HMT_MEDIUM
 	lwarx	r15,0,r14
-	andi.	r9,r15,PNV_CORE_IDLE_LOCK_BIT
-	bne	core_idle_lock_held
+	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	bne-	core_idle_lock_held
 	blr
 
 /*
@@ -213,8 +213,8 @@ pnv_enter_arch207_idle_mode:
 lwarx_loop1:
 	lwarx	r15,0,r14
 
-	andi.   r9,r15,PNV_CORE_IDLE_LOCK_BIT
-	bnel	core_idle_lock_held
+	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	bnel-	core_idle_lock_held
 
 	andc	r15,r15,r7			/* Clear thread bit */
 
@@ -241,7 +241,7 @@ common_enter: /* common code for all the threads entering sleep or winkle */
 	IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
 
 fastsleep_workaround_at_entry:
-	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
+	oris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
 	stwcx.	r15,0,r14
 	bne-	lwarx_loop1
 	isync
@@ -251,10 +251,10 @@ fastsleep_workaround_at_entry:
 	li	r4,1
 	bl	opal_config_cpu_idle_state
 
-	/* Clear Lock bit */
-	li	r0,0
+	/* Unlock */
+	xoris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
 	lwsync
-	stw	r0,0(r14)
+	stw	r15,0(r14)
 	b	common_enter
 
 enter_winkle:
@@ -302,8 +302,8 @@ power_enter_stop:
 
 lwarx_loop_stop:
 	lwarx   r15,0,r14
-	andi.   r9,r15,PNV_CORE_IDLE_LOCK_BIT
-	bnel    core_idle_lock_held
+	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	bnel-	core_idle_lock_held
 	andc    r15,r15,r7                      /* Clear thread bit */
 
 	stwcx.  r15,0,r14
@@ -560,7 +560,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
 lwarx_loop2:
 	lwarx	r15,0,r14
-	andi.	r9,r15,PNV_CORE_IDLE_LOCK_BIT
+	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
 	/*
 	 * Lock bit is set in one of the 2 cases-
 	 * a. In the sleep/winkle enter path, the last thread is executing
@@ -569,9 +569,10 @@ lwarx_loop2:
 	 * workaround undo code or resyncing timebase or restoring context
 	 * In either case loop until the lock bit is cleared.
 	 */
-	bnel	core_idle_lock_held
+	bnel-	core_idle_lock_held
 
-	cmpwi	cr2,r15,0
+	andi.	r9,r15,PNV_CORE_IDLE_THREAD_BITS
+	cmpwi	cr2,r9,0
 
 	/*
 	 * At this stage
@@ -580,7 +581,7 @@ lwarx_loop2:
 	 * cr4 - gt or eq if waking up from complete hypervisor state loss.
 	 */
 
-	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
+	oris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
 	stwcx.	r15,0,r14
 	bne-	lwarx_loop2
 	isync
@@ -670,7 +671,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	mtspr	SPRN_WORC,r4
 
 clear_lock:
-	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
+	xoris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
 	lwsync
 	stw	r15,0(r14)
 
-- 
cgit v1.2.1


From e420249d44d02e9fc110087ed8feddc38b13528c Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Apr 2017 23:05:49 +1000
Subject: powerpc/64s: Idle do not hold reservation longer than required

When taking the core idle state lock, grab it immediately like a regular
lock, rather than adding more tests in there. Holding the lock keeps it
stable, so there is no need to do it whole holding the reservation.

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/idle_book3s.S | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 335d15bfb945..de164ba7c5e8 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -556,12 +556,12 @@ BEGIN_FTR_SECTION
 	CHECK_HMI_INTERRUPT
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 
-	lbz	r7,PACA_THREAD_MASK(r13)
 	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
-lwarx_loop2:
-	lwarx	r15,0,r14
-	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	lbz	r7,PACA_THREAD_MASK(r13)
+
 	/*
+	 * Take the core lock to synchronize against other threads.
+	 *
 	 * Lock bit is set in one of the 2 cases-
 	 * a. In the sleep/winkle enter path, the last thread is executing
 	 * fastsleep workaround code.
@@ -569,7 +569,14 @@ lwarx_loop2:
 	 * workaround undo code or resyncing timebase or restoring context
 	 * In either case loop until the lock bit is cleared.
 	 */
+1:
+	lwarx	r15,0,r14
+	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
 	bnel-	core_idle_lock_held
+	oris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	stwcx.	r15,0,r14
+	bne-	1b
+	isync
 
 	andi.	r9,r15,PNV_CORE_IDLE_THREAD_BITS
 	cmpwi	cr2,r9,0
@@ -581,11 +588,6 @@ lwarx_loop2:
 	 * cr4 - gt or eq if waking up from complete hypervisor state loss.
 	 */
 
-	oris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	stwcx.	r15,0,r14
-	bne-	lwarx_loop2
-	isync
-
 BEGIN_FTR_SECTION
 	lbz	r4,PACA_SUBCORE_SIBLING_MASK(r13)
 	and	r4,r4,r15
-- 
cgit v1.2.1


From 0d7720a24234cefa8b318b1a3cf0b302a51faec9 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Apr 2017 23:05:50 +1000
Subject: powerpc/64s: Idle POWER8 avoid full state loss recovery where
 possible

If not all threads were in winkle, full state loss recovery is not
necessary and can be avoided. A previous patch removed this optimisation
due to some complexity with the implementation. Re-implement it by
counting the number of threads in winkle with the per-core idle state.
Only restore full state loss if all threads were in winkle.

This has a small window of false positives right before threads execute
winkle and just after they wake up, when the winkle count does not
reflect the true number of threads in winkle. This is not a significant
problem in comparison with even the minimum winkle duration. For
correctness, a false positive is not a problem (only false negatives
would be).

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cpuidle.h | 32 ++++++++++++--
 arch/powerpc/kernel/idle_book3s.S  | 89 +++++++++++++++++++++++++++++++++++---
 2 files changed, 112 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h
index dd6a63f0a10f..52586f9956bb 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -2,13 +2,39 @@
 #define _ASM_POWERPC_CPUIDLE_H
 
 #ifdef CONFIG_PPC_POWERNV
-/* Used in powernv idle state management */
+/* Thread state used in powernv idle state management */
 #define PNV_THREAD_RUNNING              0
 #define PNV_THREAD_NAP                  1
 #define PNV_THREAD_SLEEP                2
 #define PNV_THREAD_WINKLE               3
-#define PNV_CORE_IDLE_LOCK_BIT          0x10000000
-#define PNV_CORE_IDLE_THREAD_BITS       0x000000FF
+
+/*
+ * Core state used in powernv idle for POWER8.
+ *
+ * The lock bit synchronizes updates to the state, as well as parts of the
+ * sleep/wake code (see kernel/idle_book3s.S).
+ *
+ * Bottom 8 bits track the idle state of each thread. Bit is cleared before
+ * the thread executes an idle instruction (nap/sleep/winkle).
+ *
+ * Then there is winkle tracking. A core does not lose complete state
+ * until every thread is in winkle. So the winkle count field counts the
+ * number of threads in winkle (small window of false positives is okay
+ * around the sleep/wake, so long as there are no false negatives).
+ *
+ * When the winkle count reaches 8 (the COUNT_ALL_BIT becomes set), then
+ * the THREAD_WINKLE_BITS are set, which indicate which threads have not
+ * yet woken from the winkle state.
+ */
+#define PNV_CORE_IDLE_LOCK_BIT			0x10000000
+
+#define PNV_CORE_IDLE_WINKLE_COUNT		0x00010000
+#define PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT	0x00080000
+#define PNV_CORE_IDLE_WINKLE_COUNT_BITS		0x000F0000
+#define PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT	8
+#define PNV_CORE_IDLE_THREAD_WINKLE_BITS	0x0000FF00
+
+#define PNV_CORE_IDLE_THREAD_BITS       	0x000000FF
 
 /*
  * ============================ NOTE =================================
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index de164ba7c5e8..570163715107 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -210,15 +210,20 @@ pnv_enter_arch207_idle_mode:
 	/* Sleep or winkle */
 	lbz	r7,PACA_THREAD_MASK(r13)
 	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
+	li	r5,0
+	beq	cr3,3f
+	lis	r5,PNV_CORE_IDLE_WINKLE_COUNT@h
+3:
 lwarx_loop1:
 	lwarx	r15,0,r14
 
 	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
 	bnel-	core_idle_lock_held
 
+	add	r15,r15,r5			/* Add if winkle */
 	andc	r15,r15,r7			/* Clear thread bit */
 
-	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
+	andi.	r9,r15,PNV_CORE_IDLE_THREAD_BITS
 
 /*
  * If cr0 = 0, then current thread is the last thread of the core entering
@@ -509,14 +514,13 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_POWER9_DD1)
 pnv_restore_hyp_resource_arch207:
 	/*
 	 * POWER ISA 2.07 or less.
-	 * Check if we slept with winkle.
+	 * Check if we slept with sleep or winkle.
 	 */
 	ld	r2,PACATOC(r13);
 
-	lbz	r0,PACA_THREAD_IDLE_STATE(r13)
-	cmpwi   cr2,r0,PNV_THREAD_NAP
-	cmpwi   cr4,r0,PNV_THREAD_WINKLE
-	bgt     cr2,pnv_wakeup_tb_loss	/* Either sleep or Winkle */
+	lbz	r4,PACA_THREAD_IDLE_STATE(r13)
+	cmpwi	cr2,r4,PNV_THREAD_NAP
+	bgt	cr2,pnv_wakeup_tb_loss	/* Either sleep or Winkle */
 
 	/*
 	 * We fall through here if PACA_THREAD_IDLE_STATE shows we are waking
@@ -535,7 +539,12 @@ pnv_restore_hyp_resource_arch207:
  *
  * r13 - PACA
  * cr3 - gt if waking up with partial/complete hypervisor state loss
+ *
+ * If ISA300:
  * cr4 - gt or eq if waking up from complete hypervisor state loss.
+ *
+ * If ISA207:
+ * r4 - PACA_THREAD_IDLE_STATE
  */
 pnv_wakeup_tb_loss:
 	ld	r1,PACAR1(r13)
@@ -550,6 +559,7 @@ pnv_wakeup_tb_loss:
 	 * is required to return back to reset vector after hypervisor state
 	 * restore is complete.
 	 */
+	mr	r18,r4
 	mflr	r17
 	mfspr	r16,SPRN_SRR1
 BEGIN_FTR_SECTION
@@ -585,10 +595,77 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	 * At this stage
 	 * cr2 - eq if first thread to wakeup in core
 	 * cr3-  gt if waking up with partial/complete hypervisor state loss
+	 * ISA300:
 	 * cr4 - gt or eq if waking up from complete hypervisor state loss.
 	 */
 
 BEGIN_FTR_SECTION
+	/*
+	 * Were we in winkle?
+	 * If yes, check if all threads were in winkle, decrement our
+	 * winkle count, set all thread winkle bits if all were in winkle.
+	 * Check if our thread has a winkle bit set, and set cr4 accordingly
+	 * (to match ISA300, above). Pseudo-code for core idle state
+	 * transitions for ISA207 is as follows (everything happens atomically
+	 * due to store conditional and/or lock bit):
+	 *
+	 * nap_idle() { }
+	 * nap_wake() { }
+	 *
+	 * sleep_idle()
+	 * {
+	 *	core_idle_state &= ~thread_in_core
+	 * }
+	 *
+	 * sleep_wake()
+	 * {
+	 *     bool first_in_core, first_in_subcore;
+	 *
+	 *     first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
+	 *     first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
+	 *
+	 *     core_idle_state |= thread_in_core;
+	 * }
+	 *
+	 * winkle_idle()
+	 * {
+	 *	core_idle_state &= ~thread_in_core;
+	 *	core_idle_state += 1 << WINKLE_COUNT_SHIFT;
+	 * }
+	 *
+	 * winkle_wake()
+	 * {
+	 *     bool first_in_core, first_in_subcore, winkle_state_lost;
+	 *
+	 *     first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
+	 *     first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
+	 *
+	 *     core_idle_state |= thread_in_core;
+	 *
+	 *     if ((core_idle_state & WINKLE_MASK) == (8 << WINKLE_COUNT_SIHFT))
+	 *         core_idle_state |= THREAD_WINKLE_BITS;
+	 *     core_idle_state -= 1 << WINKLE_COUNT_SHIFT;
+	 *
+	 *     winkle_state_lost = core_idle_state &
+	 *				(thread_in_core << WINKLE_THREAD_SHIFT);
+	 *     core_idle_state &= ~(thread_in_core << WINKLE_THREAD_SHIFT);
+	 * }
+	 *
+	 */
+	cmpwi	r18,PNV_THREAD_WINKLE
+	bne	2f
+	andis.	r9,r15,PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT@h
+	subis	r15,r15,PNV_CORE_IDLE_WINKLE_COUNT@h
+	beq	2f
+	ori	r15,r15,PNV_CORE_IDLE_THREAD_WINKLE_BITS /* all were winkle */
+2:
+	/* Shift thread bit to winkle mask, then test if this thread is set,
+	 * and remove it from the winkle bits */
+	slwi	r8,r7,8
+	and	r8,r8,r15
+	andc	r15,r15,r8
+	cmpwi	cr4,r8,1 /* cr4 will be gt if our bit is set, lt if not */
+
 	lbz	r4,PACA_SUBCORE_SIBLING_MASK(r13)
 	and	r4,r4,r15
 	cmpwi	r4,0	/* Check if first in subcore */
-- 
cgit v1.2.1


From 9cba253df42f6138b48ea88f1fa35ae8a7fd5225 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Apr 2017 23:05:51 +1000
Subject: powerpc/64s: Simplify POWER9 DD1 idle workaround code

The idle workaround does not need to load PACATOC, and it does not
need to be called within a nested function that requires LR to be
saved.

Load the PACATOC at entry to the idle wakeup. It does not matter which
PACA this comes from, so it's okay to call before the workaround. Then
apply the workaround to get the right PACA.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/feature-fixups.h |  3 +++
 arch/powerpc/kernel/idle_book3s.S         | 16 +++++-----------
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
index ddf54f5bbdd1..2de2319b99e2 100644
--- a/arch/powerpc/include/asm/feature-fixups.h
+++ b/arch/powerpc/include/asm/feature-fixups.h
@@ -66,6 +66,9 @@ label##5:							\
 #define END_FTR_SECTION(msk, val)		\
 	END_FTR_SECTION_NESTED(msk, val, 97)
 
+#define END_FTR_SECTION_NESTED_IFSET(msk, label)	\
+	END_FTR_SECTION_NESTED((msk), (msk), label)
+
 #define END_FTR_SECTION_IFSET(msk)	END_FTR_SECTION((msk), (msk))
 #define END_FTR_SECTION_IFCLR(msk)	END_FTR_SECTION((msk), 0)
 
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 570163715107..0cdee271dd93 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -411,7 +411,6 @@ power9_dd1_recover_paca:
 	/* Load paca->thread_sibling_pacas[i] into r13 */
 	ldx	r13, r4, r5
 	SET_PACA(r13)
-	ld	r2, PACATOC(r13)
 	/*
 	 * Indicate that we have lost NVGPR state
 	 * which needs to be restored from the stack.
@@ -451,7 +450,12 @@ pnv_powersave_wakeup_mce:
  */
 .global pnv_powersave_wakeup
 pnv_powersave_wakeup:
+	ld	r2, PACATOC(r13)
+
 BEGIN_FTR_SECTION
+BEGIN_FTR_SECTION_NESTED(70)
+	bl	power9_dd1_recover_paca
+END_FTR_SECTION_NESTED_IFSET(CPU_FTR_POWER9_DD1, 70)
 	bl	pnv_restore_hyp_resource_arch300
 FTR_SECTION_ELSE
 	bl	pnv_restore_hyp_resource_arch207
@@ -488,14 +492,6 @@ pnv_restore_hyp_resource_arch300:
 	 * POWER ISA 3. Use PSSCR to determine if we
 	 * are waking up from deep idle state
 	 */
-BEGIN_FTR_SECTION
-	mflr 	r6
-	bl	power9_dd1_recover_paca
-	mtlr	r6
-FTR_SECTION_ELSE
-	ld	r2, PACATOC(r13)
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_POWER9_DD1)
-
 	LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
 	ld	r4,ADDROFF(pnv_first_deep_stop_state)(r5)
 
@@ -516,8 +512,6 @@ pnv_restore_hyp_resource_arch207:
 	 * POWER ISA 2.07 or less.
 	 * Check if we slept with sleep or winkle.
 	 */
-	ld	r2,PACATOC(r13);
-
 	lbz	r4,PACA_THREAD_IDLE_STATE(r13)
 	cmpwi	cr2,r4,PNV_THREAD_NAP
 	bgt	cr2,pnv_wakeup_tb_loss	/* Either sleep or Winkle */
-- 
cgit v1.2.1


From d08f8a28bcc8c2004a7186839148fc9aadd5cc6f Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 19 Apr 2017 18:21:06 +0530
Subject: powerpc/kprobes: Remove duplicate saving of MSR

set_current_kprobe() already saves regs->msr into kprobe_saved_msr. Remove the
redundant save.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index fce05a38851c..5a8601c95bbe 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -177,7 +177,6 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 			 */
 			save_previous_kprobe(kcb);
 			set_current_kprobe(p, regs, kcb);
-			kcb->kprobe_saved_msr = regs->msr;
 			kprobes_inc_nmissed_count(p);
 			prepare_singlestep(p, regs);
 			kcb->kprobe_status = KPROBE_REENTER;
-- 
cgit v1.2.1


From 700e64377c2c8e2406e9c4c1632e2eabdb3a95a1 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 19 Apr 2017 18:22:23 +0530
Subject: powerpc/ftrace: Move stack setup and teardown code into
 ftrace_graph_caller()

Move the stack setup and teardown code into ftrace_graph_caller(). This way, we
don't incur the cost of setting it up unless function graph is enabled for this
function.

Also, remove the extraneous LR restore code after the function graph stub. LR
has previously been restored and neither livepatch_handler() nor
ftrace_graph_caller() return back here.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
[mpe: Drop bad change to non-mprofile-kernel version of ftrace_graph_caller]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/entry_64.S | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 6432d4bf08c8..cb3fb98a5785 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -1313,16 +1313,12 @@ ftrace_call:
 #endif
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	stdu	r1, -112(r1)
 .globl ftrace_graph_call
 ftrace_graph_call:
 	b	ftrace_graph_stub
 _GLOBAL(ftrace_graph_stub)
-	addi	r1, r1, 112
 #endif
 
-	ld	r0,LRSAVE(r1)	/* restore callee's lr at _mcount site */
-	mtlr	r0
 	bctr			/* jump after _mcount site */
 #endif /* CC_USING_MPROFILE_KERNEL */
 
@@ -1471,6 +1467,7 @@ _GLOBAL(ftrace_graph_caller)
 
 #else /* CC_USING_MPROFILE_KERNEL */
 _GLOBAL(ftrace_graph_caller)
+	stdu	r1, -112(r1)
 	/* with -mprofile-kernel, parameter regs are still alive at _mcount */
 	std	r10, 104(r1)
 	std	r9, 96(r1)
-- 
cgit v1.2.1


From 71f6e58e5efe09b98a1862d4c25976e8f7763b67 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 12 Apr 2017 16:48:51 +0530
Subject: powerpc/kprobes: Convert __kprobes to NOKPROBE_SYMBOL()

Along similar lines as commit 9326638cbee2 ("kprobes, x86: Use NOKPROBE_SYMBOL()
instead of __kprobes annotation"), convert __kprobes annotation to either
NOKPROBE_SYMBOL() or nokprobe_inline. The latter forces inlining, in which case
the caller needs to be added to NOKPROBE_SYMBOL().

Also:
 - blacklist arch_deref_entry_point(), and
 - convert a few regular inlines to nokprobe_inline in lib/sstep.c

A key benefit is the ability to detect such symbols as being
blacklisted. Before this patch:

  $ cat /sys/kernel/debug/kprobes/blacklist | grep read_mem
  $ perf probe read_mem
  Failed to write event: Invalid argument
    Error: Failed to add events.
  $ dmesg | tail -1
  [ 3736.112815] Could not insert probe at _text+10014968: -22

After patch:
  $ cat /sys/kernel/debug/kprobes/blacklist | grep read_mem
  0xc000000000072b50-0xc000000000072d20	read_mem
  $ perf probe read_mem
  read_mem is blacklisted function, skip it.
  Added new events:
    (null):(null)        (on read_mem)
    probe:read_mem       (on read_mem)

  You can now use it in all perf tools, such as:

	  perf record -e probe:read_mem -aR sleep 1

  $ grep " read_mem" /proc/kallsyms
  c000000000072b50 t read_mem
  c0000000005f3b40 t read_mem
  $ cat /sys/kernel/debug/kprobes/list
  c0000000005f3b48  k  read_mem+0x8    [DISABLED]

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
[mpe: Minor change log formatting, fix up some conflicts]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes.c    | 55 +++++++++++++++++----------
 arch/powerpc/lib/code-patching.c |  4 +-
 arch/powerpc/lib/sstep.c         | 82 +++++++++++++++++++++-------------------
 3 files changed, 81 insertions(+), 60 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 5a8601c95bbe..a69c276dbb9c 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -42,7 +42,7 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
 struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
 
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
+int arch_prepare_kprobe(struct kprobe *p)
 {
 	int ret = 0;
 	kprobe_opcode_t insn = *p->addr;
@@ -74,30 +74,34 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
 	p->ainsn.boostable = 0;
 	return ret;
 }
+NOKPROBE_SYMBOL(arch_prepare_kprobe);
 
-void __kprobes arch_arm_kprobe(struct kprobe *p)
+void arch_arm_kprobe(struct kprobe *p)
 {
 	*p->addr = BREAKPOINT_INSTRUCTION;
 	flush_icache_range((unsigned long) p->addr,
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
+NOKPROBE_SYMBOL(arch_arm_kprobe);
 
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
+void arch_disarm_kprobe(struct kprobe *p)
 {
 	*p->addr = p->opcode;
 	flush_icache_range((unsigned long) p->addr,
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
+NOKPROBE_SYMBOL(arch_disarm_kprobe);
 
-void __kprobes arch_remove_kprobe(struct kprobe *p)
+void arch_remove_kprobe(struct kprobe *p)
 {
 	if (p->ainsn.insn) {
 		free_insn_slot(p->ainsn.insn, 0);
 		p->ainsn.insn = NULL;
 	}
 }
+NOKPROBE_SYMBOL(arch_remove_kprobe);
 
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+static nokprobe_inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 {
 	enable_single_step(regs);
 
@@ -110,37 +114,37 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 	regs->nip = (unsigned long)p->ainsn.insn;
 }
 
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+static nokprobe_inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
 	kcb->prev_kprobe.kp = kprobe_running();
 	kcb->prev_kprobe.status = kcb->kprobe_status;
 	kcb->prev_kprobe.saved_msr = kcb->kprobe_saved_msr;
 }
 
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+static nokprobe_inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
 	__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
 	kcb->kprobe_status = kcb->prev_kprobe.status;
 	kcb->kprobe_saved_msr = kcb->prev_kprobe.saved_msr;
 }
 
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 				struct kprobe_ctlblk *kcb)
 {
 	__this_cpu_write(current_kprobe, p);
 	kcb->kprobe_saved_msr = regs->msr;
 }
 
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
-				      struct pt_regs *regs)
+void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
 	ri->ret_addr = (kprobe_opcode_t *)regs->link;
 
 	/* Replace the return addr with trampoline addr */
 	regs->link = (unsigned long)kretprobe_trampoline;
 }
+NOKPROBE_SYMBOL(arch_prepare_kretprobe);
 
-int __kprobes kprobe_handler(struct pt_regs *regs)
+int kprobe_handler(struct pt_regs *regs)
 {
 	struct kprobe *p;
 	int ret = 0;
@@ -273,6 +277,7 @@ no_kprobe:
 	preempt_enable_no_resched();
 	return ret;
 }
+NOKPROBE_SYMBOL(kprobe_handler);
 
 /*
  * Function return probe trampoline:
@@ -290,8 +295,7 @@ asm(".global kretprobe_trampoline\n"
 /*
  * Called when the probe at kretprobe trampoline is hit
  */
-static int __kprobes trampoline_probe_handler(struct kprobe *p,
-						struct pt_regs *regs)
+static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kretprobe_instance *ri = NULL;
 	struct hlist_head *head, empty_rp;
@@ -360,6 +364,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
 	 */
 	return 1;
 }
+NOKPROBE_SYMBOL(trampoline_probe_handler);
 
 /*
  * Called after single-stepping.  p->addr is the address of the
@@ -369,7 +374,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
  * single-stepped a copy of the instruction.  The address of this
  * copy is p->ainsn.insn.
  */
-int __kprobes kprobe_post_handler(struct pt_regs *regs)
+int kprobe_post_handler(struct pt_regs *regs)
 {
 	struct kprobe *cur = kprobe_running();
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -409,8 +414,9 @@ out:
 
 	return 1;
 }
+NOKPROBE_SYMBOL(kprobe_post_handler);
 
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 {
 	struct kprobe *cur = kprobe_running();
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -473,13 +479,15 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 	}
 	return 0;
 }
+NOKPROBE_SYMBOL(kprobe_fault_handler);
 
 unsigned long arch_deref_entry_point(void *entry)
 {
 	return ppc_global_function_entry(entry);
 }
+NOKPROBE_SYMBOL(arch_deref_entry_point);
 
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -496,17 +504,20 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 
 	return 1;
 }
+NOKPROBE_SYMBOL(setjmp_pre_handler);
 
-void __used __kprobes jprobe_return(void)
+void __used jprobe_return(void)
 {
 	asm volatile("trap" ::: "memory");
 }
+NOKPROBE_SYMBOL(jprobe_return);
 
-static void __used __kprobes jprobe_return_end(void)
+static void __used jprobe_return_end(void)
 {
-};
+}
+NOKPROBE_SYMBOL(jprobe_return_end);
 
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
@@ -519,6 +530,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 	preempt_enable_no_resched();
 	return 1;
 }
+NOKPROBE_SYMBOL(longjmp_break_handler);
 
 static struct kprobe trampoline_p = {
 	.addr = (kprobe_opcode_t *) &kretprobe_trampoline,
@@ -530,10 +542,11 @@ int __init arch_init_kprobes(void)
 	return register_kprobe(&trampoline_p);
 }
 
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+int arch_trampoline_kprobe(struct kprobe *p)
 {
 	if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
 		return 1;
 
 	return 0;
 }
+NOKPROBE_SYMBOL(arch_trampoline_kprobe);
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 0d3002b7e2b4..500b0f6a0b64 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/kprobes.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
 #include <linux/mm.h>
@@ -59,7 +60,7 @@ bool is_offset_in_branch_range(long offset)
  * Helper to check if a given instruction is a conditional branch
  * Derived from the conditional checks in analyse_instr()
  */
-bool __kprobes is_conditional_branch(unsigned int instr)
+bool is_conditional_branch(unsigned int instr)
 {
 	unsigned int opcode = instr >> 26;
 
@@ -75,6 +76,7 @@ bool __kprobes is_conditional_branch(unsigned int instr)
 	}
 	return false;
 }
+NOKPROBE_SYMBOL(is_conditional_branch);
 
 unsigned int create_branch(const unsigned int *addr,
 			   unsigned long target, int flags)
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 9c542ec70c5b..33117f8a0882 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -49,7 +49,8 @@ extern int do_stxvd2x(int rn, unsigned long ea);
 /*
  * Emulate the truncation of 64 bit values in 32-bit mode.
  */
-static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val)
+static nokprobe_inline unsigned long truncate_if_32bit(unsigned long msr,
+							unsigned long val)
 {
 #ifdef __powerpc64__
 	if ((msr & MSR_64BIT) == 0)
@@ -61,7 +62,7 @@ static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val)
 /*
  * Determine whether a conditional branch instruction would branch.
  */
-static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs)
+static nokprobe_inline int branch_taken(unsigned int instr, struct pt_regs *regs)
 {
 	unsigned int bo = (instr >> 21) & 0x1f;
 	unsigned int bi;
@@ -81,8 +82,7 @@ static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs)
 	return 1;
 }
 
-
-static long __kprobes address_ok(struct pt_regs *regs, unsigned long ea, int nb)
+static nokprobe_inline long address_ok(struct pt_regs *regs, unsigned long ea, int nb)
 {
 	if (!user_mode(regs))
 		return 1;
@@ -92,7 +92,7 @@ static long __kprobes address_ok(struct pt_regs *regs, unsigned long ea, int nb)
 /*
  * Calculate effective address for a D-form instruction
  */
-static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs)
+static nokprobe_inline unsigned long dform_ea(unsigned int instr, struct pt_regs *regs)
 {
 	int ra;
 	unsigned long ea;
@@ -109,7 +109,7 @@ static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs
 /*
  * Calculate effective address for a DS-form instruction
  */
-static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *regs)
+static nokprobe_inline unsigned long dsform_ea(unsigned int instr, struct pt_regs *regs)
 {
 	int ra;
 	unsigned long ea;
@@ -126,8 +126,8 @@ static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *reg
 /*
  * Calculate effective address for an X-form instruction
  */
-static unsigned long __kprobes xform_ea(unsigned int instr,
-					struct pt_regs *regs)
+static nokprobe_inline unsigned long xform_ea(unsigned int instr,
+						struct pt_regs *regs)
 {
 	int ra, rb;
 	unsigned long ea;
@@ -145,33 +145,33 @@ static unsigned long __kprobes xform_ea(unsigned int instr,
  * Return the largest power of 2, not greater than sizeof(unsigned long),
  * such that x is a multiple of it.
  */
-static inline unsigned long max_align(unsigned long x)
+static nokprobe_inline unsigned long max_align(unsigned long x)
 {
 	x |= sizeof(unsigned long);
 	return x & -x;		/* isolates rightmost bit */
 }
 
 
-static inline unsigned long byterev_2(unsigned long x)
+static nokprobe_inline unsigned long byterev_2(unsigned long x)
 {
 	return ((x >> 8) & 0xff) | ((x & 0xff) << 8);
 }
 
-static inline unsigned long byterev_4(unsigned long x)
+static nokprobe_inline unsigned long byterev_4(unsigned long x)
 {
 	return ((x >> 24) & 0xff) | ((x >> 8) & 0xff00) |
 		((x & 0xff00) << 8) | ((x & 0xff) << 24);
 }
 
 #ifdef __powerpc64__
-static inline unsigned long byterev_8(unsigned long x)
+static nokprobe_inline unsigned long byterev_8(unsigned long x)
 {
 	return (byterev_4(x) << 32) | byterev_4(x >> 32);
 }
 #endif
 
-static int __kprobes read_mem_aligned(unsigned long *dest, unsigned long ea,
-				      int nb)
+static nokprobe_inline int read_mem_aligned(unsigned long *dest,
+					unsigned long ea, int nb)
 {
 	int err = 0;
 	unsigned long x = 0;
@@ -197,8 +197,8 @@ static int __kprobes read_mem_aligned(unsigned long *dest, unsigned long ea,
 	return err;
 }
 
-static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea,
-					int nb, struct pt_regs *regs)
+static nokprobe_inline int read_mem_unaligned(unsigned long *dest,
+				unsigned long ea, int nb, struct pt_regs *regs)
 {
 	int err;
 	unsigned long x, b, c;
@@ -248,7 +248,7 @@ static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea,
  * Read memory at address ea for nb bytes, return 0 for success
  * or -EFAULT if an error occurred.
  */
-static int __kprobes read_mem(unsigned long *dest, unsigned long ea, int nb,
+static int read_mem(unsigned long *dest, unsigned long ea, int nb,
 			      struct pt_regs *regs)
 {
 	if (!address_ok(regs, ea, nb))
@@ -257,9 +257,10 @@ static int __kprobes read_mem(unsigned long *dest, unsigned long ea, int nb,
 		return read_mem_aligned(dest, ea, nb);
 	return read_mem_unaligned(dest, ea, nb, regs);
 }
+NOKPROBE_SYMBOL(read_mem);
 
-static int __kprobes write_mem_aligned(unsigned long val, unsigned long ea,
-				       int nb)
+static nokprobe_inline int write_mem_aligned(unsigned long val,
+					unsigned long ea, int nb)
 {
 	int err = 0;
 
@@ -282,8 +283,8 @@ static int __kprobes write_mem_aligned(unsigned long val, unsigned long ea,
 	return err;
 }
 
-static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea,
-					 int nb, struct pt_regs *regs)
+static nokprobe_inline int write_mem_unaligned(unsigned long val,
+				unsigned long ea, int nb, struct pt_regs *regs)
 {
 	int err;
 	unsigned long c;
@@ -325,7 +326,7 @@ static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea,
  * Write memory at address ea for nb bytes, return 0 for success
  * or -EFAULT if an error occurred.
  */
-static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb,
+static int write_mem(unsigned long val, unsigned long ea, int nb,
 			       struct pt_regs *regs)
 {
 	if (!address_ok(regs, ea, nb))
@@ -334,13 +335,14 @@ static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb,
 		return write_mem_aligned(val, ea, nb);
 	return write_mem_unaligned(val, ea, nb, regs);
 }
+NOKPROBE_SYMBOL(write_mem);
 
 #ifdef CONFIG_PPC_FPU
 /*
  * Check the address and alignment, and call func to do the actual
  * load or store.
  */
-static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long),
+static int do_fp_load(int rn, int (*func)(int, unsigned long),
 				unsigned long ea, int nb,
 				struct pt_regs *regs)
 {
@@ -380,8 +382,9 @@ static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long),
 		return err;
 	return (*func)(rn, ptr);
 }
+NOKPROBE_SYMBOL(do_fp_load);
 
-static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long),
+static int do_fp_store(int rn, int (*func)(int, unsigned long),
 				 unsigned long ea, int nb,
 				 struct pt_regs *regs)
 {
@@ -425,11 +428,12 @@ static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long),
 	}
 	return err;
 }
+NOKPROBE_SYMBOL(do_fp_store);
 #endif
 
 #ifdef CONFIG_ALTIVEC
 /* For Altivec/VMX, no need to worry about alignment */
-static int __kprobes do_vec_load(int rn, int (*func)(int, unsigned long),
+static nokprobe_inline int do_vec_load(int rn, int (*func)(int, unsigned long),
 				 unsigned long ea, struct pt_regs *regs)
 {
 	if (!address_ok(regs, ea & ~0xfUL, 16))
@@ -437,7 +441,7 @@ static int __kprobes do_vec_load(int rn, int (*func)(int, unsigned long),
 	return (*func)(rn, ea);
 }
 
-static int __kprobes do_vec_store(int rn, int (*func)(int, unsigned long),
+static nokprobe_inline int do_vec_store(int rn, int (*func)(int, unsigned long),
 				  unsigned long ea, struct pt_regs *regs)
 {
 	if (!address_ok(regs, ea & ~0xfUL, 16))
@@ -447,7 +451,7 @@ static int __kprobes do_vec_store(int rn, int (*func)(int, unsigned long),
 #endif /* CONFIG_ALTIVEC */
 
 #ifdef CONFIG_VSX
-static int __kprobes do_vsx_load(int rn, int (*func)(int, unsigned long),
+static nokprobe_inline int do_vsx_load(int rn, int (*func)(int, unsigned long),
 				 unsigned long ea, struct pt_regs *regs)
 {
 	int err;
@@ -465,7 +469,7 @@ static int __kprobes do_vsx_load(int rn, int (*func)(int, unsigned long),
 	return err;
 }
 
-static int __kprobes do_vsx_store(int rn, int (*func)(int, unsigned long),
+static nokprobe_inline int do_vsx_store(int rn, int (*func)(int, unsigned long),
 				 unsigned long ea, struct pt_regs *regs)
 {
 	int err;
@@ -522,7 +526,7 @@ static int __kprobes do_vsx_store(int rn, int (*func)(int, unsigned long),
 		: "=r" (err)				\
 		: "r" (addr), "i" (-EFAULT), "0" (err))
 
-static void __kprobes set_cr0(struct pt_regs *regs, int rd)
+static nokprobe_inline void set_cr0(struct pt_regs *regs, int rd)
 {
 	long val = regs->gpr[rd];
 
@@ -539,7 +543,7 @@ static void __kprobes set_cr0(struct pt_regs *regs, int rd)
 		regs->ccr |= 0x20000000;
 }
 
-static void __kprobes add_with_carry(struct pt_regs *regs, int rd,
+static nokprobe_inline void add_with_carry(struct pt_regs *regs, int rd,
 				     unsigned long val1, unsigned long val2,
 				     unsigned long carry_in)
 {
@@ -560,7 +564,7 @@ static void __kprobes add_with_carry(struct pt_regs *regs, int rd,
 		regs->xer &= ~XER_CA;
 }
 
-static void __kprobes do_cmp_signed(struct pt_regs *regs, long v1, long v2,
+static nokprobe_inline void do_cmp_signed(struct pt_regs *regs, long v1, long v2,
 				    int crfld)
 {
 	unsigned int crval, shift;
@@ -576,7 +580,7 @@ static void __kprobes do_cmp_signed(struct pt_regs *regs, long v1, long v2,
 	regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift);
 }
 
-static void __kprobes do_cmp_unsigned(struct pt_regs *regs, unsigned long v1,
+static nokprobe_inline void do_cmp_unsigned(struct pt_regs *regs, unsigned long v1,
 				      unsigned long v2, int crfld)
 {
 	unsigned int crval, shift;
@@ -592,7 +596,7 @@ static void __kprobes do_cmp_unsigned(struct pt_regs *regs, unsigned long v1,
 	regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift);
 }
 
-static int __kprobes trap_compare(long v1, long v2)
+static nokprobe_inline int trap_compare(long v1, long v2)
 {
 	int ret = 0;
 
@@ -631,7 +635,7 @@ static int __kprobes trap_compare(long v1, long v2)
  * Returns 1 if the instruction has been executed, or 0 if not.
  * Sets *op to indicate what the instruction does.
  */
-int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs,
+int analyse_instr(struct instruction_op *op, struct pt_regs *regs,
 			    unsigned int instr)
 {
 	unsigned int opcode, ra, rb, rd, spr, u;
@@ -1692,6 +1696,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs,
 #endif
 }
 EXPORT_SYMBOL_GPL(analyse_instr);
+NOKPROBE_SYMBOL(analyse_instr);
 
 /*
  * For PPC32 we always use stwu with r1 to change the stack pointer.
@@ -1701,7 +1706,7 @@ EXPORT_SYMBOL_GPL(analyse_instr);
  * don't emulate the real store operation. We will do real store
  * operation safely in exception return code by checking this flag.
  */
-static __kprobes int handle_stack_update(unsigned long ea, struct pt_regs *regs)
+static nokprobe_inline int handle_stack_update(unsigned long ea, struct pt_regs *regs)
 {
 #ifdef CONFIG_PPC32
 	/*
@@ -1721,7 +1726,7 @@ static __kprobes int handle_stack_update(unsigned long ea, struct pt_regs *regs)
 	return 0;
 }
 
-static __kprobes void do_signext(unsigned long *valp, int size)
+static nokprobe_inline void do_signext(unsigned long *valp, int size)
 {
 	switch (size) {
 	case 2:
@@ -1733,7 +1738,7 @@ static __kprobes void do_signext(unsigned long *valp, int size)
 	}
 }
 
-static __kprobes void do_byterev(unsigned long *valp, int size)
+static nokprobe_inline void do_byterev(unsigned long *valp, int size)
 {
 	switch (size) {
 	case 2:
@@ -1757,7 +1762,7 @@ static __kprobes void do_byterev(unsigned long *valp, int size)
  * or -1 if the instruction is one that should not be stepped,
  * such as an rfid, or a mtmsrd that would clear MSR_RI.
  */
-int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
+int emulate_step(struct pt_regs *regs, unsigned int instr)
 {
 	struct instruction_op op;
 	int r, err, size;
@@ -1988,3 +1993,4 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
 	regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4);
 	return 1;
 }
+NOKPROBE_SYMBOL(emulate_step);
-- 
cgit v1.2.1


From 7aa5b018bf36f733345f8814393b48011110b555 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 19 Apr 2017 20:59:51 +0530
Subject: powerpc/kprobes: Blacklist exception handlers

Introduce __head_end to mark end of the early fixed sections and use it to
blacklist all exception handlers from kprobes.

mpe: We do not need to do anything special for relocatable kernels, where the
exception vectors are split from the main kernel, as the split vectors are
already excluded by the check for kernel_text_address().

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
[mpe: Move __head_end outside #ifdef 64-bit to unbreak the 32-bit build]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/sections.h | 2 ++
 arch/powerpc/kernel/kprobes.c       | 9 +++++++++
 arch/powerpc/kernel/vmlinux.lds.S   | 2 ++
 3 files changed, 13 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h
index 7dc006b58369..7902d6358854 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -6,6 +6,8 @@
 #include <linux/uaccess.h>
 #include <asm-generic/sections.h>
 
+extern char __head_end[];
+
 #ifdef __powerpc64__
 
 extern char __start_interrupts[];
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index a69c276dbb9c..fa3cfd90c83a 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -35,6 +35,7 @@
 #include <asm/code-patching.h>
 #include <asm/cacheflush.h>
 #include <asm/sstep.h>
+#include <asm/sections.h>
 #include <linux/uaccess.h>
 
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
@@ -42,6 +43,14 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
 struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
 
+bool arch_within_kprobe_blacklist(unsigned long addr)
+{
+	return  (addr >= (unsigned long)__kprobes_text_start &&
+		 addr < (unsigned long)__kprobes_text_end) ||
+		(addr >= (unsigned long)_stext &&
+		 addr < (unsigned long)__head_end);
+}
+
 int arch_prepare_kprobe(struct kprobe *p)
 {
 	int ret = 0;
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 7394b770ae1f..f6eee507e0b9 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -77,6 +77,8 @@ SECTIONS
 #endif
 	} :kernel
 
+	__head_end = .;
+
 	/*
 	 * If the build dies here, it's likely code in head_64.S is referencing
 	 * labels it can't reach, and the linker inserting stubs without the
-- 
cgit v1.2.1


From 9a914aa6824ac5d5fd3195ed422f31540a0ab767 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 19 Apr 2017 20:59:52 +0530
Subject: powerpc/kprobes: Blacklist common exception handlers

Blacklist all the exception common/OOL handlers as the kernel stack is not yet
setup, which means we can't take a trap at this point.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/head-64.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h
index 5067048daad4..86eb87382031 100644
--- a/arch/powerpc/include/asm/head-64.h
+++ b/arch/powerpc/include/asm/head-64.h
@@ -213,6 +213,7 @@ name:
 	USE_TEXT_SECTION();					\
 	.balign IFETCH_ALIGN_BYTES;				\
 	.global name;						\
+	_ASM_NOKPROBE_SYMBOL(name);				\
 	DEFINE_FIXED_SYMBOL(name);				\
 name:
 
-- 
cgit v1.2.1


From 2f59be5b970b503ca8db1cb723b155e455ebac8e Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 19 Apr 2017 18:22:24 +0530
Subject: powerpc/ftrace: Restore LR from pt_regs

Pass the real LR to the ftrace handler. This is needed for KPROBES_ON_FTRACE for
the pre handlers.

Also, with KPROBES_ON_FTRACE, the link register may be updated by the pre
handlers or by a registed kretprobe. Honor updated LR by restoring it from
pt_regs, rather than from the stack save area.

Live patch and function graph continue to work fine with this change.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/entry_64.S | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 6432d4bf08c8..b846f75d7584 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -1248,9 +1248,10 @@ _GLOBAL(ftrace_caller)
 
 	/* Get the _mcount() call site out of LR */
 	mflr	r7
-	/* Save it as pt_regs->nip & pt_regs->link */
+	/* Save it as pt_regs->nip */
 	std     r7, _NIP(r1)
-	std     r7, _LINK(r1)
+	/* Save the read LR in pt_regs->link */
+	std     r0, _LINK(r1)
 
 	/* Save callee's TOC in the ABI compliant location */
 	std	r2, 24(r1)
@@ -1297,16 +1298,16 @@ ftrace_call:
 	REST_8GPRS(16,r1)
 	REST_8GPRS(24,r1)
 
+	/* Restore possibly modified LR */
+	ld	r0, _LINK(r1)
+	mtlr	r0
+
 	/* Restore callee's TOC */
 	ld	r2, 24(r1)
 
 	/* Pop our stack frame */
 	addi r1, r1, SWITCH_FRAME_SIZE
 
-	/* Restore original LR for return to B */
-	ld	r0, LRSAVE(r1)
-	mtlr	r0
-
 #ifdef CONFIG_LIVEPATCH
         /* Based on the cmpd above, if the NIP was altered handle livepatch */
 	bne-	livepatch_handler
-- 
cgit v1.2.1


From ead514d5fb30a0889d51c0f0e35c3e346165a955 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 19 Apr 2017 18:22:26 +0530
Subject: powerpc/kprobes: Add support for KPROBES_ON_FTRACE

Allow kprobes to be placed on ftrace _mcount() call sites. This optimization
avoids the use of a trap, by riding on ftrace infrastructure.

This depends on HAVE_DYNAMIC_FTRACE_WITH_REGS which depends on MPROFILE_KERNEL,
which is only currently enabled on powerpc64le with newer toolchains.

Based on the x86 code by Masami.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig                 |   1 +
 arch/powerpc/include/asm/kprobes.h   |  10 ++++
 arch/powerpc/kernel/Makefile         |   3 +
 arch/powerpc/kernel/kprobes-ftrace.c | 104 +++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/kprobes.c        |   8 ++-
 5 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/kernel/kprobes-ftrace.c

(limited to 'arch')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 97a8bc8a095c..f1c43d78056c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -142,6 +142,7 @@ config PPC
 	select HAVE_IRQ_EXIT_ON_IRQ_STACK
 	select HAVE_KERNEL_GZIP
 	select HAVE_KPROBES
+	select HAVE_KPROBES_ON_FTRACE
 	select HAVE_KRETPROBES
 	select HAVE_LIVEPATCH			if HAVE_DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_MEMBLOCK
diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index a843884aafaf..a83821f33ea3 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -103,6 +103,16 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
 extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 extern int kprobe_handler(struct pt_regs *regs);
 extern int kprobe_post_handler(struct pt_regs *regs);
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+			   struct kprobe_ctlblk *kcb);
+#else
+static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+				  struct kprobe_ctlblk *kcb)
+{
+	return 0;
+}
+#endif
 #else
 static inline int kprobe_handler(struct pt_regs *regs) { return 0; }
 static inline int kprobe_post_handler(struct pt_regs *regs) { return 0; }
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 811f441a125f..3e461637b64d 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -97,6 +97,7 @@ obj-$(CONFIG_BOOTX_TEXT)	+= btext.o
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_OPTPROBES)		+= optprobes.o optprobes_head.o
+obj-$(CONFIG_KPROBES_ON_FTRACE)	+= kprobes-ftrace.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o
 obj-$(CONFIG_PPC_UDBG_16550)	+= legacy_serial.o udbg_16550.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
@@ -150,6 +151,8 @@ GCOV_PROFILE_machine_kexec_32.o := n
 UBSAN_SANITIZE_machine_kexec_32.o := n
 GCOV_PROFILE_kprobes.o := n
 UBSAN_SANITIZE_kprobes.o := n
+GCOV_PROFILE_kprobes-ftrace.o := n
+UBSAN_SANITIZE_kprobes-ftrace.o := n
 UBSAN_SANITIZE_vdso.o := n
 
 extra-$(CONFIG_PPC_FPU)		+= fpu.o
diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
new file mode 100644
index 000000000000..6c089d9757c9
--- /dev/null
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -0,0 +1,104 @@
+/*
+ * Dynamic Ftrace based Kprobes Optimization
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) Hitachi Ltd., 2012
+ * Copyright 2016 Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
+ *		  IBM Corporation
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/ftrace.h>
+
+static nokprobe_inline
+int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+		      struct kprobe_ctlblk *kcb, unsigned long orig_nip)
+{
+	/*
+	 * Emulate singlestep (and also recover regs->nip)
+	 * as if there is a nop
+	 */
+	regs->nip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
+	if (unlikely(p->post_handler)) {
+		kcb->kprobe_status = KPROBE_HIT_SSDONE;
+		p->post_handler(p, regs, 0);
+	}
+	__this_cpu_write(current_kprobe, NULL);
+	if (orig_nip)
+		regs->nip = orig_nip;
+	return 1;
+}
+
+int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+		    struct kprobe_ctlblk *kcb)
+{
+	if (kprobe_ftrace(p))
+		return __skip_singlestep(p, regs, kcb, 0);
+	else
+		return 0;
+}
+NOKPROBE_SYMBOL(skip_singlestep);
+
+/* Ftrace callback handler for kprobes */
+void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
+			   struct ftrace_ops *ops, struct pt_regs *regs)
+{
+	struct kprobe *p;
+	struct kprobe_ctlblk *kcb;
+	unsigned long flags;
+
+	/* Disable irq for emulating a breakpoint and avoiding preempt */
+	local_irq_save(flags);
+	hard_irq_disable();
+
+	p = get_kprobe((kprobe_opcode_t *)nip);
+	if (unlikely(!p) || kprobe_disabled(p))
+		goto end;
+
+	kcb = get_kprobe_ctlblk();
+	if (kprobe_running()) {
+		kprobes_inc_nmissed_count(p);
+	} else {
+		unsigned long orig_nip = regs->nip;
+
+		/*
+		 * On powerpc, NIP is *before* this instruction for the
+		 * pre handler
+		 */
+		regs->nip -= MCOUNT_INSN_SIZE;
+
+		__this_cpu_write(current_kprobe, p);
+		kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+		if (!p->pre_handler || !p->pre_handler(p, regs))
+			__skip_singlestep(p, regs, kcb, orig_nip);
+		/*
+		 * If pre_handler returns !0, it sets regs->nip and
+		 * resets current kprobe.
+		 */
+	}
+end:
+	local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
+
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+	p->ainsn.insn = NULL;
+	p->ainsn.boostable = -1;
+	return 0;
+}
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index b71922618ed2..50aabf819787 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -192,7 +192,11 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 bool arch_function_offset_within_entry(unsigned long offset)
 {
 #ifdef PPC64_ELF_ABI_v2
+#ifdef CONFIG_KPROBES_ON_FTRACE
+	return offset <= 16;
+#else
 	return offset <= 8;
+#endif
 #else
 	return !offset;
 #endif
@@ -301,7 +305,9 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 			}
 			p = __this_cpu_read(current_kprobe);
 			if (p->break_handler && p->break_handler(p, regs)) {
-				goto ss_probe;
+				if (!skip_singlestep(p, regs, kcb))
+					goto ss_probe;
+				ret = 1;
 			}
 		}
 		goto no_kprobe;
-- 
cgit v1.2.1


From 1b32cd1715378c9a3856df4a80920f8e241f914c Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 19 Apr 2017 18:22:27 +0530
Subject: powerpc: Introduce a new helper to obtain function entry points

kprobe_lookup_name() is specific to the kprobe subsystem and may not always
return the function entry point (in a subsequent patch for KPROBES_ON_FTRACE).
For looking up function entry points, introduce a separate helper and use it
in optprobes.c

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/code-patching.h | 41 ++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/optprobes.c          |  6 ++---
 2 files changed, 44 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
index 8ab937771068..abef812de7f8 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -12,6 +12,8 @@
 
 #include <asm/types.h>
 #include <asm/ppc-opcode.h>
+#include <linux/string.h>
+#include <linux/kallsyms.h>
 
 /* Flags for create_branch:
  * "b"   == create_branch(addr, target, 0);
@@ -99,6 +101,45 @@ static inline unsigned long ppc_global_function_entry(void *func)
 #endif
 }
 
+/*
+ * Wrapper around kallsyms_lookup() to return function entry address:
+ * - For ABIv1, we lookup the dot variant.
+ * - For ABIv2, we return the local entry point.
+ */
+static inline unsigned long ppc_kallsyms_lookup_name(const char *name)
+{
+	unsigned long addr;
+#ifdef PPC64_ELF_ABI_v1
+	/* check for dot variant */
+	char dot_name[1 + KSYM_NAME_LEN];
+	bool dot_appended = false;
+
+	if (strnlen(name, KSYM_NAME_LEN) >= KSYM_NAME_LEN)
+		return 0;
+
+	if (name[0] != '.') {
+		dot_name[0] = '.';
+		dot_name[1] = '\0';
+		strlcat(dot_name, name, sizeof(dot_name));
+		dot_appended = true;
+	} else {
+		dot_name[0] = '\0';
+		strlcat(dot_name, name, sizeof(dot_name));
+	}
+	addr = kallsyms_lookup_name(dot_name);
+	if (!addr && dot_appended)
+		/* Let's try the original non-dot symbol lookup	*/
+		addr = kallsyms_lookup_name(name);
+#elif defined(PPC64_ELF_ABI_v2)
+	addr = kallsyms_lookup_name(name);
+	if (addr)
+		addr = ppc_function_entry((void *)addr);
+#else
+	addr = kallsyms_lookup_name(name);
+#endif
+	return addr;
+}
+
 #ifdef CONFIG_PPC64
 /*
  * Some instruction encodings commonly used in dynamic ftracing
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index ce81a322251c..ec60ed0d4aad 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -243,10 +243,10 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
 	/*
 	 * 2. branch to optimized_callback() and emulate_step()
 	 */
-	op_callback_addr = kprobe_lookup_name("optimized_callback", 0);
-	emulate_step_addr = kprobe_lookup_name("emulate_step", 0);
+	op_callback_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("optimized_callback");
+	emulate_step_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("emulate_step");
 	if (!op_callback_addr || !emulate_step_addr) {
-		WARN(1, "kprobe_lookup_name() failed\n");
+		WARN(1, "Unable to lookup optimized_callback()/emulate_step()\n");
 		goto error;
 	}
 
-- 
cgit v1.2.1


From 24bd909e94776ecce95291bff910f14c78ac4a43 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 19 Apr 2017 18:22:28 +0530
Subject: powerpc/kprobes: Prefer ftrace when probing function entry

KPROBES_ON_FTRACE avoids much of the overhead of regular kprobes as it
eliminates the need for a trap, as well as the need to emulate or single-step
instructions.

Though OPTPROBES provides us with similar performance, we have limited
optprobes trampoline slots. As such, when asked to probe at a function
entry, default to using the ftrace infrastructure.

With:
  # cd /sys/kernel/debug/tracing
  # echo 'p _do_fork' > kprobe_events

before patch:
  # cat ../kprobes/list
  c0000000000daf08  k  _do_fork+0x8    [DISABLED]
  c000000000044fc0  k  kretprobe_trampoline+0x0    [OPTIMIZED]

and after patch:
  # cat ../kprobes/list
  c0000000000d074c  k  _do_fork+0xc    [DISABLED][FTRACE]
  c0000000000412b0  k  kretprobe_trampoline+0x0    [OPTIMIZED]

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 50aabf819787..ca040e1be892 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -49,8 +49,21 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset)
 #ifdef PPC64_ELF_ABI_v2
 	/* PPC64 ABIv2 needs local entry point */
 	addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
-	if (addr && !offset)
-		addr = (kprobe_opcode_t *)ppc_function_entry(addr);
+	if (addr && !offset) {
+#ifdef CONFIG_KPROBES_ON_FTRACE
+		unsigned long faddr;
+		/*
+		 * Per livepatch.h, ftrace location is always within the first
+		 * 16 bytes of a function on powerpc with -mprofile-kernel.
+		 */
+		faddr = ftrace_location_range((unsigned long)addr,
+					      (unsigned long)addr + 16);
+		if (faddr)
+			addr = (kprobe_opcode_t *)faddr;
+		else
+#endif
+			addr = (kprobe_opcode_t *)ppc_function_entry(addr);
+	}
 #elif defined(PPC64_ELF_ABI_v1)
 	/*
 	 * 64bit powerpc ABIv1 uses function descriptors:
-- 
cgit v1.2.1


From 856736466806ceb72c7402eb0f06bea8f5b83c5a Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 24 Apr 2017 10:35:14 +1000
Subject: powerpc/xmon: Deindent the SLB dumping logic

Currently the code that dumps SLB entries uses a double-nested if. This
means the actual dumping logic is a bit squashed. Deindent it by using
continue.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Rashmica Gupta <rashmica.g@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/xmon/xmon.c | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index f77a104abf9f..2b21e90fff8d 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -3157,23 +3157,28 @@ void dump_segments(void)
 	for (i = 0; i < mmu_slb_size; i++) {
 		asm volatile("slbmfee  %0,%1" : "=r" (esid) : "r" (i));
 		asm volatile("slbmfev  %0,%1" : "=r" (vsid) : "r" (i));
-		if (esid || vsid) {
-			printf("%02d %016lx %016lx", i, esid, vsid);
-			if (esid & SLB_ESID_V) {
-				llp = vsid & SLB_VSID_LLP;
-				if (vsid & SLB_VSID_B_1T) {
-					printf("  1T  ESID=%9lx  VSID=%13lx LLP:%3lx \n",
-						GET_ESID_1T(esid),
-						(vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T,
-						llp);
-				} else {
-					printf(" 256M ESID=%9lx  VSID=%13lx LLP:%3lx \n",
-						GET_ESID(esid),
-						(vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT,
-						llp);
-				}
-			} else
-				printf("\n");
+
+		if (!esid && !vsid)
+			continue;
+
+		printf("%02d %016lx %016lx", i, esid, vsid);
+
+		if (!(esid & SLB_ESID_V)) {
+			printf("\n");
+			continue;
+		}
+
+		llp = vsid & SLB_VSID_LLP;
+		if (vsid & SLB_VSID_B_1T) {
+			printf("  1T  ESID=%9lx  VSID=%13lx LLP:%3lx \n",
+				GET_ESID_1T(esid),
+				(vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T,
+				llp);
+		} else {
+			printf(" 256M ESID=%9lx  VSID=%13lx LLP:%3lx \n",
+				GET_ESID(esid),
+				(vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT,
+				llp);
 		}
 	}
 }
-- 
cgit v1.2.1


From 68baf692c435339e6295cb470ea5545cbc28160e Mon Sep 17 00:00:00 2001
From: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Date: Mon, 17 Apr 2017 20:21:40 -0400
Subject: powerpc/pseries: Fix of_node_put() underflow during DLPAR remove

Historically struct device_node references were tracked using a kref embedded as
a struct field. Commit 75b57ecf9d1d ("of: Make device nodes kobjects so they
show up in sysfs") (Mar 2014) refactored device_nodes to be kobjects such that
the device tree could by more simply exposed to userspace using sysfs.

Commit 0829f6d1f69e ("of: device_node kobject lifecycle fixes") (Mar 2014)
followed up these changes to better control the kobject lifecycle and in
particular the referecne counting via of_node_get(), of_node_put(), and
of_node_init().

A result of this second commit was that it introduced an of_node_put() call when
a dynamic node is detached, in of_node_remove(), that removes the initial kobj
reference created by of_node_init().

Traditionally as the original dynamic device node user the pseries code had
assumed responsibilty for releasing this final reference in its platform
specific DLPAR detach code.

This patch fixes a refcount underflow introduced by commit 0829f6d1f6, and
recently exposed by the upstreaming of the recount API.

Messages like the following are no longer seen in the kernel log with this
patch following DLPAR remove operations of cpus and pci devices.

  rpadlpar_io: slot PHB 72 removed
  refcount_t: underflow; use-after-free.
  ------------[ cut here ]------------
  WARNING: CPU: 5 PID: 3335 at lib/refcount.c:128 refcount_sub_and_test+0xf4/0x110

Fixes: 0829f6d1f69e ("of: device_node kobject lifecycle fixes")
Cc: stable@vger.kernel.org # v3.15+
Signed-off-by: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
[mpe: Make change log commit references more verbose]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/dlpar.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 193e052fa0dd..bda18d8e1674 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -288,7 +288,6 @@ int dlpar_detach_node(struct device_node *dn)
 	if (rc)
 		return rc;
 
-	of_node_put(dn); /* Must decrement the refcount */
 	return 0;
 }
 
-- 
cgit v1.2.1


From e76ca27790a514590af782f83f6eae49e0ccf8c9 Mon Sep 17 00:00:00 2001
From: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Date: Mon, 17 Apr 2017 20:24:39 -0400
Subject: powerpc/sysfs: Fix reference leak of cpu device_nodes present at boot

For CPUs present at boot each logical CPU acquires a reference to the
associated device node of the core. This happens in register_cpu() which
is called by topology_init(). The result of this is that we end up with
a reference held by each thread of the core. However, these references
are never freed if the CPU core is DLPAR removed.

This patch fixes the reference leaks by acquiring and releasing the references
in the CPU hotplug callbacks un/register_cpu_online(). With this patch symmetric
reference counting is observed with both CPUs present at boot, and those DLPAR
added after boot.

Fixes: f86e4718f24b ("driver/core: cpu: initialize of_node in cpu's device struture")
Cc: stable@vger.kernel.org # v3.12+
Signed-off-by: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/sysfs.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index c1fb255a60d6..949957b97ead 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -710,6 +710,10 @@ static int register_cpu_online(unsigned int cpu)
 	struct device_attribute *attrs, *pmc_attrs;
 	int i, nattrs;
 
+	/* For cpus present at boot a reference was already grabbed in register_cpu() */
+	if (!s->of_node)
+		s->of_node = of_get_cpu_node(cpu, NULL);
+
 #ifdef CONFIG_PPC64
 	if (cpu_has_feature(CPU_FTR_SMT))
 		device_create_file(s, &dev_attr_smt_snooze_delay);
@@ -864,6 +868,8 @@ static int unregister_cpu_online(unsigned int cpu)
 	}
 #endif
 	cacheinfo_cpu_offline(cpu);
+	of_node_put(s->of_node);
+	s->of_node = NULL;
 #endif /* CONFIG_HOTPLUG_CPU */
 	return 0;
 }
-- 
cgit v1.2.1


From 9765ad134a00a01cbcc69c78ff6defbfad209bc5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Apr 2017 16:38:26 +1000
Subject: powerpc/mm: Ensure IRQs are off in switch_mm()

powerpc expects IRQs to already be (soft) disabled when switch_mm() is
called, as made clear in the commit message of 9c1e105238c4 ("powerpc: Allow
perf_counters to access user memory at interrupt time").

Aside from any race conditions that might exist between switch_mm() and an IRQ,
there is also an unconditional hard_irq_disable() in switch_slb(). If that isn't
followed at some point by an IRQ enable then interrupts will remain disabled
until we return to userspace.

It is true that when switch_mm() is called from the scheduler IRQs are off, but
not when it's called by use_mm(). Looking closer we see that last year in commit
f98db6013c55 ("sched/core: Add switch_mm_irqs_off() and use it in the scheduler")
this was made more explicit by the addition of switch_mm_irqs_off() which is now
called by the scheduler, vs switch_mm() which is used by use_mm().

Arguably it is a bug in use_mm() to call switch_mm() in a different context than
it expects, but fixing that will take time.

This was discovered recently when vhost started throwing warnings such as:

  BUG: sleeping function called from invalid context at kernel/mutex.c:578
  in_atomic(): 0, irqs_disabled(): 1, pid: 10768, name: vhost-10760
  no locks held by vhost-10760/10768.
  irq event stamp: 10
  hardirqs last  enabled at (9):  _raw_spin_unlock_irq+0x40/0x80
  hardirqs last disabled at (10): switch_slb+0x2e4/0x490
  softirqs last  enabled at (0):  copy_process+0x5e8/0x1260
  softirqs last disabled at (0):  (null)
  Call Trace:
    show_stack+0x88/0x390 (unreliable)
    dump_stack+0x30/0x44
    __might_sleep+0x1c4/0x2d0
    mutex_lock_nested+0x74/0x5c0
    cgroup_attach_task_all+0x5c/0x180
    vhost_attach_cgroups_work+0x58/0x80 [vhost]
    vhost_worker+0x24c/0x3d0 [vhost]
    kthread+0xec/0x100
    ret_from_kernel_thread+0x5c/0xd4

Prior to commit 04b96e5528ca ("vhost: lockless enqueuing") (Aug 2016) the
vhost_worker() would do a spin_unlock_irq() not long after calling use_mm(),
which had the effect of reenabling IRQs. Since that commit removed the locking
in vhost_worker() the body of the vhost_worker() loop now runs with interrupts
off causing the warnings.

This patch addresses the problem by making the powerpc code mirror the x86 code,
ie. we disable interrupts in switch_mm(), and optimise the scheduler case by
defining switch_mm_irqs_off().

Cc: stable@vger.kernel.org # v4.7+
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[mpe: Flesh out/rewrite change log, add stable]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mmu_context.h | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 78803a7ebdd9..a114248de2ee 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -71,8 +71,9 @@ extern void drop_cop(unsigned long acop, struct mm_struct *mm);
  * switch_mm is the entry point called from the architecture independent
  * code in kernel/sched/core.c
  */
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
-			     struct task_struct *tsk)
+static inline void switch_mm_irqs_off(struct mm_struct *prev,
+				      struct mm_struct *next,
+				      struct task_struct *tsk)
 {
 	/* Mark this context has been used on the new CPU */
 	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next)))
@@ -111,6 +112,18 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	switch_mmu_context(prev, next, tsk);
 }
 
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+			     struct task_struct *tsk)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	switch_mm_irqs_off(prev, next, tsk);
+	local_irq_restore(flags);
+}
+#define switch_mm_irqs_off switch_mm_irqs_off
+
+
 #define deactivate_mm(tsk,mm)	do { } while (0)
 
 /*
-- 
cgit v1.2.1


From b409946b2a3c1ddcde75e5f35a77e03f4d354be0 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 25 Apr 2017 20:49:24 +1000
Subject: powerpc/mm: Fix possible out-of-bounds shift in arch_mmap_rnd()

The recent patch to add runtime configuration of the ASLR limits added a bug in
arch_mmap_rnd() where we may shift an integer (32-bits) by up to 33 bits,
leading to undefined behaviour.

In practice it exhibits as every process seg faulting instantly, presumably
because the rnd value hasn't been restricited by the modulus at all. We didn't
notice because it only happens under certain kernel configurations and if the
number of bits is actually set to a large value.

Fix it by switching to unsigned long.

Fixes: 9fea59bd7ca5 ("powerpc/mm: Add support for runtime configuration of ASLR limits")
Reported-by: Balbir Singh <bsingharora@gmail.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/mmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 005aa8a44915..9dbd2a733d6b 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -66,7 +66,7 @@ unsigned long arch_mmap_rnd(void)
 	if (is_32bit_task())
 		shift = mmap_rnd_compat_bits;
 #endif
-	rnd = get_random_long() % (1 << shift);
+	rnd = get_random_long() % (1ul << shift);
 
 	return rnd << PAGE_SHIFT;
 }
-- 
cgit v1.2.1


From 83c4919058459c32138a1ebe35f72b7013a9ddbc Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 26 Apr 2017 15:57:19 +1000
Subject: powerpc/powernv: Fix missing attr initialisation in
 opal_export_attrs()

In opal_export_attrs() we dynamically allocate some bin_attributes. They're
allocated with kmalloc() and although we initialise most of the fields, we don't
initialise write() or mmap(), and in particular we don't initialise the lockdep
related fields in the embedded struct attribute.

This leads to a lockdep warning at boot:

  BUG: key c0000000f11906d8 not in .data!
  WARNING: CPU: 0 PID: 1 at ../kernel/locking/lockdep.c:3136 lockdep_init_map+0x28c/0x2a0
  ...
  Call Trace:
    lockdep_init_map+0x288/0x2a0 (unreliable)
    __kernfs_create_file+0x8c/0x170
    sysfs_add_file_mode_ns+0xc8/0x240
    __machine_initcall_powernv_opal_init+0x60c/0x684
    do_one_initcall+0x60/0x1c0
    kernel_init_freeable+0x2f4/0x3d4
    kernel_init+0x24/0x160
    ret_from_kernel_thread+0x5c/0xb0

Fix it by kzalloc'ing the attr, which fixes the uninitialised write() and
mmap(), and calling sysfs_bin_attr_init() on it to initialise the lockdep
fields.

Fixes: 11fe909d2362 ("powerpc/powernv: Add OPAL exports attributes to sysfs")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 76e153fc1f93..7925a9d72cca 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -637,13 +637,14 @@ static void opal_export_attrs(void)
 		if (of_property_read_u64_array(np, prop->name, &vals[0], 2))
 			continue;
 
-		attr = kmalloc(sizeof(*attr), GFP_KERNEL);
+		attr = kzalloc(sizeof(*attr), GFP_KERNEL);
 
 		if (attr == NULL) {
 			pr_warn("Failed kmalloc for bin_attribute!");
 			continue;
 		}
 
+		sysfs_bin_attr_init(attr);
 		attr->attr.name = kstrdup(prop->name, GFP_KERNEL);
 		attr->attr.mode = 0400;
 		attr->read = export_attr_read;
-- 
cgit v1.2.1


From 45b21cfeb22087795f0b49397fbe529efeb99baf Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 26 Apr 2017 17:32:38 +1000
Subject: powerpc/powernv: Fix oops on P9 DD1 in cause_ipi()

Recently we merged the native xive support for Power9, and then separately some
reworks for doorbell IPI support. In isolation both series were OK, but the
merged result had a bug in one case.

On P9 DD1 we use pnv_p9_dd1_cause_ipi() which tries to use doorbells, and then
falls back to the interrupt controller. However the fallback is implemented by
calling icp_ops->cause_ipi. But now that xive support is merged we might be
using xive, in which case icp_ops is not initialised, it's a xics specific
structure. This leads to an oops such as:

  Unable to handle kernel paging request for data at address 0x00000028
  Oops: Kernel access of bad area, sig: 11 [#1]
  NIP pnv_p9_dd1_cause_ipi+0x74/0xe0
  LR smp_muxed_ipi_message_pass+0x54/0x70

To fix it, rather than using icp_ops which might be NULL, have both xics and
xive set smp_ops->cause_ipi, and then in the powernv code we save that as
ic_cause_ipi before overriding smp_ops->cause_ipi. For paranoia add a WARN_ON()
to check if somehow smp_ops->cause_ipi is NULL.

Fixes: b866cc2199d6 ("powerpc: Change the doorbell IPI calling convention")
Tested-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/smp.c   | 12 ++++++++----
 arch/powerpc/sysdev/xics/xics-common.c |  3 +++
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 951a2e230cfa..5189445db164 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -249,12 +249,15 @@ static int pnv_smp_prepare_cpu(int cpu)
 	return 0;
 }
 
+/* Cause IPI as setup by the interrupt controller (xics or xive) */
+static void (*ic_cause_ipi)(int cpu);
+
 static void pnv_cause_ipi(int cpu)
 {
 	if (doorbell_try_core_ipi(cpu))
 		return;
 
-	icp_ops->cause_ipi(cpu);
+	ic_cause_ipi(cpu);
 }
 
 static void pnv_p9_dd1_cause_ipi(int cpu)
@@ -269,7 +272,7 @@ static void pnv_p9_dd1_cause_ipi(int cpu)
 	if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu)))
 		doorbell_global_ipi(cpu);
 	else
-		icp_ops->cause_ipi(cpu);
+		ic_cause_ipi(cpu);
 
 	put_cpu();
 }
@@ -282,6 +285,9 @@ static void __init pnv_smp_probe(void)
 		xics_smp_probe();
 
 	if (cpu_has_feature(CPU_FTR_DBELL)) {
+		ic_cause_ipi = smp_ops->cause_ipi;
+		WARN_ON(!ic_cause_ipi);
+
 		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
 			if (cpu_has_feature(CPU_FTR_POWER9_DD1))
 				smp_ops->cause_ipi = pnv_p9_dd1_cause_ipi;
@@ -290,8 +296,6 @@ static void __init pnv_smp_probe(void)
 		} else {
 			smp_ops->cause_ipi = pnv_cause_ipi;
 		}
-	} else {
-		smp_ops->cause_ipi = icp_ops->cause_ipi;
 	}
 }
 
diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c
index 2eb53c12ee6e..ffe138b8b9dc 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -145,6 +145,9 @@ void __init xics_smp_probe(void)
 {
 	/* Register all the IPIs */
 	xics_request_ipi();
+
+	/* Setup cause_ipi callback based on which ICP is used */
+	smp_ops->cause_ipi = icp_ops->cause_ipi;
 }
 
 #endif /* CONFIG_SMP */
-- 
cgit v1.2.1


From cf4f08bed876fbc50e64da92d2e1cfdf7653ddb2 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 26 Apr 2017 21:38:30 +1000
Subject: powerpc/mm/radix: Optimise Page Walk Cache flush

Currently we implement flushing of the page walk cache (PWC) by calling
_tlbiel_pid() with a RIC (Radix Invalidation Control) value of 1 which says to
only flush the PWC.

But _tlbiel_pid() loops over each set (congruence class) of the TLB, which is
not necessary when we're just flushing the PWC.

In fact the set argument is ignored for a PWC flush, so essentially we're just
flushing the PWC 127 extra times for no benefit.

Fix it by adding tlbiel_pwc() which just does a single flush of the PWC.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
[mpe: Split out of combined patch, drop _ in name, rewrite change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/tlb-radix.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index b68b5219cf45..ae2e799822bd 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -53,6 +53,17 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
 	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
 }
 
+static inline void tlbiel_pwc(unsigned long pid)
+{
+	asm volatile("ptesync": : :"memory");
+
+	/* For PWC flush, we don't look at set number */
+	__tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
+
+	asm volatile("ptesync": : :"memory");
+	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
 static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
@@ -140,7 +151,7 @@ void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
 
 	pid = mm->context.id;
 	if (pid != MMU_NO_CONTEXT)
-		_tlbiel_pid(pid, RIC_FLUSH_PWC);
+		tlbiel_pwc(pid);
 
 	preempt_enable();
 }
@@ -222,7 +233,7 @@ void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
 		if (lock_tlbie)
 			raw_spin_unlock(&native_tlbie_lock);
 	} else
-		_tlbiel_pid(pid, RIC_FLUSH_PWC);
+		tlbiel_pwc(pid);
 no_context:
 	preempt_enable();
 }
-- 
cgit v1.2.1


From a5998fcb92552a18713b6aa5c146aa400e4d75ee Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 26 Apr 2017 21:38:17 +1000
Subject: powerpc/mm/radix: Optimise tlbiel flush all case

_tlbiel_pid() is called with a ric (Radix Invalidation Control) argument of
either RIC_FLUSH_TLB or RIC_FLUSH_ALL.

RIC_FLUSH_ALL says to invalidate the entire TLB and the Page Walk Cache (PWC).

To flush the whole TLB, we have to iterate over each set (congruence class) of
the TLB. Currently we do that and pass RIC_FLUSH_ALL each time. That is not
incorrect but it means we flush the PWC 128 times, when once would suffice.

Fix it by doing the first flush with the ric value we're passed, and then if it
was RIC_FLUSH_ALL, we downgrade it to RIC_FLUSH_TLB, because we know we have
just flushed the PWC and don't need to do it again.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
[mpe: Split out of combined patch, tweak logic, rewrite change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/tlb-radix.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index ae2e799822bd..5e17c4e873a5 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -46,9 +46,20 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
 	int set;
 
 	asm volatile("ptesync": : :"memory");
-	for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) {
+
+	/*
+	 * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+	 * also flush the entire Page Walk Cache.
+	 */
+	__tlbiel_pid(pid, 0, ric);
+
+	if (ric == RIC_FLUSH_ALL)
+		/* For the remaining sets, just flush the TLB */
+		ric = RIC_FLUSH_TLB;
+
+	for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
 		__tlbiel_pid(pid, set, ric);
-	}
+
 	asm volatile("ptesync": : :"memory");
 	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
 }
-- 
cgit v1.2.1


From 6c01bbd2cf8cbc1906818b402b10bca2283c4e7e Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 18 Apr 2017 08:20:13 +0200
Subject: powerpc/mm: Fix page table dump build on PPC32

On PPC32 (eg. mpc885_ads_defconfig), page table dump compilation fails as
follows. This is because the memory layout is slightly different on PPC32. This
patch adapts it.

  arch/powerpc/mm/dump_linuxpagetables.c: In function 'walk_pagetables':
  arch/powerpc/mm/dump_linuxpagetables.c:369:10: error: 'KERN_VIRT_START' undeclared (first use in this function)
  ...

Fixes: 8eb07b187000d ("powerpc/mm: Dump linux pagetables")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/dump_linuxpagetables.c | 59 +++++++++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c
index b4cb7818dfd0..bb136ac6ac55 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -26,6 +26,10 @@
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 
+#ifdef CONFIG_PPC32
+#define KERN_VIRT_START	0
+#endif
+
 /*
  * To visualise what is happening,
  *
@@ -71,6 +75,7 @@ static struct addr_marker address_markers[] = {
 	{ 0,	"Start of kernel VM" },
 	{ 0,	"vmalloc() Area" },
 	{ 0,	"vmalloc() End" },
+#ifdef CONFIG_PPC64
 	{ 0,	"isa I/O start" },
 	{ 0,	"isa I/O end" },
 	{ 0,	"phb I/O start" },
@@ -78,6 +83,20 @@ static struct addr_marker address_markers[] = {
 	{ 0,	"I/O remap start" },
 	{ 0,	"I/O remap end" },
 	{ 0,	"vmemmap start" },
+#else
+	{ 0,	"Early I/O remap start" },
+	{ 0,	"Early I/O remap end" },
+#ifdef CONFIG_NOT_COHERENT_CACHE
+	{ 0,	"Consistent mem start" },
+	{ 0,	"Consistent mem end" },
+#endif
+#ifdef CONFIG_HIGHMEM
+	{ 0,	"Highmem PTEs start" },
+	{ 0,	"Highmem PTEs end" },
+#endif
+	{ 0,	"Fixmap start" },
+	{ 0,	"Fixmap end" },
+#endif
 	{ -1,	NULL },
 };
 
@@ -404,20 +423,38 @@ static void walk_pagetables(struct pg_state *st)
 
 static void populate_markers(void)
 {
-	address_markers[0].start_address = PAGE_OFFSET;
-	address_markers[1].start_address = VMALLOC_START;
-	address_markers[2].start_address = VMALLOC_END;
-	address_markers[3].start_address = ISA_IO_BASE;
-	address_markers[4].start_address = ISA_IO_END;
-	address_markers[5].start_address = PHB_IO_BASE;
-	address_markers[6].start_address = PHB_IO_END;
-	address_markers[7].start_address = IOREMAP_BASE;
-	address_markers[8].start_address = IOREMAP_END;
+	int i = 0;
+
+	address_markers[i++].start_address = PAGE_OFFSET;
+	address_markers[i++].start_address = VMALLOC_START;
+	address_markers[i++].start_address = VMALLOC_END;
+#ifdef CONFIG_PPC64
+	address_markers[i++].start_address = ISA_IO_BASE;
+	address_markers[i++].start_address = ISA_IO_END;
+	address_markers[i++].start_address = PHB_IO_BASE;
+	address_markers[i++].start_address = PHB_IO_END;
+	address_markers[i++].start_address = IOREMAP_BASE;
+	address_markers[i++].start_address = IOREMAP_END;
 #ifdef CONFIG_PPC_STD_MMU_64
-	address_markers[9].start_address =  H_VMEMMAP_BASE;
+	address_markers[i++].start_address =  H_VMEMMAP_BASE;
 #else
-	address_markers[9].start_address =  VMEMMAP_BASE;
+	address_markers[i++].start_address =  VMEMMAP_BASE;
+#endif
+#else /* !CONFIG_PPC64 */
+	address_markers[i++].start_address = ioremap_bot;
+	address_markers[i++].start_address = IOREMAP_TOP;
+#ifdef CONFIG_NOT_COHERENT_CACHE
+	address_markers[i++].start_address = IOREMAP_TOP;
+	address_markers[i++].start_address = IOREMAP_TOP +
+					     CONFIG_CONSISTENT_SIZE;
+#endif
+#ifdef CONFIG_HIGHMEM
+	address_markers[i++].start_address = PKMAP_BASE;
+	address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP);
 #endif
+	address_markers[i++].start_address = FIXADDR_START;
+	address_markers[i++].start_address = FIXADDR_TOP;
+#endif /* CONFIG_PPC64 */
 }
 
 static int ptdump_show(struct seq_file *m, void *v)
-- 
cgit v1.2.1


From fd893fe56a1307d348fe5c077eb9b654288ce0c5 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 14 Apr 2017 07:45:16 +0200
Subject: powerpc/mm: Fix missing page attributes in page table dump

On some targets, _PAGE_RW is 0 and this is _PAGE_RO which is used.
There is also _PAGE_SHARED that is missing.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 1 +
 arch/powerpc/mm/dump_linuxpagetables.c       | 9 +++++++++
 2 files changed, 10 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index fb8380a2d8d5..85bc9875c3be 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -13,6 +13,7 @@
 #define _PAGE_BIT_SWAP_TYPE	0
 
 #define _PAGE_RO		0
+#define _PAGE_SHARED		0
 
 #define _PAGE_EXEC		0x00001 /* execute permission */
 #define _PAGE_WRITE		0x00002 /* write access allowed */
diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c
index bb136ac6ac55..ec4906994726 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -121,8 +121,13 @@ static const struct flag_info flag_array[] = {
 		.set	= "user",
 		.clear	= "    ",
 	}, {
+#if _PAGE_RO == 0
 		.mask	= _PAGE_RW,
 		.val	= _PAGE_RW,
+#else
+		.mask	= _PAGE_RO,
+		.val	= 0,
+#endif
 		.set	= "rw",
 		.clear	= "ro",
 	}, {
@@ -222,6 +227,10 @@ static const struct flag_info flag_array[] = {
 		.mask	= _PAGE_SPECIAL,
 		.val	= _PAGE_SPECIAL,
 		.set	= "special",
+	}, {
+		.mask	= _PAGE_SHARED,
+		.val	= _PAGE_SHARED,
+		.set	= "shared",
 	}
 };
 
-- 
cgit v1.2.1


From 78a18dbf0104641662d6c28e6832cc24776ba2bc Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 13 Apr 2017 14:41:40 +0200
Subject: powerpc/mm: On PPC32, display 32 bits addresses in page table dump

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/dump_linuxpagetables.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c
index ec4906994726..a5b8c16c3d69 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -295,8 +295,13 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
 	const char *unit = units;
 	unsigned long delta;
 
+#ifdef CONFIG_PPC64
 	seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1);
 	seq_printf(st->seq, "0x%016lx ", st->start_pa);
+#else
+	seq_printf(st->seq, "0x%08lx-0x%08lx ", st->start_address, addr - 1);
+	seq_printf(st->seq, "0x%08lx ", st->start_pa);
+#endif
 
 	delta = (addr - st->start_address) >> 10;
 	/* Work out what appropriate unit to use */
-- 
cgit v1.2.1


From 2505820f7ce2da2ef4ca5bd02fc239df0ba87c9f Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 18 Apr 2017 08:20:15 +0200
Subject: powerpc/mm: Rename table dump file name

Page table dump debugfs file is named 'kernel_page_tables' on
all other architectures implementing it, while is is named
'kernel_pagetables' on powerpc. This patch renames it.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/dump_linuxpagetables.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c
index a5b8c16c3d69..d659345a98d6 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -513,7 +513,7 @@ static int ptdump_init(void)
 
 	populate_markers();
 	build_pgtable_complete_mask();
-	debugfs_file = debugfs_create_file("kernel_pagetables", 0400, NULL,
+	debugfs_file = debugfs_create_file("kernel_page_tables", 0400, NULL,
 			NULL, &ptdump_fops);
 	return debugfs_file ? 0 : -ENOMEM;
 }
-- 
cgit v1.2.1


From 7853f9c029ac9134df42ea9e0d6bc600180f268d Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Tue, 25 Apr 2017 19:25:53 +0530
Subject: powerpc: Split ftrace bits into a separate file

entry_*.S now includes a lot more than just kernel entry/exit code. As a
first step at cleaning this up, let's split out the ftrace bits into
separate files. Also move all related tracing code into a new trace/
subdirectory.

No functional changes.

Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/Makefile            |   9 +-
 arch/powerpc/kernel/entry_32.S          | 107 ------
 arch/powerpc/kernel/entry_64.S          | 378 -------------------
 arch/powerpc/kernel/ftrace.c            | 620 --------------------------------
 arch/powerpc/kernel/trace/Makefile      |  24 ++
 arch/powerpc/kernel/trace/ftrace.c      | 620 ++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/trace/ftrace_32.S   | 118 ++++++
 arch/powerpc/kernel/trace/ftrace_64.S   | 389 ++++++++++++++++++++
 arch/powerpc/kernel/trace/trace_clock.c |  15 +
 arch/powerpc/kernel/trace_clock.c       |  15 -
 10 files changed, 1167 insertions(+), 1128 deletions(-)
 delete mode 100644 arch/powerpc/kernel/ftrace.c
 create mode 100644 arch/powerpc/kernel/trace/Makefile
 create mode 100644 arch/powerpc/kernel/trace/ftrace.c
 create mode 100644 arch/powerpc/kernel/trace/ftrace_32.S
 create mode 100644 arch/powerpc/kernel/trace/ftrace_64.S
 create mode 100644 arch/powerpc/kernel/trace/trace_clock.c
 delete mode 100644 arch/powerpc/kernel/trace_clock.c

(limited to 'arch')

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 3e461637b64d..b9db46ae545b 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -25,8 +25,6 @@ CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
-# do not trace tracer code
-CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
 # timers used by tracing
 CFLAGS_REMOVE_time.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
 endif
@@ -119,10 +117,7 @@ obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_PPC_IO_WORKAROUNDS)	+= io-workarounds.o
 
-obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
-obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
-obj-$(CONFIG_TRACING)		+= trace_clock.o
+obj-y				+= trace/
 
 ifneq ($(CONFIG_PPC_INDIRECT_PIO),y)
 obj-y				+= iomap.o
@@ -143,8 +138,6 @@ obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvm_emul.o
 # Disable GCOV & sanitizers in odd or sensitive code
 GCOV_PROFILE_prom_init.o := n
 UBSAN_SANITIZE_prom_init.o := n
-GCOV_PROFILE_ftrace.o := n
-UBSAN_SANITIZE_ftrace.o := n
 GCOV_PROFILE_machine_kexec_64.o := n
 UBSAN_SANITIZE_machine_kexec_64.o := n
 GCOV_PROFILE_machine_kexec_32.o := n
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index a38600949f3a..8587059ad848 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -31,7 +31,6 @@
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/unistd.h>
-#include <asm/ftrace.h>
 #include <asm/ptrace.h>
 #include <asm/export.h>
 
@@ -1315,109 +1314,3 @@ machine_check_in_rtas:
 	/* XXX load up BATs and panic */
 
 #endif /* CONFIG_PPC_RTAS */
-
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-_GLOBAL(mcount)
-_GLOBAL(_mcount)
-	/*
-	 * It is required that _mcount on PPC32 must preserve the
-	 * link register. But we have r0 to play with. We use r0
-	 * to push the return address back to the caller of mcount
-	 * into the ctr register, restore the link register and
-	 * then jump back using the ctr register.
-	 */
-	mflr	r0
-	mtctr	r0
-	lwz	r0, 4(r1)
-	mtlr	r0
-	bctr
-
-_GLOBAL(ftrace_caller)
-	MCOUNT_SAVE_FRAME
-	/* r3 ends up with link register */
-	subi	r3, r3, MCOUNT_INSN_SIZE
-.globl ftrace_call
-ftrace_call:
-	bl	ftrace_stub
-	nop
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
-	b	ftrace_graph_stub
-_GLOBAL(ftrace_graph_stub)
-#endif
-	MCOUNT_RESTORE_FRAME
-	/* old link register ends up in ctr reg */
-	bctr
-#else
-_GLOBAL(mcount)
-_GLOBAL(_mcount)
-
-	MCOUNT_SAVE_FRAME
-
-	subi	r3, r3, MCOUNT_INSN_SIZE
-	LOAD_REG_ADDR(r5, ftrace_trace_function)
-	lwz	r5,0(r5)
-
-	mtctr	r5
-	bctrl
-	nop
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	b	ftrace_graph_caller
-#endif
-	MCOUNT_RESTORE_FRAME
-	bctr
-#endif
-EXPORT_SYMBOL(_mcount)
-
-_GLOBAL(ftrace_stub)
-	blr
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-_GLOBAL(ftrace_graph_caller)
-	/* load r4 with local address */
-	lwz	r4, 44(r1)
-	subi	r4, r4, MCOUNT_INSN_SIZE
-
-	/* Grab the LR out of the caller stack frame */
-	lwz	r3,52(r1)
-
-	bl	prepare_ftrace_return
-	nop
-
-        /*
-         * prepare_ftrace_return gives us the address we divert to.
-         * Change the LR in the callers stack frame to this.
-         */
-	stw	r3,52(r1)
-
-	MCOUNT_RESTORE_FRAME
-	/* old link register ends up in ctr reg */
-	bctr
-
-_GLOBAL(return_to_handler)
-	/* need to save return values */
-	stwu	r1, -32(r1)
-	stw	r3, 20(r1)
-	stw	r4, 16(r1)
-	stw	r31, 12(r1)
-	mr	r31, r1
-
-	bl	ftrace_return_to_handler
-	nop
-
-	/* return value has real return address */
-	mtlr	r3
-
-	lwz	r3, 20(r1)
-	lwz	r4, 16(r1)
-	lwz	r31,12(r1)
-	lwz	r1, 0(r1)
-
-	/* Jump back to real return address */
-	blr
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-
-#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 70a5c1948b05..9b541d22595a 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -20,7 +20,6 @@
 
 #include <linux/errno.h>
 #include <linux/err.h>
-#include <linux/magic.h>
 #include <asm/unistd.h>
 #include <asm/processor.h>
 #include <asm/page.h>
@@ -33,7 +32,6 @@
 #include <asm/bug.h>
 #include <asm/ptrace.h>
 #include <asm/irqflags.h>
-#include <asm/ftrace.h>
 #include <asm/hw_irq.h>
 #include <asm/context_tracking.h>
 #include <asm/tm.h>
@@ -1173,379 +1171,3 @@ _GLOBAL(enter_prom)
 	ld	r0,16(r1)
 	mtlr    r0
         blr
-
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-_GLOBAL(mcount)
-_GLOBAL(_mcount)
-EXPORT_SYMBOL(_mcount)
-	mflr	r12
-	mtctr	r12
-	mtlr	r0
-	bctr
-
-#ifndef CC_USING_MPROFILE_KERNEL
-_GLOBAL_TOC(ftrace_caller)
-	/* Taken from output of objdump from lib64/glibc */
-	mflr	r3
-	ld	r11, 0(r1)
-	stdu	r1, -112(r1)
-	std	r3, 128(r1)
-	ld	r4, 16(r11)
-	subi	r3, r3, MCOUNT_INSN_SIZE
-.globl ftrace_call
-ftrace_call:
-	bl	ftrace_stub
-	nop
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
-	b	ftrace_graph_stub
-_GLOBAL(ftrace_graph_stub)
-#endif
-	ld	r0, 128(r1)
-	mtlr	r0
-	addi	r1, r1, 112
-
-#else /* CC_USING_MPROFILE_KERNEL */
-/*
- *
- * ftrace_caller() is the function that replaces _mcount() when ftrace is
- * active.
- *
- * We arrive here after a function A calls function B, and we are the trace
- * function for B. When we enter r1 points to A's stack frame, B has not yet
- * had a chance to allocate one yet.
- *
- * Additionally r2 may point either to the TOC for A, or B, depending on
- * whether B did a TOC setup sequence before calling us.
- *
- * On entry the LR points back to the _mcount() call site, and r0 holds the
- * saved LR as it was on entry to B, ie. the original return address at the
- * call site in A.
- *
- * Our job is to save the register state into a struct pt_regs (on the stack)
- * and then arrange for the ftrace function to be called.
- */
-_GLOBAL(ftrace_caller)
-	/* Save the original return address in A's stack frame */
-	std	r0,LRSAVE(r1)
-
-	/* Create our stack frame + pt_regs */
-	stdu	r1,-SWITCH_FRAME_SIZE(r1)
-
-	/* Save all gprs to pt_regs */
-	SAVE_8GPRS(0,r1)
-	SAVE_8GPRS(8,r1)
-	SAVE_8GPRS(16,r1)
-	SAVE_8GPRS(24,r1)
-
-	/* Load special regs for save below */
-	mfmsr   r8
-	mfctr   r9
-	mfxer   r10
-	mfcr	r11
-
-	/* Get the _mcount() call site out of LR */
-	mflr	r7
-	/* Save it as pt_regs->nip */
-	std     r7, _NIP(r1)
-	/* Save the read LR in pt_regs->link */
-	std     r0, _LINK(r1)
-
-	/* Save callee's TOC in the ABI compliant location */
-	std	r2, 24(r1)
-	ld	r2,PACATOC(r13)	/* get kernel TOC in r2 */
-
-	addis	r3,r2,function_trace_op@toc@ha
-	addi	r3,r3,function_trace_op@toc@l
-	ld	r5,0(r3)
-
-#ifdef CONFIG_LIVEPATCH
-	mr	r14,r7		/* remember old NIP */
-#endif
-	/* Calculate ip from nip-4 into r3 for call below */
-	subi    r3, r7, MCOUNT_INSN_SIZE
-
-	/* Put the original return address in r4 as parent_ip */
-	mr	r4, r0
-
-	/* Save special regs */
-	std     r8, _MSR(r1)
-	std     r9, _CTR(r1)
-	std     r10, _XER(r1)
-	std     r11, _CCR(r1)
-
-	/* Load &pt_regs in r6 for call below */
-	addi    r6, r1 ,STACK_FRAME_OVERHEAD
-
-	/* ftrace_call(r3, r4, r5, r6) */
-.globl ftrace_call
-ftrace_call:
-	bl	ftrace_stub
-	nop
-
-	/* Load ctr with the possibly modified NIP */
-	ld	r3, _NIP(r1)
-	mtctr	r3
-#ifdef CONFIG_LIVEPATCH
-	cmpd	r14,r3		/* has NIP been altered? */
-#endif
-
-	/* Restore gprs */
-	REST_8GPRS(0,r1)
-	REST_8GPRS(8,r1)
-	REST_8GPRS(16,r1)
-	REST_8GPRS(24,r1)
-
-	/* Restore possibly modified LR */
-	ld	r0, _LINK(r1)
-	mtlr	r0
-
-	/* Restore callee's TOC */
-	ld	r2, 24(r1)
-
-	/* Pop our stack frame */
-	addi r1, r1, SWITCH_FRAME_SIZE
-
-#ifdef CONFIG_LIVEPATCH
-        /* Based on the cmpd above, if the NIP was altered handle livepatch */
-	bne-	livepatch_handler
-#endif
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
-	b	ftrace_graph_stub
-_GLOBAL(ftrace_graph_stub)
-#endif
-
-	bctr			/* jump after _mcount site */
-#endif /* CC_USING_MPROFILE_KERNEL */
-
-_GLOBAL(ftrace_stub)
-	blr
-
-#ifdef CONFIG_LIVEPATCH
-	/*
-	 * This function runs in the mcount context, between two functions. As
-	 * such it can only clobber registers which are volatile and used in
-	 * function linkage.
-	 *
-	 * We get here when a function A, calls another function B, but B has
-	 * been live patched with a new function C.
-	 *
-	 * On entry:
-	 *  - we have no stack frame and can not allocate one
-	 *  - LR points back to the original caller (in A)
-	 *  - CTR holds the new NIP in C
-	 *  - r0 & r12 are free
-	 *
-	 * r0 can't be used as the base register for a DS-form load or store, so
-	 * we temporarily shuffle r1 (stack pointer) into r0 and then put it back.
-	 */
-livepatch_handler:
-	CURRENT_THREAD_INFO(r12, r1)
-
-	/* Save stack pointer into r0 */
-	mr	r0, r1
-
-	/* Allocate 3 x 8 bytes */
-	ld	r1, TI_livepatch_sp(r12)
-	addi	r1, r1, 24
-	std	r1, TI_livepatch_sp(r12)
-
-	/* Save toc & real LR on livepatch stack */
-	std	r2,  -24(r1)
-	mflr	r12
-	std	r12, -16(r1)
-
-	/* Store stack end marker */
-	lis     r12, STACK_END_MAGIC@h
-	ori     r12, r12, STACK_END_MAGIC@l
-	std	r12, -8(r1)
-
-	/* Restore real stack pointer */
-	mr	r1, r0
-
-	/* Put ctr in r12 for global entry and branch there */
-	mfctr	r12
-	bctrl
-
-	/*
-	 * Now we are returning from the patched function to the original
-	 * caller A. We are free to use r0 and r12, and we can use r2 until we
-	 * restore it.
-	 */
-
-	CURRENT_THREAD_INFO(r12, r1)
-
-	/* Save stack pointer into r0 */
-	mr	r0, r1
-
-	ld	r1, TI_livepatch_sp(r12)
-
-	/* Check stack marker hasn't been trashed */
-	lis     r2,  STACK_END_MAGIC@h
-	ori     r2,  r2, STACK_END_MAGIC@l
-	ld	r12, -8(r1)
-1:	tdne	r12, r2
-	EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0
-
-	/* Restore LR & toc from livepatch stack */
-	ld	r12, -16(r1)
-	mtlr	r12
-	ld	r2,  -24(r1)
-
-	/* Pop livepatch stack frame */
-	CURRENT_THREAD_INFO(r12, r0)
-	subi	r1, r1, 24
-	std	r1, TI_livepatch_sp(r12)
-
-	/* Restore real stack pointer */
-	mr	r1, r0
-
-	/* Return to original caller of live patched function */
-	blr
-#endif
-
-
-#else
-_GLOBAL_TOC(_mcount)
-EXPORT_SYMBOL(_mcount)
-	/* Taken from output of objdump from lib64/glibc */
-	mflr	r3
-	ld	r11, 0(r1)
-	stdu	r1, -112(r1)
-	std	r3, 128(r1)
-	ld	r4, 16(r11)
-
-	subi	r3, r3, MCOUNT_INSN_SIZE
-	LOAD_REG_ADDR(r5,ftrace_trace_function)
-	ld	r5,0(r5)
-	ld	r5,0(r5)
-	mtctr	r5
-	bctrl
-	nop
-
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	b	ftrace_graph_caller
-#endif
-	ld	r0, 128(r1)
-	mtlr	r0
-	addi	r1, r1, 112
-_GLOBAL(ftrace_stub)
-	blr
-
-#endif /* CONFIG_DYNAMIC_FTRACE */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-#ifndef CC_USING_MPROFILE_KERNEL
-_GLOBAL(ftrace_graph_caller)
-	/* load r4 with local address */
-	ld	r4, 128(r1)
-	subi	r4, r4, MCOUNT_INSN_SIZE
-
-	/* Grab the LR out of the caller stack frame */
-	ld	r11, 112(r1)
-	ld	r3, 16(r11)
-
-	bl	prepare_ftrace_return
-	nop
-
-	/*
-	 * prepare_ftrace_return gives us the address we divert to.
-	 * Change the LR in the callers stack frame to this.
-	 */
-	ld	r11, 112(r1)
-	std	r3, 16(r11)
-
-	ld	r0, 128(r1)
-	mtlr	r0
-	addi	r1, r1, 112
-	blr
-
-#else /* CC_USING_MPROFILE_KERNEL */
-_GLOBAL(ftrace_graph_caller)
-	stdu	r1, -112(r1)
-	/* with -mprofile-kernel, parameter regs are still alive at _mcount */
-	std	r10, 104(r1)
-	std	r9, 96(r1)
-	std	r8, 88(r1)
-	std	r7, 80(r1)
-	std	r6, 72(r1)
-	std	r5, 64(r1)
-	std	r4, 56(r1)
-	std	r3, 48(r1)
-
-	/* Save callee's TOC in the ABI compliant location */
-	std	r2, 24(r1)
-	ld	r2, PACATOC(r13)	/* get kernel TOC in r2 */
-
-	mfctr	r4		/* ftrace_caller has moved local addr here */
-	std	r4, 40(r1)
-	mflr	r3		/* ftrace_caller has restored LR from stack */
-	subi	r4, r4, MCOUNT_INSN_SIZE
-
-	bl	prepare_ftrace_return
-	nop
-
-	/*
-	 * prepare_ftrace_return gives us the address we divert to.
-	 * Change the LR to this.
-	 */
-	mtlr	r3
-
-	ld	r0, 40(r1)
-	mtctr	r0
-	ld	r10, 104(r1)
-	ld	r9, 96(r1)
-	ld	r8, 88(r1)
-	ld	r7, 80(r1)
-	ld	r6, 72(r1)
-	ld	r5, 64(r1)
-	ld	r4, 56(r1)
-	ld	r3, 48(r1)
-
-	/* Restore callee's TOC */
-	ld	r2, 24(r1)
-
-	addi	r1, r1, 112
-	mflr	r0
-	std	r0, LRSAVE(r1)
-	bctr
-#endif /* CC_USING_MPROFILE_KERNEL */
-
-_GLOBAL(return_to_handler)
-	/* need to save return values */
-	std	r4,  -32(r1)
-	std	r3,  -24(r1)
-	/* save TOC */
-	std	r2,  -16(r1)
-	std	r31, -8(r1)
-	mr	r31, r1
-	stdu	r1, -112(r1)
-
-	/*
-	 * We might be called from a module.
-	 * Switch to our TOC to run inside the core kernel.
-	 */
-	ld	r2, PACATOC(r13)
-
-	bl	ftrace_return_to_handler
-	nop
-
-	/* return value has real return address */
-	mtlr	r3
-
-	ld	r1, 0(r1)
-	ld	r4,  -32(r1)
-	ld	r3,  -24(r1)
-	ld	r2,  -16(r1)
-	ld	r31, -8(r1)
-
-	/* Jump back to real return address */
-	blr
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
deleted file mode 100644
index 32509de6ce4c..000000000000
--- a/arch/powerpc/kernel/ftrace.c
+++ /dev/null
@@ -1,620 +0,0 @@
-/*
- * Code for replacing ftrace calls with jumps.
- *
- * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
- *
- * Thanks goes out to P.A. Semi, Inc for supplying me with a PPC64 box.
- *
- * Added function graph tracer code, taken from x86 that was written
- * by Frederic Weisbecker, and ported to PPC by Steven Rostedt.
- *
- */
-
-#define pr_fmt(fmt) "ftrace-powerpc: " fmt
-
-#include <linux/spinlock.h>
-#include <linux/hardirq.h>
-#include <linux/uaccess.h>
-#include <linux/module.h>
-#include <linux/ftrace.h>
-#include <linux/percpu.h>
-#include <linux/init.h>
-#include <linux/list.h>
-
-#include <asm/asm-prototypes.h>
-#include <asm/cacheflush.h>
-#include <asm/code-patching.h>
-#include <asm/ftrace.h>
-#include <asm/syscall.h>
-
-
-#ifdef CONFIG_DYNAMIC_FTRACE
-static unsigned int
-ftrace_call_replace(unsigned long ip, unsigned long addr, int link)
-{
-	unsigned int op;
-
-	addr = ppc_function_entry((void *)addr);
-
-	/* if (link) set op to 'bl' else 'b' */
-	op = create_branch((unsigned int *)ip, addr, link ? 1 : 0);
-
-	return op;
-}
-
-static int
-ftrace_modify_code(unsigned long ip, unsigned int old, unsigned int new)
-{
-	unsigned int replaced;
-
-	/*
-	 * Note:
-	 * We are paranoid about modifying text, as if a bug was to happen, it
-	 * could cause us to read or write to someplace that could cause harm.
-	 * Carefully read and modify the code with probe_kernel_*(), and make
-	 * sure what we read is what we expected it to be before modifying it.
-	 */
-
-	/* read the text we want to modify */
-	if (probe_kernel_read(&replaced, (void *)ip, MCOUNT_INSN_SIZE))
-		return -EFAULT;
-
-	/* Make sure it is what we expect it to be */
-	if (replaced != old) {
-		pr_err("%p: replaced (%#x) != old (%#x)",
-		(void *)ip, replaced, old);
-		return -EINVAL;
-	}
-
-	/* replace the text with the new text */
-	if (patch_instruction((unsigned int *)ip, new))
-		return -EPERM;
-
-	return 0;
-}
-
-/*
- * Helper functions that are the same for both PPC64 and PPC32.
- */
-static int test_24bit_addr(unsigned long ip, unsigned long addr)
-{
-	addr = ppc_function_entry((void *)addr);
-
-	/* use the create_branch to verify that this offset can be branched */
-	return create_branch((unsigned int *)ip, addr, 0);
-}
-
-#ifdef CONFIG_MODULES
-
-static int is_bl_op(unsigned int op)
-{
-	return (op & 0xfc000003) == 0x48000001;
-}
-
-static unsigned long find_bl_target(unsigned long ip, unsigned int op)
-{
-	static int offset;
-
-	offset = (op & 0x03fffffc);
-	/* make it signed */
-	if (offset & 0x02000000)
-		offset |= 0xfe000000;
-
-	return ip + (long)offset;
-}
-
-#ifdef CONFIG_PPC64
-static int
-__ftrace_make_nop(struct module *mod,
-		  struct dyn_ftrace *rec, unsigned long addr)
-{
-	unsigned long entry, ptr, tramp;
-	unsigned long ip = rec->ip;
-	unsigned int op, pop;
-
-	/* read where this goes */
-	if (probe_kernel_read(&op, (void *)ip, sizeof(int))) {
-		pr_err("Fetching opcode failed.\n");
-		return -EFAULT;
-	}
-
-	/* Make sure that that this is still a 24bit jump */
-	if (!is_bl_op(op)) {
-		pr_err("Not expected bl: opcode is %x\n", op);
-		return -EINVAL;
-	}
-
-	/* lets find where the pointer goes */
-	tramp = find_bl_target(ip, op);
-
-	pr_devel("ip:%lx jumps to %lx", ip, tramp);
-
-	if (module_trampoline_target(mod, tramp, &ptr)) {
-		pr_err("Failed to get trampoline target\n");
-		return -EFAULT;
-	}
-
-	pr_devel("trampoline target %lx", ptr);
-
-	entry = ppc_global_function_entry((void *)addr);
-	/* This should match what was called */
-	if (ptr != entry) {
-		pr_err("addr %lx does not match expected %lx\n", ptr, entry);
-		return -EINVAL;
-	}
-
-#ifdef CC_USING_MPROFILE_KERNEL
-	/* When using -mkernel_profile there is no load to jump over */
-	pop = PPC_INST_NOP;
-
-	if (probe_kernel_read(&op, (void *)(ip - 4), 4)) {
-		pr_err("Fetching instruction at %lx failed.\n", ip - 4);
-		return -EFAULT;
-	}
-
-	/* We expect either a mflr r0, or a std r0, LRSAVE(r1) */
-	if (op != PPC_INST_MFLR && op != PPC_INST_STD_LR) {
-		pr_err("Unexpected instruction %08x around bl _mcount\n", op);
-		return -EINVAL;
-	}
-#else
-	/*
-	 * Our original call site looks like:
-	 *
-	 * bl <tramp>
-	 * ld r2,XX(r1)
-	 *
-	 * Milton Miller pointed out that we can not simply nop the branch.
-	 * If a task was preempted when calling a trace function, the nops
-	 * will remove the way to restore the TOC in r2 and the r2 TOC will
-	 * get corrupted.
-	 *
-	 * Use a b +8 to jump over the load.
-	 */
-
-	pop = PPC_INST_BRANCH | 8;	/* b +8 */
-
-	/*
-	 * Check what is in the next instruction. We can see ld r2,40(r1), but
-	 * on first pass after boot we will see mflr r0.
-	 */
-	if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE)) {
-		pr_err("Fetching op failed.\n");
-		return -EFAULT;
-	}
-
-	if (op != PPC_INST_LD_TOC) {
-		pr_err("Expected %08x found %08x\n", PPC_INST_LD_TOC, op);
-		return -EINVAL;
-	}
-#endif /* CC_USING_MPROFILE_KERNEL */
-
-	if (patch_instruction((unsigned int *)ip, pop)) {
-		pr_err("Patching NOP failed.\n");
-		return -EPERM;
-	}
-
-	return 0;
-}
-
-#else /* !PPC64 */
-static int
-__ftrace_make_nop(struct module *mod,
-		  struct dyn_ftrace *rec, unsigned long addr)
-{
-	unsigned int op;
-	unsigned int jmp[4];
-	unsigned long ip = rec->ip;
-	unsigned long tramp;
-
-	if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
-		return -EFAULT;
-
-	/* Make sure that that this is still a 24bit jump */
-	if (!is_bl_op(op)) {
-		pr_err("Not expected bl: opcode is %x\n", op);
-		return -EINVAL;
-	}
-
-	/* lets find where the pointer goes */
-	tramp = find_bl_target(ip, op);
-
-	/*
-	 * On PPC32 the trampoline looks like:
-	 *  0x3d, 0x80, 0x00, 0x00  lis r12,sym@ha
-	 *  0x39, 0x8c, 0x00, 0x00  addi r12,r12,sym@l
-	 *  0x7d, 0x89, 0x03, 0xa6  mtctr r12
-	 *  0x4e, 0x80, 0x04, 0x20  bctr
-	 */
-
-	pr_devel("ip:%lx jumps to %lx", ip, tramp);
-
-	/* Find where the trampoline jumps to */
-	if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
-		pr_err("Failed to read %lx\n", tramp);
-		return -EFAULT;
-	}
-
-	pr_devel(" %08x %08x ", jmp[0], jmp[1]);
-
-	/* verify that this is what we expect it to be */
-	if (((jmp[0] & 0xffff0000) != 0x3d800000) ||
-	    ((jmp[1] & 0xffff0000) != 0x398c0000) ||
-	    (jmp[2] != 0x7d8903a6) ||
-	    (jmp[3] != 0x4e800420)) {
-		pr_err("Not a trampoline\n");
-		return -EINVAL;
-	}
-
-	tramp = (jmp[1] & 0xffff) |
-		((jmp[0] & 0xffff) << 16);
-	if (tramp & 0x8000)
-		tramp -= 0x10000;
-
-	pr_devel(" %lx ", tramp);
-
-	if (tramp != addr) {
-		pr_err("Trampoline location %08lx does not match addr\n",
-		       tramp);
-		return -EINVAL;
-	}
-
-	op = PPC_INST_NOP;
-
-	if (patch_instruction((unsigned int *)ip, op))
-		return -EPERM;
-
-	return 0;
-}
-#endif /* PPC64 */
-#endif /* CONFIG_MODULES */
-
-int ftrace_make_nop(struct module *mod,
-		    struct dyn_ftrace *rec, unsigned long addr)
-{
-	unsigned long ip = rec->ip;
-	unsigned int old, new;
-
-	/*
-	 * If the calling address is more that 24 bits away,
-	 * then we had to use a trampoline to make the call.
-	 * Otherwise just update the call site.
-	 */
-	if (test_24bit_addr(ip, addr)) {
-		/* within range */
-		old = ftrace_call_replace(ip, addr, 1);
-		new = PPC_INST_NOP;
-		return ftrace_modify_code(ip, old, new);
-	}
-
-#ifdef CONFIG_MODULES
-	/*
-	 * Out of range jumps are called from modules.
-	 * We should either already have a pointer to the module
-	 * or it has been passed in.
-	 */
-	if (!rec->arch.mod) {
-		if (!mod) {
-			pr_err("No module loaded addr=%lx\n", addr);
-			return -EFAULT;
-		}
-		rec->arch.mod = mod;
-	} else if (mod) {
-		if (mod != rec->arch.mod) {
-			pr_err("Record mod %p not equal to passed in mod %p\n",
-			       rec->arch.mod, mod);
-			return -EINVAL;
-		}
-		/* nothing to do if mod == rec->arch.mod */
-	} else
-		mod = rec->arch.mod;
-
-	return __ftrace_make_nop(mod, rec, addr);
-#else
-	/* We should not get here without modules */
-	return -EINVAL;
-#endif /* CONFIG_MODULES */
-}
-
-#ifdef CONFIG_MODULES
-#ifdef CONFIG_PPC64
-/*
- * Examine the existing instructions for __ftrace_make_call.
- * They should effectively be a NOP, and follow formal constraints,
- * depending on the ABI. Return false if they don't.
- */
-#ifndef CC_USING_MPROFILE_KERNEL
-static int
-expected_nop_sequence(void *ip, unsigned int op0, unsigned int op1)
-{
-	/*
-	 * We expect to see:
-	 *
-	 * b +8
-	 * ld r2,XX(r1)
-	 *
-	 * The load offset is different depending on the ABI. For simplicity
-	 * just mask it out when doing the compare.
-	 */
-	if ((op0 != 0x48000008) || ((op1 & 0xffff0000) != 0xe8410000))
-		return 0;
-	return 1;
-}
-#else
-static int
-expected_nop_sequence(void *ip, unsigned int op0, unsigned int op1)
-{
-	/* look for patched "NOP" on ppc64 with -mprofile-kernel */
-	if (op0 != PPC_INST_NOP)
-		return 0;
-	return 1;
-}
-#endif
-
-static int
-__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
-{
-	unsigned int op[2];
-	void *ip = (void *)rec->ip;
-
-	/* read where this goes */
-	if (probe_kernel_read(op, ip, sizeof(op)))
-		return -EFAULT;
-
-	if (!expected_nop_sequence(ip, op[0], op[1])) {
-		pr_err("Unexpected call sequence at %p: %x %x\n",
-		ip, op[0], op[1]);
-		return -EINVAL;
-	}
-
-	/* If we never set up a trampoline to ftrace_caller, then bail */
-	if (!rec->arch.mod->arch.tramp) {
-		pr_err("No ftrace trampoline\n");
-		return -EINVAL;
-	}
-
-	/* Ensure branch is within 24 bits */
-	if (!create_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) {
-		pr_err("Branch out of range\n");
-		return -EINVAL;
-	}
-
-	if (patch_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) {
-		pr_err("REL24 out of range!\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
-int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
-			unsigned long addr)
-{
-	return ftrace_make_call(rec, addr);
-}
-#endif
-
-#else  /* !CONFIG_PPC64: */
-static int
-__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
-{
-	unsigned int op;
-	unsigned long ip = rec->ip;
-
-	/* read where this goes */
-	if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
-		return -EFAULT;
-
-	/* It should be pointing to a nop */
-	if (op != PPC_INST_NOP) {
-		pr_err("Expected NOP but have %x\n", op);
-		return -EINVAL;
-	}
-
-	/* If we never set up a trampoline to ftrace_caller, then bail */
-	if (!rec->arch.mod->arch.tramp) {
-		pr_err("No ftrace trampoline\n");
-		return -EINVAL;
-	}
-
-	/* create the branch to the trampoline */
-	op = create_branch((unsigned int *)ip,
-			   rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
-	if (!op) {
-		pr_err("REL24 out of range!\n");
-		return -EINVAL;
-	}
-
-	pr_devel("write to %lx\n", rec->ip);
-
-	if (patch_instruction((unsigned int *)ip, op))
-		return -EPERM;
-
-	return 0;
-}
-#endif /* CONFIG_PPC64 */
-#endif /* CONFIG_MODULES */
-
-int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
-{
-	unsigned long ip = rec->ip;
-	unsigned int old, new;
-
-	/*
-	 * If the calling address is more that 24 bits away,
-	 * then we had to use a trampoline to make the call.
-	 * Otherwise just update the call site.
-	 */
-	if (test_24bit_addr(ip, addr)) {
-		/* within range */
-		old = PPC_INST_NOP;
-		new = ftrace_call_replace(ip, addr, 1);
-		return ftrace_modify_code(ip, old, new);
-	}
-
-#ifdef CONFIG_MODULES
-	/*
-	 * Out of range jumps are called from modules.
-	 * Being that we are converting from nop, it had better
-	 * already have a module defined.
-	 */
-	if (!rec->arch.mod) {
-		pr_err("No module loaded\n");
-		return -EINVAL;
-	}
-
-	return __ftrace_make_call(rec, addr);
-#else
-	/* We should not get here without modules */
-	return -EINVAL;
-#endif /* CONFIG_MODULES */
-}
-
-int ftrace_update_ftrace_func(ftrace_func_t func)
-{
-	unsigned long ip = (unsigned long)(&ftrace_call);
-	unsigned int old, new;
-	int ret;
-
-	old = *(unsigned int *)&ftrace_call;
-	new = ftrace_call_replace(ip, (unsigned long)func, 1);
-	ret = ftrace_modify_code(ip, old, new);
-
-	return ret;
-}
-
-static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
-{
-	unsigned long ftrace_addr = (unsigned long)FTRACE_ADDR;
-	int ret;
-
-	ret = ftrace_update_record(rec, enable);
-
-	switch (ret) {
-	case FTRACE_UPDATE_IGNORE:
-		return 0;
-	case FTRACE_UPDATE_MAKE_CALL:
-		return ftrace_make_call(rec, ftrace_addr);
-	case FTRACE_UPDATE_MAKE_NOP:
-		return ftrace_make_nop(NULL, rec, ftrace_addr);
-	}
-
-	return 0;
-}
-
-void ftrace_replace_code(int enable)
-{
-	struct ftrace_rec_iter *iter;
-	struct dyn_ftrace *rec;
-	int ret;
-
-	for (iter = ftrace_rec_iter_start(); iter;
-	     iter = ftrace_rec_iter_next(iter)) {
-		rec = ftrace_rec_iter_record(iter);
-		ret = __ftrace_replace_code(rec, enable);
-		if (ret) {
-			ftrace_bug(ret, rec);
-			return;
-		}
-	}
-}
-
-/*
- * Use the default ftrace_modify_all_code, but without
- * stop_machine().
- */
-void arch_ftrace_update_code(int command)
-{
-	ftrace_modify_all_code(command);
-}
-
-int __init ftrace_dyn_arch_init(void)
-{
-	return 0;
-}
-#endif /* CONFIG_DYNAMIC_FTRACE */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-
-#ifdef CONFIG_DYNAMIC_FTRACE
-extern void ftrace_graph_call(void);
-extern void ftrace_graph_stub(void);
-
-int ftrace_enable_ftrace_graph_caller(void)
-{
-	unsigned long ip = (unsigned long)(&ftrace_graph_call);
-	unsigned long addr = (unsigned long)(&ftrace_graph_caller);
-	unsigned long stub = (unsigned long)(&ftrace_graph_stub);
-	unsigned int old, new;
-
-	old = ftrace_call_replace(ip, stub, 0);
-	new = ftrace_call_replace(ip, addr, 0);
-
-	return ftrace_modify_code(ip, old, new);
-}
-
-int ftrace_disable_ftrace_graph_caller(void)
-{
-	unsigned long ip = (unsigned long)(&ftrace_graph_call);
-	unsigned long addr = (unsigned long)(&ftrace_graph_caller);
-	unsigned long stub = (unsigned long)(&ftrace_graph_stub);
-	unsigned int old, new;
-
-	old = ftrace_call_replace(ip, addr, 0);
-	new = ftrace_call_replace(ip, stub, 0);
-
-	return ftrace_modify_code(ip, old, new);
-}
-#endif /* CONFIG_DYNAMIC_FTRACE */
-
-/*
- * Hook the return address and push it in the stack of return addrs
- * in current thread info. Return the address we want to divert to.
- */
-unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip)
-{
-	struct ftrace_graph_ent trace;
-	unsigned long return_hooker;
-
-	if (unlikely(ftrace_graph_is_dead()))
-		goto out;
-
-	if (unlikely(atomic_read(&current->tracing_graph_pause)))
-		goto out;
-
-	return_hooker = ppc_function_entry(return_to_handler);
-
-	trace.func = ip;
-	trace.depth = current->curr_ret_stack + 1;
-
-	/* Only trace if the calling function expects to */
-	if (!ftrace_graph_entry(&trace))
-		goto out;
-
-	if (ftrace_push_return_trace(parent, ip, &trace.depth, 0,
-				     NULL) == -EBUSY)
-		goto out;
-
-	parent = return_hooker;
-out:
-	return parent;
-}
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-
-#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_PPC64)
-unsigned long __init arch_syscall_addr(int nr)
-{
-	return sys_call_table[nr*2];
-}
-#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */
-
-#ifdef PPC64_ELF_ABI_v1
-char *arch_ftrace_match_adjust(char *str, const char *search)
-{
-	if (str[0] == '.' && search[0] != '.')
-		return str + 1;
-	else
-		return str;
-}
-#endif /* PPC64_ELF_ABI_v1 */
diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile
new file mode 100644
index 000000000000..5f5a35254a9b
--- /dev/null
+++ b/arch/powerpc/kernel/trace/Makefile
@@ -0,0 +1,24 @@
+#
+# Makefile for the powerpc trace subsystem
+#
+
+subdir-ccflags-$(CONFIG_PPC_WERROR)	:= -Werror
+
+ifdef CONFIG_FUNCTION_TRACER
+# do not trace tracer code
+CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
+endif
+
+obj32-$(CONFIG_FUNCTION_TRACER)		+= ftrace_32.o
+obj64-$(CONFIG_FUNCTION_TRACER)		+= ftrace_64.o
+obj-$(CONFIG_DYNAMIC_FTRACE)		+= ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
+obj-$(CONFIG_FTRACE_SYSCALLS)		+= ftrace.o
+obj-$(CONFIG_TRACING)			+= trace_clock.o
+
+obj-$(CONFIG_PPC64)			+= $(obj64-y)
+obj-$(CONFIG_PPC32)			+= $(obj32-y)
+
+# Disable GCOV & sanitizers in odd or sensitive code
+GCOV_PROFILE_ftrace.o := n
+UBSAN_SANITIZE_ftrace.o := n
diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
new file mode 100644
index 000000000000..32509de6ce4c
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -0,0 +1,620 @@
+/*
+ * Code for replacing ftrace calls with jumps.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ * Thanks goes out to P.A. Semi, Inc for supplying me with a PPC64 box.
+ *
+ * Added function graph tracer code, taken from x86 that was written
+ * by Frederic Weisbecker, and ported to PPC by Steven Rostedt.
+ *
+ */
+
+#define pr_fmt(fmt) "ftrace-powerpc: " fmt
+
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/list.h>
+
+#include <asm/asm-prototypes.h>
+#include <asm/cacheflush.h>
+#include <asm/code-patching.h>
+#include <asm/ftrace.h>
+#include <asm/syscall.h>
+
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static unsigned int
+ftrace_call_replace(unsigned long ip, unsigned long addr, int link)
+{
+	unsigned int op;
+
+	addr = ppc_function_entry((void *)addr);
+
+	/* if (link) set op to 'bl' else 'b' */
+	op = create_branch((unsigned int *)ip, addr, link ? 1 : 0);
+
+	return op;
+}
+
+static int
+ftrace_modify_code(unsigned long ip, unsigned int old, unsigned int new)
+{
+	unsigned int replaced;
+
+	/*
+	 * Note:
+	 * We are paranoid about modifying text, as if a bug was to happen, it
+	 * could cause us to read or write to someplace that could cause harm.
+	 * Carefully read and modify the code with probe_kernel_*(), and make
+	 * sure what we read is what we expected it to be before modifying it.
+	 */
+
+	/* read the text we want to modify */
+	if (probe_kernel_read(&replaced, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	/* Make sure it is what we expect it to be */
+	if (replaced != old) {
+		pr_err("%p: replaced (%#x) != old (%#x)",
+		(void *)ip, replaced, old);
+		return -EINVAL;
+	}
+
+	/* replace the text with the new text */
+	if (patch_instruction((unsigned int *)ip, new))
+		return -EPERM;
+
+	return 0;
+}
+
+/*
+ * Helper functions that are the same for both PPC64 and PPC32.
+ */
+static int test_24bit_addr(unsigned long ip, unsigned long addr)
+{
+	addr = ppc_function_entry((void *)addr);
+
+	/* use the create_branch to verify that this offset can be branched */
+	return create_branch((unsigned int *)ip, addr, 0);
+}
+
+#ifdef CONFIG_MODULES
+
+static int is_bl_op(unsigned int op)
+{
+	return (op & 0xfc000003) == 0x48000001;
+}
+
+static unsigned long find_bl_target(unsigned long ip, unsigned int op)
+{
+	static int offset;
+
+	offset = (op & 0x03fffffc);
+	/* make it signed */
+	if (offset & 0x02000000)
+		offset |= 0xfe000000;
+
+	return ip + (long)offset;
+}
+
+#ifdef CONFIG_PPC64
+static int
+__ftrace_make_nop(struct module *mod,
+		  struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long entry, ptr, tramp;
+	unsigned long ip = rec->ip;
+	unsigned int op, pop;
+
+	/* read where this goes */
+	if (probe_kernel_read(&op, (void *)ip, sizeof(int))) {
+		pr_err("Fetching opcode failed.\n");
+		return -EFAULT;
+	}
+
+	/* Make sure that that this is still a 24bit jump */
+	if (!is_bl_op(op)) {
+		pr_err("Not expected bl: opcode is %x\n", op);
+		return -EINVAL;
+	}
+
+	/* lets find where the pointer goes */
+	tramp = find_bl_target(ip, op);
+
+	pr_devel("ip:%lx jumps to %lx", ip, tramp);
+
+	if (module_trampoline_target(mod, tramp, &ptr)) {
+		pr_err("Failed to get trampoline target\n");
+		return -EFAULT;
+	}
+
+	pr_devel("trampoline target %lx", ptr);
+
+	entry = ppc_global_function_entry((void *)addr);
+	/* This should match what was called */
+	if (ptr != entry) {
+		pr_err("addr %lx does not match expected %lx\n", ptr, entry);
+		return -EINVAL;
+	}
+
+#ifdef CC_USING_MPROFILE_KERNEL
+	/* When using -mkernel_profile there is no load to jump over */
+	pop = PPC_INST_NOP;
+
+	if (probe_kernel_read(&op, (void *)(ip - 4), 4)) {
+		pr_err("Fetching instruction at %lx failed.\n", ip - 4);
+		return -EFAULT;
+	}
+
+	/* We expect either a mflr r0, or a std r0, LRSAVE(r1) */
+	if (op != PPC_INST_MFLR && op != PPC_INST_STD_LR) {
+		pr_err("Unexpected instruction %08x around bl _mcount\n", op);
+		return -EINVAL;
+	}
+#else
+	/*
+	 * Our original call site looks like:
+	 *
+	 * bl <tramp>
+	 * ld r2,XX(r1)
+	 *
+	 * Milton Miller pointed out that we can not simply nop the branch.
+	 * If a task was preempted when calling a trace function, the nops
+	 * will remove the way to restore the TOC in r2 and the r2 TOC will
+	 * get corrupted.
+	 *
+	 * Use a b +8 to jump over the load.
+	 */
+
+	pop = PPC_INST_BRANCH | 8;	/* b +8 */
+
+	/*
+	 * Check what is in the next instruction. We can see ld r2,40(r1), but
+	 * on first pass after boot we will see mflr r0.
+	 */
+	if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE)) {
+		pr_err("Fetching op failed.\n");
+		return -EFAULT;
+	}
+
+	if (op != PPC_INST_LD_TOC) {
+		pr_err("Expected %08x found %08x\n", PPC_INST_LD_TOC, op);
+		return -EINVAL;
+	}
+#endif /* CC_USING_MPROFILE_KERNEL */
+
+	if (patch_instruction((unsigned int *)ip, pop)) {
+		pr_err("Patching NOP failed.\n");
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+#else /* !PPC64 */
+static int
+__ftrace_make_nop(struct module *mod,
+		  struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned int op;
+	unsigned int jmp[4];
+	unsigned long ip = rec->ip;
+	unsigned long tramp;
+
+	if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	/* Make sure that that this is still a 24bit jump */
+	if (!is_bl_op(op)) {
+		pr_err("Not expected bl: opcode is %x\n", op);
+		return -EINVAL;
+	}
+
+	/* lets find where the pointer goes */
+	tramp = find_bl_target(ip, op);
+
+	/*
+	 * On PPC32 the trampoline looks like:
+	 *  0x3d, 0x80, 0x00, 0x00  lis r12,sym@ha
+	 *  0x39, 0x8c, 0x00, 0x00  addi r12,r12,sym@l
+	 *  0x7d, 0x89, 0x03, 0xa6  mtctr r12
+	 *  0x4e, 0x80, 0x04, 0x20  bctr
+	 */
+
+	pr_devel("ip:%lx jumps to %lx", ip, tramp);
+
+	/* Find where the trampoline jumps to */
+	if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
+		pr_err("Failed to read %lx\n", tramp);
+		return -EFAULT;
+	}
+
+	pr_devel(" %08x %08x ", jmp[0], jmp[1]);
+
+	/* verify that this is what we expect it to be */
+	if (((jmp[0] & 0xffff0000) != 0x3d800000) ||
+	    ((jmp[1] & 0xffff0000) != 0x398c0000) ||
+	    (jmp[2] != 0x7d8903a6) ||
+	    (jmp[3] != 0x4e800420)) {
+		pr_err("Not a trampoline\n");
+		return -EINVAL;
+	}
+
+	tramp = (jmp[1] & 0xffff) |
+		((jmp[0] & 0xffff) << 16);
+	if (tramp & 0x8000)
+		tramp -= 0x10000;
+
+	pr_devel(" %lx ", tramp);
+
+	if (tramp != addr) {
+		pr_err("Trampoline location %08lx does not match addr\n",
+		       tramp);
+		return -EINVAL;
+	}
+
+	op = PPC_INST_NOP;
+
+	if (patch_instruction((unsigned int *)ip, op))
+		return -EPERM;
+
+	return 0;
+}
+#endif /* PPC64 */
+#endif /* CONFIG_MODULES */
+
+int ftrace_make_nop(struct module *mod,
+		    struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long ip = rec->ip;
+	unsigned int old, new;
+
+	/*
+	 * If the calling address is more that 24 bits away,
+	 * then we had to use a trampoline to make the call.
+	 * Otherwise just update the call site.
+	 */
+	if (test_24bit_addr(ip, addr)) {
+		/* within range */
+		old = ftrace_call_replace(ip, addr, 1);
+		new = PPC_INST_NOP;
+		return ftrace_modify_code(ip, old, new);
+	}
+
+#ifdef CONFIG_MODULES
+	/*
+	 * Out of range jumps are called from modules.
+	 * We should either already have a pointer to the module
+	 * or it has been passed in.
+	 */
+	if (!rec->arch.mod) {
+		if (!mod) {
+			pr_err("No module loaded addr=%lx\n", addr);
+			return -EFAULT;
+		}
+		rec->arch.mod = mod;
+	} else if (mod) {
+		if (mod != rec->arch.mod) {
+			pr_err("Record mod %p not equal to passed in mod %p\n",
+			       rec->arch.mod, mod);
+			return -EINVAL;
+		}
+		/* nothing to do if mod == rec->arch.mod */
+	} else
+		mod = rec->arch.mod;
+
+	return __ftrace_make_nop(mod, rec, addr);
+#else
+	/* We should not get here without modules */
+	return -EINVAL;
+#endif /* CONFIG_MODULES */
+}
+
+#ifdef CONFIG_MODULES
+#ifdef CONFIG_PPC64
+/*
+ * Examine the existing instructions for __ftrace_make_call.
+ * They should effectively be a NOP, and follow formal constraints,
+ * depending on the ABI. Return false if they don't.
+ */
+#ifndef CC_USING_MPROFILE_KERNEL
+static int
+expected_nop_sequence(void *ip, unsigned int op0, unsigned int op1)
+{
+	/*
+	 * We expect to see:
+	 *
+	 * b +8
+	 * ld r2,XX(r1)
+	 *
+	 * The load offset is different depending on the ABI. For simplicity
+	 * just mask it out when doing the compare.
+	 */
+	if ((op0 != 0x48000008) || ((op1 & 0xffff0000) != 0xe8410000))
+		return 0;
+	return 1;
+}
+#else
+static int
+expected_nop_sequence(void *ip, unsigned int op0, unsigned int op1)
+{
+	/* look for patched "NOP" on ppc64 with -mprofile-kernel */
+	if (op0 != PPC_INST_NOP)
+		return 0;
+	return 1;
+}
+#endif
+
+static int
+__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned int op[2];
+	void *ip = (void *)rec->ip;
+
+	/* read where this goes */
+	if (probe_kernel_read(op, ip, sizeof(op)))
+		return -EFAULT;
+
+	if (!expected_nop_sequence(ip, op[0], op[1])) {
+		pr_err("Unexpected call sequence at %p: %x %x\n",
+		ip, op[0], op[1]);
+		return -EINVAL;
+	}
+
+	/* If we never set up a trampoline to ftrace_caller, then bail */
+	if (!rec->arch.mod->arch.tramp) {
+		pr_err("No ftrace trampoline\n");
+		return -EINVAL;
+	}
+
+	/* Ensure branch is within 24 bits */
+	if (!create_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) {
+		pr_err("Branch out of range\n");
+		return -EINVAL;
+	}
+
+	if (patch_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) {
+		pr_err("REL24 out of range!\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+			unsigned long addr)
+{
+	return ftrace_make_call(rec, addr);
+}
+#endif
+
+#else  /* !CONFIG_PPC64: */
+static int
+__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned int op;
+	unsigned long ip = rec->ip;
+
+	/* read where this goes */
+	if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	/* It should be pointing to a nop */
+	if (op != PPC_INST_NOP) {
+		pr_err("Expected NOP but have %x\n", op);
+		return -EINVAL;
+	}
+
+	/* If we never set up a trampoline to ftrace_caller, then bail */
+	if (!rec->arch.mod->arch.tramp) {
+		pr_err("No ftrace trampoline\n");
+		return -EINVAL;
+	}
+
+	/* create the branch to the trampoline */
+	op = create_branch((unsigned int *)ip,
+			   rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
+	if (!op) {
+		pr_err("REL24 out of range!\n");
+		return -EINVAL;
+	}
+
+	pr_devel("write to %lx\n", rec->ip);
+
+	if (patch_instruction((unsigned int *)ip, op))
+		return -EPERM;
+
+	return 0;
+}
+#endif /* CONFIG_PPC64 */
+#endif /* CONFIG_MODULES */
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long ip = rec->ip;
+	unsigned int old, new;
+
+	/*
+	 * If the calling address is more that 24 bits away,
+	 * then we had to use a trampoline to make the call.
+	 * Otherwise just update the call site.
+	 */
+	if (test_24bit_addr(ip, addr)) {
+		/* within range */
+		old = PPC_INST_NOP;
+		new = ftrace_call_replace(ip, addr, 1);
+		return ftrace_modify_code(ip, old, new);
+	}
+
+#ifdef CONFIG_MODULES
+	/*
+	 * Out of range jumps are called from modules.
+	 * Being that we are converting from nop, it had better
+	 * already have a module defined.
+	 */
+	if (!rec->arch.mod) {
+		pr_err("No module loaded\n");
+		return -EINVAL;
+	}
+
+	return __ftrace_make_call(rec, addr);
+#else
+	/* We should not get here without modules */
+	return -EINVAL;
+#endif /* CONFIG_MODULES */
+}
+
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	unsigned long ip = (unsigned long)(&ftrace_call);
+	unsigned int old, new;
+	int ret;
+
+	old = *(unsigned int *)&ftrace_call;
+	new = ftrace_call_replace(ip, (unsigned long)func, 1);
+	ret = ftrace_modify_code(ip, old, new);
+
+	return ret;
+}
+
+static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+{
+	unsigned long ftrace_addr = (unsigned long)FTRACE_ADDR;
+	int ret;
+
+	ret = ftrace_update_record(rec, enable);
+
+	switch (ret) {
+	case FTRACE_UPDATE_IGNORE:
+		return 0;
+	case FTRACE_UPDATE_MAKE_CALL:
+		return ftrace_make_call(rec, ftrace_addr);
+	case FTRACE_UPDATE_MAKE_NOP:
+		return ftrace_make_nop(NULL, rec, ftrace_addr);
+	}
+
+	return 0;
+}
+
+void ftrace_replace_code(int enable)
+{
+	struct ftrace_rec_iter *iter;
+	struct dyn_ftrace *rec;
+	int ret;
+
+	for (iter = ftrace_rec_iter_start(); iter;
+	     iter = ftrace_rec_iter_next(iter)) {
+		rec = ftrace_rec_iter_record(iter);
+		ret = __ftrace_replace_code(rec, enable);
+		if (ret) {
+			ftrace_bug(ret, rec);
+			return;
+		}
+	}
+}
+
+/*
+ * Use the default ftrace_modify_all_code, but without
+ * stop_machine().
+ */
+void arch_ftrace_update_code(int command)
+{
+	ftrace_modify_all_code(command);
+}
+
+int __init ftrace_dyn_arch_init(void)
+{
+	return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_graph_call(void);
+extern void ftrace_graph_stub(void);
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+	unsigned long ip = (unsigned long)(&ftrace_graph_call);
+	unsigned long addr = (unsigned long)(&ftrace_graph_caller);
+	unsigned long stub = (unsigned long)(&ftrace_graph_stub);
+	unsigned int old, new;
+
+	old = ftrace_call_replace(ip, stub, 0);
+	new = ftrace_call_replace(ip, addr, 0);
+
+	return ftrace_modify_code(ip, old, new);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+	unsigned long ip = (unsigned long)(&ftrace_graph_call);
+	unsigned long addr = (unsigned long)(&ftrace_graph_caller);
+	unsigned long stub = (unsigned long)(&ftrace_graph_stub);
+	unsigned int old, new;
+
+	old = ftrace_call_replace(ip, addr, 0);
+	new = ftrace_call_replace(ip, stub, 0);
+
+	return ftrace_modify_code(ip, old, new);
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info. Return the address we want to divert to.
+ */
+unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip)
+{
+	struct ftrace_graph_ent trace;
+	unsigned long return_hooker;
+
+	if (unlikely(ftrace_graph_is_dead()))
+		goto out;
+
+	if (unlikely(atomic_read(&current->tracing_graph_pause)))
+		goto out;
+
+	return_hooker = ppc_function_entry(return_to_handler);
+
+	trace.func = ip;
+	trace.depth = current->curr_ret_stack + 1;
+
+	/* Only trace if the calling function expects to */
+	if (!ftrace_graph_entry(&trace))
+		goto out;
+
+	if (ftrace_push_return_trace(parent, ip, &trace.depth, 0,
+				     NULL) == -EBUSY)
+		goto out;
+
+	parent = return_hooker;
+out:
+	return parent;
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_PPC64)
+unsigned long __init arch_syscall_addr(int nr)
+{
+	return sys_call_table[nr*2];
+}
+#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */
+
+#ifdef PPC64_ELF_ABI_v1
+char *arch_ftrace_match_adjust(char *str, const char *search)
+{
+	if (str[0] == '.' && search[0] != '.')
+		return str + 1;
+	else
+		return str;
+}
+#endif /* PPC64_ELF_ABI_v1 */
diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S
new file mode 100644
index 000000000000..afef2c076282
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_32.S
@@ -0,0 +1,118 @@
+/*
+ * Split from entry_32.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/reg.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/export.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+	/*
+	 * It is required that _mcount on PPC32 must preserve the
+	 * link register. But we have r0 to play with. We use r0
+	 * to push the return address back to the caller of mcount
+	 * into the ctr register, restore the link register and
+	 * then jump back using the ctr register.
+	 */
+	mflr	r0
+	mtctr	r0
+	lwz	r0, 4(r1)
+	mtlr	r0
+	bctr
+
+_GLOBAL(ftrace_caller)
+	MCOUNT_SAVE_FRAME
+	/* r3 ends up with link register */
+	subi	r3, r3, MCOUNT_INSN_SIZE
+.globl ftrace_call
+ftrace_call:
+	bl	ftrace_stub
+	nop
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	b	ftrace_graph_stub
+_GLOBAL(ftrace_graph_stub)
+#endif
+	MCOUNT_RESTORE_FRAME
+	/* old link register ends up in ctr reg */
+	bctr
+#else
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+
+	MCOUNT_SAVE_FRAME
+
+	subi	r3, r3, MCOUNT_INSN_SIZE
+	LOAD_REG_ADDR(r5, ftrace_trace_function)
+	lwz	r5,0(r5)
+
+	mtctr	r5
+	bctrl
+	nop
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	b	ftrace_graph_caller
+#endif
+	MCOUNT_RESTORE_FRAME
+	bctr
+#endif
+EXPORT_SYMBOL(_mcount)
+
+_GLOBAL(ftrace_stub)
+	blr
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+_GLOBAL(ftrace_graph_caller)
+	/* load r4 with local address */
+	lwz	r4, 44(r1)
+	subi	r4, r4, MCOUNT_INSN_SIZE
+
+	/* Grab the LR out of the caller stack frame */
+	lwz	r3,52(r1)
+
+	bl	prepare_ftrace_return
+	nop
+
+        /*
+         * prepare_ftrace_return gives us the address we divert to.
+         * Change the LR in the callers stack frame to this.
+         */
+	stw	r3,52(r1)
+
+	MCOUNT_RESTORE_FRAME
+	/* old link register ends up in ctr reg */
+	bctr
+
+_GLOBAL(return_to_handler)
+	/* need to save return values */
+	stwu	r1, -32(r1)
+	stw	r3, 20(r1)
+	stw	r4, 16(r1)
+	stw	r31, 12(r1)
+	mr	r31, r1
+
+	bl	ftrace_return_to_handler
+	nop
+
+	/* return value has real return address */
+	mtlr	r3
+
+	lwz	r3, 20(r1)
+	lwz	r4, 16(r1)
+	lwz	r31,12(r1)
+	lwz	r1, 0(r1)
+
+	/* Jump back to real return address */
+	blr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/ftrace_64.S b/arch/powerpc/kernel/trace/ftrace_64.S
new file mode 100644
index 000000000000..587e7b5c0aff
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_64.S
@@ -0,0 +1,389 @@
+/*
+ * Split from entry_64.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/ppc-opcode.h>
+#include <asm/export.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+EXPORT_SYMBOL(_mcount)
+	mflr	r12
+	mtctr	r12
+	mtlr	r0
+	bctr
+
+#ifndef CC_USING_MPROFILE_KERNEL
+_GLOBAL_TOC(ftrace_caller)
+	/* Taken from output of objdump from lib64/glibc */
+	mflr	r3
+	ld	r11, 0(r1)
+	stdu	r1, -112(r1)
+	std	r3, 128(r1)
+	ld	r4, 16(r11)
+	subi	r3, r3, MCOUNT_INSN_SIZE
+.globl ftrace_call
+ftrace_call:
+	bl	ftrace_stub
+	nop
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	b	ftrace_graph_stub
+_GLOBAL(ftrace_graph_stub)
+#endif
+	ld	r0, 128(r1)
+	mtlr	r0
+	addi	r1, r1, 112
+
+#else /* CC_USING_MPROFILE_KERNEL */
+/*
+ *
+ * ftrace_caller() is the function that replaces _mcount() when ftrace is
+ * active.
+ *
+ * We arrive here after a function A calls function B, and we are the trace
+ * function for B. When we enter r1 points to A's stack frame, B has not yet
+ * had a chance to allocate one yet.
+ *
+ * Additionally r2 may point either to the TOC for A, or B, depending on
+ * whether B did a TOC setup sequence before calling us.
+ *
+ * On entry the LR points back to the _mcount() call site, and r0 holds the
+ * saved LR as it was on entry to B, ie. the original return address at the
+ * call site in A.
+ *
+ * Our job is to save the register state into a struct pt_regs (on the stack)
+ * and then arrange for the ftrace function to be called.
+ */
+_GLOBAL(ftrace_caller)
+	/* Save the original return address in A's stack frame */
+	std	r0,LRSAVE(r1)
+
+	/* Create our stack frame + pt_regs */
+	stdu	r1,-SWITCH_FRAME_SIZE(r1)
+
+	/* Save all gprs to pt_regs */
+	SAVE_8GPRS(0,r1)
+	SAVE_8GPRS(8,r1)
+	SAVE_8GPRS(16,r1)
+	SAVE_8GPRS(24,r1)
+
+	/* Load special regs for save below */
+	mfmsr   r8
+	mfctr   r9
+	mfxer   r10
+	mfcr	r11
+
+	/* Get the _mcount() call site out of LR */
+	mflr	r7
+	/* Save it as pt_regs->nip */
+	std     r7, _NIP(r1)
+	/* Save the read LR in pt_regs->link */
+	std     r0, _LINK(r1)
+
+	/* Save callee's TOC in the ABI compliant location */
+	std	r2, 24(r1)
+	ld	r2,PACATOC(r13)	/* get kernel TOC in r2 */
+
+	addis	r3,r2,function_trace_op@toc@ha
+	addi	r3,r3,function_trace_op@toc@l
+	ld	r5,0(r3)
+
+#ifdef CONFIG_LIVEPATCH
+	mr	r14,r7		/* remember old NIP */
+#endif
+	/* Calculate ip from nip-4 into r3 for call below */
+	subi    r3, r7, MCOUNT_INSN_SIZE
+
+	/* Put the original return address in r4 as parent_ip */
+	mr	r4, r0
+
+	/* Save special regs */
+	std     r8, _MSR(r1)
+	std     r9, _CTR(r1)
+	std     r10, _XER(r1)
+	std     r11, _CCR(r1)
+
+	/* Load &pt_regs in r6 for call below */
+	addi    r6, r1 ,STACK_FRAME_OVERHEAD
+
+	/* ftrace_call(r3, r4, r5, r6) */
+.globl ftrace_call
+ftrace_call:
+	bl	ftrace_stub
+	nop
+
+	/* Load ctr with the possibly modified NIP */
+	ld	r3, _NIP(r1)
+	mtctr	r3
+#ifdef CONFIG_LIVEPATCH
+	cmpd	r14,r3		/* has NIP been altered? */
+#endif
+
+	/* Restore gprs */
+	REST_8GPRS(0,r1)
+	REST_8GPRS(8,r1)
+	REST_8GPRS(16,r1)
+	REST_8GPRS(24,r1)
+
+	/* Restore possibly modified LR */
+	ld	r0, _LINK(r1)
+	mtlr	r0
+
+	/* Restore callee's TOC */
+	ld	r2, 24(r1)
+
+	/* Pop our stack frame */
+	addi r1, r1, SWITCH_FRAME_SIZE
+
+#ifdef CONFIG_LIVEPATCH
+        /* Based on the cmpd above, if the NIP was altered handle livepatch */
+	bne-	livepatch_handler
+#endif
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	b	ftrace_graph_stub
+_GLOBAL(ftrace_graph_stub)
+#endif
+
+	bctr			/* jump after _mcount site */
+#endif /* CC_USING_MPROFILE_KERNEL */
+
+_GLOBAL(ftrace_stub)
+	blr
+
+#ifdef CONFIG_LIVEPATCH
+	/*
+	 * This function runs in the mcount context, between two functions. As
+	 * such it can only clobber registers which are volatile and used in
+	 * function linkage.
+	 *
+	 * We get here when a function A, calls another function B, but B has
+	 * been live patched with a new function C.
+	 *
+	 * On entry:
+	 *  - we have no stack frame and can not allocate one
+	 *  - LR points back to the original caller (in A)
+	 *  - CTR holds the new NIP in C
+	 *  - r0 & r12 are free
+	 *
+	 * r0 can't be used as the base register for a DS-form load or store, so
+	 * we temporarily shuffle r1 (stack pointer) into r0 and then put it back.
+	 */
+livepatch_handler:
+	CURRENT_THREAD_INFO(r12, r1)
+
+	/* Save stack pointer into r0 */
+	mr	r0, r1
+
+	/* Allocate 3 x 8 bytes */
+	ld	r1, TI_livepatch_sp(r12)
+	addi	r1, r1, 24
+	std	r1, TI_livepatch_sp(r12)
+
+	/* Save toc & real LR on livepatch stack */
+	std	r2,  -24(r1)
+	mflr	r12
+	std	r12, -16(r1)
+
+	/* Store stack end marker */
+	lis     r12, STACK_END_MAGIC@h
+	ori     r12, r12, STACK_END_MAGIC@l
+	std	r12, -8(r1)
+
+	/* Restore real stack pointer */
+	mr	r1, r0
+
+	/* Put ctr in r12 for global entry and branch there */
+	mfctr	r12
+	bctrl
+
+	/*
+	 * Now we are returning from the patched function to the original
+	 * caller A. We are free to use r0 and r12, and we can use r2 until we
+	 * restore it.
+	 */
+
+	CURRENT_THREAD_INFO(r12, r1)
+
+	/* Save stack pointer into r0 */
+	mr	r0, r1
+
+	ld	r1, TI_livepatch_sp(r12)
+
+	/* Check stack marker hasn't been trashed */
+	lis     r2,  STACK_END_MAGIC@h
+	ori     r2,  r2, STACK_END_MAGIC@l
+	ld	r12, -8(r1)
+1:	tdne	r12, r2
+	EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0
+
+	/* Restore LR & toc from livepatch stack */
+	ld	r12, -16(r1)
+	mtlr	r12
+	ld	r2,  -24(r1)
+
+	/* Pop livepatch stack frame */
+	CURRENT_THREAD_INFO(r12, r0)
+	subi	r1, r1, 24
+	std	r1, TI_livepatch_sp(r12)
+
+	/* Restore real stack pointer */
+	mr	r1, r0
+
+	/* Return to original caller of live patched function */
+	blr
+#endif
+
+
+#else
+_GLOBAL_TOC(_mcount)
+EXPORT_SYMBOL(_mcount)
+	/* Taken from output of objdump from lib64/glibc */
+	mflr	r3
+	ld	r11, 0(r1)
+	stdu	r1, -112(r1)
+	std	r3, 128(r1)
+	ld	r4, 16(r11)
+
+	subi	r3, r3, MCOUNT_INSN_SIZE
+	LOAD_REG_ADDR(r5,ftrace_trace_function)
+	ld	r5,0(r5)
+	ld	r5,0(r5)
+	mtctr	r5
+	bctrl
+	nop
+
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	b	ftrace_graph_caller
+#endif
+	ld	r0, 128(r1)
+	mtlr	r0
+	addi	r1, r1, 112
+_GLOBAL(ftrace_stub)
+	blr
+
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+#ifndef CC_USING_MPROFILE_KERNEL
+_GLOBAL(ftrace_graph_caller)
+	/* load r4 with local address */
+	ld	r4, 128(r1)
+	subi	r4, r4, MCOUNT_INSN_SIZE
+
+	/* Grab the LR out of the caller stack frame */
+	ld	r11, 112(r1)
+	ld	r3, 16(r11)
+
+	bl	prepare_ftrace_return
+	nop
+
+	/*
+	 * prepare_ftrace_return gives us the address we divert to.
+	 * Change the LR in the callers stack frame to this.
+	 */
+	ld	r11, 112(r1)
+	std	r3, 16(r11)
+
+	ld	r0, 128(r1)
+	mtlr	r0
+	addi	r1, r1, 112
+	blr
+
+#else /* CC_USING_MPROFILE_KERNEL */
+_GLOBAL(ftrace_graph_caller)
+	stdu	r1, -112(r1)
+	/* with -mprofile-kernel, parameter regs are still alive at _mcount */
+	std	r10, 104(r1)
+	std	r9, 96(r1)
+	std	r8, 88(r1)
+	std	r7, 80(r1)
+	std	r6, 72(r1)
+	std	r5, 64(r1)
+	std	r4, 56(r1)
+	std	r3, 48(r1)
+
+	/* Save callee's TOC in the ABI compliant location */
+	std	r2, 24(r1)
+	ld	r2, PACATOC(r13)	/* get kernel TOC in r2 */
+
+	mfctr	r4		/* ftrace_caller has moved local addr here */
+	std	r4, 40(r1)
+	mflr	r3		/* ftrace_caller has restored LR from stack */
+	subi	r4, r4, MCOUNT_INSN_SIZE
+
+	bl	prepare_ftrace_return
+	nop
+
+	/*
+	 * prepare_ftrace_return gives us the address we divert to.
+	 * Change the LR to this.
+	 */
+	mtlr	r3
+
+	ld	r0, 40(r1)
+	mtctr	r0
+	ld	r10, 104(r1)
+	ld	r9, 96(r1)
+	ld	r8, 88(r1)
+	ld	r7, 80(r1)
+	ld	r6, 72(r1)
+	ld	r5, 64(r1)
+	ld	r4, 56(r1)
+	ld	r3, 48(r1)
+
+	/* Restore callee's TOC */
+	ld	r2, 24(r1)
+
+	addi	r1, r1, 112
+	mflr	r0
+	std	r0, LRSAVE(r1)
+	bctr
+#endif /* CC_USING_MPROFILE_KERNEL */
+
+_GLOBAL(return_to_handler)
+	/* need to save return values */
+	std	r4,  -32(r1)
+	std	r3,  -24(r1)
+	/* save TOC */
+	std	r2,  -16(r1)
+	std	r31, -8(r1)
+	mr	r31, r1
+	stdu	r1, -112(r1)
+
+	/*
+	 * We might be called from a module.
+	 * Switch to our TOC to run inside the core kernel.
+	 */
+	ld	r2, PACATOC(r13)
+
+	bl	ftrace_return_to_handler
+	nop
+
+	/* return value has real return address */
+	mtlr	r3
+
+	ld	r1, 0(r1)
+	ld	r4,  -32(r1)
+	ld	r3,  -24(r1)
+	ld	r2,  -16(r1)
+	ld	r31, -8(r1)
+
+	/* Jump back to real return address */
+	blr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/trace_clock.c b/arch/powerpc/kernel/trace/trace_clock.c
new file mode 100644
index 000000000000..49170690946d
--- /dev/null
+++ b/arch/powerpc/kernel/trace/trace_clock.c
@@ -0,0 +1,15 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright (C) 2015 Naveen N. Rao, IBM Corporation
+ */
+
+#include <asm/trace_clock.h>
+#include <asm/time.h>
+
+u64 notrace trace_clock_ppc_tb(void)
+{
+	return get_tb();
+}
diff --git a/arch/powerpc/kernel/trace_clock.c b/arch/powerpc/kernel/trace_clock.c
deleted file mode 100644
index 49170690946d..000000000000
--- a/arch/powerpc/kernel/trace_clock.c
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * Copyright (C) 2015 Naveen N. Rao, IBM Corporation
- */
-
-#include <asm/trace_clock.h>
-#include <asm/time.h>
-
-u64 notrace trace_clock_ppc_tb(void)
-{
-	return get_tb();
-}
-- 
cgit v1.2.1


From 096ff2ddba83bf022d593a3096d683e57c4befb0 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Tue, 25 Apr 2017 19:25:54 +0530
Subject: powerpc/ftrace/64: Split further based on -mprofile-kernel

Split ftrace_64.S further retaining the core ftrace 64-bit aspects
in ftrace_64.S and moving ftrace_caller() and ftrace_graph_caller() into
separate files based on -mprofile-kernel. The livepatch routines are all
now contained within the mprofile file.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/trace/Makefile             |   5 +
 arch/powerpc/kernel/trace/ftrace_64.S          | 306 +------------------------
 arch/powerpc/kernel/trace/ftrace_64_mprofile.S | 272 ++++++++++++++++++++++
 arch/powerpc/kernel/trace/ftrace_64_pg.S       |  68 ++++++
 4 files changed, 346 insertions(+), 305 deletions(-)
 create mode 100644 arch/powerpc/kernel/trace/ftrace_64_mprofile.S
 create mode 100644 arch/powerpc/kernel/trace/ftrace_64_pg.S

(limited to 'arch')

diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile
index 5f5a35254a9b..729dffc5f7bc 100644
--- a/arch/powerpc/kernel/trace/Makefile
+++ b/arch/powerpc/kernel/trace/Makefile
@@ -11,6 +11,11 @@ endif
 
 obj32-$(CONFIG_FUNCTION_TRACER)		+= ftrace_32.o
 obj64-$(CONFIG_FUNCTION_TRACER)		+= ftrace_64.o
+ifdef CONFIG_MPROFILE_KERNEL
+obj64-$(CONFIG_FUNCTION_TRACER)		+= ftrace_64_mprofile.o
+else
+obj64-$(CONFIG_FUNCTION_TRACER)		+= ftrace_64_pg.o
+endif
 obj-$(CONFIG_DYNAMIC_FTRACE)		+= ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)		+= ftrace.o
diff --git a/arch/powerpc/kernel/trace/ftrace_64.S b/arch/powerpc/kernel/trace/ftrace_64.S
index 587e7b5c0aff..e5ccea19821e 100644
--- a/arch/powerpc/kernel/trace/ftrace_64.S
+++ b/arch/powerpc/kernel/trace/ftrace_64.S
@@ -23,233 +23,7 @@ EXPORT_SYMBOL(_mcount)
 	mtlr	r0
 	bctr
 
-#ifndef CC_USING_MPROFILE_KERNEL
-_GLOBAL_TOC(ftrace_caller)
-	/* Taken from output of objdump from lib64/glibc */
-	mflr	r3
-	ld	r11, 0(r1)
-	stdu	r1, -112(r1)
-	std	r3, 128(r1)
-	ld	r4, 16(r11)
-	subi	r3, r3, MCOUNT_INSN_SIZE
-.globl ftrace_call
-ftrace_call:
-	bl	ftrace_stub
-	nop
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
-	b	ftrace_graph_stub
-_GLOBAL(ftrace_graph_stub)
-#endif
-	ld	r0, 128(r1)
-	mtlr	r0
-	addi	r1, r1, 112
-
-#else /* CC_USING_MPROFILE_KERNEL */
-/*
- *
- * ftrace_caller() is the function that replaces _mcount() when ftrace is
- * active.
- *
- * We arrive here after a function A calls function B, and we are the trace
- * function for B. When we enter r1 points to A's stack frame, B has not yet
- * had a chance to allocate one yet.
- *
- * Additionally r2 may point either to the TOC for A, or B, depending on
- * whether B did a TOC setup sequence before calling us.
- *
- * On entry the LR points back to the _mcount() call site, and r0 holds the
- * saved LR as it was on entry to B, ie. the original return address at the
- * call site in A.
- *
- * Our job is to save the register state into a struct pt_regs (on the stack)
- * and then arrange for the ftrace function to be called.
- */
-_GLOBAL(ftrace_caller)
-	/* Save the original return address in A's stack frame */
-	std	r0,LRSAVE(r1)
-
-	/* Create our stack frame + pt_regs */
-	stdu	r1,-SWITCH_FRAME_SIZE(r1)
-
-	/* Save all gprs to pt_regs */
-	SAVE_8GPRS(0,r1)
-	SAVE_8GPRS(8,r1)
-	SAVE_8GPRS(16,r1)
-	SAVE_8GPRS(24,r1)
-
-	/* Load special regs for save below */
-	mfmsr   r8
-	mfctr   r9
-	mfxer   r10
-	mfcr	r11
-
-	/* Get the _mcount() call site out of LR */
-	mflr	r7
-	/* Save it as pt_regs->nip */
-	std     r7, _NIP(r1)
-	/* Save the read LR in pt_regs->link */
-	std     r0, _LINK(r1)
-
-	/* Save callee's TOC in the ABI compliant location */
-	std	r2, 24(r1)
-	ld	r2,PACATOC(r13)	/* get kernel TOC in r2 */
-
-	addis	r3,r2,function_trace_op@toc@ha
-	addi	r3,r3,function_trace_op@toc@l
-	ld	r5,0(r3)
-
-#ifdef CONFIG_LIVEPATCH
-	mr	r14,r7		/* remember old NIP */
-#endif
-	/* Calculate ip from nip-4 into r3 for call below */
-	subi    r3, r7, MCOUNT_INSN_SIZE
-
-	/* Put the original return address in r4 as parent_ip */
-	mr	r4, r0
-
-	/* Save special regs */
-	std     r8, _MSR(r1)
-	std     r9, _CTR(r1)
-	std     r10, _XER(r1)
-	std     r11, _CCR(r1)
-
-	/* Load &pt_regs in r6 for call below */
-	addi    r6, r1 ,STACK_FRAME_OVERHEAD
-
-	/* ftrace_call(r3, r4, r5, r6) */
-.globl ftrace_call
-ftrace_call:
-	bl	ftrace_stub
-	nop
-
-	/* Load ctr with the possibly modified NIP */
-	ld	r3, _NIP(r1)
-	mtctr	r3
-#ifdef CONFIG_LIVEPATCH
-	cmpd	r14,r3		/* has NIP been altered? */
-#endif
-
-	/* Restore gprs */
-	REST_8GPRS(0,r1)
-	REST_8GPRS(8,r1)
-	REST_8GPRS(16,r1)
-	REST_8GPRS(24,r1)
-
-	/* Restore possibly modified LR */
-	ld	r0, _LINK(r1)
-	mtlr	r0
-
-	/* Restore callee's TOC */
-	ld	r2, 24(r1)
-
-	/* Pop our stack frame */
-	addi r1, r1, SWITCH_FRAME_SIZE
-
-#ifdef CONFIG_LIVEPATCH
-        /* Based on the cmpd above, if the NIP was altered handle livepatch */
-	bne-	livepatch_handler
-#endif
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
-	b	ftrace_graph_stub
-_GLOBAL(ftrace_graph_stub)
-#endif
-
-	bctr			/* jump after _mcount site */
-#endif /* CC_USING_MPROFILE_KERNEL */
-
-_GLOBAL(ftrace_stub)
-	blr
-
-#ifdef CONFIG_LIVEPATCH
-	/*
-	 * This function runs in the mcount context, between two functions. As
-	 * such it can only clobber registers which are volatile and used in
-	 * function linkage.
-	 *
-	 * We get here when a function A, calls another function B, but B has
-	 * been live patched with a new function C.
-	 *
-	 * On entry:
-	 *  - we have no stack frame and can not allocate one
-	 *  - LR points back to the original caller (in A)
-	 *  - CTR holds the new NIP in C
-	 *  - r0 & r12 are free
-	 *
-	 * r0 can't be used as the base register for a DS-form load or store, so
-	 * we temporarily shuffle r1 (stack pointer) into r0 and then put it back.
-	 */
-livepatch_handler:
-	CURRENT_THREAD_INFO(r12, r1)
-
-	/* Save stack pointer into r0 */
-	mr	r0, r1
-
-	/* Allocate 3 x 8 bytes */
-	ld	r1, TI_livepatch_sp(r12)
-	addi	r1, r1, 24
-	std	r1, TI_livepatch_sp(r12)
-
-	/* Save toc & real LR on livepatch stack */
-	std	r2,  -24(r1)
-	mflr	r12
-	std	r12, -16(r1)
-
-	/* Store stack end marker */
-	lis     r12, STACK_END_MAGIC@h
-	ori     r12, r12, STACK_END_MAGIC@l
-	std	r12, -8(r1)
-
-	/* Restore real stack pointer */
-	mr	r1, r0
-
-	/* Put ctr in r12 for global entry and branch there */
-	mfctr	r12
-	bctrl
-
-	/*
-	 * Now we are returning from the patched function to the original
-	 * caller A. We are free to use r0 and r12, and we can use r2 until we
-	 * restore it.
-	 */
-
-	CURRENT_THREAD_INFO(r12, r1)
-
-	/* Save stack pointer into r0 */
-	mr	r0, r1
-
-	ld	r1, TI_livepatch_sp(r12)
-
-	/* Check stack marker hasn't been trashed */
-	lis     r2,  STACK_END_MAGIC@h
-	ori     r2,  r2, STACK_END_MAGIC@l
-	ld	r12, -8(r1)
-1:	tdne	r12, r2
-	EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0
-
-	/* Restore LR & toc from livepatch stack */
-	ld	r12, -16(r1)
-	mtlr	r12
-	ld	r2,  -24(r1)
-
-	/* Pop livepatch stack frame */
-	CURRENT_THREAD_INFO(r12, r0)
-	subi	r1, r1, 24
-	std	r1, TI_livepatch_sp(r12)
-
-	/* Restore real stack pointer */
-	mr	r1, r0
-
-	/* Return to original caller of live patched function */
-	blr
-#endif
-
-
-#else
+#else /* CONFIG_DYNAMIC_FTRACE */
 _GLOBAL_TOC(_mcount)
 EXPORT_SYMBOL(_mcount)
 	/* Taken from output of objdump from lib64/glibc */
@@ -267,7 +41,6 @@ EXPORT_SYMBOL(_mcount)
 	bctrl
 	nop
 
-
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	b	ftrace_graph_caller
 #endif
@@ -276,86 +49,9 @@ EXPORT_SYMBOL(_mcount)
 	addi	r1, r1, 112
 _GLOBAL(ftrace_stub)
 	blr
-
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-#ifndef CC_USING_MPROFILE_KERNEL
-_GLOBAL(ftrace_graph_caller)
-	/* load r4 with local address */
-	ld	r4, 128(r1)
-	subi	r4, r4, MCOUNT_INSN_SIZE
-
-	/* Grab the LR out of the caller stack frame */
-	ld	r11, 112(r1)
-	ld	r3, 16(r11)
-
-	bl	prepare_ftrace_return
-	nop
-
-	/*
-	 * prepare_ftrace_return gives us the address we divert to.
-	 * Change the LR in the callers stack frame to this.
-	 */
-	ld	r11, 112(r1)
-	std	r3, 16(r11)
-
-	ld	r0, 128(r1)
-	mtlr	r0
-	addi	r1, r1, 112
-	blr
-
-#else /* CC_USING_MPROFILE_KERNEL */
-_GLOBAL(ftrace_graph_caller)
-	stdu	r1, -112(r1)
-	/* with -mprofile-kernel, parameter regs are still alive at _mcount */
-	std	r10, 104(r1)
-	std	r9, 96(r1)
-	std	r8, 88(r1)
-	std	r7, 80(r1)
-	std	r6, 72(r1)
-	std	r5, 64(r1)
-	std	r4, 56(r1)
-	std	r3, 48(r1)
-
-	/* Save callee's TOC in the ABI compliant location */
-	std	r2, 24(r1)
-	ld	r2, PACATOC(r13)	/* get kernel TOC in r2 */
-
-	mfctr	r4		/* ftrace_caller has moved local addr here */
-	std	r4, 40(r1)
-	mflr	r3		/* ftrace_caller has restored LR from stack */
-	subi	r4, r4, MCOUNT_INSN_SIZE
-
-	bl	prepare_ftrace_return
-	nop
-
-	/*
-	 * prepare_ftrace_return gives us the address we divert to.
-	 * Change the LR to this.
-	 */
-	mtlr	r3
-
-	ld	r0, 40(r1)
-	mtctr	r0
-	ld	r10, 104(r1)
-	ld	r9, 96(r1)
-	ld	r8, 88(r1)
-	ld	r7, 80(r1)
-	ld	r6, 72(r1)
-	ld	r5, 64(r1)
-	ld	r4, 56(r1)
-	ld	r3, 48(r1)
-
-	/* Restore callee's TOC */
-	ld	r2, 24(r1)
-
-	addi	r1, r1, 112
-	mflr	r0
-	std	r0, LRSAVE(r1)
-	bctr
-#endif /* CC_USING_MPROFILE_KERNEL */
-
 _GLOBAL(return_to_handler)
 	/* need to save return values */
 	std	r4,  -32(r1)
diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
new file mode 100644
index 000000000000..7c933a99f5d5
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
@@ -0,0 +1,272 @@
+/*
+ * Split from ftrace_64.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/ppc-opcode.h>
+#include <asm/export.h>
+#include <asm/thread_info.h>
+#include <asm/bug.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+/*
+ *
+ * ftrace_caller() is the function that replaces _mcount() when ftrace is
+ * active.
+ *
+ * We arrive here after a function A calls function B, and we are the trace
+ * function for B. When we enter r1 points to A's stack frame, B has not yet
+ * had a chance to allocate one yet.
+ *
+ * Additionally r2 may point either to the TOC for A, or B, depending on
+ * whether B did a TOC setup sequence before calling us.
+ *
+ * On entry the LR points back to the _mcount() call site, and r0 holds the
+ * saved LR as it was on entry to B, ie. the original return address at the
+ * call site in A.
+ *
+ * Our job is to save the register state into a struct pt_regs (on the stack)
+ * and then arrange for the ftrace function to be called.
+ */
+_GLOBAL(ftrace_caller)
+	/* Save the original return address in A's stack frame */
+	std	r0,LRSAVE(r1)
+
+	/* Create our stack frame + pt_regs */
+	stdu	r1,-SWITCH_FRAME_SIZE(r1)
+
+	/* Save all gprs to pt_regs */
+	SAVE_8GPRS(0,r1)
+	SAVE_8GPRS(8,r1)
+	SAVE_8GPRS(16,r1)
+	SAVE_8GPRS(24,r1)
+
+	/* Load special regs for save below */
+	mfmsr   r8
+	mfctr   r9
+	mfxer   r10
+	mfcr	r11
+
+	/* Get the _mcount() call site out of LR */
+	mflr	r7
+	/* Save it as pt_regs->nip */
+	std     r7, _NIP(r1)
+	/* Save the read LR in pt_regs->link */
+	std     r0, _LINK(r1)
+
+	/* Save callee's TOC in the ABI compliant location */
+	std	r2, 24(r1)
+	ld	r2,PACATOC(r13)	/* get kernel TOC in r2 */
+
+	addis	r3,r2,function_trace_op@toc@ha
+	addi	r3,r3,function_trace_op@toc@l
+	ld	r5,0(r3)
+
+#ifdef CONFIG_LIVEPATCH
+	mr	r14,r7		/* remember old NIP */
+#endif
+	/* Calculate ip from nip-4 into r3 for call below */
+	subi    r3, r7, MCOUNT_INSN_SIZE
+
+	/* Put the original return address in r4 as parent_ip */
+	mr	r4, r0
+
+	/* Save special regs */
+	std     r8, _MSR(r1)
+	std     r9, _CTR(r1)
+	std     r10, _XER(r1)
+	std     r11, _CCR(r1)
+
+	/* Load &pt_regs in r6 for call below */
+	addi    r6, r1 ,STACK_FRAME_OVERHEAD
+
+	/* ftrace_call(r3, r4, r5, r6) */
+.globl ftrace_call
+ftrace_call:
+	bl	ftrace_stub
+	nop
+
+	/* Load ctr with the possibly modified NIP */
+	ld	r3, _NIP(r1)
+	mtctr	r3
+#ifdef CONFIG_LIVEPATCH
+	cmpd	r14,r3		/* has NIP been altered? */
+#endif
+
+	/* Restore gprs */
+	REST_8GPRS(0,r1)
+	REST_8GPRS(8,r1)
+	REST_8GPRS(16,r1)
+	REST_8GPRS(24,r1)
+
+	/* Restore possibly modified LR */
+	ld	r0, _LINK(r1)
+	mtlr	r0
+
+	/* Restore callee's TOC */
+	ld	r2, 24(r1)
+
+	/* Pop our stack frame */
+	addi r1, r1, SWITCH_FRAME_SIZE
+
+#ifdef CONFIG_LIVEPATCH
+        /* Based on the cmpd above, if the NIP was altered handle livepatch */
+	bne-	livepatch_handler
+#endif
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	b	ftrace_graph_stub
+_GLOBAL(ftrace_graph_stub)
+#endif
+
+	bctr			/* jump after _mcount site */
+
+_GLOBAL(ftrace_stub)
+	blr
+
+#ifdef CONFIG_LIVEPATCH
+	/*
+	 * This function runs in the mcount context, between two functions. As
+	 * such it can only clobber registers which are volatile and used in
+	 * function linkage.
+	 *
+	 * We get here when a function A, calls another function B, but B has
+	 * been live patched with a new function C.
+	 *
+	 * On entry:
+	 *  - we have no stack frame and can not allocate one
+	 *  - LR points back to the original caller (in A)
+	 *  - CTR holds the new NIP in C
+	 *  - r0 & r12 are free
+	 *
+	 * r0 can't be used as the base register for a DS-form load or store, so
+	 * we temporarily shuffle r1 (stack pointer) into r0 and then put it back.
+	 */
+livepatch_handler:
+	CURRENT_THREAD_INFO(r12, r1)
+
+	/* Save stack pointer into r0 */
+	mr	r0, r1
+
+	/* Allocate 3 x 8 bytes */
+	ld	r1, TI_livepatch_sp(r12)
+	addi	r1, r1, 24
+	std	r1, TI_livepatch_sp(r12)
+
+	/* Save toc & real LR on livepatch stack */
+	std	r2,  -24(r1)
+	mflr	r12
+	std	r12, -16(r1)
+
+	/* Store stack end marker */
+	lis     r12, STACK_END_MAGIC@h
+	ori     r12, r12, STACK_END_MAGIC@l
+	std	r12, -8(r1)
+
+	/* Restore real stack pointer */
+	mr	r1, r0
+
+	/* Put ctr in r12 for global entry and branch there */
+	mfctr	r12
+	bctrl
+
+	/*
+	 * Now we are returning from the patched function to the original
+	 * caller A. We are free to use r0 and r12, and we can use r2 until we
+	 * restore it.
+	 */
+
+	CURRENT_THREAD_INFO(r12, r1)
+
+	/* Save stack pointer into r0 */
+	mr	r0, r1
+
+	ld	r1, TI_livepatch_sp(r12)
+
+	/* Check stack marker hasn't been trashed */
+	lis     r2,  STACK_END_MAGIC@h
+	ori     r2,  r2, STACK_END_MAGIC@l
+	ld	r12, -8(r1)
+1:	tdne	r12, r2
+	EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0
+
+	/* Restore LR & toc from livepatch stack */
+	ld	r12, -16(r1)
+	mtlr	r12
+	ld	r2,  -24(r1)
+
+	/* Pop livepatch stack frame */
+	CURRENT_THREAD_INFO(r12, r0)
+	subi	r1, r1, 24
+	std	r1, TI_livepatch_sp(r12)
+
+	/* Restore real stack pointer */
+	mr	r1, r0
+
+	/* Return to original caller of live patched function */
+	blr
+#endif /* CONFIG_LIVEPATCH */
+
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+_GLOBAL(ftrace_graph_caller)
+	stdu	r1, -112(r1)
+	/* with -mprofile-kernel, parameter regs are still alive at _mcount */
+	std	r10, 104(r1)
+	std	r9, 96(r1)
+	std	r8, 88(r1)
+	std	r7, 80(r1)
+	std	r6, 72(r1)
+	std	r5, 64(r1)
+	std	r4, 56(r1)
+	std	r3, 48(r1)
+
+	/* Save callee's TOC in the ABI compliant location */
+	std	r2, 24(r1)
+	ld	r2, PACATOC(r13)	/* get kernel TOC in r2 */
+
+	mfctr	r4		/* ftrace_caller has moved local addr here */
+	std	r4, 40(r1)
+	mflr	r3		/* ftrace_caller has restored LR from stack */
+	subi	r4, r4, MCOUNT_INSN_SIZE
+
+	bl	prepare_ftrace_return
+	nop
+
+	/*
+	 * prepare_ftrace_return gives us the address we divert to.
+	 * Change the LR to this.
+	 */
+	mtlr	r3
+
+	ld	r0, 40(r1)
+	mtctr	r0
+	ld	r10, 104(r1)
+	ld	r9, 96(r1)
+	ld	r8, 88(r1)
+	ld	r7, 80(r1)
+	ld	r6, 72(r1)
+	ld	r5, 64(r1)
+	ld	r4, 56(r1)
+	ld	r3, 48(r1)
+
+	/* Restore callee's TOC */
+	ld	r2, 24(r1)
+
+	addi	r1, r1, 112
+	mflr	r0
+	std	r0, LRSAVE(r1)
+	bctr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.S b/arch/powerpc/kernel/trace/ftrace_64_pg.S
new file mode 100644
index 000000000000..f095358da96e
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_64_pg.S
@@ -0,0 +1,68 @@
+/*
+ * Split from ftrace_64.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/ppc-opcode.h>
+#include <asm/export.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+_GLOBAL_TOC(ftrace_caller)
+	/* Taken from output of objdump from lib64/glibc */
+	mflr	r3
+	ld	r11, 0(r1)
+	stdu	r1, -112(r1)
+	std	r3, 128(r1)
+	ld	r4, 16(r11)
+	subi	r3, r3, MCOUNT_INSN_SIZE
+.globl ftrace_call
+ftrace_call:
+	bl	ftrace_stub
+	nop
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	b	ftrace_graph_stub
+_GLOBAL(ftrace_graph_stub)
+#endif
+	ld	r0, 128(r1)
+	mtlr	r0
+	addi	r1, r1, 112
+
+_GLOBAL(ftrace_stub)
+	blr
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+_GLOBAL(ftrace_graph_caller)
+	/* load r4 with local address */
+	ld	r4, 128(r1)
+	subi	r4, r4, MCOUNT_INSN_SIZE
+
+	/* Grab the LR out of the caller stack frame */
+	ld	r11, 112(r1)
+	ld	r3, 16(r11)
+
+	bl	prepare_ftrace_return
+	nop
+
+	/*
+	 * prepare_ftrace_return gives us the address we divert to.
+	 * Change the LR in the callers stack frame to this.
+	 */
+	ld	r11, 112(r1)
+	std	r3, 16(r11)
+
+	ld	r0, 128(r1)
+	mtlr	r0
+	addi	r1, r1, 112
+	blr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-- 
cgit v1.2.1


From 461e96a3374892d551fe270a975f33836c7f7d72 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Mon, 20 Mar 2017 16:31:48 +1000
Subject: powerpc/pasemi: Do not process external or decrementer interrupts
 from sreset

PA Semi will wake from low power state at the system reset interrupt,
with the event encoded in SRR1, rather than waking at the interrupt
vector that corresponds to that event.

The system reset handler for this platform decodes SRR1 event reason
and calls the interrupt handler to process it directly from the system
reset handlre.

A subsequent change will treat the system reset interrupt as a Linux NMI
with its own per-CPU stack, and this will no longer work. Remove the
external and decrementer handlers from the system reset handler.

- The external exception remains raised and will fire again at the
  EE interrupt vector when system reset returns.

- The decrementer is set to 1 so it will be raised again and fire when
  the system reset returns.

It is possible to branch to an idle handler from the system reset
interrupt (like POWER does), then restore a normal stack and restore
this optimisation. But simplicity wins for now.

Tested-by: Christian Zigotzky <chzigotzky@xenosoft.de>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pasemi/idle.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/pasemi/idle.c b/arch/powerpc/platforms/pasemi/idle.c
index 75b296bc51af..44e0d9226f0a 100644
--- a/arch/powerpc/platforms/pasemi/idle.c
+++ b/arch/powerpc/platforms/pasemi/idle.c
@@ -53,11 +53,14 @@ static int pasemi_system_reset_exception(struct pt_regs *regs)
 		regs->nip = regs->link;
 
 	switch (regs->msr & SRR1_WAKEMASK) {
-	case SRR1_WAKEEE:
-		do_IRQ(regs);
-		break;
 	case SRR1_WAKEDEC:
-		timer_interrupt(regs);
+		set_dec(1);
+	case SRR1_WAKEEE:
+		/*
+		 * Handle these when interrupts get re-enabled and we take
+		 * them as regular exceptions. We are in an NMI context
+		 * and can't handle these here.
+		 */
 		break;
 	default:
 		/* do system reset */
-- 
cgit v1.2.1


From 6e83985b0f6e30b928eee26eb41d276814a698aa Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Mon, 20 Mar 2017 16:31:49 +1000
Subject: powerpc/cbe: Do not process external or decremeter interrupts from
 sreset

Cell will wake from low power state at the system reset interrupt,
with the event encoded in SRR1, rather than waking at the interrupt
vector that corresponds to that event.

The system reset handler for this platform decodes SRR1 event reason
and calls the interrupt handler to process it directly from the system
reset handlre.

A subsequent change will treat the system reset interrupt as a Linux NMI
with its own per-CPU stack, and this will no longer work. Remove the
external and decrementer handlers from the system reset handler.

- The external exception remains raised and will fire again at the
  EE interrupt vector when system reset returns.

- The decrementer is set to 1 so it will be raised again and fire when
  the system reset returns.

It is possible to branch to an idle handler from the system reset
interrupt (like POWER does), then restore a normal stack and restore
this optimisation. But simplicity wins for now.

Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/cell/pervasive.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/cell/pervasive.c b/arch/powerpc/platforms/cell/pervasive.c
index e7d075077cb0..a88944db9fc3 100644
--- a/arch/powerpc/platforms/cell/pervasive.c
+++ b/arch/powerpc/platforms/cell/pervasive.c
@@ -88,11 +88,14 @@ static void cbe_power_save(void)
 static int cbe_system_reset_exception(struct pt_regs *regs)
 {
 	switch (regs->msr & SRR1_WAKEMASK) {
-	case SRR1_WAKEEE:
-		do_IRQ(regs);
-		break;
 	case SRR1_WAKEDEC:
-		timer_interrupt(regs);
+		set_dec(1);
+	case SRR1_WAKEEE:
+		/*
+		 * Handle these when interrupts get re-enabled and we take
+		 * them as regular exceptions. We are in an NMI context
+		 * and can't handle these here.
+		 */
 		break;
 	case SRR1_WAKEMT:
 		return cbe_sysreset_hack();
-- 
cgit v1.2.1


From 83a980f7f4769c0673f0f966350d1db26993a193 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 20 Dec 2016 04:30:02 +1000
Subject: powerpc/64s: Add exception macro that does not enable RI

Subsequent patches will add more non-RI variant exceptions, so
create a macro for it rather than open-code it.

This does not change generated instructions.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 15 +++++++++++++++
 arch/powerpc/kernel/exceptions-64s.S     | 17 ++++-------------
 2 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 89259817f5ef..99ed1d811684 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -200,6 +200,21 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 #define EXCEPTION_PROLOG_PSERIES_1(label, h)				\
 	__EXCEPTION_PROLOG_PSERIES_1(label, h)
 
+/* _NORI variant keeps MSR_RI clear */
+#define __EXCEPTION_PROLOG_PSERIES_1_NORI(label, h)			\
+	ld	r10,PACAKMSR(r13);	/* get MSR value for kernel */	\
+	xori	r10,r10,MSR_RI;		/* Clear MSR_RI */		\
+	mfspr	r11,SPRN_##h##SRR0;	/* save SRR0 */			\
+	LOAD_HANDLER(r12,label)						\
+	mtspr	SPRN_##h##SRR0,r12;					\
+	mfspr	r12,SPRN_##h##SRR1;	/* and SRR1 */			\
+	mtspr	SPRN_##h##SRR1,r10;					\
+	h##rfid;							\
+	b	.	/* prevent speculative execution */
+
+#define EXCEPTION_PROLOG_PSERIES_1_NORI(label, h)			\
+	__EXCEPTION_PROLOG_PSERIES_1_NORI(label, h)
+
 #define EXCEPTION_PROLOG_PSERIES(area, label, h, extra, vec)		\
 	EXCEPTION_PROLOG_0(area);					\
 	EXCEPTION_PROLOG_1(area, extra, vec);				\
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 28f8d7bed6b1..aa65e3cd4875 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -240,20 +240,11 @@ machine_check_fwnmi:
 machine_check_pSeries_0:
 	EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
 	/*
-	 * The following is essentially EXCEPTION_PROLOG_PSERIES_1 with the
-	 * difference that MSR_RI is not enabled, because PACA_EXMC is being
-	 * used, so nested machine check corrupts it. machine_check_common
-	 * enables MSR_RI.
+	 * MSR_RI is not enabled, because PACA_EXMC is being used, so a
+	 * nested machine check corrupts it. machine_check_common enables
+	 * MSR_RI.
 	 */
-	ld	r10,PACAKMSR(r13)
-	xori	r10,r10,MSR_RI
-	mfspr	r11,SPRN_SRR0
-	LOAD_HANDLER(r12, machine_check_common)
-	mtspr	SPRN_SRR0,r12
-	mfspr	r12,SPRN_SRR1
-	mtspr	SPRN_SRR1,r10
-	rfid
-	b	.	/* prevent speculative execution */
+	EXCEPTION_PROLOG_PSERIES_1_NORI(machine_check_common, EXC_STD)
 
 TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
 
-- 
cgit v1.2.1


From a4087a4d38981bef3de7dd814493df202919cd2e Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 20 Dec 2016 04:30:03 +1000
Subject: powerpc/64s: Exception macro for stack frame and initial register
 save

This code is common to a few exceptions, and another user will be added.
This causes a trivial change to generated code:

-     604: std     r9,416(r1)
-     608: mfspr   r11,314
-     60c: std     r11,368(r1)
-     610: mfspr   r12,315
+     604: mfspr   r11,314
+     608: mfspr   r12,315
+     60c: std     r9,416(r1)
+     610: std     r11,368(r1)

machine_check_powernv_early could also use this, but that requires non
trivial changes to generated code, so that's for another patch.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 16 ++++++++++------
 arch/powerpc/kernel/exceptions-64s.S     | 13 ++++---------
 2 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 99ed1d811684..229f22a2e9b1 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -323,6 +323,15 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 
 #define NOTEST(n)
 
+#define EXCEPTION_PROLOG_COMMON_1()					   \
+	std	r9,_CCR(r1);		/* save CR in stackframe	*/ \
+	std	r11,_NIP(r1);		/* save SRR0 in stackframe	*/ \
+	std	r12,_MSR(r1);		/* save SRR1 in stackframe	*/ \
+	std	r10,0(r1);		/* make stack chain pointer	*/ \
+	std	r0,GPR0(r1);		/* save r0 in stackframe	*/ \
+	std	r10,GPR1(r1);		/* save r1 in stackframe	*/ \
+
+
 /*
  * The common exception prolog is used for all except a few exceptions
  * such as a segment miss on a kernel address.  We have to be prepared
@@ -347,12 +356,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	addi	r3,r13,area;		/* r3 -> where regs are saved*/	   \
 	RESTORE_CTR(r1, area);						   \
 	b	bad_stack;						   \
-3:	std	r9,_CCR(r1);		/* save CR in stackframe	*/ \
-	std	r11,_NIP(r1);		/* save SRR0 in stackframe	*/ \
-	std	r12,_MSR(r1);		/* save SRR1 in stackframe	*/ \
-	std	r10,0(r1);		/* make stack chain pointer	*/ \
-	std	r0,GPR0(r1);		/* save r0 in stackframe	*/ \
-	std	r10,GPR1(r1);		/* save r1 in stackframe	*/ \
+3:	EXCEPTION_PROLOG_COMMON_1();					   \
 	beq	4f;			/* if from kernel mode		*/ \
 	ACCOUNT_CPU_USER_ENTRY(r13, r9, r10);				   \
 	SAVE_PPR(area, r9, r10);					   \
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index aa65e3cd4875..f765531c67b1 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -929,17 +929,12 @@ EXC_VIRT_NONE(0x4e60, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe60)
 TRAMP_REAL_BEGIN(hmi_exception_early)
 	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, 0xe60)
-	mr	r10,r1			/* Save r1			*/
-	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack		*/
+	mr	r10,r1			/* Save r1 */
+	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack for realmode */
 	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
-	std	r9,_CCR(r1)		/* save CR in stackframe	*/
 	mfspr	r11,SPRN_HSRR0		/* Save HSRR0 */
-	std	r11,_NIP(r1)		/* save HSRR0 in stackframe	*/
-	mfspr	r12,SPRN_HSRR1		/* Save SRR1 */
-	std	r12,_MSR(r1)		/* save SRR1 in stackframe	*/
-	std	r10,0(r1)		/* make stack chain pointer	*/
-	std	r0,GPR0(r1)		/* save r0 in stackframe	*/
-	std	r10,GPR1(r1)		/* save r1 in stackframe	*/
+	mfspr	r12,SPRN_HSRR1		/* Save HSRR1 */
+	EXCEPTION_PROLOG_COMMON_1()
 	EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
 	EXCEPTION_PROLOG_COMMON_3(0xe60)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-- 
cgit v1.2.1


From a3d96f70c14773d0928c6a54fd278138f0868572 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 20 Dec 2016 04:30:04 +1000
Subject: powerpc/64s: Fix system reset vs general interrupt reentrancy

The system reset interrupt can occur when MSR_EE=0, and it currently
uses the PACA_EXGEN save area.

Some PACA_EXGEN interrupts have a window where MSR_RI=1 and MSR_EE=0
when the save area is still in use. A system reset interrupt in this
window can lead to undetected corruption when the save area gets
overwritten.

This patch introduces PACA_EXNMI save area for system reset exceptions,
which closes this corruption window. It's also helpful to retain the
EXGEN state for debugging situations, even if not considering the
recoverability aspect.

This patch also moves the PACA_EXMC area down to a less frequently used
part of the paca with the new save area.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 14 +++++++-------
 arch/powerpc/include/asm/paca.h          |  6 +++++-
 arch/powerpc/kernel/asm-offsets.c        |  1 +
 arch/powerpc/kernel/exceptions-64s.S     |  9 ++++++---
 4 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 229f22a2e9b1..63a309a432ad 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -549,8 +549,8 @@ BEGIN_FTR_SECTION				\
 	beql	ppc64_runlatch_on_trampoline;	\
 END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
 
-#define EXCEPTION_COMMON(trap, label, hdlr, ret, additions)	\
-	EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN);		\
+#define EXCEPTION_COMMON(area, trap, label, hdlr, ret, additions) \
+	EXCEPTION_PROLOG_COMMON(trap, area);			\
 	/* Volatile regs are potentially clobbered here */	\
 	additions;						\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
@@ -558,17 +558,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
 	b	ret
 
 #define STD_EXCEPTION_COMMON(trap, label, hdlr)			\
-	EXCEPTION_COMMON(trap, label, hdlr, ret_from_except,	\
-			 ADD_NVGPRS;ADD_RECONCILE)
+	EXCEPTION_COMMON(PACA_EXGEN, trap, label, hdlr,		\
+		ret_from_except, ADD_NVGPRS;ADD_RECONCILE)
 
 /*
  * Like STD_EXCEPTION_COMMON, but for exceptions that can occur
  * in the idle task and therefore need the special idle handling
  * (finish nap and runlatch)
  */
-#define STD_EXCEPTION_COMMON_ASYNC(trap, label, hdlr)		  \
-	EXCEPTION_COMMON(trap, label, hdlr, ret_from_except_lite, \
-			 FINISH_NAP;ADD_RECONCILE;RUNLATCH_ON)
+#define STD_EXCEPTION_COMMON_ASYNC(trap, label, hdlr)		\
+	EXCEPTION_COMMON(PACA_EXGEN, trap, label, hdlr,		\
+		ret_from_except_lite, FINISH_NAP;ADD_RECONCILE;RUNLATCH_ON)
 
 /*
  * When the idle code in power4_idle puts the CPU into NAP mode,
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 140ddb9ae5a8..601e2327dd8c 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,7 +99,6 @@ struct paca_struct {
 	 */
 	/* used for most interrupts/exceptions */
 	u64 exgen[13] __attribute__((aligned(0x80)));
-	u64 exmc[13];		/* used for machine checks */
 	u64 exslb[13];		/* used for SLB/segment table misses
  				 * on the linear mapping */
 	/* SLB related definitions */
@@ -180,6 +179,11 @@ struct paca_struct {
 	struct paca_struct **thread_sibling_pacas;
 #endif
 
+#ifdef CONFIG_PPC_STD_MMU_64
+	/* Non-maskable exceptions that are not performance critical */
+	u64 exnmi[13];		/* used for system reset (nmi) */
+	u64 exmc[13];		/* used for machine checks */
+#endif
 #ifdef CONFIG_PPC_BOOK3S_64
 	/* Exclusive emergency stack pointer for machine check exception. */
 	void *mc_emergency_sp;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 8e1163426ccb..46732ee70b43 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -220,6 +220,7 @@ int main(void)
 	OFFSET(PACA_EXGEN, paca_struct, exgen);
 	OFFSET(PACA_EXMC, paca_struct, exmc);
 	OFFSET(PACA_EXSLB, paca_struct, exslb);
+	OFFSET(PACA_EXNMI, paca_struct, exnmi);
 	OFFSET(PACALPPACAPTR, paca_struct, lppaca_ptr);
 	OFFSET(PACA_SLBSHADOWPTR, paca_struct, slb_shadow_ptr);
 	OFFSET(SLBSHADOW_STACKVSID, slb_shadow, save_area[SLB_NUM_BOLTED - 1].vsid);
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index f765531c67b1..3a654b1f24e9 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -116,7 +116,7 @@ EXC_VIRT_NONE(0x4000, 0x100)
 
 EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
+	EXCEPTION_PROLOG_PSERIES(PACA_EXNMI, system_reset_common, EXC_STD,
 				 IDLETEST, 0x100)
 
 EXC_REAL_END(system_reset, 0x100, 0x100)
@@ -127,7 +127,10 @@ EXC_COMMON_BEGIN(system_reset_idle_common)
 	b	pnv_powersave_wakeup
 #endif
 
-EXC_COMMON(system_reset_common, 0x100, system_reset_exception)
+EXC_COMMON_BEGIN(system_reset_common)
+	EXCEPTION_COMMON(PACA_EXNMI, 0x100,
+			system_reset, system_reset_exception,
+			ret_from_except, ADD_NVGPRS;ADD_RECONCILE)
 
 #ifdef CONFIG_PPC_PSERIES
 /*
@@ -135,7 +138,7 @@ EXC_COMMON(system_reset_common, 0x100, system_reset_exception)
  */
 TRAMP_REAL_BEGIN(system_reset_fwnmi)
 	SET_SCRATCH0(r13)		/* save r13 */
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
+	EXCEPTION_PROLOG_PSERIES(PACA_EXNMI, system_reset_common, EXC_STD,
 				 NOTEST, 0x100)
 #endif /* CONFIG_PPC_PSERIES */
 
-- 
cgit v1.2.1


From c4f3b52ce7b16824befb16ab3d045c891b08b7db Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 20 Dec 2016 04:30:05 +1000
Subject: powerpc/64s: Disallow system reset vs system reset reentrancy

In preparation for using a dedicated stack for system reset interrupts,
prevent a nested system reset from recovering, in order to simplify
code that is called in crash/debug path. This allows a system reset
interrupt to just use the base stack pointer.

Keep an in_nmi nesting counter similarly to the in_mce counter. Consider
the interrrupt non-recoverable if it is taken inside another system
reset.

Interrupt nesting could be allowed similarly to MCE, but system reset
is a special case that's not for normal operation, so simplicity wins
until there is requirement for nested system reset interrupts.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h |  6 ++++++
 arch/powerpc/include/asm/paca.h          |  5 ++++-
 arch/powerpc/kernel/asm-offsets.c        |  1 +
 arch/powerpc/kernel/exceptions-64s.S     | 37 +++++++++++++++++++++++++++-----
 arch/powerpc/kernel/traps.c              |  8 ++++++-
 arch/powerpc/xmon/xmon.c                 |  1 +
 6 files changed, 51 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 63a309a432ad..59134ebd7d49 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -275,6 +275,12 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 
 #endif
 
+/* Do not enable RI */
+#define EXCEPTION_PROLOG_PSERIES_NORI(area, label, h, extra, vec)	\
+	EXCEPTION_PROLOG_0(area);					\
+	EXCEPTION_PROLOG_1(area, extra, vec);				\
+	EXCEPTION_PROLOG_PSERIES_1_NORI(label, h);
+
 
 #define __KVM_HANDLER(area, h, n)					\
 	BEGIN_FTR_SECTION_NESTED(947)					\
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 601e2327dd8c..b1197c1affac 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -187,12 +187,15 @@ struct paca_struct {
 #ifdef CONFIG_PPC_BOOK3S_64
 	/* Exclusive emergency stack pointer for machine check exception. */
 	void *mc_emergency_sp;
+
+	u16 in_nmi;			/* In nmi handler */
+
 	/*
 	 * Flag to check whether we are in machine check early handler
 	 * and already using emergency stack.
 	 */
 	u16 in_mce;
-	u8 hmi_event_available;		 /* HMI event is available */
+	u8 hmi_event_available;		/* HMI event is available */
 #endif
 
 	/* Stuff for accurate time accounting */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 46732ee70b43..1466e96d36cf 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -235,6 +235,7 @@ int main(void)
 #ifdef CONFIG_PPC_BOOK3S_64
 	OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp);
 	OFFSET(PACA_IN_MCE, paca_struct, in_mce);
+	OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
 #endif
 	OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
 	OFFSET(PACAKEXECSTATE, paca_struct, kexec_state);
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 3a654b1f24e9..4be62568fbc2 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -116,7 +116,11 @@ EXC_VIRT_NONE(0x4000, 0x100)
 
 EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_PSERIES(PACA_EXNMI, system_reset_common, EXC_STD,
+	/*
+	 * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is
+	 * being used, so a nested NMI exception would corrupt it.
+	 */
+	EXCEPTION_PROLOG_PSERIES_NORI(PACA_EXNMI, system_reset_common, EXC_STD,
 				 IDLETEST, 0x100)
 
 EXC_REAL_END(system_reset, 0x100, 0x100)
@@ -128,9 +132,31 @@ EXC_COMMON_BEGIN(system_reset_idle_common)
 #endif
 
 EXC_COMMON_BEGIN(system_reset_common)
+	/*
+	 * Increment paca->in_nmi then enable MSR_RI. SLB or MCE will be able
+	 * to recover, but nested NMI will notice in_nmi and not recover
+	 * because of the use of the NMI stack. in_nmi reentrancy is tested in
+	 * system_reset_exception.
+	 */
+	lhz	r10,PACA_IN_NMI(r13)
+	addi	r10,r10,1
+	sth	r10,PACA_IN_NMI(r13)
+	li	r10,MSR_RI
+	mtmsrd 	r10,1
+
 	EXCEPTION_COMMON(PACA_EXNMI, 0x100,
-			system_reset, system_reset_exception,
-			ret_from_except, ADD_NVGPRS;ADD_RECONCILE)
+			system_reset, system_reset_exception, 1f,
+			ADD_NVGPRS;ADD_RECONCILE)
+1: /* EXCEPTION_COMMON continues here */
+
+	/*
+	 * The stack is no longer in use, decrement in_nmi.
+	 */
+	lhz	r10,PACA_IN_NMI(r13)
+	subi	r10,r10,1
+	sth	r10,PACA_IN_NMI(r13)
+
+	b	ret_from_except
 
 #ifdef CONFIG_PPC_PSERIES
 /*
@@ -138,8 +164,9 @@ EXC_COMMON_BEGIN(system_reset_common)
  */
 TRAMP_REAL_BEGIN(system_reset_fwnmi)
 	SET_SCRATCH0(r13)		/* save r13 */
-	EXCEPTION_PROLOG_PSERIES(PACA_EXNMI, system_reset_common, EXC_STD,
-				 NOTEST, 0x100)
+	/* See comment at system_reset exception */
+	EXCEPTION_PROLOG_PSERIES_NORI(PACA_EXNMI, system_reset_common,
+						EXC_STD, NOTEST, 0x100)
 #endif /* CONFIG_PPC_PSERIES */
 
 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 76f6045b021b..2393d3c46236 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -282,11 +282,17 @@ void system_reset_exception(struct pt_regs *regs)
 	/* See if any machine dependent calls */
 	if (ppc_md.system_reset_exception) {
 		if (ppc_md.system_reset_exception(regs))
-			return;
+			goto out;
 	}
 
 	die("System Reset", regs, SIGABRT);
 
+out:
+#ifdef CONFIG_PPC_BOOK3S_64
+	BUG_ON(get_paca()->in_nmi == 0);
+	if (get_paca()->in_nmi > 1)
+		panic("Unrecoverable nested System Reset");
+#endif
 	/* Must die if the interrupt is not recoverable */
 	if (!(regs->msr & MSR_RI))
 		panic("Unrecoverable System Reset");
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 2b21e90fff8d..99348a2961b8 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2236,6 +2236,7 @@ static void dump_one_paca(int cpu)
 	DUMP(p, emergency_sp, "p");
 #ifdef CONFIG_PPC_BOOK3S_64
 	DUMP(p, mc_emergency_sp, "p");
+	DUMP(p, in_nmi, "x");
 	DUMP(p, in_mce, "x");
 	DUMP(p, hmi_event_available, "x");
 #endif
-- 
cgit v1.2.1


From b1ee8a3de5790777f325416ad97340428d8ae25f Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 20 Dec 2016 04:30:06 +1000
Subject: powerpc/64s: Dedicated system reset interrupt stack

The system reset interrupt is used for crash/debug situations, so it is
desirable to have as little impact on the normal state of the system as
possible.

Currently it uses the current kernel stack to process the exception.
This stores into the stack which may be involved with the crash. The
stack pointer may be corrupted, or it may have overflowed.

Avoid or minimise these problems by creating a dedicated NMI stack for
the system reset interrupt to use.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 13 +++++++++++++
 arch/powerpc/include/asm/paca.h          |  3 ++-
 arch/powerpc/kernel/asm-offsets.c        |  1 +
 arch/powerpc/kernel/exceptions-64s.S     |  8 +++++---
 arch/powerpc/kernel/setup_64.c           |  5 +++++
 arch/powerpc/xmon/xmon.c                 |  1 +
 6 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 59134ebd7d49..38c5c0b33af9 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -563,6 +563,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
 	bl	hdlr;						\
 	b	ret
 
+/*
+ * Exception where stack is already set in r1, r1 is saved in r10, and it
+ * continues rather than returns.
+ */
+#define EXCEPTION_COMMON_NORET_STACK(area, trap, label, hdlr, additions) \
+	EXCEPTION_PROLOG_COMMON_1();				\
+	EXCEPTION_PROLOG_COMMON_2(area);			\
+	EXCEPTION_PROLOG_COMMON_3(trap);			\
+	/* Volatile regs are potentially clobbered here */	\
+	additions;						\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
+	bl	hdlr
+
 #define STD_EXCEPTION_COMMON(trap, label, hdlr)			\
 	EXCEPTION_COMMON(PACA_EXGEN, trap, label, hdlr,		\
 		ret_from_except, ADD_NVGPRS;ADD_RECONCILE)
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index b1197c1affac..1c09f8fe2ee8 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -185,7 +185,8 @@ struct paca_struct {
 	u64 exmc[13];		/* used for machine checks */
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
-	/* Exclusive emergency stack pointer for machine check exception. */
+	/* Exclusive stacks for system reset and machine check exception. */
+	void *nmi_emergency_sp;
 	void *mc_emergency_sp;
 
 	u16 in_nmi;			/* In nmi handler */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 1466e96d36cf..439c257dec4a 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -234,6 +234,7 @@ int main(void)
 	OFFSET(PACAEMERGSP, paca_struct, emergency_sp);
 #ifdef CONFIG_PPC_BOOK3S_64
 	OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp);
+	OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp);
 	OFFSET(PACA_IN_MCE, paca_struct, in_mce);
 	OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
 #endif
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 4be62568fbc2..3840a7700285 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -144,10 +144,12 @@ EXC_COMMON_BEGIN(system_reset_common)
 	li	r10,MSR_RI
 	mtmsrd 	r10,1
 
-	EXCEPTION_COMMON(PACA_EXNMI, 0x100,
-			system_reset, system_reset_exception, 1f,
+	mr	r10,r1
+	ld	r1,PACA_NMI_EMERG_SP(r13)
+	subi	r1,r1,INT_FRAME_SIZE
+	EXCEPTION_COMMON_NORET_STACK(PACA_EXNMI, 0x100,
+			system_reset, system_reset_exception,
 			ADD_NVGPRS;ADD_RECONCILE)
-1: /* EXCEPTION_COMMON continues here */
 
 	/*
 	 * The stack is no longer in use, decrement in_nmi.
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 729e990a019d..0f7b15860a06 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -628,6 +628,11 @@ void __init emergency_stack_init(void)
 		paca[i].emergency_sp = (void *)ti + THREAD_SIZE;
 
 #ifdef CONFIG_PPC_BOOK3S_64
+		/* emergency stack for NMI exception handling. */
+		ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit));
+		klp_init_thread_info(ti);
+		paca[i].nmi_emergency_sp = (void *)ti + THREAD_SIZE;
+
 		/* emergency stack for machine check exception handling. */
 		ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit));
 		klp_init_thread_info(ti);
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 99348a2961b8..682506821bba 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2235,6 +2235,7 @@ static void dump_one_paca(int cpu)
 	DUMP(p, kernel_msr, "lx");
 	DUMP(p, emergency_sp, "p");
 #ifdef CONFIG_PPC_BOOK3S_64
+	DUMP(p, nmi_emergency_sp, "p");
 	DUMP(p, mc_emergency_sp, "p");
 	DUMP(p, in_nmi, "x");
 	DUMP(p, in_mce, "x");
-- 
cgit v1.2.1


From 2b4f3ac5642672595ef6f71d1572b32475e94863 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 20 Dec 2016 04:30:07 +1000
Subject: powerpc: Mark system reset as an NMI with nmi_enter/exit()

System reset is a non-maskable interrupt from Linux's point of view
(occurs under local_irq_disable()), so it should use nmi_enter/exit.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/traps.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 2393d3c46236..6a81a451d767 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -279,6 +279,14 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
 
 void system_reset_exception(struct pt_regs *regs)
 {
+	/*
+	 * Avoid crashes in case of nested NMI exceptions. Recoverability
+	 * is determined by RI and in_nmi
+	 */
+	bool nested = in_nmi();
+	if (!nested)
+		nmi_enter();
+
 	/* See if any machine dependent calls */
 	if (ppc_md.system_reset_exception) {
 		if (ppc_md.system_reset_exception(regs))
@@ -297,6 +305,9 @@ out:
 	if (!(regs->msr & MSR_RI))
 		panic("Unrecoverable System Reset");
 
+	if (!nested)
+		nmi_exit();
+
 	/* What should we do here? We could issue a shutdown or hard reset. */
 }
 
-- 
cgit v1.2.1


From ddd703ca06ede1b2d01ed1b0cb8d4315ab808099 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 20 Dec 2016 04:30:08 +1000
Subject: powerpc: Add NMI IPI infrastructure

Add a simple NMI IPI system that handles concurrency and reentrancy.

The platform does not have to implement a true non-maskable interrupt,
the default is to simply use the debugger break IPI message. This has
now been co-opted for a general IPI message, and users (debugger and
crash) have been reimplemented on top of the NMI system.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Incorporate incremental fixes from Nick]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig                    |   5 +
 arch/powerpc/include/asm/smp.h          |  14 +-
 arch/powerpc/kernel/smp.c               | 243 +++++++++++++++++++++++++++-----
 arch/powerpc/platforms/cell/interrupt.c |   2 +-
 arch/powerpc/platforms/ps3/smp.c        |   4 +-
 5 files changed, 226 insertions(+), 42 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7401e921aad0..48cc3d8432bb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -80,6 +80,11 @@ config NR_IRQS
 	  /proc/interrupts. If you configure your system to have too few,
 	  drivers will fail to load or worse - handle with care.
 
+config NMI_IPI
+	bool
+	depends on SMP && (DEBUGGER || KEXEC_CORE)
+	default y
+
 config STACKTRACE_SUPPORT
 	bool
 	default y
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 751b2bd944fc..78c311ee0049 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -112,14 +112,22 @@ extern int cpu_to_core_id(int cpu);
  *
  * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
  * in /proc/interrupts will be wrong!!! --Troy */
-#define PPC_MSG_CALL_FUNCTION   0
-#define PPC_MSG_RESCHEDULE      1
+#define PPC_MSG_CALL_FUNCTION	0
+#define PPC_MSG_RESCHEDULE	1
 #define PPC_MSG_TICK_BROADCAST	2
-#define PPC_MSG_DEBUGGER_BREAK  3
+#define PPC_MSG_NMI_IPI		3
 
 /* This is only used by the powernv kernel */
 #define PPC_MSG_RM_HOST_ACTION	4
 
+#define NMI_IPI_ALL_OTHERS		-2
+
+#ifdef CONFIG_NMI_IPI
+extern int smp_handle_nmi_ipi(struct pt_regs *regs);
+#else
+static inline int smp_handle_nmi_ipi(struct pt_regs *regs) { return 0; }
+#endif
+
 /* for irq controllers that have dedicated ipis per message (4) */
 extern int smp_request_message_ipi(int virq, int message);
 extern const char *smp_ipi_name[];
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 2eca1e491e2b..7461b195b29e 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -87,8 +87,6 @@ volatile unsigned int cpu_callin_map[NR_CPUS];
 
 int smt_enabled_at_boot = 1;
 
-static void (*crash_ipi_function_ptr)(struct pt_regs *) = NULL;
-
 /*
  * Returns 1 if the specified cpu should be brought up during boot.
  * Used to inhibit booting threads if they've been disabled or
@@ -159,32 +157,33 @@ static irqreturn_t tick_broadcast_ipi_action(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static irqreturn_t debug_ipi_action(int irq, void *data)
+#ifdef CONFIG_NMI_IPI
+static irqreturn_t nmi_ipi_action(int irq, void *data)
 {
-	if (crash_ipi_function_ptr) {
-		crash_ipi_function_ptr(get_irq_regs());
-		return IRQ_HANDLED;
-	}
-
-#ifdef CONFIG_DEBUGGER
-	debugger_ipi(get_irq_regs());
-#endif /* CONFIG_DEBUGGER */
-
+	smp_handle_nmi_ipi(get_irq_regs());
 	return IRQ_HANDLED;
 }
+#endif
 
 static irq_handler_t smp_ipi_action[] = {
 	[PPC_MSG_CALL_FUNCTION] =  call_function_action,
 	[PPC_MSG_RESCHEDULE] = reschedule_action,
 	[PPC_MSG_TICK_BROADCAST] = tick_broadcast_ipi_action,
-	[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
+#ifdef CONFIG_NMI_IPI
+	[PPC_MSG_NMI_IPI] = nmi_ipi_action,
+#endif
 };
 
+/*
+ * The NMI IPI is a fallback and not truly non-maskable. It is simpler
+ * than going through the call function infrastructure, and strongly
+ * serialized, so it is more appropriate for debugging.
+ */
 const char *smp_ipi_name[] = {
 	[PPC_MSG_CALL_FUNCTION] =  "ipi call function",
 	[PPC_MSG_RESCHEDULE] = "ipi reschedule",
 	[PPC_MSG_TICK_BROADCAST] = "ipi tick-broadcast",
-	[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
+	[PPC_MSG_NMI_IPI] = "nmi ipi",
 };
 
 /* optional function to request ipi, for controllers with >= 4 ipis */
@@ -192,14 +191,13 @@ int smp_request_message_ipi(int virq, int msg)
 {
 	int err;
 
-	if (msg < 0 || msg > PPC_MSG_DEBUGGER_BREAK) {
+	if (msg < 0 || msg > PPC_MSG_NMI_IPI)
 		return -EINVAL;
-	}
-#if !defined(CONFIG_DEBUGGER) && !defined(CONFIG_KEXEC_CORE)
-	if (msg == PPC_MSG_DEBUGGER_BREAK) {
+#ifndef CONFIG_NMI_IPI
+	if (msg == PPC_MSG_NMI_IPI)
 		return 1;
-	}
 #endif
+
 	err = request_irq(virq, smp_ipi_action[msg],
 			  IRQF_PERCPU | IRQF_NO_THREAD | IRQF_NO_SUSPEND,
 			  smp_ipi_name[msg], NULL);
@@ -277,8 +275,10 @@ irqreturn_t smp_ipi_demux_relaxed(void)
 			scheduler_ipi();
 		if (all & IPI_MESSAGE(PPC_MSG_TICK_BROADCAST))
 			tick_broadcast_ipi_handler();
-		if (all & IPI_MESSAGE(PPC_MSG_DEBUGGER_BREAK))
-			debug_ipi_action(0, NULL);
+#ifdef CONFIG_NMI_IPI
+		if (all & IPI_MESSAGE(PPC_MSG_NMI_IPI))
+			nmi_ipi_action(0, NULL);
+#endif
 	} while (info->messages);
 
 	return IRQ_HANDLED;
@@ -315,6 +315,184 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 		do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
 }
 
+#ifdef CONFIG_NMI_IPI
+
+/*
+ * "NMI IPI" system.
+ *
+ * NMI IPIs may not be recoverable, so should not be used as ongoing part of
+ * a running system. They can be used for crash, debug, halt/reboot, etc.
+ *
+ * NMI IPIs are globally single threaded. No more than one in progress at
+ * any time.
+ *
+ * The IPI call waits with interrupts disabled until all targets enter the
+ * NMI handler, then the call returns.
+ *
+ * No new NMI can be initiated until targets exit the handler.
+ *
+ * The IPI call may time out without all targets entering the NMI handler.
+ * In that case, there is some logic to recover (and ignore subsequent
+ * NMI interrupts that may eventually be raised), but the platform interrupt
+ * handler may not be able to distinguish this from other exception causes,
+ * which may cause a crash.
+ */
+
+static atomic_t __nmi_ipi_lock = ATOMIC_INIT(0);
+static struct cpumask nmi_ipi_pending_mask;
+static int nmi_ipi_busy_count = 0;
+static void (*nmi_ipi_function)(struct pt_regs *) = NULL;
+
+static void nmi_ipi_lock_start(unsigned long *flags)
+{
+	raw_local_irq_save(*flags);
+	hard_irq_disable();
+	while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) {
+		raw_local_irq_restore(*flags);
+		cpu_relax();
+		raw_local_irq_save(*flags);
+		hard_irq_disable();
+	}
+}
+
+static void nmi_ipi_lock(void)
+{
+	while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1)
+		cpu_relax();
+}
+
+static void nmi_ipi_unlock(void)
+{
+	smp_mb();
+	WARN_ON(atomic_read(&__nmi_ipi_lock) != 1);
+	atomic_set(&__nmi_ipi_lock, 0);
+}
+
+static void nmi_ipi_unlock_end(unsigned long *flags)
+{
+	nmi_ipi_unlock();
+	raw_local_irq_restore(*flags);
+}
+
+/*
+ * Platform NMI handler calls this to ack
+ */
+int smp_handle_nmi_ipi(struct pt_regs *regs)
+{
+	void (*fn)(struct pt_regs *);
+	unsigned long flags;
+	int me = raw_smp_processor_id();
+	int ret = 0;
+
+	/*
+	 * Unexpected NMIs are possible here because the interrupt may not
+	 * be able to distinguish NMI IPIs from other types of NMIs, or
+	 * because the caller may have timed out.
+	 */
+	nmi_ipi_lock_start(&flags);
+	if (!nmi_ipi_busy_count)
+		goto out;
+	if (!cpumask_test_cpu(me, &nmi_ipi_pending_mask))
+		goto out;
+
+	fn = nmi_ipi_function;
+	if (!fn)
+		goto out;
+
+	cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
+	nmi_ipi_busy_count++;
+	nmi_ipi_unlock();
+
+	ret = 1;
+
+	fn(regs);
+
+	nmi_ipi_lock();
+	nmi_ipi_busy_count--;
+out:
+	nmi_ipi_unlock_end(&flags);
+
+	return ret;
+}
+
+static void do_smp_send_nmi_ipi(int cpu)
+{
+	if (cpu >= 0) {
+		do_message_pass(cpu, PPC_MSG_NMI_IPI);
+	} else {
+		int c;
+
+		for_each_online_cpu(c) {
+			if (c == raw_smp_processor_id())
+				continue;
+			do_message_pass(c, PPC_MSG_NMI_IPI);
+		}
+	}
+}
+
+/*
+ * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS.
+ * - fn is the target callback function.
+ * - delay_us > 0 is the delay before giving up waiting for targets to
+ *   enter the handler, == 0 specifies indefinite delay.
+ */
+static int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us)
+{
+	unsigned long flags;
+	int me = raw_smp_processor_id();
+	int ret = 1;
+
+	BUG_ON(cpu == me);
+	BUG_ON(cpu < 0 && cpu != NMI_IPI_ALL_OTHERS);
+
+	if (unlikely(!smp_ops))
+		return 0;
+
+	/* Take the nmi_ipi_busy count/lock with interrupts hard disabled */
+	nmi_ipi_lock_start(&flags);
+	while (nmi_ipi_busy_count) {
+		nmi_ipi_unlock_end(&flags);
+		cpu_relax();
+		nmi_ipi_lock_start(&flags);
+	}
+
+	nmi_ipi_function = fn;
+
+	if (cpu < 0) {
+		/* ALL_OTHERS */
+		cpumask_copy(&nmi_ipi_pending_mask, cpu_online_mask);
+		cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
+	} else {
+		/* cpumask starts clear */
+		cpumask_set_cpu(cpu, &nmi_ipi_pending_mask);
+	}
+	nmi_ipi_busy_count++;
+	nmi_ipi_unlock();
+
+	do_smp_send_nmi_ipi(cpu);
+
+	while (!cpumask_empty(&nmi_ipi_pending_mask)) {
+		udelay(1);
+		if (delay_us) {
+			delay_us--;
+			if (!delay_us)
+				break;
+		}
+	}
+
+	nmi_ipi_lock();
+	if (!cpumask_empty(&nmi_ipi_pending_mask)) {
+		/* Could not gather all CPUs */
+		ret = 0;
+		cpumask_clear(&nmi_ipi_pending_mask);
+	}
+	nmi_ipi_busy_count--;
+	nmi_ipi_unlock_end(&flags);
+
+	return ret;
+}
+#endif /* CONFIG_NMI_IPI */
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 void tick_broadcast(const struct cpumask *mask)
 {
@@ -325,29 +503,22 @@ void tick_broadcast(const struct cpumask *mask)
 }
 #endif
 
-#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE)
-void smp_send_debugger_break(void)
+#ifdef CONFIG_DEBUGGER
+void debugger_ipi_callback(struct pt_regs *regs)
 {
-	int cpu;
-	int me = raw_smp_processor_id();
-
-	if (unlikely(!smp_ops))
-		return;
+	debugger_ipi(regs);
+}
 
-	for_each_online_cpu(cpu)
-		if (cpu != me)
-			do_message_pass(cpu, PPC_MSG_DEBUGGER_BREAK);
+void smp_send_debugger_break(void)
+{
+	smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, debugger_ipi_callback, 1000000);
 }
 #endif
 
 #ifdef CONFIG_KEXEC_CORE
 void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
 {
-	crash_ipi_function_ptr = crash_ipi_callback;
-	if (crash_ipi_callback) {
-		mb();
-		smp_send_debugger_break();
-	}
+	smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_ipi_callback, 1000000);
 }
 #endif
 
diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c
index a6bbbaba14a3..871d38479a25 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -211,7 +211,7 @@ void iic_request_IPIs(void)
 	iic_request_ipi(PPC_MSG_CALL_FUNCTION);
 	iic_request_ipi(PPC_MSG_RESCHEDULE);
 	iic_request_ipi(PPC_MSG_TICK_BROADCAST);
-	iic_request_ipi(PPC_MSG_DEBUGGER_BREAK);
+	iic_request_ipi(PPC_MSG_NMI_IPI);
 }
 
 #endif /* CONFIG_SMP */
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index 60154d08debf..1d1ad5df106f 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -77,7 +77,7 @@ static void __init ps3_smp_probe(void)
 		BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION    != 0);
 		BUILD_BUG_ON(PPC_MSG_RESCHEDULE       != 1);
 		BUILD_BUG_ON(PPC_MSG_TICK_BROADCAST   != 2);
-		BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK   != 3);
+		BUILD_BUG_ON(PPC_MSG_NMI_IPI          != 3);
 
 		for (i = 0; i < MSG_COUNT; i++) {
 			result = ps3_event_receive_port_setup(cpu, &virqs[i]);
@@ -96,7 +96,7 @@ static void __init ps3_smp_probe(void)
 				ps3_register_ipi_irq(cpu, virqs[i]);
 		}
 
-		ps3_register_ipi_debug_brk(cpu, virqs[PPC_MSG_DEBUGGER_BREAK]);
+		ps3_register_ipi_debug_brk(cpu, virqs[PPC_MSG_NMI_IPI]);
 
 		DBG(" <- %s:%d: (%d)\n", __func__, __LINE__, cpu);
 	}
-- 
cgit v1.2.1


From c64af6458e2e2ddf86aff559837d3925fbf9cbb5 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 20 Dec 2016 04:30:09 +1000
Subject: powerpc: Add struct smp_ops_t.cause_nmi_ipi operation

Have the NMI IPI code use this op when the platform defines it.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/smp.h            | 1 +
 arch/powerpc/kernel/smp.c                 | 3 +++
 arch/powerpc/platforms/85xx/smp.c         | 1 +
 arch/powerpc/platforms/86xx/mpc86xx_smp.c | 1 +
 arch/powerpc/platforms/chrp/smp.c         | 1 +
 arch/powerpc/platforms/powermac/smp.c     | 1 +
 arch/powerpc/platforms/powernv/smp.c      | 1 +
 arch/powerpc/platforms/pseries/smp.c      | 1 +
 8 files changed, 10 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 78c311ee0049..ebddb2111d87 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -42,6 +42,7 @@ struct smp_ops_t {
 #ifdef CONFIG_PPC_SMP_MUXED_IPI
 	void  (*cause_ipi)(int cpu);
 #endif
+	int   (*cause_nmi_ipi)(int cpu);
 	void  (*probe)(void);
 	int   (*kick_cpu)(int nr);
 	int   (*prepare_cpu)(int nr);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 7461b195b29e..b37973f11ce0 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -417,6 +417,9 @@ out:
 
 static void do_smp_send_nmi_ipi(int cpu)
 {
+	if (smp_ops->cause_nmi_ipi && smp_ops->cause_nmi_ipi(cpu))
+		return;
+
 	if (cpu >= 0) {
 		do_message_pass(cpu, PPC_MSG_NMI_IPI);
 	} else {
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index 0975066f76e8..f51fd35f4618 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -344,6 +344,7 @@ done:
 }
 
 struct smp_ops_t smp_85xx_ops = {
+	.cause_nmi_ipi = NULL,
 	.kick_cpu = smp_85xx_kick_cpu,
 	.cpu_bootable = smp_generic_cpu_bootable,
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/powerpc/platforms/86xx/mpc86xx_smp.c b/arch/powerpc/platforms/86xx/mpc86xx_smp.c
index af09baee22cb..020e84a47a32 100644
--- a/arch/powerpc/platforms/86xx/mpc86xx_smp.c
+++ b/arch/powerpc/platforms/86xx/mpc86xx_smp.c
@@ -105,6 +105,7 @@ smp_86xx_setup_cpu(int cpu_nr)
 
 
 struct smp_ops_t smp_86xx_ops = {
+	.cause_nmi_ipi = NULL,
 	.message_pass = smp_mpic_message_pass,
 	.probe = smp_mpic_probe,
 	.kick_cpu = smp_86xx_kick_cpu,
diff --git a/arch/powerpc/platforms/chrp/smp.c b/arch/powerpc/platforms/chrp/smp.c
index b6c9a0dcc924..14515040f7cd 100644
--- a/arch/powerpc/platforms/chrp/smp.c
+++ b/arch/powerpc/platforms/chrp/smp.c
@@ -44,6 +44,7 @@ static void smp_chrp_setup_cpu(int cpu_nr)
 
 /* CHRP with openpic */
 struct smp_ops_t chrp_smp_ops = {
+	.cause_nmi_ipi = NULL,
 	.message_pass = smp_mpic_message_pass,
 	.probe = smp_mpic_probe,
 	.kick_cpu = smp_chrp_kick_cpu,
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index a3207cea360e..2cd99eb30762 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -447,6 +447,7 @@ void __init smp_psurge_give_timebase(void)
 struct smp_ops_t psurge_smp_ops = {
 	.message_pass	= NULL,	/* Use smp_muxed_ipi_message_pass */
 	.cause_ipi	= smp_psurge_cause_ipi,
+	.cause_nmi_ipi	= NULL,
 	.probe		= smp_psurge_probe,
 	.kick_cpu	= smp_psurge_kick_cpu,
 	.setup_cpu	= smp_psurge_setup_cpu,
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 5189445db164..4aff754b6f2c 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -302,6 +302,7 @@ static void __init pnv_smp_probe(void)
 static struct smp_ops_t pnv_smp_ops = {
 	.message_pass	= NULL, /* Use smp_muxed_ipi_message_pass */
 	.cause_ipi	= NULL,	/* Filled at runtime by pnv_smp_probe() */
+	.cause_nmi_ipi	= NULL,
 	.probe		= pnv_smp_probe,
 	.prepare_cpu	= pnv_smp_prepare_cpu,
 	.kick_cpu	= pnv_smp_kick_cpu,
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index 12ff5bef67fd..0925ac396b10 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -202,6 +202,7 @@ static __init void pSeries_smp_probe(void)
 static struct smp_ops_t pseries_smp_ops = {
 	.message_pass	= NULL,	/* Use smp_muxed_ipi_message_pass */
 	.cause_ipi	= NULL,	/* Filled at runtime by pSeries_smp_probe() */
+	.cause_nmi_ipi	= NULL,
 	.probe		= pSeries_smp_probe,
 	.kick_cpu	= smp_pSeries_kick_cpu,
 	.setup_cpu	= smp_setup_cpu,
-- 
cgit v1.2.1


From 102c05e8dc4f75a573437e2ce50b2559f3db8d5e Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 20 Dec 2016 04:30:10 +1000
Subject: powerpc/pseries: Implement NMI IPI with H_SIGNAL_SYS_RESET

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/ras.c |  4 ++++
 arch/powerpc/platforms/pseries/smp.c | 23 ++++++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 904a677208d1..bb70b26334f0 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -386,6 +386,10 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
 		}
 		fwnmi_release_errinfo();
 	}
+
+	if (smp_handle_nmi_ipi(regs))
+		return 1;
+
 	return 0; /* need to perform reset */
 }
 
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index 0925ac396b10..52ca6b311d44 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -189,6 +189,27 @@ static void smp_pseries_cause_ipi(int cpu)
 	icp_ops->cause_ipi(cpu);
 }
 
+static int pseries_cause_nmi_ipi(int cpu)
+{
+	int hwcpu;
+
+	if (cpu == NMI_IPI_ALL_OTHERS) {
+		hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
+	} else {
+		if (cpu < 0) {
+			WARN_ONCE(true, "incorrect cpu parameter %d", cpu);
+			return 0;
+		}
+
+		hwcpu = get_hard_smp_processor_id(cpu);
+	}
+
+	if (plapr_signal_sys_reset(hwcpu) == H_SUCCESS)
+		return 1;
+
+	return 0;
+}
+
 static __init void pSeries_smp_probe(void)
 {
 	xics_smp_probe();
@@ -202,7 +223,7 @@ static __init void pSeries_smp_probe(void)
 static struct smp_ops_t pseries_smp_ops = {
 	.message_pass	= NULL,	/* Use smp_muxed_ipi_message_pass */
 	.cause_ipi	= NULL,	/* Filled at runtime by pSeries_smp_probe() */
-	.cause_nmi_ipi	= NULL,
+	.cause_nmi_ipi	= pseries_cause_nmi_ipi,
 	.probe		= pSeries_smp_probe,
 	.kick_cpu	= smp_pSeries_kick_cpu,
 	.setup_cpu	= smp_setup_cpu,
-- 
cgit v1.2.1


From 1cd6ed7c2e35a716dcf1347c33f58a7ce7d1414f Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 20 Dec 2016 04:30:11 +1000
Subject: powerpc/xmon: Wait for secondaries before IPI'ing on system reset

An externally triggered system reset (e.g., via QEMU nmi command, or pseries
reset button) can cause system reset interrupts on all CPUs. In case this causes
xmon to be entered, it is undesirable for the primary (first) CPU into xmon to
trigger an NMI IPI to others, because this may cause a nested system reset
interrupt.

So spin for a time waiting for secondaries to join xmon before performing the
NMI IPI, similarly to what the crash dump code does.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Only do it when we come in from system reset, not via sysrq etc.]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/xmon/xmon.c | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 682506821bba..da5619847517 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -418,7 +418,22 @@ int cpus_are_in_xmon(void)
 {
 	return !cpumask_empty(&cpus_in_xmon);
 }
-#endif
+
+static bool wait_for_other_cpus(int ncpus)
+{
+	unsigned long timeout;
+
+	/* We wait for 2s, which is a metric "little while" */
+	for (timeout = 20000; timeout != 0; --timeout) {
+		if (cpumask_weight(&cpus_in_xmon) >= ncpus)
+			return true;
+		udelay(100);
+		barrier();
+	}
+
+	return false;
+}
+#endif /* CONFIG_SMP */
 
 static inline int unrecoverable_excp(struct pt_regs *regs)
 {
@@ -440,7 +455,6 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 #ifdef CONFIG_SMP
 	int cpu;
 	int secondary;
-	unsigned long timeout;
 #endif
 
 	local_irq_save(flags);
@@ -527,13 +541,17 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 		xmon_owner = cpu;
 		mb();
 		if (ncpus > 1) {
-			smp_send_debugger_break();
-			/* wait for other cpus to come in */
-			for (timeout = 100000000; timeout != 0; --timeout) {
-				if (cpumask_weight(&cpus_in_xmon) >= ncpus)
-					break;
-				barrier();
-			}
+			/*
+			 * A system reset (trap == 0x100) can be triggered on
+			 * all CPUs, so when we come in via 0x100 try waiting
+			 * for the other CPUs to come in before we send the
+			 * debugger break (IPI). This is similar to
+			 * crash_kexec_secondary().
+			 */
+			if (TRAP(regs) != 0x100 || !wait_for_other_cpus(ncpus))
+				smp_send_debugger_break();
+
+			wait_for_other_cpus(ncpus);
 		}
 		remove_bpts();
 		disable_surveillance();
-- 
cgit v1.2.1


From f188d0524d7e09f7ec801df9761dc1d898b1a1c0 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Sat, 26 Nov 2016 14:26:09 +1100
Subject: powerpc: Use the new post-link pass to check relocations

Currently powerpc has to introduce a dependency on its default build
target zImage in order to run a relocation check pass over the linked
vmlinux. This is deficient because the check is not run if the plain
vmlinux target is built, or if one of the other boot targets is built.

Switch to using the kbuild post-link pass, added in commit fbe6e37dab97
("kbuild: add arch specific post-link Makefile") in order to run this
check. In future powerpc will use this to do more complicated operations,
but initially using it for something simple is a good first step.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Makefile          | 11 -----------
 arch/powerpc/Makefile.postlink | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 11 deletions(-)
 create mode 100644 arch/powerpc/Makefile.postlink

(limited to 'arch')

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 19b0d1a81959..c09600bb2fe2 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -274,17 +274,6 @@ PHONY += $(BOOT_TARGETS1) $(BOOT_TARGETS2)
 
 boot := arch/$(ARCH)/boot
 
-ifeq ($(CONFIG_RELOCATABLE),y)
-quiet_cmd_relocs_check = CALL    $<
-      cmd_relocs_check = $(CONFIG_SHELL) $< "$(OBJDUMP)" "$(obj)/vmlinux"
-
-PHONY += relocs_check
-relocs_check: arch/powerpc/relocs_check.sh vmlinux
-	$(call cmd,relocs_check)
-
-zImage: relocs_check
-endif
-
 $(BOOT_TARGETS1): vmlinux
 	$(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
 $(BOOT_TARGETS2): vmlinux
diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink
new file mode 100644
index 000000000000..8c0170e45069
--- /dev/null
+++ b/arch/powerpc/Makefile.postlink
@@ -0,0 +1,34 @@
+# ===========================================================================
+# Post-link powerpc pass
+# ===========================================================================
+#
+# 1. Check that vmlinux relocations look sane
+
+PHONY := __archpost
+__archpost:
+
+include include/config/auto.conf
+include scripts/Kbuild.include
+
+quiet_cmd_relocs_check = CHKREL  $@
+      cmd_relocs_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/relocs_check.sh "$(OBJDUMP)" "$@"
+
+# `@true` prevents complaint when there is nothing to be done
+
+vmlinux: FORCE
+	@true
+ifdef CONFIG_RELOCATABLE
+	$(call if_changed,relocs_check)
+endif
+
+%.ko: FORCE
+	@true
+
+clean:
+	@true
+
+PHONY += FORCE clean
+
+FORCE:
+
+.PHONY: $(PHONY)
-- 
cgit v1.2.1


From b71c9ffb140556004caf7ba27083f9d90ae8d14b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Sat, 26 Nov 2016 14:26:10 +1100
Subject: powerpc: Add arch/powerpc/tools directory

Move a couple of existing scripts under there. Remove scripts directory:
a script is a tool, a tool is not a script.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Makefile                             |  2 +-
 arch/powerpc/Makefile.postlink                    |  2 +-
 arch/powerpc/relocs_check.sh                      | 61 -----------------------
 arch/powerpc/scripts/gcc-check-mprofile-kernel.sh | 23 ---------
 arch/powerpc/tools/gcc-check-mprofile-kernel.sh   | 23 +++++++++
 arch/powerpc/tools/relocs_check.sh                | 61 +++++++++++++++++++++++
 6 files changed, 86 insertions(+), 86 deletions(-)
 delete mode 100755 arch/powerpc/relocs_check.sh
 delete mode 100755 arch/powerpc/scripts/gcc-check-mprofile-kernel.sh
 create mode 100755 arch/powerpc/tools/gcc-check-mprofile-kernel.sh
 create mode 100755 arch/powerpc/tools/relocs_check.sh

(limited to 'arch')

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index c09600bb2fe2..3e0f0e1fadef 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -136,7 +136,7 @@ CFLAGS-$(CONFIG_GENERIC_CPU) += -mcpu=powerpc64
 endif
 
 ifdef CONFIG_MPROFILE_KERNEL
-    ifeq ($(shell $(srctree)/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh $(CC) -I$(srctree)/include -D__KERNEL__),OK)
+    ifeq ($(shell $(srctree)/arch/powerpc/tools/gcc-check-mprofile-kernel.sh $(CC) -I$(srctree)/include -D__KERNEL__),OK)
         CC_FLAGS_FTRACE := -pg -mprofile-kernel
         KBUILD_CPPFLAGS += -DCC_USING_MPROFILE_KERNEL
     else
diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink
index 8c0170e45069..3c22d64b2de9 100644
--- a/arch/powerpc/Makefile.postlink
+++ b/arch/powerpc/Makefile.postlink
@@ -11,7 +11,7 @@ include include/config/auto.conf
 include scripts/Kbuild.include
 
 quiet_cmd_relocs_check = CHKREL  $@
-      cmd_relocs_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/relocs_check.sh "$(OBJDUMP)" "$@"
+      cmd_relocs_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$@"
 
 # `@true` prevents complaint when there is nothing to be done
 
diff --git a/arch/powerpc/relocs_check.sh b/arch/powerpc/relocs_check.sh
deleted file mode 100755
index ec2d5c835170..000000000000
--- a/arch/powerpc/relocs_check.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/sh
-
-# Copyright © 2015 IBM Corporation
-
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License
-# as published by the Free Software Foundation; either version
-# 2 of the License, or (at your option) any later version.
-
-# This script checks the relocations of a vmlinux for "suspicious"
-# relocations.
-
-# based on relocs_check.pl
-# Copyright © 2009 IBM Corporation
-
-if [ $# -lt 2 ]; then
-	echo "$0 [path to objdump] [path to vmlinux]" 1>&2
-	exit 1
-fi
-
-# Have Kbuild supply the path to objdump so we handle cross compilation.
-objdump="$1"
-vmlinux="$2"
-
-bad_relocs=$(
-"$objdump" -R "$vmlinux" |
-	# Only look at relocation lines.
-	grep -E '\<R_' |
-	# These relocations are okay
-	# On PPC64:
-	#	R_PPC64_RELATIVE, R_PPC64_NONE
-	#	R_PPC64_ADDR64 mach_<name>
-	#	R_PPC64_ADDR64 __crc_<name>
-	# On PPC:
-	#	R_PPC_RELATIVE, R_PPC_ADDR16_HI,
-	#	R_PPC_ADDR16_HA,R_PPC_ADDR16_LO,
-	#	R_PPC_NONE
-	grep -F -w -v 'R_PPC64_RELATIVE
-R_PPC64_NONE
-R_PPC_ADDR16_LO
-R_PPC_ADDR16_HI
-R_PPC_ADDR16_HA
-R_PPC_RELATIVE
-R_PPC_NONE' |
-	grep -E -v '\<R_PPC64_ADDR64[[:space:]]+mach_' |
-	grep -E -v '\<R_PPC64_ADDR64[[:space:]]+__crc_'
-)
-
-if [ -z "$bad_relocs" ]; then
-	exit 0
-fi
-
-num_bad=$(echo "$bad_relocs" | wc -l)
-echo "WARNING: $num_bad bad relocations"
-echo "$bad_relocs"
-
-# If we see this type of relocation it's an idication that
-# we /may/ be using an old version of binutils.
-if echo "$bad_relocs" | grep -q -F -w R_PPC64_UADDR64; then
-	echo "WARNING: You need at least binutils >= 2.19 to build a CONFIG_RELOCATABLE kernel"
-fi
diff --git a/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh b/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh
deleted file mode 100755
index c658d8cf760b..000000000000
--- a/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-set -e
-set -o pipefail
-
-# To debug, uncomment the following line
-# set -x
-
-# Test whether the compile option -mprofile-kernel exists and generates
-# profiling code (ie. a call to _mcount()).
-echo "int func() { return 0; }" | \
-    $* -S -x c -O2 -p -mprofile-kernel - -o - 2> /dev/null | \
-    grep -q "_mcount"
-
-# Test whether the notrace attribute correctly suppresses calls to _mcount().
-
-echo -e "#include <linux/compiler.h>\nnotrace int func() { return 0; }" | \
-    $* -S -x c -O2 -p -mprofile-kernel - -o - 2> /dev/null | \
-    grep -q "_mcount" && \
-    exit 1
-
-echo "OK"
-exit 0
diff --git a/arch/powerpc/tools/gcc-check-mprofile-kernel.sh b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh
new file mode 100755
index 000000000000..c658d8cf760b
--- /dev/null
+++ b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+# To debug, uncomment the following line
+# set -x
+
+# Test whether the compile option -mprofile-kernel exists and generates
+# profiling code (ie. a call to _mcount()).
+echo "int func() { return 0; }" | \
+    $* -S -x c -O2 -p -mprofile-kernel - -o - 2> /dev/null | \
+    grep -q "_mcount"
+
+# Test whether the notrace attribute correctly suppresses calls to _mcount().
+
+echo -e "#include <linux/compiler.h>\nnotrace int func() { return 0; }" | \
+    $* -S -x c -O2 -p -mprofile-kernel - -o - 2> /dev/null | \
+    grep -q "_mcount" && \
+    exit 1
+
+echo "OK"
+exit 0
diff --git a/arch/powerpc/tools/relocs_check.sh b/arch/powerpc/tools/relocs_check.sh
new file mode 100755
index 000000000000..ec2d5c835170
--- /dev/null
+++ b/arch/powerpc/tools/relocs_check.sh
@@ -0,0 +1,61 @@
+#!/bin/sh
+
+# Copyright © 2015 IBM Corporation
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version
+# 2 of the License, or (at your option) any later version.
+
+# This script checks the relocations of a vmlinux for "suspicious"
+# relocations.
+
+# based on relocs_check.pl
+# Copyright © 2009 IBM Corporation
+
+if [ $# -lt 2 ]; then
+	echo "$0 [path to objdump] [path to vmlinux]" 1>&2
+	exit 1
+fi
+
+# Have Kbuild supply the path to objdump so we handle cross compilation.
+objdump="$1"
+vmlinux="$2"
+
+bad_relocs=$(
+"$objdump" -R "$vmlinux" |
+	# Only look at relocation lines.
+	grep -E '\<R_' |
+	# These relocations are okay
+	# On PPC64:
+	#	R_PPC64_RELATIVE, R_PPC64_NONE
+	#	R_PPC64_ADDR64 mach_<name>
+	#	R_PPC64_ADDR64 __crc_<name>
+	# On PPC:
+	#	R_PPC_RELATIVE, R_PPC_ADDR16_HI,
+	#	R_PPC_ADDR16_HA,R_PPC_ADDR16_LO,
+	#	R_PPC_NONE
+	grep -F -w -v 'R_PPC64_RELATIVE
+R_PPC64_NONE
+R_PPC_ADDR16_LO
+R_PPC_ADDR16_HI
+R_PPC_ADDR16_HA
+R_PPC_RELATIVE
+R_PPC_NONE' |
+	grep -E -v '\<R_PPC64_ADDR64[[:space:]]+mach_' |
+	grep -E -v '\<R_PPC64_ADDR64[[:space:]]+__crc_'
+)
+
+if [ -z "$bad_relocs" ]; then
+	exit 0
+fi
+
+num_bad=$(echo "$bad_relocs" | wc -l)
+echo "WARNING: $num_bad bad relocations"
+echo "$bad_relocs"
+
+# If we see this type of relocation it's an idication that
+# we /may/ be using an old version of binutils.
+if echo "$bad_relocs" | grep -q -F -w R_PPC64_UADDR64; then
+	echo "WARNING: You need at least binutils >= 2.19 to build a CONFIG_RELOCATABLE kernel"
+fi
-- 
cgit v1.2.1


From 82eae1afbbdcaf2d716f88025736dc2d6f7afbf0 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Mon, 27 Mar 2017 19:27:37 +1100
Subject: powerpc/powernv: Check kzalloc() return value in pnv_pci_table_alloc

pnv_pci_table_alloc() ignores possible failure from kzalloc_node(),
this adds a check. There are 2 callers of pnv_pci_table_alloc(),
one already checks for tbl!=NULL, this adds WARN_ON() to the other path
which only happens during boot time in IODA1 and not expected to fail.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 3 +++
 arch/powerpc/platforms/powernv/pci.c      | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 93ce3f9d2758..bc5466d447e7 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2151,6 +2151,9 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
 
 found:
 	tbl = pnv_pci_table_alloc(phb->hose->node);
+	if (WARN_ON(!tbl))
+		return;
+
 	iommu_register_group(&pe->table_group, phb->hose->global_number,
 			pe->pe_number);
 	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 9b2bdcad51ba..935ccb249a8a 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -766,6 +766,9 @@ struct iommu_table *pnv_pci_table_alloc(int nid)
 	struct iommu_table *tbl;
 
 	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid);
+	if (!tbl)
+		return NULL;
+
 	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
 	kref_init(&tbl->it_kref);
 
-- 
cgit v1.2.1


From e49a6a2173346d316c0e65d054a3cda89d57cb53 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 13 Apr 2017 17:05:27 +1000
Subject: powerpc/powernv: Fix iommu table size calculation hook for small
 tables

When the userspace requests a small TCE table (which takes less than
the system page size) and more than 1 TCE level, the existing code
returns a single page size which is a bug as each additional TCE level
requires at least one page and this is what
pnv_pci_ioda2_table_alloc_pages() does. And we end up seeing
WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size))
in drivers/vfio/vfio_iommu_spapr_tce.c.

This replaces incorrect _ALIGN_UP() (which aligns zero up to zero) with
max_t() to fix the bug.

Besides removing WARN_ON(), there should be no other changes in
behaviour.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index bc5466d447e7..16fd322b2eea 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2440,7 +2440,8 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
 
 		tce_table_size /= direct_table_size;
 		tce_table_size <<= 3;
-		tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size);
+		tce_table_size = max_t(unsigned long,
+				tce_table_size, direct_table_size);
 	}
 
 	return bytes;
-- 
cgit v1.2.1


From b6e1f6adce8068620f728f717f90f4899b5ac83f Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 24 Mar 2017 17:37:21 +1100
Subject: powerpc/pseries: Enable VFIO

This enables VFIO on pseries host in order to allow VFIO in nested guest under
PR KVM or DPDK in a HV guest. This adds support of the VFIO_SPAPR_TCE_IOMMU
type.

This adds exchange() callback to allow TCE updates by the SPAPR TCE IOMMU
driver in VFIO.

This initializes DMA32 window parameters in iommu_table_group as as this does
not implement VFIO_SPAPR_TCE_v2_IOMMU and VFIO_SPAPR_TCE_IOMMU just reuses the
existing DMA32 window.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/iommu.c | 40 ++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 7ce5db209abf..8374adee27e3 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -551,6 +551,7 @@ static void iommu_table_setparms(struct pci_controller *phb,
 static void iommu_table_setparms_lpar(struct pci_controller *phb,
 				      struct device_node *dn,
 				      struct iommu_table *tbl,
+				      struct iommu_table_group *table_group,
 				      const __be32 *dma_window)
 {
 	unsigned long offset, size;
@@ -564,6 +565,9 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb,
 	tbl->it_type = TCE_PCI;
 	tbl->it_offset = offset >> tbl->it_page_shift;
 	tbl->it_size = size >> tbl->it_page_shift;
+
+	table_group->tce32_start = offset;
+	table_group->tce32_size = size;
 }
 
 struct iommu_table_ops iommu_table_pseries_ops = {
@@ -652,8 +656,38 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 	pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
 }
 
+#ifdef CONFIG_IOMMU_API
+static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
+				long *tce, enum dma_data_direction *direction)
+{
+	long rc;
+	unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
+	unsigned long flags, oldtce = 0;
+	u64 proto_tce = iommu_direction_to_tce_perm(*direction);
+	unsigned long newtce = *tce | proto_tce;
+
+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
+
+	rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce);
+	if (!rc)
+		rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce);
+
+	if (!rc) {
+		*direction = iommu_tce_direction(oldtce);
+		*tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+	}
+
+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
+
+	return rc;
+}
+#endif
+
 struct iommu_table_ops iommu_table_lpar_multi_ops = {
 	.set = tce_buildmulti_pSeriesLP,
+#ifdef CONFIG_IOMMU_API
+	.exchange = tce_exchange_pseries,
+#endif
 	.clear = tce_freemulti_pSeriesLP,
 	.get = tce_get_pSeriesLP
 };
@@ -690,7 +724,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 	if (!ppci->table_group) {
 		ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
 		tbl = ppci->table_group->tables[0];
-		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
+		iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
+				ppci->table_group, dma_window);
 		tbl->it_ops = &iommu_table_lpar_multi_ops;
 		iommu_init_table(tbl, ppci->phb->node);
 		iommu_register_group(ppci->table_group,
@@ -1144,7 +1179,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 	if (!pci->table_group) {
 		pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
 		tbl = pci->table_group->tables[0];
-		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
+		iommu_table_setparms_lpar(pci->phb, pdn, tbl,
+				pci->table_group, dma_window);
 		tbl->it_ops = &iommu_table_lpar_multi_ops;
 		iommu_init_table(tbl, pci->phb->node);
 		iommu_register_group(pci->table_group,
-- 
cgit v1.2.1


From add2e1e585875b42ac745c05bf0e7f029ee31ea2 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 28 Apr 2017 21:57:22 +1000
Subject: powerpc/mm/hash: Fix off-by-one in comment about kernel contexts ids
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Michal Suchánek noticed a comment in book3s/64/mmu-hash.h about the context ids
we use for the kernel was inconsistent with the code and other comments in the
same file.

It should read 1-4 not 1-5.

While we're touching it, update "address" to "addresses" which makes more sense
as it's referring to more than one address below.

Reported-by: Michal Suchánek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 6d56974adf28..6981a52b3887 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -494,7 +494,7 @@ extern void slb_set_size(u16 size);
  *
  * For user processes max context id is limited to MAX_USER_CONTEXT.
 
- * For kernel space, we use context ids 1-5 to map address as below:
+ * For kernel space, we use context ids 1-4 to map addresses as below:
  * NOTE: each context only support 64TB now.
  * 0x00001 -  [ 0xc000000000000000 - 0xc0003fffffffffff ]
  * 0x00002 -  [ 0xd000000000000000 - 0xd0003fffffffffff ]
-- 
cgit v1.2.1


From 8915bcd68b2547c819b8a2a33c098121af9accb2 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Thu, 16 Mar 2017 14:04:40 +1100
Subject: powerpc/xmon: Teach xmon oops about radix vectors

Currently if we take an oops caused by an 0x380 or 0x480 exception, we get a
print which assumes SLB problems. With radix, these vectors have different
meanings.

This patch updates the oops message to reflect these different meanings.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/xmon/xmon.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index da5619847517..f11f65634aab 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1369,9 +1369,19 @@ const char *getvecname(unsigned long vec)
 	case 0x100:	ret = "(System Reset)"; break;
 	case 0x200:	ret = "(Machine Check)"; break;
 	case 0x300:	ret = "(Data Access)"; break;
-	case 0x380:	ret = "(Data SLB Access)"; break;
+	case 0x380:
+		if (radix_enabled())
+			ret = "(Data Access Out of Range)";
+		else
+			ret = "(Data SLB Access)";
+		break;
 	case 0x400:	ret = "(Instruction Access)"; break;
-	case 0x480:	ret = "(Instruction SLB Access)"; break;
+	case 0x480:
+		if (radix_enabled())
+			ret = "(Instruction Access Out of Range)";
+		else
+			ret = "(Instruction SLB Access)";
+		break;
 	case 0x500:	ret = "(Hardware Interrupt)"; break;
 	case 0x600:	ret = "(Alignment)"; break;
 	case 0x700:	ret = "(Program Check)"; break;
-- 
cgit v1.2.1


From 084a275e4c9477c432b05e872c3a297eab41638a Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Oct 2016 14:16:00 +1100
Subject: powerpc/64: Allow CONFIG_RELOCATABLE if COMPILE_TEST

This was a hack we added to work around the allmodconfig build breaking, see
commit fb43e8477ed9 ("powerpc: Disable RELOCATABLE for COMPILE_TEST with
PPC64").

Since we merged the thin archives support in commit 43c9127d94d6 ("powerpc: Add
option to use thin archives") this hasn't been necessary, so remove it.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 48cc3d8432bb..67ee6731f4e9 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -540,7 +540,7 @@ config KEXEC_FILE
 
 config RELOCATABLE
 	bool "Build a relocatable kernel"
-	depends on (PPC64 && !COMPILE_TEST) || (FLATMEM && (44x || FSL_BOOKE))
+	depends on PPC64 || (FLATMEM && (44x || FSL_BOOKE))
 	select NONSTATIC_KERNEL
 	select MODULE_REL_CRCS if MODVERSIONS
 	help
@@ -574,7 +574,7 @@ config RELOCATABLE_TEST
 config CRASH_DUMP
 	bool "Build a kdump crash kernel"
 	depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP)
-	select RELOCATABLE if (PPC64 && !COMPILE_TEST) || 44x || FSL_BOOKE
+	select RELOCATABLE if PPC64 || 44x || FSL_BOOKE
 	help
 	  Build a kernel suitable for use as a kdump capture kernel.
 	  The same kernel binary can be used as production kernel and dump
-- 
cgit v1.2.1


From daeba2956f32f91f3493788ff6ee02fb1b2f02fa Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Wed, 19 Apr 2017 17:39:26 +1000
Subject: powerpc/eeh: Avoid use after free in eeh_handle_special_event()

eeh_handle_special_event() is called when an EEH event is detected but
can't be narrowed down to a specific PE.  This function looks through
every PE to find one in an erroneous state, then calls the regular event
handler eeh_handle_normal_event() once it knows which PE has an error.

However, if eeh_handle_normal_event() found that the PE cannot possibly
be recovered, it will free it, rendering the passed PE stale.
This leads to a use after free in eeh_handle_special_event() as it attempts to
clear the "recovering" state on the PE after eeh_handle_normal_event() returns.

Thus, make sure the PE is valid when attempting to clear state in
eeh_handle_special_event().

Fixes: 8a6b1bc70dbb ("powerpc/eeh: EEH core to handle special event")
Cc: stable@vger.kernel.org # v3.11+
Reported-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Russell Currey <ruscur@russell.cc>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/eeh_driver.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index b94887165a10..e50d1470714f 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -724,7 +724,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
  */
 #define MAX_WAIT_FOR_RECOVERY 300
 
-static void eeh_handle_normal_event(struct eeh_pe *pe)
+static bool eeh_handle_normal_event(struct eeh_pe *pe)
 {
 	struct pci_bus *frozen_bus;
 	struct eeh_dev *edev, *tmp;
@@ -736,7 +736,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 	if (!frozen_bus) {
 		pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
 			__func__, pe->phb->global_number, pe->addr);
-		return;
+		return false;
 	}
 
 	eeh_pe_update_time_stamp(pe);
@@ -870,7 +870,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 	pr_info("EEH: Notify device driver to resume\n");
 	eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
 
-	return;
+	return false;
 
 excess_failures:
 	/*
@@ -915,8 +915,12 @@ perm_error:
 			pci_lock_rescan_remove();
 			pci_hp_remove_devices(frozen_bus);
 			pci_unlock_rescan_remove();
+
+			/* The passed PE should no longer be used */
+			return true;
 		}
 	}
+	return false;
 }
 
 static void eeh_handle_special_event(void)
@@ -982,7 +986,14 @@ static void eeh_handle_special_event(void)
 		 */
 		if (rc == EEH_NEXT_ERR_FROZEN_PE ||
 		    rc == EEH_NEXT_ERR_FENCED_PHB) {
-			eeh_handle_normal_event(pe);
+			/*
+			 * eeh_handle_normal_event() can make the PE stale if it
+			 * determines that the PE cannot possibly be recovered.
+			 * Don't modify the PE state if that's the case.
+			 */
+			if (eeh_handle_normal_event(pe))
+				continue;
+
 			eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
 		} else {
 			pci_lock_rescan_remove();
-- 
cgit v1.2.1


From c0b64978f09195e00d6649ca0ad0242a80e6e99a Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Wed, 19 Apr 2017 17:39:27 +1000
Subject: powerpc/eeh: Clean up and document event handling functions

Remove unnecessary tags in eeh_handle_normal_event(), and add function
comments for eeh_handle_normal_event() and eeh_handle_special_event().

The only functional difference is that in the case of a PE reaching the
maximum number of failures, rather than one message telling you of this
and suggesting you reseat the device, there are two separate messages.

Suggested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Russell Currey <ruscur@russell.cc>
Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/eeh_driver.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index e50d1470714f..c405c79e50cd 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -724,6 +724,15 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
  */
 #define MAX_WAIT_FOR_RECOVERY 300
 
+/**
+ * eeh_handle_normal_event - Handle EEH events on a specific PE
+ * @pe: EEH PE
+ *
+ * Attempts to recover the given PE.  If recovery fails or the PE has failed
+ * too many times, remove the PE.
+ *
+ * Returns true if @pe should no longer be used, else false.
+ */
 static bool eeh_handle_normal_event(struct eeh_pe *pe)
 {
 	struct pci_bus *frozen_bus;
@@ -741,8 +750,13 @@ static bool eeh_handle_normal_event(struct eeh_pe *pe)
 
 	eeh_pe_update_time_stamp(pe);
 	pe->freeze_count++;
-	if (pe->freeze_count > eeh_max_freezes)
-		goto excess_failures;
+	if (pe->freeze_count > eeh_max_freezes) {
+		pr_err("EEH: PHB#%x-PE#%x has failed %d times in the\n"
+		       "last hour and has been permanently disabled.\n",
+		       pe->phb->global_number, pe->addr,
+		       pe->freeze_count);
+		goto hard_fail;
+	}
 	pr_warn("EEH: This PCI device has failed %d times in the last hour\n",
 		pe->freeze_count);
 
@@ -872,25 +886,16 @@ static bool eeh_handle_normal_event(struct eeh_pe *pe)
 
 	return false;
 
-excess_failures:
+hard_fail:
 	/*
 	 * About 90% of all real-life EEH failures in the field
 	 * are due to poorly seated PCI cards. Only 10% or so are
 	 * due to actual, failed cards.
 	 */
-	pr_err("EEH: PHB#%x-PE#%x has failed %d times in the\n"
-	       "last hour and has been permanently disabled.\n"
-	       "Please try reseating or replacing it.\n",
-		pe->phb->global_number, pe->addr,
-		pe->freeze_count);
-	goto perm_error;
-
-hard_fail:
 	pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
 	       "Please try reseating or replacing it\n",
 		pe->phb->global_number, pe->addr);
 
-perm_error:
 	eeh_slot_error_detail(pe, EEH_LOG_PERM);
 
 	/* Notify all devices that they're about to go down. */
@@ -923,6 +928,13 @@ perm_error:
 	return false;
 }
 
+/**
+ * eeh_handle_special_event - Handle EEH events without a specific failing PE
+ *
+ * Called when an EEH event is detected but can't be narrowed down to a
+ * specific PE.  Iterates through possible failures and handles them as
+ * necessary.
+ */
 static void eeh_handle_special_event(void)
 {
 	struct eeh_pe *pe, *phb_pe;
-- 
cgit v1.2.1


From b7da1230532984fc00962b2c8be94acc87655707 Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Date: Fri, 22 Jul 2016 17:16:35 +1000
Subject: powerpc/powernv: Document cxl dependency on special case in
 pnv_eeh_reset()

pnv_eeh_reset() has special handling for PEs whose primary bus is the
root bus or the bus immediately underneath the root port.

The cxl bi-modal card support added in b0b5e5918ad1 ("cxl: Add
cxl_check_and_switch_mode() API to switch bi-modal cards") relies on
this behaviour when hot-resetting the CAPI adapter following a mode
switch.  Document this in pnv_eeh_reset() so we don't accidentally break
it.

Suggested-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 6fb5522acd70..d2f19821d71d 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1102,6 +1102,13 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
 		return -EIO;
 	}
 
+	/*
+	 * If dealing with the root bus (or the bus underneath the
+	 * root port), we reset the bus underneath the root port.
+	 *
+	 * The cxl driver depends on this behaviour for bi-modal card
+	 * switching.
+	 */
 	if (pci_is_root_bus(bus) ||
 	    pci_is_root_bus(bus->parent))
 		return pnv_eeh_root_reset(hose, option);
-- 
cgit v1.2.1


From 0cc68bfad45b47167c9d99cf560b505983ebd012 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 17 Feb 2017 17:31:22 +1100
Subject: powerpc/mpc52xx: Don't select user-visible RTAS_PROC

Otherwise we might select it when its dependenices aren't enabled,
leading to a build break.

It's default y anyway, so will be on unless someone disables it
manually.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/52xx/Kconfig | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/52xx/Kconfig b/arch/powerpc/platforms/52xx/Kconfig
index b625a2c6f4f2..e4c745981912 100644
--- a/arch/powerpc/platforms/52xx/Kconfig
+++ b/arch/powerpc/platforms/52xx/Kconfig
@@ -33,7 +33,6 @@ config PPC_EFIKA
 	bool "bPlan Efika 5k2. MPC5200B based computer"
 	depends on PPC_MPC52xx
 	select PPC_RTAS
-	select RTAS_PROC
 	select PPC_NATIVE
 
 config PPC_LITE5200
-- 
cgit v1.2.1


From 687b8f24f14db842c8c3f8cb8b24c9a29b691db8 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 15 Feb 2017 20:49:54 +1100
Subject: powerpc/smp: Document irq enable/disable after migrating IRQs

This code was until recently completely undocumented and even now the comment is
not very verbose.

We've already had one patch sent to remove the IRQ enable/disable because it's
"paradoxical and unnecessary". So document it thoroughly to save anyone else
from puzzling over it.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/smp.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index b37973f11ce0..1eef9e73e6a3 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -615,7 +615,14 @@ int generic_cpu_disable(void)
 	/* Update affinity of all IRQs previously aimed at this CPU */
 	irq_migrate_all_off_this_cpu();
 
-	/* Give the CPU time to drain in-flight ones */
+	/*
+	 * Depending on the details of the interrupt controller, it's possible
+	 * that one of the interrupts we just migrated away from this CPU is
+	 * actually already pending on this CPU. If we leave it in that state
+	 * the interrupt will never be EOI'ed, and will never fire again. So
+	 * temporarily enable interrupts here, to allow any pending interrupt to
+	 * be received (and EOI'ed), before we take this CPU offline.
+	 */
 	local_irq_enable();
 	mdelay(1);
 	local_irq_disable();
-- 
cgit v1.2.1


From 3f2290e1b5192fdfa74f012220a8d90067beb076 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 24 Apr 2017 20:46:59 +1000
Subject: powerpc/sysfs: Move #ifdef CONFIG_HOTPLUG_CPU out of the function
 body

The entire body of unregister_cpu_online() is inside an #ifdef
CONFIG_HOTPLUG_CPU block. This is ugly and means we create an empty function
when hotplug is disabled for no reason.

Instead move the #ifdef out of the function body and define the function to be
NULL in the else case. This means we'll pass NULL to cpuhp_setup_state(), but
that's fine because it accepts NULL to mean there is no teardown callback, which
is exactly what we want.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/sysfs.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 949957b97ead..4437c70c7c2b 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -789,9 +789,9 @@ static int register_cpu_online(unsigned int cpu)
 	return 0;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
 static int unregister_cpu_online(unsigned int cpu)
 {
-#ifdef CONFIG_HOTPLUG_CPU
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
 	struct device *s = &c->dev;
 	struct device_attribute *attrs, *pmc_attrs;
@@ -870,9 +870,11 @@ static int unregister_cpu_online(unsigned int cpu)
 	cacheinfo_cpu_offline(cpu);
 	of_node_put(s->of_node);
 	s->of_node = NULL;
-#endif /* CONFIG_HOTPLUG_CPU */
 	return 0;
 }
+#else /* !CONFIG_HOTPLUG_CPU */
+#define unregister_cpu_online NULL
+#endif
 
 #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
 ssize_t arch_cpu_probe(const char *buf, size_t count)
-- 
cgit v1.2.1


From d93b0ac01a9ce276ec39644be47001873d3d183c Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Tue, 18 Apr 2017 22:08:17 +0530
Subject: powerpc/book3s/mce: Move add_taint() later in virtual mode

machine_check_early() gets called in real mode. The very first time when
add_taint() is called, it prints a warning which ends up calling opal
call (that uses OPAL_CALL wrapper) for writing it to console. If we get a
very first machine check while we are in opal we are doomed. OPAL_CALL
overwrites the PACASAVEDMSR in r13 and in this case when we are done with
MCE handling the original opal call will use this new MSR on it's way
back to opal_return. This usually leads to unexpected behaviour or the
kernel to panic. Instead move the add_taint() call later in the virtual
mode where it is safe to call.

This is broken with current FW level. We got lucky so far for not getting
very first MCE hit while in OPAL. But easily reproducible on Mambo.

Fixes: 27ea2c420cad ("powerpc: Set the correct kernel taint on machine check errors.")
Cc: stable@vger.kernel.org # v4.2+
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/mce.c   | 2 ++
 arch/powerpc/kernel/traps.c | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 16eb0b508761..5f9eada3519b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -221,6 +221,8 @@ static void machine_check_process_queued_event(struct irq_work *work)
 {
 	int index;
 
+	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
 	/*
 	 * For now just print it to console.
 	 * TODO: log this error event to FSP or nvram.
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 6a81a451d767..d4e545d27ef9 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -323,8 +323,6 @@ long machine_check_early(struct pt_regs *regs)
 
 	__this_cpu_inc(irq_stat.mce_exceptions);
 
-	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
 	if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
 		handled = cur_cpu_spec->machine_check_early(regs);
 	return handled;
@@ -758,6 +756,8 @@ void machine_check_exception(struct pt_regs *regs)
 
 	__this_cpu_inc(irq_stat.mce_exceptions);
 
+	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
 	/* See if any machine dependent calls. In theory, we would want
 	 * to call the CPU first, and call the ppc_md. one if the CPU
 	 * one returns a positive number. However there is existing code
-- 
cgit v1.2.1


From 3c9ac2bcc35453141f82461c71ed109ded152a6a Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 2 May 2017 21:00:14 +1000
Subject: powerpc/mm/radix: Drop support for CPUs without lockless tlbie

Currently the radix TLB code includes support for CPUs that do *not*
have MMU_FTR_LOCKLESS_TLBIE. On those CPUs we are required to take a
global spinlock before issuing a tlbie.

Radix can only be built for 64-bit Book3s CPUs, and of those, only
POWER4, 970, Cell and PA6T do not have MMU_FTR_LOCKLESS_TLBIE. Although
it's possible to build a kernel with Radix support that can also boot on
those CPUs, we happen to know that in reality none of those CPUs support
the Radix MMU, so the code can never actually run on those CPUs.

So remove the native_tlbie_lock in the Radix TLB code.

Note that there is another lock of the same name in the hash code, which
is unaffected by this patch.

Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/tlb-radix.c | 45 +++++++--------------------------------------
 1 file changed, 7 insertions(+), 38 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 5e17c4e873a5..02e71402fdd3 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -17,7 +17,6 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
 
 #define RIC_FLUSH_TLB 0
 #define RIC_FLUSH_PWC 1
@@ -203,15 +202,9 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto no_context;
 
-	if (!mm_is_thread_local(mm)) {
-		int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
-		if (lock_tlbie)
-			raw_spin_lock(&native_tlbie_lock);
+	if (!mm_is_thread_local(mm))
 		_tlbie_pid(pid, RIC_FLUSH_ALL);
-		if (lock_tlbie)
-			raw_spin_unlock(&native_tlbie_lock);
-	} else
+	else
 		_tlbiel_pid(pid, RIC_FLUSH_ALL);
 no_context:
 	preempt_enable();
@@ -235,15 +228,9 @@ void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto no_context;
 
-	if (!mm_is_thread_local(mm)) {
-		int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
-		if (lock_tlbie)
-			raw_spin_lock(&native_tlbie_lock);
+	if (!mm_is_thread_local(mm))
 		_tlbie_pid(pid, RIC_FLUSH_PWC);
-		if (lock_tlbie)
-			raw_spin_unlock(&native_tlbie_lock);
-	} else
+	else
 		tlbiel_pwc(pid);
 no_context:
 	preempt_enable();
@@ -260,15 +247,9 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 	pid = mm ? mm->context.id : 0;
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto bail;
-	if (!mm_is_thread_local(mm)) {
-		int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
-		if (lock_tlbie)
-			raw_spin_lock(&native_tlbie_lock);
+	if (!mm_is_thread_local(mm))
 		_tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
-		if (lock_tlbie)
-			raw_spin_unlock(&native_tlbie_lock);
-	} else
+	else
 		_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
 bail:
 	preempt_enable();
@@ -289,13 +270,7 @@ EXPORT_SYMBOL(radix__flush_tlb_page);
 
 void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
-	if (lock_tlbie)
-		raw_spin_lock(&native_tlbie_lock);
 	_tlbie_pid(0, RIC_FLUSH_ALL);
-	if (lock_tlbie)
-		raw_spin_unlock(&native_tlbie_lock);
 }
 EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
 
@@ -357,7 +332,6 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 	unsigned long addr;
 	int local = mm_is_thread_local(mm);
 	unsigned long ap = mmu_get_ap(psize);
-	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
 	unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
 
 
@@ -378,13 +352,8 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 
 		if (local)
 			_tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
-		else {
-			if (lock_tlbie)
-				raw_spin_lock(&native_tlbie_lock);
+		else
 			_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
-			if (lock_tlbie)
-				raw_spin_unlock(&native_tlbie_lock);
-		}
 	}
 err_out:
 	preempt_enable();
-- 
cgit v1.2.1


From 6b3d12a948d27977816a15eb48409a298902a548 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Wed, 3 May 2017 13:24:08 +1000
Subject: powerpc/powernv: Fix TCE kill on NVLink2

Commit 616badd2fb49 ("powerpc/powernv: Use OPAL call for TCE kill on
NVLink2") forced all TCE kills to go via the OPAL call for
NVLink2. However the PHB3 implementation of TCE kill was still being
called directly from some functions which in some circumstances caused
a machine check.

This patch adds an equivalent IODA2 version of the function which uses
the correct invalidation method depending on PHB model and changes all
external callers to use it instead.

Fixes: 616badd2fb49 ("powerpc/powernv: Use OPAL call for TCE kill on NVLink2")
Cc: stable@vger.kernel.org # v4.11+
Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/npu-dma.c  |  8 ++++----
 arch/powerpc/platforms/powernv/pci-ioda.c | 10 +++++++++-
 arch/powerpc/platforms/powernv/pci.h      |  2 +-
 3 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 4c88c3e6ec9e..067defeea691 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -203,7 +203,7 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
 		pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
 		return rc;
 	}
-	pnv_pci_phb3_tce_invalidate_entire(phb, false);
+	pnv_pci_ioda2_tce_invalidate_entire(phb, false);
 
 	/* Add the table to the list so its TCE cache will get invalidated */
 	pnv_pci_link_table_and_group(phb->hose->node, num,
@@ -227,7 +227,7 @@ long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
 		pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
 		return rc;
 	}
-	pnv_pci_phb3_tce_invalidate_entire(phb, false);
+	pnv_pci_ioda2_tce_invalidate_entire(phb, false);
 
 	pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
 			&npe->table_group);
@@ -293,7 +293,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
 			0 /* bypass base */, top);
 
 	if (rc == OPAL_SUCCESS)
-		pnv_pci_phb3_tce_invalidate_entire(phb, false);
+		pnv_pci_ioda2_tce_invalidate_entire(phb, false);
 
 	return rc;
 }
@@ -357,7 +357,7 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
 		pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
 		return;
 	}
-	pnv_pci_phb3_tce_invalidate_entire(npe->phb, false);
+	pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
 }
 
 struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 16fd322b2eea..6fdbd383f676 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1895,7 +1895,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 #define PHB3_TCE_KILL_INVAL_PE		PPC_BIT(1)
 #define PHB3_TCE_KILL_INVAL_ONE		PPC_BIT(2)
 
-void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
+static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
 {
 	__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
 	const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
@@ -1991,6 +1991,14 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 	}
 }
 
+void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
+{
+	if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
+		pnv_pci_phb3_tce_invalidate_entire(phb, rm);
+	else
+		opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
+}
+
 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
 		long npages, unsigned long uaddr,
 		enum dma_data_direction direction,
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 4eab713136d1..18c8a2fa03b8 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -242,7 +242,7 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 
 /* Nvlink functions */
 extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
-extern void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
+extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
 extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
 extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
 		struct iommu_table *tbl);
-- 
cgit v1.2.1


From 700b7eadd5625d22b8235fb21259b3d7d564c000 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Apr 2017 05:12:17 +1000
Subject: powerpc/64s: Power9 has no LPCR[VRMASD] field so don't set it

Power9/ISAv3 has no VRMASD field in LPCR, we shouldn't be setting reserved bits,
so don't set them on Power9.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/cpu_setup_power.S | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 1fce4ddd2e6c..10cb2896b2ae 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -30,7 +30,7 @@ _GLOBAL(__setup_cpu_power7)
 	mtspr	SPRN_LPID,r0
 	mfspr	r3,SPRN_LPCR
 	li	r4,(LPCR_LPES1 >> LPCR_LPES_SH)
-	bl	__init_LPCR
+	bl	__init_LPCR_ISA206
 	bl	__init_tlb_power7
 	mtlr	r11
 	blr
@@ -44,7 +44,7 @@ _GLOBAL(__restore_cpu_power7)
 	mtspr	SPRN_LPID,r0
 	mfspr	r3,SPRN_LPCR
 	li	r4,(LPCR_LPES1 >> LPCR_LPES_SH)
-	bl	__init_LPCR
+	bl	__init_LPCR_ISA206
 	bl	__init_tlb_power7
 	mtlr	r11
 	blr
@@ -62,7 +62,7 @@ _GLOBAL(__setup_cpu_power8)
 	mfspr	r3,SPRN_LPCR
 	ori	r3, r3, LPCR_PECEDH
 	li	r4,0 /* LPES = 0 */
-	bl	__init_LPCR
+	bl	__init_LPCR_ISA206
 	bl	__init_HFSCR
 	bl	__init_tlb_power8
 	bl	__init_PMU_HV
@@ -84,7 +84,7 @@ _GLOBAL(__restore_cpu_power8)
 	mfspr   r3,SPRN_LPCR
 	ori	r3, r3, LPCR_PECEDH
 	li	r4,0 /* LPES = 0 */
-	bl	__init_LPCR
+	bl	__init_LPCR_ISA206
 	bl	__init_HFSCR
 	bl	__init_tlb_power8
 	bl	__init_PMU_HV
@@ -108,7 +108,7 @@ _GLOBAL(__setup_cpu_power9)
 	LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
 	andc	r3, r3, r4
 	li	r4,0 /* LPES = 0 */
-	bl	__init_LPCR
+	bl	__init_LPCR_ISA300
 	bl	__init_HFSCR
 	bl	__init_tlb_power9
 	bl	__init_PMU_HV
@@ -132,7 +132,7 @@ _GLOBAL(__restore_cpu_power9)
 	LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
 	andc	r3, r3, r4
 	li	r4,0 /* LPES = 0 */
-	bl	__init_LPCR
+	bl	__init_LPCR_ISA300
 	bl	__init_HFSCR
 	bl	__init_tlb_power9
 	bl	__init_PMU_HV
@@ -150,7 +150,7 @@ __init_hvmode_206:
 	std	r5,CPU_SPEC_FEATURES(r4)
 	blr
 
-__init_LPCR:
+__init_LPCR_ISA206:
 	/* Setup a sane LPCR:
 	 *   Called with initial LPCR in R3 and desired LPES 2-bit value in R4
 	 *
@@ -163,6 +163,11 @@ __init_LPCR:
 	 *
 	 * Other bits untouched for now
 	 */
+	li	r5,0x10
+	rldimi	r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
+
+	/* POWER9 has no VRMASD */
+__init_LPCR_ISA300:
 	rldimi	r3,r4, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
 	ori	r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2)
 	li	r5,4
@@ -170,8 +175,6 @@ __init_LPCR:
 	clrrdi	r3,r3,1		/* clear HDICE */
 	li	r5,4
 	rldimi	r3,r5, LPCR_VC_SH, 0
-	li	r5,0x10
-	rldimi	r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
 	mtspr	SPRN_LPCR,r3
 	isync
 	blr
-- 
cgit v1.2.1