From d28c4393a7bf558538e9def269c1caeab6ec056f Mon Sep 17 00:00:00 2001
From: "Prasanna S.P" <prasanna@in.ibm.com>
Date: Tue, 26 Sep 2006 10:52:34 +0200
Subject: [PATCH] x86: error_code is not safe for kprobes

This patch moves the entry.S:error_entry to .kprobes.text section,
since code marked unsafe for kprobes jumps directly to entry.S::error_entry,
that must be marked unsafe as well.
This patch also moves all the ".previous.text" asm directives to ".previous"
for kprobes section.

AK: Following a similar i386 patch from Chuck Ebbert
AK: Also merged Jeremy's fix in.

+From: Jeremy Fitzhardinge <jeremy@goop.org>

KPROBE_ENTRY does a .section .kprobes.text, and expects its users to
do a .previous at the end of the function.

Unfortunately, if any code within the function switches sections, for
example .fixup, then the .previous ends up putting all subsequent code
into .fixup.  Worse, any subsequent .fixup code gets intermingled with
the code its supposed to be fixing (which is also in .fixup).  It's
surprising this didn't cause more havok.

The fix is to use .pushsection/.popsection, so this stuff nests
properly.  A further cleanup would be to get rid of all
.section/.previous pairs, since they're inherently fragile.

+From: Chuck Ebbert <76306.1226@compuserve.com>

Because code marked unsafe for kprobes jumps directly to
entry.S::error_code, that must be marked unsafe as well.
The easiest way to do that is to move the page fault entry
point to just before error_code and let it inherit the same
section.

Also moved all the ".previous" asm directives for kprobes
sections to column 1 and removed ".text" from them.

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/entry.S | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'arch/i386/kernel/entry.S')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 87f9f60b803b..ba22ec8fab54 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -591,11 +591,9 @@ ENTRY(name)				\
 /* The include is where all of the SMP etc. interrupts come from */
 #include "entry_arch.h"
 
-ENTRY(divide_error)
-	RING0_INT_FRAME
-	pushl $0			# no error code
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_divide_error
+KPROBE_ENTRY(page_fault)
+	RING0_EC_FRAME
+	pushl $do_page_fault
 	CFI_ADJUST_CFA_OFFSET 4
 	ALIGN
 error_code:
@@ -645,6 +643,7 @@ error_code:
 	call *%edi
 	jmp ret_from_exception
 	CFI_ENDPROC
+KPROBE_END(page_fault)
 
 ENTRY(coprocessor_error)
 	RING0_INT_FRAME
@@ -720,7 +719,8 @@ debug_stack_correct:
 	call do_debug
 	jmp ret_from_exception
 	CFI_ENDPROC
-	.previous .text
+KPROBE_END(debug)
+
 /*
  * NMI is doubly nasty. It can happen _while_ we're handling
  * a debug fault, and the debug fault hasn't yet been able to
@@ -816,7 +816,7 @@ KPROBE_ENTRY(int3)
 	call do_int3
 	jmp ret_from_exception
 	CFI_ENDPROC
-	.previous .text
+KPROBE_END(int3)
 
 ENTRY(overflow)
 	RING0_INT_FRAME
@@ -881,7 +881,7 @@ KPROBE_ENTRY(general_protection)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
-	.previous .text
+KPROBE_END(general_protection)
 
 ENTRY(alignment_check)
 	RING0_EC_FRAME
@@ -890,13 +890,14 @@ ENTRY(alignment_check)
 	jmp error_code
 	CFI_ENDPROC
 
-KPROBE_ENTRY(page_fault)
-	RING0_EC_FRAME
-	pushl $do_page_fault
+ENTRY(divide_error)
+	RING0_INT_FRAME
+	pushl $0			# no error code
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_divide_error
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
-	.previous .text
 
 #ifdef CONFIG_X86_MCE
 ENTRY(machine_check)
-- 
cgit v1.2.1


From 02ba1a32dbd3d406530a17a2643a8f0f8cbf3acc Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 26 Sep 2006 10:52:35 +0200
Subject: [PATCH] i386: move kernel_thread_helper into entry.S

And add proper CFI annotation to it which was previously
impossible. This prevents "stuck" messages by the dwarf2 unwinder
when reaching the top of a kernel stack.

Includes feedback from Jan Beulich

Cc: jbeulich@novell.com
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/entry.S | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/i386/kernel/entry.S')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index ba22ec8fab54..dede506e5bd0 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -950,6 +950,19 @@ ENTRY(arch_unwind_init_running)
 ENDPROC(arch_unwind_init_running)
 #endif
 
+ENTRY(kernel_thread_helper)
+	pushl $0		# fake return address for unwinder
+	CFI_STARTPROC
+	movl %edx,%eax
+	push %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	call *%ebx
+	push %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	call do_exit
+	CFI_ENDPROC
+ENDPROC(kernel_thread_helper)
+
 .section .rodata,"a"
 #include "syscall_table.S"
 
-- 
cgit v1.2.1


From 06039754d775d3e48e4a292e4f353321205eff53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20Luis=20V=E1zquez=20Cao?= <fernando@oss.ntt.co.jp>
Date: Tue, 26 Sep 2006 10:52:36 +0200
Subject: [PATCH] i386: Disallow kprobes on NMI handlers

A kprobe executes IRET early and that could cause NMI recursion and stack
corruption.

Note: This problem was originally spotted and solved by Andi Kleen in the
x86_64 architecture. This patch is an adaption of his patch for i386.

AK: Merged with current code which was a bit different.
AK: Removed printk in nmi handler that shouldn't be there in the first time
AK: Added missing include.
AK: added KPROBES_END

Signed-off-by: Fernando Vazquez <fernando@intellilink.co.jp>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/entry.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/i386/kernel/entry.S')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index dede506e5bd0..0928f70639aa 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -729,7 +729,7 @@ KPROBE_END(debug)
  * check whether we got an NMI on the debug path where the debug
  * fault happened on the sysenter path.
  */
-ENTRY(nmi)
+KPROBE_ENTRY(nmi)
 	RING0_INT_FRAME
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
@@ -805,6 +805,7 @@ nmi_16bit_stack:
 	.align 4
 	.long 1b,iret_exc
 .previous
+KPROBE_END(nmi)
 
 KPROBE_ENTRY(int3)
 	RING0_INT_FRAME
-- 
cgit v1.2.1


From a549b86dd0f3cbffcd5f9343f4ae7fcd59f7e756 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <76306.1226@compuserve.com>
Date: Tue, 26 Sep 2006 10:52:37 +0200
Subject: [PATCH] i386: annotate FIX_STACK() and the rest of nmi()

In i386's entry.S, FIX_STACK() needs annotation because it
replaces the stack pointer.  And the rest of nmi() needs
annotation in order to compile with these new annotations.

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/entry.S | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'arch/i386/kernel/entry.S')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 0928f70639aa..4b0845249222 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -701,9 +701,15 @@ device_not_available_emulate:
 	jne ok;					\
 label:						\
 	movl TSS_sysenter_esp0+offset(%esp),%esp;	\
+	CFI_DEF_CFA esp, 0;			\
+	CFI_UNDEFINED eip;			\
 	pushfl;					\
+	CFI_ADJUST_CFA_OFFSET 4;		\
 	pushl $__KERNEL_CS;			\
-	pushl $sysenter_past_esp
+	CFI_ADJUST_CFA_OFFSET 4;		\
+	pushl $sysenter_past_esp;		\
+	CFI_ADJUST_CFA_OFFSET 4;		\
+	CFI_REL_OFFSET eip, 0
 
 KPROBE_ENTRY(debug)
 	RING0_INT_FRAME
@@ -754,6 +760,7 @@ KPROBE_ENTRY(nmi)
 	cmpl $sysenter_entry,12(%esp)
 	je nmi_debug_stack_check
 nmi_stack_correct:
+	/* We have a RING0_INT_FRAME here */
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
@@ -764,9 +771,12 @@ nmi_stack_correct:
 	CFI_ENDPROC
 
 nmi_stack_fixup:
+	RING0_INT_FRAME
 	FIX_STACK(12,nmi_stack_correct, 1)
 	jmp nmi_stack_correct
+
 nmi_debug_stack_check:
+	/* We have a RING0_INT_FRAME here */
 	cmpw $__KERNEL_CS,16(%esp)
 	jne nmi_stack_correct
 	cmpl $debug,(%esp)
@@ -777,8 +787,10 @@ nmi_debug_stack_check:
 	jmp nmi_stack_correct
 
 nmi_16bit_stack:
-	RING0_INT_FRAME
-	/* create the pointer to lss back */
+	/* We have a RING0_INT_FRAME here.
+	 *
+	 * create the pointer to lss back
+	 */
 	pushl %ss
 	CFI_ADJUST_CFA_OFFSET 4
 	pushl %esp
-- 
cgit v1.2.1


From 0da5db313317e3195482d3e660a1074857374a89 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 26 Sep 2006 10:52:39 +0200
Subject: [PATCH] i386: Abstract sensitive instructions

Abstract sensitive instructions in assembler code, replacing them with macros
(which currently are #defined to the native versions).  We use long names:
assembler is case-insensitive, so if something goes wrong and macros do not
expand, it would assemble anyway.

Resulting object files are exactly the same as before.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/entry.S | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

(limited to 'arch/i386/kernel/entry.S')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 4b0845249222..3872fca5c74a 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -76,8 +76,15 @@ DF_MASK		= 0x00000400
 NT_MASK		= 0x00004000
 VM_MASK		= 0x00020000
 
+/* These are replaces for paravirtualization */
+#define DISABLE_INTERRUPTS		cli
+#define ENABLE_INTERRUPTS		sti
+#define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
+#define INTERRUPT_RETURN		iret
+#define GET_CR0_INTO_EAX		movl %cr0, %eax
+
 #ifdef CONFIG_PREEMPT
-#define preempt_stop		cli; TRACE_IRQS_OFF
+#define preempt_stop		DISABLE_INTERRUPTS; TRACE_IRQS_OFF
 #else
 #define preempt_stop
 #define resume_kernel		restore_nocheck
@@ -236,7 +243,7 @@ check_userspace:
 	testl $(VM_MASK | 3), %eax
 	jz resume_kernel
 ENTRY(resume_userspace)
- 	cli				# make sure we don't miss an interrupt
+ 	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	movl TI_flags(%ebp), %ecx
@@ -247,7 +254,7 @@ ENTRY(resume_userspace)
 
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
-	cli
+	DISABLE_INTERRUPTS
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_nocheck
 need_resched:
@@ -275,7 +282,7 @@ sysenter_past_esp:
 	 * No need to follow this irqs on/off section: the syscall
 	 * disabled irqs and here we enable it straight after entry:
 	 */
-	sti
+	ENABLE_INTERRUPTS
 	pushl $(__USER_DS)
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET ss, 0*/
@@ -320,7 +327,7 @@ sysenter_past_esp:
 	jae syscall_badsys
 	call *sys_call_table(,%eax,4)
 	movl %eax,EAX(%esp)
-	cli
+	DISABLE_INTERRUPTS
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
@@ -330,8 +337,7 @@ sysenter_past_esp:
 	movl OLDESP(%esp), %ecx
 	xorl %ebp,%ebp
 	TRACE_IRQS_ON
-	sti
-	sysexit
+	ENABLE_INTERRUPTS_SYSEXIT
 	CFI_ENDPROC
 
 
@@ -356,7 +362,7 @@ syscall_call:
 	call *sys_call_table(,%eax,4)
 	movl %eax,EAX(%esp)		# store the return value
 syscall_exit:
-	cli				# make sure we don't miss an interrupt
+	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	TRACE_IRQS_OFF
@@ -381,11 +387,11 @@ restore_nocheck_notrace:
 	RESTORE_REGS
 	addl $4, %esp
 	CFI_ADJUST_CFA_OFFSET -4
-1:	iret
+1:	INTERRUPT_RETURN
 .section .fixup,"ax"
 iret_exc:
 	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS
 	pushl $0			# no error code
 	pushl $do_iret_error
 	jmp error_code
@@ -409,7 +415,7 @@ ldt_ss:
 	 * dosemu and wine happy. */
 	subl $8, %esp		# reserve space for switch16 pointer
 	CFI_ADJUST_CFA_OFFSET 8
-	cli
+	DISABLE_INTERRUPTS
 	TRACE_IRQS_OFF
 	movl %esp, %eax
 	/* Set up the 16bit stack frame with switch32 pointer on top,
@@ -419,7 +425,7 @@ ldt_ss:
 	TRACE_IRQS_IRET
 	RESTORE_REGS
 	lss 20+4(%esp), %esp	# switch to 16bit stack
-1:	iret
+1:	INTERRUPT_RETURN
 .section __ex_table,"a"
 	.align 4
 	.long 1b,iret_exc
@@ -434,7 +440,7 @@ work_pending:
 	jz work_notifysig
 work_resched:
 	call schedule
-	cli				# make sure we don't miss an interrupt
+	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	TRACE_IRQS_OFF
@@ -490,7 +496,7 @@ syscall_exit_work:
 	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
 	jz work_pending
 	TRACE_IRQS_ON
-	sti				# could let do_syscall_trace() call
+	ENABLE_INTERRUPTS		# could let do_syscall_trace() call
 					# schedule() instead
 	movl %esp, %eax
 	movl $1, %edx
@@ -668,7 +674,7 @@ ENTRY(device_not_available)
 	pushl $-1			# mark this as an int
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
-	movl %cr0, %eax
+	GET_CR0_INTO_EAX
 	testl $0x4, %eax		# EM (math emulation bit)
 	jne device_not_available_emulate
 	preempt_stop
@@ -811,7 +817,7 @@ nmi_16bit_stack:
 	call do_nmi
 	RESTORE_REGS
 	lss 12+4(%esp), %esp		# back to 16bit stack
-1:	iret
+1:	INTERRUPT_RETURN
 	CFI_ENDPROC
 .section __ex_table,"a"
 	.align 4
-- 
cgit v1.2.1


From 78be3706b21a232310590fe00258b224177ac05f Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 26 Sep 2006 10:52:39 +0200
Subject: [PATCH] i386: Allow a kernel not to be in ring 0

We allow for the fact that the guest kernel may not run in ring 0.  This
requires some abstraction in a few places when setting %cs or checking
privilege level (user vs kernel).

This is Chris' [RFC PATCH 15/33] move segment checks to subarch, except rather
than using #define USER_MODE_MASK which depends on a config option, we use
Zach's more flexible approach of assuming ring 3 == userspace.  I also used
"get_kernel_rpl()" over "get_kernel_cs()" because I think it reads better in
the code...

1) Remove the hardcoded 3 and introduce #define SEGMENT_RPL_MASK 3 2) Add a
get_kernel_rpl() macro, and don't assume it's zero.

And:

Clean up of patch for letting kernel run other than ring 0:

a. Add some comments about the SEGMENT_IS_*_CODE() macros.
b. Add a USER_RPL macro.  (Code was comparing a value to a mask
   in some places and to the magic number 3 in other places.)
c. Add macros for table indicator field and use them.
d. Change the entry.S tests for LDT stack segment to use the macros

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/entry.S | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/i386/kernel/entry.S')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 3872fca5c74a..284f2e908ad0 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -240,8 +240,9 @@ ret_from_intr:
 check_userspace:
 	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
 	movb CS(%esp), %al
-	testl $(VM_MASK | 3), %eax
-	jz resume_kernel
+	andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
+	cmpl $USER_RPL, %eax
+	jb resume_kernel		# not returning to v8086 or userspace
 ENTRY(resume_userspace)
  	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
@@ -377,8 +378,8 @@ restore_all:
 	# See comments in process.c:copy_thread() for details.
 	movb OLDSS(%esp), %ah
 	movb CS(%esp), %al
-	andl $(VM_MASK | (4 << 8) | 3), %eax
-	cmpl $((4 << 8) | 3), %eax
+	andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+	cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
 	CFI_REMEMBER_STATE
 	je ldt_ss			# returning to user-space with LDT SS
 restore_nocheck:
-- 
cgit v1.2.1


From adf1423698f00d00b267f7dca8231340ce7d65ef Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Tue, 26 Sep 2006 10:52:41 +0200
Subject: [PATCH] i386/x86-64: Work around gcc bug with noreturn functions in
 unwinder

Current gcc generates calls not jumps to noreturn functions. When that happens the
return address can point to the next function, which confuses the unwinder.

This patch works around it by marking asynchronous exception
frames in contrast normal call frames in the unwind information.  Then teach
the unwinder to decode this.

For normal call frames the unwinder now subtracts one from the address which avoids
this problem.  The standard libgcc unwinder uses the same trick.

It doesn't include adjustment of the printed address (i.e. for the original
example, it'd still be kernel_math_error+0 that gets displayed, but the
unwinder wouldn't get confused anymore.

This only works with binutils 2.6.17+ and some versions of H.J.Lu's 2.6.16
unfortunately because earlier binutils don't support .cfi_signal_frame

[AK: added automatic detection of the new binutils and wrote description]

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/entry.S | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/i386/kernel/entry.S')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 284f2e908ad0..5a63d6fdb70e 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -183,18 +183,21 @@ VM_MASK		= 0x00020000
 
 #define RING0_INT_FRAME \
 	CFI_STARTPROC simple;\
+	CFI_SIGNAL_FRAME;\
 	CFI_DEF_CFA esp, 3*4;\
 	/*CFI_OFFSET cs, -2*4;*/\
 	CFI_OFFSET eip, -3*4
 
 #define RING0_EC_FRAME \
 	CFI_STARTPROC simple;\
+	CFI_SIGNAL_FRAME;\
 	CFI_DEF_CFA esp, 4*4;\
 	/*CFI_OFFSET cs, -2*4;*/\
 	CFI_OFFSET eip, -3*4
 
 #define RING0_PTREGS_FRAME \
 	CFI_STARTPROC simple;\
+	CFI_SIGNAL_FRAME;\
 	CFI_DEF_CFA esp, OLDESP-EBX;\
 	/*CFI_OFFSET cs, CS-OLDESP;*/\
 	CFI_OFFSET eip, EIP-OLDESP;\
@@ -275,6 +278,7 @@ need_resched:
 	# sysenter call handler stub
 ENTRY(sysenter_entry)
 	CFI_STARTPROC simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA esp, 0
 	CFI_REGISTER esp, ebp
 	movl TSS_sysenter_esp0(%esp),%esp
-- 
cgit v1.2.1