summaryrefslogtreecommitdiff
path: root/rts/StgCRun.c
diff options
context:
space:
mode:
Diffstat (limited to 'rts/StgCRun.c')
-rw-r--r--rts/StgCRun.c251
1 files changed, 230 insertions, 21 deletions
diff --git a/rts/StgCRun.c b/rts/StgCRun.c
index 2166249c2a..92b0696c2b 100644
--- a/rts/StgCRun.c
+++ b/rts/StgCRun.c
@@ -59,8 +59,8 @@
#include "StgRun.h"
#include "Capability.h"
-#if defined(DEBUG)
#include "RtsUtils.h"
+#if defined(DEBUG)
#include "Printer.h"
#endif
@@ -90,20 +90,18 @@ StgFunPtr StgReturn(void)
#else /* !USE_MINIINTERPRETER */
-#if defined(LEADING_UNDERSCORE)
-#define STG_RUN "_StgRun"
-#define STG_RETURN "_StgReturn"
-#else
-#define STG_RUN "StgRun"
-#define STG_RETURN "StgReturn"
-#endif
-
#if defined(mingw32_HOST_OS)
-// On windows the stack has to be allocated 4k at a time, otherwise
-// we get a segfault. The C compiler knows how to do this (it calls
-// _alloca()), so we make sure that we can allocate as much stack as
-// we need:
-StgWord8 *win32AllocStack(void)
+/*
+ * Note [Windows Stack allocations]
+ *
+ * On windows the stack has to be allocated 4k at a time, otherwise
+ * we get a segfault. The C compiler knows how to do this (it calls
+ * _alloca()), so we make sure that we can allocate as much stack as
+ * we need. However since we are doing a local stack allocation and the value
+ * isn't valid outside the frame, compilers are free to optimize this allocation
+ * and the corresponding stack check away. So to prevent that we request that
+ * this function never be optimized (See #14669). */
+STG_NO_OPTIMIZE StgWord8 *win32AllocStack(void)
{
StgWord8 stack[RESERVED_C_STACK_BYTES + 16 + 12];
return stack;
@@ -230,7 +228,7 @@ StgRunIsImplementedInAssembler(void)
);
}
-#endif
+#endif // defined(i386_HOST_ARCH)
/* ----------------------------------------------------------------------------
x86-64 is almost the same as plain x86.
@@ -251,6 +249,121 @@ StgRunIsImplementedInAssembler(void)
#define STG_HIDDEN ".hidden "
#endif
+/*
+Note [Unwinding foreign exports on x86-64]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+For foreign exports, that is Haskell functions exported as C functions when
+we unwind we have to unwind from Haskell code into C code. The current story
+is as follows:
+
+ * The Haskell stack always has stg_stop_thread_info frame at the bottom
+ * We annotate stg_stop_thread_info to unwind the instruction pointer to a
+ label inside StgRun called StgRunJmp. It's the last instruction before the
+ code jumps into Haskell.
+ * StgRun - which is implemented in assembler is annotated with some manual
+ unwinding information. It unwinds all the registers that it has saved
+ on the stack. This is important as rsp and rbp are often required for
+ getting to the next frame and the rest of the saved registers are useful
+ when inspecting locals in gdb.
+
+
+ Example x86-64 stack for an FFI call
+ from C into a Haskell function:
+
+
+ HASKELL HEAP
+ "ADDRESS SPACE"
+
+ +--------------------+ <------ rbp
+ | |
+ | |
+ | |
+ | |
+ | Haskell |
+ | evaluation stack |
+ | |
+ | |
+ |--------------------|
+ |stg_catch_frame_info|
+ |--------------------|
+ | stg_forceIO_info |
+ |--------------------|
+ |stg_stop_thread_info| -------
+ +--------------------+ |
+ ... |
+ (other heap objects) |
+ ... |
+ |
+ |
+ |
+ C STACK "ADDRESS SPACE" |
+ v
+ +-----------------------------+ <------ rsp
+ | |
+ | RESERVED_C_STACK_BYTES ~16k |
+ | |
+ |-----------------------------|
+ | rbx ||
+ |-----------------------------| \
+ | rbp | |
+ |-----------------------------| \
+ | r12 | |
+ |-----------------------------| \
+ | r13 | | STG_RUN_STACK_FRAME_SIZE
+ |-----------------------------| /
+ | r14 | |
+ |-----------------------------| /
+ | r15 | |
+ |-----------------------------|/
+ | rip saved by call StgRun |
+ | in schedule() |
+ +-----------------------------+
+ ...
+ schedule() stack frame
+
+
+ Lower addresses on the top
+
+One little snag in this approach is that the annotations accepted by the
+assembler are surprisingly unexpressive. I had to resort to a .cfi_escape
+and hand-assemble a DWARF expression. What made it worse was that big numbers
+are LEB128 encoded, which makes them variable byte length, with length depending
+on the magnitude.
+
+Here's an example stack generated this way:
+
+ Thread 1 "m" hit Breakpoint 1, Fib_zdfstableZZC0ZZCmainZZCFibZZCfib1_info () at Fib.hs:9
+ 9 fib a = return (a + 1)
+ #0 Fib_zdfstableZZC0ZZCmainZZCFibZZCfib1_info () at Fib.hs:9
+ #1 stg_catch_frame_info () at rts/Exception.cmm:372
+ #2 stg_forceIO_info () at rts/StgStartup.cmm:178
+ #3 stg_stop_thread_info () at rts/StgStartup.cmm:42
+ #4 0x00000000007048ab in StgRunIsImplementedInAssembler () at rts/StgCRun.c:255
+ #5 0x00000000006fcf42 in schedule (initialCapability=initialCapability@entry=0x8adac0 <MainCapability>, task=task@entry=0x8cf2a0) at rts/Schedule.c:451
+ #6 0x00000000006fe18e in scheduleWaitThread (tso=0x4200006388, ret=<optimized out>, pcap=0x7fffffffdac0) at rts/Schedule.c:2533
+ #7 0x000000000040a21e in hs_fib ()
+ #8 0x000000000040a083 in main (argc=1, argv=0x7fffffffdc48) at m.cpp:15
+
+(This is from patched gdb. See Note [Info Offset].)
+
+The previous approach was to encode the unwinding information for select
+registers in stg_stop_thread_info with Cmm annotations. The unfortunate thing
+about that approach was that it required introduction of an artificial MachSp
+register that wasn't meaningful outside unwinding. I discovered that to get
+stack unwinding working under -threaded runtime I also needed to unwind rbp
+which would require adding MachRbp. If we wanted to see saved locals in gdb,
+we'd have to add more. The core of the problem is that Cmm is architecture
+independent, while unwinding isn't.
+
+Note [Unwinding foreign imports]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+For unwinding foreign imports, that is C functions exposed as Haskell functions
+no special handling is required. The C function unwinds according to the rip
+saved on the stack by the call instruction. Then we perform regular Haskell
+stack unwinding.
+*/
+
+
static void GNUC3_ATTRIBUTE(used)
StgRunIsImplementedInAssembler(void)
{
@@ -273,10 +386,61 @@ StgRunIsImplementedInAssembler(void)
"movq %%r14,32(%%rax)\n\t"
"movq %%r15,40(%%rax)\n\t"
#if defined(mingw32_HOST_OS)
+ /*
+ * Additional callee saved registers on Win64. This must match
+ * callClobberedRegisters in compiler/nativeGen/X86/Regs.hs as
+ * both represent the Win64 calling convention.
+ */
"movq %%rdi,48(%%rax)\n\t"
"movq %%rsi,56(%%rax)\n\t"
- "movq %%xmm6,64(%%rax)\n\t"
+ "movq %%xmm6, 64(%%rax)\n\t"
+ "movq %%xmm7, 72(%%rax)\n\t"
+ "movq %%xmm8, 80(%%rax)\n\t"
+ "movq %%xmm9, 88(%%rax)\n\t"
+ "movq %%xmm10, 96(%%rax)\n\t"
+ "movq %%xmm11,104(%%rax)\n\t"
+ "movq %%xmm12,112(%%rax)\n\t"
+ "movq %%xmm13,120(%%rax)\n\t"
+ "movq %%xmm14,128(%%rax)\n\t"
+ "movq %%xmm15,136(%%rax)\n\t"
+#endif
+
+ /*
+ * Let the unwinder know where we saved the registers
+ * See Note [Unwinding foreign exports on x86-64].
+ */
+ ".cfi_def_cfa rsp, 0\n\t"
+ ".cfi_offset rbx, %c2\n\t"
+ ".cfi_offset rbp, %c3\n\t"
+ ".cfi_offset r12, %c4\n\t"
+ ".cfi_offset r13, %c5\n\t"
+ ".cfi_offset r14, %c6\n\t"
+ ".cfi_offset r15, %c7\n\t"
+ ".cfi_offset rip, %c8\n\t"
+ ".cfi_escape " // DW_CFA_val_expression is not expressible otherwise
+ "0x16, " // DW_CFA_val_expression
+ "0x07, " // register num 7 - rsp
+ "0x04, " // block length
+ "0x77, " // DW_OP_breg7 - signed LEB128 offset from rsp
+#define RSP_DELTA (RESERVED_C_STACK_BYTES + STG_RUN_STACK_FRAME_SIZE + 8)
+ "%c9" // signed LEB128 encoded delta - byte 1
+#if (RSP_DELTA >> 7) > 0
+ ", %c10" // signed LEB128 encoded delta - byte 2
#endif
+
+#if (RSP_DELTA >> 14) > 0
+ ", %c11" // signed LEB128 encoded delta - byte 3
+#endif
+
+#if (RSP_DELTA >> 21) > 0
+ ", %c12" // signed LEB128 encoded delta - byte 4
+#endif
+
+#if (RSP_DELTA >> 28) > 0
+#error "RSP_DELTA too big"
+#endif
+ "\n\t"
+
/*
* Set BaseReg
*/
@@ -293,6 +457,17 @@ StgRunIsImplementedInAssembler(void)
#else
"movq %%rdi,%%rax\n\t"
#endif
+
+ STG_GLOBAL xstr(STG_RUN_JMP) "\n"
+#if !defined(mingw32_HOST_OS)
+ STG_HIDDEN xstr(STG_RUN_JMP) "\n"
+#endif
+#if HAVE_SUBSECTIONS_VIA_SYMBOLS
+ // If we have deadstripping enabled and a label is detected as unused
+ // the code gets nop'd out.
+ ".no_dead_strip " xstr(STG_RUN_JMP) "\n"
+#endif
+ xstr(STG_RUN_JMP) ":\n\t"
"jmp *%%rax\n\t"
".globl " STG_RETURN "\n"
@@ -311,16 +486,50 @@ StgRunIsImplementedInAssembler(void)
"movq 32(%%rsp),%%r14\n\t"
"movq 40(%%rsp),%%r15\n\t"
#if defined(mingw32_HOST_OS)
- "movq 48(%%rsp),%%rdi\n\t"
- "movq 56(%%rsp),%%rsi\n\t"
- "movq 64(%%rsp),%%xmm6\n\t"
+ "movq 48(%%rsp),%%rdi\n\t"
+ "movq 56(%%rsp),%%rsi\n\t"
+ "movq 64(%%rsp),%%xmm6\n\t"
+ "movq 72(%%rax),%%xmm7\n\t"
+ "movq 80(%%rax),%%xmm8\n\t"
+ "movq 88(%%rax),%%xmm9\n\t"
+ "movq 96(%%rax),%%xmm10\n\t"
+ "movq 104(%%rax),%%xmm11\n\t"
+ "movq 112(%%rax),%%xmm12\n\t"
+ "movq 120(%%rax),%%xmm13\n\t"
+ "movq 128(%%rax),%%xmm14\n\t"
+ "movq 136(%%rax),%%xmm15\n\t"
#endif
"addq %1, %%rsp\n\t"
"retq"
:
: "i"(RESERVED_C_STACK_BYTES),
- "i"(STG_RUN_STACK_FRAME_SIZE /* stack frame size */)
+ "i"(STG_RUN_STACK_FRAME_SIZE /* stack frame size */),
+ "i"(RESERVED_C_STACK_BYTES /* rbx relative to cfa (rsp) */),
+ "i"(RESERVED_C_STACK_BYTES + 8 /* rbp relative to cfa (rsp) */),
+ "i"(RESERVED_C_STACK_BYTES + 16 /* r12 relative to cfa (rsp) */),
+ "i"(RESERVED_C_STACK_BYTES + 24 /* r13 relative to cfa (rsp) */),
+ "i"(RESERVED_C_STACK_BYTES + 32 /* r14 relative to cfa (rsp) */),
+ "i"(RESERVED_C_STACK_BYTES + 40 /* r15 relative to cfa (rsp) */),
+ "i"(RESERVED_C_STACK_BYTES + STG_RUN_STACK_FRAME_SIZE
+ /* rip relative to cfa */),
+ "i"((RSP_DELTA & 127) | (128 * ((RSP_DELTA >> 7) > 0)))
+ /* signed LEB128-encoded delta from rsp - byte 1 */
+#if (RSP_DELTA >> 7) > 0
+ , "i"(((RSP_DELTA >> 7) & 127) | (128 * ((RSP_DELTA >> 14) > 0)))
+ /* signed LEB128-encoded delta from rsp - byte 2 */
+#endif
+
+#if (RSP_DELTA >> 14) > 0
+ , "i"(((RSP_DELTA >> 14) & 127) | (128 * ((RSP_DELTA >> 21) > 0)))
+ /* signed LEB128-encoded delta from rsp - byte 3 */
+#endif
+
+#if (RSP_DELTA >> 21) > 0
+ , "i"(((RSP_DELTA >> 21) & 127) | (128 * ((RSP_DELTA >> 28) > 0)))
+ /* signed LEB128-encoded delta from rsp - byte 4 */
+#endif
+#undef RSP_DELTA
);
/*
* See Note [Stack Alignment on X86]
@@ -766,7 +975,7 @@ StgRun(StgFunPtr f, StgRegTable *basereg) {
/*
* Save callee-saves registers on behalf of the STG code.
* Floating point registers only need the bottom 64 bits preserved.
- * We need to use the the names x16, x17, x29 and x30 instead of ip0
+ * We need to use the names x16, x17, x29 and x30 instead of ip0
* ip1, fp and lp because one of either clang or gcc doesn't understand
* the later names.
*/