diff options
Diffstat (limited to 'rts/StgCRun.c')
-rw-r--r-- | rts/StgCRun.c | 251 |
1 files changed, 230 insertions, 21 deletions
diff --git a/rts/StgCRun.c b/rts/StgCRun.c index 2166249c2a..92b0696c2b 100644 --- a/rts/StgCRun.c +++ b/rts/StgCRun.c @@ -59,8 +59,8 @@ #include "StgRun.h" #include "Capability.h" -#if defined(DEBUG) #include "RtsUtils.h" +#if defined(DEBUG) #include "Printer.h" #endif @@ -90,20 +90,18 @@ StgFunPtr StgReturn(void) #else /* !USE_MINIINTERPRETER */ -#if defined(LEADING_UNDERSCORE) -#define STG_RUN "_StgRun" -#define STG_RETURN "_StgReturn" -#else -#define STG_RUN "StgRun" -#define STG_RETURN "StgReturn" -#endif - #if defined(mingw32_HOST_OS) -// On windows the stack has to be allocated 4k at a time, otherwise -// we get a segfault. The C compiler knows how to do this (it calls -// _alloca()), so we make sure that we can allocate as much stack as -// we need: -StgWord8 *win32AllocStack(void) +/* + * Note [Windows Stack allocations] + * + * On windows the stack has to be allocated 4k at a time, otherwise + * we get a segfault. The C compiler knows how to do this (it calls + * _alloca()), so we make sure that we can allocate as much stack as + * we need. However since we are doing a local stack allocation and the value + * isn't valid outside the frame, compilers are free to optimize this allocation + * and the corresponding stack check away. So to prevent that we request that + * this function never be optimized (See #14669). */ +STG_NO_OPTIMIZE StgWord8 *win32AllocStack(void) { StgWord8 stack[RESERVED_C_STACK_BYTES + 16 + 12]; return stack; @@ -230,7 +228,7 @@ StgRunIsImplementedInAssembler(void) ); } -#endif +#endif // defined(i386_HOST_ARCH) /* ---------------------------------------------------------------------------- x86-64 is almost the same as plain x86. @@ -251,6 +249,121 @@ StgRunIsImplementedInAssembler(void) #define STG_HIDDEN ".hidden " #endif +/* +Note [Unwinding foreign exports on x86-64] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +For foreign exports, that is Haskell functions exported as C functions when +we unwind we have to unwind from Haskell code into C code. The current story +is as follows: + + * The Haskell stack always has stg_stop_thread_info frame at the bottom + * We annotate stg_stop_thread_info to unwind the instruction pointer to a + label inside StgRun called StgRunJmp. It's the last instruction before the + code jumps into Haskell. + * StgRun - which is implemented in assembler is annotated with some manual + unwinding information. It unwinds all the registers that it has saved + on the stack. This is important as rsp and rbp are often required for + getting to the next frame and the rest of the saved registers are useful + when inspecting locals in gdb. + + + Example x86-64 stack for an FFI call + from C into a Haskell function: + + + HASKELL HEAP + "ADDRESS SPACE" + + +--------------------+ <------ rbp + | | + | | + | | + | | + | Haskell | + | evaluation stack | + | | + | | + |--------------------| + |stg_catch_frame_info| + |--------------------| + | stg_forceIO_info | + |--------------------| + |stg_stop_thread_info| ------- + +--------------------+ | + ... | + (other heap objects) | + ... | + | + | + | + C STACK "ADDRESS SPACE" | + v + +-----------------------------+ <------ rsp + | | + | RESERVED_C_STACK_BYTES ~16k | + | | + |-----------------------------| + | rbx || + |-----------------------------| \ + | rbp | | + |-----------------------------| \ + | r12 | | + |-----------------------------| \ + | r13 | | STG_RUN_STACK_FRAME_SIZE + |-----------------------------| / + | r14 | | + |-----------------------------| / + | r15 | | + |-----------------------------|/ + | rip saved by call StgRun | + | in schedule() | + +-----------------------------+ + ... + schedule() stack frame + + + Lower addresses on the top + +One little snag in this approach is that the annotations accepted by the +assembler are surprisingly unexpressive. I had to resort to a .cfi_escape +and hand-assemble a DWARF expression. What made it worse was that big numbers +are LEB128 encoded, which makes them variable byte length, with length depending +on the magnitude. + +Here's an example stack generated this way: + + Thread 1 "m" hit Breakpoint 1, Fib_zdfstableZZC0ZZCmainZZCFibZZCfib1_info () at Fib.hs:9 + 9 fib a = return (a + 1) + #0 Fib_zdfstableZZC0ZZCmainZZCFibZZCfib1_info () at Fib.hs:9 + #1 stg_catch_frame_info () at rts/Exception.cmm:372 + #2 stg_forceIO_info () at rts/StgStartup.cmm:178 + #3 stg_stop_thread_info () at rts/StgStartup.cmm:42 + #4 0x00000000007048ab in StgRunIsImplementedInAssembler () at rts/StgCRun.c:255 + #5 0x00000000006fcf42 in schedule (initialCapability=initialCapability@entry=0x8adac0 <MainCapability>, task=task@entry=0x8cf2a0) at rts/Schedule.c:451 + #6 0x00000000006fe18e in scheduleWaitThread (tso=0x4200006388, ret=<optimized out>, pcap=0x7fffffffdac0) at rts/Schedule.c:2533 + #7 0x000000000040a21e in hs_fib () + #8 0x000000000040a083 in main (argc=1, argv=0x7fffffffdc48) at m.cpp:15 + +(This is from patched gdb. See Note [Info Offset].) + +The previous approach was to encode the unwinding information for select +registers in stg_stop_thread_info with Cmm annotations. The unfortunate thing +about that approach was that it required introduction of an artificial MachSp +register that wasn't meaningful outside unwinding. I discovered that to get +stack unwinding working under -threaded runtime I also needed to unwind rbp +which would require adding MachRbp. If we wanted to see saved locals in gdb, +we'd have to add more. The core of the problem is that Cmm is architecture +independent, while unwinding isn't. + +Note [Unwinding foreign imports] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +For unwinding foreign imports, that is C functions exposed as Haskell functions +no special handling is required. The C function unwinds according to the rip +saved on the stack by the call instruction. Then we perform regular Haskell +stack unwinding. +*/ + + static void GNUC3_ATTRIBUTE(used) StgRunIsImplementedInAssembler(void) { @@ -273,10 +386,61 @@ StgRunIsImplementedInAssembler(void) "movq %%r14,32(%%rax)\n\t" "movq %%r15,40(%%rax)\n\t" #if defined(mingw32_HOST_OS) + /* + * Additional callee saved registers on Win64. This must match + * callClobberedRegisters in compiler/nativeGen/X86/Regs.hs as + * both represent the Win64 calling convention. + */ "movq %%rdi,48(%%rax)\n\t" "movq %%rsi,56(%%rax)\n\t" - "movq %%xmm6,64(%%rax)\n\t" + "movq %%xmm6, 64(%%rax)\n\t" + "movq %%xmm7, 72(%%rax)\n\t" + "movq %%xmm8, 80(%%rax)\n\t" + "movq %%xmm9, 88(%%rax)\n\t" + "movq %%xmm10, 96(%%rax)\n\t" + "movq %%xmm11,104(%%rax)\n\t" + "movq %%xmm12,112(%%rax)\n\t" + "movq %%xmm13,120(%%rax)\n\t" + "movq %%xmm14,128(%%rax)\n\t" + "movq %%xmm15,136(%%rax)\n\t" +#endif + + /* + * Let the unwinder know where we saved the registers + * See Note [Unwinding foreign exports on x86-64]. + */ + ".cfi_def_cfa rsp, 0\n\t" + ".cfi_offset rbx, %c2\n\t" + ".cfi_offset rbp, %c3\n\t" + ".cfi_offset r12, %c4\n\t" + ".cfi_offset r13, %c5\n\t" + ".cfi_offset r14, %c6\n\t" + ".cfi_offset r15, %c7\n\t" + ".cfi_offset rip, %c8\n\t" + ".cfi_escape " // DW_CFA_val_expression is not expressible otherwise + "0x16, " // DW_CFA_val_expression + "0x07, " // register num 7 - rsp + "0x04, " // block length + "0x77, " // DW_OP_breg7 - signed LEB128 offset from rsp +#define RSP_DELTA (RESERVED_C_STACK_BYTES + STG_RUN_STACK_FRAME_SIZE + 8) + "%c9" // signed LEB128 encoded delta - byte 1 +#if (RSP_DELTA >> 7) > 0 + ", %c10" // signed LEB128 encoded delta - byte 2 #endif + +#if (RSP_DELTA >> 14) > 0 + ", %c11" // signed LEB128 encoded delta - byte 3 +#endif + +#if (RSP_DELTA >> 21) > 0 + ", %c12" // signed LEB128 encoded delta - byte 4 +#endif + +#if (RSP_DELTA >> 28) > 0 +#error "RSP_DELTA too big" +#endif + "\n\t" + /* * Set BaseReg */ @@ -293,6 +457,17 @@ StgRunIsImplementedInAssembler(void) #else "movq %%rdi,%%rax\n\t" #endif + + STG_GLOBAL xstr(STG_RUN_JMP) "\n" +#if !defined(mingw32_HOST_OS) + STG_HIDDEN xstr(STG_RUN_JMP) "\n" +#endif +#if HAVE_SUBSECTIONS_VIA_SYMBOLS + // If we have deadstripping enabled and a label is detected as unused + // the code gets nop'd out. + ".no_dead_strip " xstr(STG_RUN_JMP) "\n" +#endif + xstr(STG_RUN_JMP) ":\n\t" "jmp *%%rax\n\t" ".globl " STG_RETURN "\n" @@ -311,16 +486,50 @@ StgRunIsImplementedInAssembler(void) "movq 32(%%rsp),%%r14\n\t" "movq 40(%%rsp),%%r15\n\t" #if defined(mingw32_HOST_OS) - "movq 48(%%rsp),%%rdi\n\t" - "movq 56(%%rsp),%%rsi\n\t" - "movq 64(%%rsp),%%xmm6\n\t" + "movq 48(%%rsp),%%rdi\n\t" + "movq 56(%%rsp),%%rsi\n\t" + "movq 64(%%rsp),%%xmm6\n\t" + "movq 72(%%rax),%%xmm7\n\t" + "movq 80(%%rax),%%xmm8\n\t" + "movq 88(%%rax),%%xmm9\n\t" + "movq 96(%%rax),%%xmm10\n\t" + "movq 104(%%rax),%%xmm11\n\t" + "movq 112(%%rax),%%xmm12\n\t" + "movq 120(%%rax),%%xmm13\n\t" + "movq 128(%%rax),%%xmm14\n\t" + "movq 136(%%rax),%%xmm15\n\t" #endif "addq %1, %%rsp\n\t" "retq" : : "i"(RESERVED_C_STACK_BYTES), - "i"(STG_RUN_STACK_FRAME_SIZE /* stack frame size */) + "i"(STG_RUN_STACK_FRAME_SIZE /* stack frame size */), + "i"(RESERVED_C_STACK_BYTES /* rbx relative to cfa (rsp) */), + "i"(RESERVED_C_STACK_BYTES + 8 /* rbp relative to cfa (rsp) */), + "i"(RESERVED_C_STACK_BYTES + 16 /* r12 relative to cfa (rsp) */), + "i"(RESERVED_C_STACK_BYTES + 24 /* r13 relative to cfa (rsp) */), + "i"(RESERVED_C_STACK_BYTES + 32 /* r14 relative to cfa (rsp) */), + "i"(RESERVED_C_STACK_BYTES + 40 /* r15 relative to cfa (rsp) */), + "i"(RESERVED_C_STACK_BYTES + STG_RUN_STACK_FRAME_SIZE + /* rip relative to cfa */), + "i"((RSP_DELTA & 127) | (128 * ((RSP_DELTA >> 7) > 0))) + /* signed LEB128-encoded delta from rsp - byte 1 */ +#if (RSP_DELTA >> 7) > 0 + , "i"(((RSP_DELTA >> 7) & 127) | (128 * ((RSP_DELTA >> 14) > 0))) + /* signed LEB128-encoded delta from rsp - byte 2 */ +#endif + +#if (RSP_DELTA >> 14) > 0 + , "i"(((RSP_DELTA >> 14) & 127) | (128 * ((RSP_DELTA >> 21) > 0))) + /* signed LEB128-encoded delta from rsp - byte 3 */ +#endif + +#if (RSP_DELTA >> 21) > 0 + , "i"(((RSP_DELTA >> 21) & 127) | (128 * ((RSP_DELTA >> 28) > 0))) + /* signed LEB128-encoded delta from rsp - byte 4 */ +#endif +#undef RSP_DELTA ); /* * See Note [Stack Alignment on X86] @@ -766,7 +975,7 @@ StgRun(StgFunPtr f, StgRegTable *basereg) { /* * Save callee-saves registers on behalf of the STG code. * Floating point registers only need the bottom 64 bits preserved. - * We need to use the the names x16, x17, x29 and x30 instead of ip0 + * We need to use the names x16, x17, x29 and x30 instead of ip0 * ip1, fp and lp because one of either clang or gcc doesn't understand * the later names. */ |