diff options
Diffstat (limited to 'compiler/ghci/ByteCodeFFI.lhs')
-rw-r--r-- | compiler/ghci/ByteCodeFFI.lhs | 832 |
1 files changed, 832 insertions, 0 deletions
diff --git a/compiler/ghci/ByteCodeFFI.lhs b/compiler/ghci/ByteCodeFFI.lhs new file mode 100644 index 0000000000..61e70d64e4 --- /dev/null +++ b/compiler/ghci/ByteCodeFFI.lhs @@ -0,0 +1,832 @@ +% +% (c) The University of Glasgow 2001 +% +\section[ByteCodeGen]{Generate machine-code sequences for foreign import} + +\begin{code} +module ByteCodeFFI ( mkMarshalCode, moan64 ) where + +#include "HsVersions.h" + +import Outputable +import SMRep ( CgRep(..), cgRepSizeW ) +import ForeignCall ( CCallConv(..) ) +import Panic + +-- DON'T remove apparently unused imports here .. +-- there is ifdeffery below +import Control.Exception ( throwDyn ) +import DATA_BITS ( Bits(..), shiftR, shiftL ) +import Foreign ( newArray ) +import Data.List ( mapAccumL ) + +import DATA_WORD ( Word8, Word32 ) +import Foreign ( Ptr ) +import System.IO.Unsafe ( unsafePerformIO ) +import IO ( hPutStrLn, stderr ) +import Debug.Trace ( trace ) +\end{code} + +%************************************************************************ +%* * +\subsection{The platform-dependent marshall-code-generator.} +%* * +%************************************************************************ + +\begin{code} + +moan64 :: String -> SDoc -> a +moan64 msg pp_rep + = unsafePerformIO ( + hPutStrLn stderr ( + "\nGHCi's bytecode generation machinery can't handle 64-bit\n" ++ + "code properly yet. You can work around this for the time being\n" ++ + "by compiling this module and all those it imports to object code,\n" ++ + "and re-starting your GHCi session. The panic below contains information,\n" ++ + "intended for the GHC implementors, about the exact place where GHC gave up.\n" + ) + ) + `seq` + pprPanic msg pp_rep + + +-- For sparc_TARGET_ARCH, i386_TARGET_ARCH, etc. +#include "nativeGen/NCG.h" + +{- +Make a piece of code which expects to see the Haskell stack +looking like this. It is given a pointer to the lowest word in +the stack -- presumably the tag of the placeholder. + + <arg_n> + ... + <arg_1> + Addr# address_of_C_fn + <placeholder-for-result#> (must be an unboxed type) + +We cope with both ccall and stdcall for the C fn. However, this code +itself expects only to be called using the ccall convention -- that is, +we don't clear our own (single) arg off the C stack. +-} +mkMarshalCode :: CCallConv + -> (Int, CgRep) -> Int -> [(Int, CgRep)] + -> IO (Ptr Word8) +mkMarshalCode cconv (r_offW, r_rep) addr_offW arg_offs_n_reps + = let bytes = mkMarshalCode_wrk cconv (r_offW, r_rep) + addr_offW arg_offs_n_reps + in trace (show bytes) $ Foreign.newArray bytes + + + + +mkMarshalCode_wrk :: CCallConv + -> (Int, CgRep) -> Int -> [(Int, CgRep)] + -> [Word8] + +mkMarshalCode_wrk cconv (r_offW, r_rep) addr_offW arg_offs_n_reps + +#if i386_TARGET_ARCH + + = let -- Don't change this without first consulting Intel Corp :-) + bytes_per_word = 4 + + offsets_to_pushW + = concat + [ -- reversed because x86 is little-endian + reverse [a_offW .. a_offW + cgRepSizeW a_rep - 1] + + -- reversed because args are pushed L -> R onto C stack + | (a_offW, a_rep) <- reverse arg_offs_n_reps + ] + + arguments_size = bytes_per_word * length offsets_to_pushW +#if darwin_TARGET_OS + -- Darwin: align stack frame size to a multiple of 16 bytes + stack_frame_size = (arguments_size + 15) .&. complement 15 + stack_frame_pad = stack_frame_size - arguments_size +#else + stack_frame_size = arguments_size +#endif + + -- some helpers to assemble x86 insns. + movl_offespmem_esi offB -- movl offB(%esp), %esi + = [0x8B, 0xB4, 0x24] ++ lit32 offB + movl_offesimem_ecx offB -- movl offB(%esi), %ecx + = [0x8B, 0x8E] ++ lit32 offB + save_regs -- pushl all intregs except %esp + = [0x50, 0x53, 0x51, 0x52, 0x56, 0x57, 0x55] + restore_regs -- popl ditto + = [0x5D, 0x5F, 0x5E, 0x5A, 0x59, 0x5B, 0x58] + pushl_ecx -- pushl %ecx + = [0x51] + call_star_ecx -- call * %ecx + = [0xFF, 0xD1] + add_lit_esp lit -- addl $lit, %esp + = [0x81, 0xC4] ++ lit32 lit + movl_eax_offesimem offB -- movl %eax, offB(%esi) + = [0x89, 0x86] ++ lit32 offB + movl_edx_offesimem offB -- movl %edx, offB(%esi) + = [0x89, 0x96] ++ lit32 offB + ret -- ret + = [0xC3] + fstpl_offesimem offB -- fstpl offB(%esi) + = [0xDD, 0x9E] ++ lit32 offB + fstps_offesimem offB -- fstps offB(%esi) + = [0xD9, 0x9E] ++ lit32 offB + {- + 2 0000 8BB42478 movl 0x12345678(%esp), %esi + 2 563412 + 3 0007 8B8E7856 movl 0x12345678(%esi), %ecx + 3 3412 + 4 + 5 000d 50535152 pushl %eax ; pushl %ebx ; pushl %ecx ; pushl %edx + 6 0011 565755 pushl %esi ; pushl %edi ; pushl %ebp + 7 + 8 0014 5D5F5E popl %ebp ; popl %edi ; popl %esi + 9 0017 5A595B58 popl %edx ; popl %ecx ; popl %ebx ; popl %eax + 10 + 11 001b 51 pushl %ecx + 12 001c FFD1 call * %ecx + 13 + 14 001e 81C47856 addl $0x12345678, %esp + 14 3412 + 15 0024 89867856 movl %eax, 0x12345678(%esi) + 15 3412 + 16 002a 89967856 movl %edx, 0x12345678(%esi) + 16 3412 + 17 + 18 0030 DD967856 fstl 0x12345678(%esi) + 18 3412 + 19 0036 DD9E7856 fstpl 0x12345678(%esi) + 19 3412 + 20 003c D9967856 fsts 0x12345678(%esi) + 20 3412 + 21 0042 D99E7856 fstps 0x12345678(%esi) + 18 + 19 0030 C3 ret + 20 + + -} + + in + --trace (show (map fst arg_offs_n_reps)) + ( + {- On entry, top of C stack 0(%esp) is the RA and 4(%esp) is + arg passed from the interpreter. + + Push all callee saved regs. Push all of them anyway ... + pushl %eax + pushl %ebx + pushl %ecx + pushl %edx + pushl %esi + pushl %edi + pushl %ebp + -} + save_regs + + {- Now 28+0(%esp) is RA and 28+4(%esp) is the arg (the H stack ptr). + We'll use %esi as a temporary to point at the H stack, and + %ecx as a temporary to copy via. + + movl 28+4(%esp), %esi + -} + ++ movl_offespmem_esi 32 + +#if darwin_TARGET_OS + {- On Darwin, add some padding so that the stack stays aligned. -} + ++ (if stack_frame_pad /= 0 + then add_lit_esp (-stack_frame_pad) + else []) +#endif + + {- For each arg in args_offs_n_reps, examine the associated + CgRep to determine how many words there are. This gives a + bunch of offsets on the H stack to copy to the C stack: + + movl off1(%esi), %ecx + pushl %ecx + -} + ++ concatMap (\offW -> movl_offesimem_ecx (bytes_per_word * offW) + ++ pushl_ecx) + offsets_to_pushW + + {- Get the addr to call into %ecx, bearing in mind that there's + an Addr# tag at the indicated location, and do the call: + + movl 4*(1 /*tag*/ +addr_offW)(%esi), %ecx + call * %ecx + -} + ++ movl_offesimem_ecx (bytes_per_word * addr_offW) + ++ call_star_ecx + + {- Nuke the args just pushed and re-establish %esi at the + H-stack ptr: + + addl $4*number_of_args_pushed, %esp (ccall only) + movl 28+4(%esp), %esi + -} + ++ (if cconv /= StdCallConv + then add_lit_esp stack_frame_size + else []) + ++ movl_offespmem_esi 32 + + {- Depending on what the return type is, get the result + from %eax or %edx:%eax or %st(0). + + movl %eax, 4(%esi) -- assuming tagged result + or + movl %edx, 4(%esi) + movl %eax, 8(%esi) + or + fstpl 4(%esi) + or + fstps 4(%esi) + -} + ++ let i32 = movl_eax_offesimem 0 + i64 = movl_eax_offesimem 0 ++ movl_edx_offesimem 4 + f32 = fstps_offesimem 0 + f64 = fstpl_offesimem 0 + in + case r_rep of + NonPtrArg -> i32 + DoubleArg -> f64 + FloatArg -> f32 + -- LongArg -> i64 + VoidArg -> [] + other -> moan64 "ByteCodeFFI.mkMarshalCode_wrk(x86)" + (ppr r_rep) + + {- Restore all the pushed regs and go home. + + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + pushl %eax + + ret + -} + ++ restore_regs + ++ ret + ) + +#elif x86_64_TARGET_ARCH + + = + -- the address of the H stack is in %rdi. We need to move it out, so + -- we can use %rdi as an arg reg for the following call: + pushq_rbp ++ + movq_rdi_rbp ++ + + -- ####### load / push the args + + let + (stack_args, fregs_unused, reg_loads) = + load_arg_regs arg_offs_n_reps int_loads float_loads [] + + tot_arg_size = bytes_per_word * length stack_args + + -- On entry to the called function, %rsp should be aligned + -- on a 16-byte boundary +8 (i.e. the first stack arg after + -- the return address is 16-byte aligned). In STG land + -- %rsp is kept 16-byte aligned (see StgCRun.c), so we just + -- need to make sure we push a multiple of 16-bytes of args, + -- plus the return address, to get the correct alignment. + (real_size, adjust_rsp) + | tot_arg_size `rem` 16 == 0 = (tot_arg_size, []) + | otherwise = (tot_arg_size + 8, subq_lit_rsp 8) + + (stack_pushes, stack_words) = + push_args stack_args [] 0 + + -- we need to know the number of SSE regs used in the call, see later + n_sse_regs_used = length float_loads - length fregs_unused + in + concat reg_loads + ++ adjust_rsp + ++ concat stack_pushes -- push in reverse order + + -- ####### make the call + + -- use %r10 to make the call, because we don't have to save it. + -- movq 8*addr_offW(%rbp), %r10 + ++ movq_rbpoff_r10 (bytes_per_word * addr_offW) + + -- The x86_64 ABI requires us to set %al to the number of SSE + -- registers that contain arguments, if the called routine + -- is a varargs function. We don't know whether it's a + -- varargs function or not, so we have to assume it is. + -- + -- It's not safe to omit this assignment, even if the number + -- of SSE regs in use is zero. If %al is larger than 8 + -- on entry to a varargs function, seg faults ensue. + ++ movq_lit_rax n_sse_regs_used + ++ call_star_r10 + + -- pop the args from the stack, only in ccall mode + -- (in stdcall the callee does it). + ++ (if cconv /= StdCallConv + then addq_lit_rsp real_size + else []) + + -- ####### place the result in the right place and return + + ++ assign_result + ++ popq_rbp + ++ ret + + where + bytes_per_word = 8 + + -- int arg regs: rdi,rsi,rdx,rcx,r8,r9 + -- flt arg regs: xmm0..xmm7 + int_loads = [ movq_rbpoff_rdi, movq_rbpoff_rsi, movq_rbpoff_rdx, + movq_rbpoff_rcx, movq_rbpoff_r8, movq_rbpoff_r9 ] + float_loads = [ (mov_f32_rbpoff_xmm n, mov_f64_rbpoff_xmm n) | n <- [0..7] ] + + load_arg_regs args [] [] code = (args, [], code) + load_arg_regs [] iregs fregs code = ([], fregs, code) + load_arg_regs ((off,rep):args) iregs fregs code + | FloatArg <- rep, ((mov_f32,_):frest) <- fregs = + load_arg_regs args iregs frest (mov_f32 (bytes_per_word * off) : code) + | DoubleArg <- rep, ((_,mov_f64):frest) <- fregs = + load_arg_regs args iregs frest (mov_f64 (bytes_per_word * off) : code) + | (mov_reg:irest) <- iregs = + load_arg_regs args irest fregs (mov_reg (bytes_per_word * off) : code) + | otherwise = + push_this_arg + where + push_this_arg = ((off,rep):args',fregs', code') + where (args',fregs',code') = load_arg_regs args iregs fregs code + + push_args [] code pushed_words = (code, pushed_words) + push_args ((off,rep):args) code pushed_words + | FloatArg <- rep = + push_args args (push_f32_rbpoff (bytes_per_word * off) : code) + (pushed_words+1) + | DoubleArg <- rep = + push_args args (push_f64_rbpoff (bytes_per_word * off) : code) + (pushed_words+1) + | otherwise = + push_args args (pushq_rbpoff (bytes_per_word * off) : code) + (pushed_words+1) + + + assign_result = + case r_rep of + DoubleArg -> f64 + FloatArg -> f32 + VoidArg -> [] + _other -> i64 + where + i64 = movq_rax_rbpoff 0 + f32 = mov_f32_xmm0_rbpoff 0 + f64 = mov_f64_xmm0_rbpoff 0 + +-- ######### x86_64 machine code: + +-- 0: 48 89 fd mov %rdi,%rbp +-- 3: 48 8b bd 78 56 34 12 mov 0x12345678(%rbp),%rdi +-- a: 48 8b b5 78 56 34 12 mov 0x12345678(%rbp),%rsi +-- 11: 48 8b 95 78 56 34 12 mov 0x12345678(%rbp),%rdx +-- 18: 48 8b 8d 78 56 34 12 mov 0x12345678(%rbp),%rcx +-- 1f: 4c 8b 85 78 56 34 12 mov 0x12345678(%rbp),%r8 +-- 26: 4c 8b 8d 78 56 34 12 mov 0x12345678(%rbp),%r9 +-- 2d: 4c 8b 95 78 56 34 12 mov 0x12345678(%rbp),%r10 +-- 34: 48 c7 c0 78 56 34 12 mov $0x12345678,%rax +-- 3b: 48 89 85 78 56 34 12 mov %rax,0x12345678(%rbp) +-- 42: f3 0f 10 85 78 56 34 12 movss 0x12345678(%rbp),%xmm0 +-- 4a: f2 0f 10 85 78 56 34 12 movsd 0x12345678(%rbp),%xmm0 +-- 52: f3 0f 11 85 78 56 34 12 movss %xmm0,0x12345678(%rbp) +-- 5a: f2 0f 11 85 78 56 34 12 movsd %xmm0,0x12345678(%rbp) +-- 62: ff b5 78 56 34 12 pushq 0x12345678(%rbp) +-- 68: f3 44 0f 11 04 24 movss %xmm8,(%rsp) +-- 6e: f2 44 0f 11 04 24 movsd %xmm8,(%rsp) +-- 74: 48 81 ec 78 56 34 12 sub $0x12345678,%rsp +-- 7b: 48 81 c4 78 56 34 12 add $0x12345678,%rsp +-- 82: 41 ff d2 callq *%r10 +-- 85: c3 retq + + movq_rdi_rbp = [0x48,0x89,0xfd] + movq_rbpoff_rdi off = [0x48, 0x8b, 0xbd] ++ lit32 off + movq_rbpoff_rsi off = [0x48, 0x8b, 0xb5] ++ lit32 off + movq_rbpoff_rdx off = [0x48, 0x8b, 0x95] ++ lit32 off + movq_rbpoff_rcx off = [0x48, 0x8b, 0x8d] ++ lit32 off + movq_rbpoff_r8 off = [0x4c, 0x8b, 0x85] ++ lit32 off + movq_rbpoff_r9 off = [0x4c, 0x8b, 0x8d] ++ lit32 off + movq_rbpoff_r10 off = [0x4c, 0x8b, 0x95] ++ lit32 off + movq_lit_rax lit = [0x48, 0xc7, 0xc0] ++ lit32 lit + movq_rax_rbpoff off = [0x48, 0x89, 0x85] ++ lit32 off + mov_f32_rbpoff_xmm n off = [0xf3, 0x0f, 0x10, 0x85 + n`shiftL`3] ++ lit32 off + mov_f64_rbpoff_xmm n off = [0xf2, 0x0f, 0x10, 0x85 + n`shiftL`3] ++ lit32 off + mov_f32_xmm0_rbpoff off = [0xf3, 0x0f, 0x11, 0x85] ++ lit32 off + mov_f64_xmm0_rbpoff off = [0xf2, 0x0f, 0x11, 0x85] ++ lit32 off + pushq_rbpoff off = [0xff, 0xb5] ++ lit32 off + push_f32_rbpoff off = + mov_f32_rbpoff_xmm 8 off ++ -- movss off(%rbp), %xmm8 + [0xf3, 0x44, 0x0f, 0x11, 0x04, 0x24] ++ -- movss %xmm8, (%rsp) + subq_lit_rsp 8 -- subq $8, %rsp + push_f64_rbpoff off = + mov_f64_rbpoff_xmm 8 off ++ -- movsd off(%rbp), %xmm8 + [0xf2, 0x44, 0x0f, 0x11, 0x04, 0x24] ++ -- movsd %xmm8, (%rsp) + subq_lit_rsp 8 -- subq $8, %rsp + subq_lit_rsp lit = [0x48, 0x81, 0xec] ++ lit32 lit + addq_lit_rsp lit = [0x48, 0x81, 0xc4] ++ lit32 lit + call_star_r10 = [0x41,0xff,0xd2] + ret = [0xc3] + pushq_rbp = [0x55] + popq_rbp = [0x5d] + +#elif sparc_TARGET_ARCH + + = let -- At least for sparc V8 + bytes_per_word = 4 + + -- speaks for itself + w32_to_w8s_bigEndian :: Word32 -> [Word8] + w32_to_w8s_bigEndian w + = [fromIntegral (0xFF .&. (w `shiftR` 24)), + fromIntegral (0xFF .&. (w `shiftR` 16)), + fromIntegral (0xFF .&. (w `shiftR` 8)), + fromIntegral (0xFF .&. w)] + + offsets_to_pushW + = concat + [ [a_offW .. a_offW + cgRepSizeW a_rep - 1] + + | (a_offW, a_rep) <- arg_offs_n_reps + ] + + total_argWs = length offsets_to_pushW + argWs_on_stack = if total_argWs > 6 then total_argWs - 6 + else 0 + + -- The stack pointer must be kept 8-byte aligned, which means + -- we need to calculate this quantity too + argWs_on_stack_ROUNDED_UP + | odd argWs_on_stack = 1 + argWs_on_stack + | otherwise = argWs_on_stack + + -- some helpers to assemble sparc insns. + -- REGS + iReg, oReg, gReg, fReg :: Int -> Word32 + iReg = fromIntegral . (+ 24) + oReg = fromIntegral . (+ 8) + gReg = fromIntegral . (+ 0) + fReg = fromIntegral + + sp = oReg 6 + i0 = iReg 0 + i7 = iReg 7 + o0 = oReg 0 + o1 = oReg 1 + o7 = oReg 7 + g0 = gReg 0 + g1 = gReg 1 + f0 = fReg 0 + f1 = fReg 1 + + -- INSN templates + insn_r_r_i :: Word32 -> Word32 -> Word32 -> Int -> Word32 + insn_r_r_i op3 rs1 rd imm13 + = (3 `shiftL` 30) + .|. (rs1 `shiftL` 25) + .|. (op3 `shiftL` 19) + .|. (rd `shiftL` 14) + .|. (1 `shiftL` 13) + .|. mkSimm13 imm13 + + insn_r_i_r :: Word32 -> Word32 -> Int -> Word32 -> Word32 + insn_r_i_r op3 rs1 imm13 rd + = (2 `shiftL` 30) + .|. (rd `shiftL` 25) + .|. (op3 `shiftL` 19) + .|. (rs1 `shiftL` 14) + .|. (1 `shiftL` 13) + .|. mkSimm13 imm13 + + mkSimm13 :: Int -> Word32 + mkSimm13 imm13 + = let imm13w = (fromIntegral imm13) :: Word32 + in imm13w .&. 0x1FFF + + -- REAL (non-synthetic) insns + -- or %rs1, %rs2, %rd + mkOR :: Word32 -> Word32 -> Word32 -> Word32 + mkOR rs1 rs2 rd + = (2 `shiftL` 30) + .|. (rd `shiftL` 25) + .|. (op3_OR `shiftL` 19) + .|. (rs1 `shiftL` 14) + .|. (0 `shiftL` 13) + .|. rs2 + where op3_OR = 2 :: Word32 + + -- ld(int) [%rs + imm13], %rd + mkLD rs1 imm13 rd = insn_r_r_i 0x00{-op3_LD-} rd rs1 imm13 + + -- st(int) %rs, [%rd + imm13] + mkST = insn_r_r_i 0x04 -- op3_ST + + -- st(float) %rs, [%rd + imm13] + mkSTF = insn_r_r_i 0x24 -- op3_STF + + -- jmpl %rs + imm13, %rd + mkJMPL = insn_r_i_r 0x38 -- op3_JMPL + + -- save %rs + imm13, %rd + mkSAVE = insn_r_i_r 0x3C -- op3_SAVE + + -- restore %rs + imm13, %rd + mkRESTORE = insn_r_i_r 0x3D -- op3_RESTORE + + -- SYNTHETIC insns + mkNOP = mkOR g0 g0 g0 + mkCALL reg = mkJMPL reg 0 o7 + mkRET = mkJMPL i7 8 g0 + mkRESTORE_TRIVIAL = mkRESTORE g0 0 g0 + in + --trace (show (map fst arg_offs_n_reps)) + concatMap w32_to_w8s_bigEndian ( + + {- On entry, %o0 is the arg passed from the interpreter. After + the initial save insn, it will be in %i0. Studying the sparc + docs one would have thought that the minimum frame size is 92 + bytes, but gcc always uses at least 112, and indeed there are + segfaults a-plenty with 92. So I use 112 here as well. I + don't understand why, tho. + -} + [mkSAVE sp (- ({-92-}112 + 4*argWs_on_stack_ROUNDED_UP)) sp] + + {- For each arg in args_offs_n_reps, examine the associated + CgRep to determine how many words there are. This gives a + bunch of offsets on the H stack. Move the first 6 words into + %o0 .. %o5 and the rest on the stack, starting at [%sp+92]. + Use %g1 as a temp. + -} + ++ let doArgW (offW, wordNo) + | wordNo < 6 + = [mkLD i0 (bytes_per_word * offW) (oReg wordNo)] + | otherwise + = [mkLD i0 (bytes_per_word * offW) g1, + mkST g1 sp (92 + bytes_per_word * (wordNo - 6))] + in + concatMap doArgW (zip offsets_to_pushW [0 ..]) + + {- Get the addr to call into %g1, bearing in mind that there's + an Addr# tag at the indicated location, and do the call: + + ld [4*(1 /*tag*/ +addr_offW) + %i0], %g1 + call %g1 + -} + ++ [mkLD i0 (bytes_per_word * addr_offW) g1, + mkCALL g1, + mkNOP] + + {- Depending on what the return type is, get the result + from %o0 or %o1:%o0 or %f0 or %f1:%f0. + + st %o0, [%i0 + 4] -- 32 bit int + or + st %o0, [%i0 + 4] -- 64 bit int + st %o1, [%i0 + 8] -- or the other way round? + or + st %f0, [%i0 + 4] -- 32 bit float + or + st %f0, [%i0 + 4] -- 64 bit float + st %f1, [%i0 + 8] -- or the other way round? + + -} + ++ let i32 = [mkST o0 i0 0] + i64 = [mkST o0 i0 0, mkST o1 i0 4] + f32 = [mkSTF f0 i0 0] + f64 = [mkSTF f0 i0 0, mkSTF f1 i0 4] + in + case r_rep of + NonPtrArg -> i32 + DoubleArg -> f64 + FloatArg -> f32 + VoidArg -> [] + other -> moan64 "ByteCodeFFI.mkMarshalCode_wrk(sparc)" + (ppr r_rep) + + ++ [mkRET, + mkRESTORE_TRIVIAL] -- this is in the delay slot of the RET + ) +#elif powerpc_TARGET_ARCH && darwin_TARGET_OS + + = let + bytes_per_word = 4 + + -- speaks for itself + w32_to_w8s_bigEndian :: Word32 -> [Word8] + w32_to_w8s_bigEndian w + = [fromIntegral (0xFF .&. (w `shiftR` 24)), + fromIntegral (0xFF .&. (w `shiftR` 16)), + fromIntegral (0xFF .&. (w `shiftR` 8)), + fromIntegral (0xFF .&. w)] + + -- addr and result bits offsetsW + a_off = addr_offW * bytes_per_word + result_off = r_offW * bytes_per_word + + linkageArea = 24 + parameterArea = sum [ cgRepSizeW a_rep * bytes_per_word + | (_, a_rep) <- arg_offs_n_reps ] + savedRegisterArea = 4 + frameSize = padTo16 (linkageArea + max parameterArea 32 + savedRegisterArea) + padTo16 x = case x `mod` 16 of + 0 -> x + y -> x - y + 16 + + pass_parameters [] _ _ = [] + pass_parameters ((a_offW, a_rep):args) nextFPR offsetW = + let + haskellArgOffset = a_offW * bytes_per_word + offsetW' = offsetW + cgRepSizeW a_rep + + pass_word w + | offsetW + w < 8 = + [0x801f0000 -- lwz rX, src(r31) + .|. (fromIntegral src .&. 0xFFFF) + .|. (fromIntegral (offsetW+w+3) `shiftL` 21)] + | otherwise = + [0x801f0000 -- lwz r0, src(r31) + .|. (fromIntegral src .&. 0xFFFF), + 0x90010000 -- stw r0, dst(r1) + .|. (fromIntegral dst .&. 0xFFFF)] + where + src = haskellArgOffset + w*bytes_per_word + dst = linkageArea + (offsetW+w) * bytes_per_word + in + case a_rep of + FloatArg | nextFPR < 14 -> + (0xc01f0000 -- lfs fX, haskellArgOffset(r31) + .|. (fromIntegral haskellArgOffset .&. 0xFFFF) + .|. (fromIntegral nextFPR `shiftL` 21)) + : pass_parameters args (nextFPR+1) offsetW' + DoubleArg | nextFPR < 14 -> + (0xc81f0000 -- lfd fX, haskellArgOffset(r31) + .|. (fromIntegral haskellArgOffset .&. 0xFFFF) + .|. (fromIntegral nextFPR `shiftL` 21)) + : pass_parameters args (nextFPR+1) offsetW' + _ -> + concatMap pass_word [0 .. cgRepSizeW a_rep - 1] + ++ pass_parameters args nextFPR offsetW' + + gather_result = case r_rep of + VoidArg -> [] + FloatArg -> + [0xd03f0000 .|. (fromIntegral result_off .&. 0xFFFF)] + -- stfs f1, result_off(r31) + DoubleArg -> + [0xd83f0000 .|. (fromIntegral result_off .&. 0xFFFF)] + -- stfd f1, result_off(r31) + _ | cgRepSizeW r_rep == 2 -> + [0x907f0000 .|. (fromIntegral result_off .&. 0xFFFF), + 0x909f0000 .|. (fromIntegral (result_off+4) .&. 0xFFFF)] + -- stw r3, result_off(r31) + -- stw r4, result_off+4(r31) + _ | cgRepSizeW r_rep == 1 -> + [0x907f0000 .|. (fromIntegral result_off .&. 0xFFFF)] + -- stw r3, result_off(r31) + in + concatMap w32_to_w8s_bigEndian $ [ + 0x7c0802a6, -- mflr r0 + 0x93e1fffc, -- stw r31,-4(r1) + 0x90010008, -- stw r0,8(r1) + 0x94210000 .|. (fromIntegral (-frameSize) .&. 0xFFFF), + -- stwu r1, -frameSize(r1) + 0x7c7f1b78 -- mr r31, r3 + ] ++ pass_parameters arg_offs_n_reps 1 0 ++ [ + 0x819f0000 .|. (fromIntegral a_off .&. 0xFFFF), + -- lwz r12, a_off(r31) + 0x7d8903a6, -- mtctr r12 + 0x4e800421 -- bctrl + ] ++ gather_result ++ [ + 0x80210000, -- lwz r1, 0(r1) + 0x83e1fffc, -- lwz r31, -4(r1) + 0x80010008, -- lwz r0, 8(r1) + 0x7c0803a6, -- mtlr r0 + 0x4e800020 -- blr + ] + +#elif powerpc_TARGET_ARCH && linux_TARGET_OS + + -- All offsets here are measured in Words (not bytes). This includes + -- arguments to the load/store machine code generators, alignment numbers + -- and the final 'framesize' among others. + + = concatMap w32_to_w8s_bigEndian $ [ + 0x7c0802a6, -- mflr r0 + 0x93e1fffc, -- stw r31,-4(r1) + 0x90010008, -- stw r0,8(r1) + 0x94210000 .|. offset (-framesize), -- stwu r1, -frameSize(r1) + 0x7c7f1b78 -- mr r31, r3 + ] ++ pass_parameters ++ -- pass the parameters + loadWord 12 addr_offW ++ [ -- lwz r12, a_off(r31) + 0x7d8903a6, -- mtctr r12 + 0x4e800421 -- bctrl + ] ++ gather_result ++ [ -- save the return value + 0x80210000, -- lwz r1, 0(r1) + 0x83e1fffc, -- lwz r31, -4(r1) + 0x80010008, -- lwz r0, 8(r1) + 0x7c0803a6, -- mtlr r0 + 0x4e800020 -- blr + ] + + where + gather_result :: [Word32] + gather_result = case r_rep of + VoidArg -> [] + FloatArg -> storeFloat 1 r_offW + DoubleArg -> storeDouble 1 r_offW + LongArg -> storeLong 3 r_offW + _ -> storeWord 3 r_offW + + pass_parameters :: [Word32] + pass_parameters = concat params + + -- vector aligned (4 word = 16 bytes) with 8 extra words of buffer space + framesize = alignedTo 4 (argsize + 8) + + ((_,_,argsize), params) = mapAccumL loadparam (3,1,2) arg_offs_n_reps + + -- handle one argument, returning machine code and the updated state + loadparam :: (Int, Int, Int) -> (Int, CgRep) -> + ((Int, Int, Int), [Word32]) + + loadparam (gpr, fpr, stack) (ofs, rep) = case rep of + FloatArg | fpr <= 8 -> ( (gpr, fpr + 1, stack), loadFloat fpr ofs ) + FloatArg -> ( (gpr, fpr, stack + 1), stackWord stack ofs ) + + DoubleArg | fpr <= 8 -> ( (gpr, fpr + 1, stack), loadDouble fpr ofs ) + DoubleArg -> ( (gpr, fpr, astack + 2), stackLong astack ofs ) + + LongArg | even gpr -> loadparam (gpr + 1, fpr, stack) (ofs, rep) + LongArg | gpr <= 9 -> ( (gpr + 2, fpr, stack), loadLong gpr ofs ) + LongArg -> ( (gpr, fpr, astack + 2), stackLong astack ofs ) + + _ | gpr <= 10 -> ( (gpr + 1, fpr, stack), loadWord gpr ofs ) + _ -> ( (gpr, fpr, stack + 1), stackWord stack ofs ) + where astack = alignedTo 2 stack + + alignedTo :: Int -> Int -> Int + alignedTo alignment x = case x `mod` alignment of + 0 -> x + y -> x - y + alignment + + -- convenience macros to do multiple-instruction data moves + stackWord dst src = loadWord 0 src ++ storeWordC 0 dst + stackLong dst src = stackWord dst src ++ stackWord (dst + 1) (src + 1) + loadLong dst src = loadWord dst src ++ loadWord (dst + 1) (src + 1) + storeLong dst src = storeWord dst src ++ storeWord (dst + 1) (src + 1) + + -- load data from the Haskell stack (relative to r31) + loadFloat = loadstoreInstr 0xc01f0000 -- lfs fpr, ofs(r31) + loadDouble = loadstoreInstr 0xc81f0000 -- lfd fpr, ofs(r31) + loadWord = loadstoreInstr 0x801f0000 -- lwz gpr, ofs(r31) + + -- store data to the Haskell stack (relative to r31) + storeFloat = loadstoreInstr 0xd01f0000 -- stfs fpr, ofs(r31) + storeDouble = loadstoreInstr 0xd81f0000 -- stfd fpr, ofs(r31) + storeWord = loadstoreInstr 0x901f0000 -- stw gpr, ofs(r31) + + -- store data to the C stack (relative to r1) + storeWordC = loadstoreInstr 0x90010000 -- stw gpr, ofs(r1) + + -- machine code building blocks + loadstoreInstr :: Word32 -> Int -> Int -> [Word32] + loadstoreInstr code reg ofs = [ code .|. register reg .|. offset ofs ] + + register :: Int -> Word32 + register reg = fromIntegral reg `shiftL` 21 + + offset :: Int -> Word32 + offset ofs = fromIntegral (ofs * 4) .&. 0xFFFF + + -- speaks for itself + w32_to_w8s_bigEndian :: Word32 -> [Word8] + w32_to_w8s_bigEndian w = [fromIntegral (0xFF .&. (w `shiftR` 24)), + fromIntegral (0xFF .&. (w `shiftR` 16)), + fromIntegral (0xFF .&. (w `shiftR` 8)), + fromIntegral (0xFF .&. w)] + +#else + + = throwDyn (InstallationError "foreign import is not implemented for GHCi on this platform.") + +#endif + +#if i386_TARGET_ARCH || x86_64_TARGET_ARCH +lit32 :: Int -> [Word8] +lit32 i = let w32 = (fromIntegral i) :: Word32 + in map (fromIntegral . ( .&. 0xFF)) + [w32, w32 `shiftR` 8, + w32 `shiftR` 16, w32 `shiftR` 24] +#endif +\end{code} + |