diff options
author | Sylvain Henry <sylvain@haskus.fr> | 2022-02-11 14:10:17 +0100 |
---|---|---|
committer | Marge Bot <ben+marge-bot@smart-cactus.org> | 2022-02-23 13:59:23 -0500 |
commit | f07b13e38a24d73db152f465922d0fcf903e0470 (patch) | |
tree | e71238b4ee470e6d2c6719b9481ef01690f27739 | |
parent | e60d8df813185cfe3fecdf66d6438611cf2ee4eb (diff) | |
download | haskell-f07b13e38a24d73db152f465922d0fcf903e0470.tar.gz |
NCG: refactor X86 codegen
Preliminary work done to make working on #5444 easier.
Mostly make make control-flow easier to follow:
* renamed genCCall into genForeignCall
* split genForeignCall into the part dispatching on PrimTarget (genPrim) and
the one really generating code for a C call (cf ForeignTarget and genCCall)
* made genPrim/genSimplePrim only dispatch on MachOp: each MachOp now
has its own code generation function.
* out-of-line primops are not handled in a partial `outOfLineCmmOp`
anymore but in the code generation functions directly. Helper
functions have been introduced (e.g. genLibCCall) for code sharing.
* the latter two bullets make code generated for primops that are only
sometimes out-of-line (e.g. Pdep or Memcpy) and the logic to select
between inline/out-of-line much more localized
* avoided passing is32bit as an argument as we can easily get it from NatM
state when we really need it
* changed genCCall type to avoid it being partial (it can't handle
PrimTarget)
* globally removed 12 calls to `panic` thanks to better control flow and
types ("parse, don't validate" ftw!).
-rw-r--r-- | compiler/GHC/CmmToAsm/X86/CodeGen.hs | 1986 |
1 files changed, 1054 insertions, 932 deletions
diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs index 12fbe181db..2025aa58d7 100644 --- a/compiler/GHC/CmmToAsm/X86/CodeGen.hs +++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs @@ -1,4 +1,5 @@ {-# LANGUAGE BangPatterns #-} +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE NondecreasingIndentation #-} {-# LANGUAGE TupleSections #-} @@ -279,7 +280,7 @@ we construct as a separate data type and the actual control flow graph in the co Instead we now return the new basic block if a statement causes a change in the current block and use the block for all following statements. -For this reason genCCall is also split into two parts. One for calls which +For this reason genForeignCall is also split into two parts. One for calls which *won't* change the basic blocks in which successive instructions will be placed (since they only evaluate CmmExpr, which can only contain MachOps, which cannot introduce basic blocks in their lowerings). A different one for calls @@ -315,7 +316,7 @@ stmtToInstrs bid stmt = do platform <- getPlatform case stmt of CmmUnsafeForeignCall target result_regs args - -> genCCall is32Bit target result_regs args bid + -> genForeignCall target result_regs args bid _ -> (,Nothing) <$> case stmt of CmmComment s -> return (unitOL (COMMENT $ ftext s)) @@ -1339,15 +1340,15 @@ getAmode e = do -- (i.e. no index register). This stops us from running out of -- registers on x86 when using instructions such as cmpxchg, which can -- use up to three virtual registers and one fixed register. -getSimpleAmode :: Bool -> CmmExpr -> NatM Amode -getSimpleAmode is32Bit addr - | is32Bit = do - addr_code <- getAnyReg addr - config <- getConfig - addr_r <- getNewRegNat (intFormat (ncgWordWidth config)) - let amode = AddrBaseIndex (EABaseReg addr_r) EAIndexNone (ImmInt 0) - return $! Amode amode (addr_code addr_r) - | otherwise = getAmode addr +getSimpleAmode :: CmmExpr -> NatM Amode +getSimpleAmode addr = is32BitPlatform >>= \case + False -> getAmode addr + True -> do + addr_code <- getAnyReg addr + config <- getConfig + addr_r <- getNewRegNat (intFormat (ncgWordWidth config)) + let amode = AddrBaseIndex (EABaseReg addr_r) EAIndexNone (ImmInt 0) + return $! Amode amode (addr_code addr_r) x86_complex_amode :: CmmExpr -> CmmExpr -> Integer -> Integer -> NatM Amode x86_complex_amode base index shift offset @@ -2091,739 +2092,142 @@ genCondBranch' _ bid id false bool = do -- See Note [Keeping track of the current block] for information why we need -- to take/return a block id. -genCCall - :: Bool -- 32 bit platform? - -> ForeignTarget -- function to call - -> [CmmFormal] -- where to put the result - -> [CmmActual] -- arguments (of mixed type) - -> BlockId -- The block we are in +genForeignCall + :: ForeignTarget -- ^ function to call + -> [CmmFormal] -- ^ where to put the result + -> [CmmActual] -- ^ arguments (of mixed type) + -> BlockId -- ^ The block we are in -> NatM (InstrBlock, Maybe BlockId) --- First we deal with cases which might introduce new blocks in the stream. - -genCCall is32Bit (PrimTarget (MO_AtomicRMW width amop)) - [dst] [addr, n] bid = do - Amode amode addr_code <- - if amop `elem` [AMO_Add, AMO_Sub] - then getAmode addr - else getSimpleAmode is32Bit addr -- See genCCall for MO_Cmpxchg - arg <- getNewRegNat format - arg_code <- getAnyReg n - platform <- ncgPlatform <$> getConfig - - let dst_r = getRegisterReg platform (CmmLocal dst) - (code, lbl) <- op_code dst_r arg amode - return (addr_code `appOL` arg_code arg `appOL` code, Just lbl) - where - -- Code for the operation - op_code :: Reg -- Destination reg - -> Reg -- Register containing argument - -> AddrMode -- Address of location to mutate - -> NatM (OrdList Instr,BlockId) -- TODO: Return Maybe BlockId - op_code dst_r arg amode = case amop of - -- In the common case where dst_r is a virtual register the - -- final move should go away, because it's the last use of arg - -- and the first use of dst_r. - AMO_Add -> return $ (toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode)) - , MOV format (OpReg arg) (OpReg dst_r) - ], bid) - AMO_Sub -> return $ (toOL [ NEGI format (OpReg arg) - , LOCK (XADD format (OpReg arg) (OpAddr amode)) - , MOV format (OpReg arg) (OpReg dst_r) - ], bid) - -- In these cases we need a new block id, and have to return it so - -- that later instruction selection can reference it. - AMO_And -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst) - AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst - , NOT format dst - ]) - AMO_Or -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst) - AMO_Xor -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst) - where - -- Simulate operation that lacks a dedicated instruction using - -- cmpxchg. - cmpxchg_code :: (Operand -> Operand -> OrdList Instr) - -> NatM (OrdList Instr, BlockId) - cmpxchg_code instrs = do - lbl1 <- getBlockIdNat - lbl2 <- getBlockIdNat - tmp <- getNewRegNat format - - --Record inserted blocks - -- We turn A -> B into A -> A' -> A'' -> B - -- with a self loop on A'. - addImmediateSuccessorNat bid lbl1 - addImmediateSuccessorNat lbl1 lbl2 - updateCfgNat (addWeightEdge lbl1 lbl1 0) - - return $ (toOL - [ MOV format (OpAddr amode) (OpReg eax) - , JXX ALWAYS lbl1 - , NEWBLOCK lbl1 - -- Keep old value so we can return it: - , MOV format (OpReg eax) (OpReg dst_r) - , MOV format (OpReg eax) (OpReg tmp) - ] - `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL - [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode)) - , JXX NE lbl1 - -- See Note [Introducing cfg edges inside basic blocks] - -- why this basic block is required. - , JXX ALWAYS lbl2 - , NEWBLOCK lbl2 - ], - lbl2) - format = intFormat width - -genCCall is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid - | is32Bit, width == W64 = do - ChildCode64 vcode rlo <- iselExpr64 src - platform <- ncgPlatform <$> getConfig - let rhi = getHiVRegFromLo rlo - dst_r = getRegisterReg platform (CmmLocal dst) - lbl1 <- getBlockIdNat - lbl2 <- getBlockIdNat - let format = if width == W8 then II16 else intFormat width - tmp_r <- getNewRegNat format - - -- New CFG Edges: - -- bid -> lbl2 - -- bid -> lbl1 -> lbl2 - -- We also changes edges originating at bid to start at lbl2 instead. - weights <- getCfgWeights - updateCfgNat (addWeightEdge bid lbl1 110 . - addWeightEdge lbl1 lbl2 110 . - addImmediateSuccessor weights bid lbl2) +genForeignCall target dst args bid = do + case target of + PrimTarget prim -> genPrim bid prim dst args + ForeignTarget addr conv -> (,Nothing) <$> genCCall bid addr conv dst args - -- The following instruction sequence corresponds to the pseudo-code - -- - -- if (src) { - -- dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32); - -- } else { - -- dst = 64; - -- } - let !instrs = vcode `appOL` toOL - ([ MOV II32 (OpReg rhi) (OpReg tmp_r) - , OR II32 (OpReg rlo) (OpReg tmp_r) - , MOV II32 (OpImm (ImmInt 64)) (OpReg dst_r) - , JXX EQQ lbl2 - , JXX ALWAYS lbl1 - - , NEWBLOCK lbl1 - , BSF II32 (OpReg rhi) dst_r - , ADD II32 (OpImm (ImmInt 32)) (OpReg dst_r) - , BSF II32 (OpReg rlo) tmp_r - , CMOV NE II32 (OpReg tmp_r) dst_r - , JXX ALWAYS lbl2 - - , NEWBLOCK lbl2 - ]) - return (instrs, Just lbl2) - - | otherwise = do - code_src <- getAnyReg src - config <- getConfig - let platform = ncgPlatform config - let dst_r = getRegisterReg platform (CmmLocal dst) - if ncgBmiVersion config >= Just BMI2 - then do - src_r <- getNewRegNat (intFormat width) - let instrs = appOL (code_src src_r) $ case width of - W8 -> toOL - [ OR II32 (OpImm (ImmInteger 0xFFFFFF00)) (OpReg src_r) - , TZCNT II32 (OpReg src_r) dst_r - ] - W16 -> toOL - [ TZCNT II16 (OpReg src_r) dst_r - , MOVZxL II16 (OpReg dst_r) (OpReg dst_r) - ] - _ -> unitOL $ TZCNT (intFormat width) (OpReg src_r) dst_r - return (instrs, Nothing) - else do - -- The following insn sequence makes sure 'ctz 0' has a defined value. - -- starting with Haswell, one could use the TZCNT insn instead. - let format = if width == W8 then II16 else intFormat width - src_r <- getNewRegNat format - tmp_r <- getNewRegNat format - let !instrs = code_src src_r `appOL` toOL - ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++ - [ BSF format (OpReg src_r) tmp_r - , MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r) - , CMOV NE format (OpReg tmp_r) dst_r - ]) -- NB: We don't need to zero-extend the result for the - -- W8/W16 cases because the 'MOV' insn already - -- took care of implicitly clearing the upper bits - return (instrs, Nothing) - where - bw = widthInBits width +genPrim + :: BlockId -- ^ The block we are in + -> CallishMachOp -- ^ MachOp + -> [CmmFormal] -- ^ where to put the result + -> [CmmActual] -- ^ arguments (of mixed type) + -> NatM (InstrBlock, Maybe BlockId) -genCCall bits mop dst args bid = do - config <- getConfig - instr <- genCCall' config bits mop dst args bid - return (instr, Nothing) - --- genCCall' handles cases not introducing new code blocks. -genCCall' - :: NCGConfig - -> Bool -- 32 bit platform? - -> ForeignTarget -- function to call - -> [CmmFormal] -- where to put the result - -> [CmmActual] -- arguments (of mixed type) - -> BlockId -- The block we are in +-- First we deal with cases which might introduce new blocks in the stream. +genPrim bid (MO_AtomicRMW width amop) [dst] [addr, n] + = genAtomicRMW bid width amop dst addr n +genPrim bid (MO_Ctz width) [dst] [src] + = genCtz bid width dst src + +-- Then we deal with cases which not introducing new blocks in the stream. +genPrim bid prim dst args + = (,Nothing) <$> genSimplePrim bid prim dst args + +genSimplePrim + :: BlockId -- ^ the block we are in + -> CallishMachOp -- ^ MachOp + -> [CmmFormal] -- ^ where to put the result + -> [CmmActual] -- ^ arguments (of mixed type) -> NatM InstrBlock - --- Unroll memcpy calls if the number of bytes to copy isn't too --- large. Otherwise, call C's memcpy. -genCCall' config _ (PrimTarget (MO_Memcpy align)) _ - [dst, src, CmmLit (CmmInt n _)] _ - | fromInteger insns <= ncgInlineThresholdMemcpy config = do - code_dst <- getAnyReg dst - dst_r <- getNewRegNat format - code_src <- getAnyReg src - src_r <- getNewRegNat format - tmp_r <- getNewRegNat format - return $ code_dst dst_r `appOL` code_src src_r `appOL` - go dst_r src_r tmp_r (fromInteger n) - where - platform = ncgPlatform config - -- The number of instructions we will generate (approx). We need 2 - -- instructions per move. - insns = 2 * ((n + sizeBytes - 1) `div` sizeBytes) - - maxAlignment = wordAlignment platform -- only machine word wide MOVs are supported - effectiveAlignment = min (alignmentOf align) maxAlignment - format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment - - -- The size of each move, in bytes. - sizeBytes :: Integer - sizeBytes = fromIntegral (formatInBytes format) - - go :: Reg -> Reg -> Reg -> Integer -> OrdList Instr - go dst src tmp i - | i >= sizeBytes = - unitOL (MOV format (OpAddr src_addr) (OpReg tmp)) `appOL` - unitOL (MOV format (OpReg tmp) (OpAddr dst_addr)) `appOL` - go dst src tmp (i - sizeBytes) - -- Deal with remaining bytes. - | i >= 4 = -- Will never happen on 32-bit - unitOL (MOV II32 (OpAddr src_addr) (OpReg tmp)) `appOL` - unitOL (MOV II32 (OpReg tmp) (OpAddr dst_addr)) `appOL` - go dst src tmp (i - 4) - | i >= 2 = - unitOL (MOVZxL II16 (OpAddr src_addr) (OpReg tmp)) `appOL` - unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL` - go dst src tmp (i - 2) - | i >= 1 = - unitOL (MOVZxL II8 (OpAddr src_addr) (OpReg tmp)) `appOL` - unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL` - go dst src tmp (i - 1) - | otherwise = nilOL - where - src_addr = AddrBaseIndex (EABaseReg src) EAIndexNone - (ImmInteger (n - i)) - dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone - (ImmInteger (n - i)) - -genCCall' config _ (PrimTarget (MO_Memset align)) _ - [dst, - CmmLit (CmmInt c _), - CmmLit (CmmInt n _)] - _ - | fromInteger insns <= ncgInlineThresholdMemset config = do - code_dst <- getAnyReg dst - dst_r <- getNewRegNat format - if format == II64 && n >= 8 then do - code_imm8byte <- getAnyReg (CmmLit (CmmInt c8 W64)) - imm8byte_r <- getNewRegNat II64 - return $ code_dst dst_r `appOL` - code_imm8byte imm8byte_r `appOL` - go8 dst_r imm8byte_r (fromInteger n) - else - return $ code_dst dst_r `appOL` - go4 dst_r (fromInteger n) - where - platform = ncgPlatform config - maxAlignment = wordAlignment platform -- only machine word wide MOVs are supported - effectiveAlignment = min (alignmentOf align) maxAlignment - format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment - c2 = c `shiftL` 8 .|. c - c4 = c2 `shiftL` 16 .|. c2 - c8 = c4 `shiftL` 32 .|. c4 - - -- The number of instructions we will generate (approx). We need 1 - -- instructions per move. - insns = (n + sizeBytes - 1) `div` sizeBytes - - -- The size of each move, in bytes. - sizeBytes :: Integer - sizeBytes = fromIntegral (formatInBytes format) - - -- Depending on size returns the widest MOV instruction and its - -- width. - gen4 :: AddrMode -> Integer -> (InstrBlock, Integer) - gen4 addr size - | size >= 4 = - (unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr addr)), 4) - | size >= 2 = - (unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr addr)), 2) - | size >= 1 = - (unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr addr)), 1) - | otherwise = (nilOL, 0) - - -- Generates a 64-bit wide MOV instruction from REG to MEM. - gen8 :: AddrMode -> Reg -> InstrBlock - gen8 addr reg8byte = - unitOL (MOV format (OpReg reg8byte) (OpAddr addr)) - - -- Unrolls memset when the widest MOV is <= 4 bytes. - go4 :: Reg -> Integer -> InstrBlock - go4 dst left = - if left <= 0 then nilOL - else curMov `appOL` go4 dst (left - curWidth) - where - possibleWidth = minimum [left, sizeBytes] - dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left)) - (curMov, curWidth) = gen4 dst_addr possibleWidth - - -- Unrolls memset when the widest MOV is 8 bytes (thus another Reg - -- argument). Falls back to go4 when all 8 byte moves are - -- exhausted. - go8 :: Reg -> Reg -> Integer -> InstrBlock - go8 dst reg8byte left = - if possibleWidth >= 8 then - let curMov = gen8 dst_addr reg8byte - in curMov `appOL` go8 dst reg8byte (left - 8) - else go4 dst left - where - possibleWidth = minimum [left, sizeBytes] - dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left)) - -genCCall' _ _ (PrimTarget MO_ReadBarrier) _ _ _ = return nilOL -genCCall' _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL - -- barriers compile to no code on x86/x86-64; - -- we keep it this long in order to prevent earlier optimisations. - -genCCall' _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL - -genCCall' _ is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] _ = - case n of - 0 -> genPrefetch src $ PREFETCH NTA format - 1 -> genPrefetch src $ PREFETCH Lvl2 format - 2 -> genPrefetch src $ PREFETCH Lvl1 format - 3 -> genPrefetch src $ PREFETCH Lvl0 format - l -> panic $ "unexpected prefetch level in genCCall MO_Prefetch_Data: " ++ (show l) - -- the c / llvm prefetch convention is 0, 1, 2, and 3 - -- the x86 corresponding names are : NTA, 2 , 1, and 0 - where - format = archWordFormat is32bit - -- need to know what register width for pointers! - genPrefetch inRegSrc prefetchCTor = - do - code_src <- getAnyReg inRegSrc - src_r <- getNewRegNat format - return $ code_src src_r `appOL` - (unitOL (prefetchCTor (OpAddr - ((AddrBaseIndex (EABaseReg src_r ) EAIndexNone (ImmInt 0)))) )) - -- prefetch always takes an address - -genCCall' _ is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do - platform <- ncgPlatform <$> getConfig - let dst_r = getRegisterReg platform (CmmLocal dst) - case width of - W64 | is32Bit -> do - ChildCode64 vcode rlo <- iselExpr64 src - let dst_rhi = getHiVRegFromLo dst_r - rhi = getHiVRegFromLo rlo - return $ vcode `appOL` - toOL [ MOV II32 (OpReg rlo) (OpReg dst_rhi), - MOV II32 (OpReg rhi) (OpReg dst_r), - BSWAP II32 dst_rhi, - BSWAP II32 dst_r ] - W16 -> do code_src <- getAnyReg src - return $ code_src dst_r `appOL` - unitOL (BSWAP II32 dst_r) `appOL` - unitOL (SHR II32 (OpImm $ ImmInt 16) (OpReg dst_r)) - _ -> do code_src <- getAnyReg src - return $ code_src dst_r `appOL` unitOL (BSWAP format dst_r) - where - format = intFormat width - -genCCall' config is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst] - args@[src] bid = do - sse4_2 <- sse4_2Enabled - let platform = ncgPlatform config - if sse4_2 - then do code_src <- getAnyReg src - src_r <- getNewRegNat format - let dst_r = getRegisterReg platform (CmmLocal dst) - return $ code_src src_r `appOL` - (if width == W8 then - -- The POPCNT instruction doesn't take a r/m8 - unitOL (MOVZxL II8 (OpReg src_r) (OpReg src_r)) `appOL` - unitOL (POPCNT II16 (OpReg src_r) dst_r) - else - unitOL (POPCNT format (OpReg src_r) dst_r)) `appOL` - (if width == W8 || width == W16 then - -- We used a 16-bit destination register above, - -- so zero-extend - unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r)) - else nilOL) - else do - targetExpr <- cmmMakeDynamicReference config - CallReference lbl - let target = ForeignTarget targetExpr (ForeignConvention CCallConv - [NoHint] [NoHint] - CmmMayReturn) - genCCall' config is32Bit target dest_regs args bid - where - format = intFormat width - lbl = mkCmmCodeLabel primUnitId (popCntLabel width) - -genCCall' config is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst] - args@[src, mask] bid = do - let platform = ncgPlatform config - if ncgBmiVersion config >= Just BMI2 - then do code_src <- getAnyReg src - code_mask <- getAnyReg mask - src_r <- getNewRegNat format - mask_r <- getNewRegNat format - let dst_r = getRegisterReg platform (CmmLocal dst) - return $ code_src src_r `appOL` code_mask mask_r `appOL` - -- PDEP only supports > 32 bit args - ( if width == W8 || width == W16 then - toOL - [ MOVZxL format (OpReg src_r ) (OpReg src_r ) - , MOVZxL format (OpReg mask_r) (OpReg mask_r) - , PDEP II32 (OpReg mask_r) (OpReg src_r ) dst_r - , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width - ] - else - unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r) - ) - else do - targetExpr <- cmmMakeDynamicReference config - CallReference lbl - let target = ForeignTarget targetExpr (ForeignConvention CCallConv - [NoHint] [NoHint] - CmmMayReturn) - genCCall' config is32Bit target dest_regs args bid - where - format = intFormat width - lbl = mkCmmCodeLabel primUnitId (pdepLabel width) - -genCCall' config is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst] - args@[src, mask] bid = do - let platform = ncgPlatform config - if ncgBmiVersion config >= Just BMI2 - then do code_src <- getAnyReg src - code_mask <- getAnyReg mask - src_r <- getNewRegNat format - mask_r <- getNewRegNat format - let dst_r = getRegisterReg platform (CmmLocal dst) - return $ code_src src_r `appOL` code_mask mask_r `appOL` - (if width == W8 || width == W16 then - -- The PEXT instruction doesn't take a r/m8 or 16 - toOL - [ MOVZxL format (OpReg src_r ) (OpReg src_r ) - , MOVZxL format (OpReg mask_r) (OpReg mask_r) - , PEXT II32 (OpReg mask_r) (OpReg src_r ) dst_r - , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width - ] - else - unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r) - ) - else do - targetExpr <- cmmMakeDynamicReference config - CallReference lbl - let target = ForeignTarget targetExpr (ForeignConvention CCallConv - [NoHint] [NoHint] - CmmMayReturn) - genCCall' config is32Bit target dest_regs args bid - where - format = intFormat width - lbl = mkCmmCodeLabel primUnitId (pextLabel width) - -genCCall' config is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] bid - | is32Bit && width == W64 = do - -- Fallback to `hs_clz64` on i386 - targetExpr <- cmmMakeDynamicReference config CallReference lbl - let target = ForeignTarget targetExpr (ForeignConvention CCallConv - [NoHint] [NoHint] - CmmMayReturn) - genCCall' config is32Bit target dest_regs args bid - - | otherwise = do - code_src <- getAnyReg src - config <- getConfig - let platform = ncgPlatform config - let dst_r = getRegisterReg platform (CmmLocal dst) - if ncgBmiVersion config >= Just BMI2 - then do - src_r <- getNewRegNat (intFormat width) - return $ appOL (code_src src_r) $ case width of - W8 -> toOL - [ MOVZxL II8 (OpReg src_r) (OpReg src_r) -- zero-extend to 32 bit - , LZCNT II32 (OpReg src_r) dst_r -- lzcnt with extra 24 zeros - , SUB II32 (OpImm (ImmInt 24)) (OpReg dst_r) -- compensate for extra zeros - ] - W16 -> toOL - [ LZCNT II16 (OpReg src_r) dst_r - , MOVZxL II16 (OpReg dst_r) (OpReg dst_r) -- zero-extend from 16 bit - ] - _ -> unitOL (LZCNT (intFormat width) (OpReg src_r) dst_r) - else do - let format = if width == W8 then II16 else intFormat width - src_r <- getNewRegNat format - tmp_r <- getNewRegNat format - return $ code_src src_r `appOL` toOL - ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++ - [ BSR format (OpReg src_r) tmp_r - , MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r) - , CMOV NE format (OpReg tmp_r) dst_r - , XOR format (OpImm (ImmInt (bw-1))) (OpReg dst_r) - ]) -- NB: We don't need to zero-extend the result for the - -- W8/W16 cases because the 'MOV' insn already - -- took care of implicitly clearing the upper bits - where - bw = widthInBits width - lbl = mkCmmCodeLabel primUnitId (clzLabel width) - -genCCall' config is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do - targetExpr <- cmmMakeDynamicReference config - CallReference lbl - let target = ForeignTarget targetExpr (ForeignConvention CCallConv - [NoHint] [NoHint] - CmmMayReturn) - genCCall' config is32Bit target dest_regs args bid - where - lbl = mkCmmCodeLabel primUnitId (word2FloatLabel width) - -genCCall' _ _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do - load_code <- intLoadCode (MOV (intFormat width)) addr +genSimplePrim bid (MO_Memcpy align) [] [dst,src,n] = genMemCpy bid align dst src n +genSimplePrim bid (MO_Memmove align) [] [dst,src,n] = genMemMove bid align dst src n +genSimplePrim bid (MO_Memcmp align) [res] [dst,src,n] = genMemCmp bid align res dst src n +genSimplePrim bid (MO_Memset align) [] [dst,c,n] = genMemSet bid align dst c n +genSimplePrim _ MO_ReadBarrier [] [] = return nilOL -- barriers compile to no code on x86/x86-64; +genSimplePrim _ MO_WriteBarrier [] [] = return nilOL -- we keep it this long in order to prevent earlier optimisations. +genSimplePrim _ MO_Touch [] [_] = return nilOL +genSimplePrim _ (MO_Prefetch_Data n) [] [src] = genPrefetchData n src +genSimplePrim _ (MO_BSwap width) [dst] [src] = genByteSwap width dst src +genSimplePrim bid (MO_BRev width) [dst] [src] = genBitRev bid width dst src +genSimplePrim bid (MO_PopCnt width) [dst] [src] = genPopCnt bid width dst src +genSimplePrim bid (MO_Pdep width) [dst] [src,mask] = genPdep bid width dst src mask +genSimplePrim bid (MO_Pext width) [dst] [src,mask] = genPext bid width dst src mask +genSimplePrim bid (MO_Clz width) [dst] [src] = genClz bid width dst src +genSimplePrim bid (MO_UF_Conv width) [dst] [src] = genWordToFloat bid width dst src +genSimplePrim _ (MO_AtomicRead w) [dst] [addr] = genAtomicRead w dst addr +genSimplePrim _ (MO_AtomicWrite w) [] [addr,val] = genAtomicWrite w addr val +genSimplePrim bid (MO_Cmpxchg width) [dst] [addr,old,new] = genCmpXchg bid width dst addr old new +genSimplePrim _ (MO_Xchg width) [dst] [addr, value] = genXchg width dst addr value +genSimplePrim _ (MO_AddWordC w) [r,c] [x,y] = genAddSubRetCarry w ADD_CC (const Nothing) CARRY r c x y +genSimplePrim _ (MO_SubWordC w) [r,c] [x,y] = genAddSubRetCarry w SUB_CC (const Nothing) CARRY r c x y +genSimplePrim _ (MO_AddIntC w) [r,c] [x,y] = genAddSubRetCarry w ADD_CC (Just . ADD_CC) OFLO r c x y +genSimplePrim _ (MO_SubIntC w) [r,c] [x,y] = genAddSubRetCarry w SUB_CC (const Nothing) OFLO r c x y +genSimplePrim _ (MO_Add2 w) [h,l] [x,y] = genAddWithCarry w h l x y +genSimplePrim _ (MO_U_Mul2 w) [h,l] [x,y] = genUnsignedLargeMul w h l x y +genSimplePrim _ (MO_S_Mul2 w) [c,h,l] [x,y] = genSignedLargeMul w c h l x y +genSimplePrim _ (MO_S_QuotRem w) [q,r] [x,y] = genQuotRem w True q r Nothing x y +genSimplePrim _ (MO_U_QuotRem w) [q,r] [x,y] = genQuotRem w False q r Nothing x y +genSimplePrim _ (MO_U_QuotRem2 w) [q,r] [hx,lx,y] = genQuotRem w False q r (Just hx) lx y +genSimplePrim _ MO_F32_Fabs [dst] [src] = genFloatAbs W32 dst src +genSimplePrim _ MO_F64_Fabs [dst] [src] = genFloatAbs W64 dst src +genSimplePrim _ MO_F32_Sqrt [dst] [src] = genFloatSqrt FF32 dst src +genSimplePrim _ MO_F64_Sqrt [dst] [src] = genFloatSqrt FF64 dst src +genSimplePrim bid MO_F32_Sin [dst] [src] = genLibCCall bid (fsLit "sinf") [dst] [src] +genSimplePrim bid MO_F32_Cos [dst] [src] = genLibCCall bid (fsLit "cosf") [dst] [src] +genSimplePrim bid MO_F32_Tan [dst] [src] = genLibCCall bid (fsLit "tanf") [dst] [src] +genSimplePrim bid MO_F32_Exp [dst] [src] = genLibCCall bid (fsLit "expf") [dst] [src] +genSimplePrim bid MO_F32_ExpM1 [dst] [src] = genLibCCall bid (fsLit "expm1f") [dst] [src] +genSimplePrim bid MO_F32_Log [dst] [src] = genLibCCall bid (fsLit "logf") [dst] [src] +genSimplePrim bid MO_F32_Log1P [dst] [src] = genLibCCall bid (fsLit "log1pf") [dst] [src] +genSimplePrim bid MO_F32_Asin [dst] [src] = genLibCCall bid (fsLit "asinf") [dst] [src] +genSimplePrim bid MO_F32_Acos [dst] [src] = genLibCCall bid (fsLit "acosf") [dst] [src] +genSimplePrim bid MO_F32_Atan [dst] [src] = genLibCCall bid (fsLit "atanf") [dst] [src] +genSimplePrim bid MO_F32_Sinh [dst] [src] = genLibCCall bid (fsLit "sinhf") [dst] [src] +genSimplePrim bid MO_F32_Cosh [dst] [src] = genLibCCall bid (fsLit "coshf") [dst] [src] +genSimplePrim bid MO_F32_Tanh [dst] [src] = genLibCCall bid (fsLit "tanhf") [dst] [src] +genSimplePrim bid MO_F32_Pwr [dst] [x,y] = genLibCCall bid (fsLit "powf") [dst] [x,y] +genSimplePrim bid MO_F32_Asinh [dst] [src] = genLibCCall bid (fsLit "asinhf") [dst] [src] +genSimplePrim bid MO_F32_Acosh [dst] [src] = genLibCCall bid (fsLit "acoshf") [dst] [src] +genSimplePrim bid MO_F32_Atanh [dst] [src] = genLibCCall bid (fsLit "atanhf") [dst] [src] +genSimplePrim bid MO_F64_Sin [dst] [src] = genLibCCall bid (fsLit "sin") [dst] [src] +genSimplePrim bid MO_F64_Cos [dst] [src] = genLibCCall bid (fsLit "cos") [dst] [src] +genSimplePrim bid MO_F64_Tan [dst] [src] = genLibCCall bid (fsLit "tan") [dst] [src] +genSimplePrim bid MO_F64_Exp [dst] [src] = genLibCCall bid (fsLit "exp") [dst] [src] +genSimplePrim bid MO_F64_ExpM1 [dst] [src] = genLibCCall bid (fsLit "expm1") [dst] [src] +genSimplePrim bid MO_F64_Log [dst] [src] = genLibCCall bid (fsLit "log") [dst] [src] +genSimplePrim bid MO_F64_Log1P [dst] [src] = genLibCCall bid (fsLit "log1p") [dst] [src] +genSimplePrim bid MO_F64_Asin [dst] [src] = genLibCCall bid (fsLit "asin") [dst] [src] +genSimplePrim bid MO_F64_Acos [dst] [src] = genLibCCall bid (fsLit "acos") [dst] [src] +genSimplePrim bid MO_F64_Atan [dst] [src] = genLibCCall bid (fsLit "atan") [dst] [src] +genSimplePrim bid MO_F64_Sinh [dst] [src] = genLibCCall bid (fsLit "sinh") [dst] [src] +genSimplePrim bid MO_F64_Cosh [dst] [src] = genLibCCall bid (fsLit "cosh") [dst] [src] +genSimplePrim bid MO_F64_Tanh [dst] [src] = genLibCCall bid (fsLit "tanh") [dst] [src] +genSimplePrim bid MO_F64_Pwr [dst] [x,y] = genLibCCall bid (fsLit "pow") [dst] [x,y] +genSimplePrim bid MO_F64_Asinh [dst] [src] = genLibCCall bid (fsLit "asinh") [dst] [src] +genSimplePrim bid MO_F64_Acosh [dst] [src] = genLibCCall bid (fsLit "acosh") [dst] [src] +genSimplePrim bid MO_F64_Atanh [dst] [src] = genLibCCall bid (fsLit "atanh") [dst] [src] +genSimplePrim bid MO_SuspendThread [tok] [rs,i] = genRTSCCall bid (fsLit "suspendThread") [tok] [rs,i] +genSimplePrim bid MO_ResumeThread [rs] [tok] = genRTSCCall bid (fsLit "resumeThread") [rs] [tok] +genSimplePrim bid MO_I64_ToI [dst] [src] = genPrimCCall bid (fsLit "hs_int64ToInt") [dst] [src] +genSimplePrim bid MO_I64_FromI [dst] [src] = genPrimCCall bid (fsLit "hs_intToInt64") [dst] [src] +genSimplePrim bid MO_W64_ToW [dst] [src] = genPrimCCall bid (fsLit "hs_word64ToWord") [dst] [src] +genSimplePrim bid MO_W64_FromW [dst] [src] = genPrimCCall bid (fsLit "hs_wordToWord64") [dst] [src] +genSimplePrim bid MO_x64_Neg [dst] [src] = genPrimCCall bid (fsLit "hs_neg64") [dst] [src] +genSimplePrim bid MO_x64_Add [dst] [x,y] = genPrimCCall bid (fsLit "hs_add64") [dst] [x,y] +genSimplePrim bid MO_x64_Sub [dst] [x,y] = genPrimCCall bid (fsLit "hs_sub64") [dst] [x,y] +genSimplePrim bid MO_x64_Mul [dst] [x,y] = genPrimCCall bid (fsLit "hs_mul64") [dst] [x,y] +genSimplePrim bid MO_I64_Quot [dst] [x,y] = genPrimCCall bid (fsLit "hs_quotInt64") [dst] [x,y] +genSimplePrim bid MO_I64_Rem [dst] [x,y] = genPrimCCall bid (fsLit "hs_remInt64") [dst] [x,y] +genSimplePrim bid MO_W64_Quot [dst] [x,y] = genPrimCCall bid (fsLit "hs_quotWord64") [dst] [x,y] +genSimplePrim bid MO_W64_Rem [dst] [x,y] = genPrimCCall bid (fsLit "hs_remWord64") [dst] [x,y] +genSimplePrim bid MO_x64_And [dst] [x,y] = genPrimCCall bid (fsLit "hs_and64") [dst] [x,y] +genSimplePrim bid MO_x64_Or [dst] [x,y] = genPrimCCall bid (fsLit "hs_or64") [dst] [x,y] +genSimplePrim bid MO_x64_Xor [dst] [x,y] = genPrimCCall bid (fsLit "hs_xor64") [dst] [x,y] +genSimplePrim bid MO_x64_Not [dst] [src] = genPrimCCall bid (fsLit "hs_not64") [dst] [src] +genSimplePrim bid MO_x64_Shl [dst] [x,n] = genPrimCCall bid (fsLit "hs_uncheckedShiftL64") [dst] [x,n] +genSimplePrim bid MO_I64_Shr [dst] [x,n] = genPrimCCall bid (fsLit "hs_uncheckedIShiftRA64") [dst] [x,n] +genSimplePrim bid MO_W64_Shr [dst] [x,n] = genPrimCCall bid (fsLit "hs_uncheckedShiftRL64") [dst] [x,n] +genSimplePrim bid MO_x64_Eq [dst] [x,y] = genPrimCCall bid (fsLit "hs_eq64") [dst] [x,y] +genSimplePrim bid MO_x64_Ne [dst] [x,y] = genPrimCCall bid (fsLit "hs_ne64") [dst] [x,y] +genSimplePrim bid MO_I64_Ge [dst] [x,y] = genPrimCCall bid (fsLit "hs_geInt64") [dst] [x,y] +genSimplePrim bid MO_I64_Gt [dst] [x,y] = genPrimCCall bid (fsLit "hs_gtInt64") [dst] [x,y] +genSimplePrim bid MO_I64_Le [dst] [x,y] = genPrimCCall bid (fsLit "hs_leInt64") [dst] [x,y] +genSimplePrim bid MO_I64_Lt [dst] [x,y] = genPrimCCall bid (fsLit "hs_ltInt64") [dst] [x,y] +genSimplePrim bid MO_W64_Ge [dst] [x,y] = genPrimCCall bid (fsLit "hs_geWord64") [dst] [x,y] +genSimplePrim bid MO_W64_Gt [dst] [x,y] = genPrimCCall bid (fsLit "hs_gtWord64") [dst] [x,y] +genSimplePrim bid MO_W64_Le [dst] [x,y] = genPrimCCall bid (fsLit "hs_leWord64") [dst] [x,y] +genSimplePrim bid MO_W64_Lt [dst] [x,y] = genPrimCCall bid (fsLit "hs_ltWord64") [dst] [x,y] +genSimplePrim _ op dst args = do platform <- ncgPlatform <$> getConfig - - return (load_code (getRegisterReg platform (CmmLocal dst))) - -genCCall' _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do - code <- assignMem_IntCode (intFormat width) addr val - return $ code `snocOL` MFENCE - -genCCall' _ is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ - -- On x86 we don't have enough registers to use cmpxchg with a - -- complicated addressing mode, so on that architecture we - -- pre-compute the address first. - | not (is32Bit && width == W64) = do - Amode amode addr_code <- getSimpleAmode is32Bit addr - newval <- getNewRegNat format - newval_code <- getAnyReg new - oldval <- getNewRegNat format - oldval_code <- getAnyReg old - platform <- getPlatform - let dst_r = getRegisterReg platform (CmmLocal dst) - code = toOL - [ MOV format (OpReg oldval) (OpReg eax) - , LOCK (CMPXCHG format (OpReg newval) (OpAddr amode)) - , MOV format (OpReg eax) (OpReg dst_r) - ] - return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval - `appOL` code - where - format = intFormat width - -genCCall' config is32Bit (PrimTarget (MO_Xchg width)) [dst] [addr, value] _ - | (is32Bit && width == W64) = panic "gencCall: 64bit atomic exchange not supported on 32bit platforms" - | otherwise = do - let dst_r = getRegisterReg platform (CmmLocal dst) - Amode amode addr_code <- getSimpleAmode is32Bit addr - (newval, newval_code) <- getSomeReg value - -- Copy the value into the target register, perform the exchange. - let code = toOL - [ MOV format (OpReg newval) (OpReg dst_r) - -- On X86 xchg implies a lock prefix if we use a memory argument. - -- so this is atomic. - , XCHG format (OpAddr amode) dst_r - ] - return $ addr_code `appOL` newval_code `appOL` code - where - format = intFormat width - platform = ncgPlatform config - -genCCall' _ is32Bit target dest_regs args bid = do - platform <- ncgPlatform <$> getConfig - case (target, dest_regs) of - -- void return type prim op - (PrimTarget op, []) -> - outOfLineCmmOp bid op Nothing args - -- we only cope with a single result for foreign calls - (PrimTarget op, [r]) -> case op of - MO_F32_Fabs -> case args of - [x] -> sse2FabsCode W32 x - _ -> panic "genCCall: Wrong number of arguments for fabs" - MO_F64_Fabs -> case args of - [x] -> sse2FabsCode W64 x - _ -> panic "genCCall: Wrong number of arguments for fabs" - - MO_F32_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF32 args - MO_F64_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF64 args - _other_op -> outOfLineCmmOp bid op (Just r) args - - where - actuallyInlineSSE2Op = actuallyInlineFloatOp' - - actuallyInlineFloatOp' instr format [x] - = do res <- trivialUFCode format (instr format) x - any <- anyReg res - return (any (getRegisterReg platform (CmmLocal r))) - - actuallyInlineFloatOp' _ _ args - = panic $ "genCCall.actuallyInlineFloatOp': bad number of arguments! (" - ++ show (length args) ++ ")" - - sse2FabsCode :: Width -> CmmExpr -> NatM InstrBlock - sse2FabsCode w x = do - let fmt = floatFormat w - x_code <- getAnyReg x - let - const | FF32 <- fmt = CmmInt 0x7fffffff W32 - | otherwise = CmmInt 0x7fffffffffffffff W64 - Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes w) const - tmp <- getNewRegNat fmt - let - code dst = x_code dst `appOL` amode_code `appOL` toOL [ - MOV fmt (OpAddr amode) (OpReg tmp), - AND fmt (OpReg tmp) (OpReg dst) - ] - - return $ code (getRegisterReg platform (CmmLocal r)) - - (PrimTarget (MO_S_QuotRem width), _) -> divOp1 platform True width dest_regs args - (PrimTarget (MO_U_QuotRem width), _) -> divOp1 platform False width dest_regs args - (PrimTarget (MO_U_QuotRem2 width), _) -> divOp2 platform False width dest_regs args - (PrimTarget (MO_Add2 width), [res_h, res_l]) -> - case args of - [arg_x, arg_y] -> - do hCode <- getAnyReg (CmmLit (CmmInt 0 width)) - let format = intFormat width - lCode <- anyReg =<< trivialCode width (ADD_CC format) - (Just (ADD_CC format)) arg_x arg_y - let reg_l = getRegisterReg platform (CmmLocal res_l) - reg_h = getRegisterReg platform (CmmLocal res_h) - code = hCode reg_h `appOL` - lCode reg_l `snocOL` - ADC format (OpImm (ImmInteger 0)) (OpReg reg_h) - return code - _ -> panic "genCCall: Wrong number of arguments/results for add2" - (PrimTarget (MO_AddWordC width), [res_r, res_c]) -> - addSubIntC platform ADD_CC (const Nothing) CARRY width res_r res_c args - (PrimTarget (MO_SubWordC width), [res_r, res_c]) -> - addSubIntC platform SUB_CC (const Nothing) CARRY width res_r res_c args - (PrimTarget (MO_AddIntC width), [res_r, res_c]) -> - addSubIntC platform ADD_CC (Just . ADD_CC) OFLO width res_r res_c args - (PrimTarget (MO_SubIntC width), [res_r, res_c]) -> - addSubIntC platform SUB_CC (const Nothing) OFLO width res_r res_c args - (PrimTarget (MO_U_Mul2 width), [res_h, res_l]) -> - case args of - [arg_x, arg_y] -> - do (y_reg, y_code) <- getRegOrMem arg_y - x_code <- getAnyReg arg_x - let format = intFormat width - reg_h = getRegisterReg platform (CmmLocal res_h) - reg_l = getRegisterReg platform (CmmLocal res_l) - code = y_code `appOL` - x_code rax `appOL` - toOL [MUL2 format y_reg, - MOV format (OpReg rdx) (OpReg reg_h), - MOV format (OpReg rax) (OpReg reg_l)] - return code - _ -> panic "genCCall: Wrong number of arguments/results for mul2" - (PrimTarget (MO_S_Mul2 width), [res_c, res_h, res_l]) -> - case args of - [arg_x, arg_y] -> - do (y_reg, y_code) <- getRegOrMem arg_y - x_code <- getAnyReg arg_x - reg_tmp <- getNewRegNat II8 - let format = intFormat width - reg_h = getRegisterReg platform (CmmLocal res_h) - reg_l = getRegisterReg platform (CmmLocal res_l) - reg_c = getRegisterReg platform (CmmLocal res_c) - code = y_code `appOL` - x_code rax `appOL` - toOL [ IMUL2 format y_reg - , MOV format (OpReg rdx) (OpReg reg_h) - , MOV format (OpReg rax) (OpReg reg_l) - , SETCC CARRY (OpReg reg_tmp) - , MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c) - ] - return code - _ -> panic "genCCall: Wrong number of arguments/results for imul2" - - _ -> do - (instrs0, args') <- evalArgs bid args - instrs1 <- if is32Bit - then genCCall32' target dest_regs args' - else genCCall64' target dest_regs args' - return (instrs0 `appOL` instrs1) - - where divOp1 platform signed width results [arg_x, arg_y] - = divOp platform signed width results Nothing arg_x arg_y - divOp1 _ _ _ _ _ - = panic "genCCall: Wrong number of arguments for divOp1" - divOp2 platform signed width results [arg_x_high, arg_x_low, arg_y] - = divOp platform signed width results (Just arg_x_high) arg_x_low arg_y - divOp2 _ _ _ _ _ - = panic "genCCall: Wrong number of arguments for divOp2" - - -- See Note [DIV/IDIV for bytes] - divOp platform signed W8 [res_q, res_r] m_arg_x_high arg_x_low arg_y = - let widen | signed = MO_SS_Conv W8 W16 - | otherwise = MO_UU_Conv W8 W16 - arg_x_low_16 = CmmMachOp widen [arg_x_low] - arg_y_16 = CmmMachOp widen [arg_y] - m_arg_x_high_16 = (\p -> CmmMachOp widen [p]) <$> m_arg_x_high - in divOp - platform signed W16 [res_q, res_r] - m_arg_x_high_16 arg_x_low_16 arg_y_16 - - divOp platform signed width [res_q, res_r] - m_arg_x_high arg_x_low arg_y - = do let format = intFormat width - reg_q = getRegisterReg platform (CmmLocal res_q) - reg_r = getRegisterReg platform (CmmLocal res_r) - widen | signed = CLTD format - | otherwise = XOR format (OpReg rdx) (OpReg rdx) - instr | signed = IDIV - | otherwise = DIV - (y_reg, y_code) <- getRegOrMem arg_y - x_low_code <- getAnyReg arg_x_low - x_high_code <- case m_arg_x_high of - Just arg_x_high -> - getAnyReg arg_x_high - Nothing -> - return $ const $ unitOL widen - return $ y_code `appOL` - x_low_code rax `appOL` - x_high_code rdx `appOL` - toOL [instr format y_reg, - MOV format (OpReg rax) (OpReg reg_q), - MOV format (OpReg rdx) (OpReg reg_r)] - divOp _ _ _ _ _ _ _ - = panic "genCCall: Wrong number of results for divOp" - - addSubIntC platform instr mrevinstr cond width - res_r res_c [arg_x, arg_y] - = do let format = intFormat width - rCode <- anyReg =<< trivialCode width (instr format) - (mrevinstr format) arg_x arg_y - reg_tmp <- getNewRegNat II8 - let reg_c = getRegisterReg platform (CmmLocal res_c) - reg_r = getRegisterReg platform (CmmLocal res_r) - code = rCode reg_r `snocOL` - SETCC cond (OpReg reg_tmp) `snocOL` - MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c) - - return code - addSubIntC _ _ _ _ _ _ _ _ - = panic "genCCall: Wrong number of arguments/results for addSubIntC" + pprPanic "genSimplePrim: unhandled primop" (ppr (pprCallishMachOp op, dst, fmap (pdoc platform) args)) {- Note [Evaluate C-call arguments before placing in destination registers] @@ -2872,9 +2276,8 @@ this condition (only looking at the top-level of CmmExprs) to avoid spending too much effort trying to decide whether we want to take the fast path. Note that this hack *also* applies to calls to out-of-line PrimTargets (which -are lowered via a C call) since outOfLineCmmOp produces the call via -(stmtToInstrs (CmmUnsafeForeignCall ...)), which will ultimately end up -back in genCCall{32,64}. +are lowered via a C call), which will ultimately end up in +genForeignCall{32,64}. -} -- | See Note [Evaluate C-call arguments before placing in destination registers] @@ -2918,11 +2321,76 @@ evalArgs bid actuals -- and get the results from %al, %dl. This is not optimal, but a few -- register moves are probably not a huge deal when doing division. -genCCall32' :: ForeignTarget -- function to call - -> [CmmFormal] -- where to put the result - -> [CmmActual] -- arguments (of mixed type) - -> NatM InstrBlock -genCCall32' target dest_regs args = do + +-- | Generate C call to the given function in ghc-prim +genPrimCCall + :: BlockId + -> FastString + -> [CmmFormal] + -> [CmmActual] + -> NatM InstrBlock +genPrimCCall bid lbl_txt dsts args = do + config <- getConfig + -- FIXME: we should use mkForeignLabel instead of mkCmmCodeLabel + let lbl = mkCmmCodeLabel primUnitId lbl_txt + addr <- cmmMakeDynamicReference config CallReference lbl + let conv = ForeignConvention CCallConv [] [] CmmMayReturn + genCCall bid addr conv dsts args + +-- | Generate C call to the given function in libc +genLibCCall + :: BlockId + -> FastString + -> [CmmFormal] + -> [CmmActual] + -> NatM InstrBlock +genLibCCall bid lbl_txt dsts args = do + config <- getConfig + -- Assume we can call these functions directly, and that they're not in a dynamic library. + -- TODO: Why is this ok? Under linux this code will be in libm.so + -- Is it because they're really implemented as a primitive instruction by the assembler?? -- BL 2009/12/31 + let lbl = mkForeignLabel lbl_txt Nothing ForeignLabelInThisPackage IsFunction + addr <- cmmMakeDynamicReference config CallReference lbl + let conv = ForeignConvention CCallConv [] [] CmmMayReturn + genCCall bid addr conv dsts args + +-- | Generate C call to the given function in the RTS +genRTSCCall + :: BlockId + -> FastString + -> [CmmFormal] + -> [CmmActual] + -> NatM InstrBlock +genRTSCCall bid lbl_txt dsts args = do + config <- getConfig + -- Assume we can call these functions directly, and that they're not in a dynamic library. + let lbl = mkForeignLabel lbl_txt Nothing ForeignLabelInThisPackage IsFunction + addr <- cmmMakeDynamicReference config CallReference lbl + let conv = ForeignConvention CCallConv [] [] CmmMayReturn + genCCall bid addr conv dsts args + +-- | Generate a real C call to the given address with the given convention +genCCall + :: BlockId + -> CmmExpr + -> ForeignConvention + -> [CmmFormal] + -> [CmmActual] + -> NatM InstrBlock +genCCall bid addr conv dest_regs args = do + is32Bit <- is32BitPlatform + (instrs0, args') <- evalArgs bid args + instrs1 <- if is32Bit + then genCCall32 addr conv dest_regs args' + else genCCall64 addr conv dest_regs args' + return (instrs0 `appOL` instrs1) + +genCCall32 :: CmmExpr -- ^ address of the function to call + -> ForeignConvention -- ^ calling convention + -> [CmmFormal] -- ^ where to put the result + -> [CmmActual] -- ^ arguments (of mixed type) + -> NatM InstrBlock +genCCall32 addr conv dest_regs args = do config <- getConfig let platform = ncgPlatform config prom_args = map (maybePromoteCArg platform W32) args @@ -3003,19 +2471,15 @@ genCCall32' target dest_regs args = do -- deal with static vs dynamic call targets (callinsns,cconv) <- - case target of - ForeignTarget (CmmLit (CmmLabel lbl)) conv + case addr of + CmmLit (CmmLabel lbl) -> -- ToDo: stdcall arg sizes return (unitOL (CALL (Left fn_imm) []), conv) where fn_imm = ImmCLbl lbl - ForeignTarget expr conv - -> do { (dyn_r, dyn_c) <- getSomeReg expr - ; massert (isWord32 (cmmExprType platform expr)) + _ + -> do { (dyn_r, dyn_c) <- getSomeReg addr + ; massert (isWord32 (cmmExprType platform addr)) ; return (dyn_c `snocOL` CALL (Right dyn_r) [], conv) } - PrimTarget _ - -> panic $ "genCCall: Can't handle PrimTarget call type here, error " - ++ "probably because too many return values." - let push_code | arg_pad_size /= 0 = toOL [SUB II32 (OpImm (ImmInt arg_pad_size)) (OpReg esp), @@ -3073,17 +2537,18 @@ genCCall32' target dest_regs args = do b = widthInBytes w r_dest_hi = getHiVRegFromLo r_dest r_dest = getRegisterReg platform (CmmLocal dest) - assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many) + assign_code many = pprPanic "genForeignCall.assign_code - too many return values:" (ppr many) return (push_code `appOL` call `appOL` assign_code dest_regs) -genCCall64' :: ForeignTarget -- function to call - -> [CmmFormal] -- where to put the result - -> [CmmActual] -- arguments (of mixed type) - -> NatM InstrBlock -genCCall64' target dest_regs args = do +genCCall64 :: CmmExpr -- ^ address of function to call + -> ForeignConvention -- ^ calling convention + -> [CmmFormal] -- ^ where to put the result + -> [CmmActual] -- ^ arguments (of mixed type) + -> NatM InstrBlock +genCCall64 addr conv dest_regs args = do platform <- getPlatform -- load up the register arguments let prom_args = map (maybePromoteCArg platform W32) args @@ -3266,18 +2731,13 @@ genCCall64' target dest_regs args = do delta <- getDeltaNat -- deal with static vs dynamic call targets - (callinsns,_cconv) <- - case target of - ForeignTarget (CmmLit (CmmLabel lbl)) conv - -> -- ToDo: stdcall arg sizes - return (unitOL (CALL (Left fn_imm) arg_regs), conv) - where fn_imm = ImmCLbl lbl - ForeignTarget expr conv - -> do (dyn_r, dyn_c) <- getSomeReg expr - return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv) - PrimTarget _ - -> panic $ "genCCall: Can't handle PrimTarget call type here, error " - ++ "probably because too many return values." + (callinsns,_cconv) <- case addr of + CmmLit (CmmLabel lbl) -> + -- ToDo: stdcall arg sizes + return (unitOL (CALL (Left (ImmCLbl lbl)) arg_regs), conv) + _ -> do + (dyn_r, dyn_c) <- getSomeReg addr + return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv) let -- The x86_64 ABI requires us to set %al to the number of SSE2 @@ -3317,7 +2777,7 @@ genCCall64' target dest_regs args = do where rep = localRegType dest r_dest = getRegisterReg platform (CmmLocal dest) - assign_code _many = panic "genCCall.assign_code many" + assign_code _many = panic "genForeignCall.assign_code many" return (adjust_rsp `appOL` push_code `appOL` @@ -3336,154 +2796,6 @@ maybePromoteCArg platform wto arg where wfrom = cmmExprWidth platform arg -outOfLineCmmOp :: BlockId -> CallishMachOp -> Maybe CmmFormal -> [CmmActual] - -> NatM InstrBlock -outOfLineCmmOp bid mop res args - = do - config <- getConfig - targetExpr <- cmmMakeDynamicReference config CallReference lbl - let target = ForeignTarget targetExpr - (ForeignConvention CCallConv [] [] CmmMayReturn) - - -- We know foreign calls results in no new basic blocks, so we can ignore - -- the returned block id. - (instrs, _) <- stmtToInstrs bid (CmmUnsafeForeignCall target (catMaybes [res]) args) - return instrs - where - -- Assume we can call these functions directly, and that they're not in a dynamic library. - -- TODO: Why is this ok? Under linux this code will be in libm.so - -- Is it because they're really implemented as a primitive instruction by the assembler?? -- BL 2009/12/31 - lbl = mkForeignLabel fn Nothing ForeignLabelInThisPackage IsFunction - - fn = case mop of - MO_F32_Sqrt -> fsLit "sqrtf" - MO_F32_Fabs -> fsLit "fabsf" - MO_F32_Sin -> fsLit "sinf" - MO_F32_Cos -> fsLit "cosf" - MO_F32_Tan -> fsLit "tanf" - MO_F32_Exp -> fsLit "expf" - MO_F32_ExpM1 -> fsLit "expm1f" - MO_F32_Log -> fsLit "logf" - MO_F32_Log1P -> fsLit "log1pf" - - MO_F32_Asin -> fsLit "asinf" - MO_F32_Acos -> fsLit "acosf" - MO_F32_Atan -> fsLit "atanf" - - MO_F32_Sinh -> fsLit "sinhf" - MO_F32_Cosh -> fsLit "coshf" - MO_F32_Tanh -> fsLit "tanhf" - MO_F32_Pwr -> fsLit "powf" - - MO_F32_Asinh -> fsLit "asinhf" - MO_F32_Acosh -> fsLit "acoshf" - MO_F32_Atanh -> fsLit "atanhf" - - MO_F64_Sqrt -> fsLit "sqrt" - MO_F64_Fabs -> fsLit "fabs" - MO_F64_Sin -> fsLit "sin" - MO_F64_Cos -> fsLit "cos" - MO_F64_Tan -> fsLit "tan" - MO_F64_Exp -> fsLit "exp" - MO_F64_ExpM1 -> fsLit "expm1" - MO_F64_Log -> fsLit "log" - MO_F64_Log1P -> fsLit "log1p" - - MO_F64_Asin -> fsLit "asin" - MO_F64_Acos -> fsLit "acos" - MO_F64_Atan -> fsLit "atan" - - MO_F64_Sinh -> fsLit "sinh" - MO_F64_Cosh -> fsLit "cosh" - MO_F64_Tanh -> fsLit "tanh" - MO_F64_Pwr -> fsLit "pow" - - MO_F64_Asinh -> fsLit "asinh" - MO_F64_Acosh -> fsLit "acosh" - MO_F64_Atanh -> fsLit "atanh" - - MO_I64_ToI -> fsLit "hs_int64ToInt" - MO_I64_FromI -> fsLit "hs_intToInt64" - MO_W64_ToW -> fsLit "hs_word64ToWord" - MO_W64_FromW -> fsLit "hs_wordToWord64" - MO_x64_Neg -> fsLit "hs_neg64" - MO_x64_Add -> fsLit "hs_add64" - MO_x64_Sub -> fsLit "hs_sub64" - MO_x64_Mul -> fsLit "hs_mul64" - MO_I64_Quot -> fsLit "hs_quotInt64" - MO_I64_Rem -> fsLit "hs_remInt64" - MO_W64_Quot -> fsLit "hs_quotWord64" - MO_W64_Rem -> fsLit "hs_remWord64" - MO_x64_And -> fsLit "hs_and64" - MO_x64_Or -> fsLit "hs_or64" - MO_x64_Xor -> fsLit "hs_xor64" - MO_x64_Not -> fsLit "hs_not64" - MO_x64_Shl -> fsLit "hs_uncheckedShiftL64" - MO_I64_Shr -> fsLit "hs_uncheckedIShiftRA64" - MO_W64_Shr -> fsLit "hs_uncheckedShiftRL64" - MO_x64_Eq -> fsLit "hs_eq64" - MO_x64_Ne -> fsLit "hs_ne64" - MO_I64_Ge -> fsLit "hs_geInt64" - MO_I64_Gt -> fsLit "hs_gtInt64" - MO_I64_Le -> fsLit "hs_leInt64" - MO_I64_Lt -> fsLit "hs_ltInt64" - MO_W64_Ge -> fsLit "hs_geWord64" - MO_W64_Gt -> fsLit "hs_gtWord64" - MO_W64_Le -> fsLit "hs_leWord64" - MO_W64_Lt -> fsLit "hs_ltWord64" - - MO_Memcpy _ -> fsLit "memcpy" - MO_Memset _ -> fsLit "memset" - MO_Memmove _ -> fsLit "memmove" - MO_Memcmp _ -> fsLit "memcmp" - - MO_SuspendThread -> fsLit "suspendThread" - MO_ResumeThread -> fsLit "resumeThread" - - MO_PopCnt _ -> fsLit "popcnt" - MO_BSwap _ -> fsLit "bswap" - {- Here the C implementation is used as there is no x86 - instruction to reverse a word's bit order. - -} - MO_BRev w -> bRevLabel w - MO_Clz w -> clzLabel w - MO_Ctz _ -> unsupported - - MO_Pdep w -> pdepLabel w - MO_Pext w -> pextLabel w - - MO_AtomicRMW _ _ -> unsupported - MO_AtomicRead _ -> unsupported - MO_AtomicWrite _ -> unsupported - MO_Cmpxchg w -> cmpxchgLabel w -- for W64 on 32-bit - -- TODO: implement - -- cmpxchg8b instr - MO_Xchg _ -> should_be_inline - - MO_UF_Conv _ -> unsupported - - MO_S_Mul2 {} -> unsupported - MO_S_QuotRem {} -> unsupported - MO_U_QuotRem {} -> unsupported - MO_U_QuotRem2 {} -> unsupported - MO_Add2 {} -> unsupported - MO_AddIntC {} -> unsupported - MO_SubIntC {} -> unsupported - MO_AddWordC {} -> unsupported - MO_SubWordC {} -> unsupported - MO_U_Mul2 {} -> unsupported - MO_ReadBarrier -> unsupported - MO_WriteBarrier -> unsupported - MO_Touch -> unsupported - (MO_Prefetch_Data _ ) -> unsupported - unsupported = panic ("outOfLineCmmOp: " ++ show mop - ++ " not supported here") - -- If we generate a call for the given primop - -- something went wrong. - should_be_inline = panic ("outOfLineCmmOp: " ++ show mop - ++ " should be handled inline") - - -- ----------------------------------------------------------------------------- -- Generating a table-branch @@ -3829,16 +3141,6 @@ trivialFCode_sse2 pk instr x y where format = floatFormat pk -trivialUFCode :: Format -> (Reg -> Reg -> Instr) -> CmmExpr -> NatM Register -trivialUFCode format instr x = do - (x_reg, x_code) <- getSomeReg x - let - code dst = - x_code `snocOL` - instr x_reg dst - return (Any format code) - - -------------------------------------------------------------------------------- coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register coerceInt2FP from to x = coerce_sse2 @@ -3987,3 +3289,823 @@ invertCondBranches (Just cfg) keep bs = : invert (b2:bs) invert (b:bs) = b : invert bs invert [] = [] + +genAtomicRMW + :: BlockId + -> Width + -> AtomicMachOp + -> LocalReg + -> CmmExpr + -> CmmExpr + -> NatM (InstrBlock, Maybe BlockId) +genAtomicRMW bid width amop dst addr n = do + Amode amode addr_code <- + if amop `elem` [AMO_Add, AMO_Sub] + then getAmode addr + else getSimpleAmode addr -- See genForeignCall for MO_Cmpxchg + arg <- getNewRegNat format + arg_code <- getAnyReg n + platform <- ncgPlatform <$> getConfig + + let dst_r = getRegisterReg platform (CmmLocal dst) + (code, lbl) <- op_code dst_r arg amode + return (addr_code `appOL` arg_code arg `appOL` code, Just lbl) + where + -- Code for the operation + op_code :: Reg -- Destination reg + -> Reg -- Register containing argument + -> AddrMode -- Address of location to mutate + -> NatM (OrdList Instr,BlockId) -- TODO: Return Maybe BlockId + op_code dst_r arg amode = case amop of + -- In the common case where dst_r is a virtual register the + -- final move should go away, because it's the last use of arg + -- and the first use of dst_r. + AMO_Add -> return $ (toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode)) + , MOV format (OpReg arg) (OpReg dst_r) + ], bid) + AMO_Sub -> return $ (toOL [ NEGI format (OpReg arg) + , LOCK (XADD format (OpReg arg) (OpAddr amode)) + , MOV format (OpReg arg) (OpReg dst_r) + ], bid) + -- In these cases we need a new block id, and have to return it so + -- that later instruction selection can reference it. + AMO_And -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst) + AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst + , NOT format dst + ]) + AMO_Or -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst) + AMO_Xor -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst) + where + -- Simulate operation that lacks a dedicated instruction using + -- cmpxchg. + cmpxchg_code :: (Operand -> Operand -> OrdList Instr) + -> NatM (OrdList Instr, BlockId) + cmpxchg_code instrs = do + lbl1 <- getBlockIdNat + lbl2 <- getBlockIdNat + tmp <- getNewRegNat format + + --Record inserted blocks + -- We turn A -> B into A -> A' -> A'' -> B + -- with a self loop on A'. + addImmediateSuccessorNat bid lbl1 + addImmediateSuccessorNat lbl1 lbl2 + updateCfgNat (addWeightEdge lbl1 lbl1 0) + + return $ (toOL + [ MOV format (OpAddr amode) (OpReg eax) + , JXX ALWAYS lbl1 + , NEWBLOCK lbl1 + -- Keep old value so we can return it: + , MOV format (OpReg eax) (OpReg dst_r) + , MOV format (OpReg eax) (OpReg tmp) + ] + `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL + [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode)) + , JXX NE lbl1 + -- See Note [Introducing cfg edges inside basic blocks] + -- why this basic block is required. + , JXX ALWAYS lbl2 + , NEWBLOCK lbl2 + ], + lbl2) + format = intFormat width + +-- | Count trailing zeroes +genCtz :: BlockId -> Width -> LocalReg -> CmmExpr -> NatM (InstrBlock, Maybe BlockId) +genCtz bid width dst src = do + is32Bit <- is32BitPlatform + if is32Bit && width == W64 + then genCtz64_32 bid dst src + else (,Nothing) <$> genCtzGeneric width dst src + +-- | Count trailing zeroes +-- +-- 64-bit width on 32-bit architecture +genCtz64_32 + :: BlockId + -> LocalReg + -> CmmExpr + -> NatM (InstrBlock, Maybe BlockId) +genCtz64_32 bid dst src = do + ChildCode64 vcode rlo <- iselExpr64 src + platform <- ncgPlatform <$> getConfig + let rhi = getHiVRegFromLo rlo + dst_r = getRegisterReg platform (CmmLocal dst) + lbl1 <- getBlockIdNat + lbl2 <- getBlockIdNat + tmp_r <- getNewRegNat II64 + + -- New CFG Edges: + -- bid -> lbl2 + -- bid -> lbl1 -> lbl2 + -- We also changes edges originating at bid to start at lbl2 instead. + weights <- getCfgWeights + updateCfgNat (addWeightEdge bid lbl1 110 . + addWeightEdge lbl1 lbl2 110 . + addImmediateSuccessor weights bid lbl2) + + -- The following instruction sequence corresponds to the pseudo-code + -- + -- if (src) { + -- dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32); + -- } else { + -- dst = 64; + -- } + let instrs = vcode `appOL` toOL + ([ MOV II32 (OpReg rhi) (OpReg tmp_r) + , OR II32 (OpReg rlo) (OpReg tmp_r) + , MOV II32 (OpImm (ImmInt 64)) (OpReg dst_r) + , JXX EQQ lbl2 + , JXX ALWAYS lbl1 + + , NEWBLOCK lbl1 + , BSF II32 (OpReg rhi) dst_r + , ADD II32 (OpImm (ImmInt 32)) (OpReg dst_r) + , BSF II32 (OpReg rlo) tmp_r + , CMOV NE II32 (OpReg tmp_r) dst_r + , JXX ALWAYS lbl2 + + , NEWBLOCK lbl2 + ]) + return (instrs, Just lbl2) + +-- | Count trailing zeroes +-- +-- Generic case (width <= word size) +genCtzGeneric :: Width -> LocalReg -> CmmExpr -> NatM InstrBlock +genCtzGeneric width dst src = do + code_src <- getAnyReg src + config <- getConfig + let bw = widthInBits width + let platform = ncgPlatform config + let dst_r = getRegisterReg platform (CmmLocal dst) + if ncgBmiVersion config >= Just BMI2 + then do + src_r <- getNewRegNat (intFormat width) + let instrs = appOL (code_src src_r) $ case width of + W8 -> toOL + [ OR II32 (OpImm (ImmInteger 0xFFFFFF00)) (OpReg src_r) + , TZCNT II32 (OpReg src_r) dst_r + ] + W16 -> toOL + [ TZCNT II16 (OpReg src_r) dst_r + , MOVZxL II16 (OpReg dst_r) (OpReg dst_r) + ] + _ -> unitOL $ TZCNT (intFormat width) (OpReg src_r) dst_r + return instrs + else do + -- The following insn sequence makes sure 'ctz 0' has a defined value. + -- starting with Haswell, one could use the TZCNT insn instead. + let format = if width == W8 then II16 else intFormat width + src_r <- getNewRegNat format + tmp_r <- getNewRegNat format + let instrs = code_src src_r `appOL` toOL + ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++ + [ BSF format (OpReg src_r) tmp_r + , MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r) + , CMOV NE format (OpReg tmp_r) dst_r + ]) -- NB: We don't need to zero-extend the result for the + -- W8/W16 cases because the 'MOV' insn already + -- took care of implicitly clearing the upper bits + return instrs + + + +-- | Copy memory +-- +-- Unroll memcpy calls if the number of bytes to copy isn't too large (cf +-- ncgInlineThresholdMemcpy). Otherwise, call C's memcpy. +genMemCpy + :: BlockId + -> Int + -> CmmExpr + -> CmmExpr + -> CmmExpr + -> NatM InstrBlock +genMemCpy bid align dst src arg_n = do + + let libc_memcpy = genLibCCall bid (fsLit "memcpy") [] [dst,src,arg_n] + + case arg_n of + CmmLit (CmmInt n _) -> do + -- try to inline it + mcode <- genMemCpyInlineMaybe align dst src n + -- if it didn't inline, call the C function + case mcode of + Nothing -> libc_memcpy + Just c -> pure c + + -- not a literal size argument: call the C function + _ -> libc_memcpy + + + +genMemCpyInlineMaybe + :: Int + -> CmmExpr + -> CmmExpr + -> Integer + -> NatM (Maybe InstrBlock) +genMemCpyInlineMaybe align dst src n = do + config <- getConfig + let + platform = ncgPlatform config + maxAlignment = wordAlignment platform + -- only machine word wide MOVs are supported + effectiveAlignment = min (alignmentOf align) maxAlignment + format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment + + + -- The size of each move, in bytes. + let sizeBytes :: Integer + sizeBytes = fromIntegral (formatInBytes format) + + -- The number of instructions we will generate (approx). We need 2 + -- instructions per move. + let insns = 2 * ((n + sizeBytes - 1) `div` sizeBytes) + + go :: Reg -> Reg -> Reg -> Integer -> OrdList Instr + go dst src tmp i + | i >= sizeBytes = + unitOL (MOV format (OpAddr src_addr) (OpReg tmp)) `appOL` + unitOL (MOV format (OpReg tmp) (OpAddr dst_addr)) `appOL` + go dst src tmp (i - sizeBytes) + -- Deal with remaining bytes. + | i >= 4 = -- Will never happen on 32-bit + unitOL (MOV II32 (OpAddr src_addr) (OpReg tmp)) `appOL` + unitOL (MOV II32 (OpReg tmp) (OpAddr dst_addr)) `appOL` + go dst src tmp (i - 4) + | i >= 2 = + unitOL (MOVZxL II16 (OpAddr src_addr) (OpReg tmp)) `appOL` + unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL` + go dst src tmp (i - 2) + | i >= 1 = + unitOL (MOVZxL II8 (OpAddr src_addr) (OpReg tmp)) `appOL` + unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL` + go dst src tmp (i - 1) + | otherwise = nilOL + where + src_addr = AddrBaseIndex (EABaseReg src) EAIndexNone + (ImmInteger (n - i)) + + dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone + (ImmInteger (n - i)) + + if insns > fromIntegral (ncgInlineThresholdMemcpy config) + then pure Nothing + else do + code_dst <- getAnyReg dst + dst_r <- getNewRegNat format + code_src <- getAnyReg src + src_r <- getNewRegNat format + tmp_r <- getNewRegNat format + pure $ Just $ code_dst dst_r `appOL` code_src src_r `appOL` + go dst_r src_r tmp_r (fromInteger n) + +-- | Set memory to the given byte +-- +-- Unroll memset calls if the number of bytes to copy isn't too large (cf +-- ncgInlineThresholdMemset). Otherwise, call C's memset. +genMemSet + :: BlockId + -> Int + -> CmmExpr + -> CmmExpr + -> CmmExpr + -> NatM InstrBlock +genMemSet bid align dst arg_c arg_n = do + + let libc_memset = genLibCCall bid (fsLit "memset") [] [dst,arg_c,arg_n] + + case (arg_c,arg_n) of + (CmmLit (CmmInt c _), CmmLit (CmmInt n _)) -> do + -- try to inline it + mcode <- genMemSetInlineMaybe align dst c n + -- if it didn't inline, call the C function + case mcode of + Nothing -> libc_memset + Just c -> pure c + + -- not literal size arguments: call the C function + _ -> libc_memset + +genMemSetInlineMaybe + :: Int + -> CmmExpr + -> Integer + -> Integer + -> NatM (Maybe InstrBlock) +genMemSetInlineMaybe align dst c n = do + config <- getConfig + let + platform = ncgPlatform config + maxAlignment = wordAlignment platform -- only machine word wide MOVs are supported + effectiveAlignment = min (alignmentOf align) maxAlignment + format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment + c2 = c `shiftL` 8 .|. c + c4 = c2 `shiftL` 16 .|. c2 + c8 = c4 `shiftL` 32 .|. c4 + + -- The number of instructions we will generate (approx). We need 1 + -- instructions per move. + insns = (n + sizeBytes - 1) `div` sizeBytes + + -- The size of each move, in bytes. + sizeBytes :: Integer + sizeBytes = fromIntegral (formatInBytes format) + + -- Depending on size returns the widest MOV instruction and its + -- width. + gen4 :: AddrMode -> Integer -> (InstrBlock, Integer) + gen4 addr size + | size >= 4 = + (unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr addr)), 4) + | size >= 2 = + (unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr addr)), 2) + | size >= 1 = + (unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr addr)), 1) + | otherwise = (nilOL, 0) + + -- Generates a 64-bit wide MOV instruction from REG to MEM. + gen8 :: AddrMode -> Reg -> InstrBlock + gen8 addr reg8byte = + unitOL (MOV format (OpReg reg8byte) (OpAddr addr)) + + -- Unrolls memset when the widest MOV is <= 4 bytes. + go4 :: Reg -> Integer -> InstrBlock + go4 dst left = + if left <= 0 then nilOL + else curMov `appOL` go4 dst (left - curWidth) + where + possibleWidth = minimum [left, sizeBytes] + dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left)) + (curMov, curWidth) = gen4 dst_addr possibleWidth + + -- Unrolls memset when the widest MOV is 8 bytes (thus another Reg + -- argument). Falls back to go4 when all 8 byte moves are + -- exhausted. + go8 :: Reg -> Reg -> Integer -> InstrBlock + go8 dst reg8byte left = + if possibleWidth >= 8 then + let curMov = gen8 dst_addr reg8byte + in curMov `appOL` go8 dst reg8byte (left - 8) + else go4 dst left + where + possibleWidth = minimum [left, sizeBytes] + dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left)) + + if fromInteger insns > ncgInlineThresholdMemset config + then pure Nothing + else do + code_dst <- getAnyReg dst + dst_r <- getNewRegNat format + if format == II64 && n >= 8 + then do + code_imm8byte <- getAnyReg (CmmLit (CmmInt c8 W64)) + imm8byte_r <- getNewRegNat II64 + return $ Just $ code_dst dst_r `appOL` + code_imm8byte imm8byte_r `appOL` + go8 dst_r imm8byte_r (fromInteger n) + else + return $ Just $ code_dst dst_r `appOL` + go4 dst_r (fromInteger n) + + +genMemMove :: BlockId -> p -> CmmActual -> CmmActual -> CmmActual -> NatM InstrBlock +genMemMove bid _align dst src n = do + -- TODO: generate inline assembly when under a given treshold (similarly to + -- memcpy and memset) + genLibCCall bid (fsLit "memmove") [] [dst,src,n] + +genMemCmp :: BlockId -> p -> CmmFormal -> CmmActual -> CmmActual -> CmmActual -> NatM InstrBlock +genMemCmp bid _align res dst src n = do + -- TODO: generate inline assembly when under a given treshold (similarly to + -- memcpy and memset) + genLibCCall bid (fsLit "memcmp") [res] [dst,src,n] + +genPrefetchData :: Int -> CmmExpr -> NatM (OrdList Instr) +genPrefetchData n src = do + is32Bit <- is32BitPlatform + let + format = archWordFormat is32Bit + -- need to know what register width for pointers! + genPrefetch inRegSrc prefetchCTor = do + code_src <- getAnyReg inRegSrc + src_r <- getNewRegNat format + return $ code_src src_r `appOL` + (unitOL (prefetchCTor (OpAddr + ((AddrBaseIndex (EABaseReg src_r ) EAIndexNone (ImmInt 0)))) )) + -- prefetch always takes an address + + -- the c / llvm prefetch convention is 0, 1, 2, and 3 + -- the x86 corresponding names are : NTA, 2 , 1, and 0 + case n of + 0 -> genPrefetch src $ PREFETCH NTA format + 1 -> genPrefetch src $ PREFETCH Lvl2 format + 2 -> genPrefetch src $ PREFETCH Lvl1 format + 3 -> genPrefetch src $ PREFETCH Lvl0 format + l -> pprPanic "genPrefetchData: unexpected prefetch level" (ppr l) + +genByteSwap :: Width -> LocalReg -> CmmExpr -> NatM InstrBlock +genByteSwap width dst src = do + platform <- ncgPlatform <$> getConfig + is32Bit <- is32BitPlatform + let dst_r = getRegisterReg platform (CmmLocal dst) + let format = intFormat width + case width of + W64 | is32Bit -> do + ChildCode64 vcode rlo <- iselExpr64 src + let dst_rhi = getHiVRegFromLo dst_r + rhi = getHiVRegFromLo rlo + return $ vcode `appOL` + toOL [ MOV II32 (OpReg rlo) (OpReg dst_rhi), + MOV II32 (OpReg rhi) (OpReg dst_r), + BSWAP II32 dst_rhi, + BSWAP II32 dst_r ] + W16 -> do code_src <- getAnyReg src + return $ code_src dst_r `appOL` + unitOL (BSWAP II32 dst_r) `appOL` + unitOL (SHR II32 (OpImm $ ImmInt 16) (OpReg dst_r)) + _ -> do code_src <- getAnyReg src + return $ code_src dst_r `appOL` unitOL (BSWAP format dst_r) + +genBitRev :: BlockId -> Width -> CmmFormal -> CmmActual -> NatM InstrBlock +genBitRev bid width dst src = do + -- Here the C implementation (hs_bitrevN) is used as there is no x86 + -- instruction to reverse a word's bit order. + genPrimCCall bid (bRevLabel width) [dst] [src] + +genPopCnt :: BlockId -> Width -> LocalReg -> CmmExpr -> NatM InstrBlock +genPopCnt bid width dst src = do + config <- getConfig + let + platform = ncgPlatform config + format = intFormat width + + sse4_2Enabled >>= \case + + True -> do + code_src <- getAnyReg src + src_r <- getNewRegNat format + let dst_r = getRegisterReg platform (CmmLocal dst) + return $ code_src src_r `appOL` + (if width == W8 then + -- The POPCNT instruction doesn't take a r/m8 + unitOL (MOVZxL II8 (OpReg src_r) (OpReg src_r)) `appOL` + unitOL (POPCNT II16 (OpReg src_r) dst_r) + else + unitOL (POPCNT format (OpReg src_r) dst_r)) `appOL` + (if width == W8 || width == W16 then + -- We used a 16-bit destination register above, + -- so zero-extend + unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r)) + else nilOL) + + False -> + -- generate C call to hs_popcntN in ghc-prim + -- TODO: we could directly generate the assembly to index popcount_tab + -- here instead of doing it by calling a C function + genPrimCCall bid (popCntLabel width) [dst] [src] + + +genPdep :: BlockId -> Width -> LocalReg -> CmmExpr -> CmmExpr -> NatM InstrBlock +genPdep bid width dst src mask = do + config <- getConfig + let + platform = ncgPlatform config + format = intFormat width + + if ncgBmiVersion config >= Just BMI2 + then do + code_src <- getAnyReg src + code_mask <- getAnyReg mask + src_r <- getNewRegNat format + mask_r <- getNewRegNat format + let dst_r = getRegisterReg platform (CmmLocal dst) + return $ code_src src_r `appOL` code_mask mask_r `appOL` + -- PDEP only supports > 32 bit args + ( if width == W8 || width == W16 then + toOL + [ MOVZxL format (OpReg src_r ) (OpReg src_r ) + , MOVZxL format (OpReg mask_r) (OpReg mask_r) + , PDEP II32 (OpReg mask_r) (OpReg src_r ) dst_r + , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width + ] + else + unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r) + ) + else + -- generate C call to hs_pdepN in ghc-prim + genPrimCCall bid (pdepLabel width) [dst] [src,mask] + + +genPext :: BlockId -> Width -> LocalReg -> CmmExpr -> CmmExpr -> NatM InstrBlock +genPext bid width dst src mask = do + config <- getConfig + if ncgBmiVersion config >= Just BMI2 + then do + let format = intFormat width + let platform = ncgPlatform config + let dst_r = getRegisterReg platform (CmmLocal dst) + code_src <- getAnyReg src + code_mask <- getAnyReg mask + src_r <- getNewRegNat format + mask_r <- getNewRegNat format + return $ code_src src_r `appOL` code_mask mask_r `appOL` + (if width == W8 || width == W16 then + -- The PEXT instruction doesn't take a r/m8 or 16 + toOL + [ MOVZxL format (OpReg src_r ) (OpReg src_r ) + , MOVZxL format (OpReg mask_r) (OpReg mask_r) + , PEXT II32 (OpReg mask_r) (OpReg src_r ) dst_r + , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width + ] + else + unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r) + ) + else + -- generate C call to hs_pextN in ghc-prim + genPrimCCall bid (pextLabel width) [dst] [src,mask] + +genClz :: BlockId -> Width -> CmmFormal -> CmmActual -> NatM InstrBlock +genClz bid width dst src = do + is32Bit <- is32BitPlatform + config <- getConfig + if is32Bit && width == W64 + + then + -- Fallback to `hs_clz64` on i386 + genPrimCCall bid (clzLabel width) [dst] [src] + + else do + let platform = ncgPlatform config + code_src <- getAnyReg src + let dst_r = getRegisterReg platform (CmmLocal dst) + if ncgBmiVersion config >= Just BMI2 + then do + src_r <- getNewRegNat (intFormat width) + return $ appOL (code_src src_r) $ case width of + W8 -> toOL + [ MOVZxL II8 (OpReg src_r) (OpReg src_r) -- zero-extend to 32 bit + , LZCNT II32 (OpReg src_r) dst_r -- lzcnt with extra 24 zeros + , SUB II32 (OpImm (ImmInt 24)) (OpReg dst_r) -- compensate for extra zeros + ] + W16 -> toOL + [ LZCNT II16 (OpReg src_r) dst_r + , MOVZxL II16 (OpReg dst_r) (OpReg dst_r) -- zero-extend from 16 bit + ] + _ -> unitOL (LZCNT (intFormat width) (OpReg src_r) dst_r) + else do + let format = if width == W8 then II16 else intFormat width + let bw = widthInBits width + src_r <- getNewRegNat format + tmp_r <- getNewRegNat format + return $ code_src src_r `appOL` toOL + ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++ + [ BSR format (OpReg src_r) tmp_r + , MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r) + , CMOV NE format (OpReg tmp_r) dst_r + , XOR format (OpImm (ImmInt (bw-1))) (OpReg dst_r) + ]) -- NB: We don't need to zero-extend the result for the + -- W8/W16 cases because the 'MOV' insn already + -- took care of implicitly clearing the upper bits + +genWordToFloat :: BlockId -> Width -> CmmFormal -> CmmActual -> NatM InstrBlock +genWordToFloat bid width dst src = + -- TODO: generate assembly instead + genPrimCCall bid (word2FloatLabel width) [dst] [src] + +genAtomicRead :: Width -> LocalReg -> CmmExpr -> NatM InstrBlock +genAtomicRead width dst addr = do + load_code <- intLoadCode (MOV (intFormat width)) addr + platform <- ncgPlatform <$> getConfig + return (load_code (getRegisterReg platform (CmmLocal dst))) + +genAtomicWrite :: Width -> CmmExpr -> CmmExpr -> NatM InstrBlock +genAtomicWrite width addr val = do + code <- assignMem_IntCode (intFormat width) addr val + return $ code `snocOL` MFENCE + +genCmpXchg + :: BlockId + -> Width + -> LocalReg + -> CmmExpr + -> CmmExpr + -> CmmExpr + -> NatM InstrBlock +genCmpXchg bid width dst addr old new = do + is32Bit <- is32BitPlatform + -- On x86 we don't have enough registers to use cmpxchg with a + -- complicated addressing mode, so on that architecture we + -- pre-compute the address first. + if not (is32Bit && width == W64) + then do + let format = intFormat width + Amode amode addr_code <- getSimpleAmode addr + newval <- getNewRegNat format + newval_code <- getAnyReg new + oldval <- getNewRegNat format + oldval_code <- getAnyReg old + platform <- getPlatform + let dst_r = getRegisterReg platform (CmmLocal dst) + code = toOL + [ MOV format (OpReg oldval) (OpReg eax) + , LOCK (CMPXCHG format (OpReg newval) (OpAddr amode)) + , MOV format (OpReg eax) (OpReg dst_r) + ] + return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval + `appOL` code + else + -- generate C call to hs_cmpxchgN in ghc-prim + genPrimCCall bid (cmpxchgLabel width) [dst] [addr,old,new] + -- TODO: implement cmpxchg8b instruction + +genXchg :: Width -> LocalReg -> CmmExpr -> CmmExpr -> NatM InstrBlock +genXchg width dst addr value = do + is32Bit <- is32BitPlatform + + when (is32Bit && width == W64) $ + panic "genXchg: 64bit atomic exchange not supported on 32bit platforms" + + Amode amode addr_code <- getSimpleAmode addr + (newval, newval_code) <- getSomeReg value + config <- getConfig + let format = intFormat width + let platform = ncgPlatform config + let dst_r = getRegisterReg platform (CmmLocal dst) + -- Copy the value into the target register, perform the exchange. + let code = toOL + [ MOV format (OpReg newval) (OpReg dst_r) + -- On X86 xchg implies a lock prefix if we use a memory argument. + -- so this is atomic. + , XCHG format (OpAddr amode) dst_r + ] + return $ addr_code `appOL` newval_code `appOL` code + + +genFloatAbs :: Width -> LocalReg -> CmmExpr -> NatM InstrBlock +genFloatAbs width dst src = do + config <- getConfig + let + platform = ncgPlatform config + format = floatFormat width + const = case width of + W32 -> CmmInt 0x7fffffff W32 + W64 -> CmmInt 0x7fffffffffffffff W64 + _ -> pprPanic "genFloatAbs: invalid width" (ppr width) + src_code <- getAnyReg src + Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes width) const + tmp <- getNewRegNat format + let dst_r = getRegisterReg platform (CmmLocal dst) + pure $ src_code dst_r `appOL` amode_code `appOL` toOL + [ MOV format (OpAddr amode) (OpReg tmp) + , AND format (OpReg tmp) (OpReg dst_r) + ] + + +genFloatSqrt :: Format -> LocalReg -> CmmExpr -> NatM InstrBlock +genFloatSqrt format dst src = do + platform <- ncgPlatform <$> getConfig + let dst_r = getRegisterReg platform (CmmLocal dst) + src_code <- getAnyReg src + pure $ src_code dst_r `snocOL` SQRT format (OpReg dst_r) dst_r + + +genAddSubRetCarry + :: Width + -> (Format -> Operand -> Operand -> Instr) + -> (Format -> Maybe (Operand -> Operand -> Instr)) + -> Cond + -> LocalReg + -> LocalReg + -> CmmExpr + -> CmmExpr + -> NatM InstrBlock +genAddSubRetCarry width instr mrevinstr cond res_r res_c arg_x arg_y = do + platform <- ncgPlatform <$> getConfig + let format = intFormat width + rCode <- anyReg =<< trivialCode width (instr format) + (mrevinstr format) arg_x arg_y + reg_tmp <- getNewRegNat II8 + let reg_c = getRegisterReg platform (CmmLocal res_c) + reg_r = getRegisterReg platform (CmmLocal res_r) + code = rCode reg_r `snocOL` + SETCC cond (OpReg reg_tmp) `snocOL` + MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c) + return code + + +genAddWithCarry + :: Width + -> LocalReg + -> LocalReg + -> CmmExpr + -> CmmExpr + -> NatM InstrBlock +genAddWithCarry width res_h res_l arg_x arg_y = do + platform <- ncgPlatform <$> getConfig + hCode <- getAnyReg (CmmLit (CmmInt 0 width)) + let format = intFormat width + lCode <- anyReg =<< trivialCode width (ADD_CC format) + (Just (ADD_CC format)) arg_x arg_y + let reg_l = getRegisterReg platform (CmmLocal res_l) + reg_h = getRegisterReg platform (CmmLocal res_h) + code = hCode reg_h `appOL` + lCode reg_l `snocOL` + ADC format (OpImm (ImmInteger 0)) (OpReg reg_h) + return code + + +genSignedLargeMul + :: Width + -> LocalReg + -> LocalReg + -> LocalReg + -> CmmExpr + -> CmmExpr + -> NatM (OrdList Instr) +genSignedLargeMul width res_c res_h res_l arg_x arg_y = do + platform <- ncgPlatform <$> getConfig + (y_reg, y_code) <- getRegOrMem arg_y + x_code <- getAnyReg arg_x + reg_tmp <- getNewRegNat II8 + let format = intFormat width + reg_h = getRegisterReg platform (CmmLocal res_h) + reg_l = getRegisterReg platform (CmmLocal res_l) + reg_c = getRegisterReg platform (CmmLocal res_c) + code = y_code `appOL` + x_code rax `appOL` + toOL [ IMUL2 format y_reg + , MOV format (OpReg rdx) (OpReg reg_h) + , MOV format (OpReg rax) (OpReg reg_l) + , SETCC CARRY (OpReg reg_tmp) + , MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c) + ] + return code + +genUnsignedLargeMul + :: Width + -> LocalReg + -> LocalReg + -> CmmExpr + -> CmmExpr + -> NatM (OrdList Instr) +genUnsignedLargeMul width res_h res_l arg_x arg_y = do + platform <- ncgPlatform <$> getConfig + (y_reg, y_code) <- getRegOrMem arg_y + x_code <- getAnyReg arg_x + let format = intFormat width + reg_h = getRegisterReg platform (CmmLocal res_h) + reg_l = getRegisterReg platform (CmmLocal res_l) + code = y_code `appOL` + x_code rax `appOL` + toOL [MUL2 format y_reg, + MOV format (OpReg rdx) (OpReg reg_h), + MOV format (OpReg rax) (OpReg reg_l)] + return code + + +genQuotRem + :: Width + -> Bool + -> LocalReg + -> LocalReg + -> Maybe CmmExpr + -> CmmExpr + -> CmmExpr + -> NatM InstrBlock +genQuotRem width signed res_q res_r m_arg_x_high arg_x_low arg_y = do + case width of + W8 -> do + -- See Note [DIV/IDIV for bytes] + let widen | signed = MO_SS_Conv W8 W16 + | otherwise = MO_UU_Conv W8 W16 + arg_x_low_16 = CmmMachOp widen [arg_x_low] + arg_y_16 = CmmMachOp widen [arg_y] + m_arg_x_high_16 = (\p -> CmmMachOp widen [p]) <$> m_arg_x_high + genQuotRem W16 signed res_q res_r m_arg_x_high_16 arg_x_low_16 arg_y_16 + + _ -> do + platform <- ncgPlatform <$> getConfig + let format = intFormat width + reg_q = getRegisterReg platform (CmmLocal res_q) + reg_r = getRegisterReg platform (CmmLocal res_r) + widen | signed = CLTD format + | otherwise = XOR format (OpReg rdx) (OpReg rdx) + instr | signed = IDIV + | otherwise = DIV + (y_reg, y_code) <- getRegOrMem arg_y + x_low_code <- getAnyReg arg_x_low + x_high_code <- case m_arg_x_high of + Just arg_x_high -> + getAnyReg arg_x_high + Nothing -> + return $ const $ unitOL widen + return $ y_code `appOL` + x_low_code rax `appOL` + x_high_code rdx `appOL` + toOL [instr format y_reg, + MOV format (OpReg rax) (OpReg reg_q), + MOV format (OpReg rdx) (OpReg reg_r)] |