diff options
author | Sylvain Henry <sylvain@haskus.fr> | 2019-08-13 17:26:32 +0200 |
---|---|---|
committer | Sylvain Henry <sylvain@haskus.fr> | 2019-09-10 00:04:50 +0200 |
commit | 447864a94a1679b5b079e08bb7208a0005381cef (patch) | |
tree | baa469c52620ce7ae02def3e5e6a6f109cc89f40 /compiler/GHC/StgToCmm/Prim.hs | |
parent | 270fbe8512f04b6107755fa22bdec62205c0a567 (diff) | |
download | haskell-447864a94a1679b5b079e08bb7208a0005381cef.tar.gz |
Module hierarchy: StgToCmm (#13009)
Add StgToCmm module hierarchy. Platform modules that are used in several
other places (NCG, LLVM codegen, Cmm transformations) are put into
GHC.Platform.
Diffstat (limited to 'compiler/GHC/StgToCmm/Prim.hs')
-rw-r--r-- | compiler/GHC/StgToCmm/Prim.hs | 2622 |
1 files changed, 2622 insertions, 0 deletions
diff --git a/compiler/GHC/StgToCmm/Prim.hs b/compiler/GHC/StgToCmm/Prim.hs new file mode 100644 index 0000000000..dc69a51916 --- /dev/null +++ b/compiler/GHC/StgToCmm/Prim.hs @@ -0,0 +1,2622 @@ +{-# LANGUAGE CPP #-} +-- emitPrimOp is quite large +{-# OPTIONS_GHC -fmax-pmcheck-iterations=4000000 #-} + +---------------------------------------------------------------------------- +-- +-- Stg to C--: primitive operations +-- +-- (c) The University of Glasgow 2004-2006 +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm.Prim ( + cgOpApp, + cgPrimOp, -- internal(ish), used by cgCase to get code for a + -- comparison without also turning it into a Bool. + shouldInlinePrimOp + ) where + +#include "HsVersions.h" + +import GhcPrelude hiding ((<*>)) + +import GHC.StgToCmm.Layout +import GHC.StgToCmm.Foreign +import GHC.StgToCmm.Env +import GHC.StgToCmm.Monad +import GHC.StgToCmm.Utils +import GHC.StgToCmm.Ticky +import GHC.StgToCmm.Heap +import GHC.StgToCmm.Prof ( costCentreFrom ) + +import DynFlags +import GHC.Platform +import BasicTypes +import BlockId +import MkGraph +import StgSyn +import Cmm +import Type ( Type, tyConAppTyCon ) +import TyCon +import CLabel +import CmmUtils +import PrimOp +import SMRep +import FastString +import Outputable +import Util +import Data.Maybe + +import Data.Bits ((.&.), bit) +import Control.Monad (liftM, when, unless) + +------------------------------------------------------------------------ +-- Primitive operations and foreign calls +------------------------------------------------------------------------ + +{- Note [Foreign call results] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +A foreign call always returns an unboxed tuple of results, one +of which is the state token. This seems to happen even for pure +calls. + +Even if we returned a single result for pure calls, it'd still be +right to wrap it in a singleton unboxed tuple, because the result +might be a Haskell closure pointer, we don't want to evaluate it. -} + +---------------------------------- +cgOpApp :: StgOp -- The op + -> [StgArg] -- Arguments + -> Type -- Result type (always an unboxed tuple) + -> FCode ReturnKind + +-- Foreign calls +cgOpApp (StgFCallOp fcall ty) stg_args res_ty + = cgForeignCall fcall ty stg_args res_ty + -- Note [Foreign call results] + +-- tagToEnum# is special: we need to pull the constructor +-- out of the table, and perform an appropriate return. + +cgOpApp (StgPrimOp TagToEnumOp) [arg] res_ty + = ASSERT(isEnumerationTyCon tycon) + do { dflags <- getDynFlags + ; args' <- getNonVoidArgAmodes [arg] + ; let amode = case args' of [amode] -> amode + _ -> panic "TagToEnumOp had void arg" + ; emitReturn [tagToClosure dflags tycon amode] } + where + -- If you're reading this code in the attempt to figure + -- out why the compiler panic'ed here, it is probably because + -- you used tagToEnum# in a non-monomorphic setting, e.g., + -- intToTg :: Enum a => Int -> a ; intToTg (I# x#) = tagToEnum# x# + -- That won't work. + tycon = tyConAppTyCon res_ty + +cgOpApp (StgPrimOp primop) args res_ty = do + dflags <- getDynFlags + cmm_args <- getNonVoidArgAmodes args + case shouldInlinePrimOp dflags primop cmm_args of + Nothing -> do -- out-of-line + let fun = CmmLit (CmmLabel (mkRtsPrimOpLabel primop)) + emitCall (NativeNodeCall, NativeReturn) fun cmm_args + + Just f -- inline + | ReturnsPrim VoidRep <- result_info + -> do f [] + emitReturn [] + + | ReturnsPrim rep <- result_info + -> do dflags <- getDynFlags + res <- newTemp (primRepCmmType dflags rep) + f [res] + emitReturn [CmmReg (CmmLocal res)] + + | ReturnsAlg tycon <- result_info, isUnboxedTupleTyCon tycon + -> do (regs, _hints) <- newUnboxedTupleRegs res_ty + f regs + emitReturn (map (CmmReg . CmmLocal) regs) + + | otherwise -> panic "cgPrimop" + where + result_info = getPrimOpResultInfo primop + +cgOpApp (StgPrimCallOp primcall) args _res_ty + = do { cmm_args <- getNonVoidArgAmodes args + ; let fun = CmmLit (CmmLabel (mkPrimCallLabel primcall)) + ; emitCall (NativeNodeCall, NativeReturn) fun cmm_args } + +-- | Interpret the argument as an unsigned value, assuming the value +-- is given in two-complement form in the given width. +-- +-- Example: @asUnsigned W64 (-1)@ is 18446744073709551615. +-- +-- This function is used to work around the fact that many array +-- primops take Int# arguments, but we interpret them as unsigned +-- quantities in the code gen. This means that we have to be careful +-- every time we work on e.g. a CmmInt literal that corresponds to the +-- array size, as it might contain a negative Integer value if the +-- user passed a value larger than 2^(wORD_SIZE_IN_BITS-1) as the Int# +-- literal. +asUnsigned :: Width -> Integer -> Integer +asUnsigned w n = n .&. (bit (widthInBits w) - 1) + +-- TODO: Several primop implementations (e.g. 'doNewByteArrayOp') use +-- ByteOff (or some other fixed width signed type) to represent +-- array sizes or indices. This means that these will overflow for +-- large enough sizes. + +-- | Decide whether an out-of-line primop should be replaced by an +-- inline implementation. This might happen e.g. if there's enough +-- static information, such as statically know arguments, to emit a +-- more efficient implementation inline. +-- +-- Returns 'Nothing' if this primop should use its out-of-line +-- implementation (defined elsewhere) and 'Just' together with a code +-- generating function that takes the output regs as arguments +-- otherwise. +shouldInlinePrimOp :: DynFlags + -> PrimOp -- ^ The primop + -> [CmmExpr] -- ^ The primop arguments + -> Maybe ([LocalReg] -> FCode ()) + +shouldInlinePrimOp dflags NewByteArrayOp_Char [(CmmLit (CmmInt n w))] + | asUnsigned w n <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> doNewByteArrayOp res (fromInteger n) + +shouldInlinePrimOp dflags NewArrayOp [(CmmLit (CmmInt n w)), init] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> + doNewArrayOp res (arrPtrsRep dflags (fromInteger n)) mkMAP_DIRTY_infoLabel + [ (mkIntExpr dflags (fromInteger n), + fixedHdrSize dflags + oFFSET_StgMutArrPtrs_ptrs dflags) + , (mkIntExpr dflags (nonHdrSizeW (arrPtrsRep dflags (fromInteger n))), + fixedHdrSize dflags + oFFSET_StgMutArrPtrs_size dflags) + ] + (fromInteger n) init + +shouldInlinePrimOp _ CopyArrayOp + [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] = + Just $ \ [] -> doCopyArrayOp src src_off dst dst_off (fromInteger n) + +shouldInlinePrimOp _ CopyMutableArrayOp + [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] = + Just $ \ [] -> doCopyMutableArrayOp src src_off dst dst_off (fromInteger n) + +shouldInlinePrimOp _ CopyArrayArrayOp + [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] = + Just $ \ [] -> doCopyArrayOp src src_off dst dst_off (fromInteger n) + +shouldInlinePrimOp _ CopyMutableArrayArrayOp + [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] = + Just $ \ [] -> doCopyMutableArrayOp src src_off dst dst_off (fromInteger n) + +shouldInlinePrimOp dflags CloneArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneArray mkMAP_FROZEN_CLEAN_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags CloneMutableArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneArray mkMAP_DIRTY_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags FreezeArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneArray mkMAP_FROZEN_CLEAN_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags ThawArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneArray mkMAP_DIRTY_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags NewSmallArrayOp [(CmmLit (CmmInt n w)), init] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> + doNewArrayOp res (smallArrPtrsRep (fromInteger n)) mkSMAP_DIRTY_infoLabel + [ (mkIntExpr dflags (fromInteger n), + fixedHdrSize dflags + oFFSET_StgSmallMutArrPtrs_ptrs dflags) + ] + (fromInteger n) init + +shouldInlinePrimOp _ CopySmallArrayOp + [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] = + Just $ \ [] -> doCopySmallArrayOp src src_off dst dst_off (fromInteger n) + +shouldInlinePrimOp _ CopySmallMutableArrayOp + [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] = + Just $ \ [] -> doCopySmallMutableArrayOp src src_off dst dst_off (fromInteger n) + +shouldInlinePrimOp dflags CloneSmallArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneSmallArray mkSMAP_FROZEN_CLEAN_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags CloneSmallMutableArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneSmallArray mkSMAP_DIRTY_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags FreezeSmallArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneSmallArray mkSMAP_FROZEN_CLEAN_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags ThawSmallArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneSmallArray mkSMAP_DIRTY_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags primop args + | primOpOutOfLine primop = Nothing + | otherwise = Just $ \ regs -> emitPrimOp dflags regs primop args + +-- TODO: Several primops, such as 'copyArray#', only have an inline +-- implementation (below) but could possibly have both an inline +-- implementation and an out-of-line implementation, just like +-- 'newArray#'. This would lower the amount of code generated, +-- hopefully without a performance impact (needs to be measured). + +--------------------------------------------------- +cgPrimOp :: [LocalReg] -- where to put the results + -> PrimOp -- the op + -> [StgArg] -- arguments + -> FCode () + +cgPrimOp results op args + = do dflags <- getDynFlags + arg_exprs <- getNonVoidArgAmodes args + emitPrimOp dflags results op arg_exprs + + +------------------------------------------------------------------------ +-- Emitting code for a primop +------------------------------------------------------------------------ + +emitPrimOp :: DynFlags + -> [LocalReg] -- where to put the results + -> PrimOp -- the op + -> [CmmExpr] -- arguments + -> FCode () + +-- First we handle various awkward cases specially. The remaining +-- easy cases are then handled by translateOp, defined below. + +emitPrimOp _ [res] ParOp [arg] + = + -- for now, just implement this in a C function + -- later, we might want to inline it. + emitCCall + [(res,NoHint)] + (CmmLit (CmmLabel (mkForeignLabel (fsLit "newSpark") Nothing ForeignLabelInExternalPackage IsFunction))) + [(baseExpr, AddrHint), (arg,AddrHint)] + +emitPrimOp dflags [res] SparkOp [arg] + = do + -- returns the value of arg in res. We're going to therefore + -- refer to arg twice (once to pass to newSpark(), and once to + -- assign to res), so put it in a temporary. + tmp <- assignTemp arg + tmp2 <- newTemp (bWord dflags) + emitCCall + [(tmp2,NoHint)] + (CmmLit (CmmLabel (mkForeignLabel (fsLit "newSpark") Nothing ForeignLabelInExternalPackage IsFunction))) + [(baseExpr, AddrHint), ((CmmReg (CmmLocal tmp)), AddrHint)] + emitAssign (CmmLocal res) (CmmReg (CmmLocal tmp)) + +emitPrimOp dflags [res] GetCCSOfOp [arg] + = emitAssign (CmmLocal res) val + where + val + | gopt Opt_SccProfilingOn dflags = costCentreFrom dflags (cmmUntag dflags arg) + | otherwise = CmmLit (zeroCLit dflags) + +emitPrimOp _ [res] GetCurrentCCSOp [_dummy_arg] + = emitAssign (CmmLocal res) cccsExpr + +emitPrimOp _ [res] MyThreadIdOp [] + = emitAssign (CmmLocal res) currentTSOExpr + +emitPrimOp dflags [res] ReadMutVarOp [mutv] + = emitAssign (CmmLocal res) (cmmLoadIndexW dflags mutv (fixedHdrSizeW dflags) (gcWord dflags)) + +emitPrimOp dflags res@[] WriteMutVarOp [mutv,var] + = do -- Without this write barrier, other CPUs may see this pointer before + -- the writes for the closure it points to have occurred. + emitPrimCall res MO_WriteBarrier [] + emitStore (cmmOffsetW dflags mutv (fixedHdrSizeW dflags)) var + emitCCall + [{-no results-}] + (CmmLit (CmmLabel mkDirty_MUT_VAR_Label)) + [(baseExpr, AddrHint), (mutv,AddrHint)] + +-- #define sizzeofByteArrayzh(r,a) \ +-- r = ((StgArrBytes *)(a))->bytes +emitPrimOp dflags [res] SizeofByteArrayOp [arg] + = emit $ mkAssign (CmmLocal res) (cmmLoadIndexW dflags arg (fixedHdrSizeW dflags) (bWord dflags)) + +-- #define sizzeofMutableByteArrayzh(r,a) \ +-- r = ((StgArrBytes *)(a))->bytes +emitPrimOp dflags [res] SizeofMutableByteArrayOp [arg] + = emitPrimOp dflags [res] SizeofByteArrayOp [arg] + +-- #define getSizzeofMutableByteArrayzh(r,a) \ +-- r = ((StgArrBytes *)(a))->bytes +emitPrimOp dflags [res] GetSizeofMutableByteArrayOp [arg] + = emitAssign (CmmLocal res) (cmmLoadIndexW dflags arg (fixedHdrSizeW dflags) (bWord dflags)) + + +-- #define touchzh(o) /* nothing */ +emitPrimOp _ res@[] TouchOp args@[_arg] + = do emitPrimCall res MO_Touch args + +-- #define byteArrayContentszh(r,a) r = BYTE_ARR_CTS(a) +emitPrimOp dflags [res] ByteArrayContents_Char [arg] + = emitAssign (CmmLocal res) (cmmOffsetB dflags arg (arrWordsHdrSize dflags)) + +-- #define stableNameToIntzh(r,s) (r = ((StgStableName *)s)->sn) +emitPrimOp dflags [res] StableNameToIntOp [arg] + = emitAssign (CmmLocal res) (cmmLoadIndexW dflags arg (fixedHdrSizeW dflags) (bWord dflags)) + +emitPrimOp dflags [res] ReallyUnsafePtrEqualityOp [arg1,arg2] + = emitAssign (CmmLocal res) (CmmMachOp (mo_wordEq dflags) [arg1,arg2]) + +-- #define addrToHValuezh(r,a) r=(P_)a +emitPrimOp _ [res] AddrToAnyOp [arg] + = emitAssign (CmmLocal res) arg + +-- #define hvalueToAddrzh(r, a) r=(W_)a +emitPrimOp _ [res] AnyToAddrOp [arg] + = emitAssign (CmmLocal res) arg + +{- Freezing arrays-of-ptrs requires changing an info table, for the + benefit of the generational collector. It needs to scavenge mutable + objects, even if they are in old space. When they become immutable, + they can be removed from this scavenge list. -} + +-- #define unsafeFreezzeArrayzh(r,a) +-- { +-- SET_INFO((StgClosure *)a,&stg_MUT_ARR_PTRS_FROZEN_DIRTY_info); +-- r = a; +-- } +emitPrimOp _ [res] UnsafeFreezeArrayOp [arg] + = emit $ catAGraphs + [ setInfo arg (CmmLit (CmmLabel mkMAP_FROZEN_DIRTY_infoLabel)), + mkAssign (CmmLocal res) arg ] +emitPrimOp _ [res] UnsafeFreezeArrayArrayOp [arg] + = emit $ catAGraphs + [ setInfo arg (CmmLit (CmmLabel mkMAP_FROZEN_DIRTY_infoLabel)), + mkAssign (CmmLocal res) arg ] +emitPrimOp _ [res] UnsafeFreezeSmallArrayOp [arg] + = emit $ catAGraphs + [ setInfo arg (CmmLit (CmmLabel mkSMAP_FROZEN_DIRTY_infoLabel)), + mkAssign (CmmLocal res) arg ] + +-- #define unsafeFreezzeByteArrayzh(r,a) r=(a) +emitPrimOp _ [res] UnsafeFreezeByteArrayOp [arg] + = emitAssign (CmmLocal res) arg + +-- Reading/writing pointer arrays + +emitPrimOp _ [res] ReadArrayOp [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [res] IndexArrayOp [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [] WriteArrayOp [obj,ix,v] = doWritePtrArrayOp obj ix v + +emitPrimOp _ [res] IndexArrayArrayOp_ByteArray [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [res] IndexArrayArrayOp_ArrayArray [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [res] ReadArrayArrayOp_ByteArray [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [res] ReadArrayArrayOp_MutableByteArray [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [res] ReadArrayArrayOp_ArrayArray [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [res] ReadArrayArrayOp_MutableArrayArray [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [] WriteArrayArrayOp_ByteArray [obj,ix,v] = doWritePtrArrayOp obj ix v +emitPrimOp _ [] WriteArrayArrayOp_MutableByteArray [obj,ix,v] = doWritePtrArrayOp obj ix v +emitPrimOp _ [] WriteArrayArrayOp_ArrayArray [obj,ix,v] = doWritePtrArrayOp obj ix v +emitPrimOp _ [] WriteArrayArrayOp_MutableArrayArray [obj,ix,v] = doWritePtrArrayOp obj ix v + +emitPrimOp _ [res] ReadSmallArrayOp [obj,ix] = doReadSmallPtrArrayOp res obj ix +emitPrimOp _ [res] IndexSmallArrayOp [obj,ix] = doReadSmallPtrArrayOp res obj ix +emitPrimOp _ [] WriteSmallArrayOp [obj,ix,v] = doWriteSmallPtrArrayOp obj ix v + +-- Getting the size of pointer arrays + +emitPrimOp dflags [res] SizeofArrayOp [arg] + = emit $ mkAssign (CmmLocal res) (cmmLoadIndexW dflags arg + (fixedHdrSizeW dflags + bytesToWordsRoundUp dflags (oFFSET_StgMutArrPtrs_ptrs dflags)) + (bWord dflags)) +emitPrimOp dflags [res] SizeofMutableArrayOp [arg] + = emitPrimOp dflags [res] SizeofArrayOp [arg] +emitPrimOp dflags [res] SizeofArrayArrayOp [arg] + = emitPrimOp dflags [res] SizeofArrayOp [arg] +emitPrimOp dflags [res] SizeofMutableArrayArrayOp [arg] + = emitPrimOp dflags [res] SizeofArrayOp [arg] + +emitPrimOp dflags [res] SizeofSmallArrayOp [arg] = + emit $ mkAssign (CmmLocal res) + (cmmLoadIndexW dflags arg + (fixedHdrSizeW dflags + bytesToWordsRoundUp dflags (oFFSET_StgSmallMutArrPtrs_ptrs dflags)) + (bWord dflags)) +emitPrimOp dflags [res] SizeofSmallMutableArrayOp [arg] = + emitPrimOp dflags [res] SizeofSmallArrayOp [arg] + +-- IndexXXXoffAddr + +emitPrimOp dflags res IndexOffAddrOp_Char args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexOffAddrOp_WideChar args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp dflags res IndexOffAddrOp_Int args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexOffAddrOp_Word args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexOffAddrOp_Addr args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp _ res IndexOffAddrOp_Float args = doIndexOffAddrOp Nothing f32 res args +emitPrimOp _ res IndexOffAddrOp_Double args = doIndexOffAddrOp Nothing f64 res args +emitPrimOp dflags res IndexOffAddrOp_StablePtr args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexOffAddrOp_Int8 args = doIndexOffAddrOp (Just (mo_s_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexOffAddrOp_Int16 args = doIndexOffAddrOp (Just (mo_s_16ToWord dflags)) b16 res args +emitPrimOp dflags res IndexOffAddrOp_Int32 args = doIndexOffAddrOp (Just (mo_s_32ToWord dflags)) b32 res args +emitPrimOp _ res IndexOffAddrOp_Int64 args = doIndexOffAddrOp Nothing b64 res args +emitPrimOp dflags res IndexOffAddrOp_Word8 args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexOffAddrOp_Word16 args = doIndexOffAddrOp (Just (mo_u_16ToWord dflags)) b16 res args +emitPrimOp dflags res IndexOffAddrOp_Word32 args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp _ res IndexOffAddrOp_Word64 args = doIndexOffAddrOp Nothing b64 res args + +-- ReadXXXoffAddr, which are identical, for our purposes, to IndexXXXoffAddr. + +emitPrimOp dflags res ReadOffAddrOp_Char args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadOffAddrOp_WideChar args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp dflags res ReadOffAddrOp_Int args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadOffAddrOp_Word args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadOffAddrOp_Addr args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp _ res ReadOffAddrOp_Float args = doIndexOffAddrOp Nothing f32 res args +emitPrimOp _ res ReadOffAddrOp_Double args = doIndexOffAddrOp Nothing f64 res args +emitPrimOp dflags res ReadOffAddrOp_StablePtr args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadOffAddrOp_Int8 args = doIndexOffAddrOp (Just (mo_s_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadOffAddrOp_Int16 args = doIndexOffAddrOp (Just (mo_s_16ToWord dflags)) b16 res args +emitPrimOp dflags res ReadOffAddrOp_Int32 args = doIndexOffAddrOp (Just (mo_s_32ToWord dflags)) b32 res args +emitPrimOp _ res ReadOffAddrOp_Int64 args = doIndexOffAddrOp Nothing b64 res args +emitPrimOp dflags res ReadOffAddrOp_Word8 args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadOffAddrOp_Word16 args = doIndexOffAddrOp (Just (mo_u_16ToWord dflags)) b16 res args +emitPrimOp dflags res ReadOffAddrOp_Word32 args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp _ res ReadOffAddrOp_Word64 args = doIndexOffAddrOp Nothing b64 res args + +-- IndexXXXArray + +emitPrimOp dflags res IndexByteArrayOp_Char args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexByteArrayOp_WideChar args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp dflags res IndexByteArrayOp_Int args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexByteArrayOp_Word args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexByteArrayOp_Addr args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp _ res IndexByteArrayOp_Float args = doIndexByteArrayOp Nothing f32 res args +emitPrimOp _ res IndexByteArrayOp_Double args = doIndexByteArrayOp Nothing f64 res args +emitPrimOp dflags res IndexByteArrayOp_StablePtr args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexByteArrayOp_Int8 args = doIndexByteArrayOp (Just (mo_s_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexByteArrayOp_Int16 args = doIndexByteArrayOp (Just (mo_s_16ToWord dflags)) b16 res args +emitPrimOp dflags res IndexByteArrayOp_Int32 args = doIndexByteArrayOp (Just (mo_s_32ToWord dflags)) b32 res args +emitPrimOp _ res IndexByteArrayOp_Int64 args = doIndexByteArrayOp Nothing b64 res args +emitPrimOp dflags res IndexByteArrayOp_Word8 args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word16 args = doIndexByteArrayOp (Just (mo_u_16ToWord dflags)) b16 res args +emitPrimOp dflags res IndexByteArrayOp_Word32 args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp _ res IndexByteArrayOp_Word64 args = doIndexByteArrayOp Nothing b64 res args + +-- ReadXXXArray, identical to IndexXXXArray. + +emitPrimOp dflags res ReadByteArrayOp_Char args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadByteArrayOp_WideChar args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp dflags res ReadByteArrayOp_Int args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadByteArrayOp_Word args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadByteArrayOp_Addr args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp _ res ReadByteArrayOp_Float args = doIndexByteArrayOp Nothing f32 res args +emitPrimOp _ res ReadByteArrayOp_Double args = doIndexByteArrayOp Nothing f64 res args +emitPrimOp dflags res ReadByteArrayOp_StablePtr args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadByteArrayOp_Int8 args = doIndexByteArrayOp (Just (mo_s_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadByteArrayOp_Int16 args = doIndexByteArrayOp (Just (mo_s_16ToWord dflags)) b16 res args +emitPrimOp dflags res ReadByteArrayOp_Int32 args = doIndexByteArrayOp (Just (mo_s_32ToWord dflags)) b32 res args +emitPrimOp _ res ReadByteArrayOp_Int64 args = doIndexByteArrayOp Nothing b64 res args +emitPrimOp dflags res ReadByteArrayOp_Word8 args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word16 args = doIndexByteArrayOp (Just (mo_u_16ToWord dflags)) b16 res args +emitPrimOp dflags res ReadByteArrayOp_Word32 args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp _ res ReadByteArrayOp_Word64 args = doIndexByteArrayOp Nothing b64 res args + +-- IndexWord8ArrayAsXXX + +emitPrimOp dflags res IndexByteArrayOp_Word8AsChar args = doIndexByteArrayOpAs (Just (mo_u_8ToWord dflags)) b8 b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsWideChar args = doIndexByteArrayOpAs (Just (mo_u_32ToWord dflags)) b32 b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsInt args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsWord args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsAddr args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp _ res IndexByteArrayOp_Word8AsFloat args = doIndexByteArrayOpAs Nothing f32 b8 res args +emitPrimOp _ res IndexByteArrayOp_Word8AsDouble args = doIndexByteArrayOpAs Nothing f64 b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsStablePtr args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsInt16 args = doIndexByteArrayOpAs (Just (mo_s_16ToWord dflags)) b16 b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsInt32 args = doIndexByteArrayOpAs (Just (mo_s_32ToWord dflags)) b32 b8 res args +emitPrimOp _ res IndexByteArrayOp_Word8AsInt64 args = doIndexByteArrayOpAs Nothing b64 b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsWord16 args = doIndexByteArrayOpAs (Just (mo_u_16ToWord dflags)) b16 b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsWord32 args = doIndexByteArrayOpAs (Just (mo_u_32ToWord dflags)) b32 b8 res args +emitPrimOp _ res IndexByteArrayOp_Word8AsWord64 args = doIndexByteArrayOpAs Nothing b64 b8 res args + +-- ReadInt8ArrayAsXXX, identical to IndexInt8ArrayAsXXX + +emitPrimOp dflags res ReadByteArrayOp_Word8AsChar args = doIndexByteArrayOpAs (Just (mo_u_8ToWord dflags)) b8 b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsWideChar args = doIndexByteArrayOpAs (Just (mo_u_32ToWord dflags)) b32 b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsInt args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsWord args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsAddr args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp _ res ReadByteArrayOp_Word8AsFloat args = doIndexByteArrayOpAs Nothing f32 b8 res args +emitPrimOp _ res ReadByteArrayOp_Word8AsDouble args = doIndexByteArrayOpAs Nothing f64 b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsStablePtr args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsInt16 args = doIndexByteArrayOpAs (Just (mo_s_16ToWord dflags)) b16 b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsInt32 args = doIndexByteArrayOpAs (Just (mo_s_32ToWord dflags)) b32 b8 res args +emitPrimOp _ res ReadByteArrayOp_Word8AsInt64 args = doIndexByteArrayOpAs Nothing b64 b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsWord16 args = doIndexByteArrayOpAs (Just (mo_u_16ToWord dflags)) b16 b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsWord32 args = doIndexByteArrayOpAs (Just (mo_u_32ToWord dflags)) b32 b8 res args +emitPrimOp _ res ReadByteArrayOp_Word8AsWord64 args = doIndexByteArrayOpAs Nothing b64 b8 res args + +-- WriteXXXoffAddr + +emitPrimOp dflags res WriteOffAddrOp_Char args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteOffAddrOp_WideChar args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp dflags res WriteOffAddrOp_Int args = doWriteOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteOffAddrOp_Word args = doWriteOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteOffAddrOp_Addr args = doWriteOffAddrOp Nothing (bWord dflags) res args +emitPrimOp _ res WriteOffAddrOp_Float args = doWriteOffAddrOp Nothing f32 res args +emitPrimOp _ res WriteOffAddrOp_Double args = doWriteOffAddrOp Nothing f64 res args +emitPrimOp dflags res WriteOffAddrOp_StablePtr args = doWriteOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteOffAddrOp_Int8 args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteOffAddrOp_Int16 args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) b16 res args +emitPrimOp dflags res WriteOffAddrOp_Int32 args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp _ res WriteOffAddrOp_Int64 args = doWriteOffAddrOp Nothing b64 res args +emitPrimOp dflags res WriteOffAddrOp_Word8 args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteOffAddrOp_Word16 args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) b16 res args +emitPrimOp dflags res WriteOffAddrOp_Word32 args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp _ res WriteOffAddrOp_Word64 args = doWriteOffAddrOp Nothing b64 res args + +-- WriteXXXArray + +emitPrimOp dflags res WriteByteArrayOp_Char args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteByteArrayOp_WideChar args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp dflags res WriteByteArrayOp_Int args = doWriteByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteByteArrayOp_Word args = doWriteByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteByteArrayOp_Addr args = doWriteByteArrayOp Nothing (bWord dflags) res args +emitPrimOp _ res WriteByteArrayOp_Float args = doWriteByteArrayOp Nothing f32 res args +emitPrimOp _ res WriteByteArrayOp_Double args = doWriteByteArrayOp Nothing f64 res args +emitPrimOp dflags res WriteByteArrayOp_StablePtr args = doWriteByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteByteArrayOp_Int8 args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteByteArrayOp_Int16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b16 res args +emitPrimOp dflags res WriteByteArrayOp_Int32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp _ res WriteByteArrayOp_Int64 args = doWriteByteArrayOp Nothing b64 res args +emitPrimOp dflags res WriteByteArrayOp_Word8 args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteByteArrayOp_Word16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b16 res args +emitPrimOp dflags res WriteByteArrayOp_Word32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp _ res WriteByteArrayOp_Word64 args = doWriteByteArrayOp Nothing b64 res args + +-- WriteInt8ArrayAsXXX + +emitPrimOp dflags res WriteByteArrayOp_Word8AsChar args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteByteArrayOp_Word8AsWideChar args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsInt args = doWriteByteArrayOp Nothing b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsWord args = doWriteByteArrayOp Nothing b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsAddr args = doWriteByteArrayOp Nothing b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsFloat args = doWriteByteArrayOp Nothing b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsDouble args = doWriteByteArrayOp Nothing b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsStablePtr args = doWriteByteArrayOp Nothing b8 res args +emitPrimOp dflags res WriteByteArrayOp_Word8AsInt16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b8 res args +emitPrimOp dflags res WriteByteArrayOp_Word8AsInt32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsInt64 args = doWriteByteArrayOp Nothing b8 res args +emitPrimOp dflags res WriteByteArrayOp_Word8AsWord16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b8 res args +emitPrimOp dflags res WriteByteArrayOp_Word8AsWord32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsWord64 args = doWriteByteArrayOp Nothing b8 res args + +-- Copying and setting byte arrays +emitPrimOp _ [] CopyByteArrayOp [src,src_off,dst,dst_off,n] = + doCopyByteArrayOp src src_off dst dst_off n +emitPrimOp _ [] CopyMutableByteArrayOp [src,src_off,dst,dst_off,n] = + doCopyMutableByteArrayOp src src_off dst dst_off n +emitPrimOp _ [] CopyByteArrayToAddrOp [src,src_off,dst,n] = + doCopyByteArrayToAddrOp src src_off dst n +emitPrimOp _ [] CopyMutableByteArrayToAddrOp [src,src_off,dst,n] = + doCopyMutableByteArrayToAddrOp src src_off dst n +emitPrimOp _ [] CopyAddrToByteArrayOp [src,dst,dst_off,n] = + doCopyAddrToByteArrayOp src dst dst_off n +emitPrimOp _ [] SetByteArrayOp [ba,off,len,c] = + doSetByteArrayOp ba off len c + +-- Comparing byte arrays +emitPrimOp _ [res] CompareByteArraysOp [ba1,ba1_off,ba2,ba2_off,n] = + doCompareByteArraysOp res ba1 ba1_off ba2 ba2_off n + +emitPrimOp _ [res] BSwap16Op [w] = emitBSwapCall res w W16 +emitPrimOp _ [res] BSwap32Op [w] = emitBSwapCall res w W32 +emitPrimOp _ [res] BSwap64Op [w] = emitBSwapCall res w W64 +emitPrimOp dflags [res] BSwapOp [w] = emitBSwapCall res w (wordWidth dflags) + +emitPrimOp _ [res] BRev8Op [w] = emitBRevCall res w W8 +emitPrimOp _ [res] BRev16Op [w] = emitBRevCall res w W16 +emitPrimOp _ [res] BRev32Op [w] = emitBRevCall res w W32 +emitPrimOp _ [res] BRev64Op [w] = emitBRevCall res w W64 +emitPrimOp dflags [res] BRevOp [w] = emitBRevCall res w (wordWidth dflags) + +-- Population count +emitPrimOp _ [res] PopCnt8Op [w] = emitPopCntCall res w W8 +emitPrimOp _ [res] PopCnt16Op [w] = emitPopCntCall res w W16 +emitPrimOp _ [res] PopCnt32Op [w] = emitPopCntCall res w W32 +emitPrimOp _ [res] PopCnt64Op [w] = emitPopCntCall res w W64 +emitPrimOp dflags [res] PopCntOp [w] = emitPopCntCall res w (wordWidth dflags) + +-- Parallel bit deposit +emitPrimOp _ [res] Pdep8Op [src, mask] = emitPdepCall res src mask W8 +emitPrimOp _ [res] Pdep16Op [src, mask] = emitPdepCall res src mask W16 +emitPrimOp _ [res] Pdep32Op [src, mask] = emitPdepCall res src mask W32 +emitPrimOp _ [res] Pdep64Op [src, mask] = emitPdepCall res src mask W64 +emitPrimOp dflags [res] PdepOp [src, mask] = emitPdepCall res src mask (wordWidth dflags) + +-- Parallel bit extract +emitPrimOp _ [res] Pext8Op [src, mask] = emitPextCall res src mask W8 +emitPrimOp _ [res] Pext16Op [src, mask] = emitPextCall res src mask W16 +emitPrimOp _ [res] Pext32Op [src, mask] = emitPextCall res src mask W32 +emitPrimOp _ [res] Pext64Op [src, mask] = emitPextCall res src mask W64 +emitPrimOp dflags [res] PextOp [src, mask] = emitPextCall res src mask (wordWidth dflags) + +-- count leading zeros +emitPrimOp _ [res] Clz8Op [w] = emitClzCall res w W8 +emitPrimOp _ [res] Clz16Op [w] = emitClzCall res w W16 +emitPrimOp _ [res] Clz32Op [w] = emitClzCall res w W32 +emitPrimOp _ [res] Clz64Op [w] = emitClzCall res w W64 +emitPrimOp dflags [res] ClzOp [w] = emitClzCall res w (wordWidth dflags) + +-- count trailing zeros +emitPrimOp _ [res] Ctz8Op [w] = emitCtzCall res w W8 +emitPrimOp _ [res] Ctz16Op [w] = emitCtzCall res w W16 +emitPrimOp _ [res] Ctz32Op [w] = emitCtzCall res w W32 +emitPrimOp _ [res] Ctz64Op [w] = emitCtzCall res w W64 +emitPrimOp dflags [res] CtzOp [w] = emitCtzCall res w (wordWidth dflags) + +-- Unsigned int to floating point conversions +emitPrimOp _ [res] Word2FloatOp [w] = emitPrimCall [res] + (MO_UF_Conv W32) [w] +emitPrimOp _ [res] Word2DoubleOp [w] = emitPrimCall [res] + (MO_UF_Conv W64) [w] + +-- SIMD primops +emitPrimOp dflags [res] (VecBroadcastOp vcat n w) [e] = do + checkVecCompatibility dflags vcat n w + doVecPackOp (vecElemInjectCast dflags vcat w) ty zeros (replicate n e) res + where + zeros :: CmmExpr + zeros = CmmLit $ CmmVec (replicate n zero) + + zero :: CmmLit + zero = case vcat of + IntVec -> CmmInt 0 w + WordVec -> CmmInt 0 w + FloatVec -> CmmFloat 0 w + + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags [res] (VecPackOp vcat n w) es = do + checkVecCompatibility dflags vcat n w + when (es `lengthIsNot` n) $ + panic "emitPrimOp: VecPackOp has wrong number of arguments" + doVecPackOp (vecElemInjectCast dflags vcat w) ty zeros es res + where + zeros :: CmmExpr + zeros = CmmLit $ CmmVec (replicate n zero) + + zero :: CmmLit + zero = case vcat of + IntVec -> CmmInt 0 w + WordVec -> CmmInt 0 w + FloatVec -> CmmFloat 0 w + + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecUnpackOp vcat n w) [arg] = do + checkVecCompatibility dflags vcat n w + when (res `lengthIsNot` n) $ + panic "emitPrimOp: VecUnpackOp has wrong number of results" + doVecUnpackOp (vecElemProjectCast dflags vcat w) ty arg res + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags [res] (VecInsertOp vcat n w) [v,e,i] = do + checkVecCompatibility dflags vcat n w + doVecInsertOp (vecElemInjectCast dflags vcat w) ty v e i res + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecIndexByteArrayOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexByteArrayOp Nothing ty res args + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecReadByteArrayOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexByteArrayOp Nothing ty res args + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecWriteByteArrayOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doWriteByteArrayOp Nothing ty res args + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecIndexOffAddrOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexOffAddrOp Nothing ty res args + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecReadOffAddrOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexOffAddrOp Nothing ty res args + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecWriteOffAddrOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doWriteOffAddrOp Nothing ty res args + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecIndexScalarByteArrayOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexByteArrayOpAs Nothing vecty ty res args + where + vecty :: CmmType + vecty = vecVmmType vcat n w + + ty :: CmmType + ty = vecCmmCat vcat w + +emitPrimOp dflags res (VecReadScalarByteArrayOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexByteArrayOpAs Nothing vecty ty res args + where + vecty :: CmmType + vecty = vecVmmType vcat n w + + ty :: CmmType + ty = vecCmmCat vcat w + +emitPrimOp dflags res (VecWriteScalarByteArrayOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doWriteByteArrayOp Nothing ty res args + where + ty :: CmmType + ty = vecCmmCat vcat w + +emitPrimOp dflags res (VecIndexScalarOffAddrOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexOffAddrOpAs Nothing vecty ty res args + where + vecty :: CmmType + vecty = vecVmmType vcat n w + + ty :: CmmType + ty = vecCmmCat vcat w + +emitPrimOp dflags res (VecReadScalarOffAddrOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexOffAddrOpAs Nothing vecty ty res args + where + vecty :: CmmType + vecty = vecVmmType vcat n w + + ty :: CmmType + ty = vecCmmCat vcat w + +emitPrimOp dflags res (VecWriteScalarOffAddrOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doWriteOffAddrOp Nothing ty res args + where + ty :: CmmType + ty = vecCmmCat vcat w + +-- Prefetch +emitPrimOp _ [] PrefetchByteArrayOp3 args = doPrefetchByteArrayOp 3 args +emitPrimOp _ [] PrefetchMutableByteArrayOp3 args = doPrefetchMutableByteArrayOp 3 args +emitPrimOp _ [] PrefetchAddrOp3 args = doPrefetchAddrOp 3 args +emitPrimOp _ [] PrefetchValueOp3 args = doPrefetchValueOp 3 args + +emitPrimOp _ [] PrefetchByteArrayOp2 args = doPrefetchByteArrayOp 2 args +emitPrimOp _ [] PrefetchMutableByteArrayOp2 args = doPrefetchMutableByteArrayOp 2 args +emitPrimOp _ [] PrefetchAddrOp2 args = doPrefetchAddrOp 2 args +emitPrimOp _ [] PrefetchValueOp2 args = doPrefetchValueOp 2 args + +emitPrimOp _ [] PrefetchByteArrayOp1 args = doPrefetchByteArrayOp 1 args +emitPrimOp _ [] PrefetchMutableByteArrayOp1 args = doPrefetchMutableByteArrayOp 1 args +emitPrimOp _ [] PrefetchAddrOp1 args = doPrefetchAddrOp 1 args +emitPrimOp _ [] PrefetchValueOp1 args = doPrefetchValueOp 1 args + +emitPrimOp _ [] PrefetchByteArrayOp0 args = doPrefetchByteArrayOp 0 args +emitPrimOp _ [] PrefetchMutableByteArrayOp0 args = doPrefetchMutableByteArrayOp 0 args +emitPrimOp _ [] PrefetchAddrOp0 args = doPrefetchAddrOp 0 args +emitPrimOp _ [] PrefetchValueOp0 args = doPrefetchValueOp 0 args + +-- Atomic read-modify-write +emitPrimOp dflags [res] FetchAddByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_Add mba ix (bWord dflags) n +emitPrimOp dflags [res] FetchSubByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_Sub mba ix (bWord dflags) n +emitPrimOp dflags [res] FetchAndByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_And mba ix (bWord dflags) n +emitPrimOp dflags [res] FetchNandByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_Nand mba ix (bWord dflags) n +emitPrimOp dflags [res] FetchOrByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_Or mba ix (bWord dflags) n +emitPrimOp dflags [res] FetchXorByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_Xor mba ix (bWord dflags) n +emitPrimOp dflags [res] AtomicReadByteArrayOp_Int [mba, ix] = + doAtomicReadByteArray res mba ix (bWord dflags) +emitPrimOp dflags [] AtomicWriteByteArrayOp_Int [mba, ix, val] = + doAtomicWriteByteArray mba ix (bWord dflags) val +emitPrimOp dflags [res] CasByteArrayOp_Int [mba, ix, old, new] = + doCasByteArray res mba ix (bWord dflags) old new + +-- The rest just translate straightforwardly +emitPrimOp dflags [res] op [arg] + | nopOp op + = emitAssign (CmmLocal res) arg + + | Just (mop,rep) <- narrowOp op + = emitAssign (CmmLocal res) $ + CmmMachOp (mop rep (wordWidth dflags)) [CmmMachOp (mop (wordWidth dflags) rep) [arg]] + +emitPrimOp dflags r@[res] op args + | Just prim <- callishOp op + = do emitPrimCall r prim args + + | Just mop <- translateOp dflags op + = let stmt = mkAssign (CmmLocal res) (CmmMachOp mop args) in + emit stmt + +emitPrimOp dflags results op args + = case callishPrimOpSupported dflags op args of + Left op -> emit $ mkUnsafeCall (PrimTarget op) results args + Right gen -> gen results args + +-- Note [QuotRem optimization] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- `quot` and `rem` with constant divisor can be implemented with fast bit-ops +-- (shift, .&.). +-- +-- Currently we only support optimization (performed in CmmOpt) when the +-- constant is a power of 2. #9041 tracks the implementation of the general +-- optimization. +-- +-- `quotRem` can be optimized in the same way. However as it returns two values, +-- it is implemented as a "callish" primop which is harder to match and +-- to transform later on. For simplicity, the current implementation detects cases +-- that can be optimized (see `quotRemCanBeOptimized`) and converts STG quotRem +-- primop into two CMM quot and rem primops. + +type GenericOp = [CmmFormal] -> [CmmActual] -> FCode () + +callishPrimOpSupported :: DynFlags -> PrimOp -> [CmmExpr] -> Either CallishMachOp GenericOp +callishPrimOpSupported dflags op args + = case op of + IntQuotRemOp | ncg && (x86ish || ppc) + , not quotRemCanBeOptimized + -> Left (MO_S_QuotRem (wordWidth dflags)) + | otherwise + -> Right (genericIntQuotRemOp (wordWidth dflags)) + + Int8QuotRemOp | ncg && (x86ish || ppc) + , not quotRemCanBeOptimized + -> Left (MO_S_QuotRem W8) + | otherwise -> Right (genericIntQuotRemOp W8) + + Int16QuotRemOp | ncg && (x86ish || ppc) + , not quotRemCanBeOptimized + -> Left (MO_S_QuotRem W16) + | otherwise -> Right (genericIntQuotRemOp W16) + + + WordQuotRemOp | ncg && (x86ish || ppc) + , not quotRemCanBeOptimized + -> Left (MO_U_QuotRem (wordWidth dflags)) + | otherwise + -> Right (genericWordQuotRemOp (wordWidth dflags)) + + WordQuotRem2Op | (ncg && (x86ish || ppc)) + || llvm -> Left (MO_U_QuotRem2 (wordWidth dflags)) + | otherwise -> Right (genericWordQuotRem2Op dflags) + + Word8QuotRemOp | ncg && (x86ish || ppc) + , not quotRemCanBeOptimized + -> Left (MO_U_QuotRem W8) + | otherwise -> Right (genericWordQuotRemOp W8) + + Word16QuotRemOp| ncg && (x86ish || ppc) + , not quotRemCanBeOptimized + -> Left (MO_U_QuotRem W16) + | otherwise -> Right (genericWordQuotRemOp W16) + + WordAdd2Op | (ncg && (x86ish || ppc)) + || llvm -> Left (MO_Add2 (wordWidth dflags)) + | otherwise -> Right genericWordAdd2Op + + WordAddCOp | (ncg && (x86ish || ppc)) + || llvm -> Left (MO_AddWordC (wordWidth dflags)) + | otherwise -> Right genericWordAddCOp + + WordSubCOp | (ncg && (x86ish || ppc)) + || llvm -> Left (MO_SubWordC (wordWidth dflags)) + | otherwise -> Right genericWordSubCOp + + IntAddCOp | (ncg && (x86ish || ppc)) + || llvm -> Left (MO_AddIntC (wordWidth dflags)) + | otherwise -> Right genericIntAddCOp + + IntSubCOp | (ncg && (x86ish || ppc)) + || llvm -> Left (MO_SubIntC (wordWidth dflags)) + | otherwise -> Right genericIntSubCOp + + WordMul2Op | ncg && (x86ish || ppc) + || llvm -> Left (MO_U_Mul2 (wordWidth dflags)) + | otherwise -> Right genericWordMul2Op + FloatFabsOp | (ncg && x86ish || ppc) + || llvm -> Left MO_F32_Fabs + | otherwise -> Right $ genericFabsOp W32 + DoubleFabsOp | (ncg && x86ish || ppc) + || llvm -> Left MO_F64_Fabs + | otherwise -> Right $ genericFabsOp W64 + + _ -> pprPanic "emitPrimOp: can't translate PrimOp " (ppr op) + where + -- See Note [QuotRem optimization] + quotRemCanBeOptimized = case args of + [_, CmmLit (CmmInt n _) ] -> isJust (exactLog2 n) + _ -> False + + ncg = case hscTarget dflags of + HscAsm -> True + _ -> False + llvm = case hscTarget dflags of + HscLlvm -> True + _ -> False + x86ish = case platformArch (targetPlatform dflags) of + ArchX86 -> True + ArchX86_64 -> True + _ -> False + ppc = case platformArch (targetPlatform dflags) of + ArchPPC -> True + ArchPPC_64 _ -> True + _ -> False + +genericIntQuotRemOp :: Width -> GenericOp +genericIntQuotRemOp width [res_q, res_r] [arg_x, arg_y] + = emit $ mkAssign (CmmLocal res_q) + (CmmMachOp (MO_S_Quot width) [arg_x, arg_y]) <*> + mkAssign (CmmLocal res_r) + (CmmMachOp (MO_S_Rem width) [arg_x, arg_y]) +genericIntQuotRemOp _ _ _ = panic "genericIntQuotRemOp" + +genericWordQuotRemOp :: Width -> GenericOp +genericWordQuotRemOp width [res_q, res_r] [arg_x, arg_y] + = emit $ mkAssign (CmmLocal res_q) + (CmmMachOp (MO_U_Quot width) [arg_x, arg_y]) <*> + mkAssign (CmmLocal res_r) + (CmmMachOp (MO_U_Rem width) [arg_x, arg_y]) +genericWordQuotRemOp _ _ _ = panic "genericWordQuotRemOp" + +genericWordQuotRem2Op :: DynFlags -> GenericOp +genericWordQuotRem2Op dflags [res_q, res_r] [arg_x_high, arg_x_low, arg_y] + = emit =<< f (widthInBits (wordWidth dflags)) zero arg_x_high arg_x_low + where ty = cmmExprType dflags arg_x_high + shl x i = CmmMachOp (MO_Shl (wordWidth dflags)) [x, i] + shr x i = CmmMachOp (MO_U_Shr (wordWidth dflags)) [x, i] + or x y = CmmMachOp (MO_Or (wordWidth dflags)) [x, y] + ge x y = CmmMachOp (MO_U_Ge (wordWidth dflags)) [x, y] + ne x y = CmmMachOp (MO_Ne (wordWidth dflags)) [x, y] + minus x y = CmmMachOp (MO_Sub (wordWidth dflags)) [x, y] + times x y = CmmMachOp (MO_Mul (wordWidth dflags)) [x, y] + zero = lit 0 + one = lit 1 + negone = lit (fromIntegral (widthInBits (wordWidth dflags)) - 1) + lit i = CmmLit (CmmInt i (wordWidth dflags)) + + f :: Int -> CmmExpr -> CmmExpr -> CmmExpr -> FCode CmmAGraph + f 0 acc high _ = return (mkAssign (CmmLocal res_q) acc <*> + mkAssign (CmmLocal res_r) high) + f i acc high low = + do roverflowedBit <- newTemp ty + rhigh' <- newTemp ty + rhigh'' <- newTemp ty + rlow' <- newTemp ty + risge <- newTemp ty + racc' <- newTemp ty + let high' = CmmReg (CmmLocal rhigh') + isge = CmmReg (CmmLocal risge) + overflowedBit = CmmReg (CmmLocal roverflowedBit) + let this = catAGraphs + [mkAssign (CmmLocal roverflowedBit) + (shr high negone), + mkAssign (CmmLocal rhigh') + (or (shl high one) (shr low negone)), + mkAssign (CmmLocal rlow') + (shl low one), + mkAssign (CmmLocal risge) + (or (overflowedBit `ne` zero) + (high' `ge` arg_y)), + mkAssign (CmmLocal rhigh'') + (high' `minus` (arg_y `times` isge)), + mkAssign (CmmLocal racc') + (or (shl acc one) isge)] + rest <- f (i - 1) (CmmReg (CmmLocal racc')) + (CmmReg (CmmLocal rhigh'')) + (CmmReg (CmmLocal rlow')) + return (this <*> rest) +genericWordQuotRem2Op _ _ _ = panic "genericWordQuotRem2Op" + +genericWordAdd2Op :: GenericOp +genericWordAdd2Op [res_h, res_l] [arg_x, arg_y] + = do dflags <- getDynFlags + r1 <- newTemp (cmmExprType dflags arg_x) + r2 <- newTemp (cmmExprType dflags arg_x) + let topHalf x = CmmMachOp (MO_U_Shr (wordWidth dflags)) [x, hww] + toTopHalf x = CmmMachOp (MO_Shl (wordWidth dflags)) [x, hww] + bottomHalf x = CmmMachOp (MO_And (wordWidth dflags)) [x, hwm] + add x y = CmmMachOp (MO_Add (wordWidth dflags)) [x, y] + or x y = CmmMachOp (MO_Or (wordWidth dflags)) [x, y] + hww = CmmLit (CmmInt (fromIntegral (widthInBits (halfWordWidth dflags))) + (wordWidth dflags)) + hwm = CmmLit (CmmInt (halfWordMask dflags) (wordWidth dflags)) + emit $ catAGraphs + [mkAssign (CmmLocal r1) + (add (bottomHalf arg_x) (bottomHalf arg_y)), + mkAssign (CmmLocal r2) + (add (topHalf (CmmReg (CmmLocal r1))) + (add (topHalf arg_x) (topHalf arg_y))), + mkAssign (CmmLocal res_h) + (topHalf (CmmReg (CmmLocal r2))), + mkAssign (CmmLocal res_l) + (or (toTopHalf (CmmReg (CmmLocal r2))) + (bottomHalf (CmmReg (CmmLocal r1))))] +genericWordAdd2Op _ _ = panic "genericWordAdd2Op" + +-- | Implements branchless recovery of the carry flag @c@ by checking the +-- leftmost bits of both inputs @a@ and @b@ and result @r = a + b@: +-- +-- @ +-- c = a&b | (a|b)&~r +-- @ +-- +-- https://brodowsky.it-sky.net/2015/04/02/how-to-recover-the-carry-bit/ +genericWordAddCOp :: GenericOp +genericWordAddCOp [res_r, res_c] [aa, bb] + = do dflags <- getDynFlags + emit $ catAGraphs [ + mkAssign (CmmLocal res_r) (CmmMachOp (mo_wordAdd dflags) [aa,bb]), + mkAssign (CmmLocal res_c) $ + CmmMachOp (mo_wordUShr dflags) [ + CmmMachOp (mo_wordOr dflags) [ + CmmMachOp (mo_wordAnd dflags) [aa,bb], + CmmMachOp (mo_wordAnd dflags) [ + CmmMachOp (mo_wordOr dflags) [aa,bb], + CmmMachOp (mo_wordNot dflags) [CmmReg (CmmLocal res_r)] + ] + ], + mkIntExpr dflags (wORD_SIZE_IN_BITS dflags - 1) + ] + ] +genericWordAddCOp _ _ = panic "genericWordAddCOp" + +-- | Implements branchless recovery of the carry flag @c@ by checking the +-- leftmost bits of both inputs @a@ and @b@ and result @r = a - b@: +-- +-- @ +-- c = ~a&b | (~a|b)&r +-- @ +-- +-- https://brodowsky.it-sky.net/2015/04/02/how-to-recover-the-carry-bit/ +genericWordSubCOp :: GenericOp +genericWordSubCOp [res_r, res_c] [aa, bb] + = do dflags <- getDynFlags + emit $ catAGraphs [ + mkAssign (CmmLocal res_r) (CmmMachOp (mo_wordSub dflags) [aa,bb]), + mkAssign (CmmLocal res_c) $ + CmmMachOp (mo_wordUShr dflags) [ + CmmMachOp (mo_wordOr dflags) [ + CmmMachOp (mo_wordAnd dflags) [ + CmmMachOp (mo_wordNot dflags) [aa], + bb + ], + CmmMachOp (mo_wordAnd dflags) [ + CmmMachOp (mo_wordOr dflags) [ + CmmMachOp (mo_wordNot dflags) [aa], + bb + ], + CmmReg (CmmLocal res_r) + ] + ], + mkIntExpr dflags (wORD_SIZE_IN_BITS dflags - 1) + ] + ] +genericWordSubCOp _ _ = panic "genericWordSubCOp" + +genericIntAddCOp :: GenericOp +genericIntAddCOp [res_r, res_c] [aa, bb] +{- + With some bit-twiddling, we can define int{Add,Sub}Czh portably in + C, and without needing any comparisons. This may not be the + fastest way to do it - if you have better code, please send it! --SDM + + Return : r = a + b, c = 0 if no overflow, 1 on overflow. + + We currently don't make use of the r value if c is != 0 (i.e. + overflow), we just convert to big integers and try again. This + could be improved by making r and c the correct values for + plugging into a new J#. + + { r = ((I_)(a)) + ((I_)(b)); \ + c = ((StgWord)(~(((I_)(a))^((I_)(b))) & (((I_)(a))^r))) \ + >> (BITS_IN (I_) - 1); \ + } + Wading through the mass of bracketry, it seems to reduce to: + c = ( (~(a^b)) & (a^r) ) >>unsigned (BITS_IN(I_)-1) + +-} + = do dflags <- getDynFlags + emit $ catAGraphs [ + mkAssign (CmmLocal res_r) (CmmMachOp (mo_wordAdd dflags) [aa,bb]), + mkAssign (CmmLocal res_c) $ + CmmMachOp (mo_wordUShr dflags) [ + CmmMachOp (mo_wordAnd dflags) [ + CmmMachOp (mo_wordNot dflags) [CmmMachOp (mo_wordXor dflags) [aa,bb]], + CmmMachOp (mo_wordXor dflags) [aa, CmmReg (CmmLocal res_r)] + ], + mkIntExpr dflags (wORD_SIZE_IN_BITS dflags - 1) + ] + ] +genericIntAddCOp _ _ = panic "genericIntAddCOp" + +genericIntSubCOp :: GenericOp +genericIntSubCOp [res_r, res_c] [aa, bb] +{- Similarly: + #define subIntCzh(r,c,a,b) \ + { r = ((I_)(a)) - ((I_)(b)); \ + c = ((StgWord)((((I_)(a))^((I_)(b))) & (((I_)(a))^r))) \ + >> (BITS_IN (I_) - 1); \ + } + + c = ((a^b) & (a^r)) >>unsigned (BITS_IN(I_)-1) +-} + = do dflags <- getDynFlags + emit $ catAGraphs [ + mkAssign (CmmLocal res_r) (CmmMachOp (mo_wordSub dflags) [aa,bb]), + mkAssign (CmmLocal res_c) $ + CmmMachOp (mo_wordUShr dflags) [ + CmmMachOp (mo_wordAnd dflags) [ + CmmMachOp (mo_wordXor dflags) [aa,bb], + CmmMachOp (mo_wordXor dflags) [aa, CmmReg (CmmLocal res_r)] + ], + mkIntExpr dflags (wORD_SIZE_IN_BITS dflags - 1) + ] + ] +genericIntSubCOp _ _ = panic "genericIntSubCOp" + +genericWordMul2Op :: GenericOp +genericWordMul2Op [res_h, res_l] [arg_x, arg_y] + = do dflags <- getDynFlags + let t = cmmExprType dflags arg_x + xlyl <- liftM CmmLocal $ newTemp t + xlyh <- liftM CmmLocal $ newTemp t + xhyl <- liftM CmmLocal $ newTemp t + r <- liftM CmmLocal $ newTemp t + -- This generic implementation is very simple and slow. We might + -- well be able to do better, but for now this at least works. + let topHalf x = CmmMachOp (MO_U_Shr (wordWidth dflags)) [x, hww] + toTopHalf x = CmmMachOp (MO_Shl (wordWidth dflags)) [x, hww] + bottomHalf x = CmmMachOp (MO_And (wordWidth dflags)) [x, hwm] + add x y = CmmMachOp (MO_Add (wordWidth dflags)) [x, y] + sum = foldl1 add + mul x y = CmmMachOp (MO_Mul (wordWidth dflags)) [x, y] + or x y = CmmMachOp (MO_Or (wordWidth dflags)) [x, y] + hww = CmmLit (CmmInt (fromIntegral (widthInBits (halfWordWidth dflags))) + (wordWidth dflags)) + hwm = CmmLit (CmmInt (halfWordMask dflags) (wordWidth dflags)) + emit $ catAGraphs + [mkAssign xlyl + (mul (bottomHalf arg_x) (bottomHalf arg_y)), + mkAssign xlyh + (mul (bottomHalf arg_x) (topHalf arg_y)), + mkAssign xhyl + (mul (topHalf arg_x) (bottomHalf arg_y)), + mkAssign r + (sum [topHalf (CmmReg xlyl), + bottomHalf (CmmReg xhyl), + bottomHalf (CmmReg xlyh)]), + mkAssign (CmmLocal res_l) + (or (bottomHalf (CmmReg xlyl)) + (toTopHalf (CmmReg r))), + mkAssign (CmmLocal res_h) + (sum [mul (topHalf arg_x) (topHalf arg_y), + topHalf (CmmReg xhyl), + topHalf (CmmReg xlyh), + topHalf (CmmReg r)])] +genericWordMul2Op _ _ = panic "genericWordMul2Op" + +-- This replicates what we had in libraries/base/GHC/Float.hs: +-- +-- abs x | x == 0 = 0 -- handles (-0.0) +-- | x > 0 = x +-- | otherwise = negateFloat x +genericFabsOp :: Width -> GenericOp +genericFabsOp w [res_r] [aa] + = do dflags <- getDynFlags + let zero = CmmLit (CmmFloat 0 w) + + eq x y = CmmMachOp (MO_F_Eq w) [x, y] + gt x y = CmmMachOp (MO_F_Gt w) [x, y] + + neg x = CmmMachOp (MO_F_Neg w) [x] + + g1 = catAGraphs [mkAssign (CmmLocal res_r) zero] + g2 = catAGraphs [mkAssign (CmmLocal res_r) aa] + + res_t <- CmmLocal <$> newTemp (cmmExprType dflags aa) + let g3 = catAGraphs [mkAssign res_t aa, + mkAssign (CmmLocal res_r) (neg (CmmReg res_t))] + + g4 <- mkCmmIfThenElse (gt aa zero) g2 g3 + + emit =<< mkCmmIfThenElse (eq aa zero) g1 g4 + +genericFabsOp _ _ _ = panic "genericFabsOp" + +-- These PrimOps are NOPs in Cmm + +nopOp :: PrimOp -> Bool +nopOp Int2WordOp = True +nopOp Word2IntOp = True +nopOp Int2AddrOp = True +nopOp Addr2IntOp = True +nopOp ChrOp = True -- Int# and Char# are rep'd the same +nopOp OrdOp = True +nopOp _ = False + +-- These PrimOps turn into double casts + +narrowOp :: PrimOp -> Maybe (Width -> Width -> MachOp, Width) +narrowOp Narrow8IntOp = Just (MO_SS_Conv, W8) +narrowOp Narrow16IntOp = Just (MO_SS_Conv, W16) +narrowOp Narrow32IntOp = Just (MO_SS_Conv, W32) +narrowOp Narrow8WordOp = Just (MO_UU_Conv, W8) +narrowOp Narrow16WordOp = Just (MO_UU_Conv, W16) +narrowOp Narrow32WordOp = Just (MO_UU_Conv, W32) +narrowOp _ = Nothing + +-- Native word signless ops + +translateOp :: DynFlags -> PrimOp -> Maybe MachOp +translateOp dflags IntAddOp = Just (mo_wordAdd dflags) +translateOp dflags IntSubOp = Just (mo_wordSub dflags) +translateOp dflags WordAddOp = Just (mo_wordAdd dflags) +translateOp dflags WordSubOp = Just (mo_wordSub dflags) +translateOp dflags AddrAddOp = Just (mo_wordAdd dflags) +translateOp dflags AddrSubOp = Just (mo_wordSub dflags) + +translateOp dflags IntEqOp = Just (mo_wordEq dflags) +translateOp dflags IntNeOp = Just (mo_wordNe dflags) +translateOp dflags WordEqOp = Just (mo_wordEq dflags) +translateOp dflags WordNeOp = Just (mo_wordNe dflags) +translateOp dflags AddrEqOp = Just (mo_wordEq dflags) +translateOp dflags AddrNeOp = Just (mo_wordNe dflags) + +translateOp dflags AndOp = Just (mo_wordAnd dflags) +translateOp dflags OrOp = Just (mo_wordOr dflags) +translateOp dflags XorOp = Just (mo_wordXor dflags) +translateOp dflags NotOp = Just (mo_wordNot dflags) +translateOp dflags SllOp = Just (mo_wordShl dflags) +translateOp dflags SrlOp = Just (mo_wordUShr dflags) + +translateOp dflags AddrRemOp = Just (mo_wordURem dflags) + +-- Native word signed ops + +translateOp dflags IntMulOp = Just (mo_wordMul dflags) +translateOp dflags IntMulMayOfloOp = Just (MO_S_MulMayOflo (wordWidth dflags)) +translateOp dflags IntQuotOp = Just (mo_wordSQuot dflags) +translateOp dflags IntRemOp = Just (mo_wordSRem dflags) +translateOp dflags IntNegOp = Just (mo_wordSNeg dflags) + + +translateOp dflags IntGeOp = Just (mo_wordSGe dflags) +translateOp dflags IntLeOp = Just (mo_wordSLe dflags) +translateOp dflags IntGtOp = Just (mo_wordSGt dflags) +translateOp dflags IntLtOp = Just (mo_wordSLt dflags) + +translateOp dflags AndIOp = Just (mo_wordAnd dflags) +translateOp dflags OrIOp = Just (mo_wordOr dflags) +translateOp dflags XorIOp = Just (mo_wordXor dflags) +translateOp dflags NotIOp = Just (mo_wordNot dflags) +translateOp dflags ISllOp = Just (mo_wordShl dflags) +translateOp dflags ISraOp = Just (mo_wordSShr dflags) +translateOp dflags ISrlOp = Just (mo_wordUShr dflags) + +-- Native word unsigned ops + +translateOp dflags WordGeOp = Just (mo_wordUGe dflags) +translateOp dflags WordLeOp = Just (mo_wordULe dflags) +translateOp dflags WordGtOp = Just (mo_wordUGt dflags) +translateOp dflags WordLtOp = Just (mo_wordULt dflags) + +translateOp dflags WordMulOp = Just (mo_wordMul dflags) +translateOp dflags WordQuotOp = Just (mo_wordUQuot dflags) +translateOp dflags WordRemOp = Just (mo_wordURem dflags) + +translateOp dflags AddrGeOp = Just (mo_wordUGe dflags) +translateOp dflags AddrLeOp = Just (mo_wordULe dflags) +translateOp dflags AddrGtOp = Just (mo_wordUGt dflags) +translateOp dflags AddrLtOp = Just (mo_wordULt dflags) + +-- Int8# signed ops + +translateOp dflags Int8Extend = Just (MO_SS_Conv W8 (wordWidth dflags)) +translateOp dflags Int8Narrow = Just (MO_SS_Conv (wordWidth dflags) W8) +translateOp _ Int8NegOp = Just (MO_S_Neg W8) +translateOp _ Int8AddOp = Just (MO_Add W8) +translateOp _ Int8SubOp = Just (MO_Sub W8) +translateOp _ Int8MulOp = Just (MO_Mul W8) +translateOp _ Int8QuotOp = Just (MO_S_Quot W8) +translateOp _ Int8RemOp = Just (MO_S_Rem W8) + +translateOp _ Int8EqOp = Just (MO_Eq W8) +translateOp _ Int8GeOp = Just (MO_S_Ge W8) +translateOp _ Int8GtOp = Just (MO_S_Gt W8) +translateOp _ Int8LeOp = Just (MO_S_Le W8) +translateOp _ Int8LtOp = Just (MO_S_Lt W8) +translateOp _ Int8NeOp = Just (MO_Ne W8) + +-- Word8# unsigned ops + +translateOp dflags Word8Extend = Just (MO_UU_Conv W8 (wordWidth dflags)) +translateOp dflags Word8Narrow = Just (MO_UU_Conv (wordWidth dflags) W8) +translateOp _ Word8NotOp = Just (MO_Not W8) +translateOp _ Word8AddOp = Just (MO_Add W8) +translateOp _ Word8SubOp = Just (MO_Sub W8) +translateOp _ Word8MulOp = Just (MO_Mul W8) +translateOp _ Word8QuotOp = Just (MO_U_Quot W8) +translateOp _ Word8RemOp = Just (MO_U_Rem W8) + +translateOp _ Word8EqOp = Just (MO_Eq W8) +translateOp _ Word8GeOp = Just (MO_U_Ge W8) +translateOp _ Word8GtOp = Just (MO_U_Gt W8) +translateOp _ Word8LeOp = Just (MO_U_Le W8) +translateOp _ Word8LtOp = Just (MO_U_Lt W8) +translateOp _ Word8NeOp = Just (MO_Ne W8) + +-- Int16# signed ops + +translateOp dflags Int16Extend = Just (MO_SS_Conv W16 (wordWidth dflags)) +translateOp dflags Int16Narrow = Just (MO_SS_Conv (wordWidth dflags) W16) +translateOp _ Int16NegOp = Just (MO_S_Neg W16) +translateOp _ Int16AddOp = Just (MO_Add W16) +translateOp _ Int16SubOp = Just (MO_Sub W16) +translateOp _ Int16MulOp = Just (MO_Mul W16) +translateOp _ Int16QuotOp = Just (MO_S_Quot W16) +translateOp _ Int16RemOp = Just (MO_S_Rem W16) + +translateOp _ Int16EqOp = Just (MO_Eq W16) +translateOp _ Int16GeOp = Just (MO_S_Ge W16) +translateOp _ Int16GtOp = Just (MO_S_Gt W16) +translateOp _ Int16LeOp = Just (MO_S_Le W16) +translateOp _ Int16LtOp = Just (MO_S_Lt W16) +translateOp _ Int16NeOp = Just (MO_Ne W16) + +-- Word16# unsigned ops + +translateOp dflags Word16Extend = Just (MO_UU_Conv W16 (wordWidth dflags)) +translateOp dflags Word16Narrow = Just (MO_UU_Conv (wordWidth dflags) W16) +translateOp _ Word16NotOp = Just (MO_Not W16) +translateOp _ Word16AddOp = Just (MO_Add W16) +translateOp _ Word16SubOp = Just (MO_Sub W16) +translateOp _ Word16MulOp = Just (MO_Mul W16) +translateOp _ Word16QuotOp = Just (MO_U_Quot W16) +translateOp _ Word16RemOp = Just (MO_U_Rem W16) + +translateOp _ Word16EqOp = Just (MO_Eq W16) +translateOp _ Word16GeOp = Just (MO_U_Ge W16) +translateOp _ Word16GtOp = Just (MO_U_Gt W16) +translateOp _ Word16LeOp = Just (MO_U_Le W16) +translateOp _ Word16LtOp = Just (MO_U_Lt W16) +translateOp _ Word16NeOp = Just (MO_Ne W16) + +-- Char# ops + +translateOp dflags CharEqOp = Just (MO_Eq (wordWidth dflags)) +translateOp dflags CharNeOp = Just (MO_Ne (wordWidth dflags)) +translateOp dflags CharGeOp = Just (MO_U_Ge (wordWidth dflags)) +translateOp dflags CharLeOp = Just (MO_U_Le (wordWidth dflags)) +translateOp dflags CharGtOp = Just (MO_U_Gt (wordWidth dflags)) +translateOp dflags CharLtOp = Just (MO_U_Lt (wordWidth dflags)) + +-- Double ops + +translateOp _ DoubleEqOp = Just (MO_F_Eq W64) +translateOp _ DoubleNeOp = Just (MO_F_Ne W64) +translateOp _ DoubleGeOp = Just (MO_F_Ge W64) +translateOp _ DoubleLeOp = Just (MO_F_Le W64) +translateOp _ DoubleGtOp = Just (MO_F_Gt W64) +translateOp _ DoubleLtOp = Just (MO_F_Lt W64) + +translateOp _ DoubleAddOp = Just (MO_F_Add W64) +translateOp _ DoubleSubOp = Just (MO_F_Sub W64) +translateOp _ DoubleMulOp = Just (MO_F_Mul W64) +translateOp _ DoubleDivOp = Just (MO_F_Quot W64) +translateOp _ DoubleNegOp = Just (MO_F_Neg W64) + +-- Float ops + +translateOp _ FloatEqOp = Just (MO_F_Eq W32) +translateOp _ FloatNeOp = Just (MO_F_Ne W32) +translateOp _ FloatGeOp = Just (MO_F_Ge W32) +translateOp _ FloatLeOp = Just (MO_F_Le W32) +translateOp _ FloatGtOp = Just (MO_F_Gt W32) +translateOp _ FloatLtOp = Just (MO_F_Lt W32) + +translateOp _ FloatAddOp = Just (MO_F_Add W32) +translateOp _ FloatSubOp = Just (MO_F_Sub W32) +translateOp _ FloatMulOp = Just (MO_F_Mul W32) +translateOp _ FloatDivOp = Just (MO_F_Quot W32) +translateOp _ FloatNegOp = Just (MO_F_Neg W32) + +-- Vector ops + +translateOp _ (VecAddOp FloatVec n w) = Just (MO_VF_Add n w) +translateOp _ (VecSubOp FloatVec n w) = Just (MO_VF_Sub n w) +translateOp _ (VecMulOp FloatVec n w) = Just (MO_VF_Mul n w) +translateOp _ (VecDivOp FloatVec n w) = Just (MO_VF_Quot n w) +translateOp _ (VecNegOp FloatVec n w) = Just (MO_VF_Neg n w) + +translateOp _ (VecAddOp IntVec n w) = Just (MO_V_Add n w) +translateOp _ (VecSubOp IntVec n w) = Just (MO_V_Sub n w) +translateOp _ (VecMulOp IntVec n w) = Just (MO_V_Mul n w) +translateOp _ (VecQuotOp IntVec n w) = Just (MO_VS_Quot n w) +translateOp _ (VecRemOp IntVec n w) = Just (MO_VS_Rem n w) +translateOp _ (VecNegOp IntVec n w) = Just (MO_VS_Neg n w) + +translateOp _ (VecAddOp WordVec n w) = Just (MO_V_Add n w) +translateOp _ (VecSubOp WordVec n w) = Just (MO_V_Sub n w) +translateOp _ (VecMulOp WordVec n w) = Just (MO_V_Mul n w) +translateOp _ (VecQuotOp WordVec n w) = Just (MO_VU_Quot n w) +translateOp _ (VecRemOp WordVec n w) = Just (MO_VU_Rem n w) + +-- Conversions + +translateOp dflags Int2DoubleOp = Just (MO_SF_Conv (wordWidth dflags) W64) +translateOp dflags Double2IntOp = Just (MO_FS_Conv W64 (wordWidth dflags)) + +translateOp dflags Int2FloatOp = Just (MO_SF_Conv (wordWidth dflags) W32) +translateOp dflags Float2IntOp = Just (MO_FS_Conv W32 (wordWidth dflags)) + +translateOp _ Float2DoubleOp = Just (MO_FF_Conv W32 W64) +translateOp _ Double2FloatOp = Just (MO_FF_Conv W64 W32) + +-- Word comparisons masquerading as more exotic things. + +translateOp dflags SameMutVarOp = Just (mo_wordEq dflags) +translateOp dflags SameMVarOp = Just (mo_wordEq dflags) +translateOp dflags SameMutableArrayOp = Just (mo_wordEq dflags) +translateOp dflags SameMutableByteArrayOp = Just (mo_wordEq dflags) +translateOp dflags SameMutableArrayArrayOp= Just (mo_wordEq dflags) +translateOp dflags SameSmallMutableArrayOp= Just (mo_wordEq dflags) +translateOp dflags SameTVarOp = Just (mo_wordEq dflags) +translateOp dflags EqStablePtrOp = Just (mo_wordEq dflags) +-- See Note [Comparing stable names] +translateOp dflags EqStableNameOp = Just (mo_wordEq dflags) + +translateOp _ _ = Nothing + +-- Note [Comparing stable names] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- A StableName# is actually a pointer to a stable name object (SNO) +-- containing an index into the stable name table (SNT). We +-- used to compare StableName#s by following the pointers to the +-- SNOs and checking whether they held the same SNT indices. However, +-- this is not necessary: there is a one-to-one correspondence +-- between SNOs and entries in the SNT, so simple pointer equality +-- does the trick. + +-- These primops are implemented by CallishMachOps, because they sometimes +-- turn into foreign calls depending on the backend. + +callishOp :: PrimOp -> Maybe CallishMachOp +callishOp DoublePowerOp = Just MO_F64_Pwr +callishOp DoubleSinOp = Just MO_F64_Sin +callishOp DoubleCosOp = Just MO_F64_Cos +callishOp DoubleTanOp = Just MO_F64_Tan +callishOp DoubleSinhOp = Just MO_F64_Sinh +callishOp DoubleCoshOp = Just MO_F64_Cosh +callishOp DoubleTanhOp = Just MO_F64_Tanh +callishOp DoubleAsinOp = Just MO_F64_Asin +callishOp DoubleAcosOp = Just MO_F64_Acos +callishOp DoubleAtanOp = Just MO_F64_Atan +callishOp DoubleAsinhOp = Just MO_F64_Asinh +callishOp DoubleAcoshOp = Just MO_F64_Acosh +callishOp DoubleAtanhOp = Just MO_F64_Atanh +callishOp DoubleLogOp = Just MO_F64_Log +callishOp DoubleLog1POp = Just MO_F64_Log1P +callishOp DoubleExpOp = Just MO_F64_Exp +callishOp DoubleExpM1Op = Just MO_F64_ExpM1 +callishOp DoubleSqrtOp = Just MO_F64_Sqrt + +callishOp FloatPowerOp = Just MO_F32_Pwr +callishOp FloatSinOp = Just MO_F32_Sin +callishOp FloatCosOp = Just MO_F32_Cos +callishOp FloatTanOp = Just MO_F32_Tan +callishOp FloatSinhOp = Just MO_F32_Sinh +callishOp FloatCoshOp = Just MO_F32_Cosh +callishOp FloatTanhOp = Just MO_F32_Tanh +callishOp FloatAsinOp = Just MO_F32_Asin +callishOp FloatAcosOp = Just MO_F32_Acos +callishOp FloatAtanOp = Just MO_F32_Atan +callishOp FloatAsinhOp = Just MO_F32_Asinh +callishOp FloatAcoshOp = Just MO_F32_Acosh +callishOp FloatAtanhOp = Just MO_F32_Atanh +callishOp FloatLogOp = Just MO_F32_Log +callishOp FloatLog1POp = Just MO_F32_Log1P +callishOp FloatExpOp = Just MO_F32_Exp +callishOp FloatExpM1Op = Just MO_F32_ExpM1 +callishOp FloatSqrtOp = Just MO_F32_Sqrt + +callishOp _ = Nothing + +------------------------------------------------------------------------------ +-- Helpers for translating various minor variants of array indexing. + +doIndexOffAddrOp :: Maybe MachOp + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doIndexOffAddrOp maybe_post_read_cast rep [res] [addr,idx] + = mkBasicIndexedRead 0 maybe_post_read_cast rep res addr rep idx +doIndexOffAddrOp _ _ _ _ + = panic "GHC.StgToCmm.Prim: doIndexOffAddrOp" + +doIndexOffAddrOpAs :: Maybe MachOp + -> CmmType + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doIndexOffAddrOpAs maybe_post_read_cast rep idx_rep [res] [addr,idx] + = mkBasicIndexedRead 0 maybe_post_read_cast rep res addr idx_rep idx +doIndexOffAddrOpAs _ _ _ _ _ + = panic "GHC.StgToCmm.Prim: doIndexOffAddrOpAs" + +doIndexByteArrayOp :: Maybe MachOp + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doIndexByteArrayOp maybe_post_read_cast rep [res] [addr,idx] + = do dflags <- getDynFlags + mkBasicIndexedRead (arrWordsHdrSize dflags) maybe_post_read_cast rep res addr rep idx +doIndexByteArrayOp _ _ _ _ + = panic "GHC.StgToCmm.Prim: doIndexByteArrayOp" + +doIndexByteArrayOpAs :: Maybe MachOp + -> CmmType + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doIndexByteArrayOpAs maybe_post_read_cast rep idx_rep [res] [addr,idx] + = do dflags <- getDynFlags + mkBasicIndexedRead (arrWordsHdrSize dflags) maybe_post_read_cast rep res addr idx_rep idx +doIndexByteArrayOpAs _ _ _ _ _ + = panic "GHC.StgToCmm.Prim: doIndexByteArrayOpAs" + +doReadPtrArrayOp :: LocalReg + -> CmmExpr + -> CmmExpr + -> FCode () +doReadPtrArrayOp res addr idx + = do dflags <- getDynFlags + mkBasicIndexedRead (arrPtrsHdrSize dflags) Nothing (gcWord dflags) res addr (gcWord dflags) idx + +doWriteOffAddrOp :: Maybe MachOp + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doWriteOffAddrOp maybe_pre_write_cast idx_ty [] [addr,idx,val] + = mkBasicIndexedWrite 0 maybe_pre_write_cast addr idx_ty idx val +doWriteOffAddrOp _ _ _ _ + = panic "GHC.StgToCmm.Prim: doWriteOffAddrOp" + +doWriteByteArrayOp :: Maybe MachOp + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doWriteByteArrayOp maybe_pre_write_cast idx_ty [] [addr,idx,val] + = do dflags <- getDynFlags + mkBasicIndexedWrite (arrWordsHdrSize dflags) maybe_pre_write_cast addr idx_ty idx val +doWriteByteArrayOp _ _ _ _ + = panic "GHC.StgToCmm.Prim: doWriteByteArrayOp" + +doWritePtrArrayOp :: CmmExpr + -> CmmExpr + -> CmmExpr + -> FCode () +doWritePtrArrayOp addr idx val + = do dflags <- getDynFlags + let ty = cmmExprType dflags val + -- This write barrier is to ensure that the heap writes to the object + -- referred to by val have happened before we write val into the array. + -- See #12469 for details. + emitPrimCall [] MO_WriteBarrier [] + mkBasicIndexedWrite (arrPtrsHdrSize dflags) Nothing addr ty idx val + emit (setInfo addr (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel))) + -- the write barrier. We must write a byte into the mark table: + -- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N] + emit $ mkStore ( + cmmOffsetExpr dflags + (cmmOffsetExprW dflags (cmmOffsetB dflags addr (arrPtrsHdrSize dflags)) + (loadArrPtrsSize dflags addr)) + (CmmMachOp (mo_wordUShr dflags) [idx, + mkIntExpr dflags (mUT_ARR_PTRS_CARD_BITS dflags)]) + ) (CmmLit (CmmInt 1 W8)) + +loadArrPtrsSize :: DynFlags -> CmmExpr -> CmmExpr +loadArrPtrsSize dflags addr = CmmLoad (cmmOffsetB dflags addr off) (bWord dflags) + where off = fixedHdrSize dflags + oFFSET_StgMutArrPtrs_ptrs dflags + +mkBasicIndexedRead :: ByteOff -- Initial offset in bytes + -> Maybe MachOp -- Optional result cast + -> CmmType -- Type of element we are accessing + -> LocalReg -- Destination + -> CmmExpr -- Base address + -> CmmType -- Type of element by which we are indexing + -> CmmExpr -- Index + -> FCode () +mkBasicIndexedRead off Nothing ty res base idx_ty idx + = do dflags <- getDynFlags + emitAssign (CmmLocal res) (cmmLoadIndexOffExpr dflags off ty base idx_ty idx) +mkBasicIndexedRead off (Just cast) ty res base idx_ty idx + = do dflags <- getDynFlags + emitAssign (CmmLocal res) (CmmMachOp cast [ + cmmLoadIndexOffExpr dflags off ty base idx_ty idx]) + +mkBasicIndexedWrite :: ByteOff -- Initial offset in bytes + -> Maybe MachOp -- Optional value cast + -> CmmExpr -- Base address + -> CmmType -- Type of element by which we are indexing + -> CmmExpr -- Index + -> CmmExpr -- Value to write + -> FCode () +mkBasicIndexedWrite off Nothing base idx_ty idx val + = do dflags <- getDynFlags + emitStore (cmmIndexOffExpr dflags off (typeWidth idx_ty) base idx) val +mkBasicIndexedWrite off (Just cast) base idx_ty idx val + = mkBasicIndexedWrite off Nothing base idx_ty idx (CmmMachOp cast [val]) + +-- ---------------------------------------------------------------------------- +-- Misc utils + +cmmIndexOffExpr :: DynFlags + -> ByteOff -- Initial offset in bytes + -> Width -- Width of element by which we are indexing + -> CmmExpr -- Base address + -> CmmExpr -- Index + -> CmmExpr +cmmIndexOffExpr dflags off width base idx + = cmmIndexExpr dflags width (cmmOffsetB dflags base off) idx + +cmmLoadIndexOffExpr :: DynFlags + -> ByteOff -- Initial offset in bytes + -> CmmType -- Type of element we are accessing + -> CmmExpr -- Base address + -> CmmType -- Type of element by which we are indexing + -> CmmExpr -- Index + -> CmmExpr +cmmLoadIndexOffExpr dflags off ty base idx_ty idx + = CmmLoad (cmmIndexOffExpr dflags off (typeWidth idx_ty) base idx) ty + +setInfo :: CmmExpr -> CmmExpr -> CmmAGraph +setInfo closure_ptr info_ptr = mkStore closure_ptr info_ptr + +------------------------------------------------------------------------------ +-- Helpers for translating vector primops. + +vecVmmType :: PrimOpVecCat -> Length -> Width -> CmmType +vecVmmType pocat n w = vec n (vecCmmCat pocat w) + +vecCmmCat :: PrimOpVecCat -> Width -> CmmType +vecCmmCat IntVec = cmmBits +vecCmmCat WordVec = cmmBits +vecCmmCat FloatVec = cmmFloat + +vecElemInjectCast :: DynFlags -> PrimOpVecCat -> Width -> Maybe MachOp +vecElemInjectCast _ FloatVec _ = Nothing +vecElemInjectCast dflags IntVec W8 = Just (mo_WordTo8 dflags) +vecElemInjectCast dflags IntVec W16 = Just (mo_WordTo16 dflags) +vecElemInjectCast dflags IntVec W32 = Just (mo_WordTo32 dflags) +vecElemInjectCast _ IntVec W64 = Nothing +vecElemInjectCast dflags WordVec W8 = Just (mo_WordTo8 dflags) +vecElemInjectCast dflags WordVec W16 = Just (mo_WordTo16 dflags) +vecElemInjectCast dflags WordVec W32 = Just (mo_WordTo32 dflags) +vecElemInjectCast _ WordVec W64 = Nothing +vecElemInjectCast _ _ _ = Nothing + +vecElemProjectCast :: DynFlags -> PrimOpVecCat -> Width -> Maybe MachOp +vecElemProjectCast _ FloatVec _ = Nothing +vecElemProjectCast dflags IntVec W8 = Just (mo_s_8ToWord dflags) +vecElemProjectCast dflags IntVec W16 = Just (mo_s_16ToWord dflags) +vecElemProjectCast dflags IntVec W32 = Just (mo_s_32ToWord dflags) +vecElemProjectCast _ IntVec W64 = Nothing +vecElemProjectCast dflags WordVec W8 = Just (mo_u_8ToWord dflags) +vecElemProjectCast dflags WordVec W16 = Just (mo_u_16ToWord dflags) +vecElemProjectCast dflags WordVec W32 = Just (mo_u_32ToWord dflags) +vecElemProjectCast _ WordVec W64 = Nothing +vecElemProjectCast _ _ _ = Nothing + + +-- NOTE [SIMD Design for the future] +-- Check to make sure that we can generate code for the specified vector type +-- given the current set of dynamic flags. +-- Currently these checks are specific to x86 and x86_64 architecture. +-- This should be fixed! +-- In particular, +-- 1) Add better support for other architectures! (this may require a redesign) +-- 2) Decouple design choices from LLVM's pseudo SIMD model! +-- The high level LLVM naive rep makes per CPU family SIMD generation is own +-- optimization problem, and hides important differences in eg ARM vs x86_64 simd +-- 3) Depending on the architecture, the SIMD registers may also support general +-- computations on Float/Double/Word/Int scalars, but currently on +-- for example x86_64, we always put Word/Int (or sized) in GPR +-- (general purpose) registers. Would relaxing that allow for +-- useful optimization opportunities? +-- Phrased differently, it is worth experimenting with supporting +-- different register mapping strategies than we currently have, especially if +-- someday we want SIMD to be a first class denizen in GHC along with scalar +-- values! +-- The current design with respect to register mapping of scalars could +-- very well be the best,but exploring the design space and doing careful +-- measurments is the only only way to validate that. +-- In some next generation CPU ISAs, notably RISC V, the SIMD extension +-- includes support for a sort of run time CPU dependent vectorization parameter, +-- where a loop may act upon a single scalar each iteration OR some 2,4,8 ... +-- element chunk! Time will tell if that direction sees wide adoption, +-- but it is from that context that unifying our handling of simd and scalars +-- may benefit. It is not likely to benefit current architectures, though +-- it may very well be a design perspective that helps guide improving the NCG. + + +checkVecCompatibility :: DynFlags -> PrimOpVecCat -> Length -> Width -> FCode () +checkVecCompatibility dflags vcat l w = do + when (hscTarget dflags /= HscLlvm) $ do + sorry $ unlines ["SIMD vector instructions require the LLVM back-end." + ,"Please use -fllvm."] + check vecWidth vcat l w + where + check :: Width -> PrimOpVecCat -> Length -> Width -> FCode () + check W128 FloatVec 4 W32 | not (isSseEnabled dflags) = + sorry $ "128-bit wide single-precision floating point " ++ + "SIMD vector instructions require at least -msse." + check W128 _ _ _ | not (isSse2Enabled dflags) = + sorry $ "128-bit wide integer and double precision " ++ + "SIMD vector instructions require at least -msse2." + check W256 FloatVec _ _ | not (isAvxEnabled dflags) = + sorry $ "256-bit wide floating point " ++ + "SIMD vector instructions require at least -mavx." + check W256 _ _ _ | not (isAvx2Enabled dflags) = + sorry $ "256-bit wide integer " ++ + "SIMD vector instructions require at least -mavx2." + check W512 _ _ _ | not (isAvx512fEnabled dflags) = + sorry $ "512-bit wide " ++ + "SIMD vector instructions require -mavx512f." + check _ _ _ _ = return () + + vecWidth = typeWidth (vecVmmType vcat l w) + +------------------------------------------------------------------------------ +-- Helpers for translating vector packing and unpacking. + +doVecPackOp :: Maybe MachOp -- Cast from element to vector component + -> CmmType -- Type of vector + -> CmmExpr -- Initial vector + -> [CmmExpr] -- Elements + -> CmmFormal -- Destination for result + -> FCode () +doVecPackOp maybe_pre_write_cast ty z es res = do + dst <- newTemp ty + emitAssign (CmmLocal dst) z + vecPack dst es 0 + where + vecPack :: CmmFormal -> [CmmExpr] -> Int -> FCode () + vecPack src [] _ = + emitAssign (CmmLocal res) (CmmReg (CmmLocal src)) + + vecPack src (e : es) i = do + dst <- newTemp ty + if isFloatType (vecElemType ty) + then emitAssign (CmmLocal dst) (CmmMachOp (MO_VF_Insert len wid) + [CmmReg (CmmLocal src), cast e, iLit]) + else emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid) + [CmmReg (CmmLocal src), cast e, iLit]) + vecPack dst es (i + 1) + where + -- vector indices are always 32-bits + iLit = CmmLit (CmmInt (toInteger i) W32) + + cast :: CmmExpr -> CmmExpr + cast val = case maybe_pre_write_cast of + Nothing -> val + Just cast -> CmmMachOp cast [val] + + len :: Length + len = vecLength ty + + wid :: Width + wid = typeWidth (vecElemType ty) + +doVecUnpackOp :: Maybe MachOp -- Cast from vector component to element result + -> CmmType -- Type of vector + -> CmmExpr -- Vector + -> [CmmFormal] -- Element results + -> FCode () +doVecUnpackOp maybe_post_read_cast ty e res = + vecUnpack res 0 + where + vecUnpack :: [CmmFormal] -> Int -> FCode () + vecUnpack [] _ = + return () + + vecUnpack (r : rs) i = do + if isFloatType (vecElemType ty) + then emitAssign (CmmLocal r) (cast (CmmMachOp (MO_VF_Extract len wid) + [e, iLit])) + else emitAssign (CmmLocal r) (cast (CmmMachOp (MO_V_Extract len wid) + [e, iLit])) + vecUnpack rs (i + 1) + where + -- vector indices are always 32-bits + iLit = CmmLit (CmmInt (toInteger i) W32) + + cast :: CmmExpr -> CmmExpr + cast val = case maybe_post_read_cast of + Nothing -> val + Just cast -> CmmMachOp cast [val] + + len :: Length + len = vecLength ty + + wid :: Width + wid = typeWidth (vecElemType ty) + +doVecInsertOp :: Maybe MachOp -- Cast from element to vector component + -> CmmType -- Vector type + -> CmmExpr -- Source vector + -> CmmExpr -- Element + -> CmmExpr -- Index at which to insert element + -> CmmFormal -- Destination for result + -> FCode () +doVecInsertOp maybe_pre_write_cast ty src e idx res = do + dflags <- getDynFlags + -- vector indices are always 32-bits + let idx' :: CmmExpr + idx' = CmmMachOp (MO_SS_Conv (wordWidth dflags) W32) [idx] + if isFloatType (vecElemType ty) + then emitAssign (CmmLocal res) (CmmMachOp (MO_VF_Insert len wid) [src, cast e, idx']) + else emitAssign (CmmLocal res) (CmmMachOp (MO_V_Insert len wid) [src, cast e, idx']) + where + cast :: CmmExpr -> CmmExpr + cast val = case maybe_pre_write_cast of + Nothing -> val + Just cast -> CmmMachOp cast [val] + + len :: Length + len = vecLength ty + + wid :: Width + wid = typeWidth (vecElemType ty) + +------------------------------------------------------------------------------ +-- Helpers for translating prefetching. + + +-- | Translate byte array prefetch operations into proper primcalls. +doPrefetchByteArrayOp :: Int + -> [CmmExpr] + -> FCode () +doPrefetchByteArrayOp locality [addr,idx] + = do dflags <- getDynFlags + mkBasicPrefetch locality (arrWordsHdrSize dflags) addr idx +doPrefetchByteArrayOp _ _ + = panic "GHC.StgToCmm.Prim: doPrefetchByteArrayOp" + +-- | Translate mutable byte array prefetch operations into proper primcalls. +doPrefetchMutableByteArrayOp :: Int + -> [CmmExpr] + -> FCode () +doPrefetchMutableByteArrayOp locality [addr,idx] + = do dflags <- getDynFlags + mkBasicPrefetch locality (arrWordsHdrSize dflags) addr idx +doPrefetchMutableByteArrayOp _ _ + = panic "GHC.StgToCmm.Prim: doPrefetchByteArrayOp" + +-- | Translate address prefetch operations into proper primcalls. +doPrefetchAddrOp ::Int + -> [CmmExpr] + -> FCode () +doPrefetchAddrOp locality [addr,idx] + = mkBasicPrefetch locality 0 addr idx +doPrefetchAddrOp _ _ + = panic "GHC.StgToCmm.Prim: doPrefetchAddrOp" + +-- | Translate value prefetch operations into proper primcalls. +doPrefetchValueOp :: Int + -> [CmmExpr] + -> FCode () +doPrefetchValueOp locality [addr] + = do dflags <- getDynFlags + mkBasicPrefetch locality 0 addr (CmmLit (CmmInt 0 (wordWidth dflags))) +doPrefetchValueOp _ _ + = panic "GHC.StgToCmm.Prim: doPrefetchValueOp" + +-- | helper to generate prefetch primcalls +mkBasicPrefetch :: Int -- Locality level 0-3 + -> ByteOff -- Initial offset in bytes + -> CmmExpr -- Base address + -> CmmExpr -- Index + -> FCode () +mkBasicPrefetch locality off base idx + = do dflags <- getDynFlags + emitPrimCall [] (MO_Prefetch_Data locality) [cmmIndexExpr dflags W8 (cmmOffsetB dflags base off) idx] + return () + +-- ---------------------------------------------------------------------------- +-- Allocating byte arrays + +-- | Takes a register to return the newly allocated array in and the +-- size of the new array in bytes. Allocates a new +-- 'MutableByteArray#'. +doNewByteArrayOp :: CmmFormal -> ByteOff -> FCode () +doNewByteArrayOp res_r n = do + dflags <- getDynFlags + + let info_ptr = mkLblExpr mkArrWords_infoLabel + rep = arrWordsRep dflags n + + tickyAllocPrim (mkIntExpr dflags (arrWordsHdrSize dflags)) + (mkIntExpr dflags (nonHdrSize dflags rep)) + (zeroExpr dflags) + + let hdr_size = fixedHdrSize dflags + + base <- allocHeapClosure rep info_ptr cccsExpr + [ (mkIntExpr dflags n, + hdr_size + oFFSET_StgArrBytes_bytes dflags) + ] + + emit $ mkAssign (CmmLocal res_r) base + +-- ---------------------------------------------------------------------------- +-- Comparing byte arrays + +doCompareByteArraysOp :: LocalReg -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr + -> FCode () +doCompareByteArraysOp res ba1 ba1_off ba2 ba2_off n = do + dflags <- getDynFlags + ba1_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba1 (arrWordsHdrSize dflags)) ba1_off + ba2_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba2 (arrWordsHdrSize dflags)) ba2_off + + -- short-cut in case of equal pointers avoiding a costly + -- subroutine call to the memcmp(3) routine; the Cmm logic below + -- results in assembly code being generated for + -- + -- cmpPrefix10 :: ByteArray# -> ByteArray# -> Int# + -- cmpPrefix10 ba1 ba2 = compareByteArrays# ba1 0# ba2 0# 10# + -- + -- that looks like + -- + -- leaq 16(%r14),%rax + -- leaq 16(%rsi),%rbx + -- xorl %ecx,%ecx + -- cmpq %rbx,%rax + -- je l_ptr_eq + -- + -- ; NB: the common case (unequal pointers) falls-through + -- ; the conditional jump, and therefore matches the + -- ; usual static branch prediction convention of modern cpus + -- + -- subq $8,%rsp + -- movq %rbx,%rsi + -- movq %rax,%rdi + -- movl $10,%edx + -- xorl %eax,%eax + -- call memcmp + -- addq $8,%rsp + -- movslq %eax,%rax + -- movq %rax,%rcx + -- l_ptr_eq: + -- movq %rcx,%rbx + -- jmp *(%rbp) + + l_ptr_eq <- newBlockId + l_ptr_ne <- newBlockId + + emit (mkAssign (CmmLocal res) (zeroExpr dflags)) + emit (mkCbranch (cmmEqWord dflags ba1_p ba2_p) + l_ptr_eq l_ptr_ne (Just False)) + + emitLabel l_ptr_ne + emitMemcmpCall res ba1_p ba2_p n 1 + + emitLabel l_ptr_eq + +-- ---------------------------------------------------------------------------- +-- Copying byte arrays + +-- | Takes a source 'ByteArray#', an offset in the source array, a +-- destination 'MutableByteArray#', an offset into the destination +-- array, and the number of bytes to copy. Copies the given number of +-- bytes from the source array to the destination array. +doCopyByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr + -> FCode () +doCopyByteArrayOp = emitCopyByteArray copy + where + -- Copy data (we assume the arrays aren't overlapping since + -- they're of different types) + copy _src _dst dst_p src_p bytes align = + emitMemcpyCall dst_p src_p bytes align + +-- | Takes a source 'MutableByteArray#', an offset in the source +-- array, a destination 'MutableByteArray#', an offset into the +-- destination array, and the number of bytes to copy. Copies the +-- given number of bytes from the source array to the destination +-- array. +doCopyMutableByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr + -> FCode () +doCopyMutableByteArrayOp = emitCopyByteArray copy + where + -- The only time the memory might overlap is when the two arrays + -- we were provided are the same array! + -- TODO: Optimize branch for common case of no aliasing. + copy src dst dst_p src_p bytes align = do + dflags <- getDynFlags + (moveCall, cpyCall) <- forkAltPair + (getCode $ emitMemmoveCall dst_p src_p bytes align) + (getCode $ emitMemcpyCall dst_p src_p bytes align) + emit =<< mkCmmIfThenElse (cmmEqWord dflags src dst) moveCall cpyCall + +emitCopyByteArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr + -> Alignment -> FCode ()) + -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr + -> FCode () +emitCopyByteArray copy src src_off dst dst_off n = do + dflags <- getDynFlags + let byteArrayAlignment = wordAlignment dflags + srcOffAlignment = cmmExprAlignment src_off + dstOffAlignment = cmmExprAlignment dst_off + align = minimum [byteArrayAlignment, srcOffAlignment, dstOffAlignment] + dst_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags dst (arrWordsHdrSize dflags)) dst_off + src_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags src (arrWordsHdrSize dflags)) src_off + copy src dst dst_p src_p n align + +-- | Takes a source 'ByteArray#', an offset in the source array, a +-- destination 'Addr#', and the number of bytes to copy. Copies the given +-- number of bytes from the source array to the destination memory region. +doCopyByteArrayToAddrOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> FCode () +doCopyByteArrayToAddrOp src src_off dst_p bytes = do + -- Use memcpy (we are allowed to assume the arrays aren't overlapping) + dflags <- getDynFlags + src_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags src (arrWordsHdrSize dflags)) src_off + emitMemcpyCall dst_p src_p bytes (mkAlignment 1) + +-- | Takes a source 'MutableByteArray#', an offset in the source array, a +-- destination 'Addr#', and the number of bytes to copy. Copies the given +-- number of bytes from the source array to the destination memory region. +doCopyMutableByteArrayToAddrOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr + -> FCode () +doCopyMutableByteArrayToAddrOp = doCopyByteArrayToAddrOp + +-- | Takes a source 'Addr#', a destination 'MutableByteArray#', an offset into +-- the destination array, and the number of bytes to copy. Copies the given +-- number of bytes from the source memory region to the destination array. +doCopyAddrToByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> FCode () +doCopyAddrToByteArrayOp src_p dst dst_off bytes = do + -- Use memcpy (we are allowed to assume the arrays aren't overlapping) + dflags <- getDynFlags + dst_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags dst (arrWordsHdrSize dflags)) dst_off + emitMemcpyCall dst_p src_p bytes (mkAlignment 1) + + +-- ---------------------------------------------------------------------------- +-- Setting byte arrays + +-- | Takes a 'MutableByteArray#', an offset into the array, a length, +-- and a byte, and sets each of the selected bytes in the array to the +-- character. +doSetByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr + -> FCode () +doSetByteArrayOp ba off len c = do + dflags <- getDynFlags + + let byteArrayAlignment = wordAlignment dflags -- known since BA is allocated on heap + offsetAlignment = cmmExprAlignment off + align = min byteArrayAlignment offsetAlignment + + p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba (arrWordsHdrSize dflags)) off + emitMemsetCall p c len align + +-- ---------------------------------------------------------------------------- +-- Allocating arrays + +-- | Allocate a new array. +doNewArrayOp :: CmmFormal -- ^ return register + -> SMRep -- ^ representation of the array + -> CLabel -- ^ info pointer + -> [(CmmExpr, ByteOff)] -- ^ header payload + -> WordOff -- ^ array size + -> CmmExpr -- ^ initial element + -> FCode () +doNewArrayOp res_r rep info payload n init = do + dflags <- getDynFlags + + let info_ptr = mkLblExpr info + + tickyAllocPrim (mkIntExpr dflags (hdrSize dflags rep)) + (mkIntExpr dflags (nonHdrSize dflags rep)) + (zeroExpr dflags) + + base <- allocHeapClosure rep info_ptr cccsExpr payload + + arr <- CmmLocal `fmap` newTemp (bWord dflags) + emit $ mkAssign arr base + + -- Initialise all elements of the array + let mkOff off = cmmOffsetW dflags (CmmReg arr) (hdrSizeW dflags rep + off) + initialization = [ mkStore (mkOff off) init | off <- [0.. n - 1] ] + emit (catAGraphs initialization) + + emit $ mkAssign (CmmLocal res_r) (CmmReg arr) + +-- ---------------------------------------------------------------------------- +-- Copying pointer arrays + +-- EZY: This code has an unusually high amount of assignTemp calls, seen +-- nowhere else in the code generator. This is mostly because these +-- "primitive" ops result in a surprisingly large amount of code. It +-- will likely be worthwhile to optimize what is emitted here, so that +-- our optimization passes don't waste time repeatedly optimizing the +-- same bits of code. + +-- More closely imitates 'assignTemp' from the old code generator, which +-- returns a CmmExpr rather than a LocalReg. +assignTempE :: CmmExpr -> FCode CmmExpr +assignTempE e = do + t <- assignTemp e + return (CmmReg (CmmLocal t)) + +-- | Takes a source 'Array#', an offset in the source array, a +-- destination 'MutableArray#', an offset into the destination array, +-- and the number of elements to copy. Copies the given number of +-- elements from the source array to the destination array. +doCopyArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> WordOff + -> FCode () +doCopyArrayOp = emitCopyArray copy + where + -- Copy data (we assume the arrays aren't overlapping since + -- they're of different types) + copy _src _dst dst_p src_p bytes = + do dflags <- getDynFlags + emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes) + (wordAlignment dflags) + + +-- | Takes a source 'MutableArray#', an offset in the source array, a +-- destination 'MutableArray#', an offset into the destination array, +-- and the number of elements to copy. Copies the given number of +-- elements from the source array to the destination array. +doCopyMutableArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> WordOff + -> FCode () +doCopyMutableArrayOp = emitCopyArray copy + where + -- The only time the memory might overlap is when the two arrays + -- we were provided are the same array! + -- TODO: Optimize branch for common case of no aliasing. + copy src dst dst_p src_p bytes = do + dflags <- getDynFlags + (moveCall, cpyCall) <- forkAltPair + (getCode $ emitMemmoveCall dst_p src_p (mkIntExpr dflags bytes) + (wordAlignment dflags)) + (getCode $ emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes) + (wordAlignment dflags)) + emit =<< mkCmmIfThenElse (cmmEqWord dflags src dst) moveCall cpyCall + +emitCopyArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> ByteOff + -> FCode ()) -- ^ copy function + -> CmmExpr -- ^ source array + -> CmmExpr -- ^ offset in source array + -> CmmExpr -- ^ destination array + -> CmmExpr -- ^ offset in destination array + -> WordOff -- ^ number of elements to copy + -> FCode () +emitCopyArray copy src0 src_off dst0 dst_off0 n = + when (n /= 0) $ do + dflags <- getDynFlags + + -- Passed as arguments (be careful) + src <- assignTempE src0 + dst <- assignTempE dst0 + dst_off <- assignTempE dst_off0 + + -- Set the dirty bit in the header. + emit (setInfo dst (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel))) + + dst_elems_p <- assignTempE $ cmmOffsetB dflags dst + (arrPtrsHdrSize dflags) + dst_p <- assignTempE $ cmmOffsetExprW dflags dst_elems_p dst_off + src_p <- assignTempE $ cmmOffsetExprW dflags + (cmmOffsetB dflags src (arrPtrsHdrSize dflags)) src_off + let bytes = wordsToBytes dflags n + + copy src dst dst_p src_p bytes + + -- The base address of the destination card table + dst_cards_p <- assignTempE $ cmmOffsetExprW dflags dst_elems_p + (loadArrPtrsSize dflags dst) + + emitSetCards dst_off dst_cards_p n + +doCopySmallArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> WordOff + -> FCode () +doCopySmallArrayOp = emitCopySmallArray copy + where + -- Copy data (we assume the arrays aren't overlapping since + -- they're of different types) + copy _src _dst dst_p src_p bytes = + do dflags <- getDynFlags + emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes) + (wordAlignment dflags) + + +doCopySmallMutableArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> WordOff + -> FCode () +doCopySmallMutableArrayOp = emitCopySmallArray copy + where + -- The only time the memory might overlap is when the two arrays + -- we were provided are the same array! + -- TODO: Optimize branch for common case of no aliasing. + copy src dst dst_p src_p bytes = do + dflags <- getDynFlags + (moveCall, cpyCall) <- forkAltPair + (getCode $ emitMemmoveCall dst_p src_p (mkIntExpr dflags bytes) + (wordAlignment dflags)) + (getCode $ emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes) + (wordAlignment dflags)) + emit =<< mkCmmIfThenElse (cmmEqWord dflags src dst) moveCall cpyCall + +emitCopySmallArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> ByteOff + -> FCode ()) -- ^ copy function + -> CmmExpr -- ^ source array + -> CmmExpr -- ^ offset in source array + -> CmmExpr -- ^ destination array + -> CmmExpr -- ^ offset in destination array + -> WordOff -- ^ number of elements to copy + -> FCode () +emitCopySmallArray copy src0 src_off dst0 dst_off n = + when (n /= 0) $ do + dflags <- getDynFlags + + -- Passed as arguments (be careful) + src <- assignTempE src0 + dst <- assignTempE dst0 + + -- Set the dirty bit in the header. + emit (setInfo dst (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel))) + + dst_p <- assignTempE $ cmmOffsetExprW dflags + (cmmOffsetB dflags dst (smallArrPtrsHdrSize dflags)) dst_off + src_p <- assignTempE $ cmmOffsetExprW dflags + (cmmOffsetB dflags src (smallArrPtrsHdrSize dflags)) src_off + let bytes = wordsToBytes dflags n + + copy src dst dst_p src_p bytes + +-- | Takes an info table label, a register to return the newly +-- allocated array in, a source array, an offset in the source array, +-- and the number of elements to copy. Allocates a new array and +-- initializes it from the source array. +emitCloneArray :: CLabel -> CmmFormal -> CmmExpr -> CmmExpr -> WordOff + -> FCode () +emitCloneArray info_p res_r src src_off n = do + dflags <- getDynFlags + + let info_ptr = mkLblExpr info_p + rep = arrPtrsRep dflags n + + tickyAllocPrim (mkIntExpr dflags (arrPtrsHdrSize dflags)) + (mkIntExpr dflags (nonHdrSize dflags rep)) + (zeroExpr dflags) + + let hdr_size = fixedHdrSize dflags + + base <- allocHeapClosure rep info_ptr cccsExpr + [ (mkIntExpr dflags n, + hdr_size + oFFSET_StgMutArrPtrs_ptrs dflags) + , (mkIntExpr dflags (nonHdrSizeW rep), + hdr_size + oFFSET_StgMutArrPtrs_size dflags) + ] + + arr <- CmmLocal `fmap` newTemp (bWord dflags) + emit $ mkAssign arr base + + dst_p <- assignTempE $ cmmOffsetB dflags (CmmReg arr) + (arrPtrsHdrSize dflags) + src_p <- assignTempE $ cmmOffsetExprW dflags src + (cmmAddWord dflags + (mkIntExpr dflags (arrPtrsHdrSizeW dflags)) src_off) + + emitMemcpyCall dst_p src_p (mkIntExpr dflags (wordsToBytes dflags n)) + (wordAlignment dflags) + + emit $ mkAssign (CmmLocal res_r) (CmmReg arr) + +-- | Takes an info table label, a register to return the newly +-- allocated array in, a source array, an offset in the source array, +-- and the number of elements to copy. Allocates a new array and +-- initializes it from the source array. +emitCloneSmallArray :: CLabel -> CmmFormal -> CmmExpr -> CmmExpr -> WordOff + -> FCode () +emitCloneSmallArray info_p res_r src src_off n = do + dflags <- getDynFlags + + let info_ptr = mkLblExpr info_p + rep = smallArrPtrsRep n + + tickyAllocPrim (mkIntExpr dflags (smallArrPtrsHdrSize dflags)) + (mkIntExpr dflags (nonHdrSize dflags rep)) + (zeroExpr dflags) + + let hdr_size = fixedHdrSize dflags + + base <- allocHeapClosure rep info_ptr cccsExpr + [ (mkIntExpr dflags n, + hdr_size + oFFSET_StgSmallMutArrPtrs_ptrs dflags) + ] + + arr <- CmmLocal `fmap` newTemp (bWord dflags) + emit $ mkAssign arr base + + dst_p <- assignTempE $ cmmOffsetB dflags (CmmReg arr) + (smallArrPtrsHdrSize dflags) + src_p <- assignTempE $ cmmOffsetExprW dflags src + (cmmAddWord dflags + (mkIntExpr dflags (smallArrPtrsHdrSizeW dflags)) src_off) + + emitMemcpyCall dst_p src_p (mkIntExpr dflags (wordsToBytes dflags n)) + (wordAlignment dflags) + + emit $ mkAssign (CmmLocal res_r) (CmmReg arr) + +-- | Takes and offset in the destination array, the base address of +-- the card table, and the number of elements affected (*not* the +-- number of cards). The number of elements may not be zero. +-- Marks the relevant cards as dirty. +emitSetCards :: CmmExpr -> CmmExpr -> WordOff -> FCode () +emitSetCards dst_start dst_cards_start n = do + dflags <- getDynFlags + start_card <- assignTempE $ cardCmm dflags dst_start + let end_card = cardCmm dflags + (cmmSubWord dflags + (cmmAddWord dflags dst_start (mkIntExpr dflags n)) + (mkIntExpr dflags 1)) + emitMemsetCall (cmmAddWord dflags dst_cards_start start_card) + (mkIntExpr dflags 1) + (cmmAddWord dflags (cmmSubWord dflags end_card start_card) (mkIntExpr dflags 1)) + (mkAlignment 1) -- no alignment (1 byte) + +-- Convert an element index to a card index +cardCmm :: DynFlags -> CmmExpr -> CmmExpr +cardCmm dflags i = + cmmUShrWord dflags i (mkIntExpr dflags (mUT_ARR_PTRS_CARD_BITS dflags)) + +------------------------------------------------------------------------------ +-- SmallArray PrimOp implementations + +doReadSmallPtrArrayOp :: LocalReg + -> CmmExpr + -> CmmExpr + -> FCode () +doReadSmallPtrArrayOp res addr idx = do + dflags <- getDynFlags + mkBasicIndexedRead (smallArrPtrsHdrSize dflags) Nothing (gcWord dflags) res addr + (gcWord dflags) idx + +doWriteSmallPtrArrayOp :: CmmExpr + -> CmmExpr + -> CmmExpr + -> FCode () +doWriteSmallPtrArrayOp addr idx val = do + dflags <- getDynFlags + let ty = cmmExprType dflags val + emitPrimCall [] MO_WriteBarrier [] -- #12469 + mkBasicIndexedWrite (smallArrPtrsHdrSize dflags) Nothing addr ty idx val + emit (setInfo addr (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel))) + +------------------------------------------------------------------------------ +-- Atomic read-modify-write + +-- | Emit an atomic modification to a byte array element. The result +-- reg contains that previous value of the element. Implies a full +-- memory barrier. +doAtomicRMW :: LocalReg -- ^ Result reg + -> AtomicMachOp -- ^ Atomic op (e.g. add) + -> CmmExpr -- ^ MutableByteArray# + -> CmmExpr -- ^ Index + -> CmmType -- ^ Type of element by which we are indexing + -> CmmExpr -- ^ Op argument (e.g. amount to add) + -> FCode () +doAtomicRMW res amop mba idx idx_ty n = do + dflags <- getDynFlags + let width = typeWidth idx_ty + addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags) + width mba idx + emitPrimCall + [ res ] + (MO_AtomicRMW width amop) + [ addr, n ] + +-- | Emit an atomic read to a byte array that acts as a memory barrier. +doAtomicReadByteArray + :: LocalReg -- ^ Result reg + -> CmmExpr -- ^ MutableByteArray# + -> CmmExpr -- ^ Index + -> CmmType -- ^ Type of element by which we are indexing + -> FCode () +doAtomicReadByteArray res mba idx idx_ty = do + dflags <- getDynFlags + let width = typeWidth idx_ty + addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags) + width mba idx + emitPrimCall + [ res ] + (MO_AtomicRead width) + [ addr ] + +-- | Emit an atomic write to a byte array that acts as a memory barrier. +doAtomicWriteByteArray + :: CmmExpr -- ^ MutableByteArray# + -> CmmExpr -- ^ Index + -> CmmType -- ^ Type of element by which we are indexing + -> CmmExpr -- ^ Value to write + -> FCode () +doAtomicWriteByteArray mba idx idx_ty val = do + dflags <- getDynFlags + let width = typeWidth idx_ty + addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags) + width mba idx + emitPrimCall + [ {- no results -} ] + (MO_AtomicWrite width) + [ addr, val ] + +doCasByteArray + :: LocalReg -- ^ Result reg + -> CmmExpr -- ^ MutableByteArray# + -> CmmExpr -- ^ Index + -> CmmType -- ^ Type of element by which we are indexing + -> CmmExpr -- ^ Old value + -> CmmExpr -- ^ New value + -> FCode () +doCasByteArray res mba idx idx_ty old new = do + dflags <- getDynFlags + let width = (typeWidth idx_ty) + addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags) + width mba idx + emitPrimCall + [ res ] + (MO_Cmpxchg width) + [ addr, old, new ] + +------------------------------------------------------------------------------ +-- Helpers for emitting function calls + +-- | Emit a call to @memcpy@. +emitMemcpyCall :: CmmExpr -> CmmExpr -> CmmExpr -> Alignment -> FCode () +emitMemcpyCall dst src n align = do + emitPrimCall + [ {-no results-} ] + (MO_Memcpy (alignmentBytes align)) + [ dst, src, n ] + +-- | Emit a call to @memmove@. +emitMemmoveCall :: CmmExpr -> CmmExpr -> CmmExpr -> Alignment -> FCode () +emitMemmoveCall dst src n align = do + emitPrimCall + [ {- no results -} ] + (MO_Memmove (alignmentBytes align)) + [ dst, src, n ] + +-- | Emit a call to @memset@. The second argument must fit inside an +-- unsigned char. +emitMemsetCall :: CmmExpr -> CmmExpr -> CmmExpr -> Alignment -> FCode () +emitMemsetCall dst c n align = do + emitPrimCall + [ {- no results -} ] + (MO_Memset (alignmentBytes align)) + [ dst, c, n ] + +emitMemcmpCall :: LocalReg -> CmmExpr -> CmmExpr -> CmmExpr -> Int -> FCode () +emitMemcmpCall res ptr1 ptr2 n align = do + -- 'MO_Memcmp' is assumed to return an 32bit 'CInt' because all + -- code-gens currently call out to the @memcmp(3)@ C function. + -- This was easier than moving the sign-extensions into + -- all the code-gens. + dflags <- getDynFlags + let is32Bit = typeWidth (localRegType res) == W32 + + cres <- if is32Bit + then return res + else newTemp b32 + + emitPrimCall + [ cres ] + (MO_Memcmp align) + [ ptr1, ptr2, n ] + + unless is32Bit $ do + emit $ mkAssign (CmmLocal res) + (CmmMachOp + (mo_s_32ToWord dflags) + [(CmmReg (CmmLocal cres))]) + +emitBSwapCall :: LocalReg -> CmmExpr -> Width -> FCode () +emitBSwapCall res x width = do + emitPrimCall + [ res ] + (MO_BSwap width) + [ x ] + +emitBRevCall :: LocalReg -> CmmExpr -> Width -> FCode () +emitBRevCall res x width = do + emitPrimCall + [ res ] + (MO_BRev width) + [ x ] + +emitPopCntCall :: LocalReg -> CmmExpr -> Width -> FCode () +emitPopCntCall res x width = do + emitPrimCall + [ res ] + (MO_PopCnt width) + [ x ] + +emitPdepCall :: LocalReg -> CmmExpr -> CmmExpr -> Width -> FCode () +emitPdepCall res x y width = do + emitPrimCall + [ res ] + (MO_Pdep width) + [ x, y ] + +emitPextCall :: LocalReg -> CmmExpr -> CmmExpr -> Width -> FCode () +emitPextCall res x y width = do + emitPrimCall + [ res ] + (MO_Pext width) + [ x, y ] + +emitClzCall :: LocalReg -> CmmExpr -> Width -> FCode () +emitClzCall res x width = do + emitPrimCall + [ res ] + (MO_Clz width) + [ x ] + +emitCtzCall :: LocalReg -> CmmExpr -> Width -> FCode () +emitCtzCall res x width = do + emitPrimCall + [ res ] + (MO_Ctz width) + [ x ] |