summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAbhiroop Sarkar <asiamgenius@gmail.com>2018-09-27 15:28:46 -0400
committerMarge Bot <ben+marge-bot@smart-cactus.org>2019-07-03 09:33:39 -0400
commitacd795583625401c5554f8e04ec7efca18814011 (patch)
tree545e529eed21e78592ff326d4ebf9804095ad2cb
parentdf3e5b744db29c085f5bc05f8b609197bcbf9b0c (diff)
downloadhaskell-acd795583625401c5554f8e04ec7efca18814011.tar.gz
Add support for SIMD operations in the NCG
This adds support for constructing vector types from Float#, Double# etc and performing arithmetic operations on them Cleaned-Up-By: Ben Gamari <ben@well-typed.com>
-rw-r--r--compiler/cmm/CmmCallConv.hs40
-rw-r--r--compiler/cmm/CmmExpr.hs83
-rw-r--r--compiler/cmm/CmmLint.hs10
-rw-r--r--compiler/cmm/CmmMachOp.hs25
-rw-r--r--compiler/cmm/CmmType.hs6
-rw-r--r--compiler/cmm/PprC.hs4
-rw-r--r--compiler/cmm/PprCmmExpr.hs6
-rw-r--r--compiler/codeGen/CgUtils.hs42
-rw-r--r--compiler/codeGen/StgCmmPrim.hs55
-rw-r--r--compiler/llvmGen/LlvmCodeGen/Base.hs12
-rw-r--r--compiler/llvmGen/LlvmCodeGen/CodeGen.hs8
-rw-r--r--compiler/llvmGen/LlvmCodeGen/Regs.hs36
-rw-r--r--compiler/main/DynFlags.hs3
-rw-r--r--compiler/nativeGen/Format.hs55
-rw-r--r--compiler/nativeGen/NCGMonad.hs1
-rw-r--r--compiler/nativeGen/PPC/CodeGen.hs6
-rw-r--r--compiler/nativeGen/PPC/Ppr.hs12
-rw-r--r--compiler/nativeGen/Reg.hs14
-rw-r--r--compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs1
-rw-r--r--compiler/nativeGen/RegAlloc/Linear/FreeRegs.hs5
-rw-r--r--compiler/nativeGen/RegAlloc/Linear/Main.hs6
-rw-r--r--compiler/nativeGen/RegClass.hs19
-rw-r--r--compiler/nativeGen/SPARC/Ppr.hs14
-rw-r--r--compiler/nativeGen/SPARC/Regs.hs2
-rw-r--r--compiler/nativeGen/X86/CodeGen.hs526
-rw-r--r--compiler/nativeGen/X86/Instr.hs81
-rw-r--r--compiler/nativeGen/X86/Ppr.hs150
-rw-r--r--compiler/nativeGen/X86/RegInfo.hs3
-rw-r--r--compiler/nativeGen/X86/Regs.hs1
-rw-r--r--includes/CodeGen.Platform.hs72
-rw-r--r--testsuite/tests/codeGen/should_run/all.T10
-rw-r--r--testsuite/tests/codeGen/should_run/cgrun083.hs70
-rw-r--r--testsuite/tests/codeGen/should_run/cgrun083.stdout2
-rw-r--r--testsuite/tests/codeGen/should_run/simd000.hs21
-rw-r--r--testsuite/tests/codeGen/should_run/simd000.stdout4
-rw-r--r--testsuite/tests/codeGen/should_run/simd001.hs49
-rw-r--r--testsuite/tests/codeGen/should_run/simd001.stdout6
-rw-r--r--testsuite/tests/codeGen/should_run/simd002.hs33
-rw-r--r--testsuite/tests/codeGen/should_run/simd002.stdout10
-rw-r--r--testsuite/tests/codeGen/should_run/simd003.hs25
-rw-r--r--testsuite/tests/codeGen/should_run/simd003.stdout2
-rw-r--r--testsuite/tests/codeGen/should_run/simd004.hs95
-rw-r--r--testsuite/tests/codeGen/should_run/simd004.stdout20
-rw-r--r--testsuite/tests/codeGen/should_run/simd005.hs93
-rw-r--r--testsuite/tests/codeGen/should_run/simd005.stdout20
45 files changed, 1512 insertions, 246 deletions
diff --git a/compiler/cmm/CmmCallConv.hs b/compiler/cmm/CmmCallConv.hs
index 4e6a9d293a..6df910edfa 100644
--- a/compiler/cmm/CmmCallConv.hs
+++ b/compiler/cmm/CmmCallConv.hs
@@ -64,13 +64,20 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
assign_regs assts (r:rs) regs | isVecType ty = vec
| isFloatType ty = float
| otherwise = int
- where vec = case (w, regs) of
- (W128, (vs, fs, ds, ls, s:ss))
- | passVectorInReg W128 dflags -> k (RegisterParam (XmmReg s), (vs, fs, ds, ls, ss))
- (W256, (vs, fs, ds, ls, s:ss))
- | passVectorInReg W256 dflags -> k (RegisterParam (YmmReg s), (vs, fs, ds, ls, ss))
- (W512, (vs, fs, ds, ls, s:ss))
- | passVectorInReg W512 dflags -> k (RegisterParam (ZmmReg s), (vs, fs, ds, ls, ss))
+ where vec = case regs of
+ (vs, fs, ds, ls, s:ss)
+ | passVectorInReg w dflags
+ -> let elt_ty = vecElemType ty
+ reg_ty = if isFloatType elt_ty
+ then Float else Integer
+ reg_class = case w of
+ W128 -> XmmReg
+ W256 -> YmmReg
+ W512 -> ZmmReg
+ _ -> panic "CmmCallConv.assignArgumentsPos: Invalid vector width"
+ in k (RegisterParam
+ (reg_class s (vecLength ty) (typeWidth elt_ty) reg_ty),
+ (vs, fs, ds, ls, ss))
_ -> (assts, (r:rs))
float = case (w, regs) of
(W32, (vs, fs, ds, ls, s:ss))
@@ -89,6 +96,7 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
(_, (vs, fs, ds, l:ls, ss)) | widthInBits w > widthInBits (wordWidth dflags)
-> k (RegisterParam l, (vs, fs, ds, ls, ss))
_ -> (assts, (r:rs))
+
k (asst, regs') = assign_regs ((r, asst) : assts) rs regs'
ty = arg_ty r
w = typeWidth ty
@@ -202,11 +210,13 @@ nodeOnly = ([VanillaReg 1], [], [], [], [])
-- only use this functionality in hand-written C-- code in the RTS.
realArgRegsCover :: DynFlags -> [GlobalReg]
realArgRegsCover dflags
- | passFloatArgsInXmm dflags = map ($VGcPtr) (realVanillaRegs dflags) ++
- realLongRegs dflags ++
- map XmmReg (realXmmRegNos dflags)
- | otherwise = map ($VGcPtr) (realVanillaRegs dflags) ++
- realFloatRegs dflags ++
- realDoubleRegs dflags ++
- realLongRegs dflags ++
- map XmmReg (realXmmRegNos dflags)
+ | passFloatArgsInXmm dflags
+ = map ($VGcPtr) (realVanillaRegs dflags) ++
+ realLongRegs dflags ++
+ map (\x -> XmmReg x 2 W64 Integer) (realXmmRegNos dflags)
+ | otherwise
+ = map ($VGcPtr) (realVanillaRegs dflags) ++
+ realFloatRegs dflags ++
+ realDoubleRegs dflags ++
+ realLongRegs dflags ++
+ map (\x -> XmmReg x 2 W64 Integer) (realXmmRegNos dflags)
diff --git a/compiler/cmm/CmmExpr.hs b/compiler/cmm/CmmExpr.hs
index 901df5d908..79eaf8f89c 100644
--- a/compiler/cmm/CmmExpr.hs
+++ b/compiler/cmm/CmmExpr.hs
@@ -14,6 +14,7 @@ module CmmExpr
, currentTSOReg, currentNurseryReg, hpAllocReg, cccsReg
, node, baseReg
, VGcPtr(..)
+ , GlobalVecRegTy(..)
, DefinerOfRegs, UserOfRegs
, foldRegsDefd, foldRegsUsed
@@ -41,6 +42,7 @@ import Outputable (panic)
import Unique
import Data.Set (Set)
+import Data.Monoid ((<>))
import qualified Data.Set as Set
import BasicTypes (Alignment, mkAlignment, alignmentOf)
@@ -392,6 +394,7 @@ data VGcPtr = VGcPtr | VNonGcPtr deriving( Eq, Show )
-----------------------------------------------------------------------------
{-
Note [Overlapping global registers]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The backend might not faithfully implement the abstraction of the STG
machine with independent registers for different values of type
@@ -413,6 +416,26 @@ on a particular platform. The instance Eq GlobalReg is syntactic
equality of STG registers and does not take overlap into
account. However it is still used in UserOfRegs/DefinerOfRegs and
there are likely still bugs there, beware!
+
+
+Note [SIMD registers]
+~~~~~~~~~~~~~~~~~~~~~
+
+GHC's treatment of SIMD registers is heavily modelled after the x86_64
+architecture. Namely we have 128- (XMM), 256- (YMM), and 512-bit (ZMM)
+registers. Furthermore, we treat each possible format in these registers as a
+distinct register which overlaps with the others. For instance, we XMM1 as a
+2xI64 register is distinct from but overlaps with (in the sense defined in Note
+[Overlapping global registers]) its use as a 4xI32 register.
+
+This model makes it easier to fit SIMD registers into the NCG, which generally
+expects that each global register has a single, known CmmType.
+
+In the future we could consider further refactoring this to eliminate the
+XMM, YMM, and ZMM register names (which are quite x86-specific) and instead just
+having a set of NxM-bit vector registers (e.g. Vec2x64A, Vec2x64B, ...,
+Vec4x32A, ..., Vec4x64A).
+
-}
data GlobalReg
@@ -432,12 +455,15 @@ data GlobalReg
| XmmReg -- 128-bit SIMD vector register
{-# UNPACK #-} !Int -- its number
+ !Length !Width !GlobalVecRegTy
| YmmReg -- 256-bit SIMD vector register
{-# UNPACK #-} !Int -- its number
+ !Length !Width !GlobalVecRegTy
| ZmmReg -- 512-bit SIMD vector register
{-# UNPACK #-} !Int -- its number
+ !Length !Width !GlobalVecRegTy
-- STG registers
| Sp -- Stack ptr; points to last occupied stack location.
@@ -478,17 +504,17 @@ data GlobalReg
deriving( Show )
+data GlobalVecRegTy = Integer | Float
+ deriving (Show, Eq, Ord)
+
instance Eq GlobalReg where
VanillaReg i _ == VanillaReg j _ = i==j -- Ignore type when seeking clashes
FloatReg i == FloatReg j = i==j
DoubleReg i == DoubleReg j = i==j
LongReg i == LongReg j = i==j
- -- NOTE: XMM, YMM, ZMM registers actually are the same registers
- -- at least with respect to store at YMM i and then read from XMM i
- -- and similarly for ZMM etc.
- XmmReg i == XmmReg j = i==j
- YmmReg i == YmmReg j = i==j
- ZmmReg i == ZmmReg j = i==j
+ XmmReg i l w grt == XmmReg j l' w' grt' = i==j && l == l' && w == w' && grt == grt'
+ YmmReg i l w grt == YmmReg j l' w' grt' = i==j && l == l' && w == w' && grt == grt'
+ ZmmReg i l w grt == ZmmReg j l' w' grt' = i==j && l == l' && w == w' && grt == grt'
Sp == Sp = True
SpLim == SpLim = True
Hp == Hp = True
@@ -512,9 +538,21 @@ instance Ord GlobalReg where
compare (FloatReg i) (FloatReg j) = compare i j
compare (DoubleReg i) (DoubleReg j) = compare i j
compare (LongReg i) (LongReg j) = compare i j
- compare (XmmReg i) (XmmReg j) = compare i j
- compare (YmmReg i) (YmmReg j) = compare i j
- compare (ZmmReg i) (ZmmReg j) = compare i j
+ compare (XmmReg i l w grt)
+ (XmmReg j l' w' grt') = compare i j
+ <> compare l l'
+ <> compare w w'
+ <> compare grt grt'
+ compare (YmmReg i l w grt)
+ (YmmReg j l' w' grt') = compare i j
+ <> compare l l'
+ <> compare w w'
+ <> compare grt grt'
+ compare (ZmmReg i l w grt)
+ (ZmmReg j l' w' grt') = compare i j
+ <> compare l l'
+ <> compare w w'
+ <> compare grt grt'
compare Sp Sp = EQ
compare SpLim SpLim = EQ
compare Hp Hp = EQ
@@ -538,12 +576,12 @@ instance Ord GlobalReg where
compare _ (DoubleReg _) = GT
compare (LongReg _) _ = LT
compare _ (LongReg _) = GT
- compare (XmmReg _) _ = LT
- compare _ (XmmReg _) = GT
- compare (YmmReg _) _ = LT
- compare _ (YmmReg _) = GT
- compare (ZmmReg _) _ = LT
- compare _ (ZmmReg _) = GT
+ compare (XmmReg _ _ _ _) _ = LT
+ compare _ (XmmReg _ _ _ _) = GT
+ compare (YmmReg _ _ _ _) _ = LT
+ compare _ (YmmReg _ _ _ _) = GT
+ compare (ZmmReg _ _ _ _) _ = LT
+ compare _ (ZmmReg _ _ _ _) = GT
compare Sp _ = LT
compare _ Sp = GT
compare SpLim _ = LT
@@ -596,12 +634,15 @@ globalRegType dflags (VanillaReg _ VNonGcPtr) = bWord dflags
globalRegType _ (FloatReg _) = cmmFloat W32
globalRegType _ (DoubleReg _) = cmmFloat W64
globalRegType _ (LongReg _) = cmmBits W64
--- TODO: improve the internal model of SIMD/vectorized registers
--- the right design SHOULd improve handling of float and double code too.
--- see remarks in "NOTE [SIMD Design for the future]"" in StgCmmPrim
-globalRegType _ (XmmReg _) = cmmVec 4 (cmmBits W32)
-globalRegType _ (YmmReg _) = cmmVec 8 (cmmBits W32)
-globalRegType _ (ZmmReg _) = cmmVec 16 (cmmBits W32)
+globalRegType _ (XmmReg _ l w ty) = case ty of
+ Integer -> cmmVec l (cmmBits w)
+ Float -> cmmVec l (cmmFloat w)
+globalRegType _ (YmmReg _ l w ty) = case ty of
+ Integer -> cmmVec l (cmmBits w)
+ Float -> cmmVec l (cmmFloat w)
+globalRegType _ (ZmmReg _ l w ty) = case ty of
+ Integer -> cmmVec l (cmmBits w)
+ Float -> cmmVec l (cmmFloat w)
globalRegType dflags Hp = gcWord dflags
-- The initialiser for all
diff --git a/compiler/cmm/CmmLint.hs b/compiler/cmm/CmmLint.hs
index d5c3f84443..53dcd70b7b 100644
--- a/compiler/cmm/CmmLint.hs
+++ b/compiler/cmm/CmmLint.hs
@@ -148,9 +148,13 @@ lintCmmMiddle node = case node of
dflags <- getDynFlags
erep <- lintCmmExpr expr
let reg_ty = cmmRegType dflags reg
- if (erep `cmmEqType_ignoring_ptrhood` reg_ty)
- then return ()
- else cmmLintAssignErr (CmmAssign reg expr) erep reg_ty
+ case isVecCatType reg_ty of
+ True -> if ((typeWidth reg_ty) == (typeWidth erep))
+ then return ()
+ else cmmLintAssignErr (CmmAssign reg expr) erep reg_ty
+ _ -> if (erep `cmmEqType_ignoring_ptrhood` reg_ty)
+ then return ()
+ else cmmLintAssignErr (CmmAssign reg expr) erep reg_ty
CmmStore l r -> do
_ <- lintCmmExpr l
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index 9740d21bef..38d9edb480 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -136,8 +136,9 @@ data MachOp
| MO_VU_Rem Length Width
-- Floting point vector element insertion and extraction operations
- | MO_VF_Insert Length Width -- Insert scalar into vector
- | MO_VF_Extract Length Width -- Extract scalar from vector
+ | MO_VF_Broadcast Length Width -- Broadcast a scalar into a vector
+ | MO_VF_Insert Length Width -- Insert scalar into vector
+ | MO_VF_Extract Length Width -- Extract scalar from vector
-- Floating point vector operations
| MO_VF_Add Length Width
@@ -430,6 +431,7 @@ machOpResultType dflags mop tys =
MO_VU_Quot l w -> cmmVec l (cmmBits w)
MO_VU_Rem l w -> cmmVec l (cmmBits w)
+ MO_VF_Broadcast l w -> cmmVec l (cmmFloat w)
MO_VF_Insert l w -> cmmVec l (cmmFloat w)
MO_VF_Extract _ w -> cmmFloat w
@@ -522,16 +524,21 @@ machOpArgReps dflags op =
MO_VU_Quot _ r -> [r,r]
MO_VU_Rem _ r -> [r,r]
- MO_VF_Insert l r -> [typeWidth (vec l (cmmFloat r)),r,wordWidth dflags]
- MO_VF_Extract l r -> [typeWidth (vec l (cmmFloat r)),wordWidth dflags]
+ -- offset is always W32 as mentioned in StgCmmPrim.hs
+ MO_VF_Broadcast l r -> [vecwidth l r, r]
+ MO_VF_Insert l r -> [vecwidth l r, r, W32]
+ MO_VF_Extract l r -> [vecwidth l r, W32]
- MO_VF_Add _ r -> [r,r]
- MO_VF_Sub _ r -> [r,r]
- MO_VF_Mul _ r -> [r,r]
- MO_VF_Quot _ r -> [r,r]
- MO_VF_Neg _ r -> [r]
+ -- NOTE: The below is owing to the fact that floats use the SSE registers
+ MO_VF_Add l w -> [vecwidth l w, vecwidth l w]
+ MO_VF_Sub l w -> [vecwidth l w, vecwidth l w]
+ MO_VF_Mul l w -> [vecwidth l w, vecwidth l w]
+ MO_VF_Quot l w -> [vecwidth l w, vecwidth l w]
+ MO_VF_Neg l w -> [vecwidth l w]
MO_AlignmentCheck _ r -> [r]
+ where
+ vecwidth l w = widthFromBytes (l*widthInBytes w)
-----------------------------------------------------------------------------
-- CallishMachOp
diff --git a/compiler/cmm/CmmType.hs b/compiler/cmm/CmmType.hs
index 43d23c7ee7..17b588720f 100644
--- a/compiler/cmm/CmmType.hs
+++ b/compiler/cmm/CmmType.hs
@@ -6,6 +6,7 @@ module CmmType
, typeWidth, cmmEqType, cmmEqType_ignoring_ptrhood
, isFloatType, isGcPtrType, isBitsType
, isWord32, isWord64, isFloat64, isFloat32
+ , isVecCatType
, Width(..)
, widthInBits, widthInBytes, widthInLog, widthFromBytes
@@ -133,7 +134,7 @@ cInt :: DynFlags -> CmmType
cInt dflags = cmmBits (cIntWidth dflags)
------------ Predicates ----------------
-isFloatType, isGcPtrType, isBitsType :: CmmType -> Bool
+isFloatType, isGcPtrType, isBitsType, isVecCatType :: CmmType -> Bool
isFloatType (CmmType FloatCat _) = True
isFloatType _other = False
@@ -143,6 +144,9 @@ isGcPtrType _other = False
isBitsType (CmmType BitsCat _) = True
isBitsType _ = False
+isVecCatType (CmmType (VecCat _ _) _) = True
+isVecCatType _other = False
+
isWord32, isWord64, isFloat32, isFloat64 :: CmmType -> Bool
-- isWord64 is true of 64-bit non-floats (both gc-ptrs and otherwise)
-- isFloat32 and 64 are obvious
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index 7227edd57e..a60a26229b 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -713,6 +713,10 @@ pprMachOp_for_C mop = case mop of
(panic $ "PprC.pprMachOp_for_C: MO_VU_Rem"
++ " should have been handled earlier!")
+ MO_VF_Broadcast {} -> pprTrace "offending mop:"
+ (text "MO_VF_Broadcast")
+ (panic $ "PprC.pprMachOp_for_C: MO_VF_Broadcast"
+ ++ " should have been handled earlier!")
MO_VF_Insert {} -> pprTrace "offending mop:"
(text "MO_VF_Insert")
(panic $ "PprC.pprMachOp_for_C: MO_VF_Insert"
diff --git a/compiler/cmm/PprCmmExpr.hs b/compiler/cmm/PprCmmExpr.hs
index 7bf73f1ca6..2080c1f5d8 100644
--- a/compiler/cmm/PprCmmExpr.hs
+++ b/compiler/cmm/PprCmmExpr.hs
@@ -261,9 +261,9 @@ pprGlobalReg gr
FloatReg n -> char 'F' <> int n
DoubleReg n -> char 'D' <> int n
LongReg n -> char 'L' <> int n
- XmmReg n -> text "XMM" <> int n
- YmmReg n -> text "YMM" <> int n
- ZmmReg n -> text "ZMM" <> int n
+ XmmReg n _ _ _ -> text "XMM" <> int n
+ YmmReg n _ _ _ -> text "YMM" <> int n
+ ZmmReg n _ _ _ -> text "ZMM" <> int n
Sp -> text "Sp"
SpLim -> text "SpLim"
Hp -> text "Hp"
diff --git a/compiler/codeGen/CgUtils.hs b/compiler/codeGen/CgUtils.hs
index 0ff9bd8b56..2cbcfc66a9 100644
--- a/compiler/codeGen/CgUtils.hs
+++ b/compiler/codeGen/CgUtils.hs
@@ -57,27 +57,27 @@ baseRegOffset dflags (DoubleReg 4) = oFFSET_StgRegTable_rD4 dflags
baseRegOffset dflags (DoubleReg 5) = oFFSET_StgRegTable_rD5 dflags
baseRegOffset dflags (DoubleReg 6) = oFFSET_StgRegTable_rD6 dflags
baseRegOffset _ (DoubleReg n) = panic ("Registers above D6 are not supported (tried to use D" ++ show n ++ ")")
-baseRegOffset dflags (XmmReg 1) = oFFSET_StgRegTable_rXMM1 dflags
-baseRegOffset dflags (XmmReg 2) = oFFSET_StgRegTable_rXMM2 dflags
-baseRegOffset dflags (XmmReg 3) = oFFSET_StgRegTable_rXMM3 dflags
-baseRegOffset dflags (XmmReg 4) = oFFSET_StgRegTable_rXMM4 dflags
-baseRegOffset dflags (XmmReg 5) = oFFSET_StgRegTable_rXMM5 dflags
-baseRegOffset dflags (XmmReg 6) = oFFSET_StgRegTable_rXMM6 dflags
-baseRegOffset _ (XmmReg n) = panic ("Registers above XMM6 are not supported (tried to use XMM" ++ show n ++ ")")
-baseRegOffset dflags (YmmReg 1) = oFFSET_StgRegTable_rYMM1 dflags
-baseRegOffset dflags (YmmReg 2) = oFFSET_StgRegTable_rYMM2 dflags
-baseRegOffset dflags (YmmReg 3) = oFFSET_StgRegTable_rYMM3 dflags
-baseRegOffset dflags (YmmReg 4) = oFFSET_StgRegTable_rYMM4 dflags
-baseRegOffset dflags (YmmReg 5) = oFFSET_StgRegTable_rYMM5 dflags
-baseRegOffset dflags (YmmReg 6) = oFFSET_StgRegTable_rYMM6 dflags
-baseRegOffset _ (YmmReg n) = panic ("Registers above YMM6 are not supported (tried to use YMM" ++ show n ++ ")")
-baseRegOffset dflags (ZmmReg 1) = oFFSET_StgRegTable_rZMM1 dflags
-baseRegOffset dflags (ZmmReg 2) = oFFSET_StgRegTable_rZMM2 dflags
-baseRegOffset dflags (ZmmReg 3) = oFFSET_StgRegTable_rZMM3 dflags
-baseRegOffset dflags (ZmmReg 4) = oFFSET_StgRegTable_rZMM4 dflags
-baseRegOffset dflags (ZmmReg 5) = oFFSET_StgRegTable_rZMM5 dflags
-baseRegOffset dflags (ZmmReg 6) = oFFSET_StgRegTable_rZMM6 dflags
-baseRegOffset _ (ZmmReg n) = panic ("Registers above ZMM6 are not supported (tried to use ZMM" ++ show n ++ ")")
+baseRegOffset dflags (XmmReg 1 _ _ _) = oFFSET_StgRegTable_rXMM1 dflags
+baseRegOffset dflags (XmmReg 2 _ _ _) = oFFSET_StgRegTable_rXMM2 dflags
+baseRegOffset dflags (XmmReg 3 _ _ _) = oFFSET_StgRegTable_rXMM3 dflags
+baseRegOffset dflags (XmmReg 4 _ _ _) = oFFSET_StgRegTable_rXMM4 dflags
+baseRegOffset dflags (XmmReg 5 _ _ _) = oFFSET_StgRegTable_rXMM5 dflags
+baseRegOffset dflags (XmmReg 6 _ _ _) = oFFSET_StgRegTable_rXMM6 dflags
+baseRegOffset _ (XmmReg n _ _ _) = panic ("Registers above XMM6 are not supported (tried to use XMM" ++ show n ++ ")")
+baseRegOffset dflags (YmmReg 1 _ _ _) = oFFSET_StgRegTable_rYMM1 dflags
+baseRegOffset dflags (YmmReg 2 _ _ _) = oFFSET_StgRegTable_rYMM2 dflags
+baseRegOffset dflags (YmmReg 3 _ _ _) = oFFSET_StgRegTable_rYMM3 dflags
+baseRegOffset dflags (YmmReg 4 _ _ _) = oFFSET_StgRegTable_rYMM4 dflags
+baseRegOffset dflags (YmmReg 5 _ _ _) = oFFSET_StgRegTable_rYMM5 dflags
+baseRegOffset dflags (YmmReg 6 _ _ _) = oFFSET_StgRegTable_rYMM6 dflags
+baseRegOffset _ (YmmReg n _ _ _) = panic ("Registers above YMM6 are not supported (tried to use YMM" ++ show n ++ ")")
+baseRegOffset dflags (ZmmReg 1 _ _ _) = oFFSET_StgRegTable_rZMM1 dflags
+baseRegOffset dflags (ZmmReg 2 _ _ _) = oFFSET_StgRegTable_rZMM2 dflags
+baseRegOffset dflags (ZmmReg 3 _ _ _) = oFFSET_StgRegTable_rZMM3 dflags
+baseRegOffset dflags (ZmmReg 4 _ _ _) = oFFSET_StgRegTable_rZMM4 dflags
+baseRegOffset dflags (ZmmReg 5 _ _ _) = oFFSET_StgRegTable_rZMM5 dflags
+baseRegOffset dflags (ZmmReg 6 _ _ _) = oFFSET_StgRegTable_rZMM6 dflags
+baseRegOffset _ (ZmmReg n _ _ _) = panic ("Registers above ZMM6 are not supported (tried to use ZMM" ++ show n ++ ")")
baseRegOffset dflags Sp = oFFSET_StgRegTable_rSp dflags
baseRegOffset dflags SpLim = oFFSET_StgRegTable_rSpLim dflags
baseRegOffset dflags (LongReg 1) = oFFSET_StgRegTable_rL1 dflags
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index c3f9d5a279..9a6cf6c2e5 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -669,7 +669,7 @@ emitPrimOp _ [res] Word2DoubleOp [w] = emitPrimCall [res]
-- SIMD primops
emitPrimOp dflags [res] (VecBroadcastOp vcat n w) [e] = do
checkVecCompatibility dflags vcat n w
- doVecPackOp (vecElemInjectCast dflags vcat w) ty zeros (replicate n e) res
+ doVecBroadcastOp (vecElemInjectCast dflags vcat w) ty zeros e res
where
zeros :: CmmExpr
zeros = CmmLit $ CmmVec (replicate n zero)
@@ -1765,9 +1765,8 @@ vecElemProjectCast _ _ _ = Nothing
checkVecCompatibility :: DynFlags -> PrimOpVecCat -> Length -> Width -> FCode ()
checkVecCompatibility dflags vcat l w = do
- when (hscTarget dflags /= HscLlvm) $ do
- sorry $ unlines ["SIMD vector instructions require the LLVM back-end."
- ,"Please use -fllvm."]
+ when (hscTarget dflags /= HscLlvm && hscTarget dflags /= HscAsm) $ do
+ sorry "SIMD vector instructions not supported for the C backend or GHCi"
check vecWidth vcat l w
where
check :: Width -> PrimOpVecCat -> Length -> Width -> FCode ()
@@ -1792,6 +1791,38 @@ checkVecCompatibility dflags vcat l w = do
------------------------------------------------------------------------------
-- Helpers for translating vector packing and unpacking.
+doVecBroadcastOp :: Maybe MachOp -- Cast from element to vector component
+ -> CmmType -- Type of vector
+ -> CmmExpr -- Initial vector
+ -> CmmExpr -- Elements
+ -> CmmFormal -- Destination for result
+ -> FCode ()
+doVecBroadcastOp maybe_pre_write_cast ty z es res = do
+ dst <- newTemp ty
+ emitAssign (CmmLocal dst) z
+ vecBroadcast dst es 0
+ where
+ vecBroadcast :: CmmFormal -> CmmExpr -> Int -> FCode ()
+ vecBroadcast src e _ = do
+ dst <- newTemp ty
+ if isFloatType (vecElemType ty)
+ then emitAssign (CmmLocal dst) (CmmMachOp (MO_VF_Broadcast len wid)
+ [CmmReg (CmmLocal src), cast e])
+ --TODO : Add the MachOp MO_V_Broadcast
+ else emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid)
+ [CmmReg (CmmLocal src), cast e])
+ emitAssign (CmmLocal res) (CmmReg (CmmLocal dst))
+
+ cast :: CmmExpr -> CmmExpr
+ cast val = case maybe_pre_write_cast of
+ Nothing -> val
+ Just cast -> CmmMachOp cast [val]
+
+ len :: Length
+ len = vecLength ty
+
+ wid :: Width
+ wid = typeWidth (vecElemType ty)
doVecPackOp :: Maybe MachOp -- Cast from element to vector component
-> CmmType -- Type of vector
@@ -1809,16 +1840,16 @@ doVecPackOp maybe_pre_write_cast ty z es res = do
emitAssign (CmmLocal res) (CmmReg (CmmLocal src))
vecPack src (e : es) i = do
- dst <- newTemp ty
- if isFloatType (vecElemType ty)
- then emitAssign (CmmLocal dst) (CmmMachOp (MO_VF_Insert len wid)
- [CmmReg (CmmLocal src), cast e, iLit])
- else emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid)
- [CmmReg (CmmLocal src), cast e, iLit])
- vecPack dst es (i + 1)
+ dst <- newTemp ty
+ if isFloatType (vecElemType ty)
+ then emitAssign (CmmLocal dst) (CmmMachOp (MO_VF_Insert len wid)
+ [CmmReg (CmmLocal src), cast e, iLit])
+ else emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid)
+ [CmmReg (CmmLocal src), cast e, iLit])
+ vecPack dst es (i + 1)
where
-- vector indices are always 32-bits
- iLit = CmmLit (CmmInt (toInteger i) W32)
+ iLit = CmmLit (CmmInt ((toInteger i) * 16) W32)
cast :: CmmExpr -> CmmExpr
cast val = case maybe_pre_write_cast of
diff --git a/compiler/llvmGen/LlvmCodeGen/Base.hs b/compiler/llvmGen/LlvmCodeGen/Base.hs
index 81f3b9f84c..a5a5683a3e 100644
--- a/compiler/llvmGen/LlvmCodeGen/Base.hs
+++ b/compiler/llvmGen/LlvmCodeGen/Base.hs
@@ -152,12 +152,12 @@ llvmFunArgs dflags live =
where platform = targetPlatform dflags
isLive r = not (isSSE r) || r `elem` alwaysLive || r `elem` live
isPassed r = not (isSSE r) || isLive r
- isSSE (FloatReg _) = True
- isSSE (DoubleReg _) = True
- isSSE (XmmReg _) = True
- isSSE (YmmReg _) = True
- isSSE (ZmmReg _) = True
- isSSE _ = False
+ isSSE (FloatReg _) = True
+ isSSE (DoubleReg _) = True
+ isSSE (XmmReg _ _ _ _ ) = True
+ isSSE (YmmReg _ _ _ _ ) = True
+ isSSE (ZmmReg _ _ _ _ ) = True
+ isSSE _ = False
-- | Llvm standard fun attributes
llvmStdFunAttrs :: [LlvmFuncAttr]
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index 86a59381b2..8fea6e0b17 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -1287,6 +1287,7 @@ genMachOp _ op [x] = case op of
MO_VU_Quot _ _ -> panicOp
MO_VU_Rem _ _ -> panicOp
+ MO_VF_Broadcast _ _ -> panicOp
MO_VF_Insert _ _ -> panicOp
MO_VF_Extract _ _ -> panicOp
@@ -1483,6 +1484,7 @@ genMachOp_slow opt op [x, y] = case op of
MO_VS_Neg {} -> panicOp
+ MO_VF_Broadcast {} -> panicOp
MO_VF_Insert {} -> panicOp
MO_VF_Extract {} -> panicOp
@@ -1844,9 +1846,9 @@ funEpilogue live = do
let liveRegs = alwaysLive ++ live
isSSE (FloatReg _) = True
isSSE (DoubleReg _) = True
- isSSE (XmmReg _) = True
- isSSE (YmmReg _) = True
- isSSE (ZmmReg _) = True
+ isSSE (XmmReg _ _ _ _) = True
+ isSSE (YmmReg _ _ _ _) = True
+ isSSE (ZmmReg _ _ _ _) = True
isSSE _ = False
-- Set to value or "undef" depending on whether the register is
diff --git a/compiler/llvmGen/LlvmCodeGen/Regs.hs b/compiler/llvmGen/LlvmCodeGen/Regs.hs
index 8cdf3c6869..6d188d908f 100644
--- a/compiler/llvmGen/LlvmCodeGen/Regs.hs
+++ b/compiler/llvmGen/LlvmCodeGen/Regs.hs
@@ -60,24 +60,24 @@ lmGlobalReg dflags suf reg
DoubleReg 4 -> doubleGlobal $ "D4" ++ suf
DoubleReg 5 -> doubleGlobal $ "D5" ++ suf
DoubleReg 6 -> doubleGlobal $ "D6" ++ suf
- XmmReg 1 -> xmmGlobal $ "XMM1" ++ suf
- XmmReg 2 -> xmmGlobal $ "XMM2" ++ suf
- XmmReg 3 -> xmmGlobal $ "XMM3" ++ suf
- XmmReg 4 -> xmmGlobal $ "XMM4" ++ suf
- XmmReg 5 -> xmmGlobal $ "XMM5" ++ suf
- XmmReg 6 -> xmmGlobal $ "XMM6" ++ suf
- YmmReg 1 -> ymmGlobal $ "YMM1" ++ suf
- YmmReg 2 -> ymmGlobal $ "YMM2" ++ suf
- YmmReg 3 -> ymmGlobal $ "YMM3" ++ suf
- YmmReg 4 -> ymmGlobal $ "YMM4" ++ suf
- YmmReg 5 -> ymmGlobal $ "YMM5" ++ suf
- YmmReg 6 -> ymmGlobal $ "YMM6" ++ suf
- ZmmReg 1 -> zmmGlobal $ "ZMM1" ++ suf
- ZmmReg 2 -> zmmGlobal $ "ZMM2" ++ suf
- ZmmReg 3 -> zmmGlobal $ "ZMM3" ++ suf
- ZmmReg 4 -> zmmGlobal $ "ZMM4" ++ suf
- ZmmReg 5 -> zmmGlobal $ "ZMM5" ++ suf
- ZmmReg 6 -> zmmGlobal $ "ZMM6" ++ suf
+ XmmReg 1 _ _ _ -> xmmGlobal $ "XMM1" ++ suf
+ XmmReg 2 _ _ _ -> xmmGlobal $ "XMM2" ++ suf
+ XmmReg 3 _ _ _ -> xmmGlobal $ "XMM3" ++ suf
+ XmmReg 4 _ _ _ -> xmmGlobal $ "XMM4" ++ suf
+ XmmReg 5 _ _ _ -> xmmGlobal $ "XMM5" ++ suf
+ XmmReg 6 _ _ _ -> xmmGlobal $ "XMM6" ++ suf
+ YmmReg 1 _ _ _ -> ymmGlobal $ "YMM1" ++ suf
+ YmmReg 2 _ _ _ -> ymmGlobal $ "YMM2" ++ suf
+ YmmReg 3 _ _ _ -> ymmGlobal $ "YMM3" ++ suf
+ YmmReg 4 _ _ _ -> ymmGlobal $ "YMM4" ++ suf
+ YmmReg 5 _ _ _ -> ymmGlobal $ "YMM5" ++ suf
+ YmmReg 6 _ _ _ -> ymmGlobal $ "YMM6" ++ suf
+ ZmmReg 1 _ _ _ -> zmmGlobal $ "ZMM1" ++ suf
+ ZmmReg 2 _ _ _ -> zmmGlobal $ "ZMM2" ++ suf
+ ZmmReg 3 _ _ _ -> zmmGlobal $ "ZMM3" ++ suf
+ ZmmReg 4 _ _ _ -> zmmGlobal $ "ZMM4" ++ suf
+ ZmmReg 5 _ _ _ -> zmmGlobal $ "ZMM5" ++ suf
+ ZmmReg 6 _ _ _ -> zmmGlobal $ "ZMM6" ++ suf
MachSp -> wordGlobal $ "MachSp" ++ suf
_other -> panic $ "LlvmCodeGen.Reg: GlobalReg (" ++ (show reg)
++ ") not supported!"
diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs
index b5fb200802..fcc5cfb1a0 100644
--- a/compiler/main/DynFlags.hs
+++ b/compiler/main/DynFlags.hs
@@ -219,6 +219,7 @@ module DynFlags (
-- * SSE and AVX
isSseEnabled,
isSse2Enabled,
+ isSse4_1Enabled,
isSse4_2Enabled,
isBmiEnabled,
isBmi2Enabled,
@@ -5908,6 +5909,8 @@ isSse2Enabled dflags = case platformArch (targetPlatform dflags) of
ArchX86 -> True
_ -> False
+isSse4_1Enabled :: DynFlags -> Bool
+isSse4_1Enabled dflags = sseVersion dflags >= Just SSE4
isSse4_2Enabled :: DynFlags -> Bool
isSse4_2Enabled dflags = sseVersion dflags >= Just SSE42
diff --git a/compiler/nativeGen/Format.hs b/compiler/nativeGen/Format.hs
index 31472893e7..a0e4e99f80 100644
--- a/compiler/nativeGen/Format.hs
+++ b/compiler/nativeGen/Format.hs
@@ -10,9 +10,11 @@
--
module Format (
Format(..),
+ ScalarFormat(..),
intFormat,
floatFormat,
isFloatFormat,
+ isVecFormat,
cmmTypeFormat,
formatToWidth,
formatInBytes
@@ -25,6 +27,29 @@ import GhcPrelude
import Cmm
import Outputable
+
+-- Note [GHC's data format representations]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- GHC has severals types that represent various aspects of data format.
+-- These include:
+--
+-- * 'CmmType.CmmType': The data classification used throughout the C--
+-- pipeline. This is a pair of a CmmCat and a Width.
+--
+-- * 'CmmType.CmmCat': What the bits in a C-- value mean (e.g. a pointer, integer, or floating-point value)
+--
+-- * 'CmmType.Width': The width of a C-- value.
+--
+-- * 'CmmType.Length': The width (measured in number of scalars) of a vector value.
+--
+-- * 'Format.Format': The data format representation used by much of the backend.
+--
+-- * 'Format.ScalarFormat': The format of a 'Format.VecFormat'\'s scalar.
+--
+-- * 'RegClass.RegClass': Whether a register is an integer, float-point, or vector register
+--
+
-- It looks very like the old MachRep, but it's now of purely local
-- significance, here in the native code generator. You can change it
-- without global consequences.
@@ -47,8 +72,16 @@ data Format
| II64
| FF32
| FF64
+ | VecFormat !Length !ScalarFormat !Width
deriving (Show, Eq)
+data ScalarFormat = FmtInt8
+ | FmtInt16
+ | FmtInt32
+ | FmtInt64
+ | FmtFloat
+ | FmtDouble
+ deriving (Show, Eq)
-- | Get the integer format of this width.
intFormat :: Width -> Format
@@ -81,13 +114,33 @@ isFloatFormat format
FF64 -> True
_ -> False
+-- | Check if a format represents a vector
+isVecFormat :: Format -> Bool
+isVecFormat (VecFormat {}) = True
+isVecFormat _ = False
-- | Convert a Cmm type to a Format.
cmmTypeFormat :: CmmType -> Format
cmmTypeFormat ty
| isFloatType ty = floatFormat (typeWidth ty)
+ | isVecType ty = vecFormat ty
| otherwise = intFormat (typeWidth ty)
+vecFormat :: CmmType -> Format
+vecFormat ty =
+ let l = vecLength ty
+ elemTy = vecElemType ty
+ in if isFloatType elemTy
+ then case typeWidth elemTy of
+ W32 -> VecFormat l FmtFloat W32
+ W64 -> VecFormat l FmtDouble W64
+ _ -> pprPanic "Incorrect vector element width" (ppr elemTy)
+ else case typeWidth elemTy of
+ W8 -> VecFormat l FmtInt8 W8
+ W16 -> VecFormat l FmtInt16 W16
+ W32 -> VecFormat l FmtInt32 W32
+ W64 -> VecFormat l FmtInt64 W64
+ _ -> pprPanic "Incorrect vector element width" (ppr elemTy)
-- | Get the Width of a Format.
formatToWidth :: Format -> Width
@@ -99,7 +152,7 @@ formatToWidth format
II64 -> W64
FF32 -> W32
FF64 -> W64
-
+ VecFormat l _ w -> widthFromBytes (l*widthInBytes w)
formatInBytes :: Format -> Int
formatInBytes = widthInBytes . formatToWidth
diff --git a/compiler/nativeGen/NCGMonad.hs b/compiler/nativeGen/NCGMonad.hs
index 3680c1c7b0..67730aa59b 100644
--- a/compiler/nativeGen/NCGMonad.hs
+++ b/compiler/nativeGen/NCGMonad.hs
@@ -250,7 +250,6 @@ getNewRegNat rep
dflags <- getDynFlags
return (RegVirtual $ targetMkVirtualReg (targetPlatform dflags) u rep)
-
getNewRegPairNat :: Format -> NatM (Reg,Reg)
getNewRegPairNat rep
= do u <- getUniqueNat
diff --git a/compiler/nativeGen/PPC/CodeGen.hs b/compiler/nativeGen/PPC/CodeGen.hs
index a49526c93a..7e5df6a76c 100644
--- a/compiler/nativeGen/PPC/CodeGen.hs
+++ b/compiler/nativeGen/PPC/CodeGen.hs
@@ -1909,6 +1909,8 @@ genCCall' dflags gcp target dest_regs args
FF32 -> (1, 1, 4, fprs)
FF64 -> (2, 1, 8, fprs)
II64 -> panic "genCCall' passArguments II64"
+ VecFormat {}
+ -> panic "genCCall' passArguments vector format"
GCP32ELF ->
case cmmTypeFormat rep of
@@ -1919,6 +1921,8 @@ genCCall' dflags gcp target dest_regs args
FF32 -> (0, 1, 4, fprs)
FF64 -> (0, 1, 8, fprs)
II64 -> panic "genCCall' passArguments II64"
+ VecFormat {}
+ -> panic "genCCall' passArguments vector format"
GCP64ELF _ ->
case cmmTypeFormat rep of
II8 -> (1, 0, 8, gprs)
@@ -1930,6 +1934,8 @@ genCCall' dflags gcp target dest_regs args
-- the FPRs.
FF32 -> (1, 1, 8, fprs)
FF64 -> (1, 1, 8, fprs)
+ VecFormat {}
+ -> panic "genCCall' passArguments vector format"
moveResult reduceToFF32 =
case dest_regs of
diff --git a/compiler/nativeGen/PPC/Ppr.hs b/compiler/nativeGen/PPC/Ppr.hs
index 4254f23122..b7316e6bc6 100644
--- a/compiler/nativeGen/PPC/Ppr.hs
+++ b/compiler/nativeGen/PPC/Ppr.hs
@@ -29,7 +29,7 @@ import BlockId
import CLabel
import PprCmmExpr ()
-import Unique ( pprUniqueAlways, getUnique )
+import Unique ( getUnique )
import GHC.Platform
import FastString
import Outputable
@@ -168,10 +168,7 @@ pprReg r
= case r of
RegReal (RealRegSingle i) -> ppr_reg_no i
RegReal (RealRegPair{}) -> panic "PPC.pprReg: no reg pairs on this arch"
- RegVirtual (VirtualRegI u) -> text "%vI_" <> pprUniqueAlways u
- RegVirtual (VirtualRegHi u) -> text "%vHi_" <> pprUniqueAlways u
- RegVirtual (VirtualRegF u) -> text "%vF_" <> pprUniqueAlways u
- RegVirtual (VirtualRegD u) -> text "%vD_" <> pprUniqueAlways u
+ RegVirtual v -> ppr v
where
ppr_reg_no :: Int -> SDoc
@@ -190,7 +187,8 @@ pprFormat x
II32 -> sLit "w"
II64 -> sLit "d"
FF32 -> sLit "fs"
- FF64 -> sLit "fd")
+ FF64 -> sLit "fd"
+ VecFormat _ _ _ -> panic "PPC.Ppr.pprFormat: VecFormat")
pprCond :: Cond -> SDoc
@@ -375,6 +373,7 @@ pprInstr (LD fmt reg addr) = hcat [
II64 -> sLit "d"
FF32 -> sLit "fs"
FF64 -> sLit "fd"
+ VecFormat _ _ _ -> panic "PPC.Ppr.pprInstr: VecFormat"
),
case addr of AddrRegImm _ _ -> empty
AddrRegReg _ _ -> char 'x',
@@ -414,6 +413,7 @@ pprInstr (LA fmt reg addr) = hcat [
II64 -> sLit "d"
FF32 -> sLit "fs"
FF64 -> sLit "fd"
+ VecFormat _ _ _ -> panic "PPC.Ppr.pprInstr: VecFormat"
),
case addr of AddrRegImm _ _ -> empty
AddrRegReg _ _ -> char 'x',
diff --git a/compiler/nativeGen/Reg.hs b/compiler/nativeGen/Reg.hs
index 7f69ea01a4..dff2f07bf4 100644
--- a/compiler/nativeGen/Reg.hs
+++ b/compiler/nativeGen/Reg.hs
@@ -56,6 +56,7 @@ data VirtualReg
| VirtualRegHi {-# UNPACK #-} !Unique -- High part of 2-word register
| VirtualRegF {-# UNPACK #-} !Unique
| VirtualRegD {-# UNPACK #-} !Unique
+ | VirtualRegVec {-# UNPACK #-} !Unique
deriving (Eq, Show)
@@ -69,6 +70,7 @@ instance Ord VirtualReg where
compare (VirtualRegHi a) (VirtualRegHi b) = nonDetCmpUnique a b
compare (VirtualRegF a) (VirtualRegF b) = nonDetCmpUnique a b
compare (VirtualRegD a) (VirtualRegD b) = nonDetCmpUnique a b
+ compare (VirtualRegVec a) (VirtualRegVec b) = nonDetCmpUnique a b
compare VirtualRegI{} _ = LT
compare _ VirtualRegI{} = GT
@@ -76,7 +78,8 @@ instance Ord VirtualReg where
compare _ VirtualRegHi{} = GT
compare VirtualRegF{} _ = LT
compare _ VirtualRegF{} = GT
-
+ compare VirtualRegVec{} _ = LT
+ compare _ VirtualRegVec{} = GT
instance Uniquable VirtualReg where
@@ -86,6 +89,7 @@ instance Uniquable VirtualReg where
VirtualRegHi u -> u
VirtualRegF u -> u
VirtualRegD u -> u
+ VirtualRegVec u -> u
instance Outputable VirtualReg where
ppr reg
@@ -95,8 +99,9 @@ instance Outputable VirtualReg where
-- this code is kinda wrong on x86
-- because float and double occupy the same register set
-- namely SSE2 register xmm0 .. xmm15
- VirtualRegF u -> text "%vFloat_" <> pprUniqueAlways u
- VirtualRegD u -> text "%vDouble_" <> pprUniqueAlways u
+ VirtualRegF u -> text "%vFloat_" <> pprUniqueAlways u
+ VirtualRegD u -> text "%vDouble_" <> pprUniqueAlways u
+ VirtualRegVec u -> text "%vVec_" <> pprUniqueAlways u
@@ -107,6 +112,7 @@ renameVirtualReg u r
VirtualRegHi _ -> VirtualRegHi u
VirtualRegF _ -> VirtualRegF u
VirtualRegD _ -> VirtualRegD u
+ VirtualRegVec _ -> VirtualRegVec u
classOfVirtualReg :: VirtualReg -> RegClass
@@ -116,6 +122,8 @@ classOfVirtualReg vr
VirtualRegHi{} -> RcInteger
VirtualRegF{} -> RcFloat
VirtualRegD{} -> RcDouble
+ -- Below is an awful, largely x86-specific hack
+ VirtualRegVec{} -> RcDouble
diff --git a/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs b/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs
index 5d4fd418c3..23d7c6b421 100644
--- a/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs
+++ b/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs
@@ -195,7 +195,6 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcDouble conflicts exclu
-
-- Specification Code ----------------------------------------------------------
--
-- The trivColorable function for each particular architecture should
diff --git a/compiler/nativeGen/RegAlloc/Linear/FreeRegs.hs b/compiler/nativeGen/RegAlloc/Linear/FreeRegs.hs
index 5a4f1c65a8..d452edfdc6 100644
--- a/compiler/nativeGen/RegAlloc/Linear/FreeRegs.hs
+++ b/compiler/nativeGen/RegAlloc/Linear/FreeRegs.hs
@@ -2,6 +2,7 @@
module RegAlloc.Linear.FreeRegs (
FR(..),
+ allFreeRegs,
maxSpillSlots
)
@@ -69,6 +70,10 @@ instance FR SPARC.FreeRegs where
frInitFreeRegs = SPARC.initFreeRegs
frReleaseReg = SPARC.releaseReg
+-- | For debugging output.
+allFreeRegs :: FR freeRegs => Platform -> freeRegs -> [RealReg]
+allFreeRegs plat fr = foldMap (\rcls -> frGetFreeRegs plat rcls fr) allRegClasses
+
maxSpillSlots :: DynFlags -> Int
maxSpillSlots dflags
= case platformArch (targetPlatform dflags) of
diff --git a/compiler/nativeGen/RegAlloc/Linear/Main.hs b/compiler/nativeGen/RegAlloc/Linear/Main.hs
index cdaf738d68..b29712e0e0 100644
--- a/compiler/nativeGen/RegAlloc/Linear/Main.hs
+++ b/compiler/nativeGen/RegAlloc/Linear/Main.hs
@@ -884,8 +884,10 @@ allocRegsAndSpill_spill reading keep spills alloc r rs assig spill_loc
$ vcat
[ text "allocating vreg: " <> text (show r)
, text "assignment: " <> ppr assig
- , text "freeRegs: " <> text (show freeRegs)
- , text "initFreeRegs: " <> text (show (frInitFreeRegs platform `asTypeOf` freeRegs)) ]
+ , text "freeRegs: " <> text (showRegs freeRegs)
+ , text "initFreeRegs: " <> text (showRegs (frInitFreeRegs platform `asTypeOf` freeRegs))
+ ]
+ where showRegs = show . map (\reg -> (reg, targetClassOfRealReg platform reg)) . allFreeRegs platform
result
diff --git a/compiler/nativeGen/RegClass.hs b/compiler/nativeGen/RegClass.hs
index fbbb786817..d73a3409ac 100644
--- a/compiler/nativeGen/RegClass.hs
+++ b/compiler/nativeGen/RegClass.hs
@@ -1,15 +1,14 @@
-- | An architecture independent description of a register's class.
module RegClass
- ( RegClass (..) )
-
-where
+ ( RegClass(..)
+ , allRegClasses
+ ) where
import GhcPrelude
import Outputable
import Unique
-
-- | The class of a register.
-- Used in the register allocator.
-- We treat all registers in a class as being interchangable.
@@ -18,7 +17,11 @@ data RegClass
= RcInteger
| RcFloat
| RcDouble
- deriving Eq
+ deriving (Eq, Show)
+
+allRegClasses :: [RegClass]
+allRegClasses =
+ [ RcInteger, RcFloat, RcDouble ]
instance Uniquable RegClass where
@@ -27,6 +30,6 @@ instance Uniquable RegClass where
getUnique RcDouble = mkRegClassUnique 2
instance Outputable RegClass where
- ppr RcInteger = Outputable.text "I"
- ppr RcFloat = Outputable.text "F"
- ppr RcDouble = Outputable.text "D"
+ ppr RcInteger = Outputable.text "I"
+ ppr RcFloat = Outputable.text "F"
+ ppr RcDouble = Outputable.text "D"
diff --git a/compiler/nativeGen/SPARC/Ppr.hs b/compiler/nativeGen/SPARC/Ppr.hs
index fc67f77541..aa355f97cb 100644
--- a/compiler/nativeGen/SPARC/Ppr.hs
+++ b/compiler/nativeGen/SPARC/Ppr.hs
@@ -45,7 +45,6 @@ import CLabel
import Hoopl.Label
import Hoopl.Collections
-import Unique ( pprUniqueAlways )
import Outputable
import GHC.Platform
import FastString
@@ -148,12 +147,7 @@ pprReg :: Reg -> SDoc
pprReg reg
= case reg of
RegVirtual vr
- -> case vr of
- VirtualRegI u -> text "%vI_" <> pprUniqueAlways u
- VirtualRegHi u -> text "%vHi_" <> pprUniqueAlways u
- VirtualRegF u -> text "%vF_" <> pprUniqueAlways u
- VirtualRegD u -> text "%vD_" <> pprUniqueAlways u
-
+ -> ppr vr
RegReal rr
-> case rr of
@@ -221,7 +215,8 @@ pprFormat x
II32 -> sLit ""
II64 -> sLit "d"
FF32 -> sLit ""
- FF64 -> sLit "d")
+ FF64 -> sLit "d"
+ VecFormat _ _ _ -> panic "SPARC.Ppr.pprFormat: VecFormat")
-- | Pretty print a format for an instruction suffix.
@@ -235,7 +230,8 @@ pprStFormat x
II32 -> sLit ""
II64 -> sLit "x"
FF32 -> sLit ""
- FF64 -> sLit "d")
+ FF64 -> sLit "d"
+ VecFormat _ _ _ -> panic "SPARC.Ppr.pprFormat: VecFormat")
diff --git a/compiler/nativeGen/SPARC/Regs.hs b/compiler/nativeGen/SPARC/Regs.hs
index 0d7edc346a..e46dbd0d38 100644
--- a/compiler/nativeGen/SPARC/Regs.hs
+++ b/compiler/nativeGen/SPARC/Regs.hs
@@ -104,7 +104,6 @@ virtualRegSqueeze cls vr
VirtualRegD{} -> 1
_other -> 0
-
{-# INLINE realRegSqueeze #-}
realRegSqueeze :: RegClass -> RealReg -> Int
@@ -134,7 +133,6 @@ realRegSqueeze cls rr
RealRegPair{} -> 1
-
-- | All the allocatable registers in the machine,
-- including register pairs.
allRealRegs :: [RealReg]
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 13662f6807..ed3684e074 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -111,12 +111,25 @@ sse2Enabled = do
ArchX86 -> return True
_ -> panic "trying to generate x86/x86_64 on the wrong platform"
+sse4_1Enabled :: NatM Bool
+sse4_1Enabled = do
+ dflags <- getDynFlags
+ return (isSse4_1Enabled dflags)
sse4_2Enabled :: NatM Bool
sse4_2Enabled = do
dflags <- getDynFlags
return (isSse4_2Enabled dflags)
+sseEnabled :: NatM Bool
+sseEnabled = do
+ dflags <- getDynFlags
+ return (isSseEnabled dflags)
+
+avxEnabled :: NatM Bool
+avxEnabled = do
+ dflags <- getDynFlags
+ return (isAvxEnabled dflags)
cmmTopCodeGen
:: RawCmmDecl
@@ -215,6 +228,7 @@ stmtToInstrs bid stmt = do
CmmAssign reg src
| isFloatType ty -> assignReg_FltCode format reg src
| is32Bit && isWord64 ty -> assignReg_I64Code reg src
+ | isVecType ty -> assignReg_VecCode format reg src
| otherwise -> assignReg_IntCode format reg src
where ty = cmmRegType dflags reg
format = cmmTypeFormat ty
@@ -222,6 +236,7 @@ stmtToInstrs bid stmt = do
CmmStore addr src
| isFloatType ty -> assignMem_FltCode format addr src
| is32Bit && isWord64 ty -> assignMem_I64Code addr src
+ | isVecType ty -> assignMem_VecCode format addr src
| otherwise -> assignMem_IntCode format addr src
where ty = cmmExprType dflags src
format = cmmTypeFormat ty
@@ -308,6 +323,15 @@ getRegisterReg platform (CmmGlobal mid)
-- platform. Hence ...
+getVecRegisterReg :: Platform -> Bool -> Format -> CmmReg -> Reg
+getVecRegisterReg _ use_avx format (CmmLocal (LocalReg u pk))
+ | isVecType pk && use_avx = RegVirtual (mkVirtualReg u format)
+ | otherwise = pprPanic
+ (unlines ["avx flag is not enabled" ,
+ "or this is not a vector register"])
+ (ppr pk)
+getVecRegisterReg platform _use_avx _format c = getRegisterReg platform c
+
-- | Memory addressing modes passed up the tree.
data Amode
= Amode AddrMode InstrBlock
@@ -503,6 +527,13 @@ iselExpr64 expr
--------------------------------------------------------------------------------
+
+-- This is a helper data type which helps reduce the code duplication for
+-- the code generation of arithmetic operations. This is not specifically
+-- targetted for any particular type like Int8, Int32 etc
+data VectorArithInstns = VA_Add | VA_Sub | VA_Mul | VA_Div
+
+
getRegister :: CmmExpr -> NatM Register
getRegister e = do dflags <- getDynFlags
is32Bit <- is32BitPlatform
@@ -520,16 +551,24 @@ getRegister' dflags is32Bit (CmmReg reg)
do reg' <- getPicBaseNat (archWordFormat is32Bit)
return (Fixed (archWordFormat is32Bit) reg' nilOL)
_ ->
- do
- let
- fmt = cmmTypeFormat (cmmRegType dflags reg)
- format = fmt
- --
- let platform = targetPlatform dflags
- return (Fixed format
- (getRegisterReg platform reg)
- nilOL)
-
+ do use_sse2 <- sse2Enabled
+ use_avx <- avxEnabled
+ let cmmregtype = cmmRegType dflags reg
+ if isVecType cmmregtype
+ then return (vectorRegister cmmregtype use_avx use_sse2)
+ else return (standardRegister cmmregtype)
+ where
+ vectorRegister :: CmmType -> Bool -> Bool -> Register
+ vectorRegister reg_ty use_avx use_sse2
+ | use_avx || use_sse2 =
+ let vecfmt = cmmTypeFormat reg_ty
+ platform = targetPlatform dflags
+ in (Fixed vecfmt (getVecRegisterReg platform True vecfmt reg) nilOL)
+ | otherwise = panic "Please enable the -mavx or -msse2 flag"
+
+ standardRegister crt =
+ let platform = targetPlatform dflags
+ in (Fixed (cmmTypeFormat crt) (getRegisterReg platform reg) nilOL)
getRegister' dflags is32Bit (CmmRegOff r n)
= getRegister' dflags is32Bit $ mangleIndexTree dflags r n
@@ -631,7 +670,69 @@ getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
return $ Any II64 (\dst -> unitOL $
LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
+getRegister' _ _ (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
+ sse4_1 <- sse4_1Enabled
+ sse2 <- sse2Enabled
+ sse <- sseEnabled
+ case mop of
+ MO_VF_Insert l W32 | sse4_1 && sse -> vector_float_pack l W32 x y z
+ | otherwise
+ -> sorry "Please enable the -msse4 and -msse flag"
+ MO_VF_Insert l W64 | sse2 && sse -> vector_float_pack l W64 x y z
+ | otherwise
+ -> sorry "Please enable the -msse2 and -msse flag"
+ _other -> incorrectOperands
+ where
+ vector_float_pack :: Length
+ -> Width
+ -> CmmExpr
+ -> CmmExpr
+ -> CmmExpr
+ -> NatM Register
+ vector_float_pack len W32 expr1 expr2 (CmmLit offset)
+ = do
+ fn <- getAnyReg expr1
+ (r, exp) <- getSomeReg expr2
+ let f = VecFormat len FmtFloat W32
+ imm = litToImm offset
+ code dst = exp `appOL`
+ (fn dst) `snocOL`
+ (INSERTPS f (OpImm imm) (OpReg r) dst)
+ in return $ Any f code
+ vector_float_pack len W64 expr1 expr2 (CmmLit offset)
+ = do
+ Amode addr addr_code <- getAmode expr2
+ (r, exp) <- getSomeReg expr1
+
+ -- fn <- getAnyReg expr1
+ -- (r, exp) <- getSomeReg expr2
+ let f = VecFormat len FmtDouble W64
+ code dst
+ = case offset of
+ CmmInt 0 _ -> exp `appOL` addr_code `snocOL`
+ (MOVL f (OpAddr addr) (OpReg r)) `snocOL`
+ (MOVU f (OpReg r) (OpReg dst))
+ CmmInt 16 _ -> exp `appOL` addr_code `snocOL`
+ (MOVH f (OpAddr addr) (OpReg r)) `snocOL`
+ (MOVU f (OpReg r) (OpReg dst))
+ _ -> panic "Error in offset while packing"
+ -- code dst
+ -- = case offset of
+ -- CmmInt 0 _ -> exp `appOL`
+ -- (fn dst) `snocOL`
+ -- (MOVL f (OpReg r) (OpReg dst))
+ -- CmmInt 16 _ -> exp `appOL`
+ -- (fn dst) `snocOL`
+ -- (MOVH f (OpReg r) (OpReg dst))
+ -- _ -> panic "Error in offset while packing"
+ in return $ Any f code
+ vector_float_pack _ _ _ c _
+ = pprPanic "Pack not supported for : " (ppr c)
+
getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
+ sse2 <- sse2Enabled
+ sse <- sseEnabled
+ avx <- avxEnabled
case mop of
MO_F_Neg w -> sse2NegCode w x
@@ -708,23 +809,28 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
MO_FS_Conv from to -> coerceFP2Int from to x
MO_SF_Conv from to -> coerceInt2FP from to x
- MO_V_Insert {} -> needLlvm
- MO_V_Extract {} -> needLlvm
- MO_V_Add {} -> needLlvm
- MO_V_Sub {} -> needLlvm
- MO_V_Mul {} -> needLlvm
- MO_VS_Quot {} -> needLlvm
- MO_VS_Rem {} -> needLlvm
- MO_VS_Neg {} -> needLlvm
- MO_VU_Quot {} -> needLlvm
- MO_VU_Rem {} -> needLlvm
- MO_VF_Insert {} -> needLlvm
- MO_VF_Extract {} -> needLlvm
- MO_VF_Add {} -> needLlvm
- MO_VF_Sub {} -> needLlvm
- MO_VF_Mul {} -> needLlvm
- MO_VF_Quot {} -> needLlvm
- MO_VF_Neg {} -> needLlvm
+ MO_V_Insert {} -> needLlvm
+ MO_V_Extract {} -> needLlvm
+ MO_V_Add {} -> needLlvm
+ MO_V_Sub {} -> needLlvm
+ MO_V_Mul {} -> needLlvm
+ MO_VS_Quot {} -> needLlvm
+ MO_VS_Rem {} -> needLlvm
+ MO_VS_Neg {} -> needLlvm
+ MO_VU_Quot {} -> needLlvm
+ MO_VU_Rem {} -> needLlvm
+ MO_VF_Broadcast {} -> incorrectOperands
+ MO_VF_Insert {} -> incorrectOperands
+ MO_VF_Extract {} -> incorrectOperands
+ MO_VF_Add {} -> incorrectOperands
+ MO_VF_Sub {} -> incorrectOperands
+ MO_VF_Mul {} -> incorrectOperands
+ MO_VF_Quot {} -> incorrectOperands
+
+ MO_VF_Neg l w | avx -> vector_float_negate_avx l w x
+ | sse && sse2 -> vector_float_negate_sse l w x
+ | otherwise
+ -> sorry "Please enable the -mavx or -msse, -msse2 flag"
_other -> pprPanic "getRegister" (pprMachOp mop)
where
@@ -762,8 +868,45 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
= do e_code <- getRegister' dflags is32Bit expr
return (swizzleRegisterRep e_code new_format)
+ vector_float_negate_avx :: Length -> Width -> CmmExpr -> NatM Register
+ vector_float_negate_avx l w expr = do
+ tmp <- getNewRegNat (VecFormat l FmtFloat w)
+ (reg, exp) <- getSomeReg expr
+ Amode addr addr_code <- memConstant (mkAlignment $ widthInBytes W32) (CmmFloat 0.0 W32)
+ let format = case w of
+ W32 -> VecFormat l FmtFloat w
+ W64 -> VecFormat l FmtDouble w
+ _ -> pprPanic "Cannot negate vector of width" (ppr w)
+ code dst = case w of
+ W32 -> exp `appOL` addr_code `snocOL`
+ (VBROADCAST format addr tmp) `snocOL`
+ (VSUB format (OpReg reg) tmp dst)
+ W64 -> exp `appOL` addr_code `snocOL`
+ (MOVL format (OpAddr addr) (OpReg tmp)) `snocOL`
+ (MOVH format (OpAddr addr) (OpReg tmp)) `snocOL`
+ (VSUB format (OpReg reg) tmp dst)
+ _ -> pprPanic "Cannot negate vector of width" (ppr w)
+ return (Any format code)
+
+ vector_float_negate_sse :: Length -> Width -> CmmExpr -> NatM Register
+ vector_float_negate_sse l w expr = do
+ tmp <- getNewRegNat (VecFormat l FmtFloat w)
+ (reg, exp) <- getSomeReg expr
+ let format = case w of
+ W32 -> VecFormat l FmtFloat w
+ W64 -> VecFormat l FmtDouble w
+ _ -> pprPanic "Cannot negate vector of width" (ppr w)
+ code dst = exp `snocOL`
+ (XOR format (OpReg tmp) (OpReg tmp)) `snocOL`
+ (MOVU format (OpReg tmp) (OpReg dst)) `snocOL`
+ (SUB format (OpReg reg) (OpReg dst))
+ return (Any format code)
getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
+ sse4_1 <- sse4_1Enabled
+ sse2 <- sse2Enabled
+ sse <- sseEnabled
+ avx <- avxEnabled
case mop of
MO_F_Eq _ -> condFltReg is32Bit EQQ x y
MO_F_Ne _ -> condFltReg is32Bit NE x y
@@ -828,13 +971,49 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
MO_VS_Quot {} -> needLlvm
MO_VS_Rem {} -> needLlvm
MO_VS_Neg {} -> needLlvm
- MO_VF_Insert {} -> needLlvm
- MO_VF_Extract {} -> needLlvm
- MO_VF_Add {} -> needLlvm
- MO_VF_Sub {} -> needLlvm
- MO_VF_Mul {} -> needLlvm
- MO_VF_Quot {} -> needLlvm
- MO_VF_Neg {} -> needLlvm
+
+ MO_VF_Broadcast l W32 | avx -> vector_float_broadcast_avx l W32 x y
+ | sse4_1 -> vector_float_broadcast_sse l W32 x y
+ | otherwise
+ -> sorry "Please enable the -mavx or -msse4 flag"
+
+ MO_VF_Broadcast l W64 | sse2 -> vector_float_broadcast_avx l W64 x y
+ | otherwise -> sorry "Please enable the -msse2 flag"
+
+ MO_VF_Extract l W32 | avx -> vector_float_unpack l W32 x y
+ | sse -> vector_float_unpack_sse l W32 x y
+ | otherwise
+ -> sorry "Please enable the -mavx or -msse flag"
+
+ MO_VF_Extract l W64 | sse2 -> vector_float_unpack l W64 x y
+ | otherwise -> sorry "Please enable the -msse2 flag"
+
+ MO_VF_Add l w | avx -> vector_float_op_avx VA_Add l w x y
+ | sse && w == W32 -> vector_float_op_sse VA_Add l w x y
+ | sse2 && w == W64 -> vector_float_op_sse VA_Add l w x y
+ | otherwise
+ -> sorry "Please enable the -mavx or -msse flag"
+
+ MO_VF_Sub l w | avx -> vector_float_op_avx VA_Sub l w x y
+ | sse && w == W32 -> vector_float_op_sse VA_Sub l w x y
+ | sse2 && w == W64 -> vector_float_op_sse VA_Sub l w x y
+ | otherwise
+ -> sorry "Please enable the -mavx or -msse flag"
+
+ MO_VF_Mul l w | avx -> vector_float_op_avx VA_Mul l w x y
+ | sse && w == W32 -> vector_float_op_sse VA_Mul l w x y
+ | sse2 && w == W64 -> vector_float_op_sse VA_Mul l w x y
+ | otherwise
+ -> sorry "Please enable the -mavx or -msse flag"
+
+ MO_VF_Quot l w | avx -> vector_float_op_avx VA_Div l w x y
+ | sse && w == W32 -> vector_float_op_sse VA_Div l w x y
+ | sse2 && w == W64 -> vector_float_op_sse VA_Div l w x y
+ | otherwise
+ -> sorry "Please enable the -mavx or -msse flag"
+
+ MO_VF_Insert {} -> incorrectOperands
+ MO_VF_Neg {} -> incorrectOperands
_other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
where
@@ -930,7 +1109,171 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
-- TODO: There are other interesting patterns we want to replace
-- with a LEA, e.g. `(x + offset) + (y << shift)`.
+ -----------------------
+ -- Vector operations---
+ vector_float_op_avx :: VectorArithInstns
+ -> Length
+ -> Width
+ -> CmmExpr
+ -> CmmExpr
+ -> NatM Register
+ vector_float_op_avx op l w expr1 expr2 = do
+ (reg1, exp1) <- getSomeReg expr1
+ (reg2, exp2) <- getSomeReg expr2
+ let format = case w of
+ W32 -> VecFormat l FmtFloat W32
+ W64 -> VecFormat l FmtDouble W64
+ _ -> pprPanic "Operation not supported for width " (ppr w)
+ code dst = case op of
+ VA_Add -> arithInstr VADD
+ VA_Sub -> arithInstr VSUB
+ VA_Mul -> arithInstr VMUL
+ VA_Div -> arithInstr VDIV
+ where
+ -- opcode src2 src1 dst <==> dst = src1 `opcode` src2
+ arithInstr instr = exp1 `appOL` exp2 `snocOL`
+ (instr format (OpReg reg2) reg1 dst)
+ return (Any format code)
+
+ vector_float_op_sse :: VectorArithInstns
+ -> Length
+ -> Width
+ -> CmmExpr
+ -> CmmExpr
+ -> NatM Register
+ vector_float_op_sse op l w expr1 expr2 = do
+ (reg1, exp1) <- getSomeReg expr1
+ (reg2, exp2) <- getSomeReg expr2
+ let format = case w of
+ W32 -> VecFormat l FmtFloat W32
+ W64 -> VecFormat l FmtDouble W64
+ _ -> pprPanic "Operation not supported for width " (ppr w)
+ code dst = case op of
+ VA_Add -> arithInstr ADD
+ VA_Sub -> arithInstr SUB
+ VA_Mul -> arithInstr MUL
+ VA_Div -> arithInstr FDIV
+ where
+ -- opcode src2 src1 <==> src1 = src1 `opcode` src2
+ arithInstr instr
+ = exp1 `appOL` exp2 `snocOL`
+ (MOVU format (OpReg reg1) (OpReg dst)) `snocOL`
+ (instr format (OpReg reg2) (OpReg dst))
+ return (Any format code)
--------------------
+ vector_float_unpack :: Length
+ -> Width
+ -> CmmExpr
+ -> CmmExpr
+ -> NatM Register
+ vector_float_unpack l W32 expr (CmmLit lit)
+ = do
+ (r, exp) <- getSomeReg expr
+ let format = VecFormat l FmtFloat W32
+ imm = litToImm lit
+ code dst
+ = case lit of
+ CmmInt 0 _ -> exp `snocOL` (VMOVU format (OpReg r) (OpReg dst))
+ CmmInt _ _ -> exp `snocOL` (VPSHUFD format (OpImm imm) (OpReg r) dst)
+ _ -> panic "Error in offset while unpacking"
+ return (Any format code)
+ vector_float_unpack l W64 expr (CmmLit lit)
+ = do
+ dflags <- getDynFlags
+ (r, exp) <- getSomeReg expr
+ let format = VecFormat l FmtDouble W64
+ addr = spRel dflags 0
+ code dst
+ = case lit of
+ CmmInt 0 _ -> exp `snocOL`
+ (MOVL format (OpReg r) (OpAddr addr)) `snocOL`
+ (MOV FF64 (OpAddr addr) (OpReg dst))
+ CmmInt 1 _ -> exp `snocOL`
+ (MOVH format (OpReg r) (OpAddr addr)) `snocOL`
+ (MOV FF64 (OpAddr addr) (OpReg dst))
+ _ -> panic "Error in offset while unpacking"
+ return (Any format code)
+ vector_float_unpack _ w c e
+ = pprPanic "Unpack not supported for : " (ppr c $$ ppr e $$ ppr w)
+ -----------------------
+
+ vector_float_unpack_sse :: Length
+ -> Width
+ -> CmmExpr
+ -> CmmExpr
+ -> NatM Register
+ vector_float_unpack_sse l W32 expr (CmmLit lit)
+ = do
+ (r,exp) <- getSomeReg expr
+ let format = VecFormat l FmtFloat W32
+ imm = litToImm lit
+ code dst
+ = case lit of
+ CmmInt 0 _ -> exp `snocOL` (MOVU format (OpReg r) (OpReg dst))
+ CmmInt _ _ -> exp `snocOL` (PSHUFD format (OpImm imm) (OpReg r) dst)
+ _ -> panic "Error in offset while unpacking"
+ return (Any format code)
+ vector_float_unpack_sse _ w c e
+ = pprPanic "Unpack not supported for : " (ppr c $$ ppr e $$ ppr w)
+ -----------------------
+ vector_float_broadcast_avx :: Length
+ -> Width
+ -> CmmExpr
+ -> CmmExpr
+ -> NatM Register
+ vector_float_broadcast_avx len W32 expr1 expr2
+ = do
+ dflags <- getDynFlags
+ fn <- getAnyReg expr1
+ (r', exp) <- getSomeReg expr2
+ let f = VecFormat len FmtFloat W32
+ addr = spRel dflags 0
+ in return $ Any f (\r -> exp `appOL`
+ (fn r) `snocOL`
+ (MOVU f (OpReg r') (OpAddr addr)) `snocOL`
+ (VBROADCAST f addr r))
+ vector_float_broadcast_avx len W64 expr1 expr2
+ = do
+ dflags <- getDynFlags
+ fn <- getAnyReg expr1
+ (r', exp) <- getSomeReg expr2
+ let f = VecFormat len FmtDouble W64
+ addr = spRel dflags 0
+ in return $ Any f (\r -> exp `appOL`
+ (fn r) `snocOL`
+ (MOVU f (OpReg r') (OpAddr addr)) `snocOL`
+ (MOVL f (OpAddr addr) (OpReg r)) `snocOL`
+ (MOVH f (OpAddr addr) (OpReg r)))
+ vector_float_broadcast_avx _ _ c _
+ = pprPanic "Broadcast not supported for : " (ppr c)
+ -----------------------
+ vector_float_broadcast_sse :: Length
+ -> Width
+ -> CmmExpr
+ -> CmmExpr
+ -> NatM Register
+ vector_float_broadcast_sse len W32 expr1 expr2
+ = do
+ dflags <- getDynFlags
+ fn <- getAnyReg expr1 -- destination
+ (r, exp) <- getSomeReg expr2 -- source
+ let f = VecFormat len FmtFloat W32
+ addr = spRel dflags 0
+ code dst = exp `appOL`
+ (fn dst) `snocOL`
+ (MOVU f (OpReg r) (OpAddr addr)) `snocOL`
+ (insertps 0) `snocOL`
+ (insertps 16) `snocOL`
+ (insertps 32) `snocOL`
+ (insertps 48)
+ where
+ insertps off =
+ INSERTPS f (OpImm $ litToImm $ CmmInt off W32) (OpAddr addr) dst
+
+ in return $ Any f code
+ vector_float_broadcast_sse _ _ c _
+ = pprPanic "Broadcast not supported for : " (ppr c)
+ -----------------------
sub_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
sub_code rep x (CmmLit (CmmInt y _))
| is32BitInteger (-y) = add_int rep x (-y)
@@ -983,6 +1326,21 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
return (Fixed format result code)
+getRegister' _ _ (CmmLoad mem pk)
+ | isVecType pk = do
+ use_avx <- avxEnabled
+ use_sse <- sseEnabled
+ Amode addr mem_code <- getAmode mem
+ let format = cmmTypeFormat pk
+ code dst
+ | use_avx = mem_code `snocOL`
+ VMOVU format (OpAddr addr) (OpReg dst)
+ | use_sse = mem_code `snocOL`
+ MOVU format (OpAddr addr) (OpReg dst)
+ | otherwise = pprPanic (unlines ["avx or sse flag not enabled",
+ "for loading to "])
+ (ppr pk)
+ return (Any format code)
getRegister' _ _ (CmmLoad mem pk)
| isFloatType pk
@@ -1049,10 +1407,24 @@ getRegister' dflags is32Bit (CmmLit lit)
-- small memory model (see gcc docs, -mcmodel=small).
getRegister' dflags _ (CmmLit lit)
- = do let format = cmmTypeFormat (cmmLitType dflags lit)
- imm = litToImm lit
- code dst = unitOL (MOV format (OpImm imm) (OpReg dst))
- return (Any format code)
+ | isVecType cmmtype = vectorRegister cmmtype
+ | otherwise = standardRegister cmmtype
+ where
+ cmmtype = cmmLitType dflags lit
+ vectorRegister ctype
+ = do
+ --NOTE:
+ -- This operation is only used to zero a register. For loading a
+ -- vector literal there are pack and broadcast operations
+ let format = cmmTypeFormat ctype
+ code dst = unitOL (XOR format (OpReg dst) (OpReg dst))
+ return (Any format code)
+ standardRegister ctype
+ = do
+ let format = cmmTypeFormat ctype
+ imm = litToImm lit
+ code dst = unitOL (MOV format (OpImm imm) (OpReg dst))
+ return (Any format code)
getRegister' _ _ other
| isVecExpr other = needLlvm
@@ -1118,8 +1490,14 @@ getNonClobberedReg expr = do
return (reg, code)
reg2reg :: Format -> Reg -> Reg -> Instr
-reg2reg format src dst = MOV format (OpReg src) (OpReg dst)
-
+reg2reg format@(VecFormat _ FmtFloat W32) src dst
+ = VMOVU format (OpReg src) (OpReg dst)
+reg2reg format@(VecFormat _ FmtDouble W64) src dst
+ = VMOVU format (OpReg src) (OpReg dst)
+reg2reg (VecFormat _ _ _) _ _
+ = panic "MOV operation not implemented for vectors"
+reg2reg format src dst
+ = MOV format (OpReg src) (OpReg dst)
--------------------------------------------------------------------------------
getAmode :: CmmExpr -> NatM Amode
@@ -1181,6 +1559,9 @@ getAmode' _ (CmmMachOp (MO_Add _)
getAmode' _ (CmmMachOp (MO_Add _) [x,y])
= x86_complex_amode x y 0 0
+getAmode' _ (CmmLit lit@(CmmFloat _ w))
+ = memConstant (mkAlignment $ widthInBytes w) lit
+
getAmode' is32Bit (CmmLit lit) | is32BitLit is32Bit lit
= return (Amode (ImmAddr (litToImm lit) 0) nilOL)
@@ -1561,7 +1942,8 @@ assignReg_IntCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
assignReg_FltCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
-
+assignMem_VecCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_VecCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
-- integer assignment to memory
-- specific case of adding/subtracting an integer to a particular address.
@@ -1638,6 +2020,29 @@ assignReg_FltCode _ reg src = do
let platform = targetPlatform dflags
return (src_code (getRegisterReg platform reg))
+assignMem_VecCode pk addr src = do
+ (src_reg, src_code) <- getNonClobberedReg src
+ Amode addr addr_code <- getAmode addr
+ use_avx <- avxEnabled
+ use_sse <- sseEnabled
+ let
+ code | use_avx = src_code `appOL`
+ addr_code `snocOL`
+ (VMOVU pk (OpReg src_reg) (OpAddr addr))
+ | use_sse = src_code `appOL`
+ addr_code `snocOL`
+ (MOVU pk (OpReg src_reg) (OpAddr addr))
+ | otherwise = sorry "Please enable the -mavx or -msse flag"
+ return code
+
+assignReg_VecCode format reg src = do
+ use_avx <- avxEnabled
+ use_sse <- sseEnabled
+ src_code <- getAnyReg src
+ dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ flag = use_avx || use_sse
+ return (src_code (getVecRegisterReg platform flag format reg))
genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
@@ -3362,6 +3767,7 @@ sse2NegCode w x = do
x@II16 -> wrongFmt x
x@II32 -> wrongFmt x
x@II64 -> wrongFmt x
+ x@VecFormat {} -> wrongFmt x
where
wrongFmt x = panic $ "sse2NegCode: " ++ show x
@@ -3376,29 +3782,33 @@ sse2NegCode w x = do
return (Any fmt code)
isVecExpr :: CmmExpr -> Bool
-isVecExpr (CmmMachOp (MO_V_Insert {}) _) = True
-isVecExpr (CmmMachOp (MO_V_Extract {}) _) = True
-isVecExpr (CmmMachOp (MO_V_Add {}) _) = True
-isVecExpr (CmmMachOp (MO_V_Sub {}) _) = True
-isVecExpr (CmmMachOp (MO_V_Mul {}) _) = True
-isVecExpr (CmmMachOp (MO_VS_Quot {}) _) = True
-isVecExpr (CmmMachOp (MO_VS_Rem {}) _) = True
-isVecExpr (CmmMachOp (MO_VS_Neg {}) _) = True
-isVecExpr (CmmMachOp (MO_VF_Insert {}) _) = True
-isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True
-isVecExpr (CmmMachOp (MO_VF_Add {}) _) = True
-isVecExpr (CmmMachOp (MO_VF_Sub {}) _) = True
-isVecExpr (CmmMachOp (MO_VF_Mul {}) _) = True
-isVecExpr (CmmMachOp (MO_VF_Quot {}) _) = True
-isVecExpr (CmmMachOp (MO_VF_Neg {}) _) = True
-isVecExpr (CmmMachOp _ [e]) = isVecExpr e
-isVecExpr _ = False
+isVecExpr (CmmMachOp (MO_V_Insert {}) _) = True
+isVecExpr (CmmMachOp (MO_V_Extract {}) _) = True
+isVecExpr (CmmMachOp (MO_V_Add {}) _) = True
+isVecExpr (CmmMachOp (MO_V_Sub {}) _) = True
+isVecExpr (CmmMachOp (MO_V_Mul {}) _) = True
+isVecExpr (CmmMachOp (MO_VS_Quot {}) _) = True
+isVecExpr (CmmMachOp (MO_VS_Rem {}) _) = True
+isVecExpr (CmmMachOp (MO_VS_Neg {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Broadcast {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Insert {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Add {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Sub {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Mul {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Quot {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Neg {}) _) = True
+isVecExpr (CmmMachOp _ [e]) = isVecExpr e
+isVecExpr _ = False
needLlvm :: NatM a
needLlvm =
sorry $ unlines ["The native code generator does not support vector"
,"instructions. Please use -fllvm."]
+incorrectOperands :: NatM a
+incorrectOperands = sorry "Incorrect number of operands"
+
-- | This works on the invariant that all jumps in the given blocks are required.
-- Starting from there we try to make a few more jumps redundant by reordering
-- them.
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index 6e5d656beb..47b62e62e7 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -328,6 +328,36 @@ data Instr
| CMPXCHG Format Operand Operand -- src (r), dst (r/m), eax implicit
| MFENCE
+ -- Vector Instructions --
+ -- NOTE: Instructions follow the AT&T syntax
+ -- Constructors and deconstructors
+ | VBROADCAST Format AddrMode Reg
+ | VEXTRACT Format Operand Reg Operand
+ | INSERTPS Format Operand Operand Reg
+
+ -- move operations
+ | VMOVU Format Operand Operand
+ | MOVU Format Operand Operand
+ | MOVL Format Operand Operand
+ | MOVH Format Operand Operand
+
+ -- logic operations
+ | VPXOR Format Reg Reg Reg
+
+ -- Arithmetic
+ | VADD Format Operand Reg Reg
+ | VSUB Format Operand Reg Reg
+ | VMUL Format Operand Reg Reg
+ | VDIV Format Operand Reg Reg
+
+ -- Shuffle
+ | VPSHUFD Format Operand Operand Reg
+ | PSHUFD Format Operand Operand Reg
+
+ -- Shift
+ | PSLLDQ Format Operand Reg
+ | PSRLDQ Format Operand Reg
+
data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2
@@ -430,6 +460,31 @@ x86_regUsageOfInstr platform instr
CMPXCHG _ src dst -> usageRMM src dst (OpReg eax)
MFENCE -> noUsage
+ -- vector instructions
+ VBROADCAST _ src dst -> mkRU (use_EA src []) [dst]
+ VEXTRACT _ off src dst -> mkRU ((use_R off []) ++ [src]) (use_R dst [])
+ INSERTPS _ off src dst
+ -> mkRU ((use_R off []) ++ (use_R src []) ++ [dst]) [dst]
+
+ VMOVU _ src dst -> mkRU (use_R src []) (use_R dst [])
+ MOVU _ src dst -> mkRU (use_R src []) (use_R dst [])
+ MOVL _ src dst -> mkRU (use_R src []) (use_R dst [])
+ MOVH _ src dst -> mkRU (use_R src []) (use_R dst [])
+ VPXOR _ s1 s2 dst -> mkRU [s1,s2] [dst]
+
+ VADD _ s1 s2 dst -> mkRU ((use_R s1 []) ++ [s2]) [dst]
+ VSUB _ s1 s2 dst -> mkRU ((use_R s1 []) ++ [s2]) [dst]
+ VMUL _ s1 s2 dst -> mkRU ((use_R s1 []) ++ [s2]) [dst]
+ VDIV _ s1 s2 dst -> mkRU ((use_R s1 []) ++ [s2]) [dst]
+
+ VPSHUFD _ off src dst
+ -> mkRU (concatMap (\op -> use_R op []) [off, src]) [dst]
+ PSHUFD _ off src dst
+ -> mkRU (concatMap (\op -> use_R op []) [off, src]) [dst]
+
+ PSLLDQ _ off dst -> mkRU (use_R off []) [dst]
+ PSRLDQ _ off dst -> mkRU (use_R off []) [dst]
+
_other -> panic "regUsage: unrecognised instr"
where
-- # Definitions
@@ -588,6 +643,32 @@ x86_patchRegsOfInstr instr env
CMPXCHG fmt src dst -> patch2 (CMPXCHG fmt) src dst
MFENCE -> instr
+ -- vector instructions
+ VBROADCAST fmt src dst -> VBROADCAST fmt (lookupAddr src) (env dst)
+ VEXTRACT fmt off src dst
+ -> VEXTRACT fmt (patchOp off) (env src) (patchOp dst)
+ INSERTPS fmt off src dst
+ -> INSERTPS fmt (patchOp off) (patchOp src) (env dst)
+
+ VMOVU fmt src dst -> VMOVU fmt (patchOp src) (patchOp dst)
+ MOVU fmt src dst -> MOVU fmt (patchOp src) (patchOp dst)
+ MOVL fmt src dst -> MOVL fmt (patchOp src) (patchOp dst)
+ MOVH fmt src dst -> MOVH fmt (patchOp src) (patchOp dst)
+ VPXOR fmt s1 s2 dst -> VPXOR fmt (env s1) (env s2) (env dst)
+
+ VADD fmt s1 s2 dst -> VADD fmt (patchOp s1) (env s2) (env dst)
+ VSUB fmt s1 s2 dst -> VSUB fmt (patchOp s1) (env s2) (env dst)
+ VMUL fmt s1 s2 dst -> VMUL fmt (patchOp s1) (env s2) (env dst)
+ VDIV fmt s1 s2 dst -> VDIV fmt (patchOp s1) (env s2) (env dst)
+
+ VPSHUFD fmt off src dst
+ -> VPSHUFD fmt (patchOp off) (patchOp src) (env dst)
+ PSHUFD fmt off src dst
+ -> PSHUFD fmt (patchOp off) (patchOp src) (env dst)
+ PSLLDQ fmt off dst
+ -> PSLLDQ fmt (patchOp off) (env dst)
+ PSRLDQ fmt off dst
+ -> PSRLDQ fmt (patchOp off) (env dst)
_other -> panic "patchRegs: unrecognised instr"
where
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index 095d9eba7c..a3f27ba471 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -41,7 +41,6 @@ import DynFlags
import Cmm hiding (topInfoTable)
import BlockId
import CLabel
-import Unique ( pprUniqueAlways )
import GHC.Platform
import FastString
import Outputable
@@ -280,10 +279,7 @@ pprReg f r
if target32Bit platform then ppr32_reg_no f i
else ppr64_reg_no f i
RegReal (RealRegPair _ _) -> panic "X86.Ppr: no reg pairs on this arch"
- RegVirtual (VirtualRegI u) -> text "%vI_" <> pprUniqueAlways u
- RegVirtual (VirtualRegHi u) -> text "%vHi_" <> pprUniqueAlways u
- RegVirtual (VirtualRegF u) -> text "%vF_" <> pprUniqueAlways u
- RegVirtual (VirtualRegD u) -> text "%vD_" <> pprUniqueAlways u
+ RegVirtual v -> ppr v
where
ppr32_reg_no :: Format -> Int -> SDoc
@@ -395,6 +391,11 @@ pprFormat x
II64 -> sLit "q"
FF32 -> sLit "ss" -- "scalar single-precision float" (SSE2)
FF64 -> sLit "sd" -- "scalar double-precision float" (SSE2)
+
+ VecFormat _ FmtFloat W32 -> sLit "ps"
+ VecFormat _ FmtDouble W64 -> sLit "pd"
+ -- TODO: Add Ints and remove panic
+ VecFormat {} -> panic "Incorrect width"
)
pprFormat_x87 :: Format -> SDoc
@@ -783,6 +784,41 @@ pprInstr (IDIV fmt op) = pprFormatOp (sLit "idiv") fmt op
pprInstr (DIV fmt op) = pprFormatOp (sLit "div") fmt op
pprInstr (IMUL2 fmt op) = pprFormatOp (sLit "imul") fmt op
+-- Vector Instructions
+
+pprInstr (VADD format s1 s2 dst)
+ = pprFormatOpRegReg (sLit "vadd") format s1 s2 dst
+pprInstr (VSUB format s1 s2 dst)
+ = pprFormatOpRegReg (sLit "vsub") format s1 s2 dst
+pprInstr (VMUL format s1 s2 dst)
+ = pprFormatOpRegReg (sLit "vmul") format s1 s2 dst
+pprInstr (VDIV format s1 s2 dst)
+ = pprFormatOpRegReg (sLit "vdiv") format s1 s2 dst
+pprInstr (VBROADCAST format from to)
+ = pprBroadcast (sLit "vbroadcast") format from to
+pprInstr (VMOVU format from to)
+ = pprFormatOpOp (sLit "vmovu") format from to
+pprInstr (MOVU format from to)
+ = pprFormatOpOp (sLit "movu") format from to
+pprInstr (MOVL format from to)
+ = pprFormatOpOp (sLit "movl") format from to
+pprInstr (MOVH format from to)
+ = pprFormatOpOp (sLit "movh") format from to
+pprInstr (VPXOR format s1 s2 dst)
+ = pprXor (sLit "vpxor") format s1 s2 dst
+pprInstr (VEXTRACT format offset from to)
+ = pprFormatOpRegOp (sLit "vextract") format offset from to
+pprInstr (INSERTPS format offset addr dst)
+ = pprInsert (sLit "insertps") format offset addr dst
+pprInstr (VPSHUFD format offset src dst)
+ = pprShuf (sLit "vpshufd") format offset src dst
+pprInstr (PSHUFD format offset src dst)
+ = pprShuf (sLit "pshufd") format offset src dst
+pprInstr (PSLLDQ format offset dst)
+ = pprShiftLeft (sLit "pslldq") format offset dst
+pprInstr (PSRLDQ format offset dst)
+ = pprShiftRight (sLit "psrldq") format offset dst
+
-- x86_64 only
pprInstr (MUL format op1 op2) = pprFormatOpOp (sLit "mul") format op1 op2
pprInstr (MUL2 format op) = pprFormatOp (sLit "mul") format op
@@ -875,6 +911,23 @@ pprMnemonic :: PtrString -> Format -> SDoc
pprMnemonic name format =
char '\t' <> ptext name <> pprFormat format <> space
+pprGenMnemonic :: PtrString -> Format -> SDoc
+pprGenMnemonic name _ =
+ char '\t' <> ptext name <> ptext (sLit "") <> space
+
+pprBroadcastMnemonic :: PtrString -> Format -> SDoc
+pprBroadcastMnemonic name format =
+ char '\t' <> ptext name <> pprBroadcastFormat format <> space
+
+pprBroadcastFormat :: Format -> SDoc
+pprBroadcastFormat x
+ = ptext (case x of
+ VecFormat _ FmtFloat W32 -> sLit "ss"
+ VecFormat _ FmtDouble W64 -> sLit "sd"
+ -- TODO: Add Ints and remove panic
+ VecFormat {} -> panic "Incorrect width"
+ _ -> panic "Scalar Format invading vector operation"
+ )
pprFormatImmOp :: PtrString -> Format -> Imm -> Operand -> SDoc
pprFormatImmOp name format imm op1
@@ -921,7 +974,16 @@ pprOpOp name format op1 op2
pprOperand format op2
]
-
+pprFormatOpRegOp :: PtrString -> Format -> Operand -> Reg -> Operand -> SDoc
+pprFormatOpRegOp name format off reg1 op2
+ = hcat [
+ pprMnemonic name format,
+ pprOperand format off,
+ comma,
+ pprReg format reg1,
+ comma,
+ pprOperand format op2
+ ]
pprRegReg :: PtrString -> Reg -> Reg -> SDoc
pprRegReg name reg1 reg2
@@ -944,6 +1006,17 @@ pprFormatOpReg name format op1 reg2
pprReg (archWordFormat (target32Bit platform)) reg2
]
+pprFormatOpRegReg :: PtrString -> Format -> Operand -> Reg -> Reg -> SDoc
+pprFormatOpRegReg name format op1 reg2 reg3
+ = hcat [
+ pprMnemonic name format,
+ pprOperand format op1,
+ comma,
+ pprReg format reg2,
+ comma,
+ pprReg format reg3
+ ]
+
pprCondOpReg :: PtrString -> Format -> Cond -> Operand -> Reg -> SDoc
pprCondOpReg name format cond op1 reg2
= hcat [
@@ -1008,3 +1081,68 @@ pprFormatOpOpCoerce name format1 format2 op1 op2
pprCondInstr :: PtrString -> Cond -> SDoc -> SDoc
pprCondInstr name cond arg
= hcat [ char '\t', ptext name, pprCond cond, space, arg]
+
+
+-- Custom pretty printers
+-- These instructions currently don't follow a uniform suffix pattern
+-- in their names, so we have custom pretty printers for them.
+
+pprBroadcast :: PtrString -> Format -> AddrMode -> Reg -> SDoc
+pprBroadcast name format op dst
+ = hcat [
+ pprBroadcastMnemonic name format,
+ pprAddr op,
+ comma,
+ pprReg format dst
+ ]
+
+pprXor :: PtrString -> Format -> Reg -> Reg -> Reg -> SDoc
+pprXor name format reg1 reg2 reg3
+ = hcat [
+ pprGenMnemonic name format,
+ pprReg format reg1,
+ comma,
+ pprReg format reg2,
+ comma,
+ pprReg format reg3
+ ]
+
+pprInsert :: PtrString -> Format -> Operand -> Operand -> Reg -> SDoc
+pprInsert name format off src dst
+ = hcat [
+ pprGenMnemonic name format,
+ pprOperand format off,
+ comma,
+ pprOperand format src,
+ comma,
+ pprReg format dst
+ ]
+
+pprShuf :: PtrString -> Format -> Operand -> Operand -> Reg -> SDoc
+pprShuf name format op1 op2 reg3
+ = hcat [
+ pprGenMnemonic name format,
+ pprOperand format op1,
+ comma,
+ pprOperand format op2,
+ comma,
+ pprReg format reg3
+ ]
+
+pprShiftLeft :: PtrString -> Format -> Operand -> Reg -> SDoc
+pprShiftLeft name format off reg
+ = hcat [
+ pprGenMnemonic name format,
+ pprOperand format off,
+ comma,
+ pprReg format reg
+ ]
+
+pprShiftRight :: PtrString -> Format -> Operand -> Reg -> SDoc
+pprShiftRight name format off reg
+ = hcat [
+ pprGenMnemonic name format,
+ pprOperand format off,
+ comma,
+ pprReg format reg
+ ]
diff --git a/compiler/nativeGen/X86/RegInfo.hs b/compiler/nativeGen/X86/RegInfo.hs
index 19056be4fa..a7784bacad 100644
--- a/compiler/nativeGen/X86/RegInfo.hs
+++ b/compiler/nativeGen/X86/RegInfo.hs
@@ -22,6 +22,8 @@ import UniqFM
import X86.Regs
+--TODO:
+-- Add VirtualRegAVX and inspect VecFormat and allocate
mkVirtualReg :: Unique -> Format -> VirtualReg
mkVirtualReg u format
= case format of
@@ -31,6 +33,7 @@ mkVirtualReg u format
-- For now we map both to being allocated as "Double" Registers
-- on X86/X86_64
FF64 -> VirtualRegD u
+ VecFormat {} -> VirtualRegVec u
_other -> VirtualRegI u
regDotColor :: Platform -> RealReg -> SDoc
diff --git a/compiler/nativeGen/X86/Regs.hs b/compiler/nativeGen/X86/Regs.hs
index 2d9fd88c8e..f0e4c7d5f6 100644
--- a/compiler/nativeGen/X86/Regs.hs
+++ b/compiler/nativeGen/X86/Regs.hs
@@ -84,6 +84,7 @@ virtualRegSqueeze cls vr
-> case vr of
VirtualRegD{} -> 1
VirtualRegF{} -> 0
+ VirtualRegVec{} -> 1
_other -> 0
diff --git a/includes/CodeGen.Platform.hs b/includes/CodeGen.Platform.hs
index 27a9324438..34c943d053 100644
--- a/includes/CodeGen.Platform.hs
+++ b/includes/CodeGen.Platform.hs
@@ -495,13 +495,13 @@ activeStgRegs = [
,DoubleReg 1
#endif
#if defined(REG_XMM1)
- ,XmmReg 1
+ ,XmmReg 1 2 W64 Integer
#endif
#if defined(REG_YMM1)
- ,YmmReg 1
+ ,YmmReg 1 4 W64 Integer
#endif
#if defined(REG_ZMM1)
- ,ZmmReg 1
+ ,ZmmReg 1 8 W64 Integer
#endif
#if defined(REG_F2)
,FloatReg 2
@@ -510,13 +510,13 @@ activeStgRegs = [
,DoubleReg 2
#endif
#if defined(REG_XMM2)
- ,XmmReg 2
+ ,XmmReg 2 2 W64 Integer
#endif
#if defined(REG_YMM2)
- ,YmmReg 2
+ ,YmmReg 2 4 W64 Integer
#endif
#if defined(REG_ZMM2)
- ,ZmmReg 2
+ ,ZmmReg 2 8 W64 Integer
#endif
#if defined(REG_F3)
,FloatReg 3
@@ -525,13 +525,13 @@ activeStgRegs = [
,DoubleReg 3
#endif
#if defined(REG_XMM3)
- ,XmmReg 3
+ ,XmmReg 3 2 W64 Integer
#endif
#if defined(REG_YMM3)
- ,YmmReg 3
+ ,YmmReg 3 4 W64 Integer
#endif
#if defined(REG_ZMM3)
- ,ZmmReg 3
+ ,ZmmReg 3 8 W64 Integer
#endif
#if defined(REG_F4)
,FloatReg 4
@@ -540,13 +540,13 @@ activeStgRegs = [
,DoubleReg 4
#endif
#if defined(REG_XMM4)
- ,XmmReg 4
+ ,XmmReg 4 2 W64 Integer
#endif
#if defined(REG_YMM4)
- ,YmmReg 4
+ ,YmmReg 4 4 W64 Integer
#endif
#if defined(REG_ZMM4)
- ,ZmmReg 4
+ ,ZmmReg 4 8 W64 Integer
#endif
#if defined(REG_F5)
,FloatReg 5
@@ -555,13 +555,13 @@ activeStgRegs = [
,DoubleReg 5
#endif
#if defined(REG_XMM5)
- ,XmmReg 5
+ ,XmmReg 5 2 W64 Integer
#endif
#if defined(REG_YMM5)
- ,YmmReg 5
+ ,YmmReg 5 4 W64 Integer
#endif
#if defined(REG_ZMM5)
- ,ZmmReg 5
+ ,ZmmReg 5 8 W64 Integer
#endif
#if defined(REG_F6)
,FloatReg 6
@@ -570,13 +570,13 @@ activeStgRegs = [
,DoubleReg 6
#endif
#if defined(REG_XMM6)
- ,XmmReg 6
+ ,XmmReg 6 2 W64 Integer
#endif
#if defined(REG_YMM6)
- ,YmmReg 6
+ ,YmmReg 6 4 W64 Integer
#endif
#if defined(REG_ZMM6)
- ,ZmmReg 6
+ ,ZmmReg 6 8 W64 Integer
#endif
#else /* MAX_REAL_XMM_REG == 0 */
#if defined(REG_F1)
@@ -733,62 +733,62 @@ globalRegMaybe (DoubleReg 6) =
# endif
# if MAX_REAL_XMM_REG != 0
# if defined(REG_XMM1)
-globalRegMaybe (XmmReg 1) = Just (RealRegSingle REG_XMM1)
+globalRegMaybe (XmmReg 1 _ _ _) = Just (RealRegSingle REG_XMM1)
# endif
# if defined(REG_XMM2)
-globalRegMaybe (XmmReg 2) = Just (RealRegSingle REG_XMM2)
+globalRegMaybe (XmmReg 2 _ _ _) = Just (RealRegSingle REG_XMM2)
# endif
# if defined(REG_XMM3)
-globalRegMaybe (XmmReg 3) = Just (RealRegSingle REG_XMM3)
+globalRegMaybe (XmmReg 3 _ _ _) = Just (RealRegSingle REG_XMM3)
# endif
# if defined(REG_XMM4)
-globalRegMaybe (XmmReg 4) = Just (RealRegSingle REG_XMM4)
+globalRegMaybe (XmmReg 4 _ _ _) = Just (RealRegSingle REG_XMM4)
# endif
# if defined(REG_XMM5)
-globalRegMaybe (XmmReg 5) = Just (RealRegSingle REG_XMM5)
+globalRegMaybe (XmmReg 5 _ _ _) = Just (RealRegSingle REG_XMM5)
# endif
# if defined(REG_XMM6)
-globalRegMaybe (XmmReg 6) = Just (RealRegSingle REG_XMM6)
+globalRegMaybe (XmmReg 6 _ _ _) = Just (RealRegSingle REG_XMM6)
# endif
# endif
# if defined(MAX_REAL_YMM_REG) && MAX_REAL_YMM_REG != 0
# if defined(REG_YMM1)
-globalRegMaybe (YmmReg 1) = Just (RealRegSingle REG_YMM1)
+globalRegMaybe (YmmReg 1 _ _ _) = Just (RealRegSingle REG_YMM1)
# endif
# if defined(REG_YMM2)
-globalRegMaybe (YmmReg 2) = Just (RealRegSingle REG_YMM2)
+globalRegMaybe (YmmReg 2 _ _ _) = Just (RealRegSingle REG_YMM2)
# endif
# if defined(REG_YMM3)
-globalRegMaybe (YmmReg 3) = Just (RealRegSingle REG_YMM3)
+globalRegMaybe (YmmReg 3 _ _ _) = Just (RealRegSingle REG_YMM3)
# endif
# if defined(REG_YMM4)
-globalRegMaybe (YmmReg 4) = Just (RealRegSingle REG_YMM4)
+globalRegMaybe (YmmReg 4 _ _ _) = Just (RealRegSingle REG_YMM4)
# endif
# if defined(REG_YMM5)
-globalRegMaybe (YmmReg 5) = Just (RealRegSingle REG_YMM5)
+globalRegMaybe (YmmReg 5 _ _ _) = Just (RealRegSingle REG_YMM5)
# endif
# if defined(REG_YMM6)
-globalRegMaybe (YmmReg 6) = Just (RealRegSingle REG_YMM6)
+globalRegMaybe (YmmReg 6 _ _ _) = Just (RealRegSingle REG_YMM6)
# endif
# endif
# if defined(MAX_REAL_ZMM_REG) && MAX_REAL_ZMM_REG != 0
# if defined(REG_ZMM1)
-globalRegMaybe (ZmmReg 1) = Just (RealRegSingle REG_ZMM1)
+globalRegMaybe (ZmmReg 1 _ _ _) = Just (RealRegSingle REG_ZMM1)
# endif
# if defined(REG_ZMM2)
-globalRegMaybe (ZmmReg 2) = Just (RealRegSingle REG_ZMM2)
+globalRegMaybe (ZmmReg 2 _ _ _) = Just (RealRegSingle REG_ZMM2)
# endif
# if defined(REG_ZMM3)
-globalRegMaybe (ZmmReg 3) = Just (RealRegSingle REG_ZMM3)
+globalRegMaybe (ZmmReg 3 _ _ _) = Just (RealRegSingle REG_ZMM3)
# endif
# if defined(REG_ZMM4)
-globalRegMaybe (ZmmReg 4) = Just (RealRegSingle REG_ZMM4)
+globalRegMaybe (ZmmReg 4 _ _ _) = Just (RealRegSingle REG_ZMM4)
# endif
# if defined(REG_ZMM5)
-globalRegMaybe (ZmmReg 5) = Just (RealRegSingle REG_ZMM5)
+globalRegMaybe (ZmmReg 5 _ _ _) = Just (RealRegSingle REG_ZMM5)
# endif
# if defined(REG_ZMM6)
-globalRegMaybe (ZmmReg 6) = Just (RealRegSingle REG_ZMM6)
+globalRegMaybe (ZmmReg 6 _ _ _) = Just (RealRegSingle REG_ZMM6)
# endif
# endif
# if defined(REG_Sp)
diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T
index 20ac9cc59e..eb6fee544f 100644
--- a/testsuite/tests/codeGen/should_run/all.T
+++ b/testsuite/tests/codeGen/should_run/all.T
@@ -80,11 +80,20 @@ test('cgrun069',
test('cgrun070', normal, compile_and_run, [''])
test('cgrun071', [when(have_cpu_feature('sse4_2'), extra_hc_opts('-msse4.2'))], compile_and_run, [''])
test('cgrun072', normal, compile_and_run, [''])
+test('cgrun074', normal, compile_and_run, [''])
test('cgrun075', normal, compile_and_run, [''])
test('cgrun076', normal, compile_and_run, [''])
test('cgrun077', [when(have_cpu_feature('bmi2'), extra_hc_opts('-mbmi2'))], compile_and_run, [''])
test('cgrun078', omit_ways(['ghci']), compile_and_run, [''])
+# N.B. Only NCG and LLVM backends support SIMD operations
+test('simd000', when(unregisterised(), skip), compile_and_run, [''])
+test('simd001', when(unregisterised(), skip), compile_and_run, [''])
+test('simd002', when(unregisterised(), skip), compile_and_run, [''])
+test('simd003', when(unregisterised(), skip), compile_and_run, [''])
+test('simd004', when(unregisterised(), skip), compile_and_run, [''])
+test('simd005', when(unregisterised(), skip), compile_and_run, [''])
+
test('T1852', normal, compile_and_run, [''])
test('T1861', extra_run_opts('0'), compile_and_run, [''])
test('T2080', normal, compile_and_run, [''])
@@ -143,7 +152,6 @@ test('T9001', normal, compile_and_run, [''])
test('T9013', omit_ways(['ghci']), # ghci doesn't support unboxed tuples
compile_and_run, [''])
test('T9340', normal, compile_and_run, [''])
-test('cgrun074', normal, compile_and_run, [''])
test('CmmSwitchTest32', unless(wordsize(32), skip), compile_and_run, [''])
test('CmmSwitchTest64', unless(wordsize(64), skip), compile_and_run, [''])
# Skipping WAY=ghci, because it is not broken.
diff --git a/testsuite/tests/codeGen/should_run/cgrun083.hs b/testsuite/tests/codeGen/should_run/cgrun083.hs
new file mode 100644
index 0000000000..cac889ec02
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/cgrun083.hs
@@ -0,0 +1,70 @@
+{-# OPTIONS_GHC -O2 #-}
+{-# OPTIONS_GHC -msse #-}
+{-# OPTIONS_GHC -msse2 #-}
+{-# OPTIONS_GHC -msse4 #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+-- tests for SSE based vector load/stores operations
+
+import GHC.Exts
+import GHC.IO
+
+data ByteArray = BA (MutableByteArray# RealWorld)
+
+data FloatX4 = FX4# FloatX4#
+
+instance Show FloatX4 where
+ show (FX4# f) = case (unpackFloatX4# f) of
+ (# a, b, c, d #) -> show ((F# a), (F# b), (F# c), (F# d))
+
+
+instance Eq FloatX4 where
+ (FX4# a) == (FX4# b)
+ = case (unpackFloatX4# a) of
+ (# a1, a2, a3, a4 #) ->
+ case (unpackFloatX4# b) of
+ (# b1, b2, b3, b4 #) -> (F# a1) == (F# b1) &&
+ (F# a2) == (F# b2) &&
+ (F# a3) == (F# b3) &&
+ (F# a4) == (F# b4)
+
+data DoubleX2 = DX2# DoubleX2#
+
+instance Show DoubleX2 where
+ show (DX2# d) = case (unpackDoubleX2# d) of
+ (# a, b #) -> show ((D# a), (D# b))
+
+
+instance Eq DoubleX2 where
+ (DX2# a) == (DX2# b)
+ = case (unpackDoubleX2# a) of
+ (# a1, a2 #) ->
+ case (unpackDoubleX2# b) of
+ (# b1, b2 #) -> (D# a1) == (D# b1) &&
+ (D# a2) == (D# b2)
+
+writeFloatArray :: ByteArray -> Int -> Float -> IO ()
+writeFloatArray (BA ba) (I# i) (F# n) = IO $ \s ->
+ case writeFloatArray# ba i n s of s' -> (# s', () #)
+
+readFloatX4 :: ByteArray -> Int -> IO FloatX4
+readFloatX4 (BA ba) (I# i) = IO $ \s ->
+ case readFloatArrayAsFloatX4# ba i s of (# s', r #) -> (# s', FX4# r #)
+
+writeDoubleArray :: ByteArray -> Int -> Double -> IO ()
+writeDoubleArray (BA ba) (I# i) (D# n) = IO $ \s ->
+ case writeDoubleArray# ba i n s of s' -> (# s', () #)
+
+readDoubleX2 :: ByteArray -> Int -> IO DoubleX2
+readDoubleX2 (BA ba) (I# i) = IO $ \s ->
+ case readDoubleArrayAsDoubleX2# ba i s of (# s', r #) -> (# s', DX2# r #)
+
+main :: IO ()
+main = do
+ ba <- IO $ \s -> case newAlignedPinnedByteArray# 64# 64# s of (# s', ba #) -> (# s', BA ba #)
+
+ mapM_ (\i -> writeFloatArray ba i (realToFrac i + realToFrac i / 10)) [0..16]
+ print =<< readFloatX4 ba 0
+
+ mapM_ (\i -> writeDoubleArray ba i (realToFrac i + realToFrac i / 10)) [0..8]
+ print =<< readDoubleX2 ba 0
diff --git a/testsuite/tests/codeGen/should_run/cgrun083.stdout b/testsuite/tests/codeGen/should_run/cgrun083.stdout
new file mode 100644
index 0000000000..bc41b3d2d3
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/cgrun083.stdout
@@ -0,0 +1,2 @@
+(0.0,1.1,2.2,3.3)
+(0.0,1.1)
diff --git a/testsuite/tests/codeGen/should_run/simd000.hs b/testsuite/tests/codeGen/should_run/simd000.hs
new file mode 100644
index 0000000000..47d69497c0
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/simd000.hs
@@ -0,0 +1,21 @@
+{-# OPTIONS_GHC -mavx #-}
+{-# OPTIONS_GHC -msse4 #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+-- !!! test broadcasting, packing and unpacking for vector types
+
+import GHC.Exts
+
+main :: IO ()
+main = do
+ -- FloatX4#
+ case unpackFloatX4# (broadcastFloatX4# 1.5#) of
+ (# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
+ case unpackFloatX4# (packFloatX4# (# 4.5#,7.8#, 2.3#, 6.5# #)) of
+ (# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
+
+ -- DoubleX2#
+ case unpackDoubleX2# (broadcastDoubleX2# 6.5##) of
+ (# a, b #) -> print (D# a, D# b)
+ case unpackDoubleX2# (packDoubleX2# (# 8.9##,7.2## #)) of
+ (# a, b #) -> print (D# a, D# b)
diff --git a/testsuite/tests/codeGen/should_run/simd000.stdout b/testsuite/tests/codeGen/should_run/simd000.stdout
new file mode 100644
index 0000000000..e5f9d383ec
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/simd000.stdout
@@ -0,0 +1,4 @@
+(1.5,1.5,1.5,1.5)
+(4.5,7.8,2.3,6.5)
+(6.5,6.5)
+(8.9,7.2)
diff --git a/testsuite/tests/codeGen/should_run/simd001.hs b/testsuite/tests/codeGen/should_run/simd001.hs
new file mode 100644
index 0000000000..c45e3bf922
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/simd001.hs
@@ -0,0 +1,49 @@
+{-# OPTIONS_GHC -mavx #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+-- !!! test the lifting of unlifted vector types and
+-- defining various typeclass instances for the lifted types
+
+import GHC.Exts
+
+data FloatX4 = FX4# FloatX4#
+
+instance Show FloatX4 where
+ show (FX4# f) = case (unpackFloatX4# f) of
+ (# a, b, c, d #) -> show ((F# a), (F# b), (F# c), (F# d))
+
+
+instance Eq FloatX4 where
+ (FX4# a) == (FX4# b)
+ = case (unpackFloatX4# a) of
+ (# a1, a2, a3, a4 #) ->
+ case (unpackFloatX4# b) of
+ (# b1, b2, b3, b4 #) -> (F# a1) == (F# b1) &&
+ (F# a2) == (F# b2) &&
+ (F# a3) == (F# b3) &&
+ (F# a4) == (F# b4)
+
+data DoubleX2 = DX2# DoubleX2#
+
+instance Show DoubleX2 where
+ show (DX2# d) = case (unpackDoubleX2# d) of
+ (# a, b #) -> show ((D# a), (D# b))
+
+
+instance Eq DoubleX2 where
+ (DX2# a) == (DX2# b)
+ = case (unpackDoubleX2# a) of
+ (# a1, a2 #) ->
+ case (unpackDoubleX2# b) of
+ (# b1, b2 #) -> (D# a1) == (D# b1) &&
+ (D# a2) == (D# b2)
+
+main :: IO ()
+main = do
+ print (FX4# (broadcastFloatX4# 1.5#))
+ print $ (FX4# (broadcastFloatX4# 1.5#)) == (FX4# (broadcastFloatX4# 2.5#))
+ print $ (FX4# (broadcastFloatX4# 3.5#)) == (FX4# (broadcastFloatX4# 3.5#))
+
+ print (DX2# (broadcastDoubleX2# 2.5##))
+ print $ (DX2# (broadcastDoubleX2# 1.5##)) == (DX2# (broadcastDoubleX2# 2.5##))
+ print $ (DX2# (broadcastDoubleX2# 3.5##)) == (DX2# (broadcastDoubleX2# 3.5##))
diff --git a/testsuite/tests/codeGen/should_run/simd001.stdout b/testsuite/tests/codeGen/should_run/simd001.stdout
new file mode 100644
index 0000000000..899f900506
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/simd001.stdout
@@ -0,0 +1,6 @@
+(1.5,1.5,1.5,1.5)
+False
+True
+(2.5,2.5)
+False
+True
diff --git a/testsuite/tests/codeGen/should_run/simd002.hs b/testsuite/tests/codeGen/should_run/simd002.hs
new file mode 100644
index 0000000000..8c61546381
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/simd002.hs
@@ -0,0 +1,33 @@
+{-# OPTIONS_GHC -mavx #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+-- !!! test arithmetic vector operations
+
+import GHC.Exts
+
+data FloatX4 = FX4# FloatX4#
+
+instance Show FloatX4 where
+ show (FX4# f) = case (unpackFloatX4# f) of
+ (# a, b, c, d #) -> show ((F# a), (F# b), (F# c), (F# d))
+
+data DoubleX2 = DX2# DoubleX2#
+
+instance Show DoubleX2 where
+ show (DX2# d) = case (unpackDoubleX2# d) of
+ (# a, b #) -> show ((D# a), (D# b))
+
+
+main :: IO ()
+main = do
+ print (FX4# (plusFloatX4# (broadcastFloatX4# 1.3#) (broadcastFloatX4# 2.2#)))
+ print (FX4# (minusFloatX4# (broadcastFloatX4# 3.5#) (broadcastFloatX4# 2.2#)))
+ print (FX4# (timesFloatX4# (broadcastFloatX4# 2.4#) (broadcastFloatX4# 2.2#)))
+ print (FX4# (divideFloatX4# (broadcastFloatX4# 9.2#) (broadcastFloatX4# 4.0#)))
+ print (FX4# (negateFloatX4# (broadcastFloatX4# 3.5#)))
+
+ print (DX2# (plusDoubleX2# (broadcastDoubleX2# 1.3##) (broadcastDoubleX2# 2.2##)))
+ print (DX2# (minusDoubleX2# (broadcastDoubleX2# 3.5##) (broadcastDoubleX2# 2.2##)))
+ print (DX2# (timesDoubleX2# (broadcastDoubleX2# 2.4##) (broadcastDoubleX2# 2.2##)))
+ print (DX2# (divideDoubleX2# (broadcastDoubleX2# 9.2##) (broadcastDoubleX2# 4.0##)))
+ print (DX2# (negateDoubleX2# (broadcastDoubleX2# 3.5##)))
diff --git a/testsuite/tests/codeGen/should_run/simd002.stdout b/testsuite/tests/codeGen/should_run/simd002.stdout
new file mode 100644
index 0000000000..302d71a13f
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/simd002.stdout
@@ -0,0 +1,10 @@
+(3.5,3.5,3.5,3.5)
+(1.3,1.3,1.3,1.3)
+(5.28,5.28,5.28,5.28)
+(2.3,2.3,2.3,2.3)
+(-3.5,-3.5,-3.5,-3.5)
+(3.5,3.5)
+(1.2999999999999998,1.2999999999999998)
+(5.28,5.28)
+(2.3,2.3)
+(-3.5,-3.5)
diff --git a/testsuite/tests/codeGen/should_run/simd003.hs b/testsuite/tests/codeGen/should_run/simd003.hs
new file mode 100644
index 0000000000..de3ae5aeb4
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/simd003.hs
@@ -0,0 +1,25 @@
+{-# OPTIONS_GHC -msse4 #-}
+{-# OPTIONS_GHC -mavx #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+-- !!! test the packing of floats and doubles into a vector
+
+import GHC.Exts
+
+data FloatX4 = FX4# FloatX4#
+
+instance Show FloatX4 where
+ show (FX4# f) = case (unpackFloatX4# f) of
+ (# a, b, c, d #) -> show ((F# a), (F# b), (F# c), (F# d))
+
+data DoubleX2 = DX2# DoubleX2#
+
+instance Show DoubleX2 where
+ show (DX2# d) = case (unpackDoubleX2# d) of
+ (# a, b #) -> show ((D# a), (D# b))
+
+
+main :: IO ()
+main = do
+ print (FX4# (packFloatX4# (# 9.2#, 8.15#, 7.0#, 6.4# #)))
+ print (DX2# (packDoubleX2# (# 7.2##, 9.3## #)))
diff --git a/testsuite/tests/codeGen/should_run/simd003.stdout b/testsuite/tests/codeGen/should_run/simd003.stdout
new file mode 100644
index 0000000000..230e4658c4
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/simd003.stdout
@@ -0,0 +1,2 @@
+(9.2,8.15,7.0,6.4)
+(7.2,9.3)
diff --git a/testsuite/tests/codeGen/should_run/simd004.hs b/testsuite/tests/codeGen/should_run/simd004.hs
new file mode 100644
index 0000000000..5216822ec4
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/simd004.hs
@@ -0,0 +1,95 @@
+{-# OPTIONS_GHC -O2 #-}
+{-# OPTIONS_GHC -mavx #-}
+{-# OPTIONS_GHC -msse4 #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+-- !!! test if enabling -O2 produces wrong results while
+-- packing , broadcasting, unpacking vectors and for
+-- arithmetic operations as well for avx instructions
+
+import GHC.Exts
+
+data FloatX4 = FX4# FloatX4#
+
+instance Show FloatX4 where
+ show (FX4# f) = case (unpackFloatX4# f) of
+ (# a, b, c, d #) -> show ((F# a), (F# b), (F# c), (F# d))
+
+
+instance Eq FloatX4 where
+ (FX4# a) == (FX4# b)
+ = case (unpackFloatX4# a) of
+ (# a1, a2, a3, a4 #) ->
+ case (unpackFloatX4# b) of
+ (# b1, b2, b3, b4 #) -> (F# a1) == (F# b1) &&
+ (F# a2) == (F# b2) &&
+ (F# a3) == (F# b3) &&
+ (F# a4) == (F# b4)
+
+data DoubleX2 = DX2# DoubleX2#
+
+instance Show DoubleX2 where
+ show (DX2# d) = case (unpackDoubleX2# d) of
+ (# a, b #) -> show ((D# a), (D# b))
+
+
+instance Eq DoubleX2 where
+ (DX2# a) == (DX2# b)
+ = case (unpackDoubleX2# a) of
+ (# a1, a2 #) ->
+ case (unpackDoubleX2# b) of
+ (# b1, b2 #) -> (D# a1) == (D# b1) &&
+ (D# a2) == (D# b2)
+
+
+main :: IO ()
+main = do
+
+ -- !!! test broadcasting, packing and unpacking for vector types
+ -- FloatX4#
+ case unpackFloatX4# (broadcastFloatX4# 1.5#) of
+ (# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
+ case unpackFloatX4# (packFloatX4# (# 4.5#,7.8#, 2.3#, 6.5# #)) of
+ (# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
+
+ -- DoubleX2#
+ case unpackDoubleX2# (broadcastDoubleX2# 6.5##) of
+ (# a, b #) -> print (D# a, D# b)
+ case unpackDoubleX2# (packDoubleX2# (# 8.9##,7.2## #)) of
+ (# a, b #) -> print (D# a, D# b)
+
+
+ -- !!! test the lifting of unlifted vector types and
+ -- defining various typeclass instances for the lifted types
+
+ print (FX4# (broadcastFloatX4# 1.5#))
+ print $ (FX4# (broadcastFloatX4# 1.5#)) == (FX4# (broadcastFloatX4# 2.5#))
+ print $ (FX4# (broadcastFloatX4# 3.5#)) == (FX4# (broadcastFloatX4# 3.5#))
+
+ print (DX2# (broadcastDoubleX2# 2.5##))
+ print $ (DX2#
+ (broadcastDoubleX2# 1.5##)) == (DX2# (broadcastDoubleX2# 2.5##))
+ print $ (DX2#
+ (broadcastDoubleX2# 3.5##)) == (DX2# (broadcastDoubleX2# 3.5##))
+
+
+ -- !!! test arithmetic vector operations
+ print (FX4# (plusFloatX4# (packFloatX4# (# 4.5#,7.8#, 2.3#, 6.5# #))
+ (packFloatX4# (# 8.2#,6.3#, 4.7#, 9.2# #))))
+ print (FX4# (minusFloatX4# (packFloatX4# (# 4.5#,7.8#, 2.3#, 6.5# #))
+ (packFloatX4# (# 8.2#,6.3#, 4.7#, 9.2# #))))
+ print (FX4# (timesFloatX4# (packFloatX4# (# 4.5#,7.8#, 2.3#, 6.5# #))
+ (packFloatX4# (# 8.2#,6.3#, 4.7#, 9.2# #))))
+ print (FX4# (divideFloatX4# (packFloatX4# (# 4.5#,7.8#, 2.3#, 6.5# #))
+ (packFloatX4# (# 8.2#,6.3#, 4.7#, 9.2# #))))
+ print (FX4# (negateFloatX4# (packFloatX4# (# 4.5#,7.8#, 2.3#, 6.5# #))))
+
+ print (DX2# (plusDoubleX2# (broadcastDoubleX2# 1.3##)
+ (broadcastDoubleX2# 2.2##)))
+ print (DX2# (minusDoubleX2# (broadcastDoubleX2# 3.5##)
+ (broadcastDoubleX2# 2.2##)))
+ print (DX2# (timesDoubleX2# (broadcastDoubleX2# 2.4##)
+ (broadcastDoubleX2# 2.2##)))
+ print (DX2# (divideDoubleX2# (broadcastDoubleX2# 9.2##)
+ (broadcastDoubleX2# 4.0##)))
+ print (DX2# (negateDoubleX2# (broadcastDoubleX2# 3.5##)))
diff --git a/testsuite/tests/codeGen/should_run/simd004.stdout b/testsuite/tests/codeGen/should_run/simd004.stdout
new file mode 100644
index 0000000000..ee90e738ca
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/simd004.stdout
@@ -0,0 +1,20 @@
+(1.5,1.5,1.5,1.5)
+(4.5,7.8,2.3,6.5)
+(6.5,6.5)
+(8.9,7.2)
+(1.5,1.5,1.5,1.5)
+False
+True
+(2.5,2.5)
+False
+True
+(12.7,14.1,7.0,15.7)
+(-3.6999998,1.5,-2.3999999,-2.6999998)
+(36.899998,49.140003,10.809999,59.8)
+(0.5487805,1.2380953,0.4893617,0.70652175)
+(-4.5,-7.8,-2.3,-6.5)
+(3.5,3.5)
+(1.2999999999999998,1.2999999999999998)
+(5.28,5.28)
+(2.3,2.3)
+(-3.5,-3.5) \ No newline at end of file
diff --git a/testsuite/tests/codeGen/should_run/simd005.hs b/testsuite/tests/codeGen/should_run/simd005.hs
new file mode 100644
index 0000000000..b074066d24
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/simd005.hs
@@ -0,0 +1,93 @@
+{-# OPTIONS_GHC -O2 #-}
+{-# OPTIONS_GHC -msse #-}
+{-# OPTIONS_GHC -msse2 #-}
+{-# OPTIONS_GHC -msse4 #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+-- tests for SSE based vector operations
+
+import GHC.Exts
+
+data FloatX4 = FX4# FloatX4#
+
+instance Show FloatX4 where
+ show (FX4# f) = case (unpackFloatX4# f) of
+ (# a, b, c, d #) -> show ((F# a), (F# b), (F# c), (F# d))
+
+
+instance Eq FloatX4 where
+ (FX4# a) == (FX4# b)
+ = case (unpackFloatX4# a) of
+ (# a1, a2, a3, a4 #) ->
+ case (unpackFloatX4# b) of
+ (# b1, b2, b3, b4 #) -> (F# a1) == (F# b1) &&
+ (F# a2) == (F# b2) &&
+ (F# a3) == (F# b3) &&
+ (F# a4) == (F# b4)
+
+data DoubleX2 = DX2# DoubleX2#
+
+instance Show DoubleX2 where
+ show (DX2# d) = case (unpackDoubleX2# d) of
+ (# a, b #) -> show ((D# a), (D# b))
+
+
+instance Eq DoubleX2 where
+ (DX2# a) == (DX2# b)
+ = case (unpackDoubleX2# a) of
+ (# a1, a2 #) ->
+ case (unpackDoubleX2# b) of
+ (# b1, b2 #) -> (D# a1) == (D# b1) &&
+ (D# a2) == (D# b2)
+
+main :: IO ()
+main = do
+
+ -- !!! test broadcasting, packing and unpacking for vector types
+ -- FloatX4#
+ case unpackFloatX4# (broadcastFloatX4# 1.5#) of
+ (# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
+ case unpackFloatX4# (packFloatX4# (# 4.5#,7.8#, 2.3#, 6.5# #)) of
+ (# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
+
+ -- DoubleX2#
+ case unpackDoubleX2# (broadcastDoubleX2# 6.5##) of
+ (# a, b #) -> print (D# a, D# b)
+ case unpackDoubleX2# (packDoubleX2# (# 8.9##,7.2## #)) of
+ (# a, b #) -> print (D# a, D# b)
+
+
+ -- !!! test the lifting of unlifted vector types and
+ -- defining various typeclass instances for the lifted types
+
+ print (FX4# (broadcastFloatX4# 1.5#))
+ print $ (FX4# (broadcastFloatX4# 1.5#)) == (FX4# (broadcastFloatX4# 2.5#))
+ print $ (FX4# (broadcastFloatX4# 3.5#)) == (FX4# (broadcastFloatX4# 3.5#))
+
+ print (DX2# (broadcastDoubleX2# 2.5##))
+ print $ (DX2#
+ (broadcastDoubleX2# 1.5##)) == (DX2# (broadcastDoubleX2# 2.5##))
+ print $ (DX2#
+ (broadcastDoubleX2# 3.5##)) == (DX2# (broadcastDoubleX2# 3.5##))
+
+
+ -- !!! test arithmetic vector operations
+ print (FX4# (plusFloatX4# (packFloatX4# (# 4.5#,7.8#, 2.3#, 6.5# #))
+ (packFloatX4# (# 8.2#,6.3#, 4.7#, 9.2# #))))
+ print (FX4# (minusFloatX4# (packFloatX4# (# 4.5#,7.8#, 2.3#, 6.5# #))
+ (packFloatX4# (# 8.2#,6.3#, 4.7#, 9.2# #))))
+ print (FX4# (timesFloatX4# (packFloatX4# (# 4.5#,7.8#, 2.3#, 6.5# #))
+ (packFloatX4# (# 8.2#,6.3#, 4.7#, 9.2# #))))
+ print (FX4# (divideFloatX4# (packFloatX4# (# 4.5#,7.8#, 2.3#, 6.5# #))
+ (packFloatX4# (# 8.2#,6.3#, 4.7#, 9.2# #))))
+ print (FX4# (negateFloatX4# (packFloatX4# (# 4.5#,7.8#, 2.3#, 6.5# #))))
+
+ print (DX2# (plusDoubleX2# (broadcastDoubleX2# 1.3##)
+ (broadcastDoubleX2# 2.2##)))
+ print (DX2# (minusDoubleX2# (broadcastDoubleX2# 3.5##)
+ (broadcastDoubleX2# 2.2##)))
+ print (DX2# (timesDoubleX2# (broadcastDoubleX2# 2.4##)
+ (broadcastDoubleX2# 2.2##)))
+ print (DX2# (divideDoubleX2# (broadcastDoubleX2# 9.2##)
+ (broadcastDoubleX2# 4.0##)))
+ print (DX2# (negateDoubleX2# (broadcastDoubleX2# 3.5##)))
diff --git a/testsuite/tests/codeGen/should_run/simd005.stdout b/testsuite/tests/codeGen/should_run/simd005.stdout
new file mode 100644
index 0000000000..84386823f8
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/simd005.stdout
@@ -0,0 +1,20 @@
+(1.5,1.5,1.5,1.5)
+(4.5,7.8,2.3,6.5)
+(6.5,6.5)
+(8.9,7.2)
+(1.5,1.5,1.5,1.5)
+False
+True
+(2.5,2.5)
+False
+True
+(12.7,14.1,7.0,15.7)
+(-3.6999998,1.5,-2.3999999,-2.6999998)
+(36.899998,49.140003,10.809999,59.8)
+(0.5487805,1.2380953,0.4893617,0.70652175)
+(-4.5,-7.8,-2.3,-6.5)
+(3.5,3.5)
+(1.2999999999999998,1.2999999999999998)
+(5.28,5.28)
+(2.3,2.3)
+(-3.5,-3.5)