4 files changed, 71 insertions, 17 deletions
diff --git a/compiler/nativeGen/Reg.hs b/compiler/nativeGen/Reg.hs
index 7f69ea01a4..e0c0176a75 100644
--- a/compiler/nativeGen/Reg.hs
+++ b/compiler/nativeGen/Reg.hs
@@ -52,11 +52,11 @@ type RegNo
 --      Virtual regs can be of either class, so that info is attached.
 --
 data VirtualReg
-        = VirtualRegI  {-# UNPACK #-} !Unique
-        | VirtualRegHi {-# UNPACK #-} !Unique  -- High part of 2-word register
-        | VirtualRegF  {-# UNPACK #-} !Unique
-        | VirtualRegD  {-# UNPACK #-} !Unique
-
+        = VirtualRegI   {-# UNPACK #-} !Unique
+        | VirtualRegHi  {-# UNPACK #-} !Unique  -- High part of 2-word register
+        | VirtualRegF   {-# UNPACK #-} !Unique
+        | VirtualRegD   {-# UNPACK #-} !Unique
+        | VirtualRegSSE {-# UNPACK #-} !Unique
         deriving (Eq, Show)
 
 -- This is laborious, but necessary. We can't derive Ord because
@@ -69,14 +69,15 @@ instance Ord VirtualReg where
   compare (VirtualRegHi a) (VirtualRegHi b) = nonDetCmpUnique a b
   compare (VirtualRegF a) (VirtualRegF b) = nonDetCmpUnique a b
   compare (VirtualRegD a) (VirtualRegD b) = nonDetCmpUnique a b
-
+  compare (VirtualRegSSE a) (VirtualRegSSE b) = nonDetCmpUnique a b
   compare VirtualRegI{} _ = LT
   compare _ VirtualRegI{} = GT
   compare VirtualRegHi{} _ = LT
   compare _ VirtualRegHi{} = GT
   compare VirtualRegF{} _ = LT
   compare _ VirtualRegF{} = GT
-
+  compare VirtualRegD{} _ = LT
+  compare _ VirtualRegD{} = GT
 
 
 instance Uniquable VirtualReg where
@@ -86,18 +87,16 @@ instance Uniquable VirtualReg where
                 VirtualRegHi u  -> u
                 VirtualRegF u   -> u
                 VirtualRegD u   -> u
+                VirtualRegSSE u -> u
 
 instance Outputable VirtualReg where
         ppr reg
          = case reg of
                 VirtualRegI  u  -> text "%vI_"   <> pprUniqueAlways u
                 VirtualRegHi u  -> text "%vHi_"  <> pprUniqueAlways u
-                -- this code is kinda wrong on x86
-                -- because float and double occupy the same register set
-                -- namely SSE2 register xmm0 .. xmm15
-                VirtualRegF  u  -> text "%vFloat_"   <> pprUniqueAlways u
-                VirtualRegD  u  -> text "%vDouble_"   <> pprUniqueAlways u
-
+                VirtualRegF  u  -> text "%vF_"   <> pprUniqueAlways u
+                VirtualRegD  u  -> text "%vD_"   <> pprUniqueAlways u
+                VirtualRegSSE u -> text "%vSSE_" <> pprUniqueAlways u
 
 
 renameVirtualReg :: Unique -> VirtualReg -> VirtualReg
@@ -107,6 +106,7 @@ renameVirtualReg u r
         VirtualRegHi _  -> VirtualRegHi u
         VirtualRegF _   -> VirtualRegF  u
         VirtualRegD _   -> VirtualRegD  u
+        VirtualRegSSE _ -> VirtualRegSSE u
 
 
 classOfVirtualReg :: VirtualReg -> RegClass
@@ -116,7 +116,7 @@ classOfVirtualReg vr
         VirtualRegHi{}  -> RcInteger
         VirtualRegF{}   -> RcFloat
         VirtualRegD{}   -> RcDouble
-
+        VirtualRegSSE{} -> RcDoubleSSE
 
 
 -- Determine the upper-half vreg for a 64-bit quantity on a 32-bit platform
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 73cfb28d46..fe6d4d0499 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -284,11 +284,13 @@ data ChildCode64
 data Register
         = Fixed Format Reg InstrBlock
         | Any   Format (Reg -> InstrBlock)
+        | AnyV  (Reg -> InstrBlock)
 
 
 swizzleRegisterRep :: Register -> Format -> Register
 swizzleRegisterRep (Fixed _ reg code) format = Fixed format reg code
 swizzleRegisterRep (Any _ codefn)     format = Any   format codefn
+swizzleRegisterRep r@(AnyV{})         _      = r
 
 
 -- | Grab the Reg for a CmmReg
@@ -368,6 +370,9 @@ getSomeReg expr = do
     Any rep code -> do
         tmp <- getNewRegNat rep
         return (tmp, code tmp)
+    AnyV code -> do
+        tmp <- getVectorReg
+        return (tmp, code tmp)
     Fixed _ reg code ->
         return (reg, code)
 
@@ -484,6 +489,16 @@ iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do
                           r_dst_lo
             )
 
+iselExpr64 (CmmMachOp (MO_V_Add len width) [expr1, expr2]) = do
+    r_dst <- getVectorReg
+    ChildCode64 code1 r1 <- iselExpr64 expr1
+    ChildCode64 code2 r2 <- iselExpr64 expr2
+    let fmt = VecFormat len width FmtInt
+    return $ ChildCode64 (code1 `appOL` code2 `appOL`
+                          toOL [ V_MOV fmt (OpReg r_dst) (OpReg r1)
+                               , V_ADD (VecFormat len width FmtInt) (OpReg r2) (OpReg r_dst)
+                               ]) r_dst
+
 iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
      fn <- getAnyReg expr
      r_dst_lo <-  getNewRegNat II32
@@ -501,6 +516,8 @@ iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
 iselExpr64 expr
    = pprPanic "iselExpr64(i386)" (ppr expr)
 
+getVectorReg :: NatM Reg
+getVectorReg = RegVirtual . VirtualRegSSE <$> getUniqueM
 
 --------------------------------------------------------------------------------
 getRegister :: CmmExpr -> NatM Register
@@ -822,7 +839,7 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
 
       MO_V_Insert {}   -> needLlvm
       MO_V_Extract {}  -> needLlvm
-      MO_V_Add {}      -> needLlvm
+      MO_V_Add {}      -> triv_op W512 V_ADD
       MO_V_Sub {}      -> needLlvm
       MO_V_Mul {}      -> needLlvm
       MO_VS_Quot {}    -> needLlvm
@@ -985,6 +1002,12 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
 
 
 getRegister' _ _ (CmmLoad mem pk)
+  | isVecType pk
+  = do
+    code <- vecLoadCode (V_MOV format) mem
+    return (AnyV code)
+
+getRegister' _ _ (CmmLoad mem pk)
   | isFloatType pk
   = do
     Amode addr mem_code <- getAmode mem
@@ -1096,6 +1119,9 @@ getByteReg expr = do
                     -- ToDo: could optimise slightly by checking for
                     -- byte-addressable real registers, but that will
                     -- happen very rarely if at all.
+                AnyV {} -> do
+                    tmp <- getVectorReg
+                    return (tmp, code tmp)
       else getSomeReg expr -- all regs are byte-addressable on x86_64
 
 -- Another variant: this time we want the result in a register that cannot
@@ -1116,6 +1142,9 @@ getNonClobberedReg expr = do
                 return (tmp, code `snocOL` reg2reg rep reg tmp)
         | otherwise ->
                 return (reg, code)
+    AnyV code -> do
+        tmp <- getVectorReg
+        return (tmp, code tmp)
 
 reg2reg :: Format -> Reg -> Reg -> Instr
 reg2reg format src dst = MOV format (OpReg src) (OpReg dst)
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index 6e5d656beb..e59594365b 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -8,8 +8,9 @@
 --
 -----------------------------------------------------------------------------
 
-module X86.Instr (Instr(..), Operand(..), PrefetchVariant(..), JumpDest(..),
-                  getJumpDestBlockId, canShortcut, shortcutStatics,
+module X86.Instr (Instr(..), Operand(..), PrefetchVariant(..),
+                  VecFormat(..), ScalarFormat(..),
+                  JumpDest(..), getJumpDestBlockId, canShortcut, shortcutStatics,
                   shortcutJump, allocMoreStack,
                   maxSpillSlots, archWordFormat )
 where
@@ -71,6 +72,19 @@ instance Instruction Instr where
         mkStackAllocInstr       = x86_mkStackAllocInstr
         mkStackDeallocInstr     = x86_mkStackDeallocInstr
 
+data VecFormat = VecFormat { vecLength :: !Length
+                           , vecWidth  :: !Width
+                           , vecScalar :: !ScalarFormat
+                           }
+
+instance Outputable VecFormat where
+    ppr (VecFormat l w f) = ppr (l,w,f)
+
+data ScalarFormat = FmtFloat | FmtInt
+
+instance Outputable ScalarFormat where
+    ppr FmtFloat = text "float"
+    ppr FmtInt   = text "int"
 
 -- -----------------------------------------------------------------------------
 -- Intel x86 instructions
@@ -193,6 +207,7 @@ data Instr
 
         -- Moves.
         | MOV         Format Operand Operand
+        | V_MOV       VecFormat Operand Operand
         | CMOV   Cond Format Operand Reg
         | MOVZxL      Format Operand Operand -- format is the size of operand 1
         | MOVSxL      Format Operand Operand -- format is the size of operand 1
@@ -205,6 +220,7 @@ data Instr
 
         -- Int Arithmetic.
         | ADD         Format Operand Operand
+        | V_ADD       VecFormat Operand Operand
         | ADC         Format Operand Operand
         | SUB         Format Operand Operand
         | SBB         Format Operand Operand
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index 095d9eba7c..7a799cfabd 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -833,6 +833,15 @@ pprInstr (XADD format src dst) = pprFormatOpOp (sLit "xadd") format src dst
 pprInstr (CMPXCHG format src dst)
    = pprFormatOpOp (sLit "cmpxchg") format src dst
 
+-- SIMD
+pprInstr (V_ADD format src dst)
+  | VecFormat _ W32 FmtFloat <- format = pprVecFormatOpOp (sLit "addps") format src dst
+  | VecFormat _ W64 FmtFloat <- format = pprVecFormatOpOp (sLit "addpd") format src dst
+  | VecFormat _ W64 FmtInt   <- format = pprVecFormatOpOp (sLit "addpd") format src dst
+pprInstr (V_MOV format src dst)
+  | VecFormat _ W32 FmtFloat <- format = pprVecFormatOpOp (sLit "movaps") format src dst
+  | VecFormat _ W64 FmtFloat <- format = pprVecFormatOpOp (sLit "movapd") format src dst
+  | VecFormat _ _   FmtInt   <- format = pprVecFormatOpOp (sLit "movdqa") format src dst
 
 
 --------------------------