1 files changed, 468 insertions, 58 deletions
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 13662f6807..ed3684e074 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -111,12 +111,25 @@ sse2Enabled = do
     ArchX86    -> return True
     _          -> panic "trying to generate x86/x86_64 on the wrong platform"
 
+sse4_1Enabled :: NatM Bool
+sse4_1Enabled = do
+  dflags <- getDynFlags
+  return (isSse4_1Enabled dflags)
 
 sse4_2Enabled :: NatM Bool
 sse4_2Enabled = do
   dflags <- getDynFlags
   return (isSse4_2Enabled dflags)
 
+sseEnabled :: NatM Bool
+sseEnabled = do
+  dflags <- getDynFlags
+  return (isSseEnabled dflags)
+
+avxEnabled :: NatM Bool
+avxEnabled = do
+  dflags <- getDynFlags
+  return (isAvxEnabled dflags)
 
 cmmTopCodeGen
         :: RawCmmDecl
@@ -215,6 +228,7 @@ stmtToInstrs bid stmt = do
     CmmAssign reg src
       | isFloatType ty         -> assignReg_FltCode format reg src
       | is32Bit && isWord64 ty -> assignReg_I64Code      reg src
+      | isVecType ty           -> assignReg_VecCode format reg src
       | otherwise              -> assignReg_IntCode format reg src
         where ty = cmmRegType dflags reg
               format = cmmTypeFormat ty
@@ -222,6 +236,7 @@ stmtToInstrs bid stmt = do
     CmmStore addr src
       | isFloatType ty         -> assignMem_FltCode format addr src
       | is32Bit && isWord64 ty -> assignMem_I64Code      addr src
+      | isVecType ty           -> assignMem_VecCode format addr src
       | otherwise              -> assignMem_IntCode format addr src
         where ty = cmmExprType dflags src
               format = cmmTypeFormat ty
@@ -308,6 +323,15 @@ getRegisterReg platform  (CmmGlobal mid)
         -- platform.  Hence ...
 
 
+getVecRegisterReg :: Platform -> Bool -> Format -> CmmReg -> Reg
+getVecRegisterReg _ use_avx format (CmmLocal (LocalReg u pk))
+  | isVecType pk && use_avx = RegVirtual (mkVirtualReg u format)
+  | otherwise               = pprPanic
+                              (unlines ["avx flag is not enabled" ,
+                                        "or this is not a vector register"])
+                              (ppr pk)
+getVecRegisterReg platform _use_avx _format c = getRegisterReg platform c
+
 -- | Memory addressing modes passed up the tree.
 data Amode
         = Amode AddrMode InstrBlock
@@ -503,6 +527,13 @@ iselExpr64 expr
 
 
 --------------------------------------------------------------------------------
+
+-- This is a helper data type which helps reduce the code duplication for
+-- the code generation of arithmetic operations. This is not specifically
+-- targetted for any particular type like Int8, Int32 etc
+data VectorArithInstns = VA_Add | VA_Sub | VA_Mul | VA_Div
+
+
 getRegister :: CmmExpr -> NatM Register
 getRegister e = do dflags <- getDynFlags
                    is32Bit <- is32BitPlatform
@@ -520,16 +551,24 @@ getRegister' dflags is32Bit (CmmReg reg)
             do reg' <- getPicBaseNat (archWordFormat is32Bit)
                return (Fixed (archWordFormat is32Bit) reg' nilOL)
         _ ->
-            do
-               let
-                 fmt = cmmTypeFormat (cmmRegType dflags reg)
-                 format  = fmt
-               --
-               let platform = targetPlatform dflags
-               return (Fixed format
-                             (getRegisterReg platform  reg)
-                             nilOL)
-
+            do use_sse2 <- sse2Enabled
+               use_avx <- avxEnabled
+               let cmmregtype = cmmRegType dflags reg
+               if isVecType cmmregtype
+                 then return (vectorRegister cmmregtype use_avx use_sse2)
+                 else return (standardRegister cmmregtype)
+  where
+    vectorRegister :: CmmType -> Bool -> Bool -> Register
+    vectorRegister reg_ty use_avx use_sse2
+      | use_avx || use_sse2 =
+        let vecfmt   = cmmTypeFormat reg_ty
+            platform = targetPlatform dflags
+        in (Fixed vecfmt (getVecRegisterReg platform True vecfmt reg) nilOL)
+      | otherwise = panic "Please enable the -mavx or -msse2 flag"
+
+    standardRegister crt =
+      let platform = targetPlatform dflags
+       in (Fixed (cmmTypeFormat crt) (getRegisterReg platform reg) nilOL)
 
 getRegister' dflags is32Bit (CmmRegOff r n)
   = getRegister' dflags is32Bit $ mangleIndexTree dflags r n
@@ -631,7 +670,69 @@ getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
       return $ Any II64 (\dst -> unitOL $
         LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
 
+getRegister' _ _ (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
+  sse4_1 <- sse4_1Enabled
+  sse2   <- sse2Enabled
+  sse    <- sseEnabled
+  case mop of
+    MO_VF_Insert l W32  | sse4_1 && sse -> vector_float_pack l W32 x y z
+                        | otherwise
+                          -> sorry "Please enable the -msse4 and -msse flag"
+    MO_VF_Insert l W64  | sse2   && sse -> vector_float_pack l W64 x y z
+                        | otherwise
+                          -> sorry "Please enable the -msse2 and -msse flag"
+    _other                              -> incorrectOperands
+    where
+    vector_float_pack :: Length
+                      -> Width
+                      -> CmmExpr
+                      -> CmmExpr
+                      -> CmmExpr
+                      -> NatM Register
+    vector_float_pack len W32 expr1 expr2 (CmmLit offset)
+      = do
+      fn          <- getAnyReg expr1
+      (r, exp)    <- getSomeReg expr2
+      let f        = VecFormat len FmtFloat W32
+          imm      = litToImm offset
+          code dst = exp `appOL`
+                     (fn dst) `snocOL`
+                     (INSERTPS f (OpImm imm) (OpReg r) dst)
+       in return $ Any f code
+    vector_float_pack len W64 expr1 expr2 (CmmLit offset)
+      = do
+      Amode addr addr_code <- getAmode expr2
+      (r, exp) <- getSomeReg expr1
+
+      -- fn <- getAnyReg expr1
+      -- (r, exp) <- getSomeReg expr2
+      let f = VecFormat len FmtDouble W64
+          code dst
+            = case offset of
+                CmmInt 0  _ -> exp `appOL` addr_code `snocOL`
+                               (MOVL f (OpAddr addr) (OpReg r)) `snocOL`
+                               (MOVU f (OpReg r) (OpReg dst))
+                CmmInt 16 _ -> exp `appOL` addr_code `snocOL`
+                               (MOVH f (OpAddr addr) (OpReg r)) `snocOL`
+                               (MOVU f (OpReg r) (OpReg dst))
+                _ -> panic "Error in offset while packing"
+          -- code dst
+          --   = case offset of
+          --       CmmInt 0  _ -> exp `appOL`
+          --                      (fn dst) `snocOL`
+          --                      (MOVL f (OpReg r) (OpReg dst))
+          --       CmmInt 16 _ -> exp `appOL`
+          --                      (fn dst) `snocOL`
+          --                      (MOVH f (OpReg r) (OpReg dst))
+          --       _ -> panic "Error in offset while packing"
+       in return $ Any f code
+    vector_float_pack _ _ _ c _
+      = pprPanic "Pack not supported for : " (ppr c)
+
 getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
+    sse2   <- sse2Enabled
+    sse    <- sseEnabled
+    avx    <- avxEnabled
     case mop of
       MO_F_Neg w  -> sse2NegCode w x
 
@@ -708,23 +809,28 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       MO_FS_Conv from to -> coerceFP2Int from to x
       MO_SF_Conv from to -> coerceInt2FP from to x
 
-      MO_V_Insert {}   -> needLlvm
-      MO_V_Extract {}  -> needLlvm
-      MO_V_Add {}      -> needLlvm
-      MO_V_Sub {}      -> needLlvm
-      MO_V_Mul {}      -> needLlvm
-      MO_VS_Quot {}    -> needLlvm
-      MO_VS_Rem {}     -> needLlvm
-      MO_VS_Neg {}     -> needLlvm
-      MO_VU_Quot {}    -> needLlvm
-      MO_VU_Rem {}     -> needLlvm
-      MO_VF_Insert {}  -> needLlvm
-      MO_VF_Extract {} -> needLlvm
-      MO_VF_Add {}     -> needLlvm
-      MO_VF_Sub {}     -> needLlvm
-      MO_VF_Mul {}     -> needLlvm
-      MO_VF_Quot {}    -> needLlvm
-      MO_VF_Neg {}     -> needLlvm
+      MO_V_Insert {}      -> needLlvm
+      MO_V_Extract {}     -> needLlvm
+      MO_V_Add {}         -> needLlvm
+      MO_V_Sub {}         -> needLlvm
+      MO_V_Mul {}         -> needLlvm
+      MO_VS_Quot {}       -> needLlvm
+      MO_VS_Rem {}        -> needLlvm
+      MO_VS_Neg {}        -> needLlvm
+      MO_VU_Quot {}       -> needLlvm
+      MO_VU_Rem {}        -> needLlvm
+      MO_VF_Broadcast {}  -> incorrectOperands
+      MO_VF_Insert {}     -> incorrectOperands
+      MO_VF_Extract {}    -> incorrectOperands
+      MO_VF_Add {}        -> incorrectOperands
+      MO_VF_Sub {}        -> incorrectOperands
+      MO_VF_Mul {}        -> incorrectOperands
+      MO_VF_Quot {}       -> incorrectOperands
+
+      MO_VF_Neg l w  | avx           -> vector_float_negate_avx l w x
+                     | sse && sse2   -> vector_float_negate_sse l w x
+                     | otherwise
+                       -> sorry "Please enable the -mavx or -msse, -msse2 flag"
 
       _other -> pprPanic "getRegister" (pprMachOp mop)
    where
@@ -762,8 +868,45 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
             = do e_code <- getRegister' dflags is32Bit expr
                  return (swizzleRegisterRep e_code new_format)
 
+        vector_float_negate_avx :: Length -> Width -> CmmExpr -> NatM Register
+        vector_float_negate_avx l w expr = do
+          tmp                  <- getNewRegNat (VecFormat l FmtFloat w)
+          (reg, exp)           <- getSomeReg expr
+          Amode addr addr_code <- memConstant (mkAlignment $ widthInBytes W32) (CmmFloat 0.0 W32)
+          let format   = case w of
+                           W32 -> VecFormat l FmtFloat w
+                           W64 -> VecFormat l FmtDouble w
+                           _ -> pprPanic "Cannot negate vector of width" (ppr w)
+              code dst = case w of
+                           W32 -> exp `appOL` addr_code `snocOL`
+                                  (VBROADCAST format addr tmp) `snocOL`
+                                  (VSUB format (OpReg reg) tmp dst)
+                           W64 -> exp `appOL` addr_code `snocOL`
+                                  (MOVL format (OpAddr addr) (OpReg tmp)) `snocOL`
+                                  (MOVH format (OpAddr addr) (OpReg tmp)) `snocOL`
+                                  (VSUB format (OpReg reg) tmp dst)
+                           _ -> pprPanic "Cannot negate vector of width" (ppr w)
+          return (Any format code)
+
+        vector_float_negate_sse :: Length -> Width -> CmmExpr -> NatM Register
+        vector_float_negate_sse l w expr = do
+          tmp                  <- getNewRegNat (VecFormat l FmtFloat w)
+          (reg, exp)           <- getSomeReg expr
+          let format   = case w of
+                           W32 -> VecFormat l FmtFloat w
+                           W64 -> VecFormat l FmtDouble w
+                           _ -> pprPanic "Cannot negate vector of width" (ppr w)
+              code dst = exp `snocOL`
+                         (XOR format (OpReg tmp) (OpReg tmp)) `snocOL`
+                         (MOVU format (OpReg tmp) (OpReg dst)) `snocOL`
+                         (SUB format (OpReg reg) (OpReg dst))
+          return (Any format code)
 
 getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
+  sse4_1 <- sse4_1Enabled
+  sse2   <- sse2Enabled
+  sse    <- sseEnabled
+  avx    <- avxEnabled
   case mop of
       MO_F_Eq _ -> condFltReg is32Bit EQQ x y
       MO_F_Ne _ -> condFltReg is32Bit NE  x y
@@ -828,13 +971,49 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       MO_VS_Quot {}    -> needLlvm
       MO_VS_Rem {}     -> needLlvm
       MO_VS_Neg {}     -> needLlvm
-      MO_VF_Insert {}  -> needLlvm
-      MO_VF_Extract {} -> needLlvm
-      MO_VF_Add {}     -> needLlvm
-      MO_VF_Sub {}     -> needLlvm
-      MO_VF_Mul {}     -> needLlvm
-      MO_VF_Quot {}    -> needLlvm
-      MO_VF_Neg {}     -> needLlvm
+
+      MO_VF_Broadcast l W32 | avx       -> vector_float_broadcast_avx l W32 x y
+                            | sse4_1    -> vector_float_broadcast_sse l W32 x y
+                            | otherwise
+                              -> sorry "Please enable the -mavx or -msse4 flag"
+
+      MO_VF_Broadcast l W64 | sse2      -> vector_float_broadcast_avx l W64 x y
+                            | otherwise -> sorry "Please enable the -msse2 flag"
+
+      MO_VF_Extract l W32   | avx       -> vector_float_unpack l W32 x y
+                            | sse       -> vector_float_unpack_sse l W32 x y
+                            | otherwise
+                              -> sorry "Please enable the -mavx or -msse flag"
+
+      MO_VF_Extract l W64   | sse2      -> vector_float_unpack l W64 x y
+                            | otherwise -> sorry "Please enable the -msse2 flag"
+
+      MO_VF_Add l w         | avx              -> vector_float_op_avx VA_Add l w x y
+                            | sse  && w == W32 -> vector_float_op_sse VA_Add l w x y
+                            | sse2 && w == W64 -> vector_float_op_sse VA_Add l w x y
+                            | otherwise
+                              -> sorry "Please enable the -mavx or -msse flag"
+
+      MO_VF_Sub l w         | avx              -> vector_float_op_avx VA_Sub l w x y
+                            | sse  && w == W32 -> vector_float_op_sse VA_Sub l w x y
+                            | sse2 && w == W64 -> vector_float_op_sse VA_Sub l w x y
+                            | otherwise
+                              -> sorry "Please enable the -mavx or -msse flag"
+
+      MO_VF_Mul l w         | avx              -> vector_float_op_avx VA_Mul l w x y
+                            | sse  && w == W32 -> vector_float_op_sse VA_Mul l w x y
+                            | sse2 && w == W64 -> vector_float_op_sse VA_Mul l w x y
+                            | otherwise
+                              -> sorry "Please enable the -mavx or -msse flag"
+
+      MO_VF_Quot l w        | avx              -> vector_float_op_avx VA_Div l w x y
+                            | sse  && w == W32 -> vector_float_op_sse VA_Div l w x y
+                            | sse2 && w == W64 -> vector_float_op_sse VA_Div l w x y
+                            | otherwise
+                              -> sorry "Please enable the -mavx or -msse flag"
+
+      MO_VF_Insert {}                  -> incorrectOperands
+      MO_VF_Neg {}                     -> incorrectOperands
 
       _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
   where
@@ -930,7 +1109,171 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
     -- TODO: There are other interesting patterns we want to replace
     --     with a LEA, e.g. `(x + offset) + (y << shift)`.
 
+    -----------------------
+    -- Vector operations---
+    vector_float_op_avx :: VectorArithInstns
+                        -> Length
+                        -> Width
+                        -> CmmExpr
+                        -> CmmExpr
+                        -> NatM Register
+    vector_float_op_avx op l w expr1 expr2 = do
+      (reg1, exp1) <- getSomeReg expr1
+      (reg2, exp2) <- getSomeReg expr2
+      let format   = case w of
+                       W32 -> VecFormat l FmtFloat  W32
+                       W64 -> VecFormat l FmtDouble W64
+                       _ -> pprPanic "Operation not supported for width " (ppr w)
+          code dst = case op of
+            VA_Add -> arithInstr VADD
+            VA_Sub -> arithInstr VSUB
+            VA_Mul -> arithInstr VMUL
+            VA_Div -> arithInstr VDIV
+            where
+              -- opcode src2 src1 dst <==> dst = src1 `opcode` src2
+              arithInstr instr = exp1 `appOL` exp2 `snocOL`
+                                 (instr format (OpReg reg2) reg1 dst)
+      return (Any format code)
+
+    vector_float_op_sse :: VectorArithInstns
+                        -> Length
+                        -> Width
+                        -> CmmExpr
+                        -> CmmExpr
+                        -> NatM Register
+    vector_float_op_sse op l w expr1 expr2 = do
+      (reg1, exp1) <- getSomeReg expr1
+      (reg2, exp2) <- getSomeReg expr2
+      let format   = case w of
+                       W32 -> VecFormat l FmtFloat  W32
+                       W64 -> VecFormat l FmtDouble W64
+                       _ -> pprPanic "Operation not supported for width " (ppr w)
+          code dst = case op of
+            VA_Add -> arithInstr ADD
+            VA_Sub -> arithInstr SUB
+            VA_Mul -> arithInstr MUL
+            VA_Div -> arithInstr FDIV
+            where
+              -- opcode src2 src1 <==> src1 = src1 `opcode` src2
+              arithInstr instr
+                = exp1 `appOL` exp2 `snocOL`
+                  (MOVU format (OpReg reg1) (OpReg dst)) `snocOL`
+                  (instr format (OpReg reg2) (OpReg dst))
+      return (Any format code)
     --------------------
+    vector_float_unpack :: Length
+                        -> Width
+                        -> CmmExpr
+                        -> CmmExpr
+                        -> NatM Register
+    vector_float_unpack l W32 expr (CmmLit lit)
+      = do
+      (r, exp) <- getSomeReg expr
+      let format   = VecFormat l FmtFloat W32
+          imm      = litToImm lit
+          code dst
+            = case lit of
+                CmmInt 0 _ -> exp `snocOL` (VMOVU format (OpReg r) (OpReg dst))
+                CmmInt _ _ -> exp `snocOL` (VPSHUFD format (OpImm imm) (OpReg r) dst)
+                _          -> panic "Error in offset while unpacking"
+      return (Any format code)
+    vector_float_unpack l W64 expr (CmmLit lit)
+      = do
+      dflags <- getDynFlags
+      (r, exp) <- getSomeReg expr
+      let format   = VecFormat l FmtDouble W64
+          addr     = spRel dflags 0
+          code dst
+            = case lit of
+                CmmInt 0 _ -> exp `snocOL`
+                              (MOVL format (OpReg r) (OpAddr addr)) `snocOL`
+                              (MOV FF64 (OpAddr addr) (OpReg dst))
+                CmmInt 1 _ -> exp `snocOL`
+                              (MOVH format (OpReg r) (OpAddr addr)) `snocOL`
+                              (MOV FF64 (OpAddr addr) (OpReg dst))
+                _          -> panic "Error in offset while unpacking"
+      return (Any format code)
+    vector_float_unpack _ w c e
+      = pprPanic "Unpack not supported for : " (ppr c $$ ppr e $$ ppr w)
+    -----------------------
+
+    vector_float_unpack_sse :: Length
+                            -> Width
+                            -> CmmExpr
+                            -> CmmExpr
+                            -> NatM Register
+    vector_float_unpack_sse l W32 expr (CmmLit lit)
+      = do
+      (r,exp) <- getSomeReg expr
+      let format   = VecFormat l FmtFloat W32
+          imm      = litToImm lit
+          code dst
+            = case lit of
+                CmmInt 0 _ -> exp `snocOL` (MOVU format (OpReg r) (OpReg dst))
+                CmmInt _ _ -> exp `snocOL` (PSHUFD format (OpImm imm) (OpReg r) dst)
+                _          -> panic "Error in offset while unpacking"
+      return (Any format code)
+    vector_float_unpack_sse _ w c e
+      = pprPanic "Unpack not supported for : " (ppr c $$ ppr e $$ ppr w)
+    -----------------------
+    vector_float_broadcast_avx :: Length
+                           -> Width
+                           -> CmmExpr
+                           -> CmmExpr
+                           -> NatM Register
+    vector_float_broadcast_avx len W32 expr1 expr2
+      = do
+      dflags    <- getDynFlags
+      fn        <- getAnyReg expr1
+      (r', exp) <- getSomeReg expr2
+      let f    = VecFormat len FmtFloat W32
+          addr = spRel dflags 0
+       in return $ Any f (\r -> exp    `appOL`
+                                (fn r) `snocOL`
+                                (MOVU f (OpReg r') (OpAddr addr)) `snocOL`
+                                (VBROADCAST f addr r))
+    vector_float_broadcast_avx len W64 expr1 expr2
+      = do
+      dflags    <- getDynFlags
+      fn        <- getAnyReg  expr1
+      (r', exp) <- getSomeReg expr2
+      let f    = VecFormat len FmtDouble W64
+          addr = spRel dflags 0
+       in return $ Any f (\r -> exp    `appOL`
+                                (fn r) `snocOL`
+                                (MOVU f (OpReg r') (OpAddr addr)) `snocOL`
+                                (MOVL f (OpAddr addr) (OpReg r)) `snocOL`
+                                (MOVH f (OpAddr addr) (OpReg r)))
+    vector_float_broadcast_avx _ _ c _
+      = pprPanic "Broadcast not supported for : " (ppr c)
+    -----------------------
+    vector_float_broadcast_sse :: Length
+                               -> Width
+                               -> CmmExpr
+                               -> CmmExpr
+                               -> NatM Register
+    vector_float_broadcast_sse len W32 expr1 expr2
+      = do
+      dflags   <- getDynFlags
+      fn       <- getAnyReg  expr1  -- destination
+      (r, exp) <- getSomeReg expr2  -- source
+      let f        = VecFormat len FmtFloat W32
+          addr     = spRel dflags 0
+          code dst = exp `appOL`
+                     (fn dst) `snocOL`
+                     (MOVU f (OpReg r) (OpAddr addr)) `snocOL`
+                     (insertps 0) `snocOL`
+                     (insertps 16) `snocOL`
+                     (insertps 32) `snocOL`
+                     (insertps 48)
+            where
+              insertps off =
+                INSERTPS f (OpImm $ litToImm $ CmmInt off W32) (OpAddr addr) dst
+
+       in return $ Any f code
+    vector_float_broadcast_sse _ _ c _
+      = pprPanic "Broadcast not supported for : " (ppr c)
+    -----------------------
     sub_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
     sub_code rep x (CmmLit (CmmInt y _))
         | is32BitInteger (-y) = add_int rep x (-y)
@@ -983,6 +1326,21 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
 
            return (Fixed format result code)
 
+getRegister' _ _ (CmmLoad mem pk)
+  | isVecType pk = do
+      use_avx <- avxEnabled
+      use_sse <- sseEnabled
+      Amode addr mem_code <- getAmode mem
+      let format = cmmTypeFormat pk
+          code dst
+            | use_avx = mem_code `snocOL`
+                        VMOVU format (OpAddr addr) (OpReg dst)
+            | use_sse = mem_code `snocOL`
+                        MOVU format (OpAddr addr) (OpReg dst)
+            | otherwise = pprPanic (unlines ["avx or sse flag not enabled",
+                                            "for loading to "])
+                          (ppr pk)
+      return (Any format code)
 
 getRegister' _ _ (CmmLoad mem pk)
   | isFloatType pk
@@ -1049,10 +1407,24 @@ getRegister' dflags is32Bit (CmmLit lit)
         -- small memory model (see gcc docs, -mcmodel=small).
 
 getRegister' dflags _ (CmmLit lit)
-  = do let format = cmmTypeFormat (cmmLitType dflags lit)
-           imm = litToImm lit
-           code dst = unitOL (MOV format (OpImm imm) (OpReg dst))
-       return (Any format code)
+  | isVecType cmmtype = vectorRegister cmmtype
+  | otherwise         = standardRegister cmmtype
+  where
+    cmmtype = cmmLitType dflags lit
+    vectorRegister ctype
+      = do
+      --NOTE:
+      -- This operation is only used to zero a register. For loading a
+      -- vector literal there are pack and broadcast operations
+      let format = cmmTypeFormat ctype
+          code dst = unitOL (XOR format (OpReg dst) (OpReg dst))
+      return (Any format code)
+    standardRegister ctype
+      = do
+      let format = cmmTypeFormat ctype
+          imm = litToImm lit
+          code dst = unitOL (MOV format (OpImm imm) (OpReg dst))
+      return (Any format code)
 
 getRegister' _ _ other
     | isVecExpr other  = needLlvm
@@ -1118,8 +1490,14 @@ getNonClobberedReg expr = do
                 return (reg, code)
 
 reg2reg :: Format -> Reg -> Reg -> Instr
-reg2reg format src dst = MOV format (OpReg src) (OpReg dst)
-
+reg2reg format@(VecFormat _ FmtFloat W32) src dst
+  = VMOVU format (OpReg src) (OpReg dst)
+reg2reg format@(VecFormat _ FmtDouble W64) src dst
+  = VMOVU format (OpReg src) (OpReg dst)
+reg2reg (VecFormat _ _ _) _ _
+  = panic "MOV operation not implemented for vectors"
+reg2reg format src dst
+  = MOV format (OpReg src) (OpReg dst)
 
 --------------------------------------------------------------------------------
 getAmode :: CmmExpr -> NatM Amode
@@ -1181,6 +1559,9 @@ getAmode' _ (CmmMachOp (MO_Add _)
 getAmode' _ (CmmMachOp (MO_Add _) [x,y])
   = x86_complex_amode x y 0 0
 
+getAmode' _ (CmmLit lit@(CmmFloat _ w))
+  = memConstant (mkAlignment $ widthInBytes w) lit
+
 getAmode' is32Bit (CmmLit lit) | is32BitLit is32Bit lit
   = return (Amode (ImmAddr (litToImm lit) 0) nilOL)
 
@@ -1561,7 +1942,8 @@ assignReg_IntCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
 assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
 assignReg_FltCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
 
-
+assignMem_VecCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_VecCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
 -- integer assignment to memory
 
 -- specific case of adding/subtracting an integer to a particular address.
@@ -1638,6 +2020,29 @@ assignReg_FltCode _ reg src = do
   let platform = targetPlatform dflags
   return (src_code (getRegisterReg platform  reg))
 
+assignMem_VecCode pk addr src = do
+  (src_reg, src_code) <- getNonClobberedReg src
+  Amode addr addr_code <- getAmode addr
+  use_avx <- avxEnabled
+  use_sse <- sseEnabled
+  let
+        code | use_avx   = src_code `appOL`
+                           addr_code `snocOL`
+                           (VMOVU pk (OpReg src_reg) (OpAddr addr))
+             | use_sse   = src_code `appOL`
+                           addr_code `snocOL`
+                           (MOVU pk (OpReg src_reg) (OpAddr addr))
+             | otherwise = sorry "Please enable the -mavx or -msse flag"
+  return code
+
+assignReg_VecCode format reg src = do
+  use_avx <- avxEnabled
+  use_sse <- sseEnabled
+  src_code <- getAnyReg src
+  dflags <- getDynFlags
+  let platform = targetPlatform dflags
+      flag     = use_avx || use_sse
+  return (src_code (getVecRegisterReg platform flag format reg))
 
 genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
 
@@ -3362,6 +3767,7 @@ sse2NegCode w x = do
       x@II16 -> wrongFmt x
       x@II32 -> wrongFmt x
       x@II64 -> wrongFmt x
+      x@VecFormat {} -> wrongFmt x
 
       where
         wrongFmt x = panic $ "sse2NegCode: " ++ show x
@@ -3376,29 +3782,33 @@ sse2NegCode w x = do
   return (Any fmt code)
 
 isVecExpr :: CmmExpr -> Bool
-isVecExpr (CmmMachOp (MO_V_Insert {}) _)   = True
-isVecExpr (CmmMachOp (MO_V_Extract {}) _)  = True
-isVecExpr (CmmMachOp (MO_V_Add {}) _)      = True
-isVecExpr (CmmMachOp (MO_V_Sub {}) _)      = True
-isVecExpr (CmmMachOp (MO_V_Mul {}) _)      = True
-isVecExpr (CmmMachOp (MO_VS_Quot {}) _)    = True
-isVecExpr (CmmMachOp (MO_VS_Rem {}) _)     = True
-isVecExpr (CmmMachOp (MO_VS_Neg {}) _)     = True
-isVecExpr (CmmMachOp (MO_VF_Insert {}) _)  = True
-isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True
-isVecExpr (CmmMachOp (MO_VF_Add {}) _)     = True
-isVecExpr (CmmMachOp (MO_VF_Sub {}) _)     = True
-isVecExpr (CmmMachOp (MO_VF_Mul {}) _)     = True
-isVecExpr (CmmMachOp (MO_VF_Quot {}) _)    = True
-isVecExpr (CmmMachOp (MO_VF_Neg {}) _)     = True
-isVecExpr (CmmMachOp _ [e])                = isVecExpr e
-isVecExpr _                                = False
+isVecExpr (CmmMachOp (MO_V_Insert {}) _)     = True
+isVecExpr (CmmMachOp (MO_V_Extract {}) _)    = True
+isVecExpr (CmmMachOp (MO_V_Add {}) _)        = True
+isVecExpr (CmmMachOp (MO_V_Sub {}) _)        = True
+isVecExpr (CmmMachOp (MO_V_Mul {}) _)        = True
+isVecExpr (CmmMachOp (MO_VS_Quot {}) _)      = True
+isVecExpr (CmmMachOp (MO_VS_Rem {}) _)       = True
+isVecExpr (CmmMachOp (MO_VS_Neg {}) _)       = True
+isVecExpr (CmmMachOp (MO_VF_Broadcast {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Insert {}) _)    = True
+isVecExpr (CmmMachOp (MO_VF_Extract {}) _)   = True
+isVecExpr (CmmMachOp (MO_VF_Add {}) _)       = True
+isVecExpr (CmmMachOp (MO_VF_Sub {}) _)       = True
+isVecExpr (CmmMachOp (MO_VF_Mul {}) _)       = True
+isVecExpr (CmmMachOp (MO_VF_Quot {}) _)      = True
+isVecExpr (CmmMachOp (MO_VF_Neg {}) _)       = True
+isVecExpr (CmmMachOp _ [e])                  = isVecExpr e
+isVecExpr _                                  = False
 
 needLlvm :: NatM a
 needLlvm =
     sorry $ unlines ["The native code generator does not support vector"
                     ,"instructions. Please use -fllvm."]
 
+incorrectOperands :: NatM a
+incorrectOperands = sorry "Incorrect number of operands"
+
 -- | This works on the invariant that all jumps in the given blocks are required.
 --   Starting from there we try to make a few more jumps redundant by reordering
 --   them.