Merge branch 'wip/simd'

This merge revises and extends the current SIMD support in GHC. Notable features: * Support for AVX, AVX2, and AVX-512. Support for AVX-512 is untested. * SIMD primops are currently LLVM-only and documented in compiler/prelude/primops.txt.pp. * By default only 128-bit wide SIMD vectors are passed in registers, and then only on the X86_64 architecture. There is a "hidden" flag, -fllvm-pass-vectors-in-regs, that causes GHC to generate LLVM code that assumes all vectors are passed in registers by LLVM. This can be used with a suitably patched version of LLVM, and if we get LLVM 3.4 patched, we can consider turning it on by default for LLVM 3.4+. This would mean that we couldn't mix LLVM <3.4-compiled object files with LLVM >=3.4-compiled object files, but I don't see that as much of a problem. * utils/genprimcode has been hacked up to allow us to write vector operations once and have them instantiated at multiple vector types. I'm not thrilled with this solution, but after discussing with Simon PJ, what I've implemented seems to be the minimal reasonable solution to the problem of exploding primop boilerplate. The changes are documented in compiler/prelude/primops.txt.pp. * Error handling is sub-optimal. My patch checks to make sure that vector primops can be compiled efficiently based on the current set of dynamic flags. For example, if -mavx is not specified and the user tries to use a primop that adds together two 256-bit wide vectors of double-precision elements, the user will see an error message like: ghc-stage2: sorry! (unimplemented feature or known bug) (GHC version 7.7.20130916 for x86_64-unknown-linux): 256-bit wide floating point SIMD vector instructions require at least -mavx.
author: Geoffrey Mainland <gmainlan@microsoft.com> 2013-09-23 00:22:19 -0400
committer: Geoffrey Mainland <gmainlan@microsoft.com> 2013-09-23 00:22:19 -0400
commit: 680441de191145dd874bd453e09e4ee906d87bbb (patch)
tree: 0246607039802b6270563cfb3c3751cb554cdc74
parent: 6e6e6f5bfdfc3996603064523af6e2be2c5131fa (diff)
parent: 25eeb6782a8f8cfdd3d8e9515863007c609eafc7 (diff)
download: haskell-680441de191145dd874bd453e09e4ee906d87bbb.tar.gz
41 files changed, 1380 insertions, 728 deletions
diff --git a/compiler/cmm/CmmCallConv.hs b/compiler/cmm/CmmCallConv.hs
index 35499333d0..60e2c8c8f7 100644
--- a/compiler/cmm/CmmCallConv.hs
+++ b/compiler/cmm/CmmCallConv.hs
@@ -14,6 +14,7 @@ import Cmm (Convention(..))
 import PprCmm ()
 
 import DynFlags
+import Platform
 import Outputable
 
 -- Calculate the 'GlobalReg' or stack locations for function call
@@ -65,15 +66,22 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
                                     | isFloatType ty = float
                                     | otherwise      = int
         where vec = case (w, regs) of
-                      (W128, (vs, fs, ds, ls, s:ss)) -> k (RegisterParam (XmmReg s), (vs, fs, ds, ls, ss))
+                      (W128, (vs, fs, ds, ls, s:ss))
+                          | passVectorInReg W128 dflags -> k (RegisterParam (XmmReg s), (vs, fs, ds, ls, ss))
+                      (W256, (vs, fs, ds, ls, s:ss))
+                          | passVectorInReg W256 dflags -> k (RegisterParam (YmmReg s), (vs, fs, ds, ls, ss))
+                      (W512, (vs, fs, ds, ls, s:ss))
+                          | passVectorInReg W512 dflags -> k (RegisterParam (ZmmReg s), (vs, fs, ds, ls, ss))
                       _ -> (assts, (r:rs))
               float = case (w, regs) of
-                        (W32, (vs, fs, ds, ls, s:ss)) -> k (RegisterParam (FloatReg s), (vs, fs, ds, ls, ss))
+                        (W32, (vs, fs, ds, ls, s:ss))
+                            | passFloatInXmm          -> k (RegisterParam (FloatReg s), (vs, fs, ds, ls, ss))
                         (W32, (vs, f:fs, ds, ls, ss))
-                            | not hasXmmRegs          -> k (RegisterParam f, (vs, fs, ds, ls, ss))
-                        (W64, (vs, fs, ds, ls, s:ss)) -> k (RegisterParam (DoubleReg s), (vs, fs, ds, ls, ss))
+                            | not passFloatInXmm      -> k (RegisterParam f, (vs, fs, ds, ls, ss))
+                        (W64, (vs, fs, ds, ls, s:ss))
+                            | passFloatInXmm          -> k (RegisterParam (DoubleReg s), (vs, fs, ds, ls, ss))
                         (W64, (vs, fs, d:ds, ls, ss))
-                            | not hasXmmRegs          -> k (RegisterParam d, (vs, fs, ds, ls, ss))
+                            | not passFloatInXmm      -> k (RegisterParam d, (vs, fs, ds, ls, ss))
                         (W80, _) -> panic "F80 unsupported register type"
                         _ -> (assts, (r:rs))
               int = case (w, regs) of
@@ -88,8 +96,26 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
               w  = typeWidth ty
               gcp | isGcPtrType ty = VGcPtr
                   | otherwise      = VNonGcPtr
-              hasXmmRegs = mAX_Real_XMM_REG dflags /= 0
-
+              passFloatInXmm = passFloatArgsInXmm dflags
+
+passFloatArgsInXmm :: DynFlags -> Bool
+passFloatArgsInXmm dflags = case platformArch (targetPlatform dflags) of
+                              ArchX86_64 -> True
+                              _          -> False
+
+-- On X86_64, we always pass 128-bit-wide vectors in registers. On 32-bit X86
+-- and for all larger vector sizes on X86_64, LLVM's GHC calling convention
+-- doesn't currently passing vectors in registers. The patch to update the GHC
+-- calling convention to support passing SIMD vectors in registers is small and
+-- well-contained, so it may make it into LLVM 3.4. The hidden
+-- -fllvm-pass-vectors-in-regs flag will generate LLVM code that attempts to
+-- pass vectors in registers, but it must only be used with a version of LLVM
+-- that has an updated GHC calling convention.
+passVectorInReg :: Width -> DynFlags -> Bool
+passVectorInReg W128 dflags = case platformArch (targetPlatform dflags) of
+                                ArchX86_64 -> True
+                                _          -> gopt Opt_LlvmPassVectorsInRegisters dflags
+passVectorInReg _    dflags = gopt Opt_LlvmPassVectorsInRegisters dflags
 
 assignStack :: DynFlags -> ByteOff -> (a -> CmmType) -> [a]
             -> (
@@ -158,7 +184,10 @@ realVanillaRegs dflags = map VanillaReg $ regList (mAX_Real_Vanilla_REG dflags)
 realFloatRegs   dflags = map FloatReg   $ regList (mAX_Real_Float_REG   dflags)
 realDoubleRegs  dflags = map DoubleReg  $ regList (mAX_Real_Double_REG  dflags)
 realLongRegs    dflags = map LongReg    $ regList (mAX_Real_Long_REG    dflags)
-realXmmRegNos   dflags =                  regList (mAX_Real_XMM_REG     dflags)
+
+realXmmRegNos dflags
+    | isSse2Enabled dflags = regList (mAX_Real_XMM_REG     dflags)
+    | otherwise            = []
 
 regList :: Int -> [Int]
 regList n = [1 .. n]
@@ -180,12 +209,11 @@ nodeOnly = ([VanillaReg 1], [], [], [], [])
 -- only use this functionality in hand-written C-- code in the RTS.
 realArgRegsCover :: DynFlags -> [GlobalReg]
 realArgRegsCover dflags
-    | hasXmmRegs = map ($VGcPtr) (realVanillaRegs dflags) ++
-                   realDoubleRegs dflags ++
-                   realLongRegs dflags
-    | otherwise  = map ($VGcPtr) (realVanillaRegs dflags) ++
-                   realFloatRegs dflags ++
-                   realDoubleRegs dflags ++
-                   realLongRegs dflags
-  where
-    hasXmmRegs = mAX_Real_XMM_REG dflags /= 0
+    | passFloatArgsInXmm dflags = map ($VGcPtr) (realVanillaRegs dflags) ++
+                                  realLongRegs dflags ++
+                                  map XmmReg (realXmmRegNos dflags)
+    | otherwise                 = map ($VGcPtr) (realVanillaRegs dflags) ++
+                                  realFloatRegs dflags ++
+                                  realDoubleRegs dflags ++
+                                  realLongRegs dflags ++
+                                  map XmmReg (realXmmRegNos dflags)
diff --git a/compiler/cmm/CmmExpr.hs b/compiler/cmm/CmmExpr.hs
index d3624dac6b..0f5abda74b 100644
--- a/compiler/cmm/CmmExpr.hs
+++ b/compiler/cmm/CmmExpr.hs
@@ -343,6 +343,12 @@ data GlobalReg
   | XmmReg                      -- 128-bit SIMD vector register
         {-# UNPACK #-} !Int     -- its number
 
+  | YmmReg                      -- 256-bit SIMD vector register 
+        {-# UNPACK #-} !Int     -- its number
+
+  | ZmmReg                      -- 512-bit SIMD vector register 
+        {-# UNPACK #-} !Int     -- its number
+
   -- STG registers
   | Sp                  -- Stack ptr; points to last occupied stack location.
   | SpLim               -- Stack limit
@@ -379,6 +385,8 @@ instance Eq GlobalReg where
    DoubleReg i == DoubleReg j = i==j
    LongReg i == LongReg j = i==j
    XmmReg i == XmmReg j = i==j
+   YmmReg i == YmmReg j = i==j
+   ZmmReg i == ZmmReg j = i==j
    Sp == Sp = True
    SpLim == SpLim = True
    Hp == Hp = True
@@ -401,6 +409,8 @@ instance Ord GlobalReg where
    compare (DoubleReg i) (DoubleReg j) = compare i j
    compare (LongReg i)   (LongReg   j) = compare i j
    compare (XmmReg i)    (XmmReg    j) = compare i j
+   compare (YmmReg i)    (YmmReg    j) = compare i j
+   compare (ZmmReg i)    (ZmmReg    j) = compare i j
    compare Sp Sp = EQ
    compare SpLim SpLim = EQ
    compare Hp Hp = EQ
@@ -424,6 +434,10 @@ instance Ord GlobalReg where
    compare _ (LongReg _)      = GT
    compare (XmmReg _) _       = LT
    compare _ (XmmReg _)       = GT
+   compare (YmmReg _) _       = LT
+   compare _ (YmmReg _)       = GT
+   compare (ZmmReg _) _       = LT
+   compare _ (ZmmReg _)       = GT
    compare Sp _ = LT
    compare _ Sp = GT
    compare SpLim _ = LT
@@ -467,6 +481,8 @@ globalRegType _      (FloatReg _)      = cmmFloat W32
 globalRegType _      (DoubleReg _)     = cmmFloat W64
 globalRegType _      (LongReg _)       = cmmBits W64
 globalRegType _      (XmmReg _)        = cmmVec 4 (cmmBits W32)
+globalRegType _      (YmmReg _)        = cmmVec 8 (cmmBits W32)
+globalRegType _      (ZmmReg _)        = cmmVec 16 (cmmBits W32)
 
 globalRegType dflags Hp                = gcWord dflags
                                             -- The initialiser for all
@@ -479,4 +495,6 @@ isArgReg (FloatReg {})   = True
 isArgReg (DoubleReg {})  = True
 isArgReg (LongReg {})    = True
 isArgReg (XmmReg {})     = True
+isArgReg (YmmReg {})     = True
+isArgReg (ZmmReg {})     = True
 isArgReg _               = False
diff --git a/compiler/cmm/CmmLex.x b/compiler/cmm/CmmLex.x
index 81227eb05f..b9da1e0fa6 100644
--- a/compiler/cmm/CmmLex.x
+++ b/compiler/cmm/CmmLex.x
@@ -164,6 +164,8 @@ data CmmToken
   | CmmT_bits32
   | CmmT_bits64
   | CmmT_bits128
+  | CmmT_bits256
+  | CmmT_bits512
   | CmmT_float32
   | CmmT_float64
   | CmmT_gcptr
@@ -244,6 +246,8 @@ reservedWordsFM = listToUFM $
 	( "bits32",		CmmT_bits32 ),
 	( "bits64",		CmmT_bits64 ),
 	( "bits128",		CmmT_bits128 ),
+	( "bits256",		CmmT_bits256 ),
+	( "bits512",		CmmT_bits512 ),
 	( "float32",		CmmT_float32 ),
 	( "float64",		CmmT_float64 ),
 -- New forms
@@ -252,6 +256,8 @@ reservedWordsFM = listToUFM $
 	( "b32",		CmmT_bits32 ),
 	( "b64",		CmmT_bits64 ),
 	( "b128",		CmmT_bits128 ),
+	( "b256",		CmmT_bits256 ),
+	( "b512",		CmmT_bits512 ),
 	( "f32",		CmmT_float32 ),
 	( "f64",		CmmT_float64 ),
 	( "gcptr",		CmmT_gcptr )
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index 8d42bbd2cb..c009d15e25 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -118,6 +118,10 @@ data MachOp
   | MO_VS_Rem  Length Width
   | MO_VS_Neg  Length Width
 
+  -- Unsigned vector multiply/divide
+  | MO_VU_Quot Length Width
+  | MO_VU_Rem  Length Width
+
   -- Floting point vector element insertion and extraction operations
   | MO_VF_Insert  Length Width   -- Insert scalar into vector
   | MO_VF_Extract Length Width   -- Extract scalar from vector
@@ -375,6 +379,9 @@ machOpResultType dflags mop tys =
     MO_VS_Rem  l w      -> cmmVec l (cmmBits w)
     MO_VS_Neg  l w      -> cmmVec l (cmmBits w)
 
+    MO_VU_Quot l w      -> cmmVec l (cmmBits w)
+    MO_VU_Rem  l w      -> cmmVec l (cmmBits w)
+
     MO_VF_Insert  l w   -> cmmVec l (cmmFloat w)
     MO_VF_Extract _ w   -> cmmFloat w
 
@@ -461,6 +468,9 @@ machOpArgReps dflags op =
     MO_VS_Rem  _ r      -> [r,r]
     MO_VS_Neg  _ r      -> [r]
 
+    MO_VU_Quot _ r      -> [r,r]
+    MO_VU_Rem  _ r      -> [r,r]
+
     MO_VF_Insert  l r   -> [typeWidth (vec l (cmmFloat r)),r,wordWidth dflags]
     MO_VF_Extract l r   -> [typeWidth (vec l (cmmFloat r)),wordWidth dflags]
 
diff --git a/compiler/cmm/CmmParse.y b/compiler/cmm/CmmParse.y
index 8c36deafbb..8367f7abd4 100644
--- a/compiler/cmm/CmmParse.y
+++ b/compiler/cmm/CmmParse.y
@@ -289,6 +289,8 @@ import Data.Maybe
         'bits32'        { L _ (CmmT_bits32) }
         'bits64'        { L _ (CmmT_bits64) }
         'bits128'       { L _ (CmmT_bits128) }
+        'bits256'       { L _ (CmmT_bits256) }
+        'bits512'       { L _ (CmmT_bits512) }
         'float32'       { L _ (CmmT_float32) }
         'float64'       { L _ (CmmT_float64) }
         'gcptr'         { L _ (CmmT_gcptr) }
@@ -777,6 +779,8 @@ typenot8 :: { CmmType }
         | 'bits32'              { b32 }
         | 'bits64'              { b64 }
         | 'bits128'             { b128 }
+        | 'bits256'             { b256 }
+        | 'bits512'             { b512 }
         | 'float32'             { f32 }
         | 'float64'             { f64 }
         | 'gcptr'               {% do dflags <- getDynFlags; return $ gcWord dflags }
diff --git a/compiler/cmm/CmmType.hs b/compiler/cmm/CmmType.hs
index 8eac8c9e85..d03c2dc0b9 100644
--- a/compiler/cmm/CmmType.hs
+++ b/compiler/cmm/CmmType.hs
@@ -1,7 +1,7 @@
 
 module CmmType
     ( CmmType   -- Abstract
-    , b8, b16, b32, b64, b128, f32, f64, bWord, bHalfWord, gcWord
+    , b8, b16, b32, b64, b128, b256, b512, f32, f64, bWord, bHalfWord, gcWord
     , cInt, cLong
     , cmmBits, cmmFloat
     , typeWidth, cmmEqType, cmmEqType_ignoring_ptrhood
@@ -107,12 +107,14 @@ cmmFloat = CmmType FloatCat
 
 -------- Common CmmTypes ------------
 -- Floats and words of specific widths
-b8, b16, b32, b64, b128, f32, f64 :: CmmType
+b8, b16, b32, b64, b128, b256, b512, f32, f64 :: CmmType
 b8     = cmmBits W8
 b16    = cmmBits W16
 b32    = cmmBits W32
 b64    = cmmBits W64
 b128   = cmmBits W128
+b256   = cmmBits W256
+b512   = cmmBits W512
 f32    = cmmFloat W32
 f64    = cmmFloat W64
 
@@ -166,6 +168,8 @@ data Width   = W8 | W16 | W32 | W64
                         -- used in x86 native codegen only.
                         -- (we use Ord, so it'd better be in this order)
              | W128
+             | W256
+             | W512
              deriving (Eq, Ord, Show)
 
 instance Outputable Width where
@@ -177,6 +181,8 @@ mrStr W16  = sLit("W16")
 mrStr W32  = sLit("W32")
 mrStr W64  = sLit("W64")
 mrStr W128 = sLit("W128")
+mrStr W256 = sLit("W256")
+mrStr W512 = sLit("W512")
 mrStr W80  = sLit("W80")
 
 
@@ -216,6 +222,8 @@ widthInBits W16  = 16
 widthInBits W32  = 32
 widthInBits W64  = 64
 widthInBits W128 = 128
+widthInBits W256 = 256
+widthInBits W512 = 512
 widthInBits W80  = 80
 
 widthInBytes :: Width -> Int
@@ -224,6 +232,8 @@ widthInBytes W16  = 2
 widthInBytes W32  = 4
 widthInBytes W64  = 8
 widthInBytes W128 = 16
+widthInBytes W256 = 32
+widthInBytes W512 = 64
 widthInBytes W80  = 10
 
 widthFromBytes :: Int -> Width
@@ -232,6 +242,8 @@ widthFromBytes 2  = W16
 widthFromBytes 4  = W32
 widthFromBytes 8  = W64
 widthFromBytes 16 = W128
+widthFromBytes 32 = W256
+widthFromBytes 64 = W512
 widthFromBytes 10 = W80
 widthFromBytes n  = pprPanic "no width for given number of bytes" (ppr n)
 
@@ -242,6 +254,8 @@ widthInLog W16  = 1
 widthInLog W32  = 2
 widthInLog W64  = 3
 widthInLog W128 = 4
+widthInLog W256 = 5
+widthInLog W512 = 6
 widthInLog W80  = panic "widthInLog: F80"
 
 -- widening / narrowing
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index d45b103954..c468161c73 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -651,6 +651,15 @@ pprMachOp_for_C mop = case mop of
                                 (panic $ "PprC.pprMachOp_for_C: MO_VS_Neg"
                                       ++ " should have been handled earlier!")
 
+        MO_VU_Quot {}     -> pprTrace "offending mop:"
+                                (ptext $ sLit "MO_VU_Quot")
+                                (panic $ "PprC.pprMachOp_for_C: MO_VU_Quot"
+                                      ++ " should have been handled earlier!")
+        MO_VU_Rem {}      -> pprTrace "offending mop:"
+                                (ptext $ sLit "MO_VU_Rem")
+                                (panic $ "PprC.pprMachOp_for_C: MO_VU_Rem"
+                                      ++ " should have been handled earlier!")
+
         MO_VF_Insert {}   -> pprTrace "offending mop:"
                                 (ptext $ sLit "MO_VF_Insert")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VF_Insert"
diff --git a/compiler/cmm/PprCmmExpr.hs b/compiler/cmm/PprCmmExpr.hs
index d1128b07d3..0bb79ac147 100644
--- a/compiler/cmm/PprCmmExpr.hs
+++ b/compiler/cmm/PprCmmExpr.hs
@@ -256,6 +256,8 @@ pprGlobalReg gr
         DoubleReg  n   -> char 'D' <> int n
         LongReg    n   -> char 'L' <> int n
         XmmReg     n   -> ptext (sLit "XMM") <> int n
+        YmmReg     n   -> ptext (sLit "YMM") <> int n
+        ZmmReg     n   -> ptext (sLit "ZMM") <> int n
         Sp             -> ptext (sLit "Sp")
         SpLim          -> ptext (sLit "SpLim")
         Hp             -> ptext (sLit "Hp")
diff --git a/compiler/codeGen/CgUtils.hs b/compiler/codeGen/CgUtils.hs
index c06dd60cb1..6b36ab09cd 100644
--- a/compiler/codeGen/CgUtils.hs
+++ b/compiler/codeGen/CgUtils.hs
@@ -56,6 +56,20 @@ baseRegOffset dflags (XmmReg 4)          = oFFSET_StgRegTable_rXMM4 dflags
 baseRegOffset dflags (XmmReg 5)          = oFFSET_StgRegTable_rXMM5 dflags
 baseRegOffset dflags (XmmReg 6)          = oFFSET_StgRegTable_rXMM6 dflags
 baseRegOffset _      (XmmReg n)          = panic ("Registers above XMM6 are not supported (tried to use XMM" ++ show n ++ ")")
+baseRegOffset dflags (YmmReg 1)          = oFFSET_StgRegTable_rYMM1 dflags
+baseRegOffset dflags (YmmReg 2)          = oFFSET_StgRegTable_rYMM2 dflags
+baseRegOffset dflags (YmmReg 3)          = oFFSET_StgRegTable_rYMM3 dflags
+baseRegOffset dflags (YmmReg 4)          = oFFSET_StgRegTable_rYMM4 dflags
+baseRegOffset dflags (YmmReg 5)          = oFFSET_StgRegTable_rYMM5 dflags
+baseRegOffset dflags (YmmReg 6)          = oFFSET_StgRegTable_rYMM6 dflags
+baseRegOffset _      (YmmReg n)          = panic ("Registers above YMM6 are not supported (tried to use YMM" ++ show n ++ ")")
+baseRegOffset dflags (ZmmReg 1)          = oFFSET_StgRegTable_rZMM1 dflags
+baseRegOffset dflags (ZmmReg 2)          = oFFSET_StgRegTable_rZMM2 dflags
+baseRegOffset dflags (ZmmReg 3)          = oFFSET_StgRegTable_rZMM3 dflags
+baseRegOffset dflags (ZmmReg 4)          = oFFSET_StgRegTable_rZMM4 dflags
+baseRegOffset dflags (ZmmReg 5)          = oFFSET_StgRegTable_rZMM5 dflags
+baseRegOffset dflags (ZmmReg 6)          = oFFSET_StgRegTable_rZMM6 dflags
+baseRegOffset _      (ZmmReg n)          = panic ("Registers above ZMM6 are not supported (tried to use ZMM" ++ show n ++ ")")
 baseRegOffset dflags Sp                  = oFFSET_StgRegTable_rSp dflags
 baseRegOffset dflags SpLim               = oFFSET_StgRegTable_rSpLim dflags
 baseRegOffset dflags (LongReg 1)         = oFFSET_StgRegTable_rL1 dflags
diff --git a/compiler/codeGen/StgCmmArgRep.hs b/compiler/codeGen/StgCmmArgRep.hs
index bd228d4617..1e68105aac 100644
--- a/compiler/codeGen/StgCmmArgRep.hs
+++ b/compiler/codeGen/StgCmmArgRep.hs
@@ -47,6 +47,8 @@ data ArgRep = P   -- GC Ptr
             | F   -- Float
             | D   -- Double
             | V16 -- 16-byte (128-bit) vectors of Float/Double/Int8/Word32/etc.
+            | V32 -- 32-byte (256-bit) vectors of Float/Double/Int8/Word32/etc.
+            | V64 -- 64-byte (512-bit) vectors of Float/Double/Int8/Word32/etc.
 instance Outputable ArgRep where ppr = text . argRepString
 
 argRepString :: ArgRep -> String
@@ -57,6 +59,8 @@ argRepString V = "V"
 argRepString F = "F"
 argRepString D = "D"
 argRepString V16 = "V16"
+argRepString V32 = "V32"
+argRepString V64 = "V64"
 
 toArgRep :: PrimRep -> ArgRep
 toArgRep VoidRep           = V
@@ -68,9 +72,11 @@ toArgRep Int64Rep          = L
 toArgRep Word64Rep         = L
 toArgRep FloatRep          = F
 toArgRep DoubleRep         = D
-toArgRep (VecRep len elem)
-    | len*primElemRepSizeB elem == 16 = V16
-    | otherwise                       = error "toArgRep: bad vector primrep"
+toArgRep (VecRep len elem) = case len*primElemRepSizeB elem of
+                               16 -> V16
+                               32 -> V32
+                               64 -> V64
+                               _  -> error "toArgRep: bad vector primrep"
 
 isNonV :: ArgRep -> Bool
 isNonV V = False
@@ -84,6 +90,8 @@ argRepSizeW dflags L   = wORD64_SIZE        `quot` wORD_SIZE dflags
 argRepSizeW dflags D   = dOUBLE_SIZE dflags `quot` wORD_SIZE dflags
 argRepSizeW _      V   = 0
 argRepSizeW dflags V16 = 16                 `quot` wORD_SIZE dflags
+argRepSizeW dflags V32 = 32                 `quot` wORD_SIZE dflags
+argRepSizeW dflags V64 = 64                 `quot` wORD_SIZE dflags
 
 idArgRep :: Id -> ArgRep
 idArgRep = toArgRep . idPrimRep
@@ -132,4 +140,6 @@ slowCallPattern (F: _)                = (fsLit "stg_ap_f", 1)
 slowCallPattern (D: _)                = (fsLit "stg_ap_d", 1)
 slowCallPattern (L: _)                = (fsLit "stg_ap_l", 1)
 slowCallPattern (V16: _)              = (fsLit "stg_ap_v16", 1)
+slowCallPattern (V32: _)              = (fsLit "stg_ap_v32", 1)
+slowCallPattern (V64: _)              = (fsLit "stg_ap_v64", 1)
 slowCallPattern []                    = (fsLit "stg_ap_0", 0)
diff --git a/compiler/codeGen/StgCmmLayout.hs b/compiler/codeGen/StgCmmLayout.hs
index b52d4e57df..84ff21b3d0 100644
--- a/compiler/codeGen/StgCmmLayout.hs
+++ b/compiler/codeGen/StgCmmLayout.hs
@@ -385,6 +385,8 @@ stdPattern reps
         [D]   -> Just ARG_D
         [L]   -> Just ARG_L
         [V16] -> Just ARG_V16
+        [V32] -> Just ARG_V32
+        [V64] -> Just ARG_V64
 
         [N,N] -> Just ARG_NN
         [N,P] -> Just ARG_NP
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index 8560f7cf1c..523fcb21f9 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -40,7 +40,7 @@ import FastString
 import Outputable
 import Util
 
-import Control.Monad (liftM)
+import Control.Monad (liftM, when)
 import Data.Bits
 
 ------------------------------------------------------------------------
@@ -380,14 +380,6 @@ emitPrimOp dflags res IndexOffAddrOp_Word8            args = doIndexOffAddrOp
 emitPrimOp dflags res IndexOffAddrOp_Word16           args = doIndexOffAddrOp   (Just (mo_u_16ToWord dflags)) b16 res args
 emitPrimOp dflags res IndexOffAddrOp_Word32           args = doIndexOffAddrOp   (Just (mo_u_32ToWord dflags)) b32 res args
 emitPrimOp _      res IndexOffAddrOp_Word64           args = doIndexOffAddrOp   Nothing b64 res args
-emitPrimOp _      res IndexOffAddrOp_FloatX4          args = doIndexOffAddrOp   Nothing vec4f32 res args
-emitPrimOp _      res IndexOffAddrOp_FloatAsFloatX4   args = doIndexOffAddrOpAs Nothing vec4f32 f32 res args
-emitPrimOp _      res IndexOffAddrOp_DoubleX2         args = doIndexOffAddrOp   Nothing vec2f64 res args
-emitPrimOp _      res IndexOffAddrOp_DoubleAsDoubleX2 args = doIndexOffAddrOpAs Nothing vec2f64 f64 res args
-emitPrimOp _      res IndexOffAddrOp_Int32X4          args = doIndexOffAddrOp   Nothing vec4b32 res args
-emitPrimOp _      res IndexOffAddrOp_Int32AsInt32X4   args = doIndexOffAddrOpAs Nothing vec4b32 b32 res args
-emitPrimOp _      res IndexOffAddrOp_Int64X2          args = doIndexOffAddrOp   Nothing vec2b64 res args
-emitPrimOp _      res IndexOffAddrOp_Int64AsInt64X2   args = doIndexOffAddrOpAs Nothing vec2b64 b64 res args
 
 -- ReadXXXoffAddr, which are identical, for our purposes, to IndexXXXoffAddr.
 
@@ -407,14 +399,6 @@ emitPrimOp dflags res ReadOffAddrOp_Word8            args = doIndexOffAddrOp   (
 emitPrimOp dflags res ReadOffAddrOp_Word16           args = doIndexOffAddrOp   (Just (mo_u_16ToWord dflags)) b16 res args
 emitPrimOp dflags res ReadOffAddrOp_Word32           args = doIndexOffAddrOp   (Just (mo_u_32ToWord dflags)) b32 res args
 emitPrimOp _      res ReadOffAddrOp_Word64           args = doIndexOffAddrOp   Nothing b64 res args
-emitPrimOp _      res ReadOffAddrOp_FloatX4          args = doIndexOffAddrOp   Nothing vec4f32 res args
-emitPrimOp _      res ReadOffAddrOp_FloatAsFloatX4   args = doIndexOffAddrOpAs Nothing vec4f32 b32 res args
-emitPrimOp _      res ReadOffAddrOp_DoubleX2         args = doIndexOffAddrOp   Nothing vec2f64 res args
-emitPrimOp _      res ReadOffAddrOp_DoubleAsDoubleX2 args = doIndexOffAddrOpAs Nothing vec2f64 b64 res args
-emitPrimOp _      res ReadOffAddrOp_Int32X4          args = doIndexOffAddrOp   Nothing vec4b32 res args
-emitPrimOp _      res ReadOffAddrOp_Int32AsInt32X4   args = doIndexOffAddrOpAs Nothing vec4b32 b32 res args
-emitPrimOp _      res ReadOffAddrOp_Int64X2          args = doIndexOffAddrOp   Nothing vec2b64 res args
-emitPrimOp _      res ReadOffAddrOp_Int64AsInt64X2   args = doIndexOffAddrOpAs Nothing vec2b64 b64 res args
 
 -- IndexXXXArray
 
@@ -434,14 +418,6 @@ emitPrimOp dflags res IndexByteArrayOp_Word8            args = doIndexByteArrayO
 emitPrimOp dflags res IndexByteArrayOp_Word16           args = doIndexByteArrayOp   (Just (mo_u_16ToWord dflags)) b16  res args
 emitPrimOp dflags res IndexByteArrayOp_Word32           args = doIndexByteArrayOp   (Just (mo_u_32ToWord dflags)) b32  res args
 emitPrimOp _      res IndexByteArrayOp_Word64           args = doIndexByteArrayOp   Nothing b64  res args
-emitPrimOp _      res IndexByteArrayOp_FloatX4          args = doIndexByteArrayOp   Nothing vec4f32 res args
-emitPrimOp _      res IndexByteArrayOp_FloatAsFloatX4   args = doIndexByteArrayOpAs Nothing vec4f32 f32 res args
-emitPrimOp _      res IndexByteArrayOp_DoubleX2         args = doIndexByteArrayOp   Nothing vec2f64 res args
-emitPrimOp _      res IndexByteArrayOp_DoubleAsDoubleX2 args = doIndexByteArrayOpAs Nothing vec2f64 f64 res args
-emitPrimOp _      res IndexByteArrayOp_Int32X4          args = doIndexByteArrayOp   Nothing vec4b32 res args
-emitPrimOp _      res IndexByteArrayOp_Int32AsInt32X4   args = doIndexByteArrayOpAs Nothing vec4b32 b32 res args
-emitPrimOp _      res IndexByteArrayOp_Int64X2          args = doIndexByteArrayOp   Nothing vec2b64 res args
-emitPrimOp _      res IndexByteArrayOp_Int64AsInt64X2   args = doIndexByteArrayOpAs Nothing vec2b64 b64 res args
 
 -- ReadXXXArray, identical to IndexXXXArray.
 
@@ -461,14 +437,6 @@ emitPrimOp dflags res ReadByteArrayOp_Word8            args = doIndexByteArrayOp
 emitPrimOp dflags res ReadByteArrayOp_Word16           args = doIndexByteArrayOp   (Just (mo_u_16ToWord dflags)) b16  res args
 emitPrimOp dflags res ReadByteArrayOp_Word32           args = doIndexByteArrayOp   (Just (mo_u_32ToWord dflags)) b32  res args
 emitPrimOp _      res ReadByteArrayOp_Word64           args = doIndexByteArrayOp   Nothing b64  res args
-emitPrimOp _      res ReadByteArrayOp_FloatX4          args = doIndexByteArrayOp   Nothing vec4f32 res args
-emitPrimOp _      res ReadByteArrayOp_FloatAsFloatX4   args = doIndexByteArrayOpAs Nothing vec4f32 f32 res args
-emitPrimOp _      res ReadByteArrayOp_DoubleX2         args = doIndexByteArrayOp   Nothing vec2f64 res args
-emitPrimOp _      res ReadByteArrayOp_DoubleAsDoubleX2 args = doIndexByteArrayOpAs Nothing vec2f64 f64 res args
-emitPrimOp _      res ReadByteArrayOp_Int32X4          args = doIndexByteArrayOp   Nothing vec4b32 res args
-emitPrimOp _      res ReadByteArrayOp_Int32AsInt32X4   args = doIndexByteArrayOpAs Nothing vec4b32 b32 res args
-emitPrimOp _      res ReadByteArrayOp_Int64X2          args = doIndexByteArrayOp   Nothing vec2b64 res args
-emitPrimOp _      res ReadByteArrayOp_Int64AsInt64X2   args = doIndexByteArrayOpAs Nothing vec2b64 b64 res args
 
 -- WriteXXXoffAddr
 
@@ -488,14 +456,6 @@ emitPrimOp dflags res WriteOffAddrOp_Word8            args = doWriteOffAddrOp (J
 emitPrimOp dflags res WriteOffAddrOp_Word16           args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) b16 res args
 emitPrimOp dflags res WriteOffAddrOp_Word32           args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args
 emitPrimOp _      res WriteOffAddrOp_Word64           args = doWriteOffAddrOp Nothing b64 res args
-emitPrimOp _      res WriteOffAddrOp_FloatX4          args = doWriteOffAddrOp Nothing vec4f32 res args
-emitPrimOp _      res WriteOffAddrOp_FloatAsFloatX4   args = doWriteOffAddrOp Nothing f32 res args
-emitPrimOp _      res WriteOffAddrOp_DoubleX2         args = doWriteOffAddrOp Nothing vec2f64 res args
-emitPrimOp _      res WriteOffAddrOp_DoubleAsDoubleX2 args = doWriteOffAddrOp Nothing f64 res args
-emitPrimOp _      res WriteOffAddrOp_Int32X4          args = doWriteOffAddrOp Nothing vec4b32 res args
-emitPrimOp _      res WriteOffAddrOp_Int32AsInt32X4   args = doWriteOffAddrOp Nothing b32 res args
-emitPrimOp _      res WriteOffAddrOp_Int64X2          args = doWriteOffAddrOp Nothing vec2b64 res args
-emitPrimOp _      res WriteOffAddrOp_Int64AsInt64X2   args = doWriteOffAddrOp Nothing b64 res args
 
 -- WriteXXXArray
 
@@ -515,14 +475,6 @@ emitPrimOp dflags res WriteByteArrayOp_Word8            args = doWriteByteArrayO
 emitPrimOp dflags res WriteByteArrayOp_Word16           args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b16 res args
 emitPrimOp dflags res WriteByteArrayOp_Word32           args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args
 emitPrimOp _      res WriteByteArrayOp_Word64           args = doWriteByteArrayOp Nothing b64 res args
-emitPrimOp _      res WriteByteArrayOp_FloatX4          args = doWriteByteArrayOp Nothing vec4f32 res args
-emitPrimOp _      res WriteByteArrayOp_FloatAsFloatX4   args = doWriteByteArrayOp Nothing f32 res args
-emitPrimOp _      res WriteByteArrayOp_DoubleX2         args = doWriteByteArrayOp Nothing vec2f64 res args
-emitPrimOp _      res WriteByteArrayOp_DoubleAsDoubleX2 args = doWriteByteArrayOp Nothing f64 res args
-emitPrimOp _      res WriteByteArrayOp_Int32X4          args = doWriteByteArrayOp Nothing vec4b32 res args
-emitPrimOp _      res WriteByteArrayOp_Int32AsInt32X4   args = doWriteByteArrayOp Nothing b32 res args
-emitPrimOp _      res WriteByteArrayOp_Int64X2          args = doWriteByteArrayOp Nothing vec2b64 res args
-emitPrimOp _      res WriteByteArrayOp_Int64AsInt64X2   args = doWriteByteArrayOp Nothing b64 res args
 
 -- Copying and setting byte arrays
 emitPrimOp _      [] CopyByteArrayOp [src,src_off,dst,dst_off,n] =
@@ -556,78 +508,152 @@ emitPrimOp _      [res] Word2FloatOp  [w] = emitPrimCall [res]
 emitPrimOp _      [res] Word2DoubleOp [w] = emitPrimCall [res]
                                             (MO_UF_Conv W64) [w]
 
--- SIMD vector packing and unpacking
-emitPrimOp _ [res] FloatToFloatX4Op [e] =
-    doVecPackOp Nothing vec4f32 zero [e,e,e,e] res
+-- SIMD primops
+emitPrimOp dflags [res] (VecBroadcastOp vcat n w) [e] = do
+    checkVecCompatibility dflags vcat n w
+    doVecPackOp (vecElemInjectCast dflags vcat w) ty zeros (replicate n e) res
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 4 (CmmFloat 0 W32))
+    zeros :: CmmExpr
+    zeros = CmmLit $ CmmVec (replicate n zero)
+
+    zero :: CmmLit
+    zero = case vcat of
+             IntVec   -> CmmInt 0 w
+             WordVec  -> CmmInt 0 w
+             FloatVec -> CmmFloat 0 w
+
+    ty :: CmmType
+    ty = vecVmmType vcat n w
+
+emitPrimOp dflags [res] (VecPackOp vcat n w) es = do
+    checkVecCompatibility dflags vcat n w
+    when (length es /= n) $
+        panic "emitPrimOp: VecPackOp has wrong number of arguments"
+    doVecPackOp (vecElemInjectCast dflags vcat w) ty zeros es res
+  where
+    zeros :: CmmExpr
+    zeros = CmmLit $ CmmVec (replicate n zero)
+
+    zero :: CmmLit
+    zero = case vcat of
+             IntVec   -> CmmInt 0 w
+             WordVec  -> CmmInt 0 w
+             FloatVec -> CmmFloat 0 w
+
+    ty :: CmmType
+    ty = vecVmmType vcat n w
+
+emitPrimOp dflags res (VecUnpackOp vcat n w) [arg] = do
+    checkVecCompatibility dflags vcat n w
+    when (length res /= n) $
+        panic "emitPrimOp: VecUnpackOp has wrong number of results"
+    doVecUnpackOp (vecElemProjectCast dflags vcat w) ty arg res
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp _ [res] FloatX4PackOp es@[_,_,_,_] =
-    doVecPackOp Nothing vec4f32 zero es res
+emitPrimOp dflags [res] (VecInsertOp vcat n w) [v,e,i] = do
+    checkVecCompatibility dflags vcat n w
+    doVecInsertOp (vecElemInjectCast dflags vcat w) ty v e i res
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 4 (CmmFloat 0 W32))
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp _ res@[_,_,_,_] FloatX4UnpackOp [arg] =
-    doVecUnpackOp Nothing vec4f32 arg res
+emitPrimOp dflags res (VecIndexByteArrayOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexByteArrayOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp _ [res] FloatX4InsertOp [v,e,i] =
-    doVecInsertOp Nothing vec4f32 v e i res
+emitPrimOp dflags res (VecReadByteArrayOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexByteArrayOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp _ [res] DoubleToDoubleX2Op [e] =
-    doVecPackOp Nothing vec2f64 zero [e,e] res
+emitPrimOp dflags res (VecWriteByteArrayOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doWriteByteArrayOp Nothing ty res args
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 2 (CmmFloat 0 W64))
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp _ [res] DoubleX2PackOp es@[_,_] =
-    doVecPackOp Nothing vec2f64 zero es res
+emitPrimOp dflags res (VecIndexOffAddrOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexOffAddrOp Nothing ty res args
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 2 (CmmFloat 0 W64))
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp _ res@[_,_] DoubleX2UnpackOp [arg] =
-    doVecUnpackOp Nothing vec2f64 arg res
+emitPrimOp dflags res (VecReadOffAddrOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexOffAddrOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp _ [res] DoubleX2InsertOp [v,e,i] =
-    doVecInsertOp Nothing vec2f64 v e i res
+emitPrimOp dflags res (VecWriteOffAddrOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doWriteOffAddrOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp dflags [res] Int32ToInt32X4Op [e] =
-    doVecPackOp (Just (mo_WordTo32 dflags)) vec4b32 zero [e,e,e,e] res
+emitPrimOp dflags res (VecIndexScalarByteArrayOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexByteArrayOpAs Nothing vecty ty res args
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 4 (CmmInt 0 W32))
+    vecty :: CmmType
+    vecty = vecVmmType vcat n w
 
-emitPrimOp dflags [res] Int32X4PackOp es@[_,_,_,_] =
-    doVecPackOp (Just (mo_WordTo32 dflags)) vec4b32 zero es res
+    ty :: CmmType
+    ty = vecCmmCat vcat w
+
+emitPrimOp dflags res (VecReadScalarByteArrayOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexByteArrayOpAs Nothing vecty ty res args
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 4 (CmmInt 0 W32))
+    vecty :: CmmType
+    vecty = vecVmmType vcat n w
 
-emitPrimOp dflags res@[_,_,_,_] Int32X4UnpackOp [arg] =
-    doVecUnpackOp (Just (mo_s_32ToWord dflags)) vec4b32 arg res
+    ty :: CmmType
+    ty = vecCmmCat vcat w
 
-emitPrimOp dflags [res] Int32X4InsertOp [v,e,i] =
-    doVecInsertOp (Just (mo_WordTo32 dflags)) vec4b32 v e i res
+emitPrimOp dflags res (VecWriteScalarByteArrayOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doWriteByteArrayOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecCmmCat vcat w
 
-emitPrimOp _ [res] Int64ToInt64X2Op [e] =
-    doVecPackOp Nothing vec2b64 zero [e,e] res
+emitPrimOp dflags res (VecIndexScalarOffAddrOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexOffAddrOpAs Nothing vecty ty res args
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 2 (CmmInt 0 W64))
+    vecty :: CmmType
+    vecty = vecVmmType vcat n w
 
-emitPrimOp _ [res] Int64X2PackOp es@[_,_] =
-    doVecPackOp Nothing vec2b64 zero es res
+    ty :: CmmType
+    ty = vecCmmCat vcat w
+
+emitPrimOp dflags res (VecReadScalarOffAddrOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexOffAddrOpAs Nothing vecty ty res args
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 2 (CmmInt 0 W64))
+    vecty :: CmmType
+    vecty = vecVmmType vcat n w
 
-emitPrimOp _ res@[_,_] Int64X2UnpackOp [arg] =
-    doVecUnpackOp Nothing vec2b64 arg res
+    ty :: CmmType
+    ty = vecCmmCat vcat w
 
-emitPrimOp _ [res] Int64X2InsertOp [v,e,i] =
-    doVecInsertOp Nothing vec2b64 v e i res
+emitPrimOp dflags res (VecWriteScalarOffAddrOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doWriteOffAddrOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecCmmCat vcat w
 
 -- Prefetch
 emitPrimOp _ res PrefetchByteArrayOp        args = doPrefetchByteArrayOp res args
@@ -944,33 +970,26 @@ translateOp _      FloatMulOp    = Just (MO_F_Mul  W32)
 translateOp _      FloatDivOp    = Just (MO_F_Quot W32)
 translateOp _      FloatNegOp    = Just (MO_F_Neg  W32)
 
--- Floating point vector ops
-
-translateOp _ FloatX4AddOp  = Just (MO_VF_Add  4 W32)
-translateOp _ FloatX4SubOp  = Just (MO_VF_Sub  4 W32)
-translateOp _ FloatX4MulOp  = Just (MO_VF_Mul  4 W32)
-translateOp _ FloatX4DivOp  = Just (MO_VF_Quot 4 W32)
-translateOp _ FloatX4NegOp  = Just (MO_VF_Neg  4 W32)
-
-translateOp _ DoubleX2AddOp  = Just (MO_VF_Add  2 W64)
-translateOp _ DoubleX2SubOp  = Just (MO_VF_Sub  2 W64)
-translateOp _ DoubleX2MulOp  = Just (MO_VF_Mul  2 W64)
-translateOp _ DoubleX2DivOp  = Just (MO_VF_Quot 2 W64)
-translateOp _ DoubleX2NegOp  = Just (MO_VF_Neg  2 W64)
-
-translateOp _ Int32X4AddOp   = Just (MO_V_Add   4 W32)
-translateOp _ Int32X4SubOp   = Just (MO_V_Sub   4 W32)
-translateOp _ Int32X4MulOp   = Just (MO_V_Mul   4 W32)
-translateOp _ Int32X4QuotOp  = Just (MO_VS_Quot 4 W32)
-translateOp _ Int32X4RemOp   = Just (MO_VS_Rem  4 W32)
-translateOp _ Int32X4NegOp   = Just (MO_VS_Neg  4 W32)
-
-translateOp _ Int64X2AddOp   = Just (MO_V_Add   2 W64)
-translateOp _ Int64X2SubOp   = Just (MO_V_Sub   2 W64)
-translateOp _ Int64X2MulOp   = Just (MO_V_Mul   2 W64)
-translateOp _ Int64X2QuotOp  = Just (MO_VS_Quot 2 W64)
-translateOp _ Int64X2RemOp   = Just (MO_VS_Rem  2 W64)
-translateOp _ Int64X2NegOp   = Just (MO_VS_Neg  2 W64)
+-- Vector ops
+
+translateOp _ (VecAddOp FloatVec n w) = Just (MO_VF_Add  n w)
+translateOp _ (VecSubOp FloatVec n w) = Just (MO_VF_Sub  n w)
+translateOp _ (VecMulOp FloatVec n w) = Just (MO_VF_Mul  n w)
+translateOp _ (VecDivOp FloatVec n w) = Just (MO_VF_Quot n w)
+translateOp _ (VecNegOp FloatVec n w) = Just (MO_VF_Neg  n w)
+
+translateOp _ (VecAddOp  IntVec n w) = Just (MO_V_Add   n w)
+translateOp _ (VecSubOp  IntVec n w) = Just (MO_V_Sub   n w)
+translateOp _ (VecMulOp  IntVec n w) = Just (MO_V_Mul   n w)
+translateOp _ (VecQuotOp IntVec n w) = Just (MO_VS_Quot n w)
+translateOp _ (VecRemOp  IntVec n w) = Just (MO_VS_Rem  n w)
+translateOp _ (VecNegOp  IntVec n w) = Just (MO_VS_Neg  n w)
+
+translateOp _ (VecAddOp  WordVec n w) = Just (MO_V_Add   n w)
+translateOp _ (VecSubOp  WordVec n w) = Just (MO_V_Sub   n w)
+translateOp _ (VecMulOp  WordVec n w) = Just (MO_V_Mul   n w)
+translateOp _ (VecQuotOp WordVec n w) = Just (MO_VU_Quot n w)
+translateOp _ (VecRemOp  WordVec n w) = Just (MO_VU_Rem  n w)
 
 -- Conversions
 
@@ -1183,6 +1202,70 @@ setInfo :: CmmExpr -> CmmExpr -> CmmAGraph
 setInfo closure_ptr info_ptr = mkStore closure_ptr info_ptr
 
 ------------------------------------------------------------------------------
+-- Helpers for translating vector primops.
+
+vecVmmType :: PrimOpVecCat -> Length -> Width -> CmmType
+vecVmmType pocat n w = vec n (vecCmmCat pocat w)
+
+vecCmmCat :: PrimOpVecCat -> Width -> CmmType
+vecCmmCat IntVec   = cmmBits
+vecCmmCat WordVec  = cmmBits
+vecCmmCat FloatVec = cmmFloat
+
+vecElemInjectCast :: DynFlags -> PrimOpVecCat -> Width -> Maybe MachOp
+vecElemInjectCast _      FloatVec _   =  Nothing
+vecElemInjectCast dflags IntVec   W8  =  Just (mo_WordTo8  dflags)
+vecElemInjectCast dflags IntVec   W16 =  Just (mo_WordTo16 dflags)
+vecElemInjectCast dflags IntVec   W32 =  Just (mo_WordTo32 dflags)
+vecElemInjectCast _      IntVec   W64 =  Nothing
+vecElemInjectCast dflags WordVec  W8  =  Just (mo_WordTo8  dflags)
+vecElemInjectCast dflags WordVec  W16 =  Just (mo_WordTo16 dflags)
+vecElemInjectCast dflags WordVec  W32 =  Just (mo_WordTo32 dflags)
+vecElemInjectCast _      WordVec  W64 =  Nothing
+vecElemInjectCast _      _        _   =  Nothing
+
+vecElemProjectCast :: DynFlags -> PrimOpVecCat -> Width -> Maybe MachOp
+vecElemProjectCast _      FloatVec _   =  Nothing
+vecElemProjectCast dflags IntVec   W8  =  Just (mo_s_8ToWord  dflags)
+vecElemProjectCast dflags IntVec   W16 =  Just (mo_s_16ToWord dflags)
+vecElemProjectCast dflags IntVec   W32 =  Just (mo_s_32ToWord dflags)
+vecElemProjectCast _      IntVec   W64 =  Nothing
+vecElemProjectCast dflags WordVec  W8  =  Just (mo_u_8ToWord  dflags)
+vecElemProjectCast dflags WordVec  W16 =  Just (mo_u_16ToWord dflags)
+vecElemProjectCast dflags WordVec  W32 =  Just (mo_u_32ToWord dflags)
+vecElemProjectCast _      WordVec  W64 =  Nothing
+vecElemProjectCast _      _        _   =  Nothing
+
+-- Check to make sure that we can generate code for the specified vector type
+-- given the current set of dynamic flags.
+checkVecCompatibility :: DynFlags -> PrimOpVecCat -> Length -> Width -> FCode ()
+checkVecCompatibility dflags vcat l w = do
+    when (hscTarget dflags /= HscLlvm) $ do
+        sorry $ unlines ["SIMD vector instructions require the LLVM back-end."
+                         ,"Please use -fllvm."]
+    check vecWidth vcat l w
+  where
+    check :: Width -> PrimOpVecCat -> Length -> Width -> FCode ()
+    check W128 FloatVec 4 W32 | not (isSseEnabled dflags) =
+        sorry $ "128-bit wide single-precision floating point " ++
+                "SIMD vector instructions require at least -msse."
+    check W128 _ _ _ | not (isSse2Enabled dflags) =
+        sorry $ "128-bit wide integer and double precision " ++
+                "SIMD vector instructions require at least -msse2."
+    check W256 FloatVec _ _ | not (isAvxEnabled dflags) =
+        sorry $ "256-bit wide floating point " ++
+                "SIMD vector instructions require at least -mavx."
+    check W256 _ _ _ | not (isAvx2Enabled dflags) =
+        sorry $ "256-bit wide integer " ++
+                "SIMD vector instructions require at least -mavx2."
+    check W512 _ _ _ | not (isAvx512fEnabled dflags) =
+        sorry $ "512-bit wide " ++
+                "SIMD vector instructions require -mavx512f."
+    check _ _ _ _ = return ()
+
+    vecWidth = typeWidth (vecVmmType vcat l w)
+
+------------------------------------------------------------------------------
 -- Helpers for translating vector packing and unpacking.
 
 doVecPackOp :: Maybe MachOp  -- Cast from element to vector component
diff --git a/compiler/ghc.mk b/compiler/ghc.mk
index 4fdadd7c30..5b9610103b 100644
--- a/compiler/ghc.mk
+++ b/compiler/ghc.mk
@@ -250,8 +250,12 @@ PRIMOP_BITS_NAMES = primop-data-decl.hs-incl        \
                     primop-code-size.hs-incl        \
                     primop-can-fail.hs-incl         \
                     primop-strictness.hs-incl       \
-                    primop-fixity.hs-incl       \
-                    primop-primop-info.hs-incl
+                    primop-fixity.hs-incl           \
+                    primop-primop-info.hs-incl      \
+                    primop-vector-uniques.hs-incl   \
+                    primop-vector-tys.hs-incl       \
+                    primop-vector-tys-exports.hs-incl \
+                    primop-vector-tycons.hs-incl
 
 PRIMOP_BITS_STAGE1 = $(addprefix compiler/stage1/build/,$(PRIMOP_BITS_NAMES))
 PRIMOP_BITS_STAGE2 = $(addprefix compiler/stage2/build/,$(PRIMOP_BITS_NAMES))
@@ -290,6 +294,14 @@ compiler/stage$1/build/primop-fixity.hs-incl: compiler/stage$1/build/primops.txt
 	"$$(genprimopcode_INPLACE)" --fixity             < $$< > $$@
 compiler/stage$1/build/primop-primop-info.hs-incl: compiler/stage$1/build/primops.txt $$$$(genprimopcode_INPLACE)
 	"$$(genprimopcode_INPLACE)" --primop-primop-info < $$< > $$@
+compiler/stage$1/build/primop-vector-uniques.hs-incl: compiler/stage$1/build/primops.txt $$$$(genprimopcode_INPLACE)
+	"$$(genprimopcode_INPLACE)" --primop-vector-uniques     < $$< > $$@
+compiler/stage$1/build/primop-vector-tys.hs-incl: compiler/stage$1/build/primops.txt $$$$(genprimopcode_INPLACE)
+	"$$(genprimopcode_INPLACE)" --primop-vector-tys         < $$< > $$@
+compiler/stage$1/build/primop-vector-tys-exports.hs-incl: compiler/stage$1/build/primops.txt $$$$(genprimopcode_INPLACE)
+	"$$(genprimopcode_INPLACE)" --primop-vector-tys-exports < $$< > $$@
+compiler/stage$1/build/primop-vector-tycons.hs-incl: compiler/stage$1/build/primops.txt $$$$(genprimopcode_INPLACE)
+	"$$(genprimopcode_INPLACE)" --primop-vector-tycons      < $$< > $$@
 
 # Usages aren't used any more; but the generator
 # can still generate them if we want them back
diff --git a/compiler/ghci/ByteCodeAsm.lhs b/compiler/ghci/ByteCodeAsm.lhs
index dd8bbe4c83..7e5ef354a9 100644
--- a/compiler/ghci/ByteCodeAsm.lhs
+++ b/compiler/ghci/ByteCodeAsm.lhs
@@ -462,6 +462,8 @@ push_alts L   = bci_PUSH_ALTS_L
 push_alts F   = bci_PUSH_ALTS_F
 push_alts D   = bci_PUSH_ALTS_D
 push_alts V16 = error "push_alts: vector"
+push_alts V32 = error "push_alts: vector"
+push_alts V64 = error "push_alts: vector"
 
 return_ubx :: ArgRep -> Word16
 return_ubx V   = bci_RETURN_V
@@ -471,6 +473,8 @@ return_ubx L   = bci_RETURN_L
 return_ubx F   = bci_RETURN_F
 return_ubx D   = bci_RETURN_D
 return_ubx V16 = error "return_ubx: vector"
+return_ubx V32 = error "return_ubx: vector"
+return_ubx V64 = error "return_ubx: vector"
 
 -- Make lists of host-sized words for literals, so that when the
 -- words are placed in memory at increasing addresses, the
diff --git a/compiler/llvmGen/LlvmCodeGen/Base.hs b/compiler/llvmGen/LlvmCodeGen/Base.hs
index 6ae3c4252d..fb9668b5ee 100644
--- a/compiler/llvmGen/LlvmCodeGen/Base.hs
+++ b/compiler/llvmGen/LlvmCodeGen/Base.hs
@@ -155,6 +155,8 @@ llvmFunArgs dflags live =
           isSSE (FloatReg _)  = True
           isSSE (DoubleReg _) = True
           isSSE (XmmReg _)    = True
+          isSSE (YmmReg _)    = True
+          isSSE (ZmmReg _)    = True
           isSSE _             = False
 
 -- | Llvm standard fun attributes
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index 33107c0b68..5002b89b72 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -967,6 +967,9 @@ genMachOp _ op [x] = case op of
 
     MO_VS_Quot    _ _ -> panicOp
     MO_VS_Rem     _ _ -> panicOp
+
+    MO_VU_Quot    _ _ -> panicOp
+    MO_VU_Rem     _ _ -> panicOp
  
     MO_VF_Insert  _ _ -> panicOp
     MO_VF_Extract _ _ -> panicOp
@@ -1140,6 +1143,9 @@ genMachOp_slow opt op [x, y] = case op of
 
     MO_VS_Quot l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_SDiv
     MO_VS_Rem  l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_SRem
+
+    MO_VU_Quot l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_UDiv
+    MO_VU_Rem  l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_URem
  
     MO_VF_Add  l w -> genCastBinMach (LMVector l (widthToLlvmFloat w)) LM_MO_FAdd
     MO_VF_Sub  l w -> genCastBinMach (LMVector l (widthToLlvmFloat w)) LM_MO_FSub
@@ -1527,6 +1533,8 @@ funEpilogue live = do
         isSSE (FloatReg _)  = True
         isSSE (DoubleReg _) = True
         isSSE (XmmReg _)    = True
+        isSSE (YmmReg _)    = True
+        isSSE (ZmmReg _)    = True
         isSSE _             = False
 
     -- Set to value or "undef" depending on whether the register is
diff --git a/compiler/llvmGen/LlvmCodeGen/Regs.hs b/compiler/llvmGen/LlvmCodeGen/Regs.hs
index 1b87929499..9f20aa5de5 100644
--- a/compiler/llvmGen/LlvmCodeGen/Regs.hs
+++ b/compiler/llvmGen/LlvmCodeGen/Regs.hs
@@ -62,6 +62,18 @@ lmGlobalReg dflags suf reg
         XmmReg 4       -> xmmGlobal $ "XMM4" ++ suf
         XmmReg 5       -> xmmGlobal $ "XMM5" ++ suf
         XmmReg 6       -> xmmGlobal $ "XMM6" ++ suf
+        YmmReg 1       -> ymmGlobal $ "YMM1" ++ suf
+        YmmReg 2       -> ymmGlobal $ "YMM2" ++ suf
+        YmmReg 3       -> ymmGlobal $ "YMM3" ++ suf
+        YmmReg 4       -> ymmGlobal $ "YMM4" ++ suf
+        YmmReg 5       -> ymmGlobal $ "YMM5" ++ suf
+        YmmReg 6       -> ymmGlobal $ "YMM6" ++ suf
+        ZmmReg 1       -> zmmGlobal $ "ZMM1" ++ suf
+        ZmmReg 2       -> zmmGlobal $ "ZMM2" ++ suf
+        ZmmReg 3       -> zmmGlobal $ "ZMM3" ++ suf
+        ZmmReg 4       -> zmmGlobal $ "ZMM4" ++ suf
+        ZmmReg 5       -> zmmGlobal $ "ZMM5" ++ suf
+        ZmmReg 6       -> zmmGlobal $ "ZMM6" ++ suf
         _other         -> panic $ "LlvmCodeGen.Reg: GlobalReg (" ++ (show reg)
                                 ++ ") not supported!"
         -- LongReg, HpLim, CCSS, CurrentTSO, CurrentNusery, HpAlloc
@@ -72,6 +84,8 @@ lmGlobalReg dflags suf reg
         floatGlobal  name = LMNLocalVar (fsLit name) LMFloat
         doubleGlobal name = LMNLocalVar (fsLit name) LMDouble
         xmmGlobal    name = LMNLocalVar (fsLit name) (LMVector 4 (LMInt 32))
+        ymmGlobal    name = LMNLocalVar (fsLit name) (LMVector 8 (LMInt 32))
+        zmmGlobal    name = LMNLocalVar (fsLit name) (LMVector 16 (LMInt 32))
 
 -- | A list of STG Registers that should always be considered alive
 alwaysLive :: [GlobalReg]
diff --git a/compiler/llvmGen/LlvmMangler.hs b/compiler/llvmGen/LlvmMangler.hs
index 83a2be7f8d..5f74dc4564 100644
--- a/compiler/llvmGen/LlvmMangler.hs
+++ b/compiler/llvmGen/LlvmMangler.hs
@@ -20,6 +20,10 @@ import System.IO
 import Data.List ( sortBy )
 import Data.Function ( on )
 
+#if x86_64_TARGET_ARCH
+#define REWRITE_AVX
+#endif
+
 -- Magic Strings
 secStmt, infoSec, newLine, textStmt, dataStmt, syntaxUnified :: B.ByteString
 secStmt       = B.pack "\t.section\t"
@@ -47,7 +51,7 @@ llvmFixupAsm dflags f1 f2 = {-# SCC "llvm_mangler" #-} do
     w <- openBinaryFile f2 WriteMode
     ss <- readSections r w
     hClose r
-    let fixed = fixTables ss
+    let fixed = (map rewriteAVX . fixTables) ss
     mapM_ (writeSection w) fixed
     hClose w
     return ()
@@ -90,6 +94,39 @@ writeSection w (hdr, cts) = do
     B.hPutStrLn w hdr
   B.hPutStrLn w cts
 
+#if REWRITE_AVX
+rewriteAVX :: Section -> Section
+rewriteAVX = rewriteVmovaps . rewriteVmovdqa
+
+rewriteVmovdqa :: Section -> Section
+rewriteVmovdqa = rewriteInstructions vmovdqa vmovdqu
+  where
+    vmovdqa, vmovdqu :: B.ByteString
+    vmovdqa = B.pack "vmovdqa"
+    vmovdqu = B.pack "vmovdqu"
+
+rewriteVmovap :: Section -> Section
+rewriteVmovap = rewriteInstructions vmovap vmovup
+  where
+    vmovap, vmovup :: B.ByteString
+    vmovap = B.pack "vmovap"
+    vmovup = B.pack "vmovup"
+
+rewriteInstructions :: B.ByteString -> B.ByteString -> Section -> Section
+rewriteInstructions matchBS replaceBS (hdr, cts) =
+    (hdr, loop cts)
+  where
+    loop :: B.ByteString -> B.ByteString
+    loop cts =
+        case B.breakSubstring cts matchBS of
+          (hd,tl) | B.null tl -> hd
+                  | otherwise -> hd `B.append` replaceBS `B.append`
+                                 loop (B.drop (B.length matchBS) tl)
+#else /* !REWRITE_AVX */
+rewriteAVX :: Section -> Section
+rewriteAVX = id
+#endif /* !REWRITE_SSE */
+
 -- | Reorder and convert sections so info tables end up next to the
 -- code. Also does stack fixups.
 fixTables :: [Section] -> [Section]
diff --git a/compiler/main/CmdLineParser.hs b/compiler/main/CmdLineParser.hs
index 6681186246..fef2701bcd 100644
--- a/compiler/main/CmdLineParser.hs
+++ b/compiler/main/CmdLineParser.hs
@@ -220,7 +220,7 @@ processOneArg opt_kind rest arg args
                           Just min <- parseInt min_s -> Right (f maj min, args)
                         | [maj_s] <- split '.' rest_no_eq,
                           Just maj <- parseInt maj_s -> Right (f maj 0, args)
-                        | otherwise -> Left ("malformed version argument in " ++ dash_arg)
+                        | otherwise -> Right (f 1 0, args)
 
 
 findArg :: [Flag m] -> String -> Maybe (String, OptKind m)
diff --git a/compiler/main/DriverPipeline.hs b/compiler/main/DriverPipeline.hs
index 035d5778d6..44a6fa57a1 100644
--- a/compiler/main/DriverPipeline.hs
+++ b/compiler/main/DriverPipeline.hs
@@ -1380,7 +1380,10 @@ runPhase (RealPhase LlvmLlc) input_fn dflags
                 ++ [SysTools.Option tbaa]
                 ++ map SysTools.Option fpOpts
                 ++ map SysTools.Option abiOpts
-                ++ map SysTools.Option sseOpts)
+                ++ map SysTools.Option sseOpts
+                ++ map SysTools.Option avxOpts
+                ++ map SysTools.Option avx512Opts
+                ++ map SysTools.Option stackAlignOpts)
 
     return (RealPhase next_phase, output_fn)
   where
@@ -1411,8 +1414,24 @@ runPhase (RealPhase LlvmLlc) input_fn dflags
 
         sseOpts | isSse4_2Enabled dflags = ["-mattr=+sse42"]
                 | isSse2Enabled dflags   = ["-mattr=+sse2"]
+                | isSseEnabled dflags    = ["-mattr=+sse"]
                 | otherwise              = []
 
+        avxOpts | isAvx512fEnabled dflags = ["-mattr=+avx512f"]
+                | isAvx2Enabled dflags    = ["-mattr=+avx2"]
+                | isAvxEnabled dflags     = ["-mattr=+avx"]
+                | otherwise               = []
+
+        avx512Opts =
+          [ "-mattr=+avx512cd" | isAvx512cdEnabled dflags ] ++
+          [ "-mattr=+avx512er" | isAvx512erEnabled dflags ] ++
+          [ "-mattr=+avx512pf" | isAvx512pfEnabled dflags ]
+
+        stackAlignOpts =
+            case platformArch (targetPlatform dflags) of
+              ArchX86_64 | isAvxEnabled dflags -> ["-stack-alignment=32"]
+              _                                -> []
+
 -----------------------------------------------------------------------------
 -- LlvmMangle phase
 
@@ -2015,12 +2034,18 @@ doCpp dflags raw input_fn output_fn = do
         -- remember, in code we *compile*, the HOST is the same our TARGET,
         -- and BUILD is the same as our HOST.
 
-    let sse2 = isSse2Enabled dflags
-        sse4_2 = isSse4_2Enabled dflags
-        sse_defs =
-          [ "-D__SSE__=1" | sse2 || sse4_2 ] ++
-          [ "-D__SSE2__=1" | sse2 || sse4_2 ] ++
-          [ "-D__SSE4_2__=1" | sse4_2 ]
+    let sse_defs =
+          [ "-D__SSE__=1"    | isSseEnabled    dflags ] ++
+          [ "-D__SSE2__=1"   | isSse2Enabled   dflags ] ++
+          [ "-D__SSE4_2__=1" | isSse4_2Enabled dflags ]
+
+    let avx_defs =
+          [ "-D__AVX__=1"  | isAvxEnabled  dflags ] ++
+          [ "-D__AVX2__=1" | isAvx2Enabled dflags ] ++
+          [ "-D__AVX512CD__=1" | isAvx512cdEnabled dflags ] ++
+          [ "-D__AVX512ER__=1" | isAvx512erEnabled dflags ] ++
+          [ "-D__AVX512F__=1"  | isAvx512fEnabled  dflags ] ++
+          [ "-D__AVX512PF__=1" | isAvx512pfEnabled dflags ]
 
     backend_defs <- getBackendDefs dflags
 
@@ -2031,6 +2056,7 @@ doCpp dflags raw input_fn output_fn = do
                     ++ map SysTools.Option backend_defs
                     ++ map SysTools.Option hscpp_opts
                     ++ map SysTools.Option sse_defs
+                    ++ map SysTools.Option avx_defs
         -- Set the language mode to assembler-with-cpp when preprocessing. This
         -- alleviates some of the C99 macro rules relating to whitespace and the hash
         -- operator, which we tend to abuse. Clang in particular is not very happy
diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs
index 63da5d6100..d6b386a475 100644
--- a/compiler/main/DynFlags.hs
+++ b/compiler/main/DynFlags.hs
@@ -128,9 +128,16 @@ module DynFlags (
 
         unsafeGlobalDynFlags, setUnsafeGlobalDynFlags,
 
-        -- * SSE
+        -- * SSE and AVX
+        isSseEnabled,
         isSse2Enabled,
         isSse4_2Enabled,
+        isAvxEnabled,
+        isAvx2Enabled,
+        isAvx512cdEnabled,
+        isAvx512erEnabled,
+        isAvx512fEnabled,
+        isAvx512pfEnabled,
 
         -- * Linker information
         LinkerInfo(..),
@@ -303,6 +310,7 @@ data GeneralFlag
    | Opt_RegsIterative                  -- do iterative coalescing graph coloring register allocation
    | Opt_PedanticBottoms                -- Be picky about how we treat bottom
    | Opt_LlvmTBAA                       -- Use LLVM TBAA infastructure for improving AA (hidden flag)
+   | Opt_LlvmPassVectorsInRegisters     -- Pass SIMD vectors in registers (requires a patched LLVM) (hidden flag)
    | Opt_IrrefutableTuples
    | Opt_CmmSink
    | Opt_CmmElimCommonBlocks
@@ -770,6 +778,12 @@ data DynFlags = DynFlags {
 
   -- | Machine dependant flags (-m<blah> stuff)
   sseVersion            :: Maybe (Int, Int),  -- (major, minor)
+  avx                   :: Bool,
+  avx2                  :: Bool,
+  avx512cd              :: Bool, -- Enable AVX-512 Conflict Detection Instructions.
+  avx512er              :: Bool, -- Enable AVX-512 Exponential and Reciprocal Instructions.
+  avx512f               :: Bool, -- Enable AVX-512 instructions.
+  avx512pf              :: Bool, -- Enable AVX-512 PreFetch Instructions.
 
   -- | Run-time linker information (what options we need, etc.)
   rtldFlags             :: IORef (Maybe LinkerInfo)
@@ -1401,6 +1415,12 @@ defaultDynFlags mySettings =
         interactivePrint = Nothing,
         nextWrapperNum = panic "defaultDynFlags: No nextWrapperNum",
         sseVersion = Nothing,
+        avx = False,
+        avx2 = False,
+        avx512cd = False,
+        avx512er = False,
+        avx512f = False,
+        avx512pf = False,
         rtldFlags = panic "defaultDynFlags: no rtldFlags"
       }
 
@@ -2305,6 +2325,12 @@ dynamic_flags = [
   , Flag "monly-3-regs" (NoArg (addWarn "The -monly-3-regs flag does nothing; it will be removed in a future GHC release"))
   , Flag "monly-4-regs" (NoArg (addWarn "The -monly-4-regs flag does nothing; it will be removed in a future GHC release"))
   , Flag "msse"         (versionSuffix (\maj min d -> d{ sseVersion = Just (maj, min) }))
+  , Flag "mavx"         (noArg (\d -> d{ avx = True }))
+  , Flag "mavx2"        (noArg (\d -> d{ avx2 = True }))
+  , Flag "mavx512cd"    (noArg (\d -> d{ avx512cd = True }))
+  , Flag "mavx512er"    (noArg (\d -> d{ avx512er = True }))
+  , Flag "mavx512f"     (noArg (\d -> d{ avx512f = True }))
+  , Flag "mavx512pf"    (noArg (\d -> d{ avx512pf = True }))
 
      ------ Warning opts -------------------------------------------------
   , Flag "W"      (NoArg (mapM_ setWarningFlag minusWOpts))
@@ -2587,6 +2613,7 @@ fFlags = [
   ( "regs-graph",                       Opt_RegsGraph, nop ),
   ( "regs-iterative",                   Opt_RegsIterative, nop ),
   ( "llvm-tbaa",                        Opt_LlvmTBAA, nop), -- hidden flag
+  ( "llvm-pass-vectors-in-regs",        Opt_LlvmPassVectorsInRegisters, nop), -- hidden flag
   ( "irrefutable-tuples",               Opt_IrrefutableTuples, nop ),
   ( "cmm-sink",                         Opt_CmmSink, nop ),
   ( "cmm-elim-common-blocks",           Opt_CmmElimCommonBlocks, nop ),
@@ -3585,12 +3612,18 @@ setUnsafeGlobalDynFlags :: DynFlags -> IO ()
 setUnsafeGlobalDynFlags = writeIORef v_unsafeGlobalDynFlags
 
 -- -----------------------------------------------------------------------------
--- SSE
+-- SSE and AVX
 
 -- TODO: Instead of using a separate predicate (i.e. isSse2Enabled) to
 -- check if SSE is enabled, we might have x86-64 imply the -msse2
 -- flag.
 
+isSseEnabled :: DynFlags -> Bool
+isSseEnabled dflags = case platformArch (targetPlatform dflags) of
+    ArchX86_64 -> True
+    ArchX86    -> sseVersion dflags >= Just (1,0)
+    _          -> False
+
 isSse2Enabled :: DynFlags -> Bool
 isSse2Enabled dflags = case platformArch (targetPlatform dflags) of
     ArchX86_64 -> -- SSE2 is fixed on for x86_64.  It would be
@@ -3605,6 +3638,24 @@ isSse2Enabled dflags = case platformArch (targetPlatform dflags) of
 isSse4_2Enabled :: DynFlags -> Bool
 isSse4_2Enabled dflags = sseVersion dflags >= Just (4,2)
 
+isAvxEnabled :: DynFlags -> Bool
+isAvxEnabled dflags = avx dflags || avx2 dflags || avx512f dflags
+
+isAvx2Enabled :: DynFlags -> Bool
+isAvx2Enabled dflags = avx2 dflags || avx512f dflags
+
+isAvx512cdEnabled :: DynFlags -> Bool
+isAvx512cdEnabled dflags = avx512cd dflags
+
+isAvx512erEnabled :: DynFlags -> Bool
+isAvx512erEnabled dflags = avx512er dflags
+
+isAvx512fEnabled :: DynFlags -> Bool
+isAvx512fEnabled dflags = avx512f dflags
+
+isAvx512pfEnabled :: DynFlags -> Bool
+isAvx512pfEnabled dflags = avx512pf dflags
+
 -- -----------------------------------------------------------------------------
 -- Linker information
 
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index f6143d3fb9..e18da25347 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -610,6 +610,8 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       MO_VS_Quot {}    -> needLlvm
       MO_VS_Rem {}     -> needLlvm
       MO_VS_Neg {}     -> needLlvm
+      MO_VU_Quot {}    -> needLlvm
+      MO_VU_Rem {}     -> needLlvm
       MO_VF_Insert {}  -> needLlvm
       MO_VF_Extract {} -> needLlvm
       MO_VF_Add {}     -> needLlvm
diff --git a/compiler/prelude/PrelNames.lhs b/compiler/prelude/PrelNames.lhs
index 81fb9be52a..07730e653d 100644
--- a/compiler/prelude/PrelNames.lhs
+++ b/compiler/prelude/PrelNames.lhs
@@ -1474,15 +1474,6 @@ typeNatMulTyFamNameKey    = mkPreludeTyConUnique 163
 typeNatExpTyFamNameKey    = mkPreludeTyConUnique 164
 typeNatLeqTyFamNameKey    = mkPreludeTyConUnique 165
 
--- SIMD vector types (Unique keys)
-floatX4PrimTyConKey, doubleX2PrimTyConKey, int32X4PrimTyConKey,
-  int64X2PrimTyConKey :: Unique
-
-floatX4PrimTyConKey  = mkPreludeTyConUnique 170
-doubleX2PrimTyConKey = mkPreludeTyConUnique 171
-int32X4PrimTyConKey  = mkPreludeTyConUnique 172
-int64X2PrimTyConKey  = mkPreludeTyConUnique 173
-
 ntTyConKey:: Unique
 ntTyConKey = mkPreludeTyConUnique 174
 coercibleTyConKey :: Unique
@@ -1492,6 +1483,12 @@ coercibleTyConKey = mkPreludeTyConUnique 175
 --      USES TyConUniques 200-299
 -----------------------------------------------------
 
+----------------------- SIMD ------------------------
+--      USES TyConUniques 300-399
+-----------------------------------------------------
+
+#include "primop-vector-uniques.hs-incl"
+
 unitTyConKey :: Unique
 unitTyConKey = mkTupleTyConUnique BoxedTuple 0
 \end{code}
diff --git a/compiler/prelude/PrimOp.lhs b/compiler/prelude/PrimOp.lhs
index 8b1970c37f..22753ee3ea 100644
--- a/compiler/prelude/PrimOp.lhs
+++ b/compiler/prelude/PrimOp.lhs
@@ -5,7 +5,7 @@
 
 \begin{code}
 module PrimOp (
-        PrimOp(..), allThePrimOps,
+        PrimOp(..), PrimOpVecCat(..), allThePrimOps,
         primOpType, primOpSig,
         primOpTag, maxPrimOpTag, primOpOcc,
 
@@ -25,6 +25,7 @@ module PrimOp (
 import TysPrim
 import TysWiredIn
 
+import CmmType
 import Demand
 import Var              ( TyVar )
 import OccName          ( OccName, pprOccName, mkVarOccFS )
@@ -64,6 +65,7 @@ primOpTag op = iBox (tagOf_PrimOp op)
 -- supplies
 -- tagOf_PrimOp :: PrimOp -> FastInt
 #include "primop-tag.hs-incl"
+tagOf_PrimOp _ = error "tagOf_PrimOp: unknown primop"
 
 
 instance Eq PrimOp where
@@ -82,6 +84,12 @@ instance Outputable PrimOp where
     ppr op = pprPrimOp op
 \end{code}
 
+\begin{code}
+data PrimOpVecCat = IntVec
+                  | WordVec
+                  | FloatVec
+\end{code}
+
 An @Enum@-derived list would be better; meanwhile... (ToDo)
 
 \begin{code}
@@ -173,6 +181,7 @@ else, notably a type, can be constructed) for each @PrimOp@.
 \begin{code}
 primOpInfo :: PrimOp -> PrimOpInfo
 #include "primop-primop-info.hs-incl"
+primOpInfo _ = error "primOpInfo: unknown primop"
 \end{code}
 
 Here are a load of comments from the old primOp info:
diff --git a/compiler/prelude/TysPrim.lhs b/compiler/prelude/TysPrim.lhs
index f166065b22..b17f1a6f9a 100644
--- a/compiler/prelude/TysPrim.lhs
+++ b/compiler/prelude/TysPrim.lhs
@@ -76,11 +76,8 @@ module TysPrim(
 	-- * Any
 	anyTy, anyTyCon, anyTypeOfKind,
 
-        -- * SIMD
-	floatX4PrimTyCon,		floatX4PrimTy,
-	doubleX2PrimTyCon,		doubleX2PrimTy,
-	int32X4PrimTyCon,		int32X4PrimTy,
-	int64X2PrimTyCon,		int64X2PrimTy
+	-- * SIMD
+#include "primop-vector-tys-exports.hs-incl"
   ) where
 
 #include "HsVersions.h"
@@ -144,10 +141,7 @@ primTyCons
     , superKindTyCon
     , anyKindTyCon
 
-    , floatX4PrimTyCon
-    , doubleX2PrimTyCon
-    , int32X4PrimTyCon
-    , int64X2PrimTyCon
+#include "primop-vector-tycons.hs-incl"
     ]
 
 mkPrimTc :: FastString -> Unique -> TyCon -> Name
@@ -157,7 +151,7 @@ mkPrimTc fs unique tycon
 		  (ATyCon tycon)	-- Relevant TyCon
 		  UserSyntax		-- None are built-in syntax
 
-charPrimTyConName, intPrimTyConName, int32PrimTyConName, int64PrimTyConName, wordPrimTyConName, word32PrimTyConName, word64PrimTyConName, addrPrimTyConName, floatPrimTyConName, doublePrimTyConName, statePrimTyConName, realWorldTyConName, arrayPrimTyConName, arrayArrayPrimTyConName, byteArrayPrimTyConName, mutableArrayPrimTyConName, mutableByteArrayPrimTyConName, mutableArrayArrayPrimTyConName, mutVarPrimTyConName, mVarPrimTyConName, tVarPrimTyConName, stablePtrPrimTyConName, stableNamePrimTyConName, bcoPrimTyConName, weakPrimTyConName, threadIdPrimTyConName, eqPrimTyConName, eqReprPrimTyConName, floatX4PrimTyConName, doubleX2PrimTyConName, int32X4PrimTyConName, int64X2PrimTyConName :: Name
+charPrimTyConName, intPrimTyConName, int32PrimTyConName, int64PrimTyConName, wordPrimTyConName, word32PrimTyConName, word64PrimTyConName, addrPrimTyConName, floatPrimTyConName, doublePrimTyConName, statePrimTyConName, realWorldTyConName, arrayPrimTyConName, arrayArrayPrimTyConName, byteArrayPrimTyConName, mutableArrayPrimTyConName, mutableByteArrayPrimTyConName, mutableArrayArrayPrimTyConName, mutVarPrimTyConName, mVarPrimTyConName, tVarPrimTyConName, stablePtrPrimTyConName, stableNamePrimTyConName, bcoPrimTyConName, weakPrimTyConName, threadIdPrimTyConName, eqPrimTyConName, eqReprPrimTyConName :: Name
 charPrimTyConName    	      = mkPrimTc (fsLit "Char#") charPrimTyConKey charPrimTyCon
 intPrimTyConName     	      = mkPrimTc (fsLit "Int#") intPrimTyConKey  intPrimTyCon
 int32PrimTyConName	      = mkPrimTc (fsLit "Int32#") int32PrimTyConKey int32PrimTyCon
@@ -186,10 +180,6 @@ stableNamePrimTyConName       = mkPrimTc (fsLit "StableName#") stableNamePrimTyC
 bcoPrimTyConName 	      = mkPrimTc (fsLit "BCO#") bcoPrimTyConKey bcoPrimTyCon
 weakPrimTyConName  	      = mkPrimTc (fsLit "Weak#") weakPrimTyConKey weakPrimTyCon
 threadIdPrimTyConName  	      = mkPrimTc (fsLit "ThreadId#") threadIdPrimTyConKey threadIdPrimTyCon
-floatX4PrimTyConName          = mkPrimTc (fsLit "FloatX4#") floatX4PrimTyConKey floatX4PrimTyCon
-doubleX2PrimTyConName         = mkPrimTc (fsLit "DoubleX2#") doubleX2PrimTyConKey doubleX2PrimTyCon
-int32X4PrimTyConName          = mkPrimTc (fsLit "Int32X4#") int32X4PrimTyConKey int32X4PrimTyCon
-int64X2PrimTyConName          = mkPrimTc (fsLit "Int64X2#") int64X2PrimTyConKey int64X2PrimTyCon
 \end{code}
 
 %************************************************************************
@@ -766,28 +756,10 @@ anyTypeOfKind kind = TyConApp anyTyCon [kind]
 
 %************************************************************************
 %*									*
-\subsection{SIMD vector type}
+\subsection{SIMD vector types}
 %*									*
 %************************************************************************
 
 \begin{code}
-floatX4PrimTy :: Type
-floatX4PrimTy = mkTyConTy floatX4PrimTyCon
-floatX4PrimTyCon :: TyCon
-floatX4PrimTyCon = pcPrimTyCon0 floatX4PrimTyConName (VecRep 4 FloatElemRep)
-
-doubleX2PrimTy :: Type
-doubleX2PrimTy = mkTyConTy doubleX2PrimTyCon
-doubleX2PrimTyCon :: TyCon
-doubleX2PrimTyCon = pcPrimTyCon0 doubleX2PrimTyConName (VecRep 2 DoubleElemRep)
-
-int32X4PrimTy :: Type
-int32X4PrimTy = mkTyConTy int32X4PrimTyCon
-int32X4PrimTyCon :: TyCon
-int32X4PrimTyCon = pcPrimTyCon0 int32X4PrimTyConName (VecRep 4 Int32ElemRep)
-
-int64X2PrimTy :: Type
-int64X2PrimTy = mkTyConTy int64X2PrimTyCon
-int64X2PrimTyCon :: TyCon
-int64X2PrimTyCon = pcPrimTyCon0 int64X2PrimTyConName (VecRep 2 Int64ElemRep)
+#include "primop-vector-tys.hs-incl"
 \end{code}
diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp
index cfd6afa4c6..78d8925079 100644
--- a/compiler/prelude/primops.txt.pp
+++ b/compiler/prelude/primops.txt.pp
@@ -39,6 +39,22 @@
 -- (eg, out_of_line), whilst avoiding parsing complex expressions
 -- needed for strictness info.
 
+-- The vector attribute is rather special. It takes a list of 3-tuples, each of
+-- which is of the form <ELEM_TYPE,SCALAR_TYPE,LENGTH>. ELEM_TYPE is the type of
+-- the elements in the vector; LENGTH is the length of the vector; and
+-- SCALAR_TYPE is the scalar type used to inject to/project from vector
+-- element. Note that ELEM_TYPE and SCALAR_TYPE are not the same; for example,
+-- to broadcast a scalar value to a vector whose elements are of type Int8, we
+-- use an Int#.
+
+-- When a primtype or primop has a vector attribute, it is instantiated at each
+-- 3-tuple in the list of 3-tuples. That is, the vector attribute allows us to
+-- define a family of types or primops. Vector support also adds three new
+-- keywords: VECTOR, SCALAR, and VECTUPLE. These keywords are expanded to types
+-- derived from the 3-tuple. For the 3-tuple <Int64,INT64,2>, VECTOR expands to
+-- Int64X2#, SCALAR expands to INT64, and VECTUPLE expands to (# INT64, INT64
+-- #).
+
 defaults
    has_side_effects = False
    out_of_line      = False   -- See Note Note [PrimOp can_fail and has_side_effects] in PrimOp
@@ -48,6 +64,7 @@ defaults
    strictness       = { \ arity -> mkStrictSig (mkTopDmdType (replicate arity topDmd) topRes) }
    fixity           = Nothing
    llvm_only        = False
+   vector           = []
 
 -- Currently, documentation is produced using latex, so contents of
 -- description fields should be legal latex. Descriptions can contain
@@ -2373,479 +2390,210 @@ primclass Coercible a b
    }
 
 ------------------------------------------------------------------------
-section "Float SIMD Vectors"
-	{Operations on SIMD vectors of 4 single-precision (32-bit)
-         floating-point numbers.}
-------------------------------------------------------------------------
-
-primtype FloatX4#
+section "SIMD Vectors" 
+	{Operations on SIMD vectors.}
+------------------------------------------------------------------------
+
+#define ALL_VECTOR_TYPES \
+  [<Int8,Int#,16>,<Int16,Int#,8>,<Int32,INT32,4>,<Int64,INT64,2> \
+  ,<Int8,Int#,32>,<Int16,Int#,16>,<Int32,INT32,8>,<Int64,INT64,4> \
+  ,<Int8,Int#,64>,<Int16,Int#,32>,<Int32,INT32,16>,<Int64,INT64,8> \
+  ,<Word8,Word#,16>,<Word16,Word#,8>,<Word32,WORD32,4>,<Word64,WORD64,2> \
+  ,<Word8,Word#,32>,<Word16,Word#,16>,<Word32,WORD32,8>,<Word64,WORD64,4> \
+  ,<Word8,Word#,64>,<Word16,Word#,32>,<Word32,WORD32,16>,<Word64,WORD64,8> \
+  ,<Float,Float#,4>,<Double,Double#,2> \
+  ,<Float,Float#,8>,<Double,Double#,4> \
+  ,<Float,Float#,16>,<Double,Double#,8>]
+
+#define SIGNED_VECTOR_TYPES \
+  [<Int8,Int#,16>,<Int16,Int#,8>,<Int32,INT32,4>,<Int64,INT64,2> \
+  ,<Int8,Int#,32>,<Int16,Int#,16>,<Int32,INT32,8>,<Int64,INT64,4> \
+  ,<Int8,Int#,64>,<Int16,Int#,32>,<Int32,INT32,16>,<Int64,INT64,8> \
+  ,<Float,Float#,4>,<Double,Double#,2> \
+  ,<Float,Float#,8>,<Double,Double#,4> \
+  ,<Float,Float#,16>,<Double,Double#,8>]
+
+#define FLOAT_VECTOR_TYPES \
+  [<Float,Float#,4>,<Double,Double#,2> \
+  ,<Float,Float#,8>,<Double,Double#,4> \
+  ,<Float,Float#,16>,<Double,Double#,8>]
+
+#define INT_VECTOR_TYPES \
+  [<Int8,Int#,16>,<Int16,Int#,8>,<Int32,INT32,4>,<Int64,INT64,2> \
+  ,<Int8,Int#,32>,<Int16,Int#,16>,<Int32,INT32,8>,<Int64,INT64,4> \
+  ,<Int8,Int#,64>,<Int16,Int#,32>,<Int32,INT32,16>,<Int64,INT64,8> \
+  ,<Word8,Word#,16>,<Word16,Word#,8>,<Word32,WORD32,4>,<Word64,WORD64,2> \
+  ,<Word8,Word#,32>,<Word16,Word#,16>,<Word32,WORD32,8>,<Word64,WORD64,4> \
+  ,<Word8,Word#,64>,<Word16,Word#,32>,<Word32,WORD32,16>,<Word64,WORD64,8>]
+
+primtype VECTOR
    with llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop FloatToFloatX4Op "floatToFloatX4#" GenPrimOp
-   Float# -> FloatX4#
+primop VecBroadcastOp "broadcast#" GenPrimOp
+   SCALAR -> VECTOR
+   { Broadcast a scalar to all elements of a vector. }
    with llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop FloatX4PackOp "packFloatX4#" GenPrimOp
-   Float# -> Float# -> Float# -> Float# -> FloatX4#
+primop VecPackOp "pack#" GenPrimOp
+   VECTUPLE -> VECTOR
+   { Pack the elements of an unboxed tuple into a vector. }
    with llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop FloatX4UnpackOp "unpackFloatX4#" GenPrimOp
-   FloatX4# -> (# Float#, Float#, Float#, Float# #)
+primop VecUnpackOp "unpack#" GenPrimOp
+   VECTOR -> VECTUPLE
+   { Unpack the elements of a vector into an unboxed tuple. #}
    with llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop FloatX4InsertOp "insertFloatX4#" GenPrimOp
-   FloatX4# -> Float# -> Int# -> FloatX4#
+primop VecInsertOp "insert#" GenPrimOp
+   VECTOR -> SCALAR -> Int# -> VECTOR
+   { Insert a scalar at the given position in a vector. }
    with can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop FloatX4AddOp "plusFloatX4#" Dyadic
-   FloatX4# -> FloatX4# -> FloatX4#
+primop VecAddOp "plus#" Dyadic
+   VECTOR -> VECTOR -> VECTOR
+   { Add two vectors element-wise. }
    with commutable = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop FloatX4SubOp "minusFloatX4#" Dyadic
-  FloatX4# -> FloatX4# -> FloatX4#
+primop VecSubOp "minus#" Dyadic
+   VECTOR -> VECTOR -> VECTOR
+   { Subtract two vectors element-wise. }
    with llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop FloatX4MulOp "timesFloatX4#" Dyadic
-   FloatX4# -> FloatX4# -> FloatX4#
+primop VecMulOp "times#" Dyadic
+   VECTOR -> VECTOR -> VECTOR
+   { Multiply two vectors element-wise. }
    with commutable = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop FloatX4DivOp "divideFloatX4#" Dyadic
-   FloatX4# -> FloatX4# -> FloatX4#
-   with can_fail = True
-        llvm_only = True
-
-primop FloatX4NegOp "negateFloatX4#" Monadic
-   FloatX4# -> FloatX4#
-   with llvm_only = True
-
-primop IndexByteArrayOp_FloatX4 "indexFloatX4Array#" GenPrimOp
-   ByteArray# -> Int# -> FloatX4#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadByteArrayOp_FloatX4 "readFloatX4Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, FloatX4# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop WriteByteArrayOp_FloatX4 "writeFloatX4Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> FloatX4# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexOffAddrOp_FloatX4 "indexFloatX4OffAddr#" GenPrimOp
-   Addr# -> Int# -> FloatX4#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadOffAddrOp_FloatX4 "readFloatX4OffAddr#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, FloatX4# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop  WriteOffAddrOp_FloatX4 "writeFloatX4OffAddr#" GenPrimOp
-   Addr# -> Int# -> FloatX4# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexByteArrayOp_FloatAsFloatX4 "indexFloatArrayAsFloatX4#" GenPrimOp
-   ByteArray# -> Int# -> FloatX4#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadByteArrayOp_FloatAsFloatX4 "readFloatArrayAsFloatX4#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, FloatX4# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop WriteByteArrayOp_FloatAsFloatX4 "writeFloatArrayAsFloatX4#" GenPrimOp
-   MutableByteArray# s -> Int# -> FloatX4# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexOffAddrOp_FloatAsFloatX4 "indexFloatOffAddrAsFloatX4#" GenPrimOp
-   Addr# -> Int# -> FloatX4#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadOffAddrOp_FloatAsFloatX4 "readFloatOffAddrAsFloatX4#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, FloatX4# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop  WriteOffAddrOp_FloatAsFloatX4 "writeFloatOffAddrAsFloatX4#" GenPrimOp
-   Addr# -> Int# -> FloatX4# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-------------------------------------------------------------------------
-section "Double SIMD Vectors"
-	{Operations on SIMD vectors of 2 double-precision (64-bit)
-         floating-point numbers.}
-------------------------------------------------------------------------
-
-primtype DoubleX2#
-   with llvm_only = True
-
-primop DoubleToDoubleX2Op "doubleToDoubleX2#" GenPrimOp
-   Double# -> DoubleX2#
-   with llvm_only = True
-
-primop DoubleX2InsertOp "insertDoubleX2#" GenPrimOp
-   DoubleX2# -> Double# -> Int# -> DoubleX2#
-   with can_fail = True
-        llvm_only = True
-
-primop DoubleX2PackOp "packDoubleX2#" GenPrimOp
-   Double# -> Double# -> DoubleX2#
-   with llvm_only = True
-
-primop DoubleX2UnpackOp "unpackDoubleX2#" GenPrimOp
-   DoubleX2# -> (# Double#, Double# #)
-   with llvm_only = True
-
-primop DoubleX2AddOp "plusDoubleX2#" Dyadic
-   DoubleX2# -> DoubleX2# -> DoubleX2#
-   with commutable = True
-        llvm_only = True
-
-primop DoubleX2SubOp "minusDoubleX2#" Dyadic
-  DoubleX2# -> DoubleX2# -> DoubleX2#
-   with llvm_only = True
-
-primop DoubleX2MulOp "timesDoubleX2#" Dyadic
-   DoubleX2# -> DoubleX2# -> DoubleX2#
-   with commutable = True
-        llvm_only = True
-
-primop DoubleX2DivOp "divideDoubleX2#" Dyadic
-   DoubleX2# -> DoubleX2# -> DoubleX2#
-   with can_fail = True
-        llvm_only = True
-
-primop DoubleX2NegOp "negateDoubleX2#" Monadic
-   DoubleX2# -> DoubleX2#
-   with llvm_only = True
-
-primop IndexByteArrayOp_DoubleX2 "indexDoubleX2Array#" GenPrimOp
-   ByteArray# -> Int# -> DoubleX2#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadByteArrayOp_DoubleX2 "readDoubleX2Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, DoubleX2# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop WriteByteArrayOp_DoubleX2 "writeDoubleX2Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> DoubleX2# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexOffAddrOp_DoubleX2 "indexDoubleX2OffAddr#" GenPrimOp
-   Addr# -> Int# -> DoubleX2#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadOffAddrOp_DoubleX2 "readDoubleX2OffAddr#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, DoubleX2# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop  WriteOffAddrOp_DoubleX2 "writeDoubleX2OffAddr#" GenPrimOp
-   Addr# -> Int# -> DoubleX2# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexByteArrayOp_DoubleAsDoubleX2 "indexDoubleArrayAsDoubleX2#" GenPrimOp
-   ByteArray# -> Int# -> DoubleX2#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadByteArrayOp_DoubleAsDoubleX2 "readDoubleArrayAsDoubleX2#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, DoubleX2# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop WriteByteArrayOp_DoubleAsDoubleX2 "writeDoubleArrayAsDoubleX2#" GenPrimOp
-   MutableByteArray# s -> Int# -> DoubleX2# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexOffAddrOp_DoubleAsDoubleX2 "indexDoubleOffAddrAsDoubleX2#" GenPrimOp
-   Addr# -> Int# -> DoubleX2#
+primop VecDivOp "divide#" Dyadic
+   VECTOR -> VECTOR -> VECTOR
+   { Divide two vectors element-wise. }
    with can_fail = True
         llvm_only = True
+   	vector = FLOAT_VECTOR_TYPES
 
-primop ReadOffAddrOp_DoubleAsDoubleX2 "readDoubleOffAddrAsDoubleX2#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, DoubleX2# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop  WriteOffAddrOp_DoubleAsDoubleX2 "writeDoubleOffAddrAsDoubleX2#" GenPrimOp
-   Addr# -> Int# -> DoubleX2# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-------------------------------------------------------------------------
-section "Int32 SIMD Vectors"
-	{Operations on SIMD vectors of 4 32-bit signed integers.}
-------------------------------------------------------------------------
-
-primtype Int32X4#
-   with llvm_only = True
-
-primop Int32ToInt32X4Op "int32ToInt32X4#" GenPrimOp
-   INT32 -> Int32X4#
-   with llvm_only = True
-
-primop Int32X4InsertOp "insertInt32X4#" GenPrimOp
-   Int32X4# -> INT32 -> Int# -> Int32X4#
-   with can_fail = True
-        llvm_only = True
-
-primop Int32X4PackOp "packInt32X4#" GenPrimOp
-   INT32 -> INT32 -> INT32 -> INT32 -> Int32X4#
-   with llvm_only = True
-
-primop Int32X4UnpackOp "unpackInt32X4#" GenPrimOp
-   Int32X4# -> (# INT32, INT32, INT32, INT32 #)
-   with llvm_only = True
-
-primop Int32X4AddOp "plusInt32X4#" Dyadic
-   Int32X4# -> Int32X4# -> Int32X4#
-   with commutable = True
-        llvm_only = True
-
-primop Int32X4SubOp "minusInt32X4#" Dyadic
-  Int32X4# -> Int32X4# -> Int32X4#
-   with llvm_only = True
-
-primop Int32X4MulOp "timesInt32X4#" Dyadic
-   Int32X4# -> Int32X4# -> Int32X4#
-   with commutable = True
-        llvm_only = True
-
-primop Int32X4QuotOp "quotInt32X4#" Dyadic
-   Int32X4# -> Int32X4# -> Int32X4#
+primop VecQuotOp "quot#" Dyadic
+   VECTOR -> VECTOR -> VECTOR
+   { Rounds towards zero element-wise. }
    with can_fail = True
         llvm_only = True
+   	vector = INT_VECTOR_TYPES
 
-primop Int32X4RemOp "remInt32X4#" Dyadic
-   Int32X4# -> Int32X4# -> Int32X4#
+primop VecRemOp "rem#" Dyadic
+   VECTOR -> VECTOR -> VECTOR
+   { Satisfies \texttt{(quot\# x y) times\# y plus\# (rem\# x y) == x}. }
    with can_fail = True
         llvm_only = True
+   	vector = INT_VECTOR_TYPES
 
-primop Int32X4NegOp "negateInt32X4#" Monadic
-   Int32X4# -> Int32X4#
+primop VecNegOp "negate#" Monadic
+   VECTOR -> VECTOR
+   { Negate element-wise. }
    with llvm_only = True
+   	vector = SIGNED_VECTOR_TYPES
 
-primop IndexByteArrayOp_Int32X4 "indexInt32X4Array#" GenPrimOp
-   ByteArray# -> Int# -> Int32X4#
+primop VecIndexByteArrayOp "indexArray#" GenPrimOp
+   ByteArray# -> Int# -> VECTOR
+   { Read a vector from specified index of immutable array. }
    with can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop ReadByteArrayOp_Int32X4 "readInt32X4Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, Int32X4# #)
+primop VecReadByteArrayOp "readArray#" GenPrimOp
+   MutableByteArray# s -> Int# -> State# s -> (# State# s, VECTOR #)
+   { Read a vector from specified index of mutable array. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop WriteByteArrayOp_Int32X4 "writeInt32X4Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> Int32X4# -> State# s -> State# s
+primop VecWriteByteArrayOp "writeArray#" GenPrimOp
+   MutableByteArray# s -> Int# -> VECTOR -> State# s -> State# s
+   { Write a vector to specified index of mutable array. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop IndexOffAddrOp_Int32X4 "indexInt32X4OffAddr#" GenPrimOp
-   Addr# -> Int# -> Int32X4#
+primop VecIndexOffAddrOp "indexOffAddr#" GenPrimOp
+   Addr# -> Int# -> VECTOR
+   { Reads vector; offset in bytes. }
    with can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop ReadOffAddrOp_Int32X4 "readInt32X4OffAddr#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, Int32X4# #)
+primop VecReadOffAddrOp "readOffAddr#" GenPrimOp
+   Addr# -> Int# -> State# s -> (# State# s, VECTOR #)
+   { Reads vector; offset in bytes. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop  WriteOffAddrOp_Int32X4 "writeInt32X4OffAddr#" GenPrimOp
-   Addr# -> Int# -> Int32X4# -> State# s -> State# s
+primop VecWriteOffAddrOp "writeOffAddr#" GenPrimOp
+   Addr# -> Int# -> VECTOR -> State# s -> State# s
+   { Write vector; offset in bytes. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop IndexByteArrayOp_Int32AsInt32X4 "indexInt32ArrayAsInt32X4#" GenPrimOp
-   ByteArray# -> Int# -> Int32X4#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadByteArrayOp_Int32AsInt32X4 "readInt32ArrayAsInt32X4#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, Int32X4# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop WriteByteArrayOp_Int32AsInt32X4 "writeInt32ArrayAsInt32X4#" GenPrimOp
-   MutableByteArray# s -> Int# -> Int32X4# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexOffAddrOp_Int32AsInt32X4 "indexInt32OffAddrAsInt32X4#" GenPrimOp
-   Addr# -> Int# -> Int32X4#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadOffAddrOp_Int32AsInt32X4 "readInt32OffAddrAsInt32X4#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, Int32X4# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop  WriteOffAddrOp_Int32AsInt32X4 "writeInt32OffAddrAsInt32X4#" GenPrimOp
-   Addr# -> Int# -> Int32X4# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-------------------------------------------------------------------------
-section "Int64 SIMD Vectors"
-	{Operations on SIMD vectors of 2 64-bit signed integers.}
-------------------------------------------------------------------------
-
-primtype Int64X2#
-   with llvm_only = True
-
-primop Int64ToInt64X2Op "int64ToInt64X2#" GenPrimOp
-   INT64 -> Int64X2#
-   with llvm_only = True
-
-primop Int64X2InsertOp "insertInt64X2#" GenPrimOp
-   Int64X2# -> INT64 -> Int# -> Int64X2#
-   with can_fail = True
-        llvm_only = True
-
-primop Int64X2PackOp "packInt64X2#" GenPrimOp
-   INT64 -> INT64 -> Int64X2#
-   with llvm_only = True
-
-primop Int64X2UnpackOp "unpackInt64X2#" GenPrimOp
-   Int64X2# -> (# INT64, INT64 #)
-   with llvm_only = True
-
-primop Int64X2AddOp "plusInt64X2#" Dyadic
-   Int64X2# -> Int64X2# -> Int64X2#
-   with commutable = True
-        llvm_only = True
-
-primop Int64X2SubOp "minusInt64X2#" Dyadic
-  Int64X2# -> Int64X2# -> Int64X2#
-   with llvm_only = True
-
-primop Int64X2MulOp "timesInt64X2#" Dyadic
-   Int64X2# -> Int64X2# -> Int64X2#
-   with commutable = True
-        llvm_only = True
-
-primop Int64X2QuotOp "quotInt64X2#" Dyadic
-   Int64X2# -> Int64X2# -> Int64X2#
-   with can_fail = True
-        llvm_only = True
-
-primop Int64X2RemOp "remInt64X2#" Dyadic
-   Int64X2# -> Int64X2# -> Int64X2#
-   with can_fail = True
-        llvm_only = True
-
-primop Int64X2NegOp "negateInt64X2#" Monadic
-   Int64X2# -> Int64X2#
-   with llvm_only = True
-
-primop IndexByteArrayOp_Int64X2 "indexInt64X2Array#" GenPrimOp
-   ByteArray# -> Int# -> Int64X2#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadByteArrayOp_Int64X2 "readInt64X2Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, Int64X2# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop WriteByteArrayOp_Int64X2 "writeInt64X2Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> Int64X2# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexOffAddrOp_Int64X2 "indexInt64X2OffAddr#" GenPrimOp
-   Addr# -> Int# -> Int64X2#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadOffAddrOp_Int64X2 "readInt64X2OffAddr#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, Int64X2# #)
-   with has_side_effects = True
-        llvm_only = True
-
-primop  WriteOffAddrOp_Int64X2 "writeInt64X2OffAddr#" GenPrimOp
-   Addr# -> Int# -> Int64X2# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
 
-primop IndexByteArrayOp_Int64AsInt64X2 "indexInt64ArrayAsInt64X2#" GenPrimOp
-   ByteArray# -> Int# -> Int64X2#
+primop VecIndexScalarByteArrayOp "indexArrayAs#" GenPrimOp
+   ByteArray# -> Int# -> VECTOR
+   { Read a vector from specified index of immutable array of scalars; offset is in scalar elements. }
    with can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop ReadByteArrayOp_Int64AsInt64X2 "readInt64ArrayAsInt64X2#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, Int64X2# #)
+primop VecReadScalarByteArrayOp "readArrayAs#" GenPrimOp
+   MutableByteArray# s -> Int# -> State# s -> (# State# s, VECTOR #)
+   { Read a vector from specified index of mutable array of scalars; offset is in scalar elements. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop WriteByteArrayOp_Int64AsInt64X2 "writeInt64ArrayAsInt64X2#" GenPrimOp
-   MutableByteArray# s -> Int# -> Int64X2# -> State# s -> State# s
+primop VecWriteScalarByteArrayOp "writeArrayAs#" GenPrimOp
+   MutableByteArray# s -> Int# -> VECTOR -> State# s -> State# s
+   { Write a vector to specified index of mutable array of scalars; offset is in scalar elements. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop IndexOffAddrOp_Int64AsInt64X2 "indexInt64OffAddrAsInt64X2#" GenPrimOp
-   Addr# -> Int# -> Int64X2#
+primop VecIndexScalarOffAddrOp "indexOffAddrAs#" GenPrimOp
+   Addr# -> Int# -> VECTOR
+   { Reads vector; offset in scalar elements. }
    with can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop ReadOffAddrOp_Int64AsInt64X2 "readInt64OffAddrAsInt64X2#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, Int64X2# #)
+primop VecReadScalarOffAddrOp "readOffAddrAs#" GenPrimOp
+   Addr# -> Int# -> State# s -> (# State# s, VECTOR #)
+   { Reads vector; offset in scalar elements. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop  WriteOffAddrOp_Int64AsInt64X2 "writeInt64OffAddrAsInt64X2#" GenPrimOp
-   Addr# -> Int# -> Int64X2# -> State# s -> State# s
+primop VecWriteScalarOffAddrOp "writeOffAddrAs#" GenPrimOp
+   Addr# -> Int# -> VECTOR -> State# s -> State# s
+   { Write vector; offset in scalar elements. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
 ------------------------------------------------------------------------
 section "Prefetch"
diff --git a/includes/Cmm.h b/includes/Cmm.h
index 89baaa0987..e4898b48d7 100644
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -99,6 +99,8 @@
 #define D_   float64
 #define L_   bits64
 #define V16_ bits128
+#define V32_ bits256
+#define V64_ bits512
 
 #define SIZEOF_StgDouble 8
 #define SIZEOF_StgWord64 8
diff --git a/includes/CodeGen.Platform.hs b/includes/CodeGen.Platform.hs
index 8007574b10..3d6dd41ae4 100644
--- a/includes/CodeGen.Platform.hs
+++ b/includes/CodeGen.Platform.hs
@@ -65,6 +65,40 @@ import Reg
 # define xmm14 38
 # define xmm15 39
 
+# define ymm0  40
+# define ymm1  41
+# define ymm2  42
+# define ymm3  43
+# define ymm4  44
+# define ymm5  45
+# define ymm6  46
+# define ymm7  47
+# define ymm8  48
+# define ymm9  49
+# define ymm10 50
+# define ymm11 51
+# define ymm12 52
+# define ymm13 53
+# define ymm14 54
+# define ymm15 55
+
+# define zmm0  56
+# define zmm1  57
+# define zmm2  58
+# define zmm3  59
+# define zmm4  60
+# define zmm5  61
+# define zmm6  62
+# define zmm7  63
+# define zmm8  64
+# define zmm9  65
+# define zmm10 66
+# define zmm11 67
+# define zmm12 68
+# define zmm13 69
+# define zmm14 70
+# define zmm15 71
+
 #elif MACHREGS_powerpc
 
 # define r0 0
@@ -391,6 +425,12 @@ activeStgRegs = [
 #ifdef REG_XMM1
     ,XmmReg 1
 #endif
+#ifdef REG_YMM1
+    ,YmmReg 1
+#endif
+#ifdef REG_ZMM1
+    ,ZmmReg 1
+#endif
 #ifdef REG_F2
     ,FloatReg 2
 #endif
@@ -400,6 +440,12 @@ activeStgRegs = [
 #ifdef REG_XMM2
     ,XmmReg 2
 #endif
+#ifdef REG_YMM2
+    ,YmmReg 2
+#endif
+#ifdef REG_ZMM2
+    ,ZmmReg 2
+#endif
 #ifdef REG_F3
     ,FloatReg 3
 #endif
@@ -409,6 +455,12 @@ activeStgRegs = [
 #ifdef REG_XMM3
     ,XmmReg 3
 #endif
+#ifdef REG_YMM3
+    ,YmmReg 3
+#endif
+#ifdef REG_ZMM3
+    ,ZmmReg 3
+#endif
 #ifdef REG_F4
     ,FloatReg 4
 #endif
@@ -418,6 +470,12 @@ activeStgRegs = [
 #ifdef REG_XMM4
     ,XmmReg 4
 #endif
+#ifdef REG_YMM4
+    ,YmmReg 4
+#endif
+#ifdef REG_ZMM4
+    ,ZmmReg 4
+#endif
 #ifdef REG_F5
     ,FloatReg 5
 #endif
@@ -427,6 +485,12 @@ activeStgRegs = [
 #ifdef REG_XMM5
     ,XmmReg 5
 #endif
+#ifdef REG_YMM5
+    ,YmmReg 5
+#endif
+#ifdef REG_ZMM5
+    ,ZmmReg 5
+#endif
 #ifdef REG_F6
     ,FloatReg 6
 #endif
@@ -436,6 +500,12 @@ activeStgRegs = [
 #ifdef REG_XMM6
     ,XmmReg 6
 #endif
+#ifdef REG_YMM6
+    ,YmmReg 6
+#endif
+#ifdef REG_ZMM6
+    ,ZmmReg 6
+#endif
 #else /* MAX_REAL_XMM_REG == 0 */
 #ifdef REG_F1
     ,FloatReg 1
@@ -587,13 +657,65 @@ globalRegMaybe (DoubleReg 6)            =
                                           Just (RealRegSingle REG_D6)
 #  endif
 # endif
-#if MAX_REAL_XMM_REG != 0
+# if MAX_REAL_XMM_REG != 0
+#  ifdef REG_XMM1
 globalRegMaybe (XmmReg 1)               = Just (RealRegSingle REG_XMM1)
+#  endif
+#  ifdef REG_XMM2
 globalRegMaybe (XmmReg 2)               = Just (RealRegSingle REG_XMM2)
+#  endif
+#  ifdef REG_XMM3
 globalRegMaybe (XmmReg 3)               = Just (RealRegSingle REG_XMM3)
+#  endif
+#  ifdef REG_XMM4
 globalRegMaybe (XmmReg 4)               = Just (RealRegSingle REG_XMM4)
+#  endif
+#  ifdef REG_XMM5
 globalRegMaybe (XmmReg 5)               = Just (RealRegSingle REG_XMM5)
+#  endif
+#  ifdef REG_XMM6
 globalRegMaybe (XmmReg 6)               = Just (RealRegSingle REG_XMM6)
+#  endif
+# endif
+# if MAX_REAL_YMM_REG != 0
+#  ifdef REG_YMM1
+globalRegMaybe (YmmReg 1)               = Just (RealRegSingle REG_YMM1)
+#  endif
+#  ifdef REG_YMM2
+globalRegMaybe (YmmReg 2)               = Just (RealRegSingle REG_YMM2)
+#  endif
+#  ifdef REG_YMM3
+globalRegMaybe (YmmReg 3)               = Just (RealRegSingle REG_YMM3)
+#  endif
+#  ifdef REG_YMM4
+globalRegMaybe (YmmReg 4)               = Just (RealRegSingle REG_YMM4)
+#  endif
+#  ifdef REG_YMM5
+globalRegMaybe (YmmReg 5)               = Just (RealRegSingle REG_YMM5)
+#  endif
+#  ifdef REG_YMM6
+globalRegMaybe (YmmReg 6)               = Just (RealRegSingle REG_YMM6)
+#  endif
+# endif
+# if MAX_REAL_ZMM_REG != 0
+#  ifdef REG_ZMM1
+globalRegMaybe (ZmmReg 1)               = Just (RealRegSingle REG_ZMM1)
+#  endif
+#  ifdef REG_ZMM2
+globalRegMaybe (ZmmReg 2)               = Just (RealRegSingle REG_ZMM2)
+#  endif
+#  ifdef REG_ZMM3
+globalRegMaybe (ZmmReg 3)               = Just (RealRegSingle REG_ZMM3)
+#  endif
+#  ifdef REG_ZMM4
+globalRegMaybe (ZmmReg 4)               = Just (RealRegSingle REG_ZMM4)
+#  endif
+#  ifdef REG_ZMM5
+globalRegMaybe (ZmmReg 5)               = Just (RealRegSingle REG_ZMM5)
+#  endif
+#  ifdef REG_ZMM6
+globalRegMaybe (ZmmReg 6)               = Just (RealRegSingle REG_ZMM6)
+#  endif
 # endif
 # ifdef REG_Sp
 globalRegMaybe Sp                       = Just (RealRegSingle REG_Sp)
diff --git a/includes/rts/storage/FunTypes.h b/includes/rts/storage/FunTypes.h
index 0ba65bb79d..094bd87d01 100644
--- a/includes/rts/storage/FunTypes.h
+++ b/includes/rts/storage/FunTypes.h
@@ -34,22 +34,24 @@
 #define ARG_D        7 
 #define ARG_L        8 
 #define ARG_V16      9 
-#define ARG_NN       10 
-#define ARG_NP       11
-#define ARG_PN       12
-#define ARG_PP       13
-#define ARG_NNN      14
-#define ARG_NNP      15
-#define ARG_NPN      16
-#define ARG_NPP      17
-#define ARG_PNN      18
-#define ARG_PNP      19
-#define ARG_PPN      20
-#define ARG_PPP      21
-#define ARG_PPPP     22
-#define ARG_PPPPP    23
-#define ARG_PPPPPP   24
-#define ARG_PPPPPPP  25
-#define ARG_PPPPPPPP 26
+#define ARG_V32      10
+#define ARG_V64      11
+#define ARG_NN       12 
+#define ARG_NP       13
+#define ARG_PN       14
+#define ARG_PP       15
+#define ARG_NNN      16
+#define ARG_NNP      17
+#define ARG_NPN      18
+#define ARG_NPP      19
+#define ARG_PNN      20
+#define ARG_PNP      21
+#define ARG_PPN      22
+#define ARG_PPP      23
+#define ARG_PPPP     24
+#define ARG_PPPPP    25
+#define ARG_PPPPPP   26
+#define ARG_PPPPPPP  27
+#define ARG_PPPPPPPP 28
 
 #endif /* RTS_STORAGE_FUNTYPES_H */
diff --git a/includes/stg/MachRegs.h b/includes/stg/MachRegs.h
index a8f2215578..81e48cc9b4 100644
--- a/includes/stg/MachRegs.h
+++ b/includes/stg/MachRegs.h
@@ -103,11 +103,28 @@
 # define REG_Hp     edi
 #endif
 
+#define REG_XMM1    xmm0
+#define REG_XMM2    xmm1
+#define REG_XMM3    xmm2
+#define REG_XMM4    xmm3
+
+#define REG_YMM1    ymm0
+#define REG_YMM2    ymm1
+#define REG_YMM3    ymm2
+#define REG_YMM4    ymm3
+
+#define REG_ZMM1    zmm0
+#define REG_ZMM2    zmm1
+#define REG_ZMM3    zmm2
+#define REG_ZMM4    zmm3
+
 #define MAX_REAL_VANILLA_REG 1  /* always, since it defines the entry conv */
 #define MAX_REAL_FLOAT_REG   0
 #define MAX_REAL_DOUBLE_REG  0
 #define MAX_REAL_LONG_REG    0
-#define MAX_REAL_XMM_REG     0
+#define MAX_REAL_XMM_REG     4
+#define MAX_REAL_YMM_REG     4
+#define MAX_REAL_ZMM_REG     4
 
 /* -----------------------------------------------------------------------------
   The x86-64 register mapping
@@ -174,6 +191,20 @@
 #define REG_XMM5    xmm5
 #define REG_XMM6    xmm6
 
+#define REG_YMM1    ymm1
+#define REG_YMM2    ymm2
+#define REG_YMM3    ymm3
+#define REG_YMM4    ymm4
+#define REG_YMM5    ymm5
+#define REG_YMM6    ymm6
+
+#define REG_ZMM1    zmm1
+#define REG_ZMM2    zmm2
+#define REG_ZMM3    zmm3
+#define REG_ZMM4    zmm4
+#define REG_ZMM5    zmm5
+#define REG_ZMM6    zmm6
+
 #if !defined(mingw32_HOST_OS)
 #define CALLER_SAVES_R3
 #define CALLER_SAVES_R4
@@ -208,11 +239,31 @@
 #define CALLER_SAVES_XMM6
 #endif
 
+#define CALLER_SAVES_YMM1
+#define CALLER_SAVES_YMM2
+#define CALLER_SAVES_YMM3
+#define CALLER_SAVES_YMM4
+#define CALLER_SAVES_YMM5
+#if !defined(mingw32_HOST_OS)
+#define CALLER_SAVES_YMM6
+#endif
+
+#define CALLER_SAVES_ZMM1
+#define CALLER_SAVES_ZMM2
+#define CALLER_SAVES_ZMM3
+#define CALLER_SAVES_ZMM4
+#define CALLER_SAVES_ZMM5
+#if !defined(mingw32_HOST_OS)
+#define CALLER_SAVES_ZMM6
+#endif
+
 #define MAX_REAL_VANILLA_REG 6
 #define MAX_REAL_FLOAT_REG   6
 #define MAX_REAL_DOUBLE_REG  6
 #define MAX_REAL_LONG_REG    0
 #define MAX_REAL_XMM_REG     6
+#define MAX_REAL_YMM_REG     6
+#define MAX_REAL_ZMM_REG     6
 
 /* -----------------------------------------------------------------------------
    The PowerPC register mapping
diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h
index 876f39a02c..3e4f3d150e 100644
--- a/includes/stg/MiscClosures.h
+++ b/includes/stg/MiscClosures.h
@@ -225,6 +225,8 @@ RTS_RET(stg_ap_f);
 RTS_RET(stg_ap_d);
 RTS_RET(stg_ap_l);
 RTS_RET(stg_ap_v16);
+RTS_RET(stg_ap_v32);
+RTS_RET(stg_ap_v64);
 RTS_RET(stg_ap_n);
 RTS_RET(stg_ap_p);
 RTS_RET(stg_ap_pv);
@@ -242,6 +244,8 @@ RTS_FUN_DECL(stg_ap_f_fast);
 RTS_FUN_DECL(stg_ap_d_fast);
 RTS_FUN_DECL(stg_ap_l_fast);
 RTS_FUN_DECL(stg_ap_v16_fast);
+RTS_FUN_DECL(stg_ap_v32_fast);
+RTS_FUN_DECL(stg_ap_v64_fast);
 RTS_FUN_DECL(stg_ap_n_fast);
 RTS_FUN_DECL(stg_ap_p_fast);
 RTS_FUN_DECL(stg_ap_pv_fast);
diff --git a/includes/stg/Regs.h b/includes/stg/Regs.h
index 10ae2851ac..1abf9da439 100644
--- a/includes/stg/Regs.h
+++ b/includes/stg/Regs.h
@@ -87,6 +87,18 @@ typedef struct {
   StgWord128 	  rXMM4;
   StgWord128 	  rXMM5;
   StgWord128 	  rXMM6;
+  StgWord256 	  rYMM1;
+  StgWord256 	  rYMM2;
+  StgWord256 	  rYMM3;
+  StgWord256 	  rYMM4;
+  StgWord256 	  rYMM5;
+  StgWord256 	  rYMM6;
+  StgWord512 	  rZMM1;
+  StgWord512 	  rZMM2;
+  StgWord512 	  rZMM3;
+  StgWord512 	  rZMM4;
+  StgWord512 	  rZMM5;
+  StgWord512 	  rZMM6;
   StgWord64       rL1;
   StgPtr 	  rSp;
   StgPtr 	  rSpLim;
@@ -312,6 +324,78 @@ GLOBAL_REG_DECL(StgWord128,XMM6,REG_XMM6)
 #define XMM6 (BaseReg->rXMM6)
 #endif
 
+#if defined(REG_YMM1) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord256,YMM1,REG_YMM1)
+#else
+#define YMM1 (BaseReg->rYMM1)
+#endif
+
+#if defined(REG_YMM2) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord256,YMM2,REG_YMM2)
+#else
+#define YMM2 (BaseReg->rYMM2)
+#endif
+
+#if defined(REG_YMM3) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord256,YMM3,REG_YMM3)
+#else
+#define YMM3 (BaseReg->rYMM3)
+#endif
+
+#if defined(REG_YMM4) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord256,YMM4,REG_YMM4)
+#else
+#define YMM4 (BaseReg->rYMM4)
+#endif
+
+#if defined(REG_YMM5) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord256,YMM5,REG_YMM5)
+#else
+#define YMM5 (BaseReg->rYMM5)
+#endif
+
+#if defined(REG_YMM6) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord256,YMM6,REG_YMM6)
+#else
+#define YMM6 (BaseReg->rYMM6)
+#endif
+
+#if defined(REG_ZMM1) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord512,ZMM1,REG_ZMM1)
+#else
+#define ZMM1 (BaseReg->rZMM1)
+#endif
+
+#if defined(REG_ZMM2) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord512,ZMM2,REG_ZMM2)
+#else
+#define ZMM2 (BaseReg->rZMM2)
+#endif
+
+#if defined(REG_ZMM3) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord512,ZMM3,REG_ZMM3)
+#else
+#define ZMM3 (BaseReg->rZMM3)
+#endif
+
+#if defined(REG_ZMM4) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord512,ZMM4,REG_ZMM4)
+#else
+#define ZMM4 (BaseReg->rZMM4)
+#endif
+
+#if defined(REG_ZMM5) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord512,ZMM5,REG_ZMM5)
+#else
+#define ZMM5 (BaseReg->rZMM5)
+#endif
+
+#if defined(REG_ZMM6) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord512,ZMM6,REG_ZMM6)
+#else
+#define ZMM6 (BaseReg->rZMM6)
+#endif
+
 #if defined(REG_L1) && !defined(NO_GLOBAL_REG_DECLS)
 GLOBAL_REG_DECL(StgWord64,L1,REG_L1)
 #else
diff --git a/includes/stg/Types.h b/includes/stg/Types.h
index ccc06a175b..6138a8f8a2 100644
--- a/includes/stg/Types.h
+++ b/includes/stg/Types.h
@@ -85,6 +85,10 @@ typedef unsigned long long int StgWord64;
 
 typedef struct { StgWord64 h; StgWord64 l; } StgWord128;
 
+typedef struct { StgWord128 h; StgWord128 l; } StgWord256;
+
+typedef struct { StgWord256 h; StgWord256 l; } StgWord512;
+
 /*
  * Define the standard word size we'll use on this machine: make it
  * big enough to hold a pointer.
diff --git a/rts/Linker.c b/rts/Linker.c
index 58cfca684a..31f60c09ac 100644
--- a/rts/Linker.c
+++ b/rts/Linker.c
@@ -883,6 +883,8 @@ typedef struct _RtsSymbolVal {
       SymI_HasProto(stg_ap_d_ret)                       \
       SymI_HasProto(stg_ap_l_ret)                       \
       SymI_HasProto(stg_ap_v16_ret)                     \
+      SymI_HasProto(stg_ap_v32_ret)                     \
+      SymI_HasProto(stg_ap_v64_ret)                     \
       SymI_HasProto(stg_ap_n_ret)                       \
       SymI_HasProto(stg_ap_p_ret)                       \
       SymI_HasProto(stg_ap_pv_ret)                      \
@@ -1252,6 +1254,8 @@ typedef struct _RtsSymbolVal {
       SymI_HasProto(stg_ap_d_info)                                      \
       SymI_HasProto(stg_ap_l_info)                                      \
       SymI_HasProto(stg_ap_v16_info)                                    \
+      SymI_HasProto(stg_ap_v32_info)                                    \
+      SymI_HasProto(stg_ap_v64_info)                                    \
       SymI_HasProto(stg_ap_n_info)                                      \
       SymI_HasProto(stg_ap_p_info)                                      \
       SymI_HasProto(stg_ap_pv_info)                                     \
@@ -1268,6 +1272,8 @@ typedef struct _RtsSymbolVal {
       SymI_HasProto(stg_ap_d_fast)                                      \
       SymI_HasProto(stg_ap_l_fast)                                      \
       SymI_HasProto(stg_ap_v16_fast)                                    \
+      SymI_HasProto(stg_ap_v32_fast)                                    \
+      SymI_HasProto(stg_ap_v64_fast)                                    \
       SymI_HasProto(stg_ap_n_fast)                                      \
       SymI_HasProto(stg_ap_p_fast)                                      \
       SymI_HasProto(stg_ap_pv_fast)                                     \
diff --git a/utils/deriveConstants/DeriveConstants.hs b/utils/deriveConstants/DeriveConstants.hs
index 48990061cc..5b9b7c0bd9 100644
--- a/utils/deriveConstants/DeriveConstants.hs
+++ b/utils/deriveConstants/DeriveConstants.hs
@@ -313,6 +313,18 @@ wanteds = concat
           ,fieldOffset Both "StgRegTable" "rXMM4"
           ,fieldOffset Both "StgRegTable" "rXMM5"
           ,fieldOffset Both "StgRegTable" "rXMM6"
+          ,fieldOffset Both "StgRegTable" "rYMM1"
+          ,fieldOffset Both "StgRegTable" "rYMM2"
+          ,fieldOffset Both "StgRegTable" "rYMM3"
+          ,fieldOffset Both "StgRegTable" "rYMM4"
+          ,fieldOffset Both "StgRegTable" "rYMM5"
+          ,fieldOffset Both "StgRegTable" "rYMM6"
+          ,fieldOffset Both "StgRegTable" "rZMM1"
+          ,fieldOffset Both "StgRegTable" "rZMM2"
+          ,fieldOffset Both "StgRegTable" "rZMM3"
+          ,fieldOffset Both "StgRegTable" "rZMM4"
+          ,fieldOffset Both "StgRegTable" "rZMM5"
+          ,fieldOffset Both "StgRegTable" "rZMM6"
           ,fieldOffset Both "StgRegTable" "rL1"
           ,fieldOffset Both "StgRegTable" "rSp"
           ,fieldOffset Both "StgRegTable" "rSpLim"
diff --git a/utils/genapply/GenApply.hs b/utils/genapply/GenApply.hs
index 2baf85896a..188211df81 100644
--- a/utils/genapply/GenApply.hs
+++ b/utils/genapply/GenApply.hs
@@ -33,6 +33,8 @@ data ArgRep
   | D   -- double
   | L   -- long (64-bit)
   | V16 -- 16-byte (128-bit) vectors
+  | V32 -- 32-byte (256-bit) vectors
+  | V64 -- 64-byte (512-bit) vectors
 
 -- size of a value in *words*
 argSize :: ArgRep -> Int
@@ -43,6 +45,8 @@ argSize F   = 1
 argSize D   = (SIZEOF_DOUBLE `quot` SIZEOF_VOID_P :: Int)
 argSize L   = (8 `quot` SIZEOF_VOID_P :: Int)
 argSize V16 = (16 `quot` SIZEOF_VOID_P :: Int)
+argSize V32 = (32 `quot` SIZEOF_VOID_P :: Int)
+argSize V64 = (64 `quot` SIZEOF_VOID_P :: Int)
 
 showArg :: ArgRep -> String
 showArg N   = "n"
@@ -52,6 +56,8 @@ showArg F   = "f"
 showArg D   = "d"
 showArg L   = "l"
 showArg V16 = "v16"
+showArg V32 = "v32"
+showArg V64 = "v64"
 
 -- is a value a pointer?
 isPtr :: ArgRep -> Bool
@@ -504,6 +510,8 @@ argRep D   = text "D_"
 argRep L   = text "L_"
 argRep P   = text "gcptr"
 argRep V16 = text "V16_"
+argRep V32 = text "V32_"
+argRep V64 = text "V64_"
 argRep _   = text "W_"
 
 genApply regstatus args =
@@ -854,6 +862,8 @@ applyTypes = [
         [D],
         [L],
         [V16],
+        [V32],
+        [V64],
         [N],
         [P],
         [P,V],
@@ -882,6 +892,8 @@ stackApplyTypes = [
         [D],
         [L],
         [V16],
+        [V32],
+        [V64],
         [N,N],
         [N,P],
         [P,N],
diff --git a/utils/genprimopcode/Lexer.x b/utils/genprimopcode/Lexer.x
index ff18e17373..d29d8a17f0 100644
--- a/utils/genprimopcode/Lexer.x
+++ b/utils/genprimopcode/Lexer.x
@@ -40,6 +40,10 @@ words :-
     <0>         ")"                 { mkT TCloseParen }
     <0>         "(#"                { mkT TOpenParenHash }
     <0>         "#)"                { mkT THashCloseParen }
+    <0>         "["                 { mkT TOpenBracket }
+    <0>         "]"                 { mkT TCloseBracket }
+    <0>         "<"                 { mkT TOpenAngle }
+    <0>         ">"                 { mkT TCloseAngle }
     <0>         "section"           { mkT TSection }
     <0>         "primop"            { mkT TPrimop }
     <0>         "pseudoop"          { mkT TPseudoop }
@@ -58,7 +62,11 @@ words :-
     <0>         "infixl"            { mkT TInfixL }
     <0>         "infixr"            { mkT TInfixR }
     <0>         "Nothing"           { mkT TNothing }
+    <0>         "vector"            { mkT TVector }
     <0>         "thats_all_folks"   { mkT TThatsAllFolks }
+    <0>         "SCALAR"            { mkT TSCALAR }
+    <0>         "VECTOR"            { mkT TVECTOR }
+    <0>         "VECTUPLE"          { mkT TVECTUPLE }
     <0>         [a-z][a-zA-Z0-9\#_]* { mkTv TLowerName }
     <0>         [A-Z][a-zA-Z0-9\#_]* { mkTv TUpperName }
     <0>         [0-9][0-9]*         { mkTv (TInteger . read) }
diff --git a/utils/genprimopcode/Main.hs b/utils/genprimopcode/Main.hs
index 5e1c9ab84b..8b97ca169c 100644
--- a/utils/genprimopcode/Main.hs
+++ b/utils/genprimopcode/Main.hs
@@ -13,6 +13,100 @@ import Data.List
 import Data.Maybe ( catMaybes )
 import System.Environment ( getArgs )
 
+vecOptions :: Entry -> [(String,String,Int)]
+vecOptions i =
+    concat [vecs | OptionVector vecs <- opts i]
+
+desugarVectorSpec :: Entry -> [Entry]
+desugarVectorSpec i@(Section {}) = [i]
+desugarVectorSpec i              = case vecOptions i of
+                                     []  -> [i]
+                                     vos -> map genVecEntry vos
+  where
+    genVecEntry :: (String,String,Int) -> Entry
+    genVecEntry (con,repCon,n) =
+        case i of
+          PrimOpSpec {} ->
+              PrimVecOpSpec { cons    = "(" ++ concat (intersperse " " [cons i, vecCat, show n, vecWidth]) ++ ")"
+                            , name    = name'
+                            , prefix  = pfx
+                            , veclen  = n
+                            , elemrep = con ++ "ElemRep"
+                            , ty      = desugarTy (ty i)
+                            , cat     = cat i
+                            , desc    = desc i
+                            , opts    = opts i
+                            }
+          PrimTypeSpec {} ->
+              PrimVecTypeSpec { ty      = desugarTy (ty i)
+                              , prefix  = pfx
+                              , veclen  = n
+                              , elemrep = con ++ "ElemRep"
+                              , desc    = desc i
+                              , opts    = opts i
+                              }
+          _ ->
+              error "vector options can only be given for primops and primtypes"
+      where
+        vecCons       = con++"X"++show n++"#"
+        vecCat        = conCat con
+        vecWidth      = conWidth con
+        pfx           = lowerHead con++"X"++show n
+        vecTyName     = pfx++"PrimTy"
+
+        name' | Just pre <- splitSuffix (name i) "Array#"     = pre++vec++"Array#"
+              | Just pre <- splitSuffix (name i) "OffAddr#"   = pre++vec++"OffAddr#"
+              | Just pre <- splitSuffix (name i) "ArrayAs#"   = pre++con++"ArrayAs"++vec++"#"
+              | Just pre <- splitSuffix (name i) "OffAddrAs#" = pre++con++"OffAddrAs"++vec++"#"
+              | otherwise                                     = init (name i)++vec ++"#"
+          where
+            vec = con++"X"++show n
+
+        splitSuffix :: Eq a => [a] -> [a] -> Maybe [a]
+        splitSuffix s suf
+            | drop len s == suf = Just (take len s)
+            | otherwise         = Nothing
+          where
+            len = length s - length suf                            
+
+        lowerHead s = toLower (head s) : tail s
+
+        desugarTy :: Ty -> Ty
+        desugarTy (TyF s d)           = TyF (desugarTy s) (desugarTy d)
+        desugarTy (TyC s d)           = TyC (desugarTy s) (desugarTy d)
+        desugarTy (TyApp SCALAR [])   = TyApp (TyCon repCon) []
+        desugarTy (TyApp VECTOR [])   = TyApp (VecTyCon vecCons vecTyName) []
+        desugarTy (TyApp VECTUPLE []) = TyUTup (replicate n (TyApp (TyCon repCon) []))
+        desugarTy (TyApp tycon ts)    = TyApp tycon (map desugarTy ts)
+        desugarTy t@(TyVar {})        = t
+        desugarTy (TyUTup ts)         = TyUTup (map desugarTy ts)
+
+    conCat :: String -> String
+    conCat "Int8"   = "IntVec"
+    conCat "Int16"  = "IntVec"
+    conCat "Int32"  = "IntVec"
+    conCat "Int64"  = "IntVec"
+    conCat "Word8"  = "WordVec"
+    conCat "Word16" = "WordVec"
+    conCat "Word32" = "WordVec"
+    conCat "Word64" = "WordVec"
+    conCat "Float"  = "FloatVec"
+    conCat "Double" = "FloatVec"
+    conCat con      = error $ "conCat: unknown type constructor " ++ con ++ "\n"
+
+    conWidth :: String -> String
+    conWidth "Int8"   = "W8"
+    conWidth "Int16"  = "W16"
+    conWidth "Int32"  = "W32"
+    conWidth "Int64"  = "W64"
+    conWidth "Word8"  = "W8"
+    conWidth "Word16" = "W16"
+    conWidth "Word32" = "W32"
+    conWidth "Word64" = "W64"
+    conWidth "Float"  = "W32"
+    conWidth "Double" = "W64"
+    conWidth con      = error $ "conWidth: unknown type constructor " ++ con ++ "\n"
+
 main :: IO ()
 main = getArgs >>= \args ->
        if length args /= 1 || head args `notElem` known_args
@@ -75,6 +169,18 @@ main = getArgs >>= \args ->
                       "--primop-list" 
                          -> putStr (gen_primop_list p_o_specs)
 
+                      "--primop-vector-uniques" 
+                         -> putStr (gen_primop_vector_uniques p_o_specs)
+
+                      "--primop-vector-tys" 
+                         -> putStr (gen_primop_vector_tys p_o_specs)
+
+                      "--primop-vector-tys-exports" 
+                         -> putStr (gen_primop_vector_tys_exports p_o_specs)
+
+                      "--primop-vector-tycons" 
+                         -> putStr (gen_primop_vector_tycons p_o_specs)
+
                       "--make-haskell-wrappers" 
                          -> putStr (gen_wrappers p_o_specs)
                         
@@ -103,6 +209,10 @@ known_args
        "--primop-primop-info",
        "--primop-tag",
        "--primop-list",
+       "--primop-vector-uniques",
+       "--primop-vector-tys",
+       "--primop-vector-tys-exports",
+       "--primop-vector-tycons",
        "--make-haskell-wrappers",
        "--make-haskell-source",
        "--make-ext-core-source",
@@ -136,32 +246,40 @@ gen_hs_source (Info defaults entries) =
         ++ "-----------------------------------------------------------------------------\n"
         ++ "{-# LANGUAGE MultiParamTypeClasses #-}\n"
         ++ "module GHC.Prim (\n"
-        ++ unlines (map (("\t" ++) . hdr) entries)
+        ++ unlines (map (("\t" ++) . hdr) entries')
         ++ ") where\n"
     ++ "\n"
     ++ "{-\n"
         ++ unlines (map opt defaults)
     ++ "-}\n"
-        ++ unlines (concatMap ent entries) ++ "\n\n\n"
-     where opt (OptionFalse n)    = n ++ " = False"
+        ++ unlines (concatMap ent entries') ++ "\n\n\n"
+     where entries' = concatMap desugarVectorSpec entries
+
+           opt (OptionFalse n)    = n ++ " = False"
            opt (OptionTrue n)     = n ++ " = True"
            opt (OptionString n v) = n ++ " = { " ++ v ++ "}"
            opt (OptionInteger n v) = n ++ " = " ++ show v
+           opt (OptionVector _)    = ""
            opt (OptionFixity mf) = "fixity" ++ " = " ++ show mf
 
-           hdr s@(Section {})                      = sec s
-           hdr (PrimOpSpec { name = n })           = wrapOp n ++ ","
-           hdr (PseudoOpSpec { name = n })         = wrapOp n ++ ","
-           hdr (PrimTypeSpec { ty = TyApp n _ })   = wrapTy n ++ ","
-           hdr (PrimTypeSpec {})                   = error "Illegal type spec"
-           hdr (PrimClassSpec { cls = TyApp n _ }) = wrapTy n ++ ","
-           hdr (PrimClassSpec {})                  = error "Illegal class spec"
-
-           ent   (Section {})       = []
-           ent o@(PrimOpSpec {})    = spec o
-           ent o@(PrimTypeSpec {})  = spec o
-           ent o@(PrimClassSpec {}) = spec o
-           ent o@(PseudoOpSpec {})  = spec o
+           hdr s@(Section {})                                    = sec s
+           hdr (PrimOpSpec { name = n })                         = wrapOp n ++ ","
+           hdr (PrimVecOpSpec { name = n })                      = wrapOp n ++ ","
+           hdr (PseudoOpSpec { name = n })                       = wrapOp n ++ ","
+           hdr (PrimTypeSpec { ty = TyApp (TyCon n) _ })         = wrapTy n ++ ","
+           hdr (PrimTypeSpec {})                                 = error $ "Illegal type spec"
+           hdr (PrimClassSpec { cls = TyApp (TyCon n) _ })       = wrapTy n ++ ","
+           hdr (PrimClassSpec {})                                = error "Illegal class spec"
+           hdr (PrimVecTypeSpec { ty = TyApp (VecTyCon n _) _ }) = wrapTy n ++ ","
+           hdr (PrimVecTypeSpec {})                              = error $ "Illegal type spec"
+
+           ent   (Section {})         = []
+           ent o@(PrimOpSpec {})      = spec o
+           ent o@(PrimVecOpSpec {})   = spec o
+           ent o@(PrimTypeSpec {})    = spec o
+           ent o@(PrimClassSpec {})   = spec o
+           ent o@(PrimVecTypeSpec {}) = spec o
+           ent o@(PseudoOpSpec {})    = spec o
 
            sec s = "\n-- * " ++ escape (title s) ++ "\n"
                         ++ (unlines $ map ("-- " ++ ) $ lines $ unlatex $ escape $ "|" ++ desc s) ++ "\n"
@@ -173,6 +291,11 @@ gen_hs_source (Info defaults entries) =
                             ++
                             [ wrapOp n ++ " :: " ++ pprTy t,
                               wrapOp n ++ " = let x = x in x" ]
+                        PrimVecOpSpec { name = n, ty = t, opts = options } ->
+                            [ pprFixity fixity n | OptionFixity (Just fixity) <- options ]
+                            ++
+                            [ wrapOp n ++ " :: " ++ pprTy t,
+                              wrapOp n ++ " = let x = x in x" ]
                         PseudoOpSpec { name = n, ty = t } ->
                             [ wrapOp n ++ " :: " ++ pprTy t,
                               wrapOp n ++ " = let x = x in x" ]
@@ -180,6 +303,8 @@ gen_hs_source (Info defaults entries) =
                             [ "data " ++ pprTy t ]
                         PrimClassSpec { cls = t }   ->
                             [ "class " ++ pprTy t ]
+                        PrimVecTypeSpec { ty = t }   ->
+                            [ "data " ++ pprTy t ]
                         Section { } -> []
 
                    comm = case (desc o) of
@@ -212,7 +337,7 @@ pprTy = pty
           pty (TyF t1 t2) = pbty t1 ++ " -> " ++ pty t2
           pty (TyC t1 t2) = pbty t1 ++ " => " ++ pty t2
           pty t      = pbty t
-          pbty (TyApp tc ts) = tc ++ concat (map (' ' :) (map paty ts))
+          pbty (TyApp tc ts) = show tc ++ concat (map (' ' :) (map paty ts))
           pbty (TyUTup ts)   = "(# "
                             ++ concat (intersperse "," (map pty ts))
                             ++ " #)"
@@ -259,7 +384,7 @@ gen_ext_core_source entries =
   where printList f = concat . intersperse ",\n" . filter (not . null) . map f   
         tcEnt  (PrimTypeSpec {ty=t}) = 
            case t of
-            TyApp tc args -> parens tc (tcKind tc args)
+            TyApp tc args -> parens (show tc) (tcKind tc args)
             _             -> error ("tcEnt: type in PrimTypeSpec is not a type"
                               ++ " constructor: " ++ show t)  
         tcEnt  _                = ""
@@ -270,12 +395,12 @@ gen_ext_core_source entries =
         -- alternative would be to refer to things indirectly and hard-wire
         -- certain things (e.g., the kind of the Any constructor, here) into
         -- ext-core's Prims module again.
-        tcKind "Any" _                = "Klifted"
-        tcKind tc [] | last tc == '#' = "Kunlifted"
-        tcKind _  [] | otherwise      = "Klifted"
+        tcKind (TyCon "Any") _               = "Klifted"
+        tcKind tc [] | last (show tc) == '#' = "Kunlifted"
+        tcKind _  [] | otherwise             = "Klifted"
         -- assumes that all type arguments are lifted (are they?)
-        tcKind tc (_v:as)              = "(Karrow Klifted " ++ tcKind tc as
-                                         ++ ")"
+        tcKind tc (_v:as)                    = "(Karrow Klifted " ++ tcKind tc as
+                                               ++ ")"
         valEnt (PseudoOpSpec {name=n, ty=t}) = valEntry n t
         valEnt (PrimOpSpec {name=n, ty=t})   = valEntry n t
         valEnt _                             = ""
@@ -290,7 +415,7 @@ gen_ext_core_source entries =
                                                ++ " " ++ paren s1))
                                           ++ " " ++ paren s2
                   mkTconApp tc args = foldl tapp tc args
-                  mkTcon tc = paren $ "Tcon " ++ paren (qualify True tc)
+                  mkTcon tc = paren $ "Tcon " ++ paren (qualify True (show tc))
                   mkUtupleTy args = foldl tapp (tcUTuple (length args)) args   
                   mkForallTy [] t = t
                   mkForallTy vs t = foldr 
@@ -314,7 +439,7 @@ gen_ext_core_source entries =
                                                           ++ show n ++ "H")
 
         tyEnt (PrimTypeSpec {ty=(TyApp tc _args)}) = "   " ++ paren ("Tcon " ++
-                                                       (paren (qualify True tc)))
+                                                       (paren (qualify True (show tc))))
         tyEnt _ = ""
 
         -- more hacks. might be better to do this on the ext-core side,
@@ -334,7 +459,7 @@ gen_ext_core_source entries =
         prefixes ps = filter (\ t ->
                         case t of
                           (PrimTypeSpec {ty=(TyApp tc _args)}) ->
-                            any (\ p -> p `isPrefixOf` tc) ps
+                            any (\ p -> p `isPrefixOf` show tc) ps
                           _ -> False)
 
         parens n ty' = "      (zEncodeString \"" ++ n ++ "\", " ++ ty' ++ ")"
@@ -358,6 +483,8 @@ gen_latex_doc (Info defaults entries)
                  ++ d ++ "}{"
                  ++ mk_options o
                  ++ "}\n"
+           mk_entry (PrimVecOpSpec {}) =
+                 ""
            mk_entry (Section {title=ti,desc=d}) =
                  "\\primopsection{" 
                  ++ latex_encode ti ++ "}{"
@@ -376,6 +503,8 @@ gen_latex_doc (Info defaults entries)
                  ++ d ++ "}{"
                  ++ mk_options o
                  ++ "}\n"
+           mk_entry (PrimVecTypeSpec {}) =
+                 ""
            mk_entry (PseudoOpSpec {name=n,ty=t,desc=d,opts=o}) =
                  "\\pseudoopspec{"
                  ++ latex_encode (zencode n) ++ "}{"
@@ -388,7 +517,7 @@ gen_latex_doc (Info defaults entries)
              where pty (TyF t1 t2) = pbty t1 ++ " -> " ++ pty t2
                    pty (TyC t1 t2) = pbty t1 ++ " => " ++ pty t2
                    pty t = pbty t
-                   pbty (TyApp tc ts) = tc ++ (concat (map (' ':) (map paty ts)))
+                   pbty (TyApp tc ts) = show tc ++ (concat (map (' ':) (map paty ts)))
                    pbty (TyUTup ts) = "(# " ++ (concat (intersperse "," (map pty ts))) ++ " #)"
                    pbty t = paty t
                    paty (TyVar tv) = tv
@@ -398,11 +527,11 @@ gen_latex_doc (Info defaults entries)
              where pty (TyF t1 t2) = pbty t1 ++ " -> " ++ pty t2
                    pty (TyC t1 t2) = pbty t1 ++ " => " ++ pty t2
                    pty t = pbty t
-                   pbty (TyApp tc ts) = (zencode tc) ++ (concat (map (' ':) (map paty ts)))
+                   pbty (TyApp tc ts) = (zencode (show tc)) ++ (concat (map (' ':) (map paty ts)))
                    pbty (TyUTup ts) = (zencode (utuplenm (length ts))) ++ (concat ((map (' ':) (map paty ts))))
                    pbty t = paty t
                    paty (TyVar tv) = zencode tv
-                   paty (TyApp tc []) = zencode tc
+                   paty (TyApp tc []) = zencode (show tc)
                    paty t = "(" ++ pty t ++ ")"
                    utuplenm 1 = "(# #)"
                    utuplenm n = "(#" ++ (replicate (n-1) ',') ++ "#)"
@@ -441,6 +570,7 @@ gen_latex_doc (Info defaults entries)
                Just (OptionString _ _) -> error "String value for boolean option"
                Just (OptionInteger _ _) -> error "Integer value for boolean option"
                Just (OptionFixity _) -> error "Fixity value for boolean option"
+               Just (OptionVector _) -> error "vector template for boolean option"
                Nothing -> ""
            
            mk_strictness o = 
@@ -525,20 +655,15 @@ gen_wrappers (Info _ entries)
      ++ "module GHC.PrimopWrappers where\n" 
      ++ "import qualified GHC.Prim\n" 
      ++ "import GHC.Tuple ()\n"
-     ++ "import GHC.Prim (" ++ concat (intersperse ", " othertycons) ++ ")\n"
-     ++ "#if defined (__GLASGOW_HASKELL_LLVM__)\n"
-     ++ "import GHC.Prim (" ++ concat (intersperse ", " vectycons) ++ ")\n"
-     ++ "#endif /* defined (__GLASGOW_HASKELL_LLVM__) */\n"
-     ++ unlines (concatMap f otherspecs)
-     ++ "#if defined (__GLASGOW_HASKELL_LLVM__)\n"
-     ++ unlines (concatMap f vecspecs)
-     ++ "#endif /* defined (__GLASGOW_HASKELL_LLVM__) */\n"
+     ++ "import GHC.Prim (" ++ types ++ ")\n"
+     ++ unlines (concatMap f specs)
      where
-        specs = filter (not.dodgy) (filter is_primop entries)
-        (vecspecs, otherspecs) = partition is_llvm_only specs
+        specs = filter (not.dodgy) $
+                filter (not.is_llvm_only) $
+                filter is_primop entries
         tycons = foldr union [] $ map (tyconsIn . ty) specs
-        (vectycons, othertycons) =
-            (partition llvmOnlyTyCon . filter (`notElem` ["()", "Bool"])) tycons
+        tycons' = filter (`notElem` [TyCon "()", TyCon "Bool"]) tycons
+        types = concat $ intersperse ", " $ map show tycons'
         f spec = let args = map (\n -> "a" ++ show n) [1 .. arity (ty spec)]
                      src_name = wrap (name spec)
                      lhs = src_name ++ " " ++ unwords args
@@ -565,14 +690,6 @@ gen_wrappers (Info _ entries)
               Just (OptionTrue _) -> True
               _                   -> False
 
-        llvmOnlyTyCon :: TyCon -> Bool
-        llvmOnlyTyCon "Int32#"    = True
-        llvmOnlyTyCon "FloatX4#"  = True
-        llvmOnlyTyCon "DoubleX2#" = True
-        llvmOnlyTyCon "Int32X4#"  = True
-        llvmOnlyTyCon "Int64X2#"  = True
-        llvmOnlyTyCon _           = False
-
 gen_primop_list :: Info -> String
 gen_primop_list (Info _ entries)
    = unlines (
@@ -581,24 +698,99 @@ gen_primop_list (Info _ entries)
         map (\p -> "   , " ++ cons p) rest
         ++ 
         [     "   ]"     ]
-     ) where (first:rest) = filter is_primop entries
+     ) where (first:rest) = concatMap desugarVectorSpec (filter is_primop entries)
+
+mIN_VECTOR_UNIQUE :: Int
+mIN_VECTOR_UNIQUE = 300
+
+gen_primop_vector_uniques :: Info -> String
+gen_primop_vector_uniques (Info _ entries)
+   = unlines $
+     concatMap mkVecUnique (specs `zip` [mIN_VECTOR_UNIQUE..])
+  where
+    specs = concatMap desugarVectorSpec (filter is_vector (filter is_primtype entries))
+
+    mkVecUnique :: (Entry, Int) -> [String]
+    mkVecUnique (i, unique) =
+        [ key_id ++ " :: Unique"
+        , key_id ++ " = mkPreludeTyConUnique " ++ show unique
+        ]
+      where
+        key_id = prefix i ++ "PrimTyConKey"
+
+gen_primop_vector_tys :: Info -> String
+gen_primop_vector_tys (Info _ entries)
+   = unlines $
+     concatMap mkVecTypes specs
+  where
+    specs = concatMap desugarVectorSpec (filter is_vector (filter is_primtype entries))
+
+    mkVecTypes :: Entry -> [String]
+    mkVecTypes i =
+        [ name_id ++ " :: Name"
+        , name_id ++ " = mkPrimTc (fsLit \"" ++ pprTy (ty i) ++ "\") " ++ key_id ++ " " ++ tycon_id
+        , ty_id ++ " :: Type"
+        , ty_id ++ " = mkTyConTy " ++ tycon_id
+        , tycon_id ++ " :: TyCon"
+        , tycon_id ++ " = pcPrimTyCon0 " ++ name_id ++
+                      " (VecRep " ++ show (veclen i) ++ " " ++ elemrep i ++ ")"
+        ]
+      where
+        key_id   = prefix i ++ "PrimTyConKey"
+        name_id  = prefix i ++ "PrimTyConName"
+        ty_id    = prefix i ++ "PrimTy"
+        tycon_id = prefix i ++ "PrimTyCon"
+
+gen_primop_vector_tys_exports :: Info -> String
+gen_primop_vector_tys_exports (Info _ entries)
+   = unlines $
+    map mkVecTypes specs
+  where
+    specs = concatMap desugarVectorSpec (filter is_vector (filter is_primtype entries))
+
+    mkVecTypes :: Entry -> String
+    mkVecTypes i =
+        "\t" ++ ty_id ++ ", " ++ tycon_id ++ ","
+      where
+        ty_id    = prefix i ++ "PrimTy"
+        tycon_id = prefix i ++ "PrimTyCon"
+
+gen_primop_vector_tycons :: Info -> String
+gen_primop_vector_tycons (Info _ entries)
+   = unlines $
+     map mkVecTypes specs
+  where
+    specs = concatMap desugarVectorSpec (filter is_vector (filter is_primtype entries))
+
+    mkVecTypes :: Entry -> String
+    mkVecTypes i =
+        "    , " ++ tycon_id
+      where
+        tycon_id = prefix i ++ "PrimTyCon"
 
 gen_primop_tag :: Info -> String
 gen_primop_tag (Info _ entries)
    = unlines (max_def_type : max_def :
               tagOf_type : zipWith f primop_entries [1 :: Int ..])
      where
-        primop_entries = filter is_primop entries
+        primop_entries = concatMap desugarVectorSpec $ filter is_primop entries
         tagOf_type = "tagOf_PrimOp :: PrimOp -> FastInt"
         f i n = "tagOf_PrimOp " ++ cons i ++ " = _ILIT(" ++ show n ++ ")"
         max_def_type = "maxPrimOpTag :: Int"
         max_def      = "maxPrimOpTag = " ++ show (length primop_entries)
 
 gen_data_decl :: Info -> String
-gen_data_decl (Info _ entries)
-   = let conss = map cons (filter is_primop entries)
-     in  "data PrimOp\n   = " ++ head conss ++ "\n"
-         ++ unlines (map ("   | "++) (tail conss))
+gen_data_decl (Info _ entries) =
+    "data PrimOp\n   = " ++ head conss ++ "\n"
+     ++ unlines (map ("   | "++) (tail conss))
+  where
+    conss = map genCons (filter is_primop entries)
+            
+    genCons :: Entry -> String
+    genCons entry =
+        case vecOptions entry of
+          [] -> cons entry
+          _  -> cons entry ++ " PrimOpVecCat Length Width"
 
 gen_switch_from_attribs :: String -> String -> Info -> String
 gen_switch_from_attribs attrib_name fn_name (Info defaults entries)
@@ -609,12 +801,15 @@ gen_switch_from_attribs attrib_name fn_name (Info defaults entries)
          getAltRhs (OptionTrue _)     = "True"
          getAltRhs (OptionInteger _ i) = show i
          getAltRhs (OptionString _ s) = s
+         getAltRhs (OptionVector _) = "True"
          getAltRhs (OptionFixity mf) = show mf
 
          mkAlt po
             = case lookup_attrib attrib_name (opts po) of
                  Nothing -> Nothing
-                 Just xx -> Just (fn_name ++ " " ++ cons po ++ " = " ++ getAltRhs xx)
+                 Just xx -> case vecOptions po of
+                              [] -> Just (fn_name ++ " " ++ cons po ++ " = " ++ getAltRhs xx)
+                              _  -> Just (fn_name ++ " (" ++ cons po ++ " _ _ _) = " ++ getAltRhs xx)
 
      in
          case defv of
@@ -629,7 +824,7 @@ gen_switch_from_attribs attrib_name fn_name (Info defaults entries)
 
 gen_primop_info :: Info -> String
 gen_primop_info (Info _ entries)
-   = unlines (map mkPOItext (filter is_primop entries))
+   = unlines (map mkPOItext (concatMap desugarVectorSpec (filter is_primop entries)))
 
 mkPOItext :: Entry -> String
 mkPOItext i = mkPOI_LHS_text i ++ mkPOI_RHS_text i
@@ -677,29 +872,25 @@ ppTyVar "o" = "openAlphaTyVar"
 ppTyVar _   = error "Unknown type var"
 
 ppType :: Ty -> String
-ppType (TyApp "Any"         []) = "anyTy"
-ppType (TyApp "Bool"        []) = "boolTy"
-
-ppType (TyApp "Int#"        []) = "intPrimTy"
-ppType (TyApp "Int32#"      []) = "int32PrimTy"
-ppType (TyApp "Int64#"      []) = "int64PrimTy"
-ppType (TyApp "Char#"       []) = "charPrimTy"
-ppType (TyApp "Word#"       []) = "wordPrimTy"
-ppType (TyApp "Word32#"     []) = "word32PrimTy"
-ppType (TyApp "Word64#"     []) = "word64PrimTy"
-ppType (TyApp "Addr#"       []) = "addrPrimTy"
-ppType (TyApp "Float#"      []) = "floatPrimTy"
-ppType (TyApp "Double#"     []) = "doublePrimTy"
-ppType (TyApp "FloatX4#"    []) = "floatX4PrimTy"
-ppType (TyApp "DoubleX2#"   []) = "doubleX2PrimTy"
-ppType (TyApp "Int32X4#"    []) = "int32X4PrimTy"
-ppType (TyApp "Int64X2#"    []) = "int64X2PrimTy"
-ppType (TyApp "ByteArray#"  []) = "byteArrayPrimTy"
-ppType (TyApp "RealWorld"   []) = "realWorldTy"
-ppType (TyApp "ThreadId#"   []) = "threadIdPrimTy"
-ppType (TyApp "ForeignObj#" []) = "foreignObjPrimTy"
-ppType (TyApp "BCO#"        []) = "bcoPrimTy"
-ppType (TyApp "()"          []) = "unitTy"      -- unitTy is TysWiredIn's name for ()
+ppType (TyApp (TyCon "Any")         []) = "anyTy"
+ppType (TyApp (TyCon "Bool")        []) = "boolTy"
+
+ppType (TyApp (TyCon "Int#")        []) = "intPrimTy"
+ppType (TyApp (TyCon "Int32#")      []) = "int32PrimTy"
+ppType (TyApp (TyCon "Int64#")      []) = "int64PrimTy"
+ppType (TyApp (TyCon "Char#")       []) = "charPrimTy"
+ppType (TyApp (TyCon "Word#")       []) = "wordPrimTy"
+ppType (TyApp (TyCon "Word32#")     []) = "word32PrimTy"
+ppType (TyApp (TyCon "Word64#")     []) = "word64PrimTy"
+ppType (TyApp (TyCon "Addr#")       []) = "addrPrimTy"
+ppType (TyApp (TyCon "Float#")      []) = "floatPrimTy"
+ppType (TyApp (TyCon "Double#")     []) = "doublePrimTy"
+ppType (TyApp (TyCon "ByteArray#")  []) = "byteArrayPrimTy"
+ppType (TyApp (TyCon "RealWorld")   []) = "realWorldTy"
+ppType (TyApp (TyCon "ThreadId#")   []) = "threadIdPrimTy"
+ppType (TyApp (TyCon "ForeignObj#") []) = "foreignObjPrimTy"
+ppType (TyApp (TyCon "BCO#")        []) = "bcoPrimTy"
+ppType (TyApp (TyCon "()")          []) = "unitTy"      -- unitTy is TysWiredIn's name for ()
 
 ppType (TyVar "a")                      = "alphaTy"
 ppType (TyVar "b")                      = "betaTy"
@@ -707,28 +898,31 @@ ppType (TyVar "c")                      = "gammaTy"
 ppType (TyVar "s")                      = "deltaTy"
 ppType (TyVar "o")                      = "openAlphaTy"
 
-ppType (TyApp "State#" [x])             = "mkStatePrimTy " ++ ppType x
-ppType (TyApp "MutVar#" [x,y])          = "mkMutVarPrimTy " ++ ppType x 
-                                          ++ " " ++ ppType y
-ppType (TyApp "MutableArray#" [x,y])    = "mkMutableArrayPrimTy " ++ ppType x
+ppType (TyApp (TyCon "State#") [x])             = "mkStatePrimTy " ++ ppType x
+ppType (TyApp (TyCon "MutVar#") [x,y])          = "mkMutVarPrimTy " ++ ppType x 
+                                                   ++ " " ++ ppType y
+ppType (TyApp (TyCon "MutableArray#") [x,y])    = "mkMutableArrayPrimTy " ++ ppType x
+                                                   ++ " " ++ ppType y
+ppType (TyApp (TyCon "MutableArrayArray#") [x]) = "mkMutableArrayArrayPrimTy " ++ ppType x
+ppType (TyApp (TyCon "MutableByteArray#") [x])  = "mkMutableByteArrayPrimTy " 
+                                                   ++ ppType x
+ppType (TyApp (TyCon "Array#") [x])             = "mkArrayPrimTy " ++ ppType x
+ppType (TyApp (TyCon "ArrayArray#") [])         = "mkArrayArrayPrimTy"
+
+
+ppType (TyApp (TyCon "Weak#")       [x]) = "mkWeakPrimTy " ++ ppType x
+ppType (TyApp (TyCon "StablePtr#")  [x]) = "mkStablePtrPrimTy " ++ ppType x
+ppType (TyApp (TyCon "StableName#") [x]) = "mkStableNamePrimTy " ++ ppType x
+
+ppType (TyApp (TyCon "MVar#") [x,y])     = "mkMVarPrimTy " ++ ppType x 
+                                           ++ " " ++ ppType y
+ppType (TyApp (TyCon "TVar#") [x,y])     = "mkTVarPrimTy " ++ ppType x 
                                            ++ " " ++ ppType y
-ppType (TyApp "MutableArrayArray#" [x]) = "mkMutableArrayArrayPrimTy " ++ ppType x
-ppType (TyApp "MutableByteArray#" [x])  = "mkMutableByteArrayPrimTy " 
-                                          ++ ppType x
-ppType (TyApp "Array#" [x])             = "mkArrayPrimTy " ++ ppType x
-ppType (TyApp "ArrayArray#" [])         = "mkArrayArrayPrimTy"
-
-
-ppType (TyApp "Weak#"  [x])      = "mkWeakPrimTy " ++ ppType x
-ppType (TyApp "StablePtr#"  [x])      = "mkStablePtrPrimTy " ++ ppType x
-ppType (TyApp "StableName#"  [x])      = "mkStableNamePrimTy " ++ ppType x
-
-ppType (TyApp "MVar#" [x,y])     = "mkMVarPrimTy " ++ ppType x 
-                                   ++ " " ++ ppType y
-ppType (TyApp "TVar#" [x,y])     = "mkTVarPrimTy " ++ ppType x 
-                                   ++ " " ++ ppType y
-ppType (TyUTup ts)               = "(mkTupleTy UnboxedTuple " 
-                                   ++ listify (map ppType ts) ++ ")"
+
+ppType (TyApp (VecTyCon _ pptc) [])      = pptc
+
+ppType (TyUTup ts) = "(mkTupleTy UnboxedTuple " 
+                     ++ listify (map ppType ts) ++ ")"
 
 ppType (TyF s d) = "(mkFunTy (" ++ ppType s ++ ") (" ++ ppType d ++ "))"
 ppType (TyC s d) = "(mkFunTy (" ++ ppType s ++ ") (" ++ ppType d ++ "))"
diff --git a/utils/genprimopcode/Parser.y b/utils/genprimopcode/Parser.y
index eb76cb0407..07ef03b986 100644
--- a/utils/genprimopcode/Parser.y
+++ b/utils/genprimopcode/Parser.y
@@ -32,6 +32,10 @@ import Syntax
     '#)'            { THashCloseParen }
     '{'             { TOpenBrace }
     '}'             { TCloseBrace }
+    '['             { TOpenBracket }
+    ']'             { TCloseBracket }
+    '<'             { TOpenAngle }
+    '>'             { TCloseAngle }
     section         { TSection }
     primop          { TPrimop }
     pseudoop        { TPseudoop }
@@ -50,6 +54,10 @@ import Syntax
     infixl          { TInfixL }
     infixr          { TInfixR }
     nothing         { TNothing }
+    vector          { TVector }
+    SCALAR          { TSCALAR }
+    VECTOR          { TVECTOR }
+    VECTUPLE        { TVECTUPLE }
     thats_all_folks { TThatsAllFolks }
     lowerName       { TLowerName $$ }
     upperName       { TUpperName $$ }
@@ -74,6 +82,7 @@ pOption : lowerName '=' false               { OptionFalse  $1 }
         | lowerName '=' true                { OptionTrue   $1 }
         | lowerName '=' pStuffBetweenBraces { OptionString $1 $3 }
         | lowerName '=' integer             { OptionInteger $1 $3 }
+        | vector    '=' pVectorTemplate     { OptionVector $3 }
         | fixity    '=' pInfix              { OptionFixity $3 }
 
 pInfix :: { Maybe Fixity }
@@ -147,6 +156,17 @@ pInside :: { String }
 pInside : '{' pInsides '}' { "{" ++ $2 ++ "}" }
         | noBraces         { $1 }
 
+pVectorTemplate :: { [(String, String, Int)] }
+pVectorTemplate : '[' pVectors ']' { $2 }
+
+pVectors :: { [(String, String, Int)] }
+pVectors : pVector ',' pVectors { [$1] ++ $3 }
+         | pVector              { [$1] }
+         | {- empty -}          { [] }
+
+pVector :: { (String, String, Int) }
+pVector : '<' upperName ',' upperName ',' integer '>' { ($2, $4, $6) }
+ 
 pType :: { Ty }
 pType : paT '->' pType { TyF $1 $3 }
       | paT '=>' pType { TyC $1 $3 }
@@ -175,9 +195,12 @@ ppT :: { Ty }
 ppT : lowerName { TyVar $1 }
     | pTycon    { TyApp $1 [] }
 
-pTycon :: { String }
-pTycon : upperName { $1 }
-       | '(' ')'   { "()" }
+pTycon :: { TyCon }
+pTycon : upperName { TyCon $1 }
+       | '(' ')'   { TyCon "()" }
+       | SCALAR    { SCALAR }
+       | VECTOR    { VECTOR }
+       | VECTUPLE  { VECTUPLE }
 
 {
 parse :: String -> Either String Info
diff --git a/utils/genprimopcode/ParserM.hs b/utils/genprimopcode/ParserM.hs
index 8093675651..aaaf6ac66f 100644
--- a/utils/genprimopcode/ParserM.hs
+++ b/utils/genprimopcode/ParserM.hs
@@ -67,6 +67,10 @@ data Token = TEOF
            | THashCloseParen
            | TOpenBrace
            | TCloseBrace
+           | TOpenBracket
+           | TCloseBracket
+           | TOpenAngle
+           | TCloseAngle
            | TSection
            | TPrimop
            | TPseudoop
@@ -91,6 +95,10 @@ data Token = TEOF
            | TInfixL
            | TInfixR
            | TNothing
+           | TVector
+           | TSCALAR
+           | TVECTOR
+           | TVECTUPLE
     deriving Show
 
 -- Actions
diff --git a/utils/genprimopcode/Syntax.hs b/utils/genprimopcode/Syntax.hs
index 333ea2c4c7..d0c380cf59 100644
--- a/utils/genprimopcode/Syntax.hs
+++ b/utils/genprimopcode/Syntax.hs
@@ -19,6 +19,15 @@ data Entry
                    cat   :: Category,    -- category
                    desc  :: String,      -- description
                    opts  :: [Option] }   -- default overrides
+    | PrimVecOpSpec { cons    :: String,    -- PrimOp name
+                      name    :: String,    -- name in prog text
+                      prefix  :: String,    -- prefix for generated names
+                      veclen  :: Int,       -- vector length
+                      elemrep :: String,    -- vector ElemRep
+                      ty      :: Ty,        -- type
+                      cat     :: Category,  -- category
+                      desc    :: String,    -- description
+                      opts    :: [Option] } -- default overrides
     | PseudoOpSpec { name  :: String,      -- name in prog text
                      ty    :: Ty,          -- type
                      desc  :: String,      -- description
@@ -29,6 +38,12 @@ data Entry
     | PrimClassSpec { cls   :: Ty,      -- name in prog text
                       desc  :: String,      -- description
                       opts  :: [Option] }   -- default overrides
+    | PrimVecTypeSpec { ty    :: Ty,      -- name in prog text
+                        prefix  :: String,    -- prefix for generated names
+                        veclen  :: Int,       -- vector length
+                        elemrep :: String,    -- vector ElemRep
+                        desc  :: String,      -- description
+                        opts  :: [Option] }   -- default overrides
     | Section { title :: String,         -- section title
                 desc  :: String }        -- description
     deriving Show
@@ -37,12 +52,17 @@ is_primop :: Entry -> Bool
 is_primop (PrimOpSpec _ _ _ _ _ _) = True
 is_primop _ = False
 
+is_primtype :: Entry -> Bool
+is_primtype (PrimTypeSpec {}) = True
+is_primtype _ = False
+
 -- a binding of property to value
 data Option
    = OptionFalse  String          -- name = False
    | OptionTrue   String          -- name = True
    | OptionString String String   -- name = { ... unparsed stuff ... }
    | OptionInteger String Int     -- name = <int>
+   | OptionVector [(String,String,Int)]  -- name = [(,...),...]
    | OptionFixity (Maybe Fixity)  -- fixity = infix{,l,r} <int> | Nothing
      deriving Show
 
@@ -62,7 +82,20 @@ data Ty
    deriving (Eq,Show)
 
 type TyVar = String
-type TyCon = String
+
+data TyCon = TyCon String
+           | SCALAR
+           | VECTOR
+           | VECTUPLE
+           | VecTyCon String String
+  deriving (Eq, Ord)
+
+instance Show TyCon where
+    show (TyCon tc)      = tc
+    show SCALAR          = "SCALAR"
+    show VECTOR          = "VECTOR"
+    show VECTUPLE        = "VECTUPLE"
+    show (VecTyCon tc _) = tc
 
 -- Follow definitions of Fixity and FixityDirection in GHC
 
@@ -118,7 +151,7 @@ sanityPrimOp def_names p
 
 sane_ty :: Category -> Ty -> Bool
 sane_ty Compare (TyF t1 (TyF t2 td)) 
-   | t1 == t2 && td == TyApp "Int#" []  = True
+   | t1 == t2 && td == TyApp (TyCon "Int#") []  = True
 sane_ty Monadic (TyF t1 td) 
    | t1 == td  = True
 sane_ty Dyadic (TyF t1 (TyF t2 td))
@@ -133,6 +166,7 @@ get_attrib_name (OptionFalse nm) = nm
 get_attrib_name (OptionTrue nm)  = nm
 get_attrib_name (OptionString nm _) = nm
 get_attrib_name (OptionInteger nm _) = nm
+get_attrib_name (OptionVector _) = "vector"
 get_attrib_name (OptionFixity _) = "fixity"
 
 lookup_attrib :: String -> [Option] -> Maybe Option
@@ -140,3 +174,7 @@ lookup_attrib _ [] = Nothing
 lookup_attrib nm (a:as) 
     = if get_attrib_name a == nm then Just a else lookup_attrib nm as
 
+is_vector :: Entry -> Bool
+is_vector i = case lookup_attrib "vector" (opts i) of
+                Nothing -> False
+                _       -> True
author	Geoffrey Mainland <gmainlan@microsoft.com>	2013-09-23 00:22:19 -0400
committer	Geoffrey Mainland <gmainlan@microsoft.com>	2013-09-23 00:22:19 -0400
commit	680441de191145dd874bd453e09e4ee906d87bbb (patch)
tree	0246607039802b6270563cfb3c3751cb554cdc74
parent	6e6e6f5bfdfc3996603064523af6e2be2c5131fa (diff)
parent	25eeb6782a8f8cfdd3d8e9515863007c609eafc7 (diff)
download	haskell-680441de191145dd874bd453e09e4ee906d87bbb.tar.gz