Remove SIMD conversions

This patch makes it so that packing/unpacking SIMD vectors always uses the right sized types, e.g. unpacking a Word16X4# will give a tuple of Word16#s. As a result, we can get rid of the conversion instructions that were previously required. Fixes #22296 (cherry picked from commit 6d7d91817795d7ee7f45557411368a1738daa488)
author: sheaf <sam.derbyshire@gmail.com> 2022-10-14 14:31:15 +0200
committer: Matthew Pickering <matthewtpickering@gmail.com> 2022-12-13 09:30:47 +0000
commit: 19d7a5fc8f5c9f421987935bc4e9bc9f79f3c445 (patch)
tree: cf179a42546f107e63fabfc58b0a3c28984aad12
parent: 7a40261fd963c2b8895afd34bf6a7969d5e474cf (diff)
download: haskell-19d7a5fc8f5c9f421987935bc4e9bc9f79f3c445.tar.gz
6 files changed, 76 insertions, 72 deletions
diff --git a/compiler/GHC/Builtin/primops.txt.pp b/compiler/GHC/Builtin/primops.txt.pp
index 625c512cc6..a3dcd50f06 100644
--- a/compiler/GHC/Builtin/primops.txt.pp
+++ b/compiler/GHC/Builtin/primops.txt.pp
@@ -3585,9 +3585,9 @@ section "SIMD Vectors"
   [<Int8,Int8#,16>,<Int16,Int16#,8>,<Int32,Int32#,4>,<Int64,Int64#,2> \
   ,<Int8,Int8#,32>,<Int16,Int16#,16>,<Int32,Int32#,8>,<Int64,Int64#,4> \
   ,<Int8,Int8#,64>,<Int16,Int16#,32>,<Int32,Int32#,16>,<Int64,Int64#,8> \
-  ,<Word8,Word#,16>,<Word16,Word#,8>,<Word32,Word32#,4>,<Word64,Word64#,2> \
-  ,<Word8,Word#,32>,<Word16,Word#,16>,<Word32,Word32#,8>,<Word64,Word64#,4> \
-  ,<Word8,Word#,64>,<Word16,Word#,32>,<Word32,Word32#,16>,<Word64,Word64#,8> \
+  ,<Word8,Word8#,16>,<Word16,Word16#,8>,<Word32,Word32#,4>,<Word64,Word64#,2> \
+  ,<Word8,Word8#,32>,<Word16,Word16#,16>,<Word32,Word32#,8>,<Word64,Word64#,4> \
+  ,<Word8,Word8#,64>,<Word16,Word16#,32>,<Word32,Word32#,16>,<Word64,Word64#,8> \
   ,<Float,Float#,4>,<Double,Double#,2> \
   ,<Float,Float#,8>,<Double,Double#,4> \
   ,<Float,Float#,16>,<Double,Double#,8>]
@@ -3609,9 +3609,9 @@ section "SIMD Vectors"
   [<Int8,Int8#,16>,<Int16,Int16#,8>,<Int32,Int32#,4>,<Int64,Int64#,2> \
   ,<Int8,Int8#,32>,<Int16,Int16#,16>,<Int32,Int32#,8>,<Int64,Int64#,4> \
   ,<Int8,Int8#,64>,<Int16,Int16#,32>,<Int32,Int32#,16>,<Int64,Int64#,8> \
-  ,<Word8,Word#,16>,<Word16,Word#,8>,<Word32,Word32#,4>,<Word64,Word64#,2> \
-  ,<Word8,Word#,32>,<Word16,Word#,16>,<Word32,Word32#,8>,<Word64,Word64#,4> \
-  ,<Word8,Word#,64>,<Word16,Word#,32>,<Word32,Word32#,16>,<Word64,Word64#,8>]
+  ,<Word8,Word8#,16>,<Word16,Word16#,8>,<Word32,Word32#,4>,<Word64,Word64#,2> \
+  ,<Word8,Word8#,32>,<Word16,Word16#,16>,<Word32,Word32#,8>,<Word64,Word64#,4> \
+  ,<Word8,Word8#,64>,<Word16,Word16#,32>,<Word32,Word32#,16>,<Word64,Word64#,8>]
 
 primtype VECTOR
    with llvm_only = True
diff --git a/compiler/GHC/Cmm/MachOp.hs b/compiler/GHC/Cmm/MachOp.hs
index 0bd3ac1111..632165b6b2 100644
--- a/compiler/GHC/Cmm/MachOp.hs
+++ b/compiler/GHC/Cmm/MachOp.hs
@@ -514,8 +514,11 @@ machOpArgReps platform op =
     MO_FS_Conv from _   -> [from]
     MO_FF_Conv from _   -> [from]
 
-    MO_V_Insert  l r    -> [typeWidth (vec l (cmmBits r)),r,wordWidth platform]
-    MO_V_Extract l r    -> [typeWidth (vec l (cmmBits r)),wordWidth platform]
+    MO_V_Insert   l r   -> [typeWidth (vec l (cmmBits r)),r, W32]
+    MO_V_Extract  l r   -> [typeWidth (vec l (cmmBits r)), W32]
+    MO_VF_Insert  l r   -> [typeWidth (vec l (cmmFloat r)),r,W32]
+    MO_VF_Extract l r   -> [typeWidth (vec l (cmmFloat r)),W32]
+      -- SIMD vector indices are always 32 bit
 
     MO_V_Add _ r        -> [r,r]
     MO_V_Sub _ r        -> [r,r]
@@ -528,9 +531,6 @@ machOpArgReps platform op =
     MO_VU_Quot _ r      -> [r,r]
     MO_VU_Rem  _ r      -> [r,r]
 
-    MO_VF_Insert  l r   -> [typeWidth (vec l (cmmFloat r)),r,wordWidth platform]
-    MO_VF_Extract l r   -> [typeWidth (vec l (cmmFloat r)),wordWidth platform]
-
     MO_VF_Add  _ r      -> [r,r]
     MO_VF_Sub  _ r      -> [r,r]
     MO_VF_Mul  _ r      -> [r,r]
diff --git a/compiler/GHC/StgToCmm/Prim.hs b/compiler/GHC/StgToCmm/Prim.hs
index a0e6ecf871..2e646ef4fb 100644
--- a/compiler/GHC/StgToCmm/Prim.hs
+++ b/compiler/GHC/StgToCmm/Prim.hs
@@ -853,7 +853,7 @@ emitPrimOp cfg primop =
 -- SIMD primops
   (VecBroadcastOp vcat n w) -> \[e] -> opIntoRegs $ \[res] -> do
     checkVecCompatibility cfg vcat n w
-    doVecPackOp (vecElemInjectCast platform vcat w) ty zeros (replicate n e) res
+    doVecPackOp ty zeros (replicate n e) res
    where
     zeros :: CmmExpr
     zeros = CmmLit $ CmmVec (replicate n zero)
@@ -871,7 +871,7 @@ emitPrimOp cfg primop =
     checkVecCompatibility cfg vcat n w
     when (es `lengthIsNot` n) $
         panic "emitPrimOp: VecPackOp has wrong number of arguments"
-    doVecPackOp (vecElemInjectCast platform vcat w) ty zeros es res
+    doVecPackOp ty zeros es res
    where
     zeros :: CmmExpr
     zeros = CmmLit $ CmmVec (replicate n zero)
@@ -889,14 +889,14 @@ emitPrimOp cfg primop =
     checkVecCompatibility cfg vcat n w
     when (res `lengthIsNot` n) $
         panic "emitPrimOp: VecUnpackOp has wrong number of results"
-    doVecUnpackOp (vecElemProjectCast platform vcat w) ty arg res
+    doVecUnpackOp ty arg res
    where
     ty :: CmmType
     ty = vecVmmType vcat n w
 
   (VecInsertOp vcat n w) -> \[v,e,i] -> opIntoRegs $ \[res] -> do
     checkVecCompatibility cfg vcat n w
-    doVecInsertOp (vecElemInjectCast platform vcat w) ty v e i res
+    doVecInsertOp ty v e i res
    where
     ty :: CmmType
     ty = vecVmmType vcat n w
@@ -2247,32 +2247,8 @@ vecCmmCat IntVec   = cmmBits
 vecCmmCat WordVec  = cmmBits
 vecCmmCat FloatVec = cmmFloat
 
-vecElemInjectCast :: Platform -> PrimOpVecCat -> Width -> Maybe MachOp
-vecElemInjectCast _        FloatVec _   =  Nothing
-vecElemInjectCast platform   IntVec   W8  =  Just (mo_WordTo8  platform)
-vecElemInjectCast platform   IntVec   W16 =  Just (mo_WordTo16 platform)
-vecElemInjectCast platform   IntVec   W32 =  Just (mo_WordTo32 platform)
-vecElemInjectCast _        IntVec   W64 =  Nothing
-vecElemInjectCast platform   WordVec  W8  =  Just (mo_WordTo8  platform)
-vecElemInjectCast platform   WordVec  W16 =  Just (mo_WordTo16 platform)
-vecElemInjectCast platform   WordVec  W32 =  Just (mo_WordTo32 platform)
-vecElemInjectCast _        WordVec  W64 =  Nothing
-vecElemInjectCast _        _        _   =  Nothing
-
-vecElemProjectCast :: Platform -> PrimOpVecCat -> Width -> Maybe MachOp
-vecElemProjectCast _        FloatVec _   =  Nothing
-vecElemProjectCast platform   IntVec   W8  =  Just (mo_s_8ToWord  platform)
-vecElemProjectCast platform   IntVec   W16 =  Just (mo_s_16ToWord platform)
-vecElemProjectCast platform   IntVec   W32 =  Just (mo_s_32ToWord platform)
-vecElemProjectCast _        IntVec   W64 =  Nothing
-vecElemProjectCast platform   WordVec  W8  =  Just (mo_u_8ToWord  platform)
-vecElemProjectCast platform   WordVec  W16 =  Just (mo_u_16ToWord platform)
-vecElemProjectCast platform   WordVec  W32 =  Just (mo_u_32ToWord platform)
-vecElemProjectCast _        WordVec  W64 =  Nothing
-vecElemProjectCast _        _        _   =  Nothing
-
-
--- NOTE [SIMD Design for the future]
+-- Note [SIMD Design for the future]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 -- Check to make sure that we can generate code for the specified vector type
 -- given the current set of dynamic flags.
 -- Currently these checks are specific to x86 and x86_64 architecture.
@@ -2333,13 +2309,12 @@ checkVecCompatibility cfg vcat l w =
 ------------------------------------------------------------------------------
 -- Helpers for translating vector packing and unpacking.
 
-doVecPackOp :: Maybe MachOp  -- Cast from element to vector component
-            -> CmmType       -- Type of vector
+doVecPackOp :: CmmType       -- Type of vector
             -> CmmExpr       -- Initial vector
             -> [CmmExpr]     -- Elements
             -> CmmFormal     -- Destination for result
             -> FCode ()
-doVecPackOp maybe_pre_write_cast ty z es res = do
+doVecPackOp ty z es res = do
     dst <- newTemp ty
     emitAssign (CmmLocal dst) z
     vecPack dst es 0
@@ -2352,31 +2327,25 @@ doVecPackOp maybe_pre_write_cast ty z es res = do
         dst <- newTemp ty
         if isFloatType (vecElemType ty)
           then emitAssign (CmmLocal dst) (CmmMachOp (MO_VF_Insert len wid)
-                                                    [CmmReg (CmmLocal src), cast e, iLit])
+                                                    [CmmReg (CmmLocal src), e, iLit])
           else emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid)
-                                                    [CmmReg (CmmLocal src), cast e, iLit])
+                                                    [CmmReg (CmmLocal src), e, iLit])
         vecPack dst es (i + 1)
       where
         -- vector indices are always 32-bits
         iLit = CmmLit (CmmInt (toInteger i) W32)
 
-    cast :: CmmExpr -> CmmExpr
-    cast val = case maybe_pre_write_cast of
-                 Nothing   -> val
-                 Just cast -> CmmMachOp cast [val]
-
     len :: Length
     len = vecLength ty
 
     wid :: Width
     wid = typeWidth (vecElemType ty)
 
-doVecUnpackOp :: Maybe MachOp  -- Cast from vector component to element result
-              -> CmmType       -- Type of vector
+doVecUnpackOp :: CmmType       -- Type of vector
               -> CmmExpr       -- Vector
               -> [CmmFormal]   -- Element results
               -> FCode ()
-doVecUnpackOp maybe_post_read_cast ty e res =
+doVecUnpackOp ty e res =
     vecUnpack res 0
   where
     vecUnpack :: [CmmFormal] -> Int -> FCode ()
@@ -2385,46 +2354,36 @@ doVecUnpackOp maybe_post_read_cast ty e res =
 
     vecUnpack (r : rs) i = do
         if isFloatType (vecElemType ty)
-          then emitAssign (CmmLocal r) (cast (CmmMachOp (MO_VF_Extract len wid)
-                                             [e, iLit]))
-          else emitAssign (CmmLocal r) (cast (CmmMachOp (MO_V_Extract len wid)
-                                             [e, iLit]))
+          then emitAssign (CmmLocal r) (CmmMachOp (MO_VF_Extract len wid)
+                                             [e, iLit])
+          else emitAssign (CmmLocal r) (CmmMachOp (MO_V_Extract len wid)
+                                             [e, iLit])
         vecUnpack rs (i + 1)
       where
         -- vector indices are always 32-bits
         iLit = CmmLit (CmmInt (toInteger i) W32)
 
-    cast :: CmmExpr -> CmmExpr
-    cast val = case maybe_post_read_cast of
-                 Nothing   -> val
-                 Just cast -> CmmMachOp cast [val]
-
     len :: Length
     len = vecLength ty
 
     wid :: Width
     wid = typeWidth (vecElemType ty)
 
-doVecInsertOp :: Maybe MachOp  -- Cast from element to vector component
-              -> CmmType       -- Vector type
+doVecInsertOp :: CmmType       -- Vector type
               -> CmmExpr       -- Source vector
               -> CmmExpr       -- Element
               -> CmmExpr       -- Index at which to insert element
               -> CmmFormal     -- Destination for result
               -> FCode ()
-doVecInsertOp maybe_pre_write_cast ty src e idx res = do
+doVecInsertOp ty src e idx res = do
     platform <- getPlatform
     -- vector indices are always 32-bits
     let idx' :: CmmExpr
         idx' = CmmMachOp (MO_SS_Conv (wordWidth platform) W32) [idx]
     if isFloatType (vecElemType ty)
-      then emitAssign (CmmLocal res) (CmmMachOp (MO_VF_Insert len wid) [src, cast e, idx'])
-      else emitAssign (CmmLocal res) (CmmMachOp (MO_V_Insert len wid) [src, cast e, idx'])
+      then emitAssign (CmmLocal res) (CmmMachOp (MO_VF_Insert len wid) [src, e, idx'])
+      else emitAssign (CmmLocal res) (CmmMachOp (MO_V_Insert len wid) [src, e, idx'])
   where
-    cast :: CmmExpr -> CmmExpr
-    cast val = case maybe_pre_write_cast of
-                 Nothing   -> val
-                 Just cast -> CmmMachOp cast [val]
 
     len :: Length
     len = vecLength ty
diff --git a/testsuite/tests/codeGen/should_run/T22296.hs b/testsuite/tests/codeGen/should_run/T22296.hs
new file mode 100644
index 0000000000..d5ea23afda
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/T22296.hs
@@ -0,0 +1,41 @@
+{-# language MagicHash, UnboxedTuples, UnboxedSums #-}
+
+module Main ( main ) where
+
+import GHC.Exts
+import GHC.Int
+import GHC.Word
+
+foo :: Word16X8# -> Integer
+foo w16x8 =
+  case unpackWord16X8# w16x8 of
+    (# w1, w2, w3, w4, w5, w6, w7, w8 #) ->
+      let
+        s = sum $ map fromIntegral
+             [ W16# w1, W16# w2, W16# w3, W16# w4
+             , W16# w5, W16# w6, W16# w7, W16# w8 ]
+      in s
+
+bar :: Int32X4# -> Integer
+bar i32x4 =
+  case unpackInt32X4# i32x4 of
+    (# i1, i2, i3, i4 #) ->
+      let
+        s = sum $ map fromIntegral
+             [ I32# i1, I32# i2, I32# i3, I32# i4 ]
+      in s
+
+baz :: FloatX4# -> Float
+baz fx4 =
+  case unpackFloatX4# fx4 of
+    (# f1, f2, f3, f4 #) ->
+      let
+        s = sum
+             [ F# f1, F# f2, F# f3, F# f4 ]
+      in s
+
+main :: IO ()
+main = do
+  print ( foo ( broadcastWord16X8# ( wordToWord16# 1## ) ) )
+  print ( bar ( broadcastInt32X4#  ( intToInt32# 1# ) ) )
+  print ( baz ( broadcastFloatX4#  ( 1.0# ) ) )
diff --git a/testsuite/tests/codeGen/should_run/T22296.stdout b/testsuite/tests/codeGen/should_run/T22296.stdout
new file mode 100644
index 0000000000..52b2242af3
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/T22296.stdout
@@ -0,0 +1,3 @@
+8
+4
+4.0
diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T
index af3a91d026..7f1e9f6e18 100644
--- a/testsuite/tests/codeGen/should_run/all.T
+++ b/testsuite/tests/codeGen/should_run/all.T
@@ -220,3 +220,4 @@ test('T21141', normal, compile_and_run, [''])
 test('T21186', normal, compile_and_run, [''])
 test('T20640a', normal, compile_and_run, [''])
 test('T20640b', normal, compile_and_run, [''])
+test('T22296',[only_ways(llvm_ways)],compile_and_run,[''])
author	sheaf <sam.derbyshire@gmail.com>	2022-10-14 14:31:15 +0200
committer	Matthew Pickering <matthewtpickering@gmail.com>	2022-12-13 09:30:47 +0000
commit	19d7a5fc8f5c9f421987935bc4e9bc9f79f3c445 (patch)
tree	cf179a42546f107e63fabfc58b0a3c28984aad12
parent	7a40261fd963c2b8895afd34bf6a7969d5e474cf (diff)
download	haskell-19d7a5fc8f5c9f421987935bc4e9bc9f79f3c445.tar.gz