From f792079cf981cdcbc3278532d423ac95eed32bf4 Mon Sep 17 00:00:00 2001
From: Geoffrey Mainland <gmainlan@microsoft.com>
Date: Thu, 14 Feb 2013 18:52:00 +0000
Subject: Add support for 256-bit-wide vectors.

---
 compiler/cmm/CmmType.hs          |  5 +++--
 compiler/codeGen/StgCmmArgRep.hs | 11 ++++++++---
 compiler/codeGen/StgCmmLayout.hs |  1 +
 compiler/ghci/ByteCodeAsm.lhs    |  2 ++
 includes/Cmm.h                   |  1 +
 includes/rts/storage/FunTypes.h  | 35 ++++++++++++++++++-----------------
 includes/stg/MiscClosures.h      |  2 ++
 rts/Linker.c                     |  3 +++
 utils/genapply/GenApply.hs       |  6 ++++++
 9 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/compiler/cmm/CmmType.hs b/compiler/cmm/CmmType.hs
index 98e40534f8..76de02bdac 100644
--- a/compiler/cmm/CmmType.hs
+++ b/compiler/cmm/CmmType.hs
@@ -21,7 +21,7 @@ module CmmType
 
     , Length
     , vec, vec2, vec4, vec8, vec16
-    , vec2f64, vec2b64, vec4f32, vec4b32, vec8b16, vec16b8
+    , vec2f64, vec4f64, vec2b64, vec4f32, vec4b32, vec8b16, vec16b8
     , cmmVec
     , vecLength, vecElemType
     , isVecType
@@ -285,9 +285,10 @@ vec4  = vec 4
 vec8  = vec 8
 vec16 = vec 16
 
-vec2f64, vec2b64, vec4f32, vec4b32, vec8b16, vec16b8 :: CmmType
+vec2f64, vec4f64, vec2b64, vec4f32, vec4b32, vec8b16, vec16b8 :: CmmType
 vec2f64 = vec 2 f64
 vec2b64 = vec 2 b64
+vec4f64 = vec 4 f64
 vec4f32 = vec 4 f32
 vec4b32 = vec 4 b32
 vec8b16 = vec 8 b16
diff --git a/compiler/codeGen/StgCmmArgRep.hs b/compiler/codeGen/StgCmmArgRep.hs
index bd228d4617..26569cffdd 100644
--- a/compiler/codeGen/StgCmmArgRep.hs
+++ b/compiler/codeGen/StgCmmArgRep.hs
@@ -47,6 +47,7 @@ data ArgRep = P   -- GC Ptr
             | F   -- Float
             | D   -- Double
             | V16 -- 16-byte (128-bit) vectors of Float/Double/Int8/Word32/etc.
+            | V32 -- 32-byte (256-bit) vectors of Float/Double/Int8/Word32/etc.
 instance Outputable ArgRep where ppr = text . argRepString
 
 argRepString :: ArgRep -> String
@@ -57,6 +58,7 @@ argRepString V = "V"
 argRepString F = "F"
 argRepString D = "D"
 argRepString V16 = "V16"
+argRepString V32 = "V32"
 
 toArgRep :: PrimRep -> ArgRep
 toArgRep VoidRep           = V
@@ -68,9 +70,10 @@ toArgRep Int64Rep          = L
 toArgRep Word64Rep         = L
 toArgRep FloatRep          = F
 toArgRep DoubleRep         = D
-toArgRep (VecRep len elem)
-    | len*primElemRepSizeB elem == 16 = V16
-    | otherwise                       = error "toArgRep: bad vector primrep"
+toArgRep (VecRep len elem) = case len*primElemRepSizeB elem of
+                               16 -> V16
+                               32 -> V32
+                               _  -> error "toArgRep: bad vector primrep"
 
 isNonV :: ArgRep -> Bool
 isNonV V = False
@@ -84,6 +87,7 @@ argRepSizeW dflags L   = wORD64_SIZE        `quot` wORD_SIZE dflags
 argRepSizeW dflags D   = dOUBLE_SIZE dflags `quot` wORD_SIZE dflags
 argRepSizeW _      V   = 0
 argRepSizeW dflags V16 = 16                 `quot` wORD_SIZE dflags
+argRepSizeW dflags V32 = 32                 `quot` wORD_SIZE dflags
 
 idArgRep :: Id -> ArgRep
 idArgRep = toArgRep . idPrimRep
@@ -132,4 +136,5 @@ slowCallPattern (F: _)                = (fsLit "stg_ap_f", 1)
 slowCallPattern (D: _)                = (fsLit "stg_ap_d", 1)
 slowCallPattern (L: _)                = (fsLit "stg_ap_l", 1)
 slowCallPattern (V16: _)              = (fsLit "stg_ap_v16", 1)
+slowCallPattern (V32: _)              = (fsLit "stg_ap_v32", 1)
 slowCallPattern []                    = (fsLit "stg_ap_0", 0)
diff --git a/compiler/codeGen/StgCmmLayout.hs b/compiler/codeGen/StgCmmLayout.hs
index 06a47c151b..a74b62ce82 100644
--- a/compiler/codeGen/StgCmmLayout.hs
+++ b/compiler/codeGen/StgCmmLayout.hs
@@ -394,6 +394,7 @@ stdPattern reps
 	[D]   -> Just ARG_D
 	[L]   -> Just ARG_L
 	[V16] -> Just ARG_V16
+	[V32] -> Just ARG_V32
 
 	[N,N] -> Just ARG_NN
 	[N,P] -> Just ARG_NP
diff --git a/compiler/ghci/ByteCodeAsm.lhs b/compiler/ghci/ByteCodeAsm.lhs
index 9906467186..7579d7a064 100644
--- a/compiler/ghci/ByteCodeAsm.lhs
+++ b/compiler/ghci/ByteCodeAsm.lhs
@@ -446,6 +446,7 @@ push_alts L   = bci_PUSH_ALTS_L
 push_alts F   = bci_PUSH_ALTS_F
 push_alts D   = bci_PUSH_ALTS_D
 push_alts V16 = error "push_alts: vector"
+push_alts V32 = error "push_alts: vector"
 
 return_ubx :: ArgRep -> Word16
 return_ubx V   = bci_RETURN_V
@@ -455,6 +456,7 @@ return_ubx L   = bci_RETURN_L
 return_ubx F   = bci_RETURN_F
 return_ubx D   = bci_RETURN_D
 return_ubx V16 = error "return_ubx: vector"
+return_ubx V32 = error "return_ubx: vector"
 
 -- Make lists of host-sized words for literals, so that when the
 -- words are placed in memory at increasing addresses, the
diff --git a/includes/Cmm.h b/includes/Cmm.h
index 89baaa0987..ae45fd4ded 100644
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -99,6 +99,7 @@
 #define D_   float64
 #define L_   bits64
 #define V16_ bits128
+#define V32_ bits256
 
 #define SIZEOF_StgDouble 8
 #define SIZEOF_StgWord64 8
diff --git a/includes/rts/storage/FunTypes.h b/includes/rts/storage/FunTypes.h
index 0ba65bb79d..744e8241f2 100644
--- a/includes/rts/storage/FunTypes.h
+++ b/includes/rts/storage/FunTypes.h
@@ -34,22 +34,23 @@
 #define ARG_D        7 
 #define ARG_L        8 
 #define ARG_V16      9 
-#define ARG_NN       10 
-#define ARG_NP       11
-#define ARG_PN       12
-#define ARG_PP       13
-#define ARG_NNN      14
-#define ARG_NNP      15
-#define ARG_NPN      16
-#define ARG_NPP      17
-#define ARG_PNN      18
-#define ARG_PNP      19
-#define ARG_PPN      20
-#define ARG_PPP      21
-#define ARG_PPPP     22
-#define ARG_PPPPP    23
-#define ARG_PPPPPP   24
-#define ARG_PPPPPPP  25
-#define ARG_PPPPPPPP 26
+#define ARG_V32      10
+#define ARG_NN       11 
+#define ARG_NP       12
+#define ARG_PN       13
+#define ARG_PP       14
+#define ARG_NNN      15
+#define ARG_NNP      16
+#define ARG_NPN      17
+#define ARG_NPP      18
+#define ARG_PNN      19
+#define ARG_PNP      20
+#define ARG_PPN      21
+#define ARG_PPP      22
+#define ARG_PPPP     23
+#define ARG_PPPPP    24
+#define ARG_PPPPPP   25
+#define ARG_PPPPPPP  26
+#define ARG_PPPPPPPP 27
 
 #endif /* RTS_STORAGE_FUNTYPES_H */
diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h
index 8717687f3e..3ce8683a3e 100644
--- a/includes/stg/MiscClosures.h
+++ b/includes/stg/MiscClosures.h
@@ -225,6 +225,7 @@ RTS_RET(stg_ap_f);
 RTS_RET(stg_ap_d);
 RTS_RET(stg_ap_l);
 RTS_RET(stg_ap_v16);
+RTS_RET(stg_ap_v32);
 RTS_RET(stg_ap_n);
 RTS_RET(stg_ap_p);
 RTS_RET(stg_ap_pv);
@@ -242,6 +243,7 @@ RTS_FUN_DECL(stg_ap_f_fast);
 RTS_FUN_DECL(stg_ap_d_fast);
 RTS_FUN_DECL(stg_ap_l_fast);
 RTS_FUN_DECL(stg_ap_v16_fast);
+RTS_FUN_DECL(stg_ap_v32_fast);
 RTS_FUN_DECL(stg_ap_n_fast);
 RTS_FUN_DECL(stg_ap_p_fast);
 RTS_FUN_DECL(stg_ap_pv_fast);
diff --git a/rts/Linker.c b/rts/Linker.c
index 43edde23f8..aa1d3d6ccb 100644
--- a/rts/Linker.c
+++ b/rts/Linker.c
@@ -882,6 +882,7 @@ typedef struct _RtsSymbolVal {
       SymI_HasProto(stg_ap_d_ret)                       \
       SymI_HasProto(stg_ap_l_ret)                       \
       SymI_HasProto(stg_ap_v16_ret)                     \
+      SymI_HasProto(stg_ap_v32_ret)                     \
       SymI_HasProto(stg_ap_n_ret)                       \
       SymI_HasProto(stg_ap_p_ret)                       \
       SymI_HasProto(stg_ap_pv_ret)                      \
@@ -1244,6 +1245,7 @@ typedef struct _RtsSymbolVal {
       SymI_HasProto(stg_ap_d_info)                                      \
       SymI_HasProto(stg_ap_l_info)                                      \
       SymI_HasProto(stg_ap_v16_info)                                    \
+      SymI_HasProto(stg_ap_v32_info)                                    \
       SymI_HasProto(stg_ap_n_info)                                      \
       SymI_HasProto(stg_ap_p_info)                                      \
       SymI_HasProto(stg_ap_pv_info)                                     \
@@ -1260,6 +1262,7 @@ typedef struct _RtsSymbolVal {
       SymI_HasProto(stg_ap_d_fast)                                      \
       SymI_HasProto(stg_ap_l_fast)                                      \
       SymI_HasProto(stg_ap_v16_fast)                                    \
+      SymI_HasProto(stg_ap_v32_fast)                                    \
       SymI_HasProto(stg_ap_n_fast)                                      \
       SymI_HasProto(stg_ap_p_fast)                                      \
       SymI_HasProto(stg_ap_pv_fast)                                     \
diff --git a/utils/genapply/GenApply.hs b/utils/genapply/GenApply.hs
index 2baf85896a..036a8479a4 100644
--- a/utils/genapply/GenApply.hs
+++ b/utils/genapply/GenApply.hs
@@ -33,6 +33,7 @@ data ArgRep
   | D   -- double
   | L   -- long (64-bit)
   | V16 -- 16-byte (128-bit) vectors
+  | V32 -- 32-byte (256-bit) vectors
 
 -- size of a value in *words*
 argSize :: ArgRep -> Int
@@ -43,6 +44,7 @@ argSize F   = 1
 argSize D   = (SIZEOF_DOUBLE `quot` SIZEOF_VOID_P :: Int)
 argSize L   = (8 `quot` SIZEOF_VOID_P :: Int)
 argSize V16 = (16 `quot` SIZEOF_VOID_P :: Int)
+argSize V32 = (32 `quot` SIZEOF_VOID_P :: Int)
 
 showArg :: ArgRep -> String
 showArg N   = "n"
@@ -52,6 +54,7 @@ showArg F   = "f"
 showArg D   = "d"
 showArg L   = "l"
 showArg V16 = "v16"
+showArg V32 = "v32"
 
 -- is a value a pointer?
 isPtr :: ArgRep -> Bool
@@ -504,6 +507,7 @@ argRep D   = text "D_"
 argRep L   = text "L_"
 argRep P   = text "gcptr"
 argRep V16 = text "V16_"
+argRep V32 = text "V32_"
 argRep _   = text "W_"
 
 genApply regstatus args =
@@ -854,6 +858,7 @@ applyTypes = [
         [D],
         [L],
         [V16],
+        [V32],
         [N],
         [P],
         [P,V],
@@ -882,6 +887,7 @@ stackApplyTypes = [
         [D],
         [L],
         [V16],
+        [V32],
         [N,N],
         [N,P],
         [P,N],
-- 
cgit v1.2.1