codegen: fix memset unroll for small bytearrays, add 64-bit sets

Fixes #16052 When the offset in `setByteArray#` is statically known, we can provide better alignment guarantees then just 1 byte. Also, memset can now do 64-bit wide sets. The current memset intrinsic is not optimal however and can be improved for the case when we know that we deal with (baseAddress at known alignment) + offset For instance, on 64-bit `setByteArray# s 1# 23# 0#` given that bytearray is 8 bytes aligned could be unrolled into `movb, movw, movl, movq, movq`; but currently it is `movb x23` since alignment of 1 is all we can embed into MO_Memset op.
author: Artem Pyanykh <artem.pyanykh@gmail.com> 2019-04-04 13:43:38 +0300
committer: Marge Bot <ben+marge-bot@smart-cactus.org> 2019-04-09 10:30:13 -0400
commit: af4cea7f1411e5b99e2417d7c2d3d0e697093103 (patch)
tree: ec9ef85347e5c8915e864573997c15aaa8cc5a73 /compiler
parent: 36d380475d9056fdf93305985be3def00aaf6cf7 (diff)
download: haskell-af4cea7f1411e5b99e2417d7c2d3d0e697093103.tar.gz
3 files changed, 77 insertions, 29 deletions
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index 4a07c7893e..1abef3a90a 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -2073,10 +2073,18 @@ doCopyAddrToByteArrayOp src_p dst dst_off bytes = do
 -- character.
 doSetByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
                  -> FCode ()
-doSetByteArrayOp ba off len c
-    = do dflags <- getDynFlags
-         p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba (arrWordsHdrSize dflags)) off
-         emitMemsetCall p c len 1
+doSetByteArrayOp ba off len c = do
+    dflags <- getDynFlags
+    let maxAlign = wORD_SIZE dflags
+        align = minimum [maxAlign, possibleAlign]
+
+    p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba (arrWordsHdrSize dflags)) off
+
+    emitMemsetCall p c len align
+  where
+    possibleAlign = case off of
+      CmmLit (CmmInt intOff _) -> fromIntegral $ byteAlignment (fromIntegral intOff)
+      _ -> 1
 
 -- ----------------------------------------------------------------------------
 -- Allocating arrays
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 0424b1b84f..06ebd2adb5 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -1843,22 +1843,32 @@ genCCall dflags is32Bit (PrimTarget (MO_Memcpy align)) _
         dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
                    (ImmInteger (n - i))
 
-genCCall dflags _ (PrimTarget (MO_Memset align)) _
+genCCall dflags is32Bit (PrimTarget (MO_Memset align)) _
          [dst,
           CmmLit (CmmInt c _),
           CmmLit (CmmInt n _)]
          _
-    | fromInteger insns <= maxInlineMemsetInsns dflags && align .&. 3 == 0 = do
+    | fromInteger insns <= maxInlineMemsetInsns dflags = do
         code_dst <- getAnyReg dst
         dst_r <- getNewRegNat format
-        return $ code_dst dst_r `appOL` go dst_r (fromInteger n)
+        if format == II64 && n >= 8 then do
+          code_imm8byte <- getAnyReg (CmmLit (CmmInt c8 W64))
+          imm8byte_r <- getNewRegNat II64
+          return $ code_dst dst_r `appOL`
+                   code_imm8byte imm8byte_r `appOL`
+                   go8 dst_r imm8byte_r (fromInteger n)
+        else
+          return $ code_dst dst_r `appOL`
+                   go4 dst_r (fromInteger n)
   where
-    (format, val) = case align .&. 3 of
-        2 -> (II16, c2)
-        0 -> (II32, c4)
-        _ -> (II8, c)
+    format = case byteAlignment (fromIntegral align) of
+        8  -> if is32Bit then II32 else II64
+        4  -> II32
+        2 -> II16
+        _ -> II8
     c2 = c `shiftL` 8 .|. c
     c4 = c2 `shiftL` 16 .|. c2
+    c8 = c4 `shiftL` 32 .|. c4
 
     -- The number of instructions we will generate (approx). We need 1
     -- instructions per move.
@@ -1868,25 +1878,45 @@ genCCall dflags _ (PrimTarget (MO_Memset align)) _
     sizeBytes :: Integer
     sizeBytes = fromIntegral (formatInBytes format)
 
-    go :: Reg -> Integer -> OrdList Instr
-    go dst i
-        -- TODO: Add movabs instruction and support 64-bit sets.
-        | i >= sizeBytes =  -- This might be smaller than the below sizes
-            unitOL (MOV format (OpImm (ImmInteger val)) (OpAddr dst_addr)) `appOL`
-            go dst (i - sizeBytes)
-        | i >= 4 =  -- Will never happen on 32-bit
-            unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr dst_addr)) `appOL`
-            go dst (i - 4)
-        | i >= 2 =
-            unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr dst_addr)) `appOL`
-            go dst (i - 2)
-        | i >= 1 =
-            unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr dst_addr)) `appOL`
-            go dst (i - 1)
-        | otherwise = nilOL
+    -- Depending on size returns the widest MOV instruction and its
+    -- width.
+    gen4 :: AddrMode -> Integer -> (InstrBlock, Integer)
+    gen4 addr size
+        | size >= 4 =
+            (unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr addr)), 4)
+        | size >= 2 =
+            (unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr addr)), 2)
+        | size >= 1 =
+            (unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr addr)), 1)
+        | otherwise = (nilOL, 0)
+
+    -- Generates a 64-bit wide MOV instruction from REG to MEM.
+    gen8 :: AddrMode -> Reg -> InstrBlock
+    gen8 addr reg8byte =
+      unitOL (MOV format (OpReg reg8byte) (OpAddr addr))
+
+    -- Unrolls memset when the widest MOV is <= 4 bytes.
+    go4 :: Reg -> Integer -> InstrBlock
+    go4 dst left =
+      if left <= 0 then nilOL
+      else curMov `appOL` go4 dst (left - curWidth)
       where
-        dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
-                   (ImmInteger (n - i))
+        possibleWidth = minimum [left, sizeBytes]
+        dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
+        (curMov, curWidth) = gen4 dst_addr possibleWidth
+
+    -- Unrolls memset when the widest MOV is 8 bytes (thus another Reg
+    -- argument). Falls back to go4 when all 8 byte moves are
+    -- exhausted.
+    go8 :: Reg -> Reg -> Integer -> InstrBlock
+    go8 dst reg8byte left =
+      if possibleWidth >= 8 then
+        let curMov = gen8 dst_addr reg8byte
+        in  curMov `appOL` go8 dst reg8byte (left - 8)
+      else go4 dst left
+      where
+        possibleWidth = minimum [left, sizeBytes]
+        dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
 
 genCCall _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL
         -- write barrier compiles to no code on x86/x86-64;
diff --git a/compiler/utils/Util.hs b/compiler/utils/Util.hs
index 9e67a43bf5..6f7a9e5d07 100644
--- a/compiler/utils/Util.hs
+++ b/compiler/utils/Util.hs
@@ -87,6 +87,7 @@ module Util (
 
         -- * Integers
         exactLog2,
+        byteAlignment,
 
         -- * Floating point
         readRational,
@@ -1149,6 +1150,15 @@ exactLog2 x
     pow2 x | x == 1 = 0
            | otherwise = 1 + pow2 (x `shiftR` 1)
 
+-- x is aligned at N bytes means the remainder from x / N is zero.
+-- Currently, interested in N <= 8, but can be expanded to N <= 16 or
+-- N <= 32 if used within SSE or AVX context.
+byteAlignment :: Integer -> Integer
+byteAlignment x = case x .&. 7 of
+  0 -> 8
+  4 -> 4
+  2 -> 2
+  _ -> 1
 
 {-
 -- -----------------------------------------------------------------------------
author	Artem Pyanykh <artem.pyanykh@gmail.com>	2019-04-04 13:43:38 +0300
committer	Marge Bot <ben+marge-bot@smart-cactus.org>	2019-04-09 10:30:13 -0400
commit	af4cea7f1411e5b99e2417d7c2d3d0e697093103 (patch)
tree	ec9ef85347e5c8915e864573997c15aaa8cc5a73 /compiler
parent	36d380475d9056fdf93305985be3def00aaf6cf7 (diff)
download	haskell-af4cea7f1411e5b99e2417d7c2d3d0e697093103.tar.gz