summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohan Tibell <johan.tibell@gmail.com>2014-03-26 17:08:29 +0100
committerJohan Tibell <johan.tibell@gmail.com>2014-03-26 17:08:29 +0100
commit11b31c3c9bbe05486e6532ec6217c8cf2587adbb (patch)
treeb22cc07f0caf6c24557c339260ba4e319c3517cb
parentce335cee31de24d817246a87935bb9ffd21168f9 (diff)
downloadhaskell-11b31c3c9bbe05486e6532ec6217c8cf2587adbb.tar.gz
Add flags to control memcpy and memset inlining
This adds -fmax-inline-memcpy-insns and -fmax-inline-memset-insns. These flags control when we inline calls to memcpy/memset with statically known arguments. The flag naming style is taken from GCC and the same limit is used by both GCC and LLVM.
-rw-r--r--compiler/main/DynFlags.hs20
-rw-r--r--compiler/nativeGen/X86/CodeGen.hs56
-rw-r--r--docs/users_guide/flags.xml24
-rw-r--r--testsuite/.gitignore1
-rw-r--r--testsuite/tests/codeGen/should_gen_asm/all.T2
-rw-r--r--testsuite/tests/codeGen/should_gen_asm/memset-unroll.asm14
-rw-r--r--testsuite/tests/codeGen/should_gen_asm/memset-unroll.cmm8
7 files changed, 95 insertions, 30 deletions
diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs
index 1662c9f82e..6702b73826 100644
--- a/compiler/main/DynFlags.hs
+++ b/compiler/main/DynFlags.hs
@@ -803,9 +803,17 @@ data DynFlags = DynFlags {
-- Constants used to control the amount of optimization done.
- -- ^ Max size, in bytes, of inline array allocations.
- maxInlineAllocSize :: Int
- }
+ -- | Max size, in bytes, of inline array allocations.
+ maxInlineAllocSize :: Int,
+
+ -- | Only inline memcpy if it generates no more than this many
+ -- pseudo (roughly: Cmm) instructions.
+ maxInlineMemcpyInsns :: Int,
+
+ -- | Only inline memset if it generates no more than this many
+ -- pseudo (roughly: Cmm) instructions.
+ maxInlineMemsetInsns :: Int
+}
class HasDynFlags m where
getDynFlags :: m DynFlags
@@ -1455,7 +1463,9 @@ defaultDynFlags mySettings =
rtldInfo = panic "defaultDynFlags: no rtldInfo",
rtccInfo = panic "defaultDynFlags: no rtccInfo",
- maxInlineAllocSize = 128
+ maxInlineAllocSize = 128,
+ maxInlineMemcpyInsns = 32,
+ maxInlineMemsetInsns = 32
}
defaultWays :: Settings -> [Way]
@@ -2439,6 +2449,8 @@ dynamic_flags = [
, Flag "fghci-hist-size" (intSuffix (\n d -> d {ghciHistSize = n}))
, Flag "fmax-inline-alloc-size" (intSuffix (\n d -> d{ maxInlineAllocSize = n }))
+ , Flag "fmax-inline-memcpy-insns" (intSuffix (\n d -> d{ maxInlineMemcpyInsns = n }))
+ , Flag "fmax-inline-memset-insns" (intSuffix (\n d -> d{ maxInlineMemsetInsns = n }))
------ Profiling ----------------------------------------------------
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 2456688744..e659488fe0 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -159,7 +159,7 @@ stmtToInstrs stmt = do
size = cmmTypeSize ty
CmmUnsafeForeignCall target result_regs args
- -> genCCall is32Bit target result_regs args
+ -> genCCall dflags is32Bit target result_regs args
CmmBranch id -> genBranch id
CmmCondBranch arg true false -> do b1 <- genCondJump true arg
@@ -1559,7 +1559,8 @@ genCondJump id bool = do
-- register allocator.
genCCall
- :: Bool -- 32 bit platform?
+ :: DynFlags
+ -> Bool -- 32 bit platform?
-> ForeignTarget -- function to call
-> [CmmFormal] -- where to put the result
-> [CmmActual] -- arguments (of mixed type)
@@ -1570,21 +1571,27 @@ genCCall
-- Unroll memcpy calls if the source and destination pointers are at
-- least DWORD aligned and the number of bytes to copy isn't too
-- large. Otherwise, call C's memcpy.
-genCCall is32Bit (PrimTarget MO_Memcpy) _
+genCCall dflags is32Bit (PrimTarget MO_Memcpy) _
[dst, src,
(CmmLit (CmmInt n _)),
(CmmLit (CmmInt align _))]
- | n <= maxInlineSizeThreshold && align .&. 3 == 0 = do
+ | fromInteger insns <= maxInlineMemcpyInsns dflags && align .&. 3 == 0 = do
code_dst <- getAnyReg dst
dst_r <- getNewRegNat size
code_src <- getAnyReg src
src_r <- getNewRegNat size
tmp_r <- getNewRegNat size
return $ code_dst dst_r `appOL` code_src src_r `appOL`
- go dst_r src_r tmp_r n
+ go dst_r src_r tmp_r (fromInteger n)
where
+ -- The number of instructions we will generate (approx). We need 2
+ -- instructions per move.
+ insns = 2 * ((n + sizeBytes - 1) `div` sizeBytes)
+
size = if align .&. 4 /= 0 then II32 else (archWordSize is32Bit)
+ -- The size of each move, in bytes.
+ sizeBytes :: Integer
sizeBytes = fromIntegral (sizeInBytes size)
go :: Reg -> Reg -> Reg -> Integer -> OrdList Instr
@@ -1613,15 +1620,15 @@ genCCall is32Bit (PrimTarget MO_Memcpy) _
dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
(ImmInteger (n - i))
-genCCall _ (PrimTarget MO_Memset) _
+genCCall dflags _ (PrimTarget MO_Memset) _
[dst,
CmmLit (CmmInt c _),
CmmLit (CmmInt n _),
CmmLit (CmmInt align _)]
- | n <= maxInlineSizeThreshold && align .&. 3 == 0 = do
+ | fromInteger insns <= maxInlineMemsetInsns dflags && align .&. 3 == 0 = do
code_dst <- getAnyReg dst
dst_r <- getNewRegNat size
- return $ code_dst dst_r `appOL` go dst_r n
+ return $ code_dst dst_r `appOL` go dst_r (fromInteger n)
where
(size, val) = case align .&. 3 of
2 -> (II16, c2)
@@ -1630,6 +1637,12 @@ genCCall _ (PrimTarget MO_Memset) _
c2 = c `shiftL` 8 .|. c
c4 = c2 `shiftL` 16 .|. c2
+ -- The number of instructions we will generate (approx). We need 1
+ -- instructions per move.
+ insns = (n + sizeBytes - 1) `div` sizeBytes
+
+ -- The size of each move, in bytes.
+ sizeBytes :: Integer
sizeBytes = fromIntegral (sizeInBytes size)
go :: Reg -> Integer -> OrdList Instr
@@ -1652,13 +1665,13 @@ genCCall _ (PrimTarget MO_Memset) _
dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
(ImmInteger (n - i))
-genCCall _ (PrimTarget MO_WriteBarrier) _ _ = return nilOL
+genCCall _ _ (PrimTarget MO_WriteBarrier) _ _ = return nilOL
-- write barrier compiles to no code on x86/x86-64;
-- we keep it this long in order to prevent earlier optimisations.
-genCCall _ (PrimTarget MO_Touch) _ _ = return nilOL
+genCCall _ _ (PrimTarget MO_Touch) _ _ = return nilOL
-genCCall is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] =
+genCCall _ is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] =
case n of
0 -> genPrefetch src $ PREFETCH NTA size
1 -> genPrefetch src $ PREFETCH Lvl2 size
@@ -1679,8 +1692,7 @@ genCCall is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] =
((AddrBaseIndex (EABaseReg src_r ) EAIndexNone (ImmInt 0)))) ))
-- prefetch always takes an address
-genCCall is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] = do
- dflags <- getDynFlags
+genCCall dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] = do
let platform = targetPlatform dflags
let dst_r = getRegisterReg platform False (CmmLocal dst)
case width of
@@ -1702,10 +1714,9 @@ genCCall is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] = do
where
size = intSize width
-genCCall is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
+genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
args@[src] = do
sse4_2 <- sse4_2Enabled
- dflags <- getDynFlags
let platform = targetPlatform dflags
if sse4_2
then do code_src <- getAnyReg src
@@ -1725,23 +1736,22 @@ genCCall is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
let target = ForeignTarget targetExpr (ForeignConvention CCallConv
[NoHint] [NoHint]
CmmMayReturn)
- genCCall is32Bit target dest_regs args
+ genCCall dflags is32Bit target dest_regs args
where
size = intSize width
lbl = mkCmmCodeLabel primPackageId (fsLit (popCntLabel width))
-genCCall is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do
- dflags <- getDynFlags
+genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do
targetExpr <- cmmMakeDynamicReference dflags
CallReference lbl
let target = ForeignTarget targetExpr (ForeignConvention CCallConv
[NoHint] [NoHint]
CmmMayReturn)
- genCCall is32Bit target dest_regs args
+ genCCall dflags is32Bit target dest_regs args
where
lbl = mkCmmCodeLabel primPackageId (fsLit (word2FloatLabel width))
-genCCall is32Bit target dest_regs args
+genCCall _ is32Bit target dest_regs args
| is32Bit = genCCall32 target dest_regs args
| otherwise = genCCall64 target dest_regs args
@@ -2304,12 +2314,6 @@ maybePromoteCArg dflags wto arg
where
wfrom = cmmExprWidth dflags arg
--- | We're willing to inline and unroll memcpy/memset calls that touch
--- at most these many bytes. This threshold is the same as the one
--- used by GCC and LLVM.
-maxInlineSizeThreshold :: Integer
-maxInlineSizeThreshold = 128
-
outOfLineCmmOp :: CallishMachOp -> Maybe CmmFormal -> [CmmActual] -> NatM InstrBlock
outOfLineCmmOp mop res args
= do
diff --git a/docs/users_guide/flags.xml b/docs/users_guide/flags.xml
index b4febf587b..6acd28dc0a 100644
--- a/docs/users_guide/flags.xml
+++ b/docs/users_guide/flags.xml
@@ -1900,6 +1900,30 @@
<entry>-</entry>
</row>
+ <row>
+ <entry>
+ <option>-fmax-inline-memcpy-insns</option>=<replaceable>n</replaceable>
+ </entry>
+ <entry>Inline memcpy calls if they would generate no more
+ than <replaceable>n</replaceable> pseudo instructions
+ (default: 32).
+ </entry>
+ <entry>dynamic</entry>
+ <entry>-</entry>
+ </row>
+
+ <row>
+ <entry>
+ <option>-fmax-inline-memset-insns</option>=<replaceable>n</replaceable>
+ </entry>
+ <entry>Inline memset calls if they would generate no more
+ than <replaceable>n</replaceable> pseudo instructions
+ (default: 32).
+ </entry>
+ <entry>dynamic</entry>
+ <entry>-</entry>
+ </row>
+
</tbody>
</tgroup>
</informaltable>
diff --git a/testsuite/.gitignore b/testsuite/.gitignore
index 1e14dc151f..519d432273 100644
--- a/testsuite/.gitignore
+++ b/testsuite/.gitignore
@@ -139,6 +139,7 @@ tests/codeGen/should_compile/T2578
tests/codeGen/should_gen_asm/memcpy-unroll-conprop.s
tests/codeGen/should_gen_asm/memcpy-unroll.s
tests/codeGen/should_gen_asm/memcpy.s
+tests/codeGen/should_gen_asm/memset-unroll.s
tests/codeGen/should_run/1852
tests/codeGen/should_run/1861
tests/codeGen/should_run/2080
diff --git a/testsuite/tests/codeGen/should_gen_asm/all.T b/testsuite/tests/codeGen/should_gen_asm/all.T
index be30d5fe10..9cd3b45771 100644
--- a/testsuite/tests/codeGen/should_gen_asm/all.T
+++ b/testsuite/tests/codeGen/should_gen_asm/all.T
@@ -4,3 +4,5 @@ test('memcpy-unroll',
unless(platform('x86_64-unknown-linux'),skip), compile_cmp_asm, [''])
test('memcpy-unroll-conprop',
unless(platform('x86_64-unknown-linux'),skip), compile_cmp_asm, [''])
+test('memset-unroll',
+ unless(platform('x86_64-unknown-linux'),skip), compile_cmp_asm, [''])
diff --git a/testsuite/tests/codeGen/should_gen_asm/memset-unroll.asm b/testsuite/tests/codeGen/should_gen_asm/memset-unroll.asm
new file mode 100644
index 0000000000..4c5c20bfdf
--- /dev/null
+++ b/testsuite/tests/codeGen/should_gen_asm/memset-unroll.asm
@@ -0,0 +1,14 @@
+.text
+ .align 8
+.globl callMemset
+.type callMemset, @object
+callMemset:
+.Lc5:
+ movl $16843009,0(%rbx)
+ movl $16843009,4(%rbx)
+ movl $16843009,8(%rbx)
+ movl $16843009,12(%rbx)
+ jmp *(%rbp)
+ .size callMemset, .-callMemset
+.section .note.GNU-stack,"",@progbits
+.ident "GHC 7.9.20140311"
diff --git a/testsuite/tests/codeGen/should_gen_asm/memset-unroll.cmm b/testsuite/tests/codeGen/should_gen_asm/memset-unroll.cmm
new file mode 100644
index 0000000000..825e7ead90
--- /dev/null
+++ b/testsuite/tests/codeGen/should_gen_asm/memset-unroll.cmm
@@ -0,0 +1,8 @@
+#include "Cmm.h"
+
+// Small memsets should unroll
+callMemset (W_ dst)
+{
+ prim %memset(dst, 1, 16, 4);
+ return ();
+}