diff options
author | Johan Tibell <johan.tibell@gmail.com> | 2014-03-26 17:08:29 +0100 |
---|---|---|
committer | Johan Tibell <johan.tibell@gmail.com> | 2014-03-26 17:08:29 +0100 |
commit | 11b31c3c9bbe05486e6532ec6217c8cf2587adbb (patch) | |
tree | b22cc07f0caf6c24557c339260ba4e319c3517cb | |
parent | ce335cee31de24d817246a87935bb9ffd21168f9 (diff) | |
download | haskell-11b31c3c9bbe05486e6532ec6217c8cf2587adbb.tar.gz |
Add flags to control memcpy and memset inlining
This adds -fmax-inline-memcpy-insns and -fmax-inline-memset-insns.
These flags control when we inline calls to memcpy/memset with
statically known arguments. The flag naming style is taken from GCC
and the same limit is used by both GCC and LLVM.
-rw-r--r-- | compiler/main/DynFlags.hs | 20 | ||||
-rw-r--r-- | compiler/nativeGen/X86/CodeGen.hs | 56 | ||||
-rw-r--r-- | docs/users_guide/flags.xml | 24 | ||||
-rw-r--r-- | testsuite/.gitignore | 1 | ||||
-rw-r--r-- | testsuite/tests/codeGen/should_gen_asm/all.T | 2 | ||||
-rw-r--r-- | testsuite/tests/codeGen/should_gen_asm/memset-unroll.asm | 14 | ||||
-rw-r--r-- | testsuite/tests/codeGen/should_gen_asm/memset-unroll.cmm | 8 |
7 files changed, 95 insertions, 30 deletions
diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs index 1662c9f82e..6702b73826 100644 --- a/compiler/main/DynFlags.hs +++ b/compiler/main/DynFlags.hs @@ -803,9 +803,17 @@ data DynFlags = DynFlags { -- Constants used to control the amount of optimization done. - -- ^ Max size, in bytes, of inline array allocations. - maxInlineAllocSize :: Int - } + -- | Max size, in bytes, of inline array allocations. + maxInlineAllocSize :: Int, + + -- | Only inline memcpy if it generates no more than this many + -- pseudo (roughly: Cmm) instructions. + maxInlineMemcpyInsns :: Int, + + -- | Only inline memset if it generates no more than this many + -- pseudo (roughly: Cmm) instructions. + maxInlineMemsetInsns :: Int +} class HasDynFlags m where getDynFlags :: m DynFlags @@ -1455,7 +1463,9 @@ defaultDynFlags mySettings = rtldInfo = panic "defaultDynFlags: no rtldInfo", rtccInfo = panic "defaultDynFlags: no rtccInfo", - maxInlineAllocSize = 128 + maxInlineAllocSize = 128, + maxInlineMemcpyInsns = 32, + maxInlineMemsetInsns = 32 } defaultWays :: Settings -> [Way] @@ -2439,6 +2449,8 @@ dynamic_flags = [ , Flag "fghci-hist-size" (intSuffix (\n d -> d {ghciHistSize = n})) , Flag "fmax-inline-alloc-size" (intSuffix (\n d -> d{ maxInlineAllocSize = n })) + , Flag "fmax-inline-memcpy-insns" (intSuffix (\n d -> d{ maxInlineMemcpyInsns = n })) + , Flag "fmax-inline-memset-insns" (intSuffix (\n d -> d{ maxInlineMemsetInsns = n })) ------ Profiling ---------------------------------------------------- diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index 2456688744..e659488fe0 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -159,7 +159,7 @@ stmtToInstrs stmt = do size = cmmTypeSize ty CmmUnsafeForeignCall target result_regs args - -> genCCall is32Bit target result_regs args + -> genCCall dflags is32Bit target result_regs args CmmBranch id -> genBranch id CmmCondBranch arg true false -> do b1 <- genCondJump true arg @@ -1559,7 +1559,8 @@ genCondJump id bool = do -- register allocator. genCCall - :: Bool -- 32 bit platform? + :: DynFlags + -> Bool -- 32 bit platform? -> ForeignTarget -- function to call -> [CmmFormal] -- where to put the result -> [CmmActual] -- arguments (of mixed type) @@ -1570,21 +1571,27 @@ genCCall -- Unroll memcpy calls if the source and destination pointers are at -- least DWORD aligned and the number of bytes to copy isn't too -- large. Otherwise, call C's memcpy. -genCCall is32Bit (PrimTarget MO_Memcpy) _ +genCCall dflags is32Bit (PrimTarget MO_Memcpy) _ [dst, src, (CmmLit (CmmInt n _)), (CmmLit (CmmInt align _))] - | n <= maxInlineSizeThreshold && align .&. 3 == 0 = do + | fromInteger insns <= maxInlineMemcpyInsns dflags && align .&. 3 == 0 = do code_dst <- getAnyReg dst dst_r <- getNewRegNat size code_src <- getAnyReg src src_r <- getNewRegNat size tmp_r <- getNewRegNat size return $ code_dst dst_r `appOL` code_src src_r `appOL` - go dst_r src_r tmp_r n + go dst_r src_r tmp_r (fromInteger n) where + -- The number of instructions we will generate (approx). We need 2 + -- instructions per move. + insns = 2 * ((n + sizeBytes - 1) `div` sizeBytes) + size = if align .&. 4 /= 0 then II32 else (archWordSize is32Bit) + -- The size of each move, in bytes. + sizeBytes :: Integer sizeBytes = fromIntegral (sizeInBytes size) go :: Reg -> Reg -> Reg -> Integer -> OrdList Instr @@ -1613,15 +1620,15 @@ genCCall is32Bit (PrimTarget MO_Memcpy) _ dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - i)) -genCCall _ (PrimTarget MO_Memset) _ +genCCall dflags _ (PrimTarget MO_Memset) _ [dst, CmmLit (CmmInt c _), CmmLit (CmmInt n _), CmmLit (CmmInt align _)] - | n <= maxInlineSizeThreshold && align .&. 3 == 0 = do + | fromInteger insns <= maxInlineMemsetInsns dflags && align .&. 3 == 0 = do code_dst <- getAnyReg dst dst_r <- getNewRegNat size - return $ code_dst dst_r `appOL` go dst_r n + return $ code_dst dst_r `appOL` go dst_r (fromInteger n) where (size, val) = case align .&. 3 of 2 -> (II16, c2) @@ -1630,6 +1637,12 @@ genCCall _ (PrimTarget MO_Memset) _ c2 = c `shiftL` 8 .|. c c4 = c2 `shiftL` 16 .|. c2 + -- The number of instructions we will generate (approx). We need 1 + -- instructions per move. + insns = (n + sizeBytes - 1) `div` sizeBytes + + -- The size of each move, in bytes. + sizeBytes :: Integer sizeBytes = fromIntegral (sizeInBytes size) go :: Reg -> Integer -> OrdList Instr @@ -1652,13 +1665,13 @@ genCCall _ (PrimTarget MO_Memset) _ dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - i)) -genCCall _ (PrimTarget MO_WriteBarrier) _ _ = return nilOL +genCCall _ _ (PrimTarget MO_WriteBarrier) _ _ = return nilOL -- write barrier compiles to no code on x86/x86-64; -- we keep it this long in order to prevent earlier optimisations. -genCCall _ (PrimTarget MO_Touch) _ _ = return nilOL +genCCall _ _ (PrimTarget MO_Touch) _ _ = return nilOL -genCCall is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] = +genCCall _ is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] = case n of 0 -> genPrefetch src $ PREFETCH NTA size 1 -> genPrefetch src $ PREFETCH Lvl2 size @@ -1679,8 +1692,7 @@ genCCall is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] = ((AddrBaseIndex (EABaseReg src_r ) EAIndexNone (ImmInt 0)))) )) -- prefetch always takes an address -genCCall is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] = do - dflags <- getDynFlags +genCCall dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] = do let platform = targetPlatform dflags let dst_r = getRegisterReg platform False (CmmLocal dst) case width of @@ -1702,10 +1714,9 @@ genCCall is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] = do where size = intSize width -genCCall is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst] +genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst] args@[src] = do sse4_2 <- sse4_2Enabled - dflags <- getDynFlags let platform = targetPlatform dflags if sse4_2 then do code_src <- getAnyReg src @@ -1725,23 +1736,22 @@ genCCall is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst] let target = ForeignTarget targetExpr (ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn) - genCCall is32Bit target dest_regs args + genCCall dflags is32Bit target dest_regs args where size = intSize width lbl = mkCmmCodeLabel primPackageId (fsLit (popCntLabel width)) -genCCall is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do - dflags <- getDynFlags +genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do targetExpr <- cmmMakeDynamicReference dflags CallReference lbl let target = ForeignTarget targetExpr (ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn) - genCCall is32Bit target dest_regs args + genCCall dflags is32Bit target dest_regs args where lbl = mkCmmCodeLabel primPackageId (fsLit (word2FloatLabel width)) -genCCall is32Bit target dest_regs args +genCCall _ is32Bit target dest_regs args | is32Bit = genCCall32 target dest_regs args | otherwise = genCCall64 target dest_regs args @@ -2304,12 +2314,6 @@ maybePromoteCArg dflags wto arg where wfrom = cmmExprWidth dflags arg --- | We're willing to inline and unroll memcpy/memset calls that touch --- at most these many bytes. This threshold is the same as the one --- used by GCC and LLVM. -maxInlineSizeThreshold :: Integer -maxInlineSizeThreshold = 128 - outOfLineCmmOp :: CallishMachOp -> Maybe CmmFormal -> [CmmActual] -> NatM InstrBlock outOfLineCmmOp mop res args = do diff --git a/docs/users_guide/flags.xml b/docs/users_guide/flags.xml index b4febf587b..6acd28dc0a 100644 --- a/docs/users_guide/flags.xml +++ b/docs/users_guide/flags.xml @@ -1900,6 +1900,30 @@ <entry>-</entry> </row> + <row> + <entry> + <option>-fmax-inline-memcpy-insns</option>=<replaceable>n</replaceable> + </entry> + <entry>Inline memcpy calls if they would generate no more + than <replaceable>n</replaceable> pseudo instructions + (default: 32). + </entry> + <entry>dynamic</entry> + <entry>-</entry> + </row> + + <row> + <entry> + <option>-fmax-inline-memset-insns</option>=<replaceable>n</replaceable> + </entry> + <entry>Inline memset calls if they would generate no more + than <replaceable>n</replaceable> pseudo instructions + (default: 32). + </entry> + <entry>dynamic</entry> + <entry>-</entry> + </row> + </tbody> </tgroup> </informaltable> diff --git a/testsuite/.gitignore b/testsuite/.gitignore index 1e14dc151f..519d432273 100644 --- a/testsuite/.gitignore +++ b/testsuite/.gitignore @@ -139,6 +139,7 @@ tests/codeGen/should_compile/T2578 tests/codeGen/should_gen_asm/memcpy-unroll-conprop.s tests/codeGen/should_gen_asm/memcpy-unroll.s tests/codeGen/should_gen_asm/memcpy.s +tests/codeGen/should_gen_asm/memset-unroll.s tests/codeGen/should_run/1852 tests/codeGen/should_run/1861 tests/codeGen/should_run/2080 diff --git a/testsuite/tests/codeGen/should_gen_asm/all.T b/testsuite/tests/codeGen/should_gen_asm/all.T index be30d5fe10..9cd3b45771 100644 --- a/testsuite/tests/codeGen/should_gen_asm/all.T +++ b/testsuite/tests/codeGen/should_gen_asm/all.T @@ -4,3 +4,5 @@ test('memcpy-unroll', unless(platform('x86_64-unknown-linux'),skip), compile_cmp_asm, ['']) test('memcpy-unroll-conprop', unless(platform('x86_64-unknown-linux'),skip), compile_cmp_asm, ['']) +test('memset-unroll', + unless(platform('x86_64-unknown-linux'),skip), compile_cmp_asm, ['']) diff --git a/testsuite/tests/codeGen/should_gen_asm/memset-unroll.asm b/testsuite/tests/codeGen/should_gen_asm/memset-unroll.asm new file mode 100644 index 0000000000..4c5c20bfdf --- /dev/null +++ b/testsuite/tests/codeGen/should_gen_asm/memset-unroll.asm @@ -0,0 +1,14 @@ +.text + .align 8 +.globl callMemset +.type callMemset, @object +callMemset: +.Lc5: + movl $16843009,0(%rbx) + movl $16843009,4(%rbx) + movl $16843009,8(%rbx) + movl $16843009,12(%rbx) + jmp *(%rbp) + .size callMemset, .-callMemset +.section .note.GNU-stack,"",@progbits +.ident "GHC 7.9.20140311" diff --git a/testsuite/tests/codeGen/should_gen_asm/memset-unroll.cmm b/testsuite/tests/codeGen/should_gen_asm/memset-unroll.cmm new file mode 100644 index 0000000000..825e7ead90 --- /dev/null +++ b/testsuite/tests/codeGen/should_gen_asm/memset-unroll.cmm @@ -0,0 +1,8 @@ +#include "Cmm.h" + +// Small memsets should unroll +callMemset (W_ dst) +{ + prim %memset(dst, 1, 16, 4); + return (); +} |