diff options
-rw-r--r-- | compiler/nativeGen/X86/CodeGen.hs | 33 | ||||
-rw-r--r-- | compiler/nativeGen/X86/Instr.hs | 8 | ||||
-rw-r--r-- | compiler/nativeGen/X86/Ppr.hs | 2 | ||||
-rw-r--r-- | compiler/utils/Platform.hs | 1 |
4 files changed, 44 insertions, 0 deletions
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index e659488fe0..e90667de42 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -1568,6 +1568,39 @@ genCCall -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +-- Do NOT unroll memcpy calls if the compiler has -mcpu=ivybridge - +-- this can be done even better using 'enhanced rep movsb', which +-- is nearly as fast as an AVX-based memcpy. +-- +-- Note: this is implied with *both* -mcpu and -march. Why? -mcpu +-- traditionally controls tuning schedules etc for the particular +-- platform. -march controls *code generation* for that platform, +-- including what instructions can be emitted. +-- +-- In this case, the *instruction* does not change, it is still +-- backwards compatible. But the actual *performance* impact and +-- schedule of the code will change, hence why we check mcpu as well. +genCCall dflags is32Bit (PrimTarget MO_Memcpy) _ + [dst, src, + (CmmLit (CmmInt n _)), + (CmmLit (CmmInt _ _))] + | supportsERMSB dflags && not is32Bit = do + code_dst <- getAnyReg dst + dst_r <- getNewRegNat II64 + code_src <- getAnyReg src + src_r <- getNewRegNat II64 + return $ code_dst dst_r `appOL` code_src src_r `appOL` + unitOL (MOV II64 (OpReg src_r) (OpReg rsi)) `appOL` + unitOL (MOV II64 (OpReg dst_r) (OpReg rdi)) `appOL` + unitOL (MOV II64 (OpImm (ImmInteger n)) (OpReg rcx)) `appOL` + unitOL REPMOVSB `appOL` nilOL + + where + supportsERMSB dflags + | Intel x <- march dflags = any (== ERMSB) (intelCPUFeatures x) + | Intel x <- mcpu dflags = any (== ERMSB) (intelCPUFeatures x) + | otherwise = False + -- Unroll memcpy calls if the source and destination pointers are at -- least DWORD aligned and the number of bytes to copy isn't too -- large. Otherwise, call C's memcpy. diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs index 75e5b9e737..99731fb276 100644 --- a/compiler/nativeGen/X86/Instr.hs +++ b/compiler/nativeGen/X86/Instr.hs @@ -183,6 +183,10 @@ data Instr | MOV Size Operand Operand | MOVZxL Size Operand Operand -- size is the size of operand 1 | MOVSxL Size Operand Operand -- size is the size of operand 1 + + -- Special case move for Ivy Bridge processors + | REPMOVSB + -- x86_64 note: plain mov into a 32-bit register always zero-extends -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which -- don't affect the high bits of the register. @@ -425,6 +429,8 @@ x86_regUsageOfInstr platform instr POPCNT _ src dst -> mkRU (use_R src []) [dst] + REPMOVSB -> mkRU [] [] + -- note: might be a better way to do this PREFETCH _ _ src -> mkRU (use_R src []) [] @@ -570,6 +576,8 @@ x86_patchRegsOfInstr instr env PREFETCH lvl size src -> PREFETCH lvl size (patchOp src) + REPMOVSB -> REPMOVSB + _other -> panic "patchRegs: unrecognised instr" where diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs index f38a04d069..9af038c7e5 100644 --- a/compiler/nativeGen/X86/Ppr.hs +++ b/compiler/nativeGen/X86/Ppr.hs @@ -582,6 +582,8 @@ pprInstr (PREFETCH Lvl0 size src) = pprSizeOp_ (sLit "prefetcht0") size src pprInstr (PREFETCH Lvl1 size src) = pprSizeOp_ (sLit "prefetcht1") size src pprInstr (PREFETCH Lvl2 size src) = pprSizeOp_ (sLit "prefetcht2") size src +pprInstr REPMOVSB = ptext (sLit "\trep movsb") + pprInstr (NOT size op) = pprSizeOp (sLit "not") size op pprInstr (BSWAP size op) = pprSizeOp (sLit "bswap") size (OpReg op) pprInstr (NEGI size op) = pprSizeOp (sLit "neg") size op diff --git a/compiler/utils/Platform.hs b/compiler/utils/Platform.hs index 14ce7bdbb1..24891abf0f 100644 --- a/compiler/utils/Platform.hs +++ b/compiler/utils/Platform.hs @@ -212,6 +212,7 @@ data IntelFeature | AVX1 | ERMSB -- "Extended rep-movsb" | AVX2 + deriving Eq descToCPU :: String -> Maybe CPUDesc descToCPU "generic" = Just Generic |