When using Ivy Bridge, emit 'rep movsb' for copieswip/ermsb

Signed-off-by: Austin Seipp <austin@well-typed.com>
author: Austin Seipp <austin@well-typed.com> 2014-04-21 22:05:59 -0500
committer: Austin Seipp <austin@well-typed.com> 2014-04-28 04:17:48 -0500
commit: e2cc1c5135c77736bfc8a188e7b4cc29fa50d39a (patch)
tree: b1aaac05872712f2bfad153d9b16bc0cee6d9ce6
parent: 41ddcd7d7909224ac891c7b1ced8f2c59cb07dfc (diff)
download: haskell-wip/ermsb.tar.gz
4 files changed, 44 insertions, 0 deletions
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index e659488fe0..e90667de42 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -1568,6 +1568,39 @@ genCCall
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
+-- Do NOT unroll memcpy calls if the compiler has -mcpu=ivybridge -
+-- this can be done even better using 'enhanced rep movsb', which
+-- is nearly as fast as an AVX-based memcpy.
+--
+-- Note: this is implied with *both* -mcpu and -march. Why? -mcpu
+-- traditionally controls tuning schedules etc for the particular
+-- platform. -march controls *code generation* for that platform,
+-- including what instructions can be emitted.
+--
+-- In this case, the *instruction* does not change, it is still
+-- backwards compatible. But the actual *performance* impact and
+-- schedule of the code will change, hence why we check mcpu as well.
+genCCall dflags is32Bit (PrimTarget MO_Memcpy) _
+         [dst, src,
+          (CmmLit (CmmInt n _)),
+          (CmmLit (CmmInt _ _))]
+    | supportsERMSB dflags && not is32Bit = do
+        code_dst <- getAnyReg dst
+        dst_r <- getNewRegNat II64
+        code_src <- getAnyReg src
+        src_r <- getNewRegNat II64
+        return $ code_dst dst_r `appOL` code_src src_r `appOL`
+          unitOL (MOV II64 (OpReg src_r)          (OpReg rsi)) `appOL`
+          unitOL (MOV II64 (OpReg dst_r)          (OpReg rdi)) `appOL`
+          unitOL (MOV II64 (OpImm (ImmInteger n)) (OpReg rcx)) `appOL`
+          unitOL REPMOVSB `appOL` nilOL
+
+ where
+   supportsERMSB dflags
+     | Intel x <- march dflags = any (== ERMSB) (intelCPUFeatures x)
+     | Intel x <- mcpu  dflags = any (== ERMSB) (intelCPUFeatures x)
+     | otherwise = False
+
 -- Unroll memcpy calls if the source and destination pointers are at
 -- least DWORD aligned and the number of bytes to copy isn't too
 -- large.  Otherwise, call C's memcpy.
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index 75e5b9e737..99731fb276 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -183,6 +183,10 @@ data Instr
         | MOV         Size Operand Operand
         | MOVZxL      Size Operand Operand -- size is the size of operand 1
         | MOVSxL      Size Operand Operand -- size is the size of operand 1
+
+        -- Special case move for Ivy Bridge processors
+        | REPMOVSB
+
         -- x86_64 note: plain mov into a 32-bit register always zero-extends
         -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
         -- don't affect the high bits of the register.
@@ -425,6 +429,8 @@ x86_regUsageOfInstr platform instr
 
     POPCNT _ src dst -> mkRU (use_R src []) [dst]
 
+    REPMOVSB -> mkRU [] []
+
     -- note: might be a better way to do this
     PREFETCH _  _ src -> mkRU (use_R src []) []
 
@@ -570,6 +576,8 @@ x86_patchRegsOfInstr instr env
 
     PREFETCH lvl size src -> PREFETCH lvl size (patchOp src)
 
+    REPMOVSB -> REPMOVSB
+
     _other              -> panic "patchRegs: unrecognised instr"
 
   where
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index f38a04d069..9af038c7e5 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -582,6 +582,8 @@ pprInstr (PREFETCH Lvl0 size src) = pprSizeOp_ (sLit "prefetcht0") size src
 pprInstr (PREFETCH Lvl1 size src) = pprSizeOp_ (sLit "prefetcht1") size src
 pprInstr (PREFETCH Lvl2 size src) = pprSizeOp_ (sLit "prefetcht2") size src
 
+pprInstr REPMOVSB = ptext (sLit "\trep movsb")
+
 pprInstr (NOT size op) = pprSizeOp (sLit "not") size op
 pprInstr (BSWAP size op) = pprSizeOp (sLit "bswap") size (OpReg op)
 pprInstr (NEGI size op) = pprSizeOp (sLit "neg") size op
diff --git a/compiler/utils/Platform.hs b/compiler/utils/Platform.hs
index 14ce7bdbb1..24891abf0f 100644
--- a/compiler/utils/Platform.hs
+++ b/compiler/utils/Platform.hs
@@ -212,6 +212,7 @@ data IntelFeature
   | AVX1
   | ERMSB -- "Extended rep-movsb"
   | AVX2
+  deriving Eq
 
 descToCPU :: String -> Maybe CPUDesc
 descToCPU "generic"     = Just Generic
author	Austin Seipp <austin@well-typed.com>	2014-04-21 22:05:59 -0500
committer	Austin Seipp <austin@well-typed.com>	2014-04-28 04:17:48 -0500
commit	e2cc1c5135c77736bfc8a188e7b4cc29fa50d39a (patch)
tree	b1aaac05872712f2bfad153d9b16bc0cee6d9ce6
parent	41ddcd7d7909224ac891c7b1ced8f2c59cb07dfc (diff)
download	haskell-wip/ermsb.tar.gz