diff options
author | Herbert Valerio Riedel <hvr@gnu.org> | 2014-10-18 14:32:33 +0200 |
---|---|---|
committer | Herbert Valerio Riedel <hvr@gnu.org> | 2014-10-18 14:32:56 +0200 |
commit | 612f3d120c65a461a4ad7f212d67bdae005f4975 (patch) | |
tree | 5045f1a4ae209ad3201c479bc8a86d47a6cc4685 /compiler | |
parent | 1c35f9f1cb7a293da85d649904ce731a65824cfe (diff) | |
download | haskell-612f3d120c65a461a4ad7f212d67bdae005f4975.tar.gz |
Implement optimized NCG `MO_Ctz W64` op for i386 (#9340)
Summary:
This is an optimization to the CTZ primops introduced for #9340
Previously we called out to `hs_ctz64`, but we can actually generate
better hand-tuned code while avoiding the FFI ccall.
With this patch, the code
{-# LANGUAGE MagicHash #-}
module TestClz0 where
import GHC.Prim
ctz64 :: Word64# -> Word#
ctz64 x = ctz64# x
results in the following assembler generated by NCG on i386:
TestClz.ctz64_info:
movl (%ebp),%eax
movl 4(%ebp),%ecx
movl %ecx,%edx
orl %eax,%edx
movl $64,%edx
je _nAO
bsf %ecx,%ecx
addl $32,%ecx
bsf %eax,%eax
cmovne %eax,%ecx
movl %ecx,%edx
_nAO:
movl %edx,%esi
addl $8,%ebp
jmp *(%ebp)
For comparision, here's what LLVM 3.4 currently generates:
000000fc <TestClzz_ctzz64_info>:
fc: 0f bc 45 04 bsf 0x4(%ebp),%eax
100: b9 20 00 00 00 mov $0x20,%ecx
105: 0f 45 c8 cmovne %eax,%ecx
108: 83 c1 20 add $0x20,%ecx
10b: 8b 45 00 mov 0x0(%ebp),%eax
10e: 8b 55 08 mov 0x8(%ebp),%edx
111: 0f bc f0 bsf %eax,%esi
114: 85 c0 test %eax,%eax
116: 0f 44 f1 cmove %ecx,%esi
119: 83 c5 08 add $0x8,%ebp
11c: ff e2 jmp *%edx
Reviewed By: austin
Auditors: simonmar
Differential Revision: https://phabricator.haskell.org/D163
Diffstat (limited to 'compiler')
-rw-r--r-- | compiler/nativeGen/X86/CodeGen.hs | 41 |
1 files changed, 32 insertions, 9 deletions
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index 9d7cb78a6c..abd87ed087 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -1799,14 +1799,38 @@ genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] size = if width == W8 then II16 else intSize width lbl = mkCmmCodeLabel primPackageKey (fsLit (clzLabel width)) -genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) dest_regs@[dst] args@[src] +genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] | is32Bit, width == W64 = do - -- Fallback to `hs_ctz64` on i386 - targetExpr <- cmmMakeDynamicReference dflags CallReference lbl - let target = ForeignTarget targetExpr (ForeignConvention CCallConv - [NoHint] [NoHint] - CmmMayReturn) - genCCall dflags is32Bit target dest_regs args + ChildCode64 vcode rlo <- iselExpr64 src + let rhi = getHiVRegFromLo rlo + dst_r = getRegisterReg platform False (CmmLocal dst) + lbl1 <- getBlockIdNat + lbl2 <- getBlockIdNat + tmp_r <- getNewRegNat size + + -- The following instruction sequence corresponds to the pseudo-code + -- + -- if (src) { + -- dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32); + -- } else { + -- dst = 64; + -- } + return $ vcode `appOL` toOL + ([ MOV II32 (OpReg rhi) (OpReg tmp_r) + , OR II32 (OpReg rlo) (OpReg tmp_r) + , MOV II32 (OpImm (ImmInt 64)) (OpReg dst_r) + , JXX EQQ lbl2 + , JXX ALWAYS lbl1 + + , NEWBLOCK lbl1 + , BSF II32 (OpReg rhi) dst_r + , ADD II32 (OpImm (ImmInt 32)) (OpReg dst_r) + , BSF II32 (OpReg rlo) tmp_r + , CMOV NE II32 (OpReg tmp_r) dst_r + , JXX ALWAYS lbl2 + + , NEWBLOCK lbl2 + ]) | otherwise = do code_src <- getAnyReg src @@ -1828,7 +1852,6 @@ genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) dest_regs@[dst] args@[src] bw = widthInBits width platform = targetPlatform dflags size = if width == W8 then II16 else intSize width - lbl = mkCmmCodeLabel primPackageKey (fsLit (ctzLabel width)) genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do targetExpr <- cmmMakeDynamicReference dflags @@ -2485,7 +2508,7 @@ outOfLineCmmOp mop res args MO_PopCnt _ -> fsLit "popcnt" MO_BSwap _ -> fsLit "bswap" MO_Clz w -> fsLit $ clzLabel w - MO_Ctz w -> fsLit $ ctzLabel w + MO_Ctz _ -> unsupported MO_AtomicRMW _ _ -> fsLit "atomicrmw" MO_AtomicRead _ -> fsLit "atomicread" |