summaryrefslogtreecommitdiff
path: root/compiler
diff options
context:
space:
mode:
authorHerbert Valerio Riedel <hvr@gnu.org>2014-10-18 14:32:33 +0200
committerHerbert Valerio Riedel <hvr@gnu.org>2014-10-18 14:32:56 +0200
commit612f3d120c65a461a4ad7f212d67bdae005f4975 (patch)
tree5045f1a4ae209ad3201c479bc8a86d47a6cc4685 /compiler
parent1c35f9f1cb7a293da85d649904ce731a65824cfe (diff)
downloadhaskell-612f3d120c65a461a4ad7f212d67bdae005f4975.tar.gz
Implement optimized NCG `MO_Ctz W64` op for i386 (#9340)
Summary: This is an optimization to the CTZ primops introduced for #9340 Previously we called out to `hs_ctz64`, but we can actually generate better hand-tuned code while avoiding the FFI ccall. With this patch, the code {-# LANGUAGE MagicHash #-} module TestClz0 where import GHC.Prim ctz64 :: Word64# -> Word# ctz64 x = ctz64# x results in the following assembler generated by NCG on i386: TestClz.ctz64_info: movl (%ebp),%eax movl 4(%ebp),%ecx movl %ecx,%edx orl %eax,%edx movl $64,%edx je _nAO bsf %ecx,%ecx addl $32,%ecx bsf %eax,%eax cmovne %eax,%ecx movl %ecx,%edx _nAO: movl %edx,%esi addl $8,%ebp jmp *(%ebp) For comparision, here's what LLVM 3.4 currently generates: 000000fc <TestClzz_ctzz64_info>: fc: 0f bc 45 04 bsf 0x4(%ebp),%eax 100: b9 20 00 00 00 mov $0x20,%ecx 105: 0f 45 c8 cmovne %eax,%ecx 108: 83 c1 20 add $0x20,%ecx 10b: 8b 45 00 mov 0x0(%ebp),%eax 10e: 8b 55 08 mov 0x8(%ebp),%edx 111: 0f bc f0 bsf %eax,%esi 114: 85 c0 test %eax,%eax 116: 0f 44 f1 cmove %ecx,%esi 119: 83 c5 08 add $0x8,%ebp 11c: ff e2 jmp *%edx Reviewed By: austin Auditors: simonmar Differential Revision: https://phabricator.haskell.org/D163
Diffstat (limited to 'compiler')
-rw-r--r--compiler/nativeGen/X86/CodeGen.hs41
1 files changed, 32 insertions, 9 deletions
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 9d7cb78a6c..abd87ed087 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -1799,14 +1799,38 @@ genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src]
size = if width == W8 then II16 else intSize width
lbl = mkCmmCodeLabel primPackageKey (fsLit (clzLabel width))
-genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) dest_regs@[dst] args@[src]
+genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src]
| is32Bit, width == W64 = do
- -- Fallback to `hs_ctz64` on i386
- targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
- let target = ForeignTarget targetExpr (ForeignConvention CCallConv
- [NoHint] [NoHint]
- CmmMayReturn)
- genCCall dflags is32Bit target dest_regs args
+ ChildCode64 vcode rlo <- iselExpr64 src
+ let rhi = getHiVRegFromLo rlo
+ dst_r = getRegisterReg platform False (CmmLocal dst)
+ lbl1 <- getBlockIdNat
+ lbl2 <- getBlockIdNat
+ tmp_r <- getNewRegNat size
+
+ -- The following instruction sequence corresponds to the pseudo-code
+ --
+ -- if (src) {
+ -- dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32);
+ -- } else {
+ -- dst = 64;
+ -- }
+ return $ vcode `appOL` toOL
+ ([ MOV II32 (OpReg rhi) (OpReg tmp_r)
+ , OR II32 (OpReg rlo) (OpReg tmp_r)
+ , MOV II32 (OpImm (ImmInt 64)) (OpReg dst_r)
+ , JXX EQQ lbl2
+ , JXX ALWAYS lbl1
+
+ , NEWBLOCK lbl1
+ , BSF II32 (OpReg rhi) dst_r
+ , ADD II32 (OpImm (ImmInt 32)) (OpReg dst_r)
+ , BSF II32 (OpReg rlo) tmp_r
+ , CMOV NE II32 (OpReg tmp_r) dst_r
+ , JXX ALWAYS lbl2
+
+ , NEWBLOCK lbl2
+ ])
| otherwise = do
code_src <- getAnyReg src
@@ -1828,7 +1852,6 @@ genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) dest_regs@[dst] args@[src]
bw = widthInBits width
platform = targetPlatform dflags
size = if width == W8 then II16 else intSize width
- lbl = mkCmmCodeLabel primPackageKey (fsLit (ctzLabel width))
genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do
targetExpr <- cmmMakeDynamicReference dflags
@@ -2485,7 +2508,7 @@ outOfLineCmmOp mop res args
MO_PopCnt _ -> fsLit "popcnt"
MO_BSwap _ -> fsLit "bswap"
MO_Clz w -> fsLit $ clzLabel w
- MO_Ctz w -> fsLit $ ctzLabel w
+ MO_Ctz _ -> unsupported
MO_AtomicRMW _ _ -> fsLit "atomicrmw"
MO_AtomicRead _ -> fsLit "atomicread"