NCG(x86): Compile add+shift as lea if possible.wip/andreask/add_mul_lea

author: Andreas Klebinger <klebinger.andreas@gmx.at> 2022-03-04 17:21:09 +0100
committer: Andreas Klebinger <klebinger.andreas@gmx.at> 2022-08-08 14:42:26 +0200
commit: 20457d775885d6c3df020d204da9a7acfb3c2e5a (patch)
tree: 46be70cbecee6e4568c9d24a641dc733bc9b87d2
parent: 78d04cfadfd728bb088b08b1e88905b43cc0360c (diff)
download: haskell-20457d775885d6c3df020d204da9a7acfb3c2e5a.tar.gz
4 files changed, 95 insertions, 0 deletions
diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
index de0a7c56e3..295cd9f555 100644
--- a/compiler/GHC/CmmToAsm/X86/CodeGen.hs
+++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
@@ -1048,10 +1048,29 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = -- dyadic MachOps
 
     --------------------
     add_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
+    -- x + imm
     add_code rep x (CmmLit (CmmInt y _))
         | is32BitInteger y
         , rep /= W8 -- LEA doesn't support byte size (#18614)
         = add_int rep x y
+    -- x + (y << imm)
+    add_code rep x y
+        -- Byte size is not supported and 16bit size is slow when computed via LEA
+        | rep /= W8 && rep /= W16
+        -- 2^3 = 8 is the highest multiplicator supported by LEA.
+        , Just (x,y,shift_bits) <- get_shift x y
+        = add_shiftL rep x y (fromIntegral shift_bits)
+        where
+          -- x + (y << imm)
+          get_shift x (CmmMachOp (MO_Shl _w) [y, CmmLit (CmmInt shift_bits _)])
+            | shift_bits <= 3
+            = Just (x, y, shift_bits)
+          -- (y << imm) + x
+          get_shift (CmmMachOp (MO_Shl _w) [y, CmmLit (CmmInt shift_bits _)]) x
+            | shift_bits <= 3
+            = Just (x, y, shift_bits)
+          get_shift _ _
+            = Nothing
     add_code rep x y = trivialCode rep (ADD format) (Just (ADD format)) x y
       where format = intFormat rep
     -- TODO: There are other interesting patterns we want to replace
@@ -1066,6 +1085,7 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = -- dyadic MachOps
     sub_code rep x y = trivialCode rep (SUB (intFormat rep)) Nothing x y
 
     -- our three-operand add instruction:
+    add_int :: (Width -> CmmExpr -> Integer -> NatM Register)
     add_int width x y = do
         (x_reg, x_code) <- getSomeReg x
         let
@@ -1079,6 +1099,22 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = -- dyadic MachOps
         --
         return (Any format code)
 
+    -- x + (y << shift_bits) using LEA
+    add_shiftL :: (Width -> CmmExpr -> CmmExpr -> Int -> NatM Register)
+    add_shiftL width x y shift_bits = do
+        (x_reg, x_code) <- getSomeReg x
+        (y_reg, y_code) <- getSomeReg y
+        let
+            format = intFormat width
+            imm = ImmInt 0
+            code dst
+               = (x_code `appOL` y_code) `snocOL`
+                 LEA format
+                        (OpAddr (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg (2 ^ shift_bits)) imm))
+                        (OpReg dst)
+        --
+        return (Any format code)
+
     ----------------------
 
     -- See Note [DIV/IDIV for bytes]
diff --git a/testsuite/tests/codeGen/should_gen_asm/AddMulX86.asm b/testsuite/tests/codeGen/should_gen_asm/AddMulX86.asm
new file mode 100644
index 0000000000..d11e6809fc
--- /dev/null
+++ b/testsuite/tests/codeGen/should_gen_asm/AddMulX86.asm
@@ -0,0 +1,46 @@
+.section .text
+.align 8
+.align 8
+	.quad	8589934604
+	.quad	0
+	.long	14
+	.long	0
+.globl AddMulX86_f_info
+.type AddMulX86_f_info, @function
+AddMulX86_f_info:
+.LcAx:
+	leaq (%r14,%rsi,8),%rbx
+	jmp *(%rbp)
+	.size AddMulX86_f_info, .-AddMulX86_f_info
+.section .data
+.align 8
+.align 1
+.globl AddMulX86_f_closure
+.type AddMulX86_f_closure, @object
+AddMulX86_f_closure:
+	.quad	AddMulX86_f_info
+.section .text
+.align 8
+.align 8
+	.quad	8589934604
+	.quad	0
+	.long	14
+	.long	0
+.globl AddMulX86_g_info
+.type AddMulX86_g_info, @function
+AddMulX86_g_info:
+.LcAL:
+	leaq (%r14,%rsi,8),%rbx
+	jmp *(%rbp)
+	.size AddMulX86_g_info, .-AddMulX86_g_info
+.section .data
+.align 8
+.align 1
+.globl AddMulX86_g_closure
+.type AddMulX86_g_closure, @object
+AddMulX86_g_closure:
+	.quad	AddMulX86_g_info
+.section .note.GNU-stack,"",@progbits
+.ident "GHC 9.3.20220228"
+
+
diff --git a/testsuite/tests/codeGen/should_gen_asm/AddMulX86.hs b/testsuite/tests/codeGen/should_gen_asm/AddMulX86.hs
new file mode 100644
index 0000000000..552940956e
--- /dev/null
+++ b/testsuite/tests/codeGen/should_gen_asm/AddMulX86.hs
@@ -0,0 +1,12 @@
+{-# LANGUAGE MagicHash #-}
+
+module AddMulX86 where
+
+import GHC.Exts
+
+f :: Int# -> Int# -> Int#
+f x y =
+    x +# (y *# 8#) -- Should result in a lea instruction, which we grep the assembly output for.
+
+g x y =
+    (y *# 8#) +# x  -- Should result in a lea instruction, which we grep the assembly output for.
diff --git a/testsuite/tests/codeGen/should_gen_asm/all.T b/testsuite/tests/codeGen/should_gen_asm/all.T
index fa3ed1ccf5..1faa7d62ab 100644
--- a/testsuite/tests/codeGen/should_gen_asm/all.T
+++ b/testsuite/tests/codeGen/should_gen_asm/all.T
@@ -10,3 +10,4 @@ test('memset-unroll', is_amd64_codegen, compile_cmp_asm, ['cmm', ''])
 test('bytearray-memset-unroll', is_amd64_codegen, compile_grep_asm, ['hs', True, ''])
 test('bytearray-memcpy-unroll', is_amd64_codegen, compile_grep_asm, ['hs', True, ''])
 test('T18137', [when(opsys('darwin'), skip), only_ways(llvm_ways)], compile_grep_asm, ['hs', False, '-fllvm -split-sections'])
+test('AddMulX86', is_amd64_codegen, compile_cmp_asm, ['hs', '-dno-typeable-binds'])
author	Andreas Klebinger <klebinger.andreas@gmx.at>	2022-03-04 17:21:09 +0100
committer	Andreas Klebinger <klebinger.andreas@gmx.at>	2022-08-08 14:42:26 +0200
commit	20457d775885d6c3df020d204da9a7acfb3c2e5a (patch)
tree	46be70cbecee6e4568c9d24a641dc733bc9b87d2
parent	78d04cfadfd728bb088b08b1e88905b43cc0360c (diff)
download	haskell-20457d775885d6c3df020d204da9a7acfb3c2e5a.tar.gz