Fix #19889 - Invalid BMI2 instructions generated.wip/andreask/bim-fix

When arguments are 8 *or 16* bits wide, then truncate before/after and use the 32bit operation.
author: Andreas Klebinger <klebinger.andreas@gmx.at> 2021-06-09 19:25:34 +0200
committer: Andreas Klebinger <klebinger.andreas@gmx.at> 2021-07-06 21:17:37 +0000
commit: 6618008b5338ae43d8a362c31c5d5e820ff2d61c (patch)
tree: 4193a9635b3a259898dcb92cae2624c37c2d137b
parent: 1709111472f966f6e571227b035e749f953535a2 (diff)
download: haskell-6618008b5338ae43d8a362c31c5d5e820ff2d61c.tar.gz
2 files changed, 26 insertions, 24 deletions
diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
index 8da259e73b..2fbe91dc34 100644
--- a/compiler/GHC/CmmToAsm/X86/CodeGen.hs
+++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
@@ -2476,18 +2476,17 @@ genCCall' config is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
                 mask_r    <- getNewRegNat format
                 let dst_r = getRegisterReg platform  (CmmLocal dst)
                 return $ code_src src_r `appOL` code_mask mask_r `appOL`
-                    (if width == W8 then
-                         -- The PDEP instruction doesn't take a r/m8
-                         unitOL (MOVZxL II8  (OpReg src_r ) (OpReg src_r )) `appOL`
-                         unitOL (MOVZxL II8  (OpReg mask_r) (OpReg mask_r)) `appOL`
-                         unitOL (PDEP   II16 (OpReg mask_r) (OpReg src_r ) dst_r)
-                     else
-                         unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
-                    (if width == W8 || width == W16 then
-                         -- We used a 16-bit destination register above,
-                         -- so zero-extend
-                         unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
-                     else nilOL)
+                    -- PDEP only supports > 32 bit args
+                    ( if width == W8 || width == W16 then
+                        toOL
+                          [ MOVZxL format (OpReg src_r ) (OpReg src_r )
+                          , MOVZxL format (OpReg mask_r) (OpReg mask_r)
+                          , PDEP   II32 (OpReg mask_r) (OpReg src_r ) dst_r
+                          , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width
+                          ]
+                      else
+                        unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)
+                    )
         else do
             targetExpr <- cmmMakeDynamicReference config
                           CallReference lbl
@@ -2509,18 +2508,17 @@ genCCall' config is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
                 mask_r    <- getNewRegNat format
                 let dst_r = getRegisterReg platform  (CmmLocal dst)
                 return $ code_src src_r `appOL` code_mask mask_r `appOL`
-                    (if width == W8 then
-                         -- The PEXT instruction doesn't take a r/m8
-                         unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
-                         unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
-                         unitOL (PEXT II16 (OpReg mask_r) (OpReg src_r) dst_r)
-                     else
-                         unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
                     (if width == W8 || width == W16 then
-                         -- We used a 16-bit destination register above,
-                         -- so zero-extend
-                         unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
-                     else nilOL)
+                         -- The PEXT instruction doesn't take a r/m8 or 16
+                        toOL
+                          [ MOVZxL format (OpReg src_r ) (OpReg src_r )
+                          , MOVZxL format (OpReg mask_r) (OpReg mask_r)
+                          , PEXT   II32 (OpReg mask_r) (OpReg src_r ) dst_r
+                          , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width
+                          ]
+                      else
+                        unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)
+                    )
         else do
             targetExpr <- cmmMakeDynamicReference config
                           CallReference lbl
diff --git a/compiler/GHC/CmmToAsm/X86/Instr.hs b/compiler/GHC/CmmToAsm/X86/Instr.hs
index 9410537ed8..1a9226ec41 100644
--- a/compiler/GHC/CmmToAsm/X86/Instr.hs
+++ b/compiler/GHC/CmmToAsm/X86/Instr.hs
@@ -199,7 +199,11 @@ data Instr
         -- Moves.
         | MOV         Format Operand Operand
         | CMOV   Cond Format Operand Reg
-        | MOVZxL      Format Operand Operand -- format is the size of operand 1
+        | MOVZxL      Format Operand Operand
+              -- ^ The format argument is the size of operand 1 (the number of bits we keep)
+              -- We always zero *all* high bits, even though this isn't how the actual instruction
+              -- works. The code generator also seems to rely on this behaviour and it's faster
+              -- to execute on many cpus as well so for now I'm just documenting the fact.
         | MOVSxL      Format Operand Operand -- format is the size of operand 1
         -- x86_64 note: plain mov into a 32-bit register always zero-extends
         -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
author	Andreas Klebinger <klebinger.andreas@gmx.at>	2021-06-09 19:25:34 +0200
committer	Andreas Klebinger <klebinger.andreas@gmx.at>	2021-07-06 21:17:37 +0000
commit	6618008b5338ae43d8a362c31c5d5e820ff2d61c (patch)
tree	4193a9635b3a259898dcb92cae2624c37c2d137b
parent	1709111472f966f6e571227b035e749f953535a2 (diff)
download	haskell-6618008b5338ae43d8a362c31c5d5e820ff2d61c.tar.gz