summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Klebinger <klebinger.andreas@gmx.at>2021-06-09 19:25:34 +0200
committerAndreas Klebinger <klebinger.andreas@gmx.at>2021-07-06 21:17:37 +0000
commit6618008b5338ae43d8a362c31c5d5e820ff2d61c (patch)
tree4193a9635b3a259898dcb92cae2624c37c2d137b
parent1709111472f966f6e571227b035e749f953535a2 (diff)
downloadhaskell-6618008b5338ae43d8a362c31c5d5e820ff2d61c.tar.gz
Fix #19889 - Invalid BMI2 instructions generated.wip/andreask/bim-fix
When arguments are 8 *or 16* bits wide, then truncate before/after and use the 32bit operation.
-rw-r--r--compiler/GHC/CmmToAsm/X86/CodeGen.hs44
-rw-r--r--compiler/GHC/CmmToAsm/X86/Instr.hs6
2 files changed, 26 insertions, 24 deletions
diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
index 8da259e73b..2fbe91dc34 100644
--- a/compiler/GHC/CmmToAsm/X86/CodeGen.hs
+++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
@@ -2476,18 +2476,17 @@ genCCall' config is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
mask_r <- getNewRegNat format
let dst_r = getRegisterReg platform (CmmLocal dst)
return $ code_src src_r `appOL` code_mask mask_r `appOL`
- (if width == W8 then
- -- The PDEP instruction doesn't take a r/m8
- unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
- unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
- unitOL (PDEP II16 (OpReg mask_r) (OpReg src_r ) dst_r)
- else
- unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
- (if width == W8 || width == W16 then
- -- We used a 16-bit destination register above,
- -- so zero-extend
- unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
- else nilOL)
+ -- PDEP only supports > 32 bit args
+ ( if width == W8 || width == W16 then
+ toOL
+ [ MOVZxL format (OpReg src_r ) (OpReg src_r )
+ , MOVZxL format (OpReg mask_r) (OpReg mask_r)
+ , PDEP II32 (OpReg mask_r) (OpReg src_r ) dst_r
+ , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width
+ ]
+ else
+ unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)
+ )
else do
targetExpr <- cmmMakeDynamicReference config
CallReference lbl
@@ -2509,18 +2508,17 @@ genCCall' config is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
mask_r <- getNewRegNat format
let dst_r = getRegisterReg platform (CmmLocal dst)
return $ code_src src_r `appOL` code_mask mask_r `appOL`
- (if width == W8 then
- -- The PEXT instruction doesn't take a r/m8
- unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
- unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
- unitOL (PEXT II16 (OpReg mask_r) (OpReg src_r) dst_r)
- else
- unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
(if width == W8 || width == W16 then
- -- We used a 16-bit destination register above,
- -- so zero-extend
- unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
- else nilOL)
+ -- The PEXT instruction doesn't take a r/m8 or 16
+ toOL
+ [ MOVZxL format (OpReg src_r ) (OpReg src_r )
+ , MOVZxL format (OpReg mask_r) (OpReg mask_r)
+ , PEXT II32 (OpReg mask_r) (OpReg src_r ) dst_r
+ , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width
+ ]
+ else
+ unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)
+ )
else do
targetExpr <- cmmMakeDynamicReference config
CallReference lbl
diff --git a/compiler/GHC/CmmToAsm/X86/Instr.hs b/compiler/GHC/CmmToAsm/X86/Instr.hs
index 9410537ed8..1a9226ec41 100644
--- a/compiler/GHC/CmmToAsm/X86/Instr.hs
+++ b/compiler/GHC/CmmToAsm/X86/Instr.hs
@@ -199,7 +199,11 @@ data Instr
-- Moves.
| MOV Format Operand Operand
| CMOV Cond Format Operand Reg
- | MOVZxL Format Operand Operand -- format is the size of operand 1
+ | MOVZxL Format Operand Operand
+ -- ^ The format argument is the size of operand 1 (the number of bits we keep)
+ -- We always zero *all* high bits, even though this isn't how the actual instruction
+ -- works. The code generator also seems to rely on this behaviour and it's faster
+ -- to execute on many cpus as well so for now I'm just documenting the fact.
| MOVSxL Format Operand Operand -- format is the size of operand 1
-- x86_64 note: plain mov into a 32-bit register always zero-extends
-- into the 64-bit reg, in contrast to the 8 and 16-bit movs which