diff options
author | Andreas Klebinger <klebinger.andreas@gmx.at> | 2021-06-09 19:25:34 +0200 |
---|---|---|
committer | Andreas Klebinger <klebinger.andreas@gmx.at> | 2021-07-06 21:17:37 +0000 |
commit | 6618008b5338ae43d8a362c31c5d5e820ff2d61c (patch) | |
tree | 4193a9635b3a259898dcb92cae2624c37c2d137b | |
parent | 1709111472f966f6e571227b035e749f953535a2 (diff) | |
download | haskell-6618008b5338ae43d8a362c31c5d5e820ff2d61c.tar.gz |
Fix #19889 - Invalid BMI2 instructions generated.wip/andreask/bim-fix
When arguments are 8 *or 16* bits wide, then truncate before/after
and use the 32bit operation.
-rw-r--r-- | compiler/GHC/CmmToAsm/X86/CodeGen.hs | 44 | ||||
-rw-r--r-- | compiler/GHC/CmmToAsm/X86/Instr.hs | 6 |
2 files changed, 26 insertions, 24 deletions
diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs index 8da259e73b..2fbe91dc34 100644 --- a/compiler/GHC/CmmToAsm/X86/CodeGen.hs +++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs @@ -2476,18 +2476,17 @@ genCCall' config is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst] mask_r <- getNewRegNat format let dst_r = getRegisterReg platform (CmmLocal dst) return $ code_src src_r `appOL` code_mask mask_r `appOL` - (if width == W8 then - -- The PDEP instruction doesn't take a r/m8 - unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL` - unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL` - unitOL (PDEP II16 (OpReg mask_r) (OpReg src_r ) dst_r) - else - unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL` - (if width == W8 || width == W16 then - -- We used a 16-bit destination register above, - -- so zero-extend - unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r)) - else nilOL) + -- PDEP only supports > 32 bit args + ( if width == W8 || width == W16 then + toOL + [ MOVZxL format (OpReg src_r ) (OpReg src_r ) + , MOVZxL format (OpReg mask_r) (OpReg mask_r) + , PDEP II32 (OpReg mask_r) (OpReg src_r ) dst_r + , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width + ] + else + unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r) + ) else do targetExpr <- cmmMakeDynamicReference config CallReference lbl @@ -2509,18 +2508,17 @@ genCCall' config is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst] mask_r <- getNewRegNat format let dst_r = getRegisterReg platform (CmmLocal dst) return $ code_src src_r `appOL` code_mask mask_r `appOL` - (if width == W8 then - -- The PEXT instruction doesn't take a r/m8 - unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL` - unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL` - unitOL (PEXT II16 (OpReg mask_r) (OpReg src_r) dst_r) - else - unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL` (if width == W8 || width == W16 then - -- We used a 16-bit destination register above, - -- so zero-extend - unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r)) - else nilOL) + -- The PEXT instruction doesn't take a r/m8 or 16 + toOL + [ MOVZxL format (OpReg src_r ) (OpReg src_r ) + , MOVZxL format (OpReg mask_r) (OpReg mask_r) + , PEXT II32 (OpReg mask_r) (OpReg src_r ) dst_r + , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width + ] + else + unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r) + ) else do targetExpr <- cmmMakeDynamicReference config CallReference lbl diff --git a/compiler/GHC/CmmToAsm/X86/Instr.hs b/compiler/GHC/CmmToAsm/X86/Instr.hs index 9410537ed8..1a9226ec41 100644 --- a/compiler/GHC/CmmToAsm/X86/Instr.hs +++ b/compiler/GHC/CmmToAsm/X86/Instr.hs @@ -199,7 +199,11 @@ data Instr -- Moves. | MOV Format Operand Operand | CMOV Cond Format Operand Reg - | MOVZxL Format Operand Operand -- format is the size of operand 1 + | MOVZxL Format Operand Operand + -- ^ The format argument is the size of operand 1 (the number of bits we keep) + -- We always zero *all* high bits, even though this isn't how the actual instruction + -- works. The code generator also seems to rely on this behaviour and it's faster + -- to execute on many cpus as well so for now I'm just documenting the fact. | MOVSxL Format Operand Operand -- format is the size of operand 1 -- x86_64 note: plain mov into a 32-bit register always zero-extends -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which |