Add Int8# and Word8#

This is the first step of implementing: https://github.com/ghc-proposals/ghc-proposals/pull/74 The main highlights/changes: - `primops.txt.pp` gets two new sections for two new primitive types for signed and unsigned 8-bit integers (`Int8#` and `Word8` respectively) along with basic arithmetic and comparison operations. `PrimRep`/`RuntimeRep` get two new constructors for them. All of the primops translate into the existing `MachOP`s. - For `CmmCall`s the codegen will now zero-extend the values at call site (so that they can be moved to the right register) and then truncate them back their original width. - x86 native codegen needed some updates, since it wasn't able to deal with the new widths, but all the changes are quite localized. LLVM backend seems to just work. Bumps binary submodule. Signed-off-by: Michal Terepeta <michal.terepeta@gmail.com> Test Plan: ./validate with new tests Reviewers: hvr, goldfire, bgamari, simonmar Subscribers: Abhiroop, dfeuer, rwbarton, thomie, carter Differential Revision: https://phabricator.haskell.org/D4475
author: Michal Terepeta <michal.terepeta@gmail.com> 2018-10-04 13:56:59 -0400
committer: Ben Gamari <ben@smart-cactus.org> 2018-10-07 18:36:07 -0400
commit: 5d5307f943d7581d7013ffe20af22233273fba06 (patch)
tree: ec9ae993cfa44d2cfe797e0422eb388933277100 /compiler/nativeGen
parent: e4bec29cb475b7e1431dad41fb8d4438814641c9 (diff)
download: haskell-5d5307f943d7581d7013ffe20af22233273fba06.tar.gz
3 files changed, 97 insertions, 11 deletions
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index a2e26bd68b..66f959a86b 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -644,20 +644,27 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       -- Nop conversions
       MO_UU_Conv W32 W8  -> toI8Reg  W32 x
       MO_SS_Conv W32 W8  -> toI8Reg  W32 x
+      MO_XX_Conv W32 W8  -> toI8Reg  W32 x
       MO_UU_Conv W16 W8  -> toI8Reg  W16 x
       MO_SS_Conv W16 W8  -> toI8Reg  W16 x
+      MO_XX_Conv W16 W8  -> toI8Reg  W16 x
       MO_UU_Conv W32 W16 -> toI16Reg W32 x
       MO_SS_Conv W32 W16 -> toI16Reg W32 x
+      MO_XX_Conv W32 W16 -> toI16Reg W32 x
 
       MO_UU_Conv W64 W32 | not is32Bit -> conversionNop II64 x
       MO_SS_Conv W64 W32 | not is32Bit -> conversionNop II64 x
+      MO_XX_Conv W64 W32 | not is32Bit -> conversionNop II64 x
       MO_UU_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
       MO_SS_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
+      MO_XX_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
       MO_UU_Conv W64 W8  | not is32Bit -> toI8Reg  W64 x
       MO_SS_Conv W64 W8  | not is32Bit -> toI8Reg  W64 x
+      MO_XX_Conv W64 W8  | not is32Bit -> toI8Reg  W64 x
 
       MO_UU_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
       MO_SS_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
+      MO_XX_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
 
       -- widenings
       MO_UU_Conv W8  W32 -> integerExtend W8  W32 MOVZxL x
@@ -668,16 +675,26 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       MO_SS_Conv W16 W32 -> integerExtend W16 W32 MOVSxL x
       MO_SS_Conv W8  W16 -> integerExtend W8  W16 MOVSxL x
 
+      -- We don't care about the upper bits for MO_XX_Conv, so MOV is enough.
+      MO_XX_Conv W8  W32 -> integerExtend W8  W32 MOV x
+      MO_XX_Conv W16 W32 -> integerExtend W16 W32 MOV x
+      MO_XX_Conv W8  W16 -> integerExtend W8  W16 MOV x
+
       MO_UU_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOVZxL x
       MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVZxL x
       MO_UU_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVZxL x
       MO_SS_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOVSxL x
       MO_SS_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVSxL x
       MO_SS_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVSxL x
-        -- for 32-to-64 bit zero extension, amd64 uses an ordinary movl.
-        -- However, we don't want the register allocator to throw it
-        -- away as an unnecessary reg-to-reg move, so we keep it in
-        -- the form of a movzl and print it as a movl later.
+      -- For 32-to-64 bit zero extension, amd64 uses an ordinary movl.
+      -- However, we don't want the register allocator to throw it
+      -- away as an unnecessary reg-to-reg move, so we keep it in
+      -- the form of a movzl and print it as a movl later.
+      -- This doesn't apply to MO_XX_Conv since in this case we don't care about
+      -- the upper bits. So we can just use MOV.
+      MO_XX_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOV x
+      MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x
+      MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
 
       MO_FF_Conv W32 W64
         | sse2      -> coerceFP2FP W64 x
@@ -787,6 +804,7 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
 
       MO_S_MulMayOflo rep -> imulMayOflo rep x y
 
+      MO_Mul W8  -> imulW8 x y
       MO_Mul rep -> triv_op rep IMUL
       MO_And rep -> triv_op rep AND
       MO_Or  rep -> triv_op rep OR
@@ -822,6 +840,21 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
     triv_op width instr = trivialCode width op (Just op) x y
                         where op   = instr (intFormat width)
 
+    -- Special case for IMUL for bytes, since the result of IMULB will be in
+    -- %ax, the split to %dx/%edx/%rdx and %ax/%eax/%rax happens only for wider
+    -- values.
+    imulW8 :: CmmExpr -> CmmExpr -> NatM Register
+    imulW8 arg_a arg_b = do
+        (a_reg, a_code) <- getNonClobberedReg arg_a
+        b_code <- getAnyReg arg_b
+
+        let code = a_code `appOL` b_code eax `appOL`
+                   toOL [ IMUL2 format (OpReg a_reg) ]
+            format = intFormat W8
+
+        return (Fixed format eax code)
+
+
     imulMayOflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
     imulMayOflo rep a b = do
          (a_reg, a_code) <- getNonClobberedReg a
@@ -916,6 +949,18 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
         return (Any format code)
 
     ----------------------
+
+    -- See Note [DIV/IDIV for bytes]
+    div_code W8 signed quotient x y = do
+        let widen | signed    = MO_SS_Conv W8 W16
+                  | otherwise = MO_UU_Conv W8 W16
+        div_code
+            W16
+            signed
+            quotient
+            (CmmMachOp widen [x])
+            (CmmMachOp widen [y])
+
     div_code width signed quotient x y = do
            (y_op, y_code) <- getRegOrMem y -- cannot be clobbered
            x_code <- getAnyReg x
@@ -2277,6 +2322,18 @@ genCCall _ is32Bit target dest_regs args = do
             = divOp platform signed width results (Just arg_x_high) arg_x_low arg_y
         divOp2 _ _ _ _ _
             = panic "genCCall: Wrong number of arguments for divOp2"
+
+        -- See Note [DIV/IDIV for bytes]
+        divOp platform signed W8 [res_q, res_r] m_arg_x_high arg_x_low arg_y =
+            let widen | signed = MO_SS_Conv W8 W16
+                      | otherwise = MO_UU_Conv W8 W16
+                arg_x_low_16 = CmmMachOp widen [arg_x_low]
+                arg_y_16 = CmmMachOp widen [arg_y]
+                m_arg_x_high_16 = (\p -> CmmMachOp widen [p]) <$> m_arg_x_high
+            in divOp
+                  platform signed W16 [res_q, res_r]
+                  m_arg_x_high_16 arg_x_low_16 arg_y_16
+
         divOp platform signed width [res_q, res_r]
               m_arg_x_high arg_x_low arg_y
             = do let format = intFormat width
@@ -2318,6 +2375,22 @@ genCCall _ is32Bit target dest_regs args = do
         addSubIntC _ _ _ _ _ _ _ _
             = panic "genCCall: Wrong number of arguments/results for addSubIntC"
 
+-- Note [DIV/IDIV for bytes]
+--
+-- IDIV reminder:
+--   Size    Dividend   Divisor   Quotient    Remainder
+--   byte    %ax         r/m8      %al          %ah
+--   word    %dx:%ax     r/m16     %ax          %dx
+--   dword   %edx:%eax   r/m32     %eax         %edx
+--   qword   %rdx:%rax   r/m64     %rax         %rdx
+--
+-- We do a special case for the byte division because the current
+-- codegen doesn't deal well with accessing %ah register (also,
+-- accessing %ah in 64-bit mode is complicated because it cannot be an
+-- operand of many instructions). So we just widen operands to 16 bits
+-- and get the results from %al, %dl. This is not optimal, but a few
+-- register moves are probably not a huge deal when doing division.
+
 genCCall32' :: DynFlags
             -> ForeignTarget            -- function to call
             -> [CmmFormal]        -- where to put the result
@@ -2461,6 +2534,10 @@ genCCall32' dflags target dest_regs args = do
                            )
 
           | otherwise = do
+            -- Arguments can be smaller than 32-bit, but we still use @PUSH
+            -- II32@ - the usual calling conventions expect integers to be
+            -- 4-byte aligned.
+            ASSERT((typeWidth arg_ty) <= W32) return ()
             (operand, code) <- getOperand arg
             delta <- getDeltaNat
             setDeltaNat (delta-size)
@@ -2700,7 +2777,10 @@ genCCall64' dflags target dest_regs args = do
              push_args rest code'
 
            | otherwise = do
-             ASSERT(width == W64) return ()
+             -- Arguments can be smaller than 64-bit, but we still use @PUSH
+             -- II64@ - the usual calling conventions expect integers to be
+             -- 8-byte aligned.
+             ASSERT(width <= W64) return ()
              (arg_op, arg_code) <- getOperand arg
              delta <- getDeltaNat
              setDeltaNat (delta-arg_size)
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index c7000c9f4b..8cc61ed789 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -383,7 +383,13 @@ x86_regUsageOfInstr platform instr
     SUB    _ src dst    -> usageRM src dst
     SBB    _ src dst    -> usageRM src dst
     IMUL   _ src dst    -> usageRM src dst
-    IMUL2  _ src       -> mkRU (eax:use_R src []) [eax,edx]
+
+    -- Result of IMULB will be in just in %ax
+    IMUL2  II8 src       -> mkRU (eax:use_R src []) [eax]
+    -- Result of IMUL for wider values, will be split between %dx/%edx/%rdx and
+    -- %ax/%eax/%rax.
+    IMUL2  _ src        -> mkRU (eax:use_R src []) [eax,edx]
+
     MUL    _ src dst    -> usageRM src dst
     MUL2   _ src        -> mkRU (eax:use_R src []) [eax,edx]
     DIV    _ op -> mkRU (eax:edx:use_R op []) [eax,edx]
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index 03d4fce794..d4c92df753 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -327,7 +327,7 @@ pprReg f r
       (case i of {
          0 -> sLit "%al";     1 -> sLit "%bl";
          2 -> sLit "%cl";     3 -> sLit "%dl";
-        _  -> sLit "very naughty I386 byte register"
+        _  -> sLit $ "very naughty I386 byte register: " ++ show i
       })
 
     ppr32_reg_word i = ptext
@@ -364,7 +364,7 @@ pprReg f r
         10 -> sLit "%r10b";   11 -> sLit "%r11b";
         12 -> sLit "%r12b";   13 -> sLit "%r13b";
         14 -> sLit "%r14b";   15 -> sLit "%r15b";
-        _  -> sLit "very naughty x86_64 byte register"
+        _  -> sLit $ "very naughty x86_64 byte register: " ++ show i
       })
 
     ppr64_reg_word i = ptext
@@ -789,8 +789,11 @@ pprInstr (POP format op) = pprFormatOp (sLit "pop") format op
 -- pprInstr POPA = text "\tpopal"
 
 pprInstr NOP = text "\tnop"
+pprInstr (CLTD II8) = text "\tcbtw"
+pprInstr (CLTD II16) = text "\tcwtd"
 pprInstr (CLTD II32) = text "\tcltd"
 pprInstr (CLTD II64) = text "\tcqto"
+pprInstr (CLTD x) = panic $ "pprInstr: " ++ show x
 
 pprInstr (SETCC cond op) = pprCondInstr (sLit "set") cond (pprOperand II8 op)
 
@@ -1076,9 +1079,6 @@ pprInstr (XADD format src dst) = pprFormatOpOp (sLit "xadd") format src dst
 pprInstr (CMPXCHG format src dst)
    = pprFormatOpOp (sLit "cmpxchg") format src dst
 
-pprInstr _
-        = panic "X86.Ppr.pprInstr: no match"
-
 
 pprTrigOp :: String -> Bool -> CLabel -> CLabel
           -> Reg -> Reg -> Format -> SDoc
author	Michal Terepeta <michal.terepeta@gmail.com>	2018-10-04 13:56:59 -0400
committer	Ben Gamari <ben@smart-cactus.org>	2018-10-07 18:36:07 -0400
commit	5d5307f943d7581d7013ffe20af22233273fba06 (patch)
tree	ec9ae993cfa44d2cfe797e0422eb388933277100 /compiler/nativeGen
parent	e4bec29cb475b7e1431dad41fb8d4438814641c9 (diff)
download	haskell-5d5307f943d7581d7013ffe20af22233273fba06.tar.gz