----------------------------------------------------------------------------- -- -- Machine-dependent assembly language -- -- (c) The University of Glasgow 1993-2004 -- ----------------------------------------------------------------------------- #include "nativeGen/NCG.h" module MachInstrs ( -- * Cmm instantiations NatCmm, NatCmmTop, NatBasicBlock, -- * Machine instructions Instr(..), Cond(..), condUnsigned, condToSigned, condToUnsigned, #if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH && !x86_64_TARGET_ARCH Size(..), machRepSize, #endif RI(..), #if i386_TARGET_ARCH || x86_64_TARGET_ARCH Operand(..), #endif #if i386_TARGET_ARCH i386_insert_ffrees, #endif #if sparc_TARGET_ARCH riZero, fpRelEA, moveSp, fPair, #endif ) where #include "HsVersions.h" import MachRegs import Cmm import MachOp ( MachRep(..) ) import CLabel ( CLabel, pprCLabel ) import Panic ( panic ) import Outputable import FastString import Constants ( wORD_SIZE ) import GHC.Exts -- ----------------------------------------------------------------------------- -- Our flavours of the Cmm types -- Type synonyms for Cmm populated with native code type NatCmm = GenCmm CmmStatic Instr type NatCmmTop = GenCmmTop CmmStatic Instr type NatBasicBlock = GenBasicBlock Instr -- ----------------------------------------------------------------------------- -- Conditions on this architecture data Cond #if alpha_TARGET_ARCH = ALWAYS -- For BI (same as BR) | EQQ -- For CMP and BI (NB: "EQ" is a 1.3 Prelude name) | GE -- For BI only | GTT -- For BI only (NB: "GT" is a 1.3 Prelude name) | LE -- For CMP and BI | LTT -- For CMP and BI (NB: "LT" is a 1.3 Prelude name) | NE -- For BI only | NEVER -- For BI (null instruction) | ULE -- For CMP only | ULT -- For CMP only #endif #if i386_TARGET_ARCH || x86_64_TARGET_ARCH = ALWAYS -- What's really used? ToDo | EQQ | GE | GEU | GTT | GU | LE | LEU | LTT | LU | NE | NEG | POS | CARRY | OFLO | PARITY | NOTPARITY #endif #if sparc_TARGET_ARCH = ALWAYS -- What's really used? ToDo | EQQ | GE | GEU | GTT | GU | LE | LEU | LTT | LU | NE | NEG | NEVER | POS | VC | VS #endif #if powerpc_TARGET_ARCH = ALWAYS | EQQ | GE | GEU | GTT | GU | LE | LEU | LTT | LU | NE #endif deriving Eq -- to make an assertion work condUnsigned GU = True condUnsigned LU = True condUnsigned GEU = True condUnsigned LEU = True condUnsigned _ = False condToSigned GU = GTT condToSigned LU = LTT condToSigned GEU = GE condToSigned LEU = LE condToSigned x = x condToUnsigned GTT = GU condToUnsigned LTT = LU condToUnsigned GE = GEU condToUnsigned LE = LEU condToUnsigned x = x -- ----------------------------------------------------------------------------- -- Sizes on this architecture -- ToDo: it's not clear to me that we need separate signed-vs-unsigned sizes -- here. I've removed them from the x86 version, we'll see what happens --SDM #if !powerpc_TARGET_ARCH && !i386_TARGET_ARCH && !x86_64_TARGET_ARCH data Size #if alpha_TARGET_ARCH = B -- byte | Bu -- | W -- word (2 bytes): UNUSED -- | Wu -- : UNUSED | L -- longword (4 bytes) | Q -- quadword (8 bytes) -- | FF -- VAX F-style floating pt: UNUSED -- | GF -- VAX G-style floating pt: UNUSED -- | DF -- VAX D-style floating pt: UNUSED -- | SF -- IEEE single-precision floating pt: UNUSED | TF -- IEEE double-precision floating pt #endif #if sparc_TARGET_ARCH || powerpc_TARGET_ARCH = B -- byte (signed) | Bu -- byte (unsigned) | H -- halfword (signed, 2 bytes) | Hu -- halfword (unsigned, 2 bytes) | W -- word (4 bytes) | F -- IEEE single-precision floating pt | DF -- IEEE single-precision floating pt #endif deriving Eq machRepSize :: MachRep -> Size machRepSize I8 = IF_ARCH_alpha(Bu, IF_ARCH_sparc(Bu, )) machRepSize I16 = IF_ARCH_alpha(err,IF_ARCH_sparc(Hu, )) machRepSize I32 = IF_ARCH_alpha(L, IF_ARCH_sparc(W, )) machRepSize I64 = panic "machRepSize: I64" machRepSize I128 = panic "machRepSize: I128" machRepSize F32 = IF_ARCH_alpha(TF, IF_ARCH_sparc(F, )) machRepSize F64 = IF_ARCH_alpha(TF, IF_ARCH_sparc(DF,)) #endif -- ----------------------------------------------------------------------------- -- Register or immediate (a handy type on some platforms) data RI = RIReg Reg | RIImm Imm -- ----------------------------------------------------------------------------- -- Machine's assembly language -- We have a few common "instructions" (nearly all the pseudo-ops) but -- mostly all of 'Instr' is machine-specific. data Instr = COMMENT FastString -- comment pseudo-op | LDATA Section [CmmStatic] -- some static data spat out during code -- generation. Will be extracted before -- pretty-printing. | NEWBLOCK BlockId -- start a new basic block. Useful during -- codegen, removed later. Preceding -- instruction should be a jump, as per the -- invariants for a BasicBlock (see Cmm). | DELTA Int -- specify current stack offset for -- benefit of subsequent passes -- ----------------------------------------------------------------------------- -- Alpha instructions #if alpha_TARGET_ARCH -- data Instr continues... -- Loads and stores. | LD Size Reg AddrMode -- size, dst, src | LDA Reg AddrMode -- dst, src | LDAH Reg AddrMode -- dst, src | LDGP Reg AddrMode -- dst, src | LDI Size Reg Imm -- size, dst, src | ST Size Reg AddrMode -- size, src, dst -- Int Arithmetic. | CLR Reg -- dst | ABS Size RI Reg -- size, src, dst | NEG Size Bool RI Reg -- size, overflow, src, dst | ADD Size Bool Reg RI Reg -- size, overflow, src, src, dst | SADD Size Size Reg RI Reg -- size, scale, src, src, dst | SUB Size Bool Reg RI Reg -- size, overflow, src, src, dst | SSUB Size Size Reg RI Reg -- size, scale, src, src, dst | MUL Size Bool Reg RI Reg -- size, overflow, src, src, dst | DIV Size Bool Reg RI Reg -- size, unsigned, src, src, dst | REM Size Bool Reg RI Reg -- size, unsigned, src, src, dst -- Simple bit-twiddling. | NOT RI Reg | AND Reg RI Reg | ANDNOT Reg RI Reg | OR Reg RI Reg | ORNOT Reg RI Reg | XOR Reg RI Reg | XORNOT Reg RI Reg | SLL Reg RI Reg | SRL Reg RI Reg | SRA Reg RI Reg | ZAP Reg RI Reg | ZAPNOT Reg RI Reg | NOP -- Comparison | CMP Cond Reg RI Reg -- Float Arithmetic. | FCLR Reg | FABS Reg Reg | FNEG Size Reg Reg | FADD Size Reg Reg Reg | FDIV Size Reg Reg Reg | FMUL Size Reg Reg Reg | FSUB Size Reg Reg Reg | CVTxy Size Size Reg Reg | FCMP Size Cond Reg Reg Reg | FMOV Reg Reg -- Jumping around. | BI Cond Reg Imm | BF Cond Reg Imm | BR Imm | JMP Reg AddrMode Int | BSR Imm Int | JSR Reg AddrMode Int -- Alpha-specific pseudo-ops. | FUNBEGIN CLabel | FUNEND CLabel data RI = RIReg Reg | RIImm Imm #endif /* alpha_TARGET_ARCH */ -- ----------------------------------------------------------------------------- -- Intel x86 instructions {- Intel, in their infinite wisdom, selected a stack model for floating point registers on x86. That might have made sense back in 1979 -- nowadays we can see it for the nonsense it really is. A stack model fits poorly with the existing nativeGen infrastructure, which assumes flat integer and FP register sets. Prior to this commit, nativeGen could not generate correct x86 FP code -- to do so would have meant somehow working the register-stack paradigm into the register allocator and spiller, which sounds very difficult. We have decided to cheat, and go for a simple fix which requires no infrastructure modifications, at the expense of generating ropey but correct FP code. All notions of the x86 FP stack and its insns have been removed. Instead, we pretend (to the instruction selector and register allocator) that x86 has six floating point registers, %fake0 .. %fake5, which can be used in the usual flat manner. We further claim that x86 has floating point instructions very similar to SPARC and Alpha, that is, a simple 3-operand register-register arrangement. Code generation and register allocation proceed on this basis. When we come to print out the final assembly, our convenient fiction is converted to dismal reality. Each fake instruction is independently converted to a series of real x86 instructions. %fake0 .. %fake5 are mapped to %st(0) .. %st(5). To do reg-reg arithmetic operations, the two operands are pushed onto the top of the FP stack, the operation done, and the result copied back into the relevant register. There are only six %fake registers because 2 are needed for the translation, and x86 has 8 in total. The translation is inefficient but is simple and it works. A cleverer translation would handle a sequence of insns, simulating the FP stack contents, would not impose a fixed mapping from %fake to %st regs, and hopefully could avoid most of the redundant reg-reg moves of the current translation. We might as well make use of whatever unique FP facilities Intel have chosen to bless us with (let's not be churlish, after all). Hence GLDZ and GLD1. Bwahahahahahahaha! -} {- MORE FLOATING POINT MUSINGS... Intel's internal floating point registers are by default 80 bit extended precision. This means that all operations done on values in registers are done at 80 bits, and unless the intermediate values are truncated to the appropriate size (32 or 64 bits) by storing in memory, calculations in registers will give different results from calculations which pass intermediate values in memory (eg. via function calls). One solution is to set the FPU into 64 bit precision mode. Some OSs do this (eg. FreeBSD) and some don't (eg. Linux). The problem here is that this will only affect 64-bit precision arithmetic; 32-bit calculations will still be done at 64-bit precision in registers. So it doesn't solve the whole problem. There's also the issue of what the C library is expecting in terms of precision. It seems to be the case that glibc on Linux expects the FPU to be set to 80 bit precision, so setting it to 64 bit could have unexpected effects. Changing the default could have undesirable effects on other 3rd-party library code too, so the right thing would be to save/restore the FPU control word across Haskell code if we were to do this. gcc's -ffloat-store gives consistent results by always storing the results of floating-point calculations in memory, which works for both 32 and 64-bit precision. However, it only affects the values of user-declared floating point variables in C, not intermediate results. GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision flag). Another problem is how to spill floating point registers in the register allocator. Should we spill the whole 80 bits, or just 64? On an OS which is set to 64 bit precision, spilling 64 is fine. On Linux, spilling 64 bits will round the results of some operations. This is what gcc does. Spilling at 80 bits requires taking up a full 128 bit slot (so we get alignment). We spill at 80-bits and ignore the alignment problems. In the future, we'll use the SSE registers for floating point. This requires a CPU that supports SSE2 (ordinary SSE only supports 32 bit precision float ops), which means P4 or Xeon and above. Using SSE will solve all these problems, because the SSE registers use fixed 32 bit or 64 bit precision. --SDM 1/2003 -} #if i386_TARGET_ARCH || x86_64_TARGET_ARCH -- data Instr continues... -- Moves. | MOV MachRep Operand Operand | MOVZxL MachRep Operand Operand -- size is the size of operand 1 | MOVSxL MachRep Operand Operand -- size is the size of operand 1 -- x86_64 note: plain mov into a 32-bit register always zero-extends -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which -- don't affect the high bits of the register. -- Load effective address (also a very useful three-operand add instruction :-) | LEA MachRep Operand Operand -- Int Arithmetic. | ADD MachRep Operand Operand | ADC MachRep Operand Operand | SUB MachRep Operand Operand | MUL MachRep Operand Operand | IMUL MachRep Operand Operand -- signed int mul | IMUL2 MachRep Operand -- %edx:%eax = operand * %eax | DIV MachRep Operand -- eax := eax:edx/op, edx := eax:edx%op | IDIV MachRep Operand -- ditto, but signed -- Simple bit-twiddling. | AND MachRep Operand Operand | OR MachRep Operand Operand | XOR MachRep Operand Operand | NOT MachRep Operand | NEGI MachRep Operand -- NEG instruction (name clash with Cond) -- Shifts (amount may be immediate or %cl only) | SHL MachRep Operand{-amount-} Operand | SAR MachRep Operand{-amount-} Operand | SHR MachRep Operand{-amount-} Operand | BT MachRep Imm Operand | NOP #if i386_TARGET_ARCH -- Float Arithmetic. -- Note that we cheat by treating G{ABS,MOV,NEG} of doubles -- as single instructions right up until we spit them out. -- all the 3-operand fake fp insns are src1 src2 dst -- and furthermore are constrained to be fp regs only. -- IMPORTANT: keep is_G_insn up to date with any changes here | GMOV Reg Reg -- src(fpreg), dst(fpreg) | GLD MachRep AddrMode Reg -- src, dst(fpreg) | GST MachRep Reg AddrMode -- src(fpreg), dst | GLDZ Reg -- dst(fpreg) | GLD1 Reg -- dst(fpreg) | GFTOI Reg Reg -- src(fpreg), dst(intreg) | GDTOI Reg Reg -- src(fpreg), dst(intreg) | GITOF Reg Reg -- src(intreg), dst(fpreg) | GITOD Reg Reg -- src(intreg), dst(fpreg) | GADD MachRep Reg Reg Reg -- src1, src2, dst | GDIV MachRep Reg Reg Reg -- src1, src2, dst | GSUB MachRep Reg Reg Reg -- src1, src2, dst | GMUL MachRep Reg Reg Reg -- src1, src2, dst -- FP compare. Cond must be `elem` [EQQ, NE, LE, LTT, GE, GTT] -- Compare src1 with src2; set the Zero flag iff the numbers are -- comparable and the comparison is True. Subsequent code must -- test the %eflags zero flag regardless of the supplied Cond. | GCMP Cond Reg Reg -- src1, src2 | GABS MachRep Reg Reg -- src, dst | GNEG MachRep Reg Reg -- src, dst | GSQRT MachRep Reg Reg -- src, dst | GSIN MachRep Reg Reg -- src, dst | GCOS MachRep Reg Reg -- src, dst | GTAN MachRep Reg Reg -- src, dst | GFREE -- do ffree on all x86 regs; an ugly hack #endif #if x86_64_TARGET_ARCH -- SSE2 floating point: we use a restricted set of the available SSE2 -- instructions for floating-point. -- use MOV for moving (either movss or movsd (movlpd better?)) | CVTSS2SD Reg Reg -- F32 to F64 | CVTSD2SS Reg Reg -- F64 to F32 | CVTSS2SI Operand Reg -- F32 to I32/I64 (with rounding) | CVTSD2SI Operand Reg -- F64 to I32/I64 (with rounding) | CVTSI2SS Operand Reg -- I32/I64 to F32 | CVTSI2SD Operand Reg -- I32/I64 to F64 -- use ADD & SUB for arithmetic. In both cases, operands -- are Operand Reg. -- SSE2 floating-point division: | FDIV MachRep Operand Operand -- divisor, dividend(dst) -- use CMP for comparisons. ucomiss and ucomisd instructions -- compare single/double prec floating point respectively. | SQRT MachRep Operand Reg -- src, dst #endif -- Comparison | TEST MachRep Operand Operand | CMP MachRep Operand Operand | SETCC Cond Operand -- Stack Operations. | PUSH MachRep Operand | POP MachRep Operand -- both unused (SDM): -- | PUSHA -- | POPA -- Jumping around. | JMP Operand | JXX Cond BlockId -- includes unconditional branches | JMP_TBL Operand [BlockId] -- table jump | CALL (Either Imm Reg) [Reg] -- Other things. | CLTD MachRep -- sign extend %eax into %edx:%eax | FETCHGOT Reg -- pseudo-insn for ELF position-independent code -- pretty-prints as -- call 1f -- 1: popl %reg -- addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg | FETCHPC Reg -- pseudo-insn for Darwin position-independent code -- pretty-prints as -- call 1f -- 1: popl %reg data Operand = OpReg Reg -- register | OpImm Imm -- immediate value | OpAddr AddrMode -- memory reference #endif /* i386 or x86_64 */ #if i386_TARGET_ARCH i386_insert_ffrees :: [Instr] -> [Instr] i386_insert_ffrees insns | any is_G_instr insns = concatMap ffree_before_nonlocal_transfers insns | otherwise = insns ffree_before_nonlocal_transfers insn = case insn of CALL _ _ -> [GFREE, insn] JMP _ -> [GFREE, insn] other -> [insn] -- if you ever add a new FP insn to the fake x86 FP insn set, -- you must update this too is_G_instr :: Instr -> Bool is_G_instr instr = case instr of GMOV _ _ -> True; GLD _ _ _ -> True; GST _ _ _ -> True GLDZ _ -> True; GLD1 _ -> True GFTOI _ _ -> True; GDTOI _ _ -> True GITOF _ _ -> True; GITOD _ _ -> True GADD _ _ _ _ -> True; GDIV _ _ _ _ -> True GSUB _ _ _ _ -> True; GMUL _ _ _ _ -> True GCMP _ _ _ -> True; GABS _ _ _ -> True GNEG _ _ _ -> True; GSQRT _ _ _ -> True GSIN _ _ _ -> True; GCOS _ _ _ -> True; GTAN _ _ _ -> True GFREE -> panic "is_G_instr: GFREE (!)" other -> False #endif /* i386_TARGET_ARCH */ -- ----------------------------------------------------------------------------- -- Sparc instructions #if sparc_TARGET_ARCH -- data Instr continues... -- Loads and stores. | LD MachRep AddrMode Reg -- size, src, dst | ST MachRep Reg AddrMode -- size, src, dst -- Int Arithmetic. | ADD Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst | SUB Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst | UMUL Bool Reg RI Reg -- cc?, src1, src2, dst | SMUL Bool Reg RI Reg -- cc?, src1, src2, dst | RDY Reg -- move contents of Y register to reg -- Simple bit-twiddling. | AND Bool Reg RI Reg -- cc?, src1, src2, dst | ANDN Bool Reg RI Reg -- cc?, src1, src2, dst | OR Bool Reg RI Reg -- cc?, src1, src2, dst | ORN Bool Reg RI Reg -- cc?, src1, src2, dst | XOR Bool Reg RI Reg -- cc?, src1, src2, dst | XNOR Bool Reg RI Reg -- cc?, src1, src2, dst | SLL Reg RI Reg -- src1, src2, dst | SRL Reg RI Reg -- src1, src2, dst | SRA Reg RI Reg -- src1, src2, dst | SETHI Imm Reg -- src, dst | NOP -- Really SETHI 0, %g0, but worth an alias -- Float Arithmetic. -- Note that we cheat by treating F{ABS,MOV,NEG} of doubles as single -- instructions right up until we spit them out. | FABS MachRep Reg Reg -- src dst | FADD MachRep Reg Reg Reg -- src1, src2, dst | FCMP Bool MachRep Reg Reg -- exception?, src1, src2, dst | FDIV MachRep Reg Reg Reg -- src1, src2, dst | FMOV MachRep Reg Reg -- src, dst | FMUL MachRep Reg Reg Reg -- src1, src2, dst | FNEG MachRep Reg Reg -- src, dst | FSQRT MachRep Reg Reg -- src, dst | FSUB MachRep Reg Reg Reg -- src1, src2, dst | FxTOy MachRep MachRep Reg Reg -- src, dst -- Jumping around. | BI Cond Bool Imm -- cond, annul?, target | BF Cond Bool Imm -- cond, annul?, target | JMP AddrMode -- target | CALL (Either Imm Reg) Int Bool -- target, args, terminal riZero :: RI -> Bool riZero (RIImm (ImmInt 0)) = True riZero (RIImm (ImmInteger 0)) = True riZero (RIReg (RealReg 0)) = True riZero _ = False -- Calculate the effective address which would be used by the -- corresponding fpRel sequence. fpRel is in MachRegs.lhs, -- alas -- can't have fpRelEA here because of module dependencies. fpRelEA :: Int -> Reg -> Instr fpRelEA n dst = ADD False False fp (RIImm (ImmInt (n * wORD_SIZE))) dst -- Code to shift the stack pointer by n words. moveSp :: Int -> Instr moveSp n = ADD False False sp (RIImm (ImmInt (n * wORD_SIZE))) sp -- Produce the second-half-of-a-double register given the first half. fPair :: Reg -> Reg fPair (RealReg n) | n >= 32 && n `mod` 2 == 0 = RealReg (n+1) fPair other = pprPanic "fPair(sparc NCG)" (ppr other) #endif /* sparc_TARGET_ARCH */ -- ----------------------------------------------------------------------------- -- PowerPC instructions #ifdef powerpc_TARGET_ARCH -- data Instr continues... -- Loads and stores. | LD MachRep Reg AddrMode -- Load size, dst, src | LA MachRep Reg AddrMode -- Load arithmetic size, dst, src | ST MachRep Reg AddrMode -- Store size, src, dst | STU MachRep Reg AddrMode -- Store with Update size, src, dst | LIS Reg Imm -- Load Immediate Shifted dst, src | LI Reg Imm -- Load Immediate dst, src | MR Reg Reg -- Move Register dst, src -- also for fmr | CMP MachRep Reg RI --- size, src1, src2 | CMPL MachRep Reg RI --- size, src1, src2 | BCC Cond BlockId | JMP CLabel -- same as branch, -- but with CLabel instead of block ID | MTCTR Reg | BCTR [BlockId] -- with list of local destinations | BL CLabel [Reg] -- with list of argument regs | BCTRL [Reg] | ADD Reg Reg RI -- dst, src1, src2 | ADDC Reg Reg Reg -- (carrying) dst, src1, src2 | ADDE Reg Reg Reg -- (extend) dst, src1, src2 | ADDIS Reg Reg Imm -- Add Immediate Shifted dst, src1, src2 | SUBF Reg Reg Reg -- dst, src1, src2 ; dst = src2 - src1 | MULLW Reg Reg RI | DIVW Reg Reg Reg | DIVWU Reg Reg Reg | MULLW_MayOflo Reg Reg Reg -- dst = 1 if src1 * src2 overflows -- pseudo-instruction; pretty-printed as: -- mullwo. dst, src1, src2 -- mfxer dst -- rlwinm dst, dst, 2, 31,31 | AND Reg Reg RI -- dst, src1, src2 | OR Reg Reg RI -- dst, src1, src2 | XOR Reg Reg RI -- dst, src1, src2 | XORIS Reg Reg Imm -- XOR Immediate Shifted dst, src1, src2 | EXTS MachRep Reg Reg | NEG Reg Reg | NOT Reg Reg | SLW Reg Reg RI -- shift left word | SRW Reg Reg RI -- shift right word | SRAW Reg Reg RI -- shift right arithmetic word -- Rotate Left Word Immediate then AND with Mask | RLWINM Reg Reg Int Int Int | FADD MachRep Reg Reg Reg | FSUB MachRep Reg Reg Reg | FMUL MachRep Reg Reg Reg | FDIV MachRep Reg Reg Reg | FNEG Reg Reg -- negate is the same for single and double prec. | FCMP Reg Reg | FCTIWZ Reg Reg -- convert to integer word | FRSP Reg Reg -- reduce to single precision -- (but destination is a FP register) | CRNOR Int Int Int -- condition register nor | MFCR Reg -- move from condition register | MFLR Reg -- move from link register | FETCHPC Reg -- pseudo-instruction: -- bcl to next insn, mflr reg | LWSYNC -- memory barrier #endif /* powerpc_TARGET_ARCH */