1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
|
module GHC.CmmToAsm.Reg.Linear.AArch64 where
import GHC.Prelude
import GHC.CmmToAsm.AArch64.Regs
import GHC.Platform.Reg.Class
import GHC.Platform.Reg
import GHC.Utils.Outputable
import GHC.Utils.Panic
import GHC.Platform
import Data.Word
import GHC.Stack
-- AArch64 has 32 64bit general purpose register r0..r30, and zr/sp
-- AArch64 has 32 128bit floating point registers v0..v31 as part of the NEON
-- extension in Armv8-A.
--
-- Armv8-A is a fundamental change to the Arm architecture. It supports the
-- 64-bit Execution state called “AArch64”, and a new 64-bit instruction set
-- “A64”. To provide compatibility with the Armv7-A (32-bit architecture)
-- instruction set, a 32-bit variant of Armv8-A “AArch32” is provided. Most of
-- existing Armv7-A code can be run in the AArch32 execution state of Armv8-A.
--
-- these can be addresses as q/d/s/h/b 0..31, or v.f<size>[idx]
-- where size is 64, 32, 16, 8, ... and the index i allows us
-- to access the given part.
--
-- History of Arm Adv SIMD
-- .---------------------------------------------------------------------------.
-- | Armv6 | Armv7-A | Armv8-A AArch64 |
-- | SIMD extension | NEON | NEON |
-- |===========================================================================|
-- | - Operates on 32-bit | - Separate reg. bank, | - Separate reg. bank, |
-- | GP ARM registers | 32x64-bit NEON regs | 32x128-bit NEON regs |
-- | - 8-bit/16-bit integer | - 8/16/32/64-bit int | - 8/16/32/64-bit int |
-- | | - Single percision fp | - Single percision fp |
-- | | | - Double precision fp |
-- | | | - Single/Double fp are |
-- | | | IEEE compliant |
-- | - 2x16-bit/4x8-bit ops | - Up to 16x8-bit ops | - Up to 16x8-bit ops |
-- | per instruction | per instruction | per instruction |
-- '---------------------------------------------------------------------------'
data FreeRegs = FreeRegs !Word32 !Word32
instance Show FreeRegs where
show (FreeRegs g f) = "FreeRegs: " ++ showBits g ++ "; " ++ showBits f
instance Outputable FreeRegs where
ppr (FreeRegs g f) = text " " <+> foldr (\i x -> pad_int i <+> x) (text "") [0..31]
$$ text "GPR" <+> foldr (\i x -> show_bit g i <+> x) (text "") [0..31]
$$ text "FPR" <+> foldr (\i x -> show_bit f i <+> x) (text "") [0..31]
where pad_int i | i < 10 = char ' ' <> int i
pad_int i = int i
-- remember bit = 1 means it's available.
show_bit bits bit | testBit bits bit = text " "
show_bit _ _ = text " x"
noFreeRegs :: FreeRegs
noFreeRegs = FreeRegs 0 0
showBits :: Word32 -> String
showBits w = map (\i -> if testBit w i then '1' else '0') [0..31]
-- FR instance implementation (See Linear.FreeRegs)
allocateReg :: HasCallStack => RealReg -> FreeRegs -> FreeRegs
allocateReg (RealRegSingle r) (FreeRegs g f)
| r > 31 && testBit f (r - 32) = FreeRegs g (clearBit f (r - 32))
| r < 32 && testBit g r = FreeRegs (clearBit g r) f
| r > 31 = panic $ "Linear.AArch64.allocReg: double allocation of float reg v" ++ show (r - 32) ++ "; " ++ showBits f
| otherwise = pprPanic "Linear.AArch64.allocReg" $ text ("double allocation of gp reg x" ++ show r ++ "; " ++ showBits g)
allocateReg _ _ = panic "Linear.AArch64.allocReg: bad reg"
-- we start from 28 downwards... the logic is similar to the ppc logic.
-- 31 is Stack Pointer
-- 30 is Link Register
-- 29 is Stack Frame (by convention)
-- 19-28 are callee save
-- the lower ones are all caller save
-- For this reason someone decided to give aarch64 only 6 regs for
-- STG:
-- 19: Base
-- 20: Sp
-- 21: Hp
-- 22-27: R1-R6
-- 28: SpLim
-- For LLVM code gen interop:
-- See https://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20150119/253722.html
-- and the current ghccc implementation here:
-- https://github.com/llvm/llvm-project/blob/161ae1f39816edf667aaa190bce702a86879c7bd/llvm/lib/Target/AArch64/AArch64CallingConvention.td#L324-L363
-- and https://gitlab.haskell.org/ghc/ghc/-/wikis/commentary/compiler/generated-code
-- for the STG discussion.
{- For reference the ghcc from the link above:
let Entry = 1 in
def CC_AArch64_GHC : CallingConv<[
CCIfType<[iPTR], CCBitConvertToType<i64>>,
// Handle all vector types as either f64 or v2f64.
CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, f128], CCBitConvertToType<v2f64>>,
CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
CCIfType<[f32], CCAssignToReg<[S8, S9, S10, S11]>>,
CCIfType<[f64], CCAssignToReg<[D12, D13, D14, D15]>>,
// Promote i8/i16/i32 arguments to i64.
CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
// Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>>
]>;
-}
getFreeRegs :: RegClass -> FreeRegs -> [RealReg]
getFreeRegs cls (FreeRegs g f)
| RcFloat <- cls = [] -- For now we only support double and integer registers, floats will need to be promoted.
| RcDouble <- cls = go 32 f 31
| RcInteger <- cls = go 0 g 18
where
go _ _ i | i < 0 = []
go off x i | testBit x i = RealRegSingle (off + i) : (go off x $! i - 1)
| otherwise = go off x $! i - 1
initFreeRegs :: Platform -> FreeRegs
initFreeRegs platform = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform)
releaseReg :: HasCallStack => RealReg -> FreeRegs -> FreeRegs
releaseReg (RealRegSingle r) (FreeRegs g f)
| r > 31 && testBit f (r - 32) = pprPanic "Linear.AArch64.releaseReg" (text "can't release non-allocated reg v" <> int (r - 32))
| r < 32 && testBit g r = pprPanic "Linear.AArch64.releaseReg" (text "can't release non-allocated reg x" <> int r)
| r > 31 = FreeRegs g (setBit f (r - 32))
| otherwise = FreeRegs (setBit g r) f
releaseReg _ _ = pprPanic "Linear.AArch64.releaseReg" (text "bad reg")
|