1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
|
/* -----------------------------------------------------------------------------
*
* (c) The GHC Team, 1998-2008
*
* MegaBlock Allocator interface.
*
* See wiki commentary at
* http://hackage.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
*
* ---------------------------------------------------------------------------*/
#ifndef RTS_STORAGE_MBLOCK_H
#define RTS_STORAGE_MBLOCK_H
extern lnat mblocks_allocated;
extern void initMBlocks(void);
extern void * getMBlock(void);
extern void * getMBlocks(nat n);
extern void freeAllMBlocks(void);
#ifdef DEBUG
extern void *getFirstMBlock(void);
extern void *getNextMBlock(void *mblock);
#endif
/* -----------------------------------------------------------------------------
The HEAP_ALLOCED() test.
HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC.
It needs to be FAST.
See wiki commentary at
http://hackage.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
Implementation of HEAP_ALLOCED
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we
can just use a table to record which megablocks in the address
space belong to the heap. On a 32-bit machine, with 1Mb
megablocks, using 8 bits for each entry in the table, the table
requires 4k. Lookups during GC will be fast, because the table
will be quickly cached (indeed, performance measurements showed no
measurable difference between doing the table lookup and using a
constant comparison).
On 64-bit machines, we cache one 12-bit block map that describes
4096 megablocks or 4GB of memory. If HEAP_ALLOCED is called for
an address that is not in the cache, it calls slowIsHeapAlloced
(see MBlock.c) which will find the block map for the 4GB block in
question.
-------------------------------------------------------------------------- */
#if SIZEOF_VOID_P == 4
extern StgWord8 mblock_map[];
/* On a 32-bit machine a 4KB table is always sufficient */
# define MBLOCK_MAP_SIZE 4096
# define MBLOCK_MAP_ENTRY(p) ((StgWord)(p) >> MBLOCK_SHIFT)
# define HEAP_ALLOCED(p) mblock_map[MBLOCK_MAP_ENTRY(p)]
# define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p)
/* -----------------------------------------------------------------------------
HEAP_ALLOCED for 64-bit machines.
Here are some cache layout options:
[1]
16KB cache of 16-bit entries, 1MB lines (capacity 8GB)
mblock size = 20 bits
entries = 8192 13 bits
line size = 0 bits (1 bit of value)
tag size = 15 bits
= 48 bits
[2]
32KB cache of 16-bit entries, 4MB lines (capacity 32GB)
mblock size = 20 bits
entries = 16384 14 bits
line size = 2 bits (4 bits of value)
tag size = 12 bits
= 48 bits
[3]
16KB cache of 16-bit entries, 2MB lines (capacity 16GB)
mblock size = 20 bits
entries = 8192 13 bits
line size = 1 bits (2 bits of value)
tag size = 14 bits
= 48 bits
[4]
4KB cache of 32-bit entries, 16MB lines (capacity 16GB)
mblock size = 20 bits
entries = 1024 10 bits
line size = 4 bits (16 bits of value)
tag size = 14 bits
= 48 bits
[5]
4KB cache of 64-bit entries, 32MB lines (capacity 16GB)
mblock size = 20 bits
entries = 512 9 bits
line size = 5 bits (32 bits of value)
tag size = 14 bits
= 48 bits
We actually use none of the above. After much experimentation it was
found that optimising the lookup is the most important factor,
followed by reducing the number of misses. To that end, we use a
variant of [1] in which each cache entry is ((mblock << 1) + value)
where value is 0 for non-heap and 1 for heap. The cache entries can
be 32 bits, since the mblock number is 48-20 = 28 bits, and we need
1 bit for the value. The cache can be as big as we like, but
currently we use 8k entries, giving us 8GB capacity.
---------------------------------------------------------------------------- */
#elif SIZEOF_VOID_P == 8
#define MBC_LINE_BITS 0
#define MBC_TAG_BITS 15
typedef StgWord32 MbcCacheLine; // could use 16, but 32 was faster
typedef StgWord8 MBlockMapLine;
#define MBLOCK_MAP_LINE(p) (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS))
#define MBC_LINE_SIZE (1<<MBC_LINE_BITS)
#define MBC_SHIFT (48 - MBLOCK_SHIFT - MBC_LINE_BITS - MBC_TAG_BITS)
#define MBC_ENTRIES (1<<MBC_SHIFT)
extern MbcCacheLine mblock_cache[];
#define MBC_LINE(p) ((StgWord)p >> (MBLOCK_SHIFT + MBC_LINE_BITS))
#define MBLOCK_MAP_ENTRIES (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS))
typedef struct {
StgWord32 addrHigh32;
MBlockMapLine lines[MBLOCK_MAP_ENTRIES];
} MBlockMap;
extern lnat mpc_misses;
StgBool HEAP_ALLOCED_miss(StgWord mblock, void *p);
INLINE_HEADER
StgBool HEAP_ALLOCED(void *p)
{
StgWord mblock;
nat entry_no;
MbcCacheLine entry, value;
mblock = (StgWord)p >> MBLOCK_SHIFT;
entry_no = mblock & (MBC_ENTRIES-1);
entry = mblock_cache[entry_no];
value = entry ^ (mblock << 1);
// this formulation coaxes gcc into prioritising the value==1
// case, which we expect to be the most common.
// __builtin_expect() didn't have any useful effect (gcc-4.3.0).
if (value == 1) {
return 1;
} else if (value == 0) {
return 0;
} else {
// putting the rest out of line turned out to be a slight
// performance improvement:
return HEAP_ALLOCED_miss(mblock,p);
}
}
// In the parallel GC, the cache itself is safe to *read*, and can be
// updated atomically, but we need to place a lock around operations
// that touch the MBlock map.
INLINE_HEADER
StgBool HEAP_ALLOCED_GC(void *p)
{
StgWord mblock;
nat entry_no;
MbcCacheLine entry, value;
StgBool b;
mblock = (StgWord)p >> MBLOCK_SHIFT;
entry_no = mblock & (MBC_ENTRIES-1);
entry = mblock_cache[entry_no];
value = entry ^ (mblock << 1);
if (value == 1) {
return 1;
} else if (value == 0) {
return 0;
} else {
// putting the rest out of line turned out to be a slight
// performance improvement:
ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
b = HEAP_ALLOCED_miss(mblock,p);
RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
return b;
}
}
#else
# error HEAP_ALLOCED not defined
#endif
#endif /* RTS_STORAGE_MBLOCK_H */
|