rts/sm/GCThread.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215

/* -----------------------------------------------------------------------------
 *
 * (c) The GHC Team 1998-2008
 *
 * Generational garbage collector
 *
 * Documentation on the architecture of the Garbage Collector can be
 * found in the online commentary:
 *
 *   https://gitlab.haskell.org/ghc/ghc/wikis/commentary/rts/storage/gc
 *
 * ---------------------------------------------------------------------------*/

#pragma once

#include "WSDeque.h"
#include "GetTime.h" // for Ticks

#include "BeginPrivate.h"

/* -----------------------------------------------------------------------------
   General scheme

   ToDo: move this to the wiki when the implementation is done.

   We're only going to try to parallelise the copying GC for now.  The
   Plan is as follows.

   Each thread has a gc_thread structure (see below) which holds its
   thread-local data.  We'll keep a pointer to this in a thread-local
   variable, or possibly in a register.

   In the gc_thread structure is a gen_workspace for each generation.  The
   primary purpose of the gen_workspace is to hold evacuated objects;
   when an object is evacuated, it is copied to the "todo" block in
   the thread's workspace for the appropriate generation.  When the todo
   block is full, it is pushed to the global gen->todos list, which
   is protected by a lock.  (in fact we intervene a one-place buffer
   here to reduce contention).

   A thread repeatedly grabs a block of work from one of the
   gen->todos lists, scavenges it, and keeps the scavenged block on
   its own ws->scavd_list (this is to avoid unnecessary contention
   returning the completed buffers back to the generation: we can just
   collect them all later).

   When there is no global work to do, we start scavenging the todo
   blocks in the workspaces.  This is where the scan_bd field comes
   in: we can scan the contents of the todo block, when we have
   scavenged the contents of the todo block (up to todo_bd->free), we
   don't want to move this block immediately to the scavd_list,
   because it is probably only partially full.  So we remember that we
   have scanned up to this point by saving the block in ws->scan_bd,
   with the current scan pointer in ws->scan.  Later, when more
   objects have been copied to this block, we can come back and scan
   the rest.  When we visit this workspace again in the future,
   scan_bd may still be the same as todo_bd, or it might be different:
   if enough objects were copied into this block that it filled up,
   then we will have allocated a new todo block, but *not* pushed the
   old one to the generation, because it is partially scanned.

   The reason to leave scanning the todo blocks until last is that we
   want to deal with full blocks as far as possible.
   ------------------------------------------------------------------------- */


/* -----------------------------------------------------------------------------
   Generation Workspace

   A generation workspace exists for each generation for each GC
   thread. The GC thread takes a block from the todos list of the
   generation into the scanbd and then scans it.  Objects referred to
   by those in the scan block are copied into the todo or scavd blocks
   of the relevant generation.

   ------------------------------------------------------------------------- */

typedef struct gen_workspace_ {
    generation * gen;           // the gen for this workspace
    struct gc_thread_ * my_gct; // the gc_thread that contains this workspace

    // where objects to be scavenged go
    bdescr *     todo_bd;
    StgPtr       todo_free;            // free ptr for todo_bd
    StgPtr       todo_lim;             // lim for todo_bd

    WSDeque *    todo_q;
    bdescr *     todo_overflow;
    uint32_t     n_todo_overflow;

    // where large objects to be scavenged go
    bdescr *     todo_large_objects;

    // Objects that have already been scavenged.
    bdescr *     scavd_list;
    StgWord      n_scavd_blocks;     // count of blocks in this list
    StgWord      n_scavd_words;

    // Partially-full, scavenged, blocks
    bdescr *     part_list;
    StgWord      n_part_blocks;      // count of above
    StgWord      n_part_words;

    StgWord pad[1];

} gen_workspace ATTRIBUTE_ALIGNED(64);
// align so that computing gct->gens[n] is a shift, not a multiply
// fails if the size is <64, which is why we need the pad above

/* ----------------------------------------------------------------------------
   GC thread object

   Every GC thread has one of these. It contains all the generation
   specific workspaces and other GC thread local information. At some
   later point it maybe useful to move this other into the TLS store
   of the GC threads
   ------------------------------------------------------------------------- */

/* values for the wakeup field */
#define GC_THREAD_INACTIVE             0
#define GC_THREAD_STANDING_BY          1
#define GC_THREAD_RUNNING              2
#define GC_THREAD_WAITING_TO_CONTINUE  3

typedef struct gc_thread_ {
    Capability *cap;

#if defined(THREADED_RTS)
    OSThreadId id;                 // The OS thread that this struct belongs to
    SpinLock   gc_spin;
    SpinLock   mut_spin;
    volatile StgWord wakeup;       // NB not StgWord8; only StgWord is guaranteed atomic
#endif
    uint32_t thread_index;         // a zero based index identifying the thread

    bdescr * free_blocks;          // a buffer of free blocks for this thread
                                   //  during GC without accessing the block
                                   //   allocators spin lock.

    // These two lists are chained through the STATIC_LINK() fields of static
    // objects.  Pointers are tagged with the current static_flag, so before
    // following a pointer, untag it with UNTAG_STATIC_LIST_PTR().
    StgClosure* static_objects;            // live static objects
    StgClosure* scavenged_static_objects;  // static objects scavenged so far

    W_ gc_count;                   // number of GCs this thread has done

    // block that is currently being scanned
    bdescr *     scan_bd;

    // Remembered sets on this CPU.  Each GC thread has its own
    // private per-generation remembered sets, so it can add an item
    // to the remembered set without taking a lock.  The mut_lists
    // array on a gc_thread is the same as the one on the
    // corresponding Capability; we stash it here too for easy access
    // during GC; see recordMutableGen_GC().
    bdescr **    mut_lists;

    // --------------------
    // evacuate flags

    uint32_t evac_gen_no;          // Youngest generation that objects
                                   // should be evacuated to in
                                   // evacuate().  (Logically an
                                   // argument to evacuate, but it's
                                   // static a lot of the time so we
                                   // optimise it into a per-thread
                                   // variable).

    bool failed_to_evac;           // failure to evacuate an object typically
                                   // Causes it to be recorded in the mutable
                                   // object list

    bool eager_promotion;          // forces promotion to the evac gen
                                   // instead of the to-space
                                   // corresponding to the object

    W_ thunk_selector_depth;       // used to avoid unbounded recursion in
                                   // evacuate() for THUNK_SELECTOR

    // -------------------
    // stats

    W_ copied;
    W_ scanned;
    W_ any_work;
    W_ no_work;
    W_ scav_find_work;

    Time gc_start_cpu;   // process CPU time
    Time gc_sync_start_elapsed;  // start of GC sync
    Time gc_start_elapsed;  // process elapsed time
    W_ gc_start_faults;

    // -------------------
    // workspaces

    // array of workspaces, indexed by gen->abs_no.  This is placed
    // directly at the end of the gc_thread structure so that we can get from
    // the gc_thread pointer to a workspace using only pointer
    // arithmetic, no memory access.  This happens in the inner loop
    // of the GC, see Evac.c:alloc_for_copy().
    gen_workspace gens[];
} gc_thread;


extern uint32_t n_gc_threads;

extern gc_thread **gc_threads;

#if defined(THREADED_RTS) && defined(CC_LLVM_BACKEND)
extern ThreadLocalKey gctKey;
#endif

#include "EndPrivate.h"