1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
|
/* -----------------------------------------------------------------------------
* (c) The GHC Team 2006
*
* Initialization and use of the PAPI performance monitoring library
*
*
* For adding events or add your processor counters modify
*
* init_countable_events
* papi_report
*
* ---------------------------------------------------------------------------*/
#ifdef USE_PAPI /* ugly */
#include <papi.h>
#include "Papi.h"
#include "Rts.h"
#include "RtsUtils.h"
#include "Stats.h"
#include "RtsFlags.h"
#include "OSThreads.h"
// used to protect the aggregated counters
#ifdef THREADED_RTS
static Mutex papi_counter_mutex;
#endif
struct _papi_events {
int event_code;
const char * event_name;
};
/* Beware, these counters are Opteron specific
* I obtained the numbers using the papi_avail
* and papi_native_avail utilities.
* This is certainly not the official PAPI way
* of doing things.
*/
#define FR_BR 0x40000040
#define FR_BR_MIS 0x40000041
#define FR_BR_MISCOMPARE 0x40000048
#define DC_ACCESS 0x40000019
#define DC_MISS 0x4000001a
#define FR_DISPATCH_STALLS 0x40000054
#define FR_DISPATCH_STALLS_BR 0x40000055
#define FR_DISPATCH_STALLS_FULL_REORDER 0x40000058
#define FR_DISPATCH_STALLS_FULL_RESERVATION 0x40000059
#define FR_DISPATCH_STALLS_FULL_LS 0x4000005b
#define DC_L2_REFILL_MOES 0x40001e1b
#define DC_SYS_REFILL_MOES 0x40001e1c
/* This is bad, it should be in a header */
#define BIG_STRING_LEN 512
#define PAPI_CHECK(CALL) \
if((papi_error=(CALL)) != PAPI_OK) { \
debugBelch("PAPI function failed in module %s at line %d with error code %d\n", \
__FILE__,__LINE__,papi_error); \
}
/* While PAPI reporting is going on this flag is on */
int papi_is_reporting;
/* Event sets and counter arrays for GC and mutator */
int MutatorEvents = PAPI_NULL;
int GCEvents = PAPI_NULL;
int papi_error;
/* Arbitrary, to avoid using malloc */
#define MAX_PAPI_EVENTS 10
static nat n_papi_events = 0;
/* Events counted during GC and Mutator execution */
/* There's a trailing comma, do all C compilers accept that? */
static struct _papi_events papi_events[MAX_PAPI_EVENTS];
long_long MutatorCounters[MAX_PAPI_EVENTS];
long_long GC0Counters[MAX_PAPI_EVENTS];
long_long GC1Counters[MAX_PAPI_EVENTS];
long_long start_mutator_cycles;
long_long mutator_cycles;
long_long start_gc_cycles;
long_long gc0_cycles;
long_long gc1_cycles;
static long_long papi_counter(long_long values[],int event);
static void papi_add_events(int EventSet);
static nat max_hardware_counters = 2;
/* If you want to add events to count, extend the
* init_countable_events and the papi_report function.
* Be aware that your processor can count a limited number
* of events simultaneously, you can turn on multiplexing
* to increase that number, though.
*/
static void papi_add_event(const char *name, int code)
{
if (n_papi_events >= max_hardware_counters) {
errorBelch("too many PAPI events for this CPU (max: %d)",
max_hardware_counters);
stg_exit(EXIT_FAILURE);
}
papi_events[n_papi_events].event_code = code;
papi_events[n_papi_events].event_name = name;
n_papi_events++;
}
static void
init_countable_events(void)
{
max_hardware_counters = PAPI_num_counters();
#define PAPI_ADD_EVENT(EVENT) papi_add_event(#EVENT,EVENT)
if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_BRANCH) {
PAPI_ADD_EVENT(FR_BR);
PAPI_ADD_EVENT(FR_BR_MIS);
/* Docs are wrong? Opteron does not count indirect branch misses exclusively */
PAPI_ADD_EVENT(FR_BR_MISCOMPARE);
} else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_STALLS) {
PAPI_ADD_EVENT(FR_DISPATCH_STALLS);
PAPI_ADD_EVENT(FR_DISPATCH_STALLS_BR);
PAPI_ADD_EVENT(FR_DISPATCH_STALLS_FULL_LS);
} else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L1) {
PAPI_ADD_EVENT(PAPI_L1_DCA);
PAPI_ADD_EVENT(PAPI_L1_DCM);
} else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L2) {
PAPI_ADD_EVENT(PAPI_L2_DCA);
PAPI_ADD_EVENT(PAPI_L2_DCM);
} else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CB_EVENTS) {
PAPI_ADD_EVENT(DC_L2_REFILL_MOES);
PAPI_ADD_EVENT(DC_SYS_REFILL_MOES);
PAPI_ADD_EVENT(FR_BR_MIS);
} else if (RtsFlags.PapiFlags.eventType==PAPI_USER_EVENTS) {
nat i;
char *name;
int code;
for (i = 0; i < RtsFlags.PapiFlags.numUserEvents; i++) {
name = RtsFlags.PapiFlags.userEvents[i];
PAPI_CHECK(PAPI_event_name_to_code(name, &code))
papi_add_event(name, code);
}
} else {
// PAPI_ADD_EVENT(PAPI_L1_DCA); // L1 data cache accesses
// PAPI_ADD_EVENT(PAPI_L1_ICR); // L1 instruction cache reads
// PAPI_ADD_EVENT(PAPI_L1_ICM); // L1 instruction cache misses
// PAPI_ADD_EVENT(PAPI_L1_STM); // L1 store misses
// PAPI_ADD_EVENT(PAPI_L1_DCM); // L1 data cache misses
// PAPI_ADD_EVENT(PAPI_L1_LDM); // L1 load misses
// PAPI_ADD_EVENT(PAPI_L2_TCM); // L2 cache misses
// PAPI_ADD_EVENT(PAPI_L2_STM); // L2 store misses
// PAPI_ADD_EVENT(PAPI_L2_DCW); // L2 data cache writes
// PAPI_ADD_EVENT(PAPI_L2_DCR); // L2 data cache reads
// PAPI_ADD_EVENT(PAPI_L2_TCW); // L2 cache writes
// PAPI_ADD_EVENT(PAPI_L2_TCR); // L2 cache reads
// PAPI_ADD_EVENT(PAPI_CA_CLN); // exclusive access to clean cache line
// PAPI_ADD_EVENT(PAPI_TLB_DM); // TLB misses
PAPI_ADD_EVENT(PAPI_TOT_INS); // Total instructions
PAPI_ADD_EVENT(PAPI_TOT_CYC); // Total instructions
// PAPI_ADD_EVENT(PAPI_CA_SHR); // exclusive access to shared cache line
// PAPI_ADD_EVENT(PAPI_RES_STL); // Cycles stalled on any resource
}
// We might also consider:
// PAPI_BR_MSP Conditional branch instructions mispredicted
// PAPI_RES_STL Cycles stalled on any resource
};
static void
papi_report_event(const char *name, ullong value)
{
static char temp[BIG_STRING_LEN];
ullong_format_string(value,temp,rtsTrue/*commas*/);
statsPrintf(" %15s %15s\n", name, temp);
}
/* This function reports counters for GC and mutator */
static void
papi_report(long_long counters[])
{
nat i;
/* Report the value of a counter as a percentage of another counter */
#define PAPI_REPORT_PCT(EVENTSET,EVENT,EVENTTOT) \
statsPrintf(" " #EVENT " %% of " #EVENTTOT " : %.1f%%\n", \
papi_counter(EVENTSET,EVENT)*100.0/papi_counter(EVENTSET,EVENTTOT))
for (i = 0; i < n_papi_events; i++)
{
papi_report_event(papi_events[i].event_name, counters[i]);
}
if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_BRANCH) {
PAPI_REPORT_PCT(counters,FR_BR_MIS,FR_BR);
PAPI_REPORT_PCT(counters,FR_BR_MISCOMPARE,FR_BR);
}
else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L1) {
PAPI_REPORT_PCT(counters,PAPI_L1_DCM,PAPI_L1_DCA);
}
else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L2) {
PAPI_REPORT_PCT(counters,PAPI_L2_DCM,PAPI_L2_DCA);
}
}
void
papi_stats_report (void)
{
statsPrintf(" Mutator CPU counters\n");
papi_report_event("CYCLES", mutator_cycles);
papi_report(MutatorCounters);
statsPrintf("\n GC(0) CPU counters\n");
papi_report_event("CYCLES", gc0_cycles);
papi_report(GC0Counters);
statsPrintf("\n GC(1) CPU counters\n");
papi_report_event("CYCLES", gc1_cycles);
papi_report(GC1Counters);
}
void
papi_init_eventset (int *event_set)
{
PAPI_register_thread();
PAPI_CHECK( PAPI_create_eventset(event_set));
papi_add_events(*event_set);
}
void
papi_init (void)
{
/* Initialise the performance tracking library */
int ver;
if ((ver = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT) {
if (ver > 0) {
errorBelch("PAPI_library_init: wrong version: %x", ver);
stg_exit(EXIT_FAILURE);
} else {
sysErrorBelch("PAPI_library_init");
stg_exit(EXIT_FAILURE);
}
}
#ifdef THREADED_RTS
{
int err;
if ((err = PAPI_thread_init(osThreadId)) < 0) {
barf("PAPI_thread_init: %d",err);
}
initMutex(&papi_counter_mutex);
}
#endif
init_countable_events();
papi_init_eventset(&MutatorEvents);
papi_init_eventset(&GCEvents);
}
/* Extract the value corresponding to an event */
static long_long
papi_counter(long_long values[],int event)
{
nat i;
for(i=0;i<n_papi_events;i++) {
if(papi_events[i].event_code==event) {
return values[i];
}
}
/* Passed a wrong event? */
debugBelch("Event %d is not part of event set\n",event);
return 0;
}
/* Add the events of papi_events into an event set */
static void
papi_add_events(int EventSet)
{
nat i;
for(i=0;i<n_papi_events;i++) {
if((papi_error=PAPI_add_event(EventSet,
papi_events[i].event_code))
!= PAPI_OK)
debugBelch("Failed adding %s to event set with error code %d\n",
papi_events[i].event_name,papi_error);
}
}
/* We should be using elapsed cycles
* to be consistent with time metric chosen in Stats.c (Elapsed time).
* This is an approximation to the cycles that the program spends.
* Note that the counters, in contrast, are virtual and user space.
*/
#define PAPI_cycles PAPI_get_virt_cyc
void
papi_start_mutator_count(void)
{
ACQUIRE_LOCK(&papi_counter_mutex);
PAPI_CHECK( PAPI_start(MutatorEvents));
start_mutator_cycles = PAPI_cycles();
RELEASE_LOCK(&papi_counter_mutex);
}
void
papi_stop_mutator_count(void)
{
ACQUIRE_LOCK(&papi_counter_mutex);
mutator_cycles += PAPI_cycles() - start_mutator_cycles;
PAPI_CHECK( PAPI_accum(MutatorEvents,MutatorCounters));
PAPI_CHECK( PAPI_stop(MutatorEvents,NULL));
RELEASE_LOCK(&papi_counter_mutex);
}
void
papi_start_gc_count(void)
{
ACQUIRE_LOCK(&papi_counter_mutex);
PAPI_CHECK( PAPI_start(GCEvents));
start_gc_cycles = PAPI_cycles();
RELEASE_LOCK(&papi_counter_mutex);
}
void
papi_stop_gc0_count(void)
{
ACQUIRE_LOCK(&papi_counter_mutex);
PAPI_CHECK( PAPI_accum(GCEvents,GC0Counters));
PAPI_CHECK( PAPI_stop(GCEvents,NULL));
gc0_cycles += PAPI_cycles() - start_gc_cycles;
RELEASE_LOCK(&papi_counter_mutex);
}
void
papi_stop_gc1_count(void)
{
ACQUIRE_LOCK(&papi_counter_mutex);
PAPI_CHECK( PAPI_accum(GCEvents,GC1Counters));
PAPI_CHECK( PAPI_stop(GCEvents,NULL));
gc1_cycles += PAPI_cycles() - start_gc_cycles;
RELEASE_LOCK(&papi_counter_mutex);
}
void
papi_thread_start_gc1_count(int event_set)
{
ACQUIRE_LOCK(&papi_counter_mutex);
PAPI_CHECK( PAPI_start(event_set));
RELEASE_LOCK(&papi_counter_mutex);
}
void
papi_thread_stop_gc1_count(int event_set)
{
ACQUIRE_LOCK(&papi_counter_mutex);
PAPI_CHECK( PAPI_accum(event_set,GC1Counters));
PAPI_CHECK( PAPI_stop(event_set,NULL));
RELEASE_LOCK(&papi_counter_mutex);
}
#endif /* USE_PAPI */
|