summaryrefslogtreecommitdiff
path: root/rts/Profiling.c
diff options
context:
space:
mode:
authorSimon Marlow <marlowsd@gmail.com>2011-11-28 16:48:43 +0000
committerSimon Marlow <marlowsd@gmail.com>2011-11-29 12:21:18 +0000
commit50de6034343abc93a7b01daccff34121042c0e7c (patch)
tree24496a5fc6bc39c6baaa574608e53c5d76c169f6 /rts/Profiling.c
parent1c2b838131134d44004dfdff18c302131478390d (diff)
downloadhaskell-50de6034343abc93a7b01daccff34121042c0e7c.tar.gz
Make profiling work with multiple capabilities (+RTS -N)
This means that both time and heap profiling work for parallel programs. Main internal changes: - CCCS is no longer a global variable; it is now another pseudo-register in the StgRegTable struct. Thus every Capability has its own CCCS. - There is a new built-in CCS called "IDLE", which records ticks for Capabilities in the idle state. If you profile a single-threaded program with +RTS -N2, you'll see about 50% of time in "IDLE". - There is appropriate locking in rts/Profiling.c to protect the shared cost-centre-stack data structures. This patch does enough to get it working, I have cut one big corner: the cost-centre-stack data structure is still shared amongst all Capabilities, which means that multiple Capabilities will race when updating the "allocations" and "entries" fields of a CCS. Not only does this give unpredictable results, but it runs very slowly due to cache line bouncing. It is strongly recommended that you use -fno-prof-count-entries to disable the "entries" count when profiling parallel programs. (I shall add a note to this effect to the docs).
Diffstat (limited to 'rts/Profiling.c')
-rw-r--r--rts/Profiling.c136
1 files changed, 91 insertions, 45 deletions
diff --git a/rts/Profiling.c b/rts/Profiling.c
index c393c8fa83..ac2708eda1 100644
--- a/rts/Profiling.c
+++ b/rts/Profiling.c
@@ -18,6 +18,7 @@
#include "Arena.h"
#include "RetainerProfile.h"
#include "Printer.h"
+#include "Capability.h"
#include <string.h>
@@ -51,16 +52,16 @@ FILE *prof_file;
static char *hp_filename; /* heap profile (hp2ps style) log file */
FILE *hp_file;
-/* The Current Cost Centre Stack (for attributing costs)
- */
-CostCentreStack *CCCS;
-
/* Linked lists to keep track of CCs and CCSs that haven't
* been declared in the log file yet
*/
CostCentre *CC_LIST = NULL;
CostCentreStack *CCS_LIST = NULL;
+#ifdef THREADED_RTS
+Mutex ccs_mutex;
+#endif
+
/*
* Built-in cost centres and cost-centre stacks:
*
@@ -92,6 +93,7 @@ CC_DECLARE(CC_GC, "GC", "GC", CC_NOT_CAF, );
CC_DECLARE(CC_OVERHEAD, "OVERHEAD_of", "PROFILING", CC_NOT_CAF, );
CC_DECLARE(CC_DONT_CARE, "DONT_CARE", "MAIN", CC_NOT_CAF, );
CC_DECLARE(CC_PINNED, "PINNED", "SYSTEM", CC_NOT_CAF, );
+CC_DECLARE(CC_IDLE, "IDLE", "IDLE", CC_NOT_CAF, );
CCS_DECLARE(CCS_MAIN, CC_MAIN, );
CCS_DECLARE(CCS_SYSTEM, CC_SYSTEM, );
@@ -99,6 +101,7 @@ CCS_DECLARE(CCS_GC, CC_GC, );
CCS_DECLARE(CCS_OVERHEAD, CC_OVERHEAD, );
CCS_DECLARE(CCS_DONT_CARE, CC_DONT_CARE, );
CCS_DECLARE(CCS_PINNED, CC_PINNED, );
+CCS_DECLARE(CCS_IDLE, CC_IDLE, );
/*
* Static Functions
@@ -143,7 +146,12 @@ initProfiling1 (void)
prof_arena = newArena();
/* for the benefit of allocate()... */
- CCCS = CCS_SYSTEM;
+ {
+ nat n;
+ for (n=0; n < n_capabilities; n++) {
+ capabilities[n].r.rCCCS = CCS_SYSTEM;
+ }
+ }
}
void
@@ -157,8 +165,6 @@ initProfiling2 (void)
{
CostCentreStack *ccs, *next;
- CCCS = CCS_SYSTEM;
-
/* Set up the log file, and dump the header and cost centre
* information into it.
*/
@@ -173,12 +179,14 @@ initProfiling2 (void)
REGISTER_CC(CC_OVERHEAD);
REGISTER_CC(CC_DONT_CARE);
REGISTER_CC(CC_PINNED);
+ REGISTER_CC(CC_IDLE);
REGISTER_CCS(CCS_SYSTEM);
REGISTER_CCS(CCS_GC);
REGISTER_CCS(CCS_OVERHEAD);
REGISTER_CCS(CCS_DONT_CARE);
REGISTER_CCS(CCS_PINNED);
+ REGISTER_CCS(CCS_IDLE);
REGISTER_CCS(CCS_MAIN);
/* find all the registered cost centre stacks, and make them
@@ -310,12 +318,17 @@ endProfiling ( void )
// implements c1 ++> c2, where c1 and c2 are equal depth
//
-static void enterFunEqualStacks (CostCentreStack *ccs, CostCentreStack *ccsfn)
+static CostCentreStack *
+enterFunEqualStacks (CostCentreStack *ccs0,
+ CostCentreStack *ccsapp,
+ CostCentreStack *ccsfn)
{
- ASSERT(ccs->depth == ccsfn->depth);
- if (ccs == ccsfn) return;
- enterFunEqualStacks(ccs->prevStack, ccsfn->prevStack);
- CCCS = pushCostCentre(CCCS, ccsfn->cc);
+ ASSERT(ccsapp->depth == ccsfn->depth);
+ if (ccsapp == ccsfn) return ccs0;
+ return pushCostCentre(enterFunEqualStacks(ccs0,
+ ccsapp->prevStack,
+ ccsfn->prevStack),
+ ccsfn->cc);
}
// implements c1 ++> c2, where c2 is deeper than c1.
@@ -323,21 +336,25 @@ static void enterFunEqualStacks (CostCentreStack *ccs, CostCentreStack *ccsfn)
// enterFunEqualStacks(), and then push on the elements that we
// dropped in reverse order.
//
-static void enterFunCurShorter (CostCentreStack *ccsfn, StgWord n)
+static CostCentreStack *
+enterFunCurShorter (CostCentreStack *ccsapp, CostCentreStack *ccsfn, StgWord n)
{
if (n == 0) {
- ASSERT(ccsfn->depth == CCCS->depth);
- enterFunEqualStacks(CCCS,ccsfn);
- return;
+ ASSERT(ccsfn->depth == ccsapp->depth);
+ return enterFunEqualStacks(ccsapp,ccsapp,ccsfn);;
+ } else {
+ ASSERT(ccsfn->depth > ccsapp->depth);
+ return pushCostCentre(enterFunCurShorter(ccsapp, ccsfn->prevStack, n-1),
+ ccsfn->cc);
}
- enterFunCurShorter(ccsfn->prevStack, n-1);
- CCCS = pushCostCentre(CCCS, ccsfn->cc);
}
-void enterFunCCS ( CostCentreStack *ccsfn )
+void enterFunCCS (StgRegTable *reg, CostCentreStack *ccsfn)
{
+ CostCentreStack *ccsapp;
+
// common case 1: both stacks are the same
- if (ccsfn == CCCS) {
+ if (ccsfn == reg->rCCCS) {
return;
}
@@ -346,34 +363,38 @@ void enterFunCCS ( CostCentreStack *ccsfn )
return;
}
+ ccsapp = reg->rCCCS;
+ reg->rCCCS = CCS_OVERHEAD;
+
// common case 3: the stacks are completely different (e.g. one is a
// descendent of MAIN and the other of a CAF): we append the whole
// of the function stack to the current CCS.
- if (ccsfn->root != CCCS->root) {
- CCCS = appendCCS(CCCS,ccsfn);
+ if (ccsfn->root != ccsapp->root) {
+ reg->rCCCS = appendCCS(ccsapp,ccsfn);
return;
}
- // uncommon case 4: CCCS is deeper than ccsfn
- if (CCCS->depth > ccsfn->depth) {
+ // uncommon case 4: ccsapp is deeper than ccsfn
+ if (ccsapp->depth > ccsfn->depth) {
nat i, n;
- CostCentreStack *tmp = CCCS;
- n = CCCS->depth - ccsfn->depth;
+ CostCentreStack *tmp = ccsapp;
+ n = ccsapp->depth - ccsfn->depth;
for (i = 0; i < n; i++) {
tmp = tmp->prevStack;
}
- enterFunEqualStacks(tmp,ccsfn);
+ reg->rCCCS = enterFunEqualStacks(ccsapp,tmp,ccsfn);
return;
}
// uncommon case 5: ccsfn is deeper than CCCS
- if (ccsfn->depth > CCCS->depth) {
- enterFunCurShorter(ccsfn, ccsfn->depth - CCCS->depth);
+ if (ccsfn->depth > ccsapp->depth) {
+ reg->rCCCS = enterFunCurShorter(ccsapp, ccsfn,
+ ccsfn->depth - ccsapp->depth);
return;
}
// uncommon case 6: stacks are equal depth, but different
- enterFunEqualStacks(CCCS,ccsfn);
+ reg->rCCCS = enterFunEqualStacks(ccsapp,ccsapp,ccsfn);
}
/* -----------------------------------------------------------------------------
@@ -477,20 +498,41 @@ appendCCS ( CostCentreStack *ccs1, CostCentreStack *ccs2 )
CostCentreStack *
pushCostCentre (CostCentreStack *ccs, CostCentre *cc)
{
- CostCentreStack *temp_ccs;
-
- if (ccs == EMPTY_STACK)
- return actualPush(ccs,cc);
- else {
- if (ccs->cc == cc)
+ CostCentreStack *temp_ccs, *ret;
+ IndexTable *ixtable;
+
+ if (ccs == EMPTY_STACK) {
+ ACQUIRE_LOCK(&ccs_mutex);
+ ret = actualPush(ccs,cc);
+ }
+ else
+ {
+ if (ccs->cc == cc) {
return ccs;
- else {
+ } else {
// check if we've already memoized this stack
- temp_ccs = isInIndexTable(ccs->indexTable,cc);
+ ixtable = ccs->indexTable;
+ temp_ccs = isInIndexTable(ixtable,cc);
- if (temp_ccs != EMPTY_STACK)
+ if (temp_ccs != EMPTY_STACK) {
return temp_ccs;
- else {
+ } else {
+
+ // not in the IndexTable, now we take the lock:
+ ACQUIRE_LOCK(&ccs_mutex);
+
+ if (ccs->indexTable != ixtable)
+ {
+ // someone modified ccs->indexTable while
+ // we did not hold the lock, so we must
+ // check it again:
+ temp_ccs = isInIndexTable(ixtable,cc);
+ if (temp_ccs != EMPTY_STACK)
+ {
+ RELEASE_LOCK(&ccs_mutex);
+ return temp_ccs;
+ }
+ }
temp_ccs = checkLoop(ccs,cc);
if (temp_ccs != NULL) {
// This CC is already in the stack somewhere.
@@ -510,13 +552,16 @@ pushCostCentre (CostCentreStack *ccs, CostCentre *cc)
#endif
ccs->indexTable = addToIndexTable (ccs->indexTable,
new_ccs, cc, 1);
- return new_ccs;
+ ret = new_ccs;
} else {
- return actualPush (ccs,cc);
+ ret = actualPush (ccs,cc);
}
}
}
}
+
+ RELEASE_LOCK(&ccs_mutex);
+ return ret;
}
static CostCentreStack *
@@ -801,11 +846,12 @@ reportCCSProfiling( void )
fprintf(prof_file, " %s", prog_argv[count]);
fprintf(prof_file, "\n\n");
- fprintf(prof_file, "\ttotal time = %11.2f secs (%lu ticks @ %d us)\n",
+ fprintf(prof_file, "\ttotal time = %11.2f secs (%lu ticks @ %d us, %d processor%s)\n",
((double) total_prof_ticks *
- (double) RtsFlags.MiscFlags.tickInterval) / TIME_RESOLUTION,
+ (double) RtsFlags.MiscFlags.tickInterval) / (TIME_RESOLUTION * n_capabilities),
(unsigned long) total_prof_ticks,
- (int) TimeToUS(RtsFlags.MiscFlags.tickInterval));
+ (int) TimeToUS(RtsFlags.MiscFlags.tickInterval),
+ n_capabilities, n_capabilities > 1 ? "s" : "");
fprintf(prof_file, "\ttotal alloc = %11s bytes",
showStgWord64(total_alloc * sizeof(W_),