diff options
-rw-r--r-- | includes/stg/MiscClosures.h | 1 | ||||
-rw-r--r-- | rts/Apply.cmm | 27 | ||||
-rw-r--r-- | rts/Interpreter.c | 72 | ||||
-rw-r--r-- | rts/Printer.c | 5 | ||||
-rw-r--r-- | rts/Profiling.c | 6 | ||||
-rw-r--r-- | rts/StgMiscClosures.cmm | 10 | ||||
-rw-r--r-- | testsuite/tests/codeGen/should_run/cgrun057.stderr | 2 | ||||
-rw-r--r-- | testsuite/tests/profiling/should_run/T680.prof.sample | 65 | ||||
-rw-r--r-- | testsuite/tests/profiling/should_run/all.T | 3 | ||||
-rw-r--r-- | testsuite/tests/profiling/should_run/toplevel_scc_1.prof.sample | 41 |
10 files changed, 170 insertions, 62 deletions
diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h index b3f9a69472..994b369c46 100644 --- a/includes/stg/MiscClosures.h +++ b/includes/stg/MiscClosures.h @@ -63,6 +63,7 @@ RTS_RET(stg_maskUninterruptiblezh_ret); RTS_RET(stg_maskAsyncExceptionszh_ret); RTS_RET(stg_stack_underflow_frame); RTS_RET(stg_restore_cccs); +RTS_RET(stg_restore_cccs_eval); // RTS_FUN(stg_interp_constr1_entry); // RTS_FUN(stg_interp_constr2_entry); diff --git a/rts/Apply.cmm b/rts/Apply.cmm index b18c347d40..b3a04ca58c 100644 --- a/rts/Apply.cmm +++ b/rts/Apply.cmm @@ -52,6 +52,10 @@ stg_ap_0_fast ( P_ fun ) The mechanism we use to wrap the function is to create a zero-argument PAP as a proxy object to hold the new CCS, and return that. + + If the closure we evaluated is itself a PAP, we cannot make a nested + PAP, so we copy the original PAP and set the CCS in the new PAP to + enterFunCCS(pap->header.prof.ccs). */ again: @@ -122,6 +126,8 @@ again: CCS_ALLOC(BYTES_TO_WDS(SIZEOF_StgPAP), CCS_OVERHEAD); P_ pap; pap = Hp - size + WDS(1); + // We'll lose the original PAP, so we should enter its CCS + ccall enterFunCCS(BaseReg "ptr", StgHeader_ccs(fun) "ptr"); SET_HDR(pap, stg_PAP_info, CCCS); StgPAP_arity(pap) = StgPAP_arity(fun); StgPAP_n_args(pap) = StgPAP_n_args(fun); @@ -137,6 +143,27 @@ again: goto loop; } } + case AP, + AP_STACK, + BLACKHOLE, + WHITEHOLE, + THUNK, + THUNK_1_0, + THUNK_0_1, + THUNK_2_0, + THUNK_1_1, + THUNK_0_2, + THUNK_STATIC, + THUNK_SELECTOR: + { + // The thunk might evaluate to a function, so we have to come + // back here again to adjust its CCS if necessary. The + // stg_restore_ccs_eval stack frame does that. + STK_CHK_GEN(); + jump %ENTRY_CODE(info) + (stg_restore_cccs_eval_info, CCCS) + (UNTAG(fun)); + } default: { jump %ENTRY_CODE(info) (UNTAG(fun)); diff --git a/rts/Interpreter.c b/rts/Interpreter.c index 7221ff64f9..5a395670b7 100644 --- a/rts/Interpreter.c +++ b/rts/Interpreter.c @@ -214,6 +214,48 @@ void interp_shutdown ( void ) #endif +#ifdef PROFILING + +// +// Build a zero-argument PAP with the current CCS +// See Note [Evaluating functions with profiling] in Apply.cmm +// +STATIC_INLINE +StgClosure * newEmptyPAP (Capability *cap, + StgClosure *tagged_obj, // a FUN or a BCO + uint32_t arity) +{ + StgPAP *pap = (StgPAP *)allocate(cap, sizeofW(StgPAP)); + SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS); + pap->arity = arity; + pap->n_args = 0; + pap->fun = tagged_obj; + return (StgClosure *)pap; +} + +// +// Make an exact copy of a PAP, except that we combine the current CCS with the +// CCS in the PAP. See Note [Evaluating functions with profiling] in Apply.cmm +// +STATIC_INLINE +StgClosure * copyPAP (Capability *cap, StgPAP *oldpap) +{ + uint32_t size = PAP_sizeW(oldpap->n_args); + StgPAP *pap = (StgPAP *)allocate(cap, size); + enterFunCCS(&cap->r, oldpap->header.prof.ccs); + SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS); + pap->arity = oldpap->arity; + pap->n_args = oldpap->n_args; + pap->fun = oldpap->fun; + uint32_t i; + for (i = 0; i < ((StgPAP *)pap)->n_args; i++) { + pap->payload[i] = oldpap->payload[i]; + } + return (StgClosure *)pap; +} + +#endif + static StgWord app_ptrs_itbl[] = { (W_)&stg_ap_p_info, (W_)&stg_ap_pp_info, @@ -343,6 +385,8 @@ eval_obj: case CONSTR_1_1: case CONSTR_0_2: case CONSTR_NOCAF: + break; + case FUN: case FUN_1_0: case FUN_0_1: @@ -350,15 +394,30 @@ eval_obj: case FUN_1_1: case FUN_0_2: case FUN_STATIC: +#ifdef PROFILING + if (cap->r.rCCCS != obj->header.prof.ccs) { + tagged_obj = + newEmptyPAP(cap, tagged_obj, get_fun_itbl(obj)->f.arity); + } +#endif + break; + case PAP: - // already in WHNF +#ifdef PROFILING + if (cap->r.rCCCS != obj->header.prof.ccs) { + tagged_obj = copyPAP(cap, (StgPAP *)obj); + } +#endif break; case BCO: - { ASSERT(((StgBCO *)obj)->arity > 0); +#ifdef PROFILING + if (cap->r.rCCCS != obj->header.prof.ccs) { + tagged_obj = newEmptyPAP(cap, tagged_obj, ((StgBCO *)obj)->arity); + } +#endif break; - } case AP: /* Copied from stg_AP_entry. */ { @@ -380,7 +439,7 @@ eval_obj: // restore the CCCS after evaluating the AP Sp -= 2; Sp[1] = (W_)cap->r.rCCCS; - Sp[0] = (W_)&stg_restore_cccs_info; + Sp[0] = (W_)&stg_restore_cccs_eval_info; #endif Sp -= sizeofW(StgUpdateFrame); @@ -425,7 +484,7 @@ eval_obj: // restore the CCCS after evaluating the closure Sp -= 2; Sp[1] = (W_)cap->r.rCCCS; - Sp[0] = (W_)&stg_restore_cccs_info; + Sp[0] = (W_)&stg_restore_cccs_eval_info; #endif Sp -= 2; Sp[1] = (W_)tagged_obj; @@ -465,7 +524,8 @@ do_return: // NOTE: not using get_itbl(). info = ((StgClosure *)Sp)->header.info; - if (info == (StgInfoTable *)&stg_restore_cccs_info) { + if (info == (StgInfoTable *)&stg_restore_cccs_info || + info == (StgInfoTable *)&stg_restore_cccs_eval_info) { cap->r.rCCCS = (CostCentreStack*)Sp[1]; Sp += 2; goto do_return; diff --git a/rts/Printer.c b/rts/Printer.c index f23e0b0636..87b11e80d8 100644 --- a/rts/Printer.c +++ b/rts/Printer.c @@ -526,6 +526,11 @@ printStackChunk( StgPtr sp, StgPtr spBottom ) fprintCCS(stderr, (CostCentreStack*)sp[1]); debugBelch("\n" ); continue; + } else if (c == (StgWord)&stg_restore_cccs_eval_info) { + debugBelch("stg_restore_cccs_eval_info\n" ); + fprintCCS(stderr, (CostCentreStack*)sp[1]); + debugBelch("\n" ); + continue; #endif } else { debugBelch("RET_SMALL (%p)\n", info); diff --git a/rts/Profiling.c b/rts/Profiling.c index 952785be18..94ec55582b 100644 --- a/rts/Profiling.c +++ b/rts/Profiling.c @@ -236,6 +236,10 @@ CostCentre *mkCostCentre (char *label, char *module, char *srcloc) cc->label = label; cc->module = module; cc->srcloc = srcloc; + cc->is_caf = 0; + cc->mem_alloc = 0; + cc->time_ticks = 0; + cc->link = NULL; return cc; } @@ -379,7 +383,7 @@ void enterFunCCS (StgRegTable *reg, CostCentreStack *ccsfn) } // common case 2: the function stack is empty, or just CAF - if (ccsfn->prevStack == CCS_MAIN) { + if (ccsfn->cc->is_caf) { return; } diff --git a/rts/StgMiscClosures.cmm b/rts/StgMiscClosures.cmm index aa22c99be4..e8a5b8fed5 100644 --- a/rts/StgMiscClosures.cmm +++ b/rts/StgMiscClosures.cmm @@ -55,6 +55,16 @@ INFO_TABLE_RET (stg_restore_cccs, RET_SMALL, W_ info_ptr, W_ cccs) jump %ENTRY_CODE(Sp(0)) [*]; // NB. all registers live! } + +INFO_TABLE_RET (stg_restore_cccs_eval, RET_SMALL, W_ info_ptr, W_ cccs) + return (P_ ret) +{ +#if defined(PROFILING) + CCCS = cccs; +#endif + jump stg_ap_0_fast(ret); +} + /* ---------------------------------------------------------------------------- Support for the bytecode interpreter. ------------------------------------------------------------------------- */ diff --git a/testsuite/tests/codeGen/should_run/cgrun057.stderr b/testsuite/tests/codeGen/should_run/cgrun057.stderr index 262d74912d..5d1656d25d 100644 --- a/testsuite/tests/codeGen/should_run/cgrun057.stderr +++ b/testsuite/tests/codeGen/should_run/cgrun057.stderr @@ -1,4 +1,4 @@ -*** Exception (reporting due to +RTS -xc): (THUNK_2_0), stack trace: +*** Exception (reporting due to +RTS -xc): (THUNK_STATIC), stack trace: Main.g, called from Main.f, called from Main.main, diff --git a/testsuite/tests/profiling/should_run/T680.prof.sample b/testsuite/tests/profiling/should_run/T680.prof.sample index 1d52d7157d..8875bbd376 100644 --- a/testsuite/tests/profiling/should_run/T680.prof.sample +++ b/testsuite/tests/profiling/should_run/T680.prof.sample @@ -1,42 +1,41 @@ - Thu Dec 8 15:23 2016 Time and Allocation Profiling Report (Final) + Tue Dec 20 13:18 2016 Time and Allocation Profiling Report (Final) - T680 +RTS -hc -p -RTS + T680 +RTS -p -RTS - total time = 0.00 secs (0 ticks @ 1000 us, 1 processor) - total alloc = 753,032 bytes (excludes profiling overheads) + total time = 0.20 secs (204 ticks @ 1000 us, 1 processor) + total alloc = 449,729,208 bytes (excludes profiling overheads) -COST CENTRE MODULE SRC %time %alloc +COST CENTRE MODULE SRC %time %alloc -CAF GHC.IO.Handle.FD <entire-module> 0.0 4.6 -main Main T680.hs:20:1-14 0.0 1.2 -foo.\ Main T680.hs:3:12-40 0.0 25.5 -foo.bar Main T680.hs:(5,3)-(9,38) 0.0 29.8 -foo.bar.\ Main T680.hs:(8,11)-(9,38) 0.0 38.2 +foo.\ Main T680.hs:3:12-40 98.5 99.8 individual inherited COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc -MAIN MAIN <built-in> 104 0 0.0 0.1 0.0 100.0 - CAF Main <entire-module> 207 0 0.0 0.0 0.0 93.5 - k Main T680.hs:12:1-17 211 1 0.0 0.0 0.0 0.0 - foo Main T680.hs:(2,1)-(9,38) 212 1 0.0 0.0 0.0 0.0 - foo.bar Main T680.hs:(5,3)-(9,38) 216 1 0.0 0.0 0.0 0.0 - foo.bar.k' Main T680.hs:6:9-34 220 1 0.0 0.0 0.0 0.0 - k.\ Main T680.hs:12:16 221 1 0.0 0.0 0.0 0.0 - main Main T680.hs:20:1-14 208 1 0.0 0.0 0.0 0.0 - r Main T680.hs:18:1-26 210 1 0.0 0.0 0.0 93.5 - k Main T680.hs:12:1-17 213 0 0.0 0.0 0.0 93.5 - foo Main T680.hs:(2,1)-(9,38) 214 0 0.0 0.0 0.0 93.5 - foo.\ Main T680.hs:3:12-40 215 4001 0.0 25.5 0.0 25.5 - foo.bar Main T680.hs:(5,3)-(9,38) 217 0 0.0 29.8 0.0 68.0 - foo.bar.\ Main T680.hs:(8,11)-(9,38) 218 4001 0.0 38.2 0.0 38.2 - foo.bar.\.k'' Main T680.hs:8:15-27 219 4000 0.0 0.0 0.0 0.0 - foo.bar.k' Main T680.hs:6:9-34 222 0 0.0 0.0 0.0 0.0 - CAF GHC.Conc.Signal <entire-module> 201 0 0.0 0.1 0.0 0.1 - CAF GHC.IO.Encoding <entire-module> 191 0 0.0 0.4 0.0 0.4 - CAF GHC.IO.Encoding.Iconv <entire-module> 189 0 0.0 0.0 0.0 0.0 - CAF GHC.IO.Handle.FD <entire-module> 181 0 0.0 4.6 0.0 4.6 - CAF GHC.IO.Handle.Text <entire-module> 179 0 0.0 0.0 0.0 0.0 - CAF GHC.Show <entire-module> 165 0 0.0 0.0 0.0 0.0 - main Main T680.hs:20:1-14 209 0 0.0 1.2 0.0 1.2 +MAIN MAIN <built-in> 108 0 0.0 0.0 100.0 100.0 + CAF Main <entire-module> 215 0 0.0 0.0 100.0 100.0 + k Main T680.hs:12:1-17 219 1 0.0 0.0 0.0 0.0 + foo Main T680.hs:(2,1)-(9,38) 220 1 0.0 0.0 0.0 0.0 + foo.bar Main T680.hs:(5,3)-(9,38) 224 1 0.0 0.0 0.0 0.0 + foo.bar.k' Main T680.hs:6:9-34 228 1 0.0 0.0 0.0 0.0 + k.\ Main T680.hs:12:16 229 1 0.0 0.0 0.0 0.0 + main Main T680.hs:20:1-14 216 1 0.0 0.0 0.0 0.0 + r Main T680.hs:18:1-26 218 1 0.0 0.0 100.0 100.0 + k Main T680.hs:12:1-17 221 0 0.0 0.0 100.0 100.0 + foo Main T680.hs:(2,1)-(9,38) 222 0 0.0 0.0 100.0 100.0 + foo.\ Main T680.hs:3:12-40 223 4001 98.5 99.8 98.5 99.8 + foo.bar Main T680.hs:(5,3)-(9,38) 225 0 0.5 0.0 1.5 0.2 + foo.bar.\ Main T680.hs:(8,11)-(9,38) 226 4001 1.0 0.1 1.0 0.1 + foo.bar.\.k'' Main T680.hs:8:15-27 227 4000 0.0 0.0 0.0 0.0 + foo.bar.k' Main T680.hs:6:9-34 232 0 0.0 0.0 0.0 0.0 + k.\ Main T680.hs:12:16 233 0 0.0 0.0 0.0 0.0 + foo.bar.k' Main T680.hs:6:9-34 230 0 0.0 0.0 0.0 0.0 + k.\ Main T680.hs:12:16 231 0 0.0 0.0 0.0 0.0 + CAF GHC.Conc.Signal <entire-module> 206 0 0.0 0.0 0.0 0.0 + CAF GHC.IO.Encoding <entire-module> 194 0 0.0 0.0 0.0 0.0 + CAF GHC.IO.Encoding.Iconv <entire-module> 192 0 0.0 0.0 0.0 0.0 + CAF GHC.IO.Handle.FD <entire-module> 184 0 0.0 0.0 0.0 0.0 + CAF GHC.IO.Handle.Text <entire-module> 182 0 0.0 0.0 0.0 0.0 + CAF GHC.Show <entire-module> 168 0 0.0 0.0 0.0 0.0 + main Main T680.hs:20:1-14 217 0 0.0 0.0 0.0 0.0 diff --git a/testsuite/tests/profiling/should_run/all.T b/testsuite/tests/profiling/should_run/all.T index 875a98e2dd..d85a2b8e6a 100644 --- a/testsuite/tests/profiling/should_run/all.T +++ b/testsuite/tests/profiling/should_run/all.T @@ -97,8 +97,7 @@ test('T5559', [], compile_and_run, ['']) test('callstack001', # unoptimised results are different w.r.t. CAF attribution - [ expect_broken_for_10037, - omit_ways(['ghci-ext-prof']), # produces a different stack + [ omit_ways(['ghci-ext-prof']), # produces a different stack ], compile_and_run, ['-fprof-auto-calls -fno-full-laziness -fno-state-hack']) diff --git a/testsuite/tests/profiling/should_run/toplevel_scc_1.prof.sample b/testsuite/tests/profiling/should_run/toplevel_scc_1.prof.sample index 4eec28da28..3464946624 100644 --- a/testsuite/tests/profiling/should_run/toplevel_scc_1.prof.sample +++ b/testsuite/tests/profiling/should_run/toplevel_scc_1.prof.sample @@ -1,30 +1,33 @@ - Tue Jul 19 08:36 2016 Time and Allocation Profiling Report (Final) + Tue Dec 20 14:22 2016 Time and Allocation Profiling Report (Final) toplevel_scc_1 +RTS -p -RTS total time = 0.00 secs (0 ticks @ 1000 us, 1 processor) - total alloc = 79,792 bytes (excludes profiling overheads) + total alloc = 75,880 bytes (excludes profiling overheads) COST CENTRE MODULE SRC %time %alloc +MAIN MAIN <built-in> 0.0 24.0 CAF GHC.Read <entire-module> 0.0 1.2 -CAF GHC.IO.Handle.FD <entire-module> 0.0 64.9 -CAF GHC.IO.Encoding <entire-module> 0.0 3.5 -CAF Main <entire-module> 0.0 27.6 +CAF GHC.IO.Handle.FD <entire-module> 0.0 68.3 +CAF GHC.IO.Encoding <entire-module> 0.0 3.6 - individual inherited -COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc + individual inherited +COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc -MAIN MAIN <built-in> 105 0 0.0 0.4 0.0 100.0 - CAF Main <entire-module> 209 0 0.0 27.6 0.0 27.9 - Main.f1 Main toplevel_scc_1.hs:4:1-2 212 1 0.0 0.0 0.0 0.0 - Main.foo Main toplevel_scc_1.hs:7:1-2 210 1 0.0 0.2 0.0 0.3 - Main.bar Main toplevel_scc_1.hs:10:5-6 211 1 0.0 0.1 0.0 0.1 - CAF GHC.Conc.Signal <entire-module> 203 0 0.0 0.8 0.0 0.8 - CAF GHC.IO.Encoding <entire-module> 193 0 0.0 3.5 0.0 3.5 - CAF GHC.IO.Encoding.Iconv <entire-module> 191 0 0.0 0.3 0.0 0.3 - CAF GHC.IO.Handle.FD <entire-module> 183 0 0.0 64.9 0.0 64.9 - CAF GHC.IO.Handle.Text <entire-module> 181 0 0.0 0.1 0.0 0.1 - CAF GHC.Read <entire-module> 171 0 0.0 1.2 0.0 1.2 - CAF Text.Read.Lex <entire-module> 154 0 0.0 0.8 0.0 0.8 +MAIN MAIN <built-in> 105 0 0.0 24.0 0.0 100.0 + CAF Main <entire-module> 209 0 0.0 0.3 0.0 0.5 + Main.f1 Main toplevel_scc_1.hs:4:1-2 214 1 0.0 0.0 0.0 0.0 + Main.foo Main toplevel_scc_1.hs:7:1-2 210 1 0.0 0.2 0.0 0.2 + Main.bar Main toplevel_scc_1.hs:10:5-6 212 1 0.0 0.0 0.0 0.0 + CAF GHC.Conc.Signal <entire-module> 203 0 0.0 0.8 0.0 0.8 + CAF GHC.IO.Encoding <entire-module> 193 0 0.0 3.6 0.0 3.6 + CAF GHC.IO.Encoding.Iconv <entire-module> 191 0 0.0 0.3 0.0 0.3 + CAF GHC.IO.Handle.FD <entire-module> 183 0 0.0 68.3 0.0 68.3 + CAF GHC.IO.Handle.Text <entire-module> 181 0 0.0 0.1 0.0 0.1 + CAF GHC.Read <entire-module> 171 0 0.0 1.2 0.0 1.2 + CAF Text.Read.Lex <entire-module> 154 0 0.0 0.9 0.0 0.9 + Main.f1 Main toplevel_scc_1.hs:4:1-2 215 0 0.0 0.0 0.0 0.0 + Main.foo Main toplevel_scc_1.hs:7:1-2 211 0 0.0 0.1 0.0 0.1 + Main.bar Main toplevel_scc_1.hs:10:5-6 213 0 0.0 0.1 0.0 0.1 |