summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon Marlow <marlowsd@gmail.com>2016-12-15 11:17:19 -0500
committerBen Gamari <ben@smart-cactus.org>2016-12-15 11:17:29 -0500
commit394231b301efb6b56654b0a480ab794fe3b7e4db (patch)
treeaae8fd1ee76ed8a06b9c87beb5dd2cb1aa8187e7
parent5c76f834b5b7f2ee9712d0888a8b1b186b77dee5 (diff)
downloadhaskell-394231b301efb6b56654b0a480ab794fe3b7e4db.tar.gz
Fix cost-centre-stacks bug (#5654)
This fixes some cases of wrong stacks being generated by the profiler. For background and details on the fix see `Note [Evaluating functions with profiling]` in `rts/Apply.cmm`. This does have an impact on allocations for some programs when profiling. nofib results: ``` k-nucleotide +0.0% +8.8% +11.0% +11.0% 0.0% puzzle +0.0% +12.5% 0.244 0.246 0.0% typecheck 0.0% +8.7% +16.1% +16.2% 0.0% ------------------------------------------------------------------------ -------- Min -0.0% -0.0% -34.4% -35.5% -25.0% Max +0.0% +12.5% +48.9% +49.4% +10.6% Geometric Mean +0.0% +0.6% +2.0% +1.8% -0.3% ``` But runtimes don't seem to be affected much, and the examples I looked at were completely legitimate. For example, in puzzle we have this: ``` position :: ItemType -> StateType -> BankType position Bono = bonoPos position Edge = edgePos position Larry = larryPos position Adam = adamPos ``` where the identifiers on the rhs are all record selectors. Previously the profiler gave a stack that looked like ``` position bonoPos ... ``` i.e. `bonoPos` was at the same level of the call stack as `position`, but now it looks like ``` position bonoPos ... ``` I used the normaliser from the testsuite to diff the profiling output from other nofib programs and they all looked better. Test Plan: * the broken test passes * validate * compiled and ran all of nofib, measured perf, diff'd several .prof files Reviewers: niteria, erikd, austin, scpmw, bgamari Reviewed By: bgamari Subscribers: thomie Differential Revision: https://phabricator.haskell.org/D2804 GHC Trac Issues: #5654, #10007
-rw-r--r--compiler/codeGen/StgCmmClosure.hs6
-rw-r--r--includes/Cmm.h6
-rw-r--r--rts/Apply.cmm107
-rw-r--r--testsuite/tests/profiling/should_run/T5654-O0.hs (renamed from testsuite/tests/profiling/should_run/T5654.hs)0
-rw-r--r--testsuite/tests/profiling/should_run/T5654-O0.prof.sample28
-rw-r--r--testsuite/tests/profiling/should_run/T5654-O1.hs14
-rw-r--r--testsuite/tests/profiling/should_run/T5654-O1.prof.sample27
-rw-r--r--testsuite/tests/profiling/should_run/T5654.prof.sample28
-rw-r--r--testsuite/tests/profiling/should_run/T680.prof.sample50
-rw-r--r--testsuite/tests/profiling/should_run/all.T4
10 files changed, 214 insertions, 56 deletions
diff --git a/compiler/codeGen/StgCmmClosure.hs b/compiler/codeGen/StgCmmClosure.hs
index 0ce119b0bb..1943dc4830 100644
--- a/compiler/codeGen/StgCmmClosure.hs
+++ b/compiler/codeGen/StgCmmClosure.hs
@@ -565,8 +565,10 @@ getCallMethod dflags _ id _ n_args v_args _cg_loc
getCallMethod dflags name id (LFReEntrant _ _ arity _ _) n_args _v_args _cg_loc
_self_loop_info
- | n_args == 0 = ASSERT( arity /= 0 )
- ReturnIt -- No args at all
+ | n_args == 0 -- No args at all
+ && not (gopt Opt_SccProfilingOn dflags)
+ -- See Note [Evaluating functions with profiling] in rts/Apply.cmm
+ = ASSERT( arity /= 0 ) ReturnIt
| n_args < arity = SlowCall -- Not enough args
| otherwise = DirectEntry (enterIdLabel dflags name (idCafInfo id)) arity
diff --git a/includes/Cmm.h b/includes/Cmm.h
index 0623c3eeb6..276348a6c9 100644
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -288,7 +288,13 @@
// explicit jumps, for use when we are doing the stack management
// ourselves.
+#if defined(PROFILING)
+// See Note [Evaluating functions with profiling] in rts/Apply.cmm
+#define ENTER(x) jump stg_ap_0_fast(x);
+#else
#define ENTER(x) ENTER_(return,x)
+#endif
+
#define ENTER_R1() ENTER_(RET_R1,R1)
#define RET_R1(x) jump %ENTRY_CODE(Sp(0)) [R1]
diff --git a/rts/Apply.cmm b/rts/Apply.cmm
index 149a320f25..3a73ce09a4 100644
--- a/rts/Apply.cmm
+++ b/rts/Apply.cmm
@@ -30,7 +30,114 @@ stg_ap_0_fast ( P_ fun )
IF_DEBUG(sanity,
ccall checkStackFrame(Sp "ptr"));
+#if !defined(PROFILING)
+
ENTER(fun);
+
+#else
+
+/*
+ Note [Evaluating functions with profiling]
+
+ If we evaluate something like
+
+ let f = {-# SCC "f" #-} g
+
+ where g is a function, then updating the thunk for f to point to g
+ would be incorrect: we've lost the SCC annotation. In general, when
+ we evaluate a function and the current CCS is different from the one
+ stored in the function, we need to return a function with the
+ correct CCS in it.
+
+ The mechanism we use to wrap the function is to create a
+ zero-argument PAP as a proxy object to hold the new CCS, and return
+ that.
+*/
+
+again:
+ W_ info;
+ W_ untaggedfun;
+ untaggedfun = UNTAG(fun);
+ info = %INFO_PTR(untaggedfun);
+ switch [INVALID_OBJECT .. N_CLOSURE_TYPES]
+ (TO_W_( %INFO_TYPE(%STD_INFO(info)) )) {
+ case
+ IND,
+ IND_STATIC:
+ {
+ fun = StgInd_indirectee(fun);
+ goto again;
+ }
+ case
+ FUN,
+ FUN_1_0,
+ FUN_0_1,
+ FUN_2_0,
+ FUN_1_1,
+ FUN_0_2,
+ FUN_STATIC,
+ BCO:
+ {
+ if (CCCS == StgHeader_ccs(untaggedfun)) {
+ return (fun);
+ } else {
+ // We're going to build a new PAP, with zero extra
+ // arguments and therefore the same arity as the
+ // original function. In other words, we're using a
+ // zero-argument PAP as an indirection to the
+ // function, so that we can attach a different CCS to
+ // it.
+ HP_CHK_GEN(SIZEOF_StgPAP);
+ TICK_ALLOC_PAP(SIZEOF_StgPAP, 0);
+ // attribute this allocation to the "overhead of profiling"
+ CCS_ALLOC(BYTES_TO_WDS(SIZEOF_StgPAP), CCS_OVERHEAD);
+ P_ pap;
+ W_ arity;
+ pap = Hp - SIZEOF_StgPAP + WDS(1);
+ SET_HDR(pap, stg_PAP_info, CCCS);
+ arity = TO_W_(StgFunInfoExtra_arity(%FUN_INFO(info)));
+ StgPAP_arity(pap) = arity;
+ StgPAP_fun(pap) = fun;
+ StgPAP_n_args(pap) = 0;
+ return (pap);
+ }
+ }
+ case PAP:
+ {
+ if (CCCS == StgHeader_ccs(untaggedfun)) {
+ return (fun);
+ } else {
+ // We're going to copy this PAP, and put the new CCS in it
+ fun = untaggedfun;
+ W_ size;
+ size = SIZEOF_StgPAP + WDS(TO_W_(StgPAP_n_args(fun)));
+ HP_CHK_GEN(size);
+ TICK_ALLOC_PAP(size, 0);
+ // attribute this allocation to the "overhead of profiling"
+ CCS_ALLOC(BYTES_TO_WDS(SIZEOF_StgPAP), CCS_OVERHEAD);
+ P_ pap;
+ pap = Hp - size + WDS(1);
+ SET_HDR(pap, stg_PAP_info, CCCS);
+ StgPAP_arity(pap) = StgPAP_arity(fun);
+ StgPAP_n_args(pap) = StgPAP_n_args(fun);
+ StgPAP_fun(pap) = StgPAP_fun(fun);
+ W_ i;
+ i = TO_W_(StgPAP_n_args(fun));
+ loop:
+ if (i == 0) {
+ return (pap);
+ }
+ i = i - 1;
+ StgPAP_payload(pap,i) = StgPAP_payload(fun,i);
+ goto loop;
+ }
+ }
+ default:
+ {
+ jump %ENTRY_CODE(info) (UNTAG(fun));
+ }
+ }
+#endif
}
/* -----------------------------------------------------------------------------
diff --git a/testsuite/tests/profiling/should_run/T5654.hs b/testsuite/tests/profiling/should_run/T5654-O0.hs
index d7f83bf9fa..d7f83bf9fa 100644
--- a/testsuite/tests/profiling/should_run/T5654.hs
+++ b/testsuite/tests/profiling/should_run/T5654-O0.hs
diff --git a/testsuite/tests/profiling/should_run/T5654-O0.prof.sample b/testsuite/tests/profiling/should_run/T5654-O0.prof.sample
new file mode 100644
index 0000000000..2344d260f5
--- /dev/null
+++ b/testsuite/tests/profiling/should_run/T5654-O0.prof.sample
@@ -0,0 +1,28 @@
+ Thu Dec 8 11:35 2016 Time and Allocation Profiling Report (Final)
+
+ T5654-O0 +RTS -p -RTS
+
+ total time = 0.00 secs (0 ticks @ 1000 us, 1 processor)
+ total alloc = 39,184 bytes (excludes profiling overheads)
+
+COST CENTRE MODULE SRC %time %alloc
+
+MAIN MAIN <built-in> 0.0 1.0
+CAF GHC.IO.Handle.FD <entire-module> 0.0 88.3
+CAF GHC.IO.Encoding <entire-module> 0.0 7.1
+CAF GHC.Conc.Signal <entire-module> 0.0 1.6
+
+
+ individual inherited
+COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc
+
+MAIN MAIN <built-in> 104 0 0.0 1.0 0.0 100.0
+ CAF Main <entire-module> 207 0 0.0 0.9 0.0 1.4
+ f Main T5654-O0.hs:7:1-5 209 1 0.0 0.1 0.0 0.1
+ main Main T5654-O0.hs:13:1-21 208 1 0.0 0.3 0.0 0.4
+ f Main T5654-O0.hs:7:1-5 210 0 0.0 0.0 0.0 0.1
+ g Main T5654-O0.hs:11:1-11 211 1 0.0 0.1 0.0 0.1
+ CAF GHC.Conc.Signal <entire-module> 201 0 0.0 1.6 0.0 1.6
+ CAF GHC.IO.Encoding <entire-module> 191 0 0.0 7.1 0.0 7.1
+ CAF GHC.IO.Encoding.Iconv <entire-module> 189 0 0.0 0.6 0.0 0.6
+ CAF GHC.IO.Handle.FD <entire-module> 181 0 0.0 88.3 0.0 88.3
diff --git a/testsuite/tests/profiling/should_run/T5654-O1.hs b/testsuite/tests/profiling/should_run/T5654-O1.hs
new file mode 100644
index 0000000000..d7f83bf9fa
--- /dev/null
+++ b/testsuite/tests/profiling/should_run/T5654-O1.hs
@@ -0,0 +1,14 @@
+-- Tests for a bug in the handling of cost-centre stacks in the
+-- runtime, where we lose the current cost-centre stack when
+-- evaluating a function.
+
+{-# NOINLINE f #-}
+f :: Int -> Int
+f = g -- here we should remember the stack under which g was evaluated
+
+{-# NOINLINE g #-}
+g :: Int -> Int
+g x = x + 1
+
+main = return $! f 3
+
diff --git a/testsuite/tests/profiling/should_run/T5654-O1.prof.sample b/testsuite/tests/profiling/should_run/T5654-O1.prof.sample
new file mode 100644
index 0000000000..0e65631521
--- /dev/null
+++ b/testsuite/tests/profiling/should_run/T5654-O1.prof.sample
@@ -0,0 +1,27 @@
+ Thu Dec 8 11:34 2016 Time and Allocation Profiling Report (Final)
+
+ T5654-O1 +RTS -p -RTS
+
+ total time = 0.00 secs (0 ticks @ 1000 us, 1 processor)
+ total alloc = 39,064 bytes (excludes profiling overheads)
+
+COST CENTRE MODULE SRC %time %alloc
+
+MAIN MAIN <built-in> 0.0 1.9
+CAF GHC.IO.Handle.FD <entire-module> 0.0 88.6
+CAF GHC.IO.Encoding <entire-module> 0.0 7.1
+CAF GHC.Conc.Signal <entire-module> 0.0 1.6
+
+
+ individual inherited
+COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc
+
+MAIN MAIN <built-in> 104 0 0.0 1.9 0.0 100.0
+ CAF Main <entire-module> 207 0 0.0 0.0 0.0 0.2
+ main Main T5654-O1.hs:13:1-21 208 1 0.0 0.1 0.0 0.2
+ f Main T5654-O1.hs:7:1-5 209 1 0.0 0.0 0.0 0.0
+ g Main T5654-O1.hs:11:1-11 210 1 0.0 0.0 0.0 0.0
+ CAF GHC.Conc.Signal <entire-module> 201 0 0.0 1.6 0.0 1.6
+ CAF GHC.IO.Encoding <entire-module> 191 0 0.0 7.1 0.0 7.1
+ CAF GHC.IO.Encoding.Iconv <entire-module> 189 0 0.0 0.6 0.0 0.6
+ CAF GHC.IO.Handle.FD <entire-module> 181 0 0.0 88.6 0.0 88.6
diff --git a/testsuite/tests/profiling/should_run/T5654.prof.sample b/testsuite/tests/profiling/should_run/T5654.prof.sample
deleted file mode 100644
index 7d12acdaab..0000000000
--- a/testsuite/tests/profiling/should_run/T5654.prof.sample
+++ /dev/null
@@ -1,28 +0,0 @@
- Wed Nov 23 11:52 2011 Time and Allocation Profiling Report (Final)
-
- scc004 +RTS -p -RTS
-
- total time = 0.00 secs (0 ticks @ 20 ms)
- total alloc = 39,400 bytes (excludes profiling overheads)
-
-COST CENTRE MODULE %time %alloc
-
-MAIN MAIN 0.0 1.7
-CAF GHC.Conc.Signal 0.0 1.7
-CAF GHC.IO.Encoding 0.0 7.1
-CAF GHC.IO.Handle.FD 0.0 88.7
-
-
- individual inherited
-COST CENTRE MODULE no. entries %time %alloc %time %alloc
-
-MAIN MAIN 98 0 0.0 1.7 0.0 100.0
- CAF GHC.IO.Handle.FD 124 0 0.0 88.7 0.0 88.7
- CAF GHC.IO.Encoding.Iconv 116 0 0.0 0.6 0.0 0.6
- CAF GHC.IO.Encoding 114 0 0.0 7.1 0.0 7.1
- CAF GHC.Conc.Signal 106 0 0.0 1.7 0.0 1.7
- CAF Main 104 0 0.0 0.1 0.0 0.2
- f Main 197 1 0.0 0.0 0.0 0.0
- main Main 196 1 0.0 0.0 0.0 0.1
- f Main 198 0 0.0 0.0 0.0 0.0
- g Main 199 1 0.0 0.0 0.0 0.0
diff --git a/testsuite/tests/profiling/should_run/T680.prof.sample b/testsuite/tests/profiling/should_run/T680.prof.sample
index d14bad9bdd..1d52d7157d 100644
--- a/testsuite/tests/profiling/should_run/T680.prof.sample
+++ b/testsuite/tests/profiling/should_run/T680.prof.sample
@@ -1,9 +1,9 @@
- Sat Jun 4 11:59 2016 Time and Allocation Profiling Report (Final)
+ Thu Dec 8 15:23 2016 Time and Allocation Profiling Report (Final)
T680 +RTS -hc -p -RTS
total time = 0.00 secs (0 ticks @ 1000 us, 1 processor)
- total alloc = 752,952 bytes (excludes profiling overheads)
+ total alloc = 753,032 bytes (excludes profiling overheads)
COST CENTRE MODULE SRC %time %alloc
@@ -17,26 +17,26 @@ foo.bar.\ Main T680.hs:(8,11)-(9,38) 0.0 38.2
individual inherited
COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc
-MAIN MAIN <built-in> 46 0 0.0 0.1 0.0 100.0
- CAF Main <entire-module> 91 0 0.0 0.0 0.0 93.6
- k Main T680.hs:12:1-17 95 1 0.0 0.0 0.0 0.0
- foo Main T680.hs:(2,1)-(9,38) 96 1 0.0 0.0 0.0 0.0
- foo.bar Main T680.hs:(5,3)-(9,38) 100 1 0.0 0.0 0.0 0.0
- foo.bar.k' Main T680.hs:6:9-34 104 1 0.0 0.0 0.0 0.0
- k.\ Main T680.hs:12:16 105 1 0.0 0.0 0.0 0.0
- main Main T680.hs:20:1-14 92 1 0.0 0.0 0.0 0.0
- r Main T680.hs:18:1-26 94 1 0.0 0.0 0.0 93.5
- k Main T680.hs:12:1-17 97 0 0.0 0.0 0.0 93.5
- foo Main T680.hs:(2,1)-(9,38) 98 0 0.0 0.0 0.0 93.5
- foo.\ Main T680.hs:3:12-40 99 4001 0.0 25.5 0.0 25.5
- foo.bar Main T680.hs:(5,3)-(9,38) 101 0 0.0 29.8 0.0 68.0
- foo.bar.\ Main T680.hs:(8,11)-(9,38) 102 4001 0.0 38.2 0.0 38.2
- foo.bar.\.k'' Main T680.hs:8:15-27 103 4000 0.0 0.0 0.0 0.0
- foo.bar.k' Main T680.hs:6:9-34 106 0 0.0 0.0 0.0 0.0
- CAF GHC.IO.Handle.FD <entire-module> 84 0 0.0 4.6 0.0 4.6
- CAF GHC.IO.Handle.Text <entire-module> 83 0 0.0 0.0 0.0 0.0
- CAF GHC.Conc.Signal <entire-module> 80 0 0.0 0.1 0.0 0.1
- CAF GHC.IO.Encoding <entire-module> 78 0 0.0 0.4 0.0 0.4
- CAF GHC.Show <entire-module> 76 0 0.0 0.0 0.0 0.0
- CAF GHC.IO.Encoding.Iconv <entire-module> 62 0 0.0 0.0 0.0 0.0
- main Main T680.hs:20:1-14 93 0 0.0 1.2 0.0 1.2
+MAIN MAIN <built-in> 104 0 0.0 0.1 0.0 100.0
+ CAF Main <entire-module> 207 0 0.0 0.0 0.0 93.5
+ k Main T680.hs:12:1-17 211 1 0.0 0.0 0.0 0.0
+ foo Main T680.hs:(2,1)-(9,38) 212 1 0.0 0.0 0.0 0.0
+ foo.bar Main T680.hs:(5,3)-(9,38) 216 1 0.0 0.0 0.0 0.0
+ foo.bar.k' Main T680.hs:6:9-34 220 1 0.0 0.0 0.0 0.0
+ k.\ Main T680.hs:12:16 221 1 0.0 0.0 0.0 0.0
+ main Main T680.hs:20:1-14 208 1 0.0 0.0 0.0 0.0
+ r Main T680.hs:18:1-26 210 1 0.0 0.0 0.0 93.5
+ k Main T680.hs:12:1-17 213 0 0.0 0.0 0.0 93.5
+ foo Main T680.hs:(2,1)-(9,38) 214 0 0.0 0.0 0.0 93.5
+ foo.\ Main T680.hs:3:12-40 215 4001 0.0 25.5 0.0 25.5
+ foo.bar Main T680.hs:(5,3)-(9,38) 217 0 0.0 29.8 0.0 68.0
+ foo.bar.\ Main T680.hs:(8,11)-(9,38) 218 4001 0.0 38.2 0.0 38.2
+ foo.bar.\.k'' Main T680.hs:8:15-27 219 4000 0.0 0.0 0.0 0.0
+ foo.bar.k' Main T680.hs:6:9-34 222 0 0.0 0.0 0.0 0.0
+ CAF GHC.Conc.Signal <entire-module> 201 0 0.0 0.1 0.0 0.1
+ CAF GHC.IO.Encoding <entire-module> 191 0 0.0 0.4 0.0 0.4
+ CAF GHC.IO.Encoding.Iconv <entire-module> 189 0 0.0 0.0 0.0 0.0
+ CAF GHC.IO.Handle.FD <entire-module> 181 0 0.0 4.6 0.0 4.6
+ CAF GHC.IO.Handle.Text <entire-module> 179 0 0.0 0.0 0.0 0.0
+ CAF GHC.Show <entire-module> 165 0 0.0 0.0 0.0 0.0
+ main Main T680.hs:20:1-14 209 0 0.0 1.2 0.0 1.2
diff --git a/testsuite/tests/profiling/should_run/all.T b/testsuite/tests/profiling/should_run/all.T
index 151b75b858..5faca297b6 100644
--- a/testsuite/tests/profiling/should_run/all.T
+++ b/testsuite/tests/profiling/should_run/all.T
@@ -47,7 +47,9 @@ test('scc002', [], compile_and_run, [''])
test('scc003', [], compile_and_run,
['-fno-state-hack']) # Note [consistent stacks]
-test('T5654', [expect_broken(5654)], compile_and_run, [''])
+test('T5654-O0', [only_ways(['prof'])], compile_and_run, [''])
+
+test('T5654-O1', [only_ways(['profasm'])], compile_and_run, [''])
test('T5654b-O0', [only_ways(['prof'])], compile_and_run, [''])