summaryrefslogtreecommitdiff
path: root/rts
diff options
context:
space:
mode:
authorSimon Marlow <marlowsd@gmail.com>2016-10-09 18:20:53 -0400
committerBen Gamari <ben@smart-cactus.org>2016-10-09 18:21:16 -0400
commit6c47f2efa3f8f4639f375d34f54c01a60c9a1a82 (patch)
treee92395e5ef2530f9ea274e0f384426e70414b9db /rts
parent8dc72f3c33b0e724ddb690c9d494969980c10afd (diff)
downloadhaskell-6c47f2efa3f8f4639f375d34f54c01a60c9a1a82.tar.gz
Default +RTS -qn to the number of cores
Setting a -N value that is too large has a dramatic negative effect on performance, but the new -qn flag can mitigate the worst of the effects by limiting the number of GC threads. So now, if you don't explcitly set +RTS -qn, and you set -N larger than the number of cores (or use setNumCapabilities to do the same), we'll default -qn to the number of cores. These are the results from nofib/parallel on my 4-core (2 cores x 2 threads) i7 laptop, comparing -N8 before and after this change. ``` ------------------------------------------------------------------------ Program Size Allocs Runtime Elapsed TotalMem ------------------------------------------------------------------------ blackscholes +0.0% +0.0% -72.5% -72.0% +9.5% coins +0.0% -0.0% -73.7% -72.2% -0.8% mandel +0.0% +0.0% -76.4% -75.4% +3.3% matmult +0.0% +15.5% -26.8% -33.4% +1.0% nbody +0.0% +2.4% +0.7% 0.076 0.0% parfib +0.0% -8.5% -33.2% -31.5% +2.0% partree +0.0% -0.0% -60.4% -56.8% +5.7% prsa +0.0% -0.0% -65.4% -60.4% 0.0% queens +0.0% +0.2% -58.8% -58.8% -1.5% ray +0.0% -1.5% -88.7% -85.6% -3.6% sumeuler +0.0% -0.0% -47.8% -46.9% 0.0% ------------------------------------------------------------------------ Min +0.0% -8.5% -88.7% -85.6% -3.6% Max +0.0% +15.5% +0.7% -31.5% +9.5% Geometric Mean +0.0% +0.6% -61.4% -63.1% +1.4% ``` Test Plan: validate, nofib/parallel benchmarks Reviewers: niteria, ezyang, nh2, austin, erikd, trofi, bgamari Reviewed By: trofi, bgamari Subscribers: thomie Differential Revision: https://phabricator.haskell.org/D2580 GHC Trac Issues: #9221
Diffstat (limited to 'rts')
-rw-r--r--rts/Schedule.c15
1 files changed, 12 insertions, 3 deletions
diff --git a/rts/Schedule.c b/rts/Schedule.c
index 611d70411f..3cbfc0eba0 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -1531,6 +1531,7 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
uint32_t gc_type;
uint32_t i;
uint32_t need_idle;
+ uint32_t n_gc_threads;
uint32_t n_idle_caps = 0, n_failed_trygrab_idles = 0;
StgTSO *tso;
rtsBool *idle_cap;
@@ -1561,9 +1562,17 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
gc_type = SYNC_GC_SEQ;
}
- if (gc_type == SYNC_GC_PAR && RtsFlags.ParFlags.parGcThreads > 0) {
- need_idle = stg_max(0, enabled_capabilities -
- RtsFlags.ParFlags.parGcThreads);
+ // If -qn is not set and we have more capabilities than cores, set the
+ // number of GC threads to #cores. We do this here rather than in
+ // normaliseRtsOpts() because here it will work if the program calls
+ // setNumCapabilities.
+ n_gc_threads = RtsFlags.ParFlags.parGcThreads;
+ if (n_gc_threads == 0 && enabled_capabilities > getNumberOfProcessors()) {
+ n_gc_threads = getNumberOfProcessors();
+ }
+
+ if (gc_type == SYNC_GC_PAR && n_gc_threads > 0) {
+ need_idle = stg_max(0, enabled_capabilities - n_gc_threads);
} else {
need_idle = 0;
}