From 6c47f2efa3f8f4639f375d34f54c01a60c9a1a82 Mon Sep 17 00:00:00 2001 From: Simon Marlow Date: Sun, 9 Oct 2016 18:20:53 -0400 Subject: Default +RTS -qn to the number of cores Setting a -N value that is too large has a dramatic negative effect on performance, but the new -qn flag can mitigate the worst of the effects by limiting the number of GC threads. So now, if you don't explcitly set +RTS -qn, and you set -N larger than the number of cores (or use setNumCapabilities to do the same), we'll default -qn to the number of cores. These are the results from nofib/parallel on my 4-core (2 cores x 2 threads) i7 laptop, comparing -N8 before and after this change. ``` ------------------------------------------------------------------------ Program Size Allocs Runtime Elapsed TotalMem ------------------------------------------------------------------------ blackscholes +0.0% +0.0% -72.5% -72.0% +9.5% coins +0.0% -0.0% -73.7% -72.2% -0.8% mandel +0.0% +0.0% -76.4% -75.4% +3.3% matmult +0.0% +15.5% -26.8% -33.4% +1.0% nbody +0.0% +2.4% +0.7% 0.076 0.0% parfib +0.0% -8.5% -33.2% -31.5% +2.0% partree +0.0% -0.0% -60.4% -56.8% +5.7% prsa +0.0% -0.0% -65.4% -60.4% 0.0% queens +0.0% +0.2% -58.8% -58.8% -1.5% ray +0.0% -1.5% -88.7% -85.6% -3.6% sumeuler +0.0% -0.0% -47.8% -46.9% 0.0% ------------------------------------------------------------------------ Min +0.0% -8.5% -88.7% -85.6% -3.6% Max +0.0% +15.5% +0.7% -31.5% +9.5% Geometric Mean +0.0% +0.6% -61.4% -63.1% +1.4% ``` Test Plan: validate, nofib/parallel benchmarks Reviewers: niteria, ezyang, nh2, austin, erikd, trofi, bgamari Reviewed By: trofi, bgamari Subscribers: thomie Differential Revision: https://phabricator.haskell.org/D2580 GHC Trac Issues: #9221 --- rts/Schedule.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'rts') diff --git a/rts/Schedule.c b/rts/Schedule.c index 611d70411f..3cbfc0eba0 100644 --- a/rts/Schedule.c +++ b/rts/Schedule.c @@ -1531,6 +1531,7 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS, uint32_t gc_type; uint32_t i; uint32_t need_idle; + uint32_t n_gc_threads; uint32_t n_idle_caps = 0, n_failed_trygrab_idles = 0; StgTSO *tso; rtsBool *idle_cap; @@ -1561,9 +1562,17 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS, gc_type = SYNC_GC_SEQ; } - if (gc_type == SYNC_GC_PAR && RtsFlags.ParFlags.parGcThreads > 0) { - need_idle = stg_max(0, enabled_capabilities - - RtsFlags.ParFlags.parGcThreads); + // If -qn is not set and we have more capabilities than cores, set the + // number of GC threads to #cores. We do this here rather than in + // normaliseRtsOpts() because here it will work if the program calls + // setNumCapabilities. + n_gc_threads = RtsFlags.ParFlags.parGcThreads; + if (n_gc_threads == 0 && enabled_capabilities > getNumberOfProcessors()) { + n_gc_threads = getNumberOfProcessors(); + } + + if (gc_type == SYNC_GC_PAR && n_gc_threads > 0) { + need_idle = stg_max(0, enabled_capabilities - n_gc_threads); } else { need_idle = 0; } -- cgit v1.2.1