summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergei Trofimovich <slyfox@gentoo.org>2016-08-30 12:10:54 +0100
committerSergei Trofimovich <siarheit@google.com>2016-08-30 12:11:40 +0100
commita5d26f26d33bc04f31eaff50b7d633444192b4cb (patch)
tree7c796946c0fb6081904e05b7fbd03ffa5bd55afc
parent9d175605e52fd0d85f2548896358d96ee441c7e4 (diff)
downloadhaskell-a5d26f26d33bc04f31eaff50b7d633444192b4cb.tar.gz
rts: enable parallel GC scan of large (32M+) allocation area
Parallel GC does not scan large allocation area (-A) effectively as it does not do work stealing from nursery by default. That leads to large imbalance when only one of threads overflows allocation area: most of GC threads finish quickly (as there is not much to collect) and sit idle waiting while single GC thread finishes scan of single allocation area for that thread. The patch enables work stealing for (equivalent of -qb0) allocation area of -A32M or higher. Tested on a highlighting-kate package from Trac #9221 On 8-core machine the difference is around 5% faster of wall-clock time. On 24-core VM the speedup is 20%. Signed-off-by: Sergei Trofimovich <siarheit@google.com> Test Plan: measured wall time and GC parallelism on highlighting-kate build Reviewers: austin, bgamari, erikd, simonmar Reviewed By: bgamari, simonmar Subscribers: thomie Differential Revision: https://phabricator.haskell.org/D2483 GHC Trac Issues: #9221
-rw-r--r--docs/users_guide/runtime_control.rst2
-rw-r--r--rts/RtsFlags.c21
2 files changed, 20 insertions, 3 deletions
diff --git a/docs/users_guide/runtime_control.rst b/docs/users_guide/runtime_control.rst
index 1ae51ddc49..3968065865 100644
--- a/docs/users_guide/runtime_control.rst
+++ b/docs/users_guide/runtime_control.rst
@@ -449,7 +449,7 @@ performance.
.. rts-flag:: -qb <gen>
- :default: 1
+ :default: 1 for ``-A`` < 32M, 0 otherwise
:since: 6.12.1
Use load-balancing in the parallel GC in generation ⟨gen⟩ and higher.
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
index e23f760f43..7e06d84d93 100644
--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -227,7 +227,7 @@ void initRtsFlagsDefaults(void)
RtsFlags.ParFlags.parGcEnabled = 1;
RtsFlags.ParFlags.parGcGen = 0;
RtsFlags.ParFlags.parGcLoadBalancingEnabled = rtsTrue;
- RtsFlags.ParFlags.parGcLoadBalancingGen = 1;
+ RtsFlags.ParFlags.parGcLoadBalancingGen = ~0u; /* auto, based on -A */
RtsFlags.ParFlags.parGcNoSyncWithIdle = 0;
RtsFlags.ParFlags.parGcThreads = 0; /* defaults to -N */
RtsFlags.ParFlags.setAffinity = 0;
@@ -393,7 +393,8 @@ usage_text[] = {
" -qg[<n>] Use parallel GC only for generations >= <n>",
" (default: 0, -qg alone turns off parallel GC)",
" -qb[<n>] Use load-balancing in the parallel GC only for generations >= <n>",
-" (default: 1, -qb alone turns off load-balancing)",
+" (default: 1 for -A < 32M, 0 otherwise;"
+" -qb alone turns off load-balancing)",
" -qn<n> Use <n> threads for parallel GC (defaults to value of -N)",
" -qa Use the OS to set thread affinity (experimental)",
" -qm Don't automatically migrate threads between CPUs",
@@ -1450,6 +1451,22 @@ static void normaliseRtsOpts (void)
errorUsage();
}
+ if (RtsFlags.ParFlags.parGcLoadBalancingGen == ~0u) {
+ StgWord alloc_area_bytes
+ = RtsFlags.GcFlags.minAllocAreaSize * BLOCK_SIZE;
+
+ // If allocation area is larger that CPU cache
+ // we can finish scanning quicker doing work-stealing
+ // scan. Trac #9221
+ // 32M looks big enough not to fit into L2 cache
+ // of popular modern CPUs.
+ if (alloc_area_bytes >= 32 * 1024 * 1024) {
+ RtsFlags.ParFlags.parGcLoadBalancingGen = 0;
+ } else {
+ RtsFlags.ParFlags.parGcLoadBalancingGen = 1;
+ }
+ }
+
#ifdef THREADED_RTS
if (RtsFlags.ParFlags.parGcThreads > RtsFlags.ParFlags.nCapabilities) {
errorBelch("GC threads (-qn) must be between 1 and the value of -N");