10 files changed, 172 insertions, 20 deletions
diff --git a/docs/users_guide/9.2.1-notes.rst b/docs/users_guide/9.2.1-notes.rst
index de4a983001..3b0022fb8a 100644
--- a/docs/users_guide/9.2.1-notes.rst
+++ b/docs/users_guide/9.2.1-notes.rst
@@ -150,8 +150,6 @@ Runtime system
   Moreover, we now correctly account for the size of the array, meaning that
   space lost to fragmentation is no longer counted as live data.
 
-
-
 - The ``-xt`` RTS flag has been removed. Now STACK and TSO closures are always
   included in heap profiles. Tooling can choose to filter out these closure types
 `  if necessary.
@@ -162,6 +160,11 @@ Runtime system
   be consumed with ``eventlog2html``. This profiling mode does not require a
   profiling build.
 
+- The RTS will now gradually return unused memory back to the OS rather than
+  retaining a large amount (up to 4 * live) indefinitely. The rate at which memory
+  is returned is controlled by the :rts-flag:`-Fd ⟨factor⟩`. Memory return
+  is triggered by consecutive idle collections.
+
 ``ghc-prim`` library
 ~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/users_guide/runtime_control.rst b/docs/users_guide/runtime_control.rst
index b8da4aee01..25b27fdd1b 100644
--- a/docs/users_guide/runtime_control.rst
+++ b/docs/users_guide/runtime_control.rst
@@ -577,6 +577,25 @@ performance.
     The :rts-flag:`-F ⟨factor⟩` setting will be automatically reduced by the garbage
     collector when the maximum heap size (the :rts-flag:`-M ⟨size⟩` setting) is approaching.
 
+.. rts-flag:: -Fd ⟨factor⟩
+
+    :default: 4
+
+    .. index::
+       single: heap size, factor
+
+    The inverse rate at which unused memory is returned to the OS when it is no longer
+    needed. After a large amount of allocation the RTS will start by retaining
+    a lot of allocated blocks in case it will need them again shortly but then
+    it will gradually release them based on the :rts-flag:`-Fd ⟨factor⟩`. On
+    each subsequent major collection which is not caused by a heap overflow a little
+    more memory will attempt to be returned until the amount retained is similar to
+    the amount of live bytes.
+
+    Increasing this factor will make the rate memory is returned slower, decreasing
+    it will make memory be returned more eagerly. Setting it to 0 will disable the
+    memory return (which will emulate the behaviour in releases prior to 9.2).
+
 .. rts-flag:: -G ⟨generations⟩
 
     :default: 2
diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h
index eda961656d..ff05426e8a 100644
--- a/includes/rts/Flags.h
+++ b/includes/rts/Flags.h
@@ -50,6 +50,7 @@ typedef struct _GC_FLAGS {
     uint32_t     heapSizeSuggestion; /* in *blocks* */
     bool heapSizeSuggestionAuto;
     double  oldGenFactor;
+    double  returnDecayFactor;
     double  pcFreeHeap;
 
     bool         useNonmoving; // default = false
diff --git a/libraries/base/GHC/RTS/Flags.hsc b/libraries/base/GHC/RTS/Flags.hsc
index 2abe5d7d85..138033758b 100644
--- a/libraries/base/GHC/RTS/Flags.hsc
+++ b/libraries/base/GHC/RTS/Flags.hsc
@@ -131,6 +131,7 @@ data GCFlags = GCFlags
     , heapSizeSuggestion    :: Word32
     , heapSizeSuggestionAuto :: Bool
     , oldGenFactor          :: Double
+    , returnDecayFactor     :: Double
     , pcFreeHeap            :: Double
     , generations           :: Word32
     , squeezeUpdFrames      :: Bool
@@ -435,6 +436,7 @@ getGCFlags = do
           <*> (toBool <$>
                 (#{peek GC_FLAGS, heapSizeSuggestionAuto} ptr :: IO CBool))
           <*> #{peek GC_FLAGS, oldGenFactor} ptr
+          <*> #{peek GC_FLAGS, returnDecayFactor} ptr
           <*> #{peek GC_FLAGS, pcFreeHeap} ptr
           <*> #{peek GC_FLAGS, generations} ptr
           <*> (toBool <$>
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
index fa4af7f7a6..9bf3f692ab 100644
--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -164,6 +164,7 @@ void initRtsFlagsDefaults(void)
     RtsFlags.GcFlags.heapSizeSuggestionAuto = false;
     RtsFlags.GcFlags.pcFreeHeap         = 3;    /* 3% */
     RtsFlags.GcFlags.oldGenFactor       = 2;
+    RtsFlags.GcFlags.returnDecayFactor  = 4;
     RtsFlags.GcFlags.useNonmoving       = false;
     RtsFlags.GcFlags.nonmovingSelectorOpt = false;
     RtsFlags.GcFlags.generations        = 2;
@@ -324,6 +325,12 @@ usage_text[] = {
 "  -F<n>     Sets the collecting threshold for old generations as a factor of",
 "            the live data in that generation the last time it was collected",
 "            (default: 2.0)",
+"  -Fd<n>    Sets the inverse rate which memory is returned to the OS after being",
+"            optimistically retained after being allocated. Subsequent major",
+"            collections not caused by heap overflow will return an amount of",
+"            memory controlled by this factor (higher is slower). Setting the factor",
+"            to 0 means memory is not returned.",
+"            (default 4.0)",
 "  -n<size>  Allocation area chunk size (0 = disabled, default: 0)",
 "  -O<size>  Sets the minimum size of the old generation (default 1M)",
 "  -M<size>  Sets the maximum heap size (default unlimited)  Egs: -M256k -M1G",
@@ -1153,10 +1160,19 @@ error = true;
 
               case 'F':
                 OPTION_UNSAFE;
-                RtsFlags.GcFlags.oldGenFactor = atof(rts_argv[arg]+2);
+                switch(rts_argv[arg][2]) {
+                case 'd':
+                  RtsFlags.GcFlags.returnDecayFactor = atof(rts_argv[arg]+3);
+                  if (RtsFlags.GcFlags.returnDecayFactor < 0)
+                    bad_option( rts_argv[arg] );
+                  break;
+                default:
+                  RtsFlags.GcFlags.oldGenFactor = atof(rts_argv[arg]+2);
 
-                if (RtsFlags.GcFlags.oldGenFactor < 0)
-                  bad_option( rts_argv[arg] );
+                  if (RtsFlags.GcFlags.oldGenFactor < 0)
+                    bad_option( rts_argv[arg] );
+                  break;
+                };
                 break;
 
               case 'D':
diff --git a/rts/Schedule.c b/rts/Schedule.c
index d9d5c9a74a..e0631482c9 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -166,7 +166,7 @@ static bool scheduleHandleThreadFinished( Capability *cap, Task *task,
                                           StgTSO *t );
 static bool scheduleNeedHeapProfile(bool ready_to_gc);
 static void scheduleDoGC( Capability **pcap, Task *task,
-                          bool force_major, bool deadlock_detect );
+                          bool force_major, bool is_overflow_gc, bool deadlock_detect );
 
 static void deleteThread (StgTSO *tso);
 static void deleteAllThreads (void);
@@ -267,7 +267,7 @@ schedule (Capability *initialCapability, Task *task)
     case SCHED_INTERRUPTING:
         debugTrace(DEBUG_sched, "SCHED_INTERRUPTING");
         /* scheduleDoGC() deletes all the threads */
-        scheduleDoGC(&cap,task,true,false);
+        scheduleDoGC(&cap,task,true,false,false);
 
         // after scheduleDoGC(), we must be shutting down.  Either some
         // other Capability did the final GC, or we did it above,
@@ -576,7 +576,7 @@ run_thread:
     }
 
     if (ready_to_gc || scheduleNeedHeapProfile(ready_to_gc)) {
-      scheduleDoGC(&cap,task,false,false);
+      scheduleDoGC(&cap,task,false,ready_to_gc,false);
     }
   } /* end of while() */
 }
@@ -951,7 +951,7 @@ scheduleDetectDeadlock (Capability **pcap, Task *task)
         // they are unreachable and will therefore be sent an
         // exception.  Any threads thus released will be immediately
         // runnable.
-        scheduleDoGC (pcap, task, true/*force major GC*/, true/*deadlock detection*/);
+        scheduleDoGC (pcap, task, true/*force major GC*/, false /* Whether it is an overflow GC */, true/*deadlock detection*/);
         cap = *pcap;
         // when force_major == true. scheduleDoGC sets
         // recent_activity to ACTIVITY_DONE_GC and turns off the timer
@@ -1025,7 +1025,7 @@ scheduleProcessInbox (Capability **pcap USED_IF_THREADS)
     while (!emptyInbox(cap)) {
         // Executing messages might use heap, so we should check for GC.
         if (doYouWantToGC(cap)) {
-            scheduleDoGC(pcap, cap->running_task, false, false);
+            scheduleDoGC(pcap, cap->running_task, false, false, false);
             cap = *pcap;
         }
 
@@ -1590,7 +1590,7 @@ void releaseAllCapabilities(uint32_t n, Capability *keep_cap, Task *task)
 // behind deadlock_detect argument.
 static void
 scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
-              bool force_major, bool deadlock_detect)
+              bool force_major, bool is_overflow_gc, bool deadlock_detect)
 {
     Capability *cap = *pcap;
     bool heap_census;
@@ -1883,9 +1883,9 @@ delete_threads_and_gc:
     // emerge they don't immediately re-enter the GC.
     pending_sync = 0;
     signalCondition(&sync_finished_cond);
-    GarbageCollect(collect_gen, heap_census, deadlock_detect, gc_type, cap, idle_cap);
+    GarbageCollect(collect_gen, heap_census, is_overflow_gc, deadlock_detect, gc_type, cap, idle_cap);
 #else
-    GarbageCollect(collect_gen, heap_census, deadlock_detect, 0, cap, NULL);
+    GarbageCollect(collect_gen, heap_census, is_overflow_gc, deadlock_detect, 0, cap, NULL);
 #endif
 
     // If we're shutting down, don't leave any idle GC work to do.
@@ -2773,7 +2773,7 @@ exitScheduler (bool wait_foreign USED_IF_THREADS)
         nonmovingStop();
         Capability *cap = task->cap;
         waitForCapability(&cap,task);
-        scheduleDoGC(&cap,task,true,false);
+        scheduleDoGC(&cap,task,true,false,false);
         ASSERT(task->incall->tso == NULL);
         releaseCapability(cap);
     }
@@ -2841,7 +2841,7 @@ performGC_(bool force_major)
     // TODO: do we need to traceTask*() here?
 
     waitForCapability(&cap,task);
-    scheduleDoGC(&cap,task,force_major,false);
+    scheduleDoGC(&cap,task,force_major,false,false);
     releaseCapability(cap);
     exitMyTask();
 }
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 2911aad7a0..1a71bd7bf0 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -120,6 +120,8 @@ bool unload_mark_needed;
  */
 static W_ g0_pcnt_kept = 30; // percentage of g0 live at last minor GC
 
+static int consec_idle_gcs = 0;
+
 /* Mut-list stats */
 #if defined(DEBUG)
 // For lack of a better option we protect mutlist_scav_stats with oldest_gen->sync
@@ -261,6 +263,7 @@ addMutListScavStats(const MutListScavStats *src,
 void
 GarbageCollect (uint32_t collect_gen,
                 const bool do_heap_census,
+                const bool is_overflow_gc,
                 const bool deadlock_detect,
                 uint32_t gc_type USED_IF_THREADS,
                 Capability *cap,
@@ -981,11 +984,26 @@ GarbageCollect (uint32_t collect_gen,
       }
 #endif
 
-      /* If the amount of data remains constant, next major GC we'll
-       * require (F+1)*live + prealloc. We leave (F+2)*live + prealloc
-       * in order to reduce repeated deallocation and reallocation. #14702
-       */
-      need = need_prealloc + (RtsFlags.GcFlags.oldGenFactor + 2) * need_live;
+      // Reset the counter if the major GC was caused by a heap overflow
+      consec_idle_gcs = is_overflow_gc ? 0 : consec_idle_gcs + 1;
+
+      // See Note [Scaling retained memory]
+      double scaled_factor =
+        RtsFlags.GcFlags.returnDecayFactor > 0
+          ? RtsFlags.GcFlags.oldGenFactor / pow(2, (float) consec_idle_gcs / RtsFlags.GcFlags.returnDecayFactor)
+          : RtsFlags.GcFlags.oldGenFactor;
+
+      debugTrace(DEBUG_gc, "factors: %f %d %f", RtsFlags.GcFlags.oldGenFactor, consec_idle_gcs, scaled_factor  );
+
+      // Unavoidable need depends on GC strategy
+      // * Copying need 2 * live
+      // * Compacting need 1.x * live (we choose 1.2)
+      // * Nonmoving needs ~ 1.x * live
+      double unavoidable_need_factor = (oldest_gen->compact || RtsFlags.GcFlags.useNonmoving)
+                                          ? 1.2 : 2;
+      W_ scaled_needed = (scaled_factor + unavoidable_need_factor) * need_live;
+      debugTrace(DEBUG_gc, "factors_2: %f %d", unavoidable_need_factor, scaled_needed);
+      need = need_prealloc + scaled_needed;
 
       /* Also, if user set heap size, do not drop below it.
        */
@@ -1003,6 +1021,7 @@ GarbageCollect (uint32_t collect_gen,
       need = BLOCKS_TO_MBLOCKS(need);
 
       got = mblocks_allocated;
+      debugTrace(DEBUG_gc,"Returning: %d %d", got, need);
 
       uint32_t returned = 0;
       if (got > need) {
@@ -2208,3 +2227,53 @@ bool doIdleGCWork(Capability *cap STG_UNUSED, bool all)
  * work_stealing is "mostly immutable". We set it to false when we begin the
  * final sequential collections, for the benefit of notifyTodoBlock.
  * */
+
+/* Note [Scaling retained memory]
+ * Tickets: #19381 #19359 #14702
+ *
+ * After a spike in memory usage we have been conservative about returning
+ * allocated blocks to the OS in case we are still allocating a lot and would
+ * end up just reallocating them. The result of this was that up to 4 * live_bytes
+ * of blocks would be retained once they were allocated even if memory usage ended up
+ * a lot lower.
+ *
+ * For a heap of size ~1.5G, this would result in OS memory reporting 6G which is
+ * both misleading and worrying for users.
+ * In long-lived server applications this results in consistent high memory
+ * usage when the live data size is much more reasonable (for example ghcide)
+ *
+ * Therefore we have a new (2021) strategy which starts by retaining up to 4 * live_bytes
+ * of blocks before gradually returning uneeded memory back to the OS on subsequent
+ * major GCs which are NOT caused by a heap overflow.
+ *
+ * Each major GC which is NOT caused by heap overflow increases the consec_idle_gcs
+ * counter and the amount of memory which is retained is inversely proportional to this number.
+ * By default the excess memory retained is
+ *  oldGenFactor (controlled by -F) / 2 ^ (consec_idle_gcs * returnDecayFactor)
+ *
+ * On a major GC caused by a heap overflow, the `consec_idle_gcs` variable is reset to 0
+ * (as we could continue to allocate more, so retaining all the memory might make sense).
+ *
+ * Therefore setting bigger values for `-Fd` makes the rate at which memory is returned slower.
+ * Smaller values make it get returned faster. Setting `-Fd0` means no additional memory
+ * is retained.
+ *
+ * The default is `-Fd4` which results in the following scaling:
+ *
+ * > mapM print [(x, 1/ (2**(x / 4))) | x <- [1 :: Double ..20]]
+ * (1.0,0.8408964152537146)
+ * ...
+ * (4.0,0.5)
+ * ...
+ * (12.0,0.125)
+ * ...
+ * (20.0,3.125e-2)
+ *
+ * So after 12 consecutive GCs only 0.1 of the maximum memory used will be retained.
+ *
+ * Further to this decay factor, the amount of memory we attempt to retain is
+ * also influenced by the GC strategy for the oldest generation. If we are using
+ * a copying strategy then we will need at least 2 * live_bytes for copying to take
+ * place, so we always keep that much. If using compacting or nonmoving then we need a lower number,
+ * so we just retain at least `1.2 * live_bytes` for some protection.
+ */
diff --git a/rts/sm/GC.h b/rts/sm/GC.h
index 239f281910..da90c61302 100644
--- a/rts/sm/GC.h
+++ b/rts/sm/GC.h
@@ -19,6 +19,7 @@
 
 void GarbageCollect (uint32_t collect_gen,
                      bool do_heap_census,
+                     bool is_overflow_gc,
                      bool deadlock_detect,
                      uint32_t gc_type,
                      Capability *cap,
diff --git a/testsuite/tests/rts/T19381.hs b/testsuite/tests/rts/T19381.hs
new file mode 100644
index 0000000000..1b65e06ac9
--- /dev/null
+++ b/testsuite/tests/rts/T19381.hs
@@ -0,0 +1,40 @@
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+{-# LANGUAGE BangPatterns #-}
+module Main where
+
+import GHC.Exts
+import GHC.IO
+import GHC.Stats
+import System.Mem
+import Control.Monad
+
+data BA = BA ByteArray#
+
+mblockSize = 2 ^ 20
+
+main = do
+ -- Allocate 1000 byte arrays, to get a high watermark before only keeping
+ -- 100 of them.
+ ba <- take 100 <$> replicateM 1000 mkBA
+ let !n = (length ba)
+ -- Each major GC should free some amount of memory, 100 is just a large
+ -- number
+ replicateM 100 performMajorGC
+ s <- getRTSStats
+ let mblocks = (gcdetails_mem_in_use_bytes (gc s) `div` mblockSize)
+     live = (gcdetails_live_bytes (gc s) `div` mblockSize)
+ if fromIntegral mblocks < (2.2 * fromIntegral live)
+  then return ()
+  else error ("Additional memory is retained: "
+              ++ show live ++ "/"
+              ++ show mblocks)
+ -- Here to retain the ba
+ (length ba) `seq` return ()
+
+mkBA =
+    let (I# siz) = 2^19  -- ~0.1MB
+    in IO $ \s0 ->
+      case newByteArray# siz s0 of
+        (# s1, mba #) -> case unsafeFreezeByteArray# mba s1 of
+           (# s2, ba #) -> (# s2, BA ba #)
diff --git a/testsuite/tests/rts/all.T b/testsuite/tests/rts/all.T
index 9f2a54cd0f..7100aaf3d7 100644
--- a/testsuite/tests/rts/all.T
+++ b/testsuite/tests/rts/all.T
@@ -425,3 +425,4 @@ test('T17088',
 test('T15427', normal, compile_and_run, [''])
 
 test('T19481', extra_run_opts('+RTS -T  -RTS'), compile_and_run, [''])
+test('T19381', extra_run_opts('+RTS -T -RTS'), compile_and_run, [''])