From 9ca51f9e84abc41ba590203d8bc8df8d6af86db2 Mon Sep 17 00:00:00 2001 From: Cheng Shao Date: Sat, 11 Feb 2023 17:24:02 +0000 Subject: rts: add the rts_clearMemory function This patch adds the rts_clearMemory function that does its best to zero out unused RTS memory for a wasm backend use case. See the comment above rts_clearMemory() prototype declaration for more detailed explanation. Closes #22920. --- rts/RtsSymbols.c | 1 + rts/include/RtsAPI.h | 45 +++++++++++++++++++++++++++++++ rts/sm/BlockAlloc.c | 14 ++++++++++ rts/sm/BlockAlloc.h | 2 ++ rts/sm/NonMoving.h | 4 +++ rts/sm/NonMovingSweep.c | 8 +++--- rts/sm/Storage.c | 43 +++++++++++++++++++++++++++++ rts/sm/Storage.h | 4 +++ testsuite/tests/ffi/should_run/ffi023_c.c | 1 + 9 files changed, 118 insertions(+), 4 deletions(-) diff --git a/rts/RtsSymbols.c b/rts/RtsSymbols.c index a95a1b1231..974f2dbd40 100644 --- a/rts/RtsSymbols.c +++ b/rts/RtsSymbols.c @@ -925,6 +925,7 @@ extern char **environ; SymI_HasProto(newArena) \ SymI_HasProto(arenaAlloc) \ SymI_HasProto(arenaFree) \ + SymI_HasProto(rts_clearMemory) \ RTS_USER_SIGNALS_SYMBOLS \ RTS_INTCHAR_SYMBOLS diff --git a/rts/include/RtsAPI.h b/rts/include/RtsAPI.h index 3bede10069..63a0f99fee 100644 --- a/rts/include/RtsAPI.h +++ b/rts/include/RtsAPI.h @@ -599,6 +599,51 @@ extern StgWord base_GHCziTopHandler_runNonIO_closure[]; /* ------------------------------------------------------------------------ */ +// This is a public RTS API function that does its best to zero out +// unused RTS memory. rts_clearMemory() takes the storage manager +// lock. It's only safe to call rts_clearMemory() when all mutators +// have stopped and either minor/major garbage collection has just +// been run. +// +// rts_clearMemory() works for all RTS ways on all platforms, though +// the main intended use case is the pre-initialization of a +// wasm32-wasi reactor module (#22920). A reactor module is like +// shared library on other platforms, with foreign exported Haskell +// functions as entrypoints. At run-time, the user calls hs_init_ghc() +// to initialize the RTS, after that they can invoke Haskell +// computation by calling the exported Haskell functions, persisting +// the memory state across these invocations. +// +// Besides hs_init_ghc(), the user may want to invoke some Haskell +// function to initialize some global state in the user code, this +// global state is used by subsequent invocations. Now, it's possible +// to run hs_init_ghc() & custom init logic in Haskell, then snapshot +// the entire memory into a new wasm module! And the user can call the +// new wasm module's exports directly, thus eliminating the +// initialization overhead at run-time entirely. +// +// There's one problem though. After the custom init logic runs, the +// RTS memory contains a lot of garbage data in various places. These +// garbage data will be snapshotted into the new wasm module, causing +// a significant size bloat. Therefore, we need an RTS API function +// that zeros out unused RTS memory. +// +// At the end of the day, the custom init function will be a small C +// function that first calls hs_init_ghc(), then calls a foreign +// exported Haskell function to initialize whatever global state the +// other Haskell functions need, followed by a hs_perform_gc() call to +// do a major GC, and finally an rts_clearMemory() call to zero out +// the unused RTS memory. +// +// Why add rts_clearMemory(), where there's the -DZ RTS flag that +// zeros freed memory on GC? The -DZ flag actually fills freed memory +// with a garbage byte like 0xAA, and the flag only works in debug +// RTS. Why not add a new RTS flag that zeros freed memory on the go? +// Because it only makes sense to do the zeroing once before +// snapshotting the memory, but there's no point to pay for the +// zeroing overhead at the new module's run-time. +void rts_clearMemory(void); + #if defined(__cplusplus) } #endif diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c index 257dc253ed..158698b08f 100644 --- a/rts/sm/BlockAlloc.c +++ b/rts/sm/BlockAlloc.c @@ -1395,3 +1395,17 @@ reportUnmarkedBlocks (void) } #endif + +void clear_free_list(void) { + for (uint32_t node = 0; node < n_numa_nodes; ++node) { + for (bdescr *bd = free_mblock_list[node]; bd != NULL; bd = bd->link) { + clear_blocks(bd); + } + + for (int ln = 0; ln < NUM_FREE_LISTS; ++ln) { + for (bdescr *bd = free_list[node][ln]; bd != NULL; bd = bd->link) { + clear_blocks(bd); + } + } + } +} diff --git a/rts/sm/BlockAlloc.h b/rts/sm/BlockAlloc.h index addee6cb83..5adde6c2c9 100644 --- a/rts/sm/BlockAlloc.h +++ b/rts/sm/BlockAlloc.h @@ -32,4 +32,6 @@ void reportUnmarkedBlocks (void); extern W_ n_alloc_blocks; // currently allocated blocks extern W_ hw_alloc_blocks; // high-water allocated blocks +RTS_PRIVATE void clear_free_list(void); + #include "EndPrivate.h" diff --git a/rts/sm/NonMoving.h b/rts/sm/NonMoving.h index d88a203dcb..37d3581882 100644 --- a/rts/sm/NonMoving.h +++ b/rts/sm/NonMoving.h @@ -356,6 +356,10 @@ void print_thread_list(StgTSO* tso); #endif +RTS_PRIVATE void clear_segment(struct NonmovingSegment*); + +RTS_PRIVATE void clear_segment_free_blocks(struct NonmovingSegment*); + #include "EndPrivate.h" #endif // CMINUSMINUS diff --git a/rts/sm/NonMovingSweep.c b/rts/sm/NonMovingSweep.c index ad2b422307..2b0cd44a53 100644 --- a/rts/sm/NonMovingSweep.c +++ b/rts/sm/NonMovingSweep.c @@ -106,14 +106,16 @@ void nonmovingGcCafs() debug_caf_list_snapshot = (StgIndStatic*)END_OF_CAF_LIST; } -static void +#endif + +void clear_segment(struct NonmovingSegment* seg) { size_t end = ((size_t)seg) + NONMOVING_SEGMENT_SIZE; memset(&seg->bitmap, 0, end - (size_t)&seg->bitmap); } -static void +void clear_segment_free_blocks(struct NonmovingSegment* seg) { unsigned int block_size = nonmovingSegmentBlockSize(seg); @@ -125,8 +127,6 @@ clear_segment_free_blocks(struct NonmovingSegment* seg) } } -#endif - GNUC_ATTR_HOT void nonmovingSweep(void) { while (nonmovingHeap.sweep_list) { diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c index 40d8a45806..986f6dd15a 100644 --- a/rts/sm/Storage.c +++ b/rts/sm/Storage.c @@ -1924,3 +1924,46 @@ The compacting collector does nothing to improve megablock level fragmentation. The role of the compacting GC is to remove object level fragmentation and to use less memory when collecting. - see #19248 */ + +void rts_clearMemory(void) { + ACQUIRE_SM_LOCK; + + clear_free_list(); + + for (uint32_t i = 0; i < n_nurseries; ++i) { + for (bdescr *bd = nurseries[i].blocks; bd; bd = bd->link) { + clear_blocks(bd); + } + } + + for (unsigned int i = 0; i < getNumCapabilities(); ++i) { + for (bdescr *bd = getCapability(i)->pinned_object_empty; bd; bd = bd->link) { + clear_blocks(bd); + } + + for (bdescr *bd = gc_threads[i]->free_blocks; bd; bd = bd->link) { + clear_blocks(bd); + } + } + + if (RtsFlags.GcFlags.useNonmoving) + { + for (struct NonmovingSegment *seg = nonmovingHeap.free; seg; seg = seg->link) { + clear_segment(seg); + } + + for (int i = 0; i < NONMOVING_ALLOCA_CNT; ++i) { + struct NonmovingAllocator *alloc = nonmovingHeap.allocators[i]; + + for (struct NonmovingSegment *seg = alloc->active; seg; seg = seg->link) { + clear_segment_free_blocks(seg); + } + + for (unsigned int j = 0; j < getNumCapabilities(); ++j) { + clear_segment_free_blocks(alloc->current[j]); + } + } + } + + RELEASE_SM_LOCK; +} diff --git a/rts/sm/Storage.h b/rts/sm/Storage.h index 0fecc50208..faec383c8f 100644 --- a/rts/sm/Storage.h +++ b/rts/sm/Storage.h @@ -206,4 +206,8 @@ extern StgIndStatic * dyn_caf_list; extern StgIndStatic * debug_caf_list; extern StgIndStatic * revertible_caf_list; +STATIC_INLINE void clear_blocks(bdescr *bd) { + memset(bd->start, 0, BLOCK_SIZE * bd->blocks); +} + #include "EndPrivate.h" diff --git a/testsuite/tests/ffi/should_run/ffi023_c.c b/testsuite/tests/ffi/should_run/ffi023_c.c index a8a5a15447..979c378b7d 100644 --- a/testsuite/tests/ffi/should_run/ffi023_c.c +++ b/testsuite/tests/ffi/should_run/ffi023_c.c @@ -5,5 +5,6 @@ HsInt out (HsInt x) { performMajorGC(); + rts_clearMemory(); return incall(x); } -- cgit v1.2.1