diff options
author | Simon Marlow <marlowsd@gmail.com> | 2011-11-28 16:48:43 +0000 |
---|---|---|
committer | Simon Marlow <marlowsd@gmail.com> | 2011-11-29 12:21:18 +0000 |
commit | 50de6034343abc93a7b01daccff34121042c0e7c (patch) | |
tree | 24496a5fc6bc39c6baaa574608e53c5d76c169f6 /rts/PrimOps.cmm | |
parent | 1c2b838131134d44004dfdff18c302131478390d (diff) | |
download | haskell-50de6034343abc93a7b01daccff34121042c0e7c.tar.gz |
Make profiling work with multiple capabilities (+RTS -N)
This means that both time and heap profiling work for parallel
programs. Main internal changes:
- CCCS is no longer a global variable; it is now another
pseudo-register in the StgRegTable struct. Thus every
Capability has its own CCCS.
- There is a new built-in CCS called "IDLE", which records ticks for
Capabilities in the idle state. If you profile a single-threaded
program with +RTS -N2, you'll see about 50% of time in "IDLE".
- There is appropriate locking in rts/Profiling.c to protect the
shared cost-centre-stack data structures.
This patch does enough to get it working, I have cut one big corner:
the cost-centre-stack data structure is still shared amongst all
Capabilities, which means that multiple Capabilities will race when
updating the "allocations" and "entries" fields of a CCS. Not only
does this give unpredictable results, but it runs very slowly due to
cache line bouncing.
It is strongly recommended that you use -fno-prof-count-entries to
disable the "entries" count when profiling parallel programs. (I shall
add a note to this effect to the docs).
Diffstat (limited to 'rts/PrimOps.cmm')
-rw-r--r-- | rts/PrimOps.cmm | 40 |
1 files changed, 20 insertions, 20 deletions
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm index 8836d3bfe6..2ca347e803 100644 --- a/rts/PrimOps.cmm +++ b/rts/PrimOps.cmm @@ -63,7 +63,7 @@ stg_newByteArrayzh words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words; ("ptr" p) = foreign "C" allocate(MyCapability() "ptr",words) []; TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0); - SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]); + SET_HDR(p, stg_ARR_WORDS_info, CCCS); StgArrWords_bytes(p) = n; RET_P(p); } @@ -96,7 +96,7 @@ stg_newPinnedByteArrayzh to BA_ALIGN bytes: */ p = p + ((-p - SIZEOF_StgArrWords) & BA_MASK); - SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]); + SET_HDR(p, stg_ARR_WORDS_info, CCCS); StgArrWords_bytes(p) = n; RET_P(p); } @@ -136,7 +136,7 @@ stg_newAlignedPinnedByteArrayzh <alignment> is a power of 2, which is technically not guaranteed */ p = p + ((-p - SIZEOF_StgArrWords) & (alignment - 1)); - SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]); + SET_HDR(p, stg_ARR_WORDS_info, CCCS); StgArrWords_bytes(p) = n; RET_P(p); } @@ -157,7 +157,7 @@ stg_newArrayzh ("ptr" arr) = foreign "C" allocate(MyCapability() "ptr",words) [R2]; TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0); - SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]); + SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, CCCS); StgMutArrPtrs_ptrs(arr) = n; StgMutArrPtrs_size(arr) = size; @@ -225,7 +225,7 @@ stg_newMutVarzh ALLOC_PRIM( SIZEOF_StgMutVar, R1_PTR, stg_newMutVarzh); mv = Hp - SIZEOF_StgMutVar + WDS(1); - SET_HDR(mv,stg_MUT_VAR_DIRTY_info,W_[CCCS]); + SET_HDR(mv,stg_MUT_VAR_DIRTY_info,CCCS); StgMutVar_var(mv) = R1; RET_P(mv); @@ -297,21 +297,21 @@ stg_atomicModifyMutVarzh TICK_ALLOC_THUNK_2(); CCCS_ALLOC(THUNK_2_SIZE); z = Hp - THUNK_2_SIZE + WDS(1); - SET_HDR(z, stg_ap_2_upd_info, W_[CCCS]); + SET_HDR(z, stg_ap_2_upd_info, CCCS); LDV_RECORD_CREATE(z); StgThunk_payload(z,0) = f; TICK_ALLOC_THUNK_1(); CCCS_ALLOC(THUNK_1_SIZE); y = z - THUNK_1_SIZE; - SET_HDR(y, stg_sel_0_upd_info, W_[CCCS]); + SET_HDR(y, stg_sel_0_upd_info, CCCS); LDV_RECORD_CREATE(y); StgThunk_payload(y,0) = z; TICK_ALLOC_THUNK_1(); CCCS_ALLOC(THUNK_1_SIZE); r = y - THUNK_1_SIZE; - SET_HDR(r, stg_sel_1_upd_info, W_[CCCS]); + SET_HDR(r, stg_sel_1_upd_info, CCCS); LDV_RECORD_CREATE(r); StgThunk_payload(r,0) = z; @@ -353,7 +353,7 @@ stg_mkWeakzh ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, stg_mkWeakzh ); w = Hp - SIZEOF_StgWeak + WDS(1); - SET_HDR(w, stg_WEAK_info, W_[CCCS]); + SET_HDR(w, stg_WEAK_info, CCCS); // We don't care about cfinalizer here. // Should StgWeak_cfinalizer(w) be stg_NO_FINALIZER_closure or @@ -397,14 +397,14 @@ stg_mkWeakForeignEnvzh ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR, stg_mkWeakForeignEnvzh ); w = Hp - SIZEOF_StgWeak + WDS(1); - SET_HDR(w, stg_WEAK_info, W_[CCCS]); + SET_HDR(w, stg_WEAK_info, CCCS); payload_words = 4; words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words; ("ptr" p) = foreign "C" allocate(MyCapability() "ptr", words) []; TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0); - SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]); + SET_HDR(p, stg_ARR_WORDS_info, CCCS); StgArrWords_bytes(p) = WDS(payload_words); StgArrWords_payload(p,0) = fptr; @@ -877,7 +877,7 @@ stg_atomicallyzh Sp = Sp - SIZEOF_StgAtomicallyFrame; frame = Sp; - SET_HDR(frame,stg_atomically_frame_info, W_[CCCS]); + SET_HDR(frame,stg_atomically_frame_info, CCCS); StgAtomicallyFrame_code(frame) = R1; StgAtomicallyFrame_result(frame) = NO_TREC; StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE; @@ -903,7 +903,7 @@ stg_catchSTMzh Sp = Sp - SIZEOF_StgCatchSTMFrame; frame = Sp; - SET_HDR(frame, stg_catch_stm_frame_info, W_[CCCS]); + SET_HDR(frame, stg_catch_stm_frame_info, CCCS); StgCatchSTMFrame_handler(frame) = R2; StgCatchSTMFrame_code(frame) = R1; @@ -941,7 +941,7 @@ stg_catchRetryzh Sp = Sp - SIZEOF_StgCatchRetryFrame; frame = Sp; - SET_HDR(frame, stg_catch_retry_frame_info, W_[CCCS]); + SET_HDR(frame, stg_catch_retry_frame_info, CCCS); StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false; StgCatchRetryFrame_first_code(frame) = R1; StgCatchRetryFrame_alt_code(frame) = R2; @@ -1153,7 +1153,7 @@ stg_newMVarzh ALLOC_PRIM ( SIZEOF_StgMVar, NO_PTRS, stg_newMVarzh ); mvar = Hp - SIZEOF_StgMVar + WDS(1); - SET_HDR(mvar,stg_MVAR_DIRTY_info,W_[CCCS]); + SET_HDR(mvar,stg_MVAR_DIRTY_info,CCCS); // MVARs start dirty: generation 0 has no mutable list StgMVar_head(mvar) = stg_END_TSO_QUEUE_closure; StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure; @@ -1527,7 +1527,7 @@ stg_makeStableNamezh */ if ( snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) == NULL ) { sn_obj = Hp - SIZEOF_StgStableName + WDS(1); - SET_HDR(sn_obj, stg_STABLE_NAME_info, W_[CCCS]); + SET_HDR(sn_obj, stg_STABLE_NAME_info, CCCS); StgStableName_sn(sn_obj) = index; snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) = sn_obj; } else { @@ -1578,7 +1578,7 @@ stg_newBCOzh ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R5_PTR, stg_newBCOzh ); bco = Hp - bytes + WDS(1); - SET_HDR(bco, stg_BCO_info, W_[CCCS]); + SET_HDR(bco, stg_BCO_info, CCCS); StgBCO_instrs(bco) = R1; StgBCO_literals(bco) = R2; @@ -1617,7 +1617,7 @@ stg_mkApUpd0zh CCCS_ALLOC(SIZEOF_StgAP); ap = Hp - SIZEOF_StgAP + WDS(1); - SET_HDR(ap, stg_AP_info, W_[CCCS]); + SET_HDR(ap, stg_AP_info, CCCS); StgAP_n_args(ap) = HALF_W_(0); StgAP_fun(ap) = R1; @@ -1668,7 +1668,7 @@ out: ptrs_arr = Hp - nptrs_arr_sz - ptrs_arr_sz + WDS(1); nptrs_arr = Hp - nptrs_arr_sz + WDS(1); - SET_HDR(ptrs_arr, stg_MUT_ARR_PTRS_FROZEN_info, W_[CCCS]); + SET_HDR(ptrs_arr, stg_MUT_ARR_PTRS_FROZEN_info, CCCS); StgMutArrPtrs_ptrs(ptrs_arr) = ptrs; StgMutArrPtrs_size(ptrs_arr) = ptrs + ptrs_arr_cards; @@ -1683,7 +1683,7 @@ for: allocated in the nursery. The GC will fill it in if/when the array is promoted. */ - SET_HDR(nptrs_arr, stg_ARR_WORDS_info, W_[CCCS]); + SET_HDR(nptrs_arr, stg_ARR_WORDS_info, CCCS); StgArrWords_bytes(nptrs_arr) = WDS(nptrs); p = 0; for2: |