diff options
author | Michael Cahill <michael.cahill@wiredtiger.com> | 2013-05-27 17:35:36 +1000 |
---|---|---|
committer | Michael Cahill <michael.cahill@wiredtiger.com> | 2013-05-27 17:35:36 +1000 |
commit | 2b2ca70375063a536b9e3102571358644c8e5314 (patch) | |
tree | 48df8b37f6ff3a4378aeb8e63cad9334d046ed29 | |
parent | 36566e297d1a1064be90fe2d5c8361b657d9224b (diff) | |
download | mongo-2b2ca70375063a536b9e3102571358644c8e5314.tar.gz |
Simplify scans through the global table of transaction IDs.
Stop caching multiple copies of the oldest transaction ID required to stay in
memory. In the process, fix a race that could invalidate the calculation of
the oldest ID.
refs #552
-rw-r--r-- | src/btree/bt_evict.c | 16 | ||||
-rw-r--r-- | src/btree/bt_ovfl.c | 12 | ||||
-rw-r--r-- | src/include/extern.h | 7 | ||||
-rw-r--r-- | src/include/txn.h | 14 | ||||
-rw-r--r-- | src/include/txn.i | 10 | ||||
-rw-r--r-- | src/txn/txn.c | 260 | ||||
-rw-r--r-- | test/suite/test_perf001.py | 4 |
7 files changed, 123 insertions, 200 deletions
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c index c6b3eb17588..91a102c8d0a 100644 --- a/src/btree/bt_evict.c +++ b/src/btree/bt_evict.c @@ -361,9 +361,6 @@ __wt_evict_page(WT_SESSION_IMPL *session, WT_PAGE *page) } else __wt_txn_release_snapshot(session); - /* If the oldest transaction was updated, keep the newer value. */ - saved_txn.oldest_snap_min = txn->oldest_snap_min; - *txn = saved_txn; return (ret); } @@ -527,8 +524,8 @@ __wt_sync_file(WT_SESSION_IMPL *session, int syncop) /* Write dirty pages if nobody beat us to it. */ if (__wt_page_is_modified(page)) { if (txn->isolation == TXN_ISO_READ_COMMITTED) - __wt_txn_get_snapshot(session, - WT_TXN_NONE, WT_TXN_NONE, 0); + __wt_txn_refresh( + session, WT_TXN_NONE, 0, 1); ret = __wt_rec_write(session, page, NULL, 0); if (txn->isolation == TXN_ISO_READ_COMMITTED) __wt_txn_release_snapshot(session); @@ -708,9 +705,6 @@ __evict_walk(WT_SESSION_IMPL *session, u_int *entriesp, int clean) cache = S2C(session)->cache; retries = 0; - /* Update the oldest transaction ID -- we use it to filter pages. */ - __wt_txn_get_oldest(session); - /* * NOTE: we don't hold the schema lock, so we have to take care * that the handles we see are open and valid. @@ -804,7 +798,6 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean) WT_DECL_RET; WT_EVICT_ENTRY *end, *evict, *start; WT_PAGE *page; - wt_txnid_t oldest_txn; int modified, restarts, levels; btree = S2BT(session); @@ -813,7 +806,6 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean) end = start + WT_EVICT_WALK_PER_FILE; if (end > cache->evict + cache->evict_slots) end = cache->evict + cache->evict_slots; - oldest_txn = session->txn.oldest_snap_min; WT_ASSERT(session, btree->evict_page == NULL || WT_PAGE_IS_ROOT(btree->evict_page) || @@ -924,8 +916,8 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean) * transaction that were running last time we wrote the * page has since rolled back. */ - if (modified && - TXNID_LE(oldest_txn, page->modify->disk_txn) && + if (modified && !__wt_txn_visible_all(session, + page->modify->disk_txn) && !F_ISSET(cache, WT_EVICT_STUCK)) continue; } diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c index dee629aea0b..9e08762a5c8 100644 --- a/src/btree/bt_ovfl.c +++ b/src/btree/bt_ovfl.c @@ -165,19 +165,11 @@ __ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip) first = WT_ROW_UPDATE(page, rip); WT_ASSERT(session, first != NULL); - /* - * Check to see if there's a globally visible update. If there's no - * globally visible update using our cached copy of the oldest ID - * required in the system, refresh that ID and rescan, it's better - * than doing I/O and caching copies of an overflow record. - */ - for (upd = first; upd != NULL; upd = upd->next) - if (__wt_txn_visible_all(session, upd->txnid)) - return (1); - __wt_txn_get_oldest(session); + /* Check to see if there's a globally visible update. */ for (upd = first; upd != NULL; upd = upd->next) if (__wt_txn_visible_all(session, upd->txnid)) return (1); + return (0); } diff --git a/src/include/extern.h b/src/include/extern.h index aa7cf2d694a..99253ed1060 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -1246,11 +1246,10 @@ extern void __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats); extern void __wt_stat_clear_connection_stats(void *stats_arg); extern int __wt_txnid_cmp(const void *v1, const void *v2); extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session); -extern void __wt_txn_get_oldest(WT_SESSION_IMPL *session); -extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session, - wt_txnid_t my_id, +extern void __wt_txn_refresh( WT_SESSION_IMPL *session, wt_txnid_t max_id, - int committing); + int alloc_id, + int get_snapshot); extern void __wt_txn_get_evict_snapshot(WT_SESSION_IMPL *session); extern int __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]); extern void __wt_txn_release(WT_SESSION_IMPL *session); diff --git a/src/include/txn.h b/src/include/txn.h index 6d022c2d078..4d32fccbbd0 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -49,7 +49,15 @@ struct __wt_txn_state { struct __wt_txn_global { volatile wt_txnid_t current; /* Current transaction ID. */ + + /* + * The oldest transaction ID that is not yet visible to some + * transaction in the system. + */ + volatile wt_txnid_t oldest_id; + volatile uint32_t gen; /* Completed transaction generation */ + WT_TXN_STATE *states; /* Per-session transaction states */ }; @@ -74,12 +82,6 @@ struct __wt_txn { wt_txnid_t *snapshot; uint32_t snapshot_count; - /* - * When this transaction started, the oldest transaction ID that was - * not yet visible to some transaction in the system. - */ - wt_txnid_t oldest_snap_min; - /* Saved global state, to avoid repeating scans. */ wt_txnid_t last_id; uint32_t last_gen; diff --git a/src/include/txn.i b/src/include/txn.i index 8d003184e7b..4e597853694 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -140,10 +140,12 @@ __wt_txn_visible(WT_SESSION_IMPL *session, wt_txnid_t id) static inline int __wt_txn_visible_all(WT_SESSION_IMPL *session, wt_txnid_t id) { - WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + wt_txnid_t oldest_id; - txn = &session->txn; - return (TXNID_LT(id, txn->oldest_snap_min)); + txn_global = &S2C(session)->txn_global; + oldest_id = txn_global->oldest_id; + return (TXNID_LT(id, oldest_id)); } /* @@ -276,7 +278,7 @@ __wt_txn_read_first(WT_SESSION_IMPL *session) if (txn->isolation == TXN_ISO_READ_COMMITTED || (!F_ISSET(txn, TXN_RUNNING) && txn->isolation == TXN_ISO_SNAPSHOT)) - __wt_txn_get_snapshot(session, WT_TXN_NONE, WT_TXN_NONE, 0); + __wt_txn_refresh(session, WT_TXN_NONE, 0, 1); else if (!F_ISSET(txn, TXN_RUNNING)) txn_state->snap_min = txn_global->current; } diff --git a/src/txn/txn.c b/src/txn/txn.c index 3c7373446c5..ca454f8266a 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -28,11 +28,14 @@ __wt_txnid_cmp(const void *v1, const void *v2) */ static void __txn_sort_snapshot(WT_SESSION_IMPL *session, - uint32_t n, wt_txnid_t id, wt_txnid_t oldest_snap_min) + uint32_t n, wt_txnid_t id, wt_txnid_t oldest_id) { WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + wt_txnid_t id_copy; txn = &session->txn; + txn_global = &S2C(session)->txn_global; if (n > 1) qsort(txn->snapshot, n, sizeof(wt_txnid_t), __wt_txnid_cmp); @@ -40,11 +43,18 @@ __txn_sort_snapshot(WT_SESSION_IMPL *session, txn->snap_min = (n == 0) ? id : txn->snapshot[0]; txn->snap_max = id; WT_ASSERT(session, n == 0 || txn->snap_min != WT_TXN_NONE); - if (TXNID_LT(txn->snap_min, oldest_snap_min)) - oldest_snap_min = txn->snap_min; + if (TXNID_LT(txn->snap_min, oldest_id)) + oldest_id = txn->snap_min; - if (TXNID_LT(txn->oldest_snap_min, oldest_snap_min)) - txn->oldest_snap_min = oldest_snap_min; + /* + * Update the oldest snapshot if we have a newer value. We're not + * holding a lock, so copy then swap into place if our oldest version + * is newer. + */ + do { + id_copy = txn_global->oldest_id; + } while (TXNID_LT(id_copy, oldest_id) && + !WT_ATOMIC_CAS(txn_global->oldest_id, id_copy, oldest_id)); } /* @@ -61,131 +71,117 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session) } /* - * __wt_txn_get_oldest -- - * Update the current transaction's cached copy of the oldest possible - * snap_min value. + * __wt_txn_refresh -- + * Allocate a transaction ID and/or a snapshot. */ void -__wt_txn_get_oldest(WT_SESSION_IMPL *session) +__wt_txn_refresh( + WT_SESSION_IMPL *session, wt_txnid_t max_id, int alloc_id, int get_snapshot) { WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *s; - wt_txnid_t current_id, id, oldest_snap_min; - uint32_t i, session_cnt; + WT_TXN_STATE *s, *txn_state; + wt_txnid_t current_id, id, oldest_id; + uint32_t i, n, session_cnt; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; + txn_state = &txn_global->states[session->id]; + oldest_id = txn_global->oldest_id; + + /* If nothing has changed since last time, we're done. */ + if (!alloc_id && + txn->id == max_id && + txn->last_id == txn_global->current && + txn->last_gen == txn_global->gen && + TXNID_LE(oldest_id, txn->snap_min)) + goto done; do { - current_id = txn_global->current; - oldest_snap_min = - (txn->id != WT_TXN_NONE) ? txn->id : current_id + 1; + /* Take a copy of the current transaction generation. */ + txn->last_gen = txn_global->gen; - WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (i = 0, s = txn_global->states; - i < session_cnt; - i++, s++) { - if ((id = s->snap_min) != WT_TXN_NONE && - TXNID_LT(id, oldest_snap_min)) - oldest_snap_min = id; + if (alloc_id) { /* - * It is possible that there is no snapshot active, - * even though there are transactions running (at - * isolation levels lower than snapshot isolation). If - * a new snapshot is taken, it will have a snap_min - * value of the lowest running transaction. + * Allocate a transaction ID. + * + * We use an atomic compare and swap to ensure that we + * get a unique ID that is published before the global + * counter is updated. * - * We need to make sure that the oldest snap_min we - * calculate won't be made invalid in that case, so - * make sure it is at least as old as the oldest - * running transaction. + * If two threads race to allocate an ID, only the + * latest ID will proceed. The winning thread can be + * sure its snapshot contains all of the earlier active + * IDs. Threads that race and get an earlier ID may + * not appear in the snapshot, but they will loop and + * allocate a new ID before proceeding to make any + * updates. + * + * This potentially wastes transaction IDs when threads + * race to begin transactions: that is the price we pay + * to keep this path latch free. */ - if ((id = s->id) != WT_TXN_NONE && - TXNID_LT(id, oldest_snap_min)) - oldest_snap_min = id; + do { + current_id = txn_global->current; + txn_state->id = txn->id = current_id + 1; + } while (!WT_ATOMIC_CAS(txn_global->current, + current_id, txn->id) || + txn->id == WT_TXN_NONE || + txn->id == WT_TXN_ABORTED); + txn->last_id = oldest_id = max_id = txn->id; + } else { + txn->last_id = current_id = txn_global->current; + oldest_id = + (max_id != WT_TXN_NONE) ? max_id : current_id + 1; } - } while (current_id != txn_global->current); - - if (TXNID_LT(txn->oldest_snap_min, oldest_snap_min)) - txn->oldest_snap_min = oldest_snap_min; -} - -/* - * __wt_txn_get_snapshot -- - * Set up a snapshot in the current transaction, without allocating an ID. - */ -void -__wt_txn_get_snapshot(WT_SESSION_IMPL *session, - wt_txnid_t my_id, wt_txnid_t max_id, int committing) -{ - WT_CONNECTION_IMPL *conn; - WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *s, *txn_state; - wt_txnid_t current_id, id, oldest_snap_min; - uint32_t i, n, session_cnt; - - conn = S2C(session); - txn = &session->txn; - txn_global = &conn->txn_global; - txn_state = &txn_global->states[session->id]; - /* If nothing has changed since last time, we're done. */ - if (!committing && txn->last_id == txn_global->current && - txn->last_gen == txn_global->gen) { - WT_ASSERT(session, - TXNID_LE(txn->oldest_snap_min, txn->snap_min)); - txn_state->snap_min = txn->snap_min; - return; - } + if (!get_snapshot) + return; - do { - /* Take a copy of the current session ID. */ - txn->last_gen = txn_global->gen; - txn->last_id = current_id = txn_global->current; - oldest_snap_min = current_id + 1; + /* + * Publish a new snap_min value that we refine below. This + * prevents the global oldest value from moving forward + * underneath us. + */ + txn_state->snap_min = txn_global->oldest_id; /* Copy the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { - if (!committing && (id = s->id) != WT_TXN_NONE && - TXNID_LT(id, oldest_snap_min)) - oldest_snap_min = id; /* - * Ignore everything else about the session's own + * Ignore everything about the session's own * transaction: we are in the process of updating it. */ - if (i == session->id) + if (s == txn_state) continue; - if (id != WT_TXN_NONE && - (max_id == WT_TXN_NONE || TXNID_LT(id, max_id))) + if ((id = s->id) != WT_TXN_NONE) { txn->snapshot[n++] = id; - /* Ignore the session's own transaction. */ + if (TXNID_LT(id, oldest_id)) + oldest_id = id; + } if ((id = s->snap_min) != WT_TXN_NONE && - TXNID_LT(id, oldest_snap_min)) - oldest_snap_min = id; + TXNID_LT(id, oldest_id)) + oldest_id = id; } + __txn_sort_snapshot(session, n, current_id + 1, oldest_id); + /* * Ensure the snapshot reads are scheduled before re-checking * the global current ID. */ WT_READ_BARRIER(); - } while (current_id != txn_global->current || - txn->last_gen != txn_global->gen); - - __txn_sort_snapshot(session, n, - (max_id != WT_TXN_NONE) ? max_id : current_id + 1, - oldest_snap_min); - id = (my_id == WT_TXN_NONE || TXNID_LT(txn->snap_min, my_id)) ? - txn->snap_min : my_id; - WT_ASSERT(session, committing || TXNID_LE(oldest_snap_min, id)); - txn_state->snap_min = id; + } while (txn->last_id != txn_global->current || + txn->last_gen != txn_global->gen || + TXNID_LT(txn->snap_min, txn_global->oldest_id)); + +done: WT_ASSERT(session, + TXNID_LE(txn_global->oldest_id, txn->snap_min)); + txn_state->snap_min = txn->snap_min; } /* @@ -196,17 +192,17 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session, void __wt_txn_get_evict_snapshot(WT_SESSION_IMPL *session) { - WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + wt_txnid_t oldest_id; - txn = &session->txn; + txn_global = &S2C(session)->txn_global; /* * The oldest active snapshot ID in the system should *not* be visible * to eviction. Create a snapshot containing that ID. */ - __wt_txn_get_oldest(session); - __txn_sort_snapshot( - session, 0, txn->oldest_snap_min, txn->oldest_snap_min); + oldest_id = txn_global->oldest_id; + __txn_sort_snapshot(session, 0, oldest_id, oldest_id); /* * Note that we carefully don't update the global table with this @@ -226,9 +222,7 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *s, *txn_state; - wt_txnid_t id, oldest_snap_min; - uint32_t i, n, session_cnt; + WT_TXN_STATE *txn_state; conn = S2C(session); txn = &session->txn; @@ -248,66 +242,8 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) TXN_ISO_READ_COMMITTED : TXN_ISO_READ_UNCOMMITTED; F_SET(txn, TXN_RUNNING); - - do { - /* - * Allocate a transaction ID. - * - * We use an atomic increment to ensure that we get a unique - * ID, then publish that to the global state table. - * - * If two threads race to allocate an ID, only the latest ID - * will proceed. The winning thread can be sure its snapshot - * contains all of the earlier active IDs. Threads that race - * and get an earlier ID may not appear in the snapshot, - * but they will loop and allocate a new ID before proceeding - * to make any updates. - * - * This potentially wastes transaction IDs when threads race to - * begin transactions, but that is the price we pay to keep - * this path latch free. - */ - do { - txn->id = WT_ATOMIC_ADD(txn_global->current, 1); - } while (txn->id == WT_TXN_NONE || txn->id == WT_TXN_ABORTED); - WT_PUBLISH(txn_state->id, txn->id); - - /* - * If we are starting a snapshot isolation transaction, get - * a snapshot of the running transactions. - * - * If we already have a snapshot (e.g., for an auto-commit - * operation), update it so that the newly-allocated ID is - * visible. - */ - if (txn->isolation == TXN_ISO_SNAPSHOT) { - txn->last_gen = txn_global->gen; - oldest_snap_min = txn->id; - - /* Copy the array of concurrent transactions. */ - WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (i = n = 0, s = txn_global->states; - i < session_cnt; - i++, s++) { - if ((id = s->snap_min) != WT_TXN_NONE && - TXNID_LT(id, oldest_snap_min)) - oldest_snap_min = id; - if ((id = s->id) != WT_TXN_NONE) - txn->snapshot[n++] = id; - } - - __txn_sort_snapshot( - session, n, txn->id, oldest_snap_min); - txn_state->snap_min = txn->snap_min; - } - - /* - * Ensure the snapshot reads are complete before re-checking - * the global current ID. - */ - WT_READ_BARRIER(); - } while (txn->id != txn_global->current); - + __wt_txn_refresh( + session, WT_TXN_NONE, 1, txn->isolation == TXN_ISO_SNAPSHOT); return (0); } @@ -373,8 +309,8 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) * the cursor. Get the new snapshot before releasing the ID for the * commit. */ - if (session->ncursors > 0) - __wt_txn_get_snapshot(session, txn->id + 1, WT_TXN_NONE, 1); + if (session->ncursors > 0 && txn->isolation != TXN_ISO_READ_UNCOMMITTED) + __wt_txn_refresh(session, txn->id + 1, 0, 1); __wt_txn_release(session); return (0); } diff --git a/test/suite/test_perf001.py b/test/suite/test_perf001.py index 7df5e3fab29..a97ff5cf5d9 100644 --- a/test/suite/test_perf001.py +++ b/test/suite/test_perf001.py @@ -38,8 +38,8 @@ class test_perf001(wttest.WiredTigerTestCase): scenarios = [ #('file-file', dict(tabletype='file',indextype='file')), - #('file-lsm', dict(tabletype='file',indextype='lsm')), - ('lsm-file', dict(tabletype='lsm',indextype='file')), + ('file-lsm', dict(tabletype='file',indextype='lsm')), + #('lsm-file', dict(tabletype='lsm',indextype='file')), #('lsm-lsm', dict(tabletype='lsm',indextype='lsm')), ] |