diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2016-06-01 16:09:51 +1000 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2016-06-01 16:09:55 +1000 |
commit | ee0fe8abde8354fffb4b03c0a018a54978690046 (patch) | |
tree | 07a880d5875e7342740da839dd400f271a5fe02d | |
parent | d0c19b82b88f3424dd40b7fd1bc7d4bbcf90414f (diff) | |
download | mongo-ee0fe8abde8354fffb4b03c0a018a54978690046.tar.gz |
Import wiredtiger-wiredtiger-2.8.0-209-g234b68b.tar.gz from wiredtiger branch mongodb-3.2r3.2.7-rc0
ref: 88b898e..234b68b
SERVER-24306 40-second journaling stall from "log files prepared" to checkpoint
WT-2559 Windows segfault in logging code
WT-2560 Stuck trying to update oldest transaction ID
WT-2613 Windows build failing with a C4100 error
WT-2629 Introduction of ppc64le crc32c assembly file has made the stack executable
SERVER-23954 WiredTiger changes for MongoDB 3.2.7
19 files changed, 192 insertions, 147 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 1f3ac443495..018eb6ed73a 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -325,7 +325,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) valid = false; if (F_ISSET(cbt, WT_CBT_ACTIVE) && cbt->ref->page->read_gen != WT_READGEN_OLDEST) { - __wt_txn_cursor_op(session); + WT_ERR(__wt_txn_cursor_op(session)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, cbt->ref, false) : @@ -405,7 +405,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) if (btree->type == BTREE_ROW && F_ISSET(cbt, WT_CBT_ACTIVE) && cbt->ref->page->read_gen != WT_READGEN_OLDEST) { - __wt_txn_cursor_op(session); + WT_ERR(__wt_txn_cursor_op(session)); WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true)); diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 5cf6a9bf2bc..6a1203628a9 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -326,7 +326,7 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) __wt_page_evict_soon(page); /* Bump the oldest ID, we're about to do some visibility checks. */ - __wt_txn_update_oldest(session, false); + WT_RET(__wt_txn_update_oldest(session, false)); /* If eviction cannot succeed, don't try. */ return (__wt_page_can_evict(session, ref, NULL)); diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 826589f8bdd..5d60c436a08 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -81,7 +81,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) - __wt_txn_get_snapshot(session); + WT_ERR(__wt_txn_get_snapshot(session)); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); @@ -100,7 +100,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * the metadata shouldn't be that big, and (b) if we do ever */ if (txn->isolation == WT_ISO_READ_COMMITTED) - __wt_txn_get_snapshot(session); + WT_ERR(__wt_txn_get_snapshot(session)); /* * We cannot check the tree modified flag in the case of a diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index 38c3288209e..f5722d343f7 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -93,7 +93,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) * transaction ID will catch up with the current ID. */ for (;;) { - __wt_txn_update_oldest(session, true); + WT_TRET(__wt_txn_update_oldest(session, true)); if (txn_global->oldest_id == txn_global->current) break; __wt_yield(); diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c index 804c24a3d2e..d2b8d81ab37 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_ds.c +++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c @@ -16,7 +16,7 @@ static int __curds_txn_enter(WT_SESSION_IMPL *session) { session->ncursors++; /* XXX */ - __wt_txn_cursor_op(session); + WT_RET(__wt_txn_cursor_op(session)); return (0); } diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c index ca98b1bd62a..1da2e959b6a 100644 --- a/src/third_party/wiredtiger/src/evict/evict_file.c +++ b/src/third_party/wiredtiger/src/evict/evict_file.c @@ -26,7 +26,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) WT_RET(__wt_evict_file_exclusive_on(session)); /* Make sure the oldest transaction ID is up-to-date. */ - __wt_txn_update_oldest(session, true); + WT_RET(__wt_txn_update_oldest(session, true)); /* Walk the tree, discarding pages. */ next_ref = NULL; diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index d3e32d7fc23..be8cc1df956 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -594,7 +594,7 @@ __evict_pass(WT_SESSION_IMPL *session) * of whether the cache is full, to prevent the oldest ID * falling too far behind. */ - __wt_txn_update_oldest(session, true); + WT_RET(__wt_txn_update_oldest(session, loop > 0)); if (!__evict_update_work(session)) break; diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index f0d4752cc83..2d20f53e9ae 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -420,7 +420,7 @@ __evict_review( * fallen behind current. */ if (modified) - __wt_txn_update_oldest(session, true); + WT_RET(__wt_txn_update_oldest(session, false)); if (!__wt_page_can_evict(session, ref, inmem_splitp)) return (EBUSY); diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index 8ab96c0a69d..553dd03f958 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -270,7 +270,7 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter) * to read. */ if (!F_ISSET(cbt, WT_CBT_NO_TXN)) - __wt_txn_cursor_op(session); + WT_RET(__wt_txn_cursor_op(session)); return (0); } diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 292bcfb1c7c..792700555dd 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -676,8 +676,8 @@ extern void __wt_stat_join_clear_single(WT_JOIN_STATS *stats); extern void __wt_stat_join_clear_all(WT_JOIN_STATS **stats); extern void __wt_stat_join_aggregate( WT_JOIN_STATS **from, WT_JOIN_STATS *to); extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session); -extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session); -extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force); +extern int __wt_txn_get_snapshot(WT_SESSION_IMPL *session); +extern int __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force); extern int __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]); extern void __wt_txn_release(WT_SESSION_IMPL *session); extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]); diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i index fa920de7e37..c0cd9c85ee9 100644 --- a/src/third_party/wiredtiger/src/include/serial.i +++ b/src/third_party/wiredtiger/src/include/serial.i @@ -306,7 +306,7 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, if ((txn = page->modify->obsolete_check_txn) != WT_TXN_NONE) { if (!__wt_txn_visible_all(session, txn)) { /* Try to move the oldest ID forward and re-check. */ - __wt_txn_update_oldest(session, false); + WT_RET(__wt_txn_update_oldest(session, false)); if (!__wt_txn_visible_all(session, txn)) return (0); diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 1e82e2d982a..d10738cc670 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -74,7 +74,7 @@ struct __wt_txn_global { volatile uint64_t current; /* Current transaction ID. */ /* The oldest running transaction ID (may race). */ - uint64_t last_running; + volatile uint64_t last_running; /* * The oldest transaction ID that is not yet visible to some @@ -82,8 +82,11 @@ struct __wt_txn_global { */ volatile uint64_t oldest_id; - /* Count of scanning threads, or -1 for exclusive access. */ - volatile int32_t scan_count; + /* + * Prevents the oldest ID moving forwards while threads are scanning + * the global transaction state. + */ + WT_RWLOCK *scan_rwlock; /* * Track information about the running checkpoint. The transaction diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index f5ca44c2ada..96f7426e421 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -261,14 +261,14 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) * eviction, it's better to do it beforehand. */ WT_RET(__wt_cache_eviction_check(session, false, NULL)); - - __wt_txn_get_snapshot(session); + WT_RET(__wt_txn_get_snapshot(session)); } F_SET(txn, WT_TXN_RUNNING); if (F_ISSET(S2C(session), WT_CONN_READONLY)) F_SET(txn, WT_TXN_READONLY); - return (false); + + return (0); } /* @@ -450,7 +450,7 @@ __wt_txn_read_last(WT_SESSION_IMPL *session) * __wt_txn_cursor_op -- * Called for each cursor operation. */ -static inline void +static inline int __wt_txn_cursor_op(WT_SESSION_IMPL *session) { WT_TXN *txn; @@ -482,7 +482,9 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) if (txn_state->snap_min == WT_TXN_NONE) txn_state->snap_min = txn_global->last_running; } else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) - __wt_txn_get_snapshot(session); + WT_RET(__wt_txn_get_snapshot(session)); + + return (0); } /* diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 1132b54f335..b11cd55844d 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -8,6 +8,8 @@ #include "wt_internal.h" +static int __log_openfile( + WT_SESSION_IMPL *, bool, WT_FH **, const char *, uint32_t); static int __log_write_internal( WT_SESSION_IMPL *, WT_ITEM *, WT_LSN *, uint32_t); @@ -93,8 +95,9 @@ __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn) int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) { - WT_LOG *log; WT_DECL_RET; + WT_FH *log_fh; + WT_LOG *log; log = S2C(session)->log; @@ -129,12 +132,21 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) * Sync the log file if needed. */ if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) { + /* + * Get our own file handle to the log file. It is possible + * for the file handle in the log structure to change out + * from under us and either be NULL or point to a different + * file than we want. + */ + WT_ERR(__log_openfile(session, + false, &log_fh, WT_LOG_FILENAME, min_lsn->l.file)); WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_force_sync: sync %s to LSN %" PRIu32 "/%" PRIu32, - log->log_fh->name, min_lsn->l.file, min_lsn->l.offset)); - WT_ERR(__wt_fsync(session, log->log_fh, true)); + log_fh->name, min_lsn->l.file, min_lsn->l.offset)); + WT_ERR(__wt_fsync(session, log_fh, true)); log->sync_lsn = *min_lsn; WT_STAT_FAST_CONN_INCR(session, log_sync); + WT_ERR(__wt_close(session, &log_fh)); WT_ERR(__wt_cond_signal(session, log->log_sync_cond)); } err: @@ -2128,10 +2140,19 @@ __wt_log_flush(WT_SESSION_IMPL *session, uint32_t flags) * We need to flush out the current slot first to get the real * end of log LSN in log->alloc_lsn. */ - WT_RET(__wt_log_flush_lsn(session, &lsn, 0)); + WT_RET(__wt_log_flush_lsn(session, &lsn, false)); last_lsn = log->alloc_lsn; /* + * If the last write caused a switch to a new log file, we should only + * wait for the last write to be flushed. Otherwise, if the workload + * is single-threaded we could wait here forever because the write LSN + * doesn't switch into the new file until it contains a record. + */ + if (last_lsn.l.offset == WT_LOG_FIRST_RECORD) + last_lsn = log->log_close_lsn; + + /* * Wait until all current outstanding writes have been written * to the file system. */ diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index e023b2b407e..9ca850da9f1 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -210,7 +210,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) goto open; if (txn->isolation == WT_ISO_SNAPSHOT) - __wt_txn_cursor_op(session); + WT_RET(__wt_txn_cursor_op(session)); /* * Figure out how many updates are required for diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index 87771e2cb6c..51cf2e981de 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -289,7 +289,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, } /* Stop if a running transaction needs the chunk. */ - __wt_txn_update_oldest(session, true); + WT_RET(__wt_txn_update_oldest(session, true)); if (chunk->switch_txn == WT_TXN_NONE || !__wt_txn_visible_all(session, chunk->switch_txn)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, diff --git a/src/third_party/wiredtiger/src/support/power8/crc32.S b/src/third_party/wiredtiger/src/support/power8/crc32.S index c0b81143f07..f990acb7b12 100644 --- a/src/third_party/wiredtiger/src/support/power8/crc32.S +++ b/src/third_party/wiredtiger/src/support/power8/crc32.S @@ -769,3 +769,10 @@ FUNC_START(__crc32_vpmsum) FUNC_END(__crc32_vpmsum) #endif + +/* + * Make sure the stack isn't executable with GCC (regardless of platform). + */ +#ifndef __clang__ +.section .note.GNU-stack,"",@progbits +#endif diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 7a768a8fe20..ab1cd622057 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -108,17 +108,17 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session) * __wt_txn_get_snapshot -- * Allocate a snapshot. */ -void +int __wt_txn_get_snapshot(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; + WT_DECL_RET; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; uint64_t current_id, id; uint64_t prev_oldest_id, snap_min; uint32_t i, n, session_cnt; - int32_t count; conn = S2C(session); txn = &session->txn; @@ -126,15 +126,13 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) txn_state = WT_SESSION_TXN_STATE(session); /* - * We're going to scan. Increment the count of scanners to prevent the - * oldest ID from moving forwards. Spin if the count is negative, - * which indicates that some thread is moving the oldest ID forwards. + * Spin waiting for the lock: the sleeps in our blocking readlock + * implementation are too slow for scanning the transaction table. */ - do { - if ((count = txn_global->scan_count) < 0) - WT_PAUSE(); - } while (count < 0 || - !__wt_atomic_casiv32(&txn_global->scan_count, count, count + 1)); + while ((ret = + __wt_try_readlock(session, txn_global->scan_rwlock)) == EBUSY) + WT_PAUSE(); + WT_RET(ret); current_id = snap_min = txn_global->current; prev_oldest_id = txn_global->oldest_id; @@ -145,11 +143,9 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) __txn_sort_snapshot(session, 0, current_id); /* Check that the oldest ID has not moved in the meantime. */ - if (prev_oldest_id == txn_global->oldest_id) { - WT_ASSERT(session, txn_global->scan_count > 0); - (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); - return; - } + WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); + WT_RET(__wt_readunlock(session, txn_global->scan_rwlock)); + return (0); } /* Walk the array of concurrent transactions. */ @@ -182,67 +178,35 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->snap_min = snap_min; - WT_ASSERT(session, txn_global->scan_count > 0); - (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); + WT_RET(__wt_readunlock(session, txn_global->scan_rwlock)); __txn_sort_snapshot(session, n, current_id); + return (0); } /* - * __wt_txn_update_oldest -- - * Sweep the running transactions to update the oldest ID required. - * !!! - * If a data-source is calling the WT_EXTENSION_API.transaction_oldest - * method (for the oldest transaction ID not yet visible to a running - * transaction), and then comparing that oldest ID against committed - * transactions to see if updates for a committed transaction are still - * visible to running transactions, the oldest transaction ID may be - * the same as the last committed transaction ID, if the transaction - * state wasn't refreshed after the last transaction committed. Push - * past the last committed transaction. -*/ -void -__wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force) + * __txn_oldest_scan -- + * Sweep the running transactions to calculate the oldest ID required. + */ +static void +__txn_oldest_scan(WT_SESSION_IMPL *session, + uint64_t *oldest_idp, uint64_t *last_runningp, + WT_SESSION_IMPL **oldest_sessionp) { WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; - uint64_t current_id, id, last_running, oldest_id, prev_oldest_id; + uint64_t id, last_running, oldest_id, prev_oldest_id; uint32_t i, session_cnt; - int32_t count; - bool last_running_moved; conn = S2C(session); txn_global = &conn->txn_global; - -retry: - current_id = last_running = txn_global->current; oldest_session = NULL; - prev_oldest_id = txn_global->oldest_id; - /* - * For pure read-only workloads, or if the update isn't forced and the - * oldest ID isn't too far behind, avoid scanning. - */ - if (prev_oldest_id == current_id || - (!force && WT_TXNID_LT(current_id, prev_oldest_id + 100))) - return; - - /* - * We're going to scan. Increment the count of scanners to prevent the - * oldest ID from moving forwards. Spin if the count is negative, - * which indicates that some thread is moving the oldest ID forwards. - */ - do { - if ((count = txn_global->scan_count) < 0) - WT_PAUSE(); - } while (count < 0 || - !__wt_atomic_casiv32(&txn_global->scan_count, count, count + 1)); - - /* The oldest ID cannot change until the scan count goes to zero. */ + /* The oldest ID cannot change while we are holding the scan lock. */ prev_oldest_id = txn_global->oldest_id; - current_id = oldest_id = last_running = txn_global->current; + oldest_id = last_running = txn_global->current; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); @@ -264,7 +228,7 @@ retry: * !!! * Note: Don't ignore snap_min values older than the previous * oldest ID. Read-uncommitted operations publish snap_min - * values without incrementing scan_count to protect the global + * values without acquiring the scan lock to protect the global * table. See the comment in __wt_txn_cursor_op for * more details. */ @@ -283,76 +247,118 @@ retry: WT_TXNID_LT(id, oldest_id)) oldest_id = id; - /* Update the last running ID. */ - last_running_moved = - WT_TXNID_LT(txn_global->last_running, last_running); + *oldest_idp = oldest_id; + *oldest_sessionp = oldest_session; + *last_runningp = last_running; +} - /* Update the oldest ID. */ - if (WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) { - /* - * We know we want to update. Check if we're racing. - */ - if (__wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) { - WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (i = 0, s = txn_global->states; - i < session_cnt; i++, s++) { - if ((id = s->id) != WT_TXN_NONE && - WT_TXNID_LT(id, last_running)) - last_running = id; - if ((id = s->snap_min) != WT_TXN_NONE && - WT_TXNID_LT(id, oldest_id)) - oldest_id = id; - } - - if (WT_TXNID_LT(last_running, oldest_id)) - oldest_id = last_running; +/* + * __wt_txn_update_oldest -- + * Sweep the running transactions to update the oldest ID required. + */ +int +__wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION_IMPL *oldest_session; + WT_TXN_GLOBAL *txn_global; + uint64_t current_id, last_running, oldest_id; + uint64_t prev_last_running, prev_oldest_id; + + conn = S2C(session); + txn_global = &conn->txn_global; + + current_id = last_running = txn_global->current; + prev_last_running = txn_global->last_running; + prev_oldest_id = txn_global->oldest_id; + + /* + * For pure read-only workloads, or if the update isn't forced and the + * oldest ID isn't too far behind, avoid scanning. + */ + if (prev_oldest_id == current_id || + (!force && WT_TXNID_LT(current_id, prev_oldest_id + 100))) + return (0); + + /* First do a read-only scan. */ + if (force) + WT_RET(__wt_readlock(session, txn_global->scan_rwlock)); + else if ((ret = + __wt_try_readlock(session, txn_global->scan_rwlock)) != 0) + return (ret == EBUSY ? 0 : ret); + __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session); + WT_RET(__wt_readunlock(session, txn_global->scan_rwlock)); + + /* + * If the state hasn't changed (or hasn't moved far enough for + * non-forced updates), give up. + */ + if ((oldest_id == prev_oldest_id || + (!force && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) && + ((last_running == prev_last_running) || + (!force && WT_TXNID_LT(last_running, prev_last_running + 100)))) + return (0); + + /* It looks like an update is necessary, wait for exclusive access. */ + if (force) + WT_RET(__wt_writelock(session, txn_global->scan_rwlock)); + else if ((ret = + __wt_try_writelock(session, txn_global->scan_rwlock)) != 0) + return (ret == EBUSY ? 0 : ret); + + /* + * If the oldest ID has been updated while we waited, don't bother + * scanning. + */ + if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) && + WT_TXNID_LE(last_running, txn_global->last_running)) + goto done; + + /* + * Re-scan now that we have exclusive access. This is necessary because + * threads get transaction snapshots with read locks, and we have to be + * sure that there isn't a thread that has got a snapshot locally but + * not yet published its snap_min. + */ + __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session); #ifdef HAVE_DIAGNOSTIC - /* - * Make sure the ID doesn't move past any named - * snapshots. - * - * Don't include the read/assignment in the assert - * statement. Coverity complains if there are - * assignments only done in diagnostic builds, and - * when the read is from a volatile. - */ - id = txn_global->nsnap_oldest_id; - WT_ASSERT(session, - id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); + { + /* + * Make sure the ID doesn't move past any named snapshots. + * + * Don't include the read/assignment in the assert statement. Coverity + * complains if there are assignments only done in diagnostic builds, + * and when the read is from a volatile. + */ + uint64_t id = txn_global->nsnap_oldest_id; + WT_ASSERT(session, + id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); + } #endif - if (WT_TXNID_LT(txn_global->last_running, last_running)) - txn_global->last_running = last_running; - if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) - txn_global->oldest_id = oldest_id; - WT_ASSERT(session, txn_global->scan_count == -1); - txn_global->scan_count = 0; - } else { - /* - * We wanted to update the oldest ID but we're racing - * another thread. Retry if this is a forced update. - */ - WT_ASSERT(session, txn_global->scan_count > 0); - (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); - if (force) { - __wt_yield(); - goto retry; - } - } - } else { + /* Update the oldest ID. */ + if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) + txn_global->oldest_id = oldest_id; + if (WT_TXNID_LT(txn_global->last_running, last_running)) { + txn_global->last_running = last_running; + + /* Output a verbose message about long-running transactions, + * but only when some progress is being made. */ if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && current_id - oldest_id > 10000 && oldest_session != NULL) { - (void)__wt_verbose(session, WT_VERB_TRANSACTION, + WT_TRET(__wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %" PRIu32 " [%s]" " with snap_min %" PRIu64 "\n", oldest_id, oldest_session->id, oldest_session->lastop, - oldest_session->txn.snap_min); + oldest_session->txn.snap_min)); } - WT_ASSERT(session, txn_global->scan_count > 0); - (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); } + +done: WT_TRET(__wt_writeunlock(session, txn_global->scan_rwlock)); + return (ret); } /* @@ -736,6 +742,8 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_spin_init(session, &txn_global->id_lock, "transaction id lock")); WT_RET(__wt_rwlock_alloc(session, + &txn_global->scan_rwlock, "transaction scan lock")); + WT_RET(__wt_rwlock_alloc(session, &txn_global->nsnap_rwlock, "named snapshot lock")); txn_global->nsnap_oldest_id = WT_TXN_NONE; TAILQ_INIT(&txn_global->nsnaph); @@ -768,6 +776,7 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session) return (0); __wt_spin_destroy(session, &txn_global->id_lock); + WT_TRET(__wt_rwlock_destroy(session, &txn_global->scan_rwlock)); WT_TRET(__wt_rwlock_destroy(session, &txn_global->nsnap_rwlock)); __wt_free(session, txn_global->states); diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 27e18b254b8..c1b435d9897 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -404,7 +404,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * This is particularly important for compact, so that all dirty pages * can be fully written. */ - __wt_txn_update_oldest(session, true); + WT_ERR(__wt_txn_update_oldest(session, true)); /* Flush data-sources before we start the checkpoint. */ WT_ERR(__checkpoint_data_source(session, cfg)); @@ -792,6 +792,9 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session, hot_backup_locked = false; name_alloc = NULL; + /* Only referenced in diagnostic builds. */ + WT_UNUSED(is_checkpoint); + /* * Only referenced in diagnostic builds and gcc 5.1 isn't satisfied * with wrapping the entire assert condition in the unused macro. @@ -1281,7 +1284,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) * for active readers. */ if (!btree->modified && !bulk) { - __wt_txn_update_oldest(session, true); + WT_RET(__wt_txn_update_oldest(session, true)); return (__wt_txn_visible_all(session, btree->rec_max_txn) ? __wt_cache_op(session, WT_SYNC_DISCARD) : EBUSY); } |