summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger
diff options
context:
space:
mode:
authorAlex Gorrod <alexander.gorrod@mongodb.com>2017-12-07 17:01:19 +1100
committerAlex Gorrod <alexander.gorrod@mongodb.com>2017-12-07 17:01:19 +1100
commita36bc9925fdb5e9b3e146870b081074708e7aba5 (patch)
treecd5cac285e93a83f05ded2e31c969d3039312117 /src/third_party/wiredtiger
parent682b4585b8510f5235382257edd55f2125c52a20 (diff)
downloadmongo-a36bc9925fdb5e9b3e146870b081074708e7aba5.tar.gz
Import wiredtiger: 596a3c7c0169cbda0475bfbd4b177fdbf3258058 from branch mongodb-3.8
ref: 6dcff54e40..596a3c7c01 for: 3.7.1 WT-3079 Make sure eviction visits all trees WT-3776 Cursor remove operation unpins page too early WT-3786 Transactions with timestamps should read their writes
Diffstat (limited to 'src/third_party/wiredtiger')
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c38
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c2
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c123
-rw-r--r--src/third_party/wiredtiger/src/include/cache.h3
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i4
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp02.py18
7 files changed, 131 insertions, 59 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 74b1e8caef3..7375a572f39 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "6dcff54e40ce18729d14a9e96b1cbcb4fbc331fe",
+ "commit": "596a3c7c0169cbda0475bfbd4b177fdbf3258058",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-3.8"
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index ee800ca80ee..e5d5e64194f 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -52,15 +52,18 @@ __cursor_state_restore(WT_CURSOR *cursor, WT_CURFILE_STATE *state)
/*
* __cursor_page_pinned --
- * Return if we have a page pinned and it's not been flagged for forced
- * eviction (the forced eviction test is so we periodically release pages
- * grown too large).
+ * Return if we have a page pinned.
*/
static inline bool
-__cursor_page_pinned(WT_CURSOR_BTREE *cbt)
+__cursor_page_pinned(WT_CURSOR_BTREE *cbt, bool eviction_ok)
{
+ /*
+ * Optionally fail the page-pinned test when the page is flagged for
+ * forced eviction (so we periodically release pages grown too large).
+ * The test is optional as not all callers can release pinned pages.
+ */
return (F_ISSET(cbt, WT_CBT_ACTIVE) &&
- cbt->ref->page->read_gen != WT_READGEN_OLDEST);
+ (!eviction_ok || cbt->ref->page->read_gen != WT_READGEN_OLDEST));
}
/*
@@ -465,7 +468,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
* from the root.
*/
valid = false;
- if (__cursor_page_pinned(cbt)) {
+ if (__cursor_page_pinned(cbt, true)) {
__wt_txn_cursor_op(session);
WT_ERR(btree->type == BTREE_ROW ?
@@ -562,7 +565,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
* existing record.
*/
valid = false;
- if (btree->type == BTREE_ROW && __cursor_page_pinned(cbt)) {
+ if (btree->type == BTREE_ROW && __cursor_page_pinned(cbt, true)) {
__wt_txn_cursor_op(session);
WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true));
@@ -693,7 +696,7 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt)
* configured for append aren't included, regardless of whether or not
* they meet all other criteria.
*/
- if (__cursor_page_pinned(cbt) &&
+ if (__cursor_page_pinned(cbt, true) &&
F_ISSET_ALL(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_OVERWRITE) &&
!append_key) {
WT_ERR(__wt_txn_autocommit_check(session));
@@ -912,8 +915,22 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt)
* removed, and the record must exist with a positioned cursor. The
* cursor won't be positioned on a page with an external key set, but
* be sure.
+ *
+ * There's trickiness in the page-pinned check. By definition a remove
+ * operation leaves a cursor positioned if it's initially positioned.
+ * However, if every item on the page is deleted and we unpin the page,
+ * eviction might delete the page and our search will re-instantiate an
+ * empty page for us. Cursor remove returns not-found whether or not
+ * that eviction/deletion happens and it's OK unless cursor-overwrite
+ * is configured (which means we return success even if there's no item
+ * to delete). In that case, we'll fail when we try to point the cursor
+ * at the key on the page to satisfy the positioned requirement. It's
+ * arguably safe to simply leave the key initialized in the cursor (as
+ * that's all a positioned cursor implies), but it's probably safer to
+ * avoid page eviction entirely in the positioned case.
*/
- if (__cursor_page_pinned(cbt) && F_ISSET(cursor, WT_CURSTD_KEY_INT)) {
+ if (__cursor_page_pinned(cbt, !positioned) &&
+ F_ISSET(cursor, WT_CURSTD_KEY_INT)) {
WT_ERR(__wt_txn_autocommit_check(session));
/*
@@ -1055,7 +1072,8 @@ __btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
* cursor won't be positioned on a page with an external key set, but
* be sure.
*/
- if (__cursor_page_pinned(cbt) && F_ISSET(cursor, WT_CURSTD_KEY_INT)) {
+ if (__cursor_page_pinned(cbt, true) &&
+ F_ISSET(cursor, WT_CURSTD_KEY_INT)) {
WT_ERR(__wt_txn_autocommit_check(session));
/*
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index 7f55b1cc4fd..3cf075e52f8 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -689,7 +689,7 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, bool final)
WT_ASSERT(session,
F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
- WT_ASSERT(session, dhandle != conn->cache->evict_file_next);
+ WT_ASSERT(session, dhandle != conn->cache->walk_tree);
/* Check if the handle was reacquired by a session while we waited. */
if (!final &&
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index fe389b65e4d..343d29d47cf 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -17,7 +17,7 @@ static int __evict_pass(WT_SESSION_IMPL *);
static int __evict_server(WT_SESSION_IMPL *, bool *);
static void __evict_tune_workers(WT_SESSION_IMPL *session);
static int __evict_walk(WT_SESSION_IMPL *, WT_EVICT_QUEUE *);
-static int __evict_walk_file(
+static int __evict_walk_tree(
WT_SESSION_IMPL *, WT_EVICT_QUEUE *, u_int, u_int *);
#define WT_EVICT_HAS_WORKERS(s) \
@@ -805,8 +805,10 @@ __evict_clear_walk(WT_SESSION_IMPL *session)
cache = S2C(session)->cache;
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_PASS));
- if (session->dhandle == cache->evict_file_next)
- cache->evict_file_next = NULL;
+ if (session->dhandle == cache->walk_tree) {
+ cache->walk_tree = NULL;
+ cache->walk_target = 0;
+ }
if ((ref = btree->evict_ref) == NULL)
return (0);
@@ -1391,19 +1393,22 @@ retry: while (slot < max_entries) {
* scan last time through. If we don't have a saved
* handle, start from the beginning of the list.
*/
- if ((dhandle = cache->evict_file_next) != NULL)
- cache->evict_file_next = NULL;
- else
+ if ((dhandle = cache->walk_tree) != NULL)
+ cache->walk_tree = NULL;
+ else {
dhandle = TAILQ_FIRST(&conn->dhqh);
+ cache->walk_target = 0;
+ }
} else {
if (incr) {
WT_ASSERT(session, dhandle->session_inuse > 0);
(void)__wt_atomic_subi32(
&dhandle->session_inuse, 1);
incr = false;
- cache->evict_file_next = NULL;
+ cache->walk_tree = NULL;
}
dhandle = TAILQ_NEXT(dhandle, q);
+ cache->walk_target = 0;
}
/* If we reach the end of the list, we're done. */
@@ -1482,9 +1487,9 @@ retry: while (slot < max_entries) {
/*
* Remember the file to visit first, next loop.
*/
- cache->evict_file_next = dhandle;
+ cache->walk_tree = dhandle;
WT_WITH_DHANDLE(session, dhandle,
- ret = __evict_walk_file(
+ ret = __evict_walk_tree(
session, queue, max_entries, &slot));
WT_ASSERT(session, __wt_session_gen(
@@ -1572,42 +1577,21 @@ __evict_push_candidate(WT_SESSION_IMPL *session,
}
/*
- * __evict_walk_file --
- * Get a few page eviction candidates from a single underlying file.
+ * __evict_walk_target --
+ * Calculate how many pages to queue for a given tree.
*/
-static int
-__evict_walk_file(WT_SESSION_IMPL *session,
- WT_EVICT_QUEUE *queue, u_int max_entries, u_int *slotp)
+static uint32_t
+__evict_walk_target(
+ WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, u_int max_entries)
{
- WT_BTREE *btree;
WT_CACHE *cache;
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_EVICT_ENTRY *end, *evict, *start;
- WT_PAGE *last_parent, *page;
- WT_REF *ref;
- uint64_t btree_inuse, bytes_per_slot, cache_inuse, min_pages;
- uint64_t pages_seen, pages_queued, refs_walked;
- uint32_t remaining_slots, total_slots, walk_flags;
+ uint64_t btree_inuse, bytes_per_slot, cache_inuse;
uint32_t target_pages_clean, target_pages_dirty, target_pages;
- int restarts;
- bool give_up, modified, urgent_queued;
-
- conn = S2C(session);
- btree = S2BT(session);
- cache = conn->cache;
- last_parent = NULL;
- restarts = 0;
- give_up = urgent_queued = false;
+ uint32_t total_slots;
- /*
- * Figure out how many slots to fill from this tree.
- * Note that some care is taken in the calculation to avoid overflow.
- */
- start = queue->evict_queue + *slotp;
- remaining_slots = max_entries - *slotp;
- total_slots = max_entries - queue->evict_entries;
+ cache = S2C(session)->cache;
target_pages_clean = target_pages_dirty = 0;
+ total_slots = max_entries - queue->evict_entries;
/*
* The number of times we should fill the queue by the end of
@@ -1653,13 +1637,6 @@ __evict_walk_file(WT_SESSION_IMPL *session,
QUEUE_FILLS_PER_PASS;
/*
- * If the tree is dead or we're near the end of the queue, fill the
- * remaining slots.
- */
- if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
- target_pages = remaining_slots;
-
- /*
* Walk trees with a small fraction of the cache in case there are so
* many trees that none of them use enough of the cache to be allocated
* slots. Only skip a tree if it has no bytes of interest.
@@ -1680,9 +1657,62 @@ __evict_walk_file(WT_SESSION_IMPL *session,
if (target_pages < MIN_PAGES_PER_TREE)
target_pages = MIN_PAGES_PER_TREE;
+ /* If the tree is dead, take a lot of pages. */
+ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
+ target_pages *= 10;
+
+ return (target_pages);
+}
+
+/*
+ * __evict_walk_tree --
+ * Get a few page eviction candidates from a single underlying file.
+ */
+static int
+__evict_walk_tree(WT_SESSION_IMPL *session,
+ WT_EVICT_QUEUE *queue, u_int max_entries, u_int *slotp)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_EVICT_ENTRY *end, *evict, *start;
+ WT_PAGE *last_parent, *page;
+ WT_REF *ref;
+ uint64_t min_pages, pages_seen, pages_queued, refs_walked;
+ uint32_t remaining_slots, target_pages, walk_flags;
+ int restarts;
+ bool give_up, modified, urgent_queued;
+
+ conn = S2C(session);
+ btree = S2BT(session);
+ cache = conn->cache;
+ last_parent = NULL;
+ restarts = 0;
+ give_up = urgent_queued = false;
+
+ /*
+ * Figure out how many slots to fill from this tree.
+ * Note that some care is taken in the calculation to avoid overflow.
+ */
+ start = queue->evict_queue + *slotp;
+ remaining_slots = max_entries - *slotp;
+ if (cache->walk_target != 0) {
+ WT_ASSERT(session, cache->walk_progress <= cache->walk_target);
+ target_pages = cache->walk_target - cache->walk_progress;
+ } else {
+ target_pages = cache->walk_target =
+ __evict_walk_target(session, queue, max_entries);
+ cache->walk_progress = 0;
+ }
+
if (target_pages > remaining_slots)
target_pages = remaining_slots;
+ /* If we don't want any pages from this tree, move on. */
+ if (target_pages == 0)
+ return (0);
+
/*
* These statistics generate a histogram of the number of pages targeted
* for eviction each round. The range of values here start at
@@ -1967,6 +1997,7 @@ fast: /* If the page can't be evicted, give up. */
continue;
++evict;
++pages_queued;
+ ++cache->walk_progress;
__wt_verbose(session, WT_VERB_EVICTSERVER,
"select: %p, size %" WT_SIZET_FMT,
diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h
index a8873bff394..a2c0e95293c 100644
--- a/src/third_party/wiredtiger/src/include/cache.h
+++ b/src/third_party/wiredtiger/src/include/cache.h
@@ -142,7 +142,8 @@ struct __wt_cache {
*/
WT_SPINLOCK evict_pass_lock; /* Eviction pass lock */
WT_SESSION_IMPL *walk_session; /* Eviction pass session */
- WT_DATA_HANDLE *evict_file_next;/* LRU next file to search */
+ WT_DATA_HANDLE *walk_tree; /* LRU walk current tree */
+ uint32_t walk_progress, walk_target;/* Progress in current tree */
WT_SPINLOCK evict_queue_lock; /* Eviction current queue lock */
WT_EVICT_QUEUE evict_queues[WT_EVICT_QUEUE_MAX];
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index 2ecbd5dc440..1683ce8fbe2 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -456,6 +456,10 @@ __wt_txn_visible(
if (!__txn_visible_id(session, id))
return (false);
+ /* Transactions read their writes, regardless of timestamps. */
+ if (F_ISSET(&session->txn, WT_TXN_HAS_ID) && id == session->txn.id)
+ return (true);
+
#ifdef HAVE_TIMESTAMPS
{
WT_TXN *txn = &session->txn;
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp02.py b/src/third_party/wiredtiger/test/suite/test_timestamp02.py
index f928dbc184f..60a6eef3a55 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp02.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp02.py
@@ -135,5 +135,23 @@ class test_timestamp02(wttest.WiredTigerTestCase, suite_subprocess):
self.check(self.session, 'read_timestamp=' + timestamp_str(t + 200),
dict((k, 2) for k in orig_keys[i+1:]))
+ def test_read_your_writes(self):
+ if not wiredtiger.timestamp_build():
+ self.skipTest('requires a timestamp build')
+
+ self.session.create(self.uri,
+ 'key_format=i,value_format=i' + self.extra_config)
+ c = self.session.open_cursor(self.uri)
+
+ k = 10
+ c[k] = 0
+
+ self.session.begin_transaction('read_timestamp=10')
+ self.session.timestamp_transaction('commit_timestamp=20')
+ c[k] = 1
+ # We should see the value we just inserted
+ self.assertEqual(c[k], 1)
+ self.session.commit_transaction()
+
if __name__ == '__main__':
wttest.run()