summaryrefslogtreecommitdiff
path: root/src/btree
diff options
context:
space:
mode:
Diffstat (limited to 'src/btree')
-rw-r--r--src/btree/bt_compact.c13
-rw-r--r--src/btree/bt_cursor.c2
-rw-r--r--src/btree/bt_debug.c22
-rw-r--r--src/btree/bt_discard.c50
-rw-r--r--src/btree/bt_misc.c1
-rw-r--r--src/btree/bt_read.c2
-rw-r--r--src/btree/bt_slvg.c4
-rw-r--r--src/btree/bt_split.c4
-rw-r--r--src/btree/bt_sync.c6
-rw-r--r--src/btree/bt_walk.c98
10 files changed, 108 insertions, 94 deletions
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index e005674762d..2edcac76d0b 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -90,16 +90,15 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
* Compact a file.
*/
int
-__wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
+__wt_compact(WT_SESSION_IMPL *session)
{
WT_BM *bm;
WT_BTREE *btree;
WT_DECL_RET;
WT_REF *ref;
+ u_int i;
bool skip;
- WT_UNUSED(cfg);
-
btree = S2BT(session);
bm = btree->bm;
ref = NULL;
@@ -129,7 +128,13 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
__wt_spin_lock(session, &btree->flush_lock);
/* Walk the tree reviewing pages to see if they should be re-written. */
- for (;;) {
+ for (i = 0;;) {
+ /* Periodically check if we've run out of time. */
+ if (++i > 100) {
+ WT_ERR(__wt_session_compact_check_timeout(session));
+ i = 0;
+ }
+
/*
* Pages read for compaction aren't "useful"; don't update the
* read generation of pages already in memory, and if a page is
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index a996b21f7ce..650289f2cd8 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -41,7 +41,7 @@ __cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv)
if (kv->size > WT_BTREE_MAX_OBJECT_SIZE)
WT_RET_MSG(session, EINVAL,
"item size of %" WT_SIZET_FMT " exceeds the maximum "
- "supported WiredTiger size of %d",
+ "supported WiredTiger size of %" PRIu32,
kv->size, WT_BTREE_MAX_OBJECT_SIZE);
/* Check what the block manager can actually write. */
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index bcc7d27a569..3352b797fa9 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -77,7 +77,7 @@ static inline int
__debug_hex_byte(WT_DBG *ds, uint8_t v)
{
return (ds->f(
- ds, "#%c%c", __wt_hex[(v & 0xf0) >> 4], __wt_hex[v & 0x0f]));
+ ds, "#%c%c", __wt_hex((v & 0xf0) >> 4), __wt_hex(v & 0x0f)));
}
/*
@@ -1003,37 +1003,37 @@ __debug_ref(WT_DBG *ds, WT_REF *ref)
WT_SESSION_IMPL *session;
size_t addr_size;
const uint8_t *addr;
+ const char *state;
session = ds->session;
- WT_RET(ds->f(ds, "\t"));
switch (ref->state) {
case WT_REF_DISK:
- WT_RET(ds->f(ds, "disk"));
+ state = "disk";
break;
case WT_REF_DELETED:
- WT_RET(ds->f(ds, "deleted"));
+ state = "deleted";
break;
case WT_REF_LOCKED:
- WT_RET(ds->f(ds, "locked %p", (void *)ref->page));
+ state = "locked";
break;
case WT_REF_MEM:
- WT_RET(ds->f(ds, "memory %p", (void *)ref->page));
+ state = "memory";
break;
case WT_REF_READING:
- WT_RET(ds->f(ds, "reading"));
+ state = "reading";
break;
case WT_REF_SPLIT:
- WT_RET(ds->f(ds, "split"));
+ state = "split";
break;
default:
- WT_RET(ds->f(ds, "INVALID"));
+ state = "INVALID";
break;
}
__wt_ref_info(ref, &addr, &addr_size, NULL);
- return (ds->f(ds, " %s\n",
- __wt_addr_string(session, addr, addr_size, ds->tmp)));
+ return (ds->f(ds, "\t" "%p %s %s\n", (void *)ref,
+ state, __wt_addr_string(session, addr, addr_size, ds->tmp)));
}
/*
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index 162bc07a1c2..7858d2cb14e 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -27,9 +27,35 @@ __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref)
/*
* A version of the page-out function that allows us to make additional
* diagnostic checks.
+ *
+ * The WT_REF cannot be the eviction thread's location.
*/
WT_ASSERT(session, S2BT(session)->evict_ref != ref);
+#ifdef HAVE_DIAGNOSTIC
+ {
+ WT_HAZARD *hp;
+ int i;
+ /*
+ * Make sure no other thread has a hazard pointer on the page we are
+ * about to discard. This is complicated by the fact that readers
+ * publish their hazard pointer before re-checking the page state, so
+ * our check can race with readers without indicating a real problem.
+ * Wait for up to a second for hazard pointers to be cleared.
+ */
+ for (hp = NULL, i = 0; i < 100; i++) {
+ if ((hp = __wt_page_hazard_check(session, ref)) == NULL)
+ break;
+ __wt_sleep(0, 10000);
+ }
+ if (hp != NULL)
+ __wt_errx(session,
+ "discarded page has hazard pointer: (%p: %s, line %d)",
+ (void *)hp->ref, hp->file, hp->line);
+ WT_ASSERT(session, hp == NULL);
+ }
+#endif
+
__wt_page_out(session, &ref->page);
}
@@ -63,30 +89,6 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
WT_ASSERT(session, !__wt_rwlock_islocked(session, &page->page_lock));
-#ifdef HAVE_DIAGNOSTIC
- {
- WT_HAZARD *hp;
- int i;
- /*
- * Make sure no other thread has a hazard pointer on the page we are
- * about to discard. This is complicated by the fact that readers
- * publish their hazard pointer before re-checking the page state, so
- * our check can race with readers without indicating a real problem.
- * Wait for up to a second for hazard pointers to be cleared.
- */
- for (hp = NULL, i = 0; i < 100; i++) {
- if ((hp = __wt_page_hazard_check(session, page)) == NULL)
- break;
- __wt_sleep(0, 10000);
- }
- if (hp != NULL)
- __wt_errx(session,
- "discarded page has hazard pointer: (%p: %s, line %d)",
- (void *)hp->page, hp->file, hp->line);
- WT_ASSERT(session, hp == NULL);
- }
-#endif
-
/*
* If a root page split, there may be one or more pages linked from the
* page; walk the list, discarding pages.
diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c
index b6e2cc07f5a..3bec65c2567 100644
--- a/src/btree/bt_misc.c
+++ b/src/btree/bt_misc.c
@@ -14,6 +14,7 @@
*/
const char *
__wt_page_type_string(u_int type)
+ WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
switch (type) {
case WT_PAGE_INVALID:
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index 90188498535..39f9e1159cb 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -324,7 +324,7 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref)
* If this session has more than one hazard pointer, eviction will fail
* and there is no point trying.
*/
- if (__wt_hazard_count(session, page) > 1)
+ if (__wt_hazard_count(session, ref) > 1)
return (false);
/* If we can do an in-memory split, do it. */
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index f269c2d7f43..fde4d4fb9de 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -1300,7 +1300,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
/* Write the new version of the leaf page to disk. */
WT_ERR(__slvg_modify_init(session, page));
- WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR));
+ WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL));
/* Reset the page. */
page->pg_var_d = save_col_var;
@@ -2011,7 +2011,7 @@ __slvg_row_build_leaf(
/* Write the new version of the leaf page to disk. */
WT_ERR(__slvg_modify_init(session, page));
- WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR));
+ WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL));
/* Reset the page. */
page->pg_row_entries += skip_stop;
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 017c820ea29..fe49f937719 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -513,7 +513,7 @@ __split_ref_step2(
__split_verify_intl_key_order(session, child));
#endif
- WT_ERR(__wt_hazard_clear(session, child));
+ WT_ERR(__wt_hazard_clear(session, ref));
}
return (0);
@@ -1331,7 +1331,7 @@ __split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
WT_DECL_RET;
if (hazard)
- ret = __wt_hazard_clear(session, parent);
+ ret = __wt_hazard_clear(session, parent->pg_intl_parent_ref);
__wt_writeunlock(session, &parent->page_lock);
return (ret);
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 129d7fec05f..7bf15baa67f 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -136,8 +136,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
__wt_txn_get_snapshot(session);
leaf_bytes += page->memory_footprint;
++leaf_pages;
- WT_ERR(__wt_reconcile(
- session, walk, NULL, WT_CHECKPOINTING));
+ WT_ERR(__wt_reconcile(session,
+ walk, NULL, WT_CHECKPOINTING, NULL));
}
}
break;
@@ -233,7 +233,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
++leaf_pages;
}
WT_ERR(__wt_reconcile(
- session, walk, NULL, WT_CHECKPOINTING));
+ session, walk, NULL, WT_CHECKPOINTING, NULL));
}
break;
case WT_SYNC_CLOSE:
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index fb0d2296823..049700952ee 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -17,54 +17,60 @@ __ref_index_slot(WT_SESSION_IMPL *session,
WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
{
WT_PAGE_INDEX *pindex;
- uint32_t i;
+ WT_REF **start, **stop, **p, **t;
+ uint32_t entries, slot;
- /*
- * Copy the parent page's index value: the page can split at any time,
- * but the index's value is always valid, even if it's not up-to-date.
- */
-retry: WT_INTL_INDEX_GET(session, ref->home, pindex);
+ for (;;) {
+ /*
+ * Copy the parent page's index value: the page can split at
+ * any time, but the index's value is always valid, even if
+ * it's not up-to-date.
+ */
+ WT_INTL_INDEX_GET(session, ref->home, pindex);
+ entries = pindex->entries;
- /*
- * Use the page's reference hint: it should be correct unless the page
- * split before our slot. If the page splits after our slot, the hint
- * will point earlier in the array than our actual slot, so the first
- * loop is from the hint to the end of the list, and the second loop
- * is from the start of the list to the end of the list. (The second
- * loop overlaps the first, but that only happen in cases where we've
- * split the tree and aren't going to find our slot at all, that's not
- * worth optimizing.)
- *
- * It's not an error for the reference hint to be wrong, it just means
- * the first retrieval (which sets the hint for subsequent retrievals),
- * is slower.
- */
- i = ref->pindex_hint;
- if (i < pindex->entries && pindex->index[i] == ref) {
- *pindexp = pindex;
- *slotp = i;
- return;
- }
- while (++i < pindex->entries)
- if (pindex->index[i] == ref) {
- *pindexp = pindex;
- *slotp = ref->pindex_hint = i;
- return;
- }
- for (i = 0; i < pindex->entries; ++i)
- if (pindex->index[i] == ref) {
- *pindexp = pindex;
- *slotp = ref->pindex_hint = i;
- return;
+ /*
+ * Use the page's reference hint: it should be correct unless
+ * there was a split or delete in the parent before our slot.
+ * If the hint is wrong, it can be either too big or too small,
+ * but often only by a small amount. Search up and down the
+ * index starting from the hint.
+ *
+ * It's not an error for the reference hint to be wrong, it
+ * just means the first retrieval (which sets the hint for
+ * subsequent retrievals), is slower.
+ */
+ slot = ref->pindex_hint;
+ if (slot >= entries)
+ slot = entries - 1;
+ if (pindex->index[slot] == ref)
+ goto found;
+ for (start = &pindex->index[0],
+ stop = &pindex->index[entries - 1],
+ p = t = &pindex->index[slot];
+ p > start || t < stop;) {
+ if (p > start && *--p == ref) {
+ slot = (uint32_t)(p - start);
+ goto found;
+ }
+ if (t < stop && *++t == ref) {
+ slot = (uint32_t)(t - start);
+ goto found;
+ }
}
- /*
- * If we don't find our reference, the page split and our home pointer
- * references the wrong page. When internal pages split, their WT_REF
- * structure home values are updated; yield and wait for that to happen.
- */
- __wt_yield();
- goto retry;
+ /*
+ * If we don't find our reference, the page split and our home
+ * pointer references the wrong page. When internal pages
+ * split, their WT_REF structure home values are updated; yield
+ * and wait for that to happen.
+ */
+ __wt_yield();
+ }
+
+found: WT_ASSERT(session, pindex->index[slot] == ref);
+ *pindexp = pindex;
+ *slotp = slot;
}
/*
@@ -431,8 +437,8 @@ restart: /*
/*
* Move to the next slot, and set the reference hint if
* it's wrong (used when we continue the walk). We don't
- * update those hints when splitting, so it's common for
- * them to be incorrect in some workloads.
+ * always update the hints when splitting, it's expected
+ * for them to be incorrect in some workloads.
*/
ref = pindex->index[slot];
if (ref->pindex_hint != slot)