diff options
Diffstat (limited to 'src/btree')
-rw-r--r-- | src/btree/bt_compact.c | 13 | ||||
-rw-r--r-- | src/btree/bt_cursor.c | 2 | ||||
-rw-r--r-- | src/btree/bt_debug.c | 22 | ||||
-rw-r--r-- | src/btree/bt_discard.c | 50 | ||||
-rw-r--r-- | src/btree/bt_misc.c | 1 | ||||
-rw-r--r-- | src/btree/bt_read.c | 2 | ||||
-rw-r--r-- | src/btree/bt_slvg.c | 4 | ||||
-rw-r--r-- | src/btree/bt_split.c | 4 | ||||
-rw-r--r-- | src/btree/bt_sync.c | 6 | ||||
-rw-r--r-- | src/btree/bt_walk.c | 98 |
10 files changed, 108 insertions, 94 deletions
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index e005674762d..2edcac76d0b 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -90,16 +90,15 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) * Compact a file. */ int -__wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) +__wt_compact(WT_SESSION_IMPL *session) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; WT_REF *ref; + u_int i; bool skip; - WT_UNUSED(cfg); - btree = S2BT(session); bm = btree->bm; ref = NULL; @@ -129,7 +128,13 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) __wt_spin_lock(session, &btree->flush_lock); /* Walk the tree reviewing pages to see if they should be re-written. */ - for (;;) { + for (i = 0;;) { + /* Periodically check if we've run out of time. */ + if (++i > 100) { + WT_ERR(__wt_session_compact_check_timeout(session)); + i = 0; + } + /* * Pages read for compaction aren't "useful"; don't update the * read generation of pages already in memory, and if a page is diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index a996b21f7ce..650289f2cd8 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -41,7 +41,7 @@ __cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv) if (kv->size > WT_BTREE_MAX_OBJECT_SIZE) WT_RET_MSG(session, EINVAL, "item size of %" WT_SIZET_FMT " exceeds the maximum " - "supported WiredTiger size of %d", + "supported WiredTiger size of %" PRIu32, kv->size, WT_BTREE_MAX_OBJECT_SIZE); /* Check what the block manager can actually write. */ diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index bcc7d27a569..3352b797fa9 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -77,7 +77,7 @@ static inline int __debug_hex_byte(WT_DBG *ds, uint8_t v) { return (ds->f( - ds, "#%c%c", __wt_hex[(v & 0xf0) >> 4], __wt_hex[v & 0x0f])); + ds, "#%c%c", __wt_hex((v & 0xf0) >> 4), __wt_hex(v & 0x0f))); } /* @@ -1003,37 +1003,37 @@ __debug_ref(WT_DBG *ds, WT_REF *ref) WT_SESSION_IMPL *session; size_t addr_size; const uint8_t *addr; + const char *state; session = ds->session; - WT_RET(ds->f(ds, "\t")); switch (ref->state) { case WT_REF_DISK: - WT_RET(ds->f(ds, "disk")); + state = "disk"; break; case WT_REF_DELETED: - WT_RET(ds->f(ds, "deleted")); + state = "deleted"; break; case WT_REF_LOCKED: - WT_RET(ds->f(ds, "locked %p", (void *)ref->page)); + state = "locked"; break; case WT_REF_MEM: - WT_RET(ds->f(ds, "memory %p", (void *)ref->page)); + state = "memory"; break; case WT_REF_READING: - WT_RET(ds->f(ds, "reading")); + state = "reading"; break; case WT_REF_SPLIT: - WT_RET(ds->f(ds, "split")); + state = "split"; break; default: - WT_RET(ds->f(ds, "INVALID")); + state = "INVALID"; break; } __wt_ref_info(ref, &addr, &addr_size, NULL); - return (ds->f(ds, " %s\n", - __wt_addr_string(session, addr, addr_size, ds->tmp))); + return (ds->f(ds, "\t" "%p %s %s\n", (void *)ref, + state, __wt_addr_string(session, addr, addr_size, ds->tmp))); } /* diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index 162bc07a1c2..7858d2cb14e 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -27,9 +27,35 @@ __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) /* * A version of the page-out function that allows us to make additional * diagnostic checks. + * + * The WT_REF cannot be the eviction thread's location. */ WT_ASSERT(session, S2BT(session)->evict_ref != ref); +#ifdef HAVE_DIAGNOSTIC + { + WT_HAZARD *hp; + int i; + /* + * Make sure no other thread has a hazard pointer on the page we are + * about to discard. This is complicated by the fact that readers + * publish their hazard pointer before re-checking the page state, so + * our check can race with readers without indicating a real problem. + * Wait for up to a second for hazard pointers to be cleared. + */ + for (hp = NULL, i = 0; i < 100; i++) { + if ((hp = __wt_page_hazard_check(session, ref)) == NULL) + break; + __wt_sleep(0, 10000); + } + if (hp != NULL) + __wt_errx(session, + "discarded page has hazard pointer: (%p: %s, line %d)", + (void *)hp->ref, hp->file, hp->line); + WT_ASSERT(session, hp == NULL); + } +#endif + __wt_page_out(session, &ref->page); } @@ -63,30 +89,6 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); WT_ASSERT(session, !__wt_rwlock_islocked(session, &page->page_lock)); -#ifdef HAVE_DIAGNOSTIC - { - WT_HAZARD *hp; - int i; - /* - * Make sure no other thread has a hazard pointer on the page we are - * about to discard. This is complicated by the fact that readers - * publish their hazard pointer before re-checking the page state, so - * our check can race with readers without indicating a real problem. - * Wait for up to a second for hazard pointers to be cleared. - */ - for (hp = NULL, i = 0; i < 100; i++) { - if ((hp = __wt_page_hazard_check(session, page)) == NULL) - break; - __wt_sleep(0, 10000); - } - if (hp != NULL) - __wt_errx(session, - "discarded page has hazard pointer: (%p: %s, line %d)", - (void *)hp->page, hp->file, hp->line); - WT_ASSERT(session, hp == NULL); - } -#endif - /* * If a root page split, there may be one or more pages linked from the * page; walk the list, discarding pages. diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c index b6e2cc07f5a..3bec65c2567 100644 --- a/src/btree/bt_misc.c +++ b/src/btree/bt_misc.c @@ -14,6 +14,7 @@ */ const char * __wt_page_type_string(u_int type) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { switch (type) { case WT_PAGE_INVALID: diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index 90188498535..39f9e1159cb 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -324,7 +324,7 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) * If this session has more than one hazard pointer, eviction will fail * and there is no point trying. */ - if (__wt_hazard_count(session, page) > 1) + if (__wt_hazard_count(session, ref) > 1) return (false); /* If we can do an in-memory split, do it. */ diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index f269c2d7f43..fde4d4fb9de 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -1300,7 +1300,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR)); + WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL)); /* Reset the page. */ page->pg_var_d = save_col_var; @@ -2011,7 +2011,7 @@ __slvg_row_build_leaf( /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR)); + WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL)); /* Reset the page. */ page->pg_row_entries += skip_stop; diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 017c820ea29..fe49f937719 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -513,7 +513,7 @@ __split_ref_step2( __split_verify_intl_key_order(session, child)); #endif - WT_ERR(__wt_hazard_clear(session, child)); + WT_ERR(__wt_hazard_clear(session, ref)); } return (0); @@ -1331,7 +1331,7 @@ __split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard) WT_DECL_RET; if (hazard) - ret = __wt_hazard_clear(session, parent); + ret = __wt_hazard_clear(session, parent->pg_intl_parent_ref); __wt_writeunlock(session, &parent->page_lock); return (ret); diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 129d7fec05f..7bf15baa67f 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -136,8 +136,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; - WT_ERR(__wt_reconcile( - session, walk, NULL, WT_CHECKPOINTING)); + WT_ERR(__wt_reconcile(session, + walk, NULL, WT_CHECKPOINTING, NULL)); } } break; @@ -233,7 +233,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) ++leaf_pages; } WT_ERR(__wt_reconcile( - session, walk, NULL, WT_CHECKPOINTING)); + session, walk, NULL, WT_CHECKPOINTING, NULL)); } break; case WT_SYNC_CLOSE: diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index fb0d2296823..049700952ee 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -17,54 +17,60 @@ __ref_index_slot(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp) { WT_PAGE_INDEX *pindex; - uint32_t i; + WT_REF **start, **stop, **p, **t; + uint32_t entries, slot; - /* - * Copy the parent page's index value: the page can split at any time, - * but the index's value is always valid, even if it's not up-to-date. - */ -retry: WT_INTL_INDEX_GET(session, ref->home, pindex); + for (;;) { + /* + * Copy the parent page's index value: the page can split at + * any time, but the index's value is always valid, even if + * it's not up-to-date. + */ + WT_INTL_INDEX_GET(session, ref->home, pindex); + entries = pindex->entries; - /* - * Use the page's reference hint: it should be correct unless the page - * split before our slot. If the page splits after our slot, the hint - * will point earlier in the array than our actual slot, so the first - * loop is from the hint to the end of the list, and the second loop - * is from the start of the list to the end of the list. (The second - * loop overlaps the first, but that only happen in cases where we've - * split the tree and aren't going to find our slot at all, that's not - * worth optimizing.) - * - * It's not an error for the reference hint to be wrong, it just means - * the first retrieval (which sets the hint for subsequent retrievals), - * is slower. - */ - i = ref->pindex_hint; - if (i < pindex->entries && pindex->index[i] == ref) { - *pindexp = pindex; - *slotp = i; - return; - } - while (++i < pindex->entries) - if (pindex->index[i] == ref) { - *pindexp = pindex; - *slotp = ref->pindex_hint = i; - return; - } - for (i = 0; i < pindex->entries; ++i) - if (pindex->index[i] == ref) { - *pindexp = pindex; - *slotp = ref->pindex_hint = i; - return; + /* + * Use the page's reference hint: it should be correct unless + * there was a split or delete in the parent before our slot. + * If the hint is wrong, it can be either too big or too small, + * but often only by a small amount. Search up and down the + * index starting from the hint. + * + * It's not an error for the reference hint to be wrong, it + * just means the first retrieval (which sets the hint for + * subsequent retrievals), is slower. + */ + slot = ref->pindex_hint; + if (slot >= entries) + slot = entries - 1; + if (pindex->index[slot] == ref) + goto found; + for (start = &pindex->index[0], + stop = &pindex->index[entries - 1], + p = t = &pindex->index[slot]; + p > start || t < stop;) { + if (p > start && *--p == ref) { + slot = (uint32_t)(p - start); + goto found; + } + if (t < stop && *++t == ref) { + slot = (uint32_t)(t - start); + goto found; + } } - /* - * If we don't find our reference, the page split and our home pointer - * references the wrong page. When internal pages split, their WT_REF - * structure home values are updated; yield and wait for that to happen. - */ - __wt_yield(); - goto retry; + /* + * If we don't find our reference, the page split and our home + * pointer references the wrong page. When internal pages + * split, their WT_REF structure home values are updated; yield + * and wait for that to happen. + */ + __wt_yield(); + } + +found: WT_ASSERT(session, pindex->index[slot] == ref); + *pindexp = pindex; + *slotp = slot; } /* @@ -431,8 +437,8 @@ restart: /* /* * Move to the next slot, and set the reference hint if * it's wrong (used when we continue the walk). We don't - * update those hints when splitting, so it's common for - * them to be incorrect in some workloads. + * always update the hints when splitting, it's expected + * for them to be incorrect in some workloads. */ ref = pindex->index[slot]; if (ref->pindex_hint != slot) |