diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/bt_read.c')
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_read.c | 557 |
1 files changed, 534 insertions, 23 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index a3ce39b7758..d26b44e04c0 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -9,12 +9,320 @@ #include "wt_internal.h" /* - * __wt_cache_read -- - * Read a page from the file. + * __wt_las_remove_block -- + * Remove all records matching a key prefix from the lookaside store. */ int -__wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) +__wt_las_remove_block(WT_SESSION_IMPL *session, + WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size) { + WT_DECL_ITEM(las_addr); + WT_DECL_ITEM(las_key); + WT_DECL_RET; + uint64_t las_counter, las_txnid; + uint32_t las_id; + int exact; + + WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); + WT_ERR(__wt_scr_alloc(session, 0, &las_key)); + + /* + * Search for the block's unique prefix and step through all matching + * records, removing them. + */ + las_addr->data = addr; + las_addr->size = addr_size; + las_key->size = 0; + cursor->set_key( + cursor, btree_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); + if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) + ret = cursor->next(cursor); + for (; ret == 0; ret = cursor->next(cursor)) { + WT_ERR(cursor->get_key(cursor, + &las_id, las_addr, &las_counter, &las_txnid, las_key)); + + /* + * Confirm the search using the unique prefix; if not a match, + * we're done searching for records for this page. + */ + if (las_id != btree_id || + las_addr->size != addr_size || + memcmp(las_addr->data, addr, addr_size) != 0) + break; + + /* + * Cursor opened overwrite=true: won't return WT_NOTFOUND should + * another thread remove the record before we do, and the cursor + * remains positioned in that case. + */ + WT_ERR(cursor->remove(cursor)); + } + WT_ERR_NOTFOUND_OK(ret); + +err: __wt_scr_free(session, &las_addr); + __wt_scr_free(session, &las_key); + return (ret); +} + +/* + * __col_instantiate -- + * Update a column-store page entry based on a lookaside table update list. + */ +static int +__col_instantiate(WT_SESSION_IMPL *session, + uint64_t recno, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + /* Search the page and add updates. */ + WT_RET(__wt_col_search(session, recno, ref, cbt)); + WT_RET(__wt_col_modify(session, cbt, recno, NULL, upd, 0)); + return (0); +} + +/* + * __row_instantiate -- + * Update a row-store page entry based on a lookaside table update list. + */ +static int +__row_instantiate(WT_SESSION_IMPL *session, + WT_ITEM *key, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + /* Search the page and add updates. */ + WT_RET(__wt_row_search(session, key, ref, cbt, 1)); + WT_RET(__wt_row_modify(session, cbt, key, NULL, upd, 0)); + return (0); +} + +/* + * __las_page_instantiate -- + * Instantiate lookaside update records in a recently read page. + */ +static int +__las_page_instantiate(WT_SESSION_IMPL *session, + WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size) +{ + WT_CURSOR *cursor; + WT_CURSOR_BTREE cbt; + WT_DECL_ITEM(current_key); + WT_DECL_ITEM(las_addr); + WT_DECL_ITEM(las_key); + WT_DECL_ITEM(las_value); + WT_DECL_RET; + WT_PAGE *page; + WT_UPDATE *first_upd, *last_upd, *upd; + size_t incr, total_incr; + uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid; + uint32_t las_id, upd_size, session_flags; + int exact; + const uint8_t *p; + + cursor = NULL; + page = ref->page; + first_upd = last_upd = upd = NULL; + total_incr = 0; + current_recno = recno = WT_RECNO_OOB; + session_flags = 0; /* [-Werror=maybe-uninitialized] */ + + __wt_btcur_init(session, &cbt); + __wt_btcur_open(&cbt); + + WT_ERR(__wt_scr_alloc(session, 0, ¤t_key)); + WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); + WT_ERR(__wt_scr_alloc(session, 0, &las_key)); + WT_ERR(__wt_scr_alloc(session, 0, &las_value)); + + /* Open a lookaside table cursor. */ + WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); + + /* + * The lookaside records are in key and update order, that is, there + * will be a set of in-order updates for a key, then another set of + * in-order updates for a subsequent key. We process all of the updates + * for a key and then insert those updates into the page, then all the + * updates for the next key, and so on. + * + * Search for the block's unique prefix, stepping through any matching + * records. + */ + las_addr->data = addr; + las_addr->size = addr_size; + las_key->size = 0; + cursor->set_key( + cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); + if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) + ret = cursor->next(cursor); + for (; ret == 0; ret = cursor->next(cursor)) { + WT_ERR(cursor->get_key(cursor, + &las_id, las_addr, &las_counter, &las_txnid, las_key)); + + /* + * Confirm the search using the unique prefix; if not a match, + * we're done searching for records for this page. + */ + if (las_id != read_id || + las_addr->size != addr_size || + memcmp(las_addr->data, addr, addr_size) != 0) + break; + + /* + * If the on-page value has become globally visible, this record + * is no longer needed. + */ + if (__wt_txn_visible_all(session, las_txnid)) + continue; + + /* Allocate the WT_UPDATE structure. */ + WT_ERR(cursor->get_value( + cursor, &upd_txnid, &upd_size, las_value)); + WT_ERR(__wt_update_alloc(session, + (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value, + &upd, &incr)); + total_incr += incr; + upd->txnid = upd_txnid; + + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + p = las_key->data; + WT_ERR(__wt_vunpack_uint(&p, 0, &recno)); + if (current_recno == recno) + break; + WT_ASSERT(session, current_recno < recno); + + if (first_upd != NULL) { + WT_ERR(__col_instantiate(session, + current_recno, ref, &cbt, first_upd)); + first_upd = NULL; + } + current_recno = recno; + break; + case WT_PAGE_ROW_LEAF: + if (current_key->size == las_key->size && + memcmp(current_key->data, + las_key->data, las_key->size) == 0) + break; + + if (first_upd != NULL) { + WT_ERR(__row_instantiate(session, + current_key, ref, &cbt, first_upd)); + first_upd = NULL; + } + WT_ERR(__wt_buf_set(session, + current_key, las_key->data, las_key->size)); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Append the latest update to the list. */ + if (first_upd == NULL) + first_upd = last_upd = upd; + else { + last_upd->next = upd; + last_upd = upd; + } + upd = NULL; + } + WT_ERR_NOTFOUND_OK(ret); + + /* Insert the last set of updates, if any. */ + if (first_upd != NULL) + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + WT_ERR(__col_instantiate(session, + current_recno, ref, &cbt, first_upd)); + first_upd = NULL; + break; + case WT_PAGE_ROW_LEAF: + WT_ERR(__row_instantiate(session, + current_key, ref, &cbt, first_upd)); + first_upd = NULL; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Discard the cursor. */ + WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags)); + + if (total_incr != 0) { + __wt_cache_page_inmem_incr(session, page, total_incr); + + /* + * We've modified/dirtied the page, but that's not necessary and + * if we keep the page clean, it's easier to evict. We leave the + * lookaside table updates in place, so if we evict this page + * without dirtying it, any future instantiation of it will find + * the records it needs. If the page is dirtied before eviction, + * then we'll write any needed lookaside table records for the + * new location of the page. + */ + __wt_page_modify_clear(session, page); + } + +err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + WT_TRET(__wt_btcur_close(&cbt, 1)); + + /* + * On error, upd points to a single unlinked WT_UPDATE structure, + * first_upd points to a list. + */ + if (upd != NULL) + __wt_free(session, upd); + if (first_upd != NULL) + __wt_free_update_list(session, first_upd); + + __wt_scr_free(session, ¤t_key); + __wt_scr_free(session, &las_addr); + __wt_scr_free(session, &las_key); + __wt_scr_free(session, &las_value); + + return (ret); +} + +/* + * __evict_force_check -- + * Check if a page matches the criteria for forced eviction. + */ +static int +__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BTREE *btree; + + btree = S2BT(session); + + /* Pages are usually small enough, check that first. */ + if (page->memory_footprint < btree->maxmempage) + return (0); + + /* Leaf pages only. */ + if (WT_PAGE_IS_INTERNAL(page)) + return (0); + + /* + * It's hard to imagine a page with a huge memory footprint that has + * never been modified, but check to be sure. + */ + if (page->modify == NULL) + return (0); + + /* Trigger eviction on the next page release. */ + __wt_page_evict_soon(page); + + /* Bump the oldest ID, we're about to do some visibility checks. */ + __wt_txn_update_oldest(session, 0); + + /* If eviction cannot succeed, don't try. */ + return (__wt_page_can_evict(session, page, 1, NULL)); +} + +/* + * __page_read -- + * Read a page from the file. + */ +static int +__page_read(WT_SESSION_IMPL *session, WT_REF *ref) +{ + const WT_PAGE_HEADER *dsk; + WT_BTREE *btree; WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; @@ -22,6 +330,7 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) uint32_t previous_state; const uint8_t *addr; + btree = S2BT(session); page = NULL; /* @@ -45,8 +354,6 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) /* * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. - * Otherwise, there's an address, read the backing disk page and build - * an in-memory version of the page. */ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) { @@ -54,27 +361,51 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; - } else { - /* - * Read the page, then build the in-memory version of the page. - * Clear any local reference to an allocated copy of the disk - * image on return, the page steals it. - */ - WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); - WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, - WT_DATA_IN_ITEM(&tmp) ? - WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); - tmp.mem = NULL; - - /* If the page was deleted, instantiate that information. */ - if (previous_state == WT_REF_DELETED) - WT_ERR(__wt_delete_page_instantiate(session, ref)); + goto done; } - WT_ERR(__wt_verbose(session, WT_VERB_READ, - "page %p: %s", page, __wt_page_type_string(page->type))); + /* + * There's an address, read or map the backing disk page and build an + * in-memory version of the page. + */ + WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); + WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, + WT_DATA_IN_ITEM(&tmp) ? + WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); + + /* + * Clear the local reference to an allocated copy of the disk image on + * return; the page steals it, errors in this code should not free it. + */ + tmp.mem = NULL; - WT_PUBLISH(ref->state, WT_REF_MEM); + /* + * If reading for a checkpoint, there's no additional work to do, the + * page on disk is correct as written. + */ + if (session->dhandle->checkpoint != NULL) + goto done; + + /* If the page was deleted, instantiate that information. */ + if (previous_state == WT_REF_DELETED) + WT_ERR(__wt_delete_page_instantiate(session, ref)); + + /* + * Instantiate updates from the database's lookaside table. The page + * flag was set when the page was written, potentially a long time ago. + * We only care if the lookaside table is currently active, check that + * before doing any work. + */ + dsk = tmp.data; + if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) { + WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside); + WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside); + + WT_ERR(__las_page_instantiate( + session, ref, btree->id, addr, addr_size)); + } + +done: WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: /* @@ -90,3 +421,183 @@ err: /* return (ret); } + +/* + * __wt_page_in_func -- + * Acquire a hazard pointer to a page; if the page is not in-memory, + * read it from the disk and build an in-memory version. + */ +int +__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags +#ifdef HAVE_DIAGNOSTIC + , const char *file, int line +#endif + ) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; + u_int sleep_cnt, wait_cnt; + int busy, cache_work, force_attempts, oldgen, stalled; + + btree = S2BT(session); + stalled = 0; + + for (force_attempts = oldgen = 0, sleep_cnt = wait_cnt = 0;;) { + switch (ref->state) { + case WT_REF_DISK: + case WT_REF_DELETED: + if (LF_ISSET(WT_READ_CACHE)) + return (WT_NOTFOUND); + + /* + * The page isn't in memory, read it. If this thread is + * allowed to do eviction work, check for space in the + * cache. + */ + if (!LF_ISSET(WT_READ_NO_EVICT)) + WT_RET(__wt_cache_eviction_check( + session, 1, NULL)); + WT_RET(__page_read(session, ref)); + oldgen = LF_ISSET(WT_READ_WONT_NEED) || + F_ISSET(session, WT_SESSION_NO_CACHE); + continue; + case WT_REF_READING: + if (LF_ISSET(WT_READ_CACHE)) + return (WT_NOTFOUND); + if (LF_ISSET(WT_READ_NO_WAIT)) + return (WT_NOTFOUND); + + /* Waiting on another thread's read, stall. */ + WT_STAT_FAST_CONN_INCR(session, page_read_blocked); + stalled = 1; + break; + case WT_REF_LOCKED: + if (LF_ISSET(WT_READ_NO_WAIT)) + return (WT_NOTFOUND); + + /* Waiting on eviction, stall. */ + WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); + stalled = 1; + break; + case WT_REF_SPLIT: + return (WT_RESTART); + case WT_REF_MEM: + /* + * The page is in memory. + * + * Get a hazard pointer if one is required. We cannot + * be evicting if no hazard pointer is required, we're + * done. + */ + if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) + goto skip_evict; + + /* + * The expected reason we can't get a hazard pointer is + * because the page is being evicted, yield, try again. + */ +#ifdef HAVE_DIAGNOSTIC + WT_RET( + __wt_hazard_set(session, ref, &busy, file, line)); +#else + WT_RET(__wt_hazard_set(session, ref, &busy)); +#endif + if (busy) { + WT_STAT_FAST_CONN_INCR( + session, page_busy_blocked); + break; + } + + /* + * If eviction is configured for this file, check to see + * if the page qualifies for forced eviction and update + * the page's generation number. If eviction isn't being + * done on this file, we're done. + */ + if (LF_ISSET(WT_READ_NO_EVICT) || + F_ISSET(session, WT_SESSION_NO_EVICTION) || + F_ISSET(btree, WT_BTREE_NO_EVICTION)) + goto skip_evict; + + /* + * Forcibly evict pages that are too big. + */ + page = ref->page; + if (force_attempts < 10 && + __evict_force_check(session, page)) { + ++force_attempts; + ret = __wt_page_release_evict(session, ref); + /* If forced eviction fails, stall. */ + if (ret == EBUSY) { + ret = 0; + WT_STAT_FAST_CONN_INCR(session, + page_forcible_evict_blocked); + stalled = 1; + break; + } + WT_RET(ret); + + /* + * The result of a successful forced eviction + * is a page-state transition (potentially to + * an in-memory page we can use, or a restart + * return for our caller), continue the outer + * page-acquisition loop. + */ + continue; + } + + /* + * If we read the page and we are configured to not + * trash the cache, set the oldest read generation so + * the page is forcibly evicted as soon as possible. + * + * Otherwise, update the page's read generation. + */ + if (oldgen && page->read_gen == WT_READGEN_NOTSET) + __wt_page_evict_soon(page); + else if (!LF_ISSET(WT_READ_NO_GEN) && + page->read_gen != WT_READGEN_OLDEST && + page->read_gen < __wt_cache_read_gen(session)) + page->read_gen = + __wt_cache_read_gen_bump(session); +skip_evict: + /* + * Check if we need an autocommit transaction. + * Starting a transaction can trigger eviction, so skip + * it if eviction isn't permitted. + */ + return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : + __wt_txn_autocommit_check(session)); + WT_ILLEGAL_VALUE(session); + } + + /* + * We failed to get the page -- yield before retrying, and if + * we've yielded enough times, start sleeping so we don't burn + * CPU to no purpose. + */ + if (stalled) + wait_cnt += 1000; + else if (++wait_cnt < 1000) { + __wt_yield(); + continue; + } + + /* + * If stalling and this thread is allowed to do eviction work, + * check if the cache needs help. If we do work for the cache, + * substitute that for a sleep. + */ + if (!LF_ISSET(WT_READ_NO_EVICT)) { + WT_RET( + __wt_cache_eviction_check(session, 1, &cache_work)); + if (cache_work) + continue; + } + sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000); + WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); + __wt_sleep(0, sleep_cnt); + } +} |