/*- * Copyright (c) 2014-2019 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ /* * __wt_ref_is_root -- * Return if the page reference is for the root page. */ static inline bool __wt_ref_is_root(WT_REF *ref) { return (ref->home == NULL); } /* * __wt_page_is_empty -- * Return if the page is empty. */ static inline bool __wt_page_is_empty(WT_PAGE *page) { return (page->modify != NULL && page->modify->rec_result == WT_PM_REC_EMPTY); } /* * __wt_page_evict_clean -- * Return if the page can be evicted without dirtying the tree. */ static inline bool __wt_page_evict_clean(WT_PAGE *page) { return (page->modify == NULL || (page->modify->page_state == WT_PAGE_CLEAN && page->modify->rec_result == 0)); } /* * __wt_page_is_modified -- * Return if the page is dirty. */ static inline bool __wt_page_is_modified(WT_PAGE *page) { return (page->modify != NULL && page->modify->page_state != WT_PAGE_CLEAN); } /* * __wt_btree_block_free -- * Helper function to free a block from the current tree. */ static inline int __wt_btree_block_free(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) { WT_BM *bm; WT_BTREE *btree; btree = S2BT(session); bm = btree->bm; return (bm->free(bm, session, addr, addr_size)); } /* * __wt_btree_bytes_inuse -- * Return the number of bytes in use. */ static inline uint64_t __wt_btree_bytes_inuse(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; btree = S2BT(session); cache = S2C(session)->cache; return (__wt_cache_bytes_plus_overhead(cache, btree->bytes_inmem)); } /* * __wt_btree_bytes_evictable -- * Return the number of bytes that can be evicted (i.e. bytes apart from the pinned root page). */ static inline uint64_t __wt_btree_bytes_evictable(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; WT_PAGE *root_page; uint64_t bytes_inmem, bytes_root; btree = S2BT(session); cache = S2C(session)->cache; root_page = btree->root.page; bytes_inmem = btree->bytes_inmem; bytes_root = root_page == NULL ? 0 : root_page->memory_footprint; return (bytes_inmem <= bytes_root ? 0 : __wt_cache_bytes_plus_overhead( cache, bytes_inmem - bytes_root)); } /* * __wt_btree_dirty_inuse -- * Return the number of dirty bytes in use. */ static inline uint64_t __wt_btree_dirty_inuse(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; btree = S2BT(session); cache = S2C(session)->cache; return ( __wt_cache_bytes_plus_overhead(cache, btree->bytes_dirty_intl + btree->bytes_dirty_leaf)); } /* * __wt_btree_dirty_leaf_inuse -- * Return the number of bytes in use by dirty leaf pages. */ static inline uint64_t __wt_btree_dirty_leaf_inuse(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; btree = S2BT(session); cache = S2C(session)->cache; return (__wt_cache_bytes_plus_overhead(cache, btree->bytes_dirty_leaf)); } /* * __wt_cache_page_inmem_incr -- * Increment a page's memory footprint in the cache. */ static inline void __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) { WT_BTREE *btree; WT_CACHE *cache; WT_ASSERT(session, size < WT_EXABYTE); btree = S2BT(session); cache = S2C(session)->cache; (void)__wt_atomic_add64(&btree->bytes_inmem, size); (void)__wt_atomic_add64(&cache->bytes_inmem, size); (void)__wt_atomic_addsize(&page->memory_footprint, size); if (__wt_page_is_modified(page)) { (void)__wt_atomic_addsize(&page->modify->bytes_dirty, size); if (WT_PAGE_IS_INTERNAL(page)) { (void)__wt_atomic_add64(&btree->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); } else if (!btree->lsm_primary) { (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); } } /* Track internal size in cache. */ if (WT_PAGE_IS_INTERNAL(page)) (void)__wt_atomic_add64(&cache->bytes_internal, size); } /* * __wt_cache_decr_check_size -- * Decrement a size_t cache value and check for underflow. */ static inline void __wt_cache_decr_check_size(WT_SESSION_IMPL *session, size_t *vp, size_t v, const char *fld) { if (v == 0 || __wt_atomic_subsize(vp, v) < WT_EXABYTE) return; /* * It's a bug if this accounting underflowed but allow the application to proceed - the * consequence is we use more cache than configured. */ *vp = 0; __wt_errx(session, "%s went negative with decrement of %" WT_SIZET_FMT, fld, v); #ifdef HAVE_DIAGNOSTIC __wt_abort(session); #endif } /* * __wt_cache_decr_check_uint64 -- * Decrement a uint64_t cache value and check for underflow. */ static inline void __wt_cache_decr_check_uint64(WT_SESSION_IMPL *session, uint64_t *vp, uint64_t v, const char *fld) { uint64_t orig = *vp; if (v == 0 || __wt_atomic_sub64(vp, v) < WT_EXABYTE) return; /* * It's a bug if this accounting underflowed but allow the application to proceed - the * consequence is we use more cache than configured. */ *vp = 0; __wt_errx( session, "%s was %" PRIu64 ", went negative with decrement of %" PRIu64, fld, orig, v); #ifdef HAVE_DIAGNOSTIC __wt_abort(session); #endif } /* * __wt_cache_page_byte_dirty_decr -- * Decrement the page's dirty byte count, guarding from underflow. */ static inline void __wt_cache_page_byte_dirty_decr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) { WT_BTREE *btree; WT_CACHE *cache; size_t decr, orig; int i; btree = S2BT(session); cache = S2C(session)->cache; decr = 0; /* [-Wconditional-uninitialized] */ /* * We don't have exclusive access and there are ways of decrementing the * page's dirty byte count by a too-large value. For example: * T1: __wt_cache_page_inmem_incr(page, size) * page is clean, don't increment dirty byte count * T2: mark page dirty * T1: __wt_cache_page_inmem_decr(page, size) * page is dirty, decrement dirty byte count * and, of course, the reverse where the page is dirty at the increment * and clean at the decrement. * * The page's dirty-byte value always reflects bytes represented in the * cache's dirty-byte count, decrement the page/cache as much as we can * without underflow. If we can't decrement the dirty byte counts after * few tries, give up: the cache's value will be wrong, but consistent, * and we'll fix it the next time this page is marked clean, or evicted. */ for (i = 0; i < 5; ++i) { /* * Take care to read the dirty-byte count only once in case we're racing with updates. */ WT_ORDERED_READ(orig, page->modify->bytes_dirty); decr = WT_MIN(size, orig); if (__wt_atomic_cassize(&page->modify->bytes_dirty, orig, orig - decr)) break; } if (i == 5) return; if (WT_PAGE_IS_INTERNAL(page)) { __wt_cache_decr_check_uint64( session, &btree->bytes_dirty_intl, decr, "WT_BTREE.bytes_dirty_intl"); __wt_cache_decr_check_uint64( session, &cache->bytes_dirty_intl, decr, "WT_CACHE.bytes_dirty_intl"); } else if (!btree->lsm_primary) { __wt_cache_decr_check_uint64( session, &btree->bytes_dirty_leaf, decr, "WT_BTREE.bytes_dirty_leaf"); __wt_cache_decr_check_uint64( session, &cache->bytes_dirty_leaf, decr, "WT_CACHE.bytes_dirty_leaf"); } } /* * __wt_cache_page_inmem_decr -- * Decrement a page's memory footprint in the cache. */ static inline void __wt_cache_page_inmem_decr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) { WT_CACHE *cache; cache = S2C(session)->cache; WT_ASSERT(session, size < WT_EXABYTE); __wt_cache_decr_check_uint64( session, &S2BT(session)->bytes_inmem, size, "WT_BTREE.bytes_inmem"); __wt_cache_decr_check_uint64(session, &cache->bytes_inmem, size, "WT_CACHE.bytes_inmem"); __wt_cache_decr_check_size(session, &page->memory_footprint, size, "WT_PAGE.memory_footprint"); if (__wt_page_is_modified(page)) __wt_cache_page_byte_dirty_decr(session, page, size); /* Track internal size in cache. */ if (WT_PAGE_IS_INTERNAL(page)) __wt_cache_decr_check_uint64( session, &cache->bytes_internal, size, "WT_CACHE.bytes_internal"); } /* * __wt_cache_dirty_incr -- * Page switch from clean to dirty: increment the cache dirty page/byte counts. */ static inline void __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_CACHE *cache; size_t size; btree = S2BT(session); cache = S2C(session)->cache; /* * Take care to read the memory_footprint once in case we are racing with updates. */ size = page->memory_footprint; if (WT_PAGE_IS_INTERNAL(page)) { (void)__wt_atomic_add64(&btree->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->pages_dirty_intl, 1); } else { if (!btree->lsm_primary) { (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); } (void)__wt_atomic_add64(&cache->pages_dirty_leaf, 1); } (void)__wt_atomic_add64(&btree->bytes_dirty_total, size); (void)__wt_atomic_add64(&cache->bytes_dirty_total, size); (void)__wt_atomic_addsize(&page->modify->bytes_dirty, size); } /* * __wt_cache_dirty_decr -- * Page switch from dirty to clean: decrement the cache dirty page/byte counts. */ static inline void __wt_cache_dirty_decr(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_CACHE *cache; WT_PAGE_MODIFY *modify; cache = S2C(session)->cache; if (WT_PAGE_IS_INTERNAL(page)) __wt_cache_decr_check_uint64( session, &cache->pages_dirty_intl, 1, "dirty internal page count"); else __wt_cache_decr_check_uint64(session, &cache->pages_dirty_leaf, 1, "dirty leaf page count"); modify = page->modify; if (modify != NULL && modify->bytes_dirty != 0) __wt_cache_page_byte_dirty_decr(session, page, modify->bytes_dirty); } /* * __wt_cache_page_image_decr -- * Decrement a page image's size to the cache. */ static inline void __wt_cache_page_image_decr(WT_SESSION_IMPL *session, uint32_t size) { WT_CACHE *cache; cache = S2C(session)->cache; __wt_cache_decr_check_uint64(session, &cache->bytes_image, size, "WT_CACHE.image_inmem"); } /* * __wt_cache_page_image_incr -- * Increment a page image's size to the cache. */ static inline void __wt_cache_page_image_incr(WT_SESSION_IMPL *session, uint32_t size) { WT_CACHE *cache; cache = S2C(session)->cache; (void)__wt_atomic_add64(&cache->bytes_image, size); } /* * __wt_cache_page_evict -- * Evict pages from the cache. */ static inline void __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_CACHE *cache; WT_PAGE_MODIFY *modify; btree = S2BT(session); cache = S2C(session)->cache; modify = page->modify; /* Update the bytes in-memory to reflect the eviction. */ __wt_cache_decr_check_uint64( session, &btree->bytes_inmem, page->memory_footprint, "WT_BTREE.bytes_inmem"); __wt_cache_decr_check_uint64( session, &cache->bytes_inmem, page->memory_footprint, "WT_CACHE.bytes_inmem"); /* Update the bytes_internal value to reflect the eviction */ if (WT_PAGE_IS_INTERNAL(page)) __wt_cache_decr_check_uint64( session, &cache->bytes_internal, page->memory_footprint, "WT_CACHE.bytes_internal"); /* Update the cache's dirty-byte count. */ if (modify != NULL && modify->bytes_dirty != 0) { if (WT_PAGE_IS_INTERNAL(page)) { __wt_cache_decr_check_uint64( session, &btree->bytes_dirty_intl, modify->bytes_dirty, "WT_BTREE.bytes_dirty_intl"); __wt_cache_decr_check_uint64( session, &cache->bytes_dirty_intl, modify->bytes_dirty, "WT_CACHE.bytes_dirty_intl"); } else if (!btree->lsm_primary) { __wt_cache_decr_check_uint64( session, &btree->bytes_dirty_leaf, modify->bytes_dirty, "WT_BTREE.bytes_dirty_leaf"); __wt_cache_decr_check_uint64( session, &cache->bytes_dirty_leaf, modify->bytes_dirty, "WT_CACHE.bytes_dirty_leaf"); } } /* Update bytes and pages evicted. */ (void)__wt_atomic_add64(&cache->bytes_evict, page->memory_footprint); (void)__wt_atomic_addv64(&cache->pages_evicted, 1); /* * Track if eviction makes progress. This is used in various places to determine whether * eviction is stuck. */ if (!F_ISSET_ATOMIC(page, WT_PAGE_EVICT_NO_PROGRESS)) (void)__wt_atomic_addv64(&cache->eviction_progress, 1); } /* * __wt_update_list_memsize -- * The size in memory of a list of updates. */ static inline size_t __wt_update_list_memsize(WT_UPDATE *upd) { size_t upd_size; for (upd_size = 0; upd != NULL; upd = upd->next) upd_size += WT_UPDATE_MEMSIZE(upd); return (upd_size); } /* * __wt_page_modify_init -- * A page is about to be modified, allocate the modification structure. */ static inline int __wt_page_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page) { return (page->modify == NULL ? __wt_page_modify_alloc(session, page) : 0); } /* * __wt_page_only_modify_set -- * Mark the page (but only the page) dirty. */ static inline void __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) { uint64_t last_running; WT_ASSERT(session, !F_ISSET(session->dhandle, WT_DHANDLE_DEAD)); last_running = 0; if (page->modify->page_state == WT_PAGE_CLEAN) last_running = S2C(session)->txn_global.last_running; /* * We depend on the atomic operation being a write barrier, that is, a barrier to ensure all * changes to the page are flushed before updating the page state and/or marking the tree dirty, * otherwise checkpoints and/or page reconciliation might be looking at a clean page/tree. * * Every time the page transitions from clean to dirty, update the cache and transactional * information. * * The page state can only ever be incremented above dirty by the number of concurrently running * threads, so the counter will never approach the point where it would wrap. */ if (page->modify->page_state < WT_PAGE_DIRTY && __wt_atomic_add32(&page->modify->page_state, 1) == WT_PAGE_DIRTY_FIRST) { __wt_cache_dirty_incr(session, page); /* * In the event we dirty a page which is flagged for eviction soon, we update its read * generation to avoid evicting a dirty page prematurely. */ if (page->read_gen == WT_READGEN_WONT_NEED) __wt_cache_read_gen_new(session, page); /* * We won the race to dirty the page, but another thread could have committed in the * meantime, and the last_running field been updated past it. That is all very unlikely, but * not impossible, so we take care to read the global state before the atomic increment. * * If the page was dirty on entry, then last_running == 0. The page could have become clean * since then, if reconciliation completed. In that case, we leave the previous value for * first_dirty_txn rather than potentially racing to update it, at worst, we'll * unnecessarily write a page in a checkpoint. */ if (last_running != 0) page->modify->first_dirty_txn = last_running; } /* Check if this is the largest transaction ID to update the page. */ if (WT_TXNID_LT(page->modify->update_txn, session->txn.id)) page->modify->update_txn = session->txn.id; } /* * __wt_tree_modify_set -- * Mark the tree dirty. */ static inline void __wt_tree_modify_set(WT_SESSION_IMPL *session) { /* * Test before setting the dirty flag, it's a hot cache line. * * The tree's modified flag is cleared by the checkpoint thread: set it and insert a barrier * before dirtying the page. (I don't think it's a problem if the tree is marked dirty with all * the pages clean, it might result in an extra checkpoint that doesn't do any work but it * shouldn't cause problems; regardless, let's play it safe.) */ if (!S2BT(session)->modified) { /* Assert we never dirty a checkpoint handle. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); S2BT(session)->modified = true; WT_FULL_BARRIER(); } /* * The btree may already be marked dirty while the connection is still clean; mark the * connection dirty outside the test of the btree state. */ if (!S2C(session)->modified) S2C(session)->modified = true; } /* * __wt_page_modify_clear -- * Clean a modified page. */ static inline void __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page) { /* * The page must be held exclusive when this call is made, this call can only be used when the * page is owned by a single thread. * * Allow the call to be made on clean pages. */ if (__wt_page_is_modified(page)) { /* * The only part where ordering matters is during reconciliation where updates on other * threads are performing writes to the page state that need to be visible to the * reconciliation thread. * * Since clearing of the page state is not going to be happening during reconciliation on a * separate thread, there's no write barrier needed here. */ page->modify->page_state = WT_PAGE_CLEAN; __wt_cache_dirty_decr(session, page); } } /* * __wt_page_modify_set -- * Mark the page and tree dirty. */ static inline void __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) { /* * Mark the tree dirty (even if the page is already marked dirty), newly created pages to * support "empty" files are dirty, but the file isn't marked dirty until there's a real change * needing to be written. */ __wt_tree_modify_set(session); __wt_page_only_modify_set(session, page); } /* * __wt_page_parent_modify_set -- * Mark the parent page, and optionally the tree, dirty. */ static inline int __wt_page_parent_modify_set(WT_SESSION_IMPL *session, WT_REF *ref, bool page_only) { WT_PAGE *parent; /* * This function exists as a place to stash this comment. There are a few places where we need * to dirty a page's parent. The trick is the page's parent might split at any point, and the * page parent might be the wrong parent at any particular time. We ignore this and dirty * whatever page the page's reference structure points to. This is safe because if we're * pointing to the wrong parent, that parent must have split, deepening the tree, which implies * marking the original parent and all of the newly-created children as dirty. In other words, * if we have the wrong parent page, everything was marked dirty already. */ parent = ref->home; WT_RET(__wt_page_modify_init(session, parent)); if (page_only) __wt_page_only_modify_set(session, parent); else __wt_page_modify_set(session, parent); return (0); } /* * __wt_off_page -- * Return if a pointer references off-page data. */ static inline bool __wt_off_page(WT_PAGE *page, const void *p) { /* * There may be no underlying page, in which case the reference is off-page by definition. */ return (page->dsk == NULL || p < (void *)page->dsk || p >= (void *)((uint8_t *)page->dsk + page->dsk->mem_size)); } /* * __wt_ref_addr_free -- * Free the address in a reference, if necessary. */ static inline void __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref) { if (ref->addr == NULL) return; if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) { __wt_free(session, ((WT_ADDR *)ref->addr)->addr); __wt_free(session, ref->addr); } ref->addr = NULL; } /* * __wt_ref_key -- * Return a reference to a row-store internal page key as cheaply as possible. */ static inline void __wt_ref_key(WT_PAGE *page, WT_REF *ref, void *keyp, size_t *sizep) { uintptr_t v; /* * An internal page key is in one of two places: if we instantiated the * key (for example, when reading the page), WT_REF.ref_ikey references * a WT_IKEY structure, otherwise WT_REF.ref_ikey references an on-page * key offset/length pair. * * Now the magic: allocated memory must be aligned to store any standard * type, and we expect some standard type to require at least quad-byte * alignment, so allocated memory should have some clear low-order bits. * On-page objects consist of an offset/length pair: the maximum page * size currently fits into 29 bits, so we use the low-order bits of the * pointer to mark the other bits of the pointer as encoding the key's * location and length. This breaks if allocated memory isn't aligned, * of course. * * In this specific case, we use bit 0x01 to mark an on-page key, else * it's a WT_IKEY reference. The bit pattern for internal row-store * on-page keys is: * 32 bits key length * 31 bits page offset of the key's bytes, * 1 bits flags */ #define WT_IK_FLAG 0x01 #define WT_IK_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 32) #define WT_IK_DECODE_KEY_LEN(v) ((v) >> 32) #define WT_IK_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 1) #define WT_IK_DECODE_KEY_OFFSET(v) (((v)&0xFFFFFFFF) >> 1) v = (uintptr_t)ref->ref_ikey; if (v & WT_IK_FLAG) { *(void **)keyp = WT_PAGE_REF_OFFSET(page, WT_IK_DECODE_KEY_OFFSET(v)); *sizep = WT_IK_DECODE_KEY_LEN(v); } else { *(void **)keyp = WT_IKEY_DATA(ref->ref_ikey); *sizep = ((WT_IKEY *)ref->ref_ikey)->size; } } /* * __wt_ref_key_onpage_set -- * Set a WT_REF to reference an on-page key. */ static inline void __wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK *unpack) { uintptr_t v; /* * See the comment in __wt_ref_key for an explanation of the magic. */ v = WT_IK_ENCODE_KEY_LEN(unpack->size) | WT_IK_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) | WT_IK_FLAG; ref->ref_ikey = (void *)v; } /* * __wt_ref_key_instantiated -- * Return if a WT_REF key is instantiated. */ static inline WT_IKEY * __wt_ref_key_instantiated(WT_REF *ref) { uintptr_t v; /* * See the comment in __wt_ref_key for an explanation of the magic. */ v = (uintptr_t)ref->ref_ikey; return (v & WT_IK_FLAG ? NULL : ref->ref_ikey); } /* * __wt_ref_key_clear -- * Clear a WT_REF key. */ static inline void __wt_ref_key_clear(WT_REF *ref) { /* * The key union has 2 8B fields; this is equivalent to: * * ref->ref_recno = WT_RECNO_OOB; * ref->ref_ikey = NULL; */ ref->ref_recno = 0; } /* * __wt_row_leaf_key_info -- * Return a row-store leaf page key referenced by a WT_ROW if it can be had without unpacking a * cell, and information about the cell, if the key isn't cheaply available. */ static inline bool __wt_row_leaf_key_info( WT_PAGE *page, void *copy, WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep) { WT_IKEY *ikey; uintptr_t v; v = (uintptr_t)copy; /* * A row-store leaf page key is in one of two places: if instantiated, * the WT_ROW pointer references a WT_IKEY structure, otherwise, it * references an on-page offset. Further, on-page keys are in one of * two states: if the key is a simple key (not an overflow key, prefix * compressed or Huffman encoded, all of which are likely), the key's * offset/size is encoded in the pointer. Otherwise, the offset is to * the key's on-page cell. * * Now the magic: allocated memory must be aligned to store any standard * type, and we expect some standard type to require at least quad-byte * alignment, so allocated memory should have some clear low-order bits. * On-page objects consist of an offset/length pair: the maximum page * size currently fits into 29 bits, so we use the low-order bits of the * pointer to mark the other bits of the pointer as encoding the key's * location and length. This breaks if allocated memory isn't aligned, * of course. * * In this specific case, we use bit 0x01 to mark an on-page cell, bit * 0x02 to mark an on-page key, 0x03 to mark an on-page key/value pair, * otherwise it's a WT_IKEY reference. The bit pattern for on-page cells * is: * 29 bits page offset of the key's cell, * 2 bits flags * * The bit pattern for on-page keys is: * 32 bits key length, * 29 bits page offset of the key's bytes, * 2 bits flags * * But, while that allows us to skip decoding simple key cells, we also * want to skip decoding the value cell in the case where the value cell * is also simple/short. We use bit 0x03 to mark an encoded on-page key * and value pair. The bit pattern for on-page key/value pairs is: * 9 bits key length, * 13 bits value length, * 20 bits page offset of the key's bytes, * 20 bits page offset of the value's bytes, * 2 bits flags * * These bit patterns are in-memory only, of course, so can be modified * (we could even tune for specific workloads). Generally, the fields * are larger than the anticipated values being stored (512B keys, 8KB * values, 1MB pages), hopefully that won't be necessary. * * This function returns a list of things about the key (instantiation * reference, cell reference and key/length pair). Our callers know * the order in which we look things up and the information returned; * for example, the cell will never be returned if we are working with * an on-page key. */ #define WT_CELL_FLAG 0x01 #define WT_CELL_ENCODE_OFFSET(v) ((uintptr_t)(v) << 2) #define WT_CELL_DECODE_OFFSET(v) (((v)&0xFFFFFFFF) >> 2) #define WT_K_FLAG 0x02 #define WT_K_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 32) #define WT_K_DECODE_KEY_LEN(v) ((v) >> 32) #define WT_K_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 2) #define WT_K_DECODE_KEY_OFFSET(v) (((v)&0xFFFFFFFF) >> 2) #define WT_KV_FLAG 0x03 #define WT_KV_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 55) #define WT_KV_DECODE_KEY_LEN(v) ((v) >> 55) #define WT_KV_MAX_KEY_LEN (0x200 - 1) #define WT_KV_ENCODE_VALUE_LEN(v) ((uintptr_t)(v) << 42) #define WT_KV_DECODE_VALUE_LEN(v) (((v)&0x007FFC0000000000) >> 42) #define WT_KV_MAX_VALUE_LEN (0x2000 - 1) #define WT_KV_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 22) #define WT_KV_DECODE_KEY_OFFSET(v) (((v)&0x000003FFFFC00000) >> 22) #define WT_KV_MAX_KEY_OFFSET (0x100000 - 1) #define WT_KV_ENCODE_VALUE_OFFSET(v) ((uintptr_t)(v) << 2) #define WT_KV_DECODE_VALUE_OFFSET(v) (((v)&0x00000000003FFFFC) >> 2) #define WT_KV_MAX_VALUE_OFFSET (0x100000 - 1) switch (v & 0x03) { case WT_CELL_FLAG: /* On-page cell: no instantiated key. */ if (ikeyp != NULL) *ikeyp = NULL; if (cellp != NULL) *cellp = WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v)); return (false); case WT_K_FLAG: /* Encoded key: no instantiated key, no cell. */ if (cellp != NULL) *cellp = NULL; if (ikeyp != NULL) *ikeyp = NULL; if (datap != NULL) { *(void **)datap = WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v)); *sizep = WT_K_DECODE_KEY_LEN(v); return (true); } return (false); case WT_KV_FLAG: /* Encoded key/value pair: no instantiated key, no cell. */ if (cellp != NULL) *cellp = NULL; if (ikeyp != NULL) *ikeyp = NULL; if (datap != NULL) { *(void **)datap = WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_KEY_OFFSET(v)); *sizep = WT_KV_DECODE_KEY_LEN(v); return (true); } return (false); } /* Instantiated key. */ ikey = copy; if (ikeyp != NULL) *ikeyp = copy; if (cellp != NULL) *cellp = WT_PAGE_REF_OFFSET(page, ikey->cell_offset); if (datap != NULL) { *(void **)datap = WT_IKEY_DATA(ikey); *sizep = ikey->size; return (true); } return (false); } /* * __wt_row_leaf_key_set_cell -- * Set a WT_ROW to reference an on-page row-store leaf cell. */ static inline void __wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell) { uintptr_t v; /* * See the comment in __wt_row_leaf_key_info for an explanation of the magic. */ v = WT_CELL_ENCODE_OFFSET(WT_PAGE_DISK_OFFSET(page, cell)) | WT_CELL_FLAG; WT_ASSERT(NULL, WT_ROW_SLOT(page, rip) < page->entries); WT_ROW_KEY_SET(rip, v); } /* * __wt_row_leaf_key_set -- * Set a WT_ROW to reference an on-page row-store leaf key. */ static inline void __wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack) { uintptr_t v; /* * See the comment in __wt_row_leaf_key_info for an explanation of the magic. */ v = WT_K_ENCODE_KEY_LEN(unpack->size) | WT_K_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) | WT_K_FLAG; WT_ASSERT(NULL, WT_ROW_SLOT(page, rip) < page->entries); WT_ROW_KEY_SET(rip, v); } /* * __wt_row_leaf_value_set -- * Set a WT_ROW to reference an on-page row-store leaf value. */ static inline void __wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack) { uintptr_t key_len, key_offset, value_offset, v; v = (uintptr_t)WT_ROW_KEY_COPY(rip); /* * See the comment in __wt_row_leaf_key_info for an explanation of the magic. */ if (!(v & WT_K_FLAG)) /* Already an encoded key */ return; key_len = WT_K_DECODE_KEY_LEN(v); /* Key length */ if (key_len > WT_KV_MAX_KEY_LEN) return; if (unpack->size > WT_KV_MAX_VALUE_LEN) /* Value length */ return; key_offset = WT_K_DECODE_KEY_OFFSET(v); /* Page offsets */ if (key_offset > WT_KV_MAX_KEY_OFFSET) return; value_offset = WT_PAGE_DISK_OFFSET(page, unpack->data); if (value_offset > WT_KV_MAX_VALUE_OFFSET) return; v = WT_KV_ENCODE_KEY_LEN(key_len) | WT_KV_ENCODE_VALUE_LEN(unpack->size) | WT_KV_ENCODE_KEY_OFFSET(key_offset) | WT_KV_ENCODE_VALUE_OFFSET(value_offset) | WT_KV_FLAG; WT_ASSERT(NULL, WT_ROW_SLOT(page, rip) < page->entries); WT_ROW_KEY_SET(rip, v); } /* * __wt_row_leaf_key -- * Set a buffer to reference a row-store leaf page key as cheaply as possible. */ static inline int __wt_row_leaf_key( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key, bool instantiate) { void *copy; /* * A front-end for __wt_row_leaf_key_work, here to inline fast paths. * * The row-store key can change underfoot; explicitly take a copy. */ copy = WT_ROW_KEY_COPY(rip); /* * All we handle here are on-page keys (which should be a common case), and instantiated keys * (which start out rare, but become more common as a leaf page is searched, instantiating * prefix-compressed keys). */ if (__wt_row_leaf_key_info(page, copy, NULL, NULL, &key->data, &key->size)) return (0); /* * The alternative is an on-page cell with some kind of compressed or overflow key that's never * been instantiated. Call the underlying worker function to figure it out. */ return (__wt_row_leaf_key_work(session, page, rip, key, instantiate)); } /* * __wt_row_leaf_value_cell -- * Return the unpacked value for a row-store leaf page key. */ static inline void __wt_row_leaf_value_cell(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack, WT_CELL_UNPACK *vpack) { WT_CELL *kcell, *vcell; WT_CELL_UNPACK unpack; size_t size; void *copy, *key; size = 0; /* -Werror=maybe-uninitialized */ key = NULL; /* -Werror=maybe-uninitialized */ /* If we already have an unpacked key cell, use it. */ if (kpack != NULL) vcell = (WT_CELL *)((uint8_t *)kpack->cell + __wt_cell_total_len(kpack)); else { /* * The row-store key can change underfoot; explicitly take a copy. */ copy = WT_ROW_KEY_COPY(rip); /* * Figure out where the key is, step past it to the value cell. The test for a cell not * being set tells us that we have an on-page key, otherwise we're looking at an * instantiated key or on-page cell, both of which require an unpack of the key's cell to * find the value cell that follows. */ if (__wt_row_leaf_key_info(page, copy, NULL, &kcell, &key, &size) && kcell == NULL) vcell = (WT_CELL *)((uint8_t *)key + size); else { __wt_cell_unpack(session, page, kcell, &unpack); vcell = (WT_CELL *)((uint8_t *)unpack.cell + __wt_cell_total_len(&unpack)); } } __wt_cell_unpack(session, page, __wt_cell_leaf_value_parse(page, vcell), vpack); } /* * __wt_row_leaf_value -- * Return the value for a row-store leaf page encoded key/value pair. */ static inline bool __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value) { uintptr_t v; /* The row-store key can change underfoot; explicitly take a copy. */ v = (uintptr_t)WT_ROW_KEY_COPY(rip); /* * See the comment in __wt_row_leaf_key_info for an explanation of the magic. */ if ((v & 0x03) == WT_KV_FLAG) { value->data = WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v)); value->size = WT_KV_DECODE_VALUE_LEN(v); return (true); } return (false); } /* * __wt_ref_info -- * Return the addr/size and type triplet for a reference. */ static inline void __wt_ref_info( WT_SESSION_IMPL *session, WT_REF *ref, const uint8_t **addrp, size_t *sizep, bool *is_leafp) { WT_ADDR *addr; WT_CELL_UNPACK *unpack, _unpack; WT_PAGE *page; addr = ref->addr; unpack = &_unpack; page = ref->home; /* * If NULL, there is no location. If off-page, the pointer references a WT_ADDR structure. If * on-page, the pointer references a cell. * * The type is of a limited set: internal, leaf or no-overflow leaf. */ if (addr == NULL) { *addrp = NULL; *sizep = 0; if (is_leafp != NULL) *is_leafp = false; } else if (__wt_off_page(page, addr)) { *addrp = addr->addr; *sizep = addr->size; if (is_leafp != NULL) *is_leafp = addr->type != WT_ADDR_INT; } else { __wt_cell_unpack(session, page, (WT_CELL *)addr, unpack); *addrp = unpack->data; *sizep = unpack->size; if (is_leafp != NULL) *is_leafp = unpack->type != WT_ADDR_INT; } } /* * __wt_ref_info_lock -- * Lock the WT_REF and return the addr/size and type triplet for a reference. */ static inline void __wt_ref_info_lock( WT_SESSION_IMPL *session, WT_REF *ref, uint8_t *addr_buf, size_t *sizep, bool *is_leafp) { size_t size; uint32_t previous_state; const uint8_t *addr; bool is_leaf; /* * The WT_REF address references either an on-page cell or in-memory structure, and eviction * frees both. If our caller is already blocking eviction (either because the WT_REF is locked * or there's a hazard pointer on the page), no locking is required, and the caller should call * the underlying function directly. Otherwise, our caller is not blocking eviction and we lock * here, and copy out the address instead of returning a reference. */ for (;; __wt_yield()) { previous_state = ref->state; if (previous_state != WT_REF_LOCKED && previous_state != WT_REF_READING && WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED)) break; } __wt_ref_info(session, ref, &addr, &size, &is_leaf); if (addr_buf != NULL) { if (addr != NULL) memcpy(addr_buf, addr, size); *sizep = size; } if (is_leafp != NULL) *is_leafp = is_leaf; WT_REF_SET_STATE(ref, previous_state); } /* * __wt_ref_block_free -- * Free the on-disk block for a reference and clear the address. */ static inline int __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref) { size_t addr_size; const uint8_t *addr; if (ref->addr == NULL) return (0); __wt_ref_info(session, ref, &addr, &addr_size, NULL); WT_RET(__wt_btree_block_free(session, addr, addr_size)); /* Clear the address (so we don't free it twice). */ __wt_ref_addr_free(session, ref); return (0); } /* * __wt_page_del_active -- * Return if a truncate operation is active. */ static inline bool __wt_page_del_active(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) { WT_PAGE_DELETED *page_del; uint8_t prepare_state; if ((page_del = ref->page_del) == NULL) return (false); if (page_del->txnid == WT_TXN_ABORTED) return (false); WT_ORDERED_READ(prepare_state, page_del->prepare_state); if (prepare_state == WT_PREPARE_INPROGRESS || prepare_state == WT_PREPARE_LOCKED) return (true); return (visible_all ? !__wt_txn_visible_all(session, page_del->txnid, page_del->timestamp) : !__wt_txn_visible(session, page_del->txnid, page_del->timestamp)); } /* * __wt_page_las_active -- * Return if lookaside data for a page is still required. */ static inline bool __wt_page_las_active(WT_SESSION_IMPL *session, WT_REF *ref) { WT_PAGE_LOOKASIDE *page_las; if ((page_las = ref->page_las) == NULL) return (false); if (page_las->resolved) return (false); if (page_las->min_skipped_ts != WT_TS_MAX || page_las->has_prepares) return (true); return (!__wt_txn_visible_all(session, page_las->max_txn, page_las->max_ondisk_ts)); } /* * __wt_btree_can_evict_dirty -- * Check whether eviction of dirty pages or splits are permitted in the current tree. We cannot * evict dirty pages or split while a checkpoint is in progress, unless the checkpoint thread is * doing the work. Also, during connection close, if we take a checkpoint as of a timestamp, * eviction should not write dirty pages to avoid updates newer than the checkpoint timestamp * leaking to disk. */ static inline bool __wt_btree_can_evict_dirty(WT_SESSION_IMPL *session) { WT_BTREE *btree; btree = S2BT(session); return ((!WT_BTREE_SYNCING(btree) || WT_SESSION_BTREE_SYNC(session)) && !F_ISSET(S2C(session), WT_CONN_CLOSING_TIMESTAMP)); } /* * __wt_leaf_page_can_split -- * Check whether a page can be split in memory. */ static inline bool __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_INSERT *ins; WT_INSERT_HEAD *ins_head; size_t size; int count; btree = S2BT(session); /* * Checkpoints can't do in-memory splits in the tree they are walking: that can lead to * corruption when the parent internal page is updated. */ if (WT_SESSION_BTREE_SYNC(session)) return (false); /* * Only split a page once, otherwise workloads that update in the middle of the page could * continually split without benefit. */ if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) return (false); /* * Check for pages with append-only workloads. A common application pattern is to have multiple * threads frantically appending to the tree. We want to reconcile and evict this page, but we'd * like to do it without making the appending threads wait. See if it's worth doing a split to * let the threads continue before doing eviction. * * Ignore anything other than large, dirty leaf pages. We depend on the page being dirty for * correctness (the page must be reconciled again before being evicted after the split, * information from a previous reconciliation will be wrong, so we can't evict immediately). */ if (page->memory_footprint < btree->splitmempage) return (false); if (WT_PAGE_IS_INTERNAL(page)) return (false); if (!__wt_page_is_modified(page)) return (false); /* * There is no point doing an in-memory split unless there is a lot of data in the last skiplist * on the page. Split if there are enough items and the skiplist does not fit within a single * disk page. */ ins_head = page->type == WT_PAGE_ROW_LEAF ? (page->entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : WT_ROW_INSERT_SLOT(page, page->entries - 1)) : WT_COL_APPEND(page); if (ins_head == NULL) return (false); /* * In the extreme case, where the page is much larger than the maximum size, split as soon as there * are 5 items on the page. */ #define WT_MAX_SPLIT_COUNT 5 if (page->memory_footprint > (size_t)btree->maxleafpage * 2) { for (count = 0, ins = ins_head->head[0]; ins != NULL; ins = ins->next[0]) { if (++count < WT_MAX_SPLIT_COUNT) continue; WT_STAT_CONN_INCR(session, cache_inmem_splittable); WT_STAT_DATA_INCR(session, cache_inmem_splittable); return (true); } return (false); } /* * Rather than scanning the whole list, walk a higher level, which gives a sample of the items -- at * level 0 we have all the items, at level 1 we have 1/4 and at level 2 we have 1/16th. If we see * more than 30 items and more data than would fit in a disk page, split. */ #define WT_MIN_SPLIT_DEPTH 2 #define WT_MIN_SPLIT_COUNT 30 #define WT_MIN_SPLIT_MULTIPLIER 16 /* At level 2, we see 1/16th entries */ for (count = 0, size = 0, ins = ins_head->head[WT_MIN_SPLIT_DEPTH]; ins != NULL; ins = ins->next[WT_MIN_SPLIT_DEPTH]) { count += WT_MIN_SPLIT_MULTIPLIER; size += WT_MIN_SPLIT_MULTIPLIER * (WT_INSERT_KEY_SIZE(ins) + WT_UPDATE_MEMSIZE(ins->upd)); if (count > WT_MIN_SPLIT_COUNT && size > (size_t)btree->maxleafpage) { WT_STAT_CONN_INCR(session, cache_inmem_splittable); WT_STAT_DATA_INCR(session, cache_inmem_splittable); return (true); } } return (false); } /* * __wt_page_evict_retry -- * Avoid busy-spinning attempting to evict the same page all the time. */ static inline bool __wt_page_evict_retry(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_PAGE_MODIFY *mod; WT_TXN_GLOBAL *txn_global; wt_timestamp_t pinned_ts; txn_global = &S2C(session)->txn_global; /* * If the page hasn't been through one round of update/restore, give it a try. */ if ((mod = page->modify) == NULL || !FLD_ISSET(mod->restore_state, WT_PAGE_RS_RESTORED)) return (true); /* * Retry if a reasonable amount of eviction time has passed, the choice of 5 eviction passes as * a reasonable amount of time is currently pretty arbitrary. */ if (__wt_cache_aggressive(session) || mod->last_evict_pass_gen + 5 < S2C(session)->cache->evict_pass_gen) return (true); /* Retry if the global transaction state has moved forward. */ if (txn_global->current == txn_global->oldest_id || mod->last_eviction_id != __wt_txn_oldest_id(session)) return (true); if (mod->last_eviction_timestamp == WT_TS_NONE) return (true); __wt_txn_pinned_timestamp(session, &pinned_ts); if (pinned_ts > mod->last_eviction_timestamp) return (true); return (false); } /* * __wt_page_can_evict -- * Check whether a page can be evicted. */ static inline bool __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) { WT_PAGE *page; WT_PAGE_MODIFY *mod; bool modified; if (inmem_splitp != NULL) *inmem_splitp = false; page = ref->page; mod = page->modify; /* A truncated page can't be evicted until the truncate completes. */ if (__wt_page_del_active(session, ref, true)) return (false); /* Otherwise, never modified pages can always be evicted. */ if (mod == NULL) return (true); /* * We can't split or evict multiblock row-store pages where the parent's key for the page is an * overflow item, because the split into the parent frees the backing blocks for any * no-longer-used overflow keys, which will corrupt the checkpoint's block management. */ if (!__wt_btree_can_evict_dirty(session) && F_ISSET_ATOMIC(ref->home, WT_PAGE_OVERFLOW_KEYS)) { WT_STAT_CONN_INCR(session, cache_eviction_fail_parent_has_overflow_items); return (false); } /* * Check for in-memory splits before other eviction tests. If the page should split in-memory, * return success immediately and skip more detailed eviction tests. We don't need further tests * since the page won't be written or discarded from the cache. */ if (__wt_leaf_page_can_split(session, page)) { if (inmem_splitp != NULL) *inmem_splitp = true; return (true); } modified = __wt_page_is_modified(page); /* * If the file is being checkpointed, other threads can't evict dirty pages: if a page is * written and the previous version freed, that previous version might be referenced by an * internal page already written in the checkpoint, leaving the checkpoint inconsistent. */ if (modified && !__wt_btree_can_evict_dirty(session)) { WT_STAT_CONN_INCR(session, cache_eviction_checkpoint); WT_STAT_DATA_INCR(session, cache_eviction_checkpoint); return (false); } /* * If a split created new internal pages, those newly created internal pages cannot be evicted * until all threads are known to have exited the original parent page's index, because evicting * an internal page discards its WT_REF array, and a thread traversing the original parent page * index might see a freed WT_REF. * * One special case where we know this is safe is if the handle is locked exclusive (e.g., when * the whole tree is being evicted). In that case, no readers can be looking at an old index. */ if (WT_PAGE_IS_INTERNAL(page) && !F_ISSET(session->dhandle, WT_DHANDLE_EXCLUSIVE) && __wt_gen_active(session, WT_GEN_SPLIT, page->pg_intl_split_gen)) return (false); /* * If the page is clean but has modifications that appear too new to evict, skip it. */ if (!modified && !__wt_txn_visible_all(session, mod->rec_max_txn, mod->rec_max_timestamp)) { WT_STAT_CONN_INCR(session, cache_eviction_fail_with_newer_modifications_on_a_clean_page); return (false); } return (true); } /* * __wt_page_release -- * Release a reference to a page. */ static inline int __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) { WT_BTREE *btree; WT_PAGE *page; bool inmem_split; btree = S2BT(session); /* * Discard our hazard pointer. Ignore pages we don't have and the root page, which sticks in * memory, regardless. */ if (ref == NULL || ref->page == NULL || __wt_ref_is_root(ref)) return (0); /* * If hazard pointers aren't necessary for this file, we can't be evicting, we're done. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) return (0); /* * Attempt to evict pages with the special "oldest" read generation. This is set for pages that * grow larger than the configured memory_page_max setting, when we see many deleted items, and * when we are attempting to scan without trashing the cache. * * Checkpoint should not queue pages for urgent eviction if they require dirty eviction: there * is a special exemption that allows checkpoint to evict dirty pages in a tree that is being * checkpointed, and no other thread can help with that. Checkpoints don't rely on this code for * dirty eviction: that is handled explicitly in __wt_sync_file. * * If the operation has disabled eviction or splitting, or the session is preventing from * reconciling, then just queue the page for urgent eviction. Otherwise, attempt to release and * evict it. */ page = ref->page; if (WT_READGEN_EVICT_SOON(page->read_gen) && btree->evict_disabled == 0 && __wt_page_can_evict(session, ref, &inmem_split) && (!WT_SESSION_IS_CHECKPOINT(session) || __wt_page_evict_clean(page))) { if (LF_ISSET(WT_READ_NO_EVICT) || (inmem_split ? LF_ISSET(WT_READ_NO_SPLIT) : F_ISSET(session, WT_SESSION_NO_RECONCILE))) WT_IGNORE_RET_BOOL(__wt_page_evict_urgent(session, ref)); else { WT_RET_BUSY_OK(__wt_page_release_evict(session, ref, flags)); return (0); } } return (__wt_hazard_clear(session, ref)); } /* * __wt_skip_choose_depth -- * Randomly choose a depth for a skiplist insert. */ static inline u_int __wt_skip_choose_depth(WT_SESSION_IMPL *session) { u_int d; for (d = 1; d < WT_SKIP_MAXDEPTH && __wt_random(&session->rnd) < WT_SKIP_PROBABILITY; d++) ; return (d); } /* * __wt_btree_lsm_over_size -- * Return if the size of an in-memory tree with a single leaf page is over a specified maximum. * If called on anything other than a simple tree with a single leaf page, returns true so our * LSM caller will switch to a new tree. */ static inline bool __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize) { WT_BTREE *btree; WT_PAGE *child, *root; WT_PAGE_INDEX *pindex; WT_REF *first; btree = S2BT(session); root = btree->root.page; /* Check for a non-existent tree. */ if (root == NULL) return (false); /* A tree that can be evicted always requires a switch. */ if (btree->evict_disabled == 0) return (true); /* Check for a tree with a single leaf page. */ WT_INTL_INDEX_GET(session, root, pindex); if (pindex->entries != 1) /* > 1 child page, switch */ return (true); first = pindex->index[0]; if (first->state != WT_REF_MEM) /* no child page, ignore */ return (false); /* * We're reaching down into the page without a hazard pointer, but that's OK because we know * that no-eviction is set and so the page cannot disappear. */ child = first->page; if (child->type != WT_PAGE_ROW_LEAF) /* not a single leaf page */ return (true); return (child->memory_footprint > maxsize); } /* * __wt_split_descent_race -- * Return if we raced with an internal page split when descending the tree. */ static inline bool __wt_split_descent_race(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX *saved_pindex) { WT_PAGE_INDEX *pindex; /* No test when starting the descent (there's no home to check). */ if (__wt_ref_is_root(ref)) return (false); /* * A place to hang this comment... * * There's a page-split race when we walk the tree: if we're splitting * an internal page into its parent, we update the parent's page index * before updating the split page's page index, and it's not an atomic * update. A thread can read the parent page's original page index and * then read the split page's replacement index. * * For example, imagine a search descending the tree. * * Because internal page splits work by truncating the original page to * the initial part of the original page, the result of this race is we * will have a search key that points past the end of the current page. * This is only an issue when we search past the end of the page, if we * find a WT_REF in the page with the namespace we're searching for, we * don't care if the WT_REF moved or not while we were searching, we * have the correct page. * * For example, imagine an internal page with 3 child pages, with the * namespaces a-f, g-h and i-j; the first child page splits. The parent * starts out with the following page-index: * * | ... | a | g | i | ... | * * which changes to this: * * | ... | a | c | e | g | i | ... | * * The child starts out with the following page-index: * * | a | b | c | d | e | f | * * which changes to this: * * | a | b | * * The thread searches the original parent page index for the key "cat", * it couples to the "a" child page; if it uses the replacement child * page index, it will search past the end of the page and couple to the * "b" page, which is wrong. * * To detect the problem, we remember the parent page's page index used * to descend the tree. Whenever we search past the end of a page, we * check to see if the parent's page index has changed since our use of * it during descent. As the problem only appears if we read the split * page's replacement index, the parent page's index must already have * changed, ensuring we detect the problem. * * It's possible for the opposite race to happen (a thread could read * the parent page's replacement page index and then read the split * page's original index). This isn't a problem because internal splits * work by truncating the split page, so the split page search is for * content the split page retains after the split, and we ignore this * race. * * This code is a general purpose check for a descent race and we call * it in other cases, for example, a cursor traversing backwards through * the tree. * * Presumably we acquired a page index on the child page before calling * this code, don't re-order that acquisition with this check. */ WT_BARRIER(); WT_INTL_INDEX_GET(session, ref->home, pindex); return (pindex != saved_pindex); } /* * __wt_page_swap_func -- * Swap one page's hazard pointer for another one when hazard pointer coupling up/down the tree. */ static inline int __wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held, WT_REF *want, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *func, int line #endif ) { WT_DECL_RET; bool acquired; /* * This function is here to simplify the error handling during hazard * pointer coupling so we never leave a hazard pointer dangling. The * assumption is we're holding a hazard pointer on "held", and want to * acquire a hazard pointer on "want", releasing the hazard pointer on * "held" when we're done. * * When walking the tree, we sometimes swap to the same page. Fast-path * that to avoid thinking about error handling. */ if (held == want) return (0); /* Get the wanted page. */ ret = __wt_page_in_func(session, want, flags #ifdef HAVE_DIAGNOSTIC , func, line #endif ); /* * Expected failures: page not found or restart. Our callers list the errors they're expecting * to handle. */ if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART) return (WT_RESTART); /* Discard the original held page on either success or error. */ acquired = ret == 0; WT_TRET(__wt_page_release(session, held, flags)); /* Fast-path expected success. */ if (ret == 0) return (0); /* * If there was an error at any point that our caller isn't prepared to handle, discard any page * we acquired. */ if (acquired) WT_TRET(__wt_page_release(session, want, flags)); /* * If we're returning an error, don't let it be one our caller expects to handle as returned by * page-in: the expectation includes the held page not having been released, and that's not the * case. */ if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND) WT_RET_MSG(session, EINVAL, "page-release WT_NOTFOUND error mapped to EINVAL"); if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART) WT_RET_MSG(session, EINVAL, "page-release WT_RESTART error mapped to EINVAL"); return (ret); }