/*- * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ /* * __wt_ref_is_root -- * Return if the page reference is for the root page. */ static inline bool __wt_ref_is_root(WT_REF *ref) { return (ref->home == NULL); } /* * __wt_page_is_empty -- * Return if the page is empty. */ static inline bool __wt_page_is_empty(WT_PAGE *page) { return (page->modify != NULL && page->modify->rec_result == WT_PM_REC_EMPTY); } /* * __wt_page_is_modified -- * Return if the page is dirty. */ static inline bool __wt_page_is_modified(WT_PAGE *page) { return (page->modify != NULL && page->modify->write_gen != 0); } /* * __wt_btree_block_free -- * Helper function to free a block from the current tree. */ static inline int __wt_btree_block_free( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) { WT_BM *bm; WT_BTREE *btree; btree = S2BT(session); bm = btree->bm; return (bm->free(bm, session, addr, addr_size)); } /* * __wt_btree_bytes_inuse -- * Return the number of bytes in use. */ static inline uint64_t __wt_btree_bytes_inuse(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; btree = S2BT(session); cache = S2C(session)->cache; return (__wt_cache_bytes_plus_overhead(cache, btree->bytes_inmem)); } /* * __wt_btree_dirty_leaf_inuse -- * Return the number of bytes in use by dirty leaf pages. */ static inline uint64_t __wt_btree_dirty_leaf_inuse(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; btree = S2BT(session); cache = S2C(session)->cache; return (__wt_cache_bytes_plus_overhead(cache, btree->bytes_dirty_leaf)); } /* * __wt_cache_page_inmem_incr -- * Increment a page's memory footprint in the cache. */ static inline void __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) { WT_BTREE *btree; WT_CACHE *cache; WT_ASSERT(session, size < WT_EXABYTE); btree = S2BT(session); cache = S2C(session)->cache; (void)__wt_atomic_add64(&btree->bytes_inmem, size); (void)__wt_atomic_add64(&cache->bytes_inmem, size); (void)__wt_atomic_addsize(&page->memory_footprint, size); if (__wt_page_is_modified(page)) { (void)__wt_atomic_addsize(&page->modify->bytes_dirty, size); if (WT_PAGE_IS_INTERNAL(page)) (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); else { (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); } } /* Track internal size in cache. */ if (WT_PAGE_IS_INTERNAL(page)) (void)__wt_atomic_add64(&cache->bytes_internal, size); } /* * __wt_cache_decr_check_size -- * Decrement a size_t cache value and check for underflow. */ static inline void __wt_cache_decr_check_size( WT_SESSION_IMPL *session, size_t *vp, size_t v, const char *fld) { if (__wt_atomic_subsize(vp, v) < WT_EXABYTE) return; #ifdef HAVE_DIAGNOSTIC (void)__wt_atomic_addsize(vp, v); { static bool first = true; if (!first) return; __wt_errx(session, "%s underflow: decrementing %" WT_SIZET_FMT, fld, v); first = false; } #else WT_UNUSED(fld); WT_UNUSED(session); #endif } /* * __wt_cache_decr_check_uint64 -- * Decrement a uint64_t cache value and check for underflow. */ static inline void __wt_cache_decr_check_uint64( WT_SESSION_IMPL *session, uint64_t *vp, size_t v, const char *fld) { if (__wt_atomic_sub64(vp, v) < WT_EXABYTE) return; #ifdef HAVE_DIAGNOSTIC (void)__wt_atomic_add64(vp, v); { static bool first = true; if (!first) return; __wt_errx(session, "%s underflow: decrementing %" WT_SIZET_FMT, fld, v); first = false; } #else WT_UNUSED(fld); WT_UNUSED(session); #endif } /* * __wt_cache_decr_zero_uint64 -- * Decrement a uint64_t cache value and zero it on underflow. */ static inline void __wt_cache_decr_zero_uint64( WT_SESSION_IMPL *session, uint64_t *vp, size_t v, const char *fld) { if (__wt_atomic_sub64(vp, v) < WT_EXABYTE) return; __wt_errx( session, "%s went negative: decrementing %" WT_SIZET_FMT, fld, v); *vp = 0; } /* * __wt_cache_page_byte_dirty_decr -- * Decrement the page's dirty byte count, guarding from underflow. */ static inline void __wt_cache_page_byte_dirty_decr( WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) { WT_BTREE *btree; WT_CACHE *cache; size_t decr, orig; int i; btree = S2BT(session); cache = S2C(session)->cache; decr = 0; /* [-Wconditional-uninitialized] */ /* * We don't have exclusive access and there are ways of decrementing the * page's dirty byte count by a too-large value. For example: * T1: __wt_cache_page_inmem_incr(page, size) * page is clean, don't increment dirty byte count * T2: mark page dirty * T1: __wt_cache_page_inmem_decr(page, size) * page is dirty, decrement dirty byte count * and, of course, the reverse where the page is dirty at the increment * and clean at the decrement. * * The page's dirty-byte value always reflects bytes represented in the * cache's dirty-byte count, decrement the page/cache as much as we can * without underflow. If we can't decrement the dirty byte counts after * few tries, give up: the cache's value will be wrong, but consistent, * and we'll fix it the next time this page is marked clean, or evicted. */ for (i = 0; i < 5; ++i) { /* * Take care to read the dirty-byte count only once in case * we're racing with updates. */ orig = page->modify->bytes_dirty; decr = WT_MIN(size, orig); if (__wt_atomic_cassize( &page->modify->bytes_dirty, orig, orig - decr)) break; } if (i == 5) return; if (WT_PAGE_IS_INTERNAL(page)) __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_intl, decr, "WT_CACHE.bytes_dirty_intl"); else { __wt_cache_decr_check_uint64(session, &btree->bytes_dirty_leaf, decr, "WT_BTREE.bytes_dirty_leaf"); __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_leaf, decr, "WT_CACHE.bytes_dirty_leaf"); } } /* * __wt_cache_page_inmem_decr -- * Decrement a page's memory footprint in the cache. */ static inline void __wt_cache_page_inmem_decr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) { WT_CACHE *cache; cache = S2C(session)->cache; WT_ASSERT(session, size < WT_EXABYTE); __wt_cache_decr_check_uint64( session, &S2BT(session)->bytes_inmem, size, "WT_BTREE.bytes_inmem"); __wt_cache_decr_check_uint64( session, &cache->bytes_inmem, size, "WT_CACHE.bytes_inmem"); __wt_cache_decr_check_size( session, &page->memory_footprint, size, "WT_PAGE.memory_footprint"); if (__wt_page_is_modified(page)) __wt_cache_page_byte_dirty_decr(session, page, size); /* Track internal size in cache. */ if (WT_PAGE_IS_INTERNAL(page)) __wt_cache_decr_check_uint64(session, &cache->bytes_internal, size, "WT_CACHE.bytes_internal"); } /* * __wt_cache_dirty_incr -- * Page switch from clean to dirty: increment the cache dirty page/byte * counts. */ static inline void __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_CACHE *cache; size_t size; btree = S2BT(session); cache = S2C(session)->cache; /* * Take care to read the memory_footprint once in case we are racing * with updates. */ size = page->memory_footprint; if (WT_PAGE_IS_INTERNAL(page)) { (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->pages_dirty_intl, 1); } else { (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); (void)__wt_atomic_add64(&cache->pages_dirty_leaf, 1); } (void)__wt_atomic_addsize(&page->modify->bytes_dirty, size); } /* * __wt_cache_dirty_decr -- * Page switch from dirty to clean: decrement the cache dirty page/byte * counts. */ static inline void __wt_cache_dirty_decr(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_CACHE *cache; WT_PAGE_MODIFY *modify; cache = S2C(session)->cache; if (WT_PAGE_IS_INTERNAL(page)) __wt_cache_decr_zero_uint64(session, &cache->pages_dirty_intl, 1, "dirty internal page count"); else __wt_cache_decr_zero_uint64(session, &cache->pages_dirty_leaf, 1, "dirty leaf page count"); modify = page->modify; if (modify != NULL && modify->bytes_dirty != 0) __wt_cache_page_byte_dirty_decr( session, page, modify->bytes_dirty); } /* * __wt_cache_page_image_decr -- * Decrement a page image's size to the cache. */ static inline void __wt_cache_page_image_decr(WT_SESSION_IMPL *session, uint32_t size) { WT_CACHE *cache; cache = S2C(session)->cache; __wt_cache_decr_check_uint64( session, &cache->bytes_image, size, "WT_CACHE.image_inmem"); } /* * __wt_cache_page_image_incr -- * Increment a page image's size to the cache. */ static inline void __wt_cache_page_image_incr(WT_SESSION_IMPL *session, uint32_t size) { WT_CACHE *cache; cache = S2C(session)->cache; (void)__wt_atomic_add64(&cache->bytes_image, size); } /* * __wt_cache_page_evict -- * Evict pages from the cache. */ static inline void __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_CACHE *cache; WT_PAGE_MODIFY *modify; btree = S2BT(session); cache = S2C(session)->cache; modify = page->modify; /* Update the bytes in-memory to reflect the eviction. */ __wt_cache_decr_check_uint64(session, &S2BT(session)->bytes_inmem, page->memory_footprint, "WT_BTREE.bytes_inmem"); __wt_cache_decr_check_uint64(session, &cache->bytes_inmem, page->memory_footprint, "WT_CACHE.bytes_inmem"); /* Update the bytes_internal value to reflect the eviction */ if (WT_PAGE_IS_INTERNAL(page)) __wt_cache_decr_check_uint64(session, &cache->bytes_internal, page->memory_footprint, "WT_CACHE.bytes_internal"); /* Update the cache's dirty-byte count. */ if (modify != NULL && modify->bytes_dirty != 0) { if (WT_PAGE_IS_INTERNAL(page)) __wt_cache_decr_zero_uint64(session, &cache->bytes_dirty_intl, modify->bytes_dirty, "WT_CACHE.bytes_dirty_intl"); else { __wt_cache_decr_zero_uint64(session, &cache->bytes_dirty_leaf, modify->bytes_dirty, "WT_CACHE.bytes_dirty_leaf"); __wt_cache_decr_zero_uint64(session, &btree->bytes_dirty_leaf, modify->bytes_dirty, "WT_BTREE.bytes_dirty_leaf"); } } /* Update pages and bytes evicted. */ (void)__wt_atomic_add64(&cache->bytes_evict, page->memory_footprint); (void)__wt_atomic_add64(&cache->pages_evict, 1); } /* * __wt_update_list_memsize -- * The size in memory of a list of updates. */ static inline size_t __wt_update_list_memsize(WT_UPDATE *upd) { size_t upd_size; for (upd_size = 0; upd != NULL; upd = upd->next) upd_size += WT_UPDATE_MEMSIZE(upd); return (upd_size); } /* * __wt_page_modify_init -- * A page is about to be modified, allocate the modification structure. */ static inline int __wt_page_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page) { return (page->modify == NULL ? __wt_page_modify_alloc(session, page) : 0); } /* * __wt_page_only_modify_set -- * Mark the page (but only the page) dirty. */ static inline void __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) { uint64_t last_running; WT_ASSERT(session, !F_ISSET(session->dhandle, WT_DHANDLE_DEAD)); last_running = 0; if (page->modify->write_gen == 0) last_running = S2C(session)->txn_global.last_running; /* * We depend on atomic-add being a write barrier, that is, a barrier to * ensure all changes to the page are flushed before updating the page * write generation and/or marking the tree dirty, otherwise checkpoints * and/or page reconciliation might be looking at a clean page/tree. * * Every time the page transitions from clean to dirty, update the cache * and transactional information. */ if (__wt_atomic_add32(&page->modify->write_gen, 1) == 1) { __wt_cache_dirty_incr(session, page); /* * We won the race to dirty the page, but another thread could * have committed in the meantime, and the last_running field * been updated past it. That is all very unlikely, but not * impossible, so we take care to read the global state before * the atomic increment. * * If the page was dirty on entry, then last_running == 0. The * page could have become clean since then, if reconciliation * completed. In that case, we leave the previous value for * first_dirty_txn rather than potentially racing to update it, * at worst, we'll unnecessarily write a page in a checkpoint. */ if (last_running != 0) page->modify->first_dirty_txn = last_running; } /* Check if this is the largest transaction ID to update the page. */ if (WT_TXNID_LT(page->modify->update_txn, session->txn.id)) page->modify->update_txn = session->txn.id; } /* * __wt_page_modify_clear -- * Clean a modified page. */ static inline void __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page) { /* * The page must be held exclusive when this call is made, this call * can only be used when the page is owned by a single thread. * * Allow the call to be made on clean pages. */ if (__wt_page_is_modified(page)) { page->modify->write_gen = 0; __wt_cache_dirty_decr(session, page); } } /* * __wt_page_modify_set -- * Mark the page and tree dirty. */ static inline void __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) { /* * Mark the tree dirty (even if the page is already marked dirty), newly * created pages to support "empty" files are dirty, but the file isn't * marked dirty until there's a real change needing to be written. Test * before setting the dirty flag, it's a hot cache line. * * The tree's modified flag is cleared by the checkpoint thread: set it * and insert a barrier before dirtying the page. (I don't think it's * a problem if the tree is marked dirty with all the pages clean, it * might result in an extra checkpoint that doesn't do any work but it * shouldn't cause problems; regardless, let's play it safe.) */ if (S2BT(session)->modified == 0) { /* Assert we never dirty a checkpoint handle. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); S2BT(session)->modified = 1; WT_FULL_BARRIER(); } __wt_page_only_modify_set(session, page); } /* * __wt_page_parent_modify_set -- * Mark the parent page, and optionally the tree, dirty. */ static inline int __wt_page_parent_modify_set( WT_SESSION_IMPL *session, WT_REF *ref, bool page_only) { WT_PAGE *parent; /* * This function exists as a place to stash this comment. There are a * few places where we need to dirty a page's parent. The trick is the * page's parent might split at any point, and the page parent might be * the wrong parent at any particular time. We ignore this and dirty * whatever page the page's reference structure points to. This is safe * because if we're pointing to the wrong parent, that parent must have * split, deepening the tree, which implies marking the original parent * and all of the newly-created children as dirty. In other words, if * we have the wrong parent page, everything was marked dirty already. */ parent = ref->home; WT_RET(__wt_page_modify_init(session, parent)); if (page_only) __wt_page_only_modify_set(session, parent); else __wt_page_modify_set(session, parent); return (0); } /* * __wt_off_page -- * Return if a pointer references off-page data. */ static inline bool __wt_off_page(WT_PAGE *page, const void *p) { /* * There may be no underlying page, in which case the reference is * off-page by definition. */ return (page->dsk == NULL || p < (void *)page->dsk || p >= (void *)((uint8_t *)page->dsk + page->dsk->mem_size)); } /* * __wt_ref_addr_free -- * Free the address in a reference, if necessary. */ static inline void __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref) { if (ref->addr == NULL) return; if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) { __wt_free(session, ((WT_ADDR *)ref->addr)->addr); __wt_free(session, ref->addr); } ref->addr = NULL; } /* * __wt_ref_key -- * Return a reference to a row-store internal page key as cheaply as * possible. */ static inline void __wt_ref_key(WT_PAGE *page, WT_REF *ref, void *keyp, size_t *sizep) { uintptr_t v; /* * An internal page key is in one of two places: if we instantiated the * key (for example, when reading the page), WT_REF.ref_ikey references * a WT_IKEY structure, otherwise WT_REF.ref_ikey references an on-page * key offset/length pair. * * Now the magic: allocated memory must be aligned to store any standard * type, and we expect some standard type to require at least quad-byte * alignment, so allocated memory should have some clear low-order bits. * On-page objects consist of an offset/length pair: the maximum page * size currently fits into 29 bits, so we use the low-order bits of the * pointer to mark the other bits of the pointer as encoding the key's * location and length. This breaks if allocated memory isn't aligned, * of course. * * In this specific case, we use bit 0x01 to mark an on-page key, else * it's a WT_IKEY reference. The bit pattern for internal row-store * on-page keys is: * 32 bits key length * 31 bits page offset of the key's bytes, * 1 bits flags */ #define WT_IK_FLAG 0x01 #define WT_IK_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 32) #define WT_IK_DECODE_KEY_LEN(v) ((v) >> 32) #define WT_IK_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 1) #define WT_IK_DECODE_KEY_OFFSET(v) (((v) & 0xFFFFFFFF) >> 1) v = (uintptr_t)ref->ref_ikey; if (v & WT_IK_FLAG) { *(void **)keyp = WT_PAGE_REF_OFFSET(page, WT_IK_DECODE_KEY_OFFSET(v)); *sizep = WT_IK_DECODE_KEY_LEN(v); } else { *(void **)keyp = WT_IKEY_DATA(ref->ref_ikey); *sizep = ((WT_IKEY *)ref->ref_ikey)->size; } } /* * __wt_ref_key_onpage_set -- * Set a WT_REF to reference an on-page key. */ static inline void __wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK *unpack) { uintptr_t v; /* * See the comment in __wt_ref_key for an explanation of the magic. */ v = WT_IK_ENCODE_KEY_LEN(unpack->size) | WT_IK_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) | WT_IK_FLAG; ref->ref_ikey = (void *)v; } /* * __wt_ref_key_instantiated -- * Return if a WT_REF key is instantiated. */ static inline WT_IKEY * __wt_ref_key_instantiated(WT_REF *ref) { uintptr_t v; /* * See the comment in __wt_ref_key for an explanation of the magic. */ v = (uintptr_t)ref->ref_ikey; return (v & WT_IK_FLAG ? NULL : ref->ref_ikey); } /* * __wt_ref_key_clear -- * Clear a WT_REF key. */ static inline void __wt_ref_key_clear(WT_REF *ref) { /* * The key union has 2 8B fields; this is equivalent to: * * ref->ref_recno = WT_RECNO_OOB; * ref->ref_ikey = NULL; */ ref->ref_recno = 0; } /* * __wt_row_leaf_key_info -- * Return a row-store leaf page key referenced by a WT_ROW if it can be * had without unpacking a cell, and information about the cell, if the key * isn't cheaply available. */ static inline bool __wt_row_leaf_key_info(WT_PAGE *page, void *copy, WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep) { WT_IKEY *ikey; uintptr_t v; v = (uintptr_t)copy; /* * A row-store leaf page key is in one of two places: if instantiated, * the WT_ROW pointer references a WT_IKEY structure, otherwise, it * references an on-page offset. Further, on-page keys are in one of * two states: if the key is a simple key (not an overflow key, prefix * compressed or Huffman encoded, all of which are likely), the key's * offset/size is encoded in the pointer. Otherwise, the offset is to * the key's on-page cell. * * Now the magic: allocated memory must be aligned to store any standard * type, and we expect some standard type to require at least quad-byte * alignment, so allocated memory should have some clear low-order bits. * On-page objects consist of an offset/length pair: the maximum page * size currently fits into 29 bits, so we use the low-order bits of the * pointer to mark the other bits of the pointer as encoding the key's * location and length. This breaks if allocated memory isn't aligned, * of course. * * In this specific case, we use bit 0x01 to mark an on-page cell, bit * 0x02 to mark an on-page key, 0x03 to mark an on-page key/value pair, * otherwise it's a WT_IKEY reference. The bit pattern for on-page cells * is: * 29 bits page offset of the key's cell, * 2 bits flags * * The bit pattern for on-page keys is: * 32 bits key length, * 29 bits page offset of the key's bytes, * 2 bits flags * * But, while that allows us to skip decoding simple key cells, we also * want to skip decoding the value cell in the case where the value cell * is also simple/short. We use bit 0x03 to mark an encoded on-page key * and value pair. The bit pattern for on-page key/value pairs is: * 9 bits key length, * 13 bits value length, * 20 bits page offset of the key's bytes, * 20 bits page offset of the value's bytes, * 2 bits flags * * These bit patterns are in-memory only, of course, so can be modified * (we could even tune for specific workloads). Generally, the fields * are larger than the anticipated values being stored (512B keys, 8KB * values, 1MB pages), hopefully that won't be necessary. * * This function returns a list of things about the key (instantiation * reference, cell reference and key/length pair). Our callers know * the order in which we look things up and the information returned; * for example, the cell will never be returned if we are working with * an on-page key. */ #define WT_CELL_FLAG 0x01 #define WT_CELL_ENCODE_OFFSET(v) ((uintptr_t)(v) << 2) #define WT_CELL_DECODE_OFFSET(v) (((v) & 0xFFFFFFFF) >> 2) #define WT_K_FLAG 0x02 #define WT_K_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 32) #define WT_K_DECODE_KEY_LEN(v) ((v) >> 32) #define WT_K_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 2) #define WT_K_DECODE_KEY_OFFSET(v) (((v) & 0xFFFFFFFF) >> 2) #define WT_KV_FLAG 0x03 #define WT_KV_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 55) #define WT_KV_DECODE_KEY_LEN(v) ((v) >> 55) #define WT_KV_MAX_KEY_LEN (0x200 - 1) #define WT_KV_ENCODE_VALUE_LEN(v) ((uintptr_t)(v) << 42) #define WT_KV_DECODE_VALUE_LEN(v) (((v) & 0x007FFC0000000000) >> 42) #define WT_KV_MAX_VALUE_LEN (0x2000 - 1) #define WT_KV_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 22) #define WT_KV_DECODE_KEY_OFFSET(v) (((v) & 0x000003FFFFC00000) >> 22) #define WT_KV_MAX_KEY_OFFSET (0x100000 - 1) #define WT_KV_ENCODE_VALUE_OFFSET(v) ((uintptr_t)(v) << 2) #define WT_KV_DECODE_VALUE_OFFSET(v) (((v) & 0x00000000003FFFFC) >> 2) #define WT_KV_MAX_VALUE_OFFSET (0x100000 - 1) switch (v & 0x03) { case WT_CELL_FLAG: /* On-page cell: no instantiated key. */ if (ikeyp != NULL) *ikeyp = NULL; if (cellp != NULL) *cellp = WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v)); return (false); case WT_K_FLAG: /* Encoded key: no instantiated key, no cell. */ if (cellp != NULL) *cellp = NULL; if (ikeyp != NULL) *ikeyp = NULL; if (datap != NULL) { *(void **)datap = WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v)); *sizep = WT_K_DECODE_KEY_LEN(v); return (true); } return (false); case WT_KV_FLAG: /* Encoded key/value pair: no instantiated key, no cell. */ if (cellp != NULL) *cellp = NULL; if (ikeyp != NULL) *ikeyp = NULL; if (datap != NULL) { *(void **)datap = WT_PAGE_REF_OFFSET( page, WT_KV_DECODE_KEY_OFFSET(v)); *sizep = WT_KV_DECODE_KEY_LEN(v); return (true); } return (false); } /* Instantiated key. */ ikey = copy; if (ikeyp != NULL) *ikeyp = copy; if (cellp != NULL) *cellp = WT_PAGE_REF_OFFSET(page, ikey->cell_offset); if (datap != NULL) { *(void **)datap = WT_IKEY_DATA(ikey); *sizep = ikey->size; return (true); } return (false); } /* * __wt_row_leaf_key_set_cell -- * Set a WT_ROW to reference an on-page row-store leaf cell. */ static inline void __wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell) { uintptr_t v; /* * See the comment in __wt_row_leaf_key_info for an explanation of the * magic. */ v = WT_CELL_ENCODE_OFFSET(WT_PAGE_DISK_OFFSET(page, cell)) | WT_CELL_FLAG; WT_ROW_KEY_SET(rip, v); } /* * __wt_row_leaf_key_set -- * Set a WT_ROW to reference an on-page row-store leaf key. */ static inline void __wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack) { uintptr_t v; /* * See the comment in __wt_row_leaf_key_info for an explanation of the * magic. */ v = WT_K_ENCODE_KEY_LEN(unpack->size) | WT_K_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) | WT_K_FLAG; WT_ROW_KEY_SET(rip, v); } /* * __wt_row_leaf_value_set -- * Set a WT_ROW to reference an on-page row-store leaf value. */ static inline void __wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack) { uintptr_t key_len, key_offset, value_offset, v; v = (uintptr_t)WT_ROW_KEY_COPY(rip); /* * See the comment in __wt_row_leaf_key_info for an explanation of the * magic. */ if (!(v & WT_K_FLAG)) /* Already an encoded key */ return; key_len = WT_K_DECODE_KEY_LEN(v); /* Key length */ if (key_len > WT_KV_MAX_KEY_LEN) return; if (unpack->size > WT_KV_MAX_VALUE_LEN) /* Value length */ return; key_offset = WT_K_DECODE_KEY_OFFSET(v); /* Page offsets */ if (key_offset > WT_KV_MAX_KEY_OFFSET) return; value_offset = WT_PAGE_DISK_OFFSET(page, unpack->data); if (value_offset > WT_KV_MAX_VALUE_OFFSET) return; v = WT_KV_ENCODE_KEY_LEN(key_len) | WT_KV_ENCODE_VALUE_LEN(unpack->size) | WT_KV_ENCODE_KEY_OFFSET(key_offset) | WT_KV_ENCODE_VALUE_OFFSET(value_offset) | WT_KV_FLAG; WT_ROW_KEY_SET(rip, v); } /* * __wt_row_leaf_key -- * Set a buffer to reference a row-store leaf page key as cheaply as * possible. */ static inline int __wt_row_leaf_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key, bool instantiate) { void *copy; /* * A front-end for __wt_row_leaf_key_work, here to inline fast paths. * * The row-store key can change underfoot; explicitly take a copy. */ copy = WT_ROW_KEY_COPY(rip); /* * All we handle here are on-page keys (which should be a common case), * and instantiated keys (which start out rare, but become more common * as a leaf page is searched, instantiating prefix-compressed keys). */ if (__wt_row_leaf_key_info( page, copy, NULL, NULL, &key->data, &key->size)) return (0); /* * The alternative is an on-page cell with some kind of compressed or * overflow key that's never been instantiated. Call the underlying * worker function to figure it out. */ return (__wt_row_leaf_key_work(session, page, rip, key, instantiate)); } /* * __wt_cursor_row_leaf_key -- * Set a buffer to reference a cursor-referenced row-store leaf page key. */ static inline int __wt_cursor_row_leaf_key(WT_CURSOR_BTREE *cbt, WT_ITEM *key) { WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; /* * If the cursor references a WT_INSERT item, take the key from there, * else take the key from the original page. */ if (cbt->ins == NULL) { session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; rip = &page->u.row.d[cbt->slot]; WT_RET(__wt_row_leaf_key(session, page, rip, key, false)); } else { key->data = WT_INSERT_KEY(cbt->ins); key->size = WT_INSERT_KEY_SIZE(cbt->ins); } return (0); } /* * __wt_row_leaf_value_cell -- * Return a pointer to the value cell for a row-store leaf page key, or * NULL if there isn't one. */ static inline WT_CELL * __wt_row_leaf_value_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack) { WT_CELL *kcell, *vcell; WT_CELL_UNPACK unpack; void *copy, *key; size_t size; /* If we already have an unpacked key cell, use it. */ if (kpack != NULL) vcell = (WT_CELL *) ((uint8_t *)kpack->cell + __wt_cell_total_len(kpack)); else { /* * The row-store key can change underfoot; explicitly take a * copy. */ copy = WT_ROW_KEY_COPY(rip); /* * Figure out where the key is, step past it to the value cell. * The test for a cell not being set tells us that we have an * on-page key, otherwise we're looking at an instantiated key * or on-page cell, both of which require an unpack of the key's * cell to find the value cell that follows. */ if (__wt_row_leaf_key_info( page, copy, NULL, &kcell, &key, &size) && kcell == NULL) vcell = (WT_CELL *)((uint8_t *)key + size); else { __wt_cell_unpack(kcell, &unpack); vcell = (WT_CELL *)((uint8_t *) unpack.cell + __wt_cell_total_len(&unpack)); } } return (__wt_cell_leaf_value_parse(page, vcell)); } /* * __wt_row_leaf_value -- * Return the value for a row-store leaf page encoded key/value pair. */ static inline bool __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value) { uintptr_t v; /* The row-store key can change underfoot; explicitly take a copy. */ v = (uintptr_t)WT_ROW_KEY_COPY(rip); /* * See the comment in __wt_row_leaf_key_info for an explanation of the * magic. */ if ((v & 0x03) == WT_KV_FLAG) { value->data = WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v)); value->size = WT_KV_DECODE_VALUE_LEN(v); return (true); } return (false); } /* * __wt_ref_info -- * Return the addr/size and type triplet for a reference. */ static inline void __wt_ref_info(WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep) { WT_ADDR *addr; WT_CELL_UNPACK *unpack, _unpack; addr = ref->addr; unpack = &_unpack; /* * If NULL, there is no location. * If off-page, the pointer references a WT_ADDR structure. * If on-page, the pointer references a cell. * * The type is of a limited set: internal, leaf or no-overflow leaf. */ if (addr == NULL) { *addrp = NULL; *sizep = 0; if (typep != NULL) *typep = 0; } else if (__wt_off_page(ref->home, addr)) { *addrp = addr->addr; *sizep = addr->size; if (typep != NULL) switch (addr->type) { case WT_ADDR_INT: *typep = WT_CELL_ADDR_INT; break; case WT_ADDR_LEAF: *typep = WT_CELL_ADDR_LEAF; break; case WT_ADDR_LEAF_NO: *typep = WT_CELL_ADDR_LEAF_NO; break; default: *typep = 0; break; } } else { __wt_cell_unpack((WT_CELL *)addr, unpack); *addrp = unpack->data; *sizep = unpack->size; if (typep != NULL) *typep = unpack->type; } } /* * __wt_ref_block_free -- * Free the on-disk block for a reference and clear the address. */ static inline int __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref) { const uint8_t *addr; size_t addr_size; if (ref->addr == NULL) return (0); __wt_ref_info(ref, &addr, &addr_size, NULL); WT_RET(__wt_btree_block_free(session, addr, addr_size)); /* Clear the address (so we don't free it twice). */ __wt_ref_addr_free(session, ref); return (0); } /* * __wt_leaf_page_can_split -- * Check whether a page can be split in memory. */ static inline bool __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_INSERT_HEAD *ins_head; WT_INSERT *ins; size_t size; int count; btree = S2BT(session); /* * Only split a page once, otherwise workloads that update in the middle * of the page could continually split without benefit. */ if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) return (false); /* * Check for pages with append-only workloads. A common application * pattern is to have multiple threads frantically appending to the * tree. We want to reconcile and evict this page, but we'd like to * do it without making the appending threads wait. See if it's worth * doing a split to let the threads continue before doing eviction. * * Ignore anything other than large, dirty leaf pages. We depend on the * page being dirty for correctness (the page must be reconciled again * before being evicted after the split, information from a previous * reconciliation will be wrong, so we can't evict immediately). */ if (page->memory_footprint < btree->splitmempage) return (false); if (WT_PAGE_IS_INTERNAL(page)) return (false); if (!__wt_page_is_modified(page)) return (false); /* * There is no point doing an in-memory split unless there is a lot of * data in the last skiplist on the page. Split if there are enough * items and the skiplist does not fit within a single disk page. * * Rather than scanning the whole list, walk a higher level, which * gives a sample of the items -- at level 0 we have all the items, at * level 1 we have 1/4 and at level 2 we have 1/16th. If we see more * than 30 items and more data than would fit in a disk page, split. */ #define WT_MIN_SPLIT_DEPTH 2 #define WT_MIN_SPLIT_COUNT 30 #define WT_MIN_SPLIT_MULTIPLIER 16 /* At level 2, we see 1/16th entries */ ins_head = page->type == WT_PAGE_ROW_LEAF ? (page->pg_row_entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1)) : WT_COL_APPEND(page); if (ins_head == NULL) return (false); for (count = 0, size = 0, ins = ins_head->head[WT_MIN_SPLIT_DEPTH]; ins != NULL; ins = ins->next[WT_MIN_SPLIT_DEPTH]) { count += WT_MIN_SPLIT_MULTIPLIER; size += WT_MIN_SPLIT_MULTIPLIER * (WT_INSERT_KEY_SIZE(ins) + WT_UPDATE_MEMSIZE(ins->upd)); if (count > WT_MIN_SPLIT_COUNT && size > (size_t)btree->maxleafpage) { WT_STAT_FAST_CONN_INCR(session, cache_inmem_splittable); WT_STAT_FAST_DATA_INCR(session, cache_inmem_splittable); return (true); } } return (false); } /* * __wt_page_can_evict -- * Check whether a page can be evicted. */ static inline bool __wt_page_can_evict( WT_SESSION_IMPL *session, WT_REF *ref, uint32_t *evict_flagsp) { WT_BTREE *btree; WT_PAGE *page; WT_PAGE_MODIFY *mod; bool modified; btree = S2BT(session); page = ref->page; mod = page->modify; /* Pages that have never been modified can always be evicted. */ if (mod == NULL) return (true); /* * Check for in-memory splits before other eviction tests. If the page * should split in-memory, return success immediately and skip more * detailed eviction tests. We don't need further tests since the page * won't be written or discarded from the cache. */ if (__wt_leaf_page_can_split(session, page)) { if (evict_flagsp != NULL) FLD_SET(*evict_flagsp, WT_EVICT_INMEM_SPLIT); return (true); } modified = __wt_page_is_modified(page); /* * If the file is being checkpointed, we can't evict dirty pages: * if we write a page and free the previous version of the page, that * previous version might be referenced by an internal page already * been written in the checkpoint, leaving the checkpoint inconsistent. */ if (modified && btree->checkpointing != WT_CKPT_OFF) { WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint); WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint); return (false); } /* * We can't evict clean, multiblock row-store pages where the parent's * key for the page is an overflow item, because the split into the * parent frees the backing blocks for any no-longer-used overflow keys, * which will corrupt the checkpoint's block management. */ if (btree->checkpointing != WT_CKPT_OFF && F_ISSET_ATOMIC(ref->home, WT_PAGE_OVERFLOW_KEYS)) return (false); /* * If a split created new internal pages, those newly created internal * pages cannot be evicted until all threads are known to have exited * the original parent page's index, because evicting an internal page * discards its WT_REF array, and a thread traversing the original * parent page index might see a freed WT_REF. */ if (WT_PAGE_IS_INTERNAL(page) && F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK)) return (false); /* * If the page is clean but has modifications that appear too new to * evict, skip it. */ if (!modified && !__wt_txn_visible_all(session, mod->rec_max_txn)) return (false); return (true); } /* * __wt_page_release -- * Release a reference to a page. */ static inline int __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) { WT_BTREE *btree; WT_PAGE *page; btree = S2BT(session); /* * Discard our hazard pointer. Ignore pages we don't have and the root * page, which sticks in memory, regardless. */ if (ref == NULL || ref->page == NULL || __wt_ref_is_root(ref)) return (0); /* * If hazard pointers aren't necessary for this file, we can't be * evicting, we're done. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) return (0); /* * Attempt to evict pages with the special "oldest" read generation. * This is set for pages that grow larger than the configured * memory_page_max setting, when we see many deleted items, and when we * are attempting to scan without trashing the cache. * * Fast checks if eviction is disabled for this handle, operation or * tree, then perform a general check if eviction will be possible. */ page = ref->page; if (page->read_gen != WT_READGEN_OLDEST || LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || F_ISSET(btree, WT_BTREE_NO_EVICTION) || !__wt_page_can_evict(session, ref, NULL)) return (__wt_hazard_clear(session, page)); WT_RET_BUSY_OK(__wt_page_release_evict(session, ref)); return (0); } /* * __wt_page_swap_func -- * Swap one page's hazard pointer for another one when hazard pointer * coupling up/down the tree. */ static inline int __wt_page_swap_func( WT_SESSION_IMPL *session, WT_REF *held, WT_REF *want, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_DECL_RET; bool acquired; /* * This function is here to simplify the error handling during hazard * pointer coupling so we never leave a hazard pointer dangling. The * assumption is we're holding a hazard pointer on "held", and want to * acquire a hazard pointer on "want", releasing the hazard pointer on * "held" when we're done. * * When walking the tree, we sometimes swap to the same page. Fast-path * that to avoid thinking about error handling. */ if (held == want) return (0); /* Get the wanted page. */ ret = __wt_page_in_func(session, want, flags #ifdef HAVE_DIAGNOSTIC , file, line #endif ); /* * Expected failures: page not found or restart. Our callers list the * errors they're expecting to handle. */ if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART) return (WT_RESTART); /* Discard the original held page on either success or error. */ acquired = ret == 0; WT_TRET(__wt_page_release(session, held, flags)); /* Fast-path expected success. */ if (ret == 0) return (0); /* * If there was an error at any point that our caller isn't prepared to * handle, discard any page we acquired. */ if (acquired) WT_TRET(__wt_page_release(session, want, flags)); /* * If we're returning an error, don't let it be one our caller expects * to handle as returned by page-in: the expectation includes the held * page not having been released, and that's not the case. */ if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND) return (EINVAL); if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART) return (EINVAL); return (ret); } /* * __wt_page_hazard_check -- * Return if there's a hazard pointer to the page in the system. */ static inline WT_HAZARD * __wt_page_hazard_check(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_CONNECTION_IMPL *conn; WT_HAZARD *hp; WT_SESSION_IMPL *s; uint32_t i, j, hazard_size, max, session_cnt; conn = S2C(session); /* * No lock is required because the session array is fixed size, but it * may contain inactive entries. We must review any active session * that might contain a hazard pointer, so insert a barrier before * reading the active session count. That way, no matter what sessions * come or go, we'll check the slots for all of the sessions that could * have been active when we started our check. */ WT_STAT_FAST_CONN_INCR(session, cache_hazard_checks); WT_ORDERED_READ(session_cnt, conn->session_cnt); for (s = conn->sessions, i = 0, j = 0, max = 0; i < session_cnt; ++s, ++i) { if (!s->active) continue; WT_ORDERED_READ(hazard_size, s->hazard_size); if (s->hazard_size > max) { max = s->hazard_size; WT_STAT_FAST_CONN_SET(session, cache_hazard_max, max); } for (hp = s->hazard; hp < s->hazard + hazard_size; ++hp) { ++j; if (hp->page == page) { WT_STAT_FAST_CONN_INCRV(session, cache_hazard_walks, j); return (hp); } } } WT_STAT_FAST_CONN_INCRV(session, cache_hazard_walks, j); return (NULL); } /* * __wt_skip_choose_depth -- * Randomly choose a depth for a skiplist insert. */ static inline u_int __wt_skip_choose_depth(WT_SESSION_IMPL *session) { u_int d; for (d = 1; d < WT_SKIP_MAXDEPTH && __wt_random(&session->rnd) < WT_SKIP_PROBABILITY; d++) ; return (d); } /* * __wt_btree_lsm_over_size -- * Return if the size of an in-memory tree with a single leaf page is over * a specified maximum. If called on anything other than a simple tree with a * single leaf page, returns true so our LSM caller will switch to a new tree. */ static inline bool __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize) { WT_BTREE *btree; WT_PAGE *child, *root; WT_PAGE_INDEX *pindex; WT_REF *first; btree = S2BT(session); root = btree->root.page; /* Check for a non-existent tree. */ if (root == NULL) return (false); /* A tree that can be evicted always requires a switch. */ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) return (true); /* Check for a tree with a single leaf page. */ WT_INTL_INDEX_GET(session, root, pindex); if (pindex->entries != 1) /* > 1 child page, switch */ return (true); first = pindex->index[0]; if (first->state != WT_REF_MEM) /* no child page, ignore */ return (false); /* * We're reaching down into the page without a hazard pointer, but * that's OK because we know that no-eviction is set and so the page * cannot disappear. */ child = first->page; if (child->type != WT_PAGE_ROW_LEAF) /* not a single leaf page */ return (true); return (child->memory_footprint > maxsize); } /* * __wt_split_descent_race -- * Return if we raced with an internal page split when descending the tree. */ static inline bool __wt_split_descent_race( WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX *saved_pindex) { WT_PAGE_INDEX *pindex; /* No test when starting the descent (there's no home to check). */ if (__wt_ref_is_root(ref)) return (false); /* * A place to hang this comment... * * There's a page-split race when we walk the tree: if we're splitting * an internal page into its parent, we update the parent's page index * before updating the split page's page index, and it's not an atomic * update. A thread can read the parent page's original page index and * then read the split page's replacement index. * * Because internal page splits work by truncating the original page to * the initial part of the original page, the result of this race is we * will have a search key that points past the end of the current page. * This is only an issue when we search past the end of the page, if we * find a WT_REF in the page with the namespace we're searching for, we * don't care if the WT_REF moved or not while we were searching, we * have the correct page. * * For example, imagine an internal page with 3 child pages, with the * namespaces a-f, g-h and i-j; the first child page splits. The parent * starts out with the following page-index: * * | ... | a | g | i | ... | * * which changes to this: * * | ... | a | c | e | g | i | ... | * * The child starts out with the following page-index: * * | a | b | c | d | e | f | * * which changes to this: * * | a | b | * * The thread searches the original parent page index for the key "cat", * it couples to the "a" child page; if it uses the replacement child * page index, it will search past the end of the page and couple to the * "b" page, which is wrong. * * To detect the problem, we remember the parent page's page index used * to descend the tree. Whenever we search past the end of a page, we * check to see if the parent's page index has changed since our use of * it during descent. As the problem only appears if we read the split * page's replacement index, the parent page's index must already have * changed, ensuring we detect the problem. * * It's possible for the opposite race to happen (a thread could read * the parent page's replacement page index and then read the split * page's original index). This isn't a problem because internal splits * work by truncating the split page, so the split page search is for * content the split page retains after the split, and we ignore this * race. */ WT_INTL_INDEX_GET(session, ref->home, pindex); return (pindex != saved_pindex); }