/*- * Copyright (c) 2014-present MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ #define WT_RECNO_OOB 0 /* Illegal record number */ /* AUTOMATIC FLAG VALUE GENERATION START 0 */ #define WT_READ_CACHE 0x0001u #define WT_READ_IGNORE_CACHE_SIZE 0x0002u #define WT_READ_NOTFOUND_OK 0x0004u #define WT_READ_NO_GEN 0x0008u #define WT_READ_NO_SPLIT 0x0010u #define WT_READ_NO_WAIT 0x0020u #define WT_READ_PREV 0x0040u #define WT_READ_RESTART_OK 0x0080u #define WT_READ_SKIP_DELETED 0x0100u #define WT_READ_SKIP_INTL 0x0200u #define WT_READ_TRUNCATE 0x0400u #define WT_READ_VISIBLE_ALL 0x0800u #define WT_READ_WONT_NEED 0x1000u /* AUTOMATIC FLAG VALUE GENERATION STOP 32 */ /* AUTOMATIC FLAG VALUE GENERATION START 0 */ #define WT_REC_APP_EVICTION_SNAPSHOT 0x001u #define WT_REC_CALL_URGENT 0x002u #define WT_REC_CHECKPOINT 0x004u #define WT_REC_CHECKPOINT_RUNNING 0x008u #define WT_REC_CLEAN_AFTER_REC 0x010u #define WT_REC_EVICT 0x020u #define WT_REC_HS 0x040u #define WT_REC_IN_MEMORY 0x080u #define WT_REC_SCRUB 0x100u #define WT_REC_VISIBILITY_ERR 0x200u #define WT_REC_VISIBLE_ALL 0x400u /* AUTOMATIC FLAG VALUE GENERATION STOP 32 */ /* * WT_PAGE_HEADER -- * Blocks have a common header, a WT_PAGE_HEADER structure followed by a * block-manager specific structure. */ struct __wt_page_header { /* * The record number of the first record of the page is stored on disk so we can figure out * where the column-store leaf page fits into the key space during salvage. */ uint64_t recno; /* 00-07: column-store starting recno */ /* * We maintain page write-generations in the non-transactional case as that's how salvage can * determine the most recent page between pages overlapping the same key range. */ uint64_t write_gen; /* 08-15: write generation */ /* * The page's in-memory size isn't rounded or aligned, it's the actual number of bytes the * disk-image consumes when instantiated in memory. */ uint32_t mem_size; /* 16-19: in-memory page size */ union { uint32_t entries; /* 20-23: number of cells on page */ uint32_t datalen; /* 20-23: overflow data length */ } u; uint8_t type; /* 24: page type */ /* * No automatic generation: flag values cannot change, they're written to disk. */ #define WT_PAGE_COMPRESSED 0x01u /* Page is compressed on disk */ #define WT_PAGE_EMPTY_V_ALL 0x02u /* Page has all zero-length values */ #define WT_PAGE_EMPTY_V_NONE 0x04u /* Page has no zero-length values */ #define WT_PAGE_ENCRYPTED 0x08u /* Page is encrypted on disk */ #define WT_PAGE_UNUSED 0x10u /* Historic lookaside store page updates, no longer used */ #define WT_PAGE_FT_UPDATE 0x20u /* Page contains updated fast-truncate information */ uint8_t flags; /* 25: flags */ /* A byte of padding, positioned to be added to the flags. */ uint8_t unused; /* 26: unused padding */ #define WT_PAGE_VERSION_ORIG 0 /* Original version */ #define WT_PAGE_VERSION_TS 1 /* Timestamps added */ uint8_t version; /* 27: version */ }; /* * WT_PAGE_HEADER_SIZE is the number of bytes we allocate for the structure: if the compiler inserts * padding it will break the world. */ #define WT_PAGE_HEADER_SIZE 28 /* * __wt_page_header_byteswap -- * Handle big- and little-endian transformation of a page header. */ static inline void __wt_page_header_byteswap(WT_PAGE_HEADER *dsk) { #ifdef WORDS_BIGENDIAN dsk->recno = __wt_bswap64(dsk->recno); dsk->write_gen = __wt_bswap64(dsk->write_gen); dsk->mem_size = __wt_bswap32(dsk->mem_size); dsk->u.entries = __wt_bswap32(dsk->u.entries); #else WT_UNUSED(dsk); #endif } /* * The block-manager specific information immediately follows the WT_PAGE_HEADER structure. */ #define WT_BLOCK_HEADER_REF(dsk) ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_SIZE)) /* * WT_PAGE_HEADER_BYTE -- * WT_PAGE_HEADER_BYTE_SIZE -- * The first usable data byte on the block (past the combined headers). */ #define WT_PAGE_HEADER_BYTE_SIZE(btree) ((u_int)(WT_PAGE_HEADER_SIZE + (btree)->block_header)) #define WT_PAGE_HEADER_BYTE(btree, dsk) \ ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_BYTE_SIZE(btree))) /* * WT_ADDR -- * An in-memory structure to hold a block's location. */ struct __wt_addr { WT_TIME_AGGREGATE ta; uint8_t *addr; /* Block-manager's cookie */ uint8_t size; /* Block-manager's cookie length */ #define WT_ADDR_INT 1 /* Internal page */ #define WT_ADDR_LEAF 2 /* Leaf page */ #define WT_ADDR_LEAF_NO 3 /* Leaf page, no overflow */ uint8_t type; /* * If an address is both as an address for the previous and the current multi-block * reconciliations, that is, a block we're writing matches the block written the last time, it * will appear in both the current boundary points as well as the page modification's list of * previous blocks. The reuse flag is how we know that's happening so the block is treated * correctly (not free'd on error, for example). */ uint8_t reuse; }; /* * Overflow tracking for reuse: When a page is reconciled, we write new K/V overflow items. If pages * are reconciled multiple times, we need to know if we've already written a particular overflow * record (so we don't write it again), as well as if we've modified an overflow record previously * written (in which case we want to write a new record and discard blocks used by the previously * written record). Track overflow records written for the page, storing the values in a skiplist * with the record's value as the "key". */ struct __wt_ovfl_reuse { uint32_t value_offset; /* Overflow value offset */ uint32_t value_size; /* Overflow value size */ uint8_t addr_offset; /* Overflow addr offset */ uint8_t addr_size; /* Overflow addr size */ /* * On each page reconciliation, we clear the entry's in-use flag, and reset it as the overflow * record is re-used. After reconciliation completes, unused skiplist entries are discarded, along * with their underlying blocks. * * On each page reconciliation, set the just-added flag for each new skiplist entry; if * reconciliation fails for any reason, discard the newly added skiplist entries, along with their * underlying blocks. */ /* AUTOMATIC FLAG VALUE GENERATION START 0 */ #define WT_OVFL_REUSE_INUSE 0x1u #define WT_OVFL_REUSE_JUST_ADDED 0x2u /* AUTOMATIC FLAG VALUE GENERATION STOP 8 */ uint8_t flags; /* * The untyped address immediately follows the WT_OVFL_REUSE structure, the untyped value * immediately follows the address. */ #define WT_OVFL_REUSE_ADDR(p) ((void *)((uint8_t *)(p) + (p)->addr_offset)) #define WT_OVFL_REUSE_VALUE(p) ((void *)((uint8_t *)(p) + (p)->value_offset)) WT_OVFL_REUSE *next[0]; /* Forward-linked skip list */ }; /* * History store table support: when a page is being reconciled for eviction and has updates that * might be required by earlier readers in the system, the updates are written into the history * store table, and restored as necessary if the page is read. * * The first part of the key is comprised of a file ID, record key (byte-string for row-store, * record number for column-store) and timestamp. This allows us to search efficiently for a given * record key and read timestamp combination. The last part of the key is a monotonically increasing * counter to keep the key unique in the case where we have multiple transactions committing at the * same timestamp. * The value is the WT_UPDATE structure's: * - stop timestamp * - durable timestamp * - update type * - value. * * As the key for the history store table is different for row- and column-store, we store both key * types in a WT_ITEM, building/parsing them in the code, because otherwise we'd need two * history store files with different key formats. We could make the history store table's key * standard by moving the source key into the history store table value, but that doesn't make the * coding any simpler, and it makes the history store table's value more likely to overflow the page * size when the row-store key is relatively large. * * Note that we deliberately store the update type as larger than necessary (8 bytes vs 1 byte). * We've done this to leave room in case we need to store extra bit flags in this value at a later * point. If we need to store more information, we can potentially tack extra information at the end * of the "value" buffer and then use bit flags within the update type to determine how to interpret * it. * * We also configure a larger than default internal page size to accommodate for larger history * store keys. We do that to reduce the chances of having to create overflow keys on the page. */ #ifdef HAVE_BUILTIN_EXTENSION_SNAPPY #define WT_HS_COMPRESSOR "snappy" #else #define WT_HS_COMPRESSOR "none" #endif #define WT_HS_KEY_FORMAT WT_UNCHECKED_STRING(IuQQ) #define WT_HS_VALUE_FORMAT WT_UNCHECKED_STRING(QQQu) #define WT_HS_CONFIG \ "key_format=" WT_HS_KEY_FORMAT ",value_format=" WT_HS_VALUE_FORMAT \ ",block_compressor=" WT_HS_COMPRESSOR \ ",internal_page_max=16KB" \ ",leaf_value_max=64MB" \ ",prefix_compression=false" /* * WT_SAVE_UPD -- * Unresolved updates found during reconciliation. */ struct __wt_save_upd { WT_INSERT *ins; /* Insert list reference */ WT_ROW *rip; /* Original on-page reference */ WT_UPDATE *onpage_upd; WT_UPDATE *onpage_tombstone; bool restore; /* Whether to restore this saved update chain */ }; /* * WT_MULTI -- * Replacement block information used during reconciliation. */ struct __wt_multi { /* * Block's key: either a column-store record number or a row-store variable length byte string. */ union { uint64_t recno; WT_IKEY *ikey; } key; /* * A disk image that may or may not have been written, used to re-instantiate the page in * memory. */ void *disk_image; /* * List of unresolved updates. Updates are either a row-store insert or update list, or * column-store insert list. When creating history store records, there is an additional value, * the committed item's transaction information. * * If there are unresolved updates, the block wasn't written and there will always be a disk * image. */ WT_SAVE_UPD *supd; uint32_t supd_entries; bool supd_restore; /* Whether to restore saved update chains to this page */ /* * Disk image was written: address, size and checksum. On subsequent reconciliations of this * page, we avoid writing the block if it's unchanged by comparing size and checksum; the reuse * flag is set when the block is unchanged and we're reusing a previous address. */ WT_ADDR addr; uint32_t size; uint32_t checksum; }; /* * WT_OVFL_TRACK -- * Overflow record tracking for reconciliation. We assume overflow records are relatively rare, * so we don't allocate the structures to track them until we actually see them in the data. */ struct __wt_ovfl_track { /* * Overflow key/value address/byte-string pairs we potentially reuse each time we reconcile the * page. */ WT_OVFL_REUSE *ovfl_reuse[WT_SKIP_MAXDEPTH]; /* * Overflow key/value addresses to be discarded from the block manager after reconciliation * completes successfully. */ WT_CELL **discard; size_t discard_entries; size_t discard_allocated; }; /* * WT_PAGE_MODIFY -- * When a page is modified, there's additional information to maintain. */ struct __wt_page_modify { /* The first unwritten transaction ID (approximate). */ uint64_t first_dirty_txn; /* The transaction state last time eviction was attempted. */ uint64_t last_evict_pass_gen; uint64_t last_eviction_id; wt_timestamp_t last_eviction_timestamp; #ifdef HAVE_DIAGNOSTIC /* Check that transaction time moves forward. */ uint64_t last_oldest_id; #endif /* Avoid checking for obsolete updates during checkpoints. */ uint64_t obsolete_check_txn; wt_timestamp_t obsolete_check_timestamp; /* The largest transaction seen on the page by reconciliation. */ uint64_t rec_max_txn; wt_timestamp_t rec_max_timestamp; /* The largest update transaction ID (approximate). */ uint64_t update_txn; /* Dirty bytes added to the cache. */ size_t bytes_dirty; size_t bytes_updates; /* * When pages are reconciled, the result is one or more replacement blocks. A replacement block * can be in one of two states: it was written to disk, and so we have a block address, or it * contained unresolved modifications and we have a disk image for it with a list of those * unresolved modifications. The former is the common case: we only build lists of unresolved * modifications when we're evicting a page, and we only expect to see unresolved modifications * on a page being evicted in the case of a hot page that's too large to keep in memory as it * is. In other words, checkpoints will skip unresolved modifications, and will write the blocks * rather than build lists of unresolved modifications. * * Ugly union/struct layout to conserve memory, we never have both a replace address and * multiple replacement blocks. */ union { struct { /* Single, written replacement block */ WT_ADDR replace; /* * A disk image that may or may not have been written, used to re-instantiate the page * in memory. */ void *disk_image; } r; #undef mod_replace #define mod_replace u1.r.replace #undef mod_disk_image #define mod_disk_image u1.r.disk_image struct { WT_MULTI *multi; /* Multiple replacement blocks */ uint32_t multi_entries; /* Multiple blocks element count */ } m; #undef mod_multi #define mod_multi u1.m.multi #undef mod_multi_entries #define mod_multi_entries u1.m.multi_entries } u1; /* * Internal pages need to be able to chain root-page splits and have a special transactional * eviction requirement. Column-store leaf pages need update and append lists. * * Ugly union/struct layout to conserve memory, a page is either a leaf page or an internal * page. */ union { struct { /* * When a root page splits, we create a new page and write it; the new page can also * split and so on, and we continue this process until we write a single replacement * root page. We use the root split field to track the list of created pages so they can * be discarded when no longer needed. */ WT_PAGE *root_split; /* Linked list of root split pages */ } intl; #undef mod_root_split #define mod_root_split u2.intl.root_split struct { /* * Appended items to column-stores. Actual appends to the tree only happen on the last * page, but gaps created in the namespace by truncate operations can result in the * append lists of other pages becoming populated. */ WT_INSERT_HEAD **append; /* * Updated items in column-stores: variable-length RLE entries can expand to multiple * entries which requires some kind of list we can expand on demand. Updated items in * fixed-length files could be done based on an WT_UPDATE array as in row-stores, but * there can be a very large number of bits on a single page, and the cost of the * WT_UPDATE array would be huge. */ WT_INSERT_HEAD **update; /* * Split-saved last column-store page record. If a fixed-length column-store page is * split, we save the first record number moved so that during reconciliation we know * the page's last record and can write any implicitly created deleted records for the * page. No longer used by VLCS. */ uint64_t split_recno; } column_leaf; #undef mod_col_append #define mod_col_append u2.column_leaf.append #undef mod_col_update #define mod_col_update u2.column_leaf.update #undef mod_col_split_recno #define mod_col_split_recno u2.column_leaf.split_recno struct { /* Inserted items for row-store. */ WT_INSERT_HEAD **insert; /* Updated items for row-stores. */ WT_UPDATE **update; } row_leaf; #undef mod_row_insert #define mod_row_insert u2.row_leaf.insert #undef mod_row_update #define mod_row_update u2.row_leaf.update } u2; /* Overflow record tracking for reconciliation. */ WT_OVFL_TRACK *ovfl_track; /* * Page-delete information for newly instantiated deleted pages. The instantiated flag remains * set until the page is reconciled successfully; this indicates that the page_del information * in the ref remains valid. The update list remains set (if set at all) until the transaction * that deleted the page is resolved. These transitions are independent; that is, the first * reconciliation can happen either before or after the delete transaction resolves. */ bool instantiated; /* True if this is a newly instantiated page. */ WT_UPDATE **inst_updates; /* Update list for instantiated page with unresolved truncate. */ #define WT_PAGE_LOCK(s, p) __wt_spin_lock((s), &(p)->modify->page_lock) #define WT_PAGE_TRYLOCK(s, p) __wt_spin_trylock((s), &(p)->modify->page_lock) #define WT_PAGE_UNLOCK(s, p) __wt_spin_unlock((s), &(p)->modify->page_lock) WT_SPINLOCK page_lock; /* Page's spinlock */ /* * The page state is incremented when a page is modified. * * WT_PAGE_CLEAN -- * The page is clean. * WT_PAGE_DIRTY_FIRST -- * The page is in this state after the first operation that marks a * page dirty, or when reconciliation is checking to see if it has * done enough work to be able to mark the page clean. * WT_PAGE_DIRTY -- * Two or more updates have been added to the page. */ #define WT_PAGE_CLEAN 0 #define WT_PAGE_DIRTY_FIRST 1 #define WT_PAGE_DIRTY 2 uint32_t page_state; #define WT_PM_REC_EMPTY 1 /* Reconciliation: no replacement */ #define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */ #define WT_PM_REC_REPLACE 3 /* Reconciliation: single block */ uint8_t rec_result; /* Reconciliation state */ #define WT_PAGE_RS_RESTORED 0x1 uint8_t restore_state; /* Created by restoring updates */ /* Additional diagnostics fields to catch invalid updates to page_state, even in release builds. */ /* AUTOMATIC FLAG VALUE GENERATION START 0 */ #define WT_PAGE_MODIFY_EXCLUSIVE 0x1u #define WT_PAGE_MODIFY_RECONCILING 0x2u /* AUTOMATIC FLAG VALUE GENERATION STOP 8 */ uint8_t flags; }; /* * WT_COL_RLE -- * Variable-length column-store pages have an array of page entries with * RLE counts greater than 1 when reading the page, so it's not necessary * to walk the page counting records to find a specific entry. We can do a * binary search in this array, then an offset calculation to find the * cell. */ WT_PACKED_STRUCT_BEGIN(__wt_col_rle) uint64_t recno; /* Record number of first repeat. */ uint64_t rle; /* Repeat count. */ uint32_t indx; /* Slot of entry in col_var. */ WT_PACKED_STRUCT_END /* * WT_PAGE_INDEX -- * The page index held by each internal page. */ struct __wt_page_index { uint32_t entries; uint32_t deleted_entries; WT_REF **index; }; /* * WT_COL_VAR_REPEAT -- * Variable-length column-store pages have an array of page entries with RLE counts * greater than 1 when reading the page, so it's not necessary to walk the page counting * records to find a specific entry. We can do a binary search in this array, then an * offset calculation to find the cell. * * It's a separate structure to keep the page structure as small as possible. */ struct __wt_col_var_repeat { uint32_t nrepeats; /* repeat slots */ WT_COL_RLE repeats[0]; /* lookup RLE array */ }; /* * WT_COL_FIX_TW_ENTRY -- * This is a single entry in the WT_COL_FIX_TW array. It stores the offset from the page's * starting recno and the offset into the page to find the value cell containing the time window. */ struct __wt_col_fix_tw_entry { uint32_t recno_offset; uint32_t cell_offset; }; /* * WT_COL_FIX_TW -- * Fixed-length column-store pages carry an array of page entries that have time windows. This * is built when reading the page to avoid the need to walk the page to find a specific entry. We * can do a binary search in this array instead. */ struct __wt_col_fix_tw { uint32_t numtws; /* number of time window slots */ WT_COL_FIX_TW_ENTRY tws[0]; /* lookup array */ }; /* WT_COL_FIX_TW_CELL gets the cell pointer from a WT_COL_FIX_TW_ENTRY. */ #define WT_COL_FIX_TW_CELL(page, entry) ((WT_CELL *)((uint8_t *)(page)->dsk + (entry)->cell_offset)) /* * WT_PAGE -- * The WT_PAGE structure describes the in-memory page information. */ struct __wt_page { /* Per page-type information. */ union { /* * Internal pages (both column- and row-store). * * In-memory internal pages have an array of pointers to child * structures, maintained in collated order. * * Multiple threads of control may be searching the in-memory * internal page and a child page of the internal page may * cause a split at any time. When a page splits, a new array * is allocated and atomically swapped into place. Threads in * the old array continue without interruption (the old array is * still valid), but have to avoid racing. No barrier is needed * because the array reference is updated atomically, but code * reading the fields multiple times would be a very bad idea. * Specifically, do not do this: * WT_REF **refp = page->u.intl__index->index; * uint32_t entries = page->u.intl__index->entries; * * The field is declared volatile (so the compiler knows not to * read it multiple times), and we obscure the field name and * use a copy macro in all references to the field (so the code * doesn't read it multiple times). */ struct { WT_REF *parent_ref; /* Parent reference */ uint64_t split_gen; /* Generation of last split */ WT_PAGE_INDEX *volatile __index; /* Collated children */ } intl; #undef pg_intl_parent_ref #define pg_intl_parent_ref u.intl.parent_ref #undef pg_intl_split_gen #define pg_intl_split_gen u.intl.split_gen /* * Macros to copy/set the index because the name is obscured to ensure the field isn't read multiple * times. * * There are two versions of WT_INTL_INDEX_GET because the session split generation is usually set, * but it's not always required: for example, if a page is locked for splitting, or being created or * destroyed. */ #define WT_INTL_INDEX_GET_SAFE(page) ((page)->u.intl.__index) #define WT_INTL_INDEX_GET(session, page, pindex) \ do { \ WT_ASSERT(session, __wt_session_gen(session, WT_GEN_SPLIT) != 0); \ (pindex) = WT_INTL_INDEX_GET_SAFE(page); \ } while (0) #define WT_INTL_INDEX_SET(page, v) \ do { \ WT_WRITE_BARRIER(); \ ((page)->u.intl.__index) = (v); \ } while (0) /* * Macro to walk the list of references in an internal page. */ #define WT_INTL_FOREACH_BEGIN(session, page, ref) \ do { \ WT_PAGE_INDEX *__pindex; \ WT_REF **__refp; \ uint32_t __entries; \ WT_INTL_INDEX_GET(session, page, __pindex); \ for (__refp = __pindex->index, __entries = __pindex->entries; __entries > 0; \ --__entries) { \ (ref) = *__refp++; #define WT_INTL_FOREACH_REVERSE_BEGIN(session, page, ref) \ do { \ WT_PAGE_INDEX *__pindex; \ WT_REF **__refp; \ uint32_t __entries; \ WT_INTL_INDEX_GET(session, page, __pindex); \ for (__refp = __pindex->index + __pindex->entries, __entries = __pindex->entries; \ __entries > 0; --__entries) { \ (ref) = *--__refp; #define WT_INTL_FOREACH_END \ } \ } \ while (0) /* Row-store leaf page. */ WT_ROW *row; /* Key/value pairs */ #undef pg_row #define pg_row u.row /* Fixed-length column-store leaf page. */ struct { uint8_t *fix_bitf; /* Values */ WT_COL_FIX_TW *fix_tw; /* Time window index */ #define WT_COL_FIX_TWS_SET(page) ((page)->u.col_fix.fix_tw != NULL) } col_fix; #undef pg_fix_bitf #define pg_fix_bitf u.col_fix.fix_bitf #undef pg_fix_numtws #define pg_fix_numtws u.col_fix.fix_tw->numtws #undef pg_fix_tws #define pg_fix_tws u.col_fix.fix_tw->tws /* Variable-length column-store leaf page. */ struct { WT_COL *col_var; /* Values */ WT_COL_VAR_REPEAT *repeats; /* Repeats array */ #define WT_COL_VAR_REPEAT_SET(page) ((page)->u.col_var.repeats != NULL) } col_var; #undef pg_var #define pg_var u.col_var.col_var #undef pg_var_repeats #define pg_var_repeats u.col_var.repeats->repeats #undef pg_var_nrepeats #define pg_var_nrepeats u.col_var.repeats->nrepeats } u; /* * Page entry count, page-wide prefix information, type and flags are positioned at the end of * the WT_PAGE union to reduce cache misses when searching row-store pages. * * The entries field only applies to leaf pages, internal pages use the page-index entries * instead. */ uint32_t entries; /* Leaf page entries */ uint32_t prefix_start; /* Best page prefix starting slot */ uint32_t prefix_stop; /* Maximum slot to which the best page prefix applies */ #define WT_PAGE_IS_INTERNAL(page) \ ((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT) #define WT_PAGE_INVALID 0 /* Invalid page */ #define WT_PAGE_BLOCK_MANAGER 1 /* Block-manager page */ #define WT_PAGE_COL_FIX 2 /* Col-store fixed-len leaf */ #define WT_PAGE_COL_INT 3 /* Col-store internal page */ #define WT_PAGE_COL_VAR 4 /* Col-store var-length leaf page */ #define WT_PAGE_OVFL 5 /* Overflow page */ #define WT_PAGE_ROW_INT 6 /* Row-store internal page */ #define WT_PAGE_ROW_LEAF 7 /* Row-store leaf page */ uint8_t type; /* Page type */ /* AUTOMATIC FLAG VALUE GENERATION START 0 */ #define WT_PAGE_BUILD_KEYS 0x001u /* Keys have been built in memory */ #define WT_PAGE_COMPACTION_WRITE 0x002u /* Writing the page for compaction */ #define WT_PAGE_DISK_ALLOC 0x004u /* Disk image in allocated memory */ #define WT_PAGE_DISK_MAPPED 0x008u /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x010u /* Page is on the LRU queue */ #define WT_PAGE_EVICT_LRU_URGENT 0x020u /* Page is in the urgent queue */ #define WT_PAGE_EVICT_NO_PROGRESS 0x040u /* Eviction doesn't count as progress */ #define WT_PAGE_INTL_OVERFLOW_KEYS 0x080u /* Internal page has overflow keys (historic only) */ #define WT_PAGE_SPLIT_INSERT 0x100u /* A leaf page was split for append */ #define WT_PAGE_UPDATE_IGNORE 0x200u /* Ignore updates on page discard */ /* AUTOMATIC FLAG VALUE GENERATION STOP 16 */ uint16_t flags_atomic; /* Atomic flags, use F_*_ATOMIC_16 */ uint8_t unused; /* Unused padding */ size_t memory_footprint; /* Memory attached to the page */ /* Page's on-disk representation: NULL for pages created in memory. */ const WT_PAGE_HEADER *dsk; /* If/when the page is modified, we need lots more information. */ WT_PAGE_MODIFY *modify; /* * !!! * This is the 64 byte boundary, try to keep hot fields above here. */ /* * The page's read generation acts as an LRU value for each page in the * tree; it is used by the eviction server thread to select pages to be * discarded from the in-memory tree. * * The read generation is a 64-bit value, if incremented frequently, a * 32-bit value could overflow. * * The read generation is a piece of shared memory potentially read * by many threads. We don't want to update page read generations for * in-cache workloads and suffer the cache misses, so we don't simply * increment the read generation value on every access. Instead, the * read generation is incremented by the eviction server each time it * becomes active. To avoid incrementing a page's read generation too * frequently, it is set to a future point. * * Because low read generation values have special meaning, and there * are places where we manipulate the value, use an initial value well * outside of the special range. */ #define WT_READGEN_NOTSET 0 #define WT_READGEN_OLDEST 1 #define WT_READGEN_WONT_NEED 2 #define WT_READGEN_EVICT_SOON(readgen) \ ((readgen) != WT_READGEN_NOTSET && (readgen) < WT_READGEN_START_VALUE) #define WT_READGEN_START_VALUE 100 #define WT_READGEN_STEP 100 uint64_t read_gen; uint64_t cache_create_gen; /* Page create timestamp */ uint64_t evict_pass_gen; /* Eviction pass generation */ }; /* * WT_PAGE_DISK_OFFSET, WT_PAGE_REF_OFFSET -- * Return the offset/pointer of a pointer/offset in a page disk image. */ #define WT_PAGE_DISK_OFFSET(page, p) WT_PTRDIFF32(p, (page)->dsk) #define WT_PAGE_REF_OFFSET(page, o) ((void *)((uint8_t *)((page)->dsk) + (o))) /* * Prepare update states. * * Prepare update synchronization is based on the state field, which has the * following possible states: * * WT_PREPARE_INIT: * The initial prepare state of either an update or a page_del structure, * indicating a prepare phase has not started yet. * This state has no impact on the visibility of the update's data. * * WT_PREPARE_INPROGRESS: * Update is in prepared phase. * * WT_PREPARE_LOCKED: * State is locked as state transition is in progress from INPROGRESS to * RESOLVED. Any reader of the state needs to wait for state transition to * complete. * * WT_PREPARE_RESOLVED: * Represents the commit state of the prepared update. * * State Transition: * From uncommitted -> prepare -> commit: * INIT --> INPROGRESS --> LOCKED --> RESOLVED * LOCKED will be a momentary phase during timestamp update. * * From uncommitted -> prepare -> rollback: * INIT --> INPROGRESS * Prepare state will not be updated during rollback and will continue to * have the state as INPROGRESS. */ /* Must be 0, as structures will be default initialized with 0. */ #define WT_PREPARE_INIT 0 #define WT_PREPARE_INPROGRESS 1 #define WT_PREPARE_LOCKED 2 #define WT_PREPARE_RESOLVED 3 /* * Page state. * * Synchronization is based on the WT_REF->state field, which has a number of * possible states: * * WT_REF_DISK: * The initial setting before a page is brought into memory, and set as a * result of page eviction; the page is on disk, and must be read into * memory before use. WT_REF_DISK has a value of 0 (the default state * after allocating cleared memory). * * WT_REF_DELETED: * The page is on disk, but has been deleted from the tree; we can delete * row-store and VLCS leaf pages without reading them if they don't * reference overflow items. * * WT_REF_LOCKED: * Locked for exclusive access. In eviction, this page or a parent has * been selected for eviction; once hazard pointers are checked, the page * will be evicted. When reading a page that was previously deleted, it * is locked until the page is in memory and the deletion has been * instantiated with tombstone updates. The thread that set the page to * WT_REF_LOCKED has exclusive access; no other thread may use the WT_REF * until the state is changed. * * WT_REF_MEM: * Set by a reading thread once the page has been read from disk; the page * is in the cache and the page reference is OK. * * WT_REF_SPLIT: * Set when the page is split; the WT_REF is dead and can no longer be * used. * * The life cycle of a typical page goes like this: pages are read into memory * from disk and their state set to WT_REF_MEM. When the page is selected for * eviction, the page state is set to WT_REF_LOCKED. In all cases, evicting * threads reset the page's state when finished with the page: if eviction was * successful (a clean page was discarded, and a dirty page was written to disk * and then discarded), the page state is set to WT_REF_DISK; if eviction failed * because the page was busy, page state is reset to WT_REF_MEM. * * Readers check the state field and if it's WT_REF_MEM, they set a hazard * pointer to the page, flush memory and re-confirm the page state. If the * page state is unchanged, the reader has a valid reference and can proceed. * * When an evicting thread wants to discard a page from the tree, it sets the * WT_REF_LOCKED state, flushes memory, then checks hazard pointers. If a * hazard pointer is found, state is reset to WT_REF_MEM, restoring the page * to the readers. If the evicting thread does not find a hazard pointer, * the page is evicted. */ /* * WT_PAGE_DELETED -- * Information about how they got deleted for deleted pages. This structure records the * transaction that deleted the page, plus the state the ref was in when the deletion happened. * This structure is akin to an update but applies to a whole page. */ struct __wt_page_deleted { /* * Transaction IDs are set when updates are created (before they become visible) and only change * when marked with WT_TXN_ABORTED. Transaction ID readers expect to copy a transaction ID into * a local variable and see a stable value. In case a compiler might re-read the transaction ID * from memory rather than using the local variable, mark the shared transaction IDs volatile to * prevent unexpected repeated/reordered reads. */ volatile uint64_t txnid; /* Transaction ID */ wt_timestamp_t timestamp; /* Timestamps */ wt_timestamp_t durable_timestamp; /* * The prepare state is used for transaction prepare to manage visibility and propagating the * prepare state to the updates generated at instantiation time. */ volatile uint8_t prepare_state; /* * The previous state of the WT_REF; if the fast-truncate transaction is rolled back without the * page first being instantiated, this is the state to which the WT_REF returns. */ uint8_t previous_ref_state; /* * If the fast-truncate transaction has committed. If we're forced to instantiate the page, and * the committed flag isn't set, we have to create an update structure list for the transaction * to resolve in a subsequent commit. (This is tricky: if the transaction is rolled back, the * entire structure is discarded, that is, the flag is set only on commit and not on rollback.) */ bool committed; /* Flag to indicate fast-truncate is written to disk. */ bool selected_for_write; }; /* * WT_REF_HIST -- * State information of a ref at a single point in time. */ struct __wt_ref_hist { WT_SESSION_IMPL *session; const char *name; const char *func; uint32_t time_sec; uint16_t line; uint16_t state; }; /* * WT_REF -- * A single in-memory page and state information. */ struct __wt_ref { WT_PAGE *page; /* Page */ /* * When the tree deepens as a result of a split, the home page value changes. Don't cache it, we * need to see that change when looking up our slot in the page's index structure. */ WT_PAGE *volatile home; /* Reference page */ volatile uint32_t pindex_hint; /* Reference page index hint */ uint8_t unused[2]; /* Padding: before the flags field so flags can be easily expanded. */ /* * Define both internal- and leaf-page flags for now: we only need one, but it provides an easy way * to assert a page-type flag is always set (we allocate WT_REFs in lots of places and it's easy to * miss one). If we run out of bits in the flags field, remove the internal flag and rewrite tests * depending on it to be "!leaf" instead. */ /* AUTOMATIC FLAG VALUE GENERATION START 0 */ #define WT_REF_FLAG_INTERNAL 0x1u /* Page is an internal page */ #define WT_REF_FLAG_LEAF 0x2u /* Page is a leaf page */ #define WT_REF_FLAG_READING 0x4u /* Page is being read in */ /* AUTOMATIC FLAG VALUE GENERATION STOP 8 */ uint8_t flags; #define WT_REF_DISK 0 /* Page is on disk */ #define WT_REF_DELETED 1 /* Page is on disk, but deleted */ #define WT_REF_LOCKED 2 /* Page locked for exclusive access */ #define WT_REF_MEM 3 /* Page is in cache and valid */ #define WT_REF_SPLIT 4 /* Parent page split (WT_REF dead) */ volatile uint8_t state; /* Page state */ /* * Address: on-page cell if read from backing block, off-page WT_ADDR if instantiated in-memory, * or NULL if page created in-memory. */ void *addr; /* * The child page's key. Do NOT change this union without reviewing * __wt_ref_key. */ union { uint64_t recno; /* Column-store: starting recno */ void *ikey; /* Row-store: key */ } key; #undef ref_recno #define ref_recno key.recno #undef ref_ikey #define ref_ikey key.ikey /* * Page deletion information, written-to/read-from disk as necessary in the internal page's * address cell. (Deleted-address cells are also referred to as "proxy cells".) When a WT_REF * first becomes part of a fast-truncate operation, the page_del field is allocated and * initialized; it is similar to an update and holds information about the transaction that * performed the truncate. It can be discarded and set to NULL when that transaction reaches * global visibility. * * Operations other than truncate that produce deleted pages (checkpoint cleanup, reconciliation * as empty, etc.) leave the page_del field NULL as in these cases the deletion is already * globally visible. * * Once the deletion is globally visible, the original on-disk page is no longer needed and can * be discarded; this happens the next time the parent page is reconciled, either by eviction or * by a checkpoint. The ref remains, however, and still occupies the same key space in the table * that it always did. * * Deleted refs (and thus chunks of the tree namespace) are only discarded at two points: when * the parent page is discarded after being evicted, or in the course of internal page splits * and reverse splits. Until this happens, the "same" page can be brought back to life by * writing to its portion of the key space. * * A deleted page needs to be "instantiated" (read in from disk and converted to an in-memory * page where every item on the page has been individually deleted) if we need to position a * cursor on the page, or if we need to visit it for other reasons. Logic exists to avoid that * in various common cases (see: __wt_btcur_skip_page, __wt_delete_page_skip) but in many less * common situations we proceed with instantiation anyway to avoid multiplying the number of * special cases in the system. * * Common triggers for instantiation include: another thread reading from the page before a * truncate commits; an older reader visiting a page after a truncate commits; a thread reading * the page via a checkpoint cursor if the truncation wasn't yet globally visible at checkpoint * time; a thread reading the page after shutdown and restart under similar circumstances; RTS * needing to roll back a committed but unstable truncation (and possibly also updates that * occurred before the truncation); and a thread writing to the truncated portion of the table * space after the truncation but before the page is completely discarded. * * If the page must be instantiated for any reason: (1) for each entry on the page a WT_UPDATE * is created; (2) the transaction information from page_del is copied to those WT_UPDATE * structures (making them a match for the truncate operation), and (3) the WT_REF state * switches to WT_REF_MEM. * * If the fast-truncate operation has not yet committed, an array of references to the WT_UPDATE * structures is placed in modify->inst_updates. This is used to find the updates when the * operation subsequently resolves. (The page can split, so there needs to be some way to find * all of the update structures.) * * After instantiation, the page_del structure is kept until the instantiated page is next * reconciled. This is because in some cases reconciliation of the parent internal page may need * to write out a reference to the pre-instantiated on-disk page, at which point the page_del * information is needed to build the correct reference. * * If the ref is in WT_REF_DELETED state, all actions besides checking whether page_del is NULL * require that the WT_REF be locked. There are two reasons for this: first, the page might be * instantiated at any time, and it is important to not see a partly-completed instantiation; * and second, the page_del structure is discarded opportunistically if its transaction is found * to be globally visible, so accessing it without locking the ref is unsafe. * * If the ref is in WT_REF_MEM state because it has been instantiated, the safety requirements * are somewhat looser. Checking for an instantiated page by examining modify->instantiated does * not require locking. Checking if modify->inst_updates is non-NULL (which means that the * truncation isn't committed) also doesn't require locking. In general the page_del structure * should not be used after instantiation; exceptions are (a) it is still updated by transaction * prepare, commit, and rollback (so that it remains correct) and (b) it is used by internal * page reconciliation if that occurs before the instantiated child is itself reconciled. (The * latter can only happen if the child is evicted in a fairly narrow time window during a * checkpoint.) This still requires locking the ref. * * It is vital to consider all the possible cases when touching a deleted or instantiated page. * * There are two major groups of states: * * 1. The WT_REF state is WT_REF_DELETED. This means the page is deleted and not in memory. * - If the page has no disk address, the ref is a placeholder in the key space and may in * general be discarded at the next opportunity. (Some restrictions apply in VLCS.) * - If the page has a disk address, page_del may be NULL. In this case, the deletion of the * page is globally visible and the on-disk page can be discarded at the next opportunity. * - If the page has a disk address and page_del is not NULL, page_del contains information * about the transaction that deleted the page. It is necessary to lock the ref to read * page_del; at that point (if the state hasn't changed while getting the lock) * page_del->committed can be used to check if the transaction is committed or not. * * 2. The WT_REF state is WT_REF_MEM. The page is either an ordinary page or an instantiated * deleted page. * - If ref->page->modify is NULL, the page is ordinary. * - If ref->page->modify->instantiated is false and ref->page->modify->inst_updates is NULL, * the page is ordinary. * - If ref->page->modify->instantiated is true, the page is instantiated and has not yet * been reconciled. ref->page_del is either NULL (meaning the deletion is globally visible) * or contains information about the transaction that deleted the page. This information is * only meaningful either (a) in relation to the existing on-disk page rather than the in- * memory page (this can be needed to reconcile the parent internal page) or (b) if the * page is clean. * - If ref->page->modify->inst_updates is not NULL, the page is instantiated and the * transaction that deleted it has not resolved yet. The update list is used during commit * or rollback to find the updates created during instantiation. * * The last two points of group (2) are orthogonal; that is, after instantiation the * instantiated flag and page_del structure (on the one hand) and the update list (on the other) * are used and discarded independently. The former persists only until the page is first * successfully reconciled; the latter persists until the transaction resolves. These events may * occur in either order. * * As described above, in any state in group (1) an access to the page may require it be read * into memory, at which point it moves into group (2). Instantiation always sets the * instantiated flag to true; the updates list is only created if the transaction has not yet * resolved at the point instantiation happens. (The ref is locked in both transaction * resolution and instantiation to make sure these events happen in a well-defined order.) * * Because internal pages with uncommitted (including prepared) deletions are not written to * disk, a page instantiated after its parent was read from disk will always have inst_updates * set to NULL. */ WT_PAGE_DELETED *page_del; /* Page-delete information for a deleted page. */ #ifdef HAVE_REF_TRACK /* * In DIAGNOSTIC mode we overwrite the WT_REF on free to force failures, but we want to retain ref * state history. Don't overwrite these fields. */ #define WT_REF_CLEAR_SIZE (offsetof(WT_REF, hist)) #define WT_REF_SAVE_STATE_MAX 3 /* Capture history of ref state changes. */ WT_REF_HIST hist[WT_REF_SAVE_STATE_MAX]; uint64_t histoff; #define WT_REF_SAVE_STATE(ref, s, f, l) \ do { \ (ref)->hist[(ref)->histoff].session = session; \ (ref)->hist[(ref)->histoff].name = session->name; \ __wt_seconds32(session, &(ref)->hist[(ref)->histoff].time_sec); \ (ref)->hist[(ref)->histoff].func = (f); \ (ref)->hist[(ref)->histoff].line = (uint16_t)(l); \ (ref)->hist[(ref)->histoff].state = (uint16_t)(s); \ (ref)->histoff = ((ref)->histoff + 1) % WT_ELEMENTS((ref)->hist); \ } while (0) #define WT_REF_SET_STATE(ref, s) \ do { \ WT_REF_SAVE_STATE(ref, s, __PRETTY_FUNCTION__, __LINE__); \ WT_PUBLISH((ref)->state, s); \ } while (0) #else #define WT_REF_CLEAR_SIZE (sizeof(WT_REF)) #define WT_REF_SET_STATE(ref, s) WT_PUBLISH((ref)->state, s) #endif }; /* * WT_REF_SIZE is the expected structure size -- we verify the build to ensure the compiler hasn't * inserted padding which would break the world. */ #ifdef HAVE_REF_TRACK #define WT_REF_SIZE (48 + WT_REF_SAVE_STATE_MAX * sizeof(WT_REF_HIST) + 8) #else #define WT_REF_SIZE 48 #endif /* A macro wrapper allowing us to remember the callers code location */ #define WT_REF_CAS_STATE(session, ref, old_state, new_state) \ __wt_ref_cas_state_int(session, ref, old_state, new_state, __PRETTY_FUNCTION__, __LINE__) #define WT_REF_LOCK(session, ref, previous_statep) \ do { \ uint8_t __previous_state; \ for (;; __wt_yield()) { \ __previous_state = (ref)->state; \ if (__previous_state != WT_REF_LOCKED && \ WT_REF_CAS_STATE(session, ref, __previous_state, WT_REF_LOCKED)) \ break; \ } \ *(previous_statep) = __previous_state; \ } while (0) #define WT_REF_UNLOCK(ref, state) WT_REF_SET_STATE(ref, state) /* * WT_ROW -- * Each in-memory page row-store leaf page has an array of WT_ROW structures: * this is created from on-page data when a page is read from the file. It's * sorted by key, fixed in size, and starts with a reference to on-page data. * * Multiple threads of control may be searching the in-memory row-store pages, * and the key may be instantiated at any time. Code must be able to handle * both when the key has not been instantiated (the key field points into the * page's disk image), and when the key has been instantiated (the key field * points outside the page's disk image). We don't need barriers because the * key is updated atomically, but code that reads the key field multiple times * is a very, very bad idea. Specifically, do not do this: * * key = rip->key; * if (key_is_on_page(key)) { * cell = rip->key; * } * * The field is declared volatile (so the compiler knows it shouldn't read it * multiple times), and we obscure the field name and use a copy macro in all * references to the field (so the code doesn't read it multiple times), all * to make sure we don't introduce this bug (again). */ struct __wt_row { /* On-page key, on-page cell, or off-page WT_IKEY */ void *volatile __key; }; #define WT_ROW_KEY_COPY(rip) ((rip)->__key) #define WT_ROW_KEY_SET(rip, v) ((rip)->__key) = (void *)(v) /* * WT_ROW_FOREACH -- * Walk the entries of an in-memory row-store leaf page. */ #define WT_ROW_FOREACH(page, rip, i) \ for ((i) = (page)->entries, (rip) = (page)->pg_row; (i) > 0; ++(rip), --(i)) #define WT_ROW_FOREACH_REVERSE(page, rip, i) \ for ((i) = (page)->entries, (rip) = (page)->pg_row + ((page)->entries - 1); (i) > 0; \ --(rip), --(i)) /* * WT_ROW_SLOT -- * Return the 0-based array offset based on a WT_ROW reference. */ #define WT_ROW_SLOT(page, rip) ((uint32_t)((rip) - (page)->pg_row)) /* * WT_COL -- Each in-memory variable-length column-store leaf page has an array of WT_COL * structures: this is created from on-page data when a page is read from the file. It's fixed in * size, and references data on the page. */ struct __wt_col { /* * Variable-length column-store data references are page offsets, not pointers (we boldly * re-invent short pointers). The trade-off is 4B per K/V pair on a 64-bit machine vs. a single * cycle for the addition of a base pointer. The on-page data is a WT_CELL (same as row-store * pages). * * Obscure the field name, code shouldn't use WT_COL->__col_value, the public interface is * WT_COL_PTR and WT_COL_PTR_SET. */ uint32_t __col_value; }; /* * WT_COL_PTR, WT_COL_PTR_SET -- * Return/Set a pointer corresponding to the data offset. (If the item does * not exist on the page, return a NULL.) */ #define WT_COL_PTR(page, cip) WT_PAGE_REF_OFFSET(page, (cip)->__col_value) #define WT_COL_PTR_SET(cip, value) (cip)->__col_value = (value) /* * WT_COL_FOREACH -- * Walk the entries of variable-length column-store leaf page. */ #define WT_COL_FOREACH(page, cip, i) \ for ((i) = (page)->entries, (cip) = (page)->pg_var; (i) > 0; ++(cip), --(i)) /* * WT_COL_SLOT -- * Return the 0-based array offset based on a WT_COL reference. */ #define WT_COL_SLOT(page, cip) ((uint32_t)((cip) - (page)->pg_var)) /* * WT_IKEY -- * Instantiated key: row-store keys are usually prefix compressed or overflow objects. * Normally, a row-store page in-memory key points to the on-page WT_CELL, but in some * cases, we instantiate the key in memory, in which case the row-store page in-memory * key points to a WT_IKEY structure. */ struct __wt_ikey { uint32_t size; /* Key length */ /* * If we no longer point to the key's on-page WT_CELL, we can't find its * related value. Save the offset of the key cell in the page. * * Row-store cell references are page offsets, not pointers (we boldly * re-invent short pointers). The trade-off is 4B per K/V pair on a * 64-bit machine vs. a single cycle for the addition of a base pointer. */ uint32_t cell_offset; /* The key bytes immediately follow the WT_IKEY structure. */ #define WT_IKEY_DATA(ikey) ((void *)((uint8_t *)(ikey) + sizeof(WT_IKEY))) }; /* * WT_UPDATE -- * * Entries on leaf pages can be updated, either modified or deleted. Updates to entries in the * WT_ROW and WT_COL arrays are stored in the page's WT_UPDATE array. When the first element on a * page is updated, the WT_UPDATE array is allocated, with one slot for every existing element in * the page. A slot points to a WT_UPDATE structure; if more than one update is done for an entry, * WT_UPDATE structures are formed into a forward-linked list. */ struct __wt_update { /* * Transaction IDs are set when updates are created (before they become visible) and only change * when marked with WT_TXN_ABORTED. Transaction ID readers expect to copy a transaction ID into * a local variable and see a stable value. In case a compiler might re-read the transaction ID * from memory rather than using the local variable, mark the shared transaction IDs volatile to * prevent unexpected repeated/reordered reads. */ volatile uint64_t txnid; /* transaction ID */ wt_timestamp_t durable_ts; /* timestamps */ wt_timestamp_t start_ts; /* * The durable timestamp of the previous update in the update chain. This timestamp is used for * diagnostic checks only, and could be removed to reduce the size of the structure should that * be necessary. */ wt_timestamp_t prev_durable_ts; WT_UPDATE *next; /* forward-linked list */ uint32_t size; /* data length */ #define WT_UPDATE_INVALID 0 /* diagnostic check */ #define WT_UPDATE_MODIFY 1 /* partial-update modify value */ #define WT_UPDATE_RESERVE 2 /* reserved */ #define WT_UPDATE_STANDARD 3 /* complete value */ #define WT_UPDATE_TOMBSTONE 4 /* deleted */ uint8_t type; /* type (one byte to conserve memory) */ /* If the update includes a complete value. */ #define WT_UPDATE_DATA_VALUE(upd) \ ((upd)->type == WT_UPDATE_STANDARD || (upd)->type == WT_UPDATE_TOMBSTONE) /* * The update state is used for transaction prepare to manage visibility and transitioning * update structure state safely. */ volatile uint8_t prepare_state; /* prepare state */ /* AUTOMATIC FLAG VALUE GENERATION START 0 */ #define WT_UPDATE_DS 0x01u /* Update has been written to the data store. */ #define WT_UPDATE_HS 0x02u /* Update has been written to history store. */ #define WT_UPDATE_PREPARE_RESTORED_FROM_DS 0x04u /* Prepared update restored from data store. */ #define WT_UPDATE_RESTORED_FAST_TRUNCATE 0x08u /* Fast truncate instantiation */ #define WT_UPDATE_RESTORED_FROM_DS 0x10u /* Update restored from data store. */ #define WT_UPDATE_RESTORED_FROM_HS 0x20u /* Update restored from history store. */ #define WT_UPDATE_TO_DELETE_FROM_HS 0x40u /* Update needs to be deleted from history store */ /* AUTOMATIC FLAG VALUE GENERATION STOP 8 */ uint8_t flags; /* * Zero or more bytes of value (the payload) immediately follows the WT_UPDATE structure. We use * a C99 flexible array member which has the semantics we want. */ uint8_t data[]; /* start of the data */ }; /* * WT_UPDATE_SIZE is the expected structure size excluding the payload data -- we verify the build * to ensure the compiler hasn't inserted padding. */ #define WT_UPDATE_SIZE 47 /* * The memory size of an update: include some padding because this is such a common case that * overhead of tiny allocations can swamp our cache overhead calculation. */ #define WT_UPDATE_MEMSIZE(upd) WT_ALIGN(WT_UPDATE_SIZE + (upd)->size, 32) /* * WT_UPDATE_VALUE -- * * A generic representation of an update's value regardless of where it exists. This structure is * used to represent both in-memory updates and updates that don't exist in an update list such as * reconstructed modify updates, updates in the history store and onpage values. * * The skip buffer flag is an optimization for callers of various read functions to communicate that * they just want to check that an update exists and not read its underlying value. This means that * the read functions can avoid the performance penalty of reconstructing modifies. */ struct __wt_update_value { WT_ITEM buf; WT_TIME_WINDOW tw; uint8_t type; bool skip_buf; }; /* * WT_WITH_UPDATE_VALUE_SKIP_BUF -- * * A helper macro to use for calling read functions when we're checking for the existence of a given * key. This means that read functions can avoid the performance penalty of reconstructing modifies. */ #define WT_WITH_UPDATE_VALUE_SKIP_BUF(op) \ do { \ cbt->upd_value->skip_buf = true; \ op; \ cbt->upd_value->skip_buf = false; \ } while (0) /* * WT_MODIFY_UPDATE_MIN/MAX, WT_MODIFY_VECTOR_STACK_SIZE * Limit update chains value to avoid penalizing reads and permit truncation. Having a smaller * value will penalize the cases when history has to be maintained, resulting in multiplying cache * pressure. * * When threads race modifying a record, we can end up with more than the usual maximum number of * modifications in an update list. We use small vectors of modify updates in a couple of places to * avoid heap allocation, add a few additional slots to that array. */ #define WT_MODIFY_UPDATE_MIN 10 /* Update count before we bother checking anything else */ #define WT_MODIFY_UPDATE_MAX 200 /* Update count hard limit */ #define WT_UPDATE_VECTOR_STACK_SIZE (WT_MODIFY_UPDATE_MIN + 10) /* * WT_UPDATE_VECTOR -- * A resizable array for storing updates. The allocation strategy is similar to that of * llvm::SmallVector where we keep space on the stack for the regular case but fall back to * dynamic allocation as needed. */ struct __wt_update_vector { WT_SESSION_IMPL *session; WT_UPDATE *list[WT_UPDATE_VECTOR_STACK_SIZE]; WT_UPDATE **listp; size_t allocated_bytes; size_t size; }; /* * WT_MODIFY_MEM_FRACTION * Limit update chains to a fraction of the base document size. */ #define WT_MODIFY_MEM_FRACTION 10 /* * WT_INSERT -- * * Row-store leaf pages support inserts of new K/V pairs. When the first K/V pair is inserted, the * WT_INSERT_HEAD array is allocated, with one slot for every existing element in the page, plus one * additional slot. A slot points to a WT_INSERT_HEAD structure for the items which sort after the * WT_ROW element that references it and before the subsequent WT_ROW element; the skiplist * structure has a randomly chosen depth of next pointers in each inserted node. * * The additional slot is because it's possible to insert items smaller than any existing key on the * page: for that reason, the first slot of the insert array holds keys smaller than any other key * on the page. * * In column-store variable-length run-length encoded pages, a single indx entry may reference a * large number of records, because there's a single on-page entry representing many identical * records. (We don't expand those entries when the page comes into memory, as that would require * resources as pages are moved to/from the cache, including read-only files.) Instead, a single * indx entry represents all of the identical records originally found on the page. * * Modifying (or deleting) run-length encoded column-store records is hard because the page's entry * no longer references a set of identical items. We handle this by "inserting" a new entry into the * insert array, with its own record number. (This is the only case where it's possible to insert * into a column-store: only appends are allowed, as insert requires re-numbering subsequent * records. Berkeley DB did support mutable records, but it won't scale and it isn't useful enough * to re-implement, IMNSHO.) */ struct __wt_insert { WT_UPDATE *upd; /* value */ union { uint64_t recno; /* column-store record number */ struct { uint32_t offset; /* row-store key data start */ uint32_t size; /* row-store key data size */ } key; } u; #define WT_INSERT_KEY_SIZE(ins) (((WT_INSERT *)(ins))->u.key.size) #define WT_INSERT_KEY(ins) ((void *)((uint8_t *)(ins) + ((WT_INSERT *)(ins))->u.key.offset)) #define WT_INSERT_RECNO(ins) (((WT_INSERT *)(ins))->u.recno) WT_INSERT *next[0]; /* forward-linked skip list */ }; /* * Skiplist helper macros. */ #define WT_SKIP_FIRST(ins_head) \ (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)(ins_head))->head[0]) #define WT_SKIP_LAST(ins_head) \ (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)(ins_head))->tail[0]) #define WT_SKIP_NEXT(ins) ((ins)->next[0]) #define WT_SKIP_FOREACH(ins, ins_head) \ for ((ins) = WT_SKIP_FIRST(ins_head); (ins) != NULL; (ins) = WT_SKIP_NEXT(ins)) /* * Atomically allocate and swap a structure or array into place. */ #define WT_PAGE_ALLOC_AND_SWAP(s, page, dest, v, count) \ do { \ if (((v) = (dest)) == NULL) { \ WT_ERR(__wt_calloc_def(s, count, &(v))); \ if (__wt_atomic_cas_ptr(&(dest), NULL, v)) \ __wt_cache_page_inmem_incr(s, page, (count) * sizeof(*(v))); \ else \ __wt_free(s, v); \ } \ } while (0) /* * WT_INSERT_HEAD -- * The head of a skiplist of WT_INSERT items. */ struct __wt_insert_head { WT_INSERT *head[WT_SKIP_MAXDEPTH]; /* first item on skiplists */ WT_INSERT *tail[WT_SKIP_MAXDEPTH]; /* last item on skiplists */ }; /* * The row-store leaf page insert lists are arrays of pointers to structures, and may not exist. The * following macros return an array entry if the array of pointers and the specific structure exist, * else NULL. */ #define WT_ROW_INSERT_SLOT(page, slot) \ ((page)->modify == NULL || (page)->modify->mod_row_insert == NULL ? \ NULL : \ (page)->modify->mod_row_insert[slot]) #define WT_ROW_INSERT(page, ip) WT_ROW_INSERT_SLOT(page, WT_ROW_SLOT(page, ip)) #define WT_ROW_UPDATE(page, ip) \ ((page)->modify == NULL || (page)->modify->mod_row_update == NULL ? \ NULL : \ (page)->modify->mod_row_update[WT_ROW_SLOT(page, ip)]) /* * WT_ROW_INSERT_SMALLEST references an additional slot past the end of the "one per WT_ROW slot" * insert array. That's because the insert array requires an extra slot to hold keys that sort * before any key found on the original page. */ #define WT_ROW_INSERT_SMALLEST(page) \ ((page)->modify == NULL || (page)->modify->mod_row_insert == NULL ? \ NULL : \ (page)->modify->mod_row_insert[(page)->entries]) /* * The column-store leaf page update lists are arrays of pointers to structures, and may not exist. * The following macros return an array entry if the array of pointers and the specific structure * exist, else NULL. */ #define WT_COL_UPDATE_SLOT(page, slot) \ ((page)->modify == NULL || (page)->modify->mod_col_update == NULL ? \ NULL : \ (page)->modify->mod_col_update[slot]) #define WT_COL_UPDATE(page, ip) WT_COL_UPDATE_SLOT(page, WT_COL_SLOT(page, ip)) /* * WT_COL_UPDATE_SINGLE is a single WT_INSERT list, used for any fixed-length column-store updates * for a page. */ #define WT_COL_UPDATE_SINGLE(page) WT_COL_UPDATE_SLOT(page, 0) /* * WT_COL_APPEND is an WT_INSERT list, used for fixed- and variable-length appends. */ #define WT_COL_APPEND(page) \ ((page)->modify == NULL || (page)->modify->mod_col_append == NULL ? \ NULL : \ (page)->modify->mod_col_append[0]) /* WT_COL_FIX_FOREACH_BITS walks fixed-length bit-fields on a disk page. */ #define WT_COL_FIX_FOREACH_BITS(btree, dsk, v, i) \ for ((i) = 0, \ (v) = (i) < (dsk)->u.entries ? \ __bit_getv(WT_PAGE_HEADER_BYTE(btree, dsk), 0, (btree)->bitcnt) : \ 0; \ (i) < (dsk)->u.entries; ++(i), \ (v) = (i) < (dsk)->u.entries ? \ __bit_getv(WT_PAGE_HEADER_BYTE(btree, dsk), i, (btree)->bitcnt) : \ 0) /* * FLCS pages with time information have a small additional header after the main page data that * holds a version number and cell count, plus the byte offset to the start of the cell data. The * latter values are limited by the page size, so need only be 32 bits. One hopes we'll never need * 2^32 versions. * * This struct is the in-memory representation. The number of entries is the number of time windows * (there are twice as many cells) and the offsets is from the beginning of the page. The space * between the empty offset and the data offset is not used and is expected to be zeroed. * * This structure is only used when handling on-disk pages; once the page is read in, one should * instead use the time window index in the page structure, which is a different type found above. */ struct __wt_col_fix_auxiliary_header { uint32_t version; uint32_t entries; uint32_t emptyoffset; uint32_t dataoffset; }; /* * The on-disk auxiliary header uses a 1-byte version (the header must always begin with a nonzero * byte) and packed integers for the entry count and offset. To make the size of the offset entry * predictable (rather than dependent on the total page size) and also as small as possible, we * store the distance from the auxiliary data. To avoid complications computing the offset, we * include the offset's own storage space in the offset, and to make things simpler all around, we * include the whole auxiliary header in the offset; that is, the position of the auxiliary data is * computed as the position of the start of the auxiliary header plus the decoded stored offset. * * Both the entry count and the offset are limited to 32 bits because pages may not exceed 4G, so * their maximum encoded lengths are 5 each, so the maximum size of the on-disk header is 11 bytes. * It can be as small as 3 bytes, though. * * We reserve 7 bytes for the header on a full page (not 11) because on a full page the encoded * offset is the reservation size, and 7 encodes in one byte. This is enough for all smaller pages: * obviously if there's at least 4 extra bytes in the bitmap space any header will fit (4 + 7 = 11) * and if there's less the encoded offset is less than 11, which still encodes to one byte. */ #define WT_COL_FIX_AUXHEADER_RESERVATION 7 #define WT_COL_FIX_AUXHEADER_SIZE_MAX 11 /* Values for ->version. Version 0 never appears in an on-disk header. */ #define WT_COL_FIX_VERSION_NIL 0 /* Original page format with no timestamp data */ #define WT_COL_FIX_VERSION_TS 1 /* Upgraded format with cells carrying timestamp info */ /* * Manage split generation numbers. Splits walk the list of sessions to check when it is safe to * free structures that have been replaced. We also check that list periodically (e.g., when * wrapping up a transaction) to free any memory we can. * * Before a thread enters code that will examine page indexes (which are swapped out by splits), it * publishes a copy of the current split generation into its session. Don't assume that threads * never re-enter this code: if we already have a split generation, leave it alone. If our caller is * examining an index, we don't want the oldest split generation to move forward and potentially * free it. */ #define WT_ENTER_PAGE_INDEX(session) WT_ENTER_GENERATION((session), WT_GEN_SPLIT); #define WT_LEAVE_PAGE_INDEX(session) WT_LEAVE_GENERATION((session), WT_GEN_SPLIT); #define WT_WITH_PAGE_INDEX(session, e) \ WT_ENTER_PAGE_INDEX(session); \ (e); \ WT_LEAVE_PAGE_INDEX(session) /* * Manage the given generation number with support for re-entry. Re-entry is allowed as the previous * generation as it must be as low as the current generation. */ #define WT_ENTER_GENERATION(session, generation) \ do { \ bool __entered_##generation = false; \ if (__wt_session_gen((session), (generation)) == 0) { \ __wt_session_gen_enter((session), (generation)); \ __entered_##generation = true; \ } #define WT_LEAVE_GENERATION(session, generation) \ if (__entered_##generation) \ __wt_session_gen_leave((session), (generation)); \ } \ while (0) /* * WT_VERIFY_INFO -- A structure to hold all the information related to a verify operation. */ struct __wt_verify_info { WT_SESSION_IMPL *session; const char *tag; /* Identifier included in error messages */ const WT_PAGE_HEADER *dsk; /* The disk header for the page being verified */ WT_ADDR *page_addr; /* An item representing a page entry being verified */ size_t page_size; uint32_t cell_num; /* The current cell offset being verified */ uint64_t recno; /* The current record number in a column store page */ /* AUTOMATIC FLAG VALUE GENERATION START 0 */ #define WT_VRFY_DISK_CONTINUE_ON_FAILURE 0x1u #define WT_VRFY_DISK_EMPTY_PAGE_OK 0x2u /* AUTOMATIC FLAG VALUE GENERATION STOP 32 */ uint32_t flags; };