summaryrefslogtreecommitdiff
path: root/src/include/btmem.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/include/btmem.h')
-rw-r--r--src/include/btmem.h661
1 files changed, 661 insertions, 0 deletions
diff --git a/src/include/btmem.h b/src/include/btmem.h
new file mode 100644
index 00000000000..8e230ff8c2f
--- /dev/null
+++ b/src/include/btmem.h
@@ -0,0 +1,661 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WT_PAGE_HEADER --
+ * Blocks have a common header, a WT_PAGE_HEADER structure followed by a
+ * block-manager specific structure.
+ */
+struct __wt_page_header {
+ /*
+ * The record number of the first record of the page is stored on disk
+ * so we can figure out where the column-store leaf page fits into the
+ * key space during salvage.
+ */
+ uint64_t recno; /* 00-07: column-store starting recno */
+
+ uint32_t size; /* 08-11: page size */
+
+ union {
+ uint32_t entries; /* 12-15: number of cells on page */
+ uint32_t datalen; /* 12-15: overflow data length */
+ } u;
+
+ uint8_t type; /* 16: page type */
+
+ /*
+ * End the WT_PAGE_HEADER structure with 3 bytes of padding: it wastes
+ * space, but it leaves the WT_PAGE_HEADER structure 32-bit aligned and
+ * having a small amount of space to play with in the future can't hurt.
+ */
+ uint8_t unused[3]; /* 17-19: unused padding */
+};
+/*
+ * WT_PAGE_HEADER_SIZE is the number of bytes we allocate for the structure: if
+ * the compiler inserts padding it will break the world.
+ */
+#define WT_PAGE_HEADER_SIZE 20
+
+/*
+ * The block-manager specific information immediately follows the WT_PAGE_DISK
+ * structure.
+ */
+#define WT_BLOCK_HEADER_REF(dsk) \
+ ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_SIZE))
+
+/*
+ * WT_PAGE_HEADER_BYTE --
+ * WT_PAGE_HEADER_BYTE_SIZE --
+ * The first usable data byte on the block (past the combined headers).
+ */
+#define WT_PAGE_HEADER_BYTE_SIZE(btree) \
+ ((u_int)(WT_PAGE_HEADER_SIZE + (btree)->block_header))
+#define WT_PAGE_HEADER_BYTE(btree, dsk) \
+ ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_BYTE_SIZE(btree)))
+
+/*
+ * WT_ADDR --
+ * A block location.
+ */
+#define WT_NOADDR "[NoAddr]" /* No address */
+struct __wt_addr {
+ uint8_t *addr; /* Cookie */
+ uint32_t size; /* Cookie length */
+};
+
+/*
+ * WT_PAGE_MODIFY --
+ * When a page is modified, there's additional information maintained as it
+ * is written to disk.
+ */
+typedef enum {
+ WT_PT_EMPTY=0, /* Unused slot */
+ WT_PT_BLOCK, /* Block: inactive */
+ WT_PT_BLOCK_EVICT, /* Block: inactive on eviction */
+ WT_PT_OVFL, /* Overflow: active */
+ WT_PT_OVFL_DISCARD /* Overflow: inactive */
+} __wt_pt_type_t;
+
+struct __wt_page_modify {
+ /*
+ * The write generation is incremented after a page is modified. That
+ * is, it tracks page versions.
+ *
+ * The write generation value is used to detect changes scheduled based
+ * on out-of-date information. Two threads of control updating the same
+ * page could both search the page in state A. When the updates are
+ * performed serially, one of the changes will happen after the page is
+ * modified, and the search state for the other thread might no longer
+ * be applicable. To avoid this race, page write generations are copied
+ * into the search stack whenever a page is read, and check when a
+ * modification is serialized. The serialized function compares each
+ * page's current write generation to the generation copied in the
+ * read/search; if the two values match, the search occurred on a
+ * current version of the page and the modification can proceed. If the
+ * two generations differ, the serialized call returns an error and the
+ * operation must be restarted.
+ *
+ * The write-generation value could be stored on a per-entry basis if
+ * there's sufficient contention for the page as a whole.
+ *
+ * The write-generation is not declared volatile: write-generation is
+ * written by a serialized function when modifying a page, and must be
+ * flushed in order as the serialized updates are flushed.
+ *
+ * !!!
+ * 32-bit values are probably more than is needed: at some point we may
+ * need to clean up pages once there have been sufficient modifications
+ * to make our linked lists of inserted cells too slow to search, or as
+ * soon as enough memory is allocated in service of page modifications
+ * (although we should be able to release memory from the MVCC list as
+ * soon as there's no running thread/txn which might want that version
+ * of the data). I've used 32-bit types instead of 16-bit types as I
+ * am less confident a 16-bit write to memory will invariably be atomic.
+ */
+ uint32_t write_gen;
+
+ /*
+ * The disk generation tracks page versions written to disk. When a
+ * page is reconciled and written to disk, the thread doing that work
+ * is just another reader of the page, and other readers and writers
+ * can access the page at the same time. For this reason, the thread
+ * reconciling the page logs the write generation of the page it read.
+ */
+ uint32_t disk_gen;
+
+ union {
+ WT_PAGE *split; /* Resulting split */
+ WT_ADDR replace; /* Resulting replacement */
+ } u;
+
+ /*
+ * Updated items in column-stores: variable-length RLE entries can
+ * expand to multiple entries which requires some kind of list we can
+ * expand on demand. Updated items in fixed-length files could be done
+ * based on an WT_UPDATE array as in row-stores, but there can be a
+ * very large number of bits on a single page, and the cost of the
+ * WT_UPDATE array would be huge.
+ */
+ WT_INSERT_HEAD **update; /* Updated items */
+
+ /*
+ * Track pages, blocks to discard: as pages are reconciled, overflow
+ * K/V items are discarded along with their underlying blocks, and as
+ * pages are evicted, split and emptied pages are merged into their
+ * parents and discarded. If an overflow item was discarded and page
+ * reconciliation then failed, the in-memory tree would be corrupted.
+ * To keep the tree correct until we're sure page reconciliation has
+ * succeeded, we track the objects we'll discard when the reconciled
+ * page is evicted.
+ *
+ * Track overflow objects: if pages are reconciled more than once, an
+ * overflow item might be written repeatedly. Instead, when overflow
+ * items are written we save a copy and resulting location so we only
+ * write them once.
+ */
+ struct __wt_page_track {
+ __wt_pt_type_t type; /* Type */
+
+ WT_ADDR addr; /* Overflow or block location */
+
+ uint8_t *data; /* Overflow data reference */
+ uint32_t size; /* Overflow data length */
+ } *track; /* Array of tracked objects */
+ uint32_t track_entries; /* Total track slots */
+};
+
+/*
+ * WT_PAGE --
+ * The WT_PAGE structure describes the in-memory page information.
+ */
+struct __wt_page {
+ /*
+ * Two links to the parent: the physical parent page, and the internal
+ * page's reference structure used to find this page.
+ */
+#define WT_PAGE_IS_ROOT(page) \
+ ((page)->parent == NULL)
+ WT_PAGE *parent; /* Page's parent */
+ WT_REF *ref; /* Parent reference */
+
+ /* Per page-type information. */
+ union {
+ /*
+ * Column- and row-store internal page. The recno is only used
+ * by column-store, but having the WT_REF array in the same page
+ * location makes some things simpler, and it doesn't cost us
+ * any memory, other structures in this union are still larger.
+ */
+ struct {
+ uint64_t recno; /* Starting recno */
+ WT_REF *t; /* Subtree */
+ } intl;
+
+ /* Row-store leaf page. */
+ struct {
+ WT_ROW *d; /* K/V object pairs */
+
+ /*
+ * The column-store leaf page modification structures
+ * live in the WT_PAGE_MODIFY structure to keep the
+ * WT_PAGE structure as small as possible for read-only
+ * pages. For consistency, we could move the row-store
+ * modification structures into WT_PAGE_MODIFY too, but
+ * it doesn't shrink WT_PAGE any further, and avoiding
+ * ugly naming in WT_PAGE_MODIFY to avoid growing it
+ * won't be pretty. So far, avoiding ugly naming has
+ * overridden consistency.
+ */
+ WT_INSERT_HEAD **ins; /* Inserts */
+ WT_UPDATE **upd; /* Updates */
+ } row;
+
+ /* Fixed-length column-store leaf page. */
+ struct {
+ uint64_t recno; /* Starting recno */
+ uint8_t *bitf; /* COL_FIX items */
+ } col_fix;
+
+ /* Variable-length column-store leaf page. */
+ struct {
+ uint64_t recno; /* Starting recno */
+ WT_COL *d; /* COL_VAR items */
+
+ /*
+ * Variable-length column-store files maintain a list of
+ * RLE entries on the page so it's unnecessary to walk
+ * the page counting records to find a specific entry.
+ */
+ WT_COL_RLE *repeats; /* RLE array for lookups */
+ uint32_t nrepeats; /* Number of repeat slots. */
+ } col_var;
+ } u;
+
+ /* Page's on-disk representation: NULL for pages created in memory. */
+ WT_PAGE_HEADER *dsk;
+
+ /* If/when the page is modified, we need lots more information. */
+ WT_PAGE_MODIFY *modify;
+
+ /*
+ * The read generation is incremented each time the page is searched,
+ * and acts as an LRU value for each page in the tree; it is read by
+ * the eviction server thread to select pages to be discarded from the
+ * in-memory tree.
+ *
+ * The read generation is a 64-bit value; incremented every time the
+ * page is searched, a 32-bit value could overflow.
+ *
+ * The read-generation is not declared volatile: read-generation is set
+ * a lot (on every access), and we don't want to write it that much.
+ */
+ uint64_t read_gen;
+
+ /*
+ * In-memory pages optionally reference a number of entries originally
+ * read from disk and sizes the allocated arrays that describe the page.
+ */
+ uint32_t entries;
+
+ /*
+ * Memory attached to the page (although not exact or complete), used
+ * to force eviction of a page tying too much memory down.
+ */
+ uint32_t memory_footprint;
+
+#define WT_PAGE_INVALID 0 /* Invalid page */
+#define WT_PAGE_COL_FIX 1 /* Col-store fixed-len leaf */
+#define WT_PAGE_COL_INT 2 /* Col-store internal page */
+#define WT_PAGE_COL_VAR 3 /* Col-store var-length leaf page */
+#define WT_PAGE_OVFL 4 /* Overflow page */
+#define WT_PAGE_ROW_INT 5 /* Row-store internal page */
+#define WT_PAGE_ROW_LEAF 6 /* Row-store leaf page */
+#define WT_PAGE_FREELIST 7 /* Free-list page */
+ uint8_t type; /* Page type */
+
+ /*
+ * The flags are divided into two sets: flags set initially, before more
+ * than a single thread accesses the page, and the reconciliation flags.
+ * The alternative would be to move the WT_PAGE_REC_XXX flags into the
+ * WT_PAGE_MODIFY structure, but that costs more memory. Obviously, it
+ * is important not to add other flags that can be set at run-time, else
+ * the threads could race.
+ */
+#define WT_PAGE_BUILD_KEYS 0x001 /* Keys have been built in memory */
+#define WT_PAGE_FORCE_EVICT 0x002 /* Waiting for forced eviction */
+#define WT_PAGE_LAST_PAGE 0x004 /* Page is pinned */
+#define WT_PAGE_PINNED 0x008 /* Page is pinned */
+#define WT_PAGE_REC_EMPTY 0x010 /* Reconciliation: page empty */
+#define WT_PAGE_REC_REPLACE 0x020 /* Reconciliation: page replaced */
+#define WT_PAGE_REC_SPLIT 0x040 /* Reconciliation: page split */
+#define WT_PAGE_REC_SPLIT_MERGE 0x080 /* Reconciliation: page split merge */
+ uint8_t flags; /* Page flags */
+};
+
+#define WT_PAGE_REC_MASK \
+ (WT_PAGE_REC_EMPTY | \
+ WT_PAGE_REC_REPLACE | WT_PAGE_REC_SPLIT | WT_PAGE_REC_SPLIT_MERGE)
+
+/*
+ * WT_PAGE_DISK_OFFSET, WT_PAGE_REF_OFFSET --
+ * Return the offset/pointer of a pointer/offset in a page disk image.
+ */
+#define WT_PAGE_DISK_OFFSET(page, p) \
+ WT_PTRDIFF32(p, (page)->dsk)
+#define WT_PAGE_REF_OFFSET(page, o) \
+ ((void *)((uint8_t *)((page)->dsk) + (o)))
+
+/*
+ * WT_REF --
+ * A single in-memory page and the state information used to determine if
+ * it's OK to dereference the pointer to the page.
+ */
+struct __wt_ref {
+ WT_PAGE *page; /* In-memory page */
+
+ void *addr; /* On-page cell or off_page WT_ADDR */
+
+ union {
+ uint64_t recno; /* Column-store: starting recno */
+ void *key; /* Row-store: on-page cell or WT_IKEY */
+ } u;
+
+ /*
+ * Page state.
+ *
+ * WT_REF_DISK has a value of 0, the default state after allocating
+ * cleared memory.
+ *
+ * Synchronization is based on the WT_REF->state field, which has 5
+ * possible states:
+ *
+ * WT_REF_DISK:
+ * The initial setting before a page is brought into memory, and
+ * set as a result of page eviction; the page is on disk, and must be
+ * read into into memory before use.
+ *
+ * WT_REF_EVICTING:
+ * Set by eviction when a page is about to be locked; prevents a
+ * page from being evicted multiple times concurrently.
+ *
+ * WT_REF_LOCKED:
+ * Set by eviction; an eviction thread has selected this page or
+ * a parent for eviction; once hazard references are checked, the page
+ * will be evicted.
+ *
+ * WT_REF_MEM:
+ * Set by a reading thread once the page has been read from disk;
+ * the page is in the cache and the page reference is OK.
+ *
+ * WT_REF_READING:
+ * Set by a reading thread before reading a page from disk; other
+ * readers of the page wait until the read completes.
+ *
+ * The life cycle of a typical page goes like this: pages are read into
+ * memory from disk and their state set to WT_REF_MEM. When the page is
+ * selected for eviction, the page state is set to WT_REF_LOCKED. In
+ * all cases, evicting threads reset the page's state when finished with
+ * the page: if eviction was successful (a clean page was discarded, and
+ * a dirty page was written to disk and then discarded), the page state
+ * is set to WT_REF_DISK; if eviction failed because the page was busy,
+ * page state is reset to WT_REF_MEM.
+ *
+ * Readers check the state field and if it's WT_REF_MEM, they set a
+ * hazard reference to the page, flush memory and re-confirm the page
+ * state. If the page state is unchanged, the reader has a valid
+ * reference and can proceed.
+ *
+ * When an evicting thread wants to discard a page from the tree, it
+ * sets the WT_REF_LOCKED state, flushes memory, then checks hazard
+ * references. If a hazard reference is found, state is reset to
+ * WT_REF_MEM, restoring the page to the readers. If the evicting
+ * thread does not find a hazard reference, the page is evicted.
+ */
+ volatile enum {
+ WT_REF_DISK=0, /* Page is on disk */
+ WT_REF_EVICTING, /* Page being evaluated for eviction */
+ WT_REF_LOCKED, /* Page being evicted */
+ WT_REF_MEM, /* Page is in cache and valid */
+ WT_REF_READING /* Page being read */
+ } state;
+};
+
+/*
+ * WT_REF_FOREACH --
+ * Walk the subtree array of an in-memory internal page.
+ */
+#define WT_REF_FOREACH(page, ref, i) \
+ for ((i) = (page)->entries, \
+ (ref) = (page)->u.intl.t; (i) > 0; ++(ref), --(i))
+
+/*
+ * WT_ROW --
+ * Each in-memory page row-store leaf page has an array of WT_ROW structures:
+ * this is created from on-page data when a page is read from the file. It's
+ * sorted by key, fixed in size, and references data on the page.
+ */
+struct __wt_row {
+ void *key; /* On-page cell or off-page WT_IKEY */
+};
+
+/*
+ * WT_ROW_FOREACH --
+ * Walk the entries of an in-memory row-store leaf page.
+ */
+#define WT_ROW_FOREACH(page, rip, i) \
+ for ((i) = (page)->entries, \
+ (rip) = (page)->u.row.d; (i) > 0; ++(rip), --(i))
+#define WT_ROW_FOREACH_REVERSE(page, rip, i) \
+ for ((i) = (page)->entries, \
+ (rip) = (page)->u.row.d + ((page)->entries - 1); \
+ (i) > 0; --(rip), --(i))
+
+/*
+ * WT_ROW_SLOT --
+ * Return the 0-based array offset based on a WT_ROW reference.
+ */
+#define WT_ROW_SLOT(page, rip) \
+ ((uint32_t)(((WT_ROW *)rip) - (page)->u.row.d))
+
+/*
+ * WT_COL --
+ * Each in-memory variable-length column-store leaf page has an array of WT_COL
+ * structures: this is created from on-page data when a page is read from the
+ * file. It's fixed in size, and references data on the page.
+ */
+struct __wt_col {
+ /*
+ * Variable-length column-store data references are page offsets, not
+ * pointers (we boldly re-invent short pointers). The trade-off is 4B
+ * per K/V pair on a 64-bit machine vs. a single cycle for the addition
+ * of a base pointer. The on-page data is a WT_CELL (same as row-store
+ * pages).
+ *
+ * If the value is 0, it's a single, deleted record.
+ *
+ * Obscure the field name, code shouldn't use WT_COL->value, the public
+ * interface is WT_COL_PTR.
+ */
+ uint32_t __value;
+};
+
+/*
+ * WT_COL_RLE --
+ * In variable-length column store leaf pages, we build an array of entries
+ * with RLE counts greater than 1 when reading the page. We can do a binary
+ * search in this array, then an offset calculation to find the cell.
+ */
+struct __wt_col_rle {
+ uint64_t recno; /* Record number of first repeat. */
+ uint64_t rle; /* Repeat count. */
+ uint32_t indx; /* Slot of entry in col_var.d */
+} WT_GCC_ATTRIBUTE((packed));
+
+/*
+ * WT_COL_PTR --
+ * Return a pointer corresponding to the data offset -- if the item doesn't
+ * exist on the page, return a NULL.
+ */
+#define WT_COL_PTR(page, cip) \
+ ((cip)->__value == 0 ? NULL : WT_PAGE_REF_OFFSET(page, (cip)->__value))
+
+/*
+ * WT_COL_FOREACH --
+ * Walk the entries of variable-length column-store leaf page.
+ */
+#define WT_COL_FOREACH(page, cip, i) \
+ for ((i) = (page)->entries, \
+ (cip) = (page)->u.col_var.d; (i) > 0; ++(cip), --(i))
+
+/*
+ * WT_COL_SLOT --
+ * Return the 0-based array offset based on a WT_COL reference.
+ */
+#define WT_COL_SLOT(page, cip) \
+ ((uint32_t)(((WT_COL *)cip) - (page)->u.col_var.d))
+
+/*
+ * WT_IKEY --
+ * Instantiated key: row-store keys are usually prefix compressed and sometimes
+ * Huffman encoded or overflow objects. Normally, a row-store page in-memory
+ * key points to the on-page WT_CELL, but in some cases, we instantiate the key
+ * in memory, in which case the row-store page in-memory key points to a WT_IKEY
+ * structure.
+ */
+struct __wt_ikey {
+ uint32_t size; /* Key length */
+
+ /*
+ * If we no longer point to the key's on-page WT_CELL, we can't find its
+ * related value. Save the offset of the key cell in the page.
+ *
+ * Row-store cell references are page offsets, not pointers (we boldly
+ * re-invent short pointers). The trade-off is 4B per K/V pair on a
+ * 64-bit machine vs. a single cycle for the addition of a base pointer.
+ */
+ uint32_t cell_offset;
+
+ /* The key bytes immediately follow the WT_IKEY structure. */
+#define WT_IKEY_DATA(ikey) \
+ ((void *)((uint8_t *)(ikey) + sizeof(WT_IKEY)))
+};
+
+/*
+ * WT_UPDATE --
+ * Entries on leaf pages can be updated, either modified or deleted. Updates
+ * to entries referenced from the WT_ROW and WT_COL arrays are stored in the
+ * page's WT_UPDATE array. When the first element on a page is updated, the
+ * WT_UPDATE array is allocated, with one slot for every existing element in
+ * the page. A slot points to a WT_UPDATE structure; if more than one update
+ * is done for an entry, WT_UPDATE structures are formed into a forward-linked
+ * list.
+ */
+struct __wt_update {
+ WT_UPDATE *next; /* forward-linked list */
+
+ /*
+ * We use the maximum size as an is-deleted flag, which means we can't
+ * store 4GB objects; I'd rather do that than increase the size of this
+ * structure for a flag bit.
+ */
+#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == UINT32_MAX)
+#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = UINT32_MAX)
+ uint32_t size; /* update length */
+
+ /* The untyped value immediately follows the WT_UPDATE structure. */
+#define WT_UPDATE_DATA(upd) \
+ ((void *)((uint8_t *)(upd) + sizeof(WT_UPDATE)))
+};
+
+/*
+ * WT_INSERT --
+ *
+ * Row-store leaf pages support inserts of new K/V pairs. When the first K/V
+ * pair is inserted, the WT_INSERT_HEAD array is allocated, with one slot for
+ * every existing element in the page, plus one additional slot. A slot points
+ * to a WT_INSERT_HEAD structure for the items which sort after the WT_ROW
+ * element that references it and before the subsequent WT_ROW element; the
+ * skiplist structure has a randomly chosen depth of next pointers in each
+ * inserted node.
+ *
+ * The additional slot is because it's possible to insert items smaller than any
+ * existing key on the page: for that reason, the first slot of the insert array
+ * holds keys smaller than any other key on the page.
+ *
+ * In column-store variable-length run-length encoded pages, a single indx
+ * entry may reference a large number of records, because there's a single
+ * on-page entry representing many identical records. (We don't expand those
+ * entries when the page comes into memory, as that would require resources as
+ * pages are moved to/from the cache, including read-only files.) Instead, a
+ * single indx entry represents all of the identical records originally found
+ * on the page.
+ *
+ * Modifying (or deleting) run-length encoded column-store records is hard
+ * because the page's entry no longer references a set of identical items. We
+ * handle this by "inserting" a new entry into the insert array, with its own
+ * record number. (This is the only case where it's possible to insert into a
+ * column-store: only appends are allowed, as insert requires re-numbering
+ * subsequent records. Berkeley DB did support mutable records, but it won't
+ * scale and it isn't useful enough to re-implement, IMNSHO.)
+ */
+struct __wt_insert {
+ WT_UPDATE *upd; /* value */
+
+ union {
+ uint64_t recno; /* column-store record number */
+ struct {
+ uint32_t offset; /* row-store key data start */
+ uint32_t size; /* row-store key data size */
+ } key;
+ } u;
+
+#define WT_INSERT_KEY_SIZE(ins) ((ins)->u.key.size)
+#define WT_INSERT_KEY(ins) \
+ ((void *)((uint8_t *)(ins) + (ins)->u.key.offset))
+#define WT_INSERT_RECNO(ins) ((ins)->u.recno)
+
+ WT_INSERT *next[0]; /* forward-linked skip list */
+};
+
+/*
+ * Skiplist helper macros.
+ */
+#define WT_SKIP_FIRST(ins_head) \
+ (((ins_head) == NULL) ? NULL : (ins_head)->head[0])
+#define WT_SKIP_LAST(ins_head) \
+ (((ins_head) == NULL) ? NULL : (ins_head)->tail[0])
+#define WT_SKIP_NEXT(ins) ((ins)->next[0])
+#define WT_SKIP_FOREACH(ins, ins_head) \
+ for ((ins) = WT_SKIP_FIRST(ins_head); \
+ (ins) != NULL; \
+ (ins) = WT_SKIP_NEXT(ins))
+
+/*
+ * WT_INSERT_HEAD --
+ * The head of a skiplist of WT_INSERT items.
+ */
+struct __wt_insert_head {
+ WT_INSERT *head[WT_SKIP_MAXDEPTH]; /* first item on skiplists */
+ WT_INSERT *tail[WT_SKIP_MAXDEPTH]; /* last item on skiplists */
+};
+
+/*
+ * The row-store leaf page insert lists are arrays of pointers to structures,
+ * and may not exist. The following macros return an array entry if the array
+ * of pointers and the specific structure exist, else NULL.
+ */
+#define WT_ROW_INSERT_SLOT(page, slot) \
+ ((page)->u.row.ins == NULL ? NULL : (page)->u.row.ins[slot])
+#define WT_ROW_INSERT(page, ip) \
+ WT_ROW_INSERT_SLOT(page, WT_ROW_SLOT(page, ip))
+#define WT_ROW_UPDATE(page, ip) \
+ ((page)->u.row.upd == NULL ? \
+ NULL : (page)->u.row.upd[WT_ROW_SLOT(page, ip)])
+/*
+ * WT_ROW_INSERT_SMALLEST references an additional slot past the end of the
+ * the "one per WT_ROW slot" insert array. That's because the insert array
+ * requires an extra slot to hold keys that sort before any key found on the
+ * original page.
+ */
+#define WT_ROW_INSERT_SMALLEST(page) \
+ ((page)->u.row.ins == NULL ? NULL : (page)->u.row.ins[(page)->entries])
+
+/*
+ * The column-store leaf page update lists are arrays of pointers to structures,
+ * and may not exist. The following macros return an array entry if the array
+ * of pointers and the specific structure exist, else NULL.
+ */
+#define WT_COL_UPDATE_SLOT(page, slot) \
+ ((page)->modify == NULL || (page)->modify->update == NULL ? \
+ NULL : (page)->modify->update[slot])
+#define WT_COL_UPDATE(page, ip) \
+ WT_COL_UPDATE_SLOT(page, WT_COL_SLOT(page, ip))
+
+/*
+ * WT_COL_UPDATE_SINGLE is a single WT_INSERT list, used for any fixed-length
+ * column-store updates for a page.
+ */
+#define WT_COL_UPDATE_SINGLE(page) \
+ WT_COL_UPDATE_SLOT(page, 0)
+
+/*
+ * WT_COL_APPEND is a single WT_INSERT list, used for fixed- and variable-length
+ * appends.
+ */
+#define WT_COL_APPEND(btree, page) \
+ (F_ISSET((page), WT_PAGE_LAST_PAGE) && \
+ (btree)->append != NULL ? (btree)->append[0] : NULL)
+
+/* WT_FIX_FOREACH walks fixed-length bit-fields on a disk page. */
+#define WT_FIX_FOREACH(btree, dsk, v, i) \
+ for ((i) = 0, \
+ (v) = (i) < (dsk)->u.entries ? \
+ __bit_getv( \
+ WT_PAGE_HEADER_BYTE(btree, dsk), 0, (btree)->bitcnt) : 0; \
+ (i) < (dsk)->u.entries; ++(i), \
+ (v) = __bit_getv( \
+ WT_PAGE_HEADER_BYTE(btree, dsk), i, (btree)->bitcnt))