diff options
Diffstat (limited to 'src/include/btmem.h')
-rw-r--r-- | src/include/btmem.h | 66 |
1 files changed, 46 insertions, 20 deletions
diff --git a/src/include/btmem.h b/src/include/btmem.h index f13504d66ca..f214ddb1dc3 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -6,6 +6,8 @@ * See the file LICENSE for redistribution information. */ +#define WT_RECNO_OOB 0 /* Illegal record number */ + /* * WT_PAGE_HEADER -- * Blocks have a common header, a WT_PAGE_HEADER structure followed by a @@ -43,6 +45,7 @@ struct __wt_page_header { #define WT_PAGE_EMPTY_V_ALL 0x02 /* Page has all zero-length values */ #define WT_PAGE_EMPTY_V_NONE 0x04 /* Page has no zero-length values */ #define WT_PAGE_ENCRYPTED 0x08 /* Page is encrypted on disk */ +#define WT_PAGE_LAS_UPDATE 0x10 /* Page updates in lookaside store */ uint8_t flags; /* 25: flags */ /* @@ -168,6 +171,29 @@ struct __wt_ovfl_txnc { }; /* + * Lookaside table support: when a page is being reconciled for eviction and has + * updates that might be required by earlier readers in the system, the updates + * are written into a lookaside table, and restored as necessary if the page is + * read. The key is a unique marker for the page (a file ID plus an address), + * a counter (used to ensure the update records remain in the original order), + * the on-page item's transaction ID (so we can discard any update records from + * the lookaside table once the on-page item's transaction is globally visible), + * and the page key (byte-string for row-store, record number for column-store). + * The value is the WT_UPDATE structure's transaction ID, update size and value. + * + * As the key for the lookaside table is different for row- and column-store, we + * store both key types in a WT_ITEM, building/parsing them in the code, because + * otherwise we'd need two lookaside files with different key formats. We could + * make the lookaside table's key standard by moving the source key into the + * lookaside table value, but that doesn't make the coding any simpler, and it + * makes the lookaside table's value more likely to overflow the page size when + * the row-store key is relatively large. + */ +#define WT_LAS_FORMAT \ + "key_format=" WT_UNCHECKED_STRING(IuQQu) \ + ",value_format=" WT_UNCHECKED_STRING(QIu) + +/* * WT_PAGE_MODIFY -- * When a page is modified, there's additional information to maintain. */ @@ -238,15 +264,17 @@ struct __wt_page_modify { * Eviction, but block wasn't written: unresolved updates and * associated disk image. * - * Skipped updates are either a WT_INSERT, or a row-store leaf - * page entry. + * Saved updates are either a WT_INSERT, or a row-store leaf + * page entry; in the case of creating lookaside records, there + * is an additional value, the committed item's transaction ID. */ - struct __wt_upd_skipped { + struct __wt_save_upd { WT_INSERT *ins; WT_ROW *rip; - } *skip; - uint32_t skip_entries; - void *skip_dsk; + uint64_t onpage_txn; + } *supd; + uint32_t supd_entries; + void *supd_dsk; /* * Block was written: address, size and checksum. @@ -556,9 +584,8 @@ struct __wt_page { #define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */ #define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ -#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */ +#define WT_PAGE_RECONCILIATION 0x10 /* Page reconciliation lock */ #define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */ -#define WT_PAGE_SPLIT_LOCKED 0x40 /* An internal page is growing */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ /* @@ -656,14 +683,6 @@ struct __wt_page { * to the readers. If the evicting thread does not find a hazard pointer, * the page is evicted. */ -typedef enum __wt_page_state { - WT_REF_DISK=0, /* Page is on disk */ - WT_REF_DELETED, /* Page is on disk, but deleted */ - WT_REF_LOCKED, /* Page locked for exclusive access */ - WT_REF_MEM, /* Page is in cache and valid */ - WT_REF_READING, /* Page being read */ - WT_REF_SPLIT /* Parent page split (WT_REF dead) */ -} WT_PAGE_STATE; /* * WT_PAGE_DELETED -- @@ -691,7 +710,13 @@ struct __wt_ref { WT_PAGE * volatile home; /* Reference page */ uint32_t pindex_hint; /* Reference page index hint */ - volatile WT_PAGE_STATE state; /* Page state */ +#define WT_REF_DISK 0 /* Page is on disk */ +#define WT_REF_DELETED 1 /* Page is on disk, but deleted */ +#define WT_REF_LOCKED 2 /* Page locked for exclusive access */ +#define WT_REF_MEM 3 /* Page is in cache and valid */ +#define WT_REF_READING 4 /* Page being read */ +#define WT_REF_SPLIT 5 /* Parent page split (WT_REF dead) */ + volatile uint32_t state; /* Page state */ /* * Address: on-page cell if read from backing block, off-page WT_ADDR @@ -871,8 +896,9 @@ WT_PACKED_STRUCT_BEGIN(__wt_update) * store 4GB objects; I'd rather do that than increase the size of this * structure for a flag bit. */ -#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == UINT32_MAX) -#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = UINT32_MAX) +#define WT_UPDATE_DELETED_VALUE UINT32_MAX +#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = WT_UPDATE_DELETED_VALUE) +#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == WT_UPDATE_DELETED_VALUE) uint32_t size; /* update length */ /* The untyped value immediately follows the WT_UPDATE structure. */ @@ -958,7 +984,7 @@ struct __wt_insert { #define WT_PAGE_ALLOC_AND_SWAP(s, page, dest, v, count) do { \ if (((v) = (dest)) == NULL) { \ WT_ERR(__wt_calloc_def(s, count, &(v))); \ - if (WT_ATOMIC_CAS8(dest, NULL, v)) \ + if (__wt_atomic_cas_ptr(&dest, NULL, v)) \ __wt_cache_page_inmem_incr( \ s, page, (count) * sizeof(*(v))); \ else \ |