summaryrefslogtreecommitdiff
path: root/src/include/btmem.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/include/btmem.h')
-rw-r--r--src/include/btmem.h66
1 files changed, 46 insertions, 20 deletions
diff --git a/src/include/btmem.h b/src/include/btmem.h
index f13504d66ca..f214ddb1dc3 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -6,6 +6,8 @@
* See the file LICENSE for redistribution information.
*/
+#define WT_RECNO_OOB 0 /* Illegal record number */
+
/*
* WT_PAGE_HEADER --
* Blocks have a common header, a WT_PAGE_HEADER structure followed by a
@@ -43,6 +45,7 @@ struct __wt_page_header {
#define WT_PAGE_EMPTY_V_ALL 0x02 /* Page has all zero-length values */
#define WT_PAGE_EMPTY_V_NONE 0x04 /* Page has no zero-length values */
#define WT_PAGE_ENCRYPTED 0x08 /* Page is encrypted on disk */
+#define WT_PAGE_LAS_UPDATE 0x10 /* Page updates in lookaside store */
uint8_t flags; /* 25: flags */
/*
@@ -168,6 +171,29 @@ struct __wt_ovfl_txnc {
};
/*
+ * Lookaside table support: when a page is being reconciled for eviction and has
+ * updates that might be required by earlier readers in the system, the updates
+ * are written into a lookaside table, and restored as necessary if the page is
+ * read. The key is a unique marker for the page (a file ID plus an address),
+ * a counter (used to ensure the update records remain in the original order),
+ * the on-page item's transaction ID (so we can discard any update records from
+ * the lookaside table once the on-page item's transaction is globally visible),
+ * and the page key (byte-string for row-store, record number for column-store).
+ * The value is the WT_UPDATE structure's transaction ID, update size and value.
+ *
+ * As the key for the lookaside table is different for row- and column-store, we
+ * store both key types in a WT_ITEM, building/parsing them in the code, because
+ * otherwise we'd need two lookaside files with different key formats. We could
+ * make the lookaside table's key standard by moving the source key into the
+ * lookaside table value, but that doesn't make the coding any simpler, and it
+ * makes the lookaside table's value more likely to overflow the page size when
+ * the row-store key is relatively large.
+ */
+#define WT_LAS_FORMAT \
+ "key_format=" WT_UNCHECKED_STRING(IuQQu) \
+ ",value_format=" WT_UNCHECKED_STRING(QIu)
+
+/*
* WT_PAGE_MODIFY --
* When a page is modified, there's additional information to maintain.
*/
@@ -238,15 +264,17 @@ struct __wt_page_modify {
* Eviction, but block wasn't written: unresolved updates and
* associated disk image.
*
- * Skipped updates are either a WT_INSERT, or a row-store leaf
- * page entry.
+ * Saved updates are either a WT_INSERT, or a row-store leaf
+ * page entry; in the case of creating lookaside records, there
+ * is an additional value, the committed item's transaction ID.
*/
- struct __wt_upd_skipped {
+ struct __wt_save_upd {
WT_INSERT *ins;
WT_ROW *rip;
- } *skip;
- uint32_t skip_entries;
- void *skip_dsk;
+ uint64_t onpage_txn;
+ } *supd;
+ uint32_t supd_entries;
+ void *supd_dsk;
/*
* Block was written: address, size and checksum.
@@ -556,9 +584,8 @@ struct __wt_page {
#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */
#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
-#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */
+#define WT_PAGE_RECONCILIATION 0x10 /* Page reconciliation lock */
#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */
-#define WT_PAGE_SPLIT_LOCKED 0x40 /* An internal page is growing */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
/*
@@ -656,14 +683,6 @@ struct __wt_page {
* to the readers. If the evicting thread does not find a hazard pointer,
* the page is evicted.
*/
-typedef enum __wt_page_state {
- WT_REF_DISK=0, /* Page is on disk */
- WT_REF_DELETED, /* Page is on disk, but deleted */
- WT_REF_LOCKED, /* Page locked for exclusive access */
- WT_REF_MEM, /* Page is in cache and valid */
- WT_REF_READING, /* Page being read */
- WT_REF_SPLIT /* Parent page split (WT_REF dead) */
-} WT_PAGE_STATE;
/*
* WT_PAGE_DELETED --
@@ -691,7 +710,13 @@ struct __wt_ref {
WT_PAGE * volatile home; /* Reference page */
uint32_t pindex_hint; /* Reference page index hint */
- volatile WT_PAGE_STATE state; /* Page state */
+#define WT_REF_DISK 0 /* Page is on disk */
+#define WT_REF_DELETED 1 /* Page is on disk, but deleted */
+#define WT_REF_LOCKED 2 /* Page locked for exclusive access */
+#define WT_REF_MEM 3 /* Page is in cache and valid */
+#define WT_REF_READING 4 /* Page being read */
+#define WT_REF_SPLIT 5 /* Parent page split (WT_REF dead) */
+ volatile uint32_t state; /* Page state */
/*
* Address: on-page cell if read from backing block, off-page WT_ADDR
@@ -871,8 +896,9 @@ WT_PACKED_STRUCT_BEGIN(__wt_update)
* store 4GB objects; I'd rather do that than increase the size of this
* structure for a flag bit.
*/
-#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == UINT32_MAX)
-#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = UINT32_MAX)
+#define WT_UPDATE_DELETED_VALUE UINT32_MAX
+#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = WT_UPDATE_DELETED_VALUE)
+#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == WT_UPDATE_DELETED_VALUE)
uint32_t size; /* update length */
/* The untyped value immediately follows the WT_UPDATE structure. */
@@ -958,7 +984,7 @@ struct __wt_insert {
#define WT_PAGE_ALLOC_AND_SWAP(s, page, dest, v, count) do { \
if (((v) = (dest)) == NULL) { \
WT_ERR(__wt_calloc_def(s, count, &(v))); \
- if (WT_ATOMIC_CAS8(dest, NULL, v)) \
+ if (__wt_atomic_cas_ptr(&dest, NULL, v)) \
__wt_cache_page_inmem_incr( \
s, page, (count) * sizeof(*(v))); \
else \