summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/include/block.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/include/block.h')
-rw-r--r--src/third_party/wiredtiger/src/include/block.h337
1 files changed, 337 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h
new file mode 100644
index 00000000000..10fa51243ac
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/block.h
@@ -0,0 +1,337 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WiredTiger's block manager interface.
+ */
+
+/*
+ * The file's description is written into the first block of the file, which
+ * means we can use an offset of 0 as an invalid offset.
+ */
+#define WT_BLOCK_INVALID_OFFSET 0
+
+/*
+ * The block manager maintains three per-checkpoint extent lists:
+ * alloc: the extents allocated in this checkpoint
+ * avail: the extents available for allocation
+ * discard: the extents freed in this checkpoint
+ *
+ * An extent list is based on two skiplists: first, a by-offset list linking
+ * WT_EXT elements and sorted by file offset (low-to-high), second, a by-size
+ * list linking WT_SIZE elements and sorted by chunk size (low-to-high).
+ *
+ * Additionally, each WT_SIZE element on the by-size has a skiplist of its own,
+ * linking WT_EXT elements and sorted by file offset (low-to-high). This list
+ * has an entry for extents of a particular size.
+ *
+ * The trickiness is each individual WT_EXT element appears on two skiplists.
+ * In order to minimize allocation calls, we allocate a single array of WT_EXT
+ * pointers at the end of the WT_EXT structure, for both skiplists, and store
+ * the depth of the skiplist in the WT_EXT structure. The skiplist entries for
+ * the offset skiplist start at WT_EXT.next[0] and the entries for the size
+ * skiplist start at WT_EXT.next[WT_EXT.depth].
+ *
+ * One final complication: we only maintain the per-size skiplist for the avail
+ * list, the alloc and discard extent lists are not searched based on size.
+ */
+
+/*
+ * WT_EXTLIST --
+ * An extent list.
+ */
+struct __wt_extlist {
+ char *name; /* Name */
+
+ uint64_t bytes; /* Byte count */
+ uint32_t entries; /* Entry count */
+
+ wt_off_t offset; /* Written extent offset */
+ uint32_t cksum, size; /* Written extent cksum, size */
+
+ int track_size; /* Maintain per-size skiplist */
+
+ WT_EXT *last; /* Cached last element */
+
+ WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Size/offset skiplists */
+ WT_SIZE *sz[WT_SKIP_MAXDEPTH];
+};
+
+/*
+ * WT_EXT --
+ * Encapsulation of an extent, either allocated or freed within the
+ * checkpoint.
+ */
+struct __wt_ext {
+ wt_off_t off; /* Extent's file offset */
+ wt_off_t size; /* Extent's Size */
+
+ uint8_t depth; /* Skip list depth */
+
+ /*
+ * Variable-length array, sized by the number of skiplist elements.
+ * The first depth array entries are the address skiplist elements,
+ * the second depth array entries are the size skiplist.
+ */
+ WT_EXT *next[0]; /* Offset, size skiplists */
+};
+
+/*
+ * WT_SIZE --
+ * Encapsulation of a block size skiplist entry.
+ */
+struct __wt_size {
+ wt_off_t size; /* Size */
+
+ uint8_t depth; /* Skip list depth */
+
+ WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Per-size offset skiplist */
+
+ /*
+ * We don't use a variable-length array for the size skiplist, we want
+ * to be able to use any cached WT_SIZE structure as the head of a list,
+ * and we don't know the related WT_EXT structure's depth.
+ */
+ WT_SIZE *next[WT_SKIP_MAXDEPTH]; /* Size skiplist */
+};
+
+/*
+ * WT_EXT_FOREACH --
+ * Walk a block manager skiplist.
+ * WT_EXT_FOREACH_OFF --
+ * Walk a block manager skiplist where the WT_EXT.next entries are offset
+ * by the depth.
+ */
+#define WT_EXT_FOREACH(skip, head) \
+ for ((skip) = (head)[0]; \
+ (skip) != NULL; (skip) = (skip)->next[0])
+#define WT_EXT_FOREACH_OFF(skip, head) \
+ for ((skip) = (head)[0]; \
+ (skip) != NULL; (skip) = (skip)->next[(skip)->depth])
+
+/*
+ * Checkpoint cookie: carries a version number as I don't want to rev the schema
+ * file version should the default block manager checkpoint format change.
+ *
+ * Version #1 checkpoint cookie format:
+ * [1] [root addr] [alloc addr] [avail addr] [discard addr]
+ * [file size] [checkpoint size] [write generation]
+ */
+#define WT_BM_CHECKPOINT_VERSION 1 /* Checkpoint format version */
+#define WT_BLOCK_EXTLIST_MAGIC 71002 /* Identify a list */
+struct __wt_block_ckpt {
+ uint8_t version; /* Version */
+
+ wt_off_t root_offset; /* The root */
+ uint32_t root_cksum, root_size;
+
+ WT_EXTLIST alloc; /* Extents allocated */
+ WT_EXTLIST avail; /* Extents available */
+ WT_EXTLIST discard; /* Extents discarded */
+
+ wt_off_t file_size; /* Checkpoint file size */
+ uint64_t ckpt_size; /* Checkpoint byte count */
+
+ WT_EXTLIST ckpt_avail; /* Checkpoint free'd extents */
+
+ /*
+ * Checkpoint archive: the block manager may potentially free a lot of
+ * memory from the allocation and discard extent lists when checkpoint
+ * completes. Put it off until the checkpoint resolves, that lets the
+ * upper btree layer continue eviction sooner.
+ */
+ WT_EXTLIST ckpt_alloc; /* Checkpoint archive */
+ WT_EXTLIST ckpt_discard; /* Checkpoint archive */
+};
+
+/*
+ * WT_BM --
+ * Block manager handle, references a single checkpoint in a file.
+ */
+struct __wt_bm {
+ /* Methods */
+ int (*addr_string)
+ (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
+ int (*addr_valid)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+ u_int (*block_header)(WT_BM *);
+ int (*checkpoint)
+ (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, WT_CKPT *, int);
+ int (*checkpoint_load)(WT_BM *, WT_SESSION_IMPL *,
+ const uint8_t *, size_t, uint8_t *, size_t *, int);
+ int (*checkpoint_resolve)(WT_BM *, WT_SESSION_IMPL *);
+ int (*checkpoint_unload)(WT_BM *, WT_SESSION_IMPL *);
+ int (*close)(WT_BM *, WT_SESSION_IMPL *);
+ int (*compact_end)(WT_BM *, WT_SESSION_IMPL *);
+ int (*compact_page_skip)
+ (WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t, int *);
+ int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, int *);
+ int (*compact_start)(WT_BM *, WT_SESSION_IMPL *);
+ int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+ int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+ int (*read)
+ (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
+ int (*salvage_end)(WT_BM *, WT_SESSION_IMPL *);
+ int (*salvage_next)
+ (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t *, int *);
+ int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *);
+ int (*salvage_valid)
+ (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, int);
+ int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats);
+ int (*sync)(WT_BM *, WT_SESSION_IMPL *, int);
+ int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+ int (*verify_end)(WT_BM *, WT_SESSION_IMPL *);
+ int (*verify_start)(WT_BM *, WT_SESSION_IMPL *, WT_CKPT *);
+ int (*write) (WT_BM *,
+ WT_SESSION_IMPL *, WT_ITEM *, uint8_t *, size_t *, int);
+ int (*write_size)(WT_BM *, WT_SESSION_IMPL *, size_t *);
+
+ WT_BLOCK *block; /* Underlying file */
+
+ void *map; /* Mapped region */
+ size_t maplen;
+ void *mappingcookie;
+
+ /*
+ * There's only a single block manager handle that can be written, all
+ * others are checkpoints.
+ */
+ int is_live; /* The live system */
+};
+
+/*
+ * WT_BLOCK --
+ * Block manager handle, references a single file.
+ */
+struct __wt_block {
+ const char *name; /* Name */
+
+ /* A list of block manager handles, sharing a file descriptor. */
+ uint32_t ref; /* References */
+ WT_FH *fh; /* Backing file handle */
+ TAILQ_ENTRY(__wt_block) q; /* Linked list of handles */
+
+ /* Configuration information, set when the file is opened. */
+ int allocfirst; /* Allocation is first-fit */
+ int allocfirst_save; /* Allocation is first-fit, saved */
+ uint32_t allocsize; /* Allocation size */
+ size_t os_cache; /* System buffer cache flush max */
+ size_t os_cache_max;
+ size_t os_cache_dirty; /* System buffer cache write max */
+ size_t os_cache_dirty_max;
+
+ u_int block_header; /* Header length */
+
+ /*
+ * There is only a single checkpoint in a file that can be written. The
+ * information could logically live in the WT_BM structure, but then we
+ * would be re-creating it every time we opened a new checkpoint and I'd
+ * rather not do that. So, it's stored here, only accessed by one WT_BM
+ * handle.
+ */
+ WT_SPINLOCK live_lock; /* Live checkpoint lock */
+ WT_BLOCK_CKPT live; /* Live checkpoint */
+ int ckpt_inprogress;/* Live checkpoint in progress */
+
+ /* Salvage support */
+ wt_off_t slvg_off; /* Salvage file offset */
+
+ /* Verification support */
+ int verify; /* If performing verification */
+ wt_off_t verify_size; /* Checkpoint's file size */
+ WT_EXTLIST verify_alloc; /* Verification allocation list */
+ uint64_t frags; /* Maximum frags in the file */
+ uint8_t *fragfile; /* Per-file frag tracking list */
+ uint8_t *fragckpt; /* Per-checkpoint frag tracking list */
+};
+
+/*
+ * WT_BLOCK_DESC --
+ * The file's description.
+ */
+struct __wt_block_desc {
+#define WT_BLOCK_MAGIC 120897
+ uint32_t magic; /* 00-03: Magic number */
+#define WT_BLOCK_MAJOR_VERSION 1
+ uint16_t majorv; /* 04-05: Major version */
+#define WT_BLOCK_MINOR_VERSION 0
+ uint16_t minorv; /* 06-07: Minor version */
+
+ uint32_t cksum; /* 08-11: Description block checksum */
+
+ uint32_t unused; /* 12-15: Padding */
+};
+/*
+ * WT_BLOCK_DESC_SIZE is the expected structure size -- we verify the build to
+ * ensure the compiler hasn't inserted padding (padding won't cause failure,
+ * we reserve the first allocation-size block of the file for this information,
+ * but it would be worth investigation, regardless).
+ */
+#define WT_BLOCK_DESC_SIZE 16
+
+/*
+ * WT_BLOCK_HEADER --
+ * Blocks have a common header, a WT_PAGE_HEADER structure followed by a
+ * block-manager specific structure: WT_BLOCK_HEADER is WiredTiger's default.
+ */
+struct __wt_block_header {
+ /*
+ * We write the page size in the on-disk page header because it makes
+ * salvage easier. (If we don't know the expected page length, we'd
+ * have to read increasingly larger chunks from the file until we find
+ * one that checksums, and that's going to be harsh given WiredTiger's
+ * potentially large page sizes.)
+ */
+ uint32_t disk_size; /* 00-03: on-disk page size */
+
+ /*
+ * Page checksums are stored in two places. First, the page checksum
+ * is written within the internal page that references it as part of
+ * the address cookie. This is done to improve the chances of detecting
+ * not only disk corruption but other bugs (for example, overwriting a
+ * page with another valid page image). Second, a page's checksum is
+ * stored in the disk header. This is for salvage, so salvage knows it
+ * has found a page that may be useful.
+ */
+ uint32_t cksum; /* 04-07: checksum */
+
+#define WT_BLOCK_DATA_CKSUM 0x01 /* Block data is part of the checksum */
+ uint8_t flags; /* 08: flags */
+
+ /*
+ * End the structure with 3 bytes of padding: it wastes space, but it
+ * leaves the structure 32-bit aligned and having a few bytes to play
+ * with in the future can't hurt.
+ */
+ uint8_t unused[3]; /* 09-11: unused padding */
+};
+/*
+ * WT_BLOCK_HEADER_SIZE is the number of bytes we allocate for the structure: if
+ * the compiler inserts padding it will break the world.
+ */
+#define WT_BLOCK_HEADER_SIZE 12
+
+/*
+ * WT_BLOCK_HEADER_BYTE
+ * WT_BLOCK_HEADER_BYTE_SIZE --
+ * The first usable data byte on the block (past the combined headers).
+ */
+#define WT_BLOCK_HEADER_BYTE_SIZE \
+ (WT_PAGE_HEADER_SIZE + WT_BLOCK_HEADER_SIZE)
+#define WT_BLOCK_HEADER_BYTE(dsk) \
+ ((void *)((uint8_t *)(dsk) + WT_BLOCK_HEADER_BYTE_SIZE))
+
+/*
+ * Don't compress the block's WT_PAGE_HEADER and WT_BLOCK_HEADER structures.
+ * We need the WT_PAGE_HEADER in-memory size, and the WT_BLOCK_HEADER checksum
+ * and on-disk size to be immediately available without decompression. We use
+ * the on-disk size and checksum during salvage to figure out where the blocks
+ * are, and the in-memory size tells us how large a buffer we need to decompress
+ * the block. We could skip less than 64B, but a 64B boundary may offer better
+ * alignment for the underlying compression engine, and skipping 64B won't make
+ * a difference in terms of compression efficiency.
+ */
+#define WT_BLOCK_COMPRESS_SKIP 64