diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/include/block.h')
-rw-r--r-- | src/third_party/wiredtiger/src/include/block.h | 337 |
1 files changed, 337 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h new file mode 100644 index 00000000000..10fa51243ac --- /dev/null +++ b/src/third_party/wiredtiger/src/include/block.h @@ -0,0 +1,337 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * WiredTiger's block manager interface. + */ + +/* + * The file's description is written into the first block of the file, which + * means we can use an offset of 0 as an invalid offset. + */ +#define WT_BLOCK_INVALID_OFFSET 0 + +/* + * The block manager maintains three per-checkpoint extent lists: + * alloc: the extents allocated in this checkpoint + * avail: the extents available for allocation + * discard: the extents freed in this checkpoint + * + * An extent list is based on two skiplists: first, a by-offset list linking + * WT_EXT elements and sorted by file offset (low-to-high), second, a by-size + * list linking WT_SIZE elements and sorted by chunk size (low-to-high). + * + * Additionally, each WT_SIZE element on the by-size has a skiplist of its own, + * linking WT_EXT elements and sorted by file offset (low-to-high). This list + * has an entry for extents of a particular size. + * + * The trickiness is each individual WT_EXT element appears on two skiplists. + * In order to minimize allocation calls, we allocate a single array of WT_EXT + * pointers at the end of the WT_EXT structure, for both skiplists, and store + * the depth of the skiplist in the WT_EXT structure. The skiplist entries for + * the offset skiplist start at WT_EXT.next[0] and the entries for the size + * skiplist start at WT_EXT.next[WT_EXT.depth]. + * + * One final complication: we only maintain the per-size skiplist for the avail + * list, the alloc and discard extent lists are not searched based on size. + */ + +/* + * WT_EXTLIST -- + * An extent list. + */ +struct __wt_extlist { + char *name; /* Name */ + + uint64_t bytes; /* Byte count */ + uint32_t entries; /* Entry count */ + + wt_off_t offset; /* Written extent offset */ + uint32_t cksum, size; /* Written extent cksum, size */ + + int track_size; /* Maintain per-size skiplist */ + + WT_EXT *last; /* Cached last element */ + + WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Size/offset skiplists */ + WT_SIZE *sz[WT_SKIP_MAXDEPTH]; +}; + +/* + * WT_EXT -- + * Encapsulation of an extent, either allocated or freed within the + * checkpoint. + */ +struct __wt_ext { + wt_off_t off; /* Extent's file offset */ + wt_off_t size; /* Extent's Size */ + + uint8_t depth; /* Skip list depth */ + + /* + * Variable-length array, sized by the number of skiplist elements. + * The first depth array entries are the address skiplist elements, + * the second depth array entries are the size skiplist. + */ + WT_EXT *next[0]; /* Offset, size skiplists */ +}; + +/* + * WT_SIZE -- + * Encapsulation of a block size skiplist entry. + */ +struct __wt_size { + wt_off_t size; /* Size */ + + uint8_t depth; /* Skip list depth */ + + WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Per-size offset skiplist */ + + /* + * We don't use a variable-length array for the size skiplist, we want + * to be able to use any cached WT_SIZE structure as the head of a list, + * and we don't know the related WT_EXT structure's depth. + */ + WT_SIZE *next[WT_SKIP_MAXDEPTH]; /* Size skiplist */ +}; + +/* + * WT_EXT_FOREACH -- + * Walk a block manager skiplist. + * WT_EXT_FOREACH_OFF -- + * Walk a block manager skiplist where the WT_EXT.next entries are offset + * by the depth. + */ +#define WT_EXT_FOREACH(skip, head) \ + for ((skip) = (head)[0]; \ + (skip) != NULL; (skip) = (skip)->next[0]) +#define WT_EXT_FOREACH_OFF(skip, head) \ + for ((skip) = (head)[0]; \ + (skip) != NULL; (skip) = (skip)->next[(skip)->depth]) + +/* + * Checkpoint cookie: carries a version number as I don't want to rev the schema + * file version should the default block manager checkpoint format change. + * + * Version #1 checkpoint cookie format: + * [1] [root addr] [alloc addr] [avail addr] [discard addr] + * [file size] [checkpoint size] [write generation] + */ +#define WT_BM_CHECKPOINT_VERSION 1 /* Checkpoint format version */ +#define WT_BLOCK_EXTLIST_MAGIC 71002 /* Identify a list */ +struct __wt_block_ckpt { + uint8_t version; /* Version */ + + wt_off_t root_offset; /* The root */ + uint32_t root_cksum, root_size; + + WT_EXTLIST alloc; /* Extents allocated */ + WT_EXTLIST avail; /* Extents available */ + WT_EXTLIST discard; /* Extents discarded */ + + wt_off_t file_size; /* Checkpoint file size */ + uint64_t ckpt_size; /* Checkpoint byte count */ + + WT_EXTLIST ckpt_avail; /* Checkpoint free'd extents */ + + /* + * Checkpoint archive: the block manager may potentially free a lot of + * memory from the allocation and discard extent lists when checkpoint + * completes. Put it off until the checkpoint resolves, that lets the + * upper btree layer continue eviction sooner. + */ + WT_EXTLIST ckpt_alloc; /* Checkpoint archive */ + WT_EXTLIST ckpt_discard; /* Checkpoint archive */ +}; + +/* + * WT_BM -- + * Block manager handle, references a single checkpoint in a file. + */ +struct __wt_bm { + /* Methods */ + int (*addr_string) + (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t); + int (*addr_valid)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); + u_int (*block_header)(WT_BM *); + int (*checkpoint) + (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, WT_CKPT *, int); + int (*checkpoint_load)(WT_BM *, WT_SESSION_IMPL *, + const uint8_t *, size_t, uint8_t *, size_t *, int); + int (*checkpoint_resolve)(WT_BM *, WT_SESSION_IMPL *); + int (*checkpoint_unload)(WT_BM *, WT_SESSION_IMPL *); + int (*close)(WT_BM *, WT_SESSION_IMPL *); + int (*compact_end)(WT_BM *, WT_SESSION_IMPL *); + int (*compact_page_skip) + (WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t, int *); + int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, int *); + int (*compact_start)(WT_BM *, WT_SESSION_IMPL *); + int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); + int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); + int (*read) + (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t); + int (*salvage_end)(WT_BM *, WT_SESSION_IMPL *); + int (*salvage_next) + (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t *, int *); + int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *); + int (*salvage_valid) + (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, int); + int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats); + int (*sync)(WT_BM *, WT_SESSION_IMPL *, int); + int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); + int (*verify_end)(WT_BM *, WT_SESSION_IMPL *); + int (*verify_start)(WT_BM *, WT_SESSION_IMPL *, WT_CKPT *); + int (*write) (WT_BM *, + WT_SESSION_IMPL *, WT_ITEM *, uint8_t *, size_t *, int); + int (*write_size)(WT_BM *, WT_SESSION_IMPL *, size_t *); + + WT_BLOCK *block; /* Underlying file */ + + void *map; /* Mapped region */ + size_t maplen; + void *mappingcookie; + + /* + * There's only a single block manager handle that can be written, all + * others are checkpoints. + */ + int is_live; /* The live system */ +}; + +/* + * WT_BLOCK -- + * Block manager handle, references a single file. + */ +struct __wt_block { + const char *name; /* Name */ + + /* A list of block manager handles, sharing a file descriptor. */ + uint32_t ref; /* References */ + WT_FH *fh; /* Backing file handle */ + TAILQ_ENTRY(__wt_block) q; /* Linked list of handles */ + + /* Configuration information, set when the file is opened. */ + int allocfirst; /* Allocation is first-fit */ + int allocfirst_save; /* Allocation is first-fit, saved */ + uint32_t allocsize; /* Allocation size */ + size_t os_cache; /* System buffer cache flush max */ + size_t os_cache_max; + size_t os_cache_dirty; /* System buffer cache write max */ + size_t os_cache_dirty_max; + + u_int block_header; /* Header length */ + + /* + * There is only a single checkpoint in a file that can be written. The + * information could logically live in the WT_BM structure, but then we + * would be re-creating it every time we opened a new checkpoint and I'd + * rather not do that. So, it's stored here, only accessed by one WT_BM + * handle. + */ + WT_SPINLOCK live_lock; /* Live checkpoint lock */ + WT_BLOCK_CKPT live; /* Live checkpoint */ + int ckpt_inprogress;/* Live checkpoint in progress */ + + /* Salvage support */ + wt_off_t slvg_off; /* Salvage file offset */ + + /* Verification support */ + int verify; /* If performing verification */ + wt_off_t verify_size; /* Checkpoint's file size */ + WT_EXTLIST verify_alloc; /* Verification allocation list */ + uint64_t frags; /* Maximum frags in the file */ + uint8_t *fragfile; /* Per-file frag tracking list */ + uint8_t *fragckpt; /* Per-checkpoint frag tracking list */ +}; + +/* + * WT_BLOCK_DESC -- + * The file's description. + */ +struct __wt_block_desc { +#define WT_BLOCK_MAGIC 120897 + uint32_t magic; /* 00-03: Magic number */ +#define WT_BLOCK_MAJOR_VERSION 1 + uint16_t majorv; /* 04-05: Major version */ +#define WT_BLOCK_MINOR_VERSION 0 + uint16_t minorv; /* 06-07: Minor version */ + + uint32_t cksum; /* 08-11: Description block checksum */ + + uint32_t unused; /* 12-15: Padding */ +}; +/* + * WT_BLOCK_DESC_SIZE is the expected structure size -- we verify the build to + * ensure the compiler hasn't inserted padding (padding won't cause failure, + * we reserve the first allocation-size block of the file for this information, + * but it would be worth investigation, regardless). + */ +#define WT_BLOCK_DESC_SIZE 16 + +/* + * WT_BLOCK_HEADER -- + * Blocks have a common header, a WT_PAGE_HEADER structure followed by a + * block-manager specific structure: WT_BLOCK_HEADER is WiredTiger's default. + */ +struct __wt_block_header { + /* + * We write the page size in the on-disk page header because it makes + * salvage easier. (If we don't know the expected page length, we'd + * have to read increasingly larger chunks from the file until we find + * one that checksums, and that's going to be harsh given WiredTiger's + * potentially large page sizes.) + */ + uint32_t disk_size; /* 00-03: on-disk page size */ + + /* + * Page checksums are stored in two places. First, the page checksum + * is written within the internal page that references it as part of + * the address cookie. This is done to improve the chances of detecting + * not only disk corruption but other bugs (for example, overwriting a + * page with another valid page image). Second, a page's checksum is + * stored in the disk header. This is for salvage, so salvage knows it + * has found a page that may be useful. + */ + uint32_t cksum; /* 04-07: checksum */ + +#define WT_BLOCK_DATA_CKSUM 0x01 /* Block data is part of the checksum */ + uint8_t flags; /* 08: flags */ + + /* + * End the structure with 3 bytes of padding: it wastes space, but it + * leaves the structure 32-bit aligned and having a few bytes to play + * with in the future can't hurt. + */ + uint8_t unused[3]; /* 09-11: unused padding */ +}; +/* + * WT_BLOCK_HEADER_SIZE is the number of bytes we allocate for the structure: if + * the compiler inserts padding it will break the world. + */ +#define WT_BLOCK_HEADER_SIZE 12 + +/* + * WT_BLOCK_HEADER_BYTE + * WT_BLOCK_HEADER_BYTE_SIZE -- + * The first usable data byte on the block (past the combined headers). + */ +#define WT_BLOCK_HEADER_BYTE_SIZE \ + (WT_PAGE_HEADER_SIZE + WT_BLOCK_HEADER_SIZE) +#define WT_BLOCK_HEADER_BYTE(dsk) \ + ((void *)((uint8_t *)(dsk) + WT_BLOCK_HEADER_BYTE_SIZE)) + +/* + * Don't compress the block's WT_PAGE_HEADER and WT_BLOCK_HEADER structures. + * We need the WT_PAGE_HEADER in-memory size, and the WT_BLOCK_HEADER checksum + * and on-disk size to be immediately available without decompression. We use + * the on-disk size and checksum during salvage to figure out where the blocks + * are, and the in-memory size tells us how large a buffer we need to decompress + * the block. We could skip less than 64B, but a 64B boundary may offer better + * alignment for the underlying compression engine, and skipping 64B won't make + * a difference in terms of compression efficiency. + */ +#define WT_BLOCK_COMPRESS_SKIP 64 |