/*- * Copyright (c) 2014-2019 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ /* * WiredTiger's block manager interface. */ /* * The file's description is written into the first block of the file, which * means we can use an offset of 0 as an invalid offset. */ #define WT_BLOCK_INVALID_OFFSET 0 /* * The block manager maintains three per-checkpoint extent lists: * alloc: the extents allocated in this checkpoint * avail: the extents available for allocation * discard: the extents freed in this checkpoint * * An extent list is based on two skiplists: first, a by-offset list linking * WT_EXT elements and sorted by file offset (low-to-high), second, a by-size * list linking WT_SIZE elements and sorted by chunk size (low-to-high). * * Additionally, each WT_SIZE element on the by-size has a skiplist of its own, * linking WT_EXT elements and sorted by file offset (low-to-high). This list * has an entry for extents of a particular size. * * The trickiness is each individual WT_EXT element appears on two skiplists. * In order to minimize allocation calls, we allocate a single array of WT_EXT * pointers at the end of the WT_EXT structure, for both skiplists, and store * the depth of the skiplist in the WT_EXT structure. The skiplist entries for * the offset skiplist start at WT_EXT.next[0] and the entries for the size * skiplist start at WT_EXT.next[WT_EXT.depth]. * * One final complication: we only maintain the per-size skiplist for the avail * list, the alloc and discard extent lists are not searched based on size. */ /* * WT_EXTLIST -- * An extent list. */ struct __wt_extlist { char *name; /* Name */ uint64_t bytes; /* Byte count */ uint32_t entries; /* Entry count */ wt_off_t offset; /* Written extent offset */ uint32_t checksum; /* Written extent checksum */ uint32_t size; /* Written extent size */ bool track_size; /* Maintain per-size skiplist */ WT_EXT *last; /* Cached last element */ WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Size/offset skiplists */ WT_SIZE *sz[WT_SKIP_MAXDEPTH]; }; /* * WT_EXT -- * Encapsulation of an extent, either allocated or freed within the * checkpoint. */ struct __wt_ext { wt_off_t off; /* Extent's file offset */ wt_off_t size; /* Extent's Size */ uint8_t depth; /* Skip list depth */ /* * Variable-length array, sized by the number of skiplist elements. * The first depth array entries are the address skiplist elements, * the second depth array entries are the size skiplist. */ WT_EXT *next[0]; /* Offset, size skiplists */ }; /* * WT_SIZE -- * Encapsulation of a block size skiplist entry. */ struct __wt_size { wt_off_t size; /* Size */ uint8_t depth; /* Skip list depth */ WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Per-size offset skiplist */ /* * We don't use a variable-length array for the size skiplist, we want * to be able to use any cached WT_SIZE structure as the head of a list, * and we don't know the related WT_EXT structure's depth. */ WT_SIZE *next[WT_SKIP_MAXDEPTH]; /* Size skiplist */ }; /* * WT_EXT_FOREACH -- * Walk a block manager skiplist. * WT_EXT_FOREACH_OFF -- * Walk a block manager skiplist where the WT_EXT.next entries are offset * by the depth. */ #define WT_EXT_FOREACH(skip, head) \ for ((skip) = (head)[0]; \ (skip) != NULL; (skip) = (skip)->next[0]) #define WT_EXT_FOREACH_OFF(skip, head) \ for ((skip) = (head)[0]; \ (skip) != NULL; (skip) = (skip)->next[(skip)->depth]) /* * Checkpoint cookie: carries a version number as I don't want to rev the schema * file version should the default block manager checkpoint format change. * * Version #1 checkpoint cookie format: * [1] [root addr] [alloc addr] [avail addr] [discard addr] * [file size] [checkpoint size] [write generation] */ #define WT_BM_CHECKPOINT_VERSION 1 /* Checkpoint format version */ #define WT_BLOCK_EXTLIST_MAGIC 71002 /* Identify a list */ struct __wt_block_ckpt { uint8_t version; /* Version */ wt_off_t root_offset; /* The root */ uint32_t root_checksum, root_size; WT_EXTLIST alloc; /* Extents allocated */ WT_EXTLIST avail; /* Extents available */ WT_EXTLIST discard; /* Extents discarded */ wt_off_t file_size; /* Checkpoint file size */ uint64_t ckpt_size; /* Checkpoint byte count */ WT_EXTLIST ckpt_avail; /* Checkpoint free'd extents */ /* * Checkpoint archive: the block manager may potentially free a lot of * memory from the allocation and discard extent lists when checkpoint * completes. Put it off until the checkpoint resolves, that lets the * upper btree layer continue eviction sooner. */ WT_EXTLIST ckpt_alloc; /* Checkpoint archive */ WT_EXTLIST ckpt_discard; /* Checkpoint archive */ }; /* * WT_BM -- * Block manager handle, references a single checkpoint in a file. */ struct __wt_bm { /* Methods */ int (*addr_invalid) (WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); int (*addr_string) (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t); u_int (*block_header)(WT_BM *); int (*checkpoint) (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, WT_CKPT *, bool); int (*checkpoint_load)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t, uint8_t *, size_t *, bool); int (*checkpoint_resolve)(WT_BM *, WT_SESSION_IMPL *, bool); int (*checkpoint_start)(WT_BM *, WT_SESSION_IMPL *); int (*checkpoint_unload)(WT_BM *, WT_SESSION_IMPL *); int (*close)(WT_BM *, WT_SESSION_IMPL *); int (*compact_end)(WT_BM *, WT_SESSION_IMPL *); int (*compact_page_skip) (WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t, bool *); int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, bool *); int (*compact_start)(WT_BM *, WT_SESSION_IMPL *); int (*corrupt)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); bool (*is_mapped)(WT_BM *, WT_SESSION_IMPL *); int (*map_discard)(WT_BM *, WT_SESSION_IMPL *, void *, size_t); int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); int (*read) (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t); int (*salvage_end)(WT_BM *, WT_SESSION_IMPL *); int (*salvage_next) (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t *, bool *); int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *); int (*salvage_valid) (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, bool); int (*size)(WT_BM *, WT_SESSION_IMPL *, wt_off_t *); int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats); int (*sync)(WT_BM *, WT_SESSION_IMPL *, bool); int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); int (*verify_end)(WT_BM *, WT_SESSION_IMPL *); int (*verify_start) (WT_BM *, WT_SESSION_IMPL *, WT_CKPT *, const char *[]); int (*write) (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, uint8_t *, size_t *, bool, bool); int (*write_size)(WT_BM *, WT_SESSION_IMPL *, size_t *); WT_BLOCK *block; /* Underlying file */ void *map; /* Mapped region */ size_t maplen; void *mapped_cookie; /* * There's only a single block manager handle that can be written, all * others are checkpoints. */ bool is_live; /* The live system */ }; /* * WT_BLOCK -- * Block manager handle, references a single file. */ struct __wt_block { const char *name; /* Name */ uint64_t name_hash; /* Hash of name */ /* A list of block manager handles, sharing a file descriptor. */ uint32_t ref; /* References */ TAILQ_ENTRY(__wt_block) q; /* Linked list of handles */ TAILQ_ENTRY(__wt_block) hashq; /* Hashed list of handles */ WT_FH *fh; /* Backing file handle */ wt_off_t size; /* File size */ wt_off_t extend_size; /* File extended size */ wt_off_t extend_len; /* File extend chunk size */ /* Configuration information, set when the file is opened. */ uint32_t allocfirst; /* Allocation is first-fit */ uint32_t allocsize; /* Allocation size */ size_t os_cache; /* System buffer cache flush max */ size_t os_cache_max; size_t os_cache_dirty; /* System buffer cache write max */ size_t os_cache_dirty_max; u_int block_header; /* Header length */ /* * There is only a single checkpoint in a file that can be written. The * information could logically live in the WT_BM structure, but then we * would be re-creating it every time we opened a new checkpoint and I'd * rather not do that. So, it's stored here, only accessed by one WT_BM * handle. */ WT_SPINLOCK live_lock; /* Live checkpoint lock */ WT_BLOCK_CKPT live; /* Live checkpoint */ #ifdef HAVE_DIAGNOSTIC bool live_open; /* Live system is open */ #endif /* Live checkpoint status */ enum { WT_CKPT_NONE=0, WT_CKPT_INPROGRESS, WT_CKPT_PANIC_ON_FAILURE, WT_CKPT_SALVAGE } ckpt_state; /* Compaction support */ int compact_pct_tenths; /* Percent to compact */ uint64_t compact_pages_reviewed;/* Pages reviewed */ uint64_t compact_pages_skipped; /* Pages skipped */ uint64_t compact_pages_written; /* Pages rewritten */ /* Salvage support */ wt_off_t slvg_off; /* Salvage file offset */ /* Verification support */ bool verify; /* If performing verification */ bool verify_layout; /* Print out file layout information */ bool verify_strict; /* Fail hard on any error */ wt_off_t verify_size; /* Checkpoint's file size */ WT_EXTLIST verify_alloc; /* Verification allocation list */ uint64_t frags; /* Maximum frags in the file */ uint8_t *fragfile; /* Per-file frag tracking list */ uint8_t *fragckpt; /* Per-checkpoint frag tracking list */ }; /* * WT_BLOCK_DESC -- * The file's description. */ struct __wt_block_desc { #define WT_BLOCK_MAGIC 120897 uint32_t magic; /* 00-03: Magic number */ #define WT_BLOCK_MAJOR_VERSION 1 uint16_t majorv; /* 04-05: Major version */ #define WT_BLOCK_MINOR_VERSION 0 uint16_t minorv; /* 06-07: Minor version */ uint32_t checksum; /* 08-11: Description block checksum */ uint32_t unused; /* 12-15: Padding */ }; /* * WT_BLOCK_DESC_SIZE is the expected structure size -- we verify the build to * ensure the compiler hasn't inserted padding (padding won't cause failure, * we reserve the first allocation-size block of the file for this information, * but it would be worth investigation, regardless). */ #define WT_BLOCK_DESC_SIZE 16 /* * __wt_block_desc_byteswap -- * Handle big- and little-endian transformation of a description block. */ static inline void __wt_block_desc_byteswap(WT_BLOCK_DESC *desc) { #ifdef WORDS_BIGENDIAN desc->magic = __wt_bswap32(desc->magic); desc->majorv = __wt_bswap16(desc->majorv); desc->minorv = __wt_bswap16(desc->minorv); desc->checksum = __wt_bswap32(desc->checksum); #else WT_UNUSED(desc); #endif } /* * WT_BLOCK_HEADER -- * Blocks have a common header, a WT_PAGE_HEADER structure followed by a * block-manager specific structure: WT_BLOCK_HEADER is WiredTiger's default. */ struct __wt_block_header { /* * We write the page size in the on-disk page header because it makes * salvage easier. (If we don't know the expected page length, we'd * have to read increasingly larger chunks from the file until we find * one that checksums, and that's going to be harsh given WiredTiger's * potentially large page sizes.) */ uint32_t disk_size; /* 00-03: on-disk page size */ /* * Page checksums are stored in two places. First, the page checksum * is written within the internal page that references it as part of * the address cookie. This is done to improve the chances of detecting * not only disk corruption but other bugs (for example, overwriting a * page with another valid page image). Second, a page's checksum is * stored in the disk header. This is for salvage, so salvage knows it * has found a page that may be useful. */ uint32_t checksum; /* 04-07: checksum */ /* * No automatic generation: flag values cannot change, they're written * to disk. */ #define WT_BLOCK_DATA_CKSUM 0x1u /* Block data is part of the checksum */ uint8_t flags; /* 08: flags */ /* * End the structure with 3 bytes of padding: it wastes space, but it * leaves the structure 32-bit aligned and having a few bytes to play * with in the future can't hurt. */ uint8_t unused[3]; /* 09-11: unused padding */ }; /* * WT_BLOCK_HEADER_SIZE is the number of bytes we allocate for the structure: if * the compiler inserts padding it will break the world. */ #define WT_BLOCK_HEADER_SIZE 12 /* * __wt_block_header_byteswap_copy -- * Handle big- and little-endian transformation of a header block, * copying from a source to a target. */ static inline void __wt_block_header_byteswap_copy(WT_BLOCK_HEADER *from, WT_BLOCK_HEADER *to) { *to = *from; #ifdef WORDS_BIGENDIAN to->disk_size = __wt_bswap32(from->disk_size); to->checksum = __wt_bswap32(from->checksum); #endif } /* * __wt_block_header_byteswap -- * Handle big- and little-endian transformation of a header block. */ static inline void __wt_block_header_byteswap(WT_BLOCK_HEADER *blk) { #ifdef WORDS_BIGENDIAN __wt_block_header_byteswap_copy(blk, blk); #else WT_UNUSED(blk); #endif } /* * WT_BLOCK_HEADER_BYTE * WT_BLOCK_HEADER_BYTE_SIZE -- * The first usable data byte on the block (past the combined headers). */ #define WT_BLOCK_HEADER_BYTE_SIZE \ (WT_PAGE_HEADER_SIZE + WT_BLOCK_HEADER_SIZE) #define WT_BLOCK_HEADER_BYTE(dsk) \ ((void *)((uint8_t *)(dsk) + WT_BLOCK_HEADER_BYTE_SIZE)) /* * We don't compress or encrypt the block's WT_PAGE_HEADER or WT_BLOCK_HEADER * structures because we need both available with decompression or decryption. * We use the WT_BLOCK_HEADER checksum and on-disk size during salvage to * figure out where the blocks are, and we use the WT_PAGE_HEADER in-memory * size during decompression and decryption to know how large a target buffer * to allocate. We can only skip the header information when doing encryption, * but we skip the first 64B when doing compression; a 64B boundary may offer * better alignment for the underlying compression engine, and skipping 64B * shouldn't make any difference in terms of compression efficiency. */ #define WT_BLOCK_COMPRESS_SKIP 64 #define WT_BLOCK_ENCRYPT_SKIP WT_BLOCK_HEADER_BYTE_SIZE /* * __wt_block_header -- * Return the size of the block-specific header. */ static inline u_int __wt_block_header(WT_BLOCK *block) { WT_UNUSED(block); return ((u_int)WT_BLOCK_HEADER_SIZE); }