diff options
author | Keith Bostic <keith.bostic@wiredtiger.com> | 2012-01-01 11:38:39 +0000 |
---|---|---|
committer | Keith Bostic <keith.bostic@wiredtiger.com> | 2012-01-01 11:38:39 +0000 |
commit | b15486d66c545620088416a244d9fb9692f53cdb (patch) | |
tree | 64660fda17b6d823a7b34edf4eb1deb809a8c084 /src/include | |
parent | 8d63543bdabd2db901ffab84a3258c797cc4702f (diff) | |
download | mongo-b15486d66c545620088416a244d9fb9692f53cdb.tar.gz |
Split the WT_PAGE_DISK structure into two parts: the btree page header
(WT_PAGE_HEADER) and the block-manager's header (WT_BLOCK_HEADER).
Diffstat (limited to 'src/include')
-rw-r--r-- | src/include/block.h | 132 | ||||
-rw-r--r-- | src/include/btmem.h | 81 | ||||
-rw-r--r-- | src/include/btree.h | 21 | ||||
-rw-r--r-- | src/include/cell.i | 5 | ||||
-rw-r--r-- | src/include/extern.h | 20 | ||||
-rw-r--r-- | src/include/verify_build.h | 16 | ||||
-rw-r--r-- | src/include/wt_internal.in | 6 |
7 files changed, 152 insertions, 129 deletions
diff --git a/src/include/block.h b/src/include/block.h index 0fca1e014da..2fa3c75ec77 100644 --- a/src/include/block.h +++ b/src/include/block.h @@ -38,7 +38,7 @@ struct __wt_block { WT_FH *fh; /* Backing file handle */ - uint64_t lsn; /* LSN file/offset pair */ + uint64_t write_gen; /* Write generation */ uint32_t allocsize; /* Allocation size */ int checksum; /* If checksums configured */ @@ -90,13 +90,13 @@ struct __wt_block_desc { uint32_t free_cksum; /* 28-31: Free list page checksum */ /* - * We maintain page LSN's for the file in the non-transactional case - * (where, instead of a log reference, the LSN is simply a counter), + * We maintain page write-generations in the non-transactional case + * (where, instead of a transactional LSN, the value is a counter), * as that's how salvage can determine the most recent page between - * pages overlapping the same key range. This non-transactional LSN - * has to be persistent, and so it's included in the file's metadata. + * pages overlapping the same key range. The value has to persist, + * so it's included in the file's metadata. */ - uint64_t lsn; /* 32-39: Non-transactional page LSN */ + uint64_t write_gen; /* 32-39: Write generation */ }; /* * WT_BLOCK_DESC_SIZE is the expected structure size -- we verify the build to @@ -107,103 +107,69 @@ struct __wt_block_desc { #define WT_BLOCK_DESC_SIZE 40 /* - * Don't compress the first 32B of the block (almost all of the WT_PAGE_DISK - * structure) because we need the block's checksum and on-disk and in-memory - * sizes to be immediately available without decompression (the checksum and - * the on-disk block sizes are used during salvage to figure out where the - * blocks are, and the in-memory page size tells us how large a buffer we need - * to decompress the file block. We could take less than 32B, but a 32B - * boundary is probably better alignment for the underlying compression engine, - * and skipping 32B won't matter in terms of compression efficiency. + * WT_BLOCK_HEADER -- + * Blocks have a common header, a WT_PAGE_HEADER structure followed by a + * block-manager specific structure: WT_BLOCK_HEADER is WiredTiger's default. */ -#define WT_BLOCK_COMPRESS_SKIP 32 - -/* - * WT_PAGE_DISK -- - * - * All on-disk pages have a common header, defined by the WT_PAGE_DISK - * structure. The header has no version number or mode bits, and the page type - * and/or flags value will have to be modified when changes are made to the page - * layout. (The page type appears early in the header to make this simpler.) - * In other words, the page type declares the contents of the page and how to - * read it. - */ -struct __wt_page_disk { +struct __wt_block_header { /* - * The record number of the first record of the page is stored on disk - * so we can figure out where the column-store leaf page fits into the - * key space during salvage. + * We maintain page write-generations in the non-transactional case + * (where, instead of a transactional LSN, the value is a counter), + * as that's how salvage can determine the most recent page between + * pages overlapping the same key range. + * + * !!! + * The write-generation is "owned" by the btree layer, but it's easier + * to set it (when physically writing blocks), to persist it (in the + * WT_BLOCK_DESC structure, rather than the schema file), and restore + * it during salvage, in the block-manager layer. */ - uint64_t recno; /* 00-07: column-store starting recno */ + uint64_t write_gen; /* 00-07: write generation */ /* - * The LSN is a 64-bit chunk to make assignment and comparisons easier, - * but it's 2 32-bit values underneath: a file number and a file offset. + * We write the page size in the on-disk page header because it makes + * salvage easier. (If we don't know the expected page length, we'd + * have to read increasingly larger chunks from the file until we find + * one that checksums, and that's going to be harsh given WiredTiger's + * potentially large page sizes.) */ -#define WT_LSN_FILE(lsn) \ - ((uint32_t)(((lsn) & 0xffffffff00000000ULL) >> 32)) -#define WT_LSN_OFFSET(lsn) \ - ((uint32_t)((lsn) & 0xffffffff)) -#define WT_LSN_INCR(lsn) \ - (++(lsn)) - uint64_t lsn; /* 08-15: LSN file/offset pair */ + uint32_t size; /* 08-11: on-disk page size */ /* * Page checksums are stored in two places. First, a page's checksum is - * stored in the tree page that references a page as part of the address + * in the internal page that references a page as part of the address * cookie. This is done to ensure we detect corruption, as storing the * checksum in the on-disk page implies a 1 in 2^32 chance corruption of * the page will result in a valid checksum). Second, a page's checksum * is stored in the disk header. This is for salvage, so that salvage - * knows when it's found a page that has some chance of being useful. - * This isn't risky because the complete address cookie in the reference - * page is compared before we connect the two pages back together. - */ - uint32_t cksum; /* 16-19: checksum */ - - /* - * We write the page size in the on-disk page header because it makes - * salvage easier. (If we don't know the expected page length, we'd - * have to read increasingly larger chunks from the file until we find - * one that checksums, and that's going to be harsh given WiredTiger's - * large page sizes.) - * - * We also store an in-memory size because otherwise we'd have no idea - * how much memory to allocate in order to expand a compressed page. - */ - uint32_t size; /* 20-23: on-disk page size */ - uint32_t memsize; /* 24-27: in-memory page size */ - - union { - uint32_t entries; /* 28-31: number of cells on page */ - uint32_t datalen; /* 28-31: overflow data length */ - } u; - - uint8_t type; /* 32: page type */ - - /* - * End the the WT_PAGE_DISK structure with 3 bytes of padding: it wastes - * space, but it leaves the WT_PAGE_DISK structure 32-bit aligned and - * having a small amount of space to play with in the future can't hurt. + * knows when it's found a page that may be useful. */ - uint8_t unused[3]; /* 33-35: unused padding */ + uint32_t cksum; /* 12-15: checksum */ }; /* - * WT_PAGE_DISK_SIZE is the expected structure size -- we verify the build to - * ensure the compiler hasn't inserted padding (which would break the world). + * WT_BLOCK_HEADER_SIZE is the number of bytes we allocate for the structure: if + * the compiler inserts padding it will break the world. */ -#define WT_PAGE_DISK_SIZE 36 +#define WT_BLOCK_HEADER_SIZE 16 /* - * WT_DISK_REQUIRED-- - * Return bytes needed for byte length, rounded to an allocation unit. + * WT_BLOCK_HEADER_BYTE + * WT_BLOCK_HEADER_BYTE_SIZE -- + * The first usable data byte on the block (past the combined headers). */ -#define WT_DISK_REQUIRED(block, size) \ - (WT_ALIGN((size) + WT_PAGE_DISK_SIZE, ((WT_BLOCK *)(block))->allocsize)) +#define WT_BLOCK_HEADER_BYTE_SIZE \ + (WT_PAGE_HEADER_SIZE + WT_BLOCK_HEADER_SIZE) +#define WT_BLOCK_HEADER_BYTE(dsk) \ + ((void *)((uint8_t *)(dsk) + WT_BLOCK_HEADER_BYTE_SIZE)) /* - * WT_PAGE_DISK_BYTE -- - * The first usable data byte on the page (past the header). + * Don't compress the block's WT_PAGE_HEADER and WT_BLOCK_HEADER structures. + * We need the WT_PAGE_HEADER in-memory size, and the WT_BLOCK_HEADER checksum + * and on-disk size to be immediately available without decompression. We use + * the on-disk size and checksum during salvage to figure out where the blocks + * are, and the in-memory size tells us how large a buffer we need to decompress + * the block. We could skip less than 64B, but a 64B boundary may offer better + * alignment for the underlying compression engine, and skipping 64B won't make + * a difference in terms of compression efficiency. */ -#define WT_PAGE_DISK_BYTE(dsk) \ - ((void *)((uint8_t *)(dsk) + WT_PAGE_DISK_SIZE)) +#define WT_BLOCK_COMPRESS_SKIP 64 diff --git a/src/include/btmem.h b/src/include/btmem.h index 421d5ed0732..f19cad86330 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -6,6 +6,59 @@ */ /* + * WT_PAGE_HEADER -- + * Blocks have a common header, a WT_PAGE_HEADER structure followed by a + * block-manager specific structure. + */ +struct __wt_page_header { + /* + * The record number of the first record of the page is stored on disk + * so we can figure out where the column-store leaf page fits into the + * key space during salvage. + */ + uint64_t recno; /* 00-07: column-store starting recno */ + + /* The in-memory size of the block. */ + uint32_t memsize; /* 08-11: in-memory page size */ + + union { + uint32_t entries; /* 12-15: number of cells on page */ + uint32_t datalen; /* 12-15: overflow data length */ + } u; + + uint8_t type; /* 16: page type */ + + /* + * End the WT_PAGE_HEADER structure with 3 bytes of padding: it wastes + * space, but it leaves the WT_PAGE_HEADER structure 32-bit aligned and + * having a small amount of space to play with in the future can't hurt. + */ + uint8_t unused[3]; /* 17-19: unused padding */ +}; +/* + * WT_PAGE_HEADER_SIZE is the number of bytes we allocate for the structure: if + * the compiler inserts padding it will break the world. + */ +#define WT_PAGE_HEADER_SIZE 20 + +/* + * The block-manager specific information immediately follows the WT_PAGE_DISK + * structure. + */ +#define WT_BLOCK_HEADER_REF(dsk) \ + ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_SIZE)) + +/* + * WT_PAGE_HEADER_BYTE -- + * WT_PAGE_HEADER_BYTE_SIZE -- + * The first usable data byte on the block (past the combined headers). + */ +#define WT_PAGE_HEADER_BYTE_SIZE(btree) \ + ((u_int)(WT_PAGE_HEADER_SIZE + (btree)->block_header)) +#define WT_PAGE_HEADER_BYTE(btree, dsk) \ + ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_BYTE_SIZE(btree))) + +/* * WT_ADDR -- * A block location. */ @@ -190,7 +243,7 @@ struct __wt_page { } u; /* Page's on-disk representation: NULL for pages created in memory. */ - WT_PAGE_DISK *dsk; + WT_PAGE_HEADER *dsk; /* If/when the page is modified, we need lots more information. */ WT_PAGE_MODIFY *modify; @@ -256,6 +309,15 @@ struct __wt_page { WT_PAGE_REC_REPLACE | WT_PAGE_REC_SPLIT | WT_PAGE_REC_SPLIT_MERGE) /* + * WT_PAGE_DISK_OFFSET, WT_PAGE_REF_OFFSET -- + * Return the offset/pointer of a pointer/offset in a page disk image. + */ +#define WT_PAGE_DISK_OFFSET(page, p) \ + WT_PTRDIFF32(p, (page)->dsk) +#define WT_PAGE_REF_OFFSET(page, o) \ + ((void *)((uint8_t *)((page)->dsk) + (o))) + +/* * WT_REF -- * A single in-memory page and the state information used to determine if it's * OK to dereference the pointer to the page. @@ -435,7 +497,7 @@ struct __wt_col_rle { * exist on the page, return a NULL. */ #define WT_COL_PTR(page, cip) \ - ((cip)->__value == 0 ? NULL : WT_REF_OFFSET(page, (cip)->__value)) + ((cip)->__value == 0 ? NULL : WT_PAGE_REF_OFFSET(page, (cip)->__value)) /* * WT_COL_FOREACH -- @@ -642,15 +704,8 @@ struct __wt_insert_head { #define WT_FIX_FOREACH(btree, dsk, v, i) \ for ((i) = 0, \ (v) = (i) < (dsk)->u.entries ? \ - __bit_getv(WT_PAGE_DISK_BYTE(dsk), 0, (btree)->bitcnt) : 0; \ + __bit_getv( \ + WT_PAGE_HEADER_BYTE(btree, dsk), 0, (btree)->bitcnt) : 0; \ (i) < (dsk)->u.entries; ++(i), \ - (v) = __bit_getv(WT_PAGE_DISK_BYTE(dsk), i, (btree)->bitcnt)) - -/* - * WT_DISK_OFFSET, WT_REF_OFFSET -- - * Return the offset/pointer of a pointer/offset in a page disk image. - */ -#define WT_DISK_OFFSET(dsk, p) \ - WT_PTRDIFF32(p, dsk) -#define WT_REF_OFFSET(page, o) \ - ((void *)((uint8_t *)((page)->dsk) + (o))) + (v) = __bit_getv( \ + WT_PAGE_HEADER_BYTE(btree, dsk), i, (btree)->bitcnt)) diff --git a/src/include/btree.h b/src/include/btree.h index 105833f8035..9fa57432041 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -44,16 +44,6 @@ struct __wt_btree { void *huffman_key; /* Key huffman encoding */ void *huffman_value; /* Value huffman encoding */ - void *block; /* Block manager */ - - WT_PAGE *root_page; /* Root page */ - WT_ADDR root_addr; /* Replacement root address */ - int root_update; /* 0: free original root blocks - 1: free saved root blocks and - update on close */ - - WT_PAGE *evict_page; /* Eviction thread's location */ - /* * Column-store: track the last record in the file, and keep the last * page pinned in memory for fast appends, to a skiplist of appended @@ -63,6 +53,17 @@ struct __wt_btree { uint64_t last_recno; /* Col-store append, last recno */ WT_INSERT_HEAD **append; /* Appended items */ + WT_PAGE *root_page; /* Root page */ + WT_ADDR root_addr; /* Replacement root address */ + int root_update; /* 0: free original root blocks + 1: free saved root blocks and + update on close */ + + void *block; /* Block manager */ + u_int block_header; /* Block manager header length */ + + WT_PAGE *evict_page; /* Eviction thread's location */ + WT_BTREE_STATS *stats; /* Btree statistics */ #define WT_BTREE_BULK 0x01 /* Bulk-load handle */ diff --git a/src/include/cell.i b/src/include/cell.i index 3789ebb117d..df29e195540 100644 --- a/src/include/cell.i +++ b/src/include/cell.i @@ -111,8 +111,9 @@ struct __wt_cell_unpack { * WT_CELL_FOREACH -- * Walk the cells on a page. */ -#define WT_CELL_FOREACH(dsk, cell, unpack, i) \ - for ((cell) = WT_PAGE_DISK_BYTE(dsk), (i) = (dsk)->u.entries; \ +#define WT_CELL_FOREACH(btree, dsk, cell, unpack, i) \ + for ((cell) = \ + WT_PAGE_HEADER_BYTE(btree, dsk), (i) = (dsk)->u.entries; \ (i) > 0; \ (cell) = (WT_CELL *)((uint8_t *)(cell) + (unpack)->len), --(i)) diff --git a/src/include/extern.h b/src/include/extern.h index d278945763e..1328a88ffb7 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -64,6 +64,8 @@ extern int __wt_bm_read(WT_SESSION_IMPL *session, WT_BUF *buf, const uint8_t *addr, uint32_t addr_size); +extern int __wt_bm_block_header(WT_SESSION_IMPL *session, uint32_t *headerp); +extern int __wt_bm_write_size(WT_SESSION_IMPL *session, uint32_t *sizep); extern int __wt_bm_write( WT_SESSION_IMPL *session, WT_BUF *buf, uint8_t *addr, @@ -74,6 +76,7 @@ extern int __wt_bm_salvage_next(WT_SESSION_IMPL *session, WT_BUF *buf, uint8_t *addr, uint32_t *addr_sizep, + uint64_t *write_genp, int *eofp); extern int __wt_bm_salvage_end(WT_SESSION_IMPL *session, int success); extern int __wt_bm_verify_start(WT_SESSION_IMPL *session, int *emptyp); @@ -105,11 +108,12 @@ extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block); extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block, int success); -extern int __wt_block_salvage_next(WT_SESSION_IMPL *session, +extern int __wt_block_salvage_next( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BUF *buf, uint8_t *addr, uint32_t *addr_sizep, + uint64_t *write_genp, int *eofp); extern int __wt_block_verify_start(WT_SESSION_IMPL *session, WT_BLOCK *block, @@ -119,6 +123,12 @@ extern int __wt_block_verify_addr(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, uint32_t addr_size); +extern int __wt_block_header(WT_SESSION_IMPL *session, + WT_BLOCK *block, + uint32_t *headerp); +extern int __wt_block_write_size( WT_SESSION_IMPL *session, + WT_BLOCK *block, + uint32_t *sizep); extern int __wt_block_write_buf(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BUF *buf, @@ -157,8 +167,8 @@ extern int __wt_debug_addr( WT_SESSION_IMPL *session, uint32_t addr, uint32_t size, const char *ofile); -extern int __wt_debug_disk(WT_SESSION_IMPL *session, - WT_PAGE_DISK *dsk, +extern int __wt_debug_disk( WT_SESSION_IMPL *session, + WT_PAGE_HEADER *dsk, const char *ofile); extern int __wt_debug_tree_all(WT_SESSION_IMPL *session, WT_PAGE *page, @@ -214,7 +224,7 @@ __wt_page_in_func( extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *parent_ref, - WT_PAGE_DISK *dsk, + WT_PAGE_HEADER *dsk, WT_PAGE **pagep); extern int __wt_page_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_cache_read(WT_SESSION_IMPL *session, @@ -235,7 +245,7 @@ extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_dumpfile(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, - WT_PAGE_DISK *dsk, + WT_PAGE_HEADER *dsk, uint32_t size); extern int __wt_tree_np(WT_SESSION_IMPL *session, WT_PAGE **pagep, diff --git a/src/include/verify_build.h b/src/include/verify_build.h index d0c57c10f3e..9df9b4c7b23 100644 --- a/src/include/verify_build.h +++ b/src/include/verify_build.h @@ -43,22 +43,10 @@ static inline void __wt_verify_build(void) { - /* - * The compiler had better not have padded our structures -- make sure - * the page header structure is exactly what we expect. - */ + /* On-disk structures should not be padded. */ SIZE_CHECK(WT_BLOCK_DESC, WT_BLOCK_DESC_SIZE); - /* - * The page header is special: the compiler will pad it to a multiple - * of 8 bytes because it has 64-bit fields that need alignment. We - * use WT_PAGE_DISK_SIZE everywhere instead of sizeof to avoid writing - * 4 extra bytes to the file. - */ - SIZE_CHECK(WT_PAGE_DISK, WT_ALIGN(WT_PAGE_DISK_SIZE, sizeof(void *))); - - /* There are also structures that must be aligned correctly. */ - ALIGN_CHECK(WT_PAGE_DISK, sizeof(uint32_t)); + /* Some structures must be aligned correctly. */ ALIGN_CHECK(WT_SESSION_BUFFER, sizeof(uint32_t)); /* diff --git a/src/include/wt_internal.in b/src/include/wt_internal.in index 407e40c922c..df9b817660c 100644 --- a/src/include/wt_internal.in +++ b/src/include/wt_internal.in @@ -51,6 +51,8 @@ struct __wt_block; typedef struct __wt_block WT_BLOCK; struct __wt_block_desc; typedef struct __wt_block_desc WT_BLOCK_DESC; +struct __wt_block_header; + typedef struct __wt_block_header WT_BLOCK_HEADER; struct __wt_btree; typedef struct __wt_btree WT_BTREE; struct __wt_btree_session; @@ -115,8 +117,8 @@ struct __wt_named_compressor; typedef struct __wt_named_compressor WT_NAMED_COMPRESSOR; struct __wt_page; typedef struct __wt_page WT_PAGE; -struct __wt_page_disk; - typedef struct __wt_page_disk WT_PAGE_DISK; +struct __wt_page_header; + typedef struct __wt_page_header WT_PAGE_HEADER; struct __wt_page_modify; typedef struct __wt_page_modify WT_PAGE_MODIFY; struct __wt_page_track; |