summaryrefslogtreecommitdiff
path: root/src/include
diff options
context:
space:
mode:
authorKeith Bostic <keith.bostic@wiredtiger.com>2012-01-01 11:38:39 +0000
committerKeith Bostic <keith.bostic@wiredtiger.com>2012-01-01 11:38:39 +0000
commitb15486d66c545620088416a244d9fb9692f53cdb (patch)
tree64660fda17b6d823a7b34edf4eb1deb809a8c084 /src/include
parent8d63543bdabd2db901ffab84a3258c797cc4702f (diff)
downloadmongo-b15486d66c545620088416a244d9fb9692f53cdb.tar.gz
Split the WT_PAGE_DISK structure into two parts: the btree page header
(WT_PAGE_HEADER) and the block-manager's header (WT_BLOCK_HEADER).
Diffstat (limited to 'src/include')
-rw-r--r--src/include/block.h132
-rw-r--r--src/include/btmem.h81
-rw-r--r--src/include/btree.h21
-rw-r--r--src/include/cell.i5
-rw-r--r--src/include/extern.h20
-rw-r--r--src/include/verify_build.h16
-rw-r--r--src/include/wt_internal.in6
7 files changed, 152 insertions, 129 deletions
diff --git a/src/include/block.h b/src/include/block.h
index 0fca1e014da..2fa3c75ec77 100644
--- a/src/include/block.h
+++ b/src/include/block.h
@@ -38,7 +38,7 @@ struct __wt_block {
WT_FH *fh; /* Backing file handle */
- uint64_t lsn; /* LSN file/offset pair */
+ uint64_t write_gen; /* Write generation */
uint32_t allocsize; /* Allocation size */
int checksum; /* If checksums configured */
@@ -90,13 +90,13 @@ struct __wt_block_desc {
uint32_t free_cksum; /* 28-31: Free list page checksum */
/*
- * We maintain page LSN's for the file in the non-transactional case
- * (where, instead of a log reference, the LSN is simply a counter),
+ * We maintain page write-generations in the non-transactional case
+ * (where, instead of a transactional LSN, the value is a counter),
* as that's how salvage can determine the most recent page between
- * pages overlapping the same key range. This non-transactional LSN
- * has to be persistent, and so it's included in the file's metadata.
+ * pages overlapping the same key range. The value has to persist,
+ * so it's included in the file's metadata.
*/
- uint64_t lsn; /* 32-39: Non-transactional page LSN */
+ uint64_t write_gen; /* 32-39: Write generation */
};
/*
* WT_BLOCK_DESC_SIZE is the expected structure size -- we verify the build to
@@ -107,103 +107,69 @@ struct __wt_block_desc {
#define WT_BLOCK_DESC_SIZE 40
/*
- * Don't compress the first 32B of the block (almost all of the WT_PAGE_DISK
- * structure) because we need the block's checksum and on-disk and in-memory
- * sizes to be immediately available without decompression (the checksum and
- * the on-disk block sizes are used during salvage to figure out where the
- * blocks are, and the in-memory page size tells us how large a buffer we need
- * to decompress the file block. We could take less than 32B, but a 32B
- * boundary is probably better alignment for the underlying compression engine,
- * and skipping 32B won't matter in terms of compression efficiency.
+ * WT_BLOCK_HEADER --
+ * Blocks have a common header, a WT_PAGE_HEADER structure followed by a
+ * block-manager specific structure: WT_BLOCK_HEADER is WiredTiger's default.
*/
-#define WT_BLOCK_COMPRESS_SKIP 32
-
-/*
- * WT_PAGE_DISK --
- *
- * All on-disk pages have a common header, defined by the WT_PAGE_DISK
- * structure. The header has no version number or mode bits, and the page type
- * and/or flags value will have to be modified when changes are made to the page
- * layout. (The page type appears early in the header to make this simpler.)
- * In other words, the page type declares the contents of the page and how to
- * read it.
- */
-struct __wt_page_disk {
+struct __wt_block_header {
/*
- * The record number of the first record of the page is stored on disk
- * so we can figure out where the column-store leaf page fits into the
- * key space during salvage.
+ * We maintain page write-generations in the non-transactional case
+ * (where, instead of a transactional LSN, the value is a counter),
+ * as that's how salvage can determine the most recent page between
+ * pages overlapping the same key range.
+ *
+ * !!!
+ * The write-generation is "owned" by the btree layer, but it's easier
+ * to set it (when physically writing blocks), to persist it (in the
+ * WT_BLOCK_DESC structure, rather than the schema file), and restore
+ * it during salvage, in the block-manager layer.
*/
- uint64_t recno; /* 00-07: column-store starting recno */
+ uint64_t write_gen; /* 00-07: write generation */
/*
- * The LSN is a 64-bit chunk to make assignment and comparisons easier,
- * but it's 2 32-bit values underneath: a file number and a file offset.
+ * We write the page size in the on-disk page header because it makes
+ * salvage easier. (If we don't know the expected page length, we'd
+ * have to read increasingly larger chunks from the file until we find
+ * one that checksums, and that's going to be harsh given WiredTiger's
+ * potentially large page sizes.)
*/
-#define WT_LSN_FILE(lsn) \
- ((uint32_t)(((lsn) & 0xffffffff00000000ULL) >> 32))
-#define WT_LSN_OFFSET(lsn) \
- ((uint32_t)((lsn) & 0xffffffff))
-#define WT_LSN_INCR(lsn) \
- (++(lsn))
- uint64_t lsn; /* 08-15: LSN file/offset pair */
+ uint32_t size; /* 08-11: on-disk page size */
/*
* Page checksums are stored in two places. First, a page's checksum is
- * stored in the tree page that references a page as part of the address
+ * in the internal page that references a page as part of the address
* cookie. This is done to ensure we detect corruption, as storing the
* checksum in the on-disk page implies a 1 in 2^32 chance corruption of
* the page will result in a valid checksum). Second, a page's checksum
* is stored in the disk header. This is for salvage, so that salvage
- * knows when it's found a page that has some chance of being useful.
- * This isn't risky because the complete address cookie in the reference
- * page is compared before we connect the two pages back together.
- */
- uint32_t cksum; /* 16-19: checksum */
-
- /*
- * We write the page size in the on-disk page header because it makes
- * salvage easier. (If we don't know the expected page length, we'd
- * have to read increasingly larger chunks from the file until we find
- * one that checksums, and that's going to be harsh given WiredTiger's
- * large page sizes.)
- *
- * We also store an in-memory size because otherwise we'd have no idea
- * how much memory to allocate in order to expand a compressed page.
- */
- uint32_t size; /* 20-23: on-disk page size */
- uint32_t memsize; /* 24-27: in-memory page size */
-
- union {
- uint32_t entries; /* 28-31: number of cells on page */
- uint32_t datalen; /* 28-31: overflow data length */
- } u;
-
- uint8_t type; /* 32: page type */
-
- /*
- * End the the WT_PAGE_DISK structure with 3 bytes of padding: it wastes
- * space, but it leaves the WT_PAGE_DISK structure 32-bit aligned and
- * having a small amount of space to play with in the future can't hurt.
+ * knows when it's found a page that may be useful.
*/
- uint8_t unused[3]; /* 33-35: unused padding */
+ uint32_t cksum; /* 12-15: checksum */
};
/*
- * WT_PAGE_DISK_SIZE is the expected structure size -- we verify the build to
- * ensure the compiler hasn't inserted padding (which would break the world).
+ * WT_BLOCK_HEADER_SIZE is the number of bytes we allocate for the structure: if
+ * the compiler inserts padding it will break the world.
*/
-#define WT_PAGE_DISK_SIZE 36
+#define WT_BLOCK_HEADER_SIZE 16
/*
- * WT_DISK_REQUIRED--
- * Return bytes needed for byte length, rounded to an allocation unit.
+ * WT_BLOCK_HEADER_BYTE
+ * WT_BLOCK_HEADER_BYTE_SIZE --
+ * The first usable data byte on the block (past the combined headers).
*/
-#define WT_DISK_REQUIRED(block, size) \
- (WT_ALIGN((size) + WT_PAGE_DISK_SIZE, ((WT_BLOCK *)(block))->allocsize))
+#define WT_BLOCK_HEADER_BYTE_SIZE \
+ (WT_PAGE_HEADER_SIZE + WT_BLOCK_HEADER_SIZE)
+#define WT_BLOCK_HEADER_BYTE(dsk) \
+ ((void *)((uint8_t *)(dsk) + WT_BLOCK_HEADER_BYTE_SIZE))
/*
- * WT_PAGE_DISK_BYTE --
- * The first usable data byte on the page (past the header).
+ * Don't compress the block's WT_PAGE_HEADER and WT_BLOCK_HEADER structures.
+ * We need the WT_PAGE_HEADER in-memory size, and the WT_BLOCK_HEADER checksum
+ * and on-disk size to be immediately available without decompression. We use
+ * the on-disk size and checksum during salvage to figure out where the blocks
+ * are, and the in-memory size tells us how large a buffer we need to decompress
+ * the block. We could skip less than 64B, but a 64B boundary may offer better
+ * alignment for the underlying compression engine, and skipping 64B won't make
+ * a difference in terms of compression efficiency.
*/
-#define WT_PAGE_DISK_BYTE(dsk) \
- ((void *)((uint8_t *)(dsk) + WT_PAGE_DISK_SIZE))
+#define WT_BLOCK_COMPRESS_SKIP 64
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 421d5ed0732..f19cad86330 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -6,6 +6,59 @@
*/
/*
+ * WT_PAGE_HEADER --
+ * Blocks have a common header, a WT_PAGE_HEADER structure followed by a
+ * block-manager specific structure.
+ */
+struct __wt_page_header {
+ /*
+ * The record number of the first record of the page is stored on disk
+ * so we can figure out where the column-store leaf page fits into the
+ * key space during salvage.
+ */
+ uint64_t recno; /* 00-07: column-store starting recno */
+
+ /* The in-memory size of the block. */
+ uint32_t memsize; /* 08-11: in-memory page size */
+
+ union {
+ uint32_t entries; /* 12-15: number of cells on page */
+ uint32_t datalen; /* 12-15: overflow data length */
+ } u;
+
+ uint8_t type; /* 16: page type */
+
+ /*
+ * End the WT_PAGE_HEADER structure with 3 bytes of padding: it wastes
+ * space, but it leaves the WT_PAGE_HEADER structure 32-bit aligned and
+ * having a small amount of space to play with in the future can't hurt.
+ */
+ uint8_t unused[3]; /* 17-19: unused padding */
+};
+/*
+ * WT_PAGE_HEADER_SIZE is the number of bytes we allocate for the structure: if
+ * the compiler inserts padding it will break the world.
+ */
+#define WT_PAGE_HEADER_SIZE 20
+
+/*
+ * The block-manager specific information immediately follows the WT_PAGE_DISK
+ * structure.
+ */
+#define WT_BLOCK_HEADER_REF(dsk) \
+ ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_SIZE))
+
+/*
+ * WT_PAGE_HEADER_BYTE --
+ * WT_PAGE_HEADER_BYTE_SIZE --
+ * The first usable data byte on the block (past the combined headers).
+ */
+#define WT_PAGE_HEADER_BYTE_SIZE(btree) \
+ ((u_int)(WT_PAGE_HEADER_SIZE + (btree)->block_header))
+#define WT_PAGE_HEADER_BYTE(btree, dsk) \
+ ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_BYTE_SIZE(btree)))
+
+/*
* WT_ADDR --
* A block location.
*/
@@ -190,7 +243,7 @@ struct __wt_page {
} u;
/* Page's on-disk representation: NULL for pages created in memory. */
- WT_PAGE_DISK *dsk;
+ WT_PAGE_HEADER *dsk;
/* If/when the page is modified, we need lots more information. */
WT_PAGE_MODIFY *modify;
@@ -256,6 +309,15 @@ struct __wt_page {
WT_PAGE_REC_REPLACE | WT_PAGE_REC_SPLIT | WT_PAGE_REC_SPLIT_MERGE)
/*
+ * WT_PAGE_DISK_OFFSET, WT_PAGE_REF_OFFSET --
+ * Return the offset/pointer of a pointer/offset in a page disk image.
+ */
+#define WT_PAGE_DISK_OFFSET(page, p) \
+ WT_PTRDIFF32(p, (page)->dsk)
+#define WT_PAGE_REF_OFFSET(page, o) \
+ ((void *)((uint8_t *)((page)->dsk) + (o)))
+
+/*
* WT_REF --
* A single in-memory page and the state information used to determine if it's
* OK to dereference the pointer to the page.
@@ -435,7 +497,7 @@ struct __wt_col_rle {
* exist on the page, return a NULL.
*/
#define WT_COL_PTR(page, cip) \
- ((cip)->__value == 0 ? NULL : WT_REF_OFFSET(page, (cip)->__value))
+ ((cip)->__value == 0 ? NULL : WT_PAGE_REF_OFFSET(page, (cip)->__value))
/*
* WT_COL_FOREACH --
@@ -642,15 +704,8 @@ struct __wt_insert_head {
#define WT_FIX_FOREACH(btree, dsk, v, i) \
for ((i) = 0, \
(v) = (i) < (dsk)->u.entries ? \
- __bit_getv(WT_PAGE_DISK_BYTE(dsk), 0, (btree)->bitcnt) : 0; \
+ __bit_getv( \
+ WT_PAGE_HEADER_BYTE(btree, dsk), 0, (btree)->bitcnt) : 0; \
(i) < (dsk)->u.entries; ++(i), \
- (v) = __bit_getv(WT_PAGE_DISK_BYTE(dsk), i, (btree)->bitcnt))
-
-/*
- * WT_DISK_OFFSET, WT_REF_OFFSET --
- * Return the offset/pointer of a pointer/offset in a page disk image.
- */
-#define WT_DISK_OFFSET(dsk, p) \
- WT_PTRDIFF32(p, dsk)
-#define WT_REF_OFFSET(page, o) \
- ((void *)((uint8_t *)((page)->dsk) + (o)))
+ (v) = __bit_getv( \
+ WT_PAGE_HEADER_BYTE(btree, dsk), i, (btree)->bitcnt))
diff --git a/src/include/btree.h b/src/include/btree.h
index 105833f8035..9fa57432041 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -44,16 +44,6 @@ struct __wt_btree {
void *huffman_key; /* Key huffman encoding */
void *huffman_value; /* Value huffman encoding */
- void *block; /* Block manager */
-
- WT_PAGE *root_page; /* Root page */
- WT_ADDR root_addr; /* Replacement root address */
- int root_update; /* 0: free original root blocks
- 1: free saved root blocks and
- update on close */
-
- WT_PAGE *evict_page; /* Eviction thread's location */
-
/*
* Column-store: track the last record in the file, and keep the last
* page pinned in memory for fast appends, to a skiplist of appended
@@ -63,6 +53,17 @@ struct __wt_btree {
uint64_t last_recno; /* Col-store append, last recno */
WT_INSERT_HEAD **append; /* Appended items */
+ WT_PAGE *root_page; /* Root page */
+ WT_ADDR root_addr; /* Replacement root address */
+ int root_update; /* 0: free original root blocks
+ 1: free saved root blocks and
+ update on close */
+
+ void *block; /* Block manager */
+ u_int block_header; /* Block manager header length */
+
+ WT_PAGE *evict_page; /* Eviction thread's location */
+
WT_BTREE_STATS *stats; /* Btree statistics */
#define WT_BTREE_BULK 0x01 /* Bulk-load handle */
diff --git a/src/include/cell.i b/src/include/cell.i
index 3789ebb117d..df29e195540 100644
--- a/src/include/cell.i
+++ b/src/include/cell.i
@@ -111,8 +111,9 @@ struct __wt_cell_unpack {
* WT_CELL_FOREACH --
* Walk the cells on a page.
*/
-#define WT_CELL_FOREACH(dsk, cell, unpack, i) \
- for ((cell) = WT_PAGE_DISK_BYTE(dsk), (i) = (dsk)->u.entries; \
+#define WT_CELL_FOREACH(btree, dsk, cell, unpack, i) \
+ for ((cell) = \
+ WT_PAGE_HEADER_BYTE(btree, dsk), (i) = (dsk)->u.entries; \
(i) > 0; \
(cell) = (WT_CELL *)((uint8_t *)(cell) + (unpack)->len), --(i))
diff --git a/src/include/extern.h b/src/include/extern.h
index d278945763e..1328a88ffb7 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -64,6 +64,8 @@ extern int __wt_bm_read(WT_SESSION_IMPL *session,
WT_BUF *buf,
const uint8_t *addr,
uint32_t addr_size);
+extern int __wt_bm_block_header(WT_SESSION_IMPL *session, uint32_t *headerp);
+extern int __wt_bm_write_size(WT_SESSION_IMPL *session, uint32_t *sizep);
extern int __wt_bm_write( WT_SESSION_IMPL *session,
WT_BUF *buf,
uint8_t *addr,
@@ -74,6 +76,7 @@ extern int __wt_bm_salvage_next(WT_SESSION_IMPL *session,
WT_BUF *buf,
uint8_t *addr,
uint32_t *addr_sizep,
+ uint64_t *write_genp,
int *eofp);
extern int __wt_bm_salvage_end(WT_SESSION_IMPL *session, int success);
extern int __wt_bm_verify_start(WT_SESSION_IMPL *session, int *emptyp);
@@ -105,11 +108,12 @@ extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern int __wt_block_salvage_end(WT_SESSION_IMPL *session,
WT_BLOCK *block,
int success);
-extern int __wt_block_salvage_next(WT_SESSION_IMPL *session,
+extern int __wt_block_salvage_next( WT_SESSION_IMPL *session,
WT_BLOCK *block,
WT_BUF *buf,
uint8_t *addr,
uint32_t *addr_sizep,
+ uint64_t *write_genp,
int *eofp);
extern int __wt_block_verify_start(WT_SESSION_IMPL *session,
WT_BLOCK *block,
@@ -119,6 +123,12 @@ extern int __wt_block_verify_addr(WT_SESSION_IMPL *session,
WT_BLOCK *block,
const uint8_t *addr,
uint32_t addr_size);
+extern int __wt_block_header(WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
+ uint32_t *headerp);
+extern int __wt_block_write_size( WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
+ uint32_t *sizep);
extern int __wt_block_write_buf(WT_SESSION_IMPL *session,
WT_BLOCK *block,
WT_BUF *buf,
@@ -157,8 +167,8 @@ extern int __wt_debug_addr( WT_SESSION_IMPL *session,
uint32_t addr,
uint32_t size,
const char *ofile);
-extern int __wt_debug_disk(WT_SESSION_IMPL *session,
- WT_PAGE_DISK *dsk,
+extern int __wt_debug_disk( WT_SESSION_IMPL *session,
+ WT_PAGE_HEADER *dsk,
const char *ofile);
extern int __wt_debug_tree_all(WT_SESSION_IMPL *session,
WT_PAGE *page,
@@ -214,7 +224,7 @@ __wt_page_in_func(
extern int __wt_page_inmem(WT_SESSION_IMPL *session,
WT_PAGE *parent,
WT_REF *parent_ref,
- WT_PAGE_DISK *dsk,
+ WT_PAGE_HEADER *dsk,
WT_PAGE **pagep);
extern int __wt_page_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_cache_read(WT_SESSION_IMPL *session,
@@ -235,7 +245,7 @@ extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_dumpfile(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_verify_dsk(WT_SESSION_IMPL *session,
const char *addr,
- WT_PAGE_DISK *dsk,
+ WT_PAGE_HEADER *dsk,
uint32_t size);
extern int __wt_tree_np(WT_SESSION_IMPL *session,
WT_PAGE **pagep,
diff --git a/src/include/verify_build.h b/src/include/verify_build.h
index d0c57c10f3e..9df9b4c7b23 100644
--- a/src/include/verify_build.h
+++ b/src/include/verify_build.h
@@ -43,22 +43,10 @@
static inline void
__wt_verify_build(void)
{
- /*
- * The compiler had better not have padded our structures -- make sure
- * the page header structure is exactly what we expect.
- */
+ /* On-disk structures should not be padded. */
SIZE_CHECK(WT_BLOCK_DESC, WT_BLOCK_DESC_SIZE);
- /*
- * The page header is special: the compiler will pad it to a multiple
- * of 8 bytes because it has 64-bit fields that need alignment. We
- * use WT_PAGE_DISK_SIZE everywhere instead of sizeof to avoid writing
- * 4 extra bytes to the file.
- */
- SIZE_CHECK(WT_PAGE_DISK, WT_ALIGN(WT_PAGE_DISK_SIZE, sizeof(void *)));
-
- /* There are also structures that must be aligned correctly. */
- ALIGN_CHECK(WT_PAGE_DISK, sizeof(uint32_t));
+ /* Some structures must be aligned correctly. */
ALIGN_CHECK(WT_SESSION_BUFFER, sizeof(uint32_t));
/*
diff --git a/src/include/wt_internal.in b/src/include/wt_internal.in
index 407e40c922c..df9b817660c 100644
--- a/src/include/wt_internal.in
+++ b/src/include/wt_internal.in
@@ -51,6 +51,8 @@ struct __wt_block;
typedef struct __wt_block WT_BLOCK;
struct __wt_block_desc;
typedef struct __wt_block_desc WT_BLOCK_DESC;
+struct __wt_block_header;
+ typedef struct __wt_block_header WT_BLOCK_HEADER;
struct __wt_btree;
typedef struct __wt_btree WT_BTREE;
struct __wt_btree_session;
@@ -115,8 +117,8 @@ struct __wt_named_compressor;
typedef struct __wt_named_compressor WT_NAMED_COMPRESSOR;
struct __wt_page;
typedef struct __wt_page WT_PAGE;
-struct __wt_page_disk;
- typedef struct __wt_page_disk WT_PAGE_DISK;
+struct __wt_page_header;
+ typedef struct __wt_page_header WT_PAGE_HEADER;
struct __wt_page_modify;
typedef struct __wt_page_modify WT_PAGE_MODIFY;
struct __wt_page_track;