diff options
author | Michael Cahill <michael.cahill@wiredtiger.com> | 2013-05-09 21:23:47 +1000 |
---|---|---|
committer | Michael Cahill <michael.cahill@wiredtiger.com> | 2013-05-09 21:23:47 +1000 |
commit | 7e073cfa08380fca6096911fba8ed34514133e2e (patch) | |
tree | 2b483d880f62773f276623181a83a93ff9addf51 | |
parent | ae50550dde2adae738c626b764dd2e37a39e145b (diff) | |
download | mongo-7e073cfa08380fca6096911fba8ed34514133e2e.tar.gz |
Use the allocation size for the file "desc" block. This is important for direct I/O, so that reads and writes are aligned as expected.
-rw-r--r-- | src/block/block_addr.c | 11 | ||||
-rw-r--r-- | src/block/block_ckpt.c | 16 | ||||
-rw-r--r-- | src/block/block_ext.c | 6 | ||||
-rw-r--r-- | src/block/block_open.c | 26 | ||||
-rw-r--r-- | src/block/block_slvg.c | 19 | ||||
-rw-r--r-- | src/block/block_vrfy.c | 12 | ||||
-rw-r--r-- | src/include/block.h | 3 | ||||
-rw-r--r-- | src/include/extern.h | 19 | ||||
-rw-r--r-- | src/schema/schema_create.c | 14 | ||||
-rw-r--r-- | src/schema/schema_truncate.c | 12 | ||||
-rw-r--r-- | test/salvage/salvage.c | 6 |
11 files changed, 80 insertions, 64 deletions
diff --git a/src/block/block_addr.c b/src/block/block_addr.c index 359d229bf10..196bacc6f80 100644 --- a/src/block/block_addr.c +++ b/src/block/block_addr.c @@ -24,9 +24,9 @@ __block_buffer_to_addr(WT_BLOCK *block, /* * To avoid storing large offsets, we minimize the value by subtracting - * 512B (the size of the description sector), and then storing a count - * of block allocation units. That implies there is no such thing as - * an "invalid" offset though, they could all be valid (other than very + * a block for the description sector, then storing a count of block + * allocation units. That implies there is no such thing as an + * "invalid" offset though, they could all be valid (other than very * large numbers), which is what we didn't want to store in the first * place. Use the size: writing a block of size 0 makes no sense, so * that's the out-of-band value. Once we're out of this function and @@ -38,7 +38,7 @@ __block_buffer_to_addr(WT_BLOCK *block, *offsetp = 0; *sizep = *cksump = 0; } else { - *offsetp = (off_t)o * block->allocsize + WT_BLOCK_DESC_SECTOR; + *offsetp = (off_t)(o + 1) * block->allocsize; *sizep = (uint32_t)s * block->allocsize; *cksump = (uint32_t)c; } @@ -60,8 +60,7 @@ __wt_block_addr_to_buffer(WT_BLOCK *block, o = WT_BLOCK_INVALID_OFFSET; s = c = 0; } else { - o = (uint64_t) - (offset - WT_BLOCK_DESC_SECTOR) / block->allocsize; + o = (uint64_t)offset / block->allocsize - 1; s = size / block->allocsize; c = cksum; } diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c index 61728cfad15..f83144ce8e7 100644 --- a/src/block/block_ckpt.c +++ b/src/block/block_ckpt.c @@ -18,8 +18,8 @@ static int __ckpt_update(WT_SESSION_IMPL *, * Initialize a checkpoint structure. */ int -__wt_block_ckpt_init( - WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name) +__wt_block_ckpt_init(WT_SESSION_IMPL *session, + WT_BLOCK_CKPT *ci, const char *name, uint32_t allocsize) { memset(ci, 0, sizeof(*ci)); @@ -29,7 +29,7 @@ __wt_block_ckpt_init( WT_RET(__wt_block_extlist_init(session, &ci->avail, name, "avail")); WT_RET(__wt_block_extlist_init(session, &ci->discard, name, "discard")); - ci->file_size = WT_BLOCK_DESC_SECTOR; + ci->file_size = allocsize; WT_RET(__wt_block_extlist_init( session, &ci->ckpt_avail, name, "ckpt_avail")); @@ -77,7 +77,8 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, */ if (checkpoint) { ci = &_ci; - WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint")); + WT_ERR(__wt_block_ckpt_init( + session, ci, "checkpoint", block->allocsize)); } else { /* * We depend on the btree level for locking: things will go @@ -86,7 +87,8 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, * file, for that matter. */ ci = &block->live; - WT_ERR(__wt_block_ckpt_init(session, ci, "live")); + WT_ERR(__wt_block_ckpt_init( + session, ci, "live", block->allocsize)); } /* If the checkpoint has an on-disk root page, load it. */ @@ -236,7 +238,7 @@ __ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt) WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv)); ci = ckpt->bpriv; - WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name)); + WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name, block->allocsize)); WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci)); WT_RET(__wt_block_extlist_read( session, block, &ci->alloc, ci->file_size)); @@ -691,7 +693,7 @@ __ckpt_string(WT_SESSION_IMPL *session, /* Initialize the checkpoint, crack the cookie. */ ci = &_ci; - WT_RET(__wt_block_ckpt_init(session, ci, "string")); + WT_RET(__wt_block_ckpt_init(session, ci, "string", block->allocsize)); WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci)); WT_RET(__wt_buf_fmt(session, buf, diff --git a/src/block/block_ext.c b/src/block/block_ext.c index 204fd418c81..c89dd3a85ef 100644 --- a/src/block/block_ext.c +++ b/src/block/block_ext.c @@ -459,7 +459,7 @@ __block_extend( * * We should never be allocating from an empty file. */ - if (fh->file_size < WT_BLOCK_DESC_SECTOR) + if (fh->file_size < block->allocsize) WT_RET_MSG(session, EINVAL, "cannot allocate from a file with no description " "information"); @@ -1076,8 +1076,8 @@ __wt_block_extlist_read( * a cheap test to do here and we'd have to do the check as part * of file verification, regardless. */ - if (off < WT_BLOCK_DESC_SECTOR || - (off - WT_BLOCK_DESC_SECTOR) % block->allocsize != 0 || + if (off < block->allocsize || + off % block->allocsize != 0 || size % block->allocsize != 0 || off + size > ckpt_size) corrupted: WT_ERR_MSG(session, WT_ERROR, diff --git a/src/block/block_open.c b/src/block/block_open.c index 750d29eaa72..5e7338a9f58 100644 --- a/src/block/block_open.c +++ b/src/block/block_open.c @@ -14,7 +14,8 @@ static int __desc_read(WT_SESSION_IMPL *, WT_BLOCK *); * Truncate a file. */ int -__wt_block_manager_truncate(WT_SESSION_IMPL *session, const char *filename) +__wt_block_manager_truncate( + WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize) { WT_DECL_RET; WT_FH *fh; @@ -26,7 +27,7 @@ __wt_block_manager_truncate(WT_SESSION_IMPL *session, const char *filename) WT_ERR(__wt_ftruncate(session, fh, (off_t)0)); /* Write out the file's meta-data. */ - ret = __wt_desc_init(session, fh); + ret = __wt_desc_init(session, fh, allocsize); /* Close the file handle. */ err: WT_TRET(__wt_close(session, fh)); @@ -39,7 +40,8 @@ err: WT_TRET(__wt_close(session, fh)); * Create a file. */ int -__wt_block_manager_create(WT_SESSION_IMPL *session, const char *filename) +__wt_block_manager_create( + WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize) { WT_DECL_RET; WT_FH *fh; @@ -48,7 +50,7 @@ __wt_block_manager_create(WT_SESSION_IMPL *session, const char *filename) WT_RET(__wt_open(session, filename, 1, 1, 1, &fh)); /* Write out the file's meta-data. */ - ret = __wt_desc_init(session, fh); + ret = __wt_desc_init(session, fh, allocsize); /* Close the file handle. */ WT_TRET(__wt_close(session, fh)); @@ -214,15 +216,15 @@ __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block) * Write a file's initial descriptor structure. */ int -__wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh) +__wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize) { WT_BLOCK_DESC *desc; WT_DECL_ITEM(buf); WT_DECL_RET; /* Use a scratch buffer to get correct alignment for direct I/O. */ - WT_RET(__wt_scr_alloc(session, WT_BLOCK_DESC_SECTOR, &buf)); - memset(buf->mem, 0, WT_BLOCK_DESC_SECTOR); + WT_RET(__wt_scr_alloc(session, allocsize, &buf)); + memset(buf->mem, 0, allocsize); desc = buf->mem; desc->magic = WT_BLOCK_MAGIC; @@ -231,9 +233,9 @@ __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh) /* Update the checksum. */ desc->cksum = 0; - desc->cksum = __wt_cksum(desc, WT_BLOCK_DESC_SECTOR); + desc->cksum = __wt_cksum(desc, allocsize); - ret = __wt_write(session, fh, (off_t)0, WT_BLOCK_DESC_SECTOR, desc); + ret = __wt_write(session, fh, (off_t)0, allocsize, desc); __wt_scr_free(&buf); return (ret); @@ -252,11 +254,11 @@ __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block) uint32_t cksum; /* Use a scratch buffer to get correct alignment for direct I/O. */ - WT_RET(__wt_scr_alloc(session, WT_BLOCK_DESC_SECTOR, &buf)); + WT_RET(__wt_scr_alloc(session, block->allocsize, &buf)); /* Read the first sector and verify the file's format. */ WT_ERR(__wt_read( - session, block->fh, (off_t)0, WT_BLOCK_DESC_SECTOR, buf->mem)); + session, block->fh, (off_t)0, block->allocsize, buf->mem)); desc = buf->mem; WT_VERBOSE_ERR(session, block, @@ -279,7 +281,7 @@ __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block) cksum = desc->cksum; desc->cksum = 0; if (desc->magic != WT_BLOCK_MAGIC || - cksum != __wt_cksum(desc, WT_BLOCK_DESC_SECTOR)) + cksum != __wt_cksum(desc, block->allocsize)) WT_ERR_MSG(session, WT_ERROR, "%s does not appear to be a WiredTiger file", block->name); diff --git a/src/block/block_slvg.c b/src/block/block_slvg.c index 488278fd41a..c264268af31 100644 --- a/src/block/block_slvg.c +++ b/src/block/block_slvg.c @@ -17,35 +17,34 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block) off_t len; uint32_t allocsize; + allocsize = block->allocsize; + /* Reset the description sector. */ - WT_RET(__wt_desc_init(session, block->fh)); + WT_RET(__wt_desc_init(session, block->fh, allocsize)); /* * Salvage creates a new checkpoint when it's finished, set up for * rolling an empty file forward. */ - WT_RET(__wt_block_ckpt_init(session, &block->live, "live")); + WT_RET(__wt_block_ckpt_init(session, &block->live, "live", allocsize)); /* * Truncate the file to an initial sector plus N allocation size * units (bytes trailing the last multiple of an allocation size * unit must be garbage, by definition). */ - if (block->fh->file_size > WT_BLOCK_DESC_SECTOR) { - allocsize = block->allocsize; - len = block->fh->file_size - WT_BLOCK_DESC_SECTOR; - len = (len / allocsize) * allocsize; - len += WT_BLOCK_DESC_SECTOR; + if (block->fh->file_size > allocsize) { + len = (block->fh->file_size / allocsize) * allocsize; if (len != block->fh->file_size) WT_RET(__wt_ftruncate(session, block->fh, len)); } else - len = WT_BLOCK_DESC_SECTOR; + len = allocsize; /* * The first sector of the file is the description record, skip it as * we read the file. */ - block->slvg_off = WT_BLOCK_DESC_SECTOR; + block->slvg_off = allocsize; /* * The only checkpoint extent we care about is the allocation list. @@ -53,7 +52,7 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block) * any blocks we don't want as we process the file. */ WT_RET(__wt_block_insert_ext(session, block, &block->live.alloc, - WT_BLOCK_DESC_SECTOR, len - WT_BLOCK_DESC_SECTOR)); + allocsize, len - allocsize)); return (0); } diff --git a/src/block/block_vrfy.c b/src/block/block_vrfy.c index 7c06bb8b193..15b15e92396 100644 --- a/src/block/block_vrfy.c +++ b/src/block/block_vrfy.c @@ -17,9 +17,9 @@ static int __verify_last_truncate(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *); /* The bit list ignores the first sector: convert to/from a frag/offset. */ #define WT_OFF_TO_FRAG(block, off) \ - (((off) - WT_BLOCK_DESC_SECTOR) / (block)->allocsize) + ((off) / (block)->allocsize - 1) #define WT_FRAG_TO_OFF(block, frag) \ - (((off_t)(frag)) * (block)->allocsize + WT_BLOCK_DESC_SECTOR) + (((off_t)(frag + 1)) * (block)->allocsize) /* * __wt_block_verify_start -- @@ -37,7 +37,7 @@ __wt_block_verify_start( * sense if we don't have a checkpoint. */ fh = block->fh; - if (fh->file_size == WT_BLOCK_DESC_SECTOR) + if (fh->file_size == block->allocsize) return (0); if (ckptbase[0].name == NULL) WT_RET_MSG(session, WT_ERROR, @@ -50,7 +50,7 @@ __wt_block_verify_start( * The file size should be a multiple of the allocsize, offset by the * size of the descriptor sector, the first 512B of the file. */ - if ((fh->file_size - WT_BLOCK_DESC_SECTOR) % block->allocsize != 0) + if (fh->file_size % block->allocsize != 0) WT_RET_MSG(session, WT_ERROR, "the file size is not a multiple of the allocation size"); @@ -114,7 +114,7 @@ __verify_last_avail( --ckpt; ci = &_ci; - WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name)); + WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name, block->allocsize)); WT_ERR(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci)); el = &ci->avail; @@ -151,7 +151,7 @@ __verify_last_truncate( --ckpt; ci = &_ci; - WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name)); + WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name, block->allocsize)); WT_ERR(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci)); WT_ERR(__wt_ftruncate(session, block->fh, ci->file_size)); diff --git a/src/include/block.h b/src/include/block.h index 0d4f3275a4c..dd488dc4fea 100644 --- a/src/include/block.h +++ b/src/include/block.h @@ -10,10 +10,9 @@ */ /* - * The file's description is written into the first 512B of the file, which + * The file's description is written into the first block of the file, which * means we can use an offset of 0 as an invalid offset. */ -#define WT_BLOCK_DESC_SECTOR 512 #define WT_BLOCK_INVALID_OFFSET 0 /* diff --git a/src/include/extern.h b/src/include/extern.h index 9c369043e05..c3284729f7f 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -27,9 +27,10 @@ extern int __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci); -extern int __wt_block_ckpt_init( WT_SESSION_IMPL *session, +extern int __wt_block_ckpt_init(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, - const char *name); + const char *name, + uint32_t allocsize); extern int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, @@ -129,17 +130,21 @@ extern int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *cfg[], int forced_salvage, WT_BM **bmp); -extern int __wt_block_manager_truncate(WT_SESSION_IMPL *session, - const char *filename); -extern int __wt_block_manager_create(WT_SESSION_IMPL *session, - const char *filename); +extern int __wt_block_manager_truncate( WT_SESSION_IMPL *session, + const char *filename, + uint32_t allocsize); +extern int __wt_block_manager_create( WT_SESSION_IMPL *session, + const char *filename, + uint32_t allocsize); extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], int forced_salvage, WT_BLOCK **blockp); extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block); -extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh); +extern int __wt_desc_init(WT_SESSION_IMPL *session, + WT_FH *fh, + uint32_t allocsize); extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats); diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c index 9d93bb1e636..4d5510c71c4 100644 --- a/src/schema/schema_create.c +++ b/src/schema/schema_create.c @@ -11,10 +11,14 @@ static int __create_file(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { + WT_CONFIG_ITEM cval; WT_DECL_ITEM(val); WT_DECL_RET; + uint32_t allocsize; int is_metadata; - const char *filecfg[4], *fileconf, *filename; + const char *fileconf, *filename; + const char *filecfg[] = + { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL }; fileconf = NULL; @@ -32,8 +36,11 @@ __create_file(WT_SESSION_IMPL *session, goto err; } + WT_RET(__wt_config_gets(session, filecfg, "allocation_size", &cval)); + allocsize = (uint32_t)cval.val; + /* Create the file. */ - WT_ERR(__wt_block_manager_create(session, filename)); + WT_ERR(__wt_block_manager_create(session, filename, allocsize)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_fileop(session, NULL, uri)); @@ -46,10 +53,7 @@ __create_file(WT_SESSION_IMPL *session, WT_ERR(__wt_scr_alloc(session, 0, &val)); WT_ERR(__wt_buf_fmt(session, val, "version=(major=%d,minor=%d)", WT_BTREE_MAJOR_VERSION, WT_BTREE_MINOR_VERSION)); - filecfg[0] = WT_CONFIG_BASE(session, file_meta); - filecfg[1] = config; filecfg[2] = val->data; - filecfg[3] = NULL; WT_ERR(__wt_config_collapse(session, filecfg, &fileconf)); if ((ret = __wt_metadata_insert(session, uri, fileconf)) != 0) { if (ret == WT_DUPLICATE_KEY) diff --git a/src/schema/schema_truncate.c b/src/schema/schema_truncate.c index 3ead2afa1fc..7be6eb941a6 100644 --- a/src/schema/schema_truncate.c +++ b/src/schema/schema_truncate.c @@ -15,17 +15,21 @@ static int __truncate_file(WT_SESSION_IMPL *session, const char *name) { const char *filename; + uint32_t allocsize; filename = name; if (!WT_PREFIX_SKIP(filename, "file:")) return (EINVAL); + /* Get the allocation size. */ + allocsize = S2BT(session)->allocsize; + /* Close any btree handles in the file. */ WT_RET(__wt_conn_dhandle_close_all(session, name)); /* Delete the root address and truncate the file. */ WT_RET(__wt_meta_checkpoint_clear(session, name)); - WT_RET(__wt_block_manager_truncate(session, filename)); + WT_RET(__wt_block_manager_truncate(session, filename, allocsize)); return (0); } @@ -120,9 +124,11 @@ __wt_schema_truncate( WT_UNUSED(cfg); tablename = uri; - if (WT_PREFIX_MATCH(uri, "file:")) + if (WT_PREFIX_MATCH(uri, "file:")) { + WT_RET(__wt_session_get_btree( + session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE)); ret = __truncate_file(session, uri); - else if (WT_PREFIX_MATCH(uri, "lsm:")) + } else if (WT_PREFIX_MATCH(uri, "lsm:")) ret = __wt_lsm_tree_truncate(session, uri, cfg); else if (WT_PREFIX_SKIP(tablename, "table:")) ret = __truncate_table(session, tablename); diff --git a/test/salvage/salvage.c b/test/salvage/salvage.c index e8a473c7d89..2486d18e439 100644 --- a/test/salvage/salvage.c +++ b/test/salvage/salvage.c @@ -560,8 +560,8 @@ copy(u_int gen, u_int recno) */ if (access(SLVG, F_OK)) { assert((ofp = fopen(SLVG, "w")) != NULL); - assert(fread(buf, 1, 512, ifp) == 512); - assert(fwrite(buf, 1, 512, ofp) == 512); + assert(fread(buf, 1, PSIZE, ifp) == PSIZE); + assert(fwrite(buf, 1, PSIZE, ofp) == PSIZE); } else assert((ofp = fopen(SLVG, "a")) != NULL); @@ -569,7 +569,7 @@ copy(u_int gen, u_int recno) * If there's data, copy/update the first formatted page. */ if (gen != 0) { - assert(fseek(ifp, (long)512, SEEK_SET) == 0); + assert(fseek(ifp, (long)PSIZE, SEEK_SET) == 0); assert(fread(buf, 1, PSIZE, ifp) == PSIZE); dsk = (void *)buf; if (page_type != WT_PAGE_ROW_LEAF) |