summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@wiredtiger.com>2013-05-09 21:23:47 +1000
committerMichael Cahill <michael.cahill@wiredtiger.com>2013-05-09 21:23:47 +1000
commit7e073cfa08380fca6096911fba8ed34514133e2e (patch)
tree2b483d880f62773f276623181a83a93ff9addf51
parentae50550dde2adae738c626b764dd2e37a39e145b (diff)
downloadmongo-7e073cfa08380fca6096911fba8ed34514133e2e.tar.gz
Use the allocation size for the file "desc" block. This is important for direct I/O, so that reads and writes are aligned as expected.
-rw-r--r--src/block/block_addr.c11
-rw-r--r--src/block/block_ckpt.c16
-rw-r--r--src/block/block_ext.c6
-rw-r--r--src/block/block_open.c26
-rw-r--r--src/block/block_slvg.c19
-rw-r--r--src/block/block_vrfy.c12
-rw-r--r--src/include/block.h3
-rw-r--r--src/include/extern.h19
-rw-r--r--src/schema/schema_create.c14
-rw-r--r--src/schema/schema_truncate.c12
-rw-r--r--test/salvage/salvage.c6
11 files changed, 80 insertions, 64 deletions
diff --git a/src/block/block_addr.c b/src/block/block_addr.c
index 359d229bf10..196bacc6f80 100644
--- a/src/block/block_addr.c
+++ b/src/block/block_addr.c
@@ -24,9 +24,9 @@ __block_buffer_to_addr(WT_BLOCK *block,
/*
* To avoid storing large offsets, we minimize the value by subtracting
- * 512B (the size of the description sector), and then storing a count
- * of block allocation units. That implies there is no such thing as
- * an "invalid" offset though, they could all be valid (other than very
+ * a block for the description sector, then storing a count of block
+ * allocation units. That implies there is no such thing as an
+ * "invalid" offset though, they could all be valid (other than very
* large numbers), which is what we didn't want to store in the first
* place. Use the size: writing a block of size 0 makes no sense, so
* that's the out-of-band value. Once we're out of this function and
@@ -38,7 +38,7 @@ __block_buffer_to_addr(WT_BLOCK *block,
*offsetp = 0;
*sizep = *cksump = 0;
} else {
- *offsetp = (off_t)o * block->allocsize + WT_BLOCK_DESC_SECTOR;
+ *offsetp = (off_t)(o + 1) * block->allocsize;
*sizep = (uint32_t)s * block->allocsize;
*cksump = (uint32_t)c;
}
@@ -60,8 +60,7 @@ __wt_block_addr_to_buffer(WT_BLOCK *block,
o = WT_BLOCK_INVALID_OFFSET;
s = c = 0;
} else {
- o = (uint64_t)
- (offset - WT_BLOCK_DESC_SECTOR) / block->allocsize;
+ o = (uint64_t)offset / block->allocsize - 1;
s = size / block->allocsize;
c = cksum;
}
diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c
index 61728cfad15..f83144ce8e7 100644
--- a/src/block/block_ckpt.c
+++ b/src/block/block_ckpt.c
@@ -18,8 +18,8 @@ static int __ckpt_update(WT_SESSION_IMPL *,
* Initialize a checkpoint structure.
*/
int
-__wt_block_ckpt_init(
- WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
+__wt_block_ckpt_init(WT_SESSION_IMPL *session,
+ WT_BLOCK_CKPT *ci, const char *name, uint32_t allocsize)
{
memset(ci, 0, sizeof(*ci));
@@ -29,7 +29,7 @@ __wt_block_ckpt_init(
WT_RET(__wt_block_extlist_init(session, &ci->avail, name, "avail"));
WT_RET(__wt_block_extlist_init(session, &ci->discard, name, "discard"));
- ci->file_size = WT_BLOCK_DESC_SECTOR;
+ ci->file_size = allocsize;
WT_RET(__wt_block_extlist_init(
session, &ci->ckpt_avail, name, "ckpt_avail"));
@@ -77,7 +77,8 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
*/
if (checkpoint) {
ci = &_ci;
- WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint"));
+ WT_ERR(__wt_block_ckpt_init(
+ session, ci, "checkpoint", block->allocsize));
} else {
/*
* We depend on the btree level for locking: things will go
@@ -86,7 +87,8 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
* file, for that matter.
*/
ci = &block->live;
- WT_ERR(__wt_block_ckpt_init(session, ci, "live"));
+ WT_ERR(__wt_block_ckpt_init(
+ session, ci, "live", block->allocsize));
}
/* If the checkpoint has an on-disk root page, load it. */
@@ -236,7 +238,7 @@ __ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));
ci = ckpt->bpriv;
- WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
+ WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name, block->allocsize));
WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
WT_RET(__wt_block_extlist_read(
session, block, &ci->alloc, ci->file_size));
@@ -691,7 +693,7 @@ __ckpt_string(WT_SESSION_IMPL *session,
/* Initialize the checkpoint, crack the cookie. */
ci = &_ci;
- WT_RET(__wt_block_ckpt_init(session, ci, "string"));
+ WT_RET(__wt_block_ckpt_init(session, ci, "string", block->allocsize));
WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci));
WT_RET(__wt_buf_fmt(session, buf,
diff --git a/src/block/block_ext.c b/src/block/block_ext.c
index 204fd418c81..c89dd3a85ef 100644
--- a/src/block/block_ext.c
+++ b/src/block/block_ext.c
@@ -459,7 +459,7 @@ __block_extend(
*
* We should never be allocating from an empty file.
*/
- if (fh->file_size < WT_BLOCK_DESC_SECTOR)
+ if (fh->file_size < block->allocsize)
WT_RET_MSG(session, EINVAL,
"cannot allocate from a file with no description "
"information");
@@ -1076,8 +1076,8 @@ __wt_block_extlist_read(
* a cheap test to do here and we'd have to do the check as part
* of file verification, regardless.
*/
- if (off < WT_BLOCK_DESC_SECTOR ||
- (off - WT_BLOCK_DESC_SECTOR) % block->allocsize != 0 ||
+ if (off < block->allocsize ||
+ off % block->allocsize != 0 ||
size % block->allocsize != 0 ||
off + size > ckpt_size)
corrupted: WT_ERR_MSG(session, WT_ERROR,
diff --git a/src/block/block_open.c b/src/block/block_open.c
index 750d29eaa72..5e7338a9f58 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -14,7 +14,8 @@ static int __desc_read(WT_SESSION_IMPL *, WT_BLOCK *);
* Truncate a file.
*/
int
-__wt_block_manager_truncate(WT_SESSION_IMPL *session, const char *filename)
+__wt_block_manager_truncate(
+ WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize)
{
WT_DECL_RET;
WT_FH *fh;
@@ -26,7 +27,7 @@ __wt_block_manager_truncate(WT_SESSION_IMPL *session, const char *filename)
WT_ERR(__wt_ftruncate(session, fh, (off_t)0));
/* Write out the file's meta-data. */
- ret = __wt_desc_init(session, fh);
+ ret = __wt_desc_init(session, fh, allocsize);
/* Close the file handle. */
err: WT_TRET(__wt_close(session, fh));
@@ -39,7 +40,8 @@ err: WT_TRET(__wt_close(session, fh));
* Create a file.
*/
int
-__wt_block_manager_create(WT_SESSION_IMPL *session, const char *filename)
+__wt_block_manager_create(
+ WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize)
{
WT_DECL_RET;
WT_FH *fh;
@@ -48,7 +50,7 @@ __wt_block_manager_create(WT_SESSION_IMPL *session, const char *filename)
WT_RET(__wt_open(session, filename, 1, 1, 1, &fh));
/* Write out the file's meta-data. */
- ret = __wt_desc_init(session, fh);
+ ret = __wt_desc_init(session, fh, allocsize);
/* Close the file handle. */
WT_TRET(__wt_close(session, fh));
@@ -214,15 +216,15 @@ __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block)
* Write a file's initial descriptor structure.
*/
int
-__wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh)
+__wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize)
{
WT_BLOCK_DESC *desc;
WT_DECL_ITEM(buf);
WT_DECL_RET;
/* Use a scratch buffer to get correct alignment for direct I/O. */
- WT_RET(__wt_scr_alloc(session, WT_BLOCK_DESC_SECTOR, &buf));
- memset(buf->mem, 0, WT_BLOCK_DESC_SECTOR);
+ WT_RET(__wt_scr_alloc(session, allocsize, &buf));
+ memset(buf->mem, 0, allocsize);
desc = buf->mem;
desc->magic = WT_BLOCK_MAGIC;
@@ -231,9 +233,9 @@ __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh)
/* Update the checksum. */
desc->cksum = 0;
- desc->cksum = __wt_cksum(desc, WT_BLOCK_DESC_SECTOR);
+ desc->cksum = __wt_cksum(desc, allocsize);
- ret = __wt_write(session, fh, (off_t)0, WT_BLOCK_DESC_SECTOR, desc);
+ ret = __wt_write(session, fh, (off_t)0, allocsize, desc);
__wt_scr_free(&buf);
return (ret);
@@ -252,11 +254,11 @@ __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block)
uint32_t cksum;
/* Use a scratch buffer to get correct alignment for direct I/O. */
- WT_RET(__wt_scr_alloc(session, WT_BLOCK_DESC_SECTOR, &buf));
+ WT_RET(__wt_scr_alloc(session, block->allocsize, &buf));
/* Read the first sector and verify the file's format. */
WT_ERR(__wt_read(
- session, block->fh, (off_t)0, WT_BLOCK_DESC_SECTOR, buf->mem));
+ session, block->fh, (off_t)0, block->allocsize, buf->mem));
desc = buf->mem;
WT_VERBOSE_ERR(session, block,
@@ -279,7 +281,7 @@ __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block)
cksum = desc->cksum;
desc->cksum = 0;
if (desc->magic != WT_BLOCK_MAGIC ||
- cksum != __wt_cksum(desc, WT_BLOCK_DESC_SECTOR))
+ cksum != __wt_cksum(desc, block->allocsize))
WT_ERR_MSG(session, WT_ERROR,
"%s does not appear to be a WiredTiger file", block->name);
diff --git a/src/block/block_slvg.c b/src/block/block_slvg.c
index 488278fd41a..c264268af31 100644
--- a/src/block/block_slvg.c
+++ b/src/block/block_slvg.c
@@ -17,35 +17,34 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
off_t len;
uint32_t allocsize;
+ allocsize = block->allocsize;
+
/* Reset the description sector. */
- WT_RET(__wt_desc_init(session, block->fh));
+ WT_RET(__wt_desc_init(session, block->fh, allocsize));
/*
* Salvage creates a new checkpoint when it's finished, set up for
* rolling an empty file forward.
*/
- WT_RET(__wt_block_ckpt_init(session, &block->live, "live"));
+ WT_RET(__wt_block_ckpt_init(session, &block->live, "live", allocsize));
/*
* Truncate the file to an initial sector plus N allocation size
* units (bytes trailing the last multiple of an allocation size
* unit must be garbage, by definition).
*/
- if (block->fh->file_size > WT_BLOCK_DESC_SECTOR) {
- allocsize = block->allocsize;
- len = block->fh->file_size - WT_BLOCK_DESC_SECTOR;
- len = (len / allocsize) * allocsize;
- len += WT_BLOCK_DESC_SECTOR;
+ if (block->fh->file_size > allocsize) {
+ len = (block->fh->file_size / allocsize) * allocsize;
if (len != block->fh->file_size)
WT_RET(__wt_ftruncate(session, block->fh, len));
} else
- len = WT_BLOCK_DESC_SECTOR;
+ len = allocsize;
/*
* The first sector of the file is the description record, skip it as
* we read the file.
*/
- block->slvg_off = WT_BLOCK_DESC_SECTOR;
+ block->slvg_off = allocsize;
/*
* The only checkpoint extent we care about is the allocation list.
@@ -53,7 +52,7 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
* any blocks we don't want as we process the file.
*/
WT_RET(__wt_block_insert_ext(session, block, &block->live.alloc,
- WT_BLOCK_DESC_SECTOR, len - WT_BLOCK_DESC_SECTOR));
+ allocsize, len - allocsize));
return (0);
}
diff --git a/src/block/block_vrfy.c b/src/block/block_vrfy.c
index 7c06bb8b193..15b15e92396 100644
--- a/src/block/block_vrfy.c
+++ b/src/block/block_vrfy.c
@@ -17,9 +17,9 @@ static int __verify_last_truncate(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
/* The bit list ignores the first sector: convert to/from a frag/offset. */
#define WT_OFF_TO_FRAG(block, off) \
- (((off) - WT_BLOCK_DESC_SECTOR) / (block)->allocsize)
+ ((off) / (block)->allocsize - 1)
#define WT_FRAG_TO_OFF(block, frag) \
- (((off_t)(frag)) * (block)->allocsize + WT_BLOCK_DESC_SECTOR)
+ (((off_t)(frag + 1)) * (block)->allocsize)
/*
* __wt_block_verify_start --
@@ -37,7 +37,7 @@ __wt_block_verify_start(
* sense if we don't have a checkpoint.
*/
fh = block->fh;
- if (fh->file_size == WT_BLOCK_DESC_SECTOR)
+ if (fh->file_size == block->allocsize)
return (0);
if (ckptbase[0].name == NULL)
WT_RET_MSG(session, WT_ERROR,
@@ -50,7 +50,7 @@ __wt_block_verify_start(
* The file size should be a multiple of the allocsize, offset by the
* size of the descriptor sector, the first 512B of the file.
*/
- if ((fh->file_size - WT_BLOCK_DESC_SECTOR) % block->allocsize != 0)
+ if (fh->file_size % block->allocsize != 0)
WT_RET_MSG(session, WT_ERROR,
"the file size is not a multiple of the allocation size");
@@ -114,7 +114,7 @@ __verify_last_avail(
--ckpt;
ci = &_ci;
- WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
+ WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name, block->allocsize));
WT_ERR(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
el = &ci->avail;
@@ -151,7 +151,7 @@ __verify_last_truncate(
--ckpt;
ci = &_ci;
- WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
+ WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name, block->allocsize));
WT_ERR(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
WT_ERR(__wt_ftruncate(session, block->fh, ci->file_size));
diff --git a/src/include/block.h b/src/include/block.h
index 0d4f3275a4c..dd488dc4fea 100644
--- a/src/include/block.h
+++ b/src/include/block.h
@@ -10,10 +10,9 @@
*/
/*
- * The file's description is written into the first 512B of the file, which
+ * The file's description is written into the first block of the file, which
* means we can use an offset of 0 as an invalid offset.
*/
-#define WT_BLOCK_DESC_SECTOR 512
#define WT_BLOCK_INVALID_OFFSET 0
/*
diff --git a/src/include/extern.h b/src/include/extern.h
index 9c369043e05..c3284729f7f 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -27,9 +27,10 @@ extern int __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session,
WT_BLOCK *block,
uint8_t **pp,
WT_BLOCK_CKPT *ci);
-extern int __wt_block_ckpt_init( WT_SESSION_IMPL *session,
+extern int __wt_block_ckpt_init(WT_SESSION_IMPL *session,
WT_BLOCK_CKPT *ci,
- const char *name);
+ const char *name,
+ uint32_t allocsize);
extern int __wt_block_checkpoint_load(WT_SESSION_IMPL *session,
WT_BLOCK *block,
const uint8_t *addr,
@@ -129,17 +130,21 @@ extern int __wt_block_manager_open(WT_SESSION_IMPL *session,
const char *cfg[],
int forced_salvage,
WT_BM **bmp);
-extern int __wt_block_manager_truncate(WT_SESSION_IMPL *session,
- const char *filename);
-extern int __wt_block_manager_create(WT_SESSION_IMPL *session,
- const char *filename);
+extern int __wt_block_manager_truncate( WT_SESSION_IMPL *session,
+ const char *filename,
+ uint32_t allocsize);
+extern int __wt_block_manager_create( WT_SESSION_IMPL *session,
+ const char *filename,
+ uint32_t allocsize);
extern int __wt_block_open(WT_SESSION_IMPL *session,
const char *filename,
const char *cfg[],
int forced_salvage,
WT_BLOCK **blockp);
extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block);
-extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh);
+extern int __wt_desc_init(WT_SESSION_IMPL *session,
+ WT_FH *fh,
+ uint32_t allocsize);
extern void __wt_block_stat(WT_SESSION_IMPL *session,
WT_BLOCK *block,
WT_DSRC_STATS *stats);
diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c
index 9d93bb1e636..4d5510c71c4 100644
--- a/src/schema/schema_create.c
+++ b/src/schema/schema_create.c
@@ -11,10 +11,14 @@ static int
__create_file(WT_SESSION_IMPL *session,
const char *uri, int exclusive, const char *config)
{
+ WT_CONFIG_ITEM cval;
WT_DECL_ITEM(val);
WT_DECL_RET;
+ uint32_t allocsize;
int is_metadata;
- const char *filecfg[4], *fileconf, *filename;
+ const char *fileconf, *filename;
+ const char *filecfg[] =
+ { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL };
fileconf = NULL;
@@ -32,8 +36,11 @@ __create_file(WT_SESSION_IMPL *session,
goto err;
}
+ WT_RET(__wt_config_gets(session, filecfg, "allocation_size", &cval));
+ allocsize = (uint32_t)cval.val;
+
/* Create the file. */
- WT_ERR(__wt_block_manager_create(session, filename));
+ WT_ERR(__wt_block_manager_create(session, filename, allocsize));
if (WT_META_TRACKING(session))
WT_ERR(__wt_meta_track_fileop(session, NULL, uri));
@@ -46,10 +53,7 @@ __create_file(WT_SESSION_IMPL *session,
WT_ERR(__wt_scr_alloc(session, 0, &val));
WT_ERR(__wt_buf_fmt(session, val, "version=(major=%d,minor=%d)",
WT_BTREE_MAJOR_VERSION, WT_BTREE_MINOR_VERSION));
- filecfg[0] = WT_CONFIG_BASE(session, file_meta);
- filecfg[1] = config;
filecfg[2] = val->data;
- filecfg[3] = NULL;
WT_ERR(__wt_config_collapse(session, filecfg, &fileconf));
if ((ret = __wt_metadata_insert(session, uri, fileconf)) != 0) {
if (ret == WT_DUPLICATE_KEY)
diff --git a/src/schema/schema_truncate.c b/src/schema/schema_truncate.c
index 3ead2afa1fc..7be6eb941a6 100644
--- a/src/schema/schema_truncate.c
+++ b/src/schema/schema_truncate.c
@@ -15,17 +15,21 @@ static int
__truncate_file(WT_SESSION_IMPL *session, const char *name)
{
const char *filename;
+ uint32_t allocsize;
filename = name;
if (!WT_PREFIX_SKIP(filename, "file:"))
return (EINVAL);
+ /* Get the allocation size. */
+ allocsize = S2BT(session)->allocsize;
+
/* Close any btree handles in the file. */
WT_RET(__wt_conn_dhandle_close_all(session, name));
/* Delete the root address and truncate the file. */
WT_RET(__wt_meta_checkpoint_clear(session, name));
- WT_RET(__wt_block_manager_truncate(session, filename));
+ WT_RET(__wt_block_manager_truncate(session, filename, allocsize));
return (0);
}
@@ -120,9 +124,11 @@ __wt_schema_truncate(
WT_UNUSED(cfg);
tablename = uri;
- if (WT_PREFIX_MATCH(uri, "file:"))
+ if (WT_PREFIX_MATCH(uri, "file:")) {
+ WT_RET(__wt_session_get_btree(
+ session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE));
ret = __truncate_file(session, uri);
- else if (WT_PREFIX_MATCH(uri, "lsm:"))
+ } else if (WT_PREFIX_MATCH(uri, "lsm:"))
ret = __wt_lsm_tree_truncate(session, uri, cfg);
else if (WT_PREFIX_SKIP(tablename, "table:"))
ret = __truncate_table(session, tablename);
diff --git a/test/salvage/salvage.c b/test/salvage/salvage.c
index e8a473c7d89..2486d18e439 100644
--- a/test/salvage/salvage.c
+++ b/test/salvage/salvage.c
@@ -560,8 +560,8 @@ copy(u_int gen, u_int recno)
*/
if (access(SLVG, F_OK)) {
assert((ofp = fopen(SLVG, "w")) != NULL);
- assert(fread(buf, 1, 512, ifp) == 512);
- assert(fwrite(buf, 1, 512, ofp) == 512);
+ assert(fread(buf, 1, PSIZE, ifp) == PSIZE);
+ assert(fwrite(buf, 1, PSIZE, ofp) == PSIZE);
} else
assert((ofp = fopen(SLVG, "a")) != NULL);
@@ -569,7 +569,7 @@ copy(u_int gen, u_int recno)
* If there's data, copy/update the first formatted page.
*/
if (gen != 0) {
- assert(fseek(ifp, (long)512, SEEK_SET) == 0);
+ assert(fseek(ifp, (long)PSIZE, SEEK_SET) == 0);
assert(fread(buf, 1, PSIZE, ifp) == PSIZE);
dsk = (void *)buf;
if (page_type != WT_PAGE_ROW_LEAF)