summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith@wiredtiger.com>2013-05-14 05:50:48 -0700
committerKeith Bostic <keith@wiredtiger.com>2013-05-14 05:50:48 -0700
commit3e85a5919f8fef654ea13a62f697dce55e959a13 (patch)
tree3488de99afa1fa9748b7645ee566effb18fd360a
parentdded2972d38359db5ce403d47ea841da90856586 (diff)
parent73a0c8a509188e97cf1eaf0155448091716499de (diff)
downloadmongo-3e85a5919f8fef654ea13a62f697dce55e959a13.tar.gz
Merge pull request #534 from wiredtiger/file-extend
file extension
-rw-r--r--build_posix/configure.ac.in5
-rw-r--r--dist/api_data.py6
-rw-r--r--dist/flags.py6
-rw-r--r--dist/s_string.ok1
-rw-r--r--examples/c/ex_all.c7
-rw-r--r--src/block/block_ckpt.c36
-rw-r--r--src/block/block_write.c37
-rw-r--r--src/config/config_def.c5
-rw-r--r--src/conn/conn_api.c27
-rw-r--r--src/docs/tuning.dox9
-rw-r--r--src/include/connection.h5
-rw-r--r--src/include/flags.h4
-rw-r--r--src/include/os.h2
-rw-r--r--src/include/wiredtiger.in5
-rw-r--r--src/os_posix/os_ftruncate.c2
-rw-r--r--src/os_posix/os_open.c7
-rw-r--r--test/format/config.h4
-rw-r--r--test/format/format.h1
-rw-r--r--test/format/wts.c5
19 files changed, 134 insertions, 40 deletions
diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in
index 1b4afeb30ec..cab117b5c56 100644
--- a/build_posix/configure.ac.in
+++ b/build_posix/configure.ac.in
@@ -83,9 +83,8 @@ AC_CHECK_LIB(pthread, pthread_create)
AC_CHECK_LIB(dl, dlopen)
AC_CHECK_LIB(rt, sched_yield)
AC_CHECK_FUNCS([\
- clock_gettime fcntl gettimeofday \
- posix_fadvise posix_madvise posix_memalign\
- strtouq sync_file_range])
+ clock_gettime fcntl ftruncate gettimeofday posix_fadvise\
+ posix_fallocate posix_madvise posix_memalign strtouq sync_file_range])
AC_SYS_LARGEFILE
AC_MSG_CHECKING([if the adaptive mutex type is available])
diff --git a/dist/api_data.py b/dist/api_data.py
index c835faa52ee..b87ba07719d 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -560,6 +560,12 @@ methods = {
WT_CONNECTION::load_extension. For example,
<code>extensions=(/path/ext.so={entry=my_entry})</code>''',
type='list'),
+ Config('file_extend', '', r'''
+ file extension configuration. If set, extend files of the set
+ type in allocations of the set size, instead of a block at a
+ time as each new block is written. For example,
+ <code>file_extend=(data=16MB)</code>''',
+ type='list', choices=['data', 'log']),
Config('hazard_max', '1000', r'''
maximum number of simultaneous hazard pointers per session
handle''',
diff --git a/dist/flags.py b/dist/flags.py
index 607c7f13aa9..1c1dbc3564d 100644
--- a/dist/flags.py
+++ b/dist/flags.py
@@ -15,9 +15,9 @@ flags = {
'SYNC_DISCARD_NOWRITE',
'SYNC_WRITE_LEAVES',
],
- 'direct_io' : [
- 'DIRECTIO_DATA',
- 'DIRECTIO_LOG'
+ 'file_types' : [
+ 'FILE_TYPE_DATA',
+ 'FILE_TYPE_LOG'
],
'rec_write' : [
'EVICTION_SERVER_LOCKED',
diff --git a/dist/s_string.ok b/dist/s_string.ok
index 39537b8f597..57a7e09f1cb 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -402,6 +402,7 @@ extern
extlist
extlists
fadvise
+fallocate
fblocks
fclose
fcntl
diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c
index 4eb3430f79e..edf2437d167 100644
--- a/examples/c/ex_all.c
+++ b/examples/c/ex_all.c
@@ -1013,6 +1013,13 @@ main(void)
(void)conn->close(conn, NULL);
#endif
+ /*! [Configure file_extend] */
+ ret = wiredtiger_open(home, NULL,
+ "create,file_extend=(type=[data],size=16MB)", &conn);
+ /*! [Configure file_extend] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
/*! [Statistics configuration] */
ret = wiredtiger_open(home, NULL, "create,statistics=true", &conn);
/*! [Statistics configuration] */
diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c
index b6e70408564..8fa36075887 100644
--- a/src/block/block_ckpt.c
+++ b/src/block/block_ckpt.c
@@ -117,21 +117,20 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
WT_ERR(__wt_block_extlist_read_avail(
session, block, &ci->avail, ci->file_size));
- /*
- * If the checkpoint can be written, that means anything written
- * after the checkpoint is no longer interesting, truncate the
- * file. Don't bother checking the avail list for a block at
- * the end of the file, that was done when the checkpoint was
- * first written (re-writing the checkpoint might possibly make
- * it relevant here, but it's unlikely enough I don't bother).
- */
- if (!checkpoint) {
- WT_VERBOSE_ERR(session, ckpt,
- "truncate file to %" PRIuMAX,
- (uintmax_t)ci->file_size);
- WT_ERR(
- __wt_ftruncate(session, block->fh, ci->file_size));
- }
+ }
+
+ /*
+ * If the checkpoint can be written, that means anything written after
+ * the checkpoint is no longer interesting, truncate the file. Don't
+ * bother checking the avail list for a block at the end of the file,
+ * that was done when the checkpoint was first written (re-writing the
+ * checkpoint might possibly make it relevant here, but it's unlikely
+ * enough I don't bother).
+ */
+ if (!checkpoint) {
+ WT_VERBOSE_ERR(session, ckpt,
+ "truncate file to %" PRIuMAX, (uintmax_t)ci->file_size);
+ WT_ERR(__wt_ftruncate(session, block->fh, ci->file_size));
}
if (0) {
@@ -161,6 +160,11 @@ __wt_block_checkpoint_unload(
if (block->verify)
WT_TRET(__wt_verify_ckpt_unload(session, block));
+ /* If it's the live system, truncate to discard any extended blocks. */
+ if (!checkpoint)
+ WT_TRET(__wt_ftruncate(session, block->fh, block->fh->size));
+
+ /* If it's the live system, discard the active extent lists. */
if (!checkpoint)
__wt_block_ckpt_destroy(session, &block->live);
@@ -615,7 +619,7 @@ __ckpt_update(
* if there ever is, this will need to be fixed.
*/
if (is_live)
- WT_RET(__wt_filesize(session, block->fh, &ci->file_size));
+ ci->file_size = block->fh->size;
/* Set the checkpoint size for the live system. */
if (is_live)
diff --git a/src/block/block_write.c b/src/block/block_write.c
index 13cb0f25f0e..9cc4eaa289d 100644
--- a/src/block/block_write.c
+++ b/src/block/block_write.c
@@ -69,10 +69,12 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
{
WT_BLOCK_HEADER *blk;
WT_DECL_RET;
+ WT_FH *fh;
off_t offset;
uint32_t align_size;
blk = WT_BLOCK_HEADER_REF(buf->mem);
+ fh = block->fh;
/* Buffers should be aligned for writing. */
if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
@@ -131,8 +133,35 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
__wt_spin_unlock(session, &block->live_lock);
WT_RET(ret);
- if ((ret = __wt_write(
- session, block->fh, offset, align_size, buf->mem)) != 0) {
+#if defined(HAVE_POSIX_FALLOCATE) || defined(HAVE_FTRUNCATE)
+ /*
+ * Extend the file in chunks. We aren't holding a lock and we'd prefer
+ * to limit the number of threads extending the file at the same time,
+ * so choose the one thread that's crossing the extended boundary. We
+ * don't extend newly created files, and it's theoretically possible we
+ * might wait so long our extension of the file is passed by another
+ * thread writing single blocks, that's why there's a check in case the
+ * extended file size becomes too small: if the file size catches up,
+ * every thread will try to extend it.
+ */
+ if (fh->extend_len != 0 &&
+ (fh->extend_size <= fh->size ||
+ (offset + fh->extend_len <= fh->extend_size &&
+ offset + fh->extend_len + align_size >= fh->extend_size))) {
+ fh->extend_size = offset + fh->extend_len * 2;
+#if defined(HAVE_POSIX_FALLOCATE)
+ if ((ret =
+ posix_fallocate(fh->fd, offset, fh->extend_len * 2)) != 0)
+ WT_RET_MSG(
+ session, ret, "%s: posix_fallocate", fh->name);
+#elif defined(HAVE_FTRUNCATE)
+ if ((ret = ftruncate(fh->fd, fh->extend_size)) != 0)
+ WT_RET_MSG(session, ret, "%s: ftruncate", fh->name);
+#endif
+ }
+#endif
+ if ((ret =
+ __wt_write(session, fh, offset, align_size, buf->mem)) != 0) {
if (!locked)
__wt_spin_lock(session, &block->live_lock);
WT_TRET(
@@ -150,7 +179,7 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
if (block->os_cache_dirty_max != 0 &&
(block->os_cache_dirty += align_size) > block->os_cache_dirty_max) {
block->os_cache_dirty = 0;
- if ((ret = sync_file_range(block->fh->fd,
+ if ((ret = sync_file_range(fh->fd,
(off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) != 0)
WT_RET_MSG(
session, ret, "%s: sync_file_range", block->name);
@@ -161,7 +190,7 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
if (block->os_cache_max != 0 &&
(block->os_cache += align_size) > block->os_cache_max) {
block->os_cache = 0;
- if ((ret = posix_fadvise(block->fh->fd,
+ if ((ret = posix_fadvise(fh->fd,
(off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0)
WT_RET_MSG(
session, ret, "%s: posix_fadvise", block->name);
diff --git a/src/config/config_def.c b/src/config/config_def.c
index d77c76ff56e..c01894400c4 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -229,6 +229,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{ "eviction_target", "int", "min=10,max=99", NULL},
{ "eviction_trigger", "int", "min=10,max=99", NULL},
{ "extensions", "list", NULL, NULL},
+ { "file_extend", "list", "choices=[\"data\",\"log\"]", NULL},
{ "hazard_max", "int", "min=15", NULL},
{ "logging", "boolean", NULL, NULL},
{ "lsm_merge", "boolean", NULL, NULL},
@@ -391,8 +392,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"buffer_alignment=-1,cache_size=100MB,"
"checkpoint=(name=\"WiredTigerCheckpoint\",wait=0),create=0,"
"direct_io=,error_prefix=,eviction_dirty_target=80,eviction_target=80"
- ",eviction_trigger=95,extensions=,hazard_max=1000,logging=0,"
- "lsm_merge=,mmap=,multiprocess=0,session_max=50,"
+ ",eviction_trigger=95,extensions=,file_extend=,hazard_max=1000,"
+ "logging=0,lsm_merge=,mmap=,multiprocess=0,session_max=50,"
"shared_cache=(chunk=10MB,name=pool,reserve=0,size=500MB),"
"statistics=0,statistics_log=(clear=,path=\"WiredTigerStat.%H\","
"sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),sync=,transactional=,"
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index 25d6903374c..5ec60c6a524 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -850,9 +850,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
static const struct {
const char *name;
uint32_t flag;
- } *ft, directio_types[] = {
- { "data", WT_DIRECTIO_DATA },
- { "log", WT_DIRECTIO_LOG },
+ } *ft, file_types[] = {
+ { "data", WT_FILE_TYPE_DATA },
+ { "log", WT_FILE_TYPE_LOG },
{ NULL, 0 }
};
WT_CONFIG subconfig;
@@ -949,10 +949,10 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
#endif
/*
- * Configuration: direct_io, mmap, statistics.
+ * Configuration: direct_io, file_extend, mmap, statistics.
*/
WT_ERR(__wt_config_gets(session, cfg, "direct_io", &cval));
- for (ft = directio_types; ft->name != NULL; ft++) {
+ for (ft = file_types; ft->name != NULL; ft++) {
ret = __wt_config_subgets(session, &cval, ft->name, &sval);
if (ret == 0) {
if (sval.val)
@@ -960,6 +960,23 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
} else if (ret != WT_NOTFOUND)
goto err;
}
+
+ WT_ERR(__wt_config_gets(session, cfg, "file_extend", &cval));
+ for (ft = file_types; ft->name != NULL; ft++) {
+ ret = __wt_config_subgets(session, &cval, ft->name, &sval);
+ if (ret == 0) {
+ switch (ft->flag) {
+ case WT_FILE_TYPE_DATA:
+ conn->data_extend_len = sval.val;
+ break;
+ case WT_FILE_TYPE_LOG:
+ conn->log_extend_len = sval.val;
+ break;
+ }
+ } else if (ret != WT_NOTFOUND)
+ goto err;
+ }
+
WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval));
conn->mmap = cval.val == 0 ? 0 : 1;
WT_ERR(__wt_config_gets(session, cfg, "statistics", &cval));
diff --git a/src/docs/tuning.dox b/src/docs/tuning.dox
index 788fe7b5945..7b89316a808 100644
--- a/src/docs/tuning.dox
+++ b/src/docs/tuning.dox
@@ -161,6 +161,15 @@ An example of configuring page sizes:
@snippet ex_file.c file create
+@section tuning_system_file_growth File growth
+
+It's faster on some filesystems to grow a file in chunks rather than to
+extend it a block at a time as new blocks are written. By configuring
+the wiredtiger_open functions \c file_extend value, applications can
+grow files ahead of the blocks being written.
+
+@snippet ex_all.c Configure file_extend
+
@section tuning_system_buffer_cache System buffer cache
@subsection tuning_system_buffer_cache_direct_io Direct I/O
diff --git a/src/include/connection.h b/src/include/connection.h
index 52f34cc105e..78d9431fbbe 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -185,7 +185,10 @@ struct __wt_connection_impl {
uint32_t schema_gen; /* Schema generation number */
- uint32_t direct_io; /* O_DIRECT configuration */
+ off_t data_extend_len; /* file_extend data length */
+ off_t log_extend_len; /* file_extend log length */
+
+ uint32_t direct_io; /* O_DIRECT file type flags */
int mmap; /* mmap configuration */
uint32_t verbose;
diff --git a/src/include/flags.h b/src/include/flags.h
index 3f4658b56b1..e4a7bceda20 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -10,9 +10,9 @@
#define WT_CONN_SERVER_RUN 0x00000004
#define WT_CONN_SYNC 0x00000002
#define WT_CONN_TRANSACTIONAL 0x00000001
-#define WT_DIRECTIO_DATA 0x00000002
-#define WT_DIRECTIO_LOG 0x00000001
#define WT_EVICTION_SERVER_LOCKED 0x00000004
+#define WT_FILE_TYPE_DATA 0x00000002
+#define WT_FILE_TYPE_LOG 0x00000001
#define WT_SESSION_INTERNAL 0x00000010
#define WT_SESSION_NO_CACHE 0x00000008
#define WT_SESSION_NO_CACHE_CHECK 0x00000004
diff --git a/src/include/os.h b/src/include/os.h
index 46f20e0b1e2..d4035575916 100644
--- a/src/include/os.h
+++ b/src/include/os.h
@@ -41,6 +41,8 @@ struct __wt_fh {
int fd; /* POSIX file handle */
off_t size; /* File size */
+ off_t extend_size; /* File extended size */
+ off_t extend_len; /* File extend chunk size */
int direct_io; /* O_DIRECT configured */
};
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 7ece5458d7c..412e08f1d9a 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -1357,6 +1357,11 @@ struct __wt_connection {
* WT_CONNECTION::load_extension. For example\,
* <code>extensions=(/path/ext.so={entry=my_entry})</code>., a list of strings;
* default empty.}
+ * @config{file_extend, file extension configuration. If set\, extend files of
+ * the set type in allocations of the set size\, instead of a block at a time as
+ * each new block is written. For example\,
+ * <code>file_extend=(data=16MB)</code>., a list\, with values chosen from the
+ * following options: \c "data"\, \c "log"; default empty.}
* @config{hazard_max, maximum number of simultaneous hazard pointers per
* session handle., an integer greater than or equal to 15; default \c 1000.}
* @config{logging, enable logging., a boolean flag; default \c false.}
diff --git a/src/os_posix/os_ftruncate.c b/src/os_posix/os_ftruncate.c
index 376dd388fe4..e1610bf2969 100644
--- a/src/os_posix/os_ftruncate.c
+++ b/src/os_posix/os_ftruncate.c
@@ -18,7 +18,7 @@ __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, off_t len)
WT_SYSCALL_RETRY(ftruncate(fh->fd, len), ret);
if (ret == 0) {
- fh->size = len;
+ fh->size = fh->extend_size = len;
return (0);
}
diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c
index e73c4cde600..c55cba4340a 100644
--- a/src/os_posix/os_open.c
+++ b/src/os_posix/os_open.c
@@ -114,12 +114,11 @@ __wt_open(WT_SESSION_IMPL *session,
mode = 0;
#ifdef O_DIRECT
- if (is_tree && FLD_ISSET(conn->direct_io, WT_DIRECTIO_DATA)) {
+ if (is_tree && FLD_ISSET(conn->direct_io, WT_FILE_TYPE_DATA)) {
f |= O_DIRECT;
direct_io = 1;
}
#endif
-
WT_SYSCALL_RETRY(((fd = open(path, f, mode)) == -1 ? 1 : 0), ret);
if (ret != 0)
WT_ERR_MSG(session, ret,
@@ -161,6 +160,10 @@ __wt_open(WT_SESSION_IMPL *session,
/* Set the file's size. */
WT_ERR(__wt_filesize(session, fh, &fh->size));
+ /* Configure file extension. */
+ if (is_tree)
+ fh->extend_len = conn->data_extend_len;
+
/* Link onto the environment's list of files. */
__wt_spin_lock(session, &conn->fh_lock);
TAILQ_INSERT_TAIL(&conn->fhqh, fh, q);
diff --git a/test/format/config.h b/test/format/config.h
index 8a282642cb0..ef5b04363a5 100644
--- a/test/format/config.h
+++ b/test/format/config.h
@@ -82,6 +82,10 @@ static CONFIG c[] = {
"type of compression (none | bzip | lzo | raw | snappy)",
0, C_IGNORE|C_STRING, 1, 5, NULL, &g.c_compression },
+ { "data_extend",
+ "if data files are extended", /* 5% */
+ 0, C_BOOL, 5, 0, &g.c_data_extend, NULL },
+
{ "data_source",
"type of data source to create (file | kvs | lsm | table)",
0, C_IGNORE | C_STRING, 0, 0, NULL, &g.c_data_source },
diff --git a/test/format/format.h b/test/format/format.h
index 331b856ee3d..1871e425497 100644
--- a/test/format/format.h
+++ b/test/format/format.h
@@ -122,6 +122,7 @@ typedef struct {
u_int c_cache;
char *c_compression;
char *c_config_open;
+ u_int c_data_extend;
char *c_data_source;
u_int c_delete_pct;
u_int c_dictionary;
diff --git a/test/format/wts.c b/test/format/wts.c
index 78a83b0738e..03ea7de4190 100644
--- a/test/format/wts.c
+++ b/test/format/wts.c
@@ -75,13 +75,16 @@ wts_open(void)
* override the standard configuration.
*/
snprintf(config, sizeof(config),
- "create,sync=false,cache_size=%" PRIu32 "MB,"
+ "create,"
+ "sync=false,cache_size=%" PRIu32 "MB,"
"error_prefix=\"%s\","
+ "%s,"
"extensions="
"[\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"],"
"%s,%s",
g.c_cache,
g.progname,
+ g.c_data_extend ? "file_extend=(data=8MB)," : "",
REVERSE_PATH,
access(BZIP_PATH, R_OK) == 0 ? BZIP_PATH : "",
access(LZO_PATH, R_OK) == 0 ? LZO_PATH : "",