diff options
author | Keith Bostic <keith@wiredtiger.com> | 2013-05-14 05:50:48 -0700 |
---|---|---|
committer | Keith Bostic <keith@wiredtiger.com> | 2013-05-14 05:50:48 -0700 |
commit | 3e85a5919f8fef654ea13a62f697dce55e959a13 (patch) | |
tree | 3488de99afa1fa9748b7645ee566effb18fd360a | |
parent | dded2972d38359db5ce403d47ea841da90856586 (diff) | |
parent | 73a0c8a509188e97cf1eaf0155448091716499de (diff) | |
download | mongo-3e85a5919f8fef654ea13a62f697dce55e959a13.tar.gz |
Merge pull request #534 from wiredtiger/file-extend
file extension
-rw-r--r-- | build_posix/configure.ac.in | 5 | ||||
-rw-r--r-- | dist/api_data.py | 6 | ||||
-rw-r--r-- | dist/flags.py | 6 | ||||
-rw-r--r-- | dist/s_string.ok | 1 | ||||
-rw-r--r-- | examples/c/ex_all.c | 7 | ||||
-rw-r--r-- | src/block/block_ckpt.c | 36 | ||||
-rw-r--r-- | src/block/block_write.c | 37 | ||||
-rw-r--r-- | src/config/config_def.c | 5 | ||||
-rw-r--r-- | src/conn/conn_api.c | 27 | ||||
-rw-r--r-- | src/docs/tuning.dox | 9 | ||||
-rw-r--r-- | src/include/connection.h | 5 | ||||
-rw-r--r-- | src/include/flags.h | 4 | ||||
-rw-r--r-- | src/include/os.h | 2 | ||||
-rw-r--r-- | src/include/wiredtiger.in | 5 | ||||
-rw-r--r-- | src/os_posix/os_ftruncate.c | 2 | ||||
-rw-r--r-- | src/os_posix/os_open.c | 7 | ||||
-rw-r--r-- | test/format/config.h | 4 | ||||
-rw-r--r-- | test/format/format.h | 1 | ||||
-rw-r--r-- | test/format/wts.c | 5 |
19 files changed, 134 insertions, 40 deletions
diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in index 1b4afeb30ec..cab117b5c56 100644 --- a/build_posix/configure.ac.in +++ b/build_posix/configure.ac.in @@ -83,9 +83,8 @@ AC_CHECK_LIB(pthread, pthread_create) AC_CHECK_LIB(dl, dlopen) AC_CHECK_LIB(rt, sched_yield) AC_CHECK_FUNCS([\ - clock_gettime fcntl gettimeofday \ - posix_fadvise posix_madvise posix_memalign\ - strtouq sync_file_range]) + clock_gettime fcntl ftruncate gettimeofday posix_fadvise\ + posix_fallocate posix_madvise posix_memalign strtouq sync_file_range]) AC_SYS_LARGEFILE AC_MSG_CHECKING([if the adaptive mutex type is available]) diff --git a/dist/api_data.py b/dist/api_data.py index c835faa52ee..b87ba07719d 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -560,6 +560,12 @@ methods = { WT_CONNECTION::load_extension. For example, <code>extensions=(/path/ext.so={entry=my_entry})</code>''', type='list'), + Config('file_extend', '', r''' + file extension configuration. If set, extend files of the set + type in allocations of the set size, instead of a block at a + time as each new block is written. For example, + <code>file_extend=(data=16MB)</code>''', + type='list', choices=['data', 'log']), Config('hazard_max', '1000', r''' maximum number of simultaneous hazard pointers per session handle''', diff --git a/dist/flags.py b/dist/flags.py index 607c7f13aa9..1c1dbc3564d 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -15,9 +15,9 @@ flags = { 'SYNC_DISCARD_NOWRITE', 'SYNC_WRITE_LEAVES', ], - 'direct_io' : [ - 'DIRECTIO_DATA', - 'DIRECTIO_LOG' + 'file_types' : [ + 'FILE_TYPE_DATA', + 'FILE_TYPE_LOG' ], 'rec_write' : [ 'EVICTION_SERVER_LOCKED', diff --git a/dist/s_string.ok b/dist/s_string.ok index 39537b8f597..57a7e09f1cb 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -402,6 +402,7 @@ extern extlist extlists fadvise +fallocate fblocks fclose fcntl diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c index 4eb3430f79e..edf2437d167 100644 --- a/examples/c/ex_all.c +++ b/examples/c/ex_all.c @@ -1013,6 +1013,13 @@ main(void) (void)conn->close(conn, NULL); #endif + /*! [Configure file_extend] */ + ret = wiredtiger_open(home, NULL, + "create,file_extend=(type=[data],size=16MB)", &conn); + /*! [Configure file_extend] */ + if (ret == 0) + (void)conn->close(conn, NULL); + /*! [Statistics configuration] */ ret = wiredtiger_open(home, NULL, "create,statistics=true", &conn); /*! [Statistics configuration] */ diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c index b6e70408564..8fa36075887 100644 --- a/src/block/block_ckpt.c +++ b/src/block/block_ckpt.c @@ -117,21 +117,20 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ERR(__wt_block_extlist_read_avail( session, block, &ci->avail, ci->file_size)); - /* - * If the checkpoint can be written, that means anything written - * after the checkpoint is no longer interesting, truncate the - * file. Don't bother checking the avail list for a block at - * the end of the file, that was done when the checkpoint was - * first written (re-writing the checkpoint might possibly make - * it relevant here, but it's unlikely enough I don't bother). - */ - if (!checkpoint) { - WT_VERBOSE_ERR(session, ckpt, - "truncate file to %" PRIuMAX, - (uintmax_t)ci->file_size); - WT_ERR( - __wt_ftruncate(session, block->fh, ci->file_size)); - } + } + + /* + * If the checkpoint can be written, that means anything written after + * the checkpoint is no longer interesting, truncate the file. Don't + * bother checking the avail list for a block at the end of the file, + * that was done when the checkpoint was first written (re-writing the + * checkpoint might possibly make it relevant here, but it's unlikely + * enough I don't bother). + */ + if (!checkpoint) { + WT_VERBOSE_ERR(session, ckpt, + "truncate file to %" PRIuMAX, (uintmax_t)ci->file_size); + WT_ERR(__wt_ftruncate(session, block->fh, ci->file_size)); } if (0) { @@ -161,6 +160,11 @@ __wt_block_checkpoint_unload( if (block->verify) WT_TRET(__wt_verify_ckpt_unload(session, block)); + /* If it's the live system, truncate to discard any extended blocks. */ + if (!checkpoint) + WT_TRET(__wt_ftruncate(session, block->fh, block->fh->size)); + + /* If it's the live system, discard the active extent lists. */ if (!checkpoint) __wt_block_ckpt_destroy(session, &block->live); @@ -615,7 +619,7 @@ __ckpt_update( * if there ever is, this will need to be fixed. */ if (is_live) - WT_RET(__wt_filesize(session, block->fh, &ci->file_size)); + ci->file_size = block->fh->size; /* Set the checkpoint size for the live system. */ if (is_live) diff --git a/src/block/block_write.c b/src/block/block_write.c index 13cb0f25f0e..9cc4eaa289d 100644 --- a/src/block/block_write.c +++ b/src/block/block_write.c @@ -69,10 +69,12 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, { WT_BLOCK_HEADER *blk; WT_DECL_RET; + WT_FH *fh; off_t offset; uint32_t align_size; blk = WT_BLOCK_HEADER_REF(buf->mem); + fh = block->fh; /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { @@ -131,8 +133,35 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); - if ((ret = __wt_write( - session, block->fh, offset, align_size, buf->mem)) != 0) { +#if defined(HAVE_POSIX_FALLOCATE) || defined(HAVE_FTRUNCATE) + /* + * Extend the file in chunks. We aren't holding a lock and we'd prefer + * to limit the number of threads extending the file at the same time, + * so choose the one thread that's crossing the extended boundary. We + * don't extend newly created files, and it's theoretically possible we + * might wait so long our extension of the file is passed by another + * thread writing single blocks, that's why there's a check in case the + * extended file size becomes too small: if the file size catches up, + * every thread will try to extend it. + */ + if (fh->extend_len != 0 && + (fh->extend_size <= fh->size || + (offset + fh->extend_len <= fh->extend_size && + offset + fh->extend_len + align_size >= fh->extend_size))) { + fh->extend_size = offset + fh->extend_len * 2; +#if defined(HAVE_POSIX_FALLOCATE) + if ((ret = + posix_fallocate(fh->fd, offset, fh->extend_len * 2)) != 0) + WT_RET_MSG( + session, ret, "%s: posix_fallocate", fh->name); +#elif defined(HAVE_FTRUNCATE) + if ((ret = ftruncate(fh->fd, fh->extend_size)) != 0) + WT_RET_MSG(session, ret, "%s: ftruncate", fh->name); +#endif + } +#endif + if ((ret = + __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!locked) __wt_spin_lock(session, &block->live_lock); WT_TRET( @@ -150,7 +179,7 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max) { block->os_cache_dirty = 0; - if ((ret = sync_file_range(block->fh->fd, + if ((ret = sync_file_range(fh->fd, (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) != 0) WT_RET_MSG( session, ret, "%s: sync_file_range", block->name); @@ -161,7 +190,7 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; - if ((ret = posix_fadvise(block->fh->fd, + if ((ret = posix_fadvise(fh->fd, (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); diff --git a/src/config/config_def.c b/src/config/config_def.c index d77c76ff56e..c01894400c4 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -229,6 +229,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "eviction_target", "int", "min=10,max=99", NULL}, { "eviction_trigger", "int", "min=10,max=99", NULL}, { "extensions", "list", NULL, NULL}, + { "file_extend", "list", "choices=[\"data\",\"log\"]", NULL}, { "hazard_max", "int", "min=15", NULL}, { "logging", "boolean", NULL, NULL}, { "lsm_merge", "boolean", NULL, NULL}, @@ -391,8 +392,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { "buffer_alignment=-1,cache_size=100MB," "checkpoint=(name=\"WiredTigerCheckpoint\",wait=0),create=0," "direct_io=,error_prefix=,eviction_dirty_target=80,eviction_target=80" - ",eviction_trigger=95,extensions=,hazard_max=1000,logging=0," - "lsm_merge=,mmap=,multiprocess=0,session_max=50," + ",eviction_trigger=95,extensions=,file_extend=,hazard_max=1000," + "logging=0,lsm_merge=,mmap=,multiprocess=0,session_max=50," "shared_cache=(chunk=10MB,name=pool,reserve=0,size=500MB)," "statistics=0,statistics_log=(clear=,path=\"WiredTigerStat.%H\"," "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),sync=,transactional=," diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 25d6903374c..5ec60c6a524 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -850,9 +850,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, static const struct { const char *name; uint32_t flag; - } *ft, directio_types[] = { - { "data", WT_DIRECTIO_DATA }, - { "log", WT_DIRECTIO_LOG }, + } *ft, file_types[] = { + { "data", WT_FILE_TYPE_DATA }, + { "log", WT_FILE_TYPE_LOG }, { NULL, 0 } }; WT_CONFIG subconfig; @@ -949,10 +949,10 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, #endif /* - * Configuration: direct_io, mmap, statistics. + * Configuration: direct_io, file_extend, mmap, statistics. */ WT_ERR(__wt_config_gets(session, cfg, "direct_io", &cval)); - for (ft = directio_types; ft->name != NULL; ft++) { + for (ft = file_types; ft->name != NULL; ft++) { ret = __wt_config_subgets(session, &cval, ft->name, &sval); if (ret == 0) { if (sval.val) @@ -960,6 +960,23 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, } else if (ret != WT_NOTFOUND) goto err; } + + WT_ERR(__wt_config_gets(session, cfg, "file_extend", &cval)); + for (ft = file_types; ft->name != NULL; ft++) { + ret = __wt_config_subgets(session, &cval, ft->name, &sval); + if (ret == 0) { + switch (ft->flag) { + case WT_FILE_TYPE_DATA: + conn->data_extend_len = sval.val; + break; + case WT_FILE_TYPE_LOG: + conn->log_extend_len = sval.val; + break; + } + } else if (ret != WT_NOTFOUND) + goto err; + } + WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval)); conn->mmap = cval.val == 0 ? 0 : 1; WT_ERR(__wt_config_gets(session, cfg, "statistics", &cval)); diff --git a/src/docs/tuning.dox b/src/docs/tuning.dox index 788fe7b5945..7b89316a808 100644 --- a/src/docs/tuning.dox +++ b/src/docs/tuning.dox @@ -161,6 +161,15 @@ An example of configuring page sizes: @snippet ex_file.c file create +@section tuning_system_file_growth File growth + +It's faster on some filesystems to grow a file in chunks rather than to +extend it a block at a time as new blocks are written. By configuring +the wiredtiger_open functions \c file_extend value, applications can +grow files ahead of the blocks being written. + +@snippet ex_all.c Configure file_extend + @section tuning_system_buffer_cache System buffer cache @subsection tuning_system_buffer_cache_direct_io Direct I/O diff --git a/src/include/connection.h b/src/include/connection.h index 52f34cc105e..78d9431fbbe 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -185,7 +185,10 @@ struct __wt_connection_impl { uint32_t schema_gen; /* Schema generation number */ - uint32_t direct_io; /* O_DIRECT configuration */ + off_t data_extend_len; /* file_extend data length */ + off_t log_extend_len; /* file_extend log length */ + + uint32_t direct_io; /* O_DIRECT file type flags */ int mmap; /* mmap configuration */ uint32_t verbose; diff --git a/src/include/flags.h b/src/include/flags.h index 3f4658b56b1..e4a7bceda20 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -10,9 +10,9 @@ #define WT_CONN_SERVER_RUN 0x00000004 #define WT_CONN_SYNC 0x00000002 #define WT_CONN_TRANSACTIONAL 0x00000001 -#define WT_DIRECTIO_DATA 0x00000002 -#define WT_DIRECTIO_LOG 0x00000001 #define WT_EVICTION_SERVER_LOCKED 0x00000004 +#define WT_FILE_TYPE_DATA 0x00000002 +#define WT_FILE_TYPE_LOG 0x00000001 #define WT_SESSION_INTERNAL 0x00000010 #define WT_SESSION_NO_CACHE 0x00000008 #define WT_SESSION_NO_CACHE_CHECK 0x00000004 diff --git a/src/include/os.h b/src/include/os.h index 46f20e0b1e2..d4035575916 100644 --- a/src/include/os.h +++ b/src/include/os.h @@ -41,6 +41,8 @@ struct __wt_fh { int fd; /* POSIX file handle */ off_t size; /* File size */ + off_t extend_size; /* File extended size */ + off_t extend_len; /* File extend chunk size */ int direct_io; /* O_DIRECT configured */ }; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 7ece5458d7c..412e08f1d9a 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -1357,6 +1357,11 @@ struct __wt_connection { * WT_CONNECTION::load_extension. For example\, * <code>extensions=(/path/ext.so={entry=my_entry})</code>., a list of strings; * default empty.} + * @config{file_extend, file extension configuration. If set\, extend files of + * the set type in allocations of the set size\, instead of a block at a time as + * each new block is written. For example\, + * <code>file_extend=(data=16MB)</code>., a list\, with values chosen from the + * following options: \c "data"\, \c "log"; default empty.} * @config{hazard_max, maximum number of simultaneous hazard pointers per * session handle., an integer greater than or equal to 15; default \c 1000.} * @config{logging, enable logging., a boolean flag; default \c false.} diff --git a/src/os_posix/os_ftruncate.c b/src/os_posix/os_ftruncate.c index 376dd388fe4..e1610bf2969 100644 --- a/src/os_posix/os_ftruncate.c +++ b/src/os_posix/os_ftruncate.c @@ -18,7 +18,7 @@ __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, off_t len) WT_SYSCALL_RETRY(ftruncate(fh->fd, len), ret); if (ret == 0) { - fh->size = len; + fh->size = fh->extend_size = len; return (0); } diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c index e73c4cde600..c55cba4340a 100644 --- a/src/os_posix/os_open.c +++ b/src/os_posix/os_open.c @@ -114,12 +114,11 @@ __wt_open(WT_SESSION_IMPL *session, mode = 0; #ifdef O_DIRECT - if (is_tree && FLD_ISSET(conn->direct_io, WT_DIRECTIO_DATA)) { + if (is_tree && FLD_ISSET(conn->direct_io, WT_FILE_TYPE_DATA)) { f |= O_DIRECT; direct_io = 1; } #endif - WT_SYSCALL_RETRY(((fd = open(path, f, mode)) == -1 ? 1 : 0), ret); if (ret != 0) WT_ERR_MSG(session, ret, @@ -161,6 +160,10 @@ __wt_open(WT_SESSION_IMPL *session, /* Set the file's size. */ WT_ERR(__wt_filesize(session, fh, &fh->size)); + /* Configure file extension. */ + if (is_tree) + fh->extend_len = conn->data_extend_len; + /* Link onto the environment's list of files. */ __wt_spin_lock(session, &conn->fh_lock); TAILQ_INSERT_TAIL(&conn->fhqh, fh, q); diff --git a/test/format/config.h b/test/format/config.h index 8a282642cb0..ef5b04363a5 100644 --- a/test/format/config.h +++ b/test/format/config.h @@ -82,6 +82,10 @@ static CONFIG c[] = { "type of compression (none | bzip | lzo | raw | snappy)", 0, C_IGNORE|C_STRING, 1, 5, NULL, &g.c_compression }, + { "data_extend", + "if data files are extended", /* 5% */ + 0, C_BOOL, 5, 0, &g.c_data_extend, NULL }, + { "data_source", "type of data source to create (file | kvs | lsm | table)", 0, C_IGNORE | C_STRING, 0, 0, NULL, &g.c_data_source }, diff --git a/test/format/format.h b/test/format/format.h index 331b856ee3d..1871e425497 100644 --- a/test/format/format.h +++ b/test/format/format.h @@ -122,6 +122,7 @@ typedef struct { u_int c_cache; char *c_compression; char *c_config_open; + u_int c_data_extend; char *c_data_source; u_int c_delete_pct; u_int c_dictionary; diff --git a/test/format/wts.c b/test/format/wts.c index 78a83b0738e..03ea7de4190 100644 --- a/test/format/wts.c +++ b/test/format/wts.c @@ -75,13 +75,16 @@ wts_open(void) * override the standard configuration. */ snprintf(config, sizeof(config), - "create,sync=false,cache_size=%" PRIu32 "MB," + "create," + "sync=false,cache_size=%" PRIu32 "MB," "error_prefix=\"%s\"," + "%s," "extensions=" "[\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"]," "%s,%s", g.c_cache, g.progname, + g.c_data_extend ? "file_extend=(data=8MB)," : "", REVERSE_PATH, access(BZIP_PATH, R_OK) == 0 ? BZIP_PATH : "", access(LZO_PATH, R_OK) == 0 ? LZO_PATH : "", |