diff options
author | Michael Cahill <mjc@wiredtiger.com> | 2013-03-07 16:05:29 -0800 |
---|---|---|
committer | Michael Cahill <mjc@wiredtiger.com> | 2013-03-07 16:05:29 -0800 |
commit | 36a5d1b4f0fde2f69e51bd9ed0e90976823b175f (patch) | |
tree | 6caca3d118e7c65f4433af47b696adc7d35a5469 /src | |
parent | bfb576d22de55fdad4890a7da6a31a86f2aa9156 (diff) | |
parent | 3fae7e5e6a54d591ef510152e8f1681419625be8 (diff) | |
download | mongo-36a5d1b4f0fde2f69e51bd9ed0e90976823b175f.tar.gz |
Merge pull request #472 from wiredtiger/fadvise
Add configurations to minimize the impact of reads and writes on the filesystem cache.
Diffstat (limited to 'src')
-rw-r--r-- | src/block/block_map.c | 63 | ||||
-rw-r--r-- | src/block/block_mgr.c | 17 | ||||
-rw-r--r-- | src/block/block_open.c | 36 | ||||
-rw-r--r-- | src/block/block_read.c | 17 | ||||
-rw-r--r-- | src/block/block_write.c | 25 | ||||
-rw-r--r-- | src/config/config_def.c | 13 | ||||
-rw-r--r-- | src/include/block.h | 7 | ||||
-rw-r--r-- | src/include/extern.h | 8 | ||||
-rw-r--r-- | src/include/os.h | 8 | ||||
-rw-r--r-- | src/include/wiredtiger.in | 13 | ||||
-rw-r--r-- | src/os_posix/os_fsync.c | 7 | ||||
-rw-r--r-- | src/os_posix/os_open.c | 5 | ||||
-rw-r--r-- | src/os_posix/os_rw.c | 12 |
13 files changed, 180 insertions, 51 deletions
diff --git a/src/block/block_map.c b/src/block/block_map.c new file mode 100644 index 00000000000..93dcc4bec6c --- /dev/null +++ b/src/block/block_map.c @@ -0,0 +1,63 @@ +/*- + * Copyright (c) 2008-2013 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_block_map -- + * Map a segment of the file in, if possible. + */ +int +__wt_block_map( + WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp) +{ + *(void **)mapp = NULL; + *maplenp = 0; + + /* + * Turn off mapping when verifying the file, because we can't perform + * checksum validation of mapped segments, and verify has to checksum + * pages. + */ + if (block->verify) + return (0); + + /* + * Turn off mapping when direct I/O is configured for the file, the + * Linux open(2) documentation says applications should avoid mixing + * mmap(2) of files with direct I/O to the same files. + */ + if (block->fh->direct_io) + return (0); + + /* + * Turn off mapping if the application configured a cache size maximum, + * we can't control how much of the cache size we use in that case. + */ + if (block->os_cache_max != 0) + return (0); + + /* + * Map the file into memory. + * Ignore errors, we'll read the file through the cache if map fails. + */ + (void)__wt_mmap(session, block->fh, mapp, maplenp); + + return (0); +} + +/* + * __wt_block_unmap -- + * Unmap any mapped-in segment of the file. + */ +int +__wt_block_unmap( + WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen) +{ + /* Unmap the file from memory. */ + return (__wt_munmap(session, block->fh, map, maplen)); +} diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c index 46535f501af..4ad65247825 100644 --- a/src/block/block_mgr.c +++ b/src/block/block_mgr.c @@ -96,17 +96,12 @@ __bm_checkpoint_load(WT_BM *bm, WT_SESSION_IMPL *session, if (checkpoint) { /* - * Read-only objects are mapped into memory instead of being - * read into cache buffers. Ignore errors, with no mapping - * we'll read into the cache. - * - * Turn off mapping when verifying the file, because we can't - * perform checksum validation of mapped segments, and verify - * has to checksum pages. + * Read-only objects are optionally mapped into memory instead + * of being read into cache buffers. */ - if (conn->mmap && !bm->block->verify) - (void)__wt_mmap( - session, bm->block->fh, &bm->map, &bm->maplen); + if (conn->mmap) + WT_RET(__wt_block_map( + session, bm->block, &bm->map, &bm->maplen)); /* * If this handle is for a checkpoint, that is, read-only, there @@ -142,7 +137,7 @@ __bm_checkpoint_unload(WT_BM *bm, WT_SESSION_IMPL *session) /* Unmap any mapped segment. */ if (bm->map != NULL) WT_TRET( - __wt_munmap(session, bm->block->fh, bm->map, bm->maplen)); + __wt_block_unmap(session, bm->block, bm->map, bm->maplen)); /* Unload the checkpoint. */ WT_TRET(__wt_block_checkpoint_unload(session, bm->block, !bm->is_live)); diff --git a/src/block/block_open.c b/src/block/block_open.c index 4b5601c75a6..d47a3962ca3 100644 --- a/src/block/block_open.c +++ b/src/block/block_open.c @@ -123,17 +123,43 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, WT_ERR(__wt_strdup(session, filename, &block->name)); - /* Get the allocation size. */ + /* Configuration: allocation size. */ WT_ERR(__wt_config_getones(session, config, "allocation_size", &cval)); block->allocsize = (uint32_t)cval.val; + /* Configuration: optional OS buffer cache maximum size. */ + WT_ERR(__wt_config_getones(session, config, "os_cache_max", &cval)); + block->os_cache_max = cval.val; +#ifdef HAVE_POSIX_FADVISE + if (conn->direct_io && block->os_cache_max) + WT_ERR_MSG(session, EINVAL, + "os_cache_max not supported in combination with direct_io"); +#else + if (block->os_cache_max) + WT_ERR_MSG(session, EINVAL, + "os_cache_max not supported if posix_fadvise not " + "available"); +#endif + + /* Configuration: optional immediate write scheduling flag. */ + WT_ERR( + __wt_config_getones(session, config, "os_cache_dirty_max", &cval)); + block->os_cache_dirty_max = cval.val; +#ifdef HAVE_SYNC_FILE_RANGE + if (conn->direct_io && block->os_cache_dirty_max) + WT_ERR_MSG(session, EINVAL, + "os_cache_dirty_max not supported in combination with " + "direct_io"); +#else + if (block->os_cache_dirty_max) + WT_ERR_MSG(session, EINVAL, + "os_cache_dirty_max not supported if sync_file_range not " + "available"); +#endif + /* Open the underlying file handle. */ WT_ERR(__wt_open(session, filename, 0, 0, 1, &block->fh)); - /* Get the OS buffer cache maximum size. */ - WT_ERR(__wt_config_getones(session, config, "os_cache_max", &cval)); - block->fh->os_cache_max = cval.val; - /* Initialize the live checkpoint's lock. */ __wt_spin_init(session, &block->live_lock); diff --git a/src/block/block_read.c b/src/block/block_read.c index 4a5ba4c4478..20bd7c17b31 100644 --- a/src/block/block_read.c +++ b/src/block/block_read.c @@ -55,7 +55,22 @@ __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, } /* Read the block. */ - return (__wt_block_read_off(session, block, buf, offset, size, cksum)); + WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum)); + +#ifdef HAVE_POSIX_FADVISE + /* Optionally discard blocks from the system's buffer cache. */ + if (block->os_cache_max != 0 && + (block->os_cache += size) > block->os_cache_max) { + WT_DECL_RET; + + block->os_cache = 0; + if ((ret = posix_fadvise(block->fh->fd, + (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0) + WT_RET_MSG( + session, ret, "%s: posix_fadvise", block->name); + } +#endif + return (0); } /* diff --git a/src/block/block_write.c b/src/block/block_write.c index ce07bd6ae57..82d8cb490b5 100644 --- a/src/block/block_write.c +++ b/src/block/block_write.c @@ -141,6 +141,31 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_RET(ret); } +#ifdef HAVE_SYNC_FILE_RANGE + /* + * Optionally schedule writes for dirty pages in the system buffer + * cache. + */ + if (block->os_cache_dirty_max != 0 && + (block->os_cache_dirty += align_size) > block->os_cache_dirty_max) { + block->os_cache_dirty = 0; + if ((ret = sync_file_range(block->fh->fd, + (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) != 0) + WT_RET_MSG( + session, ret, "%s: sync_file_range", block->name); + } +#endif +#ifdef HAVE_POSIX_FADVISE + /* Optionally discard blocks from the system buffer cache. */ + if (block->os_cache_max != 0 && + (block->os_cache += align_size) > block->os_cache_max) { + block->os_cache = 0; + if ((ret = posix_fadvise(block->fh->fd, + (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0) + WT_RET_MSG( + session, ret, "%s: posix_fadvise", block->name); + } +#endif WT_CSTAT_INCR(session, block_write); WT_CSTAT_INCRV(session, block_byte_write, align_size); diff --git a/src/config/config_def.c b/src/config/config_def.c index 2a99402d0e7..9eb023e9928 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -133,9 +133,9 @@ __wt_confdfl_file_meta = ",leaf_item_max=0,leaf_page_max=1MB,lsm_bloom=,lsm_bloom_bit_count=8," "lsm_bloom_config=,lsm_bloom_hash_count=4,lsm_bloom_newest=0," "lsm_bloom_oldest=0,lsm_chunk_size=2MB,lsm_merge_max=15," - "lsm_merge_threads=1,memory_page_max=5MB,os_cache_max=0," - "prefix_compression=,split_pct=75,value_format=u,version=(major=0," - "minor=0)"; + "lsm_merge_threads=1,memory_page_max=5MB,os_cache_dirty_max=0," + "os_cache_max=0,prefix_compression=,split_pct=75,value_format=u," + "version=(major=0,minor=0)"; WT_CONFIG_CHECK __wt_confchk_file_meta[] = { @@ -169,6 +169,7 @@ __wt_confchk_file_meta[] = { { "lsm_merge_max", "int", "min=2,max=100", NULL}, { "lsm_merge_threads", "int", "min=1,max=10", NULL}, { "memory_page_max", "int", "min=512B,max=10TB", NULL}, + { "os_cache_dirty_max", "int", "min=0", NULL}, { "os_cache_max", "int", "min=0", NULL}, { "prefix_compression", "boolean", NULL, NULL}, { "split_pct", "int", "min=25,max=100", NULL}, @@ -258,8 +259,9 @@ __wt_confdfl_session_create = ",leaf_item_max=0,leaf_page_max=1MB,lsm_bloom=,lsm_bloom_bit_count=8," "lsm_bloom_config=,lsm_bloom_hash_count=4,lsm_bloom_newest=0," "lsm_bloom_oldest=0,lsm_chunk_size=2MB,lsm_merge_max=15," - "lsm_merge_threads=1,memory_page_max=5MB,os_cache_max=0," - "prefix_compression=,source=,split_pct=75,type=file,value_format=u"; + "lsm_merge_threads=1,memory_page_max=5MB,os_cache_dirty_max=0," + "os_cache_max=0,prefix_compression=,source=,split_pct=75,type=file," + "value_format=u"; WT_CONFIG_CHECK __wt_confchk_session_create[] = { @@ -294,6 +296,7 @@ __wt_confchk_session_create[] = { { "lsm_merge_max", "int", "min=2,max=100", NULL}, { "lsm_merge_threads", "int", "min=1,max=10", NULL}, { "memory_page_max", "int", "min=512B,max=10TB", NULL}, + { "os_cache_dirty_max", "int", "min=0", NULL}, { "os_cache_max", "int", "min=0", NULL}, { "prefix_compression", "boolean", NULL, NULL}, { "source", "string", NULL, NULL}, diff --git a/src/include/block.h b/src/include/block.h index 743b4685e43..c234e980bbe 100644 --- a/src/include/block.h +++ b/src/include/block.h @@ -195,7 +195,12 @@ struct __wt_block { /* Configuration information, set when the file is opened. */ uint32_t allocsize; /* Allocation size */ - u_int block_header; /* Header length */ + u_int block_header; /* Header length */ + + int64_t os_cache; /* System buffer cache flush max */ + int64_t os_cache_max; + int64_t os_cache_dirty; /* System buffer cache write max */ + int64_t os_cache_dirty_max; /* * There is only a single checkpoint in a file that can be written. The diff --git a/src/include/extern.h b/src/include/extern.h index 111a5152318..bed015e316e 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -112,6 +112,14 @@ extern int __wt_block_extlist_init(WT_SESSION_IMPL *session, const char *name, const char *extname); extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el); +extern int __wt_block_map( WT_SESSION_IMPL *session, + WT_BLOCK *block, + void *mapp, + size_t *maplenp); +extern int __wt_block_unmap( WT_SESSION_IMPL *session, + WT_BLOCK *block, + void *map, + size_t maplen); extern int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, const char *config, diff --git a/src/include/os.h b/src/include/os.h index 7d4af7ea091..ad2932cb403 100644 --- a/src/include/os.h +++ b/src/include/os.h @@ -30,14 +30,12 @@ } while (0) struct __wt_fh { + u_int refcnt; /* Reference count */ TAILQ_ENTRY(__wt_fh) q; /* List of open handles */ - off_t file_size; /* File size */ - off_t io_size; /* Current amount of I/O */ - off_t os_cache_max; /* Max I/O before flushing */ - char *name; /* File name */ + off_t file_size; /* File size */ int fd; /* POSIX file handle */ - u_int refcnt; /* Reference count */ + int direct_io; /* O_DIRECT configured */ }; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 7aa4e3d80c3..2310d78dc1e 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -719,10 +719,15 @@ struct __wt_session { * adjusted to a lower bound of <code>50 * leaf_page_max</code>. This * limit is soft - it is possible for pages to be temporarily larger * than this value.,an integer between 512B and 10TB; default \c 5MB.} - * @config{os_cache_max, maximum filesystem cache. If non-zero\, - * WiredTiger will attempt to flush the operating system buffer cache - * whenever this amount of I/O is performed.,an integer greater than or - * equal to 0; default \c 0.} + * @config{os_cache_dirty_max, maximum dirty system buffer cache usage\, + * in bytes. If non-zero\, schedule writes for dirty blocks belonging + * to this object in the system buffer cache after that many bytes from + * this object are written into the buffer cache.,an integer greater + * than or equal to 0; default \c 0.} + * @config{os_cache_max, maximum system buffer cache usage\, in bytes. + * If non-zero\, evict object blocks from the system buffer cache after + * that many bytes from this object are read or written into the buffer + * cache.,an integer greater than or equal to 0; default \c 0.} * @config{prefix_compression, configure row-store format key prefix * compression.,a boolean flag; default \c true.} * @config{source, override the default data source URI derived from the diff --git a/src/os_posix/os_fsync.c b/src/os_posix/os_fsync.c index fb0010705bb..2871aa9a21f 100644 --- a/src/os_posix/os_fsync.c +++ b/src/os_posix/os_fsync.c @@ -22,12 +22,5 @@ __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh) if (ret != 0) WT_RET_MSG(session, ret, "%s fsync error", fh->name); -#ifdef HAVE_POSIX_FADVISE - if (fh->os_cache_max > 0 && fh->io_size > fh->os_cache_max) { - fh->io_size = 0; - WT_RET(posix_fadvise(fh->fd, 0, 0, POSIX_FADV_DONTNEED)); - } -#endif - return (0); } diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c index fd321b7759c..b8d1d8abfe9 100644 --- a/src/os_posix/os_open.c +++ b/src/os_posix/os_open.c @@ -147,6 +147,11 @@ __wt_open(WT_SESSION_IMPL *session, fh->fd = fd; fh->refcnt = 1; +#ifdef O_DIRECT + if (f & O_DIRECT) + fh->direct_io = 1; +#endif + /* Set the file's size. */ WT_ERR(__wt_filesize(session, fh, &fh->file_size)); diff --git a/src/os_posix/os_rw.c b/src/os_posix/os_rw.c index 5e051d701c2..2b83d961592 100644 --- a/src/os_posix/os_rw.c +++ b/src/os_posix/os_rw.c @@ -27,14 +27,6 @@ __wt_read(WT_SESSION_IMPL *session, " bytes at offset %" PRIuMAX, fh->name, bytes, (uintmax_t)offset); -#ifdef HAVE_POSIX_FADVISE - if (fh->os_cache_max > 0 && - (fh->io_size += (off_t)bytes) > fh->os_cache_max) { - fh->io_size = 0; - WT_RET(posix_fadvise(fh->fd, 0, 0, POSIX_FADV_DONTNEED)); - } -#endif - return (0); } @@ -58,9 +50,5 @@ __wt_write(WT_SESSION_IMPL *session, " bytes at offset %" PRIuMAX, fh->name, bytes, (uintmax_t)offset); -#ifdef HAVE_POSIX_FADVISE - if (fh->os_cache_max > 0) - fh->io_size += (off_t)bytes; -#endif return (0); } |