summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMichael Cahill <mjc@wiredtiger.com>2013-03-07 16:05:29 -0800
committerMichael Cahill <mjc@wiredtiger.com>2013-03-07 16:05:29 -0800
commit36a5d1b4f0fde2f69e51bd9ed0e90976823b175f (patch)
tree6caca3d118e7c65f4433af47b696adc7d35a5469 /src
parentbfb576d22de55fdad4890a7da6a31a86f2aa9156 (diff)
parent3fae7e5e6a54d591ef510152e8f1681419625be8 (diff)
downloadmongo-36a5d1b4f0fde2f69e51bd9ed0e90976823b175f.tar.gz
Merge pull request #472 from wiredtiger/fadvise
Add configurations to minimize the impact of reads and writes on the filesystem cache.
Diffstat (limited to 'src')
-rw-r--r--src/block/block_map.c63
-rw-r--r--src/block/block_mgr.c17
-rw-r--r--src/block/block_open.c36
-rw-r--r--src/block/block_read.c17
-rw-r--r--src/block/block_write.c25
-rw-r--r--src/config/config_def.c13
-rw-r--r--src/include/block.h7
-rw-r--r--src/include/extern.h8
-rw-r--r--src/include/os.h8
-rw-r--r--src/include/wiredtiger.in13
-rw-r--r--src/os_posix/os_fsync.c7
-rw-r--r--src/os_posix/os_open.c5
-rw-r--r--src/os_posix/os_rw.c12
13 files changed, 180 insertions, 51 deletions
diff --git a/src/block/block_map.c b/src/block/block_map.c
new file mode 100644
index 00000000000..93dcc4bec6c
--- /dev/null
+++ b/src/block/block_map.c
@@ -0,0 +1,63 @@
+/*-
+ * Copyright (c) 2008-2013 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_block_map --
+ * Map a segment of the file in, if possible.
+ */
+int
+__wt_block_map(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp)
+{
+ *(void **)mapp = NULL;
+ *maplenp = 0;
+
+ /*
+ * Turn off mapping when verifying the file, because we can't perform
+ * checksum validation of mapped segments, and verify has to checksum
+ * pages.
+ */
+ if (block->verify)
+ return (0);
+
+ /*
+ * Turn off mapping when direct I/O is configured for the file, the
+ * Linux open(2) documentation says applications should avoid mixing
+ * mmap(2) of files with direct I/O to the same files.
+ */
+ if (block->fh->direct_io)
+ return (0);
+
+ /*
+ * Turn off mapping if the application configured a cache size maximum,
+ * we can't control how much of the cache size we use in that case.
+ */
+ if (block->os_cache_max != 0)
+ return (0);
+
+ /*
+ * Map the file into memory.
+ * Ignore errors, we'll read the file through the cache if map fails.
+ */
+ (void)__wt_mmap(session, block->fh, mapp, maplenp);
+
+ return (0);
+}
+
+/*
+ * __wt_block_unmap --
+ * Unmap any mapped-in segment of the file.
+ */
+int
+__wt_block_unmap(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen)
+{
+ /* Unmap the file from memory. */
+ return (__wt_munmap(session, block->fh, map, maplen));
+}
diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c
index 46535f501af..4ad65247825 100644
--- a/src/block/block_mgr.c
+++ b/src/block/block_mgr.c
@@ -96,17 +96,12 @@ __bm_checkpoint_load(WT_BM *bm, WT_SESSION_IMPL *session,
if (checkpoint) {
/*
- * Read-only objects are mapped into memory instead of being
- * read into cache buffers. Ignore errors, with no mapping
- * we'll read into the cache.
- *
- * Turn off mapping when verifying the file, because we can't
- * perform checksum validation of mapped segments, and verify
- * has to checksum pages.
+ * Read-only objects are optionally mapped into memory instead
+ * of being read into cache buffers.
*/
- if (conn->mmap && !bm->block->verify)
- (void)__wt_mmap(
- session, bm->block->fh, &bm->map, &bm->maplen);
+ if (conn->mmap)
+ WT_RET(__wt_block_map(
+ session, bm->block, &bm->map, &bm->maplen));
/*
* If this handle is for a checkpoint, that is, read-only, there
@@ -142,7 +137,7 @@ __bm_checkpoint_unload(WT_BM *bm, WT_SESSION_IMPL *session)
/* Unmap any mapped segment. */
if (bm->map != NULL)
WT_TRET(
- __wt_munmap(session, bm->block->fh, bm->map, bm->maplen));
+ __wt_block_unmap(session, bm->block, bm->map, bm->maplen));
/* Unload the checkpoint. */
WT_TRET(__wt_block_checkpoint_unload(session, bm->block, !bm->is_live));
diff --git a/src/block/block_open.c b/src/block/block_open.c
index 4b5601c75a6..d47a3962ca3 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -123,17 +123,43 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename,
WT_ERR(__wt_strdup(session, filename, &block->name));
- /* Get the allocation size. */
+ /* Configuration: allocation size. */
WT_ERR(__wt_config_getones(session, config, "allocation_size", &cval));
block->allocsize = (uint32_t)cval.val;
+ /* Configuration: optional OS buffer cache maximum size. */
+ WT_ERR(__wt_config_getones(session, config, "os_cache_max", &cval));
+ block->os_cache_max = cval.val;
+#ifdef HAVE_POSIX_FADVISE
+ if (conn->direct_io && block->os_cache_max)
+ WT_ERR_MSG(session, EINVAL,
+ "os_cache_max not supported in combination with direct_io");
+#else
+ if (block->os_cache_max)
+ WT_ERR_MSG(session, EINVAL,
+ "os_cache_max not supported if posix_fadvise not "
+ "available");
+#endif
+
+ /* Configuration: optional immediate write scheduling flag. */
+ WT_ERR(
+ __wt_config_getones(session, config, "os_cache_dirty_max", &cval));
+ block->os_cache_dirty_max = cval.val;
+#ifdef HAVE_SYNC_FILE_RANGE
+ if (conn->direct_io && block->os_cache_dirty_max)
+ WT_ERR_MSG(session, EINVAL,
+ "os_cache_dirty_max not supported in combination with "
+ "direct_io");
+#else
+ if (block->os_cache_dirty_max)
+ WT_ERR_MSG(session, EINVAL,
+ "os_cache_dirty_max not supported if sync_file_range not "
+ "available");
+#endif
+
/* Open the underlying file handle. */
WT_ERR(__wt_open(session, filename, 0, 0, 1, &block->fh));
- /* Get the OS buffer cache maximum size. */
- WT_ERR(__wt_config_getones(session, config, "os_cache_max", &cval));
- block->fh->os_cache_max = cval.val;
-
/* Initialize the live checkpoint's lock. */
__wt_spin_init(session, &block->live_lock);
diff --git a/src/block/block_read.c b/src/block/block_read.c
index 4a5ba4c4478..20bd7c17b31 100644
--- a/src/block/block_read.c
+++ b/src/block/block_read.c
@@ -55,7 +55,22 @@ __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session,
}
/* Read the block. */
- return (__wt_block_read_off(session, block, buf, offset, size, cksum));
+ WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum));
+
+#ifdef HAVE_POSIX_FADVISE
+ /* Optionally discard blocks from the system's buffer cache. */
+ if (block->os_cache_max != 0 &&
+ (block->os_cache += size) > block->os_cache_max) {
+ WT_DECL_RET;
+
+ block->os_cache = 0;
+ if ((ret = posix_fadvise(block->fh->fd,
+ (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0)
+ WT_RET_MSG(
+ session, ret, "%s: posix_fadvise", block->name);
+ }
+#endif
+ return (0);
}
/*
diff --git a/src/block/block_write.c b/src/block/block_write.c
index ce07bd6ae57..82d8cb490b5 100644
--- a/src/block/block_write.c
+++ b/src/block/block_write.c
@@ -141,6 +141,31 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
WT_RET(ret);
}
+#ifdef HAVE_SYNC_FILE_RANGE
+ /*
+ * Optionally schedule writes for dirty pages in the system buffer
+ * cache.
+ */
+ if (block->os_cache_dirty_max != 0 &&
+ (block->os_cache_dirty += align_size) > block->os_cache_dirty_max) {
+ block->os_cache_dirty = 0;
+ if ((ret = sync_file_range(block->fh->fd,
+ (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) != 0)
+ WT_RET_MSG(
+ session, ret, "%s: sync_file_range", block->name);
+ }
+#endif
+#ifdef HAVE_POSIX_FADVISE
+ /* Optionally discard blocks from the system buffer cache. */
+ if (block->os_cache_max != 0 &&
+ (block->os_cache += align_size) > block->os_cache_max) {
+ block->os_cache = 0;
+ if ((ret = posix_fadvise(block->fh->fd,
+ (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0)
+ WT_RET_MSG(
+ session, ret, "%s: posix_fadvise", block->name);
+ }
+#endif
WT_CSTAT_INCR(session, block_write);
WT_CSTAT_INCRV(session, block_byte_write, align_size);
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 2a99402d0e7..9eb023e9928 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -133,9 +133,9 @@ __wt_confdfl_file_meta =
",leaf_item_max=0,leaf_page_max=1MB,lsm_bloom=,lsm_bloom_bit_count=8,"
"lsm_bloom_config=,lsm_bloom_hash_count=4,lsm_bloom_newest=0,"
"lsm_bloom_oldest=0,lsm_chunk_size=2MB,lsm_merge_max=15,"
- "lsm_merge_threads=1,memory_page_max=5MB,os_cache_max=0,"
- "prefix_compression=,split_pct=75,value_format=u,version=(major=0,"
- "minor=0)";
+ "lsm_merge_threads=1,memory_page_max=5MB,os_cache_dirty_max=0,"
+ "os_cache_max=0,prefix_compression=,split_pct=75,value_format=u,"
+ "version=(major=0,minor=0)";
WT_CONFIG_CHECK
__wt_confchk_file_meta[] = {
@@ -169,6 +169,7 @@ __wt_confchk_file_meta[] = {
{ "lsm_merge_max", "int", "min=2,max=100", NULL},
{ "lsm_merge_threads", "int", "min=1,max=10", NULL},
{ "memory_page_max", "int", "min=512B,max=10TB", NULL},
+ { "os_cache_dirty_max", "int", "min=0", NULL},
{ "os_cache_max", "int", "min=0", NULL},
{ "prefix_compression", "boolean", NULL, NULL},
{ "split_pct", "int", "min=25,max=100", NULL},
@@ -258,8 +259,9 @@ __wt_confdfl_session_create =
",leaf_item_max=0,leaf_page_max=1MB,lsm_bloom=,lsm_bloom_bit_count=8,"
"lsm_bloom_config=,lsm_bloom_hash_count=4,lsm_bloom_newest=0,"
"lsm_bloom_oldest=0,lsm_chunk_size=2MB,lsm_merge_max=15,"
- "lsm_merge_threads=1,memory_page_max=5MB,os_cache_max=0,"
- "prefix_compression=,source=,split_pct=75,type=file,value_format=u";
+ "lsm_merge_threads=1,memory_page_max=5MB,os_cache_dirty_max=0,"
+ "os_cache_max=0,prefix_compression=,source=,split_pct=75,type=file,"
+ "value_format=u";
WT_CONFIG_CHECK
__wt_confchk_session_create[] = {
@@ -294,6 +296,7 @@ __wt_confchk_session_create[] = {
{ "lsm_merge_max", "int", "min=2,max=100", NULL},
{ "lsm_merge_threads", "int", "min=1,max=10", NULL},
{ "memory_page_max", "int", "min=512B,max=10TB", NULL},
+ { "os_cache_dirty_max", "int", "min=0", NULL},
{ "os_cache_max", "int", "min=0", NULL},
{ "prefix_compression", "boolean", NULL, NULL},
{ "source", "string", NULL, NULL},
diff --git a/src/include/block.h b/src/include/block.h
index 743b4685e43..c234e980bbe 100644
--- a/src/include/block.h
+++ b/src/include/block.h
@@ -195,7 +195,12 @@ struct __wt_block {
/* Configuration information, set when the file is opened. */
uint32_t allocsize; /* Allocation size */
- u_int block_header; /* Header length */
+ u_int block_header; /* Header length */
+
+ int64_t os_cache; /* System buffer cache flush max */
+ int64_t os_cache_max;
+ int64_t os_cache_dirty; /* System buffer cache write max */
+ int64_t os_cache_dirty_max;
/*
* There is only a single checkpoint in a file that can be written. The
diff --git a/src/include/extern.h b/src/include/extern.h
index 111a5152318..bed015e316e 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -112,6 +112,14 @@ extern int __wt_block_extlist_init(WT_SESSION_IMPL *session,
const char *name,
const char *extname);
extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el);
+extern int __wt_block_map( WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
+ void *mapp,
+ size_t *maplenp);
+extern int __wt_block_unmap( WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
+ void *map,
+ size_t maplen);
extern int __wt_block_manager_open(WT_SESSION_IMPL *session,
const char *filename,
const char *config,
diff --git a/src/include/os.h b/src/include/os.h
index 7d4af7ea091..ad2932cb403 100644
--- a/src/include/os.h
+++ b/src/include/os.h
@@ -30,14 +30,12 @@
} while (0)
struct __wt_fh {
+ u_int refcnt; /* Reference count */
TAILQ_ENTRY(__wt_fh) q; /* List of open handles */
- off_t file_size; /* File size */
- off_t io_size; /* Current amount of I/O */
- off_t os_cache_max; /* Max I/O before flushing */
-
char *name; /* File name */
+ off_t file_size; /* File size */
int fd; /* POSIX file handle */
- u_int refcnt; /* Reference count */
+ int direct_io; /* O_DIRECT configured */
};
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 7aa4e3d80c3..2310d78dc1e 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -719,10 +719,15 @@ struct __wt_session {
* adjusted to a lower bound of <code>50 * leaf_page_max</code>. This
* limit is soft - it is possible for pages to be temporarily larger
* than this value.,an integer between 512B and 10TB; default \c 5MB.}
- * @config{os_cache_max, maximum filesystem cache. If non-zero\,
- * WiredTiger will attempt to flush the operating system buffer cache
- * whenever this amount of I/O is performed.,an integer greater than or
- * equal to 0; default \c 0.}
+ * @config{os_cache_dirty_max, maximum dirty system buffer cache usage\,
+ * in bytes. If non-zero\, schedule writes for dirty blocks belonging
+ * to this object in the system buffer cache after that many bytes from
+ * this object are written into the buffer cache.,an integer greater
+ * than or equal to 0; default \c 0.}
+ * @config{os_cache_max, maximum system buffer cache usage\, in bytes.
+ * If non-zero\, evict object blocks from the system buffer cache after
+ * that many bytes from this object are read or written into the buffer
+ * cache.,an integer greater than or equal to 0; default \c 0.}
* @config{prefix_compression, configure row-store format key prefix
* compression.,a boolean flag; default \c true.}
* @config{source, override the default data source URI derived from the
diff --git a/src/os_posix/os_fsync.c b/src/os_posix/os_fsync.c
index fb0010705bb..2871aa9a21f 100644
--- a/src/os_posix/os_fsync.c
+++ b/src/os_posix/os_fsync.c
@@ -22,12 +22,5 @@ __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh)
if (ret != 0)
WT_RET_MSG(session, ret, "%s fsync error", fh->name);
-#ifdef HAVE_POSIX_FADVISE
- if (fh->os_cache_max > 0 && fh->io_size > fh->os_cache_max) {
- fh->io_size = 0;
- WT_RET(posix_fadvise(fh->fd, 0, 0, POSIX_FADV_DONTNEED));
- }
-#endif
-
return (0);
}
diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c
index fd321b7759c..b8d1d8abfe9 100644
--- a/src/os_posix/os_open.c
+++ b/src/os_posix/os_open.c
@@ -147,6 +147,11 @@ __wt_open(WT_SESSION_IMPL *session,
fh->fd = fd;
fh->refcnt = 1;
+#ifdef O_DIRECT
+ if (f & O_DIRECT)
+ fh->direct_io = 1;
+#endif
+
/* Set the file's size. */
WT_ERR(__wt_filesize(session, fh, &fh->file_size));
diff --git a/src/os_posix/os_rw.c b/src/os_posix/os_rw.c
index 5e051d701c2..2b83d961592 100644
--- a/src/os_posix/os_rw.c
+++ b/src/os_posix/os_rw.c
@@ -27,14 +27,6 @@ __wt_read(WT_SESSION_IMPL *session,
" bytes at offset %" PRIuMAX,
fh->name, bytes, (uintmax_t)offset);
-#ifdef HAVE_POSIX_FADVISE
- if (fh->os_cache_max > 0 &&
- (fh->io_size += (off_t)bytes) > fh->os_cache_max) {
- fh->io_size = 0;
- WT_RET(posix_fadvise(fh->fd, 0, 0, POSIX_FADV_DONTNEED));
- }
-#endif
-
return (0);
}
@@ -58,9 +50,5 @@ __wt_write(WT_SESSION_IMPL *session,
" bytes at offset %" PRIuMAX,
fh->name, bytes, (uintmax_t)offset);
-#ifdef HAVE_POSIX_FADVISE
- if (fh->os_cache_max > 0)
- fh->io_size += (off_t)bytes;
-#endif
return (0);
}