summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsueloverso <sue@mongodb.com>2016-12-04 23:22:58 -0500
committerMichael Cahill <michael.cahill@mongodb.com>2016-12-05 15:22:58 +1100
commit853430ea86b8e29cdfa9de34606405d52384d2db (patch)
tree60b562016008db77e3f610ad0067e3c40883fdd5
parentd2dfe81c95cb2cd2c04cf7e7711c8ad247d63ce1 (diff)
downloadmongo-853430ea86b8e29cdfa9de34606405d52384d2db.tar.gz
WT-2670 Add access_pattern_hint configuration for tables (#3155)
-rw-r--r--dist/api_config.py3
-rw-r--r--dist/api_data.py8
-rw-r--r--src/block/block_open.c6
-rw-r--r--src/config/config_def.c105
-rw-r--r--src/docs/upgrading.dox9
-rw-r--r--src/include/wiredtiger.in23
-rw-r--r--src/os_posix/os_fs.c16
-rw-r--r--src/os_win/os_fs.c10
8 files changed, 121 insertions, 59 deletions
diff --git a/dist/api_config.py b/dist/api_config.py
index 1069a7037ce..d83a632321e 100644
--- a/dist/api_config.py
+++ b/dist/api_config.py
@@ -198,7 +198,8 @@ def get_default(c):
return 'false'
elif c.default == 'true':
return 'true'
- elif t == 'string' and c.default == 'none':
+ elif t == 'string' and c.default == 'none' and \
+ not c.flags.get('choices', []):
return ''
elif t == 'category':
return '(%s)' % (','.join('%s=%s' % (subc.name, get_default(subc))
diff --git a/dist/api_data.py b/dist/api_data.py
index aa76ff45d1a..2d4371923e1 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -120,6 +120,14 @@ lsm_config = [
# Per-file configuration
file_config = format_meta + [
+ Config('access_pattern_hint', 'none', r'''
+ It is recommended that workloads that consist primarily of
+ updates and/or point queries specify \c random. Workloads that
+ do many cursor scans through large ranges of data specify
+ \c sequential and other workloads specify \c none. The
+ option leads to an advisory call to an appropriate operating
+ system API where available''',
+ choices=['none', 'random', 'sequential']),
Config('block_allocation', 'best', r'''
configure block allocation. Permitted values are \c "first" or
\c "best"; the \c "first" configuration uses a first-available
diff --git a/src/block/block_open.c b/src/block/block_open.c
index 4d15942709a..07ceb4c8159 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -201,6 +201,12 @@ __wt_block_open(WT_SESSION_IMPL *session,
* "direct_io=checkpoint" configures direct I/O for readonly data files.
*/
flags = 0;
+ WT_ERR(__wt_config_gets(session, cfg, "access_pattern_hint", &cval));
+ if (WT_STRING_MATCH("random", cval.str, cval.len))
+ LF_SET(WT_FS_OPEN_ACCESS_RAND);
+ else if (WT_STRING_MATCH("sequential", cval.str, cval.len))
+ LF_SET(WT_FS_OPEN_ACCESS_SEQ);
+
if (readonly && FLD_ISSET(conn->direct_io, WT_DIRECT_IO_CHECKPOINT))
LF_SET(WT_FS_OPEN_DIRECTIO);
if (!readonly && FLD_ISSET(conn->direct_io, WT_DIRECT_IO_DATA))
diff --git a/src/config/config_def.c b/src/config/config_def.c
index b3b900f8c42..9d886cbf0bd 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -224,6 +224,9 @@ static const WT_CONFIG_CHECK
};
static const WT_CONFIG_CHECK confchk_WT_SESSION_create[] = {
+ { "access_pattern_hint", "string",
+ NULL, "choices=[\"none\",\"random\",\"sequential\"]",
+ NULL, 0 },
{ "allocation_size", "int",
NULL, "min=512B,max=128MB",
NULL, 0 },
@@ -401,6 +404,9 @@ static const WT_CONFIG_CHECK confchk_colgroup_meta[] = {
};
static const WT_CONFIG_CHECK confchk_file_config[] = {
+ { "access_pattern_hint", "string",
+ NULL, "choices=[\"none\",\"random\",\"sequential\"]",
+ NULL, 0 },
{ "allocation_size", "int",
NULL, "min=512B,max=128MB",
NULL, 0 },
@@ -459,6 +465,9 @@ static const WT_CONFIG_CHECK confchk_file_config[] = {
};
static const WT_CONFIG_CHECK confchk_file_meta[] = {
+ { "access_pattern_hint", "string",
+ NULL, "choices=[\"none\",\"random\",\"sequential\"]",
+ NULL, 0 },
{ "allocation_size", "int",
NULL, "min=512B,max=128MB",
NULL, 0 },
@@ -537,6 +546,9 @@ static const WT_CONFIG_CHECK confchk_index_meta[] = {
};
static const WT_CONFIG_CHECK confchk_lsm_meta[] = {
+ { "access_pattern_hint", "string",
+ NULL, "choices=[\"none\",\"random\",\"sequential\"]",
+ NULL, 0 },
{ "allocation_size", "int",
NULL, "min=512B,max=128MB",
NULL, 0 },
@@ -1075,12 +1087,13 @@ static const WT_CONFIG_ENTRY config_entries[] = {
confchk_WT_SESSION_compact, 1
},
{ "WT_SESSION.create",
- "allocation_size=4KB,app_metadata=,block_allocation=best,"
- "block_compressor=,cache_resident=false,checksum=uncompressed,"
- "colgroups=,collator=,columns=,dictionary=0,encryption=(keyid=,"
- "name=),exclusive=false,extractor=,format=btree,huffman_key=,"
- "huffman_value=,ignore_in_memory_cache_size=false,immutable=false"
- ",internal_item_max=0,internal_key_max=0,"
+ "access_pattern_hint=none,allocation_size=4KB,app_metadata=,"
+ "block_allocation=best,block_compressor=,cache_resident=false,"
+ "checksum=uncompressed,colgroups=,collator=,columns=,dictionary=0"
+ ",encryption=(keyid=,name=),exclusive=false,extractor=,"
+ "format=btree,huffman_key=,huffman_value=,"
+ "ignore_in_memory_cache_size=false,immutable=false,"
+ "internal_item_max=0,internal_key_max=0,"
"internal_key_truncate=true,internal_page_max=4KB,key_format=u,"
"key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,"
"leaf_value_max=0,log=(enabled=true),lsm=(auto_throttle=true,"
@@ -1090,7 +1103,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
"prefix_compression_min=4,source=,split_deepen_min_child=0,"
"split_deepen_per_child=0,split_pct=75,type=file,value_format=u",
- confchk_WT_SESSION_create, 41
+ confchk_WT_SESSION_create, 42
},
{ "WT_SESSION.drop",
"checkpoint_wait=true,force=false,lock_wait=true,"
@@ -1172,10 +1185,26 @@ static const WT_CONFIG_ENTRY config_entries[] = {
confchk_colgroup_meta, 5
},
{ "file.config",
- "allocation_size=4KB,app_metadata=,block_allocation=best,"
- "block_compressor=,cache_resident=false,checksum=uncompressed,"
- "collator=,columns=,dictionary=0,encryption=(keyid=,name=),"
- "format=btree,huffman_key=,huffman_value=,"
+ "access_pattern_hint=none,allocation_size=4KB,app_metadata=,"
+ "block_allocation=best,block_compressor=,cache_resident=false,"
+ "checksum=uncompressed,collator=,columns=,dictionary=0,"
+ "encryption=(keyid=,name=),format=btree,huffman_key=,"
+ "huffman_value=,ignore_in_memory_cache_size=false,"
+ "internal_item_max=0,internal_key_max=0,"
+ "internal_key_truncate=true,internal_page_max=4KB,key_format=u,"
+ "key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,"
+ "leaf_value_max=0,log=(enabled=true),memory_page_max=5MB,"
+ "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
+ "prefix_compression_min=4,split_deepen_min_child=0,"
+ "split_deepen_per_child=0,split_pct=75,value_format=u",
+ confchk_file_config, 35
+ },
+ { "file.meta",
+ "access_pattern_hint=none,allocation_size=4KB,app_metadata=,"
+ "block_allocation=best,block_compressor=,cache_resident=false,"
+ "checkpoint=,checkpoint_lsn=,checksum=uncompressed,collator=,"
+ "columns=,dictionary=0,encryption=(keyid=,name=),format=btree,"
+ "huffman_key=,huffman_value=,id=,"
"ignore_in_memory_cache_size=false,internal_item_max=0,"
"internal_key_max=0,internal_key_truncate=true,"
"internal_page_max=4KB,key_format=u,key_gap=10,leaf_item_max=0,"
@@ -1183,24 +1212,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"log=(enabled=true),memory_page_max=5MB,os_cache_dirty_max=0,"
"os_cache_max=0,prefix_compression=false,prefix_compression_min=4"
",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75,"
- "value_format=u",
- confchk_file_config, 34
- },
- { "file.meta",
- "allocation_size=4KB,app_metadata=,block_allocation=best,"
- "block_compressor=,cache_resident=false,checkpoint=,"
- "checkpoint_lsn=,checksum=uncompressed,collator=,columns=,"
- "dictionary=0,encryption=(keyid=,name=),format=btree,huffman_key="
- ",huffman_value=,id=,ignore_in_memory_cache_size=false,"
- "internal_item_max=0,internal_key_max=0,"
- "internal_key_truncate=true,internal_page_max=4KB,key_format=u,"
- "key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,"
- "leaf_value_max=0,log=(enabled=true),memory_page_max=5MB,"
- "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
- "prefix_compression_min=4,split_deepen_min_child=0,"
- "split_deepen_per_child=0,split_pct=75,value_format=u,"
- "version=(major=0,minor=0)",
- confchk_file_meta, 38
+ "value_format=u,version=(major=0,minor=0)",
+ confchk_file_meta, 39
},
{ "index.meta",
"app_metadata=,collator=,columns=,extractor=,immutable=false,"
@@ -1208,23 +1221,23 @@ static const WT_CONFIG_ENTRY config_entries[] = {
confchk_index_meta, 10
},
{ "lsm.meta",
- "allocation_size=4KB,app_metadata=,block_allocation=best,"
- "block_compressor=,cache_resident=false,checksum=uncompressed,"
- "chunks=,collator=,columns=,dictionary=0,encryption=(keyid=,"
- "name=),format=btree,huffman_key=,huffman_value=,"
- "ignore_in_memory_cache_size=false,internal_item_max=0,"
- "internal_key_max=0,internal_key_truncate=true,"
- "internal_page_max=4KB,key_format=u,key_gap=10,last=,"
- "leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,"
- "leaf_value_max=0,log=(enabled=true),lsm=(auto_throttle=true,"
- "bloom=true,bloom_bit_count=16,bloom_config=,bloom_hash_count=8,"
- "bloom_oldest=false,chunk_count_limit=0,chunk_max=5GB,"
- "chunk_size=10MB,merge_max=15,merge_min=0),memory_page_max=5MB,"
- "old_chunks=,os_cache_dirty_max=0,os_cache_max=0,"
- "prefix_compression=false,prefix_compression_min=4,"
- "split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75,"
- "value_format=u",
- confchk_lsm_meta, 38
+ "access_pattern_hint=none,allocation_size=4KB,app_metadata=,"
+ "block_allocation=best,block_compressor=,cache_resident=false,"
+ "checksum=uncompressed,chunks=,collator=,columns=,dictionary=0,"
+ "encryption=(keyid=,name=),format=btree,huffman_key=,"
+ "huffman_value=,ignore_in_memory_cache_size=false,"
+ "internal_item_max=0,internal_key_max=0,"
+ "internal_key_truncate=true,internal_page_max=4KB,key_format=u,"
+ "key_gap=10,last=,leaf_item_max=0,leaf_key_max=0,"
+ "leaf_page_max=32KB,leaf_value_max=0,log=(enabled=true),"
+ "lsm=(auto_throttle=true,bloom=true,bloom_bit_count=16,"
+ "bloom_config=,bloom_hash_count=8,bloom_oldest=false,"
+ "chunk_count_limit=0,chunk_max=5GB,chunk_size=10MB,merge_max=15,"
+ "merge_min=0),memory_page_max=5MB,old_chunks=,"
+ "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
+ "prefix_compression_min=4,split_deepen_min_child=0,"
+ "split_deepen_per_child=0,split_pct=75,value_format=u",
+ confchk_lsm_meta, 39
},
{ "table.meta",
"app_metadata=,colgroups=,collator=,columns=,key_format=u,"
diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox
index caa6b036892..c173bebdda6 100644
--- a/src/docs/upgrading.dox
+++ b/src/docs/upgrading.dox
@@ -17,6 +17,15 @@ how much work is done at the beginning of a checkpoint to make the critical
section of checkpoints complete more quickly.
</dd>
+<dt>Change to default fadvise setting for data files</dt>
+<dd>
+The default behavior for data files was to advise the file system to optimize
+for random access on POSIX and Windows platforms. The default is now to not
+advise about access patterns. There is a new \c access_pattern_hint
+configuration string available to WT_SESSION::create that can be used
+to configure the old default behavior.
+</dd>
+
<dt>Checkpoint server created checkpoint names</dt>
<dd>
The ::wiredtiger_open checkpoint configuration no longer supports the
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 37788096f39..2365135e08d 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -995,6 +995,13 @@ struct __wt_session {
* \c "table:stock". For a description of URI formats
* see @ref data_sources.
* @configstart{WT_SESSION.create, see dist/api_data.py}
+ * @config{access_pattern_hint, It is recommended that workloads that
+ * consist primarily of updates and/or point queries specify \c random.
+ * Workloads that do many cursor scans through large ranges of data
+ * specify \c sequential and other workloads specify \c none. The
+ * option leads to an advisory call to an appropriate operating system
+ * API where available., a string\, chosen from the following options:
+ * \c "none"\, \c "random"\, \c "sequential"; default \c none.}
* @config{allocation_size, the file unit allocation size\, in bytes\,
* must a power-of-two; smaller values decrease the file space required
* by overflow items\, and the default value of 4KB is a good choice
@@ -3734,21 +3741,25 @@ typedef enum {
WT_FS_OPEN_FILE_TYPE_REGULAR /*!< open a regular file */
} WT_FS_OPEN_FILE_TYPE;
+/*! WT_FILE_SYSTEM::open_file flags: random access pattern */
+#define WT_FS_OPEN_ACCESS_RAND 0x001
+/*! WT_FILE_SYSTEM::open_file flags: sequential access pattern */
+#define WT_FS_OPEN_ACCESS_SEQ 0x002
/*! WT_FILE_SYSTEM::open_file flags: create if does not exist */
-#define WT_FS_OPEN_CREATE 0x001
+#define WT_FS_OPEN_CREATE 0x004
/*! WT_FILE_SYSTEM::open_file flags: direct I/O requested */
-#define WT_FS_OPEN_DIRECTIO 0x002
+#define WT_FS_OPEN_DIRECTIO 0x008
/*! WT_FILE_SYSTEM::open_file flags: file creation must be durable */
-#define WT_FS_OPEN_DURABLE 0x004
+#define WT_FS_OPEN_DURABLE 0x010
/*!
* WT_FILE_SYSTEM::open_file flags: return EBUSY if exclusive use not available
*/
-#define WT_FS_OPEN_EXCLUSIVE 0x008
+#define WT_FS_OPEN_EXCLUSIVE 0x020
#ifndef DOXYGEN
-#define WT_FS_OPEN_FIXED 0x010 /* Path not home relative (internal) */
+#define WT_FS_OPEN_FIXED 0x040 /* Path not home relative (internal) */
#endif
/*! WT_FILE_SYSTEM::open_file flags: open is read-only */
-#define WT_FS_OPEN_READONLY 0x020
+#define WT_FS_OPEN_READONLY 0x080
/*!
* WT_FILE_SYSTEM::remove or WT_FILE_SYSTEM::rename flags: the remove or rename
diff --git a/src/os_posix/os_fs.c b/src/os_posix/os_fs.c
index 5f06892ce6e..26be0f049cc 100644
--- a/src/os_posix/os_fs.c
+++ b/src/os_posix/os_fs.c
@@ -575,7 +575,7 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
WT_FILE_HANDLE_POSIX *pfh;
WT_SESSION_IMPL *session;
mode_t mode;
- int f;
+ int advise_flag, f;
WT_UNUSED(file_system);
@@ -676,17 +676,25 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
#if defined(HAVE_POSIX_FADVISE)
/*
- * Disable read-ahead on trees: it slows down random read workloads.
+ * If the user set an access pattern hint, call fadvise now.
* Ignore fadvise when doing direct I/O, the kernel cache isn't
* interesting.
*/
- if (!pfh->direct_io && file_type == WT_FS_OPEN_FILE_TYPE_DATA) {
+ if (!pfh->direct_io && file_type == WT_FS_OPEN_FILE_TYPE_DATA &&
+ LF_ISSET(WT_FS_OPEN_ACCESS_RAND | WT_FS_OPEN_ACCESS_SEQ)) {
+ advise_flag = 0;
+ if (LF_ISSET(WT_FS_OPEN_ACCESS_RAND))
+ advise_flag = POSIX_FADV_RANDOM;
+ if (LF_ISSET(WT_FS_OPEN_ACCESS_SEQ))
+ advise_flag = POSIX_FADV_SEQUENTIAL;
WT_SYSCALL(
- posix_fadvise(pfh->fd, 0, 0, POSIX_FADV_RANDOM), ret);
+ posix_fadvise(pfh->fd, 0, 0, advise_flag), ret);
if (ret != 0)
WT_ERR_MSG(session, ret,
"%s: handle-open: posix_fadvise", name);
}
+#else
+ WT_UNUSED(advise_flag);
#endif
directory_open:
diff --git a/src/os_win/os_fs.c b/src/os_win/os_fs.c
index 7ab7178114b..6c74f2f411f 100644
--- a/src/os_win/os_fs.c
+++ b/src/os_win/os_fs.c
@@ -521,10 +521,16 @@ __win_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC))
f |= FILE_FLAG_WRITE_THROUGH;
- /* Disable read-ahead on trees: it slows down random read workloads. */
- if (file_type == WT_FS_OPEN_FILE_TYPE_DATA)
+ /* If the user indicated a random workload, disable read-ahead. */
+ if (file_type == WT_FS_OPEN_FILE_TYPE_DATA &&
+ LF_ISSET(WT_FS_OPEN_ACCESS_RAND))
f |= FILE_FLAG_RANDOM_ACCESS;
+ /* If the user indicated a sequential workload, set that. */
+ if (file_type == WT_FS_OPEN_FILE_TYPE_DATA &&
+ LF_ISSET(WT_FS_OPEN_ACCESS_SEQ))
+ f |= FILE_FLAG_SEQUENTIAL_SCAN;
+
win_fh->filehandle = CreateFileW(name_wide->data, desired_access,
FILE_SHARE_READ | FILE_SHARE_WRITE,
NULL, dwCreationDisposition, f, NULL);