summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/btree/rec_split.c9
-rw-r--r--src/btree/row_srch.c187
-rw-r--r--src/docs/dump-formats.dox49
-rw-r--r--src/docs/transactions.dox18
-rw-r--r--src/docs/upgrading.dox75
-rw-r--r--src/include/api.h1
-rw-r--r--src/include/extern.h1
-rw-r--r--src/lsm/lsm_merge.c9
-rw-r--r--src/lsm/lsm_tree.c57
-rw-r--r--src/session/session_api.c16
10 files changed, 233 insertions, 189 deletions
diff --git a/src/btree/rec_split.c b/src/btree/rec_split.c
index c9e61582c14..f0ee6a5d77e 100644
--- a/src/btree/rec_split.c
+++ b/src/btree/rec_split.c
@@ -572,6 +572,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp)
{
WT_ADDR *addr;
+ WT_IKEY *ikey;
WT_REF *ref;
size_t incr;
@@ -614,10 +615,10 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
switch (page->type) {
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
- WT_RET(__wt_strndup(session,
- multi->key.ikey, multi->key.ikey->size + sizeof(WT_IKEY),
- &ref->key.ikey));
- incr += sizeof(WT_IKEY) + multi->key.ikey->size;
+ ikey = multi->key.ikey;
+ WT_RET(__wt_row_ikey(session, 0,
+ WT_IKEY_DATA(ikey), ikey->size, &ref->key.ikey));
+ incr += sizeof(WT_IKEY) + ikey->size;
break;
default:
ref->key.recno = multi->key.recno;
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index a3899156e13..16d01af059c 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -89,19 +89,18 @@ __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
btree->collator, srch_key, &key, cmp, &match));
}
- if (cmp > 0) { /* Keep going at this level */
+ if (cmp > 0) { /* Keep going at this level */
insp = &ret_ins->next[i];
skiplow = match;
- } else if (cmp == 0)
+ } else if (cmp < 0) { /* Drop down a level */
+ cbt->next_stack[i] = ret_ins;
+ cbt->ins_stack[i--] = insp--;
+ skiphigh = match;
+ } else
for (; i >= 0; i--) {
cbt->next_stack[i] = ret_ins->next[i];
cbt->ins_stack[i] = &ret_ins->next[i];
}
- else { /* Drop down a level */
- cbt->next_stack[i] = ret_ins;
- cbt->ins_stack[i--] = insp--;
- skiphigh = match;
- }
}
/*
@@ -167,18 +166,19 @@ restart: page = parent->page;
break;
pindex = WT_INTL_INDEX_COPY(page);
- base = pindex->entries;
- child = pindex->index[base - 1];
/*
* Fast-path internal pages with one child, a common case for
* the root page in new trees.
*/
- if (base == 1)
+ if (pindex->entries == 1) {
+ child = pindex->index[0];
goto descend;
+ }
/* Fast-path appends. */
if (insert && btree->appending) {
+ child = pindex->index[pindex->entries - 1];
__wt_ref_key(page, child, &item->data, &item->size);
WT_ERR(WT_LEX_CMP(
session, btree->collator, srch_key, item, cmp));
@@ -191,88 +191,79 @@ restart: page = parent->page;
/*
* Two versions of the binary search of internal pages: with and
* without application-specified collation.
+ *
+ * The 0th key on an internal page is a problem for a couple of
+ * reasons. First, we have to force the 0th key to sort less
+ * than any application key, so internal pages don't have to be
+ * updated if the application stores a new, "smallest" key in
+ * the tree. Second, reconciliation is aware of this and will
+ * store a byte of garbage in the 0th key, so the comparison of
+ * an application key and a 0th key is meaningless (but doing
+ * the comparison could still incorrectly modify our tracking
+ * of the leading bytes in each key that we can skip during the
+ * comparison).
+ *
+ * The only way to possibly compare against the 0th key in the
+ * binary search loop is if base is 0 and limit is 0 or 1; in
+ * that case, we must exit the loop after doing the 0th key
+ * comparison, that is, if we are doing the comparison, we're
+ * descending down the left-hand side of the tree.
*/
base = 0;
+ indx = 0; /* -Werror=maybe-uninitialized */
limit = pindex->entries;
- if (btree->collator == NULL) {
+ if (btree->collator == NULL)
for (; limit != 0; limit >>= 1) {
- indx = base + (limit >> 1);
+ /* If index is 0, skip the comparison. */
+ if ((indx = base + (limit >> 1)) == 0)
+ break;
+
child = pindex->index[indx];
+ __wt_ref_key(
+ page, child, &item->data, &item->size);
- /*
- * If about to compare an application key with
- * the 0th index on an internal page, pretend
- * the 0th index sorts less than any application
- * key. This test is so we don't have to update
- * internal pages if the application stores a
- * new, "smallest" key in the tree.
- */
- if (indx != 0) {
- __wt_ref_key(page,
- child, &item->data, &item->size);
- match = WT_MIN(skiplow, skiphigh);
- cmp = __wt_lex_compare_skip(
- srch_key, item, &match);
- if (cmp == 0)
- goto descend;
- if (cmp < 0) {
- skiphigh = match;
- continue;
- }
+ match = WT_MIN(skiplow, skiphigh);
+ cmp = __wt_lex_compare_skip(
+ srch_key, item, &match);
+ if (cmp > 0) {
skiplow = match;
- }
- base = indx + 1;
- --limit;
+ base = indx + 1;
+ --limit;
+ } else if (cmp < 0)
+ skiphigh = match;
+ else
+ break;
}
- /*
- * Reference the slot used for next step down the tree.
- *
- * Base is the smallest index greater than key and may
- * be the (last + 1) index. (Base cannot be the 0th
- * index as the 0th index always sorts less than any
- * application key). The slot for descent is the one
- * before base.
- */
- if (cmp != 0)
- child = pindex->index[base - 1];
- } else {
+ else
for (; limit != 0; limit >>= 1) {
- indx = base + (limit >> 1);
+ /* If index is 0, skip the comparison. */
+ if ((indx = base + (limit >> 1)) == 0)
+ break;
+
child = pindex->index[indx];
- /*
- * If about to compare an application key with
- * the 0th index on an internal page, pretend
- * the 0th index sorts less than any application
- * key. This test is so we don't have to update
- * internal pages if the application stores a
- * new, "smallest" key in the tree.
- */
- if (indx != 0) {
- __wt_ref_key(page,
- child, &item->data, &item->size);
- WT_ERR(WT_LEX_CMP_SKIP(
- session, btree->collator,
- srch_key, item, cmp, &match));
- if (cmp == 0)
- goto descend;
- if (cmp < 0)
- continue;
- }
- base = indx + 1;
- --limit;
+ __wt_ref_key(
+ page, child, &item->data, &item->size);
+
+ WT_ERR(WT_LEX_CMP(session,
+ btree->collator, srch_key, item, cmp));
+ if (cmp > 0) {
+ base = indx + 1;
+ --limit;
+ } else if (cmp == 0)
+ break;
}
- /*
- * Reference the slot used for next step down the tree.
- *
- * Base is the smallest index greater than key and may
- * be the (last + 1) index. (Base cannot be the 0th
- * index as the 0th index always sorts less than any
- * application key). The slot for descent is the one
- * before base.
- */
- if (cmp != 0)
- child = pindex->index[base - 1];
- }
+
+ /*
+ * Find the slot used to descend the tree. If index is 0, it's
+ * a left-side descent. Otherwise, if we found an exact match,
+ * child is already set, if we didn't find an exact match, base
+ * is the smallest index greater than key, possibly (last + 1).
+ */
+ if (indx == 0)
+ child = pindex->index[0];
+ else if (cmp != 0)
+ child = pindex->index[base - 1];
+
descend: WT_ASSERT(session, child != NULL);
/*
@@ -314,36 +305,32 @@ leaf_only:
for (; limit != 0; limit >>= 1) {
indx = base + (limit >> 1);
rip = page->pg_row_d + indx;
-
WT_ERR(__wt_row_leaf_key(session, page, rip, item, 1));
+
match = WT_MIN(skiplow, skiphigh);
cmp = __wt_lex_compare_skip(srch_key, item, &match);
- if (cmp == 0)
- break;
- if (cmp < 0) {
+ if (cmp > 0) {
+ skiplow = match;
+ base = indx + 1;
+ --limit;
+ } else if (cmp < 0)
skiphigh = match;
- continue;
- }
- skiplow = match;
-
- base = indx + 1;
- --limit;
+ else
+ break;
}
else
for (; limit != 0; limit >>= 1) {
indx = base + (limit >> 1);
rip = page->pg_row_d + indx;
-
WT_ERR(__wt_row_leaf_key(session, page, rip, item, 1));
- WT_ERR(WT_LEX_CMP_SKIP(session,
- btree->collator, srch_key, item, cmp, &match));
- if (cmp == 0)
- break;
- if (cmp < 0)
- continue;
- base = indx + 1;
- --limit;
+ WT_ERR(WT_LEX_CMP(
+ session, btree->collator, srch_key, item, cmp));
+ if (cmp > 0) {
+ base = indx + 1;
+ --limit;
+ } else if (cmp == 0)
+ break;
}
/*
diff --git a/src/docs/dump-formats.dox b/src/docs/dump-formats.dox
index bbc89d43d20..2a903f9c790 100644
--- a/src/docs/dump-formats.dox
+++ b/src/docs/dump-formats.dox
@@ -4,13 +4,58 @@ The @ref util_dump command produces a flat-text representation of a
table that can be loaded by @ref util_load. This page describes the
output formats of the @ref util_dump command.
-@section dump_formats_json JSON format
+@section dump_formats_json JSON dump format
JSON (<a href="http://www.json.org">JavaScript Object Notation</a>)
dump files use the standard JSON data-interchange format to specify
the objects, and may be interpreted by any JSON reader.
-@section dump_formats_text Text format
+The format is a JSON object where each key is the URI passed to
+WT_SESSION::create and the corresponding value is a JSON array of two
+entries. The first entry in this array is a JSON object composed of
+configuration information: the "config" key has the configuration
+string used with WT_SESSION::create, the "colgroups" and "indices"
+keys have values that are arrays of objects that are in turn composed
+of configuration information. The second entry is a JSON array, with
+each entry an object representing a row of data. If the columns were
+named in the configuration string used with WT_SESSION::create, those
+names are used for keys, otherwise predictable names (for example,
+"key0", "value0", "value1") are generated. The values in this object
+are the values for each column in the record.
+
+Here is some sample output:
+
+@code
+{
+ "table:planets" : [
+ {
+ "config" : "columns=(id,name,distance),key_format=i,value_format=Si",
+ "colgroups" : [],
+ "indices" : [
+ {
+ "uri" : "index:planets:names",
+ "config" : "columns=(name),key_format=Si,source=\"file:astronomy.wt\",type=file"
+ }
+ ]
+ },
+ [
+ {
+"id" : 1,
+"name" : "Mercury",
+"distance" : 57910000
+ },
+ {
+"id" : 2,
+"name" : "Venus",
+"distance" : 108200000
+ },
+ ...
+ ]
+ ]
+}
+@endcode
+
+@section dump_formats_text Text dump format
Text dump files have three parts, a prefix, a header and a body.
diff --git a/src/docs/transactions.dox b/src/docs/transactions.dox
index 865ec85b21f..7acdaf619a2 100644
--- a/src/docs/transactions.dox
+++ b/src/docs/transactions.dox
@@ -42,9 +42,9 @@ fail with the ::WT_DEADLOCK error. If this error occurs, transactions
should be rolled back with WT_SESSION::rollback_transaction and retried.
The WT_SESSION::begin_transaction, WT_SESSION::commit_transaction and
-WT_SESSION::rollback_transaction methods all implicitly reset open
-cursors, as if WT_CURSOR::reset were called, discarding any position or
-key/value information they may have.
+WT_SESSION::rollback_transaction methods all implicitly reset all open
+cursors in the WT_SESSION, as if WT_CURSOR::reset were called, discarding
+any position or key/value information they may have.
@snippet ex_all.c transaction commit/rollback
@@ -53,11 +53,15 @@ key/value information they may have.
If a cursor is used when no explicit transaction is active in a session,
reads are performed at the isolation level of the session, set with the
\c isolation key to WT_CONNECTION::open_session, and successful updates
-are automatically committed before the update operation completes.
+are automatically committed before the update operation returns.
-Any operation that consists of multiple related updates should be
-enclosed in an explicit transaction to ensure that the updates are
-applied atomically.
+Any operation consisting of multiple related updates should be enclosed
+in an explicit transaction to ensure the updates are applied atomically.
+
+If an implicit transaction successfully commits, the open cursors in the
+WT_SESSION remain open and positioned. If an implicit transaction fails,
+all open cursors in the WT_SESSION are reset, as if WT_CURSOR::reset were
+called, discarding any position or key/value information they may have.
See @ref cursors_transactions for more information.
diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox
index 9ac63ae4e31..908da65188d 100644
--- a/src/docs/upgrading.dox
+++ b/src/docs/upgrading.dox
@@ -1,8 +1,8 @@
/*! @page upgrading Upgrading WiredTiger applications
@section version_221 Upgrading to Version 2.2.1
-<dl>
+<dl>
<dt>::wiredtiger_open configuration parsing order changed</dt>
<dd>
In the 2.2.1 release, the order that configuration strings are
@@ -24,7 +24,16 @@ variable may need to change. The old order:
<li> user configuration file \c Wiredtiger.config</li>
<li> user environment variable \c WIREDTIGER_CONFIG</li>
</ol>
+
+</dd>
+<dt>\c os_cache_dirty_max off for LSM</dt>
+<dd>
+In some earlier versions of WiredTiger, creating an LSM table automatically
+configured \c os_cache_dirty_max, causing additional system calls that slowed
+some workloads. Applications that benefit from this setting should set it
+explicitly in WT_SESSION::create.
</dd>
+</dl>
@section version_220 Upgrading to Version 2.2.0
<dl>
@@ -41,6 +50,7 @@ that benefit from prefix compression will need to explicitly set
In the 2.2.0 release it is now necessary to include \c --enable-verbose
in the configure command to be able to use verbose messages.
</dd>
+</dl>
@section version_212 Upgrading to Version 2.1.2
<dl>
@@ -54,6 +64,7 @@ the pool being shared.
We are now also enforcing that only one of \c cache_size and \c shared_cache
are specified in the ::wiredtiger_open configuration string.
</dd>
+</dl>
@section version_211 Upgrading to Version 2.1.1
<dl>
@@ -69,6 +80,7 @@ WT_EXTENSION_API::config_parser_open method, which can be used to parse
configuration strings. See the WT_CONFIG_PARSER documentation for
examples on how to use the updated API.
</dd>
+</dl>
@section version_21 Upgrading to Version 2.1
<dl>
@@ -99,6 +111,7 @@ explicit "fsync" calls than by enabling "dsync" on a file handle.
Applications that don't execute concurrent transactions may see better
throughput with transaction_sync set to "dsync".
</dd>
+</dl>
@section version_20 Upgrading to Version 2.0
<dl>
@@ -119,6 +132,7 @@ details of the updated syntax: lsm_auto_throttle, lsm_bloom, lsm_bloom_config,
lsm_bloom_bit_count, lsm_bloom_hash_count, lsm_bloom_oldest, lsm_chunk_max,
lsm_chunk_size, lsm_merge_max and lsm_merge_threads.
</dd>
+</dl>
@section version_166 Upgrading to Version 1.6.6
<dl>
@@ -200,6 +214,7 @@ Additionally add a WT_SESSION parameter into the existing
WT_EVENT_HANDLER::handle_error, WT_EVENT_HANDLER::handle_message and
WT_EVENT_HANDLER::handle_progress callback functions.
</dd>
+</dl>
@section version_165 Upgrading to Version 1.6.5
<dl>
@@ -226,9 +241,8 @@ correct behavior.
<dd>
The \c sync configuration key to ::wiredtiger_open has been renamed \c checkpoint_sync.
</dd>
-
</dl>
-<hr>
+
@section version_164 Upgrading to Version 1.6.4
<dl>
@@ -249,9 +263,8 @@ add the \c -n option to their command line configuration; applications
previously using the \c -o option on their command line configurations
should remove it.
</dd>
-
</dl>
-<hr>
+
@section version_163 Upgrading to Version 1.6.3
<dl>
@@ -288,12 +301,11 @@ The \c transactional configuration key has been removed from
::wiredtiger_open. Any application setting it should simply remove it,
no change in application behavior is needed.
</dd>
-
</dl>
-<hr>
+
@section version_162 Upgrading to Version 1.6.2
-<dl>
+<dl>
<dt>Table of WiredTiger extension methods</dt>
<dd>
New functionality was added to the list of WiredTiger extension methods;
@@ -315,12 +327,11 @@ checksum, by default. Applications using compression insufficient for
the purposes of corrupted block identification should change their file
checksum configuration to \c on.
</dd>
-
</dl>
-<hr>
+
@section version_161 Upgrading to Version 1.6.1
-<dl>
+<dl>
<dt>Default page sizes</dt>
<dd>
In the 1.6.1 release, the default for the WT_SESSION::create configuration
@@ -344,23 +355,21 @@ In the 1.6.1 release, the \c split_pct argument to the
WT_COMPRESSOR::compress_raw function changed type from \c u_int to \c int,
applications may require modification to avoid compiler warnings.
</dd>
-
</dl>
-<hr>
+
@section version_160 Upgrading to Version 1.6.0
-<dl>
+<dl>
<dt>File format changes</dt>
<dd>
The underlying file formats changed in the 1.6.0 release; tables and files
should be dumped and re-loaded into a new database.
</dd>
-
</dl>
-<hr>
+
@section version_153 Upgrading to Version 1.5.3
-<dl>
+<dl>
<dt>Configuration strings</dt>
<dd>
An undocumented feature where configuration string case was ignored has
@@ -426,24 +435,22 @@ The \c exclusive argument to the WT_DATA_SOURCE::create method has been
removed; applications may require modifications to resolve compile errors.
</ul>
</dd>
-
</dl>
-<hr>
+
@section version_143 Upgrading to Version 1.4.3
-<dl>
+<dl>
<dt>Statistics</dt>
<dd>
WiredTiger statistics are no longer maintained by default; to configure
statistics, use the \c statistics configuration string to the
::wiredtiger_open function.
</dd>
-
</dl>
-<hr>
+
@section version_139 Upgrading to Version 1.3.9
-<dl>
+<dl>
<dt>Compression</dt>
<dd>
A new member, WT_COMPRESSOR::compress_raw, was added to the WT_COMPRESSOR
@@ -465,24 +472,22 @@ string \c uncompressed.
The underlying file formats changed in the 1.3.9 release; tables and files
should be dumped and re-loaded into a new database.
</dd>
-
</dl>
-<hr>
+
@section version_138 Upgrading to Version 1.3.8
-<dl>
+<dl>
<dt>Statistics keys</dt>
<dd>
The @ref statistics_keys "statistics key constants" have been renamed to use
all capitals, and use consistent prefixes to distinguish between connection
statistics and statistics for data sources.
</dd>
-
</dl>
-<hr>
+
@section version_136 Upgrading to Version 1.3.6
-<dl>
+<dl>
<dt>Installed library names</dt>
<dd>
The installed WiredTiger extension library names changed to limit
@@ -516,23 +521,21 @@ The built-in compression name arguments to the WT_SESSION:create
@row{Snappy compression, "snappy_compress", "snappy"}
</table>
</dd>
-
</dl>
-<hr>
+
@section version_135 Upgrading to Version 1.3.5
-<dl>
+<dl>
<dt>File format changes</dt>
<dd>
The underlying file formats changed in the 1.3.5 release; tables and files
should be dumped and re-loaded into a new database.
</dd>
-
</dl>
-<hr>
+
@section version_13 Upgrading to Version 1.3
-<dl>
+<dl>
<dt>Checkpoint and Snapshot</dt>
<dd>
The checkpoint functionality supported by WT_SESSION::checkpoint and the
@@ -623,8 +626,6 @@ returns a cursor comparison status (less than 0, equal to 0, or greater than
The underlying file formats changed in the 1.3 release; tables and files
should be dumped and re-loaded into a new database.
</dd>
-
</dl>
-<hr>
*/
diff --git a/src/include/api.h b/src/include/api.h
index 2fc0590f718..7d0a9a0925d 100644
--- a/src/include/api.h
+++ b/src/include/api.h
@@ -71,6 +71,7 @@
ret = 0; \
continue; \
} \
+ WT_TRET(__wt_session_reset_cursors(s)); \
} \
} \
break; \
diff --git a/src/include/extern.h b/src/include/extern.h
index 3008729d158..64007373764 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -1345,6 +1345,7 @@ extern int __wt_schema_worker(WT_SESSION_IMPL *session,
int *),
const char *cfg[],
uint32_t open_flags);
+extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session);
extern int __wt_open_cursor(WT_SESSION_IMPL *session,
const char *uri,
WT_CURSOR *owner,
diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c
index b755a9bda00..ebc44cdb297 100644
--- a/src/lsm/lsm_merge.c
+++ b/src/lsm/lsm_merge.c
@@ -294,6 +294,15 @@ __wt_lsm_merge(
if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) {
if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING))
WT_ERR(EINTR);
+ /*
+ * Help out with switching chunks in case the
+ * checkpoint worker is busy.
+ */
+ if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
+ WT_WITH_SCHEMA_LOCK(session, ret =
+ __wt_lsm_tree_switch(session, lsm_tree));
+ WT_ERR(ret);
+ }
WT_STAT_FAST_CONN_INCRV(session,
lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL);
++lsm_tree->merge_progressing;
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 4a790a68dc3..1ab36ba937e 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -415,23 +415,16 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
WT_ASSERT(session, lsm_tree->merge_threads <= WT_LSM_MAX_WORKERS);
/*
- * Set up the config for each chunk. If possible, avoid high latencies
- * from fsync by flushing the cache every 8MB (will be overridden by
- * any application setting).
+ * Set up the config for each chunk.
*
- * Also make the memory_page_max double the chunk size, so application
+ * Make the memory_page_max double the chunk size, so application
* threads don't immediately try to force evict the chunk when the
* worker thread clears the NO_EVICTION flag.
*/
- tmpconfig = "";
-#ifdef HAVE_SYNC_FILE_RANGE
- if (!S2C(session)->direct_io)
- tmpconfig = "os_cache_dirty_max=8MB,";
-#endif
WT_ERR(__wt_scr_alloc(session, 0, &buf));
WT_ERR(__wt_buf_fmt(session, buf,
- "%s%s,key_format=u,value_format=u,memory_page_max=%" PRIu64,
- tmpconfig, config, 2 * lsm_tree->chunk_max));
+ "%s,key_format=u,value_format=u,memory_page_max=%" PRIu64,
+ config, 2 * lsm_tree->chunk_max));
WT_ERR(__wt_strndup(
session, buf->data, buf->size, &lsm_tree->file_config));
@@ -606,9 +599,9 @@ void
__wt_lsm_tree_throttle(
WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int decrease_only)
{
- WT_LSM_CHUNK *chunk, **cp, *ondisk, *prev_chunk;
+ WT_LSM_CHUNK *last_chunk, **cp, *ondisk, *prev_chunk;
uint64_t cache_sz, cache_used, oldtime, record_count, timediff;
- uint32_t i, in_memory, gen0_chunks;
+ uint32_t in_memory, gen0_chunks;
/* Never throttle in small trees. */
if (lsm_tree->nchunks < 3) {
@@ -634,34 +627,36 @@ __wt_lsm_tree_throttle(
record_count = 1;
gen0_chunks = in_memory = 0;
ondisk = NULL;
- for (i = 0, cp = lsm_tree->chunk + lsm_tree->nchunks - 1;
- i < lsm_tree->nchunks;
- ++i, --cp)
+ for (cp = lsm_tree->chunk + lsm_tree->nchunks - 1;
+ cp >= lsm_tree->chunk;
+ --cp)
if (!F_ISSET(*cp, WT_LSM_CHUNK_ONDISK)) {
record_count += (*cp)->count;
++in_memory;
} else {
+ /*
+ * Assign ondisk to the last chunk that has been
+ * flushed since the tree was last opened (i.e it's on
+ * disk and stable is not set).
+ */
if (ondisk == NULL &&
- ((*cp)->generation == 0 ||
- F_ISSET(*cp, WT_LSM_CHUNK_STABLE)))
+ ((*cp)->generation == 0 &&
+ !F_ISSET(*cp, WT_LSM_CHUNK_STABLE)))
ondisk = *cp;
if ((*cp)->generation == 0 &&
!F_ISSET(*cp, WT_LSM_CHUNK_MERGING))
++gen0_chunks;
- else if (ondisk != NULL)
- break;
}
- chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];
+ last_chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];
/* Checkpoint throttling, based on the number of in-memory chunks. */
if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3)
lsm_tree->ckpt_throttle = 0;
else if (decrease_only)
; /* Nothing to do */
- else if (i == lsm_tree->nchunks ||
- F_ISSET(ondisk, WT_LSM_CHUNK_STABLE)) {
+ else if (ondisk == NULL) {
/*
* No checkpoint has completed this run. Keep slowing down
* inserts until one does.
@@ -670,8 +665,9 @@ __wt_lsm_tree_throttle(
WT_MAX(WT_LSM_THROTTLE_START, 2 * lsm_tree->ckpt_throttle);
} else {
WT_ASSERT(session,
- WT_TIMECMP(chunk->create_ts, ondisk->create_ts) >= 0);
- timediff = WT_TIMEDIFF(chunk->create_ts, ondisk->create_ts);
+ WT_TIMECMP(last_chunk->create_ts, ondisk->create_ts) >= 0);
+ timediff =
+ WT_TIMEDIFF(last_chunk->create_ts, ondisk->create_ts);
lsm_tree->ckpt_throttle =
(long)((in_memory - 2) * timediff / (20 * record_count));
@@ -715,14 +711,13 @@ __wt_lsm_tree_throttle(
* check that the new value is sane: otherwise, after a long idle
* period, we can calculate a crazy value.
*/
- if (in_memory > 1 &&
- i != lsm_tree->nchunks &&
- !F_ISSET(ondisk, WT_LSM_CHUNK_STABLE)) {
+ if (in_memory > 1 && ondisk != NULL) {
prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2];
WT_ASSERT(session, prev_chunk->generation == 0);
- WT_ASSERT(session,
- WT_TIMECMP(chunk->create_ts, prev_chunk->create_ts) >= 0);
- timediff = WT_TIMEDIFF(chunk->create_ts, prev_chunk->create_ts);
+ WT_ASSERT(session, WT_TIMECMP(
+ last_chunk->create_ts, prev_chunk->create_ts) >= 0);
+ timediff =
+ WT_TIMEDIFF(last_chunk->create_ts, prev_chunk->create_ts);
WT_ASSERT(session,
WT_TIMECMP(prev_chunk->create_ts, ondisk->create_ts) >= 0);
oldtime = WT_TIMEDIFF(prev_chunk->create_ts, ondisk->create_ts);
diff --git a/src/session/session_api.c b/src/session/session_api.c
index 9cf107bd1af..70ce84b2a8e 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -11,11 +11,11 @@ static int __session_checkpoint(WT_SESSION *, const char *);
static int __session_rollback_transaction(WT_SESSION *, const char *);
/*
- * __session_reset_cursors --
+ * __wt_session_reset_cursors --
* Reset all open cursors.
*/
-static int
-__session_reset_cursors(WT_SESSION_IMPL *session)
+int
+__wt_session_reset_cursors(WT_SESSION_IMPL *session)
{
WT_CURSOR *cursor;
WT_DECL_RET;
@@ -174,7 +174,7 @@ __session_reconfigure(WT_SESSION *wt_session, const char *config)
if (F_ISSET(&session->txn, TXN_RUNNING))
WT_ERR_MSG(session, EINVAL, "transaction in progress");
- WT_TRET(__session_reset_cursors(session));
+ WT_TRET(__wt_session_reset_cursors(session));
WT_ERR(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
if (cval.len != 0)
@@ -618,7 +618,7 @@ __session_begin_transaction(WT_SESSION *wt_session, const char *config)
if (F_ISSET(&session->txn, TXN_RUNNING))
WT_ERR_MSG(session, EINVAL, "Transaction already running");
- WT_ERR(__session_reset_cursors(session));
+ WT_ERR(__wt_session_reset_cursors(session));
/*
* Now there are no cursors open and no transaction active in this
@@ -654,7 +654,7 @@ __session_commit_transaction(WT_SESSION *wt_session, const char *config)
ret = EINVAL;
}
- WT_TRET(__session_reset_cursors(session));
+ WT_TRET(__wt_session_reset_cursors(session));
if (ret == 0)
ret = __wt_txn_commit(session, cfg);
@@ -679,7 +679,7 @@ __session_rollback_transaction(WT_SESSION *wt_session, const char *config)
SESSION_API_CALL(session, rollback_transaction, config, cfg);
WT_STAT_FAST_CONN_INCR(session, txn_rollback);
- WT_TRET(__session_reset_cursors(session));
+ WT_TRET(__wt_session_reset_cursors(session));
WT_TRET(__wt_txn_rollback(session, cfg));
@@ -729,7 +729,7 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config)
* the call to begin_transaction for the checkpoint, in case some
* implementation of WT_CURSOR::reset needs the schema lock.
*/
- WT_ERR(__session_reset_cursors(session));
+ WT_ERR(__wt_session_reset_cursors(session));
WT_WITH_SCHEMA_LOCK(session,
ret = __wt_txn_checkpoint(session, cfg));