From 2fe68fa71b2c4316c6d409cbb6d9f5af13a2342a Mon Sep 17 00:00:00 2001 From: Mark Benvenuto Date: Tue, 13 Jan 2015 17:15:28 -0500 Subject: Import wiredtiger-wiredtiger-2.5.0-92-gd56476d.tar.gz from wiredtiger branch mongodb-2.8 --- src/third_party/wiredtiger/dist/api_err.py | 84 +++++-- src/third_party/wiredtiger/dist/s_string.ok | 2 + src/third_party/wiredtiger/dist/s_symbols.list | 1 + src/third_party/wiredtiger/dist/s_tags | 13 +- src/third_party/wiredtiger/dist/stat_data.py | 4 + .../com/wiredtiger/db/PackFormatInputStream.java | 1 - .../src/com/wiredtiger/db/PackInputStream.java | 8 +- .../src/com/wiredtiger/db/PackOutputStream.java | 7 +- src/third_party/wiredtiger/lang/java/wiredtiger.i | 93 +++++++- .../wiredtiger/lang/python/wiredtiger.i | 15 +- src/third_party/wiredtiger/src/btree/bt_debug.c | 21 +- src/third_party/wiredtiger/src/btree/bt_delete.c | 10 +- src/third_party/wiredtiger/src/btree/bt_page.c | 15 +- src/third_party/wiredtiger/src/btree/bt_split.c | 175 ++++++++++----- src/third_party/wiredtiger/src/btree/bt_walk.c | 9 +- src/third_party/wiredtiger/src/conn/api_strerror.c | 77 +++++-- src/third_party/wiredtiger/src/conn/conn_cache.c | 2 + src/third_party/wiredtiger/src/cursor/cur_ds.c | 1 - src/third_party/wiredtiger/src/cursor/cur_log.c | 37 +++- src/third_party/wiredtiger/src/evict/evict_lru.c | 13 +- src/third_party/wiredtiger/src/evict/evict_page.c | 17 +- src/third_party/wiredtiger/src/include/btmem.h | 5 +- src/third_party/wiredtiger/src/include/btree.i | 97 +++------ src/third_party/wiredtiger/src/include/cache.h | 2 + src/third_party/wiredtiger/src/include/cache.i | 59 +++++ src/third_party/wiredtiger/src/include/cursor.h | 2 + src/third_party/wiredtiger/src/include/cursor.i | 12 + src/third_party/wiredtiger/src/include/extern.h | 2 + src/third_party/wiredtiger/src/include/stat.h | 2 + src/third_party/wiredtiger/src/include/txn.h | 1 + src/third_party/wiredtiger/src/include/txn.i | 10 + .../wiredtiger/src/include/wiredtiger.in | 242 +++++++++++---------- .../wiredtiger/src/include/wt_internal.h | 2 +- src/third_party/wiredtiger/src/log/log.c | 8 +- src/third_party/wiredtiger/src/lsm/lsm_cursor.c | 7 +- src/third_party/wiredtiger/src/os_posix/os_errno.c | 46 ++++ src/third_party/wiredtiger/src/os_win/os_errno.c | 98 ++++++++- .../wiredtiger/src/os_win/os_ftruncate.c | 7 +- .../wiredtiger/src/os_win/os_mtx_cond.c | 4 +- src/third_party/wiredtiger/src/os_win/os_rename.c | 4 +- .../wiredtiger/src/schema/schema_open.c | 7 + .../wiredtiger/src/session/session_api.c | 7 - src/third_party/wiredtiger/src/support/stat.c | 6 + src/third_party/wiredtiger/src/txn/txn.c | 14 +- src/third_party/wiredtiger/src/txn/txn_log.c | 10 +- src/third_party/wiredtiger/tools/stat_data.py | 1 + 46 files changed, 902 insertions(+), 358 deletions(-) diff --git a/src/third_party/wiredtiger/dist/api_err.py b/src/third_party/wiredtiger/dist/api_err.py index cb2c8cc588e..6c893c9af82 100644 --- a/src/third_party/wiredtiger/dist/api_err.py +++ b/src/third_party/wiredtiger/dist/api_err.py @@ -78,7 +78,7 @@ for line in open('../src/include/wiredtiger.in', 'r'): tfile.close() compare_srcfile(tmp_file, '../src/include/wiredtiger.in') -# Output the wiredtiger_strerror code. +# Output the wiredtiger_strerror and wiredtiger_sterror_r code. tmp_file = '__tmp' tfile = open(tmp_file, 'w') tfile.write('''/* DO NOT EDIT: automatically built by dist/api_err.py. */ @@ -86,18 +86,22 @@ tfile.write('''/* DO NOT EDIT: automatically built by dist/api_err.py. */ #include "wt_internal.h" /* - * wiredtiger_strerror -- - *\tReturn a string for any error value. + * Historically, there was only the wiredtiger_strerror call because the POSIX + * port didn't need anything more complex; Windows requires memory allocation + * of error strings, so we added the wiredtiger_strerror_r call. Because we + * want wiredtiger_strerror to continue to be as thread-safe as possible, errors + * are split into three categories: WiredTiger constant strings, system constant + * strings and Everything Else, and we check constant strings before Everything + * Else. */ -const char * -wiredtiger_strerror(int error) -{ -\tstatic char errbuf[64]; -\tchar *p; - -\tif (error == 0) -\t\treturn ("Successful return: 0"); +/* + * __wiredtiger_error -- + *\tReturn a constant string for the WiredTiger errors. + */ +static const char * +__wiredtiger_error(int error) +{ \tswitch (error) { ''') @@ -105,19 +109,51 @@ for err in errors: tfile.write('\tcase ' + err.name + ':\n') tfile.write('\t\treturn ("' + err.name + ': ' + err.desc + '");\n') -tfile.write('''\ -\tdefault: -\t\tif (error > 0 && (p = strerror(error)) != NULL) -\t\t\treturn (p); -\t\tbreak; -\t} - -\t/* -\t * !!! -\t * Not thread-safe, but this is never supposed to happen. -\t */ -\t(void)snprintf(errbuf, sizeof(errbuf), "Unknown error: %d", error); -\treturn (errbuf); +tfile.write('''\t} +\treturn (NULL); +} + +/* + * wiredtiger_strerror -- + *\tReturn a string for any error value, non-thread-safe version. + */ +const char * +wiredtiger_strerror(int error) +{ +\tstatic char buf[128]; +\tconst char *p; + +\t/* Check for a constant string. */ +\tif ((p = __wiredtiger_error(error)) != NULL || +\t (p = __wt_strerror(error)) != NULL) +\t\treturn (p); + +\t/* Else, fill in the non-thread-safe static buffer. */ +\tif (wiredtiger_strerror_r(error, buf, sizeof(buf)) != 0) +\t\t(void)snprintf(buf, sizeof(buf), "error return: %d", error); + +\treturn (buf); +} + +/* + * wiredtiger_strerror_r -- + *\tReturn a string for any error value, thread-safe version. + */ +int +wiredtiger_strerror_r(int error, char *buf, size_t buflen) +{ +\tconst char *p; + +\t/* Require at least 2 bytes, printable character and trailing nul. */ +\tif (buflen < 2) +\t\treturn (ENOMEM); + +\t/* Check for a constant string. */ +\tif ((p = __wiredtiger_error(error)) != NULL || +\t (p = __wt_strerror(error)) != NULL) +\t\treturn (snprintf(buf, buflen, "%s", p) > 0 ? 0 : ENOMEM); + +\treturn (__wt_strerror_r(error, buf, buflen)); } ''') tfile.close() diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index db1114b77de..d3717d27331 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -181,6 +181,7 @@ KV KVS Kanowski's Kounavis +LANGID LEX LF LIBBZ @@ -437,6 +438,7 @@ btmem btree btrees buf +buflen bufs bufsz builtin diff --git a/src/third_party/wiredtiger/dist/s_symbols.list b/src/third_party/wiredtiger/dist/s_symbols.list index d3803bc3afa..8f469e94433 100644 --- a/src/third_party/wiredtiger/dist/s_symbols.list +++ b/src/third_party/wiredtiger/dist/s_symbols.list @@ -8,6 +8,7 @@ wiredtiger_pack_start wiredtiger_pack_str wiredtiger_pack_uint wiredtiger_strerror +wiredtiger_strerror_r wiredtiger_struct_pack wiredtiger_struct_size wiredtiger_struct_unpack diff --git a/src/third_party/wiredtiger/dist/s_tags b/src/third_party/wiredtiger/dist/s_tags index 908b5eb7e0d..faed132d05b 100644 --- a/src/third_party/wiredtiger/dist/s_tags +++ b/src/third_party/wiredtiger/dist/s_tags @@ -35,10 +35,19 @@ ctags $flags ../src/include/*.in ../src/*/*.[chi] 2>/dev/null) rm -f tags ctags $flags ../include/*.in ../*/*.[chi] 2>/dev/null) +# Link the tags file into place if we're at the right level. +link_tag() +{ + if test -e ../include/tags; then + rm -f tags && ln -s ../include/tags . + fi +} + # Link to the tags file from standard build and source directories. dirs="`python -c 'import dist; dist.print_source_dirs()'` ../src/os_win" for i in $dirs; do - if ! expr "$i" : ".*/include" > /dev/null; then - (cd $i && rm -f tags && ln -s ../include/tags .) + if expr "$i" : ".*/include" > /dev/null; then + continue fi + (cd $i && link_tag) done diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index a6a047fd10e..69e8d2ed21e 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -162,10 +162,14 @@ connection_stats = [ 'pages selected for eviction unable to be evicted'), CacheStat('cache_eviction_force', 'pages evicted because they exceeded the in-memory maximum'), + CacheStat('cache_eviction_force_delete', + 'pages evicted because they had chains of deleted items'), CacheStat('cache_eviction_force_fail', 'failed eviction of pages that exceeded the in-memory maximum'), CacheStat('cache_eviction_hazard', 'hazard pointer blocked page eviction'), CacheStat('cache_eviction_internal', 'internal pages evicted'), + CacheStat('cache_eviction_maximum_page_size', + 'maximum page size at eviction', 'max_aggregate,no_scale'), CacheStat('cache_eviction_queue_empty', 'eviction server candidate queue empty when topping up'), CacheStat('cache_eviction_queue_not_empty', diff --git a/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java index c9d1c43d32d..c53938d0a58 100644 --- a/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java +++ b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java @@ -85,7 +85,6 @@ public class PackFormatInputStream { protected char getType() throws WiredTigerPackingException { if (formatOff >= format.length()) { - System.err.println("Raw format is: " + format); throw new WiredTigerPackingException( "No more fields in format."); } diff --git a/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackInputStream.java b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackInputStream.java index 75bdb3119a9..a49b2e01f17 100644 --- a/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackInputStream.java +++ b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackInputStream.java @@ -225,6 +225,7 @@ public class PackInputStream { public String getString() throws WiredTigerPackingException { int stringLength = 0; + int skipnull = 0; format.checkType('S', false); // Get the length for a fixed length string if (format.getType() != 'S') { @@ -235,10 +236,11 @@ public class PackInputStream { // string length. for (; valueOff + stringLength < value.length && value[valueOff + stringLength] != 0; stringLength++) {} + skipnull = 1; } format.consume(); String result = new String(value, valueOff, stringLength); - valueOff += stringLength + 1; + valueOff += stringLength + skipnull; return result; } @@ -250,7 +252,7 @@ public class PackInputStream { private short unpackShort(boolean signed) throws WiredTigerPackingException { long ret = unpackLong(true); - if ((signed && (ret > Short.MAX_VALUE || ret > Short.MIN_VALUE)) || + if ((signed && (ret > Short.MAX_VALUE || ret < Short.MIN_VALUE)) || (!signed && (short)ret < 0)) { throw new WiredTigerPackingException("Overflow unpacking short."); } @@ -265,7 +267,7 @@ public class PackInputStream { private int unpackInt(boolean signed) throws WiredTigerPackingException { long ret = unpackLong(true); - if ((signed && (ret > Integer.MAX_VALUE || ret > Integer.MIN_VALUE)) || + if ((signed && (ret > Integer.MAX_VALUE || ret < Integer.MIN_VALUE)) || (!signed && (int)ret < 0)) { throw new WiredTigerPackingException("Overflow unpacking integer."); } diff --git a/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackOutputStream.java b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackOutputStream.java index 60f40564afd..e79b4c63498 100644 --- a/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackOutputStream.java +++ b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackOutputStream.java @@ -174,13 +174,16 @@ public class PackOutputStream { char fieldFormat = format.getType(); int stringLen = 0; int padBytes = 0; + int valLen = 0; // Strings have two possible encodings. A lower case 's' is not null // terminated, and has a length define in the format (default 1). An // upper case 'S' is variable length and has a null terminator. if (fieldFormat == 's') { stringLen = format.getLengthFromFormat(true); - if (stringLen > value.length()) { - padBytes = stringLen - value.length(); + valLen = value.length(); + if (stringLen > valLen) { + padBytes = stringLen - valLen; + stringLen = valLen; } } else { stringLen = value.length(); diff --git a/src/third_party/wiredtiger/lang/java/wiredtiger.i b/src/third_party/wiredtiger/lang/java/wiredtiger.i index a922a7a6b2e..09290a70c67 100644 --- a/src/third_party/wiredtiger/lang/java/wiredtiger.i +++ b/src/third_party/wiredtiger/lang/java/wiredtiger.i @@ -651,6 +651,19 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler}; return this; } + /** + * Append a record number to the async_op's key. + * + * \param value The value to append + * \return This async_op object, so put calls can be chained. + */ + public AsyncOp putKeyRecord(long value) + throws WiredTigerPackingException { + keyUnpacker = null; + keyPacker.addRecord(value); + return this; + } + /** * Append a short integer to the async_op's key. * @@ -743,6 +756,19 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler}; return this; } + /** + * Append a record number to the async_op's value. + * + * \param value The value to append + * \return This async_op object, so put calls can be chained. + */ + public AsyncOp putValueRecord(long value) + throws WiredTigerPackingException { + valueUnpacker = null; + valuePacker.addRecord(value); + return this; + } + /** * Append a short integer to the async_op's value. * @@ -834,6 +860,16 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler}; return getKeyUnpacker().getLong(); } + /** + * Retrieve a record number from the async_op's key. + * + * \return The requested value. + */ + public long getKeyRecord() + throws WiredTigerPackingException { + return getKeyUnpacker().getRecord(); + } + /** * Retrieve a short integer from the async_op's key. * @@ -919,6 +955,16 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler}; return getValueUnpacker().getLong(); } + /** + * Retrieve a record number from the async_op's value. + * + * \return The requested value. + */ + public long getValueRecord() + throws WiredTigerPackingException { + return getValueUnpacker().getRecord(); + } + /** * Retrieve a short integer from the async_op's value. * @@ -1201,6 +1247,18 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler}; return this; } + /** + * Append a record number to the cursor's key. + * + * \param value The value to append + * \return This cursor object, so put calls can be chained. + */ + public Cursor putKeyRecord(long value) + throws WiredTigerPackingException { + keyPacker.addRecord(value); + return this; + } + /** * Append a short integer to the cursor's key. * @@ -1287,6 +1345,18 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler}; return this; } + /** + * Append a record number to the cursor's value. + * + * \param value The value to append + * \return This cursor object, so put calls can be chained. + */ + public Cursor putValueRecord(long value) + throws WiredTigerPackingException { + valuePacker.addRecord(value); + return this; + } + /** * Append a short integer to the cursor's value. * @@ -1376,6 +1446,16 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler}; return keyUnpacker.getLong(); } + /** + * Retrieve a record number from the cursor's key. + * + * \return The requested value. + */ + public long getKeyRecord() + throws WiredTigerPackingException { + return keyUnpacker.getRecord(); + } + /** * Retrieve a short integer from the cursor's key. * @@ -1461,6 +1541,16 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler}; return valueUnpacker.getLong(); } + /** + * Retrieve a record number from the cursor's value. + * + * \return The requested value. + */ + public long getValueRecord() + throws WiredTigerPackingException { + return valueUnpacker.getRecord(); + } + /** * Retrieve a short integer from the cursor's value. * @@ -1801,7 +1891,8 @@ err: if (ret != 0) if ((ret = $self->open_cursor($self, uri, to_dup, config, &cursor)) != 0) goto err; - cursor->flags |= WT_CURSTD_RAW; + if ((cursor->flags & WT_CURSTD_DUMP_JSON) == 0) + cursor->flags |= WT_CURSTD_RAW; if ((ret = __wt_calloc_def((WT_SESSION_IMPL *)cursor->session, 1, &jcb)) != 0) diff --git a/src/third_party/wiredtiger/lang/python/wiredtiger.i b/src/third_party/wiredtiger/lang/python/wiredtiger.i index 974118d0f61..de5afb0a0fa 100644 --- a/src/third_party/wiredtiger/lang/python/wiredtiger.i +++ b/src/third_party/wiredtiger/lang/python/wiredtiger.i @@ -339,7 +339,9 @@ retry: if (result != 0 && result != EBUSY) SWIG_ERROR_IF_NOT_SET(result); else if (result == EBUSY) { + SWIG_PYTHON_THREAD_BEGIN_ALLOW; __wt_sleep(0, 10000); + SWIG_PYTHON_THREAD_END_ALLOW; goto retry; } } @@ -361,8 +363,17 @@ retry: } %enddef -/* Cursor compare can return any of -1, 0, 1 or WT_NOTFOUND. */ +/* Cursor compare can return any of -1, 0, 1. */ %define COMPARE_OK(m) +%exception m { + $action + if (result < -1 || result > 1) + SWIG_ERROR_IF_NOT_SET(result); +} +%enddef + +/* Cursor compare can return any of -1, 0, 1 or WT_NOTFOUND. */ +%define COMPARE_NOTFOUND_OK(m) %exception m { $action if ((result < -1 || result > 1) && result != WT_NOTFOUND) @@ -379,7 +390,7 @@ NOTFOUND_OK(__wt_cursor::search) NOTFOUND_OK(__wt_cursor::update) COMPARE_OK(__wt_cursor::compare) -COMPARE_OK(__wt_cursor::search_near) +COMPARE_NOTFOUND_OK(__wt_cursor::search_near) /* Lastly, some methods need no (additional) error checking. */ %exception __wt_connection::get_home; diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 4de94277364..af9f6a669f2 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -408,11 +408,13 @@ __debug_tree_shape_info(WT_PAGE *page) v = page->memory_footprint; if (v >= WT_GIGABYTE) - snprintf(buf, sizeof(buf), "(%" PRIu64 "G)", v / WT_GIGABYTE); + snprintf(buf, sizeof(buf), + "(%p %" PRIu64 "G)", page, v / WT_GIGABYTE); else if (v >= WT_MEGABYTE) - snprintf(buf, sizeof(buf), "(%" PRIu64 "M)", v / WT_MEGABYTE); + snprintf(buf, sizeof(buf), + "(%p %" PRIu64 "M)", page, v / WT_MEGABYTE); else - snprintf(buf, sizeof(buf), "(%" PRIu64 ")", v); + snprintf(buf, sizeof(buf), "(%p %" PRIu64 ")", page, v); return (buf); } @@ -429,16 +431,16 @@ __debug_tree_shape_worker(WT_DBG *ds, WT_PAGE *page, int level) session = ds->session; if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) { - __dmsg(ds, "%*s" "I" "%s\n", - level, " ", __debug_tree_shape_info(page)); + __dmsg(ds, "%*s" "I" "%d %s\n", + level * 3, " ", level, __debug_tree_shape_info(page)); WT_INTL_FOREACH_BEGIN(session, page, ref) { if (ref->state == WT_REF_MEM) __debug_tree_shape_worker( - ds, ref->page, level + 3); + ds, ref->page, level + 1); } WT_INTL_FOREACH_END; } else - __dmsg(ds, "%*s" "L" "%s\n", - level, " ", __debug_tree_shape_info(page)); + __dmsg(ds, "%*s" "L" " %s\n", + level * 3, " ", __debug_tree_shape_info(page)); } /* @@ -458,8 +460,7 @@ __wt_debug_tree_shape( if (page == NULL) page = S2BT(session)->root.page; - WT_WITH_PAGE_INDEX(session, - __debug_tree_shape_worker(ds, page, 0)); + WT_WITH_PAGE_INDEX(session, __debug_tree_shape_worker(ds, page, 1)); __dmsg_wrapup(ds); return (0); diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index c97ea176c97..622dfb1b294 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -207,6 +207,9 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) { int skip; + if (ref->state != WT_REF_DELETED) + return (0); + /* * Deleted pages come from two sources: either it's a fast-delete as * described above, or the page has been emptied by other operations @@ -225,11 +228,14 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) * the page could switch to an in-memory state at any time. Lock down * the structure, just to be safe. */ + if (ref->page_del == NULL) + return (1); + if (!WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED)) return (0); - skip = ref->page_del == NULL || - __wt_txn_visible(session, ref->page_del->txnid) ? 1 : 0; + skip = (ref->page_del == NULL || + __wt_txn_visible(session, ref->page_del->txnid)); WT_PUBLISH(ref->state, WT_REF_DELETED); return (skip); diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 181ffdb3736..561e1c19218 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -37,8 +37,11 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) page->type != WT_PAGE_ROW_LEAF) return (0); - /* Eviction may be turned off, although that's rare. */ - if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) + /* + * Eviction may be turned off (although that's rare), or we may be in + * the middle of a checkpoint. + */ + if (F_ISSET(btree, WT_BTREE_NO_EVICTION) || btree->checkpointing) return (0); /* @@ -128,7 +131,13 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; - WT_RET(__wt_page_release(session, ref, flags)); + if ((ret = __wt_page_release_busy( + session, ref, flags)) == EBUSY) { + /* If forced eviction fails, stall. */ + ret = 0; + wait_cnt += 1000; + } else + WT_RET(ret); WT_STAT_FAST_CONN_INCR( session, page_forcible_evict_blocked); break; diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 1c62391f722..70d0758dede 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -8,15 +8,6 @@ #include "wt_internal.h" -/* - * Tuning; global variables to allow the binary to be patched, we don't yet have - * any real understanding of what might be useful to surface to applications. - */ -static u_int __split_deepen_max_internal_image = 100; -static u_int __split_deepen_min_child = 10; -static u_int __split_deepen_per_child = 100; -static u_int __split_deepen_split_child = 100; - /* * Track allocation increments, matching the cache calculations, which add an * estimate of allocation overhead to every object. @@ -176,46 +167,58 @@ __split_safe_free(WT_SESSION_IMPL *session, int exclusive, void *p, size_t s) return (__split_stash_add(session, p, s)); } +/* + * Tuning; global variables to allow the binary to be patched, we don't yet have + * any real understanding of what might be useful to surface to applications. + */ +static u_int __split_deepen_min_child = 10000; +static u_int __split_deepen_per_child = 100; + /* * __split_should_deepen -- * Return if we should deepen the tree. */ static int -__split_should_deepen(WT_SESSION_IMPL *session, WT_PAGE *page) +__split_should_deepen( + WT_SESSION_IMPL *session, WT_REF *ref, uint32_t *childrenp) { WT_PAGE_INDEX *pindex; + WT_PAGE *page; - /* - * Splits are based on either the number of child pages that will be - * created by the split (splitting an internal page that will be slow - * to search), or by the memory footprint of the parent page (avoiding - * an internal page that will eat up all of the cache and put eviction - * pressure on the system). - */ + *childrenp = 0; + + page = ref->page; pindex = WT_INTL_INDEX_COPY(page); /* * Deepen the tree if the page's memory footprint is larger than the - * maximum size for a page in memory. We need an absolute minimum - * number of entries in order to split the page: if there is a single - * huge key, splitting won't help. + * maximum size for a page in memory (presumably putting eviction + * pressure on the cache). */ - if (page->memory_footprint > S2BT(session)->maxmempage && - pindex->entries >= __split_deepen_min_child) - return (1); + if (page->memory_footprint < S2BT(session)->maxmempage) + return (0); /* - * Deepen the tree if the page's memory footprint is at least N - * times the maximum internal page size chunk in the backing file and - * the split will result in at least N children in the newly created - * intermediate layer. + * Ensure the page has enough entries to make it worth splitting and + * we get a significant payback (in the case of a set of large keys, + * splitting won't help). */ - if (page->memory_footprint > - __split_deepen_max_internal_image * S2BT(session)->maxintlpage && - pindex->entries >= - (__split_deepen_per_child * __split_deepen_split_child)) + if (pindex->entries > __split_deepen_min_child) { + *childrenp = pindex->entries / __split_deepen_per_child; return (1); + } + /* + * The root is a special-case: if it's putting cache pressure on the + * system, split it even if there are only a few entries, we can't + * push it out of memory. Sanity check: if the root page is too big + * with less than 100 keys, there are huge keys and/or a too-small + * cache, there's not much to do. + */ + if (__wt_ref_is_root(ref) && pindex->entries > 100) { + *childrenp = pindex->entries / 10; + return (1); + } return (0); } @@ -383,7 +386,7 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) * Split an internal page in-memory, deepening the tree. */ static int -__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) +__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) { WT_DECL_RET; WT_PAGE *child; @@ -391,7 +394,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) WT_REF **alloc_refp; WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref; size_t child_incr, parent_decr, parent_incr, size; - uint32_t children, chunk, i, j, remain, slots; + uint32_t chunk, i, j, remain, slots; int panic; void *p; @@ -401,13 +404,6 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) pindex = WT_INTL_INDEX_COPY(parent); - /* - * Create N children, unless we are dealing with a large page without - * many entries, in which case split into the minimum number of pages. - */ - children = WT_MAX(pindex->entries / __split_deepen_per_child, - __split_deepen_min_child); - WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children", @@ -717,10 +713,11 @@ __split_multi_inmem( /* * We modified the page above, which will have set the first dirty * transaction to the last transaction current running. However, the - * updates we installed may be older than that. Inherit the first - * dirty transaction from the original page. + * updates we installed may be older than that. Set the first dirty + * transaction to an impossibly old value so this page is never skipped + * in a checkpoint. */ - page->modify->first_dirty_txn = orig->modify->first_dirty_txn; + page->modify->first_dirty_txn = WT_TXN_FIRST; err: /* Free any resources that may have been cached in the cursor. */ WT_TRET(__wt_btcur_close(&cbt)); @@ -813,17 +810,20 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, int exclusive, int ref_discard) { WT_DECL_RET; + WT_IKEY *ikey; WT_PAGE *parent; WT_PAGE_INDEX *alloc_index, *pindex; - WT_REF **alloc_refp, *parent_ref; + WT_REF **alloc_refp, *next_ref, *parent_ref; size_t size; - uint32_t i, j, parent_entries, result_entries; + uint32_t children, i, j; + uint32_t deleted_entries, parent_entries, result_entries; int complete, hazard, locked; parent = NULL; /* -Wconditional-uninitialized */ - alloc_index = NULL; + alloc_index = pindex = NULL; parent_ref = NULL; complete = hazard = locked = 0; + parent_entries = 0; /* * Get a page-level lock on the parent to single-thread splits into the @@ -864,7 +864,29 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, pindex = WT_INTL_INDEX_COPY(parent); parent_entries = pindex->entries; - result_entries = (parent_entries - 1) + new_entries; + + /* + * Remove any refs to deleted pages while we are splitting, we have + * the internal page locked down, and are copying the refs into a new + * array anyway. Switch them to the special split state, so that any + * reading thread will restart. + */ + for (i = 0, deleted_entries = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); + if (next_ref->state == WT_REF_DELETED && + next_ref->page_del == NULL && + WT_ATOMIC_CAS4(next_ref->state, + WT_REF_DELETED, WT_REF_SPLIT)) + deleted_entries++; + } + + /* + * The final entry count consists of: The original count, plus any + * new pages, less any refs we are removing because they only + * contained deleted items, less 1 for the page being replaced. + */ + result_entries = (parent_entries + new_entries) - (deleted_entries + 1); /* * Allocate and initialize a new page index array for the parent, then @@ -876,8 +898,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_MEMSIZE_ADD(parent_incr, size); alloc_index->index = (WT_REF **)(alloc_index + 1); alloc_index->entries = result_entries; - for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) - if (pindex->index[i] == ref) + for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + if (next_ref == ref) for (j = 0; j < new_entries; ++j) { ref_new[j]->home = parent; *alloc_refp++ = ref_new[j]; @@ -889,8 +912,26 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, */ ref_new[j] = NULL; } - else - *alloc_refp++ = pindex->index[i]; + else if (next_ref->state == WT_REF_SPLIT) { + /* + * We're discarding a deleted reference. + * Free any resources it holds. + */ + if (parent->type == WT_PAGE_ROW_INT) { + WT_TRET(__split_ovfl_key_cleanup( + session, parent, next_ref)); + ikey = __wt_ref_key_instantiated(next_ref); + if (ikey != NULL) + WT_TRET(__split_safe_free(session, 0, + ikey, + sizeof(WT_IKEY) + ikey->size)); + } + + WT_TRET(__split_safe_free( + session, 0, next_ref, sizeof(WT_REF))); + } else + *alloc_refp++ = next_ref; + } /* * Update the parent page's index: this update makes the split visible @@ -977,11 +1018,30 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * Do the check here because we've just grown the parent page and * are holding it locked. */ - if (ret == 0 && !exclusive && __split_should_deepen(session, parent)) + if (ret == 0 && !exclusive && + !F_ISSET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN) && + __split_should_deepen(session, parent_ref, &children)) { + /* + * XXX + * Temporary hack to avoid a bug where the root page is split + * even when it's no longer doing any good. + */ + uint64_t __a, __b; + __a = parent->memory_footprint; WT_WITH_PAGE_INDEX(session, - ret = __split_deepen(session, parent)); + ret = __split_deepen(session, parent, children)); + __b = parent->memory_footprint; + if (__b * 2 >= __a) + F_SET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN); + } -err: if (locked) +err: if (!complete) + for (i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + if (next_ref->state == WT_REF_SPLIT) + next_ref->state = WT_REF_DELETED; + } + if (locked) F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING); if (hazard) @@ -1137,10 +1197,11 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) /* * We modified the page above, which will have set the first dirty * transaction to the last transaction current running. However, the - * updates we installed may be older than that. Inherit the first - * dirty transaction from the original page. + * updates we installed may be older than that. Set the first dirty + * transaction to an impossibly old value so this page is never skipped + * in a checkpoint. */ - right->modify->first_dirty_txn = page->modify->first_dirty_txn; + right->modify->first_dirty_txn = WT_TXN_FIRST; /* * Calculate how much memory we're moving: figure out how deep the skip diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index c74a7177401..a2b2a6bb7c8 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -207,6 +207,12 @@ restart: /* ref->state != WT_REF_MEM) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { + /* + * Avoid pulling a deleted page back in to try + * to delete it again. + */ + if (__wt_delete_page_skip(session, ref)) + break; /* * If deleting a range, try to delete the page * without instantiating it. @@ -242,8 +248,7 @@ restart: /* * If iterating a cursor, try to skip deleted * pages that are visible to us. */ - if (ref->state == WT_REF_DELETED && - __wt_delete_page_skip(session, ref)) + if (__wt_delete_page_skip(session, ref)) break; } diff --git a/src/third_party/wiredtiger/src/conn/api_strerror.c b/src/third_party/wiredtiger/src/conn/api_strerror.c index caf536b24f7..396ae7a3e0f 100644 --- a/src/third_party/wiredtiger/src/conn/api_strerror.c +++ b/src/third_party/wiredtiger/src/conn/api_strerror.c @@ -3,18 +3,22 @@ #include "wt_internal.h" /* - * wiredtiger_strerror -- - * Return a string for any error value. + * Historically, there was only the wiredtiger_strerror call because the POSIX + * port didn't need anything more complex; Windows requires memory allocation + * of error strings, so we added the wiredtiger_strerror_r call. Because we + * want wiredtiger_strerror to continue to be as thread-safe as possible, errors + * are split into three categories: WiredTiger constant strings, system constant + * strings and Everything Else, and we check constant strings before Everything + * Else. */ -const char * -wiredtiger_strerror(int error) -{ - static char errbuf[64]; - char *p; - - if (error == 0) - return ("Successful return: 0"); +/* + * __wiredtiger_error -- + * Return a constant string for the WiredTiger errors. + */ +static const char * +__wiredtiger_error(int error) +{ switch (error) { case WT_ROLLBACK: return ("WT_ROLLBACK: conflict between concurrent operations"); @@ -28,16 +32,49 @@ wiredtiger_strerror(int error) return ("WT_PANIC: WiredTiger library panic"); case WT_RESTART: return ("WT_RESTART: restart the operation (internal)"); - default: - if (error > 0 && (p = strerror(error)) != NULL) - return (p); - break; } + return (NULL); +} + +/* + * wiredtiger_strerror -- + * Return a string for any error value, non-thread-safe version. + */ +const char * +wiredtiger_strerror(int error) +{ + static char buf[128]; + const char *p; + + /* Check for a constant string. */ + if ((p = __wiredtiger_error(error)) != NULL || + (p = __wt_strerror(error)) != NULL) + return (p); + + /* Else, fill in the non-thread-safe static buffer. */ + if (wiredtiger_strerror_r(error, buf, sizeof(buf)) != 0) + (void)snprintf(buf, sizeof(buf), "error return: %d", error); + + return (buf); +} + +/* + * wiredtiger_strerror_r -- + * Return a string for any error value, thread-safe version. + */ +int +wiredtiger_strerror_r(int error, char *buf, size_t buflen) +{ + const char *p; + + /* Require at least 2 bytes, printable character and trailing nul. */ + if (buflen < 2) + return (ENOMEM); + + /* Check for a constant string. */ + if ((p = __wiredtiger_error(error)) != NULL || + (p = __wt_strerror(error)) != NULL) + return (snprintf(buf, buflen, "%s", p) > 0 ? 0 : ENOMEM); - /* - * !!! - * Not thread-safe, but this is never supposed to happen. - */ - (void)snprintf(errbuf, sizeof(errbuf), "Unknown error: %d", error); - return (errbuf); + return (__wt_strerror_r(error, buf, buflen)); } diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c index cf129531dd4..91f82a5105b 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache.c @@ -144,6 +144,8 @@ __wt_cache_stats_update(WT_SESSION_IMPL *session) WT_STAT_SET(stats, cache_bytes_inuse, __wt_cache_bytes_inuse(cache)); WT_STAT_SET(stats, cache_pages_inuse, __wt_cache_pages_inuse(cache)); WT_STAT_SET(stats, cache_bytes_dirty, cache->bytes_dirty); + WT_STAT_SET(stats, + cache_eviction_maximum_page_size, cache->evict_max_page_size); WT_STAT_SET(stats, cache_pages_dirty, cache->pages_dirty); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c index c8b8f6c4547..2cb791de85d 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_ds.c +++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c @@ -478,7 +478,6 @@ __wt_curds_open( cursor = &data_source->iface; *cursor = iface; cursor->session = &session->iface; - F_SET(cursor, WT_CURSTD_DATA_SOURCE); /* * XXX diff --git a/src/third_party/wiredtiger/src/cursor/cur_log.c b/src/third_party/wiredtiger/src/cursor/cur_log.c index 04ab1e2a14a..e3089e9fb83 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_log.c +++ b/src/third_party/wiredtiger/src/cursor/cur_log.c @@ -150,6 +150,7 @@ static int __curlog_kv(WT_SESSION_IMPL *session, WT_CURSOR *cursor) { WT_CURSOR_LOG *cl; + WT_ITEM item; uint32_t fileid, key_count, opsize, optype; cl = (WT_CURSOR_LOG *)cursor; @@ -180,11 +181,37 @@ __curlog_kv(WT_SESSION_IMPL *session, WT_CURSOR *cursor) * The log cursor sets the LSN and step count as the cursor key and * and log record related data in the value. The data in the value * contains any operation key/value that was in the log record. + * For the special case that the caller needs the result in raw form, + * we create packed versions of the key/value. */ - __wt_cursor_set_key(cursor, cl->cur_lsn->file, cl->cur_lsn->offset, - key_count); - __wt_cursor_set_value(cursor, cl->txnid, cl->rectype, optype, - fileid, cl->opkey, cl->opvalue); + if (FLD_ISSET(cursor->flags, WT_CURSTD_RAW)) { + memset(&item, 0, sizeof(item)); + WT_RET(wiredtiger_struct_size((WT_SESSION *)session, + &item.size, LOGC_KEY_FORMAT, cl->cur_lsn->file, + cl->cur_lsn->offset, key_count)); + WT_RET(__wt_realloc(session, NULL, item.size, &cl->packed_key)); + item.data = cl->packed_key; + WT_RET(wiredtiger_struct_pack((WT_SESSION *)session, + cl->packed_key, item.size, LOGC_KEY_FORMAT, + cl->cur_lsn->file, cl->cur_lsn->offset, key_count)); + __wt_cursor_set_key(cursor, &item); + + WT_RET(wiredtiger_struct_size((WT_SESSION *)session, + &item.size, LOGC_VALUE_FORMAT, cl->txnid, cl->rectype, + optype, fileid, cl->opkey, cl->opvalue)); + WT_RET(__wt_realloc(session, NULL, item.size, + &cl->packed_value)); + item.data = cl->packed_value; + WT_RET(wiredtiger_struct_pack((WT_SESSION *)session, + cl->packed_value, item.size, LOGC_VALUE_FORMAT, cl->txnid, + cl->rectype, optype, fileid, cl->opkey, cl->opvalue)); + __wt_cursor_set_value(cursor, &item); + } else { + __wt_cursor_set_key(cursor, cl->cur_lsn->file, + cl->cur_lsn->offset, key_count); + __wt_cursor_set_value(cursor, cl->txnid, cl->rectype, optype, + fileid, cl->opkey, cl->opvalue); + } return (0); } @@ -295,6 +322,8 @@ __curlog_close(WT_CURSOR *cursor) __wt_scr_free(session, &cl->logrec); __wt_scr_free(session, &cl->opkey); __wt_scr_free(session, &cl->opvalue); + __wt_free(session, cl->packed_key); + __wt_free(session, cl->packed_value); WT_TRET(__wt_cursor_close(cursor)); err: API_END_RET(session, ret); diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 60a5f82f233..a4ae0aaf55b 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -437,7 +437,7 @@ __evict_pass(WT_SESSION_IMPL *session) WT_EVICT_WORKER *worker; int loop; uint32_t flags; - uint64_t bytes_inuse, pages_evicted; + uint64_t bytes_inuse, dirty_target_size, pages_evicted, target_size; conn = S2C(session); cache = conn->cache; @@ -465,9 +465,16 @@ __evict_pass(WT_SESSION_IMPL *session) if (loop > 10) LF_SET(WT_EVICT_PASS_AGGRESSIVE); - /* Start a worker if we have capacity and the cache is full. */ + /* + * Start a worker if we have capacity and we haven't reached + * the eviction targets. + */ bytes_inuse = __wt_cache_bytes_inuse(cache); - if (bytes_inuse > conn->cache_size && + target_size = (conn->cache_size * cache->eviction_target) / 100; + dirty_target_size = + (conn->cache_size * cache->eviction_dirty_target) / 100; + if ((bytes_inuse > target_size || + cache->bytes_dirty > dirty_target_size) && conn->evict_workers < conn->evict_workers_max) { WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, "Starting evict worker: %"PRIu32"\n", diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index aca3dc11ee9..b3a6f718ca2 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -21,12 +21,15 @@ static void __evict_excl_clear(WT_SESSION_IMPL *); int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) { + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_TXN_STATE *txn_state; int forced_eviction, inmem_split, istree; + conn = S2C(session); + page = ref->page; forced_eviction = (page->read_gen == WT_READGEN_OLDEST); inmem_split = istree = 0; @@ -40,7 +43,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) */ txn_state = WT_SESSION_TXN_STATE(session); if (txn_state->snap_min == WT_TXN_NONE) - txn_state->snap_min = S2C(session)->txn_global.oldest_id; + txn_state->snap_min = conn->txn_global.oldest_id; else txn_state = NULL; @@ -75,6 +78,14 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) WT_STAT_FAST_DATA_INCR(session, cache_eviction_internal); } + /* + * Track the largest page size seen at eviction, it tells us something + * about our ability to force pages out before they're larger than the + * cache. + */ + if (page->memory_footprint > conn->cache->evict_max_page_size) + conn->cache->evict_max_page_size = page->memory_footprint; + /* Discard any subtree rooted in this page. */ if (istree) WT_WITH_PAGE_INDEX(session, @@ -119,8 +130,8 @@ done: session->excl_next = 0; txn_state->snap_min = WT_TXN_NONE; if ((inmem_split || (forced_eviction && ret == EBUSY)) && - !F_ISSET(S2C(session)->cache, WT_EVICT_WOULD_BLOCK)) { - F_SET(S2C(session)->cache, WT_EVICT_WOULD_BLOCK); + !F_ISSET(conn->cache, WT_EVICT_WOULD_BLOCK)) { + F_SET(conn->cache, WT_EVICT_WOULD_BLOCK); WT_TRET(__wt_evict_server_wake(session)); } diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index e1fc72677c5..dd10e522412 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -550,9 +550,10 @@ struct __wt_page { #define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */ #define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ -#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */ -#define WT_PAGE_SPLITTING 0x20 /* An internal page is growing */ +#define WT_PAGE_REFUSE_DEEPEN 0x10 /* Don't deepen the tree at this page */ +#define WT_PAGE_SCANNING 0x20 /* Obsolete updates are being scanned */ #define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */ +#define WT_PAGE_SPLITTING 0x80 /* An internal page is growing */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ }; diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index a333e4af565..d30ee46486a 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -164,65 +164,6 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) (void)WT_ATOMIC_ADD8(cache->pages_evict, 1); } -/* - * __wt_cache_read_gen -- - * Get the current read generation number. - */ -static inline uint64_t -__wt_cache_read_gen(WT_SESSION_IMPL *session) -{ - return (S2C(session)->cache->read_gen); -} - -/* - * __wt_cache_read_gen_incr -- - * Increment the current read generation number. - */ -static inline void -__wt_cache_read_gen_incr(WT_SESSION_IMPL *session) -{ - ++S2C(session)->cache->read_gen; -} - -/* - * __wt_cache_read_gen_set -- - * Get the read generation to store in a page. - */ -static inline uint64_t -__wt_cache_read_gen_set(WT_SESSION_IMPL *session) -{ - /* - * We return read-generations from the future (where "the future" is - * measured by increments of the global read generation). The reason - * is because when acquiring a new hazard pointer for a page, we can - * check its read generation, and if the read generation isn't less - * than the current global generation, we don't bother updating the - * page. In other words, the goal is to avoid some number of updates - * immediately after each update we have to make. - */ - return (__wt_cache_read_gen(session) + WT_READGEN_STEP); -} - -/* - * __wt_cache_pages_inuse -- - * Return the number of pages in use. - */ -static inline uint64_t -__wt_cache_pages_inuse(WT_CACHE *cache) -{ - return (cache->pages_inmem - cache->pages_evict); -} - -/* - * __wt_cache_bytes_inuse -- - * Return the number of bytes in use. - */ -static inline uint64_t -__wt_cache_bytes_inuse(WT_CACHE *cache) -{ - return (cache->bytes_inmem - cache->bytes_evict); -} - /* * __wt_page_evict_soon -- * Set a page to be evicted as soon as possible. @@ -917,16 +858,16 @@ __wt_ref_info(WT_SESSION_IMPL *session, } /* - * __wt_page_release -- - * Release a reference to a page. + * __wt_page_release_busy -- + * Release a reference to a page, fail if busy during forced eviction. */ static inline int -__wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) +__wt_page_release_busy(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; - int locked; + int locked, too_big; btree = S2BT(session); @@ -938,6 +879,8 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) return (0); page = ref->page; + too_big = (page->memory_footprint < btree->maxmempage) ? 0 : 1; + /* * Attempt to evict pages with the special "oldest" read generation. * @@ -970,18 +913,36 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) return (ret); (void)WT_ATOMIC_ADD4(btree->evict_busy, 1); - if ((ret = __wt_evict_page(session, ref)) == 0) - WT_STAT_FAST_CONN_INCR(session, cache_eviction_force); - else { + if ((ret = __wt_evict_page(session, ref)) == 0) { + if (too_big) + WT_STAT_FAST_CONN_INCR(session, cache_eviction_force); + else + /* + * If the page isn't too big, we are evicting it because + * it had a chain of deleted entries that make traversal + * expensive. + */ + WT_STAT_FAST_CONN_INCR( + session, cache_eviction_force_delete); + } else { WT_STAT_FAST_CONN_INCR(session, cache_eviction_force_fail); - if (ret == EBUSY) - ret = 0; } (void)WT_ATOMIC_SUB4(btree->evict_busy, 1); return (ret); } +/* + * __wt_page_release -- + * Release a reference to a page. + */ +static inline int +__wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) +{ + WT_RET_BUSY_OK(__wt_page_release_busy(session, ref, flags)); + return (0); +} + /* * __wt_page_swap_func -- * Swap one page's hazard pointer for another one when hazard pointer diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h index 75219e5b413..deccd676e26 100644 --- a/src/third_party/wiredtiger/src/include/cache.h +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -61,6 +61,8 @@ struct __wt_cache { uint64_t bytes_dirty; /* Bytes/pages currently dirty */ uint64_t pages_dirty; + uint64_t evict_max_page_size; /* Largest page seen at eviction */ + /* * Read information. */ diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i index b997781272a..ee969255241 100644 --- a/src/third_party/wiredtiger/src/include/cache.i +++ b/src/third_party/wiredtiger/src/include/cache.i @@ -6,6 +6,65 @@ * See the file LICENSE for redistribution information. */ +/* + * __wt_cache_read_gen -- + * Get the current read generation number. + */ +static inline uint64_t +__wt_cache_read_gen(WT_SESSION_IMPL *session) +{ + return (S2C(session)->cache->read_gen); +} + +/* + * __wt_cache_read_gen_incr -- + * Increment the current read generation number. + */ +static inline void +__wt_cache_read_gen_incr(WT_SESSION_IMPL *session) +{ + ++S2C(session)->cache->read_gen; +} + +/* + * __wt_cache_read_gen_set -- + * Get the read generation to store in a page. + */ +static inline uint64_t +__wt_cache_read_gen_set(WT_SESSION_IMPL *session) +{ + /* + * We return read-generations from the future (where "the future" is + * measured by increments of the global read generation). The reason + * is because when acquiring a new hazard pointer for a page, we can + * check its read generation, and if the read generation isn't less + * than the current global generation, we don't bother updating the + * page. In other words, the goal is to avoid some number of updates + * immediately after each update we have to make. + */ + return (__wt_cache_read_gen(session) + WT_READGEN_STEP); +} + +/* + * __wt_cache_pages_inuse -- + * Return the number of pages in use. + */ +static inline uint64_t +__wt_cache_pages_inuse(WT_CACHE *cache) +{ + return (cache->pages_inmem - cache->pages_evict); +} + +/* + * __wt_cache_bytes_inuse -- + * Return the number of bytes in use. + */ +static inline uint64_t +__wt_cache_bytes_inuse(WT_CACHE *cache) +{ + return (cache->bytes_inmem - cache->bytes_evict); +} + /* * __wt_eviction_check -- * Wake the eviction server if necessary. diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index 0fc4b883a16..e46c1f7de1b 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -264,6 +264,8 @@ struct __wt_cursor_log { WT_ITEM *logrec; /* Copy of record for cursor */ WT_ITEM *opkey, *opvalue; /* Op key/value copy */ const uint8_t *stepp, *stepp_end; /* Pointer within record */ + uint8_t *packed_key; /* Packed key for 'raw' interface */ + uint8_t *packed_value; /* Packed value for 'raw' interface */ uint32_t step_count; /* Intra-record count */ uint32_t rectype; /* Record type */ uint64_t txnid; /* Record txnid */ diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index ae6aafdd638..8fa9790e096 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -176,11 +176,23 @@ static inline int __cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter) { WT_SESSION_IMPL *session; + WT_TXN *txn; session = (WT_SESSION_IMPL *)cbt->iface.session; + txn = &session->txn; if (reenter) WT_RET(__curfile_leave(cbt)); + + /* + * If there is no transaction active in this thread and we haven't + * checked if the cache is full, do it now. If we have to block for + * eviction, this is the best time to do it. + */ + if (F_ISSET(txn, TXN_RUNNING) && + !F_ISSET(txn, TXN_HAS_ID) && !F_ISSET(txn, TXN_HAS_SNAPSHOT)) + WT_RET(__wt_cache_full_check(session)); + if (!F_ISSET(cbt, WT_CBT_ACTIVE)) WT_RET(__curfile_enter(cbt)); __wt_txn_cursor_op(session); diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index b80719de7c0..d8ed3f5cef1 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -432,6 +432,8 @@ extern int __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp extern int __wt_dlsym(WT_SESSION_IMPL *session, WT_DLH *dlh, const char *name, int fail, void *sym_ret); extern int __wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh); extern int __wt_errno(void); +extern const char *__wt_strerror(int error); +extern int __wt_strerror_r(int error, char *buf, size_t buflen); extern int __wt_exist(WT_SESSION_IMPL *session, const char *filename, int *existp); extern void __wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh); extern int __wt_fallocate( WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len); diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 53a4ce3af4a..6efb9970065 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -164,9 +164,11 @@ struct __wt_connection_stats { WT_STATS cache_eviction_dirty; WT_STATS cache_eviction_fail; WT_STATS cache_eviction_force; + WT_STATS cache_eviction_force_delete; WT_STATS cache_eviction_force_fail; WT_STATS cache_eviction_hazard; WT_STATS cache_eviction_internal; + WT_STATS cache_eviction_maximum_page_size; WT_STATS cache_eviction_queue_empty; WT_STATS cache_eviction_queue_not_empty; WT_STATS cache_eviction_server_evicting; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 36cb10c30d0..8380e55effb 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -7,6 +7,7 @@ */ #define WT_TXN_NONE 0 /* No txn running in a session. */ +#define WT_TXN_FIRST 1 /* First transaction to run. */ #define WT_TXN_ABORTED UINT64_MAX /* Update rolled back, ignore. */ /* diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 745a8f75a99..656181790ed 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -227,6 +227,16 @@ __wt_txn_id_check(WT_SESSION_IMPL *session) txn = &session->txn; WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING)); + + /* + * If there is no transaction active in this thread and we haven't + * checked if the cache is full, do it now. If we have to block for + * eviction, this is the best time to do it. + */ + if (F_ISSET(txn, TXN_RUNNING) && + !F_ISSET(txn, TXN_HAS_ID) && !F_ISSET(txn, TXN_HAS_SNAPSHOT)) + WT_RET(__wt_cache_full_check(session)); + if (!F_ISSET(txn, TXN_HAS_ID)) { conn = S2C(session); txn_global = &conn->txn_global; diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index ee9c58e4278..91eb41af4f3 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -525,18 +525,17 @@ struct __wt_cursor { #define WT_CURSTD_APPEND 0x0001 #define WT_CURSTD_BULK 0x0002 -#define WT_CURSTD_DATA_SOURCE 0x0004 -#define WT_CURSTD_DUMP_HEX 0x0008 -#define WT_CURSTD_DUMP_JSON 0x0010 -#define WT_CURSTD_DUMP_PRINT 0x0020 -#define WT_CURSTD_KEY_EXT 0x0040 /* Key points out of the tree. */ -#define WT_CURSTD_KEY_INT 0x0080 /* Key points into the tree. */ +#define WT_CURSTD_DUMP_HEX 0x0004 +#define WT_CURSTD_DUMP_JSON 0x0008 +#define WT_CURSTD_DUMP_PRINT 0x0010 +#define WT_CURSTD_KEY_EXT 0x0020 /* Key points out of the tree. */ +#define WT_CURSTD_KEY_INT 0x0040 /* Key points into the tree. */ #define WT_CURSTD_KEY_SET (WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT) -#define WT_CURSTD_OPEN 0x0100 -#define WT_CURSTD_OVERWRITE 0x0200 -#define WT_CURSTD_RAW 0x0400 -#define WT_CURSTD_VALUE_EXT 0x0800 /* Value points out of the tree. */ -#define WT_CURSTD_VALUE_INT 0x1000 /* Value points into the tree. */ +#define WT_CURSTD_OPEN 0x0080 +#define WT_CURSTD_OVERWRITE 0x0100 +#define WT_CURSTD_RAW 0x0200 +#define WT_CURSTD_VALUE_EXT 0x0400 /* Value points out of the tree. */ +#define WT_CURSTD_VALUE_INT 0x0800 /* Value points into the tree. */ #define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT) uint32_t flags; #endif @@ -2020,15 +2019,26 @@ int wiredtiger_open(const char *home, WT_CONNECTION **connectionp); /*! - * Return information about an error as a string; wiredtiger_strerror is a - * superset of the ISO C99/POSIX 1003.1-2001 function strerror. + * Return information about a WiredTiger error as a string, not thread-safe. * * @snippet ex_all.c Display an error * - * @param err a return value from a WiredTiger, C library or POSIX function + * @param error a return value from a WiredTiger call * @returns a string representation of the error */ -const char *wiredtiger_strerror(int err); +const char *wiredtiger_strerror(int error); + +/*! + * Return information about a WiredTiger error as a string, thread-safe version. + * + * @snippet ex_all.c Display an error thread safe + * + * @param error a return value from a WiredTiger call + * @param buf a buffer of at least \c buflen bytes + * @param buflen the length of the buffer + * @returns zero for success, non-zero to indicate an error. + */ +int wiredtiger_strerror_r(int error, char *buf, size_t buflen); #if !defined(SWIG) /*! @@ -3155,204 +3165,208 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_EVICTION_FAIL 1030 /*! cache: pages evicted because they exceeded the in-memory maximum */ #define WT_STAT_CONN_CACHE_EVICTION_FORCE 1031 +/*! cache: pages evicted because they had chains of deleted items */ +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1032 /*! cache: failed eviction of pages that exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1032 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1033 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1033 +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1034 /*! cache: internal pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1034 +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1035 +/*! cache: maximum page size at eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1036 /*! cache: eviction server candidate queue empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1035 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1037 /*! cache: eviction server candidate queue not empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1036 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1038 /*! cache: eviction server evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1037 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1039 /*! cache: eviction server populating queue, but not evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1038 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1040 /*! cache: eviction server unable to reach eviction goal */ -#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1039 +#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1041 /*! cache: pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1040 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1042 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1041 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1043 /*! cache: in-memory page splits */ -#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1042 +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1044 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1043 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1045 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1044 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1046 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1045 +#define WT_STAT_CONN_CACHE_READ 1047 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1046 +#define WT_STAT_CONN_CACHE_WRITE 1048 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1047 +#define WT_STAT_CONN_COND_WAIT 1049 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1048 +#define WT_STAT_CONN_CURSOR_CREATE 1050 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1049 +#define WT_STAT_CONN_CURSOR_INSERT 1051 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1050 +#define WT_STAT_CONN_CURSOR_NEXT 1052 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1051 +#define WT_STAT_CONN_CURSOR_PREV 1053 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1052 +#define WT_STAT_CONN_CURSOR_REMOVE 1054 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1053 +#define WT_STAT_CONN_CURSOR_RESET 1055 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1054 +#define WT_STAT_CONN_CURSOR_SEARCH 1056 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1055 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1057 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1056 +#define WT_STAT_CONN_CURSOR_UPDATE 1058 /*! data-handle: connection dhandles swept */ -#define WT_STAT_CONN_DH_CONN_HANDLES 1057 +#define WT_STAT_CONN_DH_CONN_HANDLES 1059 /*! data-handle: connection candidate referenced */ -#define WT_STAT_CONN_DH_CONN_REF 1058 +#define WT_STAT_CONN_DH_CONN_REF 1060 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_CONN_SWEEPS 1059 +#define WT_STAT_CONN_DH_CONN_SWEEPS 1061 /*! data-handle: connection time-of-death sets */ -#define WT_STAT_CONN_DH_CONN_TOD 1060 +#define WT_STAT_CONN_DH_CONN_TOD 1062 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1061 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1063 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1062 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1064 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1063 +#define WT_STAT_CONN_FILE_OPEN 1065 /*! log: log buffer size increases */ -#define WT_STAT_CONN_LOG_BUFFER_GROW 1064 +#define WT_STAT_CONN_LOG_BUFFER_GROW 1066 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1065 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1067 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1066 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1068 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1067 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1069 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1068 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1070 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1069 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1071 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1070 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1072 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1071 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1073 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1072 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1074 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1073 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1075 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1074 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1076 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1075 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1077 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1076 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1078 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1077 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1079 /*! log: log read operations */ -#define WT_STAT_CONN_LOG_READS 1078 +#define WT_STAT_CONN_LOG_READS 1080 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1079 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1081 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1080 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1082 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1081 +#define WT_STAT_CONN_LOG_SCANS 1083 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1082 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1084 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1083 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1085 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1084 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1086 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1085 +#define WT_STAT_CONN_LOG_SLOT_RACES 1087 /*! log: slots selected for switching that were unavailable */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1086 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1088 /*! log: record size exceeded maximum */ -#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1087 +#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1089 /*! log: failed to find a slot large enough for record */ -#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1088 +#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1090 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1089 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1091 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1090 +#define WT_STAT_CONN_LOG_SYNC 1092 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1091 +#define WT_STAT_CONN_LOG_WRITES 1093 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1092 +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1094 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1093 +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1095 /*! LSM: rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1094 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1096 /*! LSM: application work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1095 +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1097 /*! LSM: merge work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1096 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1098 /*! LSM: tree queue hit maximum */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1097 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1099 /*! LSM: switch work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1098 +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1100 /*! LSM: tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1099 +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1101 /*! LSM: tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1100 +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1102 /*! LSM: tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1101 +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1103 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1102 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1104 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1103 +#define WT_STAT_CONN_MEMORY_FREE 1105 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1104 +#define WT_STAT_CONN_MEMORY_GROW 1106 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1105 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1107 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1106 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1108 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1107 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1109 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1108 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1110 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1109 +#define WT_STAT_CONN_PAGE_SLEEP 1111 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1110 +#define WT_STAT_CONN_READ_IO 1112 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1111 +#define WT_STAT_CONN_REC_PAGES 1113 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1112 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1114 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1113 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1115 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1114 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1116 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1115 +#define WT_STAT_CONN_RWLOCK_READ 1117 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1116 +#define WT_STAT_CONN_RWLOCK_WRITE 1118 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1117 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1119 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1118 +#define WT_STAT_CONN_SESSION_OPEN 1120 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1119 +#define WT_STAT_CONN_TXN_BEGIN 1121 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1120 +#define WT_STAT_CONN_TXN_CHECKPOINT 1122 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1121 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1123 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1122 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1124 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1123 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1125 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1124 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1126 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1125 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1127 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1126 +#define WT_STAT_CONN_TXN_COMMIT 1128 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1127 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1129 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1128 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1130 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1129 +#define WT_STAT_CONN_TXN_ROLLBACK 1131 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1130 +#define WT_STAT_CONN_WRITE_IO 1132 /*! * @} diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index 138b64a6e27..1b3a9b62626 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -322,13 +322,13 @@ struct __wt_update; #include "misc.i" #include "intpack.i" /* required by cell.i, packing.i */ #include "packing.i" +#include "cache.i" /* required by txn.i */ #include "cell.i" /* required by btree.i */ #include "mutex.i" /* required by btree.i */ #include "txn.i" /* required by btree.i */ #include "btree.i" /* required by cursor.i */ -#include "cache.i" /* required by cursor.i */ #include "cursor.i" #include "bitstring.i" diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index a3abb336f3d..944e748a6a8 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -957,10 +957,14 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) &slot->slot_buf, slot->slot_buf.memsize * 2)); } /* - * If we have a file to close, close it now. + * If we have a file to close, close it now. First fsync so + * that a later sync will be assured all earlier transactions + * in earlier log files are also on disk. */ - if (close_fh) + if (close_fh) { + WT_ERR(__wt_fsync(session, close_fh)); WT_ERR(__wt_close(session, close_fh)); + } err: if (locked) __wt_spin_unlock(session, &log->log_sync_lock); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index 2dfaea1ec3a..0d44b16d85c 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -171,8 +171,6 @@ __clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update) lsm_tree->nchunks != 0) goto open; - WT_RET(__wt_cache_full_check(session)); - if (clsm->dsk_gen != lsm_tree->dsk_gen && lsm_tree->nchunks != 0) goto open; @@ -1484,11 +1482,8 @@ __wt_clsm_open(WT_SESSION_IMPL *session, WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp)); if (0) { -err: __wt_lsm_tree_release(session, lsm_tree); - if (clsm != NULL) { - clsm->lsm_tree = NULL; +err: if (clsm != NULL) WT_TRET(__clsm_close(cursor)); - } } return (ret); diff --git a/src/third_party/wiredtiger/src/os_posix/os_errno.c b/src/third_party/wiredtiger/src/os_posix/os_errno.c index ed3451a9c1c..a58f13583ce 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_errno.c +++ b/src/third_party/wiredtiger/src/os_posix/os_errno.c @@ -21,3 +21,49 @@ __wt_errno(void) */ return (errno == 0 ? WT_ERROR : errno); } + +/* + * __wt_strerror -- + * POSIX implementation of wiredtiger_strerror. + */ +const char * +__wt_strerror(int error) +{ + const char *p; + + /* + * POSIX errors are non-negative integers; check for 0 explicitly + * in-case the underlying strerror doesn't handle 0, some don't. + */ + if (error == 0) + return ("Successful return: 0"); + if (error > 0 && (p = strerror(error)) != NULL) + return (p); + return (NULL); +} + +/* + * __wt_strerror_r -- + * POSIX implementation of wiredtiger_strerror_r. + */ +int +__wt_strerror_r(int error, char *buf, size_t buflen) +{ + const char *p; + + /* Require at least 2 bytes, printable character and trailing nul. */ + if (buflen < 2) + return (ENOMEM); + + /* + * Check for POSIX errors then fallback to something generic. Copy the + * string into the user's buffer, return success if anything printed. + */ + p = __wt_strerror(error); + if (p != NULL && snprintf(buf, buflen, "%s", p) > 0) + return (0); + + /* Fallback to a generic message, then guess it's a memory problem. */ + return ( + snprintf(buf, buflen, "error return: %d", error) > 0 ? 0 : ENOMEM); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_errno.c b/src/third_party/wiredtiger/src/os_win/os_errno.c index e321912d829..00ee638fbe3 100644 --- a/src/third_party/wiredtiger/src/os_win/os_errno.c +++ b/src/third_party/wiredtiger/src/os_win/os_errno.c @@ -8,6 +8,34 @@ #include "wt_internal.h" +static const int windows_error_offset = -29000; + +/* + * __wt_map_error_to_windows_error -- + * Return a negative integer, an encoded Windows error + * Standard C errors are positive integers from 0 - ~200 + * Windows errors are from 0 - 15999 according to the documentation + */ +static DWORD +__wt_map_error_to_windows_error(int error) { + /* Ensure we do not exceed the error range + Also validate he do not get any COM errors + (which are negative integers) + */ + WT_ASSERT(NULL, error > 0 && error > -(windows_error_offset)); + + return (error + -(windows_error_offset)); +} + +/* + * __wt_map_error_to_windows_error -- + * Return a positive integer, a decoded Windows error + */ +static int +__wt_map_windows_error_to_error(DWORD winerr) { + return (winerr + windows_error_offset); +} + /* * __wt_errno -- * Return errno, or WT_ERROR if errno not set. @@ -24,5 +52,73 @@ __wt_errno(void) /* GetLastError should only be called if we hit an actual error */ WT_ASSERT(NULL, err != ERROR_SUCCESS); - return (err == ERROR_SUCCESS ? WT_ERROR : err); + return (err == ERROR_SUCCESS ? + WT_ERROR : __wt_map_windows_error_to_error(err)); +} + +/* + * __wt_strerror -- + * Windows implementation of wiredtiger_strerror. + */ +const char * +__wt_strerror(int error) +{ + const char *p; + + /* + * POSIX errors are non-negative integers; check for 0 explicitly + * in-case the underlying strerror doesn't handle 0, some don't. + */ + if (error == 0) + return ("Successful return: 0"); + if (error > 0 && (p = strerror(error)) != NULL) + return (p); + return (NULL); +} + +/* + * __wt_strerror_r -- + * Windows implementation of wiredtiger_strerror_r. + */ +int +__wt_strerror_r(int error, char *buf, size_t buflen) +{ + DWORD lasterror; + const char *p; + + /* Require at least 2 bytes, printable character and trailing nul. */ + if (buflen < 2) + return (ENOMEM); + + /* + * Check for POSIX errors, Windows errors, then fallback to something + * generic. Copy the string into the user's buffer, return success if + * anything printed. + */ + p = __wt_strerror(error); + if (p != NULL && snprintf(buf, buflen, "%s", p) > 0) + return (0); + + if (error < 0) { + error = __wt_map_error_to_windows_error(error); + + lasterror = FormatMessageA( + FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, + error, + 0, /* let system choose the correct LANGID */ + buf, + buflen, + NULL); + + if (lasterror != 0) + return (0); + + /* Fall through to the fallback error code */ + } + + /* Fallback to a generic message, then guess it's a memory problem. */ + return ( + snprintf(buf, buflen, "error return: %d", error) > 0 ? 0 : ENOMEM); } diff --git a/src/third_party/wiredtiger/src/os_win/os_ftruncate.c b/src/third_party/wiredtiger/src/os_win/os_ftruncate.c index e80308536f1..d9b43e4596f 100644 --- a/src/third_party/wiredtiger/src/os_win/os_ftruncate.c +++ b/src/third_party/wiredtiger/src/os_win/os_ftruncate.c @@ -17,7 +17,6 @@ __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len) { WT_DECL_RET; LARGE_INTEGER largeint; - uint32_t lasterror; largeint.QuadPart = len; @@ -32,10 +31,8 @@ __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len) return (0); } - lasterror = GetLastError(); - - if (lasterror = ERROR_USER_MAPPED_FILE) + if (GetLastError() == ERROR_USER_MAPPED_FILE) return (EBUSY); - WT_RET_MSG(session, lasterror, "%s SetEndOfFile error", fh->name); + WT_RET_MSG(session, __wt_errno(), "%s SetEndOfFile error", fh->name); } diff --git a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c index 71ea8ed49a2..36de49d1aae 100644 --- a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c +++ b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c @@ -45,7 +45,6 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs) { WT_DECL_RET; int locked; - int lasterror; int milliseconds; locked = 0; WT_ASSERT(session, usecs >= 0); @@ -82,8 +81,7 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs) &cond->cond, &cond->mtx, INFINITE); if (ret == 0) { - lasterror = GetLastError(); - if (lasterror == ERROR_TIMEOUT) { + if (GetLastError() == ERROR_TIMEOUT) { ret = 1; } } diff --git a/src/third_party/wiredtiger/src/os_win/os_rename.c b/src/third_party/wiredtiger/src/os_win/os_rename.c index 8c2784457c4..a0f33843218 100644 --- a/src/third_party/wiredtiger/src/os_win/os_rename.c +++ b/src/third_party/wiredtiger/src/os_win/os_rename.c @@ -33,13 +33,13 @@ __wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to) */ if ((ret = GetFileAttributesA(to_path)) != INVALID_FILE_ATTRIBUTES) { if ((ret = DeleteFileA(to_path)) == FALSE) { - lasterror = GetLastError(); + lasterror = __wt_errno(); goto err; } } if ((MoveFileA(from_path, to_path)) == FALSE) - lasterror = GetLastError(); + lasterror = __wt_errno(); err: __wt_free(session, from_path); diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c index 81447b173ae..d613ced00aa 100644 --- a/src/third_party/wiredtiger/src/schema/schema_open.c +++ b/src/third_party/wiredtiger/src/schema/schema_open.c @@ -327,6 +327,13 @@ __wt_schema_open_index(WT_SESSION_IMPL *session, table->indices[i] = idx; idx = NULL; + + /* + * If the slot is bigger than anything else we've seen, + * bump the number of indices. + */ + if (i >= table->nindices) + table->nindices = i + 1; } /* If we were looking for a single index, we're done. */ diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index 3ab5e0acab1..8ee143133ae 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -736,13 +736,6 @@ __session_begin_transaction(WT_SESSION *wt_session, const char *config) if (F_ISSET(&session->txn, TXN_RUNNING)) WT_ERR_MSG(session, EINVAL, "Transaction already running"); - /* - * There is no transaction active in this thread; check if the cache is - * full, if we have to block for eviction, this is the best time to do - * it. - */ - WT_ERR(__wt_cache_full_check(session)); - ret = __wt_txn_begin(session, cfg); err: API_END_RET(session, ret); diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index c93168cd9a1..223d62d0559 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -367,6 +367,8 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) stats->cache_inmem_split.desc = "cache: in-memory page splits"; stats->cache_eviction_internal.desc = "cache: internal pages evicted"; stats->cache_bytes_max.desc = "cache: maximum bytes configured"; + stats->cache_eviction_maximum_page_size.desc = + "cache: maximum page size at eviction"; stats->cache_eviction_dirty.desc = "cache: modified pages evicted"; stats->cache_eviction_deepen.desc = "cache: page split during eviction deepened the tree"; @@ -374,6 +376,8 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) "cache: pages currently held in the cache"; stats->cache_eviction_force.desc = "cache: pages evicted because they exceeded the in-memory maximum"; + stats->cache_eviction_force_delete.desc = + "cache: pages evicted because they had chains of deleted items"; stats->cache_eviction_app.desc = "cache: pages evicted by application threads"; stats->cache_read.desc = "cache: pages read into cache"; @@ -548,9 +552,11 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->cache_eviction_hazard.v = 0; stats->cache_inmem_split.v = 0; stats->cache_eviction_internal.v = 0; + stats->cache_eviction_maximum_page_size.v = 0; stats->cache_eviction_dirty.v = 0; stats->cache_eviction_deepen.v = 0; stats->cache_eviction_force.v = 0; + stats->cache_eviction_force_delete.v = 0; stats->cache_eviction_app.v = 0; stats->cache_read.v = 0; stats->cache_eviction_fail.v = 0; diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index cd130002c81..5b8f11a88a5 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -361,8 +361,15 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) /* If we are logging, write a commit log record. */ if (ret == 0 && txn->mod_count > 0 && FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) && - !F_ISSET(session, WT_SESSION_NO_LOGGING)) + !F_ISSET(session, WT_SESSION_NO_LOGGING)) { + /* + * We are about to block on I/O writing the log. + * Release our snapshot in case it is keeping data pinned. + * This is particularly important for checkpoints. + */ + __wt_txn_release_snapshot(session); ret = __wt_txn_log_commit(session, cfg); + } /* * If anything went wrong, roll back. @@ -531,9 +538,8 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) conn = S2C(session); txn_global = &conn->txn_global; - txn_global->current = 1; - txn_global->oldest_id = 1; - txn_global->last_running = 1; + txn_global->current = txn_global->last_running = + txn_global->oldest_id = WT_TXN_FIRST; WT_RET(__wt_calloc_def( session, conn->session_size, &txn_global->states)); diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index f66bd7e09c8..f706efa8a70 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -270,6 +270,7 @@ __wt_txn_checkpoint_log( { WT_DECL_ITEM(logrec); WT_DECL_RET; + WT_ITEM *ckpt_snapshot, empty; WT_LSN *ckpt_lsn; WT_TXN *txn; uint8_t *end, *p; @@ -319,19 +320,22 @@ __wt_txn_checkpoint_log( */ if (!txn->full_ckpt) { txn->ckpt_nsnapshot = 0; + WT_CLEAR(empty); + ckpt_snapshot = ∅ *ckpt_lsn = S2C(session)->log->alloc_lsn; - } + } else + ckpt_snapshot = txn->ckpt_snapshot; /* Write the checkpoint log record. */ WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype, ckpt_lsn->file, ckpt_lsn->offset, - txn->ckpt_nsnapshot, &txn->ckpt_snapshot)); + txn->ckpt_nsnapshot, ckpt_snapshot)); WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, rectype, ckpt_lsn->file, ckpt_lsn->offset, - txn->ckpt_nsnapshot, &txn->ckpt_snapshot)); + txn->ckpt_nsnapshot, ckpt_snapshot)); logrec->size += (uint32_t)recsize; WT_ERR(__wt_log_write(session, logrec, lsnp, 0)); diff --git a/src/third_party/wiredtiger/tools/stat_data.py b/src/third_party/wiredtiger/tools/stat_data.py index 7c00f6a70a8..89e06dbbf90 100644 --- a/src/third_party/wiredtiger/tools/stat_data.py +++ b/src/third_party/wiredtiger/tools/stat_data.py @@ -4,6 +4,7 @@ no_scale_per_second_list = [ 'async: maximum work queue length', 'cache: bytes currently in the cache', 'cache: maximum bytes configured', + 'cache: maximum page size at eviction', 'cache: pages currently held in the cache', 'cache: tracked dirty bytes in the cache', 'cache: tracked dirty pages in the cache', -- cgit v1.2.1