diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2015-09-11 16:23:54 +1000 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2015-09-11 16:23:54 +1000 |
commit | 58c7ad85c90619d4fa0e7e4df3b9f4d643b9b73b (patch) | |
tree | 63cfbe95d22f14a3d3366d68976df0d739318e9c /src/third_party/wiredtiger | |
parent | 8b205afd0ae74fd7351bc183e39b8931044f3987 (diff) | |
download | mongo-58c7ad85c90619d4fa0e7e4df3b9f4d643b9b73b.tar.gz |
Import wiredtiger-wiredtiger-2.6.1-1056-g5205bb1.tar.gz from wiredtiger branch mongodb-3.2
Diffstat (limited to 'src/third_party/wiredtiger')
101 files changed, 4347 insertions, 2349 deletions
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-large-oplog.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-large-oplog.wtperf new file mode 100644 index 00000000000..1e203a34cc3 --- /dev/null +++ b/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-large-oplog.wtperf @@ -0,0 +1,13 @@ +# wtperf options file to simulate populating a MongoDB oplog +# This creates a test database of 7.8GB +conn_config="cache_size=2GB,checkpoint=(wait=60)" +table_config="type=file" +# Start with a small set of inserts in the populate phase. +icount=300000 +report_interval=5 +run_time=3600 +populate_threads=1 +key_sz=8192 +# Setup three threads to insert into the oplog +# Setup one thread to be doing truncates from the oplog +threads=((count=3,inserts=1,throttle=2000),(count=1,truncate=1,truncate_pct=10,truncate_count=300000)) diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-small-oplog.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-small-oplog.wtperf new file mode 100644 index 00000000000..4f2ae5359cd --- /dev/null +++ b/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-small-oplog.wtperf @@ -0,0 +1,13 @@ +# wtperf options file to simulate populating a MongoDB oplog +# This creates an oplog of 6.1GB +conn_config="cache_size=2GB,checkpoint=(wait=60)" +table_config="type=file" +# Start with a small set of inserts in the populate phase. +icount=750000 +report_interval=5 +run_time=3600 +populate_threads=1 +key_sz=512 +# Setup three threads to insert into the oplog +# Setup one thread to be doing truncates from the oplog +threads=((count=3,inserts=1,throttle=2000),(count=1,truncate=1,truncate_pct=10,truncate_count=750000)) diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_run.sh b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_run.sh index d5de7c4abdb..ac31c2a2e78 100755 --- a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_run.sh +++ b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_run.sh @@ -24,18 +24,18 @@ outfile=./wtperf.out rm -f $outfile # Each of these has an entry for each op in ops below. -avg=(0 0 0) -max=(0 0 0) -min=(0 0 0) -sum=(0 0 0) +avg=(0 0 0 0) +max=(0 0 0 0) +min=(0 0 0 0) +sum=(0 0 0 0) # Load needs floating point and bc, handle separately. -loadindex=4 +loadindex=5 avg[$loadindex]=0 max[$loadindex]=0 min[$loadindex]=0 sum[$loadindex]=0 -ops=(read insert update) -outp=("Read count:" "Insert count:" "Update count:") +ops=(read insert update truncate) +outp=("Read count:" "Insert count:" "Update count:" "Truncate count:") outp[$loadindex]="Load time:" # getval min/max val cur diff --git a/src/third_party/wiredtiger/build_win/filelist.win b/src/third_party/wiredtiger/build_win/filelist.win index 099451e418d..9d0ee10d305 100644 --- a/src/third_party/wiredtiger/build_win/filelist.win +++ b/src/third_party/wiredtiger/build_win/filelist.win @@ -45,6 +45,7 @@ src/btree/col_srch.c src/btree/row_key.c src/btree/row_modify.c src/btree/row_srch.c +src/cache/cache_las.c src/config/config.c src/config/config_api.c src/config/config_check.c diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index d9830191d94..3a700cf886b 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -449,13 +449,17 @@ connection_runtime_config = [ Config('chunk', '10MB', r''' the granularity that a shared cache is redistributed''', min='1MB', max='10TB'), + Config('name', 'none', r''' + the name of a cache that is shared between databases or + \c "none" when no shared cache is configured'''), + Config('quota', '0', r''' + maximum size of cache this database can be allocated from the + shared cache. Defaults to the entire shared cache size''', + type='int'), Config('reserve', '0', r''' amount of cache this database is guaranteed to have available from the shared cache. This setting is per database. Defaults to the chunk size''', type='int'), - Config('name', 'none', r''' - the name of a cache that is shared between databases or - \c "none" when no shared cache is configured'''), Config('size', '500MB', r''' maximum memory to allocate for the shared cache. Setting this will update the value if one is already set''', diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index c3321cf845d..f33f0e9a962 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -45,6 +45,7 @@ src/btree/col_srch.c src/btree/row_key.c src/btree/row_modify.c src/btree/row_srch.c +src/cache/cache_las.c src/config/config.c src/config/config_api.c src/config/config_check.c diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py index d861eabc7ff..d98f249335e 100644 --- a/src/third_party/wiredtiger/dist/flags.py +++ b/src/third_party/wiredtiger/dist/flags.py @@ -45,9 +45,10 @@ flags = { 'READ_WONT_NEED', ], 'rec_write' : [ + 'EVICT_LOOKASIDE', 'EVICTING', - 'SKIP_UPDATE_ERR', - 'SKIP_UPDATE_RESTORE', + 'EVICT_UPDATE_RESTORE', + 'VISIBILITY_ERR', ], 'txn_log_checkpoint' : [ 'TXN_LOG_CKPT_CLEANUP', @@ -106,15 +107,17 @@ flags = { 'session' : [ 'SESSION_CAN_WAIT', 'SESSION_CLEAR_EVICT_WALK', + 'SESSION_INTERNAL', 'SESSION_LOCKED_CHECKPOINT', 'SESSION_LOCKED_HANDLE_LIST', 'SESSION_LOCKED_SCHEMA', + 'SESSION_LOCKED_SLOT', 'SESSION_LOCKED_TABLE', - 'SESSION_INTERNAL', 'SESSION_LOGGING_INMEM', + 'SESSION_LOOKASIDE_CURSOR', 'SESSION_NO_CACHE', - 'SESSION_NO_CACHE_CHECK', 'SESSION_NO_DATA_HANDLES', + 'SESSION_NO_EVICTION', 'SESSION_NO_LOGGING', 'SESSION_NO_SCHEMA_LOCK', 'SESSION_QUIET_CORRUPT_FILE', diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index f3858da477e..aaf365a7376 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -28,6 +28,13 @@ WT_DEADLOCK WT_DEBUG_BYTE WT_HANDLE_CLOSED WT_HANDLE_NULLABLE +WT_LOG_SLOT_ACTIVE +WT_LOG_SLOT_BITS +WT_LOG_SLOT_JOIN_MASK +WT_LOG_SLOT_MASK_OFF +WT_LOG_SLOT_MASK_ON +WT_LOG_SLOT_MAXBITS +WT_LOG_SLOT_UNBUFFERED_ISSET WT_PACKED_STRUCT_BEGIN WT_PACKED_STRUCT_END WT_READ_BARRIER diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index fc706226c0a..4419662b9c4 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -32,6 +32,7 @@ BIGENDIAN BOOL BSR BTREE +BUF BZ Barack Bitfield @@ -156,11 +157,13 @@ KVS Kanowski's Kounavis LANGID +LAS LF LLLLLL LLLLLLL LOGREC LOGSCAN +LOOKASIDE LRU LRVv LSB @@ -176,6 +179,7 @@ Levyx Llqr Llqrt LockFile +Lookaside Lookup MALLOC MEM @@ -210,6 +214,7 @@ NetBSD NoAddr Noll Nul +OOB OPTYPE OUTBUFF OVFL @@ -231,6 +236,7 @@ Preload Prepend Qsort RCS +RECNO REF's REFs RET @@ -291,6 +297,7 @@ ULINE URI URIs UTF +Unbuffered UnixLib Unmap UnmapViewOfFile @@ -319,6 +326,7 @@ WiredTiger's WiredTigerCheckpoint WiredTigerException WiredTigerInit +WiredTigerLAS WiredTigerLog WiredTigerPreplog WiredTigerTmplog @@ -504,6 +512,7 @@ dlh dll dlopen dlsym +dmalloc dmsg doxgen doxygen @@ -513,6 +522,7 @@ dsk dsrc dst dstlen +dstrdup dsync dumpcmp dumpfile @@ -649,6 +659,7 @@ kvraw kvs kvsbdb lang +las latencies lbrace lbracket @@ -676,6 +687,7 @@ logread logrec logsize logtest +lookaside lookup lookups lossy @@ -940,6 +952,7 @@ uS uint uintmax unbare +unbuffered uncompressing uncompresssed undef @@ -948,6 +961,7 @@ unesc unescaped uninstantiated unistd +unlinked unmap unmarshall unmarshalled diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 70e1d32843c..c91fc921380 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -127,9 +127,9 @@ connection_stats = [ AsyncStat('async_alloc_race', 'number of allocation state races'), AsyncStat('async_alloc_view', 'number of operation slots viewed for allocation'), + AsyncStat('async_cur_queue', 'current work queue length'), AsyncStat('async_flush', 'number of flush calls'), AsyncStat('async_full', 'number of times operation allocation failed'), - AsyncStat('async_cur_queue', 'current work queue length'), AsyncStat('async_max_queue', 'maximum work queue length', 'no_clear,no_scale'), AsyncStat('async_nowork', 'number of times worker found no work'), @@ -156,11 +156,11 @@ connection_stats = [ ########################################## CacheStat('cache_bytes_dirty', 'tracked dirty bytes in the cache', 'no_clear,no_scale'), - CacheStat('cache_bytes_inuse', - 'bytes currently in the cache', 'no_clear,no_scale'), CacheStat('cache_bytes_internal', 'tracked bytes belonging to internal pages in the cache', 'no_clear,no_scale'), + CacheStat('cache_bytes_inuse', + 'bytes currently in the cache', 'no_clear,no_scale'), CacheStat('cache_bytes_leaf', 'tracked bytes belonging to leaf pages in the cache', 'no_clear,no_scale'), @@ -172,11 +172,11 @@ connection_stats = [ CacheStat('cache_bytes_read', 'bytes read into cache'), CacheStat('cache_bytes_write', 'bytes written from cache'), CacheStat('cache_eviction_app', 'pages evicted by application threads'), + CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'), CacheStat('cache_eviction_clean', 'unmodified pages evicted'), CacheStat('cache_eviction_deepen', 'page split during eviction deepened the tree'), CacheStat('cache_eviction_dirty', 'modified pages evicted'), - CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'), CacheStat('cache_eviction_fail', 'pages selected for eviction unable to be evicted'), CacheStat('cache_eviction_force', @@ -204,13 +204,23 @@ connection_stats = [ CacheStat('cache_eviction_worker_evicting', 'eviction worker thread evicting pages'), CacheStat('cache_inmem_split', 'in-memory page splits'), + CacheStat('cache_inmem_splittable', + 'in-memory page passed criteria to be split'), + CacheStat('cache_lookaside_insert', 'lookaside table insert calls'), + CacheStat('cache_lookaside_remove', 'lookaside table remove calls'), CacheStat('cache_overhead', 'percentage overhead', 'no_clear,no_scale'), CacheStat('cache_pages_dirty', 'tracked dirty pages in the cache', 'no_clear,no_scale'), CacheStat('cache_pages_inuse', 'pages currently held in the cache', 'no_clear,no_scale'), CacheStat('cache_read', 'pages read into cache'), + CacheStat('cache_read_lookaside', + 'pages read into cache requiring lookaside entries'), CacheStat('cache_write', 'pages written from cache'), + CacheStat('cache_write_lookaside', + 'page written requiring lookaside records'), + CacheStat('cache_write_restore', + 'pages written requiring in-memory restoration'), ########################################## # Dhandle statistics @@ -236,8 +246,8 @@ connection_stats = [ LogStat('log_compress_len', 'total size of compressed records'), LogStat('log_compress_mem', 'total in-memory size of compressed records'), LogStat('log_compress_small', 'log records too small to compress'), - LogStat('log_compress_writes', 'log records compressed'), LogStat('log_compress_write_fails', 'log records not compressed'), + LogStat('log_compress_writes', 'log records compressed'), LogStat('log_max_filesize', 'maximum log file size', 'no_clear,no_scale'), LogStat('log_prealloc_files', 'pre-allocated log files prepared'), LogStat('log_prealloc_max', @@ -247,20 +257,18 @@ connection_stats = [ LogStat('log_scan_records', 'records processed by log scan'), LogStat('log_scan_rereads', 'log scan records requiring two reads'), LogStat('log_scans', 'log scan operations'), - LogStat('log_sync', 'log sync operations'), - LogStat('log_sync_dir', 'log sync_dir operations'), - LogStat('log_writes', 'log write operations'), - LogStat('log_write_lsn', 'log server thread advances write LSN'), - + LogStat('log_slot_closes', 'consolidated slot closures'), LogStat('log_slot_coalesced', 'written slots coalesced'), LogStat('log_slot_consolidated', 'logging bytes consolidated'), - LogStat('log_slot_closes', 'consolidated slot closures'), LogStat('log_slot_joins', 'consolidated slot joins'), LogStat('log_slot_races', 'consolidated slot join races'), - LogStat('log_slot_toobig', 'record size exceeded maximum'), - LogStat('log_slot_toosmall', - 'failed to find a slot large enough for record'), + LogStat('log_slot_switch_busy', 'busy returns attempting to switch slots'), LogStat('log_slot_transitions', 'consolidated slot join transitions'), + LogStat('log_slot_unbuffered', 'consolidated slot unbuffered writes'), + LogStat('log_sync', 'log sync operations'), + LogStat('log_sync_dir', 'log sync_dir operations'), + LogStat('log_write_lsn', 'log server thread advances write LSN'), + LogStat('log_writes', 'log write operations'), ########################################## # Reconciliation statistics @@ -279,6 +287,8 @@ connection_stats = [ TxnStat('txn_checkpoint', 'transaction checkpoints'), TxnStat('txn_checkpoint_generation', 'transaction checkpoint generation', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_running', + 'transaction checkpoint currently running', 'no_clear,no_scale'), TxnStat('txn_checkpoint_time_max', 'transaction checkpoint max time (msecs)', 'no_clear,no_scale'), TxnStat('txn_checkpoint_time_min', @@ -287,17 +297,16 @@ connection_stats = [ 'transaction checkpoint most recent time (msecs)', 'no_clear,no_scale'), TxnStat('txn_checkpoint_time_total', 'transaction checkpoint total time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_running', - 'transaction checkpoint currently running', 'no_clear,no_scale'), + TxnStat('txn_commit', 'transactions committed'), + TxnStat('txn_fail_cache', + 'transaction failures due to cache overflow'), TxnStat('txn_pinned_checkpoint_range', 'transaction range of IDs currently pinned by a checkpoint', - 'no_clear,no_scale'), + 'no_clear,no_scale'), TxnStat('txn_pinned_range', 'transaction range of IDs currently pinned', 'no_clear,no_scale'), - TxnStat('txn_sync', 'transaction sync calls'), - TxnStat('txn_commit', 'transactions committed'), - TxnStat('txn_fail_cache', 'transaction failures due to cache overflow'), TxnStat('txn_rollback', 'transactions rolled back'), + TxnStat('txn_sync', 'transaction sync calls'), ########################################## # LSM statistics @@ -391,6 +400,8 @@ dsrc_stats = [ 'column-store fixed-size leaf pages', 'no_scale'), BtreeStat('btree_column_internal', 'column-store internal pages', 'no_scale'), + BtreeStat('btree_column_rle', + 'column-store variable-size RLE encoded values', 'no_scale'), BtreeStat('btree_column_variable', 'column-store variable-size leaf pages', 'no_scale'), BtreeStat('btree_compact_rewrite', 'pages rewritten by compaction'), @@ -434,9 +445,9 @@ dsrc_stats = [ ########################################## # Block manager statistics ########################################## - BlockStat('block_alloc', 'blocks allocated'), BlockStat('allocation_size', 'file allocation unit size', 'no_aggregate,no_scale'), + BlockStat('block_alloc', 'blocks allocated'), BlockStat('block_checkpoint_size', 'checkpoint size', 'no_scale'), BlockStat('block_extension', 'allocations requiring file extension'), BlockStat('block_free', 'blocks freed'), @@ -463,20 +474,28 @@ dsrc_stats = [ CacheStat('cache_eviction_internal', 'internal pages evicted'), CacheStat('cache_eviction_split', 'pages split during eviction'), CacheStat('cache_inmem_split', 'in-memory page splits'), + CacheStat('cache_inmem_splittable', + 'in-memory page passed criteria to be split'), CacheStat('cache_overflow_value', 'overflow values cached in memory', 'no_scale'), CacheStat('cache_read', 'pages read into cache'), + CacheStat('cache_read_lookaside', + 'pages read into cache requiring lookaside entries'), CacheStat('cache_read_overflow', 'overflow pages read into cache'), CacheStat('cache_write', 'pages written from cache'), + CacheStat('cache_write_lookaside', + 'page written requiring lookaside records'), + CacheStat('cache_write_restore', + 'pages written requiring in-memory restoration'), ########################################## # Compression statistics ########################################## - CompressStat('compress_raw_ok', 'raw compression call succeeded'), CompressStat('compress_raw_fail', 'raw compression call failed, no additional data available'), CompressStat('compress_raw_fail_temporary', 'raw compression call failed, additional data available'), + CompressStat('compress_raw_ok', 'raw compression call succeeded'), CompressStat('compress_read', 'compressed pages read'), CompressStat('compress_write', 'compressed pages written'), CompressStat('compress_write_fail', 'page written failed to compress'), @@ -487,21 +506,21 @@ dsrc_stats = [ # Reconciliation statistics ########################################## RecStat('rec_dictionary', 'dictionary matches'), + RecStat('rec_multiblock_internal', 'internal page multi-block writes'), + RecStat('rec_multiblock_leaf', 'leaf page multi-block writes'), + RecStat('rec_multiblock_max', + 'maximum blocks required for a page', 'max_aggregate,no_scale'), RecStat('rec_overflow_key_internal', 'internal-page overflow keys'), RecStat('rec_overflow_key_leaf', 'leaf-page overflow keys'), RecStat('rec_overflow_value', 'overflow values written'), - RecStat('rec_page_match', 'page checksum matches'), RecStat('rec_page_delete', 'pages deleted'), + RecStat('rec_page_match', 'page checksum matches'), RecStat('rec_pages', 'page reconciliation calls'), RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'), RecStat('rec_prefix_compression', 'leaf page key bytes discarded using prefix compression'), RecStat('rec_suffix_compression', 'internal page key bytes discarded using suffix compression'), - RecStat('rec_multiblock_internal', 'internal page multi-block writes'), - RecStat('rec_multiblock_leaf', 'leaf page multi-block writes'), - RecStat('rec_multiblock_max', - 'maximum blocks required for a page', 'max_aggregate,no_scale'), ########################################## # Transaction statistics diff --git a/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c b/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c index 7d68717e3ca..5b29e66c503 100644 --- a/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c +++ b/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c @@ -493,9 +493,10 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) rotn_encryptor->encryptor.terminate = rotn_terminate; rotn_encryptor->wtext = connection->get_extension_api(connection); - if ((ret = rotn_configure(rotn_encryptor, config)) != 0) + if ((ret = rotn_configure(rotn_encryptor, config)) != 0) { + free(rotn_encryptor); return (ret); - + } /* Load the encryptor */ return (connection->add_encryptor( connection, "rotn", (WT_ENCRYPTOR *)rotn_encryptor, NULL)); diff --git a/src/third_party/wiredtiger/src/async/async_op.c b/src/third_party/wiredtiger/src/async/async_op.c index 7e1920933c2..469dbc8e615 100644 --- a/src/third_party/wiredtiger/src/async/async_op.c +++ b/src/third_party/wiredtiger/src/async/async_op.c @@ -237,7 +237,7 @@ __async_op_init(WT_CONNECTION_IMPL *conn, WT_ASYNC_OP_IMPL *op, uint32_t id) asyncop->c.set_key = __wt_cursor_set_key; asyncop->c.get_value = __wt_cursor_get_value; asyncop->c.set_value = __wt_cursor_set_value; - asyncop->c.recno = 0; + asyncop->c.recno = WT_RECNO_OOB; memset(asyncop->c.raw_recno_buf, 0, sizeof(asyncop->c.raw_recno_buf)); memset(&asyncop->c.key, 0, sizeof(asyncop->c.key)); memset(&asyncop->c.value, 0, sizeof(asyncop->c.value)); diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c index cdef1682faf..018f6a20164 100644 --- a/src/third_party/wiredtiger/src/block/block_ext.c +++ b/src/third_party/wiredtiger/src/block/block_ext.c @@ -86,7 +86,7 @@ __block_off_srch(WT_EXT **head, wt_off_t off, WT_EXT ***stack, int skip_off) * __block_first_srch -- * Search the skiplist for the first available slot. */ -static inline int +static inline bool __block_first_srch(WT_EXT **head, wt_off_t size, WT_EXT ***stack) { WT_EXT *ext; @@ -99,11 +99,11 @@ __block_first_srch(WT_EXT **head, wt_off_t size, WT_EXT ***stack) if (ext->size >= size) break; if (ext == NULL) - return (0); + return (false); /* Build a stack for the offset we want. */ __block_off_srch(head, ext->off, stack, 0); - return (1); + return (true); } /* @@ -251,7 +251,7 @@ __block_off_insert( * Return if any part of a specified range appears on a specified extent * list. */ -static int +static bool __block_off_match(WT_EXTLIST *el, wt_off_t off, wt_off_t size) { WT_EXT *before, *after; @@ -261,10 +261,10 @@ __block_off_match(WT_EXTLIST *el, wt_off_t off, wt_off_t size) /* If "before" or "after" overlaps, we have a winner. */ if (before != NULL && before->off + before->size > off) - return (1); + return (true); if (after != NULL && off + size > after->off) - return (1); - return (0); + return (true); + return (false); } /* diff --git a/src/third_party/wiredtiger/src/block/block_slvg.c b/src/third_party/wiredtiger/src/block/block_slvg.c index c78a6c39942..641bb8a42f7 100644 --- a/src/third_party/wiredtiger/src/block/block_slvg.c +++ b/src/third_party/wiredtiger/src/block/block_slvg.c @@ -73,19 +73,19 @@ __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block) * __wt_block_offset_invalid -- * Return if the block offset is insane. */ -int +bool __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size) { if (size == 0) /* < minimum page size */ - return (1); + return (true); if (size % block->allocsize != 0) /* not allocation-size units */ - return (1); + return (true); if (size > WT_BTREE_PAGE_SIZE_MAX) /* > maximum page size */ - return (1); + return (true); /* past end-of-file */ if (offset + (wt_off_t)size > block->fh->size) - return (1); - return (0); + return (true); + return (false); } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c index 18f8ca54601..79a52dbcaa3 100644 --- a/src/third_party/wiredtiger/src/btree/bt_compact.c +++ b/src/third_party/wiredtiger/src/btree/bt_compact.c @@ -53,12 +53,12 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp) } else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE) { /* * The page's modification information can change underfoot if - * the page is being reconciled, lock the page down. + * the page is being reconciled, serialize with reconciliation. */ - WT_PAGE_LOCK(session, page); + F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION); ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); - WT_PAGE_UNLOCK(session, page); + F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); WT_RET(ret); } return (0); @@ -73,14 +73,12 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; - WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_REF *ref; - int block_manager_begin, evict_reset, skip; + int block_manager_begin, skip; WT_UNUSED(cfg); - conn = S2C(session); btree = S2BT(session); bm = btree->bm; ref = NULL; @@ -118,25 +116,6 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) */ __wt_spin_lock(session, &btree->flush_lock); - /* - * That leaves eviction, we don't want to block eviction. Set a flag - * so reconciliation knows compaction is running. If reconciliation - * sees the flag it locks the page it's writing, we acquire the same - * lock when reading the page's modify information, serializing access. - * The same page lock blocks work on the page, but compaction is an - * uncommon, heavy-weight operation. If it's ever a problem, there's - * no reason we couldn't use an entirely separate lock than the page - * lock. - * - * We also need to ensure we don't race with an on-going reconciliation. - * After we set the flag, wait for eviction of this file to drain, and - * then let eviction continue; - */ - conn->compact_in_memory_pass = 1; - WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); - if (evict_reset) - __wt_evict_file_exclusive_off(session); - /* Start compaction. */ WT_ERR(bm->compact_start(bm, session)); block_manager_begin = 1; @@ -172,11 +151,7 @@ err: if (ref != NULL) if (block_manager_begin) WT_TRET(bm->compact_end(bm, session)); - /* - * Unlock will be a release barrier, use it to update the compaction - * status for reconciliation. - */ - conn->compact_in_memory_pass = 0; + /* Unblock threads writing leaf pages. */ __wt_spin_unlock(session, &btree->flush_lock); return (ret); diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 9f41e3ae684..458a1985e28 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -70,7 +70,7 @@ __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt) * __cursor_valid -- * Return if the cursor references an valid key/value pair. */ -static inline int +static inline bool __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) { WT_BTREE *btree; @@ -133,10 +133,10 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) - return (0); + return (false); if (updp != NULL) *updp = upd; - return (1); + return (true); } /* @@ -155,7 +155,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * keys, check for retrieval past the end of the page. */ if (cbt->recno >= page->pg_fix_recno + page->pg_fix_entries) - return (0); + return (false); /* * Updates aren't stored on the page, an update would have @@ -170,7 +170,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * "slots", check if search returned a valid slot. */ if (cbt->slot >= page->pg_var_entries) - return (0); + return (false); /* * Updates aren't stored on the page, an update would have @@ -181,7 +181,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) cip = &page->pg_var_d[cbt->slot]; if ((cell = WT_COL_PTR(page, cip)) == NULL || __wt_cell_type(cell) == WT_CELL_DEL) - return (0); + return (false); break; case BTREE_ROW: /* @@ -189,7 +189,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * key as an on-page object, we're done. */ if (cbt->ins != NULL) - return (0); + return (false); /* * Check if searched returned a valid slot (the failure mode is @@ -198,19 +198,19 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * mirrors the column-store test). */ if (cbt->slot >= page->pg_row_entries) - return (0); + return (false); /* Updates are stored on the page, check for a delete. */ if (page->pg_row_upd != NULL && (upd = __wt_txn_read( session, page->pg_row_upd[cbt->slot])) != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) - return (0); + return (false); if (updp != NULL) *updp = upd; } break; } - return (1); + return (true); } /* @@ -517,7 +517,7 @@ retry: WT_RET(__cursor_func_init(cbt, 1)); WT_ERR(__cursor_col_search(session, cbt, NULL)); if (F_ISSET(cursor, WT_CURSTD_APPEND)) - cbt->iface.recno = 0; + cbt->iface.recno = WT_RECNO_OOB; /* * If not overwriting, fail if the key exists. Creating a @@ -911,7 +911,7 @@ __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp) * __cursor_equals -- * Return if two cursors reference the same row. */ -static inline int +static inline bool __cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b) { switch (a->btree->type) { @@ -923,21 +923,21 @@ __cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b) * one being returned to the application. */ if (((WT_CURSOR *)a)->recno == ((WT_CURSOR *)b)->recno) - return (1); + return (true); break; case BTREE_ROW: if (a->ref != b->ref) - return (0); + return (false); if (a->ins != NULL || b->ins != NULL) { if (a->ins == b->ins) - return (1); + return (true); break; } if (a->slot == b->slot) - return (1); + return (true); break; } - return (0); + return (false); } /* @@ -1153,6 +1153,19 @@ err: if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED)) } /* + * __wt_btcur_init -- + * Initialize an cursor used for internal purposes. + */ +void +__wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + memset(cbt, 0, sizeof(WT_CURSOR_BTREE)); + + cbt->iface.session = &session->iface; + cbt->btree = S2BT(session); +} + +/* * __wt_btcur_open -- * Open a btree cursor. */ @@ -1168,14 +1181,22 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt) * Close a btree cursor. */ int -__wt_btcur_close(WT_CURSOR_BTREE *cbt) +__wt_btcur_close(WT_CURSOR_BTREE *cbt, int lowlevel) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cbt->iface.session; - ret = __curfile_leave(cbt); + /* + * The in-memory split and lookaside table code creates low-level btree + * cursors to search/modify leaf pages. Those cursors don't hold hazard + * pointers, nor are they counted in the session handle's cursor count. + * Skip the usual cursor tear-down in that case. + */ + if (!lowlevel) + ret = __curfile_leave(cbt); + __wt_buf_free(session, &cbt->_row_key); __wt_buf_free(session, &cbt->_tmp); diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 77d80cdb3a2..38ef407e160 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -340,6 +340,8 @@ __wt_debug_disk( __dmsg(ds, ", empty-all"); if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE)) __dmsg(ds, ", empty-none"); + if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE)) + __dmsg(ds, ", LAS-update"); __dmsg(ds, ", generation %" PRIu64 "\n", dsk->write_gen); @@ -643,12 +645,10 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) __dmsg(ds, ", disk-mapped"); if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) __dmsg(ds, ", evict-lru"); - if (F_ISSET_ATOMIC(page, WT_PAGE_SCANNING)) - __dmsg(ds, ", scanning"); + if (F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION)) + __dmsg(ds, ", reconciliation"); if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) __dmsg(ds, ", split-insert"); - if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED)) - __dmsg(ds, ", split-locked"); if (mod != NULL) switch (F_ISSET(mod, WT_PM_REC_MASK)) { diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index cddfa0ef801..0d512b13c5e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -77,7 +77,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp) } (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); - ret = __wt_evict_page(session, ref); + ret = __wt_evict(session, ref, 0); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); } @@ -216,10 +216,10 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) * __wt_delete_page_skip -- * If iterating a cursor, skip deleted pages that are visible to us. */ -int +bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) { - int skip; + bool skip; /* * Deleted pages come from two sources: either it's a fast-delete as @@ -240,10 +240,10 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) * the structure, just to be safe. */ if (ref->page_del == NULL) - return (1); + return (true); if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) - return (0); + return (false); skip = (ref->page_del == NULL || __wt_txn_visible(session, ref->page_del->txnid)); diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index 060a93f543f..73e6affccd3 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -15,7 +15,6 @@ static void __free_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *); static void __free_skip_array(WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t); static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *); static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t); -static void __free_update_list(WT_SESSION_IMPL *, WT_UPDATE *); /* * __wt_ref_out -- @@ -56,7 +55,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); - WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED)); + WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION)); #ifdef HAVE_DIAGNOSTIC { @@ -160,8 +159,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_free(session, multi->key.ikey); break; } - __wt_free(session, multi->skip); - __wt_free(session, multi->skip_dsk); + __wt_free(session, multi->supd); + __wt_free(session, multi->supd_dsk); __wt_free(session, multi->addr.addr); } __wt_free(session, mod->mod_multi); @@ -235,10 +234,7 @@ __wt_free_ref( * it clean explicitly.) */ if (free_pages && ref->page != NULL) { - if (ref->page->modify != NULL) { - ref->page->modify->write_gen = 0; - __wt_cache_dirty_decr(session, ref->page); - } + __wt_page_modify_clear(session, ref->page); __wt_page_out(session, &ref->page); } @@ -373,7 +369,7 @@ __free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins) WT_INSERT *next; for (; ins != NULL; ins = next) { - __free_update_list(session, ins->upd); + __wt_free_update_list(session, ins->upd); next = WT_SKIP_NEXT(ins); __wt_free(session, ins); } @@ -395,29 +391,23 @@ __free_update( */ for (updp = update_head; entries > 0; --entries, ++updp) if (*updp != NULL) - __free_update_list(session, *updp); + __wt_free_update_list(session, *updp); /* Free the update array. */ __wt_free(session, update_head); } /* - * __free_update_list -- + * __wt_free_update_list -- * Walk a WT_UPDATE forward-linked list and free the per-thread combination * of a WT_UPDATE structure and its associated data. */ -static void -__free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd) +void +__wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd) { WT_UPDATE *next; for (; upd != NULL; upd = next) { - /* Everything we free should be visible to everyone. */ - WT_ASSERT(session, - F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || - upd->txnid == WT_TXN_ABORTED || - __wt_txn_visible_all(session, upd->txnid)); - next = upd->next; __wt_free(session, upd); } diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index 0cc6b6eb25f..6a4243a0fc7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -255,27 +255,17 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) /* Page sizes */ WT_RET(__btree_page_sizes(session)); - /* - * Set special flags for the metadata file. - * Eviction; the metadata file is never evicted. - * Logging; the metadata file is always logged if possible. - */ - if (WT_IS_METADATA(btree->dhandle)) { + WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval)); + if (cval.val) F_SET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); + else + F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); + + WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); + if (cval.val) F_CLR(btree, WT_BTREE_NO_LOGGING); - } else { - WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval)); - if (cval.val) - F_SET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); - else - F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); - - WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); - if (cval.val) - F_CLR(btree, WT_BTREE_NO_LOGGING); - else - F_SET(btree, WT_BTREE_NO_LOGGING); - } + else + F_SET(btree, WT_BTREE_NO_LOGGING); /* Checksums */ WT_RET(__wt_config_gets(session, cfg, "checksum", &cval)); @@ -370,7 +360,7 @@ __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno) root_ref->page = root; root_ref->state = WT_REF_MEM; - root_ref->key.recno = is_recno ? 1 : 0; + root_ref->key.recno = is_recno ? 1 : WT_RECNO_OOB; root->pg_intl_parent_ref = root_ref; } @@ -697,9 +687,11 @@ __btree_page_sizes(WT_SESSION_IMPL *session) WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval)); btree->maxmempage = WT_MAX((uint64_t)cval.val, 50 * (uint64_t)btree->maxleafpage); - cache_size = S2C(session)->cache_size; - if (cache_size > 0) - btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 4); + if (!F_ISSET(S2C(session), WT_CONN_CACHE_POOL)) { + if ((cache_size = S2C(session)->cache_size) > 0) + btree->maxmempage = + WT_MIN(btree->maxmempage, cache_size / 4); + } /* * Get the split percentage (reconciliation splits pages into smaller diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c index d8456c5b61f..7104e702418 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c +++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c @@ -79,7 +79,7 @@ __wt_ovfl_read(WT_SESSION_IMPL *session, * __ovfl_cache_col_visible -- * column-store: check for a globally visible update. */ -static int +static bool __ovfl_cache_col_visible( WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_CELL_UNPACK *unpack) { @@ -99,15 +99,15 @@ __ovfl_cache_col_visible( if (__wt_cell_rle(unpack) == 1 && upd != NULL && /* Sanity: upd should always be set. */ __wt_txn_visible_all(session, upd->txnid)) - return (1); - return (0); + return (true); + return (false); } /* * __ovfl_cache_row_visible -- * row-store: check for a globally visible update. */ -static int +static bool __ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip) { WT_UPDATE *upd; @@ -115,9 +115,9 @@ __ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip) /* Check to see if there's a globally visible update. */ for (upd = WT_ROW_UPDATE(page, rip); upd != NULL; upd = upd->next) if (__wt_txn_visible_all(session, upd->txnid)) - return (1); + return (true); - return (0); + return (false); } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 922dc2892b8..ba218fc332c 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -17,219 +17,6 @@ static int __inmem_row_leaf_entries( WT_SESSION_IMPL *, const WT_PAGE_HEADER *, uint32_t *); /* - * __evict_force_check -- - * Check if a page matches the criteria for forced eviction. - */ -static int -__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) -{ - WT_BTREE *btree; - - btree = S2BT(session); - - /* Pages are usually small enough, check that first. */ - if (page->memory_footprint < btree->maxmempage) - return (0); - - /* Leaf pages only. */ - if (WT_PAGE_IS_INTERNAL(page)) - return (0); - - /* - * It's hard to imagine a page with a huge memory footprint that has - * never been modified, but check to be sure. - */ - if (page->modify == NULL) - return (0); - - /* Trigger eviction on the next page release. */ - __wt_page_evict_soon(page); - - /* Bump the oldest ID, we're about to do some visibility checks. */ - __wt_txn_update_oldest(session, 0); - - /* If eviction cannot succeed, don't try. */ - return (__wt_page_can_evict(session, page, 1, NULL)); -} - -/* - * __wt_page_in_func -- - * Acquire a hazard pointer to a page; if the page is not in-memory, - * read it from the disk and build an in-memory version. - */ -int -__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags -#ifdef HAVE_DIAGNOSTIC - , const char *file, int line -#endif - ) -{ - WT_BTREE *btree; - WT_DECL_RET; - WT_PAGE *page; - u_int sleep_cnt, wait_cnt; - int busy, cache_work, force_attempts, oldgen; - - btree = S2BT(session); - - for (force_attempts = oldgen = 0, wait_cnt = 0;;) { - switch (ref->state) { - case WT_REF_DISK: - case WT_REF_DELETED: - if (LF_ISSET(WT_READ_CACHE)) - return (WT_NOTFOUND); - - /* - * The page isn't in memory, read it. If this thread is - * allowed to do eviction work, check for space in the - * cache. - */ - if (!LF_ISSET(WT_READ_NO_EVICT)) - WT_RET(__wt_cache_eviction_check( - session, 1, NULL)); - WT_RET(__wt_cache_read(session, ref)); - oldgen = LF_ISSET(WT_READ_WONT_NEED) || - F_ISSET(session, WT_SESSION_NO_CACHE); - continue; - case WT_REF_READING: - if (LF_ISSET(WT_READ_CACHE)) - return (WT_NOTFOUND); - if (LF_ISSET(WT_READ_NO_WAIT)) - return (WT_NOTFOUND); - - /* Waiting on another thread's read, stall. */ - WT_STAT_FAST_CONN_INCR(session, page_read_blocked); - goto stall; - case WT_REF_LOCKED: - if (LF_ISSET(WT_READ_NO_WAIT)) - return (WT_NOTFOUND); - - /* Waiting on eviction, stall. */ - WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); - goto stall; - case WT_REF_SPLIT: - return (WT_RESTART); - case WT_REF_MEM: - /* - * The page is in memory. - * - * Get a hazard pointer if one is required. We cannot - * be evicting if no hazard pointer is required, we're - * done. - */ - if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) - goto skip_evict; - - /* - * The expected reason we can't get a hazard pointer is - * because the page is being evicted, yield, try again. - */ -#ifdef HAVE_DIAGNOSTIC - WT_RET( - __wt_hazard_set(session, ref, &busy, file, line)); -#else - WT_RET(__wt_hazard_set(session, ref, &busy)); -#endif - if (busy) { - WT_STAT_FAST_CONN_INCR( - session, page_busy_blocked); - break; - } - - /* - * If eviction is configured for this file, check to see - * if the page qualifies for forced eviction and update - * the page's generation number. If eviction isn't being - * done on this file, we're done. - */ - if (LF_ISSET(WT_READ_NO_EVICT) || - F_ISSET(btree, WT_BTREE_NO_EVICTION)) - goto skip_evict; - - /* - * Forcibly evict pages that are too big. - */ - page = ref->page; - if (force_attempts < 10 && - __evict_force_check(session, page)) { - ++force_attempts; - ret = __wt_page_release_evict(session, ref); - /* If forced eviction fails, stall. */ - if (ret == EBUSY) { - ret = 0; - WT_STAT_FAST_CONN_INCR(session, - page_forcible_evict_blocked); - goto stall; - } - WT_RET(ret); - - /* - * The result of a successful forced eviction - * is a page-state transition (potentially to - * an in-memory page we can use, or a restart - * return for our caller), continue the outer - * page-acquisition loop. - */ - continue; - } - - /* - * If we read the page and we are configured to not - * trash the cache, set the oldest read generation so - * the page is forcibly evicted as soon as possible. - * - * Otherwise, update the page's read generation. - */ - if (oldgen && page->read_gen == WT_READGEN_NOTSET) - __wt_page_evict_soon(page); - else if (!LF_ISSET(WT_READ_NO_GEN) && - page->read_gen != WT_READGEN_OLDEST && - page->read_gen < __wt_cache_read_gen(session)) - page->read_gen = - __wt_cache_read_gen_bump(session); -skip_evict: - /* - * Check if we need an autocommit transaction. - * Starting a transaction can trigger eviction, so skip - * it if eviction isn't permitted. - */ - return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : - __wt_txn_autocommit_check(session)); - WT_ILLEGAL_VALUE(session); - } - - /* - * We failed to get the page -- yield before retrying, and if - * we've yielded enough times, start sleeping so we don't burn - * CPU to no purpose. - */ - if (++wait_cnt < 1000) - __wt_yield(); - else { - if (0) { -stall: wait_cnt += 1000; - } - - /* - * If stalling and this thread is allowed to do eviction - * work, check if the cache needs help. If we do work - * for the cache, substitute that for a sleep. - */ - if (!LF_ISSET(WT_READ_NO_EVICT)) { - WT_RET(__wt_cache_eviction_check( - session, 1, &cache_work)); - if (cache_work) - continue; - } - sleep_cnt = WT_MIN(wait_cnt, 10000); - wait_cnt *= 2; - WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); - __wt_sleep(0, sleep_cnt); - } - } -} - -/* * __wt_page_alloc -- * Create or read a page into the cache. */ diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index a3ce39b7758..d26b44e04c0 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -9,12 +9,320 @@ #include "wt_internal.h" /* - * __wt_cache_read -- - * Read a page from the file. + * __wt_las_remove_block -- + * Remove all records matching a key prefix from the lookaside store. */ int -__wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) +__wt_las_remove_block(WT_SESSION_IMPL *session, + WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size) { + WT_DECL_ITEM(las_addr); + WT_DECL_ITEM(las_key); + WT_DECL_RET; + uint64_t las_counter, las_txnid; + uint32_t las_id; + int exact; + + WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); + WT_ERR(__wt_scr_alloc(session, 0, &las_key)); + + /* + * Search for the block's unique prefix and step through all matching + * records, removing them. + */ + las_addr->data = addr; + las_addr->size = addr_size; + las_key->size = 0; + cursor->set_key( + cursor, btree_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); + if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) + ret = cursor->next(cursor); + for (; ret == 0; ret = cursor->next(cursor)) { + WT_ERR(cursor->get_key(cursor, + &las_id, las_addr, &las_counter, &las_txnid, las_key)); + + /* + * Confirm the search using the unique prefix; if not a match, + * we're done searching for records for this page. + */ + if (las_id != btree_id || + las_addr->size != addr_size || + memcmp(las_addr->data, addr, addr_size) != 0) + break; + + /* + * Cursor opened overwrite=true: won't return WT_NOTFOUND should + * another thread remove the record before we do, and the cursor + * remains positioned in that case. + */ + WT_ERR(cursor->remove(cursor)); + } + WT_ERR_NOTFOUND_OK(ret); + +err: __wt_scr_free(session, &las_addr); + __wt_scr_free(session, &las_key); + return (ret); +} + +/* + * __col_instantiate -- + * Update a column-store page entry based on a lookaside table update list. + */ +static int +__col_instantiate(WT_SESSION_IMPL *session, + uint64_t recno, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + /* Search the page and add updates. */ + WT_RET(__wt_col_search(session, recno, ref, cbt)); + WT_RET(__wt_col_modify(session, cbt, recno, NULL, upd, 0)); + return (0); +} + +/* + * __row_instantiate -- + * Update a row-store page entry based on a lookaside table update list. + */ +static int +__row_instantiate(WT_SESSION_IMPL *session, + WT_ITEM *key, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + /* Search the page and add updates. */ + WT_RET(__wt_row_search(session, key, ref, cbt, 1)); + WT_RET(__wt_row_modify(session, cbt, key, NULL, upd, 0)); + return (0); +} + +/* + * __las_page_instantiate -- + * Instantiate lookaside update records in a recently read page. + */ +static int +__las_page_instantiate(WT_SESSION_IMPL *session, + WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size) +{ + WT_CURSOR *cursor; + WT_CURSOR_BTREE cbt; + WT_DECL_ITEM(current_key); + WT_DECL_ITEM(las_addr); + WT_DECL_ITEM(las_key); + WT_DECL_ITEM(las_value); + WT_DECL_RET; + WT_PAGE *page; + WT_UPDATE *first_upd, *last_upd, *upd; + size_t incr, total_incr; + uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid; + uint32_t las_id, upd_size, session_flags; + int exact; + const uint8_t *p; + + cursor = NULL; + page = ref->page; + first_upd = last_upd = upd = NULL; + total_incr = 0; + current_recno = recno = WT_RECNO_OOB; + session_flags = 0; /* [-Werror=maybe-uninitialized] */ + + __wt_btcur_init(session, &cbt); + __wt_btcur_open(&cbt); + + WT_ERR(__wt_scr_alloc(session, 0, ¤t_key)); + WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); + WT_ERR(__wt_scr_alloc(session, 0, &las_key)); + WT_ERR(__wt_scr_alloc(session, 0, &las_value)); + + /* Open a lookaside table cursor. */ + WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); + + /* + * The lookaside records are in key and update order, that is, there + * will be a set of in-order updates for a key, then another set of + * in-order updates for a subsequent key. We process all of the updates + * for a key and then insert those updates into the page, then all the + * updates for the next key, and so on. + * + * Search for the block's unique prefix, stepping through any matching + * records. + */ + las_addr->data = addr; + las_addr->size = addr_size; + las_key->size = 0; + cursor->set_key( + cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); + if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) + ret = cursor->next(cursor); + for (; ret == 0; ret = cursor->next(cursor)) { + WT_ERR(cursor->get_key(cursor, + &las_id, las_addr, &las_counter, &las_txnid, las_key)); + + /* + * Confirm the search using the unique prefix; if not a match, + * we're done searching for records for this page. + */ + if (las_id != read_id || + las_addr->size != addr_size || + memcmp(las_addr->data, addr, addr_size) != 0) + break; + + /* + * If the on-page value has become globally visible, this record + * is no longer needed. + */ + if (__wt_txn_visible_all(session, las_txnid)) + continue; + + /* Allocate the WT_UPDATE structure. */ + WT_ERR(cursor->get_value( + cursor, &upd_txnid, &upd_size, las_value)); + WT_ERR(__wt_update_alloc(session, + (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value, + &upd, &incr)); + total_incr += incr; + upd->txnid = upd_txnid; + + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + p = las_key->data; + WT_ERR(__wt_vunpack_uint(&p, 0, &recno)); + if (current_recno == recno) + break; + WT_ASSERT(session, current_recno < recno); + + if (first_upd != NULL) { + WT_ERR(__col_instantiate(session, + current_recno, ref, &cbt, first_upd)); + first_upd = NULL; + } + current_recno = recno; + break; + case WT_PAGE_ROW_LEAF: + if (current_key->size == las_key->size && + memcmp(current_key->data, + las_key->data, las_key->size) == 0) + break; + + if (first_upd != NULL) { + WT_ERR(__row_instantiate(session, + current_key, ref, &cbt, first_upd)); + first_upd = NULL; + } + WT_ERR(__wt_buf_set(session, + current_key, las_key->data, las_key->size)); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Append the latest update to the list. */ + if (first_upd == NULL) + first_upd = last_upd = upd; + else { + last_upd->next = upd; + last_upd = upd; + } + upd = NULL; + } + WT_ERR_NOTFOUND_OK(ret); + + /* Insert the last set of updates, if any. */ + if (first_upd != NULL) + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + WT_ERR(__col_instantiate(session, + current_recno, ref, &cbt, first_upd)); + first_upd = NULL; + break; + case WT_PAGE_ROW_LEAF: + WT_ERR(__row_instantiate(session, + current_key, ref, &cbt, first_upd)); + first_upd = NULL; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Discard the cursor. */ + WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags)); + + if (total_incr != 0) { + __wt_cache_page_inmem_incr(session, page, total_incr); + + /* + * We've modified/dirtied the page, but that's not necessary and + * if we keep the page clean, it's easier to evict. We leave the + * lookaside table updates in place, so if we evict this page + * without dirtying it, any future instantiation of it will find + * the records it needs. If the page is dirtied before eviction, + * then we'll write any needed lookaside table records for the + * new location of the page. + */ + __wt_page_modify_clear(session, page); + } + +err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + WT_TRET(__wt_btcur_close(&cbt, 1)); + + /* + * On error, upd points to a single unlinked WT_UPDATE structure, + * first_upd points to a list. + */ + if (upd != NULL) + __wt_free(session, upd); + if (first_upd != NULL) + __wt_free_update_list(session, first_upd); + + __wt_scr_free(session, ¤t_key); + __wt_scr_free(session, &las_addr); + __wt_scr_free(session, &las_key); + __wt_scr_free(session, &las_value); + + return (ret); +} + +/* + * __evict_force_check -- + * Check if a page matches the criteria for forced eviction. + */ +static int +__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BTREE *btree; + + btree = S2BT(session); + + /* Pages are usually small enough, check that first. */ + if (page->memory_footprint < btree->maxmempage) + return (0); + + /* Leaf pages only. */ + if (WT_PAGE_IS_INTERNAL(page)) + return (0); + + /* + * It's hard to imagine a page with a huge memory footprint that has + * never been modified, but check to be sure. + */ + if (page->modify == NULL) + return (0); + + /* Trigger eviction on the next page release. */ + __wt_page_evict_soon(page); + + /* Bump the oldest ID, we're about to do some visibility checks. */ + __wt_txn_update_oldest(session, 0); + + /* If eviction cannot succeed, don't try. */ + return (__wt_page_can_evict(session, page, 1, NULL)); +} + +/* + * __page_read -- + * Read a page from the file. + */ +static int +__page_read(WT_SESSION_IMPL *session, WT_REF *ref) +{ + const WT_PAGE_HEADER *dsk; + WT_BTREE *btree; WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; @@ -22,6 +330,7 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) uint32_t previous_state; const uint8_t *addr; + btree = S2BT(session); page = NULL; /* @@ -45,8 +354,6 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) /* * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. - * Otherwise, there's an address, read the backing disk page and build - * an in-memory version of the page. */ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) { @@ -54,27 +361,51 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; - } else { - /* - * Read the page, then build the in-memory version of the page. - * Clear any local reference to an allocated copy of the disk - * image on return, the page steals it. - */ - WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); - WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, - WT_DATA_IN_ITEM(&tmp) ? - WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); - tmp.mem = NULL; - - /* If the page was deleted, instantiate that information. */ - if (previous_state == WT_REF_DELETED) - WT_ERR(__wt_delete_page_instantiate(session, ref)); + goto done; } - WT_ERR(__wt_verbose(session, WT_VERB_READ, - "page %p: %s", page, __wt_page_type_string(page->type))); + /* + * There's an address, read or map the backing disk page and build an + * in-memory version of the page. + */ + WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); + WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, + WT_DATA_IN_ITEM(&tmp) ? + WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); + + /* + * Clear the local reference to an allocated copy of the disk image on + * return; the page steals it, errors in this code should not free it. + */ + tmp.mem = NULL; - WT_PUBLISH(ref->state, WT_REF_MEM); + /* + * If reading for a checkpoint, there's no additional work to do, the + * page on disk is correct as written. + */ + if (session->dhandle->checkpoint != NULL) + goto done; + + /* If the page was deleted, instantiate that information. */ + if (previous_state == WT_REF_DELETED) + WT_ERR(__wt_delete_page_instantiate(session, ref)); + + /* + * Instantiate updates from the database's lookaside table. The page + * flag was set when the page was written, potentially a long time ago. + * We only care if the lookaside table is currently active, check that + * before doing any work. + */ + dsk = tmp.data; + if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) { + WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside); + WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside); + + WT_ERR(__las_page_instantiate( + session, ref, btree->id, addr, addr_size)); + } + +done: WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: /* @@ -90,3 +421,183 @@ err: /* return (ret); } + +/* + * __wt_page_in_func -- + * Acquire a hazard pointer to a page; if the page is not in-memory, + * read it from the disk and build an in-memory version. + */ +int +__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags +#ifdef HAVE_DIAGNOSTIC + , const char *file, int line +#endif + ) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; + u_int sleep_cnt, wait_cnt; + int busy, cache_work, force_attempts, oldgen, stalled; + + btree = S2BT(session); + stalled = 0; + + for (force_attempts = oldgen = 0, sleep_cnt = wait_cnt = 0;;) { + switch (ref->state) { + case WT_REF_DISK: + case WT_REF_DELETED: + if (LF_ISSET(WT_READ_CACHE)) + return (WT_NOTFOUND); + + /* + * The page isn't in memory, read it. If this thread is + * allowed to do eviction work, check for space in the + * cache. + */ + if (!LF_ISSET(WT_READ_NO_EVICT)) + WT_RET(__wt_cache_eviction_check( + session, 1, NULL)); + WT_RET(__page_read(session, ref)); + oldgen = LF_ISSET(WT_READ_WONT_NEED) || + F_ISSET(session, WT_SESSION_NO_CACHE); + continue; + case WT_REF_READING: + if (LF_ISSET(WT_READ_CACHE)) + return (WT_NOTFOUND); + if (LF_ISSET(WT_READ_NO_WAIT)) + return (WT_NOTFOUND); + + /* Waiting on another thread's read, stall. */ + WT_STAT_FAST_CONN_INCR(session, page_read_blocked); + stalled = 1; + break; + case WT_REF_LOCKED: + if (LF_ISSET(WT_READ_NO_WAIT)) + return (WT_NOTFOUND); + + /* Waiting on eviction, stall. */ + WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); + stalled = 1; + break; + case WT_REF_SPLIT: + return (WT_RESTART); + case WT_REF_MEM: + /* + * The page is in memory. + * + * Get a hazard pointer if one is required. We cannot + * be evicting if no hazard pointer is required, we're + * done. + */ + if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) + goto skip_evict; + + /* + * The expected reason we can't get a hazard pointer is + * because the page is being evicted, yield, try again. + */ +#ifdef HAVE_DIAGNOSTIC + WT_RET( + __wt_hazard_set(session, ref, &busy, file, line)); +#else + WT_RET(__wt_hazard_set(session, ref, &busy)); +#endif + if (busy) { + WT_STAT_FAST_CONN_INCR( + session, page_busy_blocked); + break; + } + + /* + * If eviction is configured for this file, check to see + * if the page qualifies for forced eviction and update + * the page's generation number. If eviction isn't being + * done on this file, we're done. + */ + if (LF_ISSET(WT_READ_NO_EVICT) || + F_ISSET(session, WT_SESSION_NO_EVICTION) || + F_ISSET(btree, WT_BTREE_NO_EVICTION)) + goto skip_evict; + + /* + * Forcibly evict pages that are too big. + */ + page = ref->page; + if (force_attempts < 10 && + __evict_force_check(session, page)) { + ++force_attempts; + ret = __wt_page_release_evict(session, ref); + /* If forced eviction fails, stall. */ + if (ret == EBUSY) { + ret = 0; + WT_STAT_FAST_CONN_INCR(session, + page_forcible_evict_blocked); + stalled = 1; + break; + } + WT_RET(ret); + + /* + * The result of a successful forced eviction + * is a page-state transition (potentially to + * an in-memory page we can use, or a restart + * return for our caller), continue the outer + * page-acquisition loop. + */ + continue; + } + + /* + * If we read the page and we are configured to not + * trash the cache, set the oldest read generation so + * the page is forcibly evicted as soon as possible. + * + * Otherwise, update the page's read generation. + */ + if (oldgen && page->read_gen == WT_READGEN_NOTSET) + __wt_page_evict_soon(page); + else if (!LF_ISSET(WT_READ_NO_GEN) && + page->read_gen != WT_READGEN_OLDEST && + page->read_gen < __wt_cache_read_gen(session)) + page->read_gen = + __wt_cache_read_gen_bump(session); +skip_evict: + /* + * Check if we need an autocommit transaction. + * Starting a transaction can trigger eviction, so skip + * it if eviction isn't permitted. + */ + return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : + __wt_txn_autocommit_check(session)); + WT_ILLEGAL_VALUE(session); + } + + /* + * We failed to get the page -- yield before retrying, and if + * we've yielded enough times, start sleeping so we don't burn + * CPU to no purpose. + */ + if (stalled) + wait_cnt += 1000; + else if (++wait_cnt < 1000) { + __wt_yield(); + continue; + } + + /* + * If stalling and this thread is allowed to do eviction work, + * check if the cache needs help. If we do work for the cache, + * substitute that for a sleep. + */ + if (!LF_ISSET(WT_READ_NO_EVICT)) { + WT_RET( + __wt_cache_eviction_check(session, 1, &cache_work)); + if (cache_work) + continue; + } + sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000); + WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); + __wt_sleep(0, sleep_cnt); + } +} diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index 22d4948e07d..c2a211bdd2d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -349,9 +349,6 @@ err: WT_TRET(bm->salvage_end(bm, session)); __wt_scr_free(session, &ss->tmp1); __wt_scr_free(session, &ss->tmp2); - /* Wrap up reporting. */ - WT_TRET(__wt_progress(session, NULL, ss->fcnt)); - return (ret); } @@ -381,8 +378,9 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss) if (eof) break; - /* Report progress every 10 chunks. */ - if (++ss->fcnt % 10 == 0) + /* Report progress occasionally. */ +#define WT_SALVAGE_PROGRESS_INTERVAL 100 + if (++ss->fcnt % WT_SALVAGE_PROGRESS_INTERVAL == 0) WT_ERR(__wt_progress(session, NULL, ss->fcnt)); /* @@ -1305,7 +1303,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_reconcile(session, ref, cookie, WT_SKIP_UPDATE_ERR)); + WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR)); /* Reset the page. */ page->pg_var_d = save_col_var; @@ -2011,7 +2009,7 @@ __slvg_row_build_leaf( /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_reconcile(session, ref, cookie, WT_SKIP_UPDATE_ERR)); + WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR)); /* Reset the page. */ page->pg_row_entries += skip_stop; diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index a63eadcaeab..4b9ab45c678 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -173,7 +173,7 @@ __split_safe_free(WT_SESSION_IMPL *session, * __split_should_deepen -- * Return if we should deepen the tree. */ -static int +static bool __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; @@ -196,7 +196,7 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref) * pressure on the cache). */ if (page->memory_footprint < btree->maxmempage) - return (0); + return (false); /* * Ensure the page has enough entries to make it worth splitting and @@ -204,7 +204,7 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref) * splitting won't help). */ if (pindex->entries > btree->split_deepen_min_child) - return (1); + return (true); /* * Don't allow a single page to put pressure on cache usage. The root @@ -216,9 +216,9 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref) if (pindex->entries >= 100 && (__wt_ref_is_root(ref) || page->memory_footprint >= S2C(session)->cache_size / 4)) - return (1); + return (true); - return (0); + return (false); } /* @@ -343,7 +343,7 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) switch (page->type) { case WT_PAGE_COL_INT: - recno = 0; + recno = 0; /* Less than any valid record number. */ WT_INTL_FOREACH_BEGIN(session, page, ref) { WT_ASSERT(session, ref->key.recno > recno); recno = ref->key.recno; @@ -684,13 +684,11 @@ __split_multi_inmem( WT_DECL_RET; WT_PAGE *page; WT_UPDATE *upd; - WT_UPD_SKIPPED *skip; + WT_SAVE_UPD *supd; uint64_t recno; uint32_t i, slot; - WT_CLEAR(cbt); - cbt.iface.session = &session->iface; - cbt.btree = S2BT(session); + __wt_btcur_init(session, &cbt); __wt_btcur_open(&cbt); /* @@ -704,22 +702,22 @@ __split_multi_inmem( * allocated page on error, when discarding the allocated WT_REF. */ WT_RET(__wt_page_inmem(session, ref, - multi->skip_dsk, ((WT_PAGE_HEADER *)multi->skip_dsk)->mem_size, + multi->supd_dsk, ((WT_PAGE_HEADER *)multi->supd_dsk)->mem_size, WT_PAGE_DISK_ALLOC, &page)); - multi->skip_dsk = NULL; + multi->supd_dsk = NULL; if (orig->type == WT_PAGE_ROW_LEAF) WT_RET(__wt_scr_alloc(session, 0, &key)); /* Re-create each modification we couldn't write. */ - for (i = 0, skip = multi->skip; i < multi->skip_entries; ++i, ++skip) + for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd) switch (orig->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: /* Build a key. */ - upd = skip->ins->upd; - skip->ins->upd = NULL; - recno = WT_INSERT_RECNO(skip->ins); + upd = supd->ins->upd; + supd->ins->upd = NULL; + recno = WT_INSERT_RECNO(supd->ins); /* Search the page. */ WT_ERR(__wt_col_search(session, recno, ref, &cbt)); @@ -730,19 +728,19 @@ __split_multi_inmem( break; case WT_PAGE_ROW_LEAF: /* Build a key. */ - if (skip->ins == NULL) { - slot = WT_ROW_SLOT(orig, skip->rip); + if (supd->ins == NULL) { + slot = WT_ROW_SLOT(orig, supd->rip); upd = orig->pg_row_upd[slot]; orig->pg_row_upd[slot] = NULL; WT_ERR(__wt_row_leaf_key( - session, orig, skip->rip, key, 0)); + session, orig, supd->rip, key, 0)); } else { - upd = skip->ins->upd; - skip->ins->upd = NULL; + upd = supd->ins->upd; + supd->ins->upd = NULL; - key->data = WT_INSERT_KEY(skip->ins); - key->size = WT_INSERT_KEY_SIZE(skip->ins); + key->data = WT_INSERT_KEY(supd->ins); + key->size = WT_INSERT_KEY_SIZE(supd->ins); } /* Search the page. */ @@ -765,7 +763,7 @@ __split_multi_inmem( page->modify->first_dirty_txn = WT_TXN_FIRST; err: /* Free any resources that may have been cached in the cursor. */ - WT_TRET(__wt_btcur_close(&cbt)); + WT_TRET(__wt_btcur_close(&cbt, 1)); __wt_scr_free(session, &key); return (ret); @@ -801,7 +799,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, */ ref->home = NULL; - if (multi->skip == NULL) { + if (multi->supd == NULL) { /* * Copy the address: we could simply take the buffer, but that * would complicate error handling, freeing the reference array @@ -830,7 +828,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, break; } - ref->state = multi->skip == NULL ? WT_REF_DISK : WT_REF_MEM; + ref->state = multi->supd == NULL ? WT_REF_DISK : WT_REF_MEM; /* * If our caller wants to track the memory allocations, we have a return @@ -841,16 +839,13 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, return (0); } -#define WT_SPLIT_EXCLUSIVE 0x01 /* Page held exclusively */ -#define WT_SPLIT_INMEM 0x02 /* In-memory split */ - /* * __split_parent -- * Resolve a multi-page split, inserting new information into the parent. */ static int __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, - WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, uint32_t flags) + WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, int exclusive) { WT_DECL_RET; WT_IKEY *ikey; @@ -878,27 +873,39 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * memory inside of the lock and may want to invest effort in making the * locked period shorter. * - * We could race with another thread deepening our parent. To deal - * with that, read the parent pointer each time we try to lock it, and - * check that it's still correct after it is locked. + * We use the reconciliation lock here because not only do we have to + * single-thread the split, we have to lock out reconciliation of the + * parent because reconciliation of the parent can't deal with finding + * a split child during internal page traversal. Basically, there's no + * reason to use a different lock if we have to block reconciliation + * anyway. */ for (;;) { parent = ref->home; - F_CAS_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED, ret); + F_CAS_ATOMIC(parent, WT_PAGE_RECONCILIATION, ret); if (ret == 0) { + /* + * We can race with another thread deepening our parent. + * To deal with that, read the parent pointer each time + * we try to lock it, and check it's still correct after + * it's locked. + */ if (parent == ref->home) break; - F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED); + F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION); continue; } /* - * If we're attempting an in-memory split and we can't lock the - * parent while there is a checkpoint in progress, give up. - * This avoids an infinite loop where we are trying to split a - * page while its parent is being checkpointed. + * A checkpoint reconciling this parent page can deadlock with + * our split. We have an exclusive page lock on the child before + * we acquire the page's reconciliation lock, and reconciliation + * acquires the page's reconciliation lock before it encounters + * the child's exclusive lock (which causes reconciliation to + * loop until the exclusive lock is resolved). If we can't lock + * the parent, give up to avoid that deadlock. */ - if (LF_ISSET(WT_SPLIT_INMEM) && S2BT(session)->checkpointing) + if (S2BT(session)->checkpointing) return (EBUSY); __wt_yield(); } @@ -1095,8 +1102,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * Add it to the session discard list, to be freed when it's safe. */ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); - WT_TRET(__split_safe_free(session, - split_gen, LF_ISSET(WT_SPLIT_EXCLUSIVE) ? 1 : 0, pindex, size)); + WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size)); parent_decr += size; /* @@ -1121,7 +1127,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * Do the check here because we've just grown the parent page and * are holding it locked. */ - if (ret == 0 && !LF_ISSET(WT_SPLIT_EXCLUSIVE) && + if (ret == 0 && !exclusive && __split_should_deepen(session, parent_ref)) ret = __split_deepen(session, parent); @@ -1131,7 +1137,7 @@ err: if (!complete) if (next_ref->state == WT_REF_SPLIT) next_ref->state = WT_REF_DELETED; } - F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED); + F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION); if (hazard) WT_TRET(__wt_hazard_clear(session, parent)); @@ -1170,7 +1176,13 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) right = NULL; page_decr = parent_incr = right_incr = 0; + /* + * Assert splitting makes sense; specifically assert the page is dirty, + * we depend on that, otherwise the page might be evicted based on its + * last reconciliation which no longer matches reality after the split. + */ WT_ASSERT(session, __wt_page_can_split(session, page)); + WT_ASSERT(session, __wt_page_is_modified(page)); /* Find the last item on the page. */ ins_head = page->pg_row_entries == 0 ? @@ -1198,7 +1210,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * The key-instantiation code checks for races, clear the key fields so * we don't trigger them. */ - child->key.recno = 0; + child->key.recno = WT_RECNO_OOB; child->key.ikey = NULL; child->state = WT_REF_MEM; @@ -1373,7 +1385,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) */ page = NULL; if ((ret = __split_parent( - session, ref, split_ref, 2, parent_incr, WT_SPLIT_INMEM)) != 0) { + session, ref, split_ref, 2, parent_incr, 0)) != 0) { /* * Move the insert list element back to the original page list. * For simplicity, the previous skip list pointers originally @@ -1390,8 +1402,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * We marked the new page dirty; we're going to discard it, but * first mark it clean and fix up the cache statistics. */ - right->modify->write_gen = 0; - __wt_cache_dirty_decr(session, right); + __wt_page_modify_clear(session, right); WT_ERR(ret); } @@ -1448,8 +1459,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) * Pages with unresolved changes are not marked clean during * reconciliation, do it now. */ - mod->write_gen = 0; - __wt_cache_dirty_decr(session, page); + __wt_page_modify_clear(session, page); __wt_ref_out(session, ref); /* Swap the new page into place. */ @@ -1492,8 +1502,8 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) * Split into the parent; if we're closing the file, we hold it * exclusively. */ - WT_ERR(__split_parent( session, ref, ref_new, - new_entries, parent_incr, closing ? WT_SPLIT_EXCLUSIVE : 0)); + WT_ERR(__split_parent( + session, ref, ref_new, new_entries, parent_incr, closing)); WT_STAT_FAST_CONN_INCR(session, cache_eviction_split); WT_STAT_FAST_DATA_INCR(session, cache_eviction_split); @@ -1506,10 +1516,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) * Pages with unresolved changes are not marked clean during * reconciliation, do it now. */ - if (__wt_page_is_modified(page)) { - mod->write_gen = 0; - __wt_cache_dirty_decr(session, page); - } + __wt_page_modify_clear(session, page); __wt_page_out(session, &page); return (0); diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c index 9a0584d3217..b379712f6e7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_stat.c +++ b/src/third_party/wiredtiger/src/btree/bt_stat.c @@ -51,6 +51,7 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) WT_STAT_SET(session, stats, btree_column_deleted, 0); WT_STAT_SET(session, stats, btree_column_fix, 0); WT_STAT_SET(session, stats, btree_column_internal, 0); + WT_STAT_SET(session, stats, btree_column_rle, 0); WT_STAT_SET(session, stats, btree_column_variable, 0); WT_STAT_SET(session, stats, btree_entries, 0); WT_STAT_SET(session, stats, btree_overflow, 0); @@ -114,12 +115,12 @@ __stat_page_col_var( WT_COL *cip; WT_INSERT *ins; WT_UPDATE *upd; - uint64_t deleted_cnt, entry_cnt, ovfl_cnt; + uint64_t deleted_cnt, entry_cnt, ovfl_cnt, rle_cnt; uint32_t i; int orig_deleted; unpack = &_unpack; - deleted_cnt = entry_cnt = ovfl_cnt = 0; + deleted_cnt = entry_cnt = ovfl_cnt = rle_cnt = 0; WT_STAT_INCR(session, stats, btree_column_variable); @@ -140,8 +141,10 @@ __stat_page_col_var( __wt_cell_unpack(cell, unpack); if (unpack->type == WT_CELL_ADDR_DEL) orig_deleted = 1; - else + else { entry_cnt += __wt_cell_rle(unpack); + rle_cnt += __wt_cell_rle(unpack) - 1; + } if (unpack->ovfl) ++ovfl_cnt; } @@ -173,6 +176,7 @@ __stat_page_col_var( ++entry_cnt; WT_STAT_INCRV(session, stats, btree_column_deleted, deleted_cnt); + WT_STAT_INCRV(session, stats, btree_column_rle, rle_cnt); WT_STAT_INCRV(session, stats, btree_entries, entry_cnt); WT_STAT_INCRV(session, stats, btree_overflow, ovfl_cnt); } diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index 3f615babb07..1fd660d4cd4 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -245,9 +245,6 @@ err: /* Inform the underlying block manager we're done. */ if (ckptbase != NULL) __wt_meta_ckptlist_free(session, ckptbase); - /* Wrap up reporting. */ - WT_TRET(__wt_progress(session, NULL, vs->fcnt)); - /* Free allocated memory. */ __wt_scr_free(session, &vs->max_key); __wt_scr_free(session, &vs->max_addr); @@ -343,9 +340,10 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs) * of the page to be built, and then a subsequent logical verification * which happens here. * - * Report progress every 10 pages. + * Report progress occasionally. */ - if (++vs->fcnt % 10 == 0) +#define WT_VERIFY_PROGRESS_INTERVAL 100 + if (++vs->fcnt % WT_VERIFY_PROGRESS_INTERVAL == 0) WT_RET(__wt_progress(session, NULL, vs->fcnt)); #ifdef HAVE_DIAGNOSTIC diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c index 095e439786c..38396facc3d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c @@ -71,19 +71,20 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: - if (dsk->recno != 0) + if (dsk->recno != WT_RECNO_OOB) break; WT_RET_VRFY(session, - "%s page at %s has a record number of zero", - __wt_page_type_string(dsk->type), tag); + "%s page at %s has an invalid record number of %d", + __wt_page_type_string(dsk->type), tag, WT_RECNO_OOB); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: - if (dsk->recno == 0) + if (dsk->recno == WT_RECNO_OOB) break; WT_RET_VRFY(session, - "%s page at %s has a non-zero record number", + "%s page at %s has a record number, which is illegal for " + "this page type", __wt_page_type_string(dsk->type), tag); } @@ -91,8 +92,6 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, flags = dsk->flags; if (LF_ISSET(WT_PAGE_COMPRESSED)) LF_CLR(WT_PAGE_COMPRESSED); - if (LF_ISSET(WT_PAGE_ENCRYPTED)) - LF_CLR(WT_PAGE_ENCRYPTED); if (dsk->type == WT_PAGE_ROW_LEAF) { if (LF_ISSET(WT_PAGE_EMPTY_V_ALL) && LF_ISSET(WT_PAGE_EMPTY_V_NONE)) @@ -105,6 +104,10 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, if (LF_ISSET(WT_PAGE_EMPTY_V_NONE)) LF_CLR(WT_PAGE_EMPTY_V_NONE); } + if (LF_ISSET(WT_PAGE_ENCRYPTED)) + LF_CLR(WT_PAGE_ENCRYPTED); + if (LF_ISSET(WT_PAGE_LAS_UPDATE)) + LF_CLR(WT_PAGE_LAS_UPDATE); if (flags != 0) WT_RET_VRFY(session, "page at %s has invalid flags set: 0x%" PRIx8, diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c index fb7c9a1ce90..cbc5143698b 100644 --- a/src/third_party/wiredtiger/src/btree/col_modify.c +++ b/src/third_party/wiredtiger/src/btree/col_modify.c @@ -17,7 +17,7 @@ static int __col_insert_alloc( */ int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, - uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove) + uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, int is_remove) { WT_BTREE *btree; WT_DECL_RET; @@ -25,7 +25,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, **ins_headp; WT_ITEM _value; WT_PAGE *page; - WT_UPDATE *old_upd; + WT_UPDATE *old_upd, *upd; size_t ins_size, upd_size; u_int i, skipdepth; int append, logged; @@ -33,6 +33,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, btree = cbt->btree; ins = NULL; page = cbt->ref->page; + upd = upd_arg; append = logged = 0; /* This code expects a remove to have a NULL value. */ @@ -48,10 +49,10 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * There's some chance the application specified a record past * the last record on the page. If that's the case, and we're * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the - * append list, not the update list. In addition, a recno of 0 + * append list, not the update list. Also, an out-of-band recno * implies an append operation, we're allocating a new row. */ - if (recno == 0 || + if (recno == WT_RECNO_OOB || recno > (btree->type == BTREE_COL_VAR ? __col_var_last_recno(page) : __col_fix_last_recno(page))) append = 1; @@ -76,7 +77,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * If we are restoring updates that couldn't be evicted, the * key must not exist on the new page. */ - WT_ASSERT(session, upd == NULL); + WT_ASSERT(session, upd_arg == NULL); /* Make sure the update can proceed. */ WT_ERR(__wt_txn_update_check( @@ -134,7 +135,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, cbt->ins_head = ins_head; cbt->ins = ins; - if (upd == NULL) { + if (upd_arg == NULL) { WT_ERR( __wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, upd)); @@ -160,7 +161,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ - if (cbt->ins_stack[0] == NULL || recno == 0) + if (cbt->ins_stack[0] == NULL || recno == WT_RECNO_OOB) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; @@ -192,7 +193,8 @@ err: /* if (logged) __wt_txn_unmodify(session); __wt_free(session, ins); - __wt_free(session, upd); + if (upd_arg == NULL) + __wt_free(session, upd); } return (ret); diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index 49a749b8a02..888c54d1ec9 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -112,6 +112,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * there should only be one update list per key. */ WT_ASSERT(session, *upd_entry == NULL); + /* * Set the "old" entry to the second update in the list * so that the serialization function succeeds in diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c new file mode 100644 index 00000000000..e269e8702e1 --- /dev/null +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -0,0 +1,391 @@ +/*- + * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_las_stats_update -- + * Update the lookaside table statistics for return to the application. + */ +void +__wt_las_stats_update(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_CONNECTION_STATS **cstats; + WT_DSRC_STATS **dstats; + + conn = S2C(session); + + /* + * Lookaside table statistics are copied from the underlying lookaside + * table data-source statistics. If there's no lookaside table, values + * remain 0. In the current system, there's always a lookaside table, + * but there's no reason not to be cautious. + */ + if (conn->las_cursor == NULL) + return; + + /* + * We have a cursor, and we need the underlying data handle; we can get + * to it by way of the underlying btree handle, but it's a little ugly. + */ + cstats = conn->stats; + dstats = ((WT_CURSOR_BTREE *)conn->las_cursor)->btree->dhandle->stats; + + WT_STAT_SET(session, cstats, + cache_lookaside_insert, WT_STAT_READ(dstats, cursor_insert)); + WT_STAT_SET(session, cstats, + cache_lookaside_remove, WT_STAT_READ(dstats, cursor_remove)); +} + +/* + * __las_cursor_create -- + * Open a new lookaside table cursor. + */ +static int +__las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) +{ + WT_BTREE *btree; + const char *open_cursor_cfg[] = { + WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL }; + + WT_RET(__wt_open_cursor( + session, WT_LAS_URI, NULL, open_cursor_cfg, cursorp)); + + /* + * Set special flags for the lookaside table: the lookaside flag (used, + * for example, to avoid writing records during reconciliation), also + * turn off checkpoints and logging. + * + * Test flags before setting them so updates can't race in subsequent + * opens (the first update is safe because it's single-threaded from + * wiredtiger_open). + */ + btree = S2BT(session); + if (!F_ISSET(btree, WT_BTREE_LOOKASIDE)) + F_SET(btree, WT_BTREE_LOOKASIDE); + if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) + F_SET(btree, WT_BTREE_NO_CHECKPOINT); + if (!F_ISSET(btree, WT_BTREE_NO_LOGGING)) + F_SET(btree, WT_BTREE_NO_LOGGING); + + return (0); +} + +/* + * __wt_las_create -- + * Initialize the database's lookaside store. + */ +int +__wt_las_create(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + const char *drop_cfg[] = { + WT_CONFIG_BASE(session, WT_SESSION_drop), "force=true", NULL }; + + conn = S2C(session); + + /* + * Done at startup: we cannot do it on demand because we require the + * schema lock to create and drop the file, and it may not always be + * available. + * + * Open an internal session, used for the shared lookaside cursor. + * + * Sessions associated with a lookaside cursor should never be tapped + * for eviction. + */ + WT_RET(__wt_open_internal_session( + conn, "lookaside table", 1, 1, &conn->las_session)); + session = conn->las_session; + F_SET(session, WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION); + + /* Discard any previous incarnation of the file. */ + WT_RET(__wt_session_drop(session, WT_LAS_URI, drop_cfg)); + + /* Re-create the file. */ + WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT)); + + /* Open the shared cursor. */ + WT_WITHOUT_DHANDLE(session, + ret = __las_cursor_create(session, &conn->las_cursor)); + + return (ret); +} + +/* + * __wt_las_destroy -- + * Destroy the database's lookaside store. + */ +int +__wt_las_destroy(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION *wt_session; + + conn = S2C(session); + + if (conn->las_session == NULL) + return (0); + + wt_session = &conn->las_session->iface; + ret = wt_session->close(wt_session, NULL); + + conn->las_cursor = NULL; + conn->las_session = NULL; + + return (ret); +} + +/* + * __wt_las_set_written -- + * Flag that the lookaside table has been written. + */ +void +__wt_las_set_written(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + if (!conn->las_written) { + conn->las_written = true; + + /* + * Push the flag: unnecessary, but from now page reads must deal + * with lookaside table records, and we only do the write once. + */ + WT_FULL_BARRIER(); + } +} + +/* + * __wt_las_is_written -- + * Return if the lookaside table has been written. + */ +bool +__wt_las_is_written(WT_SESSION_IMPL *session) +{ + return (S2C(session)->las_written); +} + +/* + * __wt_las_cursor -- + * Return a lookaside cursor. + */ +int +__wt_las_cursor( + WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + *cursorp = NULL; + + /* + * We don't want to get tapped for eviction after we start using the + * lookaside cursor; save a copy of the current eviction state, we'll + * turn eviction off before we return. + * + * Don't cache lookaside table pages, we're here because of eviction + * problems and there's no reason to believe lookaside pages will be + * useful more than once. + */ + *session_flags = + F_ISSET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + + conn = S2C(session); + + /* Eviction and sweep threads have their own lookaside table cursors. */ + if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) { + if (session->las_cursor == NULL) { + WT_WITHOUT_DHANDLE(session, ret = + __las_cursor_create(session, &session->las_cursor)); + WT_RET(ret); + } + + *cursorp = session->las_cursor; + } else { + /* Lock the shared lookaside cursor. */ + __wt_spin_lock(session, &conn->las_lock); + + *cursorp = conn->las_cursor; + } + + /* Turn caching and eviction off. */ + F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + + return (0); +} + +/* + * __wt_las_cursor_close -- + * Discard a lookaside cursor. + */ +int +__wt_las_cursor_close( + WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) +{ + WT_CONNECTION_IMPL *conn; + WT_CURSOR *cursor; + WT_DECL_RET; + + conn = S2C(session); + + if ((cursor = *cursorp) == NULL) + return (0); + *cursorp = NULL; + + /* Reset the cursor. */ + ret = cursor->reset(cursor); + + /* + * We turned off caching and eviction while the lookaside cursor was in + * use, restore the session's flags. + */ + F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + F_SET(session, session_flags); + + /* + * Eviction and sweep threads have their own lookaside table cursors; + * else, unlock the shared lookaside cursor. + */ + if (!F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) + __wt_spin_unlock(session, &conn->las_lock); + + return (ret); +} + +/* + * __wt_las_sweep -- + * Sweep the lookaside table. + */ +int +__wt_las_sweep(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_CURSOR *cursor; + WT_DECL_ITEM(las_addr); + WT_DECL_ITEM(las_key); + WT_DECL_RET; + WT_ITEM *key; + uint64_t cnt, las_counter, las_txnid; + uint32_t las_id, session_flags; + int notused; + + conn = S2C(session); + cursor = NULL; + key = &conn->las_sweep_key; + session_flags = 0; /* [-Werror=maybe-uninitialized] */ + + WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); + WT_ERR(__wt_scr_alloc(session, 0, &las_key)); + + WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); + + /* + * If we're not starting a new sweep, position the cursor using the key + * from the last call (we don't care if we're before or after the key, + * just roughly in the same spot is fine). + */ + if (conn->las_sweep_call != 0 && key->data != NULL) { + __wt_cursor_set_raw_key(cursor, key); + if ((ret = cursor->search_near(cursor, ¬used)) != 0) + goto srch_notfound; + } + + /* + * The sweep server wakes up every 10 seconds (by default), it's a slow + * moving thread. Try to review the entire lookaside table once every 5 + * minutes, or every 30 calls. + * + * The reason is because the lookaside table exists because we're seeing + * cache/eviction pressure (it allows us to trade performance and disk + * space for cache space), and it's likely lookaside blocks are being + * evicted, and reading them back in doesn't help things. A trickier, + * but possibly better, alternative might be to review all lookaside + * blocks in the cache in order to get rid of them, and slowly review + * lookaside blocks that have already been evicted. + * + * We can't know for sure how many records are in the lookaside table, + * the cursor insert and remove statistics aren't updated atomically. + * Start with reviewing 100 rows, and if it takes more than the target + * number of calls to finish, increase the number of rows checked on + * each call; if it takes less than the target calls to finish, then + * decrease the number of rows reviewed on each call (but never less + * than 100). + */ +#define WT_SWEEP_LOOKASIDE_MIN_CNT 100 +#define WT_SWEEP_LOOKASIDE_PASS_TARGET 30 + ++conn->las_sweep_call; + if ((cnt = conn->las_sweep_cnt) < WT_SWEEP_LOOKASIDE_MIN_CNT) + cnt = conn->las_sweep_cnt = WT_SWEEP_LOOKASIDE_MIN_CNT; + + /* Walk the file. */ + for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { + /* + * If the loop terminates after completing a work unit, we will + * continue the table sweep next time. Get a local copy of the + * sweep key, we're going to reset the cursor; do so before + * calling cursor.remove, cursor.remove can discard our hazard + * pointer and the page could be evicted from underneath us. + */ + if (cnt == 1) { + WT_ERR(__wt_cursor_get_raw_key(cursor, key)); + if (!WT_DATA_IN_ITEM(key)) + WT_ERR(__wt_buf_set( + session, key, key->data, key->size)); + } + + WT_ERR(cursor->get_key(cursor, + &las_id, las_addr, &las_counter, &las_txnid, las_key)); + + /* + * If the on-page record transaction ID associated with the + * record is globally visible, the record can be discarded. + * + * Cursor opened overwrite=true: won't return WT_NOTFOUND should + * another thread remove the record before we do, and the cursor + * remains positioned in that case. + */ + if (__wt_txn_visible_all(session, las_txnid)) + WT_ERR(cursor->remove(cursor)); + } + + /* + * When reaching the lookaside table end or the target number of calls, + * adjust the row count. Decrease/increase the row count depending on + * if the number of calls is less/more than the target. + */ + if (ret == WT_NOTFOUND || + conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) { + if (conn->las_sweep_call < WT_SWEEP_LOOKASIDE_PASS_TARGET && + conn->las_sweep_cnt > WT_SWEEP_LOOKASIDE_MIN_CNT) + conn->las_sweep_cnt -= WT_SWEEP_LOOKASIDE_MIN_CNT; + if (conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) + conn->las_sweep_cnt += WT_SWEEP_LOOKASIDE_MIN_CNT; + } + +srch_notfound: + if (ret == WT_NOTFOUND) + conn->las_sweep_call = 0; + + WT_ERR_NOTFOUND_OK(ret); + + if (0) { +err: __wt_buf_free(session, key); + } + + WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + + __wt_scr_free(session, &las_addr); + __wt_scr_free(session, &las_key); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index 73837c46ee8..91cfcedfcaf 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -76,6 +76,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_shared_cache_subconfigs[] = { { "chunk", "int", NULL, "min=1MB,max=10TB", NULL, 0 }, { "name", "string", NULL, NULL, NULL, 0 }, + { "quota", "int", NULL, NULL, NULL, 0 }, { "reserve", "int", NULL, NULL, NULL, 0 }, { "size", "int", NULL, "min=1MB,max=10TB", NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } @@ -121,7 +122,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { { "lsm_merge", "boolean", NULL, NULL, NULL, 0 }, { "shared_cache", "category", NULL, NULL, - confchk_wiredtiger_open_shared_cache_subconfigs, 4 }, + confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", NULL, 0 }, @@ -520,7 +521,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", NULL, NULL, - confchk_wiredtiger_open_shared_cache_subconfigs, 4 }, + confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", NULL, 0 }, @@ -595,7 +596,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", NULL, NULL, - confchk_wiredtiger_open_shared_cache_subconfigs, 4 }, + confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", NULL, 0 }, @@ -668,7 +669,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", NULL, NULL, - confchk_wiredtiger_open_shared_cache_subconfigs, 4 }, + confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", NULL, 0 }, @@ -740,7 +741,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", NULL, NULL, - confchk_wiredtiger_open_shared_cache_subconfigs, 4 }, + confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", NULL, 0 }, @@ -807,8 +808,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { "eviction_dirty_trigger=95,eviction_target=80,eviction_trigger=95" ",file_manager=(close_handle_minimum=250,close_idle_time=30," "close_scan_interval=10),lsm_manager=(merge=,worker_thread_max=4)" - ",lsm_merge=,shared_cache=(chunk=10MB,name=,reserve=0,size=500MB)" - ",statistics=none,statistics_log=(on_close=0," + ",lsm_merge=,shared_cache=(chunk=10MB,name=,quota=0,reserve=0," + "size=500MB),statistics=none,statistics_log=(on_close=0," "path=\"WiredTigerStat.%d.%H\",sources=," "timestamp=\"%b %d %H:%M:%S\",wait=0),verbose=", confchk_WT_CONNECTION_reconfigure, 17 @@ -959,9 +960,9 @@ static const WT_CONFIG_ENTRY config_entries[] = { "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4)," "lsm_merge=,mmap=,multiprocess=0,session_max=100," - "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0" - ",size=500MB),statistics=none,statistics_log=(on_close=0," - "path=\"WiredTigerStat.%d.%H\",sources=," + "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0," + "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0" + ",path=\"WiredTigerStat.%d.%H\",sources=," "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" ",method=fsync),use_environment_priv=0,verbose=", confchk_wiredtiger_open, 34 @@ -979,9 +980,9 @@ static const WT_CONFIG_ENTRY config_entries[] = { "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4)," "lsm_merge=,mmap=,multiprocess=0,session_max=100," - "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0" - ",size=500MB),statistics=none,statistics_log=(on_close=0," - "path=\"WiredTigerStat.%d.%H\",sources=," + "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0," + "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0" + ",path=\"WiredTigerStat.%d.%H\",sources=," "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" ",method=fsync),use_environment_priv=0,verbose=,version=(major=0," "minor=0)", @@ -999,9 +1000,9 @@ static const WT_CONFIG_ENTRY config_entries[] = { "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4)," "lsm_merge=,mmap=,multiprocess=0,session_max=100," - "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0" - ",size=500MB),statistics=none,statistics_log=(on_close=0," - "path=\"WiredTigerStat.%d.%H\",sources=," + "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0," + "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0" + ",path=\"WiredTigerStat.%d.%H\",sources=," "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" ",method=fsync),verbose=,version=(major=0,minor=0)", confchk_wiredtiger_open_basecfg, 31 @@ -1018,9 +1019,9 @@ static const WT_CONFIG_ENTRY config_entries[] = { "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4)," "lsm_merge=,mmap=,multiprocess=0,session_max=100," - "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0" - ",size=500MB),statistics=none,statistics_log=(on_close=0," - "path=\"WiredTigerStat.%d.%H\",sources=," + "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0," + "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0" + ",path=\"WiredTigerStat.%d.%H\",sources=," "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" ",method=fsync),verbose=", confchk_wiredtiger_open_usercfg, 30 diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index b28fca3a71b..b1155d06826 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -2031,11 +2031,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_turtle_init(session)); WT_ERR(__wt_metadata_open(session)); - /* - * Start the worker threads last. - */ + /* Start the worker threads and run recovery. */ WT_ERR(__wt_connection_workers(session, cfg)); + /* Create the lookaside table. */ + WT_ERR(__wt_las_create(session)); + WT_STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0); *wt_connp = &conn->iface; diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c index fdc95a32387..aaae58ef168 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c @@ -22,21 +22,22 @@ */ #define WT_CACHE_POOL_REDUCE_THRESHOLD 20 /* Balancing passes after a bump before a connection is a candidate. */ -#define WT_CACHE_POOL_BUMP_SKIPS 10 +#define WT_CACHE_POOL_BUMP_SKIPS 5 /* Balancing passes after a reduction before a connection is a candidate. */ -#define WT_CACHE_POOL_REDUCE_SKIPS 5 +#define WT_CACHE_POOL_REDUCE_SKIPS 10 /* * Constants that control how much influence different metrics have on * the pressure calculation. */ -#define WT_CACHE_POOL_APP_EVICT_MULTIPLIER 10 -#define WT_CACHE_POOL_APP_WAIT_MULTIPLIER 50 +#define WT_CACHE_POOL_APP_EVICT_MULTIPLIER 3 +#define WT_CACHE_POOL_APP_WAIT_MULTIPLIER 6 #define WT_CACHE_POOL_READ_MULTIPLIER 1 -static int __cache_pool_adjust(WT_SESSION_IMPL *, uint64_t, uint64_t, int *); +static int __cache_pool_adjust( + WT_SESSION_IMPL *, uint64_t, uint64_t, int, int *); static int __cache_pool_assess(WT_SESSION_IMPL *, uint64_t *); -static int __cache_pool_balance(WT_SESSION_IMPL *); +static int __cache_pool_balance(WT_SESSION_IMPL *, int); /* * __wt_cache_pool_config -- @@ -51,7 +52,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) WT_DECL_RET; char *pool_name; int created, updating; - uint64_t chunk, reserve, size, used_cache; + uint64_t chunk, quota, reserve, size, used_cache; conn = S2C(session); created = updating = 0; @@ -142,6 +143,11 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) chunk = (uint64_t)cval.val; else chunk = cp->chunk; + if (__wt_config_gets(session, &cfg[1], + "shared_cache.quota", &cval) == 0 && cval.val != 0) + quota = (uint64_t)cval.val; + else + quota = cp->quota; } else { /* * The only time shared cache configuration uses default @@ -155,6 +161,9 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) session, cfg, "shared_cache.chunk", &cval)); WT_ASSERT(session, cval.val != 0); chunk = (uint64_t)cval.val; + WT_ERR(__wt_config_gets( + session, cfg, "shared_cache.quota", &cval)); + quota = (uint64_t)cval.val; } /* @@ -197,8 +206,10 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) /* The configuration is verified - it's safe to update the pool. */ cp->size = size; cp->chunk = chunk; + cp->quota = quota; conn->cache->cp_reserved = reserve; + conn->cache->cp_quota = quota; /* Wake up the cache pool server so any changes are noticed. */ if (updating) @@ -402,7 +413,7 @@ __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session) * effectively used. */ static int -__cache_pool_balance(WT_SESSION_IMPL *session) +__cache_pool_balance(WT_SESSION_IMPL *session, int forward) { WT_CACHE_POOL *cp; WT_DECL_RET; @@ -421,16 +432,16 @@ __cache_pool_balance(WT_SESSION_IMPL *session) WT_ERR(__cache_pool_assess(session, &highest)); bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD; + /* * Actively attempt to: * - Reduce the amount allocated, if we are over the budget * - Increase the amount used if there is capacity and any pressure. */ - for (bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD; - F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) && - F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN);) { + while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) && + F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN)) { WT_ERR(__cache_pool_adjust( - session, highest, bump_threshold, &adjusted)); + session, highest, bump_threshold, forward, &adjusted)); /* * Stop if the amount of cache being used is stable, and we * aren't over capacity. @@ -456,30 +467,39 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest) WT_CACHE *cache; WT_CONNECTION_IMPL *entry; uint64_t app_evicts, app_waits, reads; - uint64_t entries, highest, tmp; + uint64_t balanced_size, entries, highest, tmp; cp = __wt_process.cache_pool; - entries = 0; + balanced_size = entries = 0; highest = 1; /* Avoid divide by zero */ + TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) { + if (entry->cache_size == 0 || entry->cache == NULL) + continue; + ++entries; + } + + if (entries > 0) + balanced_size = cp->currently_used / entries; + /* Generate read pressure information. */ TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) { - if (entry->cache_size == 0 || - entry->cache == NULL) + if (entry->cache_size == 0 || entry->cache == NULL) continue; cache = entry->cache; - ++entries; /* * Figure out a delta since the last time we did an assessment * for each metric we are tracking. Watch out for wrapping * of values. + * + * Count pages read, assuming pages are 4KB. */ - tmp = cache->bytes_read; + tmp = cache->bytes_read >> 12; if (tmp >= cache->cp_saved_read) reads = tmp - cache->cp_saved_read; else - reads = (UINT64_MAX - cache->cp_saved_read) + tmp; + reads = tmp; cache->cp_saved_read = tmp; /* Update the application eviction count information */ @@ -500,12 +520,19 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest) (UINT64_MAX - cache->cp_saved_app_waits) + tmp; cache->cp_saved_app_waits = tmp; - /* Calculate the weighted pressure for this member */ - cache->cp_pass_pressure = - (app_evicts * WT_CACHE_POOL_APP_EVICT_MULTIPLIER) + + /* Calculate the weighted pressure for this member. */ + tmp = (app_evicts * WT_CACHE_POOL_APP_EVICT_MULTIPLIER) + (app_waits * WT_CACHE_POOL_APP_WAIT_MULTIPLIER) + (reads * WT_CACHE_POOL_READ_MULTIPLIER); + /* Weight smaller caches higher. */ + tmp = (uint64_t)(tmp * + ((double)balanced_size / entry->cache_size)); + + /* Smooth over history. */ + cache->cp_pass_pressure = + (9 * cache->cp_pass_pressure + tmp) / 10; + if (cache->cp_pass_pressure > highest) highest = cache->cp_pass_pressure; @@ -524,24 +551,25 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest) /* * __cache_pool_adjust -- - * Adjust the allocation of cache to each connection. If force is set + * Adjust the allocation of cache to each connection. If full is set * ignore cache load information, and reduce the allocation for every * connection allocated more than their reserved size. */ static int __cache_pool_adjust(WT_SESSION_IMPL *session, - uint64_t highest, uint64_t bump_threshold, int *adjustedp) + uint64_t highest, uint64_t bump_threshold, int forward, int *adjustedp) { WT_CACHE_POOL *cp; WT_CACHE *cache; WT_CONNECTION_IMPL *entry; - uint64_t adjusted, highest_percentile, pressure, reserved; - int force, grew; + uint64_t adjustment, highest_percentile, pressure, reserved, smallest; + int busy, pool_full, grow; + u_int pct_full; *adjustedp = 0; cp = __wt_process.cache_pool; - force = (cp->currently_used > cp->size); - grew = 0; + grow = 0; + pool_full = (cp->currently_used >= cp->size); /* Highest as a percentage, avoid 0 */ highest_percentile = (highest / 100) + 1; @@ -549,13 +577,17 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, "Cache pool distribution: ")); WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, - "\t" "cache_size, pressure, skips: ")); + "\t" "cache (MB), pressure, skips, busy, %% full:")); } - TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) { + for (entry = forward ? TAILQ_FIRST(&cp->cache_pool_qh) : + TAILQ_LAST(&cp->cache_pool_qh, __wt_cache_pool_qh); + entry != NULL; + entry = forward ? TAILQ_NEXT(entry, cpq) : + TAILQ_PREV(entry, __wt_cache_pool_qh, cpq)) { cache = entry->cache; reserved = cache->cp_reserved; - adjusted = 0; + adjustment = 0; /* * The read pressure is calculated as a percentage of how @@ -565,84 +597,109 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, * assigned. */ pressure = cache->cp_pass_pressure / highest_percentile; + busy = __wt_eviction_needed(entry->default_session, &pct_full); + WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, - "\t%" PRIu64 ", %" PRIu64 ", %" PRIu32, - entry->cache_size, pressure, cache->cp_skip_count)); + "\t%5" PRIu64 ", %3" PRIu64 ", %2" PRIu32 ", %d, %2u", + entry->cache_size >> 20, pressure, cache->cp_skip_count, + busy, pct_full)); /* Allow to stabilize after changes. */ if (cache->cp_skip_count > 0 && --cache->cp_skip_count > 0) continue; + /* * If the entry is currently allocated less than the reserved - * size, increase it's allocation. This should only happen if: - * - It's the first time we've seen this member - * - The reserved size has been adjusted + * size, increase its allocation. This should only happen if: + * - it's the first time we've seen this member, or + * - the reserved size has been adjusted */ if (entry->cache_size < reserved) { - grew = 1; - adjusted = reserved - entry->cache_size; - + grow = 1; + adjustment = reserved - entry->cache_size; /* * Conditions for reducing the amount of resources for an * entry: - * - If we are forcing and this entry has more than the - * minimum amount of space in use. - * - If the read pressure in this entry is below the - * threshold, other entries need more cache, the entry has - * more than the minimum space and there is no available - * space in the pool. + * - the pool is full, + * - application threads are not busy doing eviction already, + * - this entry has more than the minimum amount of space in + * use, + * - the read pressure in this entry is below the threshold, + * other entries need more cache, the entry has more than + * the minimum space and there is no available space in the + * pool. */ - } else if ((force && entry->cache_size > reserved) || - (pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && - highest > 1 && entry->cache_size > reserved && - cp->currently_used >= cp->size)) { - grew = 0; + } else if (pool_full && !busy && + entry->cache_size > reserved && + pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && highest > 1) { + grow = 0; /* - * Shrink by a chunk size if that doesn't drop us - * below the reserved size. + * Don't drop the size down too much - or it can + * trigger aggressive eviction in the connection, + * which is likely to lead to lower throughput and + * potentially a negative feedback loop in the + * balance algorithm. */ - if (entry->cache_size > cp->chunk + reserved) - adjusted = cp->chunk; - else - adjusted = entry->cache_size - reserved; + smallest = (100 * __wt_cache_bytes_inuse(cache)) / + cache->eviction_trigger; + if (entry->cache_size > smallest) + adjustment = WT_MIN(cp->chunk, + (entry->cache_size - smallest) / 2); + adjustment = + WT_MIN(adjustment, entry->cache_size - reserved); /* * Conditions for increasing the amount of resources for an * entry: - * - There was some activity across the pool - * - This entry is using less than the entire cache pool - * - The connection is using enough cache to require eviction - * - There is space available in the pool - * - Additional cache would benefit the connection OR - * - The pool is less than half distributed + * - there is space available in the pool + * - the connection isn't over quota + * - the connection is using enough cache to require eviction + * - there was some activity across the pool + * - this entry is using less than the entire cache pool + * - additional cache would benefit the connection OR + * - the pool is less than half distributed */ - } else if (entry->cache_size < cp->size && + } else if (!pool_full && + (cache->cp_quota == 0 || + entry->cache_size < cache->cp_quota) && __wt_cache_bytes_inuse(cache) >= (entry->cache_size * cache->eviction_target) / 100 && - ((cp->currently_used < cp->size && - pressure > bump_threshold) || + (pressure > bump_threshold || cp->currently_used < cp->size * 0.5)) { - grew = 1; - adjusted = WT_MIN(cp->chunk, - cp->size - cp->currently_used); + grow = 1; + adjustment = WT_MIN(WT_MIN(cp->chunk, + cp->size - cp->currently_used), + cache->cp_quota - entry->cache_size); } - if (adjusted > 0) { + /* + * Bounds checking: don't go over the pool size or under the + * reserved size for this cache. + * + * Shrink by a chunk size if that doesn't drop us + * below the reserved size. + * + * Limit the reduction to half of the free space in the + * connection's cache. This should reduce cache sizes + * gradually without stalling application threads. + */ + if (adjustment > 0) { *adjustedp = 1; - if (grew > 0) { + if (grow) { cache->cp_skip_count = WT_CACHE_POOL_BUMP_SKIPS; - entry->cache_size += adjusted; - cp->currently_used += adjusted; + entry->cache_size += adjustment; + cp->currently_used += adjustment; } else { cache->cp_skip_count = WT_CACHE_POOL_REDUCE_SKIPS; WT_ASSERT(session, - entry->cache_size >= adjusted && - cp->currently_used >= adjusted); - entry->cache_size -= adjusted; - cp->currently_used -= adjusted; + entry->cache_size >= adjustment && + cp->currently_used >= adjustment); + entry->cache_size -= adjustment; + cp->currently_used -= adjustment; } WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, "Allocated %s%" PRId64 " to %s", - grew ? "" : "-", adjusted, entry->home)); + grow ? "" : "-", adjustment, entry->home)); + /* * TODO: Add a loop waiting for connection to give up * cache. @@ -663,11 +720,13 @@ __wt_cache_pool_server(void *arg) WT_CACHE_POOL *cp; WT_DECL_RET; WT_SESSION_IMPL *session; + int forward; session = (WT_SESSION_IMPL *)arg; cp = __wt_process.cache_pool; cache = S2C(session)->cache; + forward = 1; while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) && F_ISSET(cache, WT_CACHE_POOL_RUN)) { @@ -695,8 +754,10 @@ __wt_cache_pool_server(void *arg) * Continue even if there was an error. Details of errors are * reported in the balance function. */ - if (F_ISSET(cache, WT_CACHE_POOL_MANAGER)) - (void)__cache_pool_balance(session); + if (F_ISSET(cache, WT_CACHE_POOL_MANAGER)) { + (void)__cache_pool_balance(session, forward); + forward = !forward; + } } if (0) { diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c index 1c4a631cc59..7a8a6cba838 100644 --- a/src/third_party/wiredtiger/src/conn/conn_handle.c +++ b/src/third_party/wiredtiger/src/conn/conn_handle.c @@ -55,6 +55,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list")); WT_RET(__wt_rwlock_alloc(session, &conn->hot_backup_lock, "hot backup")); + WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table")); WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema")); WT_RET(__wt_spin_init(session, &conn->table_lock, "table creation")); @@ -140,6 +141,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->encryptor_lock); __wt_spin_destroy(session, &conn->fh_lock); WT_TRET(__wt_rwlock_destroy(session, &conn->hot_backup_lock)); + __wt_spin_destroy(session, &conn->las_lock); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); __wt_spin_destroy(session, &conn->table_lock); diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index dae0293d790..2b115190b06 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -287,8 +287,9 @@ __log_file_server(void *arg) WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; - WT_LSN close_end_lsn, close_lsn, min_lsn; + WT_LSN close_end_lsn, min_lsn; WT_SESSION_IMPL *session; + uint32_t filenum; int locked; session = arg; @@ -300,66 +301,97 @@ __log_file_server(void *arg) * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. */ - if ((close_fh = log->log_close_fh) != NULL && - (ret = __wt_log_extract_lognum(session, close_fh->name, - &close_lsn.file)) == 0 && - close_lsn.file < log->write_lsn.file) { + if ((close_fh = log->log_close_fh) != NULL) { + WT_ERR(__wt_log_extract_lognum(session, close_fh->name, + &filenum)); /* - * We've copied the file handle, clear out the one in - * log structure to allow it to be set again. + * We update the close file handle before updating the + * close LSN when changing files. It is possible we + * could see mismatched settings. If we do, yield + * until it is set. This should rarely happen. */ - log->log_close_fh = NULL; - /* - * Set the close_end_lsn to the LSN immediately after - * ours. That is, the beginning of the next log file. - * We need to know the LSN file number of our own close - * in case earlier calls are still in progress and the - * next one to move the sync_lsn into the next file for - * later syncs. - */ - close_lsn.offset = 0; - close_end_lsn = close_lsn; - close_end_lsn.file++; - WT_ERR(__wt_fsync(session, close_fh)); - __wt_spin_lock(session, &log->log_sync_lock); - locked = 1; - WT_ERR(__wt_close(session, &close_fh)); - WT_ASSERT(session, - WT_LOG_CMP(&close_end_lsn, &log->sync_lsn) >= 0); - log->sync_lsn = close_end_lsn; - WT_ERR(__wt_cond_signal(session, log->log_sync_cond)); - locked = 0; - __wt_spin_unlock(session, &log->log_sync_lock); + while (log->log_close_lsn.file < filenum) + __wt_yield(); + + if (__wt_log_cmp( + &log->write_lsn, &log->log_close_lsn) >= 0) { + /* + * We've copied the file handle, clear out the + * one in the log structure to allow it to be + * set again. Copy the LSN before clearing + * the file handle. + * Use a barrier to make sure the compiler does + * not reorder the following two statements. + */ + close_end_lsn = log->log_close_lsn; + WT_FULL_BARRIER(); + log->log_close_fh = NULL; + /* + * Set the close_end_lsn to the LSN immediately + * after ours. That is, the beginning of the + * next log file. We need to know the LSN + * file number of our own close in case earlier + * calls are still in progress and the next one + * to move the sync_lsn into the next file for + * later syncs. + */ + close_end_lsn.file++; + close_end_lsn.offset = 0; + WT_ERR(__wt_fsync(session, close_fh)); + __wt_spin_lock(session, &log->log_sync_lock); + locked = 1; + WT_ERR(__wt_close(session, &close_fh)); + WT_ASSERT(session, __wt_log_cmp( + &close_end_lsn, &log->sync_lsn) >= 0); + log->sync_lsn = close_end_lsn; + WT_ERR(__wt_cond_signal( + session, log->log_sync_cond)); + locked = 0; + __wt_spin_unlock(session, &log->log_sync_lock); + } } /* * If a later thread asked for a background sync, do it now. */ - if (WT_LOG_CMP(&log->bg_sync_lsn, &log->sync_lsn) > 0) { + if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) { /* * Save the latest write LSN which is the minimum * we will have written to disk. */ min_lsn = log->write_lsn; /* - * The sync LSN we asked for better be smaller than - * the current written LSN. + * We have to wait until the LSN we asked for is + * written. If it isn't signal the wrlsn thread + * to get it written. */ - WT_ASSERT(session, - WT_LOG_CMP(&log->bg_sync_lsn, &min_lsn) <= 0); - WT_ERR(__wt_fsync(session, log->log_fh)); - __wt_spin_lock(session, &log->log_sync_lock); - locked = 1; - /* - * The sync LSN could have advanced while we were - * writing to disk. - */ - if (WT_LOG_CMP(&log->sync_lsn, &min_lsn) <= 0) { - log->sync_lsn = min_lsn; + if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) { + WT_ERR(__wt_fsync(session, log->log_fh)); + __wt_spin_lock(session, &log->log_sync_lock); + locked = 1; + /* + * The sync LSN could have advanced while we + * were writing to disk. + */ + if (__wt_log_cmp( + &log->sync_lsn, &min_lsn) <= 0) { + log->sync_lsn = min_lsn; + WT_ERR(__wt_cond_signal( + session, log->log_sync_cond)); + } + locked = 0; + __wt_spin_unlock(session, &log->log_sync_lock); + } else { WT_ERR(__wt_cond_signal( - session, log->log_sync_cond)); + session, conn->log_wrlsn_cond)); + /* + * We do not want to wait potentially a second + * to process this. Yield to give the wrlsn + * thread a chance to run and try again in + * this case. + */ + __wt_yield(); + continue; } - locked = 0; - __wt_spin_unlock(session, &log->log_sync_lock); } /* Wait until the next event. */ WT_ERR(__wt_cond_wait( @@ -394,26 +426,29 @@ typedef struct { /* * __wt_log_wrlsn -- * Process written log slots and attempt to coalesce them if the LSNs - * are contiguous. Returns 1 if slots were freed, 0 if no slots were - * freed in the progress arg. Must be called with the log slot lock held. + * are contiguous. The purpose of this function is to advance the + * write_lsn in LSN order after the buffer is written to the log file. */ int -__wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) +__wt_log_wrlsn(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; + WT_DECL_RET; WT_LOG *log; WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL]; WT_LOGSLOT *coalescing, *slot; + WT_LSN save_lsn; size_t written_i; uint32_t i, save_i; conn = S2C(session); log = conn->log; + __wt_spin_lock(session, &log->log_writelsn_lock); +restart: coalescing = NULL; + WT_INIT_LSN(&save_lsn); written_i = 0; i = 0; - if (free_i != NULL) - *free_i = WT_SLOT_POOL; /* * Walk the array once saving any slots that are in the @@ -422,9 +457,14 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) while (i < WT_SLOT_POOL) { save_i = i; slot = &log->slot_pool[i++]; - if (free_i != NULL && *free_i == WT_SLOT_POOL && - slot->slot_state == WT_LOG_SLOT_FREE) - *free_i = save_i; + /* + * XXX - During debugging I saw slot 0 become orphaned. + * I believe it is fixed, but check for now. + * This assertion should catch that. + */ + if (slot->slot_state == 0) + WT_ASSERT(session, + slot->slot_release_lsn.file >= log->write_lsn.file); if (slot->slot_state != WT_LOG_SLOT_WRITTEN) continue; written[written_i].slot_index = save_i; @@ -435,15 +475,8 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) * based on the release LSN, and then look for them in order. */ if (written_i > 0) { - /* - * If wanted, reset the yield variable to indicate that we - * have found written slots. - */ - if (yield != NULL) - *yield = 0; WT_INSERTION_SORT(written, written_i, WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT); - /* * We know the written array is sorted by LSN. Go * through them either advancing write_lsn or coalesce @@ -451,8 +484,28 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) */ for (i = 0; i < written_i; i++) { slot = &log->slot_pool[written[i].slot_index]; + /* + * The log server thread pushes out slots periodically. + * Sometimes they are empty slots. If we find an + * empty slot, where empty means the start and end LSN + * are the same, free it and continue. + */ + if (__wt_log_cmp(&slot->slot_start_lsn, + &slot->slot_release_lsn) == 0 && + __wt_log_cmp(&slot->slot_start_lsn, + &slot->slot_end_lsn) == 0) { + __wt_log_slot_free(session, slot); + continue; + } if (coalescing != NULL) { - if (WT_LOG_CMP(&coalescing->slot_end_lsn, + /* + * If the write_lsn changed, we may be able to + * process slots. Try again. + */ + if (__wt_log_cmp( + &log->write_lsn, &save_lsn) != 0) + goto restart; + if (__wt_log_cmp(&coalescing->slot_end_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; @@ -461,6 +514,8 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) * If we get here we have a slot to coalesce * and free. */ + coalescing->slot_last_offset = + slot->slot_last_offset; coalescing->slot_end_lsn = slot->slot_end_lsn; WT_STAT_FAST_CONN_INCR( session, log_slot_coalesced); @@ -473,8 +528,12 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) /* * If this written slot is not the next LSN, * try to start coalescing with later slots. + * A synchronous write may update write_lsn + * so save the last one we saw to check when + * coalescing slots. */ - if (WT_LOG_CMP( + save_lsn = log->write_lsn; + if (__wt_log_cmp( &log->write_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; @@ -483,27 +542,29 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) * If we get here we have a slot to process. * Advance the LSN and process the slot. */ - WT_ASSERT(session, WT_LOG_CMP(&written[i].lsn, + WT_ASSERT(session, __wt_log_cmp(&written[i].lsn, &slot->slot_release_lsn) == 0); + if (slot->slot_start_lsn.offset != + slot->slot_last_offset) + slot->slot_start_lsn.offset = + slot->slot_last_offset; log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; - WT_RET(__wt_cond_signal( + WT_ERR(__wt_cond_signal( session, log->log_write_cond)); WT_STAT_FAST_CONN_INCR(session, log_write_lsn); /* * Signal the close thread if needed. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) - WT_RET(__wt_cond_signal( + WT_ERR(__wt_cond_signal( session, conn->log_file_cond)); } - WT_RET(__wt_log_slot_free(session, slot)); - if (free_i != NULL && *free_i == WT_SLOT_POOL && - slot->slot_state == WT_LOG_SLOT_FREE) - *free_i = written[i].slot_index; + __wt_log_slot_free(session, slot); } } - return (0); +err: __wt_spin_unlock(session, &log->log_writelsn_lock); + return (ret); } /* @@ -515,31 +576,26 @@ __log_wrlsn_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; - WT_LOG *log; WT_SESSION_IMPL *session; - int locked, yield; session = arg; conn = S2C(session); - log = conn->log; - locked = yield = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { - __wt_spin_lock(session, &log->log_slot_lock); - locked = 1; - WT_ERR(__wt_log_wrlsn(session, NULL, &yield)); - locked = 0; - __wt_spin_unlock(session, &log->log_slot_lock); - if (++yield < 1000) - __wt_yield(); - else - WT_ERR(__wt_cond_wait(session, - conn->log_wrlsn_cond, 100000)); + /* + * Write out any log record buffers. + */ + WT_ERR(__wt_log_wrlsn(session)); + WT_ERR(__wt_cond_wait(session, conn->log_wrlsn_cond, 10000)); } + /* + * On close we need to do this one more time because there could + * be straggling log writes that need to be written. + */ + WT_ERR(__wt_log_force_write(session, 1)); + WT_ERR(__wt_log_wrlsn(session)); if (0) { err: __wt_err(session, ret, "log wrlsn server error"); } - if (locked) - __wt_spin_unlock(session, &log->log_slot_lock); return (WT_THREAD_RET_VALUE); } @@ -554,44 +610,81 @@ __log_server(void *arg) WT_DECL_RET; WT_LOG *log; WT_SESSION_IMPL *session; - u_int locked; + int freq_per_sec, signalled; session = arg; conn = S2C(session); log = conn->log; - locked = 0; + signalled = 0; + + /* + * Set this to the number of times per second we want to force out the + * log slot buffer. + */ +#define WT_FORCE_PER_SECOND 20 + freq_per_sec = WT_FORCE_PER_SECOND; + + /* + * The log server thread does a variety of work. It forces out any + * buffered log writes. It pre-allocates log files and it performs + * log archiving. The reason the wrlsn thread does not force out + * the buffered writes is because we want to process and move the + * write_lsn forward as quickly as possible. The same reason applies + * to why the log file server thread does not force out the writes. + * That thread does fsync calls which can take a long time and we + * don't want log records sitting in the buffer over the time it + * takes to sync out an earlier file. + */ while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* - * Perform log pre-allocation. + * Slots depend on future activity. Force out buffered + * writes in case we are idle. This cannot be part of the + * wrlsn thread because of interaction advancing the write_lsn + * and a buffer may need to wait for the write_lsn to advance + * in the case of a synchronous buffer. We end up with a hang. */ - if (conn->log_prealloc > 0) - WT_ERR(__log_prealloc_once(session)); + WT_ERR_BUSY_OK(__wt_log_force_write(session, 0)); /* - * Perform the archive. + * We don't want to archive or pre-allocate files as often as + * we want to force out log buffers. Only do it once per second + * or if the condition was signalled. */ - if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) { - if (__wt_try_writelock( - session, log->log_archive_lock) == 0) { - locked = 1; - WT_ERR(__log_archive_once(session, 0)); - WT_ERR( __wt_writeunlock( - session, log->log_archive_lock)); - locked = 0; - } else - WT_ERR(__wt_verbose(session, WT_VERB_LOG, - "log_archive: Blocked due to open log " - "cursor holding archive lock")); + if (--freq_per_sec <= 0 || signalled != 0) { + freq_per_sec = WT_FORCE_PER_SECOND; + + /* + * Perform log pre-allocation. + */ + if (conn->log_prealloc > 0) + WT_ERR(__log_prealloc_once(session)); + + /* + * Perform the archive. + */ + if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) { + if (__wt_try_writelock( + session, log->log_archive_lock) == 0) { + ret = __log_archive_once(session, 0); + WT_TRET(__wt_writeunlock( + session, log->log_archive_lock)); + WT_ERR(ret); + } else + WT_ERR( + __wt_verbose(session, WT_VERB_LOG, + "log_archive: Blocked due to open " + "log cursor holding archive lock")); + } } + /* Wait until the next event. */ - WT_ERR(__wt_cond_wait(session, conn->log_cond, WT_MILLION)); + WT_ERR(__wt_cond_wait_signal(session, conn->log_cond, + WT_MILLION / WT_FORCE_PER_SECOND, &signalled)); } if (0) { err: __wt_err(session, ret, "log server error"); } - if (locked) - (void)__wt_writeunlock(session, log->log_archive_lock); return (WT_THREAD_RET_VALUE); } @@ -624,6 +717,8 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_spin_init(session, &log->log_lock, "log")); WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot")); WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync")); + WT_RET(__wt_spin_init(session, &log->log_writelsn_lock, + "log write LSN")); WT_RET(__wt_rwlock_alloc(session, &log->log_archive_lock, "log archive lock")); if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_LOG)) @@ -755,13 +850,11 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) WT_TRET(__wt_thread_join(session, conn->log_tid)); conn->log_tid_set = 0; } - WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); if (conn->log_file_tid_set) { WT_TRET(__wt_cond_signal(session, conn->log_file_cond)); WT_TRET(__wt_thread_join(session, conn->log_file_tid)); conn->log_file_tid_set = 0; } - WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); if (conn->log_file_session != NULL) { wt_session = &conn->log_file_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); @@ -772,13 +865,13 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid)); conn->log_wrlsn_tid_set = 0; } - WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond)); if (conn->log_wrlsn_session != NULL) { wt_session = &conn->log_wrlsn_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); conn->log_wrlsn_session = NULL; } + WT_TRET(__wt_log_slot_destroy(session)); WT_TRET(__wt_log_close(session)); /* Close the server thread's session. */ @@ -788,13 +881,18 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn->log_session = NULL; } - WT_TRET(__wt_log_slot_destroy(session)); + /* Destroy the condition variables now that all threads are stopped */ + WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); + WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); + WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond)); + WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond)); WT_TRET(__wt_rwlock_destroy(session, &conn->log->log_archive_lock)); __wt_spin_destroy(session, &conn->log->log_lock); __wt_spin_destroy(session, &conn->log->log_slot_lock); __wt_spin_destroy(session, &conn->log->log_sync_lock); + __wt_spin_destroy(session, &conn->log->log_writelsn_lock); __wt_free(session, conn->log_path); __wt_free(session, conn->log); return (ret); diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index 397f3ff8c38..8bc69bb3e80 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -111,14 +111,17 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) F_CLR(conn, WT_CONN_SERVER_RUN); WT_TRET(__wt_async_destroy(session)); WT_TRET(__wt_lsm_manager_destroy(session)); + WT_TRET(__wt_sweep_destroy(session)); F_SET(conn, WT_CONN_CLOSING); WT_TRET(__wt_checkpoint_server_destroy(session)); WT_TRET(__wt_statlog_destroy(session, 1)); - WT_TRET(__wt_sweep_destroy(session)); WT_TRET(__wt_evict_destroy(session)); + /* Shut down the lookaside table, after all eviction is complete. */ + WT_TRET(__wt_las_destroy(session)); + /* Close open data handles. */ WT_TRET(__wt_conn_dhandle_discard(session)); @@ -238,9 +241,7 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) /* Run recovery. */ WT_RET(__wt_txn_recover(session)); - /* - * Start the handle sweep thread. - */ + /* Start the handle sweep thread. */ WT_RET(__wt_sweep_create(session)); /* Start the optional async threads. */ diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c index 80698c536cd..3b188bfd22a 100644 --- a/src/third_party/wiredtiger/src/conn/conn_stat.c +++ b/src/third_party/wiredtiger/src/conn/conn_stat.c @@ -50,6 +50,7 @@ __wt_conn_stat_init(WT_SESSION_IMPL *session) __wt_async_stats_update(session); __wt_cache_stats_update(session); + __wt_las_stats_update(session); __wt_txn_stats_update(session); WT_STAT_SET(session, stats, file_open, conn->open_file_count); diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index 3de9347f38f..8da32416242 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -283,6 +283,13 @@ __sweep_server(void *arg) WT_STAT_FAST_CONN_INCR(session, dh_sweeps); /* + * Sweep the lookaside table. If the lookaside table hasn't yet + * been written, there's no work to do. + */ + if (__wt_las_is_written(session)) + WT_ERR(__wt_las_sweep(session)); + + /* * Mark handles with a time of death, and report whether any * handles are marked dead. If sweep_idle_time is 0, handles * never become idle. @@ -359,8 +366,14 @@ __wt_sweep_create(WT_SESSION_IMPL *session) /* * Handle sweep does enough I/O it may be called upon to perform slow * operations for the block manager. + * + * The sweep thread sweeps the lookaside table for outdated records, + * it gets its own cursor for that purpose. + * + * Don't tap the sweep thread for eviction. */ - F_SET(session, WT_SESSION_CAN_WAIT); + F_SET(session, WT_SESSION_CAN_WAIT | + WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION); WT_RET(__wt_cond_alloc( session, "handle sweep server", 0, &conn->sweep_cond)); @@ -399,5 +412,9 @@ __wt_sweep_destroy(WT_SESSION_IMPL *session) conn->sweep_session = NULL; } + + /* Discard any saved lookaside key. */ + __wt_buf_free(session, &conn->las_sweep_key); + return (ret); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c index 60d94697189..3d9e5e405e8 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_backup.c +++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c @@ -514,17 +514,23 @@ static int __backup_list_all_append(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CURSOR_BACKUP *cb; + const char *name; WT_UNUSED(cfg); cb = session->bkp_cursor; + name = session->dhandle->name; /* Ignore files in the process of being bulk-loaded. */ if (F_ISSET(S2BT(session), WT_BTREE_BULK)) return (0); + /* Ignore the lookaside table. */ + if (strcmp(name, WT_LAS_URI) == 0) + return (0); + /* Add the file to the list of files to be copied. */ - return (__backup_list_append(session, cb, session->dhandle->name)); + return (__backup_list_append(session, cb, name)); } /* diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c index c58d6899150..8ee57d24413 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_ds.c +++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c @@ -510,7 +510,7 @@ __wt_curds_open( source = data_source->source; source->session = (WT_SESSION *)session; memset(&source->q, 0, sizeof(source->q)); - source->recno = 0; + source->recno = WT_RECNO_OOB; memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf)); memset(&source->key, 0, sizeof(source->key)); memset(&source->value, 0, sizeof(source->value)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c index a9f3124149e..c998565eb75 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_file.c +++ b/src/third_party/wiredtiger/src/cursor/cur_file.c @@ -369,15 +369,20 @@ __curfile_close(WT_CURSOR *cursor) __wt_buf_free(session, &cbulk->last); } - WT_TRET(__wt_btcur_close(cbt)); - if (cbt->btree != NULL) { + WT_TRET(__wt_btcur_close(cbt, 0)); + /* The URI is owned by the btree handle. */ + cursor->internal_uri = NULL; + WT_TRET(__wt_cursor_close(cursor)); + + /* + * Note: release the data handle last so that cursor statistics are + * updated correctly. + */ + if (session->dhandle != NULL) { /* Increment the data-source's in-use counter. */ __wt_cursor_dhandle_decr_use(session); WT_TRET(__wt_session_release_btree(session)); } - /* The URI is owned by the btree handle. */ - cursor->internal_uri = NULL; - WT_TRET(__wt_cursor_close(cursor)); err: API_END_RET(session, ret); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c index 7dad85e9d38..045663b3614 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_index.c +++ b/src/third_party/wiredtiger/src/cursor/cur_index.c @@ -130,7 +130,8 @@ __curindex_move(WT_CURSOR_INDEX *cindex) (*cp)->recno = first->recno; } F_SET(*cp, WT_CURSTD_KEY_EXT); - WT_RET((*cp)->search(*cp)); + if (cindex->cg_needvalue[i]) + WT_RET((*cp)->search(*cp)); } F_SET(&cindex->iface, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); @@ -320,6 +321,7 @@ __curindex_close(WT_CURSOR *cursor) *cp = NULL; } + __wt_free(session, cindex->cg_needvalue); __wt_free(session, cindex->cg_cursors); if (cindex->key_plan != idx->key_plan) __wt_free(session, cindex->key_plan); @@ -353,14 +355,19 @@ __curindex_open_colgroups( /* Child cursors are opened with dump disabled. */ const char *cfg[] = { cfg_arg[0], cfg_arg[1], "dump=\"\"", NULL }; char *proj; + size_t cgcnt; table = cindex->table; - WT_RET(__wt_calloc_def(session, WT_COLGROUPS(table), &cp)); + cgcnt = WT_COLGROUPS(table); + WT_RET(__wt_calloc_def(session, cgcnt, &cindex->cg_needvalue)); + WT_RET(__wt_calloc_def(session, cgcnt, &cp)); cindex->cg_cursors = cp; /* Work out which column groups we need. */ for (proj = (char *)cindex->value_plan; *proj != '\0'; proj++) { arg = strtoul(proj, &proj, 10); + if (*proj == WT_PROJ_VALUE) + cindex->cg_needvalue[arg] = 1; if ((*proj != WT_PROJ_KEY && *proj != WT_PROJ_VALUE) || cp[arg] != NULL) continue; diff --git a/src/third_party/wiredtiger/src/cursor/cur_log.c b/src/third_party/wiredtiger/src/cursor/cur_log.c index 3376f2a3166..ade9fd18962 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_log.c +++ b/src/third_party/wiredtiger/src/cursor/cur_log.c @@ -74,7 +74,7 @@ __curlog_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) acl = (WT_CURSOR_LOG *)a; bcl = (WT_CURSOR_LOG *)b; WT_ASSERT(session, cmpp != NULL); - *cmpp = WT_LOG_CMP(acl->cur_lsn, bcl->cur_lsn); + *cmpp = __wt_log_cmp(acl->cur_lsn, bcl->cur_lsn); /* * If both are on the same LSN, compare step counter. */ @@ -392,6 +392,12 @@ __wt_curlog_open(WT_SESSION_IMPL *session, WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); + /* + * The user may be trying to read a log record they just wrote. + * Log records may be buffered, so force out any now. + */ + WT_ERR(__wt_log_force_write(session, 1)); + /* Log cursors block archiving. */ WT_ERR(__wt_readlock(session, log->log_archive_lock)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c index 2f844baaa00..2216a1d969d 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_stat.c +++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c @@ -497,7 +497,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session, conn = S2C(session); - WT_ERR(__wt_calloc_one(session, &cst)); + WT_RET(__wt_calloc_one(session, &cst)); cursor = &cst->iface; *cursor = iface; cursor->session = &session->iface; diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c index b7d8be14e5c..701bd845ae9 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_std.c +++ b/src/third_party/wiredtiger/src/cursor/cur_std.c @@ -258,9 +258,9 @@ __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap) item->data, item->size, "q", &cursor->recno)); } else cursor->recno = va_arg(ap, uint64_t); - if (cursor->recno == 0) + if (cursor->recno == WT_RECNO_OOB) WT_ERR_MSG(session, EINVAL, - "Record numbers must be greater than zero"); + "%d is an invalid record number", WT_RECNO_OOB); buf->data = &cursor->recno; sz = sizeof(cursor->recno); } else { diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c index 35ff0e4329e..66fabe48fb2 100644 --- a/src/third_party/wiredtiger/src/evict/evict_file.c +++ b/src/third_party/wiredtiger/src/evict/evict_file.c @@ -80,16 +80,13 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) break; case WT_SYNC_DISCARD: /* - * If we see a dirty page in a dead handle, clean the + * Dead handles may reference dirty pages; clean the * page, both to keep statistics correct, and to let * the page-discard function assert no dirty page is * ever discarded. */ - if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) && - __wt_page_is_modified(page)) { - page->modify->write_gen = 0; - __wt_cache_dirty_decr(session, page); - } + if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) + __wt_page_modify_clear(session, page); WT_ASSERT(session, F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index d442a34de71..b16621d1e6f 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -10,14 +10,13 @@ static int __evict_clear_all_walks(WT_SESSION_IMPL *); static int __evict_clear_walks(WT_SESSION_IMPL *); -static int __evict_has_work(WT_SESSION_IMPL *, uint32_t *); static int WT_CDECL __evict_lru_cmp(const void *, const void *); static int __evict_lru_pages(WT_SESSION_IMPL *, int); -static int __evict_lru_walk(WT_SESSION_IMPL *, uint32_t); +static int __evict_lru_walk(WT_SESSION_IMPL *); static int __evict_page(WT_SESSION_IMPL *, int); static int __evict_pass(WT_SESSION_IMPL *); -static int __evict_walk(WT_SESSION_IMPL *, uint32_t); -static int __evict_walk_file(WT_SESSION_IMPL *, u_int *, uint32_t); +static int __evict_walk(WT_SESSION_IMPL *); +static int __evict_walk_file(WT_SESSION_IMPL *, u_int *); static WT_THREAD_RET __evict_worker(void *); static int __evict_server_work(WT_SESSION_IMPL *); @@ -248,9 +247,16 @@ __evict_workers_resize(WT_SESSION_IMPL *session) for (i = conn->evict_workers_alloc; i < conn->evict_workers_max; i++) { WT_ERR(__wt_open_internal_session(conn, - "eviction-worker", 0, 0, &workers[i].session)); + "eviction-worker", 1, 0, &workers[i].session)); workers[i].id = i; - F_SET(workers[i].session, WT_SESSION_CAN_WAIT); + + /* + * Eviction worker threads get their own lookaside table cursor. + * Eviction worker threads may be called upon to perform slow + * operations for the block manager. + */ + F_SET(workers[i].session, + WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_CAN_WAIT); if (i < conn->evict_workers_min) { ++conn->evict_workers; @@ -280,7 +286,7 @@ __wt_evict_create(WT_SESSION_IMPL *session) /* We need a session handle because we're reading/writing pages. */ WT_RET(__wt_open_internal_session( - conn, "eviction-server", 0, 0, &conn->evict_session)); + conn, "eviction-server", 1, 0, &conn->evict_session)); session = conn->evict_session; /* @@ -297,6 +303,9 @@ __wt_evict_create(WT_SESSION_IMPL *session) else F_SET(session, WT_SESSION_CAN_WAIT); + /* The eviction server gets its own lookaside table cursor. */ + F_SET(session, WT_SESSION_LOOKASIDE_CURSOR); + /* * Start the primary eviction server thread after the worker threads * have started to avoid it starting additional worker threads before @@ -406,47 +415,62 @@ err: WT_PANIC_MSG(session, ret, "cache eviction worker error"); } /* - * __evict_has_work -- - * Find out if there is eviction work to be done. + * __evict_update_work -- + * Configure eviction work state. */ -static int -__evict_has_work(WT_SESSION_IMPL *session, uint32_t *flagsp) +static bool +__evict_update_work(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; - uint32_t flags; - int evict, dirty; + uint64_t bytes_inuse, bytes_max, dirty_inuse; conn = S2C(session); cache = conn->cache; - *flagsp = flags = 0; + + /* Clear previous state. */ + cache->state = 0; if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) - return (0); + return (false); - /* Check to see if the eviction server should run. */ - __wt_cache_status(session, &evict, &dirty); - if (evict) - /* The cache is too small. */ - LF_SET(WT_EVICT_PASS_ALL); - else if (dirty) - /* Too many dirty pages, ignore clean pages. */ - LF_SET(WT_EVICT_PASS_DIRTY); - else if (F_ISSET(cache, WT_CACHE_WOULD_BLOCK)) { - /* - * Evict pages with oldest generation (which would otherwise - * block application threads) set regardless of whether we have - * reached the eviction trigger. - */ - LF_SET(WT_EVICT_PASS_WOULD_BLOCK); - F_CLR(cache, WT_CACHE_WOULD_BLOCK); + /* + * Page eviction overrides the dirty target and other types of eviction, + * that is, we don't care where we are with respect to the dirty target + * if page eviction is configured. + * + * Avoid division by zero if the cache size has not yet been set in a + * shared cache. + */ + bytes_max = conn->cache_size + 1; + bytes_inuse = __wt_cache_bytes_inuse(cache); + if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) { + FLD_SET(cache->state, WT_EVICT_PASS_ALL); + goto done; } - if (F_ISSET(cache, WT_CACHE_STUCK)) - LF_SET(WT_EVICT_PASS_AGGRESSIVE); + dirty_inuse = __wt_cache_dirty_inuse(cache); + if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) { + FLD_SET(cache->state, WT_EVICT_PASS_DIRTY); + goto done; + } - *flagsp = flags; - return (0); + /* + * Evict pages with oldest generation (which would otherwise block + * application threads), set regardless of whether we have reached + * the eviction trigger. + */ + if (F_ISSET(cache, WT_CACHE_WOULD_BLOCK)) { + FLD_SET(cache->state, WT_EVICT_PASS_WOULD_BLOCK); + + F_CLR(cache, WT_CACHE_WOULD_BLOCK); + goto done; + } + return (false); + +done: if (F_ISSET(cache, WT_CACHE_STUCK)) + FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE); + return (true); } /* @@ -460,7 +484,6 @@ __evict_pass(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_EVICT_WORKER *worker; uint64_t pages_evicted; - uint32_t flags; int loop; conn = S2C(session); @@ -483,10 +506,10 @@ __evict_pass(WT_SESSION_IMPL *session) } /* - * Increment the shared read generation. We do this - * occasionally even if eviction is not currently required, so - * that pages have some relative read generation when the - * eviction server does need to do some work. + * Increment the shared read generation. Do this occasionally + * even if eviction is not currently required, so that pages + * have some relative read generation when the eviction server + * does need to do some work. */ __wt_cache_read_gen_incr(session); @@ -502,18 +525,17 @@ __evict_pass(WT_SESSION_IMPL *session) */ __wt_txn_update_oldest(session, 1); - WT_RET(__evict_has_work(session, &flags)); - if (flags == 0) + if (!__evict_update_work(session)) break; if (loop > 10) - LF_SET(WT_EVICT_PASS_AGGRESSIVE); + FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE); /* * Start a worker if we have capacity and we haven't reached * the eviction targets. */ - if (LF_ISSET(WT_EVICT_PASS_ALL | + if (FLD_ISSET(cache->state, WT_EVICT_PASS_ALL | WT_EVICT_PASS_DIRTY | WT_EVICT_PASS_WOULD_BLOCK) && conn->evict_workers < conn->evict_workers_max) { WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, @@ -532,7 +554,7 @@ __evict_pass(WT_SESSION_IMPL *session) " In use: %" PRIu64 " Dirty: %" PRIu64, conn->cache_size, cache->bytes_inmem, cache->bytes_dirty)); - WT_RET(__evict_lru_walk(session, flags)); + WT_RET(__evict_lru_walk(session)); WT_RET(__evict_server_work(session)); /* @@ -553,7 +575,8 @@ __evict_pass(WT_SESSION_IMPL *session) * Mark the cache as stuck if we need space * and aren't evicting any pages. */ - if (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK)) { + if (!FLD_ISSET(cache->state, + WT_EVICT_PASS_WOULD_BLOCK)) { F_SET(cache, WT_CACHE_STUCK); WT_STAT_FAST_CONN_INCR( session, cache_eviction_slow); @@ -673,44 +696,6 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session) } /* - * __wt_evict_page -- - * Evict a given page. - */ -int -__wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref) -{ - WT_DECL_RET; - WT_TXN *txn; - WT_TXN_ISOLATION saved_iso; - - /* - * We have to take care when evicting pages not to write a change that: - * (a) is not yet committed; or - * (b) is committed more recently than an in-progress checkpoint. - * - * We handle both of these cases by setting up the transaction context - * before evicting, using a special "eviction" isolation level, where - * only globally visible updates can be evicted. - */ - __wt_txn_update_oldest(session, 1); - txn = &session->txn; - saved_iso = txn->isolation; - txn->isolation = WT_ISO_EVICTION; - - /* - * Sanity check: if a transaction has updates, its updates should not - * be visible to eviction. - */ - WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_ID) || - !__wt_txn_visible(session, txn->id)); - - ret = __wt_evict(session, ref, 0); - txn->isolation = saved_iso; - - return (ret); -} - -/* * __wt_evict_file_exclusive_on -- * Get exclusive eviction access to a file and discard any of the file's * blocks queued for eviction. @@ -808,7 +793,7 @@ __evict_lru_pages(WT_SESSION_IMPL *session, int is_server) * Add pages to the LRU queue to be evicted from cache. */ static int -__evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags) +__evict_lru_walk(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_DECL_RET; @@ -819,7 +804,7 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags) cache = S2C(session)->cache; /* Get some more pages to consider for eviction. */ - if ((ret = __evict_walk(session, flags)) != 0) + if ((ret = __evict_walk(session)) != 0) return (ret == EBUSY ? 0 : ret); /* Sort the list into LRU order and restart. */ @@ -851,7 +836,8 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags) /* Track the oldest read generation we have in the queue. */ cache->read_gen_oldest = cache->evict[0].ref->page->read_gen; - if (LF_ISSET(WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) + if (FLD_ISSET(cache->state, + WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) /* * Take all candidates if we only gathered pages with an oldest * read generation set. @@ -929,7 +915,7 @@ __evict_server_work(WT_SESSION_IMPL *session) * Fill in the array by walking the next set of pages. */ static int -__evict_walk(WT_SESSION_IMPL *session, uint32_t flags) +__evict_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; @@ -1023,7 +1009,7 @@ retry: while (slot < max_entries && ret == 0) { * stick in cache until we get aggressive. */ if ((btree->checkpointing || btree->evict_priority != 0) && - !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE)) + !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE)) continue; /* Skip files if we have used all available hazard pointers. */ @@ -1055,7 +1041,7 @@ retry: while (slot < max_entries && ret == 0) { */ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { WT_WITH_DHANDLE(session, dhandle, - ret = __evict_walk_file(session, &slot, flags)); + ret = __evict_walk_file(session, &slot)); WT_ASSERT(session, session->split_gen == 0); } @@ -1093,7 +1079,8 @@ retry: while (slot < max_entries && ret == 0) { */ if (!F_ISSET(cache, WT_CACHE_CLEAR_WALKS) && ret == 0 && slot < max_entries && (retries < 2 || - (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) && retries < 10 && + (retries < 10 && + !FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) && (slot == cache->evict_entries || slot > start_slot)))) { start_slot = slot; ++retries; @@ -1136,10 +1123,11 @@ __evict_init_candidate( * Get a few page eviction candidates from a single underlying file. */ static int -__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) +__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) { WT_BTREE *btree; WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_EVICT_ENTRY *end, *evict, *start; WT_PAGE *page; @@ -1149,8 +1137,9 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) uint32_t walk_flags; int enough, internal_pages, modified, restarts; + conn = S2C(session); btree = S2BT(session); - cache = S2C(session)->cache; + cache = conn->cache; start = cache->evict + *slotp; end = WT_MIN(start + WT_EVICT_WALK_PER_FILE, cache->evict + cache->evict_slots); @@ -1204,21 +1193,21 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) goto fast; /* Optionally ignore clean pages. */ - if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY)) + if (!modified && FLD_ISSET(cache->state, WT_EVICT_PASS_DIRTY)) continue; /* * If we are only trickling out pages marked for definite * eviction, skip anything that isn't marked. */ - if (LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) && + if (FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) && page->read_gen != WT_READGEN_OLDEST) continue; /* Limit internal pages to 50% unless we get aggressive. */ if (WT_PAGE_IS_INTERNAL(page) && ++internal_pages > WT_EVICT_WALK_PER_FILE / 2 && - !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE)) + !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE)) continue; /* @@ -1233,36 +1222,44 @@ fast: /* If the page can't be evicted, give up. */ continue; /* - * If the page is clean but has modifications that appear too - * new to evict, skip it. + * Additional tests if eviction is likely to succeed. * - * Note: take care with ordering: if we detected that the page - * is modified above, we expect mod != NULL. + * If eviction is stuck or we are helping with forced eviction, + * try anyway: maybe a transaction that was running last time + * we wrote the page has since rolled back, or we can help the + * checkpoint complete sooner. Additionally, being stuck will + * configure lookaside table writes in reconciliation, allowing + * us to evict pages we can't usually evict. */ - mod = page->modify; - if (!modified && mod != NULL && !LF_ISSET( - WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) && - !__wt_txn_visible_all(session, mod->rec_max_txn)) - continue; + if (!FLD_ISSET(cache->state, + WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) { + /* + * Note: take care with ordering: if we detected that + * the page is modified above, we expect mod != NULL. + */ + mod = page->modify; - /* - * If the oldest transaction hasn't changed since the last time - * this page was written, it's unlikely that we can make - * progress. Similarly, if the most recent update on the page - * is not yet globally visible, eviction will fail. These - * heuristics attempt to avoid repeated attempts to evict the - * same page. - * - * That said, if eviction is stuck, or we are helping with - * forced eviction, try anyway: maybe a transaction that was - * running last time we wrote the page has since rolled back, - * or we can help get the checkpoint completed sooner. - */ - if (modified && !LF_ISSET( - WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) && - (mod->disk_snap_min == S2C(session)->txn_global.oldest_id || - !__wt_txn_visible_all(session, mod->update_txn))) - continue; + /* + * If the page is clean but has modifications that + * appear too new to evict, skip it. + */ + if (!modified && mod != NULL && + !__wt_txn_visible_all(session, mod->rec_max_txn)) + continue; + + /* + * If the oldest transaction hasn't changed since the + * last time this page was written, it's unlikely we + * can make progress. Similarly, if the most recent + * update on the page is not yet globally visible, + * eviction will fail. These heuristics attempt to + * avoid repeated attempts to evict the same page. + */ + if (modified && + (mod->disk_snap_min == conn->txn_global.oldest_id || + !__wt_txn_visible_all(session, mod->update_txn))) + continue; + } WT_ASSERT(session, evict->ref == NULL); __evict_init_candidate(session, evict, ref); @@ -1428,13 +1425,10 @@ __evict_page(WT_SESSION_IMPL *session, int is_server) * page-discard function assert that no dirty pages are ever * discarded. */ - if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD) && - __wt_page_is_modified(page)) { - page->modify->write_gen = 0; - __wt_cache_dirty_decr(session, page); - } + if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD)) + __wt_page_modify_clear(session, page); - WT_WITH_BTREE(session, btree, ret = __wt_evict_page(session, ref)); + WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, 0)); (void)__wt_atomic_subv32(&btree->evict_busy, 1); @@ -1453,7 +1447,7 @@ __evict_page(WT_SESSION_IMPL *session, int is_server) * crosses its boundaries. */ int -__wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full) +__wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; @@ -1570,29 +1564,31 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full) * NOTE: this function is not called anywhere, it is intended to be called * from a debugger. */ -void -__wt_cache_dump(WT_SESSION_IMPL *session) +int +__wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) { - WT_BTREE *btree; + FILE *fp; WT_CONNECTION_IMPL *conn; - WT_DATA_HANDLE *dhandle; - WT_REF *next_walk; + WT_DATA_HANDLE *dhandle, *saved_dhandle; WT_PAGE *page; + WT_REF *next_walk; uint64_t file_intl_pages, file_leaf_pages; uint64_t file_bytes, file_dirty, total_bytes; conn = S2C(session); total_bytes = 0; + if (ofile == NULL) + fp = stdout; + else + WT_RET(__wt_fopen(session, ofile, WT_FHANDLE_WRITE, 0, &fp)); + + saved_dhandle = session->dhandle; TAILQ_FOREACH(dhandle, &conn->dhqh, q) { if (!WT_PREFIX_MATCH(dhandle->name, "file:") || !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; - btree = dhandle->handle; - if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) - continue; - file_bytes = file_dirty = file_intl_pages = file_leaf_pages = 0; next_walk = NULL; session->dhandle = dhandle; @@ -1607,12 +1603,14 @@ __wt_cache_dump(WT_SESSION_IMPL *session) file_bytes += page->memory_footprint; if (__wt_page_is_modified(page)) file_dirty += page->memory_footprint; + (void)__wt_fprintf(fp, + "%" WT_SIZET_FMT ", ", page->memory_footprint); } session->dhandle = NULL; - printf("cache dump: %s%s%s%s:" - " %" PRIu64 " intl pages, %" PRIu64 " leaf pages," - " %" PRIu64 "MB, %" PRIu64 "MB dirty\n", + (void)__wt_fprintf(fp, "\n" "cache dump: %s%s%s%s\n\t" + " %" PRIu64 " internal pages, %" PRIu64 " leaf pages," + " %" PRIu64 "MB, %" PRIu64 "MB dirty\n==============\n", dhandle->name, dhandle->checkpoint == NULL ? "" : " [", dhandle->checkpoint == NULL ? "" : dhandle->checkpoint, @@ -1622,9 +1620,13 @@ __wt_cache_dump(WT_SESSION_IMPL *session) total_bytes += file_bytes; } - printf("cache dump: total found = %" PRIu64 "MB" + session->dhandle = saved_dhandle; + + (void)__wt_fprintf(fp, "cache dump: total found = %" PRIu64 "MB" " vs tracked inuse %" PRIu64 "MB\n", total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20); - fflush(stdout); + if (fp != stdout) + WT_RET(__wt_fclose(&fp, WT_FHANDLE_WRITE)); + return (0); } #endif diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 1e5faf45de2..11284ce7b21 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -150,17 +150,12 @@ done: if (((inmem_split && ret == 0) || (forced_eviction && ret == EBUSY)) && int __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing) { - int evict; - /* * If doing normal system eviction, but only in the service of reducing * the number of dirty pages, leave the clean page in cache. */ - if (!closing) { - __wt_cache_status(session, &evict, NULL); - if (!evict) - return (EBUSY); - } + if (!closing && __wt_eviction_dirty_target(session)) + return (EBUSY); /* * Discard the page and update the reference structure; if the page has @@ -184,7 +179,6 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing) WT_ADDR *addr; WT_PAGE *parent; WT_PAGE_MODIFY *mod; - int evict; parent = ref->home; mod = ref->page->modify; @@ -229,11 +223,8 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing) * push it out of cache (and read it back in, when needed), we * would rather have more, smaller pages than fewer large pages. */ - if (!closing) { - __wt_cache_status(session, &evict, NULL); - if (!evict) - return (EBUSY); - } + if (!closing && __wt_eviction_dirty_target(session)) + return (EBUSY); /* Discard the parent's address. */ if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) { @@ -309,8 +300,7 @@ __evict_review( { WT_DECL_RET; WT_PAGE *page; - WT_PAGE_MODIFY *mod; - uint32_t reconcile_flags; + uint32_t flags; /* * Get exclusive access to the page if our caller doesn't have the tree @@ -331,7 +321,6 @@ __evict_review( /* Now that we have exclusive access, review the page. */ page = ref->page; - mod = page->modify; /* * Fail if an internal has active children, the children must be evicted @@ -347,6 +336,13 @@ __evict_review( /* Check if the page can be evicted. */ if (!closing) { + /* + * Update the oldest ID to avoid wasted effort should it have + * fallen behind current. + */ + if (__wt_page_is_modified(page)) + __wt_txn_update_oldest(session, 1); + if (!__wt_page_can_evict(session, page, 0, inmem_splitp)) return (EBUSY); @@ -361,9 +357,12 @@ __evict_review( return (__wt_split_insert(session, ref)); } + /* If the page is clean, we're done and we can evict. */ + if (!__wt_page_is_modified(page)) + return (0); + /* - * If the page is dirty and can possibly change state, reconcile it to - * determine the final state. + * If the page is dirty, reconcile it to decide if we can evict it. * * If we have an exclusive lock (we're discarding the tree), assert * there are no updates we cannot read. @@ -377,30 +376,38 @@ __evict_review( * in-memory pages, (restoring the updates that stopped us from writing * the block), and inserting the whole mess into the page's parent. * - * Don't set the update-restore flag for internal pages, they don't have - * updates that can be saved and restored. + * Otherwise, if eviction is getting pressed, configure reconciliation + * to write not-yet-globally-visible updates to the lookaside table, + * allowing the eviction of pages we'd otherwise have to retain in cache + * to support older readers. + * + * Don't set the update-restore or lookaside table flags for internal + * pages, they don't have update lists that can be saved and restored. */ - reconcile_flags = WT_EVICTING; - if (__wt_page_is_modified(page)) { - if (closing) - FLD_SET(reconcile_flags, WT_SKIP_UPDATE_ERR); - else if (!WT_PAGE_IS_INTERNAL(page) && - page->read_gen == WT_READGEN_OLDEST) - FLD_SET(reconcile_flags, WT_SKIP_UPDATE_RESTORE); - WT_RET(__wt_reconcile(session, ref, NULL, reconcile_flags)); - WT_ASSERT(session, - !__wt_page_is_modified(page) || - FLD_ISSET(reconcile_flags, WT_SKIP_UPDATE_RESTORE)); + flags = WT_EVICTING; + if (closing) + LF_SET(WT_VISIBILITY_ERR); + else if (!WT_PAGE_IS_INTERNAL(page)) { + if (page->read_gen == WT_READGEN_OLDEST) + LF_SET(WT_EVICT_UPDATE_RESTORE); + else if (__wt_eviction_aggressive(session)) + LF_SET(WT_EVICT_LOOKASIDE); } + WT_RET(__wt_reconcile(session, ref, NULL, flags)); + /* - * If the page was ever modified, make sure all of the updates - * on the page are old enough they can be discarded from cache. + * Success: assert the page is clean or reconciliation was configured + * for an update/restore split, and if the page is clean, reconciliation + * was configured for a lookaside table or all updates on the page are + * globally visible. */ - if (!closing && mod != NULL && - !__wt_txn_visible_all(session, mod->rec_max_txn) && - !FLD_ISSET(reconcile_flags, WT_SKIP_UPDATE_RESTORE)) - return (EBUSY); + WT_ASSERT(session, + LF_ISSET(WT_EVICT_UPDATE_RESTORE) || !__wt_page_is_modified(page)); + WT_ASSERT(session, + LF_SET(WT_EVICT_LOOKASIDE) || + __wt_page_is_modified(page) || + __wt_txn_visible_all(session, page->modify->rec_max_txn)); return (0); } diff --git a/src/third_party/wiredtiger/src/include/bitstring.i b/src/third_party/wiredtiger/src/include/bitstring.i index c548c12761d..5449ffe6209 100644 --- a/src/third_party/wiredtiger/src/include/bitstring.i +++ b/src/third_party/wiredtiger/src/include/bitstring.i @@ -84,10 +84,10 @@ __bit_alloc(WT_SESSION_IMPL *session, uint64_t nbits, void *retp) * __bit_test -- * Test one bit in name. */ -static inline int +static inline bool __bit_test(uint8_t *bitf, uint64_t bit) { - return (bitf[__bit_byte(bit)] & __bit_mask(bit) ? 1 : 0); + return ((bitf[__bit_byte(bit)] & __bit_mask(bit)) != 0); } /* diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 4aa2b1c7a7d..f214ddb1dc3 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -6,6 +6,8 @@ * See the file LICENSE for redistribution information. */ +#define WT_RECNO_OOB 0 /* Illegal record number */ + /* * WT_PAGE_HEADER -- * Blocks have a common header, a WT_PAGE_HEADER structure followed by a @@ -43,6 +45,7 @@ struct __wt_page_header { #define WT_PAGE_EMPTY_V_ALL 0x02 /* Page has all zero-length values */ #define WT_PAGE_EMPTY_V_NONE 0x04 /* Page has no zero-length values */ #define WT_PAGE_ENCRYPTED 0x08 /* Page is encrypted on disk */ +#define WT_PAGE_LAS_UPDATE 0x10 /* Page updates in lookaside store */ uint8_t flags; /* 25: flags */ /* @@ -168,6 +171,29 @@ struct __wt_ovfl_txnc { }; /* + * Lookaside table support: when a page is being reconciled for eviction and has + * updates that might be required by earlier readers in the system, the updates + * are written into a lookaside table, and restored as necessary if the page is + * read. The key is a unique marker for the page (a file ID plus an address), + * a counter (used to ensure the update records remain in the original order), + * the on-page item's transaction ID (so we can discard any update records from + * the lookaside table once the on-page item's transaction is globally visible), + * and the page key (byte-string for row-store, record number for column-store). + * The value is the WT_UPDATE structure's transaction ID, update size and value. + * + * As the key for the lookaside table is different for row- and column-store, we + * store both key types in a WT_ITEM, building/parsing them in the code, because + * otherwise we'd need two lookaside files with different key formats. We could + * make the lookaside table's key standard by moving the source key into the + * lookaside table value, but that doesn't make the coding any simpler, and it + * makes the lookaside table's value more likely to overflow the page size when + * the row-store key is relatively large. + */ +#define WT_LAS_FORMAT \ + "key_format=" WT_UNCHECKED_STRING(IuQQu) \ + ",value_format=" WT_UNCHECKED_STRING(QIu) + +/* * WT_PAGE_MODIFY -- * When a page is modified, there's additional information to maintain. */ @@ -238,15 +264,17 @@ struct __wt_page_modify { * Eviction, but block wasn't written: unresolved updates and * associated disk image. * - * Skipped updates are either a WT_INSERT, or a row-store leaf - * page entry. + * Saved updates are either a WT_INSERT, or a row-store leaf + * page entry; in the case of creating lookaside records, there + * is an additional value, the committed item's transaction ID. */ - struct __wt_upd_skipped { + struct __wt_save_upd { WT_INSERT *ins; WT_ROW *rip; - } *skip; - uint32_t skip_entries; - void *skip_dsk; + uint64_t onpage_txn; + } *supd; + uint32_t supd_entries; + void *supd_dsk; /* * Block was written: address, size and checksum. @@ -556,9 +584,8 @@ struct __wt_page { #define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */ #define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ -#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */ +#define WT_PAGE_RECONCILIATION 0x10 /* Page reconciliation lock */ #define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */ -#define WT_PAGE_SPLIT_LOCKED 0x40 /* An internal page is growing */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ /* @@ -869,8 +896,9 @@ WT_PACKED_STRUCT_BEGIN(__wt_update) * store 4GB objects; I'd rather do that than increase the size of this * structure for a flag bit. */ -#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == UINT32_MAX) -#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = UINT32_MAX) +#define WT_UPDATE_DELETED_VALUE UINT32_MAX +#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = WT_UPDATE_DELETED_VALUE) +#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == WT_UPDATE_DELETED_VALUE) uint32_t size; /* update length */ /* The untyped value immediately follows the WT_UPDATE structure. */ diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h index deecd8f6d88..98ce4c22c10 100644 --- a/src/third_party/wiredtiger/src/include/btree.h +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -146,12 +146,14 @@ struct __wt_btree { /* Flags values up to 0xff are reserved for WT_DHANDLE_* */ #define WT_BTREE_BULK 0x00100 /* Bulk-load handle */ #define WT_BTREE_IN_MEMORY 0x00200 /* Cache-resident object */ -#define WT_BTREE_NO_EVICTION 0x00400 /* Disable eviction */ -#define WT_BTREE_NO_LOGGING 0x00800 /* Disable logging */ -#define WT_BTREE_SALVAGE 0x01000 /* Handle is for salvage */ -#define WT_BTREE_SKIP_CKPT 0x02000 /* Handle skipped checkpoint */ -#define WT_BTREE_UPGRADE 0x04000 /* Handle is for upgrade */ -#define WT_BTREE_VERIFY 0x08000 /* Handle is for verify */ +#define WT_BTREE_LOOKASIDE 0x00400 /* Look-aside table */ +#define WT_BTREE_NO_CHECKPOINT 0x00800 /* Disable checkpoints */ +#define WT_BTREE_NO_EVICTION 0x01000 /* Disable eviction */ +#define WT_BTREE_NO_LOGGING 0x02000 /* Disable logging */ +#define WT_BTREE_SALVAGE 0x04000 /* Handle is for salvage */ +#define WT_BTREE_SKIP_CKPT 0x08000 /* Handle skipped checkpoint */ +#define WT_BTREE_UPGRADE 0x10000 /* Handle is for upgrade */ +#define WT_BTREE_VERIFY 0x20000 /* Handle is for verify */ uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 058a00d5a78..b54cecb6ce0 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -10,17 +10,17 @@ * __wt_ref_is_root -- * Return if the page reference is for the root page. */ -static inline int +static inline bool __wt_ref_is_root(WT_REF *ref) { - return (ref->home == NULL ? 1 : 0); + return (ref->home == NULL); } /* * __wt_page_is_empty -- * Return if the page is empty. */ -static inline int +static inline bool __wt_page_is_empty(WT_PAGE *page) { return (page->modify != NULL && @@ -31,10 +31,10 @@ __wt_page_is_empty(WT_PAGE *page) * __wt_page_is_modified -- * Return if the page is dirty. */ -static inline int +static inline bool __wt_page_is_modified(WT_PAGE *page) { - return (page->modify != NULL && page->modify->write_gen != 0 ? 1 : 0); + return (page->modify != NULL && page->modify->write_gen != 0); } /* @@ -84,6 +84,9 @@ __wt_cache_decr_check_size( __wt_errx(session, "%s underflow: decrementing %" WT_SIZET_FMT, fld, v); first = 0; } +#else + WT_UNUSED(fld); + WT_UNUSED(session); #endif } @@ -109,6 +112,9 @@ __wt_cache_decr_check_uint64( __wt_errx(session, "%s underflow: decrementing %" WT_SIZET_FMT, fld, v); first = 0; } +#else + WT_UNUSED(fld); + WT_UNUSED(session); #endif } @@ -352,9 +358,13 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) * have committed in the meantime, and the last_running field * been updated past it. That is all very unlikely, but not * impossible, so we take care to read the global state before - * the atomic increment. If we raced with reconciliation, just - * leave the previous value here: at worst, we will write a - * page in a checkpoint when not absolutely necessary. + * the atomic increment. + * + * If the page was dirty on entry, then last_running == 0. The + * page could have become clean since then, if reconciliation + * completed. In that case, we leave the previous value for + * first_dirty_txn rather than potentially racing to update it, + * at worst, we'll unnecessarily write a page in a checkpoint. */ if (last_running != 0) page->modify->first_dirty_txn = last_running; @@ -366,6 +376,25 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) } /* + * __wt_page_modify_clear -- + * Clean a modified page. + */ +static inline void +__wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + /* + * The page must be held exclusive when this call is made, this call + * can only be used when the page is owned by a single thread. + * + * Allow the call to be made on clean pages. + */ + if (__wt_page_is_modified(page)) { + page->modify->write_gen = 0; + __wt_cache_dirty_decr(session, page); + } +} + +/* * __wt_page_modify_set -- * Mark the page and tree dirty. */ @@ -385,6 +414,9 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) * shouldn't cause problems; regardless, let's play it safe.) */ if (S2BT(session)->modified == 0) { + /* Assert we never dirty a checkpoint handle. */ + WT_ASSERT(session, session->dhandle->checkpoint == NULL); + S2BT(session)->modified = 1; WT_FULL_BARRIER(); } @@ -426,7 +458,7 @@ __wt_page_parent_modify_set( * __wt_off_page -- * Return if a pointer references off-page data. */ -static inline int +static inline bool __wt_off_page(WT_PAGE *page, const void *p) { /* @@ -527,7 +559,12 @@ __wt_ref_key_instantiated(WT_REF *ref) static inline void __wt_ref_key_clear(WT_REF *ref) { - /* The key union has 2 fields, both of which are 8B. */ + /* + * The key union has 2 8B fields; this is equivalent to: + * + * ref->key.recno = WT_RECNO_OOB; + * ref->key.ikey = NULL; + */ ref->key.recno = 0; } @@ -537,7 +574,7 @@ __wt_ref_key_clear(WT_REF *ref) * had without unpacking a cell, and information about the cell, if the key * isn't cheaply available. */ -static inline int +static inline bool __wt_row_leaf_key_info(WT_PAGE *page, void *copy, WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep) { @@ -628,7 +665,7 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy, if (cellp != NULL) *cellp = WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v)); - return (0); + return (false); case WT_K_FLAG: /* Encoded key: no instantiated key, no cell. */ if (cellp != NULL) @@ -639,9 +676,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy, *(void **)datap = WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v)); *sizep = WT_K_DECODE_KEY_LEN(v); - return (1); + return (true); } - return (0); + return (false); case WT_KV_FLAG: /* Encoded key/value pair: no instantiated key, no cell. */ if (cellp != NULL) @@ -652,9 +689,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy, *(void **)datap = WT_PAGE_REF_OFFSET( page, WT_KV_DECODE_KEY_OFFSET(v)); *sizep = WT_KV_DECODE_KEY_LEN(v); - return (1); + return (true); } - return (0); + return (false); } @@ -667,9 +704,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy, if (datap != NULL) { *(void **)datap = WT_IKEY_DATA(ikey); *sizep = ikey->size; - return (1); + return (true); } - return (0); + return (false); } /* @@ -857,7 +894,7 @@ __wt_row_leaf_value_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack) * __wt_row_leaf_value -- * Return the value for a row-store leaf page encoded key/value pair. */ -static inline int +static inline bool __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value) { uintptr_t v; @@ -873,9 +910,9 @@ __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value) value->data = WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v)); value->size = WT_KV_DECODE_VALUE_LEN(v); - return (1); + return (true); } - return (0); + return (false); } /* @@ -934,11 +971,13 @@ __wt_ref_info(WT_SESSION_IMPL *session, * __wt_page_can_split -- * Check whether a page can be split in memory. */ -static inline int +static inline bool __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_INSERT_HEAD *ins_head; + WT_INSERT *ins; + int i; btree = S2BT(session); @@ -947,58 +986,54 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) * of the page could continually split without benefit. */ if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) - return (0); + return (false); /* * Check for pages with append-only workloads. A common application * pattern is to have multiple threads frantically appending to the * tree. We want to reconcile and evict this page, but we'd like to - * do it without making the appending threads wait. If we're not - * discarding the tree, check and see if it's worth doing a split to - * let the threads continue before doing eviction. - * - * Ignore anything other than large, dirty row-store leaf pages. + * do it without making the appending threads wait. See if it's worth + * doing a split to let the threads continue before doing eviction. * - * XXX KEITH - * Need a better test for append-only workloads. + * Ignore anything other than large, dirty row-store leaf pages. The + * split code only supports row-store pages, and we depend on the page + * being dirty for correctness (the page must be reconciled again + * before being evicted after the split, information from a previous + * reconciliation will be wrong, so we can't evict immediately). */ if (page->type != WT_PAGE_ROW_LEAF || page->memory_footprint < btree->maxmempage || !__wt_page_is_modified(page)) - return (0); - - /* Don't split a page that is pending a multi-block split. */ - if (F_ISSET(page->modify, WT_PM_REC_MULTIBLOCK)) - return (0); + return (false); /* * There is no point splitting if the list is small, no deep items is - * our heuristic for that. (A 1/4 probability of adding a new skiplist - * level means there will be a new 6th level for roughly each 4KB of - * entries in the list. If we have at least two 6th level entries, the - * list is at least large enough to work with.) - * - * The following code requires at least two items on the insert list, - * this test serves the additional purpose of confirming that. + * our heuristic for that. A 1/4 probability of adding a new skiplist + * level, with level-0 always created, means there will be a 5th level + * entry for roughly every 1024 entries in the list. If there are at + * least 4 5th level entries (4K items), the list is large enough. */ -#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(6, WT_SKIP_MAXDEPTH - 1) +#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(5, WT_SKIP_MAXDEPTH - 1) ins_head = page->pg_row_entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); - if (ins_head == NULL || - ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == NULL || - ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == - ins_head->tail[WT_MIN_SPLIT_SKIPLIST_DEPTH]) - return (0); - - return (1); + if (ins_head == NULL) + return (false); + for (i = 0, ins = ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH]; + ins != NULL; ins = ins->next[WT_MIN_SPLIT_SKIPLIST_DEPTH]) + if (++i == 4) { + WT_STAT_FAST_CONN_INCR(session, cache_inmem_splittable); + WT_STAT_FAST_DATA_INCR(session, cache_inmem_splittable); + return (true); + } + return (false); } /* * __wt_page_can_evict -- * Check whether a page can be evicted. */ -static inline int +static inline bool __wt_page_can_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int check_splits, int *inmem_splitp) { @@ -1011,11 +1046,22 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, btree = S2BT(session); mod = page->modify; - txn_global = &S2C(session)->txn_global; /* Pages that have never been modified can always be evicted. */ if (mod == NULL) - return (1); + return (true); + + /* + * Check for in-memory splits before other eviction tests. If the page + * should split in-memory, return success immediately and skip more + * detailed eviction tests. We don't need further tests since the page + * won't be written or discarded from the cache. + */ + if (__wt_page_can_split(session, page)) { + if (inmem_splitp != NULL) + *inmem_splitp = 1; + return (true); + } /* * If the tree was deepened, there's a requirement that newly created @@ -1028,20 +1074,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, */ if (check_splits && WT_PAGE_IS_INTERNAL(page) && !__wt_txn_visible_all(session, mod->mod_split_txn)) - return (0); - - /* - * Allow for the splitting of pages when a checkpoint is underway only - * if the allow_splits flag has been passed, we know we are performing - * a checkpoint, the page is larger than the stated maximum and there - * has not already been a split on this page as the WT_PM_REC_MULTIBLOCK - * flag is unset. - */ - if (__wt_page_can_split(session, page)) { - if (inmem_splitp != NULL) - *inmem_splitp = 1; - return (1); - } + return (false); /* * If the file is being checkpointed, we can't evict dirty pages: @@ -1049,25 +1082,27 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, * previous version might be referenced by an internal page already * been written in the checkpoint, leaving the checkpoint inconsistent. */ - if (btree->checkpointing && - (__wt_page_is_modified(page) || - F_ISSET(mod, WT_PM_REC_MULTIBLOCK))) { + if (btree->checkpointing && __wt_page_is_modified(page)) { WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint); WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint); - return (0); + return (false); } /* - * If the page was recently split in-memory, don't force it out: we - * hope an eviction thread will find it first. The check here is - * similar to __wt_txn_visible_all, but ignores the checkpoint's - * transaction. + * If the page was recently split in-memory, don't evict it immediately: + * we want to give application threads that are appending a chance to + * move to the new leaf page created by the split. + * + * Note the check here is similar to __wt_txn_visible_all, but ignores + * the checkpoint's transaction. */ - if (check_splits && - WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn)) - return (0); + if (check_splits) { + txn_global = &S2C(session)->txn_global; + if (WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn)) + return (false); + } - return (1); + return (true); } /* @@ -1100,7 +1135,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) (void)__wt_atomic_addv32(&btree->evict_busy, 1); too_big = (page->memory_footprint > btree->maxmempage) ? 1 : 0; - if ((ret = __wt_evict_page(session, ref)) == 0) { + if ((ret = __wt_evict(session, ref, 0)) == 0) { if (too_big) WT_STAT_FAST_CONN_INCR(session, cache_eviction_force); else @@ -1151,12 +1186,13 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) * memory_page_max setting, when we see many deleted items, and when we * are attempting to scan without trashing the cache. * - * Fast checks if eviction is disabled for this operation or this tree, - * then perform a general check if eviction will be possible. + * Fast checks if eviction is disabled for this handle, operation or + * tree, then perform a general check if eviction will be possible. */ page = ref->page; if (page->read_gen != WT_READGEN_OLDEST || LF_ISSET(WT_READ_NO_EVICT) || + F_ISSET(session, WT_SESSION_NO_EVICTION) || F_ISSET(btree, WT_BTREE_NO_EVICTION) || !__wt_page_can_evict(session, page, 1, NULL)) return (__wt_hazard_clear(session, page)); @@ -1272,13 +1308,13 @@ __wt_skip_choose_depth(WT_SESSION_IMPL *session) } /* - * __wt_btree_lsm_size -- + * __wt_btree_lsm_over_size -- * Return if the size of an in-memory tree with a single leaf page is over * a specified maximum. If called on anything other than a simple tree with a * single leaf page, returns true so our LSM caller will switch to a new tree. */ -static inline int -__wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize) +static inline bool +__wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize) { WT_BTREE *btree; WT_PAGE *child, *root; @@ -1290,20 +1326,20 @@ __wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize) /* Check for a non-existent tree. */ if (root == NULL) - return (0); + return (false); /* A tree that can be evicted always requires a switch. */ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) - return (1); + return (true); /* Check for a tree with a single leaf page. */ WT_INTL_INDEX_GET(session, root, pindex); if (pindex->entries != 1) /* > 1 child page, switch */ - return (1); + return (true); first = pindex->index[0]; if (first->state != WT_REF_MEM) /* no child page, ignore */ - return (0); + return (false); /* * We're reaching down into the page without a hazard pointer, but @@ -1312,7 +1348,7 @@ __wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize) */ child = first->page; if (child->type != WT_PAGE_ROW_LEAF) /* not a single leaf page */ - return (1); + return (true); return (child->memory_footprint > maxsize); } diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h index ed93f82538c..f98483a215f 100644 --- a/src/third_party/wiredtiger/src/include/cache.h +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -18,11 +18,6 @@ #define WT_EVICT_WALK_BASE 300 /* Pages tracked across file visits */ #define WT_EVICT_WALK_INCR 100 /* Pages added each walk */ -#define WT_EVICT_PASS_AGGRESSIVE 0x01 -#define WT_EVICT_PASS_ALL 0x02 -#define WT_EVICT_PASS_DIRTY 0x04 -#define WT_EVICT_PASS_WOULD_BLOCK 0x08 - /* * WT_EVICT_ENTRY -- * Encapsulation of an eviction candidate. @@ -109,6 +104,7 @@ struct __wt_cache { * Cache pool information. */ uint64_t cp_pass_pressure; /* Calculated pressure from this pass */ + uint64_t cp_quota; /* Maximum size for this cache */ uint64_t cp_reserved; /* Base size for this cache */ WT_SESSION_IMPL *cp_session; /* May be used for cache management */ uint32_t cp_skip_count; /* Post change stabilization */ @@ -119,6 +115,15 @@ struct __wt_cache { uint64_t cp_saved_read; /* Read count at last review */ /* + * Work state. + */ +#define WT_EVICT_PASS_AGGRESSIVE 0x01 +#define WT_EVICT_PASS_ALL 0x02 +#define WT_EVICT_PASS_DIRTY 0x04 +#define WT_EVICT_PASS_WOULD_BLOCK 0x08 + uint32_t state; + + /* * Flags. */ #define WT_CACHE_POOL_MANAGER 0x01 /* The active cache pool manager */ @@ -140,6 +145,7 @@ struct __wt_cache_pool { const char *name; uint64_t size; uint64_t chunk; + uint64_t quota; uint64_t currently_used; uint32_t refs; /* Reference count for structure. */ /* Locked: List of connections participating in the cache pool. */ diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i index 87f8c5543d1..bc33f82d927 100644 --- a/src/third_party/wiredtiger/src/include/cache.i +++ b/src/third_party/wiredtiger/src/include/cache.i @@ -104,48 +104,6 @@ __wt_cache_dirty_inuse(WT_CACHE *cache) } /* - * __wt_cache_status -- - * Return if the cache usage exceeds the eviction or dirty targets. - */ -static inline void -__wt_cache_status(WT_SESSION_IMPL *session, int *evictp, int *dirtyp) -{ - WT_CONNECTION_IMPL *conn; - WT_CACHE *cache; - uint64_t bytes_inuse, bytes_max, dirty_inuse; - - conn = S2C(session); - cache = conn->cache; - - /* - * There's an assumption "evict" overrides "dirty", that is, if eviction - * is required, we no longer care where we are with respect to the dirty - * target. - * - * Avoid division by zero if the cache size has not yet been set in a - * shared cache. - */ - bytes_max = conn->cache_size + 1; - if (evictp != NULL) { - bytes_inuse = __wt_cache_bytes_inuse(cache); - if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) { - *evictp = 1; - return; - } - *evictp = 0; - } - if (dirtyp != NULL) { - dirty_inuse = __wt_cache_dirty_inuse(cache); - if (dirty_inuse > - (cache->eviction_dirty_target * bytes_max) / 100) { - *dirtyp = 1; - return; - } - *dirtyp = 0; - } -} - -/* * __wt_session_can_wait -- * Return if a session available for a potentially slow operation. */ @@ -161,29 +119,52 @@ __wt_session_can_wait(WT_SESSION_IMPL *session) return (0); /* - * LSM sets the no-cache-check flag when holding the LSM tree lock, + * LSM sets the no-eviction flag when holding the LSM tree lock, * in that case, or when holding the schema lock, we don't want to * highjack the thread for eviction. */ if (F_ISSET(session, - WT_SESSION_NO_CACHE_CHECK | WT_SESSION_LOCKED_SCHEMA)) + WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_SCHEMA)) return (0); return (1); } /* + * __wt_eviction_aggressive -- + * Return if the eviction server is running in aggressive mode. + */ +static inline int +__wt_eviction_aggressive(WT_SESSION_IMPL *session) +{ + return (FLD_ISSET( + S2C(session)->cache->state, WT_EVICT_PASS_AGGRESSIVE) ? 1 : 0); +} + +/* + * __wt_eviction_dirty_target -- + * Return if the eviction server is running to reduce the number of dirty + * pages (versus running to discard pages from the cache). + */ +static inline int +__wt_eviction_dirty_target(WT_SESSION_IMPL *session) +{ + return (FLD_ISSET( + S2C(session)->cache->state, WT_EVICT_PASS_DIRTY) ? 1 : 0); +} + +/* * __wt_eviction_needed -- * Return if an application thread should do eviction, and the cache full * percentage as a side-effect. */ -static inline int -__wt_eviction_needed(WT_SESSION_IMPL *session, int *pct_fullp) +static inline bool +__wt_eviction_needed(WT_SESSION_IMPL *session, u_int *pct_fullp) { WT_CONNECTION_IMPL *conn; WT_CACHE *cache; uint64_t bytes_inuse, bytes_max; - int pct_full; + u_int pct_full; conn = S2C(session); cache = conn->cache; @@ -196,25 +177,20 @@ __wt_eviction_needed(WT_SESSION_IMPL *session, int *pct_fullp) bytes_max = conn->cache_size + 1; /* - * Return the cache full percentage; anything over 95% means we involve - * the application thread. + * Calculate the cache full percentage; anything over the trigger means + * we involve the application thread. */ - pct_full = (int)((100 * bytes_inuse) / bytes_max); + pct_full = (u_int)((100 * bytes_inuse) / bytes_max); if (pct_fullp != NULL) *pct_fullp = pct_full; - if (pct_full >= 95) - return (1); + if (pct_full > cache->eviction_trigger) + return (true); - /* - * Return if we're over the trigger cache size or there are too many - * dirty pages. - */ - if (bytes_inuse > (cache->eviction_trigger * bytes_max) / 100) - return (1); + /* Return if there are too many dirty bytes in cache. */ if (__wt_cache_dirty_inuse(cache) > (cache->eviction_dirty_trigger * bytes_max) / 100) - return (1); - return (0); + return (true); + return (false); } /* @@ -225,7 +201,7 @@ static inline int __wt_cache_eviction_check(WT_SESSION_IMPL *session, int busy, int *didworkp) { WT_BTREE *btree; - int pct_full; + u_int pct_full; if (didworkp != NULL) *didworkp = 0; @@ -235,7 +211,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, int busy, int *didworkp) * that case, or when holding the schema or handle list locks (which * block eviction), we don't want to highjack the thread for eviction. */ - if (F_ISSET(session, WT_SESSION_NO_CACHE_CHECK | + if (F_ISSET(session, WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA)) return (0); diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i index 20a4d214015..d7ecfd3bda4 100644 --- a/src/third_party/wiredtiger/src/include/cell.i +++ b/src/third_party/wiredtiger/src/include/cell.i @@ -182,7 +182,7 @@ __wt_cell_pack_addr(WT_CELL *cell, u_int cell_type, uint64_t recno, size_t size) p = cell->__chunk + 1; - if (recno == 0) + if (recno == WT_RECNO_OOB) cell->__chunk[0] = cell_type; /* Type */ else { cell->__chunk[0] = cell_type | WT_CELL_64V; diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index 64043035e76..d8ff261cd82 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -270,7 +270,9 @@ struct __wt_connection_impl { uint32_t hazard_max; /* Hazard array size */ WT_CACHE *cache; /* Page cache */ - uint64_t cache_size; /* Configured cache size */ + volatile uint64_t cache_size; /* Cache size (either statically + configured or the current size + within a cache pool). */ WT_TXN_GLOBAL txn_global; /* Global transaction state */ @@ -292,8 +294,6 @@ struct __wt_connection_impl { uint64_t ckpt_time_recent; /* Checkpoint time recent/total */ uint64_t ckpt_time_total; - int compact_in_memory_pass; /* Compaction serialization */ - #define WT_CONN_STAT_ALL 0x01 /* "all" statistics configured */ #define WT_CONN_STAT_CLEAR 0x02 /* clear after gathering */ #define WT_CONN_STAT_FAST 0x04 /* "fast" statistics configured */ @@ -370,6 +370,20 @@ struct __wt_connection_impl { time_t sweep_interval;/* Handle sweep interval */ u_int sweep_handles_min;/* Handle sweep minimum open */ + /* + * Shared lookaside lock, session and cursor, used by threads accessing + * the lookaside table (other than eviction server and worker threads + * and the sweep thread, all of which have their own lookaside cursors). + */ + WT_SPINLOCK las_lock; /* Lookaside table spinlock */ + WT_SESSION_IMPL *las_session; /* Lookaside table session */ + WT_CURSOR *las_cursor; /* Lookaside table cursor */ + bool las_written; /* Lookaside table has been written */ + + WT_ITEM las_sweep_key; /* Sweep server's saved key */ + int las_sweep_call;/* Sweep server's call count */ + uint64_t las_sweep_cnt; /* Sweep server's per-call row count */ + /* Locked: collator list */ TAILQ_HEAD(__wt_coll_qh, __wt_named_collator) collqh; diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index 2b3a3221004..2f55dfc8186 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -261,6 +261,7 @@ struct __wt_cursor_index { WT_CURSOR *child; WT_CURSOR **cg_cursors; + uint8_t *cg_needvalue; }; struct __wt_cursor_json { diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index 484af0b4a58..e7fed250251 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -32,7 +32,7 @@ __cursor_pos_clear(WT_CURSOR_BTREE *cbt) * and it's a minimal set of things we need to clear. It would be a * lot simpler to clear everything, but we call this function a lot. */ - cbt->recno = 0; + cbt->recno = WT_RECNO_OOB; cbt->ins = NULL; cbt->ins_head = NULL; diff --git a/src/third_party/wiredtiger/src/include/error.h b/src/third_party/wiredtiger/src/include/error.h index fcb96b16361..abffc02945e 100644 --- a/src/third_party/wiredtiger/src/include/error.h +++ b/src/third_party/wiredtiger/src/include/error.h @@ -92,7 +92,8 @@ return (__wt_illegal_value(session, NULL)) #define WT_ILLEGAL_VALUE_ERR(session) \ default: \ - WT_ERR(__wt_illegal_value(session, NULL)) + ret = __wt_illegal_value(session, NULL); \ + goto err #define WT_ILLEGAL_VALUE_SET(session) \ default: \ ret = __wt_illegal_value(session, NULL); \ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index a7b02ec4a75..e5c5a72fe02 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -63,7 +63,7 @@ extern int __wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max); extern int __wt_block_ext_discard(WT_SESSION_IMPL *session, u_int max); extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block); extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block); -extern int __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size); +extern bool __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size); extern int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, int *eofp); extern int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t addr_size, int valid); extern int __wt_block_verify_start(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase, const char *cfg[]); @@ -101,8 +101,9 @@ extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt); extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp); extern int __wt_btcur_equals( WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp); extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop); +extern void __wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); extern void __wt_btcur_open(WT_CURSOR_BTREE *cbt); -extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt); +extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt, int lowlevel); extern int __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v); extern int __wt_debug_addr_print( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size); extern int __wt_debug_addr(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, const char *ofile); @@ -115,12 +116,13 @@ extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char * extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile); extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp); extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref); -extern int __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref); +extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep); extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, int free_pages); extern void __wt_free_ref_index(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE_INDEX *pindex, int free_pages); +extern void __wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd); extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]); extern int __wt_btree_close(WT_SESSION_IMPL *session); extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno); @@ -138,15 +140,15 @@ extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *add extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store); extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack); extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell); +extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep); +extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep); +extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size); extern int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ); -extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep); -extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep); -extern int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd); extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]); extern void __wt_split_stash_discard(WT_SESSION_IMPL *session); @@ -162,7 +164,7 @@ extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok); extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf); extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags); -extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove); +extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, int is_remove); extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt); extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key); @@ -179,6 +181,14 @@ extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert); extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); +extern void __wt_las_stats_update(WT_SESSION_IMPL *session); +extern int __wt_las_create(WT_SESSION_IMPL *session); +extern int __wt_las_destroy(WT_SESSION_IMPL *session); +extern void __wt_las_set_written(WT_SESSION_IMPL *session); +extern bool __wt_las_is_written(WT_SESSION_IMPL *session); +extern int __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags); +extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags); +extern int __wt_las_sweep(WT_SESSION_IMPL *session); extern int __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len); extern int __wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str); extern int __wt_config_subinit( WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item); @@ -237,7 +247,7 @@ extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session); extern int __wt_connection_init(WT_CONNECTION_IMPL *conn); extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn); extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]); -extern int __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield); +extern int __wt_log_wrlsn(WT_SESSION_IMPL *session); extern int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_logmgr_open(WT_SESSION_IMPL *session); extern int __wt_logmgr_destroy(WT_SESSION_IMPL *session); @@ -308,14 +318,14 @@ extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_evict_server_wake(WT_SESSION_IMPL *session); extern int __wt_evict_create(WT_SESSION_IMPL *session); extern int __wt_evict_destroy(WT_SESSION_IMPL *session); -extern int __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, int *evict_resetp); extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session); -extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full); -extern void __wt_cache_dump(WT_SESSION_IMPL *session); +extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full); +extern int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile); extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int closing); extern int __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing); extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); +extern int __wt_log_ckpt_lsn(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); extern int __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn); extern int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn); extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, int *rec); @@ -323,12 +333,13 @@ extern void __wt_log_written_reset(WT_SESSION_IMPL *session); extern int __wt_log_get_all_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp, uint32_t *maxid, int active_only); extern void __wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count); extern int __wt_log_extract_lognum( WT_SESSION_IMPL *session, const char *name, uint32_t *id); +extern int __wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot); extern int __wt_log_allocfile( WT_SESSION_IMPL *session, uint32_t lognum, const char *dest, int prealloc); extern int __wt_log_remove(WT_SESSION_IMPL *session, const char *file_prefix, uint32_t lognum); extern int __wt_log_open(WT_SESSION_IMPL *session); extern int __wt_log_close(WT_SESSION_IMPL *session); -extern int __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create, int *created); extern int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_LSN *next_lsnp, void *cookie, int firstrecord), void *cookie); +extern int __wt_log_force_write(WT_SESSION_IMPL *session, int retry); extern int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags); extern int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap); extern int __wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp); @@ -354,14 +365,16 @@ extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logr extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep); extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); +extern int __wt_log_slot_close( WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *releasep, int forced); +extern int __wt_log_slot_switch_internal(WT_SESSION_IMPL *session, WT_MYSLOT *myslot); +extern int __wt_log_slot_switch(WT_SESSION_IMPL *session, WT_MYSLOT *myslot); +extern int __wt_log_slot_new(WT_SESSION_IMPL *session); extern int __wt_log_slot_init(WT_SESSION_IMPL *session); extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session); -extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslotp); -extern int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); -extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); -extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); -extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size); -extern int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); +extern void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot); +extern int64_t __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size); +extern void __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm); extern int __wt_clsm_await_switch(WT_CURSOR_LSM *clsm); extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks); @@ -474,7 +487,7 @@ extern int __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t siz extern int __wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size); extern int __wt_munmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len, void **mappingcookie); extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, int is_signalled, WT_CONDVAR **condp); -extern int __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs); +extern int __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, int *signalled); extern int __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond); extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp); extern int __wt_rwlock_alloc( WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name); @@ -488,7 +501,7 @@ extern int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp); extern int __wt_once(void (*init_routine)(void)); extern int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp); extern int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp); -extern int __wt_absolute_path(const char *path); +extern bool __wt_absolute_path(const char *path); extern const char *__wt_path_separator(void); extern int __wt_has_priv(void); extern int __wt_remove(WT_SESSION_IMPL *session, const char *name); @@ -576,6 +589,8 @@ extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*f extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session, int free_buffers); extern int __wt_session_copy_values(WT_SESSION_IMPL *session); extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_session_create( WT_SESSION_IMPL *session, const char *uri, const char *config); +extern int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]); extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp); extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, int open_metadata, WT_SESSION_IMPL **sessionp); extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip); @@ -638,7 +653,7 @@ extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, cons extern uint32_t __wt_nlpo2_round(uint32_t v); extern uint32_t __wt_nlpo2(uint32_t v); extern uint32_t __wt_log2_int(uint32_t n); -extern int __wt_ispo2(uint32_t v); +extern bool __wt_ispo2(uint32_t v); extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2); extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state); extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state); diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h index 031be7e7c59..ca3c3c38245 100644 --- a/src/third_party/wiredtiger/src/include/flags.h +++ b/src/third_party/wiredtiger/src/include/flags.h @@ -18,6 +18,8 @@ #define WT_CONN_SERVER_SWEEP 0x00002000 #define WT_CONN_WAS_BACKUP 0x00004000 #define WT_EVICTING 0x00000001 +#define WT_EVICT_LOOKASIDE 0x00000002 +#define WT_EVICT_UPDATE_RESTORE 0x00000004 #define WT_FILE_TYPE_CHECKPOINT 0x00000001 #define WT_FILE_TYPE_DATA 0x00000002 #define WT_FILE_TYPE_DIRECTORY 0x00000004 @@ -46,17 +48,17 @@ #define WT_SESSION_LOCKED_CHECKPOINT 0x00000008 #define WT_SESSION_LOCKED_HANDLE_LIST 0x00000010 #define WT_SESSION_LOCKED_SCHEMA 0x00000020 -#define WT_SESSION_LOCKED_TABLE 0x00000040 -#define WT_SESSION_LOGGING_INMEM 0x00000080 -#define WT_SESSION_NO_CACHE 0x00000100 -#define WT_SESSION_NO_CACHE_CHECK 0x00000200 -#define WT_SESSION_NO_DATA_HANDLES 0x00000400 -#define WT_SESSION_NO_LOGGING 0x00000800 -#define WT_SESSION_NO_SCHEMA_LOCK 0x00001000 -#define WT_SESSION_QUIET_CORRUPT_FILE 0x00002000 -#define WT_SESSION_SERVER_ASYNC 0x00004000 -#define WT_SKIP_UPDATE_ERR 0x00000002 -#define WT_SKIP_UPDATE_RESTORE 0x00000004 +#define WT_SESSION_LOCKED_SLOT 0x00000040 +#define WT_SESSION_LOCKED_TABLE 0x00000080 +#define WT_SESSION_LOGGING_INMEM 0x00000100 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00000200 +#define WT_SESSION_NO_CACHE 0x00000400 +#define WT_SESSION_NO_DATA_HANDLES 0x00000800 +#define WT_SESSION_NO_EVICTION 0x00001000 +#define WT_SESSION_NO_LOGGING 0x00002000 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00004000 +#define WT_SESSION_QUIET_CORRUPT_FILE 0x00008000 +#define WT_SESSION_SERVER_ASYNC 0x00010000 #define WT_SYNC_CHECKPOINT 0x00000001 #define WT_SYNC_CLOSE 0x00000002 #define WT_SYNC_DISCARD 0x00000004 @@ -90,6 +92,7 @@ #define WT_VERB_VERIFY 0x00200000 #define WT_VERB_VERSION 0x00400000 #define WT_VERB_WRITE 0x00800000 +#define WT_VISIBILITY_ERR 0x00000008 /* * flags section: END * DO NOT EDIT: automatically built by dist/flags.py. diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h index 3472985745e..01e33792d73 100644 --- a/src/third_party/wiredtiger/src/include/gcc.h +++ b/src/third_party/wiredtiger/src/include/gcc.h @@ -123,7 +123,7 @@ __wt_atomic_sub##name(type *vp, type v) \ { \ return (__sync_sub_and_fetch(vp, v)); \ } \ -static inline int \ +static inline bool \ __wt_atomic_cas##name(type *vp, type old, type new) \ { \ return (WT_ATOMIC_CAS(vp, old, new)); \ @@ -145,7 +145,7 @@ WT_ATOMIC_FUNC(size, size_t, size_t) * __wt_atomic_cas_ptr -- * Pointer compare and swap. */ -static inline int +static inline bool __wt_atomic_cas_ptr(void *vp, void *old, void *new) { return (WT_ATOMIC_CAS((void **)vp, old, new)); diff --git a/src/third_party/wiredtiger/src/include/hardware.h b/src/third_party/wiredtiger/src/include/hardware.h index c9b72f8a609..32353072c5b 100644 --- a/src/third_party/wiredtiger/src/include/hardware.h +++ b/src/third_party/wiredtiger/src/include/hardware.h @@ -50,6 +50,16 @@ &(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \ } while (0) +#define F_CAS_ATOMIC_WAIT(p, mask) do { \ + int __ret; \ + for (;;) { \ + F_CAS_ATOMIC(p, mask, __ret); \ + if (__ret == 0) \ + break; \ + __wt_yield(); \ + } \ +} while (0) + #define F_CLR_ATOMIC(p, mask) do { \ uint8_t __orig; \ do { \ diff --git a/src/third_party/wiredtiger/src/include/lint.h b/src/third_party/wiredtiger/src/include/lint.h index eba4a1c3b3f..f288fb98683 100644 --- a/src/third_party/wiredtiger/src/include/lint.h +++ b/src/third_party/wiredtiger/src/include/lint.h @@ -49,14 +49,14 @@ __wt_atomic_sub##name(type *vp, type v) \ *vp -= v; \ return (*vp); \ } \ -static inline int \ +static inline bool \ __wt_atomic_cas##name(type *vp, type old, type new) \ { \ if (*vp == old) { \ *vp = new; \ - return (1); \ + return (true); \ } \ - return (0); \ + return (false); \ } WT_ATOMIC_FUNC(8, uint8_t, uint8_t) @@ -75,13 +75,13 @@ WT_ATOMIC_FUNC(size, size_t, size_t) * __wt_atomic_cas_ptr -- * Pointer compare and swap. */ -static inline int +static inline bool __wt_atomic_cas_ptr(void *vp, void *old, void *new) { if (*(void **)vp == old) { *(void **)vp = new; - return (1); + return (true); } - return (0); + return (false); } static inline void WT_BARRIER(void) { return; } diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h index 949eb09ca30..06be95697c7 100644 --- a/src/third_party/wiredtiger/src/include/log.h +++ b/src/third_party/wiredtiger/src/include/log.h @@ -12,7 +12,6 @@ /* Logging subsystem declarations. */ #define WT_LOG_ALIGN 128 -#define WT_LOG_SLOT_BUF_SIZE 256 * 1024 #define WT_INIT_LSN(l) do { \ (l)->file = 1; \ @@ -48,63 +47,133 @@ ((size) - offsetof(WT_LOG_RECORD, record)) /* - * Compare 2 LSNs, return -1 if lsn0 < lsn1, 0 if lsn0 == lsn1 - * and 1 if lsn0 > lsn1. - */ -#define WT_LOG_CMP(lsn1, lsn2) \ - ((lsn1)->file != (lsn2)->file ? \ - ((lsn1)->file < (lsn2)->file ? -1 : 1) : \ - ((lsn1)->offset != (lsn2)->offset ? \ - ((lsn1)->offset < (lsn2)->offset ? -1 : 1) : 0)) - -/* * Possible values for the consolidation array slot states: - * (NOTE: Any new states must be > WT_LOG_SLOT_DONE and < WT_LOG_SLOT_READY.) * - * < WT_LOG_SLOT_DONE - threads are actively writing to the log. - * WT_LOG_SLOT_DONE - all activity on this slot is complete. + * WT_LOG_SLOT_CLOSE - slot is in use but closed to new joins. * WT_LOG_SLOT_FREE - slot is available for allocation. - * WT_LOG_SLOT_PENDING - slot is transitioning from ready to active. * WT_LOG_SLOT_WRITTEN - slot is written and should be processed by worker. - * WT_LOG_SLOT_READY - slot is ready for threads to join. - * > WT_LOG_SLOT_READY - threads are actively consolidating on this slot. * * The slot state must be volatile: threads loop checking the state and can't * cache the first value they see. + * + * The slot state is divided into two 32 bit sizes. One half is the + * amount joined and the other is the amount released. Since we use + * a few special states, reserve the top few bits for state. That makes + * the maximum size less than 32 bits for both joined and released. + */ + +/* + * The high bit is reserved for the special states. If the high bit is + * set (WT_LOG_SLOT_RESERVED) then we are guaranteed to be in a special state. + */ +#define WT_LOG_SLOT_FREE -1 /* Not in use */ +#define WT_LOG_SLOT_WRITTEN -2 /* Slot data written, not processed */ + +/* + * We allocate the buffer size, but trigger a slot switch when we cross + * the maximum size of half the buffer. If a record is more than the buffer + * maximum then we trigger a slot switch and write that record unbuffered. + * We use a larger buffer to provide overflow space so that we can switch + * once we cross the threshold. + */ +#define WT_LOG_SLOT_BUF_SIZE (256 * 1024) /* Must be power of 2 */ +#define WT_LOG_SLOT_BUF_MAX ((uint32_t)log->slot_buf_size / 2) +#define WT_LOG_SLOT_UNBUFFERED (WT_LOG_SLOT_BUF_SIZE << 1) + +/* + * If new slot states are added, adjust WT_LOG_SLOT_BITS and + * WT_LOG_SLOT_MASK_OFF accordingly for how much of the top 32 + * bits we are using. More slot states here will reduce the maximum + * size that a slot can hold unbuffered by half. If a record is + * larger than the maximum we can account for in the slot state we fall + * back to direct writes. + */ +#define WT_LOG_SLOT_BITS 2 +#define WT_LOG_SLOT_MAXBITS (32 - WT_LOG_SLOT_BITS) +#define WT_LOG_SLOT_CLOSE 0x4000000000000000LL /* Force slot close */ +#define WT_LOG_SLOT_RESERVED 0x8000000000000000LL /* Reserved states */ + +/* + * Check if the unbuffered flag is set in the joined portion of + * the slot state. */ -#define WT_LOG_SLOT_DONE 0 -#define WT_LOG_SLOT_FREE 1 -#define WT_LOG_SLOT_PENDING 2 -#define WT_LOG_SLOT_WRITTEN 3 -#define WT_LOG_SLOT_READY 4 +#define WT_LOG_SLOT_UNBUFFERED_ISSET(state) \ + ((state) & ((int64_t)WT_LOG_SLOT_UNBUFFERED << 32)) + +#define WT_LOG_SLOT_MASK_OFF 0x3fffffffffffffffLL +#define WT_LOG_SLOT_MASK_ON ~(WT_LOG_SLOT_MASK_OFF) +#define WT_LOG_SLOT_JOIN_MASK (WT_LOG_SLOT_MASK_OFF >> 32) + +/* + * These macros manipulate the slot state and its component parts. + */ +#define WT_LOG_SLOT_FLAGS(state) ((state) & WT_LOG_SLOT_MASK_ON) +#define WT_LOG_SLOT_JOINED(state) (((state) & WT_LOG_SLOT_MASK_OFF) >> 32) +#define WT_LOG_SLOT_JOINED_BUFFERED(state) \ + (WT_LOG_SLOT_JOINED(state) & \ + (WT_LOG_SLOT_UNBUFFERED - 1)) +#define WT_LOG_SLOT_JOIN_REL(j, r, s) (((j) << 32) + (r) + (s)) +#define WT_LOG_SLOT_RELEASED(state) ((int64_t)(int32_t)(state)) +#define WT_LOG_SLOT_RELEASED_BUFFERED(state) \ + ((int64_t)((int32_t)WT_LOG_SLOT_RELEASED(state) & \ + (WT_LOG_SLOT_UNBUFFERED - 1))) + +/* Slot is in use */ +#define WT_LOG_SLOT_ACTIVE(state) \ + (WT_LOG_SLOT_JOINED(state) != WT_LOG_SLOT_JOIN_MASK) +/* Slot is in use, but closed to new joins */ +#define WT_LOG_SLOT_CLOSED(state) \ + (WT_LOG_SLOT_ACTIVE(state) && \ + (FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_CLOSE) && \ + !FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_RESERVED))) +/* Slot is in use, all data copied into buffer */ +#define WT_LOG_SLOT_INPROGRESS(state) \ + (WT_LOG_SLOT_RELEASED(state) != WT_LOG_SLOT_JOINED(state)) +#define WT_LOG_SLOT_DONE(state) \ + (WT_LOG_SLOT_CLOSED(state) && \ + !WT_LOG_SLOT_INPROGRESS(state)) +/* Slot is in use, more threads may join this slot */ +#define WT_LOG_SLOT_OPEN(state) \ + (WT_LOG_SLOT_ACTIVE(state) && \ + !WT_LOG_SLOT_UNBUFFERED_ISSET(state) && \ + !FLD64_ISSET((uint64_t)(state), WT_LOG_SLOT_CLOSE) && \ + WT_LOG_SLOT_JOINED(state) < WT_LOG_SLOT_BUF_MAX) + struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_logslot { volatile int64_t slot_state; /* Slot state */ - uint64_t slot_group_size; /* Group size */ + int64_t slot_unbuffered; /* Unbuffered data in this slot */ int32_t slot_error; /* Error value */ -#define WT_SLOT_INVALID_INDEX 0xffffffff - uint32_t slot_index; /* Active slot index */ wt_off_t slot_start_offset; /* Starting file offset */ - WT_LSN slot_release_lsn; /* Slot release LSN */ - WT_LSN slot_start_lsn; /* Slot starting LSN */ - WT_LSN slot_end_lsn; /* Slot ending LSN */ + wt_off_t slot_last_offset; /* Last record offset */ + WT_LSN slot_release_lsn; /* Slot release LSN */ + WT_LSN slot_start_lsn; /* Slot starting LSN */ + WT_LSN slot_end_lsn; /* Slot ending LSN */ WT_FH *slot_fh; /* File handle for this group */ - WT_ITEM slot_buf; /* Buffer for grouped writes */ - int32_t slot_churn; /* Active slots are scarce. */ + WT_ITEM slot_buf; /* Buffer for grouped writes */ -#define WT_SLOT_BUFFERED 0x01 /* Buffer writes */ -#define WT_SLOT_CLOSEFH 0x02 /* Close old fh on release */ -#define WT_SLOT_SYNC 0x04 /* Needs sync on release */ -#define WT_SLOT_SYNC_DIR 0x08 /* Directory sync on release */ +#define WT_SLOT_CLOSEFH 0x01 /* Close old fh on release */ +#define WT_SLOT_SYNC 0x02 /* Needs sync on release */ +#define WT_SLOT_SYNC_DIR 0x04 /* Directory sync on release */ uint32_t flags; /* Flags */ }; -#define WT_SLOT_INIT_FLAGS (WT_SLOT_BUFFERED) +#define WT_SLOT_INIT_FLAGS 0 + +#define WT_WITH_SLOT_LOCK(session, log, op) do { \ + WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); \ + WT_WITH_LOCK(session, \ + &log->log_slot_lock, WT_SESSION_LOCKED_SLOT, op); \ +} while (0) struct __wt_myslot { - WT_LOGSLOT *slot; - wt_off_t offset; + WT_LOGSLOT *slot; /* Slot I'm using */ + wt_off_t end_offset; /* My end offset in buffer */ + wt_off_t offset; /* Slot buffer offset */ +#define WT_MYSLOT_CLOSE 0x01 /* This thread is closing the slot */ +#define WT_MYSLOT_UNBUFFERED 0x02 /* Write directly */ + uint32_t flags; /* Flags */ }; - /* Offset of first record */ + #define WT_LOG_FIRST_RECORD log->allocsize struct __wt_log { @@ -118,8 +187,9 @@ struct __wt_log { uint32_t tmp_fileid; /* Temporary file number */ uint32_t prep_missed; /* Pre-allocated file misses */ WT_FH *log_fh; /* Logging file handle */ - WT_FH *log_close_fh; /* Logging file handle to close */ WT_FH *log_dir_fh; /* Log directory file handle */ + WT_FH *log_close_fh; /* Logging file handle to close */ + WT_LSN log_close_lsn; /* LSN needed to close */ /* * System LSNs @@ -140,8 +210,9 @@ struct __wt_log { WT_SPINLOCK log_lock; /* Locked: Logging fields */ WT_SPINLOCK log_slot_lock; /* Locked: Consolidation array */ WT_SPINLOCK log_sync_lock; /* Locked: Single-thread fsync */ + WT_SPINLOCK log_writelsn_lock; /* Locked: write LSN */ - WT_RWLOCK *log_archive_lock; /* Archive and log cursors */ + WT_RWLOCK *log_archive_lock; /* Archive and log cursors */ /* Notify any waiting threads when sync_lsn is updated. */ WT_CONDVAR *log_sync_cond; @@ -150,7 +221,6 @@ struct __wt_log { /* * Consolidation array information - * WT_SLOT_ACTIVE must be less than WT_SLOT_POOL. * Our testing shows that the more consolidation we generate the * better the performance we see which equates to an active slot * slot count of one. @@ -158,13 +228,14 @@ struct __wt_log { * Note: this can't be an array, we impose cache-line alignment and * gcc doesn't support that for arrays. */ -#define WT_SLOT_ACTIVE 1 #define WT_SLOT_POOL 128 - WT_LOGSLOT *slot_array[WT_SLOT_ACTIVE]; /* Active slots */ - WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */ - size_t slot_buf_size; /* Buffer size for slots */ + WT_LOGSLOT *active_slot; /* Active slot */ + WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */ + size_t slot_buf_size; /* Buffer size for slots */ +#ifdef HAVE_DIAGNOSTIC + uint64_t write_calls; /* Calls to log_write */ +#endif -#define WT_LOG_FORCE_CONSOLIDATE 0x01 /* Disable direct writes */ uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/log.i b/src/third_party/wiredtiger/src/include/log.i new file mode 100644 index 00000000000..ff309c31265 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/log.i @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +static inline int __wt_log_cmp(WT_LSN *lsn1, WT_LSN *lsn2); + +/* + * __wt_log_cmp -- + * Compare 2 LSNs, return -1 if lsn1 < lsn2, 0if lsn1 == lsn2 + * and 1 if lsn1 > lsn2. + */ +static inline int +__wt_log_cmp(WT_LSN *lsn1, WT_LSN *lsn2) +{ + WT_LSN l1, l2; + + /* + * Read LSNs into local variables so that we only read each field + * once and all comparisons are on the same values. + */ + l1 = *(volatile WT_LSN *)lsn1; + l2 = *(volatile WT_LSN *)lsn2; + + /* + * If the file numbers are different we don't need to compare the + * offset. + */ + if (l1.file != l2.file) + return (l1.file < l2.file ? -1 : 1); + /* + * If the file numbers are the same, compare the offset. + */ + if (l1.offset != l2.offset) + return (l1.offset < l2.offset ? -1 : 1); + return (0); +} diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h index 66547262417..a5a303f1630 100644 --- a/src/third_party/wiredtiger/src/include/meta.h +++ b/src/third_party/wiredtiger/src/include/meta.h @@ -21,7 +21,9 @@ #define WT_METADATA_TURTLE_SET "WiredTiger.turtle.set" /* Turtle temp file */ #define WT_METADATA_URI "metadata:" /* Metadata alias */ -#define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata file URI */ +#define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata table URI */ + +#define WT_LAS_URI "file:WiredTigerLAS.wt" /* Lookaside table URI*/ /* * Pre computed hash for the metadata file. Used to optimize comparisons diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index 7fb6ae13d38..1b2cbf11fc2 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -130,6 +130,7 @@ #define FLD_CLR(field, mask) ((field) &= ~((uint32_t)(mask))) #define FLD_ISSET(field, mask) ((field) & ((uint32_t)(mask))) +#define FLD64_ISSET(field, mask) ((field) & ((uint64_t)(mask))) #define FLD_SET(field, mask) ((field) |= ((uint32_t)(mask))) /* diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i index 98facff02b9..6b502c4c1d1 100644 --- a/src/third_party/wiredtiger/src/include/misc.i +++ b/src/third_party/wiredtiger/src/include/misc.i @@ -7,6 +7,18 @@ */ /* + * __wt_cond_wait -- + * Wait on a mutex, optionally timing out. + */ +static inline int +__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) +{ + int notused; + + return (__wt_cond_wait_signal(session, cond, usecs, ¬used)); +} + +/* * __wt_strdup -- * ANSI strdup function. */ diff --git a/src/third_party/wiredtiger/src/include/msvc.h b/src/third_party/wiredtiger/src/include/msvc.h index f4d8ba52fc1..8f5aa9abde8 100644 --- a/src/third_party/wiredtiger/src/include/msvc.h +++ b/src/third_party/wiredtiger/src/include/msvc.h @@ -52,7 +52,7 @@ __wt_atomic_sub##name(type *vp, type v) \ { \ return (_InterlockedExchangeAdd ## s((t *)(vp), - (t)v) - (v)); \ } \ -static inline int \ +static inline bool \ __wt_atomic_cas##name(type *vp, type old, type new) \ { \ return (_InterlockedCompareExchange ## s \ @@ -75,7 +75,7 @@ WT_ATOMIC_FUNC(size, size_t, size_t, 64, __int64) * __wt_atomic_cas_ptr -- * Pointer compare and swap. */ -static inline int +static inline bool __wt_atomic_cas_ptr(void *vp, void *old, void *new) { return (_InterlockedCompareExchange64( diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i index 7b62e66eccb..d90b29c2133 100644 --- a/src/third_party/wiredtiger/src/include/serial.i +++ b/src/third_party/wiredtiger/src/include/serial.i @@ -123,7 +123,7 @@ __col_append_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, * If the application didn't specify a record number, allocate a new one * and set up for an append. */ - if ((recno = WT_INSERT_RECNO(new_ins)) == 0) { + if ((recno = WT_INSERT_RECNO(new_ins)) == WT_RECNO_OOB) { recno = WT_INSERT_RECNO(new_ins) = btree->last_recno + 1; WT_ASSERT(session, WT_SKIP_LAST(ins_head) == NULL || recno > WT_INSERT_RECNO(WT_SKIP_LAST(ins_head))); @@ -292,25 +292,37 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, __wt_page_modify_set(session, page); /* - * If there are subsequent WT_UPDATE structures, we're evicting pages - * and the page-scanning mutex isn't held, discard obsolete WT_UPDATE - * structures. Serialization is needed so only one thread does the - * obsolete check at a time, and to protect updates from disappearing - * under reconciliation. + * If there are no subsequent WT_UPDATE structures we are done here. */ - if (upd->next != NULL && - __wt_txn_visible_all(session, page->modify->obsolete_check_txn)) { - F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret); - /* If we can't lock it, don't scan, that's okay. */ - if (ret != 0) - return (0); - obsolete = __wt_update_obsolete_check(session, page, upd->next); - F_CLR_ATOMIC(page, WT_PAGE_SCANNING); - if (obsolete != NULL) { + if (upd->next == NULL) + return (0); + /* + * We would like to call __wt_txn_update_oldest only in the event that + * there are further updates to this page, the check against WT_TXN_NONE + * is used as an indicator of there being further updates on this page. + */ + if (page->modify->obsolete_check_txn != WT_TXN_NONE) { + if (!__wt_txn_visible_all(session, + page->modify->obsolete_check_txn)) { + /* Try to move the oldest ID forward and re-check */ + __wt_txn_update_oldest(session,0); + } + if (!__wt_txn_visible_all(session, + page->modify->obsolete_check_txn)) { page->modify->obsolete_check_txn = WT_TXN_NONE; - __wt_update_obsolete_free(session, page, obsolete); + return (0); } } + F_CAS_ATOMIC(page, WT_PAGE_RECONCILIATION, ret); + + /* If we can't lock it, don't scan, that's okay. */ + if (ret != 0) + return (0); + obsolete = __wt_update_obsolete_check(session, page, upd->next); + F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); + if (obsolete != NULL) { + __wt_update_obsolete_free(session, page, obsolete); + } return (0); } diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index c6c246954f7..a691794fd46 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -76,6 +76,11 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */ WT_COMPACT *compact; /* Compact state */ + /* + * Lookaside table cursor, sweep and eviction worker threads only. + */ + WT_CURSOR *las_cursor; /* Lookaside table cursor */ + WT_DATA_HANDLE *meta_dhandle; /* Metadata file */ void *meta_track; /* Metadata operation tracking */ void *meta_track_next; /* Current position */ diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 6ecb6b3a3c7..cd2c149bc94 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -276,11 +276,17 @@ struct __wt_connection_stats { int64_t cache_eviction_walk; int64_t cache_eviction_worker_evicting; int64_t cache_inmem_split; + int64_t cache_inmem_splittable; + int64_t cache_lookaside_insert; + int64_t cache_lookaside_remove; int64_t cache_overhead; int64_t cache_pages_dirty; int64_t cache_pages_inuse; int64_t cache_read; + int64_t cache_read_lookaside; int64_t cache_write; + int64_t cache_write_lookaside; + int64_t cache_write_restore; int64_t cond_wait; int64_t cursor_create; int64_t cursor_insert; @@ -323,9 +329,9 @@ struct __wt_connection_stats { int64_t log_slot_consolidated; int64_t log_slot_joins; int64_t log_slot_races; - int64_t log_slot_toobig; - int64_t log_slot_toosmall; + int64_t log_slot_switch_busy; int64_t log_slot_transitions; + int64_t log_slot_unbuffered; int64_t log_sync; int64_t log_sync_dir; int64_t log_write_lsn; @@ -400,6 +406,7 @@ struct __wt_dsrc_stats { int64_t btree_column_deleted; int64_t btree_column_fix; int64_t btree_column_internal; + int64_t btree_column_rle; int64_t btree_column_variable; int64_t btree_compact_rewrite; int64_t btree_entries; @@ -424,10 +431,14 @@ struct __wt_dsrc_stats { int64_t cache_eviction_internal; int64_t cache_eviction_split; int64_t cache_inmem_split; + int64_t cache_inmem_splittable; int64_t cache_overflow_value; int64_t cache_read; + int64_t cache_read_lookaside; int64_t cache_read_overflow; int64_t cache_write; + int64_t cache_write_lookaside; + int64_t cache_write_restore; int64_t compress_raw_fail; int64_t compress_raw_fail_temporary; int64_t compress_raw_ok; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 0e7be1be6bc..4a325c70a95 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -78,9 +78,8 @@ struct __wt_txn_global { }; typedef enum __wt_txn_isolation { - WT_ISO_EVICTION, /* Internal: eviction context */ - WT_ISO_READ_UNCOMMITTED, WT_ISO_READ_COMMITTED, + WT_ISO_READ_UNCOMMITTED, WT_ISO_SNAPSHOT } WT_TXN_ISOLATION; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 1228893871f..2b42990f5e5 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -140,12 +140,22 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) } /* + * __wt_txn_committed -- + * Return if a transaction has been committed. + */ +static inline bool +__wt_txn_committed(WT_SESSION_IMPL *session, uint64_t id) +{ + return (WT_TXNID_LT(id, S2C(session)->txn_global.last_running)); +} + +/* * __wt_txn_visible_all -- * Check if a given transaction ID is "globally visible". This is, if * all sessions in the system will see the transaction ID including the * ID that belongs to a running checkpoint. */ -static inline int +static inline bool __wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id) { uint64_t oldest_id; @@ -159,28 +169,21 @@ __wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id) * __wt_txn_visible -- * Can the current transaction see the given ID? */ -static inline int +static inline bool __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) { WT_TXN *txn; - int found; + bool found; txn = &session->txn; /* Changes with no associated transaction are always visible. */ if (id == WT_TXN_NONE) - return (1); + return (true); /* Nobody sees the results of aborted transactions. */ if (id == WT_TXN_ABORTED) - return (0); - - /* - * Eviction only sees globally visible updates, or if there is a - * checkpoint transaction running, use its transaction. - */ - if (txn->isolation == WT_ISO_EVICTION) - return (__wt_txn_visible_all(session, id)); + return (false); /* * Read-uncommitted transactions see all other changes. @@ -194,11 +197,11 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) */ if (txn->isolation == WT_ISO_READ_UNCOMMITTED || session->dhandle == session->meta_dhandle) - return (1); + return (true); /* Transactions see their own changes. */ if (id == txn->id) - return (1); + return (true); /* * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is @@ -210,9 +213,9 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) * snapshot is empty. */ if (WT_TXNID_LE(txn->snap_max, id)) - return (0); + return (false); if (txn->snapshot_count == 0 || WT_TXNID_LT(id, txn->snap_min)) - return (1); + return (true); WT_BINARY_SEARCH(id, txn->snapshot, txn->snapshot_count, found); return (!found); @@ -266,7 +269,7 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) } F_SET(txn, WT_TXN_RUNNING); - return (0); + return (false); } /* @@ -477,7 +480,7 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) * __wt_txn_am_oldest -- * Am I the oldest transaction in the system? */ -static inline int +static inline bool __wt_txn_am_oldest(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; @@ -492,12 +495,12 @@ __wt_txn_am_oldest(WT_SESSION_IMPL *session) txn_global = &conn->txn_global; if (txn->id == WT_TXN_NONE) - return (0); + return (false); WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LT(id, txn->id)) - return (0); + return (false); - return (1); + return (true); } diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index ddcbf19b847..71ba3f41a44 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -1750,6 +1750,9 @@ struct __wt_connection { * @config{ name, the name of a cache that * is shared between databases or \c "none" when no shared cache is * configured., a string; default \c none.} + * @config{ quota, maximum size of cache this + * database can be allocated from the shared cache. Defaults to the + * entire shared cache size., an integer; default \c 0.} * @config{ reserve, amount of cache this * database is guaranteed to have available from the shared cache. This * setting is per database. Defaults to the chunk size., an integer; @@ -2216,10 +2219,12 @@ struct __wt_connection { * @config{ name, the name of a cache that is shared * between databases or \c "none" when no shared cache is configured., a string; * default \c none.} - * @config{ reserve, amount of cache - * this database is guaranteed to have available from the shared cache. This - * setting is per database. Defaults to the chunk size., an integer; default \c - * 0.} + * @config{ quota, maximum size of + * cache this database can be allocated from the shared cache. Defaults to the + * entire shared cache size., an integer; default \c 0.} + * @config{ reserve, amount of cache this database is + * guaranteed to have available from the shared cache. This setting is per + * database. Defaults to the chunk size., an integer; default \c 0.} * @config{ size, maximum memory to allocate for the * shared cache. Setting this will update the value if one is already set., an * integer between 1MB and 10TB; default \c 500MB.} @@ -3642,198 +3647,210 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1047 /*! cache: in-memory page splits */ #define WT_STAT_CONN_CACHE_INMEM_SPLIT 1048 +/*! cache: in-memory page passed criteria to be split */ +#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1049 +/*! cache: lookaside table insert calls */ +#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1050 +/*! cache: lookaside table remove calls */ +#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1051 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1049 +#define WT_STAT_CONN_CACHE_OVERHEAD 1052 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1050 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1053 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1051 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1054 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1052 +#define WT_STAT_CONN_CACHE_READ 1055 +/*! cache: pages read into cache requiring lookaside entries */ +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1056 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1053 +#define WT_STAT_CONN_CACHE_WRITE 1057 +/*! cache: page written requiring lookaside records */ +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1058 +/*! cache: pages written requiring in-memory restoration */ +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1059 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1054 +#define WT_STAT_CONN_COND_WAIT 1060 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1055 +#define WT_STAT_CONN_CURSOR_CREATE 1061 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1056 +#define WT_STAT_CONN_CURSOR_INSERT 1062 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1057 +#define WT_STAT_CONN_CURSOR_NEXT 1063 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1058 +#define WT_STAT_CONN_CURSOR_PREV 1064 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1059 +#define WT_STAT_CONN_CURSOR_REMOVE 1065 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1060 +#define WT_STAT_CONN_CURSOR_RESET 1066 /*! cursor: cursor restarted searches */ -#define WT_STAT_CONN_CURSOR_RESTART 1061 +#define WT_STAT_CONN_CURSOR_RESTART 1067 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1062 +#define WT_STAT_CONN_CURSOR_SEARCH 1068 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1063 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1069 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1064 +#define WT_STAT_CONN_CURSOR_UPDATE 1070 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1065 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1071 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1066 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1072 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1067 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1073 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1068 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1074 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1069 +#define WT_STAT_CONN_DH_SWEEP_REF 1075 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1070 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1076 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1071 +#define WT_STAT_CONN_DH_SWEEP_TOD 1077 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1072 +#define WT_STAT_CONN_DH_SWEEPS 1078 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1073 +#define WT_STAT_CONN_FILE_OPEN 1079 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1074 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1080 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1075 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1081 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1076 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1082 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1077 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1083 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1078 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1084 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1079 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1085 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1080 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1086 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1081 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1087 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1082 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1088 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1083 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1089 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1084 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1090 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1085 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1091 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1086 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1092 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1087 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1093 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1088 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1094 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1089 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1095 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1090 +#define WT_STAT_CONN_LOG_SCANS 1096 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1091 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1097 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1092 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1098 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1093 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1099 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1094 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1100 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1095 -/*! log: record size exceeded maximum */ -#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1096 -/*! log: failed to find a slot large enough for record */ -#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1097 +#define WT_STAT_CONN_LOG_SLOT_RACES 1101 +/*! log: busy returns attempting to switch slots */ +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1102 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1098 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1103 +/*! log: consolidated slot unbuffered writes */ +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1104 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1099 +#define WT_STAT_CONN_LOG_SYNC 1105 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1100 +#define WT_STAT_CONN_LOG_SYNC_DIR 1106 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1101 +#define WT_STAT_CONN_LOG_WRITE_LSN 1107 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1102 +#define WT_STAT_CONN_LOG_WRITES 1108 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1103 +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1109 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1104 +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1110 /*! LSM: rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1105 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1111 /*! LSM: application work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1106 +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1112 /*! LSM: merge work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1107 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1113 /*! LSM: tree queue hit maximum */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1108 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1114 /*! LSM: switch work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1109 +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1115 /*! LSM: tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1110 +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1116 /*! LSM: tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1111 +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1117 /*! LSM: tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1112 +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1118 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1113 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1119 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1114 +#define WT_STAT_CONN_MEMORY_FREE 1120 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1115 +#define WT_STAT_CONN_MEMORY_GROW 1121 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1116 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1122 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1117 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1123 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1118 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1124 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1119 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1125 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1120 +#define WT_STAT_CONN_PAGE_SLEEP 1126 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1121 +#define WT_STAT_CONN_READ_IO 1127 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1122 +#define WT_STAT_CONN_REC_PAGES 1128 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1123 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1129 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1124 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1130 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1125 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1131 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1126 +#define WT_STAT_CONN_RWLOCK_READ 1132 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1127 +#define WT_STAT_CONN_RWLOCK_WRITE 1133 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1128 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1134 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1129 +#define WT_STAT_CONN_SESSION_OPEN 1135 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1130 +#define WT_STAT_CONN_TXN_BEGIN 1136 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1131 +#define WT_STAT_CONN_TXN_CHECKPOINT 1137 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1132 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1138 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1133 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1139 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1134 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1140 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1135 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1141 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1136 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1142 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1137 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1143 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1138 +#define WT_STAT_CONN_TXN_COMMIT 1144 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1139 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1145 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1140 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1146 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1141 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1147 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1142 +#define WT_STAT_CONN_TXN_ROLLBACK 1148 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1143 +#define WT_STAT_CONN_TXN_SYNC 1149 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1144 +#define WT_STAT_CONN_WRITE_IO 1150 /*! * @} @@ -3883,148 +3900,158 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_BTREE_COLUMN_FIX 2019 /*! btree: column-store internal pages */ #define WT_STAT_DSRC_BTREE_COLUMN_INTERNAL 2020 +/*! btree: column-store variable-size RLE encoded values */ +#define WT_STAT_DSRC_BTREE_COLUMN_RLE 2021 /*! btree: column-store variable-size leaf pages */ -#define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2021 +#define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2022 /*! btree: pages rewritten by compaction */ -#define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2022 +#define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2023 /*! btree: number of key/value pairs */ -#define WT_STAT_DSRC_BTREE_ENTRIES 2023 +#define WT_STAT_DSRC_BTREE_ENTRIES 2024 /*! btree: fixed-record size */ -#define WT_STAT_DSRC_BTREE_FIXED_LEN 2024 +#define WT_STAT_DSRC_BTREE_FIXED_LEN 2025 /*! btree: maximum tree depth */ -#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2025 +#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2026 /*! btree: maximum internal page key size */ -#define WT_STAT_DSRC_BTREE_MAXINTLKEY 2026 +#define WT_STAT_DSRC_BTREE_MAXINTLKEY 2027 /*! btree: maximum internal page size */ -#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2027 +#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2028 /*! btree: maximum leaf page key size */ -#define WT_STAT_DSRC_BTREE_MAXLEAFKEY 2028 +#define WT_STAT_DSRC_BTREE_MAXLEAFKEY 2029 /*! btree: maximum leaf page size */ -#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2029 +#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2030 /*! btree: maximum leaf page value size */ -#define WT_STAT_DSRC_BTREE_MAXLEAFVALUE 2030 +#define WT_STAT_DSRC_BTREE_MAXLEAFVALUE 2031 /*! btree: overflow pages */ -#define WT_STAT_DSRC_BTREE_OVERFLOW 2031 +#define WT_STAT_DSRC_BTREE_OVERFLOW 2032 /*! btree: row-store internal pages */ -#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2032 +#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2033 /*! btree: row-store leaf pages */ -#define WT_STAT_DSRC_BTREE_ROW_LEAF 2033 +#define WT_STAT_DSRC_BTREE_ROW_LEAF 2034 /*! cache: bytes read into cache */ -#define WT_STAT_DSRC_CACHE_BYTES_READ 2034 +#define WT_STAT_DSRC_CACHE_BYTES_READ 2035 /*! cache: bytes written from cache */ -#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2035 +#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2036 /*! cache: checkpoint blocked page eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2036 +#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2037 /*! cache: unmodified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2037 +#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2038 /*! cache: page split during eviction deepened the tree */ -#define WT_STAT_DSRC_CACHE_EVICTION_DEEPEN 2038 +#define WT_STAT_DSRC_CACHE_EVICTION_DEEPEN 2039 /*! cache: modified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2039 +#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2040 /*! cache: data source pages selected for eviction unable to be evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2040 +#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2041 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2041 +#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2042 /*! cache: internal pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2042 +#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2043 /*! cache: pages split during eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT 2043 +#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT 2044 /*! cache: in-memory page splits */ -#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2044 +#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2045 +/*! cache: in-memory page passed criteria to be split */ +#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2046 /*! cache: overflow values cached in memory */ -#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2045 +#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2047 /*! cache: pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ 2046 +#define WT_STAT_DSRC_CACHE_READ 2048 +/*! cache: pages read into cache requiring lookaside entries */ +#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2049 /*! cache: overflow pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2047 +#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2050 /*! cache: pages written from cache */ -#define WT_STAT_DSRC_CACHE_WRITE 2048 +#define WT_STAT_DSRC_CACHE_WRITE 2051 +/*! cache: page written requiring lookaside records */ +#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2052 +/*! cache: pages written requiring in-memory restoration */ +#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2053 /*! compression: raw compression call failed, no additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2049 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2054 /*! compression: raw compression call failed, additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2050 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2055 /*! compression: raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 2051 +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2056 /*! compression: compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 2052 +#define WT_STAT_DSRC_COMPRESS_READ 2057 /*! compression: compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 2053 +#define WT_STAT_DSRC_COMPRESS_WRITE 2058 /*! compression: page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2054 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2059 /*! compression: page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2055 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2060 /*! cursor: create calls */ -#define WT_STAT_DSRC_CURSOR_CREATE 2056 +#define WT_STAT_DSRC_CURSOR_CREATE 2061 /*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2057 +#define WT_STAT_DSRC_CURSOR_INSERT 2062 /*! cursor: bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2058 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2063 /*! cursor: cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2059 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2064 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2060 +#define WT_STAT_DSRC_CURSOR_NEXT 2065 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2061 +#define WT_STAT_DSRC_CURSOR_PREV 2066 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2062 +#define WT_STAT_DSRC_CURSOR_REMOVE 2067 /*! cursor: cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2063 +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2068 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2064 +#define WT_STAT_DSRC_CURSOR_RESET 2069 /*! cursor: restarted searches */ -#define WT_STAT_DSRC_CURSOR_RESTART 2065 +#define WT_STAT_DSRC_CURSOR_RESTART 2070 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2066 +#define WT_STAT_DSRC_CURSOR_SEARCH 2071 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2067 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2072 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2068 +#define WT_STAT_DSRC_CURSOR_UPDATE 2073 /*! cursor: cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2069 +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2074 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2070 +#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2075 /*! LSM: chunks in the LSM tree */ -#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2071 +#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2076 /*! LSM: highest merge generation in the LSM tree */ -#define WT_STAT_DSRC_LSM_GENERATION_MAX 2072 +#define WT_STAT_DSRC_LSM_GENERATION_MAX 2077 /*! LSM: queries that could have benefited from a Bloom filter that did * not exist */ -#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2073 +#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2078 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2074 +#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2079 /*! reconciliation: dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2075 +#define WT_STAT_DSRC_REC_DICTIONARY 2080 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2076 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2081 /*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2077 +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2082 /*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2078 +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2083 /*! reconciliation: internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2079 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2084 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2080 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2085 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2081 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2086 /*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2082 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2087 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2083 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2088 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2084 +#define WT_STAT_DSRC_REC_PAGES 2089 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2085 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2090 /*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2086 +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2091 /*! reconciliation: internal page key bytes discarded using suffix * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2087 +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2092 /*! session: object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2088 +#define WT_STAT_DSRC_SESSION_COMPACT 2093 /*! session: open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2089 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2094 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2090 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2095 /*! @} */ /* * Statistics section: END diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index 9cc2ce2135a..4d46a25b63c 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -41,6 +41,7 @@ extern "C" { #else #include <pthread.h> #endif +#include <stdbool.h> #include <stddef.h> #include <stdio.h> #include <stdint.h> @@ -245,6 +246,8 @@ struct __wt_rwlock; typedef struct __wt_rwlock WT_RWLOCK; struct __wt_salvage_cookie; typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE; +struct __wt_save_upd; + typedef struct __wt_save_upd WT_SAVE_UPD; struct __wt_scratch_track; typedef struct __wt_scratch_track WT_SCRATCH_TRACK; struct __wt_session_impl; @@ -265,8 +268,6 @@ struct __wt_txn_op; typedef struct __wt_txn_op WT_TXN_OP; struct __wt_txn_state; typedef struct __wt_txn_state WT_TXN_STATE; -struct __wt_upd_skipped; - typedef struct __wt_upd_skipped WT_UPD_SKIPPED; struct __wt_update; typedef struct __wt_update WT_UPDATE; union __wt_rand_state; @@ -335,6 +336,7 @@ union __wt_rand_state; #include "cache.i" /* required by txn.i */ #include "cell.i" /* required by btree.i */ +#include "log.i" #include "mutex.i" /* required by btree.i */ #include "txn.i" /* required by btree.i */ diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 26ba34c7f93..574442f645c 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -34,6 +34,24 @@ __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn) } /* + * __wt_log_ckpt_lsn -- + * Force out buffered records and return an LSN for checkpoint. + */ +int +__wt_log_ckpt_lsn(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + + conn = S2C(session); + log = conn->log; + WT_RET(__wt_log_force_write(session, 1)); + WT_RET(__wt_log_wrlsn(session)); + *ckp_lsn = log->write_start_lsn; + return (0); +} + +/* * __wt_log_background -- * Record the given LSN as the background LSN and signal the * thread as needed. @@ -53,7 +71,7 @@ __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn) * needed. */ __wt_spin_lock(session, &log->log_sync_lock); - if (WT_LOG_CMP(lsn, &log->bg_sync_lsn) > 0) + if (__wt_log_cmp(lsn, &log->bg_sync_lsn) > 0) log->bg_sync_lsn = *lsn; __wt_spin_unlock(session, &log->log_sync_lock); return (__wt_cond_signal(session, conn->log_file_cond)); @@ -100,7 +118,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) /* * Sync the log file if needed. */ - if (WT_LOG_CMP(&log->sync_lsn, min_lsn) < 0) { + if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_force_sync: sync to LSN %d/%lu", min_lsn->file, min_lsn->offset)); @@ -241,6 +259,11 @@ __wt_log_get_all_files(WT_SESSION_IMPL *session, log = S2C(session)->log; *maxid = 0; + /* + * These may be files needed by backup. Force the current slot + * to get written to the file. + */ + WT_RET(__wt_log_force_write(session, 1)); WT_RET(__log_get_files(session, WT_LOG_FILENAME, &files, &count)); /* Filter out any files that are below the checkpoint LSN. */ @@ -354,70 +377,12 @@ static int __log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize) { WT_CONNECTION_IMPL *conn; - - conn = S2C(session); - return (lsn->offset + (wt_off_t)recsize < conn->log_file_max); -} - -/* - * __log_acquire -- - * Called with the log slot lock held. Can be called recursively - * from __wt_log_newfile when we change log files. - */ -static int -__log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) -{ - WT_CONNECTION_IMPL *conn; WT_LOG *log; - int created_log; conn = S2C(session); log = conn->log; - created_log = 1; - /* - * Called locked. Add recsize to alloc_lsn. Save our starting LSN - * where the previous allocation finished for the release LSN. - * That way when log files switch, we're waiting for the correct LSN - * from outstanding writes. - */ - slot->slot_release_lsn = log->alloc_lsn; - if (!__log_size_fit(session, &log->alloc_lsn, recsize)) { - WT_RET(__wt_log_newfile(session, 0, &created_log)); - if (log->log_close_fh != NULL) - F_SET(slot, WT_SLOT_CLOSEFH); - } - - /* - * Checkpoints can be configured based on amount of log written. - * Add in this log record to the sum and if needed, signal the - * checkpoint condition. The logging subsystem manages the - * accumulated field. There is a bit of layering violation - * here checking the connection ckpt field and using its - * condition. - */ - if (WT_CKPT_LOGSIZE(conn)) { - log->log_written += (wt_off_t)recsize; - WT_RET(__wt_checkpoint_signal(session, log->log_written)); - } - - /* - * Need to minimally fill in slot info here. Our slot start LSN - * comes after any potential new log file creations. - */ - slot->slot_start_lsn = log->alloc_lsn; - slot->slot_start_offset = log->alloc_lsn.offset; - /* - * Pre-allocate on the first real write into the log file, if it - * was just created (i.e. not pre-allocated). - */ - if (log->alloc_lsn.offset == WT_LOG_FIRST_RECORD && created_log) - WT_RET(__log_prealloc(session, log->log_fh)); - - log->alloc_lsn.offset += (wt_off_t)recsize; - slot->slot_end_lsn = log->alloc_lsn; - slot->slot_error = 0; - slot->slot_fh = log->log_fh; - return (0); + return (lsn->offset == WT_LOG_FIRST_RECORD || + lsn->offset + (wt_off_t)recsize < conn->log_file_max); } /* @@ -490,24 +455,32 @@ __log_decrypt(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out) */ static int __log_fill(WT_SESSION_IMPL *session, - WT_MYSLOT *myslot, int direct, WT_ITEM *record, WT_LSN *lsnp) + WT_MYSLOT *myslot, int force, WT_ITEM *record, WT_LSN *lsnp) { WT_DECL_RET; WT_LOG_RECORD *logrec; + /* + * The WT_LOG_SLOT_BUF_MAX macro uses log. + */ logrec = (WT_LOG_RECORD *)record->mem; /* - * Call __wt_write. For now the offset is the real byte offset. If the - * offset becomes a unit of WT_LOG_ALIGN this is where we would multiply - * by WT_LOG_ALIGN to get the real file byte offset for write(). + * Call __wt_write or copy into the buffer. For now the offset is the + * real byte offset. If the offset becomes a unit of WT_LOG_ALIGN this + * is where we would multiply by WT_LOG_ALIGN to get the real file byte + * offset for write(). */ - if (direct) + if (!force && !F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) + memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset, + logrec, logrec->len); + else + /* + * If this is a force or unbuffered write, write it now. + * A forced write sends in a temporary, local slot. + */ WT_ERR(__wt_write(session, myslot->slot->slot_fh, myslot->offset + myslot->slot->slot_start_offset, (size_t)logrec->len, (void *)logrec)); - else - memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset, - logrec, logrec->len); WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len); if (lsnp != NULL) { @@ -563,12 +536,12 @@ __log_file_header( logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, log->allocsize); WT_CLEAR(tmp); + memset(&myslot, 0, sizeof(myslot)); myslot.slot = &tmp; - myslot.offset = 0; /* - * We may recursively call __log_acquire to allocate log space for the - * log descriptor record. Call __log_fill to write it, but we + * We may recursively call __wt_log_acquire to allocate log space for + * the log descriptor record. Call __log_fill to write it, but we * do not need to call __log_release because we're not waiting for * any earlier operations to complete. */ @@ -577,7 +550,7 @@ __log_file_header( tmp.slot_fh = fh; } else { WT_ASSERT(session, fh == NULL); - WT_ERR(__log_acquire(session, logrec->len, &tmp)); + WT_ERR(__wt_log_acquire(session, logrec->len, &tmp)); } WT_ERR(__log_fill(session, &myslot, 1, buf, NULL)); /* @@ -697,6 +670,146 @@ err: __wt_scr_free(session, &from_path); } /* + * __log_newfile -- + * Create the next log file and write the file header record into it. + */ +static int +__log_newfile(WT_SESSION_IMPL *session, int conn_open, int *created) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + WT_LSN end_lsn; + int create_log, yield_cnt; + + conn = S2C(session); + log = conn->log; + + create_log = 1; + yield_cnt = 0; + /* + * Set aside the log file handle to be closed later. Other threads + * may still be using it to write to the log. If the log file size + * is small we could fill a log file before the previous one is closed. + * Wait for that to close. + */ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); + while (log->log_close_fh != NULL) { + WT_STAT_FAST_CONN_INCR(session, log_close_yields); + WT_RET(__wt_log_wrlsn(session)); + if (++yield_cnt > 10000) + return (EBUSY); + __wt_yield(); + } + log->log_close_fh = log->log_fh; + if (log->log_close_fh != NULL) + log->log_close_lsn = log->alloc_lsn; + log->fileid++; + /* + * Make sure everything we set above is visible. + */ + WT_FULL_BARRIER(); + /* + * If we're pre-allocating log files, look for one. If there aren't any + * or we're not pre-allocating, then create one. + */ + if (conn->log_prealloc) { + ret = __log_alloc_prealloc(session, log->fileid); + /* + * If ret is 0 it means we found a pre-allocated file. + * If ret is non-zero but not WT_NOTFOUND, we return the error. + * If ret is WT_NOTFOUND, we leave create_log set and create + * the new log file. + */ + if (ret == 0) + create_log = 0; + /* + * If we get any error other than WT_NOTFOUND, return it. + */ + if (ret != 0 && ret != WT_NOTFOUND) + return (ret); + ret = 0; + } + /* + * If we need to create the log file, do so now. + */ + if (create_log) { + log->prep_missed++; + WT_RET(__wt_log_allocfile( + session, log->fileid, WT_LOG_FILENAME, 1)); + } + WT_RET(__log_openfile(session, + 0, &log->log_fh, WT_LOG_FILENAME, log->fileid)); + /* + * We need to setup the LSNs. Set the end LSN and alloc LSN to + * the end of the header. + */ + log->alloc_lsn.file = log->fileid; + log->alloc_lsn.offset = WT_LOG_FIRST_RECORD; + end_lsn = log->alloc_lsn; + + /* + * If we're called from connection creation code, we need to update + * the LSNs since we're the only write in progress. + */ + if (conn_open) { + WT_RET(__wt_fsync(session, log->log_fh)); + log->sync_lsn = end_lsn; + log->write_lsn = end_lsn; + log->write_start_lsn = end_lsn; + } + if (created != NULL) + *created = create_log; + return (0); +} + +/* + * __wt_log_acquire -- + * Called serially when switching slots. Can be called recursively + * from __log_newfile when we change log files. + */ +int +__wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + int created_log; + + conn = S2C(session); + log = conn->log; + created_log = 1; + /* + * Add recsize to alloc_lsn. Save our starting LSN + * where the previous allocation finished for the release LSN. + * That way when log files switch, we're waiting for the correct LSN + * from outstanding writes. + */ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); + /* + * We need to set the release LSN earlier, before a log file change. + */ + slot->slot_release_lsn = log->alloc_lsn; + if (!__log_size_fit(session, &log->alloc_lsn, recsize)) { + WT_RET(__log_newfile(session, 0, &created_log)); + if (log->log_close_fh != NULL) + F_SET(slot, WT_SLOT_CLOSEFH); + } + + /* + * Pre-allocate on the first real write into the log file, if it + * was just created (i.e. not pre-allocated). + */ + if (log->alloc_lsn.offset == WT_LOG_FIRST_RECORD && created_log) + WT_RET(__log_prealloc(session, log->log_fh)); + /* + * Initialize the slot for activation. + */ + __wt_log_slot_activate(session, slot); + + return (0); +} + +/* * __log_truncate -- * Truncate the log to the given LSN. If this_log is set, it will only * truncate the log file indicated in the given LSN. If not set, @@ -842,7 +955,7 @@ err: __wt_scr_free(session, &path); * __wt_log_open -- * Open the appropriate log file for the connection. The purpose is * to find the last log file that exists, open it and set our initial - * LSNs to the end of that file. If none exist, call __wt_log_newfile + * LSNs to the end of that file. If none exist, call __log_newfile * to create it. */ int @@ -917,7 +1030,9 @@ __wt_log_open(WT_SESSION_IMPL *session) * Start logging at the beginning of the next log file, no matter * where the previous log file ends. */ - WT_ERR(__wt_log_newfile(session, 1, NULL)); + WT_WITH_SLOT_LOCK(session, log, + ret = __log_newfile(session, 1, NULL)); + WT_ERR(ret); /* If we found log files, save the new state. */ if (logcount > 0) { @@ -1065,38 +1180,57 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) WT_DECL_RET; WT_LOG *log; WT_LSN sync_lsn; - size_t write_size; - int locked, yield_count; + int locked, need_relock, yield_count; + int64_t release_buffered, release_bytes; conn = S2C(session); log = conn->log; - locked = yield_count = 0; - *freep = 1; + locked = need_relock = yield_count = 0; + if (freep != NULL) + *freep = 1; + release_buffered = + WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state); + release_bytes = release_buffered + slot->slot_unbuffered; /* Write the buffered records */ - if (F_ISSET(slot, WT_SLOT_BUFFERED)) { - write_size = (size_t) - (slot->slot_end_lsn.offset - slot->slot_start_offset); - WT_ERR(__wt_write(session, slot->slot_fh, - slot->slot_start_offset, write_size, slot->slot_buf.mem)); + /* + * Checkpoints can be configured based on amount of log written. + * Add in this log record to the sum and if needed, signal the + * checkpoint condition. The logging subsystem manages the + * accumulated field. There is a bit of layering violation + * here checking the connection ckpt field and using its + * condition. + */ + if (WT_CKPT_LOGSIZE(conn)) { + log->log_written += (wt_off_t)release_bytes; + WT_RET(__wt_checkpoint_signal(session, log->log_written)); } + if (release_buffered != 0) + WT_ERR(__wt_write(session, + slot->slot_fh, slot->slot_start_offset, + (size_t)release_buffered, slot->slot_buf.mem)); + /* - * If this is not a buffered write, meaning the slot we have is a - * dummy constructed slot, not from the slot pool, or we have to wait - * for a synchronous operation, we do not pass handling of this slot - * off to the worker thread. The caller is responsible for freeing - * the slot in that case. Otherwise the worker thread will free it. + * If we have to wait for a synchronous operation, we do not pass + * handling of this slot off to the worker thread. The caller is + * responsible for freeing the slot in that case. Otherwise the + * worker thread will free it. */ - if (F_ISSET(slot, WT_SLOT_BUFFERED) && - !F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { - *freep = 0; + if (!F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { + if (freep != NULL) + *freep = 0; slot->slot_state = WT_LOG_SLOT_WRITTEN; /* * After this point the worker thread owns the slot. There * is nothing more to do but return. */ - WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond)); + /* + * !!! Signalling the wrlsn_cond condition here results in + * worse performance because it causes more scheduling churn + * and more walking of the slot pool for a very small number + * of slots to process. Don't signal here. + */ goto done; } @@ -1105,15 +1239,31 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) * be holes in the log file. */ WT_STAT_FAST_CONN_INCR(session, log_release_write_lsn); - while (WT_LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) { + while (__wt_log_cmp(&log->write_lsn, &slot->slot_release_lsn) != 0) { + /* + * If we're on a locked path and the write LSN is not advancing, + * unlock in case an earlier thread is trying to switch its + * slot and complete its operation. + */ + if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) { + __wt_spin_unlock(session, &log->log_slot_lock); + need_relock = 1; + } if (++yield_count < 1000) __wt_yield(); else WT_ERR(__wt_cond_wait( session, log->log_write_cond, 200)); + if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) { + __wt_spin_lock(session, &log->log_slot_lock); + need_relock = 0; + } } + log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; + + WT_ASSERT(session, slot != log->active_slot); WT_ERR(__wt_cond_signal(session, log->log_write_cond)); /* @@ -1168,7 +1318,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) * Sync the log file if needed. */ if (F_ISSET(slot, WT_SLOT_SYNC) && - WT_LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { + __wt_log_cmp(&log->sync_lsn, &slot->slot_end_lsn) < 0) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_release: sync log %s", log->log_fh->name)); WT_STAT_FAST_CONN_INCR(session, log_sync); @@ -1186,6 +1336,8 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) } err: if (locked) __wt_spin_unlock(session, &log->log_sync_lock); + if (need_relock) + __wt_spin_lock(session, &log->log_slot_lock); if (ret != 0 && slot->slot_error == 0) slot->slot_error = ret; done: @@ -1193,93 +1345,6 @@ done: } /* - * __wt_log_newfile -- - * Create the next log file and write the file header record into it. - */ -int -__wt_log_newfile(WT_SESSION_IMPL *session, int conn_create, int *created) -{ - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_LOG *log; - WT_LSN end_lsn; - int create_log; - - conn = S2C(session); - log = conn->log; - - create_log = 1; - /* - * Set aside the log file handle to be closed later. Other threads - * may still be using it to write to the log. If the log file size - * is small we could fill a log file before the previous one is closed. - * Wait for that to close. - */ - while (log->log_close_fh != NULL) { - WT_STAT_FAST_CONN_INCR(session, log_close_yields); - WT_RET(__wt_log_wrlsn(session, NULL, NULL)); - __wt_yield(); - } - log->log_close_fh = log->log_fh; - log->fileid++; - - /* - * If we're pre-allocating log files, look for one. If there aren't any - * or we're not pre-allocating, then create one. - */ - ret = 0; - if (conn->log_prealloc) { - ret = __log_alloc_prealloc(session, log->fileid); - /* - * If ret is 0 it means we found a pre-allocated file. - * If ret is non-zero but not WT_NOTFOUND, we return the error. - * If ret is WT_NOTFOUND, we leave create_log set and create - * the new log file. - */ - if (ret == 0) - create_log = 0; - /* - * If we get any error other than WT_NOTFOUND, return it. - */ - if (ret != 0 && ret != WT_NOTFOUND) - return (ret); - ret = 0; - } - /* - * If we need to create the log file, do so now. - */ - if (create_log) { - log->prep_missed++; - if ((ret = __wt_log_allocfile( - session, log->fileid, WT_LOG_FILENAME, 0)) != 0) - return (ret); - } - WT_RET(__log_openfile(session, - 0, &log->log_fh, WT_LOG_FILENAME, log->fileid)); - /* - * We need to setup the LSNs. Set the end LSN and alloc LSN to - * the end of the header. - */ - log->alloc_lsn.file = log->fileid; - log->alloc_lsn.offset = WT_LOG_FIRST_RECORD; - end_lsn = log->alloc_lsn; - - /* - * If we're called from connection creation code, we need to update - * the LSNs since we're the only write in progress. - */ - if (conn_create) { - WT_RET(__wt_fsync(session, log->log_fh)); - log->sync_lsn = end_lsn; - log->write_lsn = end_lsn; - log->write_start_lsn = end_lsn; - } - if (created != NULL) - *created = create_log; - return (0); -} - -/* * __wt_log_scan -- * Scan the logs, calling a function on each record found. */ @@ -1535,7 +1600,7 @@ advance: /* Truncate if we're in recovery. */ if (LF_ISSET(WT_LOGSCAN_RECOVER) && - WT_LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0) + __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0) WT_ERR(__log_truncate(session, &rd_lsn, WT_LOG_FILENAME, 0)); @@ -1559,42 +1624,54 @@ err: WT_STAT_FAST_CONN_INCR(session, log_scans); } /* - * __log_direct_write -- - * Write a log record without using the consolidation arrays. + * __log_force_write_internal -- + * Force a switch and release and write of the current slot. + * Must be called with the slot lock held. */ static int -__log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, - uint32_t flags) +__log_force_write_internal(WT_SESSION_IMPL *session) { WT_DECL_RET; WT_LOG *log; - WT_LOGSLOT tmp; - WT_MYSLOT myslot; - int dummy, locked; + WT_LOGSLOT *slot; + int free_slot, release; log = S2C(session)->log; - myslot.slot = &tmp; - myslot.offset = 0; - dummy = 0; - WT_CLEAR(tmp); + slot = log->active_slot; + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); + /* + * If closing the slot returns WT_NOTFOUND, it means that someone else + * is processing the slot change: we're done. If we get EBUSY (or any + * other error), return that so the caller can decide what to do. + */ + ret = __wt_log_slot_close(session, slot, &release, 1); + if (ret == WT_NOTFOUND) + return (0); + WT_RET(ret); + if (release) { + WT_RET(__log_release(session, slot, &free_slot)); + if (free_slot) + __wt_log_slot_free(session, slot); + } + WT_RET(__wt_log_slot_new(session)); + return (0); +} - /* Fast path the contended case. */ - if (__wt_spin_trylock(session, &log->log_slot_lock) != 0) - return (EAGAIN); - locked = 1; - - if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) - F_SET(&tmp, WT_SLOT_SYNC_DIR); - if (LF_ISSET(WT_LOG_FSYNC)) - F_SET(&tmp, WT_SLOT_SYNC); - WT_ERR(__log_acquire(session, record->size, &tmp)); - __wt_spin_unlock(session, &log->log_slot_lock); - locked = 0; - WT_ERR(__log_fill(session, &myslot, 1, record, lsnp)); - WT_ERR(__log_release(session, &tmp, &dummy)); +/* + * __wt_log_force_write -- + * Force a switch and release and write of the current slot. + * Wrapper function that takes the lock. + */ +int +__wt_log_force_write(WT_SESSION_IMPL *session, int retry) +{ + WT_DECL_RET; + + do { + WT_WITH_SLOT_LOCK(session, S2C(session)->log, + ret = __log_force_write_internal(session)); + } while (retry && ret == EBUSY); -err: if (locked) - __wt_spin_unlock(session, &log->log_slot_lock); return (ret); } @@ -1741,14 +1818,16 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_LOG_RECORD *logrec; WT_LSN lsn; WT_MYSLOT myslot; - uint32_t rdup_len; - int free_slot, locked; + int64_t release_size; + uint32_t force, rdup_len; + int free_slot; conn = S2C(session); log = conn->log; - free_slot = locked = 0; + free_slot = 0; WT_INIT_LSN(&lsn); myslot.slot = NULL; + memset(&myslot, 0, sizeof(myslot)); /* * Assume the WT_ITEM the caller passed is a WT_LOG_RECORD, which has a * header at the beginning for us to fill in. @@ -1778,87 +1857,67 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_STAT_FAST_CONN_INCR(session, log_writes); - if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) { - ret = __log_direct_write(session, record, &lsn, flags); - if (ret == 0 && lsnp != NULL) - *lsnp = lsn; - /* - * All needed syncing will be handled directly except - * a background sync. Handle that here. - */ - if (ret == 0) { - if (LF_ISSET(WT_LOG_BACKGROUND)) - goto bg; - else - return (0); - } - if (ret != EAGAIN) - WT_ERR(ret); - /* - * An EAGAIN return means we failed to get the try lock - - * fall through to the consolidation code in that case. - */ - } - + __wt_log_slot_join(session, rdup_len, flags, &myslot); /* - * As soon as we see contention for the log slot, disable direct - * log writes. We get better performance by forcing writes through - * the consolidation code. This is because individual writes flood - * the I/O system faster than they contend on the log slot lock. + * If the addition of this record crosses the buffer boundary, + * switch in a new slot. */ - F_SET(log, WT_LOG_FORCE_CONSOLIDATE); - if ((ret = __wt_log_slot_join( - session, rdup_len, flags, &myslot)) == ENOMEM) { + force = LF_ISSET(WT_LOG_FLUSH | WT_LOG_FSYNC); + ret = 0; + if (myslot.end_offset >= WT_LOG_SLOT_BUF_MAX || + F_ISSET(&myslot, WT_MYSLOT_UNBUFFERED) || force) + ret = __wt_log_slot_switch(session, &myslot); + if (ret == 0) + ret = __log_fill(session, &myslot, 0, record, &lsn); + release_size = __wt_log_slot_release( + session, &myslot, (int64_t)rdup_len); + /* + * If we get an error we still need to do proper accounting in + * the slot fields. + * XXX On error we may still need to call release and free. + */ + if (ret != 0) + myslot.slot->slot_error = ret; + WT_ASSERT(session, ret == 0); + if (WT_LOG_SLOT_DONE(release_size)) { + WT_ERR(__log_release(session, myslot.slot, &free_slot)); + if (free_slot) + __wt_log_slot_free(session, myslot.slot); + } else if (force) { /* - * If we couldn't find a consolidated slot for this record - * write the record directly. + * If we are going to wait for this slot to get written, + * signal the wrlsn thread. + * + * XXX I've seen times when conditions are NULL. */ - while ((ret = __log_direct_write( - session, record, lsnp, flags)) == EAGAIN) - ; - WT_ERR(ret); - return (0); + if (conn->log_cond != NULL) { + WT_ERR(__wt_cond_signal(session, conn->log_cond)); + __wt_yield(); + } else + WT_ERR(__wt_log_force_write(session, 1)); } - WT_ERR(ret); - if (myslot.offset == 0) { - __wt_spin_lock(session, &log->log_slot_lock); - locked = 1; - WT_ERR(__wt_log_slot_close(session, myslot.slot)); - WT_ERR(__log_acquire( - session, myslot.slot->slot_group_size, myslot.slot)); - __wt_spin_unlock(session, &log->log_slot_lock); - locked = 0; - WT_ERR(__wt_log_slot_notify(session, myslot.slot)); - } else - WT_ERR(__wt_log_slot_wait(session, myslot.slot)); - WT_ERR(__log_fill(session, &myslot, 0, record, &lsn)); - if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) { - WT_ERR(__log_release(session, myslot.slot, &free_slot)); - if (free_slot) - WT_ERR(__wt_log_slot_free(session, myslot.slot)); + if (LF_ISSET(WT_LOG_FLUSH)) { + /* Wait for our writes to reach the OS */ + while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 && + myslot.slot->slot_error == 0) + (void)__wt_cond_wait( + session, log->log_write_cond, 10000); } else if (LF_ISSET(WT_LOG_FSYNC)) { /* Wait for our writes to reach disk */ - while (WT_LOG_CMP(&log->sync_lsn, &lsn) <= 0 && + while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) (void)__wt_cond_wait( session, log->log_sync_cond, 10000); - } else if (LF_ISSET(WT_LOG_FLUSH)) { - /* Wait for our writes to reach the OS */ - while (WT_LOG_CMP(&log->write_lsn, &lsn) <= 0 && - myslot.slot->slot_error == 0) - (void)__wt_cond_wait( - session, log->log_write_cond, 10000); } /* * Advance the background sync LSN if needed. */ -bg: if (LF_ISSET(WT_LOG_BACKGROUND) && - WT_LOG_CMP(&session->bg_sync_lsn, &lsn) <= 0) + if (LF_ISSET(WT_LOG_BACKGROUND) && + __wt_log_cmp(&session->bg_sync_lsn, &lsn) <= 0) WT_ERR(__wt_log_background(session, &lsn)); -err: if (locked) - __wt_spin_unlock(session, &log->log_slot_lock); +err: if (ret == 0 && lsnp != NULL) *lsnp = lsn; /* diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c index 07878d1ae1e..a1a68557f93 100644 --- a/src/third_party/wiredtiger/src/log/log_slot.c +++ b/src/third_party/wiredtiger/src/log/log_slot.c @@ -9,325 +9,493 @@ #include "wt_internal.h" /* - * This file implements the consolidated array algorithm as described in - * the paper: - * Scalability of write-ahead logging on multicore and multisocket hardware - * by Ryan Johnson, Ippokratis Pandis, Radu Stoica, Manos Athanassoulis - * and Anastasia Ailamaki. - * - * It appeared in The VLDB Journal, DOI 10.1007/s00778-011-0260-8 and can - * be found at: - * http://infoscience.epfl.ch/record/170505/files/aether-smpfulltext.pdf + * __wt_log_slot_activate -- + * Initialize a slot to become active. */ - -/* - * __wt_log_slot_init -- - * Initialize the slot array. - */ -int -__wt_log_slot_init(WT_SESSION_IMPL *session) +void +__wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; - WT_DECL_RET; WT_LOG *log; - WT_LOGSLOT *slot; - int32_t i; conn = S2C(session); log = conn->log; - WT_CACHE_LINE_ALIGNMENT_VERIFY(session, log->slot_pool); - for (i = 0; i < WT_SLOT_POOL; i++) { - log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE; - log->slot_pool[i].slot_index = WT_SLOT_INVALID_INDEX; - } - - /* - * Set up the available slots from the pool the first time. - */ - for (i = 0; i < WT_SLOT_ACTIVE; i++) { - slot = &log->slot_pool[i]; - slot->slot_index = (uint32_t)i; - slot->slot_state = WT_LOG_SLOT_READY; - log->slot_array[i] = slot; - } - - /* - * Allocate memory for buffers now that the arrays are setup. Split - * this out to make error handling simpler. - * - * Cap the slot buffer to the log file size. - */ - log->slot_buf_size = - WT_MIN((size_t)conn->log_file_max, WT_LOG_SLOT_BUF_SIZE); - for (i = 0; i < WT_SLOT_POOL; i++) { - WT_ERR(__wt_buf_init(session, - &log->slot_pool[i].slot_buf, log->slot_buf_size)); - F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS); - } - WT_STAT_FAST_CONN_INCRV(session, - log_buffer_size, log->slot_buf_size * WT_SLOT_POOL); - if (0) { -err: while (--i >= 0) - __wt_buf_free(session, &log->slot_pool[i].slot_buf); - } - return (ret); + slot->slot_state = 0; + slot->slot_start_lsn = slot->slot_end_lsn = log->alloc_lsn; + slot->slot_start_offset = log->alloc_lsn.offset; + slot->slot_last_offset = log->alloc_lsn.offset; + slot->slot_fh = log->log_fh; + slot->slot_error = 0; + slot->slot_unbuffered = 0; } /* - * __wt_log_slot_destroy -- - * Clean up the slot array on shutdown. + * __wt_log_slot_close -- + * Close out the slot the caller is using. The slot may already be + * closed or freed by another thread. */ int -__wt_log_slot_destroy(WT_SESSION_IMPL *session) +__wt_log_slot_close( + WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *releasep, int forced) { WT_CONNECTION_IMPL *conn; WT_LOG *log; - int i; + int64_t end_offset, new_state, old_state; + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); conn = S2C(session); log = conn->log; - - for (i = 0; i < WT_SLOT_POOL; i++) - __wt_buf_free(session, &log->slot_pool[i].slot_buf); + if (releasep != NULL) + *releasep = 0; + if (slot == NULL) + return (WT_NOTFOUND); +retry: + old_state = slot->slot_state; + /* + * If this close is coming from a forced close and a thread is in + * the middle of using the slot, return EBUSY. The caller can + * decide if retrying is necessary or not. + */ + if (forced && WT_LOG_SLOT_INPROGRESS(old_state)) + return (EBUSY); + /* + * If someone else is switching out this slot we lost. Nothing to + * do but return. Return WT_NOTFOUND anytime the given slot was + * processed by another closing thread. Only return 0 when we + * actually closed the slot. + */ + if (WT_LOG_SLOT_CLOSED(old_state)) + return (WT_NOTFOUND); + /* + * If someone completely processed this slot, we're done. + */ + if (FLD64_ISSET((uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED)) + return (WT_NOTFOUND); + new_state = (old_state | WT_LOG_SLOT_CLOSE); + /* + * Close this slot. If we lose the race retry. + */ + if (!__wt_atomic_casiv64(&slot->slot_state, old_state, new_state)) + goto retry; + /* + * We own the slot now. No one else can join. + * Set the end LSN. + */ + WT_STAT_FAST_CONN_INCR(session, log_slot_closes); + if (WT_LOG_SLOT_DONE(new_state) && releasep != NULL) + *releasep = 1; + slot->slot_end_lsn = slot->slot_start_lsn; + end_offset = + WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered; + slot->slot_end_lsn.offset += (wt_off_t)end_offset; + WT_STAT_FAST_CONN_INCRV(session, + log_slot_consolidated, end_offset); + /* + * XXX Would like to change so one piece of code advances the LSN. + */ + log->alloc_lsn = slot->slot_end_lsn; + WT_ASSERT(session, log->alloc_lsn.file >= log->write_lsn.file); return (0); } /* - * __wt_log_slot_join -- - * Join a consolidated logging slot. Callers should be prepared to deal - * with a ENOMEM return - which indicates no slots could accommodate - * the log record. + * __wt_log_slot_switch_internal -- + * Switch out the current slot and set up a new one. */ int -__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, - uint32_t flags, WT_MYSLOT *myslotp) +__wt_log_slot_switch_internal(WT_SESSION_IMPL *session, WT_MYSLOT *myslot) { - WT_CONNECTION_IMPL *conn; + WT_DECL_RET; WT_LOG *log; - WT_LOGSLOT *slot; - int64_t new_state, old_state; - uint32_t allocated_slot, slot_attempts; + int release; +#ifdef HAVE_DIAGNOSTIC + int64_t r, state; + int32_t j; +#endif - conn = S2C(session); - log = conn->log; - slot_attempts = 0; + log = S2C(session)->log; + release = 0; + + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); - if (mysize >= (uint64_t)log->slot_buf_size) { - WT_STAT_FAST_CONN_INCR(session, log_slot_toobig); - return (ENOMEM); - } -find_slot: -#if WT_SLOT_ACTIVE == 1 - allocated_slot = 0; -#else - allocated_slot = __wt_random(&session->rnd) % WT_SLOT_ACTIVE; -#endif - /* - * Get the selected slot. Use a barrier to prevent the compiler from - * caching this read. - */ - WT_BARRIER(); - slot = log->slot_array[allocated_slot]; -join_slot: /* - * Read the current slot state. Use a barrier to prevent the compiler - * from caching this read. + * If someone else raced us to closing this specific slot, we're + * done here. */ - WT_BARRIER(); - old_state = slot->slot_state; + if (myslot->slot != log->active_slot) + return (0); + /* - * WT_LOG_SLOT_READY and higher means the slot is available for - * joining. Any other state means it is in use and transitioning - * from the active array. + * If close returns WT_NOTFOUND, it means that someone else is + * processing the slot change. However, we could have retried + * from a busy time creating a new slot. If so, we are that + * someone else and we need to try setting up a new slot again. */ - if (old_state < WT_LOG_SLOT_READY) { - WT_STAT_FAST_CONN_INCR(session, log_slot_transitions); - goto find_slot; + if (!F_ISSET(myslot, WT_MYSLOT_CLOSE)) { + ret = __wt_log_slot_close(session, myslot->slot, &release, 0); + if (ret == WT_NOTFOUND) + return (0); } + /* - * Add in our size to the state and then atomically swap that - * into place if it is still the same value. + * Only mainline callers use switch. Our size should be in join + * and we have not yet released, so we should never think release + * should be done now. */ - new_state = old_state + (int64_t)mysize; - if (new_state < old_state) { - /* Our size doesn't fit here. */ - WT_STAT_FAST_CONN_INCR(session, log_slot_toobig); - goto find_slot; - } + WT_ASSERT(session, release == 0); + WT_ASSERT(session, ret == 0); + /* - * If the slot buffer isn't big enough to hold this update, try - * to find another slot. + * Set that we have closed this slot because we may call in here + * multiple times if we retry creating a new slot. */ - if (new_state > (int64_t)slot->slot_buf.memsize) { - if (++slot_attempts > 5) { - WT_STAT_FAST_CONN_INCR(session, log_slot_toosmall); - return (ENOMEM); - } - goto find_slot; - } + F_SET(myslot, WT_MYSLOT_CLOSE); +#ifdef HAVE_DIAGNOSTIC + state = myslot->slot->slot_state; + j = WT_LOG_SLOT_JOINED(state); + r = WT_LOG_SLOT_RELEASED(state); + WT_ASSERT(session, j > r); +#endif + WT_RET(__wt_log_slot_new(session)); + return (0); +} + +/* + * __wt_log_slot_switch -- + * Switch out the current slot and set up a new one. + */ +int +__wt_log_slot_switch(WT_SESSION_IMPL *session, WT_MYSLOT *myslot) +{ + WT_DECL_RET; + WT_LOG *log; + + log = S2C(session)->log; /* - * We lost a race to add our size into this slot. Check the state - * and try again. + * !!! Since the WT_WITH_SLOT_LOCK macro is a do-while loop, the + * compiler does not like it combined directly with the while loop + * here. */ - if (!__wt_atomic_casiv64(&slot->slot_state, old_state, new_state)) { - WT_STAT_FAST_CONN_INCR(session, log_slot_races); - goto join_slot; + WT_WITH_SLOT_LOCK(session, log, + ret = __wt_log_slot_switch_internal(session, myslot)); + while (ret == EBUSY) { + WT_STAT_FAST_CONN_INCR(session, log_slot_switch_busy); + __wt_yield(); + WT_WITH_SLOT_LOCK(session, log, + ret = __wt_log_slot_switch_internal(session, myslot)); } - WT_ASSERT(session, myslotp != NULL); - /* - * We joined this slot. Fill in our information to return to - * the caller. - */ - WT_STAT_FAST_CONN_INCR(session, log_slot_joins); - if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) - F_SET(slot, WT_SLOT_SYNC_DIR); - if (LF_ISSET(WT_LOG_FSYNC)) - F_SET(slot, WT_SLOT_SYNC); - myslotp->slot = slot; - myslotp->offset = (wt_off_t)old_state - WT_LOG_SLOT_READY; - return (0); + WT_ASSERT(session, ret == 0); + return (ret); } /* - * __log_slot_find_free -- - * Find and return a free log slot. + * __wt_log_slot_new -- + * Find a free slot and switch it as the new active slot. + * Must be called holding the slot lock. */ -static int -__log_slot_find_free(WT_SESSION_IMPL *session, WT_LOGSLOT **slot) +int +__wt_log_slot_new(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_LOG *log; - uint32_t pool_i; + WT_LOGSLOT *slot; + int32_t i; + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); conn = S2C(session); log = conn->log; - WT_ASSERT(session, slot != NULL); /* - * Encourage processing and moving the write LSN forward. - * That process has to walk the slots anyway, so do that - * work and let it give us the index of a free slot along - * the way. + * Although this function is single threaded, multiple threads could + * be trying to set a new active slot sequentially. If we find an + * active slot that is valid, return. */ - WT_RET(__wt_log_wrlsn(session, &pool_i, NULL)); - while (pool_i == WT_SLOT_POOL) { + if ((slot = log->active_slot) != NULL && + WT_LOG_SLOT_OPEN(slot->slot_state)) + return (0); + + /* + * Keep trying until we can find a free slot. + */ + for (;;) { + /* + * For now just restart at 0. We could use log->pool_index + * if that is inefficient. + */ + for (i = 0; i < WT_SLOT_POOL; i++) { + slot = &log->slot_pool[i]; + if (slot->slot_state == WT_LOG_SLOT_FREE) { + /* + * Make sure that the next buffer size can + * fit in the file. Proactively switch if + * it cannot. This reduces, but does not + * eliminate, log files that exceed the + * maximum file size. + * + * We want to minimize the risk of an + * error due to no space. + */ + WT_RET(__wt_log_acquire(session, + log->slot_buf_size, slot)); + /* + * We have a new, free slot to use. + * Set it as the active slot. + */ + WT_STAT_FAST_CONN_INCR(session, + log_slot_transitions); + log->active_slot = slot; + return (0); + } + } + /* + * If we didn't find any free slots signal the worker thread. + */ + (void)__wt_cond_signal(session, conn->log_wrlsn_cond); __wt_yield(); - WT_RET(__wt_log_wrlsn(session, &pool_i, NULL)); } - *slot = &log->slot_pool[pool_i]; - WT_ASSERT(session, (*slot)->slot_state == WT_LOG_SLOT_FREE); - return (0); + /* NOTREACHED */ } /* - * __wt_log_slot_close -- - * Close a slot and do not allow any other threads to join this slot. - * Remove this from the active slot array and move a new slot from - * the pool into its place. Set up the size of this group; - * Must be called with the logging spinlock held. + * __wt_log_slot_init -- + * Initialize the slot array. */ int -__wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) +__wt_log_slot_init(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; + WT_DECL_RET; WT_LOG *log; - WT_LOGSLOT *newslot; - int64_t old_state; + WT_LOGSLOT *slot; + int32_t i; conn = S2C(session); log = conn->log; - /* - * Find an unused slot in the pool. - */ - WT_RET(__log_slot_find_free(session, &newslot)); + WT_CACHE_LINE_ALIGNMENT_VERIFY(session, log->slot_pool); + for (i = 0; i < WT_SLOT_POOL; i++) + log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE; /* - * Swap out the slot we're going to use and put a free one in the - * slot array in its place so that threads can use it right away. + * Allocate memory for buffers now that the arrays are setup. Split + * this out to make error handling simpler. */ - WT_STAT_FAST_CONN_INCR(session, log_slot_closes); - newslot->slot_state = WT_LOG_SLOT_READY; - newslot->slot_index = slot->slot_index; - log->slot_array[newslot->slot_index] = newslot; - old_state = - __wt_atomic_storeiv64(&slot->slot_state, WT_LOG_SLOT_PENDING); - slot->slot_group_size = (uint64_t)(old_state - WT_LOG_SLOT_READY); /* - * Note that this statistic may be much bigger than in reality, - * especially when compared with the total bytes written in - * __log_fill. The reason is that this size reflects any - * rounding up that is needed and the total bytes in __log_fill - * is the amount of user bytes. + * Cap the slot buffer to the log file size times two if needed. + * That means we try to fill to half the buffer but allow some + * extra space. + * + * !!! If the buffer size is too close to the log file size, we will + * switch log files very aggressively. Scale back the buffer for + * small log file sizes. */ + log->slot_buf_size = (uint32_t)WT_MIN( + (size_t)conn->log_file_max/10, WT_LOG_SLOT_BUF_SIZE); + for (i = 0; i < WT_SLOT_POOL; i++) { + WT_ERR(__wt_buf_init(session, + &log->slot_pool[i].slot_buf, log->slot_buf_size)); + F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS); + } WT_STAT_FAST_CONN_INCRV(session, - log_slot_consolidated, (uint64_t)slot->slot_group_size); - return (0); + log_buffer_size, log->slot_buf_size * WT_SLOT_POOL); + /* + * Set up the available slot from the pool the first time. + */ + slot = &log->slot_pool[0]; + /* + * We cannot initialize the release LSN in the activate function + * because that is called after a log file switch. + */ + slot->slot_release_lsn = log->alloc_lsn; + __wt_log_slot_activate(session, slot); + log->active_slot = slot; + + if (0) { +err: while (--i >= 0) + __wt_buf_free(session, &log->slot_pool[i].slot_buf); + } + return (ret); } /* - * __wt_log_slot_notify -- - * Notify all threads waiting for the state to be < WT_LOG_SLOT_DONE. + * __wt_log_slot_destroy -- + * Clean up the slot array on shutdown. */ int -__wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) +__wt_log_slot_destroy(WT_SESSION_IMPL *session) { - WT_UNUSED(session); + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + WT_LOGSLOT *slot; + int64_t rel; + int i; + + conn = S2C(session); + log = conn->log; - slot->slot_state = - (int64_t)WT_LOG_SLOT_DONE - (int64_t)slot->slot_group_size; + /* + * Write out any remaining buffers. Free the buffer. + */ + for (i = 0; i < WT_SLOT_POOL; i++) { + slot = &log->slot_pool[i]; + if (!FLD64_ISSET( + (uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED)) { + rel = WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state); + if (rel != 0) + WT_RET(__wt_write(session, slot->slot_fh, + slot->slot_start_offset, (size_t)rel, + slot->slot_buf.mem)); + } + __wt_buf_free(session, &log->slot_pool[i].slot_buf); + } return (0); } /* - * __wt_log_slot_wait -- - * Wait for slot leader to allocate log area and tell us our log offset. + * __wt_log_slot_join -- + * Join a consolidated logging slot. Must be called with + * the read lock held. */ -int -__wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) +void +__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, + uint32_t flags, WT_MYSLOT *myslot) { - int yield_count; + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + WT_LOGSLOT *slot; + int64_t flag_state, new_state, old_state, released; + int32_t join_offset, new_join; +#ifdef HAVE_DIAGNOSTIC + int unbuf_force; +#endif - yield_count = 0; - WT_UNUSED(session); + conn = S2C(session); + log = conn->log; - while (slot->slot_state > WT_LOG_SLOT_DONE) - if (++yield_count < 1000) - __wt_yield(); - else - __wt_sleep(0, 200); - return (0); + /* + * Make sure the length cannot overflow. The caller should not + * even call this function if it doesn't fit but use direct + * writes. + */ + WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); + + /* + * There should almost always be a slot open. + */ +#ifdef HAVE_DIAGNOSTIC + unbuf_force = ((++log->write_calls % 1000) == 0); +#endif + for (;;) { + WT_BARRIER(); + slot = log->active_slot; + old_state = slot->slot_state; + /* + * Try to join our size into the existing size and + * atomically write it back into the state. + */ + flag_state = WT_LOG_SLOT_FLAGS(old_state); + released = WT_LOG_SLOT_RELEASED(old_state); + join_offset = WT_LOG_SLOT_JOINED(old_state); +#ifdef HAVE_DIAGNOSTIC + if (unbuf_force || mysize > WT_LOG_SLOT_BUF_MAX) { +#else + if (mysize > WT_LOG_SLOT_BUF_MAX) { +#endif + new_join = join_offset + WT_LOG_SLOT_UNBUFFERED; + F_SET(myslot, WT_MYSLOT_UNBUFFERED); + myslot->slot = slot; + } else + new_join = join_offset + (int32_t)mysize; + new_state = (int64_t)WT_LOG_SLOT_JOIN_REL( + (int64_t)new_join, (int64_t)released, (int64_t)flag_state); + + /* + * Check if the slot is open for joining and we are able to + * swap in our size into the state. + */ + if (WT_LOG_SLOT_OPEN(old_state) && + __wt_atomic_casiv64( + &slot->slot_state, old_state, new_state)) + break; + /* + * The slot is no longer open or we lost the race to + * update it. Yield and try again. + */ + WT_STAT_FAST_CONN_INCR(session, log_slot_races); + __wt_yield(); + } + /* + * We joined this slot. Fill in our information to return to + * the caller. + */ + if (mysize != 0) + WT_STAT_FAST_CONN_INCR(session, log_slot_joins); + if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) + F_SET(slot, WT_SLOT_SYNC_DIR); + if (LF_ISSET(WT_LOG_FSYNC)) + F_SET(slot, WT_SLOT_SYNC); + if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) { + WT_ASSERT(session, slot->slot_unbuffered == 0); + WT_STAT_FAST_CONN_INCR(session, log_slot_unbuffered); + slot->slot_unbuffered = (int64_t)mysize; + } + myslot->slot = slot; + myslot->offset = join_offset; + myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize); } /* * __wt_log_slot_release -- * Each thread in a consolidated group releases its portion to - * signal it has completed writing its piece of the log. + * signal it has completed copying its piece of the log into + * the memory buffer. */ int64_t -__wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size) +__wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size) { + WT_LOGSLOT *slot; + wt_off_t cur_offset, my_start; + int64_t my_size, rel_size; + + WT_UNUSED(session); + slot = myslot->slot; + my_start = slot->slot_start_offset + myslot->offset; + while ((cur_offset = slot->slot_last_offset) < my_start) { + /* + * Set our offset if we are larger. + */ + if (__wt_atomic_casiv64( + &slot->slot_last_offset, cur_offset, my_start)) + break; + /* + * If we raced another thread updating this, try again. + */ + WT_BARRIER(); + } /* - * Add my size into the state. When it reaches WT_LOG_SLOT_DONE - * all participatory threads have completed copying their piece. + * Add my size into the state and return the new size. */ - return (__wt_atomic_addiv64(&slot->slot_state, (int64_t)size)); + rel_size = size; + if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) + rel_size = WT_LOG_SLOT_UNBUFFERED; + my_size = (int64_t)WT_LOG_SLOT_JOIN_REL((int64_t)0, rel_size, 0); + return (__wt_atomic_addiv64(&slot->slot_state, my_size)); } /* * __wt_log_slot_free -- * Free a slot back into the pool. */ -int +void __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { - WT_UNUSED(session); /* * Make sure flags don't get retained between uses. * We have to reset them them here because multiple threads may * change the flags when joining the slot. */ + WT_UNUSED(session); slot->flags = WT_SLOT_INIT_FLAGS; + slot->slot_error = 0; slot->slot_state = WT_LOG_SLOT_FREE; - return (0); } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index 674b9e6d3a8..6068bb6c559 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -134,7 +134,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) if (have_primary) { WT_ENTER_PAGE_INDEX(session); WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree, - ovfl = __wt_btree_lsm_size(session, hard_limit ? + ovfl = __wt_btree_lsm_over_size(session, hard_limit ? 2 * lsm_tree->chunk_size : lsm_tree->chunk_size)); WT_LEAVE_PAGE_INDEX(session); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_merge.c b/src/third_party/wiredtiger/src/lsm/lsm_merge.c index 40991f845e4..01a61359949 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_merge.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_merge.c @@ -512,7 +512,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) * Don't block if the cache is full: our next unit of work may be to * discard some trees to free space. */ - F_SET(session, WT_SESSION_NO_CACHE_CHECK); + F_SET(session, WT_SESSION_NO_EVICTION); if (create_bloom) { if (ret == 0) @@ -632,6 +632,6 @@ err: if (locked) "Merge failed with %s", __wt_strerror(session, ret, NULL, 0))); } - F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK); + F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c index f34f0598261..46db76e099c 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c @@ -1144,7 +1144,7 @@ __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * Diagnostic: avoid deadlocks with the schema lock: if we need it for * an operation, we should already have it. */ - F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); return (0); } @@ -1157,7 +1157,7 @@ __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; - F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); if ((ret = __wt_readunlock(session, lsm_tree->rwlock)) != 0) WT_PANIC_RET(session, ret, "Unlocking an LSM tree"); @@ -1177,7 +1177,7 @@ __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * Diagnostic: avoid deadlocks with the schema lock: if we need it for * an operation, we should already have it. */ - F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); return (0); } @@ -1190,7 +1190,7 @@ __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; - F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); if ((ret = __wt_writeunlock(session, lsm_tree->rwlock)) != 0) WT_PANIC_RET(session, ret, "Unlocking an LSM tree"); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index 0c36c68e9f5..8eba0127b8b 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -301,17 +301,19 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, * Flush the file before checkpointing: this is the expensive part in * terms of I/O. * - * Use the special eviction isolation level to avoid interfering with - * an application checkpoint: we have already checked that all of the - * updates in this chunk are globally visible. - * - * !!! We can wait here for checkpoints and fsyncs to complete, which - * can be a long time. + * !!! + * We can wait here for checkpoints and fsyncs to complete, which can + * take a long time. */ if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { + /* + * Set read-uncommitted: we have already checked that all of the + * updates in this chunk are globally visible, use the cheapest + * possible check in reconciliation. + */ saved_isolation = session->txn.isolation; - session->txn.isolation = WT_ISO_EVICTION; + session->txn.isolation = WT_ISO_READ_UNCOMMITTED; ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); @@ -412,7 +414,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, * ourselves to get stuck creating bloom filters, the entire tree * can stall since there may be no worker threads available to flush. */ - F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK); + F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); WT_ERR(__wt_bloom_insert(bloom, &key)); @@ -446,7 +448,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); - F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK); + F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); } diff --git a/src/third_party/wiredtiger/src/meta/meta_apply.c b/src/third_party/wiredtiger/src/meta/meta_apply.c index 6d08ce3aa6a..315621f2ae9 100644 --- a/src/third_party/wiredtiger/src/meta/meta_apply.c +++ b/src/third_party/wiredtiger/src/meta/meta_apply.c @@ -32,7 +32,7 @@ __wt_meta_btree_apply(WT_SESSION_IMPL *session, WT_ERR(cursor->get_key(cursor, &uri)); if (!WT_PREFIX_MATCH(uri, "file:")) break; - else if (strcmp(uri, WT_METAFILE_URI) == 0) + if (strcmp(uri, WT_METAFILE_URI) == 0) continue; /* diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c index 227d0fa9a6c..8255f004dab 100644 --- a/src/third_party/wiredtiger/src/meta/meta_table.c +++ b/src/third_party/wiredtiger/src/meta/meta_table.c @@ -12,22 +12,22 @@ * __metadata_turtle -- * Return if a key's value should be taken from the turtle file. */ -static int +static bool __metadata_turtle(const char *key) { switch (key[0]) { case 'f': if (strcmp(key, WT_METAFILE_URI) == 0) - return (1); + return (true); break; case 'W': if (strcmp(key, "WiredTiger version") == 0) - return (1); + return (true); if (strcmp(key, "WiredTiger version string") == 0) - return (1); + return (true); break; } - return (0); + return (false); } /* @@ -37,6 +37,8 @@ __metadata_turtle(const char *key) int __wt_metadata_open(WT_SESSION_IMPL *session) { + WT_BTREE *btree; + if (session->meta_dhandle != NULL) return (0); @@ -45,7 +47,24 @@ __wt_metadata_open(WT_SESSION_IMPL *session) session->meta_dhandle = session->dhandle; WT_ASSERT(session, session->meta_dhandle != NULL); - /* The meta_dhandle doesn't need to stay locked -- release it. */ + /* + * Set special flags for the metadata file: eviction (the metadata file + * is in-memory and never evicted), logging (the metadata file is always + * logged if possible). + * + * Test flags before setting them so updates can't race in subsequent + * opens (the first update is safe because it's single-threaded from + * wiredtiger_open). + */ + btree = S2BT(session); + if (!F_ISSET(btree, WT_BTREE_IN_MEMORY)) + F_SET(btree, WT_BTREE_IN_MEMORY); + if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) + F_SET(btree, WT_BTREE_NO_EVICTION); + if (F_ISSET(btree, WT_BTREE_NO_LOGGING)) + F_CLR(btree, WT_BTREE_NO_LOGGING); + + /* The metadata handle doesn't need to stay locked -- release it. */ return (__wt_session_release_btree(session)); } @@ -59,9 +78,9 @@ __wt_metadata_cursor( { WT_DATA_HANDLE *saved_dhandle; WT_DECL_RET; + int is_dead; const char *cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), config, NULL }; - int is_dead; saved_dhandle = session->dhandle; WT_ERR(__wt_metadata_open(session)); diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c index baf9b475777..7946b4ab0cc 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c +++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c @@ -41,11 +41,13 @@ err: __wt_free(session, cond); } /* - * __wt_cond_wait -- - * Wait on a mutex, optionally timing out. + * __wt_cond_wait_signal -- + * Wait on a mutex, optionally timing out. If we get it + * before the time out period expires, let the caller know. */ int -__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) +__wt_cond_wait_signal( + WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, int *signalled) { struct timespec ts; WT_DECL_RET; @@ -54,6 +56,7 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) locked = 0; /* Fast path if already signalled. */ + *signalled = 1; if (__wt_atomic_addi32(&cond->waiters, 1) == 0) return (0); @@ -88,8 +91,10 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) #ifdef ETIME ret == ETIME || #endif - ret == ETIMEDOUT) + ret == ETIMEDOUT) { + *signalled = 0; ret = 0; + } (void)__wt_atomic_subi32(&cond->waiters, 1); diff --git a/src/third_party/wiredtiger/src/os_posix/os_open.c b/src/third_party/wiredtiger/src/os_posix/os_open.c index 8622bb5b4ca..ef4662aa369 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_open.c +++ b/src/third_party/wiredtiger/src/os_posix/os_open.c @@ -213,6 +213,8 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp) fh = *fhp; *fhp = NULL; + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: close", fh->name)); + __wt_spin_lock(session, &conn->fh_lock); if (fh == NULL || fh->ref == 0 || --fh->ref > 0) { __wt_spin_unlock(session, &conn->fh_lock); diff --git a/src/third_party/wiredtiger/src/os_posix/os_path.c b/src/third_party/wiredtiger/src/os_posix/os_path.c index 07b14b55b44..af28e1b3b56 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_path.c +++ b/src/third_party/wiredtiger/src/os_posix/os_path.c @@ -12,10 +12,10 @@ * __wt_absolute_path -- * Return if a filename is an absolute path. */ -int +bool __wt_absolute_path(const char *path) { - return (path[0] == '/' ? 1 : 0); + return (path[0] == '/'); } /* diff --git a/src/third_party/wiredtiger/src/os_win/os_errno.c b/src/third_party/wiredtiger/src/os_win/os_errno.c index 097c73b5731..a9d3d521052 100644 --- a/src/third_party/wiredtiger/src/os_win/os_errno.c +++ b/src/third_party/wiredtiger/src/os_win/os_errno.c @@ -22,7 +22,7 @@ __wt_map_error_to_windows_error(int error) { Also validate he do not get any COM errors (which are negative integers) */ - WT_ASSERT(NULL, error > 0 && error > -(windows_error_offset)); + WT_ASSERT(NULL, error < 0); return (error + -(windows_error_offset)); } @@ -96,7 +96,7 @@ __wt_strerror(WT_SESSION_IMPL *session, int error, char *errbuf, size_t errlen) snprintf(errbuf, errlen, "%s", buf) > 0) return (errbuf); if (lasterror != 0 && session != NULL && - __wt_buf_set(session, &session->err, buf, strlen(buf)) == 0) + __wt_buf_fmt(session, &session->err, "%s", buf) == 0) return (session->err.data); } diff --git a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c index 565928cb863..14ca5d61282 100644 --- a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c +++ b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c @@ -37,13 +37,15 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, } /* - * __wt_cond_wait -- - * Wait on a mutex, optionally timing out. + * __wt_cond_wait_signal -- + * Wait on a mutex, optionally timing out. If we get it + * before the time out period expires, let the caller know. */ int -__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) +__wt_cond_wait_signal( + WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, int *signalled) { - DWORD milliseconds; + DWORD err, milliseconds; WT_DECL_RET; uint64_t milliseconds64; int locked; @@ -51,6 +53,7 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) locked = 0; /* Fast path if already signalled. */ + *signalled = 1; if (__wt_atomic_addi32(&cond->waiters, 1) == 0) return (0); @@ -91,17 +94,25 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) ret = SleepConditionVariableCS( &cond->cond, &cond->mtx, INFINITE); + /* + * SleepConditionVariableCS returns non-zero on success, 0 on timeout + * or failure. Check for timeout, else convert to a WiredTiger error + * value and fail. + */ if (ret == 0) { - if (GetLastError() == ERROR_TIMEOUT) { - ret = 1; - } - } + if ((err = GetLastError()) == ERROR_TIMEOUT) + *signalled = 0; + else + ret = __wt_errno(); + } else + ret = 0; (void)__wt_atomic_subi32(&cond->waiters, 1); if (locked) LeaveCriticalSection(&cond->mtx); - if (ret != 0) + + if (ret == 0) return (0); WT_RET_MSG(session, ret, "SleepConditionVariableCS"); } diff --git a/src/third_party/wiredtiger/src/os_win/os_path.c b/src/third_party/wiredtiger/src/os_win/os_path.c index 89f05e238c4..9d001e50571 100644 --- a/src/third_party/wiredtiger/src/os_win/os_path.c +++ b/src/third_party/wiredtiger/src/os_win/os_path.c @@ -12,7 +12,7 @@ * __wt_absolute_path -- * Return if a filename is an absolute path. */ -int +bool __wt_absolute_path(const char *path) { /* @@ -21,7 +21,7 @@ __wt_absolute_path(const char *path) */ if (strlen(path) >= 3 && isalpha(path[0]) && path[1] == ':') path += 2; - return (path[0] == '/' || path[0] == '\\' ? 1 : 0); + return (path[0] == '/' || path[0] == '\\'); } /* diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 6b0ca54065e..10daa8b717c 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -27,18 +27,30 @@ typedef struct { WT_ITEM dsk; /* Temporary disk-image buffer */ - /* Track whether all changes to the page are written. */ + /* + * Track start/stop write generation to decide if all changes to the + * page are written. + */ + uint32_t orig_write_gen; + + /* + * Track start/stop checkpoint generations to decide if lookaside table + * records are correct. + */ + uint64_t orig_btree_checkpoint_gen; + uint64_t orig_txn_checkpoint_gen; + + /* + * Track maximum transaction ID seen and first unwritten transaction ID. + */ uint64_t max_txn; uint64_t first_dirty_txn; - uint32_t orig_write_gen; /* - * If page updates are skipped because they are as yet unresolved, or - * the page has updates we cannot discard, the page is left "dirty": - * the page cannot be discarded and a subsequent reconciliation will - * be necessary to discard the page. + * When we can't mark the page clean (for example, checkpoint found some + * uncommitted updates), there's a leave-dirty flag. */ - int leave_dirty; + int leave_dirty; /* * Raw compression (don't get me started, as if normal reconciliation @@ -153,18 +165,12 @@ typedef struct { void *dsk; /* Split's disk image */ /* - * When busy pages get large, we need to be able to evict them - * even when they contain unresolved updates, or updates which - * cannot be evicted because of running transactions. In such - * cases, break the page into multiple blocks, write the blocks - * that can be evicted, saving lists of updates for blocks that - * cannot be evicted, then re-instantiate the blocks that cannot - * be evicted as new, in-memory pages, restoring the updates on - * those pages. + * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and + * WT_EVICT_LOOKASIDE configurations. */ - WT_UPD_SKIPPED *skip; /* Skipped updates */ - uint32_t skip_next; - size_t skip_allocated; + WT_SAVE_UPD *supd; /* Saved updates */ + uint32_t supd_next; + size_t supd_allocated; /* * The key for a row-store page; no column-store key is needed @@ -220,12 +226,14 @@ typedef struct { size_t space_avail; /* Remaining space in this chunk */ /* - * While reviewing updates for each page, we store skipped updates here, - * and then move them to per-block areas as the blocks are defined. + * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and + * WT_EVICT_LOOKASIDE configurations. While reviewing updates for each + * page, we save WT_UPDATE lists here, and then move them to per-block + * areas as the blocks are defined. */ - WT_UPD_SKIPPED *skip; /* Skipped updates */ - uint32_t skip_next; - size_t skip_allocated; + WT_SAVE_UPD *supd; /* Saved updates */ + uint32_t supd_next; + size_t supd_allocated; /* * We don't need to keep the 0th key around on internal pages, the @@ -277,6 +285,9 @@ typedef struct { WT_SALVAGE_COOKIE *salvage; /* If it's a salvage operation */ + int cache_write_lookaside; /* Used the lookaside table */ + int cache_write_restore; /* Used update/restoration */ + uint32_t tested_ref_state; /* Debugging information */ } WT_RECONCILE; @@ -318,8 +329,11 @@ static int __rec_split_row_promote( WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t); static int __rec_split_write(WT_SESSION_IMPL *, WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, int); +static int __rec_update_las( + WT_SESSION_IMPL *, WT_RECONCILE *, uint32_t, WT_BOUNDARY *); static int __rec_write_init(WT_SESSION_IMPL *, WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *); +static int __rec_write_status(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_write_wrapup_err( WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); @@ -338,31 +352,19 @@ int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags) { - WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_RECONCILE *r; - int page_lock, scan_lock, split_lock; - conn = S2C(session); page = ref->page; mod = page->modify; - page_lock = scan_lock = split_lock = 0; - - /* We're shouldn't get called with a clean page, that's an error. */ - if (!__wt_page_is_modified(page)) - WT_RET_MSG(session, WT_ERROR, - "Attempt to reconcile a clean page."); WT_RET(__wt_verbose(session, WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type))); - WT_STAT_FAST_CONN_INCR(session, rec_pages); - WT_STAT_FAST_DATA_INCR(session, rec_pages); - if (LF_ISSET(WT_EVICTING)) { - WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction); - WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction); - } + + /* We shouldn't get called with a clean page, that's an error. */ + WT_ASSERT(session, __wt_page_is_modified(page)); #ifdef HAVE_DIAGNOSTIC { @@ -386,39 +388,15 @@ __wt_reconcile(WT_SESSION_IMPL *session, r = session->reconcile; /* - * The compaction process looks at the page's modification information; - * if compaction is running, acquire the page's lock. - */ - if (conn->compact_in_memory_pass) { - WT_PAGE_LOCK(session, page); - page_lock = 1; - } - - /* - * Reconciliation reads the lists of updates, so obsolete updates cannot - * be discarded while reconciliation is in progress. - */ - for (;;) { - F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret); - if (ret == 0) - break; - __wt_yield(); - } - scan_lock = 1; - - /* - * Mark internal pages as splitting to ensure we don't deadlock when - * performing an in-memory split during a checkpoint. + * Reconciliation locks the page for three reasons: + * Reconciliation reads the lists of page updates, obsolete updates + * cannot be discarded while reconciliation is in progress; + * The compaction process reads page modification information, which + * reconciliation modifies; + * In-memory splits: reconciliation of an internal page cannot handle + * a child page splitting during the reconciliation. */ - if (WT_PAGE_IS_INTERNAL(page)) { - for (;;) { - F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret); - if (ret == 0) - break; - __wt_yield(); - } - split_lock = 1; - } + F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION); /* Reconcile the page. */ switch (page->type) { @@ -445,19 +423,34 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_ILLEGAL_VALUE_SET(session); } + /* Get the final status for the reconciliation. */ + if (ret == 0) + ret = __rec_write_status(session, r, page); + /* Wrap up the page reconciliation. */ if (ret == 0) ret = __rec_write_wrapup(session, r, page); else WT_TRET(__rec_write_wrapup_err(session, r, page)); - /* Release the locks we're holding. */ - if (split_lock) - F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED); - if (scan_lock) - F_CLR_ATOMIC(page, WT_PAGE_SCANNING); - if (page_lock) - WT_PAGE_UNLOCK(session, page); + /* Release the reconciliation lock. */ + F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); + + /* Update statistics. */ + WT_STAT_FAST_CONN_INCR(session, rec_pages); + WT_STAT_FAST_DATA_INCR(session, rec_pages); + if (LF_ISSET(WT_EVICTING)) { + WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction); + WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction); + } + if (r->cache_write_lookaside) { + WT_STAT_FAST_CONN_INCR(session, cache_write_lookaside); + WT_STAT_FAST_DATA_INCR(session, cache_write_lookaside); + } + if (r->cache_write_restore) { + WT_STAT_FAST_CONN_INCR(session, cache_write_restore); + WT_STAT_FAST_DATA_INCR(session, cache_write_restore); + } /* * Clean up the boundary structures: some workloads result in millions @@ -489,6 +482,125 @@ __wt_reconcile(WT_SESSION_IMPL *session, } /* + * __rec_las_checkpoint_test -- + * Return if the lookaside table is going to collide with a checkpoint. + */ +static inline bool +__rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_CONNECTION_IMPL *conn; + WT_BTREE *btree; + + conn = S2C(session); + btree = S2BT(session); + + /* + * Running checkpoints can collide with the lookaside table because + * reconciliation using the lookaside table writes the key's last + * committed value, which might not be the value checkpoint would write. + * If reconciliation was configured for lookaside table eviction, this + * file participates in checkpoints, and any of the tree or system + * transactional generation numbers don't match, there's a possible + * collision. + * + * It's a complicated test, but the alternative is to have checkpoint + * drain lookaside table reconciliations, and this isn't a problem for + * most workloads. + */ + if (!F_ISSET(r, WT_EVICT_LOOKASIDE)) + return (false); + if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) + return (false); + if (r->orig_btree_checkpoint_gen == btree->checkpoint_gen && + r->orig_txn_checkpoint_gen == conn->txn_global.checkpoint_gen && + r->orig_btree_checkpoint_gen == r->orig_txn_checkpoint_gen) + return (false); + return (true); +} + +/* + * __rec_write_status -- + * Return the final status for reconciliation. + */ +static int +__rec_write_status(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_PAGE_MODIFY *mod; + + btree = S2BT(session); + mod = page->modify; + + /* Check for a lookaside table and checkpoint collision. */ + if (__rec_las_checkpoint_test(session, r)) + return (EBUSY); + + /* + * Set the page's status based on whether or not we cleaned the page. + */ + if (r->leave_dirty) { + /* + * Update the page's first unwritten transaction ID. + */ + mod->first_dirty_txn = r->first_dirty_txn; + + /* + * The page remains dirty. + * + * Any checkpoint call cleared the tree's modified flag before + * writing pages, so we must explicitly reset it. We insert a + * barrier after the change for clarity (the requirement is the + * flag be set before a subsequent checkpoint reads it, and + * as the current checkpoint is waiting on this reconciliation + * to complete, there's no risk of that happening) + */ + btree->modified = 1; + WT_FULL_BARRIER(); + + /* + * Eviction should only be here if following the save/restore + * eviction path. + */ + WT_ASSERT(session, + !F_ISSET(r, WT_EVICTING) || + F_ISSET(r, WT_EVICT_UPDATE_RESTORE)); + } else { + /* + * Track the page's maximum transaction ID (used to decide if + * we're likely to be able to evict this page in the future). + */ + mod->rec_max_txn = r->max_txn; + + /* + * Track the tree's maximum transaction ID (used to decide if + * it's safe to discard the tree). Reconciliation for eviction + * is multi-threaded, only update the tree's maximum transaction + * ID when doing a checkpoint. That's sufficient, we only care + * about the maximum transaction ID of current updates in the + * tree, and checkpoint visits every dirty page in the tree. + */ + if (!F_ISSET(r, WT_EVICTING) && + WT_TXNID_LT(btree->rec_max_txn, r->max_txn)) + btree->rec_max_txn = r->max_txn; + + /* + * The page only might be clean; if the write generation is + * unchanged since reconciliation started, it's clean. + * + * If the write generation changed, the page has been written + * since reconciliation started and remains dirty (that can't + * happen when evicting, the page is exclusively locked). + */ + if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0)) + __wt_cache_dirty_decr(session, page); + else + WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + } + + return (0); +} + +/* * __rec_root_write -- * Handle the write of a root page. */ @@ -577,7 +689,7 @@ err: __wt_page_out(session, &next); * __rec_raw_compression_config -- * Configure raw compression. */ -static inline int +static inline bool __rec_raw_compression_config( WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) { @@ -588,11 +700,11 @@ __rec_raw_compression_config( /* Check if raw compression configured. */ if (btree->compressor == NULL || btree->compressor->compress_raw == NULL) - return (0); + return (false); /* Only for row-store and variable-length column-store objects. */ if (page->type == WT_PAGE_COL_FIX) - return (0); + return (false); /* * Raw compression cannot support dictionary compression. (Technically, @@ -602,11 +714,11 @@ __rec_raw_compression_config( * that seems an unlikely use case.) */ if (btree->dictionary != 0) - return (0); + return (false); /* Raw compression cannot support prefix compression. */ if (btree->prefix_compression != 0) - return (0); + return (false); /* * Raw compression is also turned off during salvage: we can't allow @@ -614,9 +726,9 @@ __rec_raw_compression_config( * can't manipulate the page size. */ if (salvage != NULL) - return (0); + return (false); - return (1); + return (true); } /* @@ -628,10 +740,12 @@ __rec_write_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COOKIE *salvage, void *reconcilep) { WT_BTREE *btree; + WT_CONNECTION_IMPL *conn; WT_PAGE *page; WT_RECONCILE *r; btree = S2BT(session); + conn = S2C(session); page = ref->page; if ((r = *(WT_RECONCILE **)reconcilep) == NULL) { @@ -648,9 +762,59 @@ __rec_write_init(WT_SESSION_IMPL *session, F_SET(&r->dsk, WT_ITEM_ALIGNED); } + /* Reconciliation is not re-entrant, make sure that doesn't happen. */ + WT_ASSERT(session, r->ref == NULL); + /* Remember the configuration. */ r->ref = ref; r->page = page; + + /* + * Save the page's write generation before reading the page. + * Save the transaction generations before reading the page. + * These are all ordered reads, but we only need one. + */ + r->orig_btree_checkpoint_gen = btree->checkpoint_gen; + r->orig_txn_checkpoint_gen = conn->txn_global.checkpoint_gen; + WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen); + + /* + * Lookaside table eviction is configured when eviction gets aggressive, + * adjust the flags for cases we don't support. + */ + if (LF_ISSET(WT_EVICT_LOOKASIDE)) { + /* + * Saving lookaside table updates into the lookaside table won't + * work. + */ + if (F_ISSET(btree, WT_BTREE_LOOKASIDE)) + LF_CLR(WT_EVICT_LOOKASIDE); + + /* + * We don't yet support fixed-length column-store combined with + * the lookaside table. It's not hard to do, but the underlying + * function that reviews which updates can be written to the + * evicted page and which updates need to be written to the + * lookaside table needs access to the original value from the + * page being evicted, and there's no code path for that in the + * case of fixed-length column-store objects. (Row-store and + * variable-width column-store objects provide a reference to + * the unpacked on-page cell for this purpose, but there isn't + * an on-page cell for fixed-length column-store objects.) For + * now, turn it off. + */ + if (page->type == WT_PAGE_COL_FIX) + LF_CLR(WT_EVICT_LOOKASIDE); + + /* + * Check for a lookaside table and checkpoint collision, and if + * we find one, turn off the lookaside file (we've gone to all + * the effort of getting exclusive access to the page, might as + * well try and evict it). + */ + if (__rec_las_checkpoint_test(session, r)) + LF_CLR(WT_EVICT_LOOKASIDE); + } r->flags = flags; /* Track if the page can be marked clean. */ @@ -668,8 +832,8 @@ __rec_write_init(WT_SESSION_IMPL *session, r->all_empty_value = 1; r->any_empty_value = 0; - /* The list of cached, skipped updates. */ - r->skip_next = 0; + /* The list of saved updates. */ + r->supd_next = 0; /* * Dictionary compression only writes repeated values once. We grow @@ -714,14 +878,11 @@ __rec_write_init(WT_SESSION_IMPL *session, r->salvage = salvage; - /* Save the page's write generation before reading the page. */ - WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen); - /* * Running transactions may update the page after we write it, so * this is the highest ID we can be confident we will see. */ - r->first_dirty_txn = S2C(session)->txn_global.last_running; + r->first_dirty_txn = conn->txn_global.last_running; return (0); } @@ -748,7 +909,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) __rec_bnd_cleanup(session, r, 1); - __wt_free(session, r->skip); + __wt_free(session, r->supd); __wt_buf_free(session, &r->k.buf); __wt_buf_free(session, &r->v.buf); @@ -784,6 +945,9 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy) if (r->bnd == NULL) return; + /* Reconciliation is not re-entrant, make sure that doesn't happen. */ + r->ref = NULL; + /* * Free the boundary structures' memory. In the case of normal cleanup, * discard any memory we won't reuse in the next reconciliation; in the @@ -799,7 +963,7 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy) for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) { __wt_free(session, bnd->addr.addr); __wt_free(session, bnd->dsk); - __wt_free(session, bnd->skip); + __wt_free(session, bnd->supd); __wt_buf_free(session, &bnd->key); } __wt_free(session, r->bnd); @@ -820,66 +984,84 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy) for (bnd = r->bnd, i = 0; i < last_used; ++bnd, ++i) { __wt_free(session, bnd->addr.addr); __wt_free(session, bnd->dsk); - __wt_free(session, bnd->skip); + __wt_free(session, bnd->supd); } } } /* - * __rec_skip_update_save -- - * Save a skipped WT_UPDATE list for later restoration. + * __rec_block_free -- + * Helper function to free a block. */ static int -__rec_skip_update_save( - WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip) +__rec_block_free( + WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) +{ + WT_BM *bm; + WT_BTREE *btree; + + btree = S2BT(session); + bm = btree->bm; + + return (bm->free(bm, session, addr, addr_size)); +} + +/* + * __rec_update_save -- + * Save a WT_UPDATE list for later restoration. + */ +static int +__rec_update_save(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, uint64_t txnid) { WT_RET(__wt_realloc_def( - session, &r->skip_allocated, r->skip_next + 1, &r->skip)); - r->skip[r->skip_next].ins = ins; - r->skip[r->skip_next].rip = rip; - ++r->skip_next; + session, &r->supd_allocated, r->supd_next + 1, &r->supd)); + r->supd[r->supd_next].ins = ins; + r->supd[r->supd_next].rip = rip; + r->supd[r->supd_next].onpage_txn = txnid; + ++r->supd_next; return (0); } /* - * __rec_skip_update_move -- - * Move a skipped WT_UPDATE list from the per-page cache to a specific + * __rec_update_move -- + * Move a saved WT_UPDATE list from the per-page cache to a specific * block's list. */ static int -__rec_skip_update_move( - WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_UPD_SKIPPED *skip) +__rec_update_move(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_SAVE_UPD *supd) { WT_RET(__wt_realloc_def( - session, &bnd->skip_allocated, bnd->skip_next + 1, &bnd->skip)); - bnd->skip[bnd->skip_next] = *skip; - ++bnd->skip_next; + session, &bnd->supd_allocated, bnd->supd_next + 1, &bnd->supd)); + bnd->supd[bnd->supd_next] = *supd; + ++bnd->supd_next; - skip->ins = NULL; - skip->rip = NULL; + supd->ins = NULL; + supd->rip = NULL; return (0); } /* * __rec_txn_read -- - * Return the first visible update in a list (or NULL if none are visible), - * set a flag if any updates were skipped, track the maximum transaction ID on - * the page. + * Return the update in a list that should be written (or NULL if none can + * be written). */ -static inline int +static int __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) { + WT_BTREE *btree; WT_DECL_RET; - WT_ITEM ovfl; + WT_DECL_ITEM(tmp); WT_PAGE *page; - WT_UPDATE *upd, *upd_list, *upd_ovfl; + WT_UPDATE *append, *upd, *upd_list; size_t notused; uint64_t max_txn, min_txn, txnid; - int skipped; + int append_origv, skipped; *updp = NULL; + btree = S2BT(session); page = r->page; /* @@ -893,13 +1075,16 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, } else upd_list = ins->upd; - skipped = 0; - for (max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, upd = upd_list; - upd != NULL; upd = upd->next) { + for (skipped = 0, + max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, + upd = upd_list; upd != NULL; upd = upd->next) { if ((txnid = upd->txnid) == WT_TXN_ABORTED) continue; - /* Track the largest/smallest transaction IDs on the list. */ + /* + * Track the largest/smallest transaction IDs on the list and + * the smallest not-globally-visible transaction on the page. + */ if (WT_TXNID_LT(max_txn, txnid)) max_txn = txnid; if (WT_TXNID_LT(txnid, min_txn)) @@ -909,132 +1094,231 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, r->first_dirty_txn = txnid; /* - * Record whether any updates were skipped on the way to finding - * the first visible update. - * - * If updates were skipped before the one being written, future - * reads without intervening modifications to the page could - * see a different value; if no updates were skipped, the page - * can safely be marked clean and does not need to be - * reconciled until modified again. + * Find the first update we can use. */ - if (*updp == NULL) { - if (__wt_txn_visible(session, txnid)) - *updp = upd; - else + if (F_ISSET(r, WT_EVICTING)) { + /* + * Eviction can write any committed update. + * + * When reconciling for eviction, track whether any + * uncommitted updates are found. + */ + if (__wt_txn_committed(session, txnid)) { + if (*updp == NULL) + *updp = upd; + } else skipped = 1; + } else { + /* + * Checkpoint can only write updates visible as of its + * snapshot. + * + * When reconciling for a checkpoint, track whether any + * updates were skipped on the way to finding the first + * visible update. + */ + if (*updp == NULL) { + if (__wt_txn_visible(session, txnid)) + *updp = upd; + else + skipped = 1; + } } } /* + * If all of the updates were aborted, quit. This test is not strictly + * necessary because the above loop exits with skipped not set and the + * maximum transaction left at its initial value of WT_TXN_NONE, so + * the test below will be branch true and return, but it's cheap and a + * little more explicit, and makes Coverity happy. + */ + if (max_txn == WT_TXN_NONE) + return (0); + + /* * Track the maximum transaction ID in the page. We store this in the - * page at the end of reconciliation if no updates are skipped, it's - * used to avoid evicting clean pages from memory with changes required - * to satisfy a snapshot read. + * tree at the end of reconciliation in the service of checkpoints, it + * is used to avoid discarding trees from memory when they have changes + * required to satisfy a snapshot read. */ if (WT_TXNID_LT(r->max_txn, max_txn)) r->max_txn = max_txn; /* - * If no updates were skipped and all updates are globally visible, the - * page can be marked clean and we're done, regardless of whether we're - * evicting or checkpointing. + * If there are no skipped updates and all updates are globally visible, + * the page can be marked clean and we're done, regardless if evicting + * or checkpointing. * * We have to check both: the oldest transaction ID may have moved while - * we were scanning the update list, so it is possible to skip an update - * but then find that by the end of the scan, all updates are stable. + * we were scanning the update list, so it is possible to find a skipped + * update, but then find all updates are stable at the end of the scan. + * + * Skip the visibility check for the lookaside table as a special-case, + * we know there are no older readers of that table. */ - if (!skipped && __wt_txn_visible_all(session, max_txn)) + if (!skipped && + (F_ISSET(btree, WT_BTREE_LOOKASIDE) || + __wt_txn_visible_all(session, max_txn))) return (0); /* - * If some updates are not globally visible, or were skipped, the page - * cannot be marked clean. + * In some cases, there had better not be skipped updates or updates not + * yet globally visible. */ - r->leave_dirty = 1; - - /* If we're not evicting, we're done, we know what we'll write. */ - if (!F_ISSET(r, WT_EVICTING)) - return (0); - - /* In some cases, there had better not be any updates we can't write. */ - if (F_ISSET(r, WT_SKIP_UPDATE_ERR)) + if (F_ISSET(r, WT_VISIBILITY_ERR)) WT_PANIC_RET(session, EINVAL, - "reconciliation illegally skipped an update"); + "reconciliation error, uncommitted update or update not " + "globally visible"); /* - * If evicting and we aren't able to save/restore the not-yet-visible - * updates, the page can't be evicted. + * If not trying to evict the page, we know what we'll write and we're + * done. Because some updates were skipped or are not globally visible, + * the page can't be marked clean. */ - if (!F_ISSET(r, WT_SKIP_UPDATE_RESTORE)) - return (EBUSY); + if (!F_ISSET(r, WT_EVICTING)) { + r->leave_dirty = 1; + return (0); + } /* - * Evicting a page with not-yet-visible updates: save and restore the - * list of updates on a newly instantiated page. - * - * The order of the updates on the list matters so we can't move only - * the unresolved updates, we have to move the entire update list. + * Evicting with either uncommitted changes or not-yet-globally-visible + * changes. There are two ways to continue, the save/restore eviction + * path or the lookaside table eviction path. Both cannot be configured + * because the paths track different information. The save/restore path + * can handle both uncommitted and not-yet-globally-visible changes, by + * evicting most of the page and then creating a new, smaller page into + * which we re-instantiate those changes. The lookaside table path can + * only handle not-yet-globally-visible changes by writing those changes + * into the lookaside table and restoring them on demand if and when the + * page is read back into memory. * - * Clear the returned update so our caller ignores the key/value pair - * in the case of an insert/append entry (everything we need is in the - * update list), and otherwise writes the original on-page key/value - * pair to which the update list applies. + * Both paths are configured outside of reconciliation: the save/restore + * path is the WT_EVICT_UPDATE_RESTORE flag, the lookaside table path is + * the WT_EVICT_LOOKASIDE flag. */ - *updp = NULL; + if (!F_ISSET(r, WT_EVICT_LOOKASIDE | WT_EVICT_UPDATE_RESTORE)) + return (EBUSY); + if (skipped && !F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) + return (EBUSY); + + append_origv = 0; + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) { + /* + * The save/restore eviction path. + * + * Clear the returned update so our caller ignores the key/value + * pair in the case of an insert/append list entry (everything + * we need is in the update list), and otherwise writes the + * original on-page key/value pair to which the update list + * applies. + */ + *updp = NULL; + + /* The page can't be marked clean. */ + r->leave_dirty = 1; + + /* + * A special-case for overflow values, where we can't write the + * original on-page value item to disk because it's been updated + * or removed. + * + * What happens is that an overflow value is updated or removed + * and its backing blocks freed. If any reader in the system + * might still want the value, a copy was cached in the page + * reconciliation tracking memory, and the page cell set to + * WT_CELL_VALUE_OVFL_RM. Eviction then chose the page and + * we're splitting it up in order to push parts of it out of + * memory. + * + * We could write the original on-page value item to disk... if + * we had a copy. The cache may not have a copy (a globally + * visible update would have kept a value from being cached), or + * an update that subsequently became globally visible could + * cause a cached value to be discarded. Either way, once there + * is a globally visible update, we may not have the original + * value. + * + * Fortunately, if there's a globally visible update we don't + * care about the original version, so we simply ignore it, no + * transaction can ever try and read it. If there isn't a + * globally visible update, there had better be a cached value. + * + * In the latter case, we could write the value out to disk, but + * (1) we are planning on re-instantiating this page in memory, + * it isn't going to disk, and (2) the value item is eventually + * going to be discarded, that seems like a waste of a write. + * Instead, find the cached value and append it to the update + * list we're saving for later restoration. + */ + if (vpack != NULL && + vpack->raw == WT_CELL_VALUE_OVFL_RM && + !__wt_txn_visible_all(session, min_txn)) + append_origv = 1; + } else { + /* + * The lookaside table eviction path. + * + * If at least one update is globally visible, copy the update + * list and ignore the current on-page value. If no update is + * globally visible, readers require the page's original value. + */ + if (!__wt_txn_visible_all(session, min_txn)) + append_origv = 1; + } /* - * Handle the case were we don't want to write an original on-page value - * item to disk because it's been updated or removed. - * - * Here's the deal: an overflow value was updated or removed and its - * backing blocks freed. If any transaction in the system might still - * read the value, a copy was cached in page reconciliation tracking - * memory, and the page cell set to WT_CELL_VALUE_OVFL_RM. Eviction - * then chose the page and we're splitting it up in order to push parts - * of it out of memory. - * - * We could write the original on-page value item to disk... if we had - * a copy. The cache may not have a copy (a globally visible update - * would have kept a value from ever being cached), or an update that - * subsequent became globally visible could cause a cached value to be - * discarded. Either way, once there's a globally visible update, we - * may not have the value. - * - * Fortunately, if there's a globally visible update we don't care about - * the original version, so we simply ignore it, no transaction can ever - * try and read it. If there isn't a globally visible update, there had - * better be a cached value. - * - * In the latter case, we could write the value out to disk, but (1) we - * are planning on re-instantiating this page in memory, it isn't going - * to disk, and (2) the value item is eventually going to be discarded, - * that seems like a waste of a write. Instead, find the cached value - * and append it to the update list we're saving for later restoration. - */ - if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM && - !__wt_txn_visible_all(session, min_txn)) { - if ((ret = __wt_ovfl_txnc_search( - page, vpack->data, vpack->size, &ovfl)) != 0) - WT_PANIC_RET(session, ret, - "cached overflow item discarded early"); + * We need the original on-page value for some reason: get a copy and + * append it to the end of the update list with a transaction ID that + * guarantees its visibility. + */ + if (append_origv) { + /* + * If we don't have a value cell, it's an insert/append list + * key/value pair which simply doesn't exist for some reader; + * place a deleted record at the end of the update list. + */ + if (vpack == NULL || vpack->type == WT_CELL_DEL) + WT_RET(__wt_update_alloc( + session, NULL, &append, ¬used)); + else { + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + if ((ret = __wt_page_cell_data_ref( + session, page, vpack, tmp)) == 0) + ret = __wt_update_alloc( + session, tmp, &append, ¬used); + __wt_scr_free(session, &tmp); + WT_RET(ret); + } /* - * Create an update structure with an impossibly low transaction - * ID and append it to the update list we're about to save. - * Restoring that update list when this page is re-instantiated - * creates an update for the key/value pair visible to every - * running transaction in the system, ensuring the on-page value - * will be ignored. + * Give the entry an impossibly low transaction ID to ensure its + * global visibility, append it to the update list. + * + * Note the change to the actual reader-accessible update list: + * from now on, the original on-page value appears at the end + * of the update list, even if this reconciliation subsequently + * fails. */ - WT_RET(__wt_update_alloc(session, &ovfl, &upd_ovfl, ¬used)); - upd_ovfl->txnid = WT_TXN_NONE; + append->txnid = WT_TXN_NONE; for (upd = upd_list; upd->next != NULL; upd = upd->next) ; - upd->next = upd_ovfl; + upd->next = append; } - return (__rec_skip_update_save(session, r, ins, rip)); + /* + * The order of the updates on the list matters, we can't move only the + * unresolved updates, move the entire update list. + * + * If we skipped updates, the transaction value is never used. If we + * didn't skip updates, the list of updates are eventually written to + * the lookaside table, and associated with each update record is the + * transaction ID of the update we wrote in the reconciled page; once + * that transaction ID is globally visible, we know we no longer need + * the lookaside table records, allowing them to be discarded. + */ + return (__rec_update_save(session, + r, ins, rip, (*updp == NULL) ? WT_TXN_NONE : (*updp)->txnid)); } /* @@ -1155,10 +1439,10 @@ __rec_child_modify(WT_SESSION_IMPL *session, * If called during checkpoint, acquire a hazard pointer * so the child isn't evicted, it's an in-memory case. * - * This call cannot return split/restart, dirty page - * eviction is shutout during checkpoint, all splits in - * process will have completed before we walk any pages - * for checkpoint. + * This call cannot return split/restart, eviction of + * pages that split into their parent is shutout during + * checkpoint, all splits in process will have completed + * before we walk any pages for checkpoint. */ ret = __wt_page_in(session, ref, WT_READ_CACHE | WT_READ_NO_EVICT | @@ -1215,7 +1499,7 @@ in_memory: * reason to write the cell. */ mod = ref->page->modify; - if (mod != NULL && mod->flags != 0) + if (mod != NULL && F_ISSET(mod, WT_PM_REC_MASK)) *statep = WT_CHILD_MODIFIED; else if (ref->addr == NULL) { *statep = WT_CHILD_IGNORE; @@ -1234,37 +1518,32 @@ static int __rec_child_deleted( WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, int *statep) { - WT_BM *bm; WT_PAGE_DELETED *page_del; size_t addr_size; const uint8_t *addr; - bm = S2BT(session)->bm; page_del = ref->page_del; /* * Internal pages with child leaf pages in the WT_REF_DELETED state are * a special case during reconciliation. First, if the deletion was a * result of a session truncate call, the deletion may not be visible to - * us. In that case, we proceed as with any change that's not visible - * during reconciliation by setting the skipped flag and ignoring the - * change for the purposes of writing the internal page. + * us. In that case, we proceed as with any change not visible during + * reconciliation by ignoring the change for the purposes of writing the + * internal page. * * In this case, there must be an associated page-deleted structure, and * it holds the transaction ID we care about. + * + * In some cases, there had better not be any updates we can't see. */ - if (page_del != NULL && !__wt_txn_visible(session, page_del->txnid)) { - /* - * In some cases, there had better not be any updates we can't - * write. - */ - if (F_ISSET(r, WT_SKIP_UPDATE_ERR)) - WT_PANIC_RET(session, EINVAL, - "reconciliation illegally skipped an update"); - } + if (F_ISSET(r, WT_VISIBILITY_ERR) && + page_del != NULL && !__wt_txn_visible(session, page_del->txnid)) + WT_PANIC_RET(session, EINVAL, + "reconciliation illegally skipped an update"); /* - * The deletion is visible to us, deal with any underlying disk blocks. + * Deal with any underlying disk blocks. * * First, check to see if there is an address associated with this leaf: * if there isn't, we're done, the underlying page is already gone. If @@ -1291,7 +1570,7 @@ __rec_child_deleted( (page_del == NULL || __wt_txn_visible_all(session, page_del->txnid))) { WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); - WT_RET(bm->free(bm, session, addr, addr_size)); + WT_RET(__rec_block_free(session, addr, addr_size)); if (__wt_off_page(ref->home, ref->addr)) { __wt_free(session, ((WT_ADDR *)ref->addr)->addr); @@ -1562,7 +1841,7 @@ static void __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) { bnd->offset = 0; - bnd->recno = 0; + bnd->recno = WT_RECNO_OOB; bnd->entries = 0; __wt_free(session, bnd->addr.addr); @@ -1571,9 +1850,9 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) bnd->cksum = 0; __wt_free(session, bnd->dsk); - __wt_free(session, bnd->skip); - bnd->skip_next = 0; - bnd->skip_allocated = 0; + __wt_free(session, bnd->supd); + bnd->supd_next = 0; + bnd->supd_allocated = 0; /* * Don't touch the key, we re-use that memory in each new @@ -1775,9 +2054,13 @@ __rec_split_init(WT_SESSION_IMPL *session, * __rec_is_checkpoint -- * Return if we're writing a checkpoint. */ -static int -__rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd) +static bool +__rec_is_checkpoint(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_BOUNDARY *bnd) { + WT_BTREE *btree; + + btree = S2BT(session); + /* * Check to see if we're going to create a checkpoint. * @@ -1792,13 +2075,14 @@ __rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd) * we don't do checkpoint writes here; clear the boundary information as * a reminder and create the checkpoint during wrapup. */ - if (bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) { + if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT) && + bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) { bnd->addr.addr = NULL; bnd->addr.size = 0; bnd->addr.type = 0; - return (1); + return (true); } - return (0); + return (false); } /* @@ -1841,7 +2125,7 @@ __rec_split_row_promote( WT_DECL_ITEM(update); WT_DECL_RET; WT_ITEM *max; - WT_UPD_SKIPPED *skip; + WT_SAVE_UPD *supd; size_t cnt, len, size; uint32_t i; const uint8_t *pa, *pb; @@ -1892,36 +2176,37 @@ __rec_split_row_promote( * the last key and smaller than the current key. */ max = r->last; - for (i = r->skip_next; i > 0; --i) { - skip = &r->skip[i - 1]; - if (skip->ins == NULL) - WT_ERR(__wt_row_leaf_key( - session, r->page, skip->rip, update, 0)); - else { - update->data = WT_INSERT_KEY(skip->ins); - update->size = WT_INSERT_KEY_SIZE(skip->ins); - } + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) + for (i = r->supd_next; i > 0; --i) { + supd = &r->supd[i - 1]; + if (supd->ins == NULL) + WT_ERR(__wt_row_leaf_key( + session, r->page, supd->rip, update, 0)); + else { + update->data = WT_INSERT_KEY(supd->ins); + update->size = WT_INSERT_KEY_SIZE(supd->ins); + } - /* Compare against the current key, it must be less. */ - WT_ERR(__wt_compare( - session, btree->collator, update, r->cur, &cmp)); - if (cmp >= 0) - continue; + /* Compare against the current key, it must be less. */ + WT_ERR(__wt_compare( + session, btree->collator, update, r->cur, &cmp)); + if (cmp >= 0) + continue; - /* Compare against the last key, it must be greater. */ - WT_ERR(__wt_compare( - session, btree->collator, update, r->last, &cmp)); - if (cmp >= 0) - max = update; + /* Compare against the last key, it must be greater. */ + WT_ERR(__wt_compare( + session, btree->collator, update, r->last, &cmp)); + if (cmp >= 0) + max = update; - /* - * The skipped updates are in key-sort order so the entry we're - * looking for is either the last one or the next-to-last one - * in the list. Once we've compared an entry against the last - * key on the page, we're done. - */ - break; - } + /* + * The saved updates are in key-sort order so the entry + * we're looking for is either the last or the next-to- + * last one in the list. Once we've compared an entry + * against the last key on the page, we're done. + */ + break; + } /* * The largest key on the last block must sort before the current key, @@ -2228,7 +2513,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, * We track the record number at each column-store split point, set an * initial value. */ - recno = 0; + recno = WT_RECNO_OOB; if (dsk->type == WT_PAGE_COL_VAR) recno = last->recno; @@ -2326,10 +2611,8 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RET(compressor->pre_size(compressor, wt_session, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, (size_t)r->raw_offsets[slots], &result_len)); - extra_skip = 0; - if (btree->kencryptor != NULL) - extra_skip = btree->kencryptor->size_const + - WT_ENCRYPT_LEN_SIZE; + extra_skip = btree->kencryptor == NULL ? 0 : + btree->kencryptor->size_const + WT_ENCRYPT_LEN_SIZE; corrected_page_size = result_len + WT_BLOCK_COMPRESS_SKIP; WT_RET(bm->write_size(bm, session, &corrected_page_size)); @@ -2477,7 +2760,7 @@ no_slots: break; case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: - next->recno = 0; + next->recno = WT_RECNO_OOB; if (!last_block) { /* * Confirm there was uncompressed data remaining @@ -2530,7 +2813,8 @@ no_slots: * * If it's not a checkpoint, write the block. */ - if (r->bnd_next == 1 && last_block && __rec_is_checkpoint(r, last)) { + if (r->bnd_next == 1 && + last_block && __rec_is_checkpoint(session, r, last)) { if (write_ref == dst) WT_RET(__wt_buf_set( session, &r->dsk, dst->mem, dst->size)); @@ -2647,13 +2931,29 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * We only arrive here with no entries to write if the page was entirely - * empty, and if the page is empty, we merge it into its parent during - * the parent's reconciliation. A page with skipped updates isn't truly - * empty, continue on. + * We may arrive here with no entries to write if the page was entirely + * empty or if nothing on the page was visible to us. */ - if (r->entries == 0 && r->skip_next == 0) - return (0); + if (r->entries == 0) { + /* + * Pages with skipped or not-yet-globally visible updates aren't + * really empty; otherwise, the page is truly empty and we will + * merge it into its parent during the parent's reconciliation. + */ + if (r->supd_next == 0) + return (0); + + /* + * If using the save/restore eviction path, continue with the + * write, the page will be restored after we finish. + * + * If using the lookaside table eviction path, we can't continue + * (we need a page to be written, otherwise we won't ever find + * the updates for future reads). + */ + if (F_ISSET(r, WT_EVICT_LOOKASIDE)) + return (EBUSY); + } /* Set the boundary reference and increment the count. */ bnd = &r->bnd[r->bnd_next++]; @@ -2666,9 +2966,8 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk); /* If this is a checkpoint, we're done, otherwise write the page. */ - return ( - __rec_is_checkpoint(r, bnd) ? 0 : - __rec_split_write(session, r, bnd, &r->dsk, 1)); + return (__rec_is_checkpoint(session, r, bnd) ? + 0 : __rec_split_write(session, r, bnd, &r->dsk, 1)); } /* @@ -2794,7 +3093,7 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_PAGE *page; WT_PAGE_HEADER *dsk; WT_PAGE_MODIFY *mod; - WT_UPD_SKIPPED *skip; + WT_SAVE_UPD *supd; size_t addr_size; uint32_t bnd_slot, i, j; int cmp; @@ -2837,23 +3136,23 @@ __rec_split_write(WT_SESSION_IMPL *session, bnd->cksum = 0; /* - * Check if we've skipped updates that belong to this block, and move - * any to the per-block structure. Quit as soon as we find a skipped + * Check if we've saved updates that belong to this block, and move + * any to the per-block structure. Quit as soon as we find a saved * update that doesn't belong to the block, they're in sorted order. * * This code requires a key be filled in for the next block (or the * last block flag be set, if there's no next block). */ - for (i = 0, skip = r->skip; i < r->skip_next; ++i, ++skip) { - /* The last block gets all remaining skipped updates. */ + for (i = 0, supd = r->supd; i < r->supd_next; ++i, ++supd) { + /* The last block gets all remaining saved updates. */ if (last_block) { - WT_ERR(__rec_skip_update_move(session, bnd, skip)); + WT_ERR(__rec_update_move(session, bnd, supd)); continue; } /* - * Get the skipped update's key and compare it with this block's - * key range. If the skipped update list belongs with the block + * Get the saved update's key and compare it with this block's + * key range. If the saved update list belongs with the block * we're about to write, move it to the per-block memory. Check * only to the first update that doesn't go with the block, they * must be in sorted order. @@ -2861,43 +3160,56 @@ __rec_split_write(WT_SESSION_IMPL *session, switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: - if (WT_INSERT_RECNO(skip->ins) >= (bnd + 1)->recno) - goto skip_check_complete; + if (WT_INSERT_RECNO(supd->ins) >= (bnd + 1)->recno) + goto supd_check_complete; break; case WT_PAGE_ROW_LEAF: - if (skip->ins == NULL) + if (supd->ins == NULL) WT_ERR(__wt_row_leaf_key( - session, page, skip->rip, key, 0)); + session, page, supd->rip, key, 0)); else { - key->data = WT_INSERT_KEY(skip->ins); - key->size = WT_INSERT_KEY_SIZE(skip->ins); + key->data = WT_INSERT_KEY(supd->ins); + key->size = WT_INSERT_KEY_SIZE(supd->ins); } WT_ERR(__wt_compare(session, btree->collator, key, &(bnd + 1)->key, &cmp)); if (cmp >= 0) - goto skip_check_complete; + goto supd_check_complete; break; WT_ILLEGAL_VALUE_ERR(session); } - WT_ERR(__rec_skip_update_move(session, bnd, skip)); + WT_ERR(__rec_update_move(session, bnd, supd)); } -skip_check_complete: +supd_check_complete: /* * If there are updates that weren't moved to the block, shuffle them to - * the beginning of the cached list (we maintain the skipped updates in - * sorted order, new skipped updates must be appended to the list). + * the beginning of the cached list (we maintain the saved updates in + * sorted order, new saved updates must be appended to the list). + */ + for (j = 0; i < r->supd_next; ++j, ++i) + r->supd[j] = r->supd[i]; + r->supd_next = j; + + /* + * If using the lookaside table eviction path and we found updates that + * weren't globally visible when reconciling this page, note that in the + * page header. */ - for (j = 0; i < r->skip_next; ++j, ++i) - r->skip[j] = r->skip[i]; - r->skip_next = j; + if (F_ISSET(r, WT_EVICT_LOOKASIDE) && bnd->supd != NULL) { + F_SET(dsk, WT_PAGE_LAS_UPDATE); + r->cache_write_lookaside = 1; + } /* - * If we had to skip updates in order to build this disk image, we can't - * actually write it. Instead, we will re-instantiate the page using the - * disk image and the list of updates we skipped. + * If using the save/restore eviction path and we had to skip updates in + * order to build this disk image, we can't actually write it. Instead, + * we will re-instantiate the page using the disk image and the list of + * updates we skipped. */ - if (bnd->skip != NULL) { + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) { + r->cache_write_restore = 1; + /* * If the buffer is compressed (raw compression was configured), * we have to decompress it so we can instantiate it later. It's @@ -2963,12 +3275,148 @@ skip_check_complete: WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr)); bnd->addr.size = (uint8_t)addr_size; + /* + * If using the lookaside table eviction path and we found updates that + * weren't globally visible when reconciling this page, copy them into + * the database's lookaside store. + */ + if (F_ISSET(r, WT_EVICT_LOOKASIDE) && bnd->supd != NULL) + ret = __rec_update_las(session, r, btree->id, bnd); + done: err: __wt_scr_free(session, &key); return (ret); } /* + * __rec_update_las -- + * Copy a set of updates into the database's lookaside buffer. + */ +static int +__rec_update_las(WT_SESSION_IMPL *session, + WT_RECONCILE *r, uint32_t btree_id, WT_BOUNDARY *bnd) +{ + WT_CURSOR *cursor; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_ITEM las_addr, las_value; + WT_PAGE *page; + WT_SAVE_UPD *list; + WT_UPDATE *upd; + uint64_t las_counter; + uint32_t i, session_flags, slot; + uint8_t *p; + + cursor = NULL; + WT_CLEAR(las_addr); + WT_CLEAR(las_value); + page = r->page; + + /* + * We're writing lookaside records: start instantiating them on pages + * we read (with the right flag set), and start sweeping the file. + */ + __wt_las_set_written(session); + + WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); + + /* Ensure enough room for a column-store key without checking. */ + WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key)); + + /* + * Each key in the lookaside table is associated with a block, and those + * blocks are freed and reallocated to other pages as pages in the tree + * are modified and reconciled. We want to be sure we don't add records + * to the lookaside table, then discard the block to which they apply, + * then write a new block to the same address, and then apply the old + * records to the new block when it's read. We don't want to clean old + * records out of the lookaside table every time we free a block because + * that happens a lot and would be costly; instead, we clean out the old + * records when adding new records into the lookaside table. This works + * because we only read from the lookaside table for pages marked with + * the WT_PAGE_LAS_UPDATE flag: that flag won't be set if we rewrite a + * block with no lookaside records, so the lookaside table won't be + * checked when the block is read, even if there are lookaside table + * records matching that block. If we rewrite a block that has lookaside + * records, we'll run this code, discarding any old records that might + * exist. + */ + WT_ERR(__wt_las_remove_block( + session, cursor, btree_id, bnd->addr.addr, bnd->addr.size)); + + /* Lookaside table key component: block address. */ + las_addr.data = bnd->addr.addr; + las_addr.size = bnd->addr.size; + + /* Enter each update in the boundary's list into the lookaside store. */ + for (las_counter = 0, i = 0, + list = bnd->supd; i < bnd->supd_next; ++i, ++list) { + /* Lookaside table key component: source key. */ + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + p = key->mem; + WT_ERR( + __wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins))); + key->size = WT_PTRDIFF(p, key->data); + + break; + case WT_PAGE_ROW_LEAF: + if (list->ins == NULL) + WT_ERR(__wt_row_leaf_key( + session, page, list->rip, key, 0)); + else { + key->data = WT_INSERT_KEY(list->ins); + key->size = WT_INSERT_KEY_SIZE(list->ins); + } + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Lookaside table value component: update reference. */ + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + upd = list->ins->upd; + break; + case WT_PAGE_ROW_LEAF: + if (list->ins == NULL) { + slot = WT_ROW_SLOT(page, list->rip); + upd = page->pg_row_upd[slot]; + } else + upd = list->ins->upd; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* + * Walk the list of updates, storing each key/value pair into + * the lookaside table. + */ + do { + cursor->set_key(cursor, btree_id, + &las_addr, ++las_counter, list->onpage_txn, key); + + if (WT_UPDATE_DELETED_ISSET(upd)) + las_value.size = 0; + else { + las_value.data = WT_UPDATE_DATA(upd); + las_value.size = upd->size; + } + cursor->set_value( + cursor, upd->txnid, upd->size, &las_value); + + WT_ERR(cursor->insert(cursor)); + } while ((upd = upd->next) != NULL); + } + +err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + + __wt_scr_free(session, &key); + return (ret); +} + +/* * __wt_bulk_init -- * Bulk insert initialization. */ @@ -3008,7 +3456,7 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) recno = 1; break; case BTREE_ROW: - recno = 0; + recno = WT_RECNO_OOB; break; WT_ILLEGAL_VALUE(session); } @@ -3049,6 +3497,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_RET(__rec_split_finish(session, r)); WT_RET(__rec_write_wrapup(session, r, r->page)); + WT_RET(__rec_write_status(session, r, r->page)); /* Mark the page's parent and the tree dirty. */ parent = r->ref->home; @@ -3824,7 +4273,7 @@ record_loop: /* * Write a placeholder. */ WT_ASSERT(session, - F_ISSET(r, WT_SKIP_UPDATE_RESTORE)); + F_ISSET(r, WT_EVICT_UPDATE_RESTORE)); data = "@"; size = 1; @@ -4207,7 +4656,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) vtype = state == WT_CHILD_PROXY ? WT_CELL_ADDR_DEL : (u_int)vpack->raw; } - __rec_cell_build_addr(r, p, size, vtype, 0); + __rec_cell_build_addr(r, p, size, vtype, WT_RECNO_OOB); CHILD_RELEASE_ERR(session, hazard, ref); /* @@ -4294,7 +4743,7 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr = &multi->addr; __rec_cell_build_addr( - r, addr->addr, addr->size, __rec_vtype(addr), 0); + r, addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB); /* Boundary: split or write the page. */ if (key->len + val->len > r->space_avail) @@ -4450,7 +4899,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session, * Assert the case. */ WT_ASSERT(session, - F_ISSET(r, WT_SKIP_UPDATE_RESTORE)); + F_ISSET(r, WT_EVICT_UPDATE_RESTORE)); /* * If the key is also a removed overflow item, @@ -4777,13 +5226,11 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) static int __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page) { - WT_BM *bm; WT_DECL_RET; WT_PAGE_MODIFY *mod; WT_MULTI *multi; uint32_t i; - bm = S2BT(session)->bm; mod = page->modify; /* @@ -4799,17 +5246,17 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_free(session, multi->key.ikey); break; } - if (multi->skip == NULL) { + if (multi->supd == NULL) { if (multi->addr.reuse) multi->addr.addr = NULL; else { - WT_RET(bm->free(bm, session, + WT_RET(__rec_block_free(session, multi->addr.addr, multi->addr.size)); __wt_free(session, multi->addr.addr); } } else { - __wt_free(session, multi->skip); - __wt_free(session, multi->skip_dsk); + __wt_free(session, multi->supd); + __wt_free(session, multi->supd_dsk); } } __wt_free(session, mod->mod_multi); @@ -4882,7 +5329,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) */ WT_RET(__wt_ref_info( session, ref, &addr, &addr_size, NULL)); - WT_RET(bm->free(bm, session, addr, addr_size)); + WT_RET(__rec_block_free(session, addr, addr_size)); if (__wt_off_page(ref->home, ref->addr)) { __wt_free( session, ((WT_ADDR *)ref->addr)->addr); @@ -4908,7 +5355,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * are checkpoints, and must be explicitly dropped. */ if (!__wt_ref_is_root(ref)) - WT_RET(bm->free(bm, session, + WT_RET(__rec_block_free(session, mod->mod_replace.addr, mod->mod_replace.size)); /* Discard the replacement page's address. */ @@ -4962,14 +5409,14 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * nothing to write. Allocate, then initialize the array of * replacement blocks. */ - if (bnd->skip != NULL) { + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) { WT_RET(__wt_calloc_def( session, r->bnd_next, &mod->mod_multi)); multi = mod->mod_multi; - multi->skip = bnd->skip; - multi->skip_entries = bnd->skip_next; - bnd->skip = NULL; - multi->skip_dsk = bnd->dsk; + multi->supd = bnd->supd; + multi->supd_entries = bnd->supd_next; + bnd->supd = NULL; + multi->supd_dsk = bnd->dsk; bnd->dsk = NULL; mod->mod_multi_entries = 1; @@ -5068,50 +5515,6 @@ err: __wt_scr_free(session, &tkey); F_SET(mod, WT_PM_REC_MULTIBLOCK); break; } - - /* - * If updates were skipped, the tree isn't clean. The checkpoint call - * cleared the tree's modified value before calling the eviction thread, - * so we must explicitly reset the tree's modified flag. We insert a - * barrier after the change for clarity (the requirement is the value - * be set before a subsequent checkpoint reads it, and because the - * current checkpoint is waiting on this reconciliation to complete, - * there's no risk of that happening). - */ - if (r->leave_dirty) { - mod->first_dirty_txn = r->first_dirty_txn; - - btree->modified = 1; - WT_FULL_BARRIER(); - } else { - /* - * If no updates were skipped, we have a new maximum transaction - * written for the page (used to decide if a clean page can be - * evicted). Set the highest transaction ID for the page. - * - * Track the highest transaction ID for the tree (used to decide - * if it's safe to discard all of the pages in the tree without - * further checking). Reconciliation in the service of eviction - * is multi-threaded, only update the tree's maximum transaction - * ID when doing a checkpoint. That's sufficient, we only care - * about the highest transaction ID of any update currently in - * the tree, and checkpoint visits every dirty page in the tree. - */ - mod->rec_max_txn = r->max_txn; - if (!F_ISSET(r, WT_EVICTING) && - WT_TXNID_LT(btree->rec_max_txn, r->max_txn)) - btree->rec_max_txn = r->max_txn; - - /* - * The page only might be clean; if the write generation is - * unchanged since reconciliation started, it's clean. If the - * write generation changed, the page has been written since - * we started reconciliation and remains dirty. - */ - if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0)) - __wt_cache_dirty_decr(session, page); - } - return (0); } @@ -5122,14 +5525,12 @@ err: __wt_scr_free(session, &tkey); static int __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) { - WT_BM *bm; WT_BOUNDARY *bnd; WT_DECL_RET; WT_MULTI *multi; WT_PAGE_MODIFY *mod; uint32_t i; - bm = S2BT(session)->bm; mod = page->modify; /* @@ -5160,7 +5561,7 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) if (bnd->addr.reuse) bnd->addr.addr = NULL; else { - WT_TRET(bm->free(bm, session, + WT_TRET(__rec_block_free(session, bnd->addr.addr, bnd->addr.size)); __wt_free(session, bnd->addr.addr); } @@ -5203,18 +5604,18 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_RET(__wt_row_ikey_alloc(session, 0, bnd->key.data, bnd->key.size, &multi->key.ikey)); - if (bnd->skip == NULL) { + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) { + multi->supd = bnd->supd; + multi->supd_entries = bnd->supd_next; + bnd->supd = NULL; + multi->supd_dsk = bnd->dsk; + bnd->dsk = NULL; + } else { multi->addr = bnd->addr; multi->addr.reuse = 0; multi->size = bnd->size; multi->cksum = bnd->cksum; bnd->addr.addr = NULL; - } else { - multi->skip = bnd->skip; - multi->skip_entries = bnd->skip_next; - bnd->skip = NULL; - multi->skip_dsk = bnd->dsk; - bnd->dsk = NULL; } } mod->mod_multi_entries = r->bnd_next; @@ -5243,18 +5644,18 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { multi->key.recno = bnd->recno; - if (bnd->skip == NULL) { + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) { + multi->supd = bnd->supd; + multi->supd_entries = bnd->supd_next; + bnd->supd = NULL; + multi->supd_dsk = bnd->dsk; + bnd->dsk = NULL; + } else { multi->addr = bnd->addr; multi->addr.reuse = 0; multi->size = bnd->size; multi->cksum = bnd->cksum; bnd->addr.addr = NULL; - } else { - multi->skip = bnd->skip; - multi->skip_entries = bnd->skip_next; - bnd->skip = NULL; - multi->skip_dsk = bnd->dsk; - bnd->dsk = NULL; } } mod->mod_multi_entries = r->bnd_next; diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index 06786db2f6d..a1f5618a317 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -383,6 +383,22 @@ err: if (cursor != NULL) } /* + * __wt_session_create -- + * Internal version of WT_SESSION::create. + */ +int +__wt_session_create( + WT_SESSION_IMPL *session, const char *uri, const char *config) +{ + WT_DECL_RET; + + WT_WITH_SCHEMA_LOCK(session, + WT_WITH_TABLE_LOCK(session, + ret = __wt_schema_create(session, uri, config))); + return (ret); +} + +/* * __session_create -- * WT_SESSION->create method. */ @@ -423,9 +439,7 @@ __session_create(WT_SESSION *wt_session, const char *uri, const char *config) WT_ERR_NOTFOUND_OK(ret); } - WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, - ret = __wt_schema_create(session, uri, config))); + ret = __wt_session_create(session, uri, config); err: API_END_RET_NOTFOUND_MAP(session, ret); } @@ -529,6 +543,21 @@ __session_compact(WT_SESSION *wt_session, const char *uri, const char *config) } /* + * __wt_session_drop -- + * Internal version of WT_SESSION::drop. + */ +int +__wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) +{ + WT_DECL_RET; + + WT_WITH_SCHEMA_LOCK(session, + WT_WITH_TABLE_LOCK(session, + ret = __wt_schema_drop(session, uri, cfg))); + return (ret); +} + +/* * __session_drop -- * WT_SESSION->drop method. */ @@ -544,9 +573,7 @@ __session_drop(WT_SESSION *wt_session, const char *uri, const char *config) /* Disallow objects in the WiredTiger name space. */ WT_ERR(__wt_str_name_check(session, uri)); - WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, - ret = __wt_schema_drop(session, uri, cfg))); + ret = __wt_session_drop(session, uri, cfg); err: /* Note: drop operations cannot be unrolled (yet?). */ API_END_RET_NOTFOUND_MAP(session, ret); @@ -915,7 +942,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) * If our LSN is smaller than the current sync LSN then our * transaction is stable. We're done. */ - if (WT_LOG_CMP(&session->bg_sync_lsn, &log->sync_lsn) <= 0) + if (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) <= 0) goto err; /* @@ -937,7 +964,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) * Keep checking the LSNs until we find it is stable or we reach * our timeout. */ - while (WT_LOG_CMP(&session->bg_sync_lsn, &log->sync_lsn) > 0) { + while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) { WT_ERR(__wt_cond_signal(session, conn->log_file_cond)); WT_ERR(__wt_epoch(session, &now)); waited_ms = WT_TIMEDIFF(now, start) / WT_MILLION; @@ -1001,7 +1028,7 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config) * operations, but checkpoint does enough I/O it may be called upon to * perform slow operations for the block manager. */ - F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK); + F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION); /* * Only one checkpoint can be active at a time, and checkpoints must run @@ -1016,7 +1043,7 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config) WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0); -err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK); +err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION); API_END_RET_NOTFOUND_MAP(session, ret); } diff --git a/src/third_party/wiredtiger/src/support/pow.c b/src/third_party/wiredtiger/src/support/pow.c index 8e42113a2ee..0f50bfe56a1 100644 --- a/src/third_party/wiredtiger/src/support/pow.c +++ b/src/third_party/wiredtiger/src/support/pow.c @@ -100,7 +100,7 @@ __wt_log2_int(uint32_t n) * __wt_ispo2 -- * Return if a number is a power-of-two. */ -int +bool __wt_ispo2(uint32_t v) { /* diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 6af357202be..79248b0652c 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -24,6 +24,7 @@ static const char * const __stats_dsrc_desc[] = { "btree: column-store variable-size deleted values", "btree: column-store fixed-size leaf pages", "btree: column-store internal pages", + "btree: column-store variable-size RLE encoded values", "btree: column-store variable-size leaf pages", "btree: pages rewritten by compaction", "btree: number of key/value pairs", @@ -48,10 +49,14 @@ static const char * const __stats_dsrc_desc[] = { "cache: internal pages evicted", "cache: pages split during eviction", "cache: in-memory page splits", + "cache: in-memory page passed criteria to be split", "cache: overflow values cached in memory", "cache: pages read into cache", + "cache: pages read into cache requiring lookaside entries", "cache: overflow pages read into cache", "cache: pages written from cache", + "cache: page written requiring lookaside records", + "cache: pages written requiring in-memory restoration", "compression: raw compression call failed, no additional data available", "compression: raw compression call failed, additional data available", "compression: raw compression call succeeded", @@ -137,6 +142,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->btree_column_internal = 0; stats->btree_column_deleted = 0; stats->btree_column_variable = 0; + stats->btree_column_rle = 0; stats->btree_fixed_len = 0; stats->btree_maxintlkey = 0; stats->btree_maxintlpage = 0; @@ -154,15 +160,19 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cache_eviction_checkpoint = 0; stats->cache_eviction_fail = 0; stats->cache_eviction_hazard = 0; + stats->cache_inmem_splittable = 0; stats->cache_inmem_split = 0; stats->cache_eviction_internal = 0; stats->cache_eviction_dirty = 0; stats->cache_read_overflow = 0; stats->cache_overflow_value = 0; stats->cache_eviction_deepen = 0; + stats->cache_write_lookaside = 0; stats->cache_read = 0; + stats->cache_read_lookaside = 0; stats->cache_eviction_split = 0; stats->cache_write = 0; + stats->cache_write_restore = 0; stats->cache_eviction_clean = 0; stats->compress_read = 0; stats->compress_write = 0; @@ -243,6 +253,7 @@ __wt_stat_dsrc_aggregate_single( to->btree_column_internal += from->btree_column_internal; to->btree_column_deleted += from->btree_column_deleted; to->btree_column_variable += from->btree_column_variable; + to->btree_column_rle += from->btree_column_rle; to->btree_fixed_len = from->btree_fixed_len; if (from->btree_maxintlkey > to->btree_maxintlkey) to->btree_maxintlkey = from->btree_maxintlkey; @@ -266,15 +277,19 @@ __wt_stat_dsrc_aggregate_single( to->cache_eviction_checkpoint += from->cache_eviction_checkpoint; to->cache_eviction_fail += from->cache_eviction_fail; to->cache_eviction_hazard += from->cache_eviction_hazard; + to->cache_inmem_splittable += from->cache_inmem_splittable; to->cache_inmem_split += from->cache_inmem_split; to->cache_eviction_internal += from->cache_eviction_internal; to->cache_eviction_dirty += from->cache_eviction_dirty; to->cache_read_overflow += from->cache_read_overflow; to->cache_overflow_value += from->cache_overflow_value; to->cache_eviction_deepen += from->cache_eviction_deepen; + to->cache_write_lookaside += from->cache_write_lookaside; to->cache_read += from->cache_read; + to->cache_read_lookaside += from->cache_read_lookaside; to->cache_eviction_split += from->cache_eviction_split; to->cache_write += from->cache_write; + to->cache_write_restore += from->cache_write_restore; to->cache_eviction_clean += from->cache_eviction_clean; to->compress_read += from->compress_read; to->compress_write += from->compress_write; @@ -354,6 +369,7 @@ __wt_stat_dsrc_aggregate( to->btree_column_deleted += WT_STAT_READ(from, btree_column_deleted); to->btree_column_variable += WT_STAT_READ(from, btree_column_variable); + to->btree_column_rle += WT_STAT_READ(from, btree_column_rle); to->btree_fixed_len = from[0]->btree_fixed_len; if ((v = WT_STAT_READ(from, btree_maxintlkey)) > to->btree_maxintlkey) @@ -386,6 +402,8 @@ __wt_stat_dsrc_aggregate( to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail); to->cache_eviction_hazard += WT_STAT_READ(from, cache_eviction_hazard); + to->cache_inmem_splittable += + WT_STAT_READ(from, cache_inmem_splittable); to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split); to->cache_eviction_internal += WT_STAT_READ(from, cache_eviction_internal); @@ -394,9 +412,13 @@ __wt_stat_dsrc_aggregate( to->cache_overflow_value += WT_STAT_READ(from, cache_overflow_value); to->cache_eviction_deepen += WT_STAT_READ(from, cache_eviction_deepen); + to->cache_write_lookaside += + WT_STAT_READ(from, cache_write_lookaside); to->cache_read += WT_STAT_READ(from, cache_read); + to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside); to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split); to->cache_write += WT_STAT_READ(from, cache_write); + to->cache_write_restore += WT_STAT_READ(from, cache_write_restore); to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean); to->compress_read += WT_STAT_READ(from, compress_read); to->compress_write += WT_STAT_READ(from, compress_write); @@ -511,11 +533,17 @@ static const char * const __stats_connection_desc[] = { "cache: pages walked for eviction", "cache: eviction worker thread evicting pages", "cache: in-memory page splits", + "cache: in-memory page passed criteria to be split", + "cache: lookaside table insert calls", + "cache: lookaside table remove calls", "cache: percentage overhead", "cache: tracked dirty pages in the cache", "cache: pages currently held in the cache", "cache: pages read into cache", + "cache: pages read into cache requiring lookaside entries", "cache: pages written from cache", + "cache: page written requiring lookaside records", + "cache: pages written requiring in-memory restoration", "connection: pthread mutex condition wait calls", "cursor: cursor create calls", "cursor: cursor insert calls", @@ -558,9 +586,9 @@ static const char * const __stats_connection_desc[] = { "log: logging bytes consolidated", "log: consolidated slot joins", "log: consolidated slot join races", - "log: record size exceeded maximum", - "log: failed to find a slot large enough for record", + "log: busy returns attempting to switch slots", "log: consolidated slot join transitions", + "log: consolidated slot unbuffered writes", "log: log sync operations", "log: log sync_dir operations", "log: log server thread advances write LSN", @@ -667,21 +695,27 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cache_eviction_worker_evicting = 0; stats->cache_eviction_force_fail = 0; stats->cache_eviction_hazard = 0; + stats->cache_inmem_splittable = 0; stats->cache_inmem_split = 0; stats->cache_eviction_internal = 0; + stats->cache_lookaside_insert = 0; + stats->cache_lookaside_remove = 0; /* not clearing cache_bytes_max */ /* not clearing cache_eviction_maximum_page_size */ stats->cache_eviction_dirty = 0; stats->cache_eviction_deepen = 0; + stats->cache_write_lookaside = 0; /* not clearing cache_pages_inuse */ stats->cache_eviction_force = 0; stats->cache_eviction_force_delete = 0; stats->cache_eviction_app = 0; stats->cache_read = 0; + stats->cache_read_lookaside = 0; stats->cache_eviction_fail = 0; stats->cache_eviction_split = 0; stats->cache_eviction_walk = 0; stats->cache_write = 0; + stats->cache_write_restore = 0; /* not clearing cache_overhead */ /* not clearing cache_bytes_internal */ /* not clearing cache_bytes_leaf */ @@ -716,11 +750,12 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->dh_sweeps = 0; stats->dh_session_handles = 0; stats->dh_session_sweeps = 0; + stats->log_slot_switch_busy = 0; stats->log_slot_closes = 0; stats->log_slot_races = 0; stats->log_slot_transitions = 0; stats->log_slot_joins = 0; - stats->log_slot_toosmall = 0; + stats->log_slot_unbuffered = 0; stats->log_bytes_payload = 0; stats->log_bytes_written = 0; stats->log_compress_writes = 0; @@ -738,7 +773,6 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing log_prealloc_max */ stats->log_prealloc_files = 0; stats->log_prealloc_used = 0; - stats->log_slot_toobig = 0; stats->log_scan_records = 0; stats->log_compress_mem = 0; /* not clearing log_buffer_size */ @@ -835,25 +869,35 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, cache_eviction_force_fail); to->cache_eviction_hazard += WT_STAT_READ(from, cache_eviction_hazard); + to->cache_inmem_splittable += + WT_STAT_READ(from, cache_inmem_splittable); to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split); to->cache_eviction_internal += WT_STAT_READ(from, cache_eviction_internal); + to->cache_lookaside_insert += + WT_STAT_READ(from, cache_lookaside_insert); + to->cache_lookaside_remove += + WT_STAT_READ(from, cache_lookaside_remove); to->cache_bytes_max += WT_STAT_READ(from, cache_bytes_max); to->cache_eviction_maximum_page_size += WT_STAT_READ(from, cache_eviction_maximum_page_size); to->cache_eviction_dirty += WT_STAT_READ(from, cache_eviction_dirty); to->cache_eviction_deepen += WT_STAT_READ(from, cache_eviction_deepen); + to->cache_write_lookaside += + WT_STAT_READ(from, cache_write_lookaside); to->cache_pages_inuse += WT_STAT_READ(from, cache_pages_inuse); to->cache_eviction_force += WT_STAT_READ(from, cache_eviction_force); to->cache_eviction_force_delete += WT_STAT_READ(from, cache_eviction_force_delete); to->cache_eviction_app += WT_STAT_READ(from, cache_eviction_app); to->cache_read += WT_STAT_READ(from, cache_read); + to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside); to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail); to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split); to->cache_eviction_walk += WT_STAT_READ(from, cache_eviction_walk); to->cache_write += WT_STAT_READ(from, cache_write); + to->cache_write_restore += WT_STAT_READ(from, cache_write_restore); to->cache_overhead += WT_STAT_READ(from, cache_overhead); to->cache_bytes_internal += WT_STAT_READ(from, cache_bytes_internal); to->cache_bytes_leaf += WT_STAT_READ(from, cache_bytes_leaf); @@ -888,11 +932,12 @@ __wt_stat_connection_aggregate( to->dh_sweeps += WT_STAT_READ(from, dh_sweeps); to->dh_session_handles += WT_STAT_READ(from, dh_session_handles); to->dh_session_sweeps += WT_STAT_READ(from, dh_session_sweeps); + to->log_slot_switch_busy += WT_STAT_READ(from, log_slot_switch_busy); to->log_slot_closes += WT_STAT_READ(from, log_slot_closes); to->log_slot_races += WT_STAT_READ(from, log_slot_races); to->log_slot_transitions += WT_STAT_READ(from, log_slot_transitions); to->log_slot_joins += WT_STAT_READ(from, log_slot_joins); - to->log_slot_toosmall += WT_STAT_READ(from, log_slot_toosmall); + to->log_slot_unbuffered += WT_STAT_READ(from, log_slot_unbuffered); to->log_bytes_payload += WT_STAT_READ(from, log_bytes_payload); to->log_bytes_written += WT_STAT_READ(from, log_bytes_written); to->log_compress_writes += WT_STAT_READ(from, log_compress_writes); @@ -913,7 +958,6 @@ __wt_stat_connection_aggregate( to->log_prealloc_max += WT_STAT_READ(from, log_prealloc_max); to->log_prealloc_files += WT_STAT_READ(from, log_prealloc_files); to->log_prealloc_used += WT_STAT_READ(from, log_prealloc_used); - to->log_slot_toobig += WT_STAT_READ(from, log_slot_toobig); to->log_scan_records += WT_STAT_READ(from, log_scan_records); to->log_compress_mem += WT_STAT_READ(from, log_compress_mem); to->log_buffer_size += WT_STAT_READ(from, log_buffer_size); diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index e671ce28ffb..9f59c53314e 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -246,6 +246,10 @@ __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]) WT_ASSERT(session, session->dhandle->checkpoint == NULL); WT_ASSERT(session, WT_PREFIX_MATCH(session->dhandle->name, "file:")); + /* Skip files that are never involved in a checkpoint. */ + if (F_ISSET(S2BT(session), WT_BTREE_NO_CHECKPOINT)) + return (0); + /* Make sure there is space for the next entry. */ WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated, session->ckpt_handle_next + 1, &session->ckpt_handle)); @@ -1164,7 +1168,15 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, int final) btree = S2BT(session); bulk = F_ISSET(btree, WT_BTREE_BULK) ? 1 : 0; - /* If the handle is already dead, discard it. */ + /* + * If the handle is already dead or the file isn't durable, force the + * discard. + * + * If the file isn't durable, mark the handle dead, there are asserts + * later on that only dead handles can have modified pages. + */ + if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) + F_SET(session->dhandle, WT_DHANDLE_DEAD); if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD)); diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index 0d66eccd7dc..a63720d736f 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -33,18 +33,7 @@ __txn_op_log(WT_SESSION_IMPL *session, * 3) row store remove; or * 4) row store insert/update. */ - if (cbt->btree->type != BTREE_ROW) { - WT_ASSERT(session, cbt->ins != NULL); - recno = WT_INSERT_RECNO(cbt->ins); - WT_ASSERT(session, recno != 0); - - if (WT_UPDATE_DELETED_ISSET(upd)) - WT_ERR(__wt_logop_col_remove_pack(session, logrec, - op->fileid, recno)); - else - WT_ERR(__wt_logop_col_put_pack(session, logrec, - op->fileid, recno, &value)); - } else { + if (cbt->btree->type == BTREE_ROW) { WT_ERR(__wt_cursor_row_leaf_key(cbt, &key)); if (WT_UPDATE_DELETED_ISSET(upd)) @@ -53,6 +42,16 @@ __txn_op_log(WT_SESSION_IMPL *session, else WT_ERR(__wt_logop_row_put_pack(session, logrec, op->fileid, &key, &value)); + } else { + recno = WT_INSERT_RECNO(cbt->ins); + WT_ASSERT(session, recno != WT_RECNO_OOB); + + if (WT_UPDATE_DELETED_ISSET(upd)) + WT_ERR(__wt_logop_col_remove_pack(session, logrec, + op->fileid, recno)); + else + WT_ERR(__wt_logop_col_put_pack(session, logrec, + op->fileid, recno, &value)); } err: __wt_buf_free(session, &key); @@ -308,7 +307,7 @@ __wt_txn_checkpoint_log( switch (flags) { case WT_TXN_LOG_CKPT_PREPARE: txn->full_ckpt = 1; - *ckpt_lsn = S2C(session)->log->write_start_lsn; + WT_ERR(__wt_log_ckpt_lsn(session, ckpt_lsn)); /* * We need to make sure that the log records in the checkpoint * LSN are on disk. In particular to make sure that the @@ -337,7 +336,7 @@ __wt_txn_checkpoint_log( txn->ckpt_nsnapshot = 0; WT_CLEAR(empty); ckpt_snapshot = ∅ - *ckpt_lsn = S2C(session)->log->write_start_lsn; + WT_ERR(__wt_log_ckpt_lsn(session, ckpt_lsn)); } else ckpt_snapshot = txn->ckpt_snapshot; @@ -419,9 +418,9 @@ __wt_txn_truncate_log( } else { op->type = WT_TXN_OP_TRUNCATE_COL; op->u.truncate_col.start = - (start == NULL) ? 0 : start->recno; + (start == NULL) ? WT_RECNO_OOB : start->recno; op->u.truncate_col.stop = - (stop == NULL) ? 0 : stop->recno; + (stop == NULL) ? WT_RECNO_OOB : stop->recno; } /* Write that operation into the in-memory log. */ diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index f321da303d7..240d0a5ffd3 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -65,7 +65,7 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r, "No file found with ID %u (max %u)", id, r->nfiles)); r->missing = 1; - } else if (WT_LOG_CMP(lsnp, &r->files[id].ckpt_lsn) >= 0) { + } else if (__wt_log_cmp(lsnp, &r->files[id].ckpt_lsn) >= 0) { /* * We're going to apply the operation. Get the cursor, opening * one if none is cached. @@ -144,10 +144,10 @@ __txn_op_apply( GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); /* Set up the cursors. */ - if (start_recno == 0) { + if (start_recno == WT_RECNO_OOB) { start = NULL; stop = cursor; - } else if (stop_recno == 0) { + } else if (stop_recno == WT_RECNO_OOB) { start = cursor; stop = NULL; } else { diff --git a/src/third_party/wiredtiger/src/utilities/util_list.c b/src/third_party/wiredtiger/src/utilities/util_list.c index 1888c7d967b..1d35f2efc72 100644 --- a/src/third_party/wiredtiger/src/utilities/util_list.c +++ b/src/third_party/wiredtiger/src/utilities/util_list.c @@ -97,12 +97,15 @@ list_print(WT_SESSION *session, const char *name, int cflag, int vflag) } /* - * XXX - * We don't normally say anything about the WiredTiger - * metadata, it's not a normal "object" in the database. I'm - * making an exception for the checkpoint and verbose options. + * !!! + * We don't normally say anything about the WiredTiger metadata + * and lookaside tables, they're not application/user "objects" + * in the database. I'm making an exception for the checkpoint + * and verbose options. */ - if (strcmp(key, WT_METADATA_URI) != 0 || cflag || vflag) + if (cflag || vflag || + (strcmp(key, WT_METADATA_URI) != 0 && + strcmp(key, WT_LAS_URI) != 0)) printf("%s\n", key); if (!cflag && !vflag) diff --git a/src/third_party/wiredtiger/tools/wtstats/stat_data.py b/src/third_party/wiredtiger/tools/wtstats/stat_data.py index 3fbc634385a..f2f193c0860 100644 --- a/src/third_party/wiredtiger/tools/wtstats/stat_data.py +++ b/src/third_party/wiredtiger/tools/wtstats/stat_data.py @@ -43,6 +43,7 @@ no_scale_per_second_list = [ 'btree: column-store internal pages', 'btree: column-store variable-size deleted values', 'btree: column-store variable-size leaf pages', + 'btree: column-store variable-size RLE encoded values', 'btree: fixed-record size', 'btree: maximum internal page key size', 'btree: maximum internal page size', |