summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--dist/api_config.py12
-rw-r--r--dist/api_data.py898
-rw-r--r--dist/api_err.py7
-rw-r--r--dist/dist.py53
-rw-r--r--dist/filelist4
-rw-r--r--dist/flags.py18
-rw-r--r--dist/java_doc.py3
-rw-r--r--dist/log.py252
-rw-r--r--dist/log_data.py62
-rw-r--r--dist/s_all3
-rw-r--r--dist/s_copyright5
-rw-r--r--dist/s_copyright.list17
-rw-r--r--dist/s_define.list5
-rw-r--r--dist/s_funcs.list1
-rw-r--r--dist/s_longlines1
-rw-r--r--dist/s_string.ok12
-rw-r--r--dist/s_tags4
-rw-r--r--dist/stat.py1
-rw-r--r--dist/stat_data.py24
-rwxr-xr-xdist/style.py18
-rw-r--r--lang/java/Makefile.am1
-rw-r--r--lang/java/wiredtiger.i266
-rw-r--r--lang/python/wiredtiger.i14
-rw-r--r--src/block/block_ext.c12
-rw-r--r--src/block/block_mgr.c2
-rw-r--r--src/btree/bt_cursor.c27
-rw-r--r--src/btree/bt_evict.c9
-rw-r--r--src/btree/bt_handle.c14
-rw-r--r--src/btree/bt_page.c2
-rw-r--r--src/btree/bt_ret.c40
-rw-r--r--src/btree/bt_slvg.c2
-rw-r--r--src/btree/bt_walk.c2
-rw-r--r--src/btree/col_modify.c6
-rw-r--r--src/btree/rec_evict.c4
-rw-r--r--src/btree/rec_write.c10
-rw-r--r--src/btree/row_modify.c9
-rw-r--r--src/config/config.c2
-rw-r--r--src/config/config_check.c4
-rw-r--r--src/config/config_def.c27
-rw-r--r--src/conn/conn_api.c1
-rw-r--r--src/conn/conn_cache_pool.c13
-rw-r--r--src/conn/conn_ckpt.c6
-rw-r--r--src/conn/conn_dhandle.c6
-rw-r--r--src/conn/conn_log.c12
-rw-r--r--src/conn/conn_open.c12
-rw-r--r--src/conn/conn_stat.c4
-rw-r--r--src/cursor/cur_backup.c7
-rw-r--r--src/cursor/cur_ds.c6
-rw-r--r--src/cursor/cur_dump.c8
-rw-r--r--src/cursor/cur_index.c4
-rw-r--r--src/cursor/cur_std.c40
-rw-r--r--src/cursor/cur_table.c9
-rw-r--r--src/docs/Doxyfile4
-rw-r--r--src/include/api.h3
-rw-r--r--src/include/btree.h2
-rw-r--r--src/include/connection.h4
-rw-r--r--src/include/cursor.h1
-rw-r--r--src/include/error.h13
-rw-r--r--src/include/extern.h147
-rw-r--r--src/include/flags.h38
-rw-r--r--src/include/log.h128
-rw-r--r--src/include/log.i1
-rw-r--r--src/include/misc.h6
-rw-r--r--src/include/session.h1
-rw-r--r--src/include/stat.h6
-rw-r--r--src/include/txn.h62
-rw-r--r--src/include/txn.i74
-rw-r--r--src/include/wiredtiger.in120
-rw-r--r--src/include/wt_internal.h7
-rw-r--r--src/log/log.c523
-rw-r--r--src/log/log_auto.c431
-rw-r--r--src/log/log_desc.c1
-rw-r--r--src/log/log_slot.c179
-rw-r--r--src/lsm/lsm_tree.c4
-rw-r--r--src/lsm/lsm_worker.c4
-rw-r--r--src/meta/meta_ckpt.c8
-rw-r--r--src/meta/meta_track.c12
-rw-r--r--src/meta/meta_turtle.c2
-rw-r--r--src/os_posix/os_dir.c11
-rw-r--r--src/os_posix/os_mtx.c8
-rw-r--r--src/os_posix/os_open.c2
-rw-r--r--src/os_posix/os_strtouq.c3
-rw-r--r--src/schema/schema_create.c42
-rw-r--r--src/schema/schema_drop.c4
-rw-r--r--src/schema/schema_open.c6
-rw-r--r--src/schema/schema_plan.c8
-rw-r--r--src/session/session_api.c4
-rw-r--r--src/session/session_compact.c172
-rw-r--r--src/session/session_dhandle.c10
-rw-r--r--src/session/session_salvage.c4
-rw-r--r--src/support/global.c8
-rw-r--r--src/support/hash_fnv.c3
-rw-r--r--src/support/hex.c4
-rw-r--r--src/support/mutex.c2
-rw-r--r--src/support/pow.c5
-rw-r--r--src/support/scratch.c18
-rw-r--r--src/support/stat.c20
-rw-r--r--src/txn/txn.c62
-rw-r--r--src/txn/txn_ckpt.c49
-rw-r--r--src/txn/txn_log.c440
-rw-r--r--src/txn/txn_recover.c451
-rw-r--r--src/utilities/util_printlog.c29
-rw-r--r--test/java/com/wiredtiger/test/AutoCloseTest.java284
-rw-r--r--test/java/com/wiredtiger/test/WiredTigerSuite.java1
-rw-r--r--test/salvage/salvage.c18
-rw-r--r--test/suite/test_txn02.py27
-rw-r--r--test/thread/t.c2
-rw-r--r--tools/statlog.py3
108 files changed, 4373 insertions, 1089 deletions
diff --git a/dist/api_config.py b/dist/api_config.py
index ab76c80961f..27bc06a32aa 100644
--- a/dist/api_config.py
+++ b/dist/api_config.py
@@ -206,8 +206,10 @@ static const WT_CONFIG_CHECK confchk_%(name)s_subconfigs[] = {
};
''' % {
'name' : c.name,
- 'check' : '\n\t'.join('"\n\t "'.join(w.wrap('{ "%s", "%s", %s, NULL },' %
- (subc.name, gettype(subc), checkstr(subc)))) for subc in sorted(c.subconfig)),
+ 'check' : '\n\t'.join('"\n\t "'.join(
+ w.wrap('{ "%s", "%s", %s, NULL },' %
+ (subc.name, gettype(subc), checkstr(subc))))
+ for subc in sorted(c.subconfig)),
})
def getsubconfigstr(c):
@@ -248,7 +250,7 @@ for name in sorted(api_data.methods.keys()):
# #defines are used to avoid a list search where we know the correct slot).
config_defines +=\
'#define\tWT_CONFIG_ENTRY_' + name.replace('.', '_') + '\t' * \
- max(1, 6 - int ((len('WT_CONFIG_ENTRY_' + name)) / 8)) + \
+ max(1, 6 - (len('WT_CONFIG_ENTRY_' + name) / 8)) + \
"%2s" % str(slot) + '\n'
# Write the method name and base.
@@ -318,12 +320,12 @@ tfile = open(tmp_file, 'w')
skip = 0
for line in open('../src/include/config.h', 'r'):
if skip:
- if line.count('configuration section: END'):
+ if 'configuration section: END' in line:
tfile.write('/*\n' + line)
skip = 0
else:
tfile.write(line)
- if line.count('configuration section: BEGIN'):
+ if 'configuration section: BEGIN' in line:
skip = 1
tfile.write(' */\n')
tfile.write(config_defines)
diff --git a/dist/api_data.py b/dist/api_data.py
index 09215470f00..c8f5bb8c397 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -9,25 +9,25 @@ class Error:
errors = [
Error('WT_DEADLOCK', 'conflict between concurrent operations', '''
- This error is generated when an operation cannot be completed
- due to a conflict with concurrent operations. The operation
- may be retried; if a transaction is in progress, it should be
- rolled back and the operation retried in a new transaction.'''),
+ This error is generated when an operation cannot be completed
+ due to a conflict with concurrent operations. The operation
+ may be retried; if a transaction is in progress, it should be
+ rolled back and the operation retried in a new transaction.'''),
Error('WT_DUPLICATE_KEY', 'attempt to insert an existing key', '''
- This error is generated when the application attempts to insert
- a record with the same key as an existing record without the
- 'overwrite' configuration to WT_SESSION::open_cursor.'''),
+ This error is generated when the application attempts to insert
+ a record with the same key as an existing record without the
+ 'overwrite' configuration to WT_SESSION::open_cursor.'''),
Error('WT_ERROR', 'non-specific WiredTiger error', '''
- This error is returned when an error is not covered by a
- specific error return.'''),
+ This error is returned when an error is not covered by a
+ specific error return.'''),
Error('WT_NOTFOUND', 'item not found', '''
- This error indicates an operation did not find a value to
- return. This includes cursor search and other operations
- where no record matched the cursor's search key such as
- WT_CURSOR::update or WT_CURSOR::remove.'''),
+ This error indicates an operation did not find a value to
+ return. This includes cursor search and other operations
+ where no record matched the cursor's search key such as
+ WT_CURSOR::update or WT_CURSOR::remove.'''),
Error('WT_PANIC', 'WiredTiger library panic', '''
- This error indicates an underlying problem that requires the
- application exit and restart.'''),
+ This error indicates an underlying problem that requires the
+ application exit and restart.'''),
Error('WT_RESTART', 'restart the operation (internal)', undoc=True),
]
@@ -50,79 +50,79 @@ class Config:
# All schema objects can have column names (optional for simple tables).
column_meta = [
Config('columns', '', r'''
- list of the column names. Comma-separated list of the form
- <code>(column[,...])</code>. For tables, the number of entries
- must match the total number of values in \c key_format and \c
- value_format. For colgroups and indices, all column names must
- appear in the list of columns for the table''',
- type='list'),
+ list of the column names. Comma-separated list of the form
+ <code>(column[,...])</code>. For tables, the number of entries
+ must match the total number of values in \c key_format and \c
+ value_format. For colgroups and indices, all column names must
+ appear in the list of columns for the table''',
+ type='list'),
]
source_meta = [
Config('source', '', r'''
- set a custom data source URI for a column group, index or simple
- table. By default, the data source URI is derived from the \c
- type and the column group or index name. Applications can
- create tables from existing data sources by supplying a \c
- source configuration''', undoc=True),
+ set a custom data source URI for a column group, index or simple
+ table. By default, the data source URI is derived from the \c
+ type and the column group or index name. Applications can
+ create tables from existing data sources by supplying a \c
+ source configuration''', undoc=True),
Config('type', 'file', r'''
- set the type of data source used to store a column group, index
- or simple table. By default, a \c "file:" URI is derived from
- the object name. The \c type configuration can be used to
- switch to a different data source, such as LSM or an extension
- configured by the application.'''),
+ set the type of data source used to store a column group, index
+ or simple table. By default, a \c "file:" URI is derived from
+ the object name. The \c type configuration can be used to
+ switch to a different data source, such as LSM or an extension
+ configured by the application.'''),
]
format_meta = column_meta + [
Config('key_format', 'u', r'''
- the format of the data packed into key items. See @ref
- schema_format_types for details. By default, the key_format is
- \c 'u' and applications use WT_ITEM structures to manipulate
- raw byte arrays. By default, records are stored in row-store
- files: keys of type \c 'r' are record numbers and records
- referenced by record number are stored in column-store files''',
- type='format'),
+ the format of the data packed into key items. See @ref
+ schema_format_types for details. By default, the key_format is
+ \c 'u' and applications use WT_ITEM structures to manipulate
+ raw byte arrays. By default, records are stored in row-store
+ files: keys of type \c 'r' are record numbers and records
+ referenced by record number are stored in column-store files''',
+ type='format'),
Config('value_format', 'u', r'''
- the format of the data packed into value items. See @ref
- schema_format_types for details. By default, the value_format
- is \c 'u' and applications use a WT_ITEM structure to
- manipulate raw byte arrays. Value items of type 't' are
- bitfields, and when configured with record number type keys,
- will be stored using a fixed-length store''',
- type='format'),
+ the format of the data packed into value items. See @ref
+ schema_format_types for details. By default, the value_format
+ is \c 'u' and applications use a WT_ITEM structure to
+ manipulate raw byte arrays. Value items of type 't' are
+ bitfields, and when configured with record number type keys,
+ will be stored using a fixed-length store''',
+ type='format'),
]
lsm_config = [
Config('lsm_auto_throttle', 'true', r'''
- Throttle inserts into LSM trees if flushing to disk isn't
- keeping up''',
- type='boolean'),
+ Throttle inserts into LSM trees if flushing to disk isn't
+ keeping up''',
+ type='boolean'),
Config('lsm_bloom', 'true', r'''
- create bloom filters on LSM tree chunks as they are merged''',
- type='boolean'),
+ create bloom filters on LSM tree chunks as they are merged''',
+ type='boolean'),
Config('lsm_bloom_config', '', r'''
- config string used when creating Bloom filter files, passed
- to WT_SESSION::create'''),
+ config string used when creating Bloom filter files, passed
+ to WT_SESSION::create'''),
Config('lsm_bloom_bit_count', '8', r'''
- the number of bits used per item for LSM bloom filters''',
- min='2', max='1000'),
+ the number of bits used per item for LSM bloom filters''',
+ min='2', max='1000'),
Config('lsm_bloom_hash_count', '4', r'''
- the number of hash values per item used for LSM bloom
- filters''',
- min='2', max='100'),
+ the number of hash values per item used for LSM bloom
+ filters''',
+ min='2', max='100'),
Config('lsm_bloom_oldest', 'false', r'''
- create a bloom filter on the oldest LSM tree chunk. Only
- supported if bloom filters are enabled''',
- type='boolean'),
+ create a bloom filter on the oldest LSM tree chunk. Only
+ supported if bloom filters are enabled''',
+ type='boolean'),
Config('lsm_chunk_size', '2MB', r'''
- the maximum size of the in-memory chunk of an LSM tree''',
- min='512K', max='500MB'),
+ the maximum size of the in-memory chunk of an LSM tree''',
+ min='512K', max='500MB'),
Config('lsm_merge_max', '15', r'''
- the maximum number of chunks to include in a merge operation''',
- min='2', max='100'),
+ the maximum number of chunks to include in a merge operation''',
+ min='2', max='100'),
Config('lsm_merge_threads', '1', r'''
- the number of thread to perform merge operations''',
- min='1', max='10'), # !!! max must match WT_LSM_MAX_WORKERS
+ the number of thread to perform merge operations''',
+ min='1', max='10'), # !!! max must match WT_LSM_MAX_WORKERS
]
# Per-file configuration
@@ -134,137 +134,141 @@ file_config = format_meta + [
uses a best-fit algorithm''',
choices=['first', 'best',]),
Config('allocation_size', '4KB', r'''
- the file unit allocation size, in bytes, must a power-of-two;
- smaller values decrease the file space required by overflow
- items, and the default value of 4KB is a good choice absent
- requirements from the operating system or storage device''',
- min='512B', max='128MB'),
+ the file unit allocation size, in bytes, must a power-of-two;
+ smaller values decrease the file space required by overflow
+ items, and the default value of 4KB is a good choice absent
+ requirements from the operating system or storage device''',
+ min='512B', max='128MB'),
Config('block_compressor', '', r'''
- configure a compressor for file blocks. Permitted values are
- empty (off) or \c "bzip2", \c "snappy" or custom compression
- engine \c "name" created with WT_CONNECTION::add_compressor.
- See @ref compression for more information'''),
+ configure a compressor for file blocks. Permitted values are
+ empty (off) or \c "bzip2", \c "snappy" or custom compression
+ engine \c "name" created with WT_CONNECTION::add_compressor.
+ See @ref compression for more information'''),
Config('cache_resident', 'false', r'''
- do not ever evict the object's pages; see @ref
- tuning_cache_resident for more information''',
- type='boolean'),
+ do not ever evict the object's pages; see @ref
+ tuning_cache_resident for more information''',
+ type='boolean'),
Config('checksum', 'uncompressed', r'''
- configure block checksums; permitted values are <code>on</code>
- (checksum all blocks), <code>off</code> (checksum no blocks) and
- <code>uncompresssed</code> (checksum only blocks which are not
- compressed for any reason). The \c uncompressed setting is for
- applications which can rely on decompression to fail if a block
- has been corrupted''',
- choices=['on', 'off', 'uncompressed']),
+ configure block checksums; permitted values are <code>on</code>
+ (checksum all blocks), <code>off</code> (checksum no blocks) and
+ <code>uncompresssed</code> (checksum only blocks which are not
+ compressed for any reason). The \c uncompressed setting is for
+ applications which can rely on decompression to fail if a block
+ has been corrupted''',
+ choices=['on', 'off', 'uncompressed']),
Config('collator', '', r'''
- configure custom collation for keys. Value must be a collator
- name created with WT_CONNECTION::add_collator'''),
+ configure custom collation for keys. Value must be a collator
+ name created with WT_CONNECTION::add_collator'''),
Config('dictionary', '0', r'''
- the maximum number of unique values remembered in the Btree
- row-store leaf page value dictionary; see
- @ref file_formats_compression for more information''',
- min='0'),
+ the maximum number of unique values remembered in the Btree
+ row-store leaf page value dictionary; see
+ @ref file_formats_compression for more information''',
+ min='0'),
Config('format', 'btree', r'''
- the file format''',
- choices=['btree']),
+ the file format''',
+ choices=['btree']),
Config('huffman_key', '', r'''
- configure Huffman encoding for keys. Permitted values
- are empty (off), \c "english", \c "utf8<file>" or \c
- "utf16<file>". See @ref huffman for more information'''),
+ configure Huffman encoding for keys. Permitted values
+ are empty (off), \c "english", \c "utf8<file>" or \c
+ "utf16<file>". See @ref huffman for more information'''),
Config('huffman_value', '', r'''
- configure Huffman encoding for values. Permitted values
- are empty (off), \c "english", \c "utf8<file>" or \c
- "utf16<file>". See @ref huffman for more information'''),
+ configure Huffman encoding for values. Permitted values
+ are empty (off), \c "english", \c "utf8<file>" or \c
+ "utf16<file>". See @ref huffman for more information'''),
Config('internal_key_truncate', 'true', r'''
- configure internal key truncation, discarding unnecessary
- trailing bytes on internal keys (ignored for custom
- collators)''',
- type='boolean'),
+ configure internal key truncation, discarding unnecessary
+ trailing bytes on internal keys (ignored for custom
+ collators)''',
+ type='boolean'),
Config('internal_page_max', '4KB', r'''
- the maximum page size for internal nodes, in bytes; the size
- must be a multiple of the allocation size and is significant
- for applications wanting to avoid excessive L2 cache misses
- while searching the tree. The page maximum is the bytes of
- uncompressed data, that is, the limit is applied before any
- block compression is done''',
- min='512B', max='512MB'),
+ the maximum page size for internal nodes, in bytes; the size
+ must be a multiple of the allocation size and is significant
+ for applications wanting to avoid excessive L2 cache misses
+ while searching the tree. The page maximum is the bytes of
+ uncompressed data, that is, the limit is applied before any
+ block compression is done''',
+ min='512B', max='512MB'),
Config('internal_item_max', '0', r'''
- the largest key stored within an internal node, in bytes. If
- non-zero, any key larger than the specified size will be
- stored as an overflow item (which may require additional I/O
- to access). If zero, a default size is chosen that permits at
- least 8 keys per internal page''',
- min=0),
+ the largest key stored within an internal node, in bytes. If
+ non-zero, any key larger than the specified size will be
+ stored as an overflow item (which may require additional I/O
+ to access). If zero, a default size is chosen that permits at
+ least 8 keys per internal page''',
+ min=0),
Config('key_gap', '10', r'''
- the maximum gap between instantiated keys in a Btree leaf page,
- constraining the number of keys processed to instantiate a
- random Btree leaf page key''',
- min='0', undoc=True),
+ the maximum gap between instantiated keys in a Btree leaf page,
+ constraining the number of keys processed to instantiate a
+ random Btree leaf page key''',
+ min='0', undoc=True),
Config('leaf_page_max', '1MB', r'''
- the maximum page size for leaf nodes, in bytes; the size must
- be a multiple of the allocation size, and is significant for
- applications wanting to maximize sequential data transfer from
- a storage device. The page maximum is the bytes of uncompressed
- data, that is, the limit is applied before any block compression
- is done''',
- min='512B', max='512MB'),
+ the maximum page size for leaf nodes, in bytes; the size must
+ be a multiple of the allocation size, and is significant for
+ applications wanting to maximize sequential data transfer from
+ a storage device. The page maximum is the bytes of uncompressed
+ data, that is, the limit is applied before any block compression
+ is done''',
+ min='512B', max='512MB'),
Config('leaf_item_max', '0', r'''
- the largest key or value stored within a leaf node, in bytes.
- If non-zero, any key or value larger than the specified size
- will be stored as an overflow item (which may require additional
- I/O to access). If zero, a default size is chosen that permits
- at least 4 key and value pairs per leaf page''',
- min=0),
+ the largest key or value stored within a leaf node, in bytes.
+ If non-zero, any key or value larger than the specified size
+ will be stored as an overflow item (which may require additional
+ I/O to access). If zero, a default size is chosen that permits
+ at least 4 key and value pairs per leaf page''',
+ min=0),
Config('memory_page_max', '5MB', r'''
- the maximum size a page can grow to in memory before being
- reconciled to disk. The specified size will be adjusted to a
- lower bound of <code>50 * leaf_page_max</code>. This limit is
- soft - it is possible for pages to be temporarily larger than
- this value''',
- min='512B', max='10TB'),
+ the maximum size a page can grow to in memory before being
+ reconciled to disk. The specified size will be adjusted to a
+ lower bound of <code>50 * leaf_page_max</code>. This limit is
+ soft - it is possible for pages to be temporarily larger than
+ this value''',
+ min='512B', max='10TB'),
Config('os_cache_max', '0', r'''
- maximum system buffer cache usage, in bytes. If non-zero, evict
- object blocks from the system buffer cache after that many bytes
- from this object are read or written into the buffer cache''',
- min=0),
+ maximum system buffer cache usage, in bytes. If non-zero, evict
+ object blocks from the system buffer cache after that many bytes
+ from this object are read or written into the buffer cache''',
+ min=0),
Config('os_cache_dirty_max', '0', r'''
- maximum dirty system buffer cache usage, in bytes. If non-zero,
- schedule writes for dirty blocks belonging to this object in the
- system buffer cache after that many bytes from this object are
- written into the buffer cache''',
- min=0),
+ maximum dirty system buffer cache usage, in bytes. If non-zero,
+ schedule writes for dirty blocks belonging to this object in the
+ system buffer cache after that many bytes from this object are
+ written into the buffer cache''',
+ min=0),
Config('prefix_compression', 'true', r'''
- configure prefix compression on row-store leaf pages''',
- type='boolean'),
+ configure prefix compression on row-store leaf pages''',
+ type='boolean'),
Config('prefix_compression_min', '4', r'''
- minimum gain before prefix compression will be used on row-store
- leaf pages''',
- min=0),
+ minimum gain before prefix compression will be used on row-store
+ leaf pages''',
+ min=0),
Config('split_pct', '75', r'''
- the Btree page split size as a percentage of the maximum Btree
- page size, that is, when a Btree page is split, it will be
- split into smaller pages, where each page is the specified
- percentage of the maximum Btree page size''',
- min='25', max='100'),
+ the Btree page split size as a percentage of the maximum Btree
+ page size, that is, when a Btree page is split, it will be
+ split into smaller pages, where each page is the specified
+ percentage of the maximum Btree page size''',
+ min='25', max='100'),
]
# File metadata, including both configurable and non-configurable (internal)
file_meta = file_config + [
Config('checkpoint', '', r'''
- the file checkpoint entries'''),
+ the file checkpoint entries'''),
+ Config('checkpoint_lsn', '', r'''
+ LSN of the last checkpoint'''),
+ Config('id', '', r'''
+ the file's ID number'''),
Config('version', '(major=0,minor=0)', r'''
- the file version'''),
+ the file version'''),
]
table_only_meta = [
Config('colgroups', '', r'''
- comma-separated list of names of column groups. Each column
- group is stored separately, keyed by the primary key of the
- table. If no column groups are specified, all columns are
- stored together in a single file. All value columns in the
- table must appear in at least one column group. Each column
- group must be created with a separate call to
- WT_SESSION::create''', type='list'),
+ comma-separated list of names of column groups. Each column
+ group is stored separately, keyed by the primary key of the
+ table. If no column groups are specified, all columns are
+ stored together in a single file. All value columns in the
+ table must appear in at least one column group. Each column
+ group must be created with a separate call to
+ WT_SESSION::create''', type='list'),
]
colgroup_meta = column_meta + source_meta
@@ -276,88 +280,89 @@ table_meta = format_meta + table_only_meta
# Connection runtime config, shared by conn.reconfigure and wiredtiger_open
connection_runtime_config = [
Config('shared_cache', '', r'''
- shared cache configuration options. A database should configure
- either a cache_size or a shared_cache not both''',
- type='category', subconfig=[
- Config('enable', 'false', r'''
- whether the connection is using a shared cache''',
- type='boolean'),
- Config('chunk', '10MB', r'''
- the granularity that a shared cache is redistributed''',
- min='1MB', max='10TB'),
- Config('reserve', '0', r'''
- amount of cache this database is guaranteed to have
- available from the shared cache. This setting is per
- database. Defaults to the chunk size''', type='int'),
- Config('name', 'pool', r'''
- name of a cache that is shared between databases'''),
- Config('size', '500MB', r'''
- maximum memory to allocate for the shared cache. Setting
- this will update the value if one is already set''',
- min='1MB', max='10TB')
- ]),
+ shared cache configuration options. A database should configure
+ either a cache_size or a shared_cache not both''',
+ type='category', subconfig=[
+ Config('enable', 'false', r'''
+ whether the connection is using a shared cache''',
+ type='boolean'),
+ Config('chunk', '10MB', r'''
+ the granularity that a shared cache is redistributed''',
+ min='1MB', max='10TB'),
+ Config('reserve', '0', r'''
+ amount of cache this database is guaranteed to have
+ available from the shared cache. This setting is per
+ database. Defaults to the chunk size''', type='int'),
+ Config('name', 'pool', r'''
+ name of a cache that is shared between databases'''),
+ Config('size', '500MB', r'''
+ maximum memory to allocate for the shared cache. Setting
+ this will update the value if one is already set''',
+ min='1MB', max='10TB')
+ ]),
Config('cache_size', '100MB', r'''
- maximum heap memory to allocate for the cache. A database should
- configure either a cache_size or a shared_cache not both''',
- min='1MB', max='10TB'),
+ maximum heap memory to allocate for the cache. A database should
+ configure either a cache_size or a shared_cache not both''',
+ min='1MB', max='10TB'),
Config('error_prefix', '', r'''
- prefix string for error messages'''),
+ prefix string for error messages'''),
Config('eviction_dirty_target', '80', r'''
- continue evicting until the cache has less dirty pages than this
- (as a percentage). Dirty pages will only be evicted if the cache
- is full enough to trigger eviction''',
- min=10, max=99),
+ continue evicting until the cache has less dirty pages than this
+ (as a percentage). Dirty pages will only be evicted if the cache
+ is full enough to trigger eviction''',
+ min=10, max=99),
Config('eviction_target', '80', r'''
- continue evicting until the cache becomes less full than this
- (as a percentage). Must be less than \c eviction_trigger''',
- min=10, max=99),
+ continue evicting until the cache becomes less full than this
+ (as a percentage). Must be less than \c eviction_trigger''',
+ min=10, max=99),
Config('eviction_trigger', '95', r'''
- trigger eviction when the cache becomes this full (as a
- percentage)''',
- min=10, max=99),
+ trigger eviction when the cache becomes this full (as a
+ percentage)''',
+ min=10, max=99),
Config('statistics', 'none', r'''
- Maintain database statistics, which may impact performance.
- Choosing "all" maintains all statistics regardless of cost,
- "fast" maintains a subset of statistics that are relatively
- inexpensive, "none" turns off all statistics. The "clear"
- configuration resets statistics after they are gathered,
- where appropriate (for example, a cache size statistic is
- not cleared, while the count of cursor insert operations will
- be cleared). When "clear" is configured for the database,
- gathered statistics are reset each time a statistics cursor
- is used to gather statistics, as well as each time statistics
- are logged using the \c statistics_log configuration. See
- @ref statistics for more information''',
- type='list', choices=['all', 'fast', 'none', 'clear']),
+ Maintain database statistics, which may impact performance.
+ Choosing "all" maintains all statistics regardless of cost,
+ "fast" maintains a subset of statistics that are relatively
+ inexpensive, "none" turns off all statistics. The "clear"
+ configuration resets statistics after they are gathered,
+ where appropriate (for example, a cache size statistic is
+ not cleared, while the count of cursor insert operations will
+ be cleared). When "clear" is configured for the database,
+ gathered statistics are reset each time a statistics cursor
+ is used to gather statistics, as well as each time statistics
+ are logged using the \c statistics_log configuration. See
+ @ref statistics for more information''',
+ type='list', choices=['all', 'fast', 'none', 'clear']),
Config('verbose', '', r'''
- enable messages for various events. Options are given as a
- list, such as <code>"verbose=[evictserver,read]"</code>''',
- type='list', choices=[
- 'block',
- 'ckpt',
- 'compact',
- 'evict',
- 'evictserver',
- 'fileops',
- 'hazard',
- 'log',
- 'lsm',
- 'mutex',
- 'overflow',
- 'read',
- 'readserver',
- 'reconcile',
- 'salvage',
- 'shared_cache',
- 'verify',
- 'version',
- 'write']),
+ enable messages for various events. Options are given as a
+ list, such as <code>"verbose=[evictserver,read]"</code>''',
+ type='list', choices=[
+ 'block',
+ 'ckpt',
+ 'compact',
+ 'evict',
+ 'evictserver',
+ 'fileops',
+ 'hazard',
+ 'log',
+ 'lsm',
+ 'mutex',
+ 'overflow',
+ 'read',
+ 'readserver',
+ 'reconcile',
+ 'recovery',
+ 'salvage',
+ 'shared_cache',
+ 'verify',
+ 'version',
+ 'write']),
]
session_config = [
Config('isolation', 'read-committed', r'''
- the default isolation level for operations in this session''',
- choices=['read-uncommitted', 'read-committed', 'snapshot']),
+ the default isolation level for operations in this session''',
+ choices=['read-uncommitted', 'read-committed', 'snapshot']),
]
methods = {
@@ -375,135 +380,136 @@ methods = {
'session.compact' : Method([]),
-'session.create' : Method(table_only_meta + file_config + lsm_config + source_meta + [
+'session.create' :
+ Method(table_only_meta + file_config + lsm_config + source_meta + [
Config('exclusive', 'false', r'''
- fail if the object exists. When false (the default), if the
- object exists, check that its settings match the specified
- configuration''',
- type='boolean'),
+ fail if the object exists. When false (the default), if the
+ object exists, check that its settings match the specified
+ configuration''',
+ type='boolean'),
]),
'session.drop' : Method([
Config('force', 'false', r'''
- return success if the object does not exist''',
- type='boolean'),
+ return success if the object does not exist''',
+ type='boolean'),
Config('remove_files', 'true', r'''
- should the underlying files be removed?''',
- type='boolean'),
- ]),
+ should the underlying files be removed?''',
+ type='boolean'),
+]),
'session.log_printf' : Method([]),
'session.open_cursor' : Method([
Config('append', 'false', r'''
- append the value as a new record, creating a new record
- number key; valid only for cursors with record number keys''',
- type='boolean'),
+ append the value as a new record, creating a new record
+ number key; valid only for cursors with record number keys''',
+ type='boolean'),
Config('bulk', 'false', r'''
- configure the cursor for bulk-loading, a fast, initial load
- path (see @ref bulk_load for more information). Bulk-load
- may only be used for newly created objects and cursors
- configured for bulk-load only support the WT_CURSOR::insert
- and WT_CURSOR::close methods. When bulk-loading row-store
- objects, keys must be loaded in sorted order. The value is
- usually a true/false flag; when bulk-loading fixed-length
- column store objects, the special value \c bitmap allows
- chunks of a memory resident bitmap to be loaded directly into
- a file by passing a \c WT_ITEM to WT_CURSOR::set_value where
- the \c size field indicates the number of records in the
- bitmap (as specified by the object's \c value_format
- configuration). Bulk-loaded bitmap values must end on a byte
- boundary relative to the bit count (except for the last set
- of values loaded)'''),
+ configure the cursor for bulk-loading, a fast, initial load
+ path (see @ref bulk_load for more information). Bulk-load
+ may only be used for newly created objects and cursors
+ configured for bulk-load only support the WT_CURSOR::insert
+ and WT_CURSOR::close methods. When bulk-loading row-store
+ objects, keys must be loaded in sorted order. The value is
+ usually a true/false flag; when bulk-loading fixed-length
+ column store objects, the special value \c bitmap allows
+ chunks of a memory resident bitmap to be loaded directly into
+ a file by passing a \c WT_ITEM to WT_CURSOR::set_value where
+ the \c size field indicates the number of records in the
+ bitmap (as specified by the object's \c value_format
+ configuration). Bulk-loaded bitmap values must end on a byte
+ boundary relative to the bit count (except for the last set
+ of values loaded)'''),
Config('checkpoint', '', r'''
- the name of a checkpoint to open (the reserved name
- "WiredTigerCheckpoint" opens the most recent internal
- checkpoint taken for the object). The cursor does not
- support data modification'''),
+ the name of a checkpoint to open (the reserved name
+ "WiredTigerCheckpoint" opens the most recent internal
+ checkpoint taken for the object). The cursor does not
+ support data modification'''),
Config('dump', '', r'''
- configure the cursor for dump format inputs and outputs:
- "hex" selects a simple hexadecimal format, "print"
- selects a format where only non-printing characters are
- hexadecimal encoded. The cursor dump format is compatible
- with the @ref util_dump and @ref util_load commands''',
- choices=['hex', 'print']),
+ configure the cursor for dump format inputs and outputs:
+ "hex" selects a simple hexadecimal format, "print"
+ selects a format where only non-printing characters are
+ hexadecimal encoded. The cursor dump format is compatible
+ with the @ref util_dump and @ref util_load commands''',
+ choices=['hex', 'print']),
Config('next_random', 'false', r'''
- configure the cursor to return a pseudo-random record from
- the object; valid only for row-store cursors. Cursors
- configured with \c next_random=true only support the
- WT_CURSOR::next and WT_CURSOR::close methods. See @ref
- cursor_random for details''',
- type='boolean'),
+ configure the cursor to return a pseudo-random record from
+ the object; valid only for row-store cursors. Cursors
+ configured with \c next_random=true only support the
+ WT_CURSOR::next and WT_CURSOR::close methods. See @ref
+ cursor_random for details''',
+ type='boolean'),
Config('overwrite', 'true', r'''
- configures whether the cursor's insert, update and remove
- methods check the existing state of the record. If \c overwrite
- is \c false, WT_CURSOR::insert fails with ::WT_DUPLICATE_KEY
- if the record exists, WT_CURSOR::update and WT_CURSOR::remove
- fail with ::WT_NOTFOUND if the record does not exist''',
- type='boolean'),
+ configures whether the cursor's insert, update and remove
+ methods check the existing state of the record. If \c overwrite
+ is \c false, WT_CURSOR::insert fails with ::WT_DUPLICATE_KEY
+ if the record exists, WT_CURSOR::update and WT_CURSOR::remove
+ fail with ::WT_NOTFOUND if the record does not exist''',
+ type='boolean'),
Config('raw', 'false', r'''
- ignore the encodings for the key and value, manage data as if
- the formats were \c "u". See @ref cursor_raw for details''',
- type='boolean'),
+ ignore the encodings for the key and value, manage data as if
+ the formats were \c "u". See @ref cursor_raw for details''',
+ type='boolean'),
Config('statistics', '', r'''
- Specify the statistics to be gathered. Choosing "all" gathers
- statistics regardless of cost and may include traversing
- on-disk files; "fast" gathers a subset of relatively
- inexpensive statistics. The selection must agree with the
- database \c statistics configuration specified to
- ::wiredtiger_open or WT_CONNECTION::reconfigure. For example,
- "all" or "fast" can be configured when the database is
- configured with "all", but the cursor open will fail if "all"
- is specified when the database is configured with "fast",
- and the cursor open will fail in all cases when the database
- is configured with "none". If \c statistics is not configured,
- the default configuration is the database configuration.
- The "clear" configuration resets statistics after gathering
- them, where appropriate (for example, a cache size statistic
- is not cleared, while the count of cursor insert operations
- will be cleared). See @ref statistics for more information''',
- type='list', choices=['all', 'fast', 'clear']),
+ Specify the statistics to be gathered. Choosing "all" gathers
+ statistics regardless of cost and may include traversing
+ on-disk files; "fast" gathers a subset of relatively
+ inexpensive statistics. The selection must agree with the
+ database \c statistics configuration specified to
+ ::wiredtiger_open or WT_CONNECTION::reconfigure. For example,
+ "all" or "fast" can be configured when the database is
+ configured with "all", but the cursor open will fail if "all"
+ is specified when the database is configured with "fast",
+ and the cursor open will fail in all cases when the database
+ is configured with "none". If \c statistics is not configured,
+ the default configuration is the database configuration.
+ The "clear" configuration resets statistics after gathering
+ them, where appropriate (for example, a cache size statistic
+ is not cleared, while the count of cursor insert operations
+ will be cleared). See @ref statistics for more information''',
+ type='list', choices=['all', 'fast', 'clear']),
Config('target', '', r'''
- if non-empty, backup the list of objects; valid only for a
- backup data source''',
- type='list'),
+ if non-empty, backup the list of objects; valid only for a
+ backup data source''',
+ type='list'),
]),
'session.rename' : Method([]),
'session.salvage' : Method([
Config('force', 'false', r'''
- force salvage even of files that do not appear to be WiredTiger
- files''',
- type='boolean'),
+ force salvage even of files that do not appear to be WiredTiger
+ files''',
+ type='boolean'),
]),
'session.truncate' : Method([]),
'session.upgrade' : Method([]),
'session.verify' : Method([
Config('dump_address', 'false', r'''
- Display addresses and page types as pages are verified, using
- the application's message handler, intended for debugging''',
- type='boolean'),
+ Display addresses and page types as pages are verified, using
+ the application's message handler, intended for debugging''',
+ type='boolean'),
Config('dump_blocks', 'false', r'''
- Display the contents of on-disk blocks as they are verified, using
- the application's message handler, intended for debugging''',
- type='boolean'),
+ Display the contents of on-disk blocks as they are verified, using
+ the application's message handler, intended for debugging''',
+ type='boolean'),
Config('dump_pages', 'false', r'''
- Display the contents of in-memory pages as they are verified, using
- the application's message handler, intended for debugging''',
- type='boolean')
+ Display the contents of in-memory pages as they are verified, using
+ the application's message handler, intended for debugging''',
+ type='boolean')
]),
'session.begin_transaction' : Method([
Config('isolation', '', r'''
- the isolation level for this transaction; defaults to the
- session's isolation level''',
- choices=['read-uncommitted', 'read-committed', 'snapshot']),
+ the isolation level for this transaction; defaults to the
+ session's isolation level''',
+ choices=['read-uncommitted', 'read-committed', 'snapshot']),
Config('name', '', r'''
- name of the transaction for tracing and debugging'''),
+ name of the transaction for tracing and debugging'''),
Config('priority', 0, r'''
- priority of the transaction for resolving conflicts.
- Transactions with higher values are less likely to abort''',
- min='-100', max='100'),
+ priority of the transaction for resolving conflicts.
+ Transactions with higher values are less likely to abort''',
+ min='-100', max='100'),
]),
'session.commit_transaction' : Method([]),
@@ -511,24 +517,24 @@ methods = {
'session.checkpoint' : Method([
Config('drop', '', r'''
- specify a list of checkpoints to drop.
- The list may additionally contain one of the following keys:
- \c "from=all" to drop all checkpoints,
- \c "from=<checkpoint>" to drop all checkpoints after and
- including the named checkpoint, or
- \c "to=<checkpoint>" to drop all checkpoints before and
- including the named checkpoint. Checkpoints cannot be
- dropped while a hot backup is in progress or if open in
- a cursor''', type='list'),
+ specify a list of checkpoints to drop.
+ The list may additionally contain one of the following keys:
+ \c "from=all" to drop all checkpoints,
+ \c "from=<checkpoint>" to drop all checkpoints after and
+ including the named checkpoint, or
+ \c "to=<checkpoint>" to drop all checkpoints before and
+ including the named checkpoint. Checkpoints cannot be
+ dropped while a hot backup is in progress or if open in
+ a cursor''', type='list'),
Config('force', 'false', r'''
- by default, checkpoints may be skipped if the underlying object
- has not been modified, this option forces the checkpoint''',
- type='boolean'),
+ by default, checkpoints may be skipped if the underlying object
+ has not been modified, this option forces the checkpoint''',
+ type='boolean'),
Config('name', '', r'''
- if non-empty, specify a name for the checkpoint (note that
- checkpoints including LSM trees may not be named)'''),
+ if non-empty, specify a name for the checkpoint (note that
+ checkpoints including LSM trees may not be named)'''),
Config('target', '', r'''
- if non-empty, checkpoint the list of objects''', type='list'),
+ if non-empty, checkpoint the list of objects''', type='list'),
]),
'connection.add_collator' : Method([]),
@@ -540,17 +546,17 @@ methods = {
'connection.load_extension' : Method([
Config('config', '', r'''
- configuration string passed to the entry point of the
- extension as its WT_CONFIG_ARG argument'''),
+ configuration string passed to the entry point of the
+ extension as its WT_CONFIG_ARG argument'''),
Config('entry', 'wiredtiger_extension_init', r'''
- the entry point of the extension, called to initialize the
- extension when it is loaded. The signature of the function
- must match ::wiredtiger_extension_init'''),
+ the entry point of the extension, called to initialize the
+ extension when it is loaded. The signature of the function
+ must match ::wiredtiger_extension_init'''),
Config('terminate', 'wiredtiger_extension_terminate', r'''
- an optional function in the extension that is called before
- the extension is unloaded during WT_CONNECTION::close. The
- signature of the function must match
- ::wiredtiger_extension_terminate'''),
+ an optional function in the extension that is called before
+ the extension is unloaded during WT_CONNECTION::close. The
+ signature of the function must match
+ ::wiredtiger_extension_terminate'''),
]),
'connection.open_session' : Method(session_config),
@@ -559,112 +565,112 @@ methods = {
'wiredtiger_open' : Method(connection_runtime_config + [
Config('buffer_alignment', '-1', r'''
- in-memory alignment (in bytes) for buffers used for I/O. The
- default value of -1 indicates a platform-specific alignment
- value should be used (4KB on Linux systems, zero elsewhere)''',
- min='-1', max='1MB'),
+ in-memory alignment (in bytes) for buffers used for I/O. The
+ default value of -1 indicates a platform-specific alignment
+ value should be used (4KB on Linux systems, zero elsewhere)''',
+ min='-1', max='1MB'),
Config('checkpoint', '', r'''
- periodically checkpoint the database''',
- type='category', subconfig=[
- Config('name', '"WiredTigerCheckpoint"', r'''
- the checkpoint name'''),
- Config('wait', '0', r'''
- seconds to wait between each checkpoint; setting this value
- configures periodic checkpoints''',
- min='1', max='100000'),
- ]),
+ periodically checkpoint the database''',
+ type='category', subconfig=[
+ Config('name', '"WiredTigerCheckpoint"', r'''
+ the checkpoint name'''),
+ Config('wait', '0', r'''
+ seconds to wait between each checkpoint; setting this value
+ configures periodic checkpoints''',
+ min='1', max='100000'),
+ ]),
Config('checkpoint_sync', 'true', r'''
- flush files to stable storage when closing or writing
- checkpoints''',
- type='boolean'),
+ flush files to stable storage when closing or writing
+ checkpoints''',
+ type='boolean'),
Config('create', 'false', r'''
- create the database if it does not exist''',
- type='boolean'),
+ create the database if it does not exist''',
+ type='boolean'),
Config('direct_io', '', r'''
- Use \c O_DIRECT to access files. Options are given as a list,
- such as <code>"direct_io=[data]"</code>. Configuring
- \c direct_io requires care, see @ref
- tuning_system_buffer_cache_direct_io for important warnings''',
- type='list', choices=['data', 'log']),
+ Use \c O_DIRECT to access files. Options are given as a list,
+ such as <code>"direct_io=[data]"</code>. Configuring
+ \c direct_io requires care, see @ref
+ tuning_system_buffer_cache_direct_io for important warnings''',
+ type='list', choices=['data', 'log']),
Config('extensions', '', r'''
- list of shared library extensions to load (using dlopen).
- Any values specified to an library extension are passed to
- WT_CONNECTION::load_extension as the \c config parameter
- (for example,
- <code>extensions=(/path/ext.so={entry=my_entry})</code>)''',
- type='list'),
+ list of shared library extensions to load (using dlopen).
+ Any values specified to an library extension are passed to
+ WT_CONNECTION::load_extension as the \c config parameter
+ (for example,
+ <code>extensions=(/path/ext.so={entry=my_entry})</code>)''',
+ type='list'),
Config('file_extend', '', r'''
- file extension configuration. If set, extend files of the set
- type in allocations of the set size, instead of a block at a
- time as each new block is written. For example,
- <code>file_extend=(data=16MB)</code>''',
- type='list', choices=['data', 'log']),
+ file extension configuration. If set, extend files of the set
+ type in allocations of the set size, instead of a block at a
+ time as each new block is written. For example,
+ <code>file_extend=(data=16MB)</code>''',
+ type='list', choices=['data', 'log']),
Config('hazard_max', '1000', r'''
- maximum number of simultaneous hazard pointers per session
- handle''',
- min='15'),
+ maximum number of simultaneous hazard pointers per session
+ handle''',
+ min='15'),
Config('log', '', r'''
- enable logging''',
- type='category', subconfig=[
- Config('archive', 'true', r'''
- automatically archive unneeded log files''',
- type='boolean'),
- Config('enabled', 'false', r'''
- enable logging subsystem''',
- type='boolean'),
- Config('file_max', '100MB', r'''
- the maximum size of the log file''',
- min='1MB', max='2GB'),
- Config('path', '""', r'''
- the path to a directory into which the log files are written.
- If the value is not an absolute path name, the files are created
- relative to the database home'''),
- ]),
+ enable logging''',
+ type='category', subconfig=[
+ Config('archive', 'true', r'''
+ automatically archive unneeded log files''',
+ type='boolean'),
+ Config('enabled', 'false', r'''
+ enable logging subsystem''',
+ type='boolean'),
+ Config('file_max', '100MB', r'''
+ the maximum size of log files''',
+ min='100KB', max='2GB'),
+ Config('path', '""', r'''
+ the path to a directory into which the log files are written.
+ If the value is not an absolute path name, the files are created
+ relative to the database home'''),
+ ]),
Config('lsm_merge', 'true', r'''
- merge LSM chunks where possible''',
- type='boolean'),
+ merge LSM chunks where possible''',
+ type='boolean'),
Config('mmap', 'true', r'''
- Use memory mapping to access files when possible''',
- type='boolean'),
+ Use memory mapping to access files when possible''',
+ type='boolean'),
Config('multiprocess', 'false', r'''
- permit sharing between processes (will automatically start an
- RPC server for primary processes and use RPC for secondary
- processes). <b>Not yet supported in WiredTiger</b>''',
- type='boolean'),
+ permit sharing between processes (will automatically start an
+ RPC server for primary processes and use RPC for secondary
+ processes). <b>Not yet supported in WiredTiger</b>''',
+ type='boolean'),
Config('session_max', '50', r'''
- maximum expected number of sessions (including server
- threads)''',
- min='1'),
+ maximum expected number of sessions (including server
+ threads)''',
+ min='1'),
Config('statistics_log', '', r'''
- log any statistics the database is configured to maintain,
- to a file. See @ref statistics for more information''',
- type='category', subconfig=[
- Config('path', '"WiredTigerStat.%H"', r'''
- the pathname to a file into which the log records are written,
- may contain ISO C standard strftime conversion specifications.
- If the value is not an absolute path name, the file is created
- relative to the database home'''),
- Config('sources', '', r'''
- if non-empty, include statistics for the list of data source
- URIs, if they are open at the time of the statistics logging.
- The list may include URIs matching a single data source
- ("table:mytable"), or a URI matching all data sources of a
- particular type ("table:")''',
- type='list'),
- Config('timestamp', '"%b %d %H:%M:%S"', r'''
- a timestamp prepended to each log record, may contain strftime
- conversion specifications'''),
- Config('wait', '0', r'''
- seconds to wait between each write of the log records''',
- min='1', max='100000'),
- ]),
+ log any statistics the database is configured to maintain,
+ to a file. See @ref statistics for more information''',
+ type='category', subconfig=[
+ Config('path', '"WiredTigerStat.%H"', r'''
+ the pathname to a file into which the log records are written,
+ may contain ISO C standard strftime conversion specifications.
+ If the value is not an absolute path name, the file is created
+ relative to the database home'''),
+ Config('sources', '', r'''
+ if non-empty, include statistics for the list of data source
+ URIs, if they are open at the time of the statistics logging.
+ The list may include URIs matching a single data source
+ ("table:mytable"), or a URI matching all data sources of a
+ particular type ("table:")''',
+ type='list'),
+ Config('timestamp', '"%b %d %H:%M:%S"', r'''
+ a timestamp prepended to each log record, may contain strftime
+ conversion specifications'''),
+ Config('wait', '0', r'''
+ seconds to wait between each write of the log records''',
+ min='1', max='100000'),
+ ]),
Config('transaction_sync', 'dsync', r'''
- how to sync log records when the transaction commits''',
- choices=['dsync', 'fsync', 'none']),
+ how to sync log records when the transaction commits''',
+ choices=['dsync', 'fsync', 'none']),
Config('use_environment_priv', 'false', r'''
- use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment
- variables regardless of whether or not the process is running
- with special privileges. See @ref home for more information''',
- type='boolean'),
+ use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment
+ variables regardless of whether or not the process is running
+ with special privileges. See @ref home for more information''',
+ type='boolean'),
]),
}
diff --git a/dist/api_err.py b/dist/api_err.py
index 7ad645d108c..158a7e7ff56 100644
--- a/dist/api_err.py
+++ b/dist/api_err.py
@@ -28,8 +28,11 @@ for line in open('../src/include/wiredtiger.in', 'r'):
if 'undoc' in err.flags:
tfile.write('/*! @cond internal */\n')
tfile.write('/*!%s.%s */\n' %
- (('\n * ' if err.long_desc else ' ') + err.desc[0].upper() + err.desc[1:],
- ''.join('\n * ' + l for l in textwrap.wrap(textwrap.dedent(err.long_desc).strip(), 77)) + '\n' if err.long_desc else ''))
+ (('\n * ' if err.long_desc else ' ') +
+ err.desc[0].upper() + err.desc[1:],
+ ''.join('\n * ' + l for l in textwrap.wrap(
+ textwrap.dedent(err.long_desc).strip(), 77)) +
+ '\n' if err.long_desc else ''))
tfile.write('#define\t%s\t%d\n' % (err.name, v))
v -= 1
if 'undoc' in err.flags:
diff --git a/dist/dist.py b/dist/dist.py
index 8dbe75cba09..a51ce31b97b 100644
--- a/dist/dist.py
+++ b/dist/dist.py
@@ -1,58 +1,29 @@
import filecmp, os, re, shutil
-# source_files_list --
-# Return a list of the source file names in filelist.
-def source_files_list():
- list=[]
- file_re = re.compile(r'^(\w|/)+/((\w|.)+)')
- for line in open('filelist', 'r'):
- if file_re.match(line):
- list.append(file_re.match(line).group(2))
- return sorted(list)
-
# source_files --
-# Print a list of the source file names in filelist.
+# Return a list of the source file names in filelist.
def source_files():
- for line in source_files_list():
- print(line)
-
-# source_paths_list --
-# Return a list of the source file paths in filelist.
-def source_paths_list():
- list=[]
file_re = re.compile(r'^\w')
for line in open('filelist', 'r'):
if file_re.match(line):
- list.append(line.rstrip())
- return sorted(list)
+ yield os.path.join('..', line.rstrip())
-# source_paths --
-# Print a list of the source file paths in filelist.
-def source_paths():
- for line in source_paths_list():
- print(line)
-
-# directory_files_list --
+# source_dirs --
# Return a list of the directories in filelist.
-def directory_files_list():
- dirs = {}
- dir_re = re.compile(r'^((\w|/)+/)')
- for line in open('filelist', 'r'):
- if dir_re.match(line):
- dirs[dir_re.match(line).group(1)] = 1
- return sorted(dirs)
+def source_dirs():
+ dirs = set()
+ for f in source_files():
+ dirs.add(os.path.dirname(f))
+ return dirs
-# directory_files --
-# Print a list of the directories in filelist.
-def directory_files():
- for entry in directory_files_list():
- print(entry)
+def print_source_dirs():
+ for d in source_dirs():
+ print d
# compare_srcfile --
# Compare two files, and if they differ, update the source file.
def compare_srcfile(tmp, src):
- if not os.path.isfile(src) or \
- not filecmp.cmp(tmp, src, False):
+ if not os.path.isfile(src) or not filecmp.cmp(tmp, src, shallow=False):
print('Updating ' + src)
shutil.copyfile(tmp, src)
os.remove(tmp)
diff --git a/dist/filelist b/dist/filelist
index f251731af48..87b75d0b7cf 100644
--- a/dist/filelist
+++ b/dist/filelist
@@ -75,7 +75,7 @@ src/cursor/cur_stat.c
src/cursor/cur_std.c
src/cursor/cur_table.c
src/log/log.c
-src/log/log_desc.c
+src/log/log_auto.c
src/log/log_slot.c
src/lsm/lsm_cursor.c
src/lsm/lsm_merge.c
@@ -149,3 +149,5 @@ src/support/stat.c
src/txn/txn.c
src/txn/txn_ckpt.c
src/txn/txn_ext.c
+src/txn/txn_log.c
+src/txn/txn_recover.c
diff --git a/dist/flags.py b/dist/flags.py
index c5cf3c213ea..c7ba71e04b0 100644
--- a/dist/flags.py
+++ b/dist/flags.py
@@ -18,17 +18,16 @@ flags = {
'FILE_TYPE_DATA',
'FILE_TYPE_LOG'
],
- 'log_scan' : [
+ 'log_scan' : [
'LOGSCAN_FIRST',
'LOGSCAN_FROM_CKP',
'LOGSCAN_ONE',
'LOGSCAN_RECOVER'
- ],
- 'log_write' : [
- 'LOG_CKPT',
+ ],
+ 'log_write' : [
'LOG_DSYNC',
'LOG_FSYNC'
- ],
+ ],
'rec_write' : [
'EVICTION_SERVER_LOCKED',
'SKIP_UPDATE_ERR',
@@ -47,6 +46,12 @@ flags = {
'TREE_SKIP_LEAF',
'TREE_WAIT',
],
+ 'txn_log_checkpoint' : [
+ 'TXN_LOG_CKPT_FAIL',
+ 'TXN_LOG_CKPT_PREPARE',
+ 'TXN_LOG_CKPT_START',
+ 'TXN_LOG_CKPT_STOP',
+ ],
'verbose' : [
'VERB_block',
'VERB_ckpt',
@@ -61,6 +66,7 @@ flags = {
'VERB_overflow',
'VERB_read',
'VERB_reconcile',
+ 'VERB_recovery',
'VERB_salvage',
'VERB_shared_cache',
'VERB_verify',
@@ -82,6 +88,8 @@ flags = {
'session' : [
'SESSION_CACHE_BUSY',
'SESSION_INTERNAL',
+ 'SESSION_LOGGING_DISABLED',
+ 'SESSION_LOGGING_INMEM',
'SESSION_NO_CACHE',
'SESSION_NO_CACHE_CHECK',
'SESSION_NO_SCHEMA_LOCK',
diff --git a/dist/java_doc.py b/dist/java_doc.py
index ce42a53c118..d44ccb12160 100644
--- a/dist/java_doc.py
+++ b/dist/java_doc.py
@@ -36,7 +36,8 @@ for line in open(f, 'r'):
m = cfunc_re.match(line)
if m:
- tfile.write('COPYDOC(__' + curr_class.lower() + ', ' + curr_class.upper() + ', ' + m.group(1) + ')\n')
+ tfile.write('COPYDOC(__' + curr_class.lower() + ', ' +
+ curr_class.upper() + ', ' + m.group(1) + ')\n')
tfile.close()
compare_srcfile(tmp_file, o)
diff --git a/dist/log.py b/dist/log.py
index aeed3ee5fe6..73752d4c7d3 100644
--- a/dist/log.py
+++ b/dist/log.py
@@ -1,40 +1,260 @@
#!/usr/bin/env python
import os, re, sys, textwrap
-import log_data
from dist import compare_srcfile
+import log_data
# Temporary file.
tmp_file = '__tmp'
-# Map log record types to C
-c_types = {
- 'string' : 'const char *',
+# Map log record types to:
+# (C type, pack type, printf format, printf arg(s))
+field_types = {
+ 'string' : ('const char *', 'S', '%s', 'arg'),
+ 'item' : ('WT_ITEM *', 'u', '%.*s',
+ '(int)arg.size, (const char *)arg.data'),
+ 'recno' : ('uint64_t', 'r', '%" PRIu64 "', 'arg'),
+ 'uint32' : ('uint32_t', 'I', '%" PRIu32 "', 'arg'),
+ 'uint64' : ('uint64_t', 'Q', '%" PRIu64 "', 'arg'),
}
-# Map log record types to format strings
-fmt_types = {
- 'string' : 'S',
-}
+def cintype(f):
+ return field_types[f[0]][0]
+
+def couttype(f):
+ type = cintype(f)
+ # We already have a pointer to a WT_ITEM
+ if f[0] == 'item':
+ return type
+ if type[-1] != '*':
+ type += ' '
+ return type + '*'
+
+def clocaltype(f):
+ type = cintype(f)
+ # Allocate a WT_ITEM struct on the stack
+ if f[0] == 'item':
+ return type[:-2]
+ return type
+
+def pack_fmt(fields):
+ return ''.join(field_types[f[0]][1] for f in fields)
+
+def op_pack_fmt(r):
+ return 'II' + pack_fmt(r.fields)
+
+def rec_pack_fmt(r):
+ return 'I' + pack_fmt(r.fields)
+
+def printf_fmt(f):
+ return field_types[f[0]][2]
+
+def printf_arg(f):
+ arg = field_types[f[0]][3].replace('arg', f[1])
+ return '\n\t ' + arg if f[0] == 'item' else ' ' + arg
#####################################################################
-# Create log.i with inline functions for each log record type.
+# Update log.h with #defines for types
#####################################################################
-f='../src/include/log.i'
-tfile = open(tmp_file, 'w')
-
-tfile.write('/* DO NOT EDIT: automatically built by dist/log.py. */\n')
+log_defines = (
+ ''.join('#define\t%s\t%d\n' % (r.macro_name(), i)
+ for i, r in enumerate(log_data.rectypes)) +
+ ''.join('#define\t%s\t%d\n' % (r.macro_name(), i)
+ for i, r in enumerate(log_data.optypes))
+)
+tfile = open(tmp_file, 'w')
+skip = 0
+for line in open('../src/include/log.h', 'r'):
+ if skip:
+ if 'Log record declarations: END' in line:
+ tfile.write('/*\n' + line)
+ skip = 0
+ else:
+ tfile.write(line)
+ if 'Log record declarations: BEGIN' in line:
+ skip = 1
+ tfile.write(' */\n')
+ tfile.write(log_defines)
tfile.close()
-compare_srcfile(tmp_file, f)
+compare_srcfile(tmp_file, '../src/include/log.h')
#####################################################################
-# Create log_desc.c with descriptors for each log record type.
+# Create log_auto.c with handlers for each record / operation type.
#####################################################################
-f='../src/log/log_desc.c'
+f='../src/log/log_auto.c'
tfile = open(tmp_file, 'w')
tfile.write('/* DO NOT EDIT: automatically built by dist/log.py. */\n')
+tfile.write('''
+#include "wt_internal.h"
+
+int
+__wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp)
+{
+ WT_ITEM *logrec;
+
+ WT_RET(__wt_scr_alloc(session, WT_ALIGN(size + 1, LOG_ALIGN), &logrec));
+ WT_CLEAR(*(WT_LOG_RECORD *)logrec->data);
+ logrec->size = offsetof(WT_LOG_RECORD, record);
+
+ *logrecp = logrec;
+ return (0);
+}
+
+void
+__wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp)
+{
+ WT_UNUSED(session);
+ __wt_scr_free(logrecp);
+}
+
+int
+__wt_logrec_read(WT_SESSION_IMPL *session,
+ const uint8_t **pp, const uint8_t *end, uint32_t *rectypep)
+{
+ uint64_t rectype;
+
+ WT_UNUSED(session);
+ WT_RET(__wt_vunpack_uint(pp, WT_PTRDIFF(end, *pp), &rectype));
+ *rectypep = (uint32_t)rectype;
+ return (0);
+}
+
+int
+__wt_logop_read(WT_SESSION_IMPL *session,
+ const uint8_t **pp, const uint8_t *end,
+ uint32_t *optypep, uint32_t *opsizep)
+{
+ return (__wt_struct_unpack(
+ session, *pp, WT_PTRDIFF(end, *pp), "II", optypep, opsizep));
+}
+''')
+
+# Emit code to read, write and print log operations (within a log record)
+for optype in log_data.optypes:
+ if not optype.fields:
+ continue
+
+ tfile.write('''
+int
+__wt_logop_%(name)s_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ %(arg_decls)s)
+{
+ const char *fmt = WT_UNCHECKED_STRING(%(fmt)s);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = %(macro)s;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0%(arg_names)s));
+
+ size += __wt_vsize_uint(size) - 1;
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize%(arg_names)s));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+''' % {
+ 'name' : optype.name,
+ 'macro' : optype.macro_name(),
+ 'arg_decls' : ', '.join(
+ '%s%s%s' % (cintype(f), '' if cintype(f)[-1] == '*' else ' ', f[1])
+ for f in optype.fields),
+ 'arg_names' : ''.join(', %s' % f[1] for f in optype.fields),
+ 'fmt' : op_pack_fmt(optype)
+})
+
+ tfile.write('''
+int
+__wt_logop_%(name)s_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ %(arg_decls)s)
+{
+ const char *fmt = WT_UNCHECKED_STRING(%(fmt)s);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size%(arg_names)s));
+ WT_ASSERT(session, optype == %(macro)s);
+
+ *pp += size;
+ return (0);
+}
+''' % {
+ 'name' : optype.name,
+ 'macro' : optype.macro_name(),
+ 'arg_decls' : ', '.join(
+ '%s%sp' % (couttype(f), f[1]) for f in optype.fields),
+ 'arg_names' : ''.join(', %sp' % f[1] for f in optype.fields),
+ 'fmt' : op_pack_fmt(optype)
+})
+
+ tfile.write('''
+int
+__wt_logop_%(name)s_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ %(arg_decls)s
+
+ WT_RET(__wt_logop_%(name)s_unpack(
+ session, pp, end%(arg_addrs)s));
+
+ %(print_args)s
+ return (0);
+}
+''' % {
+ 'name' : optype.name,
+ 'arg_decls' : '\n\t'.join('%s%s%s;' %
+ (clocaltype(f), '' if clocaltype(f)[-1] == '*' else ' ', f[1])
+ for f in optype.fields),
+ 'arg_addrs' : ''.join(', &%s' % f[1] for f in optype.fields),
+ 'print_args' : '\n\t'.join(
+ 'fprintf(out, " \\"%s\\": \\"%s\\",\\n",%s);' %
+ (f[1], printf_fmt(f), printf_arg(f))
+ for f in optype.fields),
+})
+
+# Emit the printlog entry point
+tfile.write('''
+int
+__wt_txn_op_printlog(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t optype, opsize;
+
+ /* Peek at the size and the type. */
+ WT_RET(__wt_logop_read(session, pp, end, &optype, &opsize));
+ end = *pp + opsize;
+
+ switch (optype) {''')
+
+for optype in log_data.optypes:
+ if not optype.fields:
+ continue
+
+ tfile.write('''
+ case %(macro)s:
+ WT_RET(%(print_func)s(session, pp, end, out));
+ break;
+''' % {
+ 'macro' : optype.macro_name(),
+ 'print_func' : '__wt_logop_' + optype.name + '_print',
+})
+
+tfile.write('''
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+''')
+
tfile.close()
compare_srcfile(tmp_file, f)
diff --git a/dist/log_data.py b/dist/log_data.py
index 1e274e48991..64fe73a88a2 100644
--- a/dist/log_data.py
+++ b/dist/log_data.py
@@ -1,10 +1,60 @@
-# Data for config.py, describes all configuration key / value pairs
+# Data for log.py, describes the format of log records
+
+# There are a small number of main log record types.
+#
+# Some log record types, such as transaction commit, also include a list of
+# "log operations" within the same log record. Both log record types and log
+# operations are described here.
class LogRecordType:
- def __init__(self, name, fields):
- self.name = name
- self.fields = fields
+ def __init__(self, name, fields):
+ self.name = name
+ self.fields = fields
+
+ def macro_name(self):
+ return 'WT_LOGREC_%s' % self.name.upper()
+
+ def prname(self):
+ return '__logrec_print_' + self.name
+
+rectypes = [
+ # A database-wide checkpoint.
+ LogRecordType('checkpoint', [
+ ('WT_LSN', 'ckpt_lsn'), ('uint32', 'nsnapshot'), ('item', 'snapshot')]),
+
+ # Common case: a transaction commit
+ LogRecordType('commit', [('uint64', 'txnid')]),
+
+ # Mark the start / end of a file sync operation (usually when a file is
+ # closed). These log records aren't required during recovery, but we use
+ # the allocated LSN to reduce the amount of work recovery has to do, and
+ # they are useful for debugging recovery.
+ LogRecordType('file_sync', [('uint32', 'fileid'), ('int', 'start')]),
+
+ # Debugging message in the log
+ LogRecordType('message', [('string', 'message')]),
+]
+
+class LogOperationType:
+ def __init__(self, name, fields):
+ self.name = name
+ self.fields = fields
+
+ def macro_name(self):
+ return 'WT_LOGOP_%s' % self.name.upper()
-types = [
- LogRecordType('debug', [('string', 'message')])
+optypes = [
+ LogOperationType('col_put',
+ [('uint32', 'fileid'), ('recno', 'recno'), ('item', 'value')]),
+ LogOperationType('col_remove',
+ [('uint32', 'fileid'), ('recno', 'recno')]),
+ LogOperationType('col_truncate',
+ [('uint32', 'fileid'), ('recno', 'start'), ('recno', 'stop')]),
+ LogOperationType('row_put',
+ [('uint32', 'fileid'), ('item', 'key'), ('item', 'value')]),
+ LogOperationType('row_remove',
+ [('uint32', 'fileid'), ('item', 'key')]),
+ LogOperationType('row_truncate',
+ [('uint32', 'fileid'), ('item', 'start'), ('item', 'stop'),
+ ('uint32', 'mode')]),
]
diff --git a/dist/s_all b/dist/s_all
index 11506fd76fd..c39ea945b3a 100644
--- a/dist/s_all
+++ b/dist/s_all
@@ -72,7 +72,8 @@ run "sh ./s_stat" "checking for unused statistics fields"
run "sh ./s_getopt" "checking for incorrect getopt usage"
run "sh ./s_longlines" "checking for long lines"
run "sh ./s_string" "checking string spelling"
-run "sh ./s_style" "checking style"
+run "python style.py" "checking style (pass 1)"
+run "sh ./s_style" "checking style (pass 2)"
run "sh ./s_symbols" "checking external symbol names"
run "sh ./s_typedef -c" "checking for unused typedefs"
run "sh ./s_whitespace" "checking whitespace"
diff --git a/dist/s_copyright b/dist/s_copyright
index ce8f9970ae3..966cddcfff4 100644
--- a/dist/s_copyright
+++ b/dist/s_copyright
@@ -72,7 +72,10 @@ check()
for i in `cd .. &&
find [a-z]* -name '*.[chi]' \
-o -name '*.cxx' -o -name '*.java' -o -name '*.py' |
- sed -e '/test\/3rdparty\//d' -e 's/^\.\///'`; do
+ sed -e '/test\/3rdparty\//d' \
+ -e '/^build/d' \
+ -e 's/^\.\///'`
+do
check $i
done
diff --git a/dist/s_copyright.list b/dist/s_copyright.list
index 3213ace3b5b..ce990c43474 100644
--- a/dist/s_copyright.list
+++ b/dist/s_copyright.list
@@ -7,12 +7,21 @@ skip dist/db.py
skip dist/dist.py
skip dist/flags.py
skip dist/java_doc.py
-skip dist/log.py
skip dist/log_data.py
+skip dist/log.py
skip dist/serial.py
-skip dist/stat.py
skip dist/stat_data.py
+skip dist/stat.py
+skip dist/style.py
skip lang/java/java_doc.i
+skip lang/java/src/com/wiredtiger/db/Connection.java
+skip lang/java/src/com/wiredtiger/db/Cursor.java
+skip lang/java/src/com/wiredtiger/db/SearchStatus.java
+skip lang/java/src/com/wiredtiger/db/Session.java
+skip lang/java/src/com/wiredtiger/db/wiredtigerConstants.java
+skip lang/java/src/com/wiredtiger/db/wiredtiger.java
+skip lang/java/src/com/wiredtiger/db/wiredtigerJNI.java
+skip lang/java/wiredtiger_wrap.c
skip lang/python/setup.py
skip lang/python/wiredtiger/__init__.py
skip lang/python/wiredtiger_wrap.c
@@ -24,8 +33,8 @@ skip src/include/flags.h
skip src/include/log.i
skip src/include/queue.h
skip src/include/serial_funcs.i
-skip src/log/log_desc.c
+skip src/log/log_auto.c
skip src/support/stat.c
-skip test/packing/intpack-test.c
skip test/packing/intpack-test2.c
+skip test/packing/intpack-test.c
skip test/packing/packing-test.c
diff --git a/dist/s_define.list b/dist/s_define.list
index a30378ea899..e3d1d2593e1 100644
--- a/dist/s_define.list
+++ b/dist/s_define.list
@@ -5,10 +5,12 @@ API_CALL_NOCONF
API_SESSION_INIT
FLD_CLR
HAVE_ATOMICS
+IS_INIT_LSN
LF_CLR
LF_SET
LLONG_MAX
LLONG_MIN
+MAX_LSN
SIZE_CHECK
TXNID_LE
TXN_API_CALL
@@ -18,8 +20,11 @@ WT_BARRIER
WT_BLOCK_DESC_SIZE
WT_CACHE_LINE_ALIGNMENT
WT_DEBUG_BYTE
+WT_HANDLE_CLOSED
+WT_HANDLE_NULLABLE
WT_READ_BARRIER
WT_REF_SIZE
+WT_RET_TIMEDOUT_OK
WT_SPINLOCK_MAX
WT_STAT_ATOMIC_DECR
WT_STAT_ATOMIC_INCR
diff --git a/dist/s_funcs.list b/dist/s_funcs.list
index 64ac90752f3..b34564adda7 100644
--- a/dist/s_funcs.list
+++ b/dist/s_funcs.list
@@ -9,6 +9,7 @@ __wt_bloom_drop
__wt_bloom_get
__wt_cache_dump
__wt_config_getone
+__wt_cursor_get_raw_value
__wt_debug_addr
__wt_debug_offset
__wt_debug_tree
diff --git a/dist/s_longlines b/dist/s_longlines
index d0b32ce341f..2aa7e9eaaff 100644
--- a/dist/s_longlines
+++ b/dist/s_longlines
@@ -6,6 +6,7 @@ trap 'rm -f $t; exit 0' 0 1 2 3 13 15
l=`(cd .. &&
find bench/wtperf examples ext src test -name '*.[chisy]' &&
+ find dist -name '*.py' &&
find src -name '*.in') |
grep -v 'support/stat\.c'`
diff --git a/dist/s_string.ok b/dist/s_string.ok
index d230c987ab2..a66518d57d5 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -70,6 +70,7 @@ DUPLICATEV
DbCursor
DbEnv
Decrement
+EAGAIN
EB
EBUSY
EINTR
@@ -115,6 +116,7 @@ IMPL's
INDX
INIT
INITIALIZER
+INMEM
INSERT's
INUSE
ISSET
@@ -131,6 +133,7 @@ Kounavis
LEX
LF
LNO
+LOGSCAN
LRU
LSB
LSM
@@ -441,6 +444,7 @@ errv
errx
esc
eventv
+evictable
evictserver
exactp
extern
@@ -458,8 +462,10 @@ ffs
fgetln
fh
filefrag
+fileid
filename
filenames
+fileop
fileops
filesize
filesystem
@@ -467,6 +473,7 @@ firstfit
fixup
flcs
fmt
+fmterr
fnv
foc
fopen
@@ -548,6 +555,7 @@ keyv
kv
kvs
kvsbdb
+lang
latencies
ld
len
@@ -568,6 +576,7 @@ logf
logmgr
lognum
logput
+logread
lookup
lookups
lr
@@ -682,6 +691,7 @@ printf
printlog
priv
ps
+pse
psp
pthread
putK
@@ -832,6 +842,7 @@ treplacement
trk
trk's
troot
+trunc
trywrlock
tsalvage
tsplit
@@ -894,6 +905,7 @@ versa
vlcs
vmsg
vpack
+vprintf
vrfy
vsize
vslot
diff --git a/dist/s_tags b/dist/s_tags
index 65b8d917a05..e189587ca82 100644
--- a/dist/s_tags
+++ b/dist/s_tags
@@ -36,7 +36,7 @@ rm -f tags
ctags $flags ../include/*.in ../*/*.[chi] 2>/dev/null)
# Link to the tags file from standard build and source directories.
-dirs="`python -c 'from dist import directory_files; directory_files();'`"
+dirs="`python -c 'import dist; dist.print_source_dirs()'`"
for i in $dirs; do
- (cd ../$i && rm -f tags && ln -s ../include/tags .)
+ (cd $i && rm -f tags && ln -s ../include/tags .)
done
diff --git a/dist/stat.py b/dist/stat.py
index 783f2f191f2..69dcb87e8b3 100644
--- a/dist/stat.py
+++ b/dist/stat.py
@@ -3,7 +3,6 @@
import re, string, sys, textwrap
from dist import compare_srcfile
-from dist import source_paths_list
# Read the source files.
from stat_data import dsrc_stats, connection_stats
diff --git a/dist/stat_data.py b/dist/stat_data.py
index acc3358012f..1074b7d99cf 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -69,9 +69,10 @@ connection_stats = [
Stat('cache_eviction_fail',
'cache: pages selected for eviction unable to be evicted'),
Stat('cache_eviction_force',
- 'cache: pages evicted because they exceeded the in memory maximum'),
+ 'cache: pages evicted because they exceeded the in memory maximum'),
Stat('cache_eviction_force_fail',
- 'cache: failed eviction of pages that exceeded the in memory maximum'),
+ 'cache: failed eviction of pages that exceeded the ' +
+ 'in memory maximum'),
Stat('cache_eviction_hazard',
'cache: hazard pointer blocked page eviction'),
Stat('cache_eviction_internal', 'cache: internal pages evicted'),
@@ -98,11 +99,15 @@ connection_stats = [
Stat('dh_conn_handles', 'dhandle: connection dhandles swept'),
Stat('dh_session_handles', 'dhandle: session dhandles swept'),
Stat('dh_sweep_evict', 'dhandle: sweeps conflicting with evict'),
- Stat('dh_sweeps', 'dhandle: number of sweep attempts'),
+ Stat('dh_sweeps', 'dhandle: sweep attempts'),
##########################################
# Logging statistics
##########################################
+ Stat('log_buffer_grow',
+ 'log: log buffer size increases'),
+ Stat('log_buffer_size',
+ 'log: total log buffer size', 'no_clear,no_scale'),
Stat('log_bytes_user', 'log: user provided log bytes written'),
Stat('log_bytes_written', 'log: log bytes written'),
Stat('log_max_filesize', 'log: maximum log file size', 'no_clear'),
@@ -115,9 +120,17 @@ connection_stats = [
Stat('log_slot_consolidated', 'log: logging bytes consolidated'),
Stat('log_slot_closes', 'log: consolidated slot closures'),
+ Stat('log_slot_ready_wait_timeout',
+ 'log: log slot ready wait timeouts'),
Stat('log_slot_joins', 'log: consolidated slot joins'),
Stat('log_slot_races', 'log: consolidated slot join races'),
+ Stat('log_slot_release_wait_timeout',
+ 'log: log slot release wait timeouts'),
+ Stat('log_slot_switch_fails',
+ 'log: slots selected for switching that were unavailable'),
Stat('log_slot_toobig', 'log: record size exceeded maximum'),
+ Stat('log_slot_toosmall',
+ 'log: failed to find a slot large enough for record'),
Stat('log_slot_transitions', 'log: consolidated slot join transitions'),
##########################################
@@ -245,7 +258,8 @@ dsrc_stats = [
##########################################
Stat('block_alloc', 'block manager: blocks allocated'),
Stat('allocation_size',
- 'block manager: file allocation unit size', 'no_aggregate,no_scale'),
+ 'block manager: file allocation unit size',
+ 'no_aggregate,no_scale'),
Stat('block_checkpoint_size',
'block manager: checkpoint size', 'no_scale'),
Stat('block_extension',
@@ -320,7 +334,7 @@ dsrc_stats = [
Stat('rec_split_leaf', 'reconciliation leaf pages split'),
Stat('rec_split_max',
- 'reconciliation maximum number of splits created for a page',
+ 'reconciliation maximum splits for a page',
'max_aggregate,no_scale'),
##########################################
diff --git a/dist/style.py b/dist/style.py
new file mode 100755
index 00000000000..0c5083eb521
--- /dev/null
+++ b/dist/style.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+
+# Check the style of WiredTiger C code.
+from dist import source_files
+import re
+
+skip_re = re.compile(r'DO NOT EDIT: automatically built')
+func_re = re.compile(r'(/\*(?:[^\*]|\*[^/])*\*/)?\n\w[\w ]+\n(\w+)', re.DOTALL)
+
+for f in source_files():
+ s = open(f, 'r').read()
+ if skip_re.search(s):
+ continue
+ for m in func_re.finditer(s):
+ if not m.group(1) or \
+ not m.group(1).startswith('/*\n * %s --\n' % m.group(2)):
+ print "%s:%d: missing comment for %s" % \
+ (f, s[:m.start(2)].count('\n'), m.group(2))
diff --git a/lang/java/Makefile.am b/lang/java/Makefile.am
index 6c9346af2e1..4fbf9bef5dd 100644
--- a/lang/java/Makefile.am
+++ b/lang/java/Makefile.am
@@ -30,6 +30,7 @@ JAVA_SRC = \
$(JAVAEXAMPLES)/ex_access.java
JAVA_JUNIT = \
+ $(JAVATEST)/AutoCloseTest.java \
$(JAVATEST)/CursorTest.java \
$(JAVATEST)/CursorTest02.java \
$(JAVATEST)/PackTest.java \
diff --git a/lang/java/wiredtiger.i b/lang/java/wiredtiger.i
index 1ad1a09768c..3c1cc1d347e 100644
--- a/lang/java/wiredtiger.i
+++ b/lang/java/wiredtiger.i
@@ -25,13 +25,47 @@
%}
%{
-typedef int bool;
+#include "../src/include/wt_internal.h"
+
+/*
+ * Closed handle checking:
+ *
+ * The typedef WT_CURSOR_NULLABLE used in wiredtiger.h is only made
+ * visible to the SWIG parser and is used to identify arguments of
+ * Cursor type that are permitted to be null. Likewise, typedefs
+ * WT_{CURSOR,SESSION,CONNECTION}_CLOSED identify 'close' calls that
+ * need explicit nulling of the swigCPtr. These typedefs permit
+ * special casing in typemaps for input args.
+ *
+ * We want SWIG to see these 'fake' typenames, but not the compiler.
+ */
+#define WT_CURSOR_NULLABLE WT_CURSOR
+#define WT_CURSOR_CLOSED WT_CURSOR
+#define WT_SESSION_CLOSED WT_SESSION
+#define WT_CONNECTION_CLOSED WT_CONNECTION
+
+/*
+ * For Connections, Sessions and Cursors created in Java, each of
+ * WT_CONNECTION_IMPL, WT_SESSION_IMPL and WT_CURSOR have a
+ * lang_private field that store a pointer to a JAVA_CALLBACK, alloced
+ * during the various open calls. {conn,session,cursor}CloseHandler()
+ * functions reach into the associated java object, set the swigCPtr
+ * to 0, and free the JAVA_CALLBACK. Typemaps matching Connection,
+ * Session, Cursor args use the NULL_CHECK macro, which checks if
+ * swigCPtr is 0.
+ */
+typedef struct {
+ JNIEnv *jnienv; /* jni env that created the Session/Cursor */
+ jobject jobj; /* the java Session/Cursor object */
+ jfieldID fid; /* cached Cursor.swigCPtr field id in session */
+} JAVA_CALLBACK;
static void throwWiredTigerException(JNIEnv *jenv, const char *msg) {
jclass excep = (*jenv)->FindClass(jenv, "com/wiredtiger/db/WiredTigerException");
if (excep)
(*jenv)->ThrowNew(jenv, excep, msg);
}
+
%}
/* No finalizers */
@@ -71,7 +105,7 @@ static void throwWiredTigerException(JNIEnv *jenv, const char *msg) {
%}
%typemap(argout) WT_ITEM * %{
- (*jenv)->ReleaseByteArrayElements(jenv, $input, $1->data, 0);
+ (*jenv)->ReleaseByteArrayElements(jenv, $input, (void *)$1->data, 0);
%}
%typemap(out) WT_ITEM %{
@@ -94,6 +128,15 @@ static void throwWiredTigerException(JNIEnv *jenv, const char *msg) {
$result = $1;
%}
+%define NULL_CHECK(val, name)
+ if (!val) {
+ SWIG_JavaThrowException(jenv, SWIG_JavaNullPointerException,
+ #name " is null");
+ return 0;
+ }
+%enddef
+
+%define WT_CLASS(type, class, name, closeHandler)
/*
* Extra 'self' elimination.
* The methods we're wrapping look like this:
@@ -105,8 +148,26 @@ static void throwWiredTigerException(JNIEnv *jenv, const char *msg) {
* and we use consecutive argument matching of typemaps to convert two args to
* one.
*/
-%define WT_CLASS(type, class, name)
-%typemap(in, numinputs=0) type *name "$1 = *(type **)&jarg1;"
+%typemap(in, numinputs=0) type *name {
+ $1 = *(type **)&jarg1;
+ NULL_CHECK($1, $1_name)
+}
+
+%typemap(in, numinputs=0) class ## _CLOSED *name {
+ $1 = *(type **)&jarg1;
+ NULL_CHECK($1, $1_name)
+ closeHandler;
+}
+
+%typemap(in) class ## _NULLABLE * {
+ $1 = *(type **)&$input;
+}
+
+%typemap(in) type * {
+ $1 = *(type **)&$input;
+ NULL_CHECK($1, $1_name)
+}
+
%typemap(javaimports) type "
/**
* @copydoc class
@@ -126,9 +187,9 @@ static void throwWiredTigerException(JNIEnv *jenv, const char *msg) {
*/
%}
-WT_CLASS(struct __wt_connection, WT_CONNECTION, connection)
-WT_CLASS(struct __wt_session, WT_SESSION, session)
-WT_CLASS(struct __wt_cursor, WT_CURSOR, cursor)
+WT_CLASS(struct __wt_connection, WT_CONNECTION, connection, connCloseHandler($1))
+WT_CLASS(struct __wt_session, WT_SESSION, session, sessionCloseHandler($1))
+WT_CLASS(struct __wt_cursor, WT_CURSOR, cursor, cursorCloseHandler($1))
%define COPYDOC(SIGNATURE_CLASS, CLASS, METHOD)
%javamethodmodifiers SIGNATURE_CLASS::METHOD "
@@ -170,6 +231,103 @@ WT_CLASS(struct __wt_cursor, WT_CURSOR, cursor)
enum SearchStatus { FOUND, NOTFOUND, SMALLER, LARGER };
%}
+%wrapper %{
+/* Zero out SWIG's pointer to the C object,
+ * equivalent to 'jobj.swigCPtr = 0;' in java.
+ */
+static int
+javaClose(JAVA_CALLBACK *jcb, jfieldID *pfid)
+{
+ jclass cls;
+ jfieldID fid;
+ JNIEnv *env;
+
+ env = jcb->jnienv;
+
+ if (pfid == NULL || *pfid == NULL) {
+ cls = (*env)->GetObjectClass(env, jcb->jobj);
+ *pfid = (*env)->GetFieldID(env, cls, "swigCPtr", "J");
+ if (pfid != NULL)
+ *pfid = fid;
+ }
+ (*env)->SetLongField(env, jcb->jobj, *pfid, 0L);
+ (*env)->DeleteGlobalRef(env, jcb->jobj);
+ return (0);
+}
+
+/* Connection specific close handler. */
+static int
+connCloseHandler(WT_CONNECTION *conn_arg)
+{
+ int ret;
+ JAVA_CALLBACK *jcb;
+ WT_CONNECTION_IMPL *conn;
+
+ conn = (WT_CONNECTION_IMPL *)conn_arg;
+ jcb = (JAVA_CALLBACK *)conn->lang_private;
+ conn->lang_private = NULL;
+ ret = javaClose(jcb, NULL);
+ __wt_free(conn->default_session, jcb);
+
+ return (0);
+}
+
+/* Session specific close handler. */
+static int
+sessionCloseHandler(WT_SESSION *session_arg)
+{
+ int ret;
+ JAVA_CALLBACK *jcb;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)session_arg;
+ jcb = (JAVA_CALLBACK *)session->lang_private;
+ session->lang_private = NULL;
+ ret = javaClose(jcb, NULL);
+ __wt_free(session, jcb);
+
+ return (ret);
+}
+
+/* Cursor specific close handler. */
+static int
+cursorCloseHandler(WT_CURSOR *cursor)
+{
+ int ret;
+ JAVA_CALLBACK *jcb;
+ JAVA_CALLBACK *sess_jcb;
+
+ static jfieldID ccp_fid = NULL; /* cursor cptr fid, computed once */
+
+ jcb = (JAVA_CALLBACK *)cursor->lang_private;
+ sess_jcb = (JAVA_CALLBACK *)
+ ((WT_SESSION_IMPL *)cursor->session)->lang_private;
+ cursor->lang_private = NULL;
+ ret = javaClose(jcb, &sess_jcb->fid);
+ __wt_free((WT_SESSION_IMPL *)cursor->session, jcb);
+
+ return (ret);
+}
+
+/* Add event handler support. */
+static int
+javaCloseHandler(WT_EVENT_HANDLER *handler, WT_SESSION *session,
+ WT_CURSOR *cursor)
+{
+ int ret;
+
+ WT_UNUSED(handler);
+
+ if (cursor != NULL)
+ ret = cursorCloseHandler(cursor);
+ else
+ ret = sessionCloseHandler(session);
+ return (ret);
+}
+
+WT_EVENT_HANDLER javaApiEventHandler = {NULL, NULL, NULL, javaCloseHandler};
+%}
+
%extend __wt_cursor {
%javamethodmodifiers get_key_wrap "protected";
@@ -237,6 +395,14 @@ enum SearchStatus { FOUND, NOTFOUND, SMALLER, LARGER };
throwWiredTigerException(jenv, wiredtiger_strerror(ret));
return cmp;
}
+
+ %javamethodmodifiers java_init "protected";
+ int java_init(jobject jcursor) {
+ JAVA_CALLBACK *jcb = (JAVA_CALLBACK *)$self->lang_private;
+ jcb->jobj = JCALL1(NewGlobalRef, jcb->jnienv, jcursor);
+ JCALL1(DeleteLocalRef, jcb->jnienv, jcursor);
+ return (0);
+ }
}
/* Cache key/value formats in Cursor */
@@ -257,6 +423,7 @@ enum SearchStatus { FOUND, NOTFOUND, SMALLER, LARGER };
valueFormat = getValue_format();
keyPacker = new PackOutputStream(keyFormat);
valuePacker = new PackOutputStream(valueFormat);
+ wiredtigerJNI.Cursor_java_init(swigCPtr, this, this);
}
protected static long getCPtr($javaclassname obj) {
@@ -784,14 +951,63 @@ enum SearchStatus { FOUND, NOTFOUND, SMALLER, LARGER };
%rename(Session) __wt_session;
%rename(Connection) __wt_connection;
+%define TRACKED_CLASS(jclassname, ctypename, java_init_fcn, implclass)
+%ignore jclassname::jclassname();
+
+%typemap(javabody) struct ctypename %{
+ private long swigCPtr;
+ protected boolean swigCMemOwn;
+
+ protected $javaclassname(long cPtr, boolean cMemoryOwn) {
+ swigCMemOwn = cMemoryOwn;
+ swigCPtr = cPtr;
+ java_init_fcn(swigCPtr, this, this);
+ }
+
+ protected static long getCPtr($javaclassname obj) {
+ return (obj == null) ? 0 : obj.swigCPtr;
+ }
+%}
+
+%extend ctypename {
+ %javamethodmodifiers java_init "protected";
+ int java_init(jobject jsess) {
+ implclass *session = (implclass *)$self;
+ JAVA_CALLBACK *jcb = (JAVA_CALLBACK *)session->lang_private;
+ jcb->jobj = JCALL1(NewGlobalRef, jcb->jnienv, jsess);
+ JCALL1(DeleteLocalRef, jcb->jnienv, jsess);
+ return (0);
+ }
+}
+%enddef
+
+TRACKED_CLASS(Session, __wt_session, wiredtigerJNI.Session_java_init, WT_SESSION_IMPL)
+TRACKED_CLASS(Connection, __wt_connection, wiredtigerJNI.Connection_java_init, WT_CONNECTION_IMPL)
+/* Note: Cursor incorporates the elements of TRACKED_CLASS into its
+ * custom constructor and %extend clause.
+ */
+
%include "wiredtiger.h"
/* Return new connections, sessions and cursors. */
%inline {
WT_CONNECTION *wiredtiger_open_wrap(JNIEnv *jenv, const char *home, const char *config) {
+ extern WT_EVENT_HANDLER javaApiEventHandler;
WT_CONNECTION *conn = NULL;
+ WT_CONNECTION_IMPL *connimpl;
+ JAVA_CALLBACK *jcb;
int ret;
- if ((ret = wiredtiger_open(home, NULL, config, &conn)) != 0)
+ if ((ret = wiredtiger_open(home, &javaApiEventHandler, config, &conn)) != 0)
+ goto err;
+
+ connimpl = (WT_CONNECTION_IMPL *)conn;
+ if ((ret = __wt_calloc_def(connimpl->default_session, 1, &jcb)) != 0)
+ goto err;
+
+ jcb->jnienv = jenv;
+ connimpl->lang_private = jcb;
+
+err: if (ret != 0)
throwWiredTigerException(jenv, wiredtiger_strerror(ret));
return conn;
}
@@ -799,22 +1015,48 @@ WT_CONNECTION *wiredtiger_open_wrap(JNIEnv *jenv, const char *home, const char *
%extend __wt_connection {
WT_SESSION *open_session_wrap(JNIEnv *jenv, const char *config) {
+ extern WT_EVENT_HANDLER javaApiEventHandler;
WT_SESSION *session = NULL;
+ WT_SESSION_IMPL *sessionimpl;
+ JAVA_CALLBACK *jcb;
int ret;
- if ((ret = $self->open_session($self, NULL, config, &session)) != 0)
+
+ if ((ret = $self->open_session($self, &javaApiEventHandler, config, &session)) != 0)
+ goto err;
+
+ sessionimpl = (WT_SESSION_IMPL *)session;
+ if ((ret = __wt_calloc_def(sessionimpl, 1, &jcb)) != 0)
+ goto err;
+
+ jcb->jnienv = jenv;
+ sessionimpl->lang_private = jcb;
+
+err: if (ret != 0)
throwWiredTigerException(jenv, wiredtiger_strerror(ret));
return session;
}
}
%extend __wt_session {
- WT_CURSOR *open_cursor_wrap(JNIEnv *jenv, const char *uri, WT_CURSOR *to_dup, const char *config) {
+ WT_CURSOR *open_cursor_wrap(JNIEnv *jenv, const char *uri, WT_CURSOR_NULLABLE *to_dup, const char *config) {
WT_CURSOR *cursor = NULL;
+ JAVA_CALLBACK *jcb;
int ret;
+
if ((ret = $self->open_cursor($self, uri, to_dup, config, &cursor)) != 0)
+ goto err;
+
+ cursor->flags |= WT_CURSTD_RAW;
+
+ if ((ret = __wt_calloc_def((WT_SESSION_IMPL *)cursor->session,
+ 1, &jcb)) != 0)
+ goto err;
+
+ jcb->jnienv = jenv;
+ cursor->lang_private = jcb;
+
+err: if (ret != 0)
throwWiredTigerException(jenv, wiredtiger_strerror(ret));
- else
- cursor->flags |= WT_CURSTD_RAW;
return cursor;
}
}
diff --git a/lang/python/wiredtiger.i b/lang/python/wiredtiger.i
index bd92c246bd3..14effa18ee4 100644
--- a/lang/python/wiredtiger.i
+++ b/lang/python/wiredtiger.i
@@ -405,6 +405,12 @@ typedef int int_void;
%}
};
+%extend __wt_session {
+ int log_printf(const char *msg) {
+ return self->log_printf(self, "%s", msg);
+ }
+};
+
/* Remove / rename parts of the C API that we don't want in Python. */
%immutable __wt_cursor::session;
%immutable __wt_cursor::uri;
@@ -412,20 +418,20 @@ typedef int int_void;
%ignore __wt_cursor::value_format;
%immutable __wt_session::connection;
-%ignore __wt_buf;
+%ignore __wt_collator;
+%ignore __wt_compressor;
%ignore __wt_config_item;
+%ignore __wt_data_source;
%ignore __wt_event_handler;
%ignore __wt_extractor;
%ignore __wt_item;
-%ignore __wt_collator;
%ignore __wt_connection::add_collator;
-%ignore __wt_compressor;
%ignore __wt_connection::add_compressor;
-%ignore __wt_data_source;
%ignore __wt_connection::add_data_source;
%ignore __wt_connection::add_extractor;
%ignore __wt_connection::get_extension_api;
+%ignore __wt_session::log_printf;
%ignore wiredtiger_struct_pack;
%ignore wiredtiger_struct_size;
diff --git a/src/block/block_ext.c b/src/block/block_ext.c
index 306fad1f265..ba704cf19c3 100644
--- a/src/block/block_ext.c
+++ b/src/block/block_ext.c
@@ -867,7 +867,7 @@ __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b)
}
/*
- * __wt_block_insert_ext, __block_merge --
+ * __wt_block_insert_ext --
* Insert an extent into an extent list, merging if possible.
*/
int
@@ -887,6 +887,12 @@ __wt_block_insert_ext(
*/
return (__block_merge(session, el, off, size));
}
+
+/*
+ * __block_merge --
+ * Insert an extent into an extent list, merging if possible (internal
+ * version).
+ */
static int
__block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
{
@@ -1239,6 +1245,10 @@ __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el)
memset(el, 0, sizeof(*el));
}
+/*
+ * __block_extlist_dump --
+ * Dump an extent list as verbose messages.
+ */
static int
__block_extlist_dump(
WT_SESSION_IMPL *session, const char *tag, WT_EXTLIST *el, int show_size)
diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c
index b688e994293..200bbf101df 100644
--- a/src/block/block_mgr.c
+++ b/src/block/block_mgr.c
@@ -22,7 +22,7 @@ __bm_readonly(WT_BM *bm, WT_SESSION_IMPL *session)
}
/*
- * __bm_addr_string
+ * __bm_addr_string --
* Return a printable string representation of an address cookie.
*/
static int
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index 44eed6ba0c5..936a112d07d 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -683,19 +683,32 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
{
WT_BTREE *btree;
WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
WT_SESSION_IMPL *session;
cbt = (start != NULL) ? start : stop;
session = (WT_SESSION_IMPL *)cbt->iface.session;
btree = cbt->btree;
+ /*
+ * For recovery, we log the start and stop keys for a truncate
+ * operation, not the individual records removed. On the other hand,
+ * for rollback we need to keep track of all the in-memory operations.
+ *
+ * We deal with this here by logging the truncate range first, then (in
+ * the logging code) disabling writing of the in-memory remove records
+ * to disk.
+ */
+ if (S2C(session)->logging)
+ WT_RET(__wt_txn_truncate_log(session, start, stop));
+
switch (btree->type) {
case BTREE_COL_FIX:
- WT_RET(__cursor_truncate_fix(
+ WT_ERR(__cursor_truncate_fix(
session, start, stop, __wt_col_modify));
break;
case BTREE_COL_VAR:
- WT_RET(__cursor_truncate(
+ WT_ERR(__cursor_truncate(
session, start, stop, __wt_col_modify));
break;
case BTREE_ROW:
@@ -710,15 +723,17 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
* are positioned in the tree.
*/
if (start != NULL)
- WT_RET(__wt_btcur_search(start));
+ WT_ERR(__wt_btcur_search(start));
if (stop != NULL)
- WT_RET(__wt_btcur_search(stop));
- WT_RET(__cursor_truncate(
+ WT_ERR(__wt_btcur_search(stop));
+ WT_ERR(__cursor_truncate(
session, start, stop, __wt_row_modify));
break;
}
- return (0);
+err: if (S2C(session)->logging)
+ WT_TRET(__wt_txn_truncate_end(session));
+ return (ret);
}
/*
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c
index 404aaa5a5bb..1c7f0e205b8 100644
--- a/src/btree/bt_evict.c
+++ b/src/btree/bt_evict.c
@@ -169,7 +169,8 @@ __wt_cache_evict_server(void *arg)
F_CLR(cache, WT_EVICT_ACTIVE);
WT_VERBOSE_ERR(session, evictserver, "sleeping");
/* Don't rely on signals: check periodically. */
- WT_ERR(__wt_cond_wait(session, cache->evict_cond, 100000));
+ WT_ERR_TIMEDOUT_OK(
+ __wt_cond_wait(session, cache->evict_cond, 100000));
WT_VERBOSE_ERR(session, evictserver, "waking");
}
@@ -350,9 +351,9 @@ __wt_evict_page(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/*
- * __wt_evict_file_exclusive_on
+ * __wt_evict_file_exclusive_on --
* Get exclusive eviction access to a file and discard any of the file's
- * blocks queued for eviction.
+ * blocks queued for eviction.
*/
void
__wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
@@ -398,7 +399,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
}
/*
- * __wt_evict_file_exclusive_off
+ * __wt_evict_file_exclusive_off --
* Release exclusive eviction access to a file.
*/
void
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 6db8534aca4..1e4d073a290 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -197,6 +197,10 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
"%" PRIu64 ".%" PRIu64, maj_version, min_version);
}
+ /* Get the file ID. */
+ WT_RET(__wt_config_gets(session, cfg, "id", &cval));
+ btree->id = (uint32_t)cval.val;
+
/* Validate file types and check the data format plan. */
WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL));
@@ -513,7 +517,7 @@ __wt_btree_new_leaf_page(
}
/*
- * __wt_btree_no_eviction --
+ * __wt_btree_evictable --
* Setup or release a cache-resident tree.
*/
void
@@ -701,6 +705,10 @@ __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
return (split_size);
}
+/*
+ * pse1 --
+ * Page size error message 1.
+ */
static int
pse1(WT_SESSION_IMPL *session, const char *type, uint32_t max, uint32_t ovfl)
{
@@ -710,6 +718,10 @@ pse1(WT_SESSION_IMPL *session, const char *type, uint32_t max, uint32_t ovfl)
type, max, ovfl);
}
+/*
+ * pse2 --
+ * Page size error message 2.
+ */
static int
pse2(WT_SESSION_IMPL *session,
const char *type, uint32_t max, uint32_t ovfl, int pct)
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index fff5eadb9b9..40596a889d5 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -16,7 +16,7 @@ static int __inmem_row_leaf_entries(
WT_SESSION_IMPL *, WT_PAGE_HEADER *, uint32_t *);
/*
- * __wt_page_in --
+ * __wt_page_in_func --
* Acquire a hazard pointer to a page; if the page is not in-memory,
* read it from the disk and build an in-memory version.
*/
diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c
index b502d8d3409..a43ff27fa84 100644
--- a/src/btree/bt_ret.c
+++ b/src/btree/bt_ret.c
@@ -8,6 +8,42 @@
#include "wt_internal.h"
/*
+ * __wt_row_key_get --
+ * Get a reference to the current key.
+ */
+int
+__wt_row_key_get(WT_CURSOR_BTREE *cbt, WT_ITEM *key)
+{
+ WT_PAGE *page;
+ WT_ROW *rip;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->page;
+
+ switch (page->type) {
+ case WT_PAGE_ROW_LEAF:
+ rip = &page->u.row.d[cbt->slot];
+
+ /*
+ * If the cursor references a WT_INSERT item, take the key from
+ * there. Otherwise, take the key from the original page, and
+ * the value from any related WT_UPDATE item, or the page if
+ * the key was never updated.
+ */
+ if (cbt->ins != NULL) {
+ key->data = WT_INSERT_KEY(cbt->ins);
+ key->size = WT_INSERT_KEY_SIZE(cbt->ins);
+ } else
+ WT_RET(__wt_row_leaf_key(session, page, rip, key, 1));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
* __wt_kv_return --
* Return a page referenced key/value pair to the application.
*/
@@ -78,10 +114,10 @@ __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
* original page, and the value from any related WT_UPDATE item,
* or the page if the key was never updated.
*/
- if (cbt->ins != NULL &&
- (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
+ if (cbt->ins != NULL) {
cursor->key.data = WT_INSERT_KEY(cbt->ins);
cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins);
+ upd = __wt_txn_read(session, cbt->ins->upd);
} else {
WT_RET(__wt_row_leaf_key(
session, page, rip, &cursor->key, 0));
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index 1ce32d86cb7..07a5b0c827c 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -2218,7 +2218,7 @@ __slvg_ovfl_discard(WT_SESSION_IMPL *session, WT_STUFF *ss)
}
/*
- * __slvg_free --
+ * __slvg_cleanup --
* Discard memory allocated to the page and overflow arrays.
*/
static int
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index 98c3cc9c3ef..69d4583b4db 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -8,7 +8,7 @@
#include "wt_internal.h"
/*
- * __tree_walk_delete_rollback --
+ * __wt_tree_walk_delete_rollback --
* Abort pages that were deleted without being instantiated.
*/
void
diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c
index 0e157e80bf8..e88b3d23a47 100644
--- a/src/btree/col_modify.c
+++ b/src/btree/col_modify.c
@@ -78,7 +78,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
/* Allocate the WT_UPDATE structure and transaction ID. */
WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
- WT_ERR(__wt_txn_modify(session, &upd->txnid));
+ WT_ERR(__wt_txn_modify(session, cbt, upd));
logged = 1;
/*
@@ -121,8 +121,6 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
WT_ERR(__col_insert_alloc(
session, recno, skipdepth, &ins, &ins_size));
WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
- WT_ERR(__wt_txn_modify(session, &upd->txnid));
- logged = 1;
ins->upd = upd;
ins_size += upd_size;
@@ -132,6 +130,8 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
*/
cbt->ins_head = ins_head;
cbt->ins = ins;
+ WT_ERR(__wt_txn_modify(session, cbt, upd));
+ logged = 1;
/*
* If there was no insert list during the search, or there was
diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c
index 54409493a30..e616faa1e7f 100644
--- a/src/btree/rec_evict.c
+++ b/src/btree/rec_evict.c
@@ -137,7 +137,7 @@ done: session->excl_next = 0;
}
/*
- * __rec_root_update --
+ * __rec_root_update --
* Update a root page's reference on eviction (clean or dirty).
*/
static void
@@ -147,7 +147,7 @@ __rec_root_update(WT_SESSION_IMPL *session)
}
/*
- * __rec_page_clean_update --
+ * __rec_page_clean_update --
* Update a clean page's reference on eviction.
*/
static void
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
index 5be4899f1c6..1e04d726588 100644
--- a/src/btree/rec_write.c
+++ b/src/btree/rec_write.c
@@ -2136,8 +2136,12 @@ __wt_rec_row_bulk_insert(WT_CURSOR_BULK *cbulk)
#define WT_FIX_ENTRIES(btree, bytes) (((bytes) * 8) / (btree)->bitcnt)
+/*
+ * __rec_col_fix_bulk_insert_split_check --
+ * Check if a bulk-loaded fixed-length column store page needs to split.
+ */
static inline int
-__rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk)
+__rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk)
{
WT_BTREE *btree;
WT_RECONCILE *r;
@@ -3778,7 +3782,7 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/*
- * __rec_write_wrapup --
+ * __rec_write_wrapup --
* Finish the reconciliation.
*/
static int
@@ -4025,7 +4029,7 @@ err: __wt_scr_free(&tkey);
}
/*
- * __rec_write_wrapup_err --
+ * __rec_write_wrapup_err --
* Finish the reconciliation on error.
*/
static int
diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c
index 3d05d7c7a49..8e791fe59f2 100644
--- a/src/btree/row_modify.c
+++ b/src/btree/row_modify.c
@@ -62,7 +62,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
/* Allocate the WT_UPDATE structure and transaction ID. */
WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
- WT_ERR(__wt_txn_modify(session, &upd->txnid));
+ WT_ERR(__wt_txn_modify(session, cbt, upd));
logged = 1;
/*
@@ -108,8 +108,6 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
WT_ERR(__wt_row_insert_alloc(
session, key, skipdepth, &ins, &ins_size));
WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
- WT_ERR(__wt_txn_modify(session, &upd->txnid));
- logged = 1;
ins->upd = upd;
ins_size += upd_size;
@@ -119,6 +117,8 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
*/
cbt->ins_head = ins_head;
cbt->ins = ins;
+ WT_ERR(__wt_txn_modify(session, cbt, upd));
+ logged = 1;
/*
* If there was no insert list during the search, the cursor's
@@ -155,6 +155,7 @@ err: /*
if (logged)
__wt_txn_unmodify(session);
__wt_free(session, ins);
+ cbt->ins = NULL;
__wt_free(session, upd);
}
@@ -282,7 +283,7 @@ __wt_update_obsolete_free(
}
/*
- * __wt_page_obsolete --
+ * __wt_row_leaf_obsolete --
* Discard all obsolete updates on a row-store leaf page.
*/
void
diff --git a/src/config/config.c b/src/config/config.c
index 5bfb1646288..acff9c29e28 100644
--- a/src/config/config.c
+++ b/src/config/config.c
@@ -486,7 +486,7 @@ __config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
} while (0)
/*
- * __config_process_value
+ * __config_process_value --
* Deal with special config values like true / false.
*/
static int
diff --git a/src/config/config_check.c b/src/config/config_check.c
index 08d65e72d18..d613e3c734e 100644
--- a/src/config/config_check.c
+++ b/src/config/config_check.c
@@ -210,9 +210,9 @@ err: if (entry != NULL) {
}
/*
- * __wt_config_check--
+ * __wt_config_check --
* Check the keys in an application-supplied config string match what is
- * specified in an array of check strings.
+ * specified in an array of check strings.
*/
int
__wt_config_check(WT_SESSION_IMPL *session,
diff --git a/src/config/config_def.c b/src/config/config_def.c
index f089d4592da..79dce2fa7b7 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -46,8 +46,8 @@ static const WT_CONFIG_CHECK confchk_connection_reconfigure[] = {
{ "verbose", "list",
"choices=[\"block\",\"ckpt\",\"compact\",\"evict\","
"\"evictserver\",\"fileops\",\"hazard\",\"log\",\"lsm\",\"mutex\""
- ",\"overflow\",\"read\",\"readserver\",\"reconcile\",\"salvage\","
- "\"shared_cache\",\"verify\",\"version\",\"write\"]",
+ ",\"overflow\",\"read\",\"readserver\",\"reconcile\",\"recovery\""
+ ",\"salvage\",\"shared_cache\",\"verify\",\"version\",\"write\"]",
NULL},
{ NULL, NULL, NULL, NULL }
};
@@ -60,6 +60,7 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = {
{ "block_compressor", "string", NULL, NULL},
{ "cache_resident", "boolean", NULL, NULL},
{ "checkpoint", "string", NULL, NULL},
+ { "checkpoint_lsn", "string", NULL, NULL},
{ "checksum", "string",
"choices=[\"on\",\"off\",\"uncompressed\"]",
NULL},
@@ -69,6 +70,7 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = {
{ "format", "string", "choices=[\"btree\"]", NULL},
{ "huffman_key", "string", NULL, NULL},
{ "huffman_value", "string", NULL, NULL},
+ { "id", "string", NULL, NULL},
{ "internal_item_max", "int", "min=0", NULL},
{ "internal_key_truncate", "boolean", NULL, NULL},
{ "internal_page_max", "int", "min=512B,max=512MB", NULL},
@@ -216,7 +218,7 @@ static const WT_CONFIG_CHECK confchk_checkpoint_subconfigs[] = {
static const WT_CONFIG_CHECK confchk_log_subconfigs[] = {
{ "archive", "boolean", NULL, NULL },
{ "enabled", "boolean", NULL, NULL },
- { "file_max", "int", "min=1MB,max=2GB", NULL },
+ { "file_max", "int", "min=100KB,max=2GB", NULL },
{ "path", "string", NULL, NULL },
{ NULL, NULL, NULL, NULL }
};
@@ -263,8 +265,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{ "verbose", "list",
"choices=[\"block\",\"ckpt\",\"compact\",\"evict\","
"\"evictserver\",\"fileops\",\"hazard\",\"log\",\"lsm\",\"mutex\""
- ",\"overflow\",\"read\",\"readserver\",\"reconcile\",\"salvage\","
- "\"shared_cache\",\"verify\",\"version\",\"write\"]",
+ ",\"overflow\",\"read\",\"readserver\",\"reconcile\",\"recovery\""
+ ",\"salvage\",\"shared_cache\",\"verify\",\"version\",\"write\"]",
NULL},
{ NULL, NULL, NULL, NULL }
};
@@ -316,13 +318,14 @@ static const WT_CONFIG_ENTRY config_entries[] = {
},
{ "file.meta",
"allocation_size=4KB,block_allocation=best,block_compressor=,"
- "cache_resident=0,checkpoint=,checksum=uncompressed,collator=,"
- "columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,"
- "internal_item_max=0,internal_key_truncate=,internal_page_max=4KB"
- ",key_format=u,key_gap=10,leaf_item_max=0,leaf_page_max=1MB,"
- "memory_page_max=5MB,os_cache_dirty_max=0,os_cache_max=0,"
- "prefix_compression=,prefix_compression_min=4,split_pct=75,"
- "value_format=u,version=(major=0,minor=0)",
+ "cache_resident=0,checkpoint=,checkpoint_lsn=,"
+ "checksum=uncompressed,collator=,columns=,dictionary=0,"
+ "format=btree,huffman_key=,huffman_value=,id=,internal_item_max=0"
+ ",internal_key_truncate=,internal_page_max=4KB,key_format=u,"
+ "key_gap=10,leaf_item_max=0,leaf_page_max=1MB,memory_page_max=5MB"
+ ",os_cache_dirty_max=0,os_cache_max=0,prefix_compression=,"
+ "prefix_compression_min=4,split_pct=75,value_format=u,"
+ "version=(major=0,minor=0)",
confchk_file_meta
},
{ "index.meta",
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index c7db3aea0f8..97eef10ff6b 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -962,6 +962,7 @@ __conn_verbose_config(WT_SESSION_IMPL *session, const char *cfg[])
{ "overflow", WT_VERB_overflow },
{ "read", WT_VERB_read },
{ "reconcile", WT_VERB_reconcile },
+ { "recovery", WT_VERB_recovery },
{ "salvage", WT_VERB_salvage },
{ "shared_cache", WT_VERB_shared_cache },
{ "verify", WT_VERB_verify },
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index dfb290f64c4..262cea40d05 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -361,6 +361,10 @@ err: __wt_spin_unlock(NULL, &cp->cache_pool_lock);
return (ret);
}
+/*
+ * __cache_pool_assess --
+ * Assess the usage of the cache pool.
+ */
static int
__cache_pool_assess(uint64_t *phighest)
{
@@ -403,9 +407,10 @@ __cache_pool_assess(uint64_t *phighest)
}
/*
- * Adjust the allocation of cache to each connection. If force is set ignore
- * cache load information, and reduce the allocation for every connection
- * allocated more than their reserved size.
+ * __cache_pool_adjust --
+ * Adjust the allocation of cache to each connection. If force is set
+ * ignore cache load information, and reduce the allocation for every
+ * connection allocated more than their reserved size.
*/
static int
__cache_pool_adjust(uint64_t highest, uint64_t bump_threshold)
@@ -512,7 +517,7 @@ __wt_cache_pool_server(void *arg)
while (F_ISSET(cp, WT_CACHE_POOL_RUN)) {
if (cp->currently_used <= cp->size)
- WT_ERR(__wt_cond_wait(
+ WT_ERR_TIMEDOUT_OK(__wt_cond_wait(
session, cp->cache_pool_cond, 1000000));
/*
* Re-check pool run flag - since we want to avoid getting the
diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c
index 511904a4565..ba8eed03e47 100644
--- a/src/conn/conn_ckpt.c
+++ b/src/conn/conn_ckpt.c
@@ -73,7 +73,7 @@ __ckpt_server(void *arg)
WT_ERR(wt_session->checkpoint(wt_session, conn->ckpt_config));
/* Wait... */
- WT_ERR(
+ WT_ERR_TIMEDOUT_OK(
__wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs));
}
@@ -84,7 +84,7 @@ err: __wt_err(session, ret, "checkpoint server error");
}
/*
- * __wt_checkpoint_create -
+ * __wt_checkpoint_create --
* Start the checkpoint server thread.
*/
int
@@ -120,7 +120,7 @@ __wt_checkpoint_create(WT_CONNECTION_IMPL *conn, const char *cfg[])
}
/*
- * __wt_checkpoint_destroy -
+ * __wt_checkpoint_destroy --
* Destroy the checkpoint server thread.
*/
int
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index 55f6f997a40..6bdd93f7a6b 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -577,11 +577,13 @@ __wt_conn_btree_close(WT_SESSION_IMPL *session, int locked)
if (!inuse) {
/*
* We should only close the metadata file when closing the
- * last session (i.e., the default session for the connection).
+ * last session (i.e., the default session for the connection)
+ * or at the end of recovery.
*/
WT_ASSERT(session,
S2BT(session) != session->metafile ||
- session == S2C(session)->default_session);
+ session == S2C(session)->default_session ||
+ F_ISSET(session, WT_SESSION_LOGGING_DISABLED));
if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
WT_TRET(__wt_conn_btree_sync_and_close(session));
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index 00e7e14aaca..3cd02e6143a 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -90,7 +90,8 @@ __log_archive_server(void *arg)
* it gets turned back on and check again.
*/
if (conn->archive == 0) {
- WT_ERR(__wt_cond_wait(session, conn->arch_cond, 0));
+ WT_ERR_TIMEDOUT_OK(
+ __wt_cond_wait(session, conn->arch_cond, 0));
continue;
}
@@ -132,7 +133,8 @@ __log_archive_server(void *arg)
log->first_lsn.offset = 0;
/* Wait until the next event. */
- WT_ERR(__wt_cond_wait(session, conn->arch_cond, 0));
+ WT_ERR_TIMEDOUT_OK(
+ __wt_cond_wait(session, conn->arch_cond, 0));
}
if (0) {
@@ -180,8 +182,11 @@ __wt_logmgr_create(WT_CONNECTION_IMPL *conn, const char *cfg[])
INIT_LSN(&log->ckpt_lsn);
INIT_LSN(&log->first_lsn);
INIT_LSN(&log->sync_lsn);
+ INIT_LSN(&log->trunc_lsn);
INIT_LSN(&log->write_lsn);
log->fileid = 0;
+ WT_RET(__wt_cond_alloc(session,
+ "log release", 0, &log->log_release_cond));
WT_RET(__wt_log_open(session));
WT_RET(__wt_log_slot_init(session));
@@ -249,6 +254,9 @@ __wt_logmgr_destroy(WT_CONNECTION_IMPL *conn)
WT_TRET(wt_session->close(wt_session, NULL));
conn->arch_session = NULL;
}
+
+ WT_TRET(__wt_log_slot_destroy(session));
+ WT_TRET(__wt_cond_destroy(session, &conn->log->log_release_cond));
__wt_spin_destroy(session, &conn->log->log_lock);
__wt_spin_destroy(session, &conn->log->log_slot_lock);
__wt_free(session, conn->log);
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index 81f334143a8..220dea45edc 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -77,7 +77,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
* exit before files are closed.
*/
F_CLR(conn, WT_CONN_SERVER_RUN);
- WT_TRET(__wt_logmgr_destroy(conn));
WT_TRET(__wt_checkpoint_destroy(conn));
WT_TRET(__wt_statlog_destroy(conn));
@@ -87,6 +86,17 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
/* Close open data handles. */
WT_TRET(__wt_conn_dhandle_discard(conn));
+ /*
+ * Now that all data handles are closed, tell logging that a checkpoint
+ * has completed then shut down the log manager (only after closing
+ * data handles).
+ */
+ if (conn->logging) {
+ WT_TRET(__wt_txn_checkpoint_log(
+ session, 1, WT_TXN_LOG_CKPT_STOP, NULL));
+ WT_TRET(__wt_logmgr_destroy(conn));
+ }
+
/* Free memory for collators */
while ((ncoll = TAILQ_FIRST(&conn->collqh)) != NULL)
WT_TRET(__wt_conn_remove_collator(conn, ncoll));
diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c
index f26d1514e2b..35fddcfeaf0 100644
--- a/src/conn/conn_stat.c
+++ b/src/conn/conn_stat.c
@@ -272,7 +272,7 @@ __statlog_server(void *arg)
* statistics and check again.
*/
if (conn->stat_all == 0 && conn->stat_fast == 0) {
- WT_ERR(__wt_cond_wait(
+ WT_ERR_TIMEDOUT_OK(__wt_cond_wait(
session, conn->stat_cond, conn->stat_usecs));
continue;
}
@@ -341,7 +341,7 @@ __statlog_server(void *arg)
WT_ERR(fflush(fp) == 0 ? 0 : __wt_errno());
/* Wait until the next event. */
- WT_ERR(
+ WT_ERR_TIMEDOUT_OK(
__wt_cond_wait(session, conn->stat_cond, conn->stat_usecs));
}
diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c
index 56691d88ba3..22ae7bf91ae 100644
--- a/src/cursor/cur_backup.c
+++ b/src/cursor/cur_backup.c
@@ -214,9 +214,10 @@ __backup_start(
WT_ERR(__backup_list_append(session, cb, WT_METADATA_BACKUP));
WT_ERR(__backup_list_append(session, cb, WT_SINGLETHREAD));
- /* Add log files if logging is on. */
- if (conn->log) {
- WT_ERR(__wt_log_getfiles(session, &logfiles, &logcount));
+ /* Add log files if logging is on and we're doing a full backup. */
+ if (!target_list && conn->log) {
+ WT_ERR(
+ __wt_log_get_active_files(session, &logfiles, &logcount));
for (i = 0; i < logcount; i++)
WT_ERR(__backup_list_append(session, cb, logfiles[i]));
}
diff --git a/src/cursor/cur_ds.c b/src/cursor/cur_ds.c
index 9e849e664e9..dc8e1618ccb 100644
--- a/src/cursor/cur_ds.c
+++ b/src/cursor/cur_ds.c
@@ -36,7 +36,7 @@ __curds_txn_leave(WT_SESSION_IMPL *session)
}
/*
- * __curds_key_set -
+ * __curds_key_set --
* Set the key for the data-source.
*/
static int
@@ -57,7 +57,7 @@ err: return (ret);
}
/*
- * __curds_value_set -
+ * __curds_value_set --
* Set the value for the data-source.
*/
static int
@@ -77,7 +77,7 @@ err: return (ret);
}
/*
- * __curds_cursor_resolve -
+ * __curds_cursor_resolve --
* Resolve cursor operation.
*/
static int
diff --git a/src/cursor/cur_dump.c b/src/cursor/cur_dump.c
index 7883996f7cb..f89cad2ffd9 100644
--- a/src/cursor/cur_dump.c
+++ b/src/cursor/cur_dump.c
@@ -247,6 +247,10 @@ WT_CURDUMP_PASS(prev)
WT_CURDUMP_PASS(reset)
WT_CURDUMP_PASS(search)
+/*
+ * __curdump_search_near --
+ * WT_CURSOR::search_near for dump cursors.
+ */
static int
__curdump_search_near(WT_CURSOR *cursor, int *exact)
{
@@ -260,6 +264,10 @@ WT_CURDUMP_PASS(insert)
WT_CURDUMP_PASS(update)
WT_CURDUMP_PASS(remove)
+/*
+ * __curdump_close --
+ * WT_CURSOR::close for dump cursors.
+ */
static int
__curdump_close(WT_CURSOR *cursor)
{
diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c
index 34987898ccf..b6b597816a2 100644
--- a/src/cursor/cur_index.c
+++ b/src/cursor/cur_index.c
@@ -303,6 +303,10 @@ err: API_END(session);
return (ret);
}
+/*
+ * __curindex_open_colgroups --
+ * Open cursors on the column groups required for an index cursor.
+ */
static int
__curindex_open_colgroups(
WT_SESSION_IMPL *session, WT_CURSOR_INDEX *cindex, const char *cfg_arg[])
diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c
index efb22e58174..3f85e65b353 100644
--- a/src/cursor/cur_std.c
+++ b/src/cursor/cur_std.c
@@ -19,7 +19,7 @@ __wt_cursor_notsup(WT_CURSOR *cursor)
return (ENOTSUP);
}
-/*
+/*
* __wt_cursor_noop --
* Cursor noop.
*/
@@ -142,6 +142,44 @@ __wt_cursor_set_raw_key(WT_CURSOR *cursor, WT_ITEM *key)
}
/*
+ * __wt_cursor_get_raw_value --
+ * Temporarily force raw mode in a cursor to get a canonical copy of
+ * the value.
+ */
+int
+__wt_cursor_get_raw_value(WT_CURSOR *cursor, WT_ITEM *value)
+{
+ WT_DECL_RET;
+ int raw_set;
+
+ raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0;
+ if (!raw_set)
+ F_SET(cursor, WT_CURSTD_RAW);
+ ret = cursor->get_value(cursor, value);
+ if (!raw_set)
+ F_CLR(cursor, WT_CURSTD_RAW);
+ return (ret);
+}
+
+/*
+ * __wt_cursor_set_raw_value --
+ * Temporarily force raw mode in a cursor to set a canonical copy of
+ * the value.
+ */
+void
+__wt_cursor_set_raw_value(WT_CURSOR *cursor, WT_ITEM *value)
+{
+ int raw_set;
+
+ raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0;
+ if (!raw_set)
+ F_SET(cursor, WT_CURSTD_RAW);
+ cursor->set_value(cursor, value);
+ if (!raw_set)
+ F_CLR(cursor, WT_CURSTD_RAW);
+}
+
+/*
* __wt_cursor_get_keyv --
* WT_CURSOR->get_key worker function.
*/
diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c
index 33eb7df53d7..86337a568b7 100644
--- a/src/cursor/cur_table.c
+++ b/src/cursor/cur_table.c
@@ -36,6 +36,7 @@ static int __curtable_update(WT_CURSOR *cursor);
F_SET(*__cp, WT_CURSTD_KEY_EXT | \
WT_CURSTD_VALUE_EXT); \
WT_ERR((*__cp)->f(*__cp)); \
+ WT_ERR((*__cp)->reset(*__cp)); \
} \
} while (0)
@@ -602,6 +603,10 @@ err: API_END(session);
return (ret);
}
+/*
+ * __curtable_open_colgroups --
+ * Open cursors on column groups for a table cursor.
+ */
static int
__curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[])
{
@@ -638,6 +643,10 @@ __curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[])
return (0);
}
+/*
+ * __curtable_open_indices --
+ * Open cursors on indices for a table cursor.
+ */
static int
__curtable_open_indices(WT_CURSOR_TABLE *ctable)
{
diff --git a/src/docs/Doxyfile b/src/docs/Doxyfile
index fefc925922a..5f7e5016892 100644
--- a/src/docs/Doxyfile
+++ b/src/docs/Doxyfile
@@ -1584,7 +1584,9 @@ PREDEFINED = DOXYGEN \
__wt_extension_api:=WT_EXTENSION_API \
__wt_extractor:=WT_EXTRACTOR \
__wt_item:=WT_ITEM \
- __wt_session:=WT_SESSION
+ __wt_session:=WT_SESSION \
+ WT_HANDLE_CLOSED(x):=x \
+ WT_HANDLE_NULLABLE(x):=x
# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
# this tag can be used to specify a list of macro names that should be expanded.
diff --git a/src/include/api.h b/src/include/api.h
index 17624c60244..dacb583e596 100644
--- a/src/include/api.h
+++ b/src/include/api.h
@@ -95,6 +95,9 @@
#define SESSION_API_CALL(s, n, config, cfg) \
API_CALL(s, session, n, NULL, NULL, config, cfg)
+#define SESSION_API_CALL_NO_CONF(s, n) \
+ API_CALL_NOCONF(s, session, n, NULL, NULL)
+
#define SESSION_TXN_API_CALL(s, n, config, cfg) \
TXN_API_CALL(s, session, n, NULL, NULL, config, cfg)
diff --git a/src/include/btree.h b/src/include/btree.h
index 686bbd14d3d..4958c88716a 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -66,6 +66,8 @@ struct __wt_btree {
/* Row-store comparison function */
WT_COLLATOR *collator; /* Comparison function */
+ uint32_t id; /* File ID, for logging */
+
uint32_t key_gap; /* Row-store prefix key gap */
uint32_t allocsize; /* Allocation size */
diff --git a/src/include/connection.h b/src/include/connection.h
index b81f3f470ab..d94144ddb05 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -129,7 +129,7 @@ struct __wt_connection_impl {
TAILQ_HEAD(__wt_block_qh, __wt_block) blockqh;
u_int open_btree_count; /* Locked: open writable btree count */
- u_int next_file_id; /* Locked: file ID counter */
+ uint32_t next_file_id; /* Locked: file ID counter */
/*
* WiredTiger allocates space for 50 simultaneous sessions (threads of
@@ -225,6 +225,8 @@ struct __wt_connection_impl {
/* Locked: data source list */
TAILQ_HEAD(__wt_dsrc_qh, __wt_named_data_source) dsrcqh;
+ void *lang_private; /* Language specific private storage */
+
/* If non-zero, all buffers used for I/O will be aligned to this. */
size_t buffer_alignment;
diff --git a/src/include/cursor.h b/src/include/cursor.h
index 0fe5bbeed4e..64b7451a9fc 100644
--- a/src/include/cursor.h
+++ b/src/include/cursor.h
@@ -45,6 +45,7 @@
{ NULL, NULL }, /* TAILQ_ENTRY q */ \
0, /* recno key */ \
{ 0 }, /* recno raw buffer */ \
+ NULL, /* lang_private */ \
{ NULL, 0, 0, NULL, 0 }, /* WT_ITEM key */ \
{ NULL, 0, 0, NULL, 0 }, /* WT_ITEM value */ \
0, /* int saved_err */ \
diff --git a/src/include/error.h b/src/include/error.h
index 6497a92d331..d1251d0fa1a 100644
--- a/src/include/error.h
+++ b/src/include/error.h
@@ -48,6 +48,14 @@
goto err; \
} \
} while (0)
+#define WT_ERR_TIMEDOUT_OK(a) do { \
+ if ((ret = (a)) != 0) { \
+ if (ret == ETIMEDOUT) \
+ ret = 0; \
+ else \
+ goto err; \
+ } \
+} while (0)
#define WT_ERR_TEST(a, v) do { \
if (a) { \
ret = (v); \
@@ -75,6 +83,11 @@
if ((__ret = (a)) != 0 && __ret != WT_NOTFOUND) \
return (__ret); \
} while (0)
+#define WT_RET_TIMEDOUT_OK(a) do { \
+ int __ret; \
+ if ((__ret = (a)) != 0 && __ret != ETIMEDOUT) \
+ return (__ret); \
+} while (0)
/* Set "ret" if not already set. */
#define WT_TRET(a) do { \
diff --git a/src/include/extern.h b/src/include/extern.h
index 101be45994a..6923661d3cb 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -351,6 +351,7 @@ extern int __wt_page_inmem( WT_SESSION_IMPL *session,
extern int __wt_cache_read(WT_SESSION_IMPL *session,
WT_PAGE *parent,
WT_REF *ref);
+extern int __wt_row_key_get(WT_CURSOR_BTREE *cbt, WT_ITEM *key);
extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
extern int __wt_bt_salvage(WT_SESSION_IMPL *session,
WT_CKPT *ckptbase,
@@ -653,6 +654,8 @@ extern int __wt_cursor_get_key(WT_CURSOR *cursor, ...);
extern void __wt_cursor_set_key(WT_CURSOR *cursor, ...);
extern int __wt_cursor_get_raw_key(WT_CURSOR *cursor, WT_ITEM *key);
extern void __wt_cursor_set_raw_key(WT_CURSOR *cursor, WT_ITEM *key);
+extern int __wt_cursor_get_raw_value(WT_CURSOR *cursor, WT_ITEM *value);
+extern void __wt_cursor_set_raw_value(WT_CURSOR *cursor, WT_ITEM *value);
extern int __wt_cursor_get_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap);
extern void __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap);
extern int __wt_cursor_get_value(WT_CURSOR *cursor, ...);
@@ -674,9 +677,13 @@ extern int __wt_curtable_open(WT_SESSION_IMPL *session,
const char *uri,
const char *cfg[],
WT_CURSOR **cursorp);
-extern int __wt_log_getfiles(WT_SESSION_IMPL *session,
- char ***files,
- u_int *count);
+extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
+extern int __wt_log_get_files(WT_SESSION_IMPL *session,
+ char ***filesp,
+ u_int *countp);
+extern int __wt_log_get_active_files( WT_SESSION_IMPL *session,
+ char ***filesp,
+ u_int *countp);
extern void __wt_log_files_free(WT_SESSION_IMPL *session,
char **files,
u_int count);
@@ -709,16 +716,124 @@ extern int __wt_log_write(WT_SESSION_IMPL *session,
extern int __wt_log_vprintf(WT_SESSION_IMPL *session,
const char *fmt,
va_list ap);
+extern int __wt_logrec_alloc(WT_SESSION_IMPL *session,
+ size_t size,
+ WT_ITEM **logrecp);
+extern void __wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp);
+extern int __wt_logrec_read(WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ uint32_t *rectypep);
+extern int __wt_logop_read(WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ uint32_t *optypep,
+ uint32_t *opsizep);
+extern int __wt_logop_col_put_pack( WT_SESSION_IMPL *session,
+ WT_ITEM *logrec,
+ uint32_t fileid,
+ uint64_t recno,
+ WT_ITEM *value);
+extern int __wt_logop_col_put_unpack( WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ uint32_t *fileidp,
+ uint64_t *recnop,
+ WT_ITEM *valuep);
+extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ FILE *out);
+extern int __wt_logop_col_remove_pack( WT_SESSION_IMPL *session,
+ WT_ITEM *logrec,
+ uint32_t fileid,
+ uint64_t recno);
+extern int __wt_logop_col_remove_unpack( WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ uint32_t *fileidp,
+ uint64_t *recnop);
+extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ FILE *out);
+extern int __wt_logop_col_truncate_pack( WT_SESSION_IMPL *session,
+ WT_ITEM *logrec,
+ uint32_t fileid,
+ uint64_t start,
+ uint64_t stop);
+extern int __wt_logop_col_truncate_unpack( WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ uint32_t *fileidp,
+ uint64_t *startp,
+ uint64_t *stopp);
+extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ FILE *out);
+extern int __wt_logop_row_put_pack( WT_SESSION_IMPL *session,
+ WT_ITEM *logrec,
+ uint32_t fileid,
+ WT_ITEM *key,
+ WT_ITEM *value);
+extern int __wt_logop_row_put_unpack( WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ uint32_t *fileidp,
+ WT_ITEM *keyp,
+ WT_ITEM *valuep);
+extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ FILE *out);
+extern int __wt_logop_row_remove_pack( WT_SESSION_IMPL *session,
+ WT_ITEM *logrec,
+ uint32_t fileid,
+ WT_ITEM *key);
+extern int __wt_logop_row_remove_unpack( WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ uint32_t *fileidp,
+ WT_ITEM *keyp);
+extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ FILE *out);
+extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session,
+ WT_ITEM *logrec,
+ uint32_t fileid,
+ WT_ITEM *start,
+ WT_ITEM *stop,
+ uint32_t mode);
+extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ uint32_t *fileidp,
+ WT_ITEM *startp,
+ WT_ITEM *stopp,
+ uint32_t *modep);
+extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ FILE *out);
+extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ FILE *out);
extern int __wt_log_slot_init(WT_SESSION_IMPL *session);
+extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session);
extern int __wt_log_slot_join(WT_SESSION_IMPL *session,
uint64_t mysize,
uint32_t flags,
WT_MYSLOT *myslotp);
extern int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
-extern int __wt_log_slot_notify(WT_LOGSLOT *slot);
-extern int __wt_log_slot_wait(WT_LOGSLOT *slot);
+extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size);
extern int __wt_log_slot_free(WT_LOGSLOT *slot);
+extern int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session,
+ int64_t newsize);
extern int __wt_clsm_init_merge( WT_CURSOR *cursor,
u_int start_chunk,
uint32_t start_id,
@@ -809,9 +924,10 @@ extern int __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session,
extern int __wt_meta_ckptlist_get( WT_SESSION_IMPL *session,
const char *fname,
WT_CKPT **ckptbasep);
-extern int __wt_meta_ckptlist_set( WT_SESSION_IMPL *session,
+extern int __wt_meta_ckptlist_set(WT_SESSION_IMPL *session,
const char *fname,
- WT_CKPT *ckptbase);
+ WT_CKPT *ckptbase,
+ WT_LSN *ckptlsn);
extern void __wt_meta_ckptlist_free(WT_SESSION_IMPL *session,
WT_CKPT *ckptbase);
extern void __wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt);
@@ -1304,6 +1420,7 @@ extern int __wt_ispo2(uint32_t v);
extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2);
extern uint32_t __wt_random(void);
extern int __wt_buf_grow(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size);
+extern int __wt_buf_extend(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size);
extern int __wt_buf_init(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size);
extern int __wt_buf_initsize(WT_SESSION_IMPL *session,
WT_ITEM *buf,
@@ -1386,3 +1503,19 @@ extern uint64_t __wt_ext_transaction_oldest(WT_EXTENSION_API *wt_api);
extern int __wt_ext_transaction_visible( WT_EXTENSION_API *wt_api,
WT_SESSION *wt_session,
uint64_t transaction_id);
+extern void __wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op);
+extern int __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_checkpoint_logread( WT_SESSION_IMPL *session,
+ const uint8_t **pp,
+ const uint8_t *end,
+ WT_LSN *ckpt_lsn);
+extern int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session,
+ int full,
+ uint32_t flags,
+ WT_LSN *lsnp);
+extern int __wt_txn_truncate_log( WT_SESSION_IMPL *session,
+ WT_CURSOR_BTREE *start,
+ WT_CURSOR_BTREE *stop);
+extern int __wt_txn_truncate_end(WT_SESSION_IMPL *session);
+extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out);
+extern int __wt_txn_recover(WT_SESSION_IMPL *default_session);
diff --git a/src/include/flags.h b/src/include/flags.h
index 91c0a6a959e..cd8de157cbf 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -16,11 +16,12 @@
#define WT_LOGSCAN_FROM_CKP 0x00000004
#define WT_LOGSCAN_ONE 0x00000002
#define WT_LOGSCAN_RECOVER 0x00000001
-#define WT_LOG_CKPT 0x00000004
#define WT_LOG_DSYNC 0x00000002
#define WT_LOG_FSYNC 0x00000001
-#define WT_SESSION_CACHE_BUSY 0x00000040
-#define WT_SESSION_INTERNAL 0x00000020
+#define WT_SESSION_CACHE_BUSY 0x00000100
+#define WT_SESSION_INTERNAL 0x00000080
+#define WT_SESSION_LOGGING_DISABLED 0x00000040
+#define WT_SESSION_LOGGING_INMEM 0x00000020
#define WT_SESSION_NO_CACHE 0x00000010
#define WT_SESSION_NO_CACHE_CHECK 0x00000008
#define WT_SESSION_NO_SCHEMA_LOCK 0x00000004
@@ -40,19 +41,24 @@
#define WT_TREE_SKIP_INTL 0x00000004
#define WT_TREE_SKIP_LEAF 0x00000002
#define WT_TREE_WAIT 0x00000001
-#define WT_VERB_block 0x00020000
-#define WT_VERB_ckpt 0x00010000
-#define WT_VERB_compact 0x00008000
-#define WT_VERB_evict 0x00004000
-#define WT_VERB_evictserver 0x00002000
-#define WT_VERB_fileops 0x00001000
-#define WT_VERB_hazard 0x00000800
-#define WT_VERB_log 0x00000400
-#define WT_VERB_lsm 0x00000200
-#define WT_VERB_mutex 0x00000100
-#define WT_VERB_overflow 0x00000080
-#define WT_VERB_read 0x00000040
-#define WT_VERB_reconcile 0x00000020
+#define WT_TXN_LOG_CKPT_FAIL 0x00000008
+#define WT_TXN_LOG_CKPT_PREPARE 0x00000004
+#define WT_TXN_LOG_CKPT_START 0x00000002
+#define WT_TXN_LOG_CKPT_STOP 0x00000001
+#define WT_VERB_block 0x00040000
+#define WT_VERB_ckpt 0x00020000
+#define WT_VERB_compact 0x00010000
+#define WT_VERB_evict 0x00008000
+#define WT_VERB_evictserver 0x00004000
+#define WT_VERB_fileops 0x00002000
+#define WT_VERB_hazard 0x00001000
+#define WT_VERB_log 0x00000800
+#define WT_VERB_lsm 0x00000400
+#define WT_VERB_mutex 0x00000200
+#define WT_VERB_overflow 0x00000100
+#define WT_VERB_read 0x00000080
+#define WT_VERB_reconcile 0x00000040
+#define WT_VERB_recovery 0x00000020
#define WT_VERB_salvage 0x00000010
#define WT_VERB_shared_cache 0x00000008
#define WT_VERB_verify 0x00000004
diff --git a/src/include/log.h b/src/include/log.h
index 40ebb82958a..bb53be4484c 100644
--- a/src/include/log.h
+++ b/src/include/log.h
@@ -9,6 +9,7 @@
/* Logging subsystem declarations. */
#define LOG_ALIGN 128
+#define WT_LOG_SLOT_BUF_INIT_SIZE 64 * 1024
/*
* We rely on this structure being aligned at 64 bits by the compiler,
@@ -20,26 +21,18 @@ struct __wt_lsn {
off_t offset; /* Log file offset */
};
-typedef enum {
- WT_LOGREC_INT16,
- WT_LOGREC_UINT16,
- WT_LOGREC_INT32,
- WT_LOGREC_UINT32,
- WT_LOGREC_INT64,
- WT_LOGREC_UINT64,
- WT_LOGREC_STRING,
-} WT_LOGREC_FIELDTYPE;
-
-typedef struct {
- const char *fmt;
- const char *fields[];
-} WT_LOGREC_DESC;
-
#define INIT_LSN(l) do { \
(l)->file = 1; \
(l)->offset = 0; \
} while (0)
+#define IS_INIT_LSN(l) ((l)->file == 1 && (l)->offset == 0)
+
+#define MAX_LSN(l) do { \
+ (l)->file = UINT32_MAX; \
+ (l)->offset = INT64_MAX; \
+} while (0)
+
/*
* Compare 2 LSNs, return -1 if lsn0 < lsn1, 0 if lsn0 == lsn1
* and 1 if lsn0 > lsn1.
@@ -64,45 +57,25 @@ typedef struct {
#define WT_LOG_SLOT_PENDING 2
#define WT_LOG_SLOT_READY 3
typedef struct {
- union {
- struct {
-#undef slot_state
-#define slot_state u.slot.state
- int64_t state; /* Slot state */
-#undef slot_group_size
-#define slot_group_size u.slot.group_size
- uint64_t group_size; /* Group size */
-#undef slot_error
-#define slot_error u.slot.error
- int32_t error; /* Error value */
-#undef slot_index
-#define slot_index u.slot.index
+ int64_t slot_state; /* Slot state */
+ uint64_t slot_group_size; /* Group size */
+ int32_t slot_error; /* Error value */
#define SLOT_INVALID_INDEX 0xffffffff
- uint32_t index; /* Active slot index */
-#undef slot_start_offset
-#define slot_start_offset u.slot.start_offset
- off_t start_offset; /* Starting file offset */
-#undef slot_release_lsn
-#define slot_release_lsn u.slot.release_lsn
- WT_LSN release_lsn; /* Slot release LSN */
-#undef slot_start_lsn
-#define slot_start_lsn u.slot.start_lsn
- WT_LSN start_lsn; /* Slot starting LSN */
-#undef slot_end_lsn
-#define slot_end_lsn u.slot.end_lsn
- WT_LSN end_lsn; /* Slot ending LSN */
-#undef slot_fh
-#define slot_fh u.slot.fh
- WT_FH *fh; /* File handle for this group */
-#undef slot_flags
-#define slot_flags u.slot.flags
-#define SLOT_CLOSEFH 0x01 /* Close old fh on release */
-#define SLOT_SYNC 0x02 /* Needs sync on release */
- uint32_t flags; /* Flags */
- } slot;
- uint8_t align[LOG_ALIGN];
- } u;
-} WT_LOGSLOT;
+ uint32_t slot_index; /* Active slot index */
+ off_t slot_start_offset; /* Starting file offset */
+ WT_LSN slot_release_lsn; /* Slot release LSN */
+ WT_LSN slot_start_lsn; /* Slot starting LSN */
+ WT_LSN slot_end_lsn; /* Slot ending LSN */
+ WT_FH *slot_fh; /* File handle for this group */
+ WT_ITEM slot_buf; /* Buffer for grouped writes */
+ WT_CONDVAR *slot_done_cond; /* Signalled when write done */
+ int32_t slot_churn; /* Active slots are scarce. */
+#define SLOT_BUF_GROW 0x01 /* Grow buffer on release */
+#define SLOT_BUFFERED 0x02 /* Buffer writes */
+#define SLOT_CLOSEFH 0x04 /* Close old fh on release */
+#define SLOT_SYNC 0x08 /* Needs sync on release */
+ uint32_t flags; /* Flags */
+} WT_LOGSLOT WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
typedef struct {
WT_LOGSLOT *slot;
@@ -127,6 +100,7 @@ typedef struct {
WT_LSN ckpt_lsn; /* Last checkpoint LSN */
WT_LSN first_lsn; /* First LSN */
WT_LSN sync_lsn; /* LSN of the last sync */
+ WT_LSN trunc_lsn; /* End LSN for recovery truncation */
WT_LSN write_lsn; /* Last LSN written to log file */
/*
@@ -135,17 +109,24 @@ typedef struct {
WT_SPINLOCK log_lock; /* Locked: Logging fields */
WT_SPINLOCK log_slot_lock; /* Locked: Consolidation array */
+ /* Notify any waiting slots when write_lsn is updated. */
+ WT_CONDVAR *log_release_cond;
+
/*
* Consolidation array information
* SLOT_ACTIVE must be less than SLOT_POOL.
+ * Our testing shows that the more consolidation we generate the
+ * better the performance we see which equates to an active slot
+ * slot count of one.
*/
-#define SLOT_ACTIVE 4
+#define SLOT_ACTIVE 1
#define SLOT_POOL 16
uint32_t pool_index; /* Global pool index */
WT_LOGSLOT *slot_array[SLOT_ACTIVE]; /* Active slots */
WT_LOGSLOT slot_pool[SLOT_POOL]; /* Pool of all slots */
- uint32_t flags; /* Currently unused */
+#define WT_LOG_FORCE_CONSOLIDATE 0x01 /* Disable direct writes */
+ uint32_t flags;
} WT_LOG;
typedef struct {
@@ -168,3 +149,40 @@ struct __wt_log_desc {
uint16_t minorv; /* 06-07: Minor version */
uint64_t log_size; /* 08-15: Log file size */
};
+
+/*
+ * WT_LOG_REC_DESC --
+ * A descriptor for a log record type.
+ */
+struct __wt_log_rec_desc {
+ const char *fmt;
+ int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end);
+};
+
+/*
+ * WT_LOG_OP_DESC --
+ * A descriptor for a log operation type.
+ */
+struct __wt_log_op_desc {
+ const char *fmt;
+ int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end);
+};
+
+/*
+ * DO NOT EDIT: automatically built by dist/log.py.
+ * Log record declarations: BEGIN
+ */
+#define WT_LOGREC_CHECKPOINT 0
+#define WT_LOGREC_COMMIT 1
+#define WT_LOGREC_FILE_SYNC 2
+#define WT_LOGREC_MESSAGE 3
+#define WT_LOGOP_COL_PUT 0
+#define WT_LOGOP_COL_REMOVE 1
+#define WT_LOGOP_COL_TRUNCATE 2
+#define WT_LOGOP_ROW_PUT 3
+#define WT_LOGOP_ROW_REMOVE 4
+#define WT_LOGOP_ROW_TRUNCATE 5
+/*
+ * Log record declarations: END
+ * DO NOT EDIT: automatically built by dist/log.py.
+ */
diff --git a/src/include/log.i b/src/include/log.i
deleted file mode 100644
index 6cfe18fc441..00000000000
--- a/src/include/log.i
+++ /dev/null
@@ -1 +0,0 @@
-/* DO NOT EDIT: automatically built by dist/log.py. */
diff --git a/src/include/misc.h b/src/include/misc.h
index 3cb5c3e4979..dc22b4da7ba 100644
--- a/src/include/misc.h
+++ b/src/include/misc.h
@@ -157,6 +157,12 @@
#define WT_STRING_MATCH(str, bytes, len) \
(strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0')
+/*
+ * Macro that produces a string literal that isn't wrapped in quotes, to avoid
+ * tripping up spell checkers.
+ */
+#define WT_UNCHECKED_STRING(str) #str
+
/* Function return value and scratch buffer declaration and initialization. */
#define WT_DECL_ITEM(i) WT_ITEM *i = NULL
#define WT_DECL_RET int ret = 0
diff --git a/src/include/session.h b/src/include/session.h
index 4f0477193f1..7ceb3fdda5f 100644
--- a/src/include/session.h
+++ b/src/include/session.h
@@ -96,6 +96,7 @@ struct __wt_session_impl {
WT_TXN_ISOLATION isolation;
WT_TXN txn; /* Transaction state */
u_int ncursors; /* Count of active file cursors. */
+ void *lang_private; /* Language specific private storage */
WT_REF **excl; /* Eviction exclusive list */
u_int excl_next; /* Next empty slot */
diff --git a/src/include/stat.h b/src/include/stat.h
index 94a6e869be9..fa3109cc56d 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -162,6 +162,8 @@ struct __wt_connection_stats {
WT_STATS dh_sweep_evict;
WT_STATS dh_sweeps;
WT_STATS file_open;
+ WT_STATS log_buffer_grow;
+ WT_STATS log_buffer_size;
WT_STATS log_bytes_user;
WT_STATS log_bytes_written;
WT_STATS log_max_filesize;
@@ -173,7 +175,11 @@ struct __wt_connection_stats {
WT_STATS log_slot_consolidated;
WT_STATS log_slot_joins;
WT_STATS log_slot_races;
+ WT_STATS log_slot_ready_wait_timeout;
+ WT_STATS log_slot_release_wait_timeout;
+ WT_STATS log_slot_switch_fails;
WT_STATS log_slot_toobig;
+ WT_STATS log_slot_toosmall;
WT_STATS log_slot_transitions;
WT_STATS log_sync;
WT_STATS log_writes;
diff --git a/src/include/txn.h b/src/include/txn.h
index 28842f772f6..6a9dcd4abd5 100644
--- a/src/include/txn.h
+++ b/src/include/txn.h
@@ -51,6 +51,51 @@ enum __wt_txn_isolation {
TXN_ISO_SNAPSHOT
};
+/*
+ * WT_TXN_OP --
+ * A transactional operation. Each transaction builds an in-memory array
+ * of these operations as it runs, then uses the array to either write log
+ * records during commit or undo the operations during rollback.
+ */
+struct __wt_txn_op {
+ uint32_t fileid;
+ enum {
+ TXN_OP_BASIC,
+ TXN_OP_INMEM,
+ TXN_OP_REF,
+ TXN_OP_TRUNCATE_COL,
+ TXN_OP_TRUNCATE_ROW
+ } type;
+ union {
+ /* TXN_OP_BASIC, TXN_OP_INMEM */
+ struct {
+ WT_INSERT *ins;
+ WT_UPDATE *upd;
+ WT_ITEM key;
+ } op;
+ /* TXN_OP_REF */
+ WT_REF *ref;
+ /* TXN_OP_TRUNCATE_COL */
+ struct {
+ uint64_t start, stop;
+ } truncate_col;
+ /* TXN_OP_TRUNCATE_ROW */
+ struct {
+ WT_ITEM start, stop;
+ enum {
+ TXN_TRUNC_ALL,
+ TXN_TRUNC_BOTH,
+ TXN_TRUNC_START,
+ TXN_TRUNC_STOP
+ } mode;
+ } truncate_row;
+ } u;
+};
+
+/*
+ * WT_TXN --
+ * Per-session transaction context.
+ */
struct __wt_txn {
uint64_t id;
@@ -66,21 +111,20 @@ struct __wt_txn {
uint64_t *snapshot;
uint32_t snapshot_count;
- /*
- * Arrays of txn IDs in WT_UPDATE or WT_REF structures created or
- * modified by this transaction.
- */
- uint64_t **mod;
+ /* Array of modifications by this transaction. */
+ WT_TXN_OP *mod;
size_t mod_alloc;
u_int mod_count;
- WT_REF **modref;
- size_t modref_alloc;
- u_int modref_count;
-
/* Requested notification when transactions are resolved. */
WT_TXN_NOTIFY *notify;
+ /* Checkpoint status. */
+ WT_LSN ckpt_lsn;
+ int full_ckpt;
+ uint32_t ckpt_nsnapshot;
+ WT_ITEM *ckpt_snapshot;
+
#define TXN_AUTOCOMMIT 0x01
#define TXN_ERROR 0x02
#define TXN_OLDEST 0x04
diff --git a/src/include/txn.i b/src/include/txn.i
index ed02760e24d..f8ace3c1dd7 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -5,41 +5,25 @@
* See the file LICENSE for redistribution information.
*/
+static inline void __wt_txn_read_first(WT_SESSION_IMPL *session);
+static inline void __wt_txn_read_last(WT_SESSION_IMPL *session);
+
/*
* __wt_txn_modify --
* Mark a WT_UPDATE object modified by the current transaction.
*/
static inline int
-__wt_txn_modify(WT_SESSION_IMPL *session, uint64_t *id)
-{
- WT_TXN *txn;
-
- txn = &session->txn;
- WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING));
- WT_RET(__wt_realloc_def(
- session, &txn->mod_alloc, txn->mod_count + 1, &txn->mod));
-
- txn->mod[txn->mod_count++] = id;
- *id = txn->id;
- return (0);
-}
-
-/*
- * __wt_txn_modify_ref --
- * Mark a WT_REF object modified by the current transaction.
- */
-static inline int
-__wt_txn_modify_ref(WT_SESSION_IMPL *session, WT_REF *ref)
+__txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp)
{
WT_TXN *txn;
txn = &session->txn;
WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING));
- WT_RET(__wt_realloc_def(
- session, &txn->modref_alloc, txn->modref_count + 1, &txn->modref));
+ WT_RET(__wt_realloc_def(session, &txn->mod_alloc,
+ txn->mod_count + 1, &txn->mod));
- txn->modref[txn->modref_count++] = ref;
- ref->txnid = txn->id;
+ *opp = &txn->mod[txn->mod_count++];
+ WT_CLEAR(**opp);
return (0);
}
@@ -62,6 +46,48 @@ __wt_txn_unmodify(WT_SESSION_IMPL *session)
}
/*
+ * __wt_txn_modify --
+ * Mark a WT_UPDATE object modified by the current transaction.
+ */
+static inline int
+__wt_txn_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+ WT_DECL_RET;
+ WT_TXN_OP *op;
+
+ WT_RET(__txn_next_op(session, &op));
+ op->type = F_ISSET(session, WT_SESSION_LOGGING_INMEM) ?
+ TXN_OP_INMEM : TXN_OP_BASIC;
+ /* If we are logging, we need a reference to the key. */
+ if (cbt->btree->type == BTREE_ROW && S2C(session)->logging)
+ WT_ERR(__wt_row_key_get(cbt, &op->u.op.key));
+ op->u.op.ins = cbt->ins;
+ op->u.op.upd = upd;
+ op->fileid = S2BT(session)->id;
+ upd->txnid = session->txn.id;
+ if (0) {
+err: __wt_txn_unmodify(session);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_txn_modify_ref --
+ * Mark a WT_REF object modified by the current transaction.
+ */
+static inline int
+__wt_txn_modify_ref(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_TXN_OP *op;
+
+ WT_RET(__txn_next_op(session, &op));
+ op->type = TXN_OP_REF;
+ op->u.ref = ref;
+ ref->txnid = session->txn.id;
+ return (0);
+}
+
+/*
* __wt_txn_visible_all --
* Check if a given transaction ID is "globally visible". This is, if
* all sessions in the system will see the transaction ID.
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index e64d0bcab68..3ee79c5f125 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -80,6 +80,18 @@ struct __wt_extractor; typedef struct __wt_extractor WT_EXTRACTOR;
struct __wt_item; typedef struct __wt_item WT_ITEM;
struct __wt_session; typedef struct __wt_session WT_SESSION;
+#if defined(SWIGJAVA)
+#define WT_HANDLE_NULLABLE(typename) typename##_NULLABLE
+#define WT_HANDLE_CLOSED(typename) typename##_CLOSED
+typedef WT_CURSOR WT_CURSOR_NULLABLE;
+typedef WT_CURSOR WT_CURSOR_CLOSED;
+typedef WT_SESSION WT_SESSION_CLOSED;
+typedef WT_CONNECTION WT_CONNECTION_CLOSED;
+#elif !defined(DOXYGEN)
+#define WT_HANDLE_NULLABLE(typename) typename
+#define WT_HANDLE_CLOSED(typename) typename
+#endif
+
/*!
* A raw item of data to be managed. Data items have a pointer to the data and
* a length (limited to 4GB for items stored in tables). WT_ITEM structures do
@@ -459,7 +471,7 @@ struct __wt_cursor {
* @param cursor the cursor handle
* @errors
*/
- int __F(close)(WT_CURSOR *cursor);
+ int __F(close)(WT_HANDLE_CLOSED(WT_CURSOR) *cursor);
/*
* Protected fields, only to be used by cursor implementations.
@@ -478,6 +490,8 @@ struct __wt_cursor {
uint64_t recno; /* Record number, normal and raw mode */
uint8_t raw_recno_buf[WT_INTPACK64_MAXSIZE];
+ void *lang_private; /* Language specific private storage */
+
WT_ITEM key, value;
int saved_err; /* Saved error in set_{key,value}. */
@@ -523,7 +537,8 @@ struct __wt_session {
* @configempty{session.close, see dist/api_data.py}
* @errors
*/
- int __F(close)(WT_SESSION *session, const char *config);
+ int __F(close)(WT_HANDLE_CLOSED(WT_SESSION) *session,
+ const char *config);
/*!
* Reconfigure a session handle.
@@ -689,7 +704,7 @@ struct __wt_session {
* @errors
*/
int __F(open_cursor)(WT_SESSION *session,
- const char *uri, WT_CURSOR *to_dup,
+ const char *uri, WT_HANDLE_NULLABLE(WT_CURSOR) *to_dup,
const char *config, WT_CURSOR **cursorp);
/*! @} */
@@ -975,7 +990,9 @@ struct __wt_session {
*/
int __F(truncate)(WT_SESSION *session,
const char *name,
- WT_CURSOR *start, WT_CURSOR *stop, const char *config);
+ WT_HANDLE_NULLABLE(WT_CURSOR) *start,
+ WT_HANDLE_NULLABLE(WT_CURSOR) *stop,
+ const char *config);
/*!
* Upgrade a file or table.
@@ -1148,7 +1165,8 @@ struct __wt_connection {
* @configempty{connection.close, see dist/api_data.py}
* @errors
*/
- int __F(close)(WT_CONNECTION *connection, const char *config);
+ int __F(close)(WT_HANDLE_CLOSED(WT_CONNECTION) *connection,
+ const char *config);
/*!
* Reconfigure a connection handle.
@@ -1209,9 +1227,9 @@ struct __wt_connection {
* chosen from the following options: \c "block"\, \c "ckpt"\, \c
* "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\, \c
* "hazard"\, \c "log"\, \c "lsm"\, \c "mutex"\, \c "overflow"\, \c
- * "read"\, \c "readserver"\, \c "reconcile"\, \c "salvage"\, \c
- * "shared_cache"\, \c "verify"\, \c "version"\, \c "write"; default
- * empty.}
+ * "read"\, \c "readserver"\, \c "reconcile"\, \c "recovery"\, \c
+ * "salvage"\, \c "shared_cache"\, \c "verify"\, \c "version"\, \c
+ * "write"; default empty.}
* @configend
* @errors
*/
@@ -1464,7 +1482,7 @@ struct __wt_connection {
* @config{&nbsp;&nbsp;&nbsp;&nbsp;enabled, enable logging subsystem., a boolean
* flag; default \c false.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;file_max, the
- * maximum size of the log file., an integer between 1MB and 2GB; default \c
+ * maximum size of log files., an integer between 100KB and 2GB; default \c
* 100MB.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;path, the path to a directory into
* which the log files are written. If the value is not an absolute path name\,
@@ -1542,8 +1560,8 @@ struct __wt_connection {
* values chosen from the following options: \c "block"\, \c "ckpt"\, \c
* "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\, \c "hazard"\, \c
* "log"\, \c "lsm"\, \c "mutex"\, \c "overflow"\, \c "read"\, \c "readserver"\,
- * \c "reconcile"\, \c "salvage"\, \c "shared_cache"\, \c "verify"\, \c
- * "version"\, \c "write"; default empty.}
+ * \c "reconcile"\, \c "recovery"\, \c "salvage"\, \c "shared_cache"\, \c
+ * "verify"\, \c "version"\, \c "write"; default empty.}
* @configend
* Additionally, if a file named \c WiredTiger.config appears in the WiredTiger
* home directory, it is read for configuration values (see @ref config_file
@@ -2481,74 +2499,86 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_DH_SESSION_HANDLES 1041
/*! dhandle: sweeps conflicting with evict */
#define WT_STAT_CONN_DH_SWEEP_EVICT 1042
-/*! dhandle: number of sweep attempts */
+/*! dhandle: sweep attempts */
#define WT_STAT_CONN_DH_SWEEPS 1043
/*! files currently open */
#define WT_STAT_CONN_FILE_OPEN 1044
+/*! log: log buffer size increases */
+#define WT_STAT_CONN_LOG_BUFFER_GROW 1045
+/*! log: total log buffer size */
+#define WT_STAT_CONN_LOG_BUFFER_SIZE 1046
/*! log: user provided log bytes written */
-#define WT_STAT_CONN_LOG_BYTES_USER 1045
+#define WT_STAT_CONN_LOG_BYTES_USER 1047
/*! log: log bytes written */
-#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1046
+#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1048
/*! log: maximum log file size */
-#define WT_STAT_CONN_LOG_MAX_FILESIZE 1047
+#define WT_STAT_CONN_LOG_MAX_FILESIZE 1049
/*! log: log read operations */
-#define WT_STAT_CONN_LOG_READS 1048
+#define WT_STAT_CONN_LOG_READS 1050
/*! log: records processed by log scan */
-#define WT_STAT_CONN_LOG_SCAN_RECORDS 1049
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1051
/*! log: log scan records requiring two reads */
-#define WT_STAT_CONN_LOG_SCAN_REREADS 1050
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1052
/*! log: log scan operations */
-#define WT_STAT_CONN_LOG_SCANS 1051
+#define WT_STAT_CONN_LOG_SCANS 1053
/*! log: consolidated slot closures */
-#define WT_STAT_CONN_LOG_SLOT_CLOSES 1052
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1054
/*! log: logging bytes consolidated */
-#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1053
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1055
/*! log: consolidated slot joins */
-#define WT_STAT_CONN_LOG_SLOT_JOINS 1054
+#define WT_STAT_CONN_LOG_SLOT_JOINS 1056
/*! log: consolidated slot join races */
-#define WT_STAT_CONN_LOG_SLOT_RACES 1055
+#define WT_STAT_CONN_LOG_SLOT_RACES 1057
+/*! log: log slot ready wait timeouts */
+#define WT_STAT_CONN_LOG_SLOT_READY_WAIT_TIMEOUT 1058
+/*! log: log slot release wait timeouts */
+#define WT_STAT_CONN_LOG_SLOT_RELEASE_WAIT_TIMEOUT 1059
+/*! log: slots selected for switching that were unavailable */
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1060
/*! log: record size exceeded maximum */
-#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1056
+#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1061
+/*! log: failed to find a slot large enough for record */
+#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1062
/*! log: consolidated slot join transitions */
-#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1057
+#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1063
/*! log: log sync operations */
-#define WT_STAT_CONN_LOG_SYNC 1058
+#define WT_STAT_CONN_LOG_SYNC 1064
/*! log: log write operations */
-#define WT_STAT_CONN_LOG_WRITES 1059
+#define WT_STAT_CONN_LOG_WRITES 1065
/*! rows merged in an LSM tree */
-#define WT_STAT_CONN_LSM_ROWS_MERGED 1060
+#define WT_STAT_CONN_LSM_ROWS_MERGED 1066
/*! memory allocations */
-#define WT_STAT_CONN_MEMORY_ALLOCATION 1061
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1067
/*! memory frees */
-#define WT_STAT_CONN_MEMORY_FREE 1062
+#define WT_STAT_CONN_MEMORY_FREE 1068
/*! memory re-allocations */
-#define WT_STAT_CONN_MEMORY_GROW 1063
+#define WT_STAT_CONN_MEMORY_GROW 1069
/*! total read I/Os */
-#define WT_STAT_CONN_READ_IO 1064
+#define WT_STAT_CONN_READ_IO 1070
/*! page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1065
+#define WT_STAT_CONN_REC_PAGES 1071
/*! page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1066
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1072
/*! reconciliation failed because an update could not be included */
-#define WT_STAT_CONN_REC_SKIPPED_UPDATE 1067
+#define WT_STAT_CONN_REC_SKIPPED_UPDATE 1073
/*! pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 1068
+#define WT_STAT_CONN_RWLOCK_READ 1074
/*! pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 1069
+#define WT_STAT_CONN_RWLOCK_WRITE 1075
/*! open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1070
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1076
/*! transactions */
-#define WT_STAT_CONN_TXN_BEGIN 1071
+#define WT_STAT_CONN_TXN_BEGIN 1077
/*! transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1072
+#define WT_STAT_CONN_TXN_CHECKPOINT 1078
/*! transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1073
+#define WT_STAT_CONN_TXN_COMMIT 1079
/*! transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1074
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1080
/*! transactions rolled-back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1075
+#define WT_STAT_CONN_TXN_ROLLBACK 1081
/*! total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 1076
+#define WT_STAT_CONN_WRITE_IO 1082
/*!
* @}
@@ -2721,7 +2751,7 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_DSRC_REC_SPLIT_INTERNAL 2080
/*! reconciliation leaf pages split */
#define WT_STAT_DSRC_REC_SPLIT_LEAF 2081
-/*! reconciliation maximum number of splits created for a page */
+/*! reconciliation maximum splits for a page */
#define WT_STAT_DSRC_REC_SPLIT_MAX 2082
/*! object compaction */
#define WT_STAT_DSRC_SESSION_COMPACT 2083
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index b8b55640296..d36656be947 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -147,6 +147,10 @@ struct __wt_insert_head;
typedef struct __wt_insert_head WT_INSERT_HEAD;
struct __wt_log_desc;
typedef struct __wt_log_desc WT_LOG_DESC;
+struct __wt_log_op_desc;
+ typedef struct __wt_log_op_desc WT_LOG_OP_DESC;
+struct __wt_log_rec_desc;
+ typedef struct __wt_log_rec_desc WT_LOG_REC_DESC;
struct __wt_lsm_chunk;
typedef struct __wt_lsm_chunk WT_LSM_CHUNK;
struct __wt_lsm_data_source;
@@ -203,6 +207,8 @@ struct __wt_txn;
typedef struct __wt_txn WT_TXN;
struct __wt_txn_global;
typedef struct __wt_txn_global WT_TXN_GLOBAL;
+struct __wt_txn_op;
+ typedef struct __wt_txn_op WT_TXN_OP;
struct __wt_txn_state;
typedef struct __wt_txn_state WT_TXN_STATE;
struct __wt_update;
@@ -261,7 +267,6 @@ struct __wt_update;
#include "bitstring.i"
#include "column.i"
-#include "log.i"
#include "serial.i"
#if defined(__cplusplus)
diff --git a/src/log/log.c b/src/log/log.c
index 7f5d2d7f139..c3b61108422 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -8,17 +8,82 @@
#include "wt_internal.h"
/*
- * __wt_log_getfiles --
+ * __wt_log_ckpt --
+ * Record the given LSN as the checkpoint LSN and signal the archive
+ * thread as needed.
+ */
+int
+__wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+ log->ckpt_lsn = *ckp_lsn;
+ if (conn->arch_cond != NULL)
+ WT_RET(__wt_cond_signal(session, conn->arch_cond));
+ return (0);
+}
+
+/*
+ * __wt_log_get_files --
* Retrieve the list of all existing log files.
*/
int
-__wt_log_getfiles(WT_SESSION_IMPL *session, char ***files, u_int *count)
+__wt_log_get_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp)
{
WT_CONNECTION_IMPL *conn;
+ const char *log_path;
+
+ *countp = 0;
+ *filesp = NULL;
conn = S2C(session);
- return (__wt_dirlist(session, conn->log_path, WT_LOG_FILENAME,
- WT_DIRLIST_INCLUDE, files, count));
+ log_path = conn->log_path;
+ if (log_path == NULL)
+ log_path = "";
+ return (__wt_dirlist(session, log_path, WT_LOG_FILENAME,
+ WT_DIRLIST_INCLUDE, filesp, countp));
+}
+
+/*
+ * __wt_log_get_active_files --
+ * Retrieve the list of active log files (those that are not candidates
+ * for archiving).
+ */
+int
+__wt_log_get_active_files(
+ WT_SESSION_IMPL *session, char ***filesp, u_int *countp)
+{
+ WT_DECL_RET;
+ WT_LOG *log;
+ char **files;
+ uint32_t id;
+ u_int count, i;
+
+ log = S2C(session)->log;
+
+ WT_RET(__wt_log_get_files(session, &files, &count));
+
+ /* Filter out any files that are below the checkpoint LSN. */
+ for (i = 0; i < count; ) {
+ WT_ERR(__wt_log_extract_lognum(session, files[i], &id));
+ if (id < log->ckpt_lsn.file) {
+ __wt_free(session, files[i]);
+ files[i] = files[count - 1];
+ files[--count] = NULL;
+ } else
+ i++;
+ }
+
+ *filesp = files;
+ *countp = count;
+
+ if (0) {
+err: __wt_log_files_free(session, files, count);
+ }
+ return (ret);
}
/*
@@ -70,11 +135,9 @@ __wt_log_extract_lognum(
if (id == NULL || name == NULL)
return (0);
- if ((p = strrchr(name, '.')) == NULL)
- return (WT_ERROR);
- ++p;
- if ((sscanf(++p, "%" PRIu32, id)) != 1)
- return (WT_ERROR);
+ if ((p = strrchr(name, '.')) == NULL ||
+ sscanf(++p, "%" PRIu32, id) != 1)
+ WT_RET_MSG(session, WT_ERROR, "Bad log file name '%s'", name);
return (0);
}
@@ -138,7 +201,8 @@ __wt_log_open(WT_SESSION_IMPL *session)
log = conn->log;
lastlog = 0;
firstlog = UINT32_MAX;
- WT_RET(__wt_log_getfiles(session, &logfiles, &logcount));
+
+ WT_RET(__wt_log_get_files(session, &logfiles, &logcount));
for (i = 0; i < logcount; i++) {
WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
lastlog = WT_MAX(lastlog, lognum);
@@ -149,13 +213,23 @@ __wt_log_open(WT_SESSION_IMPL *session)
firstlog, lastlog);
log->first_lsn.file = firstlog;
log->first_lsn.offset = 0;
+
/*
* Start logging at the beginning of the next log file, no matter
* where the previous log file ends.
*/
WT_ERR(__wt_log_newfile(session, 1));
-err:
- __wt_log_files_free(session, logfiles, logcount);
+
+ /*
+ * If there were log files, run recovery.
+ * XXX belongs at a higher level than this.
+ */
+ if (logcount > 0) {
+ log->trunc_lsn = log->alloc_lsn;
+ WT_ERR(__wt_txn_recover(session));
+ }
+
+err: __wt_log_files_free(session, logfiles, logcount);
return (ret);
}
@@ -186,9 +260,13 @@ __wt_log_close(WT_SESSION_IMPL *session)
return (0);
}
+/*
+ * __log_fill --
+ * Copy a thread's log records into the assigned slot.
+ */
static int
__log_fill(WT_SESSION_IMPL *session,
- WT_MYSLOT *myslot, WT_ITEM *record, WT_LSN *lsnp)
+ WT_MYSLOT *myslot, int direct, WT_ITEM *record, WT_LSN *lsnp)
{
WT_DECL_RET;
WT_LOG_RECORD *logrec;
@@ -199,9 +277,14 @@ __log_fill(WT_SESSION_IMPL *session,
* If the offset becomes a unit of LOG_ALIGN this is where we would
* multiply by LOG_ALIGN to get the real file byte offset for write().
*/
- WT_ERR(__wt_write(session, myslot->slot->slot_fh,
- myslot->offset + myslot->slot->slot_start_offset,
- logrec->len, (void *)logrec));
+ if (direct)
+ WT_ERR(__wt_write(session, myslot->slot->slot_fh,
+ myslot->offset + myslot->slot->slot_start_offset,
+ logrec->len, (void *)logrec));
+ else
+ memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset,
+ logrec, logrec->len);
+
WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len);
if (lsnp != NULL) {
*lsnp = myslot->slot->slot_start_lsn;
@@ -228,27 +311,66 @@ __log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize)
/*
* __log_truncate --
- * Truncate the log to the given LSN. In addition to setting the
- * given LSN as the new end of log file, it will remove any later
- * log files that may exist. This function assumes we are in recovery
- * or other dedicated time and not during live running.
+ * Truncate the log to the given LSN. If this_log is set, it will only
+ * truncate the log file indicated in the given LSN. If not set,
+ * it will truncate between the given LSN and the trunc_lsn. That is,
+ * since we pre-allocate log files, it will free that space and allow the
+ * log to be traversed. We use the trunc_lsn because logging has already
+ * opened the new/next log file before recovery ran. This function assumes
+ * we are in recovery or other dedicated time and not during live running.
*/
static int
-__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn)
+__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, uint32_t this_log)
{
+ WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
+ WT_FH *log_fh, *tmp_fh;
+ WT_LOG *log;
uint32_t lognum;
u_int i, logcount;
char **logfiles;
- WT_RET(__wt_log_getfiles(session, &logfiles, &logcount));
+ conn = S2C(session);
+ log = conn->log;
+ log_fh = NULL;
+ logcount = 0;
+ logfiles = NULL;
+
+ /*
+ * Truncate the log file to the given LSN.
+ */
+ WT_ERR(__log_openfile(session, 0, &log_fh, lsn->file));
+ WT_ERR(__wt_ftruncate(session, log_fh, lsn->offset));
+ tmp_fh = log_fh;
+ log_fh = NULL;
+ WT_ERR(__wt_close(session, tmp_fh));
+
+ /*
+ * If we just want to truncate the current log, return and skip
+ * looking for intervening logs.
+ */
+ if (this_log)
+ goto err;
+ WT_ERR(__wt_log_get_files(session, &logfiles, &logcount));
for (i = 0; i < logcount; i++) {
WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
- if (lognum > lsn->file)
- WT_ERR(__wt_log_remove(session, lognum));
+ if (lognum > lsn->file && lognum < log->trunc_lsn.file) {
+ WT_ERR(__log_openfile(session, 0, &log_fh, lognum));
+ /*
+ * If there are intervening files pre-allocated,
+ * truncate them to the end of the log file header.
+ */
+ WT_ERR(__wt_ftruncate(session,
+ log_fh, LOG_FIRST_RECORD));
+ tmp_fh = log_fh;
+ log_fh = NULL;
+ WT_ERR(__wt_close(session, tmp_fh));
+ }
}
-
-err: __wt_log_files_free(session, logfiles, logcount);
+err: if (log_fh != NULL)
+ WT_TRET(__wt_close(session, log_fh));
+ if (logfiles != NULL)
+ __wt_log_files_free(session, logfiles, logcount);
return (ret);
}
@@ -263,6 +385,7 @@ __log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, off_t *eof)
WT_DECL_RET;
WT_LOG *log;
uint64_t rec;
+ uint32_t allocsize;
off_t log_size, off;
conn = S2C(session);
@@ -271,6 +394,10 @@ __log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, off_t *eof)
return (0);
*eof = 0;
WT_ERR(__wt_filesize(session, fh, &log_size));
+ if (log == NULL)
+ allocsize = LOG_ALIGN;
+ else
+ allocsize = log->allocsize;
/*
* We know all log records are aligned at log->allocsize. The first
* item in a log record is always the length. Look for any non-zero
@@ -278,9 +405,9 @@ __log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, off_t *eof)
* it could be the middle of a large record. But we know no log record
* starts after it. Return an estimate of the log file size.
*/
- for (off = log_size - (off_t)log->allocsize;
+ for (off = log_size - (off_t)allocsize;
off > 0;
- off -= (off_t)log->allocsize) {
+ off -= (off_t)allocsize) {
WT_ERR(__wt_read(session, fh, off, sizeof(uint64_t), &rec));
if (rec != 0)
break;
@@ -288,7 +415,7 @@ __log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, off_t *eof)
/*
* Set EOF to the last zero-filled record we saw.
*/
- *eof = off + (off_t)log->allocsize;
+ *eof = off + (off_t)allocsize;
err:
return (ret);
}
@@ -316,7 +443,7 @@ __log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
if (!__log_size_fit(session, &log->alloc_lsn, recsize)) {
WT_RET(__wt_log_newfile(session, 0));
if (log->log_close_fh != NULL)
- FLD_SET(slot->slot_flags, SLOT_CLOSEFH);
+ F_SET(slot, SLOT_CLOSEFH);
}
/*
* Need to minimally fill in slot info here. Our slot start LSN
@@ -337,6 +464,10 @@ __log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
return (0);
}
+/*
+ * __log_release --
+ * Release a log slot.
+ */
static int
__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
@@ -344,6 +475,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
WT_DECL_RET;
WT_FH *close_fh;
WT_LOG *log;
+ uint32_t write_size;
conn = S2C(session);
log = conn->log;
@@ -352,23 +484,53 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
* of the file handle structure.
*/
close_fh = NULL;
- if (FLD_ISSET(slot->slot_flags, SLOT_CLOSEFH)) {
+ if (F_ISSET(slot, SLOT_CLOSEFH)) {
close_fh = log->log_close_fh;
log->log_close_fh = NULL;
- FLD_CLR(slot->slot_flags, SLOT_CLOSEFH);
+ F_CLR(slot, SLOT_CLOSEFH);
+ }
+
+ /* Write the buffered records */
+ if (F_ISSET(slot, SLOT_BUFFERED)) {
+ write_size = (uint32_t)
+ (slot->slot_end_lsn.offset - slot->slot_start_offset);
+ WT_ERR(__wt_write(session, slot->slot_fh,
+ slot->slot_start_offset, write_size, slot->slot_buf.mem));
}
+
/*
- * Wait for earlier groups to finish.
+ * Wait for earlier groups to finish, otherwise there could be holes
+ * in the log file.
*/
- while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0)
- __wt_yield();
- if (FLD_ISSET(slot->slot_flags, SLOT_SYNC)) {
+ while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) {
+ /*
+ * Workloads with fast commits (no-sync is a reasonable
+ * approximation) benefit from yielding rather than using the
+ * more heavy weight condition wait.
+ */
+ if (S2C(session)->txn_logsync == 0)
+ __wt_yield();
+ else if (__wt_cond_wait(session,
+ log->log_release_cond, 10000) == ETIMEDOUT)
+ WT_STAT_FAST_CONN_INCR(session,
+ log_slot_release_wait_timeout);
+ }
+ if (F_ISSET(slot, SLOT_SYNC)) {
WT_STAT_FAST_CONN_INCR(session, log_sync);
WT_ERR(__wt_fsync(session, log->log_fh));
- FLD_CLR(slot->slot_flags, SLOT_SYNC);
+ F_CLR(slot, SLOT_SYNC);
log->sync_lsn = slot->slot_end_lsn;
}
log->write_lsn = slot->slot_end_lsn;
+ WT_ERR(__wt_cond_signal(session, log->log_release_cond));
+ if (F_ISSET(slot, SLOT_BUF_GROW)) {
+ WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
+ F_CLR(slot, SLOT_BUF_GROW);
+ WT_STAT_FAST_CONN_INCRV(session,
+ log_buffer_size, slot->slot_buf.memsize);
+ WT_ERR(__wt_buf_grow(session,
+ &slot->slot_buf, slot->slot_buf.memsize * 2));
+ }
/*
* If we have a file to close, close it now.
*/
@@ -433,6 +595,7 @@ __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create)
memset(&tmp, 0, sizeof(tmp));
myslot.slot = &tmp;
myslot.offset = 0;
+
/*
* Recursively call __log_acquire to allocate log space for the
* log descriptor record. Call __log_fill to write it, but we
@@ -440,7 +603,8 @@ __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create)
* earlier operations to complete.
*/
WT_ERR(__log_acquire(session, logrec->len, &tmp));
- WT_ERR(__log_fill(session, &myslot, buf, NULL));
+ WT_ERR(__log_fill(session, &myslot, 1, buf, NULL));
+
/*
* If we're called from connection creation code, we need to update
* the LSNs since we're the only write in progress.
@@ -517,11 +681,8 @@ __wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
cksum = logrec->checksum;
logrec->checksum = 0;
logrec->checksum = __wt_cksum(logrec, logrec->len);
- if (logrec->checksum != cksum) {
- WT_VERBOSE_ERR(session, log, "log_read: Bad checksum");
- ret = WT_ERROR;
- goto err;
- }
+ if (logrec->checksum != cksum)
+ WT_ERR_MSG(session, WT_ERROR, "log_read: Bad checksum");
record->size = logrec->len;
WT_STAT_FAST_CONN_INCR(session, log_reads);
err:
@@ -529,6 +690,10 @@ err:
return (ret);
}
+/*
+ * __wt_log_scan --
+ * Scan the logs, calling a function on each record found.
+ */
int
__wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
int (*func)(WT_SESSION_IMPL *session,
@@ -542,11 +707,16 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
WT_LOG_RECORD *logrec;
WT_LSN end_lsn, rd_lsn, start_lsn;
off_t log_size;
- uint32_t cksum, rdup_len, reclen;
+ uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen;
+ u_int i, logcount;
int done;
+ char **logfiles;
conn = S2C(session);
log = conn->log;
+ log_fh = NULL;
+ logcount = 0;
+ logfiles = NULL;
WT_CLEAR(buf);
/*
@@ -556,27 +726,68 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
if (func == NULL)
return (0);
- if (lsnp == NULL) {
- if (LF_ISSET(WT_LOGSCAN_FIRST))
- start_lsn = log->first_lsn;
- else if (LF_ISSET(WT_LOGSCAN_FROM_CKP))
- start_lsn = log->ckpt_lsn;
- else
- return (WT_ERROR); /* Illegal usage */
- } else {
- if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP))
- return (WT_ERROR); /* Illegal usage */
+ if (LF_ISSET(WT_LOGSCAN_RECOVER))
+ WT_VERBOSE_RET(session, log,
+ "__wt_log_scan truncating to %u/%" PRIuMAX,
+ log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset);
+
+ if (log != NULL) {
+ allocsize = log->allocsize;
+
+ if (lsnp == NULL) {
+ if (LF_ISSET(WT_LOGSCAN_FIRST))
+ start_lsn = log->first_lsn;
+ else if (LF_ISSET(WT_LOGSCAN_FROM_CKP))
+ start_lsn = log->ckpt_lsn;
+ else
+ return (WT_ERROR); /* Illegal usage */
+ } else {
+ if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP))
+ WT_RET_MSG(session, WT_ERROR,
+ "choose either a start LSN or a start flag");
- /* Offsets must be on allocation boundaries. */
- if (lsnp->offset % log->allocsize != 0 ||
- lsnp->file > log->fileid)
- return (WT_NOTFOUND);
+ /* Offsets must be on allocation boundaries. */
+ if (lsnp->offset % allocsize != 0 ||
+ lsnp->file > log->fileid)
+ return (WT_NOTFOUND);
- start_lsn = *lsnp;
+ start_lsn = *lsnp;
+ }
+
+ end_lsn = log->alloc_lsn;
+ } else {
+ /*
+ * If logging is not configured, we can still print out the log
+ * if log files exist. We just need to set the LSNs from what
+ * is in the files versus what is in the live connection.
+ */
+ /*
+ * Set allocsize to the minimum alignment it could be. Larger
+ * records and larger allocation boundaries should always be
+ * a multiple of this.
+ */
+ allocsize = LOG_ALIGN;
+ lastlog = 0;
+ firstlog = UINT32_MAX;
+ WT_RET(__wt_log_get_files(session, &logfiles, &logcount));
+ if (logcount == 0)
+ /*
+ * Return it is not supported if none don't exist.
+ */
+ return (ENOTSUP);
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i],
+ &lognum));
+ lastlog = WT_MAX(lastlog, lognum);
+ firstlog = WT_MIN(firstlog, lognum);
+ }
+ start_lsn.file = firstlog;
+ end_lsn.file = lastlog;
+ start_lsn.offset = end_lsn.offset = 0;
+ __wt_log_files_free(session, logfiles, logcount);
+ logfiles = NULL;
}
- end_lsn = log->alloc_lsn;
- log_fh = NULL;
- WT_RET(__log_openfile(session, 0, &log_fh, start_lsn.file));
+ WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file));
WT_ERR(__log_filesize(session, log_fh, &log_size));
if (LF_ISSET(WT_LOGSCAN_ONE))
done = 1;
@@ -591,6 +802,11 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
*/
WT_ERR(__wt_close(session, log_fh));
log_fh = NULL;
+ /*
+ * Truncate this log file before we move to the next.
+ */
+ if (LF_ISSET(WT_LOGSCAN_RECOVER))
+ WT_ERR(__log_truncate(session, &rd_lsn, 1));
rd_lsn.file++;
rd_lsn.offset = 0;
/*
@@ -607,9 +823,9 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
/*
* Read the minimum allocation size a record could be.
*/
- WT_ASSERT(session, buf.memsize >= log->allocsize);
+ WT_ASSERT(session, buf.memsize >= allocsize);
WT_ERR(__wt_read(
- session, log_fh, rd_lsn.offset, log->allocsize, buf.mem));
+ session, log_fh, rd_lsn.offset, allocsize, buf.mem));
/*
* First 8 bytes is the real record length. See if we
* need to read more than the allocation size. We expect
@@ -625,16 +841,11 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
* that may exist.
*/
if (reclen == 0) {
- /*
- * This LSN is the end. Truncate if we're in recovery
- * otherwise, just stop.
- */
- if (LF_ISSET(WT_LOGSCAN_RECOVER))
- WT_ERR(__log_truncate(session, &rd_lsn));
+ /* This LSN is the end. */
break;
}
- rdup_len = __wt_rduppo2(reclen, log->allocsize);
- if (reclen > log->allocsize) {
+ rdup_len = __wt_rduppo2(reclen, allocsize);
+ if (reclen > allocsize) {
WT_ERR(__wt_buf_grow(session, &buf, rdup_len));
WT_ERR(__wt_read(
session, log_fh, rd_lsn.offset, reclen, buf.mem));
@@ -649,8 +860,16 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
logrec->checksum = 0;
logrec->checksum = __wt_cksum(logrec, logrec->len);
if (logrec->checksum != cksum) {
- ret = WT_ERROR;
- goto err;
+ /*
+ * A checksum mismatch means we have reached the end of
+ * the useful part of the log. This should be found on
+ * the first pass through recovery. In the second pass
+ * where we truncate the log, this is where it should
+ * end.
+ */
+ if (log != NULL)
+ log->trunc_lsn = rd_lsn;
+ break;
}
/*
@@ -664,7 +883,14 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
rd_lsn.offset += (off_t)rdup_len;
} while (!done);
+ /* Truncate if we're in recovery. */
+ if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
+ LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0)
+ WT_ERR(__log_truncate(session, &rd_lsn, 0));
+
err: WT_STAT_FAST_CONN_INCR(session, log_scans);
+ if (logfiles != NULL)
+ __wt_log_files_free(session, logfiles, logcount);
__wt_buf_free(session, &buf);
if (ret == ENOENT)
ret = 0;
@@ -673,6 +899,48 @@ err: WT_STAT_FAST_CONN_INCR(session, log_scans);
return (ret);
}
+/*
+ * __log_direct_write --
+ * Write a log record without using the consolidation arrays.
+ */
+static int
+__log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+ uint32_t flags)
+{
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOGSLOT tmp;
+ WT_MYSLOT myslot;
+ int locked;
+
+ log = S2C(session)->log;
+ locked = 0;
+ myslot.slot = &tmp;
+ myslot.offset = 0;
+
+ /* Fast path the contended case. */
+ if (__wt_spin_trylock(session, &log->log_slot_lock) != 0)
+ return (EAGAIN);
+
+ memset(&tmp, 0, sizeof(tmp));
+ locked = 1;
+ if (LF_ISSET(WT_LOG_FSYNC))
+ F_SET(&tmp, SLOT_SYNC);
+ WT_ERR(__log_acquire(session, record->size, &tmp));
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ locked = 0;
+ WT_ERR(__log_fill(session, &myslot, 1, record, lsnp));
+ WT_ERR(__log_release(session, &tmp));
+
+err: if (locked)
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ return (ret);
+}
+
+/*
+ * __wt_log_write --
+ * Write a record into the log.
+ */
int
__wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
uint32_t flags)
@@ -681,7 +949,6 @@ __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
WT_DECL_RET;
WT_LOG *log;
WT_LOG_RECORD *logrec;
- WT_LOGSLOT tmp;
WT_LSN tmp_lsn;
WT_MYSLOT myslot;
uint32_t rdup_len;
@@ -691,8 +958,6 @@ __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
log = conn->log;
locked = 0;
INIT_LSN(&tmp_lsn);
- myslot.slot = &tmp;
- myslot.offset = 0;
/*
* Assume the WT_ITEM the user passed is a WT_LOG_RECORD, which has
* a header at the beginning for us to fill in.
@@ -720,24 +985,45 @@ __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
logrec->checksum = 0;
logrec->checksum = __wt_cksum(logrec, record->size);
- memset(&tmp, 0, sizeof(tmp));
WT_STAT_FAST_CONN_INCR(session, log_writes);
- if (__wt_spin_trylock(session, &log->log_slot_lock) == 0) {
+
+ if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) {
+ ret = __log_direct_write(session, record, lsnp, flags);
+ if (ret == 0)
+ return (0);
+ if (ret != EAGAIN)
+ WT_ERR(ret);
/*
- * No contention, just write our record. We're not using
- * the consolidation arrays, so send in the tmp slot.
+ * An EAGAIN return means we failed to get the try lock -
+ * fall through to the consolidation code in that case.
*/
- locked = 1;
- if (LF_ISSET(WT_LOG_FSYNC))
- FLD_SET(tmp.slot_flags, SLOT_SYNC);
- WT_ERR(__log_acquire(session, rdup_len, &tmp));
- __wt_spin_unlock(session, &log->log_slot_lock);
- locked = 0;
- WT_ERR(__log_fill(session, &myslot, record, lsnp));
- WT_ERR(__log_release(session, &tmp));
+ }
+
+ /*
+ * As soon as we see contention for the log slot, disable direct
+ * log writes. We get better performance by forcing writes through
+ * the consolidation code. This is because individual writes flood
+ * the I/O system faster than they contend on the log slot lock.
+ */
+ F_SET(log, WT_LOG_FORCE_CONSOLIDATE);
+ if ((ret = __wt_log_slot_join(
+ session, rdup_len, flags, &myslot)) == ENOMEM) {
+ /*
+ * If we couldn't find a consolidated slot for this record
+ * write the record directly.
+ */
+ while ((ret = __log_direct_write(
+ session, record, lsnp, flags)) == EAGAIN)
+ ;
+ WT_ERR(ret);
+ /*
+ * Increase the buffer size of any slots we can get access
+ * to, so future consolidations are likely to succeed.
+ */
+ WT_ERR(__wt_log_slot_grow_buffers(session, 4 * rdup_len));
return (0);
}
- WT_ERR(__wt_log_slot_join(session, rdup_len, flags, &myslot));
+ WT_ERR(ret);
if (myslot.offset == 0) {
__wt_spin_lock(session, &log->log_slot_lock);
locked = 1;
@@ -746,15 +1032,16 @@ __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
session, myslot.slot->slot_group_size, myslot.slot));
__wt_spin_unlock(session, &log->log_slot_lock);
locked = 0;
- WT_ERR(__wt_log_slot_notify(myslot.slot));
+ WT_ERR(__wt_log_slot_notify(session, myslot.slot));
} else
- WT_ERR(__wt_log_slot_wait(myslot.slot));
- WT_ERR(__log_fill(session, &myslot, record, &tmp_lsn));
+ WT_ERR(__wt_log_slot_wait(session, myslot.slot));
+ WT_ERR(__log_fill(session, &myslot, 0, record, &tmp_lsn));
if (__wt_log_slot_release(myslot.slot, rdup_len) ==
WT_LOG_SLOT_DONE) {
WT_ERR(__log_release(session, myslot.slot));
WT_ERR(__wt_log_slot_free(myslot.slot));
} else if (LF_ISSET(WT_LOG_FSYNC)) {
+ /* Wait for our slot to be finalized */
while (LOG_CMP(&log->sync_lsn, &tmp_lsn) <= 0 &&
myslot.slot->slot_error == 0)
__wt_yield();
@@ -773,22 +1060,23 @@ err:
*/
if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0)
ret = myslot.slot->slot_error;
- if (LF_ISSET(WT_LOG_CKPT) && ret == 0) {
- log->ckpt_lsn = tmp_lsn;
- if (conn->arch_cond != NULL)
- WT_ERR(__wt_cond_signal(session, conn->arch_cond));
- }
return (ret);
}
+/*
+ * __wt_log_vprintf --
+ * Write a message into the log.
+ */
int
__wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
{
WT_CONNECTION_IMPL *conn;
- WT_DECL_ITEM(buf);
- WT_LOG_RECORD *logrec;
+ WT_DECL_ITEM(logrec);
+ WT_DECL_RET;
va_list ap_copy;
- size_t len;
+ const char *rec_fmt = WT_UNCHECKED_STRING(I);
+ uint32_t rectype = WT_LOGREC_MESSAGE;
+ size_t header_size, len;
conn = S2C(session);
@@ -796,18 +1084,31 @@ __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
return (0);
va_copy(ap_copy, ap);
- len = (size_t)vsnprintf(NULL, 0, fmt, ap_copy) + sizeof(WT_LOG_RECORD);
+ len = (size_t)vsnprintf(NULL, 0, fmt, ap_copy) + 1;
va_end(ap_copy);
- WT_RET(__wt_scr_alloc(session, 0, &buf));
- WT_RET(__wt_buf_initsize(session, buf, len));
+ WT_RET(
+ __wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec));
- logrec = (WT_LOG_RECORD *)buf->mem;
- (void)vsnprintf((char *)logrec->record, len, fmt, ap);
+ /*
+ * We're writing a record with the type (an integer) followed by a
+ * string (NUL-terminated data). To avoid writing the string into
+ * a buffer before copying it, we write the header first, then the
+ * raw bytes of the string.
+ */
+ WT_ERR(__wt_struct_size(session, &header_size, rec_fmt, rectype));
+ WT_ERR(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, header_size,
+ rec_fmt, rectype));
+ logrec->size += (uint32_t)header_size;
- WT_VERBOSE_RET(session, log,
- "log_printf: %s", (char *)logrec->record);
- WT_RET(__wt_log_write(session, buf, NULL, 0));
- __wt_scr_free(&buf);
- return (0);
+ (void)vsnprintf((char *)logrec->data + logrec->size, len, fmt, ap);
+
+ WT_VERBOSE_ERR(session, log,
+ "log_printf: %s", (char *)logrec->data + logrec->size);
+
+ logrec->size += len;
+ WT_ERR(__wt_log_write(session, logrec, NULL, 0));
+err: __wt_scr_free(&logrec);
+ return (ret);
}
diff --git a/src/log/log_auto.c b/src/log/log_auto.c
new file mode 100644
index 00000000000..50d80d67434
--- /dev/null
+++ b/src/log/log_auto.c
@@ -0,0 +1,431 @@
+/* DO NOT EDIT: automatically built by dist/log.py. */
+
+#include "wt_internal.h"
+
+int
+__wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp)
+{
+ WT_ITEM *logrec;
+
+ WT_RET(__wt_scr_alloc(session, WT_ALIGN(size + 1, LOG_ALIGN), &logrec));
+ WT_CLEAR(*(WT_LOG_RECORD *)logrec->data);
+ logrec->size = offsetof(WT_LOG_RECORD, record);
+
+ *logrecp = logrec;
+ return (0);
+}
+
+void
+__wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp)
+{
+ WT_UNUSED(session);
+ __wt_scr_free(logrecp);
+}
+
+int
+__wt_logrec_read(WT_SESSION_IMPL *session,
+ const uint8_t **pp, const uint8_t *end, uint32_t *rectypep)
+{
+ uint64_t rectype;
+
+ WT_UNUSED(session);
+ WT_RET(__wt_vunpack_uint(pp, WT_PTRDIFF(end, *pp), &rectype));
+ *rectypep = (uint32_t)rectype;
+ return (0);
+}
+
+int
+__wt_logop_read(WT_SESSION_IMPL *session,
+ const uint8_t **pp, const uint8_t *end,
+ uint32_t *optypep, uint32_t *opsizep)
+{
+ return (__wt_struct_unpack(
+ session, *pp, WT_PTRDIFF(end, *pp), "II", optypep, opsizep));
+}
+
+int
+__wt_logop_col_put_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, uint64_t recno, WT_ITEM *value)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIru);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_COL_PUT;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, recno, value));
+
+ size += __wt_vsize_uint(size) - 1;
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, recno, value));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_col_put_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIru);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, recnop, valuep));
+ WT_ASSERT(session, optype == WT_LOGOP_COL_PUT);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_col_put_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ uint64_t recno;
+ WT_ITEM value;
+
+ WT_RET(__wt_logop_col_put_unpack(
+ session, pp, end, &fileid, &recno, &value));
+
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"recno\": \"%" PRIu64 "\",\n", recno);
+ fprintf(out, " \"value\": \"%.*s\",\n",
+ (int)value.size, (const char *)value.data);
+ return (0);
+}
+
+int
+__wt_logop_col_remove_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, uint64_t recno)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIr);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_COL_REMOVE;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, recno));
+
+ size += __wt_vsize_uint(size) - 1;
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, recno));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_col_remove_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, uint64_t *recnop)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIr);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, recnop));
+ WT_ASSERT(session, optype == WT_LOGOP_COL_REMOVE);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_col_remove_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ uint64_t recno;
+
+ WT_RET(__wt_logop_col_remove_unpack(
+ session, pp, end, &fileid, &recno));
+
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"recno\": \"%" PRIu64 "\",\n", recno);
+ return (0);
+}
+
+int
+__wt_logop_col_truncate_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, uint64_t start, uint64_t stop)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIrr);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_COL_TRUNCATE;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, start, stop));
+
+ size += __wt_vsize_uint(size) - 1;
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, start, stop));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_col_truncate_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, uint64_t *startp, uint64_t *stopp)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIrr);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, startp, stopp));
+ WT_ASSERT(session, optype == WT_LOGOP_COL_TRUNCATE);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_col_truncate_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ uint64_t start;
+ uint64_t stop;
+
+ WT_RET(__wt_logop_col_truncate_unpack(
+ session, pp, end, &fileid, &start, &stop));
+
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"start\": \"%" PRIu64 "\",\n", start);
+ fprintf(out, " \"stop\": \"%" PRIu64 "\",\n", stop);
+ return (0);
+}
+
+int
+__wt_logop_row_put_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, WT_ITEM *key, WT_ITEM *value)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIuu);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_ROW_PUT;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, key, value));
+
+ size += __wt_vsize_uint(size) - 1;
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, key, value));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_row_put_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIuu);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, keyp, valuep));
+ WT_ASSERT(session, optype == WT_LOGOP_ROW_PUT);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_row_put_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ WT_ITEM key;
+ WT_ITEM value;
+
+ WT_RET(__wt_logop_row_put_unpack(
+ session, pp, end, &fileid, &key, &value));
+
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"key\": \"%.*s\",\n",
+ (int)key.size, (const char *)key.data);
+ fprintf(out, " \"value\": \"%.*s\",\n",
+ (int)value.size, (const char *)value.data);
+ return (0);
+}
+
+int
+__wt_logop_row_remove_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, WT_ITEM *key)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIu);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_ROW_REMOVE;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, key));
+
+ size += __wt_vsize_uint(size) - 1;
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, key));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_row_remove_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, WT_ITEM *keyp)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIu);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, keyp));
+ WT_ASSERT(session, optype == WT_LOGOP_ROW_REMOVE);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_row_remove_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ WT_ITEM key;
+
+ WT_RET(__wt_logop_row_remove_unpack(
+ session, pp, end, &fileid, &key));
+
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"key\": \"%.*s\",\n",
+ (int)key.size, (const char *)key.data);
+ return (0);
+}
+
+int
+__wt_logop_row_truncate_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIuuI);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_ROW_TRUNCATE;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, start, stop, mode));
+
+ size += __wt_vsize_uint(size) - 1;
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, start, stop, mode));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_row_truncate_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIuuI);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, startp, stopp, modep));
+ WT_ASSERT(session, optype == WT_LOGOP_ROW_TRUNCATE);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_row_truncate_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ WT_ITEM start;
+ WT_ITEM stop;
+ uint32_t mode;
+
+ WT_RET(__wt_logop_row_truncate_unpack(
+ session, pp, end, &fileid, &start, &stop, &mode));
+
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"start\": \"%.*s\",\n",
+ (int)start.size, (const char *)start.data);
+ fprintf(out, " \"stop\": \"%.*s\",\n",
+ (int)stop.size, (const char *)stop.data);
+ fprintf(out, " \"mode\": \"%" PRIu32 "\",\n", mode);
+ return (0);
+}
+
+int
+__wt_txn_op_printlog(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t optype, opsize;
+
+ /* Peek at the size and the type. */
+ WT_RET(__wt_logop_read(session, pp, end, &optype, &opsize));
+ end = *pp + opsize;
+
+ switch (optype) {
+ case WT_LOGOP_COL_PUT:
+ WT_RET(__wt_logop_col_put_print(session, pp, end, out));
+ break;
+
+ case WT_LOGOP_COL_REMOVE:
+ WT_RET(__wt_logop_col_remove_print(session, pp, end, out));
+ break;
+
+ case WT_LOGOP_COL_TRUNCATE:
+ WT_RET(__wt_logop_col_truncate_print(session, pp, end, out));
+ break;
+
+ case WT_LOGOP_ROW_PUT:
+ WT_RET(__wt_logop_row_put_print(session, pp, end, out));
+ break;
+
+ case WT_LOGOP_ROW_REMOVE:
+ WT_RET(__wt_logop_row_remove_print(session, pp, end, out));
+ break;
+
+ case WT_LOGOP_ROW_TRUNCATE:
+ WT_RET(__wt_logop_row_truncate_print(session, pp, end, out));
+ break;
+
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
diff --git a/src/log/log_desc.c b/src/log/log_desc.c
deleted file mode 100644
index 6cfe18fc441..00000000000
--- a/src/log/log_desc.c
+++ /dev/null
@@ -1 +0,0 @@
-/* DO NOT EDIT: automatically built by dist/log.py. */
diff --git a/src/log/log_slot.c b/src/log/log_slot.c
index de0f7b2bd73..2e46698c431 100644
--- a/src/log/log_slot.c
+++ b/src/log/log_slot.c
@@ -27,9 +27,10 @@ int
__wt_log_slot_init(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
WT_LOG *log;
WT_LOGSLOT *slot;
- uint32_t i;
+ int32_t i;
conn = S2C(session);
log = conn->log;
@@ -43,16 +44,59 @@ __wt_log_slot_init(WT_SESSION_IMPL *session)
*/
for (i = 0; i < SLOT_ACTIVE; i++) {
slot = &log->slot_pool[i];
- slot->slot_index = i;
+ slot->slot_index = (uint32_t)i;
slot->slot_state = WT_LOG_SLOT_READY;
log->slot_array[i] = slot;
}
+
+ /*
+ * Allocate memory for buffers now that the arrays are setup. Split
+ * this out to make error handling simpler.
+ */
+ for (i = 0; i < SLOT_POOL; i++) {
+ WT_ERR(__wt_buf_init(session,
+ &log->slot_pool[i].slot_buf, WT_LOG_SLOT_BUF_INIT_SIZE));
+ F_SET(&log->slot_pool[i], SLOT_BUFFERED);
+ WT_ERR(__wt_cond_alloc(session,
+ "slot pool done", 0, &log->slot_pool[i].slot_done_cond));
+ }
+ WT_STAT_FAST_CONN_INCRV(session,
+ log_buffer_size, WT_LOG_SLOT_BUF_INIT_SIZE * SLOT_POOL);
+ if (0) {
+err: while (--i >= 0)
+ __wt_buf_free(session, &log->slot_pool[i].slot_buf);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_log_slot_destroy --
+ * Clean up the slot array on shutdown.
+ */
+int
+__wt_log_slot_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ int i;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ for (i = 0; i < SLOT_POOL; i++) {
+ __wt_buf_free(session, &log->slot_pool[i].slot_buf);
+ WT_TRET(__wt_cond_destroy(
+ session, &log->slot_pool[i].slot_done_cond));
+ }
return (0);
}
/*
* __wt_log_slot_join --
- * Join a consolidated logging slot.
+ * Join a consolidated logging slot. Callers should be prepared to deal
+ * with a ENOMEM return - which indicates no slots could accommodate
+ * the log record.
*/
int
__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
@@ -62,13 +106,14 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
WT_LOG *log;
WT_LOGSLOT *slot;
int64_t cur_state, new_state, old_state;
- uint32_t i;
+ uint32_t allocated_slot, slot_grow_attempts;
conn = S2C(session);
log = conn->log;
+ slot_grow_attempts = 0;
find_slot:
- i = __wt_random() % SLOT_ACTIVE;
- slot = log->slot_array[i];
+ allocated_slot = __wt_random() % SLOT_ACTIVE;
+ slot = log->slot_array[allocated_slot];
old_state = slot->slot_state;
join_slot:
/*
@@ -90,6 +135,18 @@ join_slot:
WT_STAT_FAST_CONN_INCR(session, log_slot_toobig);
goto find_slot;
}
+ /*
+ * If the slot buffer isn't big enough to hold this update, mark
+ * the slot for a buffer size increase and find another slot.
+ */
+ if (new_state > (int64_t)slot->slot_buf.memsize) {
+ F_SET(slot, SLOT_BUF_GROW);
+ if (++slot_grow_attempts > 5) {
+ WT_STAT_FAST_CONN_INCR(session, log_slot_toosmall);
+ return (ENOMEM);
+ }
+ goto find_slot;
+ }
cur_state = WT_ATOMIC_CAS_VAL(slot->slot_state, old_state, new_state);
/*
* We lost a race to add our size into this slot. Check the state
@@ -107,7 +164,7 @@ join_slot:
*/
WT_STAT_FAST_CONN_INCR(session, log_slot_joins);
if (LF_ISSET(WT_LOG_FSYNC))
- FLD_SET(slot->slot_flags, SLOT_SYNC);
+ F_SET(slot, SLOT_SYNC);
myslotp->slot = slot;
myslotp->offset = (off_t)old_state - WT_LOG_SLOT_READY;
return (0);
@@ -127,10 +184,12 @@ __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
WT_LOG *log;
WT_LOGSLOT *newslot;
int64_t old_state;
- uint32_t pool_i;
+ int32_t yields;
+ uint32_t pool_i, switch_fails;
conn = S2C(session);
log = conn->log;
+ switch_fails = 0;
retry:
/*
* Find an unused slot in the pool.
@@ -139,8 +198,28 @@ retry:
newslot = &log->slot_pool[pool_i];
if (++log->pool_index >= SLOT_POOL)
log->pool_index = 0;
- if (newslot->slot_state != WT_LOG_SLOT_FREE)
+ if (newslot->slot_state != WT_LOG_SLOT_FREE) {
+ WT_STAT_FAST_CONN_INCR(session, log_slot_switch_fails);
+ /*
+ * If it takes a number of attempts to find an available slot
+ * it's likely all slots are waiting to be released. This
+ * churn is used to change how long we pause before closing
+ * the slot - which leads to more consolidation and less churn.
+ */
+ if (++switch_fails % SLOT_POOL == 0 &&
+ switch_fails != 0 && slot->slot_churn < 5)
+ ++slot->slot_churn;
+ __wt_yield();
goto retry;
+ } else if (slot->slot_churn > 0) {
+ --slot->slot_churn;
+ WT_ASSERT(session, slot->slot_churn >= 0);
+ }
+
+ /* Pause to allow other threads a chance to consolidate. */
+ for (yields = slot->slot_churn; yields >= 0; yields--)
+ __wt_yield();
+
/*
* Swap out the slot we're going to use and put a free one in the
* slot array in its place so that threads can use it right away.
@@ -168,10 +247,11 @@ retry:
* Notify all threads waiting for the state to be < WT_LOG_SLOT_DONE.
*/
int
-__wt_log_slot_notify(WT_LOGSLOT *slot)
+__wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
slot->slot_state =
(int64_t)WT_LOG_SLOT_DONE - (int64_t)slot->slot_group_size;
+ WT_RET(__wt_cond_signal(session, slot->slot_done_cond));
return (0);
}
@@ -180,15 +260,26 @@ __wt_log_slot_notify(WT_LOGSLOT *slot)
* Wait for slot leader to allocate log area and tell us our log offset.
*/
int
-__wt_log_slot_wait(WT_LOGSLOT *slot)
+__wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
- while (slot->slot_state > WT_LOG_SLOT_DONE)
- __wt_yield();
+ while (slot->slot_state > WT_LOG_SLOT_DONE) {
+ /*
+ * Workloads with fast commits (no-sync is a reasonable
+ * approximation) benefit from yielding rather than using the
+ * more heavy weight condition wait.
+ */
+ if (S2C(session)->txn_logsync == 0)
+ __wt_yield();
+ else if (__wt_cond_wait(session,
+ slot->slot_done_cond, 10000) == ETIMEDOUT)
+ WT_STAT_FAST_CONN_INCR(session,
+ log_slot_ready_wait_timeout);
+ }
return (0);
}
/*
- * _wt_log_slot_release --
+ * __wt_log_slot_release --
* Each thread in a consolidated group releases its portion to
* signal it has completed writing its piece of the log.
*/
@@ -215,3 +306,63 @@ __wt_log_slot_free(WT_LOGSLOT *slot)
slot->slot_state = WT_LOG_SLOT_FREE;
return (0);
}
+
+/*
+ * __wt_log_slot_grow_buffers --
+ * Increase the buffer size of all available slots in the buffer pool.
+ * Go to some lengths to include active (but unused) slots to handle
+ * the case where all log write record sizes exceed the size of the
+ * active buffer.
+ */
+int
+__wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, int64_t newsize)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOGSLOT *slot;
+ int64_t orig_state;
+ uint64_t old_size, total_growth;
+ int i;
+
+ conn = S2C(session);
+ log = conn->log;
+ total_growth = 0;
+ WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
+ /*
+ * Take the log slot lock to prevent other threads growing buffers
+ * at the same time. Could tighten the scope of this lock, or have
+ * a separate lock if there is contention.
+ */
+ __wt_spin_lock(session, &log->log_slot_lock);
+ for (i = 0; i < SLOT_POOL; i++) {
+ slot = &log->slot_pool[i];
+ /* Avoid atomic operations if they won't succeed. */
+ if (slot->slot_state != WT_LOG_SLOT_FREE &&
+ slot->slot_state != WT_LOG_SLOT_READY)
+ continue;
+ /* Don't keep growing unrelated buffers. */
+ if (slot->slot_buf.memsize > (size_t)(10 * newsize) &&
+ !F_ISSET(slot, SLOT_BUF_GROW))
+ continue;
+ orig_state = WT_ATOMIC_CAS_VAL(
+ slot->slot_state, WT_LOG_SLOT_FREE, WT_LOG_SLOT_PENDING);
+ if (orig_state != WT_LOG_SLOT_FREE) {
+ orig_state = WT_ATOMIC_CAS_VAL(slot->slot_state,
+ WT_LOG_SLOT_READY, WT_LOG_SLOT_PENDING);
+ if (orig_state != WT_LOG_SLOT_READY)
+ continue;
+ }
+
+ /* We have a slot - now go ahead and grow the buffer. */
+ old_size = slot->slot_buf.memsize;
+ F_CLR(slot, SLOT_BUF_GROW);
+ WT_ERR(__wt_buf_grow(session, &slot->slot_buf,
+ WT_MAX(slot->slot_buf.memsize * 2, (size_t)newsize)));
+ slot->slot_state = orig_state;
+ total_growth += slot->slot_buf.memsize - old_size;
+ }
+err: __wt_spin_unlock(session, &log->log_slot_lock);
+ WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, total_growth);
+ return (ret);
+}
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 68fbfa4c9d0..40d382e0e72 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -223,7 +223,7 @@ __wt_lsm_tree_setup_chunk(
}
/*
- * __wt_lsm_start_worker --
+ * __lsm_tree_start_worker --
* Start the worker thread for an LSM tree.
*/
static int
@@ -526,7 +526,7 @@ __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
}
/*
- * __lsm_tree_throttle --
+ * __wt_lsm_tree_throttle --
* Calculate whether LSM updates need to be throttled.
*/
void
diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c
index ca1759f106b..22e6df5767b 100644
--- a/src/lsm/lsm_worker.c
+++ b/src/lsm/lsm_worker.c
@@ -156,7 +156,7 @@ __wt_lsm_merge_worker(void *vargs)
* The "main" thread polls 10 times per second,
* secondary threads once per second.
*/
- WT_ERR(__wt_cond_wait(
+ WT_ERR_TIMEDOUT_OK(__wt_cond_wait(
session, lsm_tree->work_cond,
id == 0 ? 100000 : 1000000));
stallms += (id == 0) ? 100 : 1000;
@@ -409,7 +409,7 @@ __wt_lsm_checkpoint_worker(void *arg)
__lsm_unpin_chunks(session, &cookie);
if (j == 0 && F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) &&
!F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
- WT_ERR(__wt_cond_wait(
+ WT_ERR_TIMEDOUT_OK(__wt_cond_wait(
session, lsm_tree->work_cond, 100000));
}
err: __lsm_unpin_chunks(session, &cookie);
diff --git a/src/meta/meta_ckpt.c b/src/meta/meta_ckpt.c
index 344275876d6..273b93ef0e6 100644
--- a/src/meta/meta_ckpt.c
+++ b/src/meta/meta_ckpt.c
@@ -365,8 +365,8 @@ format:
* Set a file's checkpoint value from the WT_CKPT list.
*/
int
-__wt_meta_ckptlist_set(
- WT_SESSION_IMPL *session, const char *fname, WT_CKPT *ckptbase)
+__wt_meta_ckptlist_set(WT_SESSION_IMPL *session,
+ const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn)
{
struct timespec ts;
WT_CKPT *ckpt;
@@ -443,6 +443,10 @@ __wt_meta_ckptlist_set(
sep = ",";
}
WT_ERR(__wt_buf_catfmt(session, buf, ")"));
+ if (ckptlsn != NULL)
+ WT_ERR(__wt_buf_catfmt(session, buf,
+ ",checkpoint_lsn=(%" PRIu32 ",%" PRIuMAX ")",
+ ckptlsn->file, (uintmax_t)ckptlsn->offset));
WT_ERR(__ckpt_set(session, fname, buf->mem));
err: __wt_scr_free(&buf);
diff --git a/src/meta/meta_track.c b/src/meta/meta_track.c
index d9828ee6cb8..3992687f559 100644
--- a/src/meta/meta_track.c
+++ b/src/meta/meta_track.c
@@ -89,6 +89,10 @@ __wt_meta_track_on(WT_SESSION_IMPL *session)
return (0);
}
+/*
+ * __meta_track_apply --
+ * Apply the changes in a metadata tracking record.
+ */
static int
__meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll)
{
@@ -151,7 +155,7 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll)
case WT_ST_REMOVE: /* Remove trk.a */
if ((tret = __wt_metadata_remove(
session, trk->a)) != 0) {
- __wt_err(session, ret,
+ __wt_err(session, tret,
"metadata unroll remove: %s",
trk->a);
WT_TRET(tret);
@@ -160,7 +164,7 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll)
case WT_ST_SET: /* Set trk.a to trk.b */
if ((tret = __wt_metadata_update(
session, trk->a, trk->b)) != 0) {
- __wt_err(session, ret,
+ __wt_err(session, tret,
"metadata unroll update %s to %s",
trk->a, trk->b);
WT_TRET(tret);
@@ -320,8 +324,8 @@ __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key)
}
/*
- * __wt_meta_track_fs_rename --
- * Track a filesystem rename operation.
+ * __wt_meta_track_fileop --
+ * Track a filesystem operation.
*/
int
__wt_meta_track_fileop(
diff --git a/src/meta/meta_turtle.c b/src/meta/meta_turtle.c
index 0292de522ed..166975d58fb 100644
--- a/src/meta/meta_turtle.c
+++ b/src/meta/meta_turtle.c
@@ -27,7 +27,7 @@ __metadata_config(WT_SESSION_IMPL *session, const char **metaconfp)
/* Create a turtle file with default values. */
WT_RET(__wt_scr_alloc(session, 0, &buf));
WT_ERR(__wt_buf_fmt(session, buf,
- "key_format=S,value_format=S,version=(major=%d,minor=%d)",
+ "key_format=S,value_format=S,id=0,version=(major=%d,minor=%d)",
WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX));
cfg[1] = buf->data;
WT_ERR(__wt_config_collapse(session, cfg, &metaconf));
diff --git a/src/os_posix/os_dir.c b/src/os_posix/os_dir.c
index fab75184971..b0b7c799181 100644
--- a/src/os_posix/os_dir.c
+++ b/src/os_posix/os_dir.c
@@ -18,21 +18,22 @@ int
__wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix,
uint32_t flags, char ***dirlist, u_int *countp)
{
- WT_DECL_RET;
struct dirent *dp;
DIR *dirp;
+ WT_DECL_RET;
const char *path;
- char **entries;
size_t dirallocsz;
u_int count, dirsz;
int match;
+ char **entries;
+
+ *dirlist = NULL;
+ *countp = 0;
WT_RET(__wt_filename(session, dir, &path));
- *countp = 0;
- dirallocsz = 0;
- *dirlist = NULL;
dirp = NULL;
+ dirallocsz = 0;
dirsz = 0;
entries = NULL;
if (flags == 0)
diff --git a/src/os_posix/os_mtx.c b/src/os_posix/os_mtx.c
index e6afa5c653b..762668b1672 100644
--- a/src/os_posix/os_mtx.c
+++ b/src/os_posix/os_mtx.c
@@ -52,7 +52,7 @@ err: __wt_free(session, cond);
}
/*
- * __wt_cond_wait
+ * __wt_cond_wait --
* Wait on a mutex, optionally timing out.
*/
int
@@ -192,7 +192,7 @@ err: __wt_free(session, rwlock);
}
/*
- * __wt_readlock
+ * __wt_readlock --
* Get a shared lock.
*/
int
@@ -210,7 +210,7 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
}
/*
- * __wt_try_writelock
+ * __wt_try_writelock --
* Try to get an exclusive lock, or fail immediately if unavailable.
*/
int
@@ -229,7 +229,7 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
}
/*
- * __wt_writelock
+ * __wt_writelock --
* Wait to get an exclusive lock.
*/
int
diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c
index dddf9a91485..f6b141c4da2 100644
--- a/src/os_posix/os_open.c
+++ b/src/os_posix/os_open.c
@@ -8,7 +8,7 @@
#include "wt_internal.h"
/*
- * __open_directory_sync:
+ * __open_directory_sync --
* Fsync the directory in which we created the file.
*/
static int
diff --git a/src/os_posix/os_strtouq.c b/src/os_posix/os_strtouq.c
index cb445261ce5..b532918e241 100644
--- a/src/os_posix/os_strtouq.c
+++ b/src/os_posix/os_strtouq.c
@@ -8,7 +8,8 @@
#include "wt_internal.h"
/*
- * Convert a string to an unsigned quad integer.
+ * __wt_strtouq --
+ * Convert a string to an unsigned quad integer.
*/
uint64_t
__wt_strtouq(const char *nptr, char **endptr, int base)
diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c
index 52e29ba7ed1..fe1ede05393 100644
--- a/src/schema/schema_create.c
+++ b/src/schema/schema_create.c
@@ -46,6 +46,10 @@ __wt_direct_io_size_check(WT_SESSION_IMPL *session,
return (0);
}
+/*
+ * __create_file --
+ * Create a new 'file:' object.
+ */
static int
__create_file(WT_SESSION_IMPL *session,
const char *uri, int exclusive, const char *config)
@@ -84,13 +88,15 @@ __create_file(WT_SESSION_IMPL *session,
WT_ERR(__wt_meta_track_fileop(session, NULL, uri));
/*
- * If creating an ordinary file, append the current version numbers to
- * the passed-in configuration and insert the resulting configuration
- * into the metadata.
+ * If creating an ordinary file, append the file ID and current version
+ * numbers to the passed-in configuration and insert the resulting
+ * configuration into the metadata.
*/
if (!is_metadata) {
WT_ERR(__wt_scr_alloc(session, 0, &val));
- WT_ERR(__wt_buf_fmt(session, val, "version=(major=%d,minor=%d)",
+ WT_ERR(__wt_buf_fmt(session, val,
+ "id=%" PRIu32 ",version=(major=%d,minor=%d)",
+ ++S2C(session)->next_file_id,
WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX));
for (p = filecfg; *p != NULL; ++p)
;
@@ -120,6 +126,10 @@ err: __wt_scr_free(&val);
return (ret);
}
+/*
+ * __wt_schema_colgroup_source --
+ * Get the URI of the data source for a column group.
+ */
int
__wt_schema_colgroup_source(WT_SESSION_IMPL *session,
WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf)
@@ -152,6 +162,10 @@ __wt_schema_colgroup_source(WT_SESSION_IMPL *session,
return (0);
}
+/*
+ * __create_colgroup --
+ * Create a column group.
+ */
static int
__create_colgroup(WT_SESSION_IMPL *session,
const char *name, int exclusive, const char *config)
@@ -257,6 +271,10 @@ err: __wt_free(session, cgconf);
return (ret);
}
+/*
+ * __wt_schema_index_source --
+ * Get the URI of the data source for an index.
+ */
int
__wt_schema_index_source(WT_SESSION_IMPL *session,
WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf)
@@ -285,6 +303,10 @@ __wt_schema_index_source(WT_SESSION_IMPL *session,
return (0);
}
+/*
+ * __create_index --
+ * Create an index.
+ */
static int
__create_index(WT_SESSION_IMPL *session,
const char *name, int exclusive, const char *config)
@@ -415,6 +437,10 @@ err: __wt_free(session, idxconf);
return (ret);
}
+/*
+ * __create_table --
+ * Create a table.
+ */
static int
__create_table(WT_SESSION_IMPL *session,
const char *name, int exclusive, const char *config)
@@ -488,6 +514,10 @@ err: if (table != NULL) {
return (ret);
}
+/*
+ * __create_data_source --
+ * Create a custom data source.
+ */
static int
__create_data_source(WT_SESSION_IMPL *session,
const char *uri, const char *config, WT_DATA_SOURCE *dsrc)
@@ -517,6 +547,10 @@ __create_data_source(WT_SESSION_IMPL *session,
return (dsrc->create(dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg));
}
+/*
+ * __wt_schema_create --
+ * Process a WT_SESSION::create operation for all supported types.
+ */
int
__wt_schema_create(
WT_SESSION_IMPL *session, const char *uri, const char *config)
diff --git a/src/schema/schema_drop.c b/src/schema/schema_drop.c
index 91a15f477d6..c4cb864a5de 100644
--- a/src/schema/schema_drop.c
+++ b/src/schema/schema_drop.c
@@ -157,6 +157,10 @@ err: if (force && ret == WT_NOTFOUND)
return (ret);
}
+/*
+ * __wt_schema_drop --
+ * Process a WT_SESSION::drop operation for all supported types.
+ */
int
__wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
{
diff --git a/src/schema/schema_open.c b/src/schema/schema_open.c
index 372968ea221..75fb425f37d 100644
--- a/src/schema/schema_open.c
+++ b/src/schema/schema_open.c
@@ -114,7 +114,7 @@ err: __wt_scr_free(&buf);
}
/*
- * ___open_index --
+ * __open_index --
* Open an index.
*/
static int
@@ -208,8 +208,8 @@ err: __wt_scr_free(&buf);
}
/*
- * __wt_schema_open_indices --
- * Open the indices for a table.
+ * __wt_schema_open_index --
+ * Open one or more indices for a table.
*/
int
__wt_schema_open_index(WT_SESSION_IMPL *session,
diff --git a/src/schema/schema_plan.c b/src/schema/schema_plan.c
index acde7a11183..68c2eb0f29f 100644
--- a/src/schema/schema_plan.c
+++ b/src/schema/schema_plan.c
@@ -7,6 +7,10 @@
#include "wt_internal.h"
+/*
+ * __find_next_col --
+ * Find the next column to use for a plan.
+ */
static int
__find_next_col(WT_SESSION_IMPL *session, WT_TABLE *table,
WT_CONFIG_ITEM *colname, u_int *cgnump, u_int *colnump, char *coltype)
@@ -253,6 +257,10 @@ __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table,
return (0);
}
+/*
+ * __find_column_format --
+ * Find the format of the named column.
+ */
static int
__find_column_format(WT_SESSION_IMPL *session,
WT_TABLE *table, WT_CONFIG_ITEM *colname, int value_only, WT_PACK_VALUE *pv)
diff --git a/src/session/session_api.c b/src/session/session_api.c
index 04c32b794ba..133348f2a01 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -340,11 +340,13 @@ __session_log_printf(WT_SESSION *wt_session, const char *fmt, ...)
va_list ap;
session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL_NO_CONF(session, log_printf);
va_start(ap, fmt);
- ret =__wt_log_vprintf(session, fmt, ap);
+ ret = __wt_log_vprintf(session, fmt, ap);
va_end(ap);
+err: API_END(session);
return (ret);
}
diff --git a/src/session/session_compact.c b/src/session/session_compact.c
index 2337ee16050..502c35f6775 100644
--- a/src/session/session_compact.c
+++ b/src/session/session_compact.c
@@ -8,90 +8,94 @@
#include "wt_internal.h"
/*
-Compaction is the place where the underlying block manager becomes visible in
-the higher engine btree and API layers. As there is currently only one block
-manager, this code is written with it in mind: other block managers may need
-changes to support compaction, and a smart block manager might need far less
-support from the engine.
-
-First, the default block manager cannot entirely own compaction because it has
-no way to find a block after it moves other than a request from the btree layer
-with the new address. In other words, if internal page X points to leaf page Y,
-and page Y moves, the address of page Y has to be updated in page X. Generally,
-this is solved by building a translation layer in the block manager so internal
-pages don't require updates to relocate blocks: however, the translation table
-must be durable, has its own garbage collection issues and might be slower, all
-of which have their own problems.
-
-Second, the btree layer cannot entirely own compaction because page addresses
-are opaque, it cannot know where a page is in the file from the address cookie.
-
-For these reasons, compaction is a cooperative process between the btree layer
-and the block manager. The btree layer walks files, and asks the block manager
-if rewriting a particular block would reduce the file footprint: if writing the
-page will help, the page is marked dirty so it will eventually be written. As
-pages are written, the original page potentially becomes available for reuse
-and if enough pages at the end of the file are available for reuse, the file can
-be truncated, and compaction succeeds.
-
-However, writing a page is not by itself sufficient to make a page available
-for reuse. The original version of the page is still referenced by at least
-the most recent checkpoint in the file. To make a page available for reuse,
-we have to checkpoint the file so we can discard the checkpoint referencing
-the original version of the block; once no checkpoint references a block, it
-becomes available for reuse.
-
-Compaction is not necessarily possible in WiredTiger, even in a file with lots
-of available space. If a block at the end of the file is referenced by a named
-checkpoint, there is nothing we can do to compact the file, no matter how many
-times we rewrite the block, the named checkpoint can't be discarded and so the
-reference count on the original block will never go to zero. What's worse,
-because the block manager doesn't reference count blocks, it can't easily know
-this is the case, and so we'll waste a lot of effort trying to compact files
-that can't be compacted.
-
-Now, to the actual process. First, we checkpoint the high-level object (which
-is potentially composed of multiple files): there are potentially many dirty
-blocks in the cache, and we want to write them out and then discard previous
-checkpoints so we have as many blocks as possible on the file's "available for
-reuse" list when we start compaction.
-
-Then, we compact the high-level object.
-
-Compacting the object is done 10% at a time, that is, we try and move blocks
-from the last 10% of the file into the beginning of the file (the 10% is hard
-coded in the block manager). The reason for this is because we are walking
-the file in logical order, not block offset order, and we can fail to compact
-a file if we write the wrong blocks first.
-
-For example, imagine a file with 10 blocks in the first 10% of a file, 1,000
-blocks in the 3rd quartile of the file, and 10 blocks in the last 10% of the
-file. If we were to rewrite blocks from more than the last 10% of the file,
-and found the 1,000 blocks in the 3rd quartile of the file first, we'd copy
-10 of them without ever rewriting the blocks from the end of the file which
-would allow us to compact the file. So, we compact the last 10% of the file,
-and if that works, we compact the last 10% of the file again, and so on. Note
-the block manager uses a first-fit block selection algorithm during compaction
-to maximize block movement.
-
-After each 10% compaction, we checkpoint two more times (seriously, twice).
-The second and third checkpoints are because the block manager checkpoints in
-two steps: blocks made available for reuse during a checkpoint are put on a
-special checkpoint-available list and only moved to the real available list
-after the metadata has been updated with the new checkpoint's information.
-(Otherwise it is possible to allocate a rewritten block, crash before the
-metadata is updated, and see corruption.) For this reason, blocks allocated
-to write the checkpoint itself cannot be taken from the blocks made available
-by the checkpoint.
-
-To say it another way, the second checkpoint puts the blocks from the end of
-the file that were made available by compaction onto the checkpoint-available
-list, but then potentially writes the checkpoint itself at the end of the
-file, which would prevent any file truncation. When the metadata is updated
-for the second checkpoint, the blocks freed by compaction become available
-for the third checkpoint, so the third checkpoint's blocks are written towards
-the beginning of the file, and then the file can be truncated.
-*/
+ * Compaction is the place where the underlying block manager becomes visible
+ * in the higher engine btree and API layers. As there is currently only one
+ * block manager, this code is written with it in mind: other block managers
+ * may need changes to support compaction, and a smart block manager might need
+ * far less support from the engine.
+ *
+ * First, the default block manager cannot entirely own compaction because it
+ * has no way to find a block after it moves other than a request from the
+ * btree layer with the new address. In other words, if internal page X points
+ * to leaf page Y, and page Y moves, the address of page Y has to be updated in
+ * page X. Generally, this is solved by building a translation layer in the
+ * block manager so internal pages don't require updates to relocate blocks:
+ * however, the translation table must be durable, has its own garbage
+ * collection issues and might be slower, all of which have their own problems.
+ *
+ * Second, the btree layer cannot entirely own compaction because page
+ * addresses are opaque, it cannot know where a page is in the file from the
+ * address cookie.
+ *
+ * For these reasons, compaction is a cooperative process between the btree
+ * layer and the block manager. The btree layer walks files, and asks the
+ * block manager if rewriting a particular block would reduce the file
+ * footprint: if writing the page will help, the page is marked dirty so it
+ * will eventually be written. As pages are written, the original page
+ * potentially becomes available for reuse and if enough pages at the end of
+ * the file are available for reuse, the file can be truncated, and compaction
+ * succeeds.
+ *
+ * However, writing a page is not by itself sufficient to make a page available
+ * for reuse. The original version of the page is still referenced by at least
+ * the most recent checkpoint in the file. To make a page available for reuse,
+ * we have to checkpoint the file so we can discard the checkpoint referencing
+ * the original version of the block; once no checkpoint references a block, it
+ * becomes available for reuse.
+ *
+ * Compaction is not necessarily possible in WiredTiger, even in a file with
+ * lots of available space. If a block at the end of the file is referenced by
+ * a named checkpoint, there is nothing we can do to compact the file, no
+ * matter how many times we rewrite the block, the named checkpoint can't be
+ * discarded and so the reference count on the original block will never go to
+ * zero. What's worse, because the block manager doesn't reference count
+ * blocks, it can't easily know this is the case, and so we'll waste a lot of
+ * effort trying to compact files that can't be compacted.
+ *
+ * Now, to the actual process. First, we checkpoint the high-level object
+ * (which is potentially composed of multiple files): there are potentially
+ * many dirty blocks in the cache, and we want to write them out and then
+ * discard previous checkpoints so we have as many blocks as possible on the
+ * file's "available for reuse" list when we start compaction.
+ *
+ * Then, we compact the high-level object.
+ *
+ * Compacting the object is done 10% at a time, that is, we try and move blocks
+ * from the last 10% of the file into the beginning of the file (the 10% is
+ * hard coded in the block manager). The reason for this is because we are
+ * walking the file in logical order, not block offset order, and we can fail
+ * to compact a file if we write the wrong blocks first.
+ *
+ * For example, imagine a file with 10 blocks in the first 10% of a file, 1,000
+ * blocks in the 3rd quartile of the file, and 10 blocks in the last 10% of the
+ * file. If we were to rewrite blocks from more than the last 10% of the file,
+ * and found the 1,000 blocks in the 3rd quartile of the file first, we'd copy
+ * 10 of them without ever rewriting the blocks from the end of the file which
+ * would allow us to compact the file. So, we compact the last 10% of the
+ * file, and if that works, we compact the last 10% of the file again, and so
+ * on. Note the block manager uses a first-fit block selection algorithm
+ * during compaction to maximize block movement.
+ *
+ * After each 10% compaction, we checkpoint two more times (seriously, twice).
+ * The second and third checkpoints are because the block manager checkpoints
+ * in two steps: blocks made available for reuse during a checkpoint are put on
+ *
+ * a special checkpoint-available list and only moved to the real available
+ * list after the metadata has been updated with the new checkpoint's
+ * information. (Otherwise it is possible to allocate a rewritten block, crash
+ * before the metadata is updated, and see corruption.) For this reason,
+ * blocks allocated to write the checkpoint itself cannot be taken from the
+ * blocks made available by the checkpoint.
+ *
+ * To say it another way, the second checkpoint puts the blocks from the end of
+ * the file that were made available by compaction onto the
+ * checkpoint-available list, but then potentially writes the checkpoint itself
+ * at the end of the file, which would prevent any file truncation. When the
+ * metadata is updated for the second checkpoint, the blocks freed by
+ * compaction become available for the third checkpoint, so the third
+ * checkpoint's blocks are written towards the beginning of the file, and then
+ * the file can be truncated.
+ */
/*
* __session_compact_worker --
diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c
index d95a17c66db..d776144d880 100644
--- a/src/session/session_dhandle.c
+++ b/src/session/session_dhandle.c
@@ -211,7 +211,8 @@ retry: WT_RET(__wt_meta_checkpoint_last_name(
}
/*
- * Discard any session dhandles that are not open.
+ * __session_dhandle_sweep --
+ * Discard any session dhandles that are not open.
*/
static int
__session_dhandle_sweep(WT_SESSION_IMPL *session)
@@ -235,9 +236,10 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session)
}
/*
- * Wrapper function to first sweep the session and then get the btree.
- * Sweeping is only called when a session notices it has dead dhandles
- * on its session dhandle list. Must be called with schema lock.
+ * __session_open_btree --
+ * Wrapper function to first sweep the session and then get the btree.
+ * Sweeping is only called when a session notices it has dead dhandles on
+ * its session dhandle list. Must be called with schema lock.
*/
static int
__session_open_btree(WT_SESSION_IMPL *session,
diff --git a/src/session/session_salvage.c b/src/session/session_salvage.c
index 3a00ec6562d..09b8a7c5bbe 100644
--- a/src/session/session_salvage.c
+++ b/src/session/session_salvage.c
@@ -50,8 +50,8 @@ __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
if (ckptbase[0].raw.data == NULL)
WT_ERR(__wt_meta_checkpoint_clear(session, dhandle->name));
else
- WT_ERR(
- __wt_meta_ckptlist_set(session, dhandle->name, ckptbase));
+ WT_ERR(__wt_meta_ckptlist_set(
+ session, dhandle->name, ckptbase, NULL));
err: __wt_meta_ckptlist_free(session, ckptbase);
return (ret);
diff --git a/src/support/global.c b/src/support/global.c
index 807675bddf2..b365b9473eb 100644
--- a/src/support/global.c
+++ b/src/support/global.c
@@ -10,6 +10,10 @@
WT_PROCESS __wt_process; /* Per-process structure */
static int __wt_pthread_once_failed; /* If initialization failed */
+/*
+ * __system_is_little_endian --
+ * Check if the system is little endian.
+ */
static int
__system_is_little_endian(void)
{
@@ -28,6 +32,10 @@ __system_is_little_endian(void)
return (EINVAL);
}
+/*
+ * __wt_pthread_once --
+ * Global initialization, run once.
+ */
static void
__wt_pthread_once(void)
{
diff --git a/src/support/hash_fnv.c b/src/support/hash_fnv.c
index 111556b62a6..099dba1ec73 100644
--- a/src/support/hash_fnv.c
+++ b/src/support/hash_fnv.c
@@ -109,7 +109,8 @@
#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
/*
- * fnv_64a_buf - perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer
+ * fnv_64a_buf --
+ * Perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer
*
* input:
* buf - start of buffer to hash
diff --git a/src/support/hex.c b/src/support/hex.c
index 48463e86c51..bb736932e9a 100644
--- a/src/support/hex.c
+++ b/src/support/hex.c
@@ -155,6 +155,10 @@ hex2byte(const u_char *from, u_char *to)
return (0);
}
+/*
+ * __hex_fmterr --
+ * Hex format error message.
+ */
static int
__hex_fmterr(WT_SESSION_IMPL *session)
{
diff --git a/src/support/mutex.c b/src/support/mutex.c
index b684fcf0299..8ed8f50322f 100644
--- a/src/support/mutex.c
+++ b/src/support/mutex.c
@@ -66,7 +66,7 @@ __wt_spin_lock_register(WT_SESSION_IMPL *session,
}
/*
- * __wt_statlog_spinlock_dump --
+ * __wt_statlog_dump_spinlock --
* Log the spin-lock statistics.
*/
int
diff --git a/src/support/pow.c b/src/support/pow.c
index 9ba22a53f8d..c01268245bb 100644
--- a/src/support/pow.c
+++ b/src/support/pow.c
@@ -28,6 +28,7 @@
#include "wt_internal.h"
#ifdef __WIREDTIGER_UNUSED__
+
/*
* __wt_nlpo2_round --
* Round up to the next-largest power-of-two for a 32-bit unsigned value.
@@ -63,6 +64,10 @@ __wt_nlpo2_round(uint32_t v)
return (v + 1);
}
+/*
+ * __wt_nlpo2 --
+ * Return the next largest power-of-two.
+ */
uint32_t
__wt_nlpo2(uint32_t v)
{
diff --git a/src/support/scratch.c b/src/support/scratch.c
index aa6a56e9d50..5f4cab931d6 100644
--- a/src/support/scratch.c
+++ b/src/support/scratch.c
@@ -76,6 +76,22 @@ __wt_buf_grow(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
}
/*
+ * __wt_buf_extend --
+ * Extend a buffer that's currently in-use. The difference from
+ * __wt_buf_grow is that extend is expected to be called repeatedly for
+ * the same buffer, and so grows the buffer exponentially to avoid
+ * repeated costly calls to realloc.
+ */
+int
+__wt_buf_extend(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+ if (size <= buf->memsize)
+ return (0);
+
+ return (__wt_buf_grow(session, buf, WT_MAX(size, 2 * buf->memsize)));
+}
+
+/*
* __wt_buf_init --
* Initialize a buffer at a specific size.
*/
@@ -259,7 +275,7 @@ __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
}
/*
- * __wt_scr_alloc --
+ * __wt_scr_alloc_func --
* Scratch buffer allocation function.
*/
int
diff --git a/src/support/stat.c b/src/support/stat.c
index a48597ae869..4d90409db08 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -114,8 +114,7 @@ __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats)
"reconciliation failed because an update could not be included";
stats->rec_split_internal.desc = "reconciliation internal pages split";
stats->rec_split_leaf.desc = "reconciliation leaf pages split";
- stats->rec_split_max.desc =
- "reconciliation maximum number of splits created for a page";
+ stats->rec_split_max.desc = "reconciliation maximum splits for a page";
stats->session_compact.desc = "object compaction";
stats->session_cursor_open.desc = "open cursor count";
stats->txn_update_conflict.desc = "update conflicts";
@@ -364,8 +363,10 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
stats->dh_conn_handles.desc = "dhandle: connection dhandles swept";
stats->dh_session_handles.desc = "dhandle: session dhandles swept";
stats->dh_sweep_evict.desc = "dhandle: sweeps conflicting with evict";
- stats->dh_sweeps.desc = "dhandle: number of sweep attempts";
+ stats->dh_sweeps.desc = "dhandle: sweep attempts";
stats->file_open.desc = "files currently open";
+ stats->log_buffer_grow.desc = "log: log buffer size increases";
+ stats->log_buffer_size.desc = "log: total log buffer size";
stats->log_bytes_user.desc = "log: user provided log bytes written";
stats->log_bytes_written.desc = "log: log bytes written";
stats->log_max_filesize.desc = "log: maximum log file size";
@@ -378,7 +379,15 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
stats->log_slot_consolidated.desc = "log: logging bytes consolidated";
stats->log_slot_joins.desc = "log: consolidated slot joins";
stats->log_slot_races.desc = "log: consolidated slot join races";
+ stats->log_slot_ready_wait_timeout.desc =
+ "log: log slot ready wait timeouts";
+ stats->log_slot_release_wait_timeout.desc =
+ "log: log slot release wait timeouts";
+ stats->log_slot_switch_fails.desc =
+ "log: slots selected for switching that were unavailable";
stats->log_slot_toobig.desc = "log: record size exceeded maximum";
+ stats->log_slot_toosmall.desc =
+ "log: failed to find a slot large enough for record";
stats->log_slot_transitions.desc =
"log: consolidated slot join transitions";
stats->log_sync.desc = "log: log sync operations";
@@ -453,6 +462,7 @@ __wt_stat_refresh_connection_stats(void *stats_arg)
stats->dh_session_handles.v = 0;
stats->dh_sweep_evict.v = 0;
stats->dh_sweeps.v = 0;
+ stats->log_buffer_grow.v = 0;
stats->log_bytes_user.v = 0;
stats->log_bytes_written.v = 0;
stats->log_reads.v = 0;
@@ -463,7 +473,11 @@ __wt_stat_refresh_connection_stats(void *stats_arg)
stats->log_slot_consolidated.v = 0;
stats->log_slot_joins.v = 0;
stats->log_slot_races.v = 0;
+ stats->log_slot_ready_wait_timeout.v = 0;
+ stats->log_slot_release_wait_timeout.v = 0;
+ stats->log_slot_switch_fails.v = 0;
stats->log_slot_toobig.v = 0;
+ stats->log_slot_toosmall.v = 0;
stats->log_slot_transitions.v = 0;
stats->log_sync.v = 0;
stats->log_writes.v = 0;
diff --git a/src/txn/txn.c b/src/txn/txn.c
index 0faab9fff82..f7a3f2004c4 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -296,7 +296,7 @@ __wt_txn_release(WT_SESSION_IMPL *session)
WT_TXN_STATE *txn_state;
txn = &session->txn;
- txn->mod_count = txn->modref_count = 0;
+ txn->mod_count = 0;
txn->notify = NULL;
txn_global = &S2C(session)->txn_global;
@@ -327,7 +327,10 @@ __wt_txn_release(WT_SESSION_IMPL *session)
int
__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
{
+ WT_DECL_RET;
WT_TXN *txn;
+ WT_TXN_OP *op;
+ u_int i;
WT_UNUSED(cfg);
@@ -339,8 +342,29 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
/* Commit notification. */
if (txn->notify != NULL)
- WT_RET(txn->notify->notify(txn->notify, (WT_SESSION *)session,
- txn->id, 1));
+ WT_TRET(txn->notify->notify(txn->notify,
+ (WT_SESSION *)session, txn->id, 1));
+
+ /* If we are logging, write a commit log record. */
+ if (ret == 0 &&
+ txn->mod_count > 0 && S2C(session)->logging &&
+ !F_ISSET(session, WT_SESSION_LOGGING_DISABLED))
+ ret = __wt_txn_log_commit(session, cfg);
+
+ /*
+ * If anything went wrong, roll back.
+ *
+ * !!!
+ * Nothing can fail after this point.
+ */
+ if (ret != 0) {
+ WT_TRET(__wt_txn_rollback(session, cfg));
+ return (ret);
+ }
+
+ /* Free memory associated with updates. */
+ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++)
+ __wt_txn_op_free(session, op);
/*
* Auto-commit transactions need a new transaction snapshot so that the
@@ -364,9 +388,8 @@ int
__wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
{
WT_DECL_RET;
- WT_REF **rp;
WT_TXN *txn;
- uint64_t **m;
+ WT_TXN_OP *op;
u_int i;
WT_UNUSED(cfg);
@@ -381,12 +404,29 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
txn->id, 0));
/* Rollback updates. */
- for (i = 0, m = txn->mod; i < txn->mod_count; i++, m++)
- **m = WT_TXN_ABORTED;
+ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
+ switch (op->type) {
+ case TXN_OP_BASIC:
+ case TXN_OP_INMEM:
+ op->u.op.upd->txnid = WT_TXN_ABORTED;
+ break;
+ case TXN_OP_REF:
+ __wt_tree_walk_delete_rollback(op->u.ref);
+ break;
+ case TXN_OP_TRUNCATE_COL:
+ case TXN_OP_TRUNCATE_ROW:
+ /*
+ * Nothing to do: these operations are only logged for
+ * recovery. The in-memory changes will be rolled back
+ * with a combination of TXN_OP_REF and TXN_OP_INMEM
+ * operations.
+ */
+ break;
+ }
- /* Rollback fast deletes. */
- for (i = 0, rp = txn->modref; i < txn->modref_count; i++, rp++)
- __wt_tree_walk_delete_rollback(*rp);
+ /* Free any memory allocated for the operation. */
+ __wt_txn_op_free(session, op);
+ }
__wt_txn_release(session);
return (ret);
@@ -412,7 +452,6 @@ __wt_txn_init(WT_SESSION_IMPL *session)
* for eviction.
*/
txn->mod = NULL;
- txn->modref = NULL;
txn->isolation = session->isolation;
return (0);
@@ -429,7 +468,6 @@ __wt_txn_destroy(WT_SESSION_IMPL *session)
txn = &session->txn;
__wt_free(session, txn->mod);
- __wt_free(session, txn->modref);
__wt_free(session, txn->snapshot);
}
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 368caa978f1..6ad8a6aeb2b 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -65,7 +65,7 @@ err: if (cursor != NULL)
*/
static int
__checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[],
- int (*op)(WT_SESSION_IMPL *, const char *[]))
+ int (*op)(WT_SESSION_IMPL *, const char *[]), int *fullp)
{
WT_CONFIG targetconf;
WT_CONFIG_ITEM cval, k, v;
@@ -132,6 +132,8 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[],
}
err: __wt_scr_free(&tmp);
+ if (ret == 0 && fullp != NULL)
+ *fullp = !target_list;
return (ret);
}
@@ -186,7 +188,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_TXN *txn;
WT_TXN_ISOLATION saved_isolation;
void *saved_meta_next;
- int tracking;
+ int full, tracking;
conn = S2C(session);
saved_isolation = session->isolation;
@@ -214,16 +216,27 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
/* Flush dirty leaf pages before we start the checkpoint. */
session->isolation = txn->isolation = TXN_ISO_READ_COMMITTED;
- WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_write_leaves));
+ WT_ERR(__checkpoint_apply(
+ session, cfg, __wt_checkpoint_write_leaves, &full));
WT_ERR(__wt_meta_track_on(session));
tracking = 1;
+ /* Tell logging that we are about to start a database checkpoint. */
+ if (S2C(session)->logging && full)
+ WT_ERR(__wt_txn_checkpoint_log(
+ session, full, WT_TXN_LOG_CKPT_PREPARE, NULL));
+
/* Start a snapshot transaction for the checkpoint. */
wt_session = &session->iface;
WT_ERR(wt_session->begin_transaction(wt_session, "isolation=snapshot"));
- WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint));
+ /* Tell logging that we have started a database checkpoint. */
+ if (S2C(session)->logging && full)
+ WT_ERR(__wt_txn_checkpoint_log(
+ session, full, WT_TXN_LOG_CKPT_START, NULL));
+
+ WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint, NULL));
/* Release the snapshot transaction, before syncing the file(s). */
__wt_txn_release(session);
@@ -233,7 +246,8 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
* lazy checkpoints, but we don't support them yet).
*/
if (F_ISSET(conn, WT_CONN_CKPT_SYNC))
- WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync));
+ WT_ERR(__checkpoint_apply(
+ session, cfg, __wt_checkpoint_sync, NULL));
/* Checkpoint the metadata file. */
SLIST_FOREACH(dhandle, &conn->dhlh, l) {
@@ -280,6 +294,13 @@ err: /*
__wt_txn_release(session);
else
__wt_txn_release_snapshot(session);
+
+ /* Tell logging that we have finished a database checkpoint. */
+ if (S2C(session)->logging && full)
+ WT_TRET(__wt_txn_checkpoint_log(session, full,
+ (ret == 0) ? WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_FAIL,
+ NULL));
+
__wt_spin_unlock(session, &conn->checkpoint_lock);
__wt_scr_free(&tmp);
@@ -415,6 +436,7 @@ __checkpoint_worker(
WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
+ WT_LSN ckptlsn;
const char *name;
int deleted, force, hot_backup_locked, track_ckpt;
char *name_alloc;
@@ -425,6 +447,9 @@ __checkpoint_worker(
bm = btree->bm;
ckpt = ckptbase = NULL;
+ INIT_LSN(&ckptlsn);
+ dhandle = session->dhandle;
+ name_alloc = NULL;
hot_backup_locked = 0;
name_alloc = NULL;
track_ckpt = 1;
@@ -695,6 +720,11 @@ __checkpoint_worker(
*/
WT_ERR(__wt_bt_cache_force_write(session));
+ /* Tell logging that a file checkpoint is starting. */
+ if (S2C(session)->logging)
+ WT_ERR(__wt_txn_checkpoint_log(
+ session, 0, WT_TXN_LOG_CKPT_START, &ckptlsn));
+
/*
* Clear the tree's modified flag; any changes before we clear the flag
* are guaranteed to be part of this checkpoint (unless reconciliation
@@ -723,8 +753,8 @@ __checkpoint_worker(
ckpt->write_gen = btree->write_gen;
fake: /* Update the object's metadata. */
- ret = __wt_meta_ckptlist_set(session, dhandle->name, ckptbase);
- WT_ERR(ret);
+ WT_ERR(__wt_meta_ckptlist_set(
+ session, dhandle->name, ckptbase, &ckptlsn));
/*
* If we wrote a checkpoint (rather than faking one), pages may be
@@ -740,6 +770,11 @@ fake: /* Update the object's metadata. */
WT_ERR(bm->checkpoint_resolve(bm, session));
}
+ /* Tell logging that the checkpoint is complete. */
+ if (S2C(session)->logging)
+ WT_ERR(__wt_txn_checkpoint_log(
+ session, 0, WT_TXN_LOG_CKPT_STOP, NULL));
+
err: if (hot_backup_locked)
__wt_spin_unlock(session, &conn->hot_backup_lock);
skip: __wt_meta_ckptlist_free(session, ckptbase);
diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c
new file mode 100644
index 00000000000..ca8812aae09
--- /dev/null
+++ b/src/txn/txn_log.c
@@ -0,0 +1,440 @@
+/*-
+ * Copyright (c) 2008-2013 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __txn_op_log --
+ * Log an operation for the current transaction.
+ */
+static int
+__txn_op_log(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_TXN_OP *op)
+{
+ WT_ITEM value;
+ uint64_t recno;
+
+ value.data = WT_UPDATE_DATA(op->u.op.upd);
+ value.size = op->u.op.upd->size;
+
+ /*
+ * Log the operation. It must be one of the following:
+ * 1) column store remove;
+ * 2) column store insert/update;
+ * 3) row store remove; or
+ * 4) row store insert/update.
+ */
+ if (op->u.op.key.data == NULL) {
+ WT_ASSERT(session, op->u.op.ins != NULL);
+ recno = op->u.op.ins->u.recno;
+
+ if (WT_UPDATE_DELETED_ISSET(op->u.op.upd))
+ WT_RET(__wt_logop_col_remove_pack(session, logrec,
+ op->fileid, recno));
+ else
+ WT_RET(__wt_logop_col_put_pack(session, logrec,
+ op->fileid, recno, &value));
+ } else {
+ if (WT_UPDATE_DELETED_ISSET(op->u.op.upd))
+ WT_RET(__wt_logop_row_remove_pack(session, logrec,
+ op->fileid, &op->u.op.key));
+ else
+ WT_RET(__wt_logop_row_put_pack(session, logrec,
+ op->fileid, &op->u.op.key, &value));
+ }
+
+ return (0);
+}
+
+/*
+ * __txn_commit_printlog --
+ * Print a commit log record.
+ */
+static int
+__txn_commit_printlog(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ /* The logging subsystem zero-pads records. */
+ while (*pp < end && **pp)
+ WT_RET(__wt_txn_op_printlog(session, pp, end, out));
+ return (0);
+}
+
+/*
+ * __wt_txn_op_free --
+ * Free memory associated with a transactional operation.
+ */
+void
+__wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op)
+{
+ if (op->type == TXN_OP_TRUNCATE_ROW) {
+ __wt_buf_free(session, &op->u.truncate_row.start);
+ __wt_buf_free(session, &op->u.truncate_row.stop);
+ }
+}
+
+/*
+ * __wt_txn_log_commit --
+ * Write the operations of a transaction to the log at commit time.
+ */
+int
+__wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_DECL_ITEM(logrec);
+ WT_TXN *txn;
+ WT_TXN_OP *op;
+ const char *fmt = WT_UNCHECKED_STRING(Iq);
+ size_t header_size;
+ uint32_t rectype = WT_LOGREC_COMMIT;
+ u_int i;
+
+ WT_UNUSED(cfg);
+ txn = &session->txn;
+
+ WT_RET(__wt_struct_size(session, &header_size, fmt, rectype, txn->id));
+ WT_RET(__wt_logrec_alloc(session, header_size, &logrec));
+
+ WT_ERR(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, header_size,
+ fmt, rectype, txn->id));
+ logrec->size += (uint32_t)header_size;
+
+ /* Write updates to the log. */
+ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++)
+ switch (op->type) {
+ case TXN_OP_BASIC:
+ WT_ERR(__txn_op_log(session, logrec, op));
+ break;
+ case TXN_OP_INMEM:
+ case TXN_OP_REF:
+ /* Nothing to log, we're done. */
+ break;
+ case TXN_OP_TRUNCATE_COL:
+ WT_ERR(__wt_logop_col_truncate_pack(session, logrec,
+ op->fileid,
+ op->u.truncate_col.start, op->u.truncate_col.stop));
+ break;
+ case TXN_OP_TRUNCATE_ROW:
+ WT_ERR(__wt_logop_row_truncate_pack(session, logrec,
+ op->fileid,
+ &op->u.truncate_row.start, &op->u.truncate_row.stop,
+ (uint32_t)op->u.truncate_row.mode));
+ break;
+ }
+
+ WT_ERR(__wt_log_write(session,
+ logrec, NULL, S2C(session)->txn_logsync));
+
+err: __wt_logrec_free(session, &logrec);
+ return (ret);
+}
+
+/*
+ * __txn_log_file_sync --
+ * Write a log record for a file sync.
+ */
+static int
+__txn_log_file_sync(WT_SESSION_IMPL *session, uint32_t flags, WT_LSN *lsnp)
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ WT_DECL_ITEM(logrec);
+ const char *fmt = WT_UNCHECKED_STRING(ISI);
+ size_t header_size;
+ uint32_t rectype = WT_LOGREC_FILE_SYNC;
+ int start;
+
+ dhandle = session->dhandle;
+ start = LF_ISSET(WT_TXN_LOG_CKPT_START);
+
+ WT_RET(__wt_struct_size(
+ session, &header_size, fmt, rectype, dhandle->name, start));
+ WT_RET(__wt_logrec_alloc(session, header_size, &logrec));
+
+ WT_ERR(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, header_size,
+ fmt, rectype, dhandle->name, start));
+ logrec->size += (uint32_t)header_size;
+
+ WT_ERR(__wt_log_write(session, logrec, lsnp, 0));
+err: __wt_logrec_free(session, &logrec);
+ return (ret);
+}
+
+/*
+ * __wt_txn_checkpoint_logread --
+ * Read a log record for a checkpoint operation.
+ */
+int
+__wt_txn_checkpoint_logread(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ WT_LSN *ckpt_lsn)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IQIU);
+ WT_ITEM ckpt_snapshot;
+ u_int ckpt_nsnapshot;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &ckpt_lsn->file, &ckpt_lsn->offset,
+ &ckpt_nsnapshot, &ckpt_snapshot));
+ WT_UNUSED(ckpt_nsnapshot);
+ WT_UNUSED(ckpt_snapshot);
+ *pp = end;
+ return (0);
+}
+
+/*
+ * __wt_txn_checkpoint_log --
+ * Write a log record for a checkpoint operation.
+ */
+int
+__wt_txn_checkpoint_log(
+ WT_SESSION_IMPL *session, int full, uint32_t flags, WT_LSN *lsnp)
+{
+ WT_DECL_RET;
+ WT_DECL_ITEM(logrec);
+ WT_LSN *ckpt_lsn;
+ WT_TXN *txn;
+ const char *fmt = WT_UNCHECKED_STRING(IIQIU);
+ uint8_t *end, *p;
+ size_t recsize;
+ uint32_t i, rectype = WT_LOGREC_CHECKPOINT;
+
+ txn = &session->txn;
+ ckpt_lsn = &txn->ckpt_lsn;
+
+ /*
+ * If this is a file sync, log it unless there is a full checkpoint in
+ * progress.
+ */
+ if (!full) {
+ if (txn->full_ckpt) {
+ if (lsnp != NULL)
+ *lsnp = *ckpt_lsn;
+ return (0);
+ } else
+ return (__txn_log_file_sync(session, flags, lsnp));
+ }
+
+ switch (flags) {
+ case WT_TXN_LOG_CKPT_PREPARE:
+ txn->full_ckpt = 1;
+ *ckpt_lsn = S2C(session)->log->alloc_lsn;
+ break;
+
+ case WT_TXN_LOG_CKPT_START:
+ /* Take a copy of the transaction snapshot. */
+ txn->ckpt_nsnapshot = txn->snapshot_count;
+ recsize = txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE;
+ WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot));
+ p = txn->ckpt_snapshot->mem;
+ end = p + recsize;
+ for (i = 0; i < txn->snapshot_count; i++)
+ WT_ERR(__wt_vpack_uint(
+ &p, WT_PTRDIFF(end, p), txn->snapshot[i]));
+ break;
+
+ case WT_TXN_LOG_CKPT_STOP:
+ /*
+ * During a clean connection close, we get here without the
+ * prepare or start steps. In that case, log the current LSN
+ * as the checkpoint LSN.
+ */
+ if (!txn->full_ckpt) {
+ txn->ckpt_nsnapshot = 0;
+ *ckpt_lsn = S2C(session)->log->alloc_lsn;
+ }
+
+ /* Write the checkpoint log record. */
+ WT_ERR(__wt_struct_size(session, &recsize, fmt,
+ rectype, ckpt_lsn->file, ckpt_lsn->offset,
+ txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
+ WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));
+
+ WT_ERR(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, recsize, fmt,
+ rectype, ckpt_lsn->file, ckpt_lsn->offset,
+ txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
+ logrec->size += (uint32_t)recsize;
+ WT_ERR(__wt_log_write(session, logrec, lsnp, 0));
+
+ /*
+ * If this full checkpoint completed successfully and there is
+ * no hot backup in progress, tell the logging subsystem the
+ * checkpoint LSN so that it can archive.
+ */
+ if (!S2C(session)->hot_backup)
+ WT_ERR(__wt_log_ckpt(session, ckpt_lsn));
+
+ /* FALLTHROUGH */
+ case WT_TXN_LOG_CKPT_FAIL:
+ /* Cleanup any allocated resources */
+ INIT_LSN(ckpt_lsn);
+ txn->ckpt_nsnapshot = 0;
+ __wt_scr_free(&txn->ckpt_snapshot);
+ txn->full_ckpt = 0;
+ break;
+ }
+
+err: __wt_logrec_free(session, &logrec);
+ return (ret);
+}
+
+/*
+ * __wt_txn_truncate_log --
+ * Begin truncating a range of a file.
+ */
+int
+__wt_txn_truncate_log(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
+{
+ WT_BTREE *btree;
+ WT_ITEM *item;
+ WT_TXN_OP *op;
+
+ btree = S2BT(session);
+
+ WT_RET(__txn_next_op(session, &op));
+ op->fileid = btree->id;
+
+ if (btree->type == BTREE_ROW) {
+ op->type = TXN_OP_TRUNCATE_ROW;
+ op->u.truncate_row.mode = TXN_TRUNC_ALL;
+ WT_CLEAR(op->u.truncate_row.start);
+ WT_CLEAR(op->u.truncate_row.stop);
+ if (start != NULL) {
+ op->u.truncate_row.mode = TXN_TRUNC_START;
+ item = &op->u.truncate_row.start;
+ WT_RET(__wt_cursor_get_raw_key(&start->iface, item));
+ WT_RET(__wt_buf_set(
+ session, item, item->data, item->size));
+ }
+ if (stop != NULL) {
+ op->u.truncate_row.mode =
+ (op->u.truncate_row.mode == TXN_TRUNC_ALL) ?
+ TXN_TRUNC_STOP : TXN_TRUNC_BOTH;
+ item = &op->u.truncate_row.stop;
+ WT_RET(__wt_cursor_get_raw_key(&stop->iface, item));
+ WT_RET(__wt_buf_set(
+ session, item, item->data, item->size));
+ }
+ } else {
+ op->type = TXN_OP_TRUNCATE_COL;
+ op->u.truncate_col.start =
+ (start == NULL) ? 0 : start->recno;
+ op->u.truncate_col.stop =
+ (stop == NULL) ? 0 : stop->recno;
+ }
+
+ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOGGING_INMEM));
+ F_SET(session, WT_SESSION_LOGGING_INMEM);
+ return (0);
+}
+
+/*
+ * __wt_txn_truncate_end --
+ * Finish truncating a range of a file.
+ */
+int
+__wt_txn_truncate_end(WT_SESSION_IMPL *session)
+{
+ F_CLR(session, WT_SESSION_LOGGING_INMEM);
+ return (0);
+}
+
+/*
+ * __txn_printlog --
+ * Print a log record in a human-readable format.
+ */
+static int
+__txn_printlog(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie)
+{
+ FILE *out;
+ WT_LSN ckpt_lsn;
+ const uint8_t *end, *p;
+ const char *msg;
+ uint64_t txnid;
+ uint32_t fileid, rectype;
+ int32_t start;
+
+ out = cookie;
+
+ p = (const uint8_t *)logrec->data + offsetof(WT_LOG_RECORD, record);
+ end = (const uint8_t *)logrec->data + logrec->size;
+
+ /* First, peek at the log record type. */
+ WT_RET(__wt_logrec_read(session, &p, end, &rectype));
+
+ if (fprintf(out, " { \"lsn\" : [%" PRIu32 ",%" PRId64 "],\n",
+ lsnp->file, lsnp->offset) < 0)
+ return (errno);
+
+ switch (rectype) {
+ case WT_LOGREC_CHECKPOINT:
+ WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
+ WT_UNCHECKED_STRING(IQ), &ckpt_lsn.file, &ckpt_lsn.offset));
+ if (fprintf(out, " \"type\" : \"checkpoint\"\n") < 0 ||
+ fprintf(
+ out, " \"ckpt_lsn\" : [%" PRIu32 ",%" PRId64 "],\n",
+ ckpt_lsn.file, ckpt_lsn.offset) < 0)
+ return (errno);
+ break;
+
+ case WT_LOGREC_COMMIT:
+ WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid));
+ if (fprintf(out, " \"type\" : \"commit\"\n") < 0 ||
+ fprintf(out, " \"txnid\" : %" PRIu64 ",\n", txnid) < 0)
+ return (errno);
+ WT_RET(__txn_commit_printlog(session, &p, end, out));
+ break;
+
+ case WT_LOGREC_FILE_SYNC:
+ WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
+ WT_UNCHECKED_STRING(Ii), &fileid, &start));
+ if (fprintf(out, " \"type\" : \"file_sync\"\n") < 0 ||
+ fprintf(out, " \"fileid\" : %" PRIu32 "\n",
+ fileid) < 0 ||
+ fprintf(out, " \"start\" : %" PRId32 "\n", start) < 0)
+ return (errno);
+ break;
+
+ case WT_LOGREC_MESSAGE:
+ WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
+ WT_UNCHECKED_STRING(S), &msg));
+ if (fprintf(out, " \"type\" : \"message\"\n") < 0 ||
+ fprintf(out, " \"message\" : \"%s\"\n", msg) < 0)
+ return (errno);
+ break;
+ }
+
+ if (fprintf(out, " },\n") < 0)
+ return (errno);
+
+ return (0);
+}
+
+/*
+ * __wt_txn_printlog --
+ * Print the log in a human-readable format.
+ */
+int
+__wt_txn_printlog(WT_SESSION *wt_session, FILE *out)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ if (fprintf(out, "[\n") < 0)
+ return (errno);
+ WT_RET(__wt_log_scan(
+ session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, out));
+ if (fprintf(out, "]\n") < 0)
+ return (errno);
+
+ return (0);
+}
diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c
new file mode 100644
index 00000000000..fb1d05c2565
--- /dev/null
+++ b/src/txn/txn_recover.c
@@ -0,0 +1,451 @@
+/*-
+ * Copyright (c) 2008-2013 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/* State maintained during recovery. */
+typedef struct {
+ WT_SESSION_IMPL *session;
+
+ /* Files from the metadata, indexed by file ID. */
+ struct {
+ const char *uri; /* File URI. */
+ WT_CURSOR *c; /* Cursor used for recovery. */
+ WT_LSN ckpt_lsn; /* File's checkpoint LSN. */
+ } *files;
+ size_t file_alloc; /* Allocated size of files array. */
+ u_int max_fileid; /* Maximum file ID seen. */
+ u_int nfiles; /* Number of files in the metadata. */
+
+ WT_LSN ckpt_lsn; /* Start LSN for main recovery loop. */
+
+ int modified; /* Did recovery make any changes? */
+ int metadata_only; /*
+ * Set during the first recovery pass,
+ * when only the metadata is recovered.
+ */
+} WT_RECOVERY;
+
+/*
+ * __recovery_cursor --
+ * Get a cursor for a recovery operation.
+ */
+static int
+__recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r,
+ WT_LSN *lsnp, u_int id, int duplicate, WT_CURSOR **cp)
+{
+ WT_CURSOR *c;
+ const char *cfg[] = { WT_CONFIG_BASE(session, session_open_cursor),
+ "overwrite", NULL };
+
+ c = NULL;
+
+ /*
+ * Only apply operations in the correct metadata phase, and if the LSN
+ * is more recent than the last checkpoint. If there is no entry for a
+ * file, assume it was dropped.
+ */
+ if (r->metadata_only != (id == 0) ||
+ LOG_CMP(lsnp, &r->files[id].ckpt_lsn) < 0)
+ ;
+ else if (id > r->max_fileid)
+ r->max_fileid = id;
+ else if (id >= r->nfiles || r->files[id].uri == NULL)
+ WT_VERBOSE_RET(session, recovery,
+ "No file found with ID %u (max %u)", id, r->nfiles);
+ else if ((c = r->files[id].c) == NULL) {
+ WT_RET(__wt_open_cursor(
+ session, r->files[id].uri, NULL, cfg, &c));
+ r->files[id].c = c;
+ }
+
+ if (duplicate && c != NULL)
+ WT_RET(__wt_open_cursor(
+ session, r->files[id].uri, NULL, cfg, &c));
+
+ *cp = c;
+ return (0);
+}
+
+/*
+ * Helper to a cursor if this operation is to be applied during recovery.
+ */
+#define GET_RECOVERY_CURSOR(s, r, lsnp, fileid, cp) \
+ WT_ERR(__recovery_cursor((s), (r), (lsnp), (fileid), 0, (cp))); \
+ WT_VERBOSE_ERR(session, recovery, \
+ "%s op %d to file %d at LSN %u/%" PRIuMAX, \
+ (cursor == NULL) ? "Skipping" : "Applying", \
+ optype, fileid, lsnp->file, (uintmax_t)lsnp->offset); \
+ if (cursor == NULL) \
+ break
+
+/*
+ * __txn_op_apply --
+ * Apply a transactional operation during recovery.
+ */
+static int
+__txn_op_apply(
+ WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
+{
+ WT_CURSOR *cursor, *start, *stop;
+ WT_DECL_RET;
+ WT_ITEM key, start_key, stop_key, value;
+ WT_SESSION_IMPL *session;
+ uint64_t recno, start_recno, stop_recno;
+ uint32_t fileid, mode, optype, opsize;
+
+ session = r->session;
+
+ /* Peek at the size and the type. */
+ WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize));
+ end = *pp + opsize;
+
+ switch (optype) {
+ case WT_LOGOP_COL_PUT:
+ WT_ERR(__wt_logop_col_put_unpack(session, pp, end,
+ &fileid, &recno, &value));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ cursor->set_key(cursor, recno);
+ __wt_cursor_set_raw_value(cursor, &value);
+ WT_ERR(cursor->insert(cursor));
+ break;
+
+ case WT_LOGOP_COL_REMOVE:
+ WT_ERR(__wt_logop_col_remove_unpack(session, pp, end,
+ &fileid, &recno));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ cursor->set_key(cursor, recno);
+ WT_ERR(cursor->remove(cursor));
+ break;
+
+ case WT_LOGOP_COL_TRUNCATE:
+ WT_ERR(__wt_logop_col_truncate_unpack(session, pp, end,
+ &fileid, &start_recno, &stop_recno));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+
+ /* Set up the cursors. */
+ if (start_recno == 0) {
+ start = NULL;
+ stop = cursor;
+ } else if (stop_recno == 0) {
+ start = cursor;
+ stop = NULL;
+ } else {
+ start = cursor;
+ WT_ERR(__recovery_cursor(
+ session, r, lsnp, fileid, 1, &stop));
+ }
+
+ /* Set the keys. */
+ if (start != NULL)
+ start->set_key(stop, start_recno);
+ if (stop != NULL)
+ stop->set_key(stop, stop_recno);
+
+ WT_TRET(session->iface.truncate(&session->iface, NULL,
+ start, stop, NULL));
+ /* If we opened a duplicate cursor, close it now. */
+ if (stop != NULL && stop != cursor)
+ WT_TRET(stop->close(stop));
+ WT_ERR(ret);
+ break;
+
+ case WT_LOGOP_ROW_PUT:
+ WT_ERR(__wt_logop_row_put_unpack(session, pp, end,
+ &fileid, &key, &value));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ __wt_cursor_set_raw_key(cursor, &key);
+ __wt_cursor_set_raw_value(cursor, &value);
+ WT_ERR(cursor->insert(cursor));
+ break;
+
+ case WT_LOGOP_ROW_REMOVE:
+ WT_ERR(__wt_logop_row_remove_unpack(session, pp, end,
+ &fileid, &key));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ __wt_cursor_set_raw_key(cursor, &key);
+ WT_ERR(cursor->remove(cursor));
+ break;
+
+ case WT_LOGOP_ROW_TRUNCATE:
+ WT_ERR(__wt_logop_row_truncate_unpack(session, pp, end,
+ &fileid, &start_key, &stop_key, &mode));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ /* Set up the cursors. */
+ start = stop = NULL;
+ switch (mode) {
+ case TXN_TRUNC_ALL:
+ /* Both cursors stay NULL. */
+ break;
+ case TXN_TRUNC_BOTH:
+ start = cursor;
+ WT_ERR(__recovery_cursor(
+ session, r, lsnp, fileid, 1, &stop));
+ break;
+ case TXN_TRUNC_START:
+ start = cursor;
+ break;
+ case TXN_TRUNC_STOP:
+ stop = cursor;
+ break;
+
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /* Set the keys. */
+ if (start != NULL)
+ __wt_cursor_set_raw_key(start, &start_key);
+ if (stop != NULL)
+ __wt_cursor_set_raw_key(stop, &stop_key);
+
+ WT_TRET(session->iface.truncate(&session->iface, NULL,
+ start, stop, NULL));
+ /* If we opened a duplicate cursor, close it now. */
+ if (stop != NULL && stop != cursor)
+ WT_TRET(stop->close(stop));
+ WT_ERR(ret);
+ break;
+
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ r->modified = 1;
+
+err: if (ret != 0)
+ __wt_err(session, ret,
+ "Operation failed during recovery");
+ return (ret);
+}
+
+/*
+ * __txn_commit_apply --
+ * Apply a commit record during recovery.
+ */
+static int
+__txn_commit_apply(
+ WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
+{
+ WT_UNUSED(lsnp);
+
+ /* The logging subsystem zero-pads records. */
+ while (*pp < end && **pp)
+ WT_RET(__txn_op_apply(r, lsnp, pp, end));
+
+ return (0);
+}
+
+/*
+ * __txn_log_recover --
+ * Roll the log forward to recover committed changes.
+ */
+static int
+__txn_log_recover(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie)
+{
+ WT_RECOVERY *r;
+ const uint8_t *end, *p;
+ uint64_t txnid;
+ uint32_t rectype;
+
+ r = cookie;
+ p = (const uint8_t *)logrec->data + offsetof(WT_LOG_RECORD, record);
+ end = (const uint8_t *)logrec->data + logrec->size;
+
+ /* First, peek at the log record type. */
+ WT_RET(__wt_logrec_read(session, &p, end, &rectype));
+
+ switch (rectype) {
+ case WT_LOGREC_CHECKPOINT:
+ if (r->metadata_only)
+ WT_RET(__wt_txn_checkpoint_logread(
+ session, &p, end, &r->ckpt_lsn));
+ break;
+
+ case WT_LOGREC_COMMIT:
+ WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid));
+ WT_UNUSED(txnid);
+ WT_RET(__txn_commit_apply(r, lsnp, &p, end));
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * __recovery_setup_file --
+ * Set up the recovery slot for a file.
+ */
+static int
+__recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_LSN lsn;
+ uint32_t fileid;
+
+ WT_RET(__wt_config_getones(r->session, config, "id", &cval));
+ fileid = (uint32_t)cval.val;
+
+ if (r->nfiles <= fileid) {
+ WT_RET(__wt_realloc_def(
+ r->session, &r->file_alloc, fileid + 1, &r->files));
+ r->nfiles = fileid + 1;
+ }
+
+ WT_RET(__wt_strdup(r->session, uri, &r->files[fileid].uri));
+ WT_RET(
+ __wt_config_getones(r->session, config, "checkpoint_lsn", &cval));
+ /* If there is checkpoint logged for the file, apply everything. */
+ if (cval.type != WT_CONFIG_ITEM_STRUCT)
+ INIT_LSN(&lsn);
+ else if (sscanf(cval.str, "(%" PRIu32 ",%" PRIdMAX ")",
+ &lsn.file, (intmax_t*)&lsn.offset) != 2)
+ WT_RET_MSG(r->session, EINVAL,
+ "Failed to parse checkpoint LSN '%.*s'",
+ (int)cval.len, cval.str);
+ r->files[fileid].ckpt_lsn = lsn;
+
+ WT_VERBOSE_RET(r->session, recovery,
+ "Recovering %s with id %u @ (%" PRIu32 ", %" PRIu64 ")",
+ uri, fileid, lsn.file, lsn.offset);
+
+ return (0);
+
+}
+
+/*
+ * __recovery_free --
+ * Free the recovery state.
+ */
+static int
+__recovery_free(WT_RECOVERY *r)
+{
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ session = r->session;
+ for (i = 0; i < r->nfiles; i++) {
+ __wt_free(session, r->files[i].uri);
+ if ((c = r->files[i].c) != NULL)
+ WT_TRET(c->close(c));
+ }
+
+ __wt_free(session, r->files);
+ return (ret);
+}
+
+/*
+ * __recovery_file_scan --
+ * Scan the files referenced from the metadata and gather information
+ * about them for recovery.
+ */
+static int
+__recovery_file_scan(WT_RECOVERY *r)
+{
+ WT_DECL_RET;
+ WT_CURSOR *c;
+ const char *uri, *config;
+ int cmp;
+
+ /* Scan through all files in the metadata. */
+ c = r->files[0].c;
+ c->set_key(c, "file:");
+ if ((ret = c->search_near(c, &cmp)) != 0) {
+ /* Is the metadata empty? */
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ goto err;
+ }
+ if (cmp < 0)
+ WT_ERR_NOTFOUND_OK(c->next(c));
+ for (; ret == 0; ret = c->next(c)) {
+ WT_ERR(c->get_key(c, &uri));
+ if (!WT_PREFIX_MATCH(uri, "file:"))
+ break;
+ WT_ERR(c->get_value(c, &config));
+ WT_ERR(__recovery_setup_file(r, uri, config));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: r->max_fileid = r->nfiles;
+ return (ret);
+}
+
+/*
+ * __wt_txn_recover --
+ * Run recovery.
+ */
+int
+__wt_txn_recover(WT_SESSION_IMPL *default_session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_RECOVERY r;
+ WT_SESSION_IMPL *session;
+ const char *config;
+ int modified;
+
+ conn = S2C(default_session);
+ WT_CLEAR(r);
+ INIT_LSN(&r.ckpt_lsn);
+
+ /* We need a real session for recovery. */
+ WT_RET(__wt_open_session(conn, 0, NULL, NULL, &session));
+ F_SET(session, WT_SESSION_LOGGING_DISABLED);
+ r.session = session;
+
+ WT_ERR(__wt_metadata_search(session, WT_METADATA_URI, &config));
+ WT_ERR(__recovery_setup_file(&r, WT_METADATA_URI, config));
+ WT_ERR(__wt_metadata_cursor(session, NULL, &r.files[0].c));
+
+ /*
+ * First, do a full pass through the log to recover the metadata,
+ * and establish the last checkpoint LSN.
+ */
+ r.metadata_only = 1;
+ WT_ERR(__wt_log_scan(
+ session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r));
+
+ WT_ASSERT(session, LOG_CMP(&r.ckpt_lsn, &conn->log->first_lsn) >= 0);
+
+ WT_ERR(__recovery_file_scan(&r));
+
+ /*
+ * Now, recover all the files apart from the metadata.
+ * Pass WT_LOGSCAN_RECOVER so that old logs get truncated.
+ */
+ r.metadata_only = 0;
+ WT_VERBOSE_ERR(session, recovery,
+ "Main recovery loop: starting at %u/%" PRIuMAX,
+ r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset);
+ if (IS_INIT_LSN(&r.ckpt_lsn))
+ WT_ERR(__wt_log_scan(session, NULL,
+ WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER,
+ __txn_log_recover, &r));
+ else
+ WT_ERR(__wt_log_scan(session, &r.ckpt_lsn,
+ WT_LOGSCAN_RECOVER,
+ __txn_log_recover, &r));
+
+ conn->next_file_id = r.max_fileid;
+
+err: modified = r.modified;
+ WT_TRET(__recovery_free(&r));
+ __wt_free(session, config);
+ WT_TRET(session->iface.close(&session->iface, NULL));
+
+ /*
+ * If recovery ran successfully and modified something, log a
+ * checkpoint.
+ */
+ if (ret == 0 && modified)
+ ret = __wt_txn_checkpoint_log(
+ default_session, 1, WT_TXN_LOG_CKPT_STOP, NULL);
+
+ return (ret);
+}
diff --git a/src/utilities/util_printlog.c b/src/utilities/util_printlog.c
index b62954a6b51..810dfac5fe2 100644
--- a/src/utilities/util_printlog.c
+++ b/src/utilities/util_printlog.c
@@ -12,9 +12,7 @@ static int usage(void);
int
util_printlog(WT_SESSION *session, int argc, char *argv[])
{
- WT_CURSOR *cursor;
WT_DECL_RET;
- WT_ITEM key, value;
int ch, printable;
printable = 0;
@@ -41,30 +39,11 @@ util_printlog(WT_SESSION *session, int argc, char *argv[])
if (argc != 0)
return (usage());
- if ((ret = session->open_cursor(session, "log",
- NULL, printable ? "printable" : "raw", &cursor)) != 0) {
- fprintf(stderr, "%s: cursor open(log) failed: %s\n",
- progname, wiredtiger_strerror(ret));
- goto err;
- }
+ WT_UNUSED(printable);
+ ret = __wt_txn_printlog(session, stdout);
- while ((ret = cursor->next(cursor)) == 0) {
- if ((ret = cursor->get_key(cursor, &key)) != 0)
- break;
- if ((ret = cursor->get_value(cursor, &value)) != 0)
- break;
- if (fwrite(key.data, 1, key.size, stdout) != key.size ||
- fwrite("\n", 1, 1, stdout) != 1 ||
- fwrite(value.data, 1, value.size, stdout) != value.size ||
- fwrite("\n", 1, 1, stdout) != 1) {
- ret = errno;
- break;
- }
- }
- if (ret == WT_NOTFOUND)
- ret = 0;
- else {
- fprintf(stderr, "%s: cursor get(log) failed: %s\n",
+ if (ret != 0) {
+ fprintf(stderr, "%s: printlog failed: %s\n",
progname, wiredtiger_strerror(ret));
goto err;
}
diff --git a/test/java/com/wiredtiger/test/AutoCloseTest.java b/test/java/com/wiredtiger/test/AutoCloseTest.java
new file mode 100644
index 00000000000..11c29dc23c6
--- /dev/null
+++ b/test/java/com/wiredtiger/test/AutoCloseTest.java
@@ -0,0 +1,284 @@
+/*-
+ * Public Domain 2008-2013 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package com.wiredtiger.test;
+
+import com.wiredtiger.db.Connection;
+import com.wiredtiger.db.Cursor;
+import com.wiredtiger.db.Session;
+import com.wiredtiger.db.WiredTigerPackingException;
+import com.wiredtiger.db.wiredtiger;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.After;
+import org.junit.Test;
+import org.junit.Assert;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+public class AutoCloseTest {
+
+ /*
+ * Connvalid tells us that we really closed the connection.
+ * That allows teardown to reliably clean up so that
+ * a single failure in one test does not cascade.
+ */
+ Connection conn;
+ boolean connvalid = false;
+
+ private Session sessionSetup() {
+ String keyFormat = "S";
+ String valueFormat = "u";
+
+ conn = wiredtiger.open("WT_HOME", "create");
+ connvalid = true;
+ Session s = conn.open_session(null);
+ s.create("table:t",
+ "key_format=" + keyFormat + ",value_format=" + valueFormat);
+ return s;
+ }
+
+ private Cursor populate(Session s)
+ throws WiredTigerPackingException {
+ Cursor c = s.open_cursor("table:t", null, null);
+ c.putKeyString("bar");
+ c.putValueByteArray("foo".getBytes());
+ c.insert();
+ return c;
+ }
+
+ @Test
+ public void autoCloseCursor01()
+ throws WiredTigerPackingException {
+ Session s = sessionSetup();
+ Cursor c = populate(s);
+
+ c.close();
+
+ boolean caught = false;
+ try {
+ // Error: cursor used after close
+ c.next();
+ }
+ catch (NullPointerException iae) {
+ assertEquals(iae.toString().contains("cursor is null"), true);
+ caught = true;
+ }
+ assertEquals(caught, true);
+
+ s.close("");
+ conn.close("");
+ connvalid = false;
+ }
+
+ @Test
+ public void autoCloseCursor02()
+ throws WiredTigerPackingException {
+ Session s = sessionSetup();
+ Cursor c = populate(s);
+
+ s.close("");
+
+ boolean caught = false;
+ try {
+ // Error: cursor used after session closed
+ c.next();
+ }
+ catch (NullPointerException iae) {
+ assertEquals(iae.toString().contains("cursor is null"), true);
+ caught = true;
+ }
+ assertEquals(caught, true);
+
+ conn.close("");
+ connvalid = false;
+ }
+
+ @Test
+ public void autoCloseCursor03()
+ throws WiredTigerPackingException {
+ Session s = sessionSetup();
+ Cursor c = populate(s);
+
+ conn.close("");
+ connvalid = false;
+
+ boolean caught = false;
+ try {
+ // Error: cursor used after connection close
+ c.close();
+ }
+ catch (NullPointerException iae) {
+ assertEquals(iae.toString().contains("cursor is null"), true);
+ caught = true;
+ }
+ assertEquals(caught, true);
+ }
+
+ @Test
+ public void autoCloseCursor04()
+ throws WiredTigerPackingException {
+ Session s = sessionSetup();
+
+ // The truncate call allows both of its cursor args
+ // to be null, so we don't expect null checking.
+
+ Cursor cbegin = s.open_cursor("table:t", null, null);
+ cbegin.putKeyString("bar");
+ cbegin.search();
+ Cursor cend = s.open_cursor(null, cbegin, null);
+ s.truncate(null, cbegin, cend, "");
+ cbegin.close();
+ cend.close();
+
+ s.truncate("table:t", null, null, null);
+
+ conn.close("");
+ connvalid = false;
+ }
+
+ @Test
+ public void autoCloseCursor05()
+ throws WiredTigerPackingException {
+ Session s = sessionSetup();
+ Cursor c = populate(s);
+
+ // Allowable compare call
+ c.putKeyString("bar");
+ c.search();
+ Cursor c2 = s.open_cursor(null, c, null);
+ c.compare(c2);
+
+ boolean caught = false;
+ try {
+ // Error: cursor arg should not be null
+ c.compare(null);
+ }
+ catch (NullPointerException iae) {
+ assertEquals(iae.toString().contains("other is null"), true);
+ caught = true;
+ }
+ assertEquals(caught, true);
+ conn.close("");
+ connvalid = false;
+ }
+
+ @Test
+ public void autoCloseSession01()
+ throws WiredTigerPackingException {
+ Session s = sessionSetup();
+ Cursor c = populate(s);
+
+ s.close("");
+
+ boolean caught = false;
+ try {
+ // Error: session used after close
+ s.drop("table:t", "");
+ }
+ catch (NullPointerException iae) {
+ assertEquals(iae.toString().contains("session is null"), true);
+ caught = true;
+ }
+ assertEquals(caught, true);
+
+ conn.close("");
+ connvalid = false;
+ }
+
+ @Test
+ public void autoCloseSession02()
+ throws WiredTigerPackingException {
+ Session s = sessionSetup();
+ Cursor c = populate(s);
+
+ conn.close("");
+ connvalid = false;
+
+ boolean caught = false;
+ try {
+ // Error: session used after connection close
+ s.close("");
+ }
+ catch (NullPointerException iae) {
+ assertEquals(iae.toString().contains("session is null"), true);
+ caught = true;
+ }
+ assertEquals(caught, true);
+ }
+
+ public void autoCloseSession03()
+ throws WiredTigerPackingException {
+ Session s = sessionSetup();
+ Cursor c = populate(s);
+
+ s.close("");
+
+ boolean caught = false;
+ try {
+ // Error: session used after close, using open call
+ s.open_cursor("table:t", null, null);
+ }
+ catch (NullPointerException iae) {
+ assertEquals(iae.toString().contains("session is null"), true);
+ caught = true;
+ }
+ assertEquals(caught, true);
+
+ conn.close("");
+ connvalid = false;
+ }
+
+ @Test
+ public void autoCloseConnection01()
+ throws WiredTigerPackingException {
+ Session s = sessionSetup();
+ Cursor c = populate(s);
+
+ conn.close("");
+ connvalid = false;
+
+ boolean caught = false;
+ try {
+ // Error: connection used after close
+ conn.close("");
+ }
+ catch (NullPointerException iae) {
+ assertEquals(iae.toString().contains("connection is null"), true);
+ caught = true;
+ }
+ assertEquals(caught, true);
+ }
+
+ @After
+ public void teardown() {
+ if (connvalid) {
+ conn.close("");
+ }
+ }
+
+}
diff --git a/test/java/com/wiredtiger/test/WiredTigerSuite.java b/test/java/com/wiredtiger/test/WiredTigerSuite.java
index cbcc1b4323f..24847711cc7 100644
--- a/test/java/com/wiredtiger/test/WiredTigerSuite.java
+++ b/test/java/com/wiredtiger/test/WiredTigerSuite.java
@@ -31,6 +31,7 @@ import org.junit.runners.Suite;
@RunWith(Suite.class)
@Suite.SuiteClasses( {
+ AutoCloseTest.class,
CursorTest.class,
CursorTest02.class,
PackTest.class
diff --git a/test/salvage/salvage.c b/test/salvage/salvage.c
index 9d6eee1c59f..5b7a2c2db88 100644
--- a/test/salvage/salvage.c
+++ b/test/salvage/salvage.c
@@ -158,7 +158,7 @@ run(int r)
printf("\t%s: run %d\n", __wt_page_type_string(page_type), r);
WT_UNUSED_RET(system(
- "rm -f WiredTiger WiredTiger.* __slvg.* __schema.*"));
+ "rm -f WiredTiger* __slvg.* __schema.*"));
assert((res_fp = fopen(RSLT, "w")) != NULL);
/*
@@ -457,7 +457,12 @@ build(int ikey, int ivalue, int cnt)
char config[256], kbuf[64], vbuf[64];
int new_slvg;
- assert(wiredtiger_open(NULL, NULL, "create", &conn) == 0);
+ /*
+ * Disable logging: we're modifying files directly, we don't want to
+ * run recovery.
+ */
+ assert(wiredtiger_open(
+ NULL, NULL, "create,log=(enabled=false)", &conn) == 0);
assert(conn->open_session(conn, NULL, NULL, &session) == 0);
assert(session->drop(session, "file:" LOAD, "force") == 0);
@@ -604,24 +609,25 @@ process(void)
config[0] = '\0';
if (verbose)
snprintf(config, sizeof(config),
- "error_prefix=\"%s\",verbose=[salvage,verify]",
+ "error_prefix=\"%s\",verbose=[salvage,verify],",
progname);
+ strcat(config, "log=(enabled=false),");
+
assert(wiredtiger_open(NULL, NULL, config, &conn) == 0);
assert(conn->open_session(conn, NULL, NULL, &session) == 0);
assert(session->salvage(session, "file:" SLVG, 0) == 0);
assert(conn->close(conn, 0) == 0);
/* Verify. */
- assert(wiredtiger_open(NULL, NULL, "", &conn) == 0);
+ assert(wiredtiger_open(NULL, NULL, config, &conn) == 0);
assert(conn->open_session(conn, NULL, NULL, &session) == 0);
assert(session->verify(session, "file:" SLVG, 0) == 0);
assert(conn->close(conn, 0) == 0);
/* Dump. */
assert((fp = fopen(DUMP, "w")) != NULL);
- assert(wiredtiger_open(NULL, NULL, "", &conn) == 0);
+ assert(wiredtiger_open(NULL, NULL, config, &conn) == 0);
assert(conn->open_session(conn, NULL, NULL, &session) == 0);
- assert(session->create(session, "file:" SLVG, NULL) == 0);
assert(session->open_cursor(
session, "file:" SLVG, NULL, "dump=print", &cursor) == 0);
while (cursor->next(cursor) == 0) {
diff --git a/test/suite/test_txn02.py b/test/suite/test_txn02.py
index ec0e2240f3d..bc024dfa531 100644
--- a/test/suite/test_txn02.py
+++ b/test/suite/test_txn02.py
@@ -29,8 +29,10 @@
# Transactions: commits and rollbacks
#
-import wiredtiger, wttest
+import os, shutil
+from wiredtiger import wiredtiger_open
from wtscenario import multiply_scenarios, number_scenarios
+import wttest
class test_txn02(wttest.WiredTigerTestCase):
tablename = 'test_txn02'
@@ -70,11 +72,14 @@ class test_txn02(wttest.WiredTigerTestCase):
txn4s = [('t4c', dict(txn4='commit')), ('t4r', dict(txn4='rollback'))]
scenarios = number_scenarios(multiply_scenarios('.', types,
- op1s, txn1s, op2s, txn2s, op3s, txn3s, op4s, txn4s)) # [:1]
+ op1s, txn1s, op2s, txn2s, op3s, txn3s, op4s, txn4s))
# Overrides WiredTigerTestCase
def setUpConnectionOpen(self, dir):
- conn = wiredtiger.wiredtiger_open(dir, 'create,' +
+ self.home = dir
+ self.backup_dir = os.path.join(self.home, "WT_BACKUP")
+ conn = wiredtiger_open(dir,
+ 'create,log=(enabled,file_max=100K),' +
('error_prefix="%s: ",' % self.shortid()))
self.pr(`conn`)
self.session2 = conn.open_session()
@@ -89,9 +94,7 @@ class test_txn02(wttest.WiredTigerTestCase):
actual = dict((k, v) for k, v in c if v != 0)
# Search for the expected items as well as iterating
for k, v in expected.iteritems():
- c.set_key(k)
- c.search()
- self.assertEqual(c.get_value(), v)
+ self.assertEqual(c[k], v)
c.close()
if txn_config:
session.commit_transaction()
@@ -108,7 +111,19 @@ class test_txn02(wttest.WiredTigerTestCase):
self.check(self.session2, "isolation=read-committed", committed)
self.check(self.session2, "isolation=read-uncommitted", current)
+ # Opening a clone of the database home directory should see the
+ # committed results.
+ wttest.removeAll(self.backup_dir)
+ shutil.copytree(self.home, self.backup_dir)
+ backup_conn = wiredtiger_open(self.backup_dir, 'log=(enabled)')
+ try:
+ self.check(backup_conn.open_session(), None, committed)
+ #self.check(backup_conn.open_session(), None, {})
+ finally:
+ backup_conn.close()
+
def test_ops(self):
+ # print "Creating %s with config '%s'" % (self.uri, self.create_params)
self.session.create(self.uri, self.create_params)
# Set up the table with entries for 1 and 10
# We use the overwrite config so insert can update as needed.
diff --git a/test/thread/t.c b/test/thread/t.c
index 5d9d10df0f3..ba37522bd04 100644
--- a/test/thread/t.c
+++ b/test/thread/t.c
@@ -203,7 +203,7 @@ wt_shutdown(void)
static void
shutdown(void)
{
- WT_UNUSED_RET(system("rm -f WiredTiger.* __wt*"));
+ WT_UNUSED_RET(system("rm -f WiredTiger* __wt*"));
}
static int
diff --git a/tools/statlog.py b/tools/statlog.py
index de7f6139c95..f46c834dc5d 100644
--- a/tools/statlog.py
+++ b/tools/statlog.py
@@ -42,6 +42,7 @@ no_scale_per_second_list = [
'cache: tracked dirty pages in the cache',
'cache: pages currently held in the cache',
'files currently open',
+ 'log: total log buffer size',
'open cursor count',
'block manager: file allocation unit size',
'block manager: checkpoint size',
@@ -68,7 +69,7 @@ no_scale_per_second_list = [
'overflow values cached in memory',
'chunks in the LSM tree',
'highest merge generation in the LSM tree',
- 'reconciliation maximum number of splits created for a page',
+ 'reconciliation maximum splits for a page',
'open cursor count',
]
# no scale-per-second list section: END