diff options
108 files changed, 4373 insertions, 1089 deletions
diff --git a/dist/api_config.py b/dist/api_config.py index ab76c80961f..27bc06a32aa 100644 --- a/dist/api_config.py +++ b/dist/api_config.py @@ -206,8 +206,10 @@ static const WT_CONFIG_CHECK confchk_%(name)s_subconfigs[] = { }; ''' % { 'name' : c.name, - 'check' : '\n\t'.join('"\n\t "'.join(w.wrap('{ "%s", "%s", %s, NULL },' % - (subc.name, gettype(subc), checkstr(subc)))) for subc in sorted(c.subconfig)), + 'check' : '\n\t'.join('"\n\t "'.join( + w.wrap('{ "%s", "%s", %s, NULL },' % + (subc.name, gettype(subc), checkstr(subc)))) + for subc in sorted(c.subconfig)), }) def getsubconfigstr(c): @@ -248,7 +250,7 @@ for name in sorted(api_data.methods.keys()): # #defines are used to avoid a list search where we know the correct slot). config_defines +=\ '#define\tWT_CONFIG_ENTRY_' + name.replace('.', '_') + '\t' * \ - max(1, 6 - int ((len('WT_CONFIG_ENTRY_' + name)) / 8)) + \ + max(1, 6 - (len('WT_CONFIG_ENTRY_' + name) / 8)) + \ "%2s" % str(slot) + '\n' # Write the method name and base. @@ -318,12 +320,12 @@ tfile = open(tmp_file, 'w') skip = 0 for line in open('../src/include/config.h', 'r'): if skip: - if line.count('configuration section: END'): + if 'configuration section: END' in line: tfile.write('/*\n' + line) skip = 0 else: tfile.write(line) - if line.count('configuration section: BEGIN'): + if 'configuration section: BEGIN' in line: skip = 1 tfile.write(' */\n') tfile.write(config_defines) diff --git a/dist/api_data.py b/dist/api_data.py index 09215470f00..c8f5bb8c397 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -9,25 +9,25 @@ class Error: errors = [ Error('WT_DEADLOCK', 'conflict between concurrent operations', ''' - This error is generated when an operation cannot be completed - due to a conflict with concurrent operations. The operation - may be retried; if a transaction is in progress, it should be - rolled back and the operation retried in a new transaction.'''), + This error is generated when an operation cannot be completed + due to a conflict with concurrent operations. The operation + may be retried; if a transaction is in progress, it should be + rolled back and the operation retried in a new transaction.'''), Error('WT_DUPLICATE_KEY', 'attempt to insert an existing key', ''' - This error is generated when the application attempts to insert - a record with the same key as an existing record without the - 'overwrite' configuration to WT_SESSION::open_cursor.'''), + This error is generated when the application attempts to insert + a record with the same key as an existing record without the + 'overwrite' configuration to WT_SESSION::open_cursor.'''), Error('WT_ERROR', 'non-specific WiredTiger error', ''' - This error is returned when an error is not covered by a - specific error return.'''), + This error is returned when an error is not covered by a + specific error return.'''), Error('WT_NOTFOUND', 'item not found', ''' - This error indicates an operation did not find a value to - return. This includes cursor search and other operations - where no record matched the cursor's search key such as - WT_CURSOR::update or WT_CURSOR::remove.'''), + This error indicates an operation did not find a value to + return. This includes cursor search and other operations + where no record matched the cursor's search key such as + WT_CURSOR::update or WT_CURSOR::remove.'''), Error('WT_PANIC', 'WiredTiger library panic', ''' - This error indicates an underlying problem that requires the - application exit and restart.'''), + This error indicates an underlying problem that requires the + application exit and restart.'''), Error('WT_RESTART', 'restart the operation (internal)', undoc=True), ] @@ -50,79 +50,79 @@ class Config: # All schema objects can have column names (optional for simple tables). column_meta = [ Config('columns', '', r''' - list of the column names. Comma-separated list of the form - <code>(column[,...])</code>. For tables, the number of entries - must match the total number of values in \c key_format and \c - value_format. For colgroups and indices, all column names must - appear in the list of columns for the table''', - type='list'), + list of the column names. Comma-separated list of the form + <code>(column[,...])</code>. For tables, the number of entries + must match the total number of values in \c key_format and \c + value_format. For colgroups and indices, all column names must + appear in the list of columns for the table''', + type='list'), ] source_meta = [ Config('source', '', r''' - set a custom data source URI for a column group, index or simple - table. By default, the data source URI is derived from the \c - type and the column group or index name. Applications can - create tables from existing data sources by supplying a \c - source configuration''', undoc=True), + set a custom data source URI for a column group, index or simple + table. By default, the data source URI is derived from the \c + type and the column group or index name. Applications can + create tables from existing data sources by supplying a \c + source configuration''', undoc=True), Config('type', 'file', r''' - set the type of data source used to store a column group, index - or simple table. By default, a \c "file:" URI is derived from - the object name. The \c type configuration can be used to - switch to a different data source, such as LSM or an extension - configured by the application.'''), + set the type of data source used to store a column group, index + or simple table. By default, a \c "file:" URI is derived from + the object name. The \c type configuration can be used to + switch to a different data source, such as LSM or an extension + configured by the application.'''), ] format_meta = column_meta + [ Config('key_format', 'u', r''' - the format of the data packed into key items. See @ref - schema_format_types for details. By default, the key_format is - \c 'u' and applications use WT_ITEM structures to manipulate - raw byte arrays. By default, records are stored in row-store - files: keys of type \c 'r' are record numbers and records - referenced by record number are stored in column-store files''', - type='format'), + the format of the data packed into key items. See @ref + schema_format_types for details. By default, the key_format is + \c 'u' and applications use WT_ITEM structures to manipulate + raw byte arrays. By default, records are stored in row-store + files: keys of type \c 'r' are record numbers and records + referenced by record number are stored in column-store files''', + type='format'), Config('value_format', 'u', r''' - the format of the data packed into value items. See @ref - schema_format_types for details. By default, the value_format - is \c 'u' and applications use a WT_ITEM structure to - manipulate raw byte arrays. Value items of type 't' are - bitfields, and when configured with record number type keys, - will be stored using a fixed-length store''', - type='format'), + the format of the data packed into value items. See @ref + schema_format_types for details. By default, the value_format + is \c 'u' and applications use a WT_ITEM structure to + manipulate raw byte arrays. Value items of type 't' are + bitfields, and when configured with record number type keys, + will be stored using a fixed-length store''', + type='format'), ] lsm_config = [ Config('lsm_auto_throttle', 'true', r''' - Throttle inserts into LSM trees if flushing to disk isn't - keeping up''', - type='boolean'), + Throttle inserts into LSM trees if flushing to disk isn't + keeping up''', + type='boolean'), Config('lsm_bloom', 'true', r''' - create bloom filters on LSM tree chunks as they are merged''', - type='boolean'), + create bloom filters on LSM tree chunks as they are merged''', + type='boolean'), Config('lsm_bloom_config', '', r''' - config string used when creating Bloom filter files, passed - to WT_SESSION::create'''), + config string used when creating Bloom filter files, passed + to WT_SESSION::create'''), Config('lsm_bloom_bit_count', '8', r''' - the number of bits used per item for LSM bloom filters''', - min='2', max='1000'), + the number of bits used per item for LSM bloom filters''', + min='2', max='1000'), Config('lsm_bloom_hash_count', '4', r''' - the number of hash values per item used for LSM bloom - filters''', - min='2', max='100'), + the number of hash values per item used for LSM bloom + filters''', + min='2', max='100'), Config('lsm_bloom_oldest', 'false', r''' - create a bloom filter on the oldest LSM tree chunk. Only - supported if bloom filters are enabled''', - type='boolean'), + create a bloom filter on the oldest LSM tree chunk. Only + supported if bloom filters are enabled''', + type='boolean'), Config('lsm_chunk_size', '2MB', r''' - the maximum size of the in-memory chunk of an LSM tree''', - min='512K', max='500MB'), + the maximum size of the in-memory chunk of an LSM tree''', + min='512K', max='500MB'), Config('lsm_merge_max', '15', r''' - the maximum number of chunks to include in a merge operation''', - min='2', max='100'), + the maximum number of chunks to include in a merge operation''', + min='2', max='100'), Config('lsm_merge_threads', '1', r''' - the number of thread to perform merge operations''', - min='1', max='10'), # !!! max must match WT_LSM_MAX_WORKERS + the number of thread to perform merge operations''', + min='1', max='10'), # !!! max must match WT_LSM_MAX_WORKERS ] # Per-file configuration @@ -134,137 +134,141 @@ file_config = format_meta + [ uses a best-fit algorithm''', choices=['first', 'best',]), Config('allocation_size', '4KB', r''' - the file unit allocation size, in bytes, must a power-of-two; - smaller values decrease the file space required by overflow - items, and the default value of 4KB is a good choice absent - requirements from the operating system or storage device''', - min='512B', max='128MB'), + the file unit allocation size, in bytes, must a power-of-two; + smaller values decrease the file space required by overflow + items, and the default value of 4KB is a good choice absent + requirements from the operating system or storage device''', + min='512B', max='128MB'), Config('block_compressor', '', r''' - configure a compressor for file blocks. Permitted values are - empty (off) or \c "bzip2", \c "snappy" or custom compression - engine \c "name" created with WT_CONNECTION::add_compressor. - See @ref compression for more information'''), + configure a compressor for file blocks. Permitted values are + empty (off) or \c "bzip2", \c "snappy" or custom compression + engine \c "name" created with WT_CONNECTION::add_compressor. + See @ref compression for more information'''), Config('cache_resident', 'false', r''' - do not ever evict the object's pages; see @ref - tuning_cache_resident for more information''', - type='boolean'), + do not ever evict the object's pages; see @ref + tuning_cache_resident for more information''', + type='boolean'), Config('checksum', 'uncompressed', r''' - configure block checksums; permitted values are <code>on</code> - (checksum all blocks), <code>off</code> (checksum no blocks) and - <code>uncompresssed</code> (checksum only blocks which are not - compressed for any reason). The \c uncompressed setting is for - applications which can rely on decompression to fail if a block - has been corrupted''', - choices=['on', 'off', 'uncompressed']), + configure block checksums; permitted values are <code>on</code> + (checksum all blocks), <code>off</code> (checksum no blocks) and + <code>uncompresssed</code> (checksum only blocks which are not + compressed for any reason). The \c uncompressed setting is for + applications which can rely on decompression to fail if a block + has been corrupted''', + choices=['on', 'off', 'uncompressed']), Config('collator', '', r''' - configure custom collation for keys. Value must be a collator - name created with WT_CONNECTION::add_collator'''), + configure custom collation for keys. Value must be a collator + name created with WT_CONNECTION::add_collator'''), Config('dictionary', '0', r''' - the maximum number of unique values remembered in the Btree - row-store leaf page value dictionary; see - @ref file_formats_compression for more information''', - min='0'), + the maximum number of unique values remembered in the Btree + row-store leaf page value dictionary; see + @ref file_formats_compression for more information''', + min='0'), Config('format', 'btree', r''' - the file format''', - choices=['btree']), + the file format''', + choices=['btree']), Config('huffman_key', '', r''' - configure Huffman encoding for keys. Permitted values - are empty (off), \c "english", \c "utf8<file>" or \c - "utf16<file>". See @ref huffman for more information'''), + configure Huffman encoding for keys. Permitted values + are empty (off), \c "english", \c "utf8<file>" or \c + "utf16<file>". See @ref huffman for more information'''), Config('huffman_value', '', r''' - configure Huffman encoding for values. Permitted values - are empty (off), \c "english", \c "utf8<file>" or \c - "utf16<file>". See @ref huffman for more information'''), + configure Huffman encoding for values. Permitted values + are empty (off), \c "english", \c "utf8<file>" or \c + "utf16<file>". See @ref huffman for more information'''), Config('internal_key_truncate', 'true', r''' - configure internal key truncation, discarding unnecessary - trailing bytes on internal keys (ignored for custom - collators)''', - type='boolean'), + configure internal key truncation, discarding unnecessary + trailing bytes on internal keys (ignored for custom + collators)''', + type='boolean'), Config('internal_page_max', '4KB', r''' - the maximum page size for internal nodes, in bytes; the size - must be a multiple of the allocation size and is significant - for applications wanting to avoid excessive L2 cache misses - while searching the tree. The page maximum is the bytes of - uncompressed data, that is, the limit is applied before any - block compression is done''', - min='512B', max='512MB'), + the maximum page size for internal nodes, in bytes; the size + must be a multiple of the allocation size and is significant + for applications wanting to avoid excessive L2 cache misses + while searching the tree. The page maximum is the bytes of + uncompressed data, that is, the limit is applied before any + block compression is done''', + min='512B', max='512MB'), Config('internal_item_max', '0', r''' - the largest key stored within an internal node, in bytes. If - non-zero, any key larger than the specified size will be - stored as an overflow item (which may require additional I/O - to access). If zero, a default size is chosen that permits at - least 8 keys per internal page''', - min=0), + the largest key stored within an internal node, in bytes. If + non-zero, any key larger than the specified size will be + stored as an overflow item (which may require additional I/O + to access). If zero, a default size is chosen that permits at + least 8 keys per internal page''', + min=0), Config('key_gap', '10', r''' - the maximum gap between instantiated keys in a Btree leaf page, - constraining the number of keys processed to instantiate a - random Btree leaf page key''', - min='0', undoc=True), + the maximum gap between instantiated keys in a Btree leaf page, + constraining the number of keys processed to instantiate a + random Btree leaf page key''', + min='0', undoc=True), Config('leaf_page_max', '1MB', r''' - the maximum page size for leaf nodes, in bytes; the size must - be a multiple of the allocation size, and is significant for - applications wanting to maximize sequential data transfer from - a storage device. The page maximum is the bytes of uncompressed - data, that is, the limit is applied before any block compression - is done''', - min='512B', max='512MB'), + the maximum page size for leaf nodes, in bytes; the size must + be a multiple of the allocation size, and is significant for + applications wanting to maximize sequential data transfer from + a storage device. The page maximum is the bytes of uncompressed + data, that is, the limit is applied before any block compression + is done''', + min='512B', max='512MB'), Config('leaf_item_max', '0', r''' - the largest key or value stored within a leaf node, in bytes. - If non-zero, any key or value larger than the specified size - will be stored as an overflow item (which may require additional - I/O to access). If zero, a default size is chosen that permits - at least 4 key and value pairs per leaf page''', - min=0), + the largest key or value stored within a leaf node, in bytes. + If non-zero, any key or value larger than the specified size + will be stored as an overflow item (which may require additional + I/O to access). If zero, a default size is chosen that permits + at least 4 key and value pairs per leaf page''', + min=0), Config('memory_page_max', '5MB', r''' - the maximum size a page can grow to in memory before being - reconciled to disk. The specified size will be adjusted to a - lower bound of <code>50 * leaf_page_max</code>. This limit is - soft - it is possible for pages to be temporarily larger than - this value''', - min='512B', max='10TB'), + the maximum size a page can grow to in memory before being + reconciled to disk. The specified size will be adjusted to a + lower bound of <code>50 * leaf_page_max</code>. This limit is + soft - it is possible for pages to be temporarily larger than + this value''', + min='512B', max='10TB'), Config('os_cache_max', '0', r''' - maximum system buffer cache usage, in bytes. If non-zero, evict - object blocks from the system buffer cache after that many bytes - from this object are read or written into the buffer cache''', - min=0), + maximum system buffer cache usage, in bytes. If non-zero, evict + object blocks from the system buffer cache after that many bytes + from this object are read or written into the buffer cache''', + min=0), Config('os_cache_dirty_max', '0', r''' - maximum dirty system buffer cache usage, in bytes. If non-zero, - schedule writes for dirty blocks belonging to this object in the - system buffer cache after that many bytes from this object are - written into the buffer cache''', - min=0), + maximum dirty system buffer cache usage, in bytes. If non-zero, + schedule writes for dirty blocks belonging to this object in the + system buffer cache after that many bytes from this object are + written into the buffer cache''', + min=0), Config('prefix_compression', 'true', r''' - configure prefix compression on row-store leaf pages''', - type='boolean'), + configure prefix compression on row-store leaf pages''', + type='boolean'), Config('prefix_compression_min', '4', r''' - minimum gain before prefix compression will be used on row-store - leaf pages''', - min=0), + minimum gain before prefix compression will be used on row-store + leaf pages''', + min=0), Config('split_pct', '75', r''' - the Btree page split size as a percentage of the maximum Btree - page size, that is, when a Btree page is split, it will be - split into smaller pages, where each page is the specified - percentage of the maximum Btree page size''', - min='25', max='100'), + the Btree page split size as a percentage of the maximum Btree + page size, that is, when a Btree page is split, it will be + split into smaller pages, where each page is the specified + percentage of the maximum Btree page size''', + min='25', max='100'), ] # File metadata, including both configurable and non-configurable (internal) file_meta = file_config + [ Config('checkpoint', '', r''' - the file checkpoint entries'''), + the file checkpoint entries'''), + Config('checkpoint_lsn', '', r''' + LSN of the last checkpoint'''), + Config('id', '', r''' + the file's ID number'''), Config('version', '(major=0,minor=0)', r''' - the file version'''), + the file version'''), ] table_only_meta = [ Config('colgroups', '', r''' - comma-separated list of names of column groups. Each column - group is stored separately, keyed by the primary key of the - table. If no column groups are specified, all columns are - stored together in a single file. All value columns in the - table must appear in at least one column group. Each column - group must be created with a separate call to - WT_SESSION::create''', type='list'), + comma-separated list of names of column groups. Each column + group is stored separately, keyed by the primary key of the + table. If no column groups are specified, all columns are + stored together in a single file. All value columns in the + table must appear in at least one column group. Each column + group must be created with a separate call to + WT_SESSION::create''', type='list'), ] colgroup_meta = column_meta + source_meta @@ -276,88 +280,89 @@ table_meta = format_meta + table_only_meta # Connection runtime config, shared by conn.reconfigure and wiredtiger_open connection_runtime_config = [ Config('shared_cache', '', r''' - shared cache configuration options. A database should configure - either a cache_size or a shared_cache not both''', - type='category', subconfig=[ - Config('enable', 'false', r''' - whether the connection is using a shared cache''', - type='boolean'), - Config('chunk', '10MB', r''' - the granularity that a shared cache is redistributed''', - min='1MB', max='10TB'), - Config('reserve', '0', r''' - amount of cache this database is guaranteed to have - available from the shared cache. This setting is per - database. Defaults to the chunk size''', type='int'), - Config('name', 'pool', r''' - name of a cache that is shared between databases'''), - Config('size', '500MB', r''' - maximum memory to allocate for the shared cache. Setting - this will update the value if one is already set''', - min='1MB', max='10TB') - ]), + shared cache configuration options. A database should configure + either a cache_size or a shared_cache not both''', + type='category', subconfig=[ + Config('enable', 'false', r''' + whether the connection is using a shared cache''', + type='boolean'), + Config('chunk', '10MB', r''' + the granularity that a shared cache is redistributed''', + min='1MB', max='10TB'), + Config('reserve', '0', r''' + amount of cache this database is guaranteed to have + available from the shared cache. This setting is per + database. Defaults to the chunk size''', type='int'), + Config('name', 'pool', r''' + name of a cache that is shared between databases'''), + Config('size', '500MB', r''' + maximum memory to allocate for the shared cache. Setting + this will update the value if one is already set''', + min='1MB', max='10TB') + ]), Config('cache_size', '100MB', r''' - maximum heap memory to allocate for the cache. A database should - configure either a cache_size or a shared_cache not both''', - min='1MB', max='10TB'), + maximum heap memory to allocate for the cache. A database should + configure either a cache_size or a shared_cache not both''', + min='1MB', max='10TB'), Config('error_prefix', '', r''' - prefix string for error messages'''), + prefix string for error messages'''), Config('eviction_dirty_target', '80', r''' - continue evicting until the cache has less dirty pages than this - (as a percentage). Dirty pages will only be evicted if the cache - is full enough to trigger eviction''', - min=10, max=99), + continue evicting until the cache has less dirty pages than this + (as a percentage). Dirty pages will only be evicted if the cache + is full enough to trigger eviction''', + min=10, max=99), Config('eviction_target', '80', r''' - continue evicting until the cache becomes less full than this - (as a percentage). Must be less than \c eviction_trigger''', - min=10, max=99), + continue evicting until the cache becomes less full than this + (as a percentage). Must be less than \c eviction_trigger''', + min=10, max=99), Config('eviction_trigger', '95', r''' - trigger eviction when the cache becomes this full (as a - percentage)''', - min=10, max=99), + trigger eviction when the cache becomes this full (as a + percentage)''', + min=10, max=99), Config('statistics', 'none', r''' - Maintain database statistics, which may impact performance. - Choosing "all" maintains all statistics regardless of cost, - "fast" maintains a subset of statistics that are relatively - inexpensive, "none" turns off all statistics. The "clear" - configuration resets statistics after they are gathered, - where appropriate (for example, a cache size statistic is - not cleared, while the count of cursor insert operations will - be cleared). When "clear" is configured for the database, - gathered statistics are reset each time a statistics cursor - is used to gather statistics, as well as each time statistics - are logged using the \c statistics_log configuration. See - @ref statistics for more information''', - type='list', choices=['all', 'fast', 'none', 'clear']), + Maintain database statistics, which may impact performance. + Choosing "all" maintains all statistics regardless of cost, + "fast" maintains a subset of statistics that are relatively + inexpensive, "none" turns off all statistics. The "clear" + configuration resets statistics after they are gathered, + where appropriate (for example, a cache size statistic is + not cleared, while the count of cursor insert operations will + be cleared). When "clear" is configured for the database, + gathered statistics are reset each time a statistics cursor + is used to gather statistics, as well as each time statistics + are logged using the \c statistics_log configuration. See + @ref statistics for more information''', + type='list', choices=['all', 'fast', 'none', 'clear']), Config('verbose', '', r''' - enable messages for various events. Options are given as a - list, such as <code>"verbose=[evictserver,read]"</code>''', - type='list', choices=[ - 'block', - 'ckpt', - 'compact', - 'evict', - 'evictserver', - 'fileops', - 'hazard', - 'log', - 'lsm', - 'mutex', - 'overflow', - 'read', - 'readserver', - 'reconcile', - 'salvage', - 'shared_cache', - 'verify', - 'version', - 'write']), + enable messages for various events. Options are given as a + list, such as <code>"verbose=[evictserver,read]"</code>''', + type='list', choices=[ + 'block', + 'ckpt', + 'compact', + 'evict', + 'evictserver', + 'fileops', + 'hazard', + 'log', + 'lsm', + 'mutex', + 'overflow', + 'read', + 'readserver', + 'reconcile', + 'recovery', + 'salvage', + 'shared_cache', + 'verify', + 'version', + 'write']), ] session_config = [ Config('isolation', 'read-committed', r''' - the default isolation level for operations in this session''', - choices=['read-uncommitted', 'read-committed', 'snapshot']), + the default isolation level for operations in this session''', + choices=['read-uncommitted', 'read-committed', 'snapshot']), ] methods = { @@ -375,135 +380,136 @@ methods = { 'session.compact' : Method([]), -'session.create' : Method(table_only_meta + file_config + lsm_config + source_meta + [ +'session.create' : + Method(table_only_meta + file_config + lsm_config + source_meta + [ Config('exclusive', 'false', r''' - fail if the object exists. When false (the default), if the - object exists, check that its settings match the specified - configuration''', - type='boolean'), + fail if the object exists. When false (the default), if the + object exists, check that its settings match the specified + configuration''', + type='boolean'), ]), 'session.drop' : Method([ Config('force', 'false', r''' - return success if the object does not exist''', - type='boolean'), + return success if the object does not exist''', + type='boolean'), Config('remove_files', 'true', r''' - should the underlying files be removed?''', - type='boolean'), - ]), + should the underlying files be removed?''', + type='boolean'), +]), 'session.log_printf' : Method([]), 'session.open_cursor' : Method([ Config('append', 'false', r''' - append the value as a new record, creating a new record - number key; valid only for cursors with record number keys''', - type='boolean'), + append the value as a new record, creating a new record + number key; valid only for cursors with record number keys''', + type='boolean'), Config('bulk', 'false', r''' - configure the cursor for bulk-loading, a fast, initial load - path (see @ref bulk_load for more information). Bulk-load - may only be used for newly created objects and cursors - configured for bulk-load only support the WT_CURSOR::insert - and WT_CURSOR::close methods. When bulk-loading row-store - objects, keys must be loaded in sorted order. The value is - usually a true/false flag; when bulk-loading fixed-length - column store objects, the special value \c bitmap allows - chunks of a memory resident bitmap to be loaded directly into - a file by passing a \c WT_ITEM to WT_CURSOR::set_value where - the \c size field indicates the number of records in the - bitmap (as specified by the object's \c value_format - configuration). Bulk-loaded bitmap values must end on a byte - boundary relative to the bit count (except for the last set - of values loaded)'''), + configure the cursor for bulk-loading, a fast, initial load + path (see @ref bulk_load for more information). Bulk-load + may only be used for newly created objects and cursors + configured for bulk-load only support the WT_CURSOR::insert + and WT_CURSOR::close methods. When bulk-loading row-store + objects, keys must be loaded in sorted order. The value is + usually a true/false flag; when bulk-loading fixed-length + column store objects, the special value \c bitmap allows + chunks of a memory resident bitmap to be loaded directly into + a file by passing a \c WT_ITEM to WT_CURSOR::set_value where + the \c size field indicates the number of records in the + bitmap (as specified by the object's \c value_format + configuration). Bulk-loaded bitmap values must end on a byte + boundary relative to the bit count (except for the last set + of values loaded)'''), Config('checkpoint', '', r''' - the name of a checkpoint to open (the reserved name - "WiredTigerCheckpoint" opens the most recent internal - checkpoint taken for the object). The cursor does not - support data modification'''), + the name of a checkpoint to open (the reserved name + "WiredTigerCheckpoint" opens the most recent internal + checkpoint taken for the object). The cursor does not + support data modification'''), Config('dump', '', r''' - configure the cursor for dump format inputs and outputs: - "hex" selects a simple hexadecimal format, "print" - selects a format where only non-printing characters are - hexadecimal encoded. The cursor dump format is compatible - with the @ref util_dump and @ref util_load commands''', - choices=['hex', 'print']), + configure the cursor for dump format inputs and outputs: + "hex" selects a simple hexadecimal format, "print" + selects a format where only non-printing characters are + hexadecimal encoded. The cursor dump format is compatible + with the @ref util_dump and @ref util_load commands''', + choices=['hex', 'print']), Config('next_random', 'false', r''' - configure the cursor to return a pseudo-random record from - the object; valid only for row-store cursors. Cursors - configured with \c next_random=true only support the - WT_CURSOR::next and WT_CURSOR::close methods. See @ref - cursor_random for details''', - type='boolean'), + configure the cursor to return a pseudo-random record from + the object; valid only for row-store cursors. Cursors + configured with \c next_random=true only support the + WT_CURSOR::next and WT_CURSOR::close methods. See @ref + cursor_random for details''', + type='boolean'), Config('overwrite', 'true', r''' - configures whether the cursor's insert, update and remove - methods check the existing state of the record. If \c overwrite - is \c false, WT_CURSOR::insert fails with ::WT_DUPLICATE_KEY - if the record exists, WT_CURSOR::update and WT_CURSOR::remove - fail with ::WT_NOTFOUND if the record does not exist''', - type='boolean'), + configures whether the cursor's insert, update and remove + methods check the existing state of the record. If \c overwrite + is \c false, WT_CURSOR::insert fails with ::WT_DUPLICATE_KEY + if the record exists, WT_CURSOR::update and WT_CURSOR::remove + fail with ::WT_NOTFOUND if the record does not exist''', + type='boolean'), Config('raw', 'false', r''' - ignore the encodings for the key and value, manage data as if - the formats were \c "u". See @ref cursor_raw for details''', - type='boolean'), + ignore the encodings for the key and value, manage data as if + the formats were \c "u". See @ref cursor_raw for details''', + type='boolean'), Config('statistics', '', r''' - Specify the statistics to be gathered. Choosing "all" gathers - statistics regardless of cost and may include traversing - on-disk files; "fast" gathers a subset of relatively - inexpensive statistics. The selection must agree with the - database \c statistics configuration specified to - ::wiredtiger_open or WT_CONNECTION::reconfigure. For example, - "all" or "fast" can be configured when the database is - configured with "all", but the cursor open will fail if "all" - is specified when the database is configured with "fast", - and the cursor open will fail in all cases when the database - is configured with "none". If \c statistics is not configured, - the default configuration is the database configuration. - The "clear" configuration resets statistics after gathering - them, where appropriate (for example, a cache size statistic - is not cleared, while the count of cursor insert operations - will be cleared). See @ref statistics for more information''', - type='list', choices=['all', 'fast', 'clear']), + Specify the statistics to be gathered. Choosing "all" gathers + statistics regardless of cost and may include traversing + on-disk files; "fast" gathers a subset of relatively + inexpensive statistics. The selection must agree with the + database \c statistics configuration specified to + ::wiredtiger_open or WT_CONNECTION::reconfigure. For example, + "all" or "fast" can be configured when the database is + configured with "all", but the cursor open will fail if "all" + is specified when the database is configured with "fast", + and the cursor open will fail in all cases when the database + is configured with "none". If \c statistics is not configured, + the default configuration is the database configuration. + The "clear" configuration resets statistics after gathering + them, where appropriate (for example, a cache size statistic + is not cleared, while the count of cursor insert operations + will be cleared). See @ref statistics for more information''', + type='list', choices=['all', 'fast', 'clear']), Config('target', '', r''' - if non-empty, backup the list of objects; valid only for a - backup data source''', - type='list'), + if non-empty, backup the list of objects; valid only for a + backup data source''', + type='list'), ]), 'session.rename' : Method([]), 'session.salvage' : Method([ Config('force', 'false', r''' - force salvage even of files that do not appear to be WiredTiger - files''', - type='boolean'), + force salvage even of files that do not appear to be WiredTiger + files''', + type='boolean'), ]), 'session.truncate' : Method([]), 'session.upgrade' : Method([]), 'session.verify' : Method([ Config('dump_address', 'false', r''' - Display addresses and page types as pages are verified, using - the application's message handler, intended for debugging''', - type='boolean'), + Display addresses and page types as pages are verified, using + the application's message handler, intended for debugging''', + type='boolean'), Config('dump_blocks', 'false', r''' - Display the contents of on-disk blocks as they are verified, using - the application's message handler, intended for debugging''', - type='boolean'), + Display the contents of on-disk blocks as they are verified, using + the application's message handler, intended for debugging''', + type='boolean'), Config('dump_pages', 'false', r''' - Display the contents of in-memory pages as they are verified, using - the application's message handler, intended for debugging''', - type='boolean') + Display the contents of in-memory pages as they are verified, using + the application's message handler, intended for debugging''', + type='boolean') ]), 'session.begin_transaction' : Method([ Config('isolation', '', r''' - the isolation level for this transaction; defaults to the - session's isolation level''', - choices=['read-uncommitted', 'read-committed', 'snapshot']), + the isolation level for this transaction; defaults to the + session's isolation level''', + choices=['read-uncommitted', 'read-committed', 'snapshot']), Config('name', '', r''' - name of the transaction for tracing and debugging'''), + name of the transaction for tracing and debugging'''), Config('priority', 0, r''' - priority of the transaction for resolving conflicts. - Transactions with higher values are less likely to abort''', - min='-100', max='100'), + priority of the transaction for resolving conflicts. + Transactions with higher values are less likely to abort''', + min='-100', max='100'), ]), 'session.commit_transaction' : Method([]), @@ -511,24 +517,24 @@ methods = { 'session.checkpoint' : Method([ Config('drop', '', r''' - specify a list of checkpoints to drop. - The list may additionally contain one of the following keys: - \c "from=all" to drop all checkpoints, - \c "from=<checkpoint>" to drop all checkpoints after and - including the named checkpoint, or - \c "to=<checkpoint>" to drop all checkpoints before and - including the named checkpoint. Checkpoints cannot be - dropped while a hot backup is in progress or if open in - a cursor''', type='list'), + specify a list of checkpoints to drop. + The list may additionally contain one of the following keys: + \c "from=all" to drop all checkpoints, + \c "from=<checkpoint>" to drop all checkpoints after and + including the named checkpoint, or + \c "to=<checkpoint>" to drop all checkpoints before and + including the named checkpoint. Checkpoints cannot be + dropped while a hot backup is in progress or if open in + a cursor''', type='list'), Config('force', 'false', r''' - by default, checkpoints may be skipped if the underlying object - has not been modified, this option forces the checkpoint''', - type='boolean'), + by default, checkpoints may be skipped if the underlying object + has not been modified, this option forces the checkpoint''', + type='boolean'), Config('name', '', r''' - if non-empty, specify a name for the checkpoint (note that - checkpoints including LSM trees may not be named)'''), + if non-empty, specify a name for the checkpoint (note that + checkpoints including LSM trees may not be named)'''), Config('target', '', r''' - if non-empty, checkpoint the list of objects''', type='list'), + if non-empty, checkpoint the list of objects''', type='list'), ]), 'connection.add_collator' : Method([]), @@ -540,17 +546,17 @@ methods = { 'connection.load_extension' : Method([ Config('config', '', r''' - configuration string passed to the entry point of the - extension as its WT_CONFIG_ARG argument'''), + configuration string passed to the entry point of the + extension as its WT_CONFIG_ARG argument'''), Config('entry', 'wiredtiger_extension_init', r''' - the entry point of the extension, called to initialize the - extension when it is loaded. The signature of the function - must match ::wiredtiger_extension_init'''), + the entry point of the extension, called to initialize the + extension when it is loaded. The signature of the function + must match ::wiredtiger_extension_init'''), Config('terminate', 'wiredtiger_extension_terminate', r''' - an optional function in the extension that is called before - the extension is unloaded during WT_CONNECTION::close. The - signature of the function must match - ::wiredtiger_extension_terminate'''), + an optional function in the extension that is called before + the extension is unloaded during WT_CONNECTION::close. The + signature of the function must match + ::wiredtiger_extension_terminate'''), ]), 'connection.open_session' : Method(session_config), @@ -559,112 +565,112 @@ methods = { 'wiredtiger_open' : Method(connection_runtime_config + [ Config('buffer_alignment', '-1', r''' - in-memory alignment (in bytes) for buffers used for I/O. The - default value of -1 indicates a platform-specific alignment - value should be used (4KB on Linux systems, zero elsewhere)''', - min='-1', max='1MB'), + in-memory alignment (in bytes) for buffers used for I/O. The + default value of -1 indicates a platform-specific alignment + value should be used (4KB on Linux systems, zero elsewhere)''', + min='-1', max='1MB'), Config('checkpoint', '', r''' - periodically checkpoint the database''', - type='category', subconfig=[ - Config('name', '"WiredTigerCheckpoint"', r''' - the checkpoint name'''), - Config('wait', '0', r''' - seconds to wait between each checkpoint; setting this value - configures periodic checkpoints''', - min='1', max='100000'), - ]), + periodically checkpoint the database''', + type='category', subconfig=[ + Config('name', '"WiredTigerCheckpoint"', r''' + the checkpoint name'''), + Config('wait', '0', r''' + seconds to wait between each checkpoint; setting this value + configures periodic checkpoints''', + min='1', max='100000'), + ]), Config('checkpoint_sync', 'true', r''' - flush files to stable storage when closing or writing - checkpoints''', - type='boolean'), + flush files to stable storage when closing or writing + checkpoints''', + type='boolean'), Config('create', 'false', r''' - create the database if it does not exist''', - type='boolean'), + create the database if it does not exist''', + type='boolean'), Config('direct_io', '', r''' - Use \c O_DIRECT to access files. Options are given as a list, - such as <code>"direct_io=[data]"</code>. Configuring - \c direct_io requires care, see @ref - tuning_system_buffer_cache_direct_io for important warnings''', - type='list', choices=['data', 'log']), + Use \c O_DIRECT to access files. Options are given as a list, + such as <code>"direct_io=[data]"</code>. Configuring + \c direct_io requires care, see @ref + tuning_system_buffer_cache_direct_io for important warnings''', + type='list', choices=['data', 'log']), Config('extensions', '', r''' - list of shared library extensions to load (using dlopen). - Any values specified to an library extension are passed to - WT_CONNECTION::load_extension as the \c config parameter - (for example, - <code>extensions=(/path/ext.so={entry=my_entry})</code>)''', - type='list'), + list of shared library extensions to load (using dlopen). + Any values specified to an library extension are passed to + WT_CONNECTION::load_extension as the \c config parameter + (for example, + <code>extensions=(/path/ext.so={entry=my_entry})</code>)''', + type='list'), Config('file_extend', '', r''' - file extension configuration. If set, extend files of the set - type in allocations of the set size, instead of a block at a - time as each new block is written. For example, - <code>file_extend=(data=16MB)</code>''', - type='list', choices=['data', 'log']), + file extension configuration. If set, extend files of the set + type in allocations of the set size, instead of a block at a + time as each new block is written. For example, + <code>file_extend=(data=16MB)</code>''', + type='list', choices=['data', 'log']), Config('hazard_max', '1000', r''' - maximum number of simultaneous hazard pointers per session - handle''', - min='15'), + maximum number of simultaneous hazard pointers per session + handle''', + min='15'), Config('log', '', r''' - enable logging''', - type='category', subconfig=[ - Config('archive', 'true', r''' - automatically archive unneeded log files''', - type='boolean'), - Config('enabled', 'false', r''' - enable logging subsystem''', - type='boolean'), - Config('file_max', '100MB', r''' - the maximum size of the log file''', - min='1MB', max='2GB'), - Config('path', '""', r''' - the path to a directory into which the log files are written. - If the value is not an absolute path name, the files are created - relative to the database home'''), - ]), + enable logging''', + type='category', subconfig=[ + Config('archive', 'true', r''' + automatically archive unneeded log files''', + type='boolean'), + Config('enabled', 'false', r''' + enable logging subsystem''', + type='boolean'), + Config('file_max', '100MB', r''' + the maximum size of log files''', + min='100KB', max='2GB'), + Config('path', '""', r''' + the path to a directory into which the log files are written. + If the value is not an absolute path name, the files are created + relative to the database home'''), + ]), Config('lsm_merge', 'true', r''' - merge LSM chunks where possible''', - type='boolean'), + merge LSM chunks where possible''', + type='boolean'), Config('mmap', 'true', r''' - Use memory mapping to access files when possible''', - type='boolean'), + Use memory mapping to access files when possible''', + type='boolean'), Config('multiprocess', 'false', r''' - permit sharing between processes (will automatically start an - RPC server for primary processes and use RPC for secondary - processes). <b>Not yet supported in WiredTiger</b>''', - type='boolean'), + permit sharing between processes (will automatically start an + RPC server for primary processes and use RPC for secondary + processes). <b>Not yet supported in WiredTiger</b>''', + type='boolean'), Config('session_max', '50', r''' - maximum expected number of sessions (including server - threads)''', - min='1'), + maximum expected number of sessions (including server + threads)''', + min='1'), Config('statistics_log', '', r''' - log any statistics the database is configured to maintain, - to a file. See @ref statistics for more information''', - type='category', subconfig=[ - Config('path', '"WiredTigerStat.%H"', r''' - the pathname to a file into which the log records are written, - may contain ISO C standard strftime conversion specifications. - If the value is not an absolute path name, the file is created - relative to the database home'''), - Config('sources', '', r''' - if non-empty, include statistics for the list of data source - URIs, if they are open at the time of the statistics logging. - The list may include URIs matching a single data source - ("table:mytable"), or a URI matching all data sources of a - particular type ("table:")''', - type='list'), - Config('timestamp', '"%b %d %H:%M:%S"', r''' - a timestamp prepended to each log record, may contain strftime - conversion specifications'''), - Config('wait', '0', r''' - seconds to wait between each write of the log records''', - min='1', max='100000'), - ]), + log any statistics the database is configured to maintain, + to a file. See @ref statistics for more information''', + type='category', subconfig=[ + Config('path', '"WiredTigerStat.%H"', r''' + the pathname to a file into which the log records are written, + may contain ISO C standard strftime conversion specifications. + If the value is not an absolute path name, the file is created + relative to the database home'''), + Config('sources', '', r''' + if non-empty, include statistics for the list of data source + URIs, if they are open at the time of the statistics logging. + The list may include URIs matching a single data source + ("table:mytable"), or a URI matching all data sources of a + particular type ("table:")''', + type='list'), + Config('timestamp', '"%b %d %H:%M:%S"', r''' + a timestamp prepended to each log record, may contain strftime + conversion specifications'''), + Config('wait', '0', r''' + seconds to wait between each write of the log records''', + min='1', max='100000'), + ]), Config('transaction_sync', 'dsync', r''' - how to sync log records when the transaction commits''', - choices=['dsync', 'fsync', 'none']), + how to sync log records when the transaction commits''', + choices=['dsync', 'fsync', 'none']), Config('use_environment_priv', 'false', r''' - use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment - variables regardless of whether or not the process is running - with special privileges. See @ref home for more information''', - type='boolean'), + use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment + variables regardless of whether or not the process is running + with special privileges. See @ref home for more information''', + type='boolean'), ]), } diff --git a/dist/api_err.py b/dist/api_err.py index 7ad645d108c..158a7e7ff56 100644 --- a/dist/api_err.py +++ b/dist/api_err.py @@ -28,8 +28,11 @@ for line in open('../src/include/wiredtiger.in', 'r'): if 'undoc' in err.flags: tfile.write('/*! @cond internal */\n') tfile.write('/*!%s.%s */\n' % - (('\n * ' if err.long_desc else ' ') + err.desc[0].upper() + err.desc[1:], - ''.join('\n * ' + l for l in textwrap.wrap(textwrap.dedent(err.long_desc).strip(), 77)) + '\n' if err.long_desc else '')) + (('\n * ' if err.long_desc else ' ') + + err.desc[0].upper() + err.desc[1:], + ''.join('\n * ' + l for l in textwrap.wrap( + textwrap.dedent(err.long_desc).strip(), 77)) + + '\n' if err.long_desc else '')) tfile.write('#define\t%s\t%d\n' % (err.name, v)) v -= 1 if 'undoc' in err.flags: diff --git a/dist/dist.py b/dist/dist.py index 8dbe75cba09..a51ce31b97b 100644 --- a/dist/dist.py +++ b/dist/dist.py @@ -1,58 +1,29 @@ import filecmp, os, re, shutil -# source_files_list -- -# Return a list of the source file names in filelist. -def source_files_list(): - list=[] - file_re = re.compile(r'^(\w|/)+/((\w|.)+)') - for line in open('filelist', 'r'): - if file_re.match(line): - list.append(file_re.match(line).group(2)) - return sorted(list) - # source_files -- -# Print a list of the source file names in filelist. +# Return a list of the source file names in filelist. def source_files(): - for line in source_files_list(): - print(line) - -# source_paths_list -- -# Return a list of the source file paths in filelist. -def source_paths_list(): - list=[] file_re = re.compile(r'^\w') for line in open('filelist', 'r'): if file_re.match(line): - list.append(line.rstrip()) - return sorted(list) + yield os.path.join('..', line.rstrip()) -# source_paths -- -# Print a list of the source file paths in filelist. -def source_paths(): - for line in source_paths_list(): - print(line) - -# directory_files_list -- +# source_dirs -- # Return a list of the directories in filelist. -def directory_files_list(): - dirs = {} - dir_re = re.compile(r'^((\w|/)+/)') - for line in open('filelist', 'r'): - if dir_re.match(line): - dirs[dir_re.match(line).group(1)] = 1 - return sorted(dirs) +def source_dirs(): + dirs = set() + for f in source_files(): + dirs.add(os.path.dirname(f)) + return dirs -# directory_files -- -# Print a list of the directories in filelist. -def directory_files(): - for entry in directory_files_list(): - print(entry) +def print_source_dirs(): + for d in source_dirs(): + print d # compare_srcfile -- # Compare two files, and if they differ, update the source file. def compare_srcfile(tmp, src): - if not os.path.isfile(src) or \ - not filecmp.cmp(tmp, src, False): + if not os.path.isfile(src) or not filecmp.cmp(tmp, src, shallow=False): print('Updating ' + src) shutil.copyfile(tmp, src) os.remove(tmp) diff --git a/dist/filelist b/dist/filelist index f251731af48..87b75d0b7cf 100644 --- a/dist/filelist +++ b/dist/filelist @@ -75,7 +75,7 @@ src/cursor/cur_stat.c src/cursor/cur_std.c src/cursor/cur_table.c src/log/log.c -src/log/log_desc.c +src/log/log_auto.c src/log/log_slot.c src/lsm/lsm_cursor.c src/lsm/lsm_merge.c @@ -149,3 +149,5 @@ src/support/stat.c src/txn/txn.c src/txn/txn_ckpt.c src/txn/txn_ext.c +src/txn/txn_log.c +src/txn/txn_recover.c diff --git a/dist/flags.py b/dist/flags.py index c5cf3c213ea..c7ba71e04b0 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -18,17 +18,16 @@ flags = { 'FILE_TYPE_DATA', 'FILE_TYPE_LOG' ], - 'log_scan' : [ + 'log_scan' : [ 'LOGSCAN_FIRST', 'LOGSCAN_FROM_CKP', 'LOGSCAN_ONE', 'LOGSCAN_RECOVER' - ], - 'log_write' : [ - 'LOG_CKPT', + ], + 'log_write' : [ 'LOG_DSYNC', 'LOG_FSYNC' - ], + ], 'rec_write' : [ 'EVICTION_SERVER_LOCKED', 'SKIP_UPDATE_ERR', @@ -47,6 +46,12 @@ flags = { 'TREE_SKIP_LEAF', 'TREE_WAIT', ], + 'txn_log_checkpoint' : [ + 'TXN_LOG_CKPT_FAIL', + 'TXN_LOG_CKPT_PREPARE', + 'TXN_LOG_CKPT_START', + 'TXN_LOG_CKPT_STOP', + ], 'verbose' : [ 'VERB_block', 'VERB_ckpt', @@ -61,6 +66,7 @@ flags = { 'VERB_overflow', 'VERB_read', 'VERB_reconcile', + 'VERB_recovery', 'VERB_salvage', 'VERB_shared_cache', 'VERB_verify', @@ -82,6 +88,8 @@ flags = { 'session' : [ 'SESSION_CACHE_BUSY', 'SESSION_INTERNAL', + 'SESSION_LOGGING_DISABLED', + 'SESSION_LOGGING_INMEM', 'SESSION_NO_CACHE', 'SESSION_NO_CACHE_CHECK', 'SESSION_NO_SCHEMA_LOCK', diff --git a/dist/java_doc.py b/dist/java_doc.py index ce42a53c118..d44ccb12160 100644 --- a/dist/java_doc.py +++ b/dist/java_doc.py @@ -36,7 +36,8 @@ for line in open(f, 'r'): m = cfunc_re.match(line) if m: - tfile.write('COPYDOC(__' + curr_class.lower() + ', ' + curr_class.upper() + ', ' + m.group(1) + ')\n') + tfile.write('COPYDOC(__' + curr_class.lower() + ', ' + + curr_class.upper() + ', ' + m.group(1) + ')\n') tfile.close() compare_srcfile(tmp_file, o) diff --git a/dist/log.py b/dist/log.py index aeed3ee5fe6..73752d4c7d3 100644 --- a/dist/log.py +++ b/dist/log.py @@ -1,40 +1,260 @@ #!/usr/bin/env python import os, re, sys, textwrap -import log_data from dist import compare_srcfile +import log_data # Temporary file. tmp_file = '__tmp' -# Map log record types to C -c_types = { - 'string' : 'const char *', +# Map log record types to: +# (C type, pack type, printf format, printf arg(s)) +field_types = { + 'string' : ('const char *', 'S', '%s', 'arg'), + 'item' : ('WT_ITEM *', 'u', '%.*s', + '(int)arg.size, (const char *)arg.data'), + 'recno' : ('uint64_t', 'r', '%" PRIu64 "', 'arg'), + 'uint32' : ('uint32_t', 'I', '%" PRIu32 "', 'arg'), + 'uint64' : ('uint64_t', 'Q', '%" PRIu64 "', 'arg'), } -# Map log record types to format strings -fmt_types = { - 'string' : 'S', -} +def cintype(f): + return field_types[f[0]][0] + +def couttype(f): + type = cintype(f) + # We already have a pointer to a WT_ITEM + if f[0] == 'item': + return type + if type[-1] != '*': + type += ' ' + return type + '*' + +def clocaltype(f): + type = cintype(f) + # Allocate a WT_ITEM struct on the stack + if f[0] == 'item': + return type[:-2] + return type + +def pack_fmt(fields): + return ''.join(field_types[f[0]][1] for f in fields) + +def op_pack_fmt(r): + return 'II' + pack_fmt(r.fields) + +def rec_pack_fmt(r): + return 'I' + pack_fmt(r.fields) + +def printf_fmt(f): + return field_types[f[0]][2] + +def printf_arg(f): + arg = field_types[f[0]][3].replace('arg', f[1]) + return '\n\t ' + arg if f[0] == 'item' else ' ' + arg ##################################################################### -# Create log.i with inline functions for each log record type. +# Update log.h with #defines for types ##################################################################### -f='../src/include/log.i' -tfile = open(tmp_file, 'w') - -tfile.write('/* DO NOT EDIT: automatically built by dist/log.py. */\n') +log_defines = ( + ''.join('#define\t%s\t%d\n' % (r.macro_name(), i) + for i, r in enumerate(log_data.rectypes)) + + ''.join('#define\t%s\t%d\n' % (r.macro_name(), i) + for i, r in enumerate(log_data.optypes)) +) +tfile = open(tmp_file, 'w') +skip = 0 +for line in open('../src/include/log.h', 'r'): + if skip: + if 'Log record declarations: END' in line: + tfile.write('/*\n' + line) + skip = 0 + else: + tfile.write(line) + if 'Log record declarations: BEGIN' in line: + skip = 1 + tfile.write(' */\n') + tfile.write(log_defines) tfile.close() -compare_srcfile(tmp_file, f) +compare_srcfile(tmp_file, '../src/include/log.h') ##################################################################### -# Create log_desc.c with descriptors for each log record type. +# Create log_auto.c with handlers for each record / operation type. ##################################################################### -f='../src/log/log_desc.c' +f='../src/log/log_auto.c' tfile = open(tmp_file, 'w') tfile.write('/* DO NOT EDIT: automatically built by dist/log.py. */\n') +tfile.write(''' +#include "wt_internal.h" + +int +__wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp) +{ + WT_ITEM *logrec; + + WT_RET(__wt_scr_alloc(session, WT_ALIGN(size + 1, LOG_ALIGN), &logrec)); + WT_CLEAR(*(WT_LOG_RECORD *)logrec->data); + logrec->size = offsetof(WT_LOG_RECORD, record); + + *logrecp = logrec; + return (0); +} + +void +__wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp) +{ + WT_UNUSED(session); + __wt_scr_free(logrecp); +} + +int +__wt_logrec_read(WT_SESSION_IMPL *session, + const uint8_t **pp, const uint8_t *end, uint32_t *rectypep) +{ + uint64_t rectype; + + WT_UNUSED(session); + WT_RET(__wt_vunpack_uint(pp, WT_PTRDIFF(end, *pp), &rectype)); + *rectypep = (uint32_t)rectype; + return (0); +} + +int +__wt_logop_read(WT_SESSION_IMPL *session, + const uint8_t **pp, const uint8_t *end, + uint32_t *optypep, uint32_t *opsizep) +{ + return (__wt_struct_unpack( + session, *pp, WT_PTRDIFF(end, *pp), "II", optypep, opsizep)); +} +''') + +# Emit code to read, write and print log operations (within a log record) +for optype in log_data.optypes: + if not optype.fields: + continue + + tfile.write(''' +int +__wt_logop_%(name)s_pack( + WT_SESSION_IMPL *session, WT_ITEM *logrec, + %(arg_decls)s) +{ + const char *fmt = WT_UNCHECKED_STRING(%(fmt)s); + size_t size; + uint32_t optype, recsize; + + optype = %(macro)s; + WT_RET(__wt_struct_size(session, &size, fmt, + optype, 0%(arg_names)s)); + + size += __wt_vsize_uint(size) - 1; + WT_RET(__wt_buf_extend(session, logrec, logrec->size + size)); + recsize = (uint32_t)size; + WT_RET(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, size, fmt, + optype, recsize%(arg_names)s)); + + logrec->size += (uint32_t)size; + return (0); +} +''' % { + 'name' : optype.name, + 'macro' : optype.macro_name(), + 'arg_decls' : ', '.join( + '%s%s%s' % (cintype(f), '' if cintype(f)[-1] == '*' else ' ', f[1]) + for f in optype.fields), + 'arg_names' : ''.join(', %s' % f[1] for f in optype.fields), + 'fmt' : op_pack_fmt(optype) +}) + + tfile.write(''' +int +__wt_logop_%(name)s_unpack( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + %(arg_decls)s) +{ + const char *fmt = WT_UNCHECKED_STRING(%(fmt)s); + uint32_t optype, size; + + WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &optype, &size%(arg_names)s)); + WT_ASSERT(session, optype == %(macro)s); + + *pp += size; + return (0); +} +''' % { + 'name' : optype.name, + 'macro' : optype.macro_name(), + 'arg_decls' : ', '.join( + '%s%sp' % (couttype(f), f[1]) for f in optype.fields), + 'arg_names' : ''.join(', %sp' % f[1] for f in optype.fields), + 'fmt' : op_pack_fmt(optype) +}) + + tfile.write(''' +int +__wt_logop_%(name)s_print( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + %(arg_decls)s + + WT_RET(__wt_logop_%(name)s_unpack( + session, pp, end%(arg_addrs)s)); + + %(print_args)s + return (0); +} +''' % { + 'name' : optype.name, + 'arg_decls' : '\n\t'.join('%s%s%s;' % + (clocaltype(f), '' if clocaltype(f)[-1] == '*' else ' ', f[1]) + for f in optype.fields), + 'arg_addrs' : ''.join(', &%s' % f[1] for f in optype.fields), + 'print_args' : '\n\t'.join( + 'fprintf(out, " \\"%s\\": \\"%s\\",\\n",%s);' % + (f[1], printf_fmt(f), printf_arg(f)) + for f in optype.fields), +}) + +# Emit the printlog entry point +tfile.write(''' +int +__wt_txn_op_printlog( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + uint32_t optype, opsize; + + /* Peek at the size and the type. */ + WT_RET(__wt_logop_read(session, pp, end, &optype, &opsize)); + end = *pp + opsize; + + switch (optype) {''') + +for optype in log_data.optypes: + if not optype.fields: + continue + + tfile.write(''' + case %(macro)s: + WT_RET(%(print_func)s(session, pp, end, out)); + break; +''' % { + 'macro' : optype.macro_name(), + 'print_func' : '__wt_logop_' + optype.name + '_print', +}) + +tfile.write(''' + WT_ILLEGAL_VALUE(session); + } + + return (0); +} +''') + tfile.close() compare_srcfile(tmp_file, f) diff --git a/dist/log_data.py b/dist/log_data.py index 1e274e48991..64fe73a88a2 100644 --- a/dist/log_data.py +++ b/dist/log_data.py @@ -1,10 +1,60 @@ -# Data for config.py, describes all configuration key / value pairs +# Data for log.py, describes the format of log records + +# There are a small number of main log record types. +# +# Some log record types, such as transaction commit, also include a list of +# "log operations" within the same log record. Both log record types and log +# operations are described here. class LogRecordType: - def __init__(self, name, fields): - self.name = name - self.fields = fields + def __init__(self, name, fields): + self.name = name + self.fields = fields + + def macro_name(self): + return 'WT_LOGREC_%s' % self.name.upper() + + def prname(self): + return '__logrec_print_' + self.name + +rectypes = [ + # A database-wide checkpoint. + LogRecordType('checkpoint', [ + ('WT_LSN', 'ckpt_lsn'), ('uint32', 'nsnapshot'), ('item', 'snapshot')]), + + # Common case: a transaction commit + LogRecordType('commit', [('uint64', 'txnid')]), + + # Mark the start / end of a file sync operation (usually when a file is + # closed). These log records aren't required during recovery, but we use + # the allocated LSN to reduce the amount of work recovery has to do, and + # they are useful for debugging recovery. + LogRecordType('file_sync', [('uint32', 'fileid'), ('int', 'start')]), + + # Debugging message in the log + LogRecordType('message', [('string', 'message')]), +] + +class LogOperationType: + def __init__(self, name, fields): + self.name = name + self.fields = fields + + def macro_name(self): + return 'WT_LOGOP_%s' % self.name.upper() -types = [ - LogRecordType('debug', [('string', 'message')]) +optypes = [ + LogOperationType('col_put', + [('uint32', 'fileid'), ('recno', 'recno'), ('item', 'value')]), + LogOperationType('col_remove', + [('uint32', 'fileid'), ('recno', 'recno')]), + LogOperationType('col_truncate', + [('uint32', 'fileid'), ('recno', 'start'), ('recno', 'stop')]), + LogOperationType('row_put', + [('uint32', 'fileid'), ('item', 'key'), ('item', 'value')]), + LogOperationType('row_remove', + [('uint32', 'fileid'), ('item', 'key')]), + LogOperationType('row_truncate', + [('uint32', 'fileid'), ('item', 'start'), ('item', 'stop'), + ('uint32', 'mode')]), ] diff --git a/dist/s_all b/dist/s_all index 11506fd76fd..c39ea945b3a 100644 --- a/dist/s_all +++ b/dist/s_all @@ -72,7 +72,8 @@ run "sh ./s_stat" "checking for unused statistics fields" run "sh ./s_getopt" "checking for incorrect getopt usage" run "sh ./s_longlines" "checking for long lines" run "sh ./s_string" "checking string spelling" -run "sh ./s_style" "checking style" +run "python style.py" "checking style (pass 1)" +run "sh ./s_style" "checking style (pass 2)" run "sh ./s_symbols" "checking external symbol names" run "sh ./s_typedef -c" "checking for unused typedefs" run "sh ./s_whitespace" "checking whitespace" diff --git a/dist/s_copyright b/dist/s_copyright index ce8f9970ae3..966cddcfff4 100644 --- a/dist/s_copyright +++ b/dist/s_copyright @@ -72,7 +72,10 @@ check() for i in `cd .. && find [a-z]* -name '*.[chi]' \ -o -name '*.cxx' -o -name '*.java' -o -name '*.py' | - sed -e '/test\/3rdparty\//d' -e 's/^\.\///'`; do + sed -e '/test\/3rdparty\//d' \ + -e '/^build/d' \ + -e 's/^\.\///'` +do check $i done diff --git a/dist/s_copyright.list b/dist/s_copyright.list index 3213ace3b5b..ce990c43474 100644 --- a/dist/s_copyright.list +++ b/dist/s_copyright.list @@ -7,12 +7,21 @@ skip dist/db.py skip dist/dist.py skip dist/flags.py skip dist/java_doc.py -skip dist/log.py skip dist/log_data.py +skip dist/log.py skip dist/serial.py -skip dist/stat.py skip dist/stat_data.py +skip dist/stat.py +skip dist/style.py skip lang/java/java_doc.i +skip lang/java/src/com/wiredtiger/db/Connection.java +skip lang/java/src/com/wiredtiger/db/Cursor.java +skip lang/java/src/com/wiredtiger/db/SearchStatus.java +skip lang/java/src/com/wiredtiger/db/Session.java +skip lang/java/src/com/wiredtiger/db/wiredtigerConstants.java +skip lang/java/src/com/wiredtiger/db/wiredtiger.java +skip lang/java/src/com/wiredtiger/db/wiredtigerJNI.java +skip lang/java/wiredtiger_wrap.c skip lang/python/setup.py skip lang/python/wiredtiger/__init__.py skip lang/python/wiredtiger_wrap.c @@ -24,8 +33,8 @@ skip src/include/flags.h skip src/include/log.i skip src/include/queue.h skip src/include/serial_funcs.i -skip src/log/log_desc.c +skip src/log/log_auto.c skip src/support/stat.c -skip test/packing/intpack-test.c skip test/packing/intpack-test2.c +skip test/packing/intpack-test.c skip test/packing/packing-test.c diff --git a/dist/s_define.list b/dist/s_define.list index a30378ea899..e3d1d2593e1 100644 --- a/dist/s_define.list +++ b/dist/s_define.list @@ -5,10 +5,12 @@ API_CALL_NOCONF API_SESSION_INIT FLD_CLR HAVE_ATOMICS +IS_INIT_LSN LF_CLR LF_SET LLONG_MAX LLONG_MIN +MAX_LSN SIZE_CHECK TXNID_LE TXN_API_CALL @@ -18,8 +20,11 @@ WT_BARRIER WT_BLOCK_DESC_SIZE WT_CACHE_LINE_ALIGNMENT WT_DEBUG_BYTE +WT_HANDLE_CLOSED +WT_HANDLE_NULLABLE WT_READ_BARRIER WT_REF_SIZE +WT_RET_TIMEDOUT_OK WT_SPINLOCK_MAX WT_STAT_ATOMIC_DECR WT_STAT_ATOMIC_INCR diff --git a/dist/s_funcs.list b/dist/s_funcs.list index 64ac90752f3..b34564adda7 100644 --- a/dist/s_funcs.list +++ b/dist/s_funcs.list @@ -9,6 +9,7 @@ __wt_bloom_drop __wt_bloom_get __wt_cache_dump __wt_config_getone +__wt_cursor_get_raw_value __wt_debug_addr __wt_debug_offset __wt_debug_tree diff --git a/dist/s_longlines b/dist/s_longlines index d0b32ce341f..2aa7e9eaaff 100644 --- a/dist/s_longlines +++ b/dist/s_longlines @@ -6,6 +6,7 @@ trap 'rm -f $t; exit 0' 0 1 2 3 13 15 l=`(cd .. && find bench/wtperf examples ext src test -name '*.[chisy]' && + find dist -name '*.py' && find src -name '*.in') | grep -v 'support/stat\.c'` diff --git a/dist/s_string.ok b/dist/s_string.ok index d230c987ab2..a66518d57d5 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -70,6 +70,7 @@ DUPLICATEV DbCursor DbEnv Decrement +EAGAIN EB EBUSY EINTR @@ -115,6 +116,7 @@ IMPL's INDX INIT INITIALIZER +INMEM INSERT's INUSE ISSET @@ -131,6 +133,7 @@ Kounavis LEX LF LNO +LOGSCAN LRU LSB LSM @@ -441,6 +444,7 @@ errv errx esc eventv +evictable evictserver exactp extern @@ -458,8 +462,10 @@ ffs fgetln fh filefrag +fileid filename filenames +fileop fileops filesize filesystem @@ -467,6 +473,7 @@ firstfit fixup flcs fmt +fmterr fnv foc fopen @@ -548,6 +555,7 @@ keyv kv kvs kvsbdb +lang latencies ld len @@ -568,6 +576,7 @@ logf logmgr lognum logput +logread lookup lookups lr @@ -682,6 +691,7 @@ printf printlog priv ps +pse psp pthread putK @@ -832,6 +842,7 @@ treplacement trk trk's troot +trunc trywrlock tsalvage tsplit @@ -894,6 +905,7 @@ versa vlcs vmsg vpack +vprintf vrfy vsize vslot diff --git a/dist/s_tags b/dist/s_tags index 65b8d917a05..e189587ca82 100644 --- a/dist/s_tags +++ b/dist/s_tags @@ -36,7 +36,7 @@ rm -f tags ctags $flags ../include/*.in ../*/*.[chi] 2>/dev/null) # Link to the tags file from standard build and source directories. -dirs="`python -c 'from dist import directory_files; directory_files();'`" +dirs="`python -c 'import dist; dist.print_source_dirs()'`" for i in $dirs; do - (cd ../$i && rm -f tags && ln -s ../include/tags .) + (cd $i && rm -f tags && ln -s ../include/tags .) done diff --git a/dist/stat.py b/dist/stat.py index 783f2f191f2..69dcb87e8b3 100644 --- a/dist/stat.py +++ b/dist/stat.py @@ -3,7 +3,6 @@ import re, string, sys, textwrap from dist import compare_srcfile -from dist import source_paths_list # Read the source files. from stat_data import dsrc_stats, connection_stats diff --git a/dist/stat_data.py b/dist/stat_data.py index acc3358012f..1074b7d99cf 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -69,9 +69,10 @@ connection_stats = [ Stat('cache_eviction_fail', 'cache: pages selected for eviction unable to be evicted'), Stat('cache_eviction_force', - 'cache: pages evicted because they exceeded the in memory maximum'), + 'cache: pages evicted because they exceeded the in memory maximum'), Stat('cache_eviction_force_fail', - 'cache: failed eviction of pages that exceeded the in memory maximum'), + 'cache: failed eviction of pages that exceeded the ' + + 'in memory maximum'), Stat('cache_eviction_hazard', 'cache: hazard pointer blocked page eviction'), Stat('cache_eviction_internal', 'cache: internal pages evicted'), @@ -98,11 +99,15 @@ connection_stats = [ Stat('dh_conn_handles', 'dhandle: connection dhandles swept'), Stat('dh_session_handles', 'dhandle: session dhandles swept'), Stat('dh_sweep_evict', 'dhandle: sweeps conflicting with evict'), - Stat('dh_sweeps', 'dhandle: number of sweep attempts'), + Stat('dh_sweeps', 'dhandle: sweep attempts'), ########################################## # Logging statistics ########################################## + Stat('log_buffer_grow', + 'log: log buffer size increases'), + Stat('log_buffer_size', + 'log: total log buffer size', 'no_clear,no_scale'), Stat('log_bytes_user', 'log: user provided log bytes written'), Stat('log_bytes_written', 'log: log bytes written'), Stat('log_max_filesize', 'log: maximum log file size', 'no_clear'), @@ -115,9 +120,17 @@ connection_stats = [ Stat('log_slot_consolidated', 'log: logging bytes consolidated'), Stat('log_slot_closes', 'log: consolidated slot closures'), + Stat('log_slot_ready_wait_timeout', + 'log: log slot ready wait timeouts'), Stat('log_slot_joins', 'log: consolidated slot joins'), Stat('log_slot_races', 'log: consolidated slot join races'), + Stat('log_slot_release_wait_timeout', + 'log: log slot release wait timeouts'), + Stat('log_slot_switch_fails', + 'log: slots selected for switching that were unavailable'), Stat('log_slot_toobig', 'log: record size exceeded maximum'), + Stat('log_slot_toosmall', + 'log: failed to find a slot large enough for record'), Stat('log_slot_transitions', 'log: consolidated slot join transitions'), ########################################## @@ -245,7 +258,8 @@ dsrc_stats = [ ########################################## Stat('block_alloc', 'block manager: blocks allocated'), Stat('allocation_size', - 'block manager: file allocation unit size', 'no_aggregate,no_scale'), + 'block manager: file allocation unit size', + 'no_aggregate,no_scale'), Stat('block_checkpoint_size', 'block manager: checkpoint size', 'no_scale'), Stat('block_extension', @@ -320,7 +334,7 @@ dsrc_stats = [ Stat('rec_split_leaf', 'reconciliation leaf pages split'), Stat('rec_split_max', - 'reconciliation maximum number of splits created for a page', + 'reconciliation maximum splits for a page', 'max_aggregate,no_scale'), ########################################## diff --git a/dist/style.py b/dist/style.py new file mode 100755 index 00000000000..0c5083eb521 --- /dev/null +++ b/dist/style.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python + +# Check the style of WiredTiger C code. +from dist import source_files +import re + +skip_re = re.compile(r'DO NOT EDIT: automatically built') +func_re = re.compile(r'(/\*(?:[^\*]|\*[^/])*\*/)?\n\w[\w ]+\n(\w+)', re.DOTALL) + +for f in source_files(): + s = open(f, 'r').read() + if skip_re.search(s): + continue + for m in func_re.finditer(s): + if not m.group(1) or \ + not m.group(1).startswith('/*\n * %s --\n' % m.group(2)): + print "%s:%d: missing comment for %s" % \ + (f, s[:m.start(2)].count('\n'), m.group(2)) diff --git a/lang/java/Makefile.am b/lang/java/Makefile.am index 6c9346af2e1..4fbf9bef5dd 100644 --- a/lang/java/Makefile.am +++ b/lang/java/Makefile.am @@ -30,6 +30,7 @@ JAVA_SRC = \ $(JAVAEXAMPLES)/ex_access.java JAVA_JUNIT = \ + $(JAVATEST)/AutoCloseTest.java \ $(JAVATEST)/CursorTest.java \ $(JAVATEST)/CursorTest02.java \ $(JAVATEST)/PackTest.java \ diff --git a/lang/java/wiredtiger.i b/lang/java/wiredtiger.i index 1ad1a09768c..3c1cc1d347e 100644 --- a/lang/java/wiredtiger.i +++ b/lang/java/wiredtiger.i @@ -25,13 +25,47 @@ %} %{ -typedef int bool; +#include "../src/include/wt_internal.h" + +/* + * Closed handle checking: + * + * The typedef WT_CURSOR_NULLABLE used in wiredtiger.h is only made + * visible to the SWIG parser and is used to identify arguments of + * Cursor type that are permitted to be null. Likewise, typedefs + * WT_{CURSOR,SESSION,CONNECTION}_CLOSED identify 'close' calls that + * need explicit nulling of the swigCPtr. These typedefs permit + * special casing in typemaps for input args. + * + * We want SWIG to see these 'fake' typenames, but not the compiler. + */ +#define WT_CURSOR_NULLABLE WT_CURSOR +#define WT_CURSOR_CLOSED WT_CURSOR +#define WT_SESSION_CLOSED WT_SESSION +#define WT_CONNECTION_CLOSED WT_CONNECTION + +/* + * For Connections, Sessions and Cursors created in Java, each of + * WT_CONNECTION_IMPL, WT_SESSION_IMPL and WT_CURSOR have a + * lang_private field that store a pointer to a JAVA_CALLBACK, alloced + * during the various open calls. {conn,session,cursor}CloseHandler() + * functions reach into the associated java object, set the swigCPtr + * to 0, and free the JAVA_CALLBACK. Typemaps matching Connection, + * Session, Cursor args use the NULL_CHECK macro, which checks if + * swigCPtr is 0. + */ +typedef struct { + JNIEnv *jnienv; /* jni env that created the Session/Cursor */ + jobject jobj; /* the java Session/Cursor object */ + jfieldID fid; /* cached Cursor.swigCPtr field id in session */ +} JAVA_CALLBACK; static void throwWiredTigerException(JNIEnv *jenv, const char *msg) { jclass excep = (*jenv)->FindClass(jenv, "com/wiredtiger/db/WiredTigerException"); if (excep) (*jenv)->ThrowNew(jenv, excep, msg); } + %} /* No finalizers */ @@ -71,7 +105,7 @@ static void throwWiredTigerException(JNIEnv *jenv, const char *msg) { %} %typemap(argout) WT_ITEM * %{ - (*jenv)->ReleaseByteArrayElements(jenv, $input, $1->data, 0); + (*jenv)->ReleaseByteArrayElements(jenv, $input, (void *)$1->data, 0); %} %typemap(out) WT_ITEM %{ @@ -94,6 +128,15 @@ static void throwWiredTigerException(JNIEnv *jenv, const char *msg) { $result = $1; %} +%define NULL_CHECK(val, name) + if (!val) { + SWIG_JavaThrowException(jenv, SWIG_JavaNullPointerException, + #name " is null"); + return 0; + } +%enddef + +%define WT_CLASS(type, class, name, closeHandler) /* * Extra 'self' elimination. * The methods we're wrapping look like this: @@ -105,8 +148,26 @@ static void throwWiredTigerException(JNIEnv *jenv, const char *msg) { * and we use consecutive argument matching of typemaps to convert two args to * one. */ -%define WT_CLASS(type, class, name) -%typemap(in, numinputs=0) type *name "$1 = *(type **)&jarg1;" +%typemap(in, numinputs=0) type *name { + $1 = *(type **)&jarg1; + NULL_CHECK($1, $1_name) +} + +%typemap(in, numinputs=0) class ## _CLOSED *name { + $1 = *(type **)&jarg1; + NULL_CHECK($1, $1_name) + closeHandler; +} + +%typemap(in) class ## _NULLABLE * { + $1 = *(type **)&$input; +} + +%typemap(in) type * { + $1 = *(type **)&$input; + NULL_CHECK($1, $1_name) +} + %typemap(javaimports) type " /** * @copydoc class @@ -126,9 +187,9 @@ static void throwWiredTigerException(JNIEnv *jenv, const char *msg) { */ %} -WT_CLASS(struct __wt_connection, WT_CONNECTION, connection) -WT_CLASS(struct __wt_session, WT_SESSION, session) -WT_CLASS(struct __wt_cursor, WT_CURSOR, cursor) +WT_CLASS(struct __wt_connection, WT_CONNECTION, connection, connCloseHandler($1)) +WT_CLASS(struct __wt_session, WT_SESSION, session, sessionCloseHandler($1)) +WT_CLASS(struct __wt_cursor, WT_CURSOR, cursor, cursorCloseHandler($1)) %define COPYDOC(SIGNATURE_CLASS, CLASS, METHOD) %javamethodmodifiers SIGNATURE_CLASS::METHOD " @@ -170,6 +231,103 @@ WT_CLASS(struct __wt_cursor, WT_CURSOR, cursor) enum SearchStatus { FOUND, NOTFOUND, SMALLER, LARGER }; %} +%wrapper %{ +/* Zero out SWIG's pointer to the C object, + * equivalent to 'jobj.swigCPtr = 0;' in java. + */ +static int +javaClose(JAVA_CALLBACK *jcb, jfieldID *pfid) +{ + jclass cls; + jfieldID fid; + JNIEnv *env; + + env = jcb->jnienv; + + if (pfid == NULL || *pfid == NULL) { + cls = (*env)->GetObjectClass(env, jcb->jobj); + *pfid = (*env)->GetFieldID(env, cls, "swigCPtr", "J"); + if (pfid != NULL) + *pfid = fid; + } + (*env)->SetLongField(env, jcb->jobj, *pfid, 0L); + (*env)->DeleteGlobalRef(env, jcb->jobj); + return (0); +} + +/* Connection specific close handler. */ +static int +connCloseHandler(WT_CONNECTION *conn_arg) +{ + int ret; + JAVA_CALLBACK *jcb; + WT_CONNECTION_IMPL *conn; + + conn = (WT_CONNECTION_IMPL *)conn_arg; + jcb = (JAVA_CALLBACK *)conn->lang_private; + conn->lang_private = NULL; + ret = javaClose(jcb, NULL); + __wt_free(conn->default_session, jcb); + + return (0); +} + +/* Session specific close handler. */ +static int +sessionCloseHandler(WT_SESSION *session_arg) +{ + int ret; + JAVA_CALLBACK *jcb; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)session_arg; + jcb = (JAVA_CALLBACK *)session->lang_private; + session->lang_private = NULL; + ret = javaClose(jcb, NULL); + __wt_free(session, jcb); + + return (ret); +} + +/* Cursor specific close handler. */ +static int +cursorCloseHandler(WT_CURSOR *cursor) +{ + int ret; + JAVA_CALLBACK *jcb; + JAVA_CALLBACK *sess_jcb; + + static jfieldID ccp_fid = NULL; /* cursor cptr fid, computed once */ + + jcb = (JAVA_CALLBACK *)cursor->lang_private; + sess_jcb = (JAVA_CALLBACK *) + ((WT_SESSION_IMPL *)cursor->session)->lang_private; + cursor->lang_private = NULL; + ret = javaClose(jcb, &sess_jcb->fid); + __wt_free((WT_SESSION_IMPL *)cursor->session, jcb); + + return (ret); +} + +/* Add event handler support. */ +static int +javaCloseHandler(WT_EVENT_HANDLER *handler, WT_SESSION *session, + WT_CURSOR *cursor) +{ + int ret; + + WT_UNUSED(handler); + + if (cursor != NULL) + ret = cursorCloseHandler(cursor); + else + ret = sessionCloseHandler(session); + return (ret); +} + +WT_EVENT_HANDLER javaApiEventHandler = {NULL, NULL, NULL, javaCloseHandler}; +%} + %extend __wt_cursor { %javamethodmodifiers get_key_wrap "protected"; @@ -237,6 +395,14 @@ enum SearchStatus { FOUND, NOTFOUND, SMALLER, LARGER }; throwWiredTigerException(jenv, wiredtiger_strerror(ret)); return cmp; } + + %javamethodmodifiers java_init "protected"; + int java_init(jobject jcursor) { + JAVA_CALLBACK *jcb = (JAVA_CALLBACK *)$self->lang_private; + jcb->jobj = JCALL1(NewGlobalRef, jcb->jnienv, jcursor); + JCALL1(DeleteLocalRef, jcb->jnienv, jcursor); + return (0); + } } /* Cache key/value formats in Cursor */ @@ -257,6 +423,7 @@ enum SearchStatus { FOUND, NOTFOUND, SMALLER, LARGER }; valueFormat = getValue_format(); keyPacker = new PackOutputStream(keyFormat); valuePacker = new PackOutputStream(valueFormat); + wiredtigerJNI.Cursor_java_init(swigCPtr, this, this); } protected static long getCPtr($javaclassname obj) { @@ -784,14 +951,63 @@ enum SearchStatus { FOUND, NOTFOUND, SMALLER, LARGER }; %rename(Session) __wt_session; %rename(Connection) __wt_connection; +%define TRACKED_CLASS(jclassname, ctypename, java_init_fcn, implclass) +%ignore jclassname::jclassname(); + +%typemap(javabody) struct ctypename %{ + private long swigCPtr; + protected boolean swigCMemOwn; + + protected $javaclassname(long cPtr, boolean cMemoryOwn) { + swigCMemOwn = cMemoryOwn; + swigCPtr = cPtr; + java_init_fcn(swigCPtr, this, this); + } + + protected static long getCPtr($javaclassname obj) { + return (obj == null) ? 0 : obj.swigCPtr; + } +%} + +%extend ctypename { + %javamethodmodifiers java_init "protected"; + int java_init(jobject jsess) { + implclass *session = (implclass *)$self; + JAVA_CALLBACK *jcb = (JAVA_CALLBACK *)session->lang_private; + jcb->jobj = JCALL1(NewGlobalRef, jcb->jnienv, jsess); + JCALL1(DeleteLocalRef, jcb->jnienv, jsess); + return (0); + } +} +%enddef + +TRACKED_CLASS(Session, __wt_session, wiredtigerJNI.Session_java_init, WT_SESSION_IMPL) +TRACKED_CLASS(Connection, __wt_connection, wiredtigerJNI.Connection_java_init, WT_CONNECTION_IMPL) +/* Note: Cursor incorporates the elements of TRACKED_CLASS into its + * custom constructor and %extend clause. + */ + %include "wiredtiger.h" /* Return new connections, sessions and cursors. */ %inline { WT_CONNECTION *wiredtiger_open_wrap(JNIEnv *jenv, const char *home, const char *config) { + extern WT_EVENT_HANDLER javaApiEventHandler; WT_CONNECTION *conn = NULL; + WT_CONNECTION_IMPL *connimpl; + JAVA_CALLBACK *jcb; int ret; - if ((ret = wiredtiger_open(home, NULL, config, &conn)) != 0) + if ((ret = wiredtiger_open(home, &javaApiEventHandler, config, &conn)) != 0) + goto err; + + connimpl = (WT_CONNECTION_IMPL *)conn; + if ((ret = __wt_calloc_def(connimpl->default_session, 1, &jcb)) != 0) + goto err; + + jcb->jnienv = jenv; + connimpl->lang_private = jcb; + +err: if (ret != 0) throwWiredTigerException(jenv, wiredtiger_strerror(ret)); return conn; } @@ -799,22 +1015,48 @@ WT_CONNECTION *wiredtiger_open_wrap(JNIEnv *jenv, const char *home, const char * %extend __wt_connection { WT_SESSION *open_session_wrap(JNIEnv *jenv, const char *config) { + extern WT_EVENT_HANDLER javaApiEventHandler; WT_SESSION *session = NULL; + WT_SESSION_IMPL *sessionimpl; + JAVA_CALLBACK *jcb; int ret; - if ((ret = $self->open_session($self, NULL, config, &session)) != 0) + + if ((ret = $self->open_session($self, &javaApiEventHandler, config, &session)) != 0) + goto err; + + sessionimpl = (WT_SESSION_IMPL *)session; + if ((ret = __wt_calloc_def(sessionimpl, 1, &jcb)) != 0) + goto err; + + jcb->jnienv = jenv; + sessionimpl->lang_private = jcb; + +err: if (ret != 0) throwWiredTigerException(jenv, wiredtiger_strerror(ret)); return session; } } %extend __wt_session { - WT_CURSOR *open_cursor_wrap(JNIEnv *jenv, const char *uri, WT_CURSOR *to_dup, const char *config) { + WT_CURSOR *open_cursor_wrap(JNIEnv *jenv, const char *uri, WT_CURSOR_NULLABLE *to_dup, const char *config) { WT_CURSOR *cursor = NULL; + JAVA_CALLBACK *jcb; int ret; + if ((ret = $self->open_cursor($self, uri, to_dup, config, &cursor)) != 0) + goto err; + + cursor->flags |= WT_CURSTD_RAW; + + if ((ret = __wt_calloc_def((WT_SESSION_IMPL *)cursor->session, + 1, &jcb)) != 0) + goto err; + + jcb->jnienv = jenv; + cursor->lang_private = jcb; + +err: if (ret != 0) throwWiredTigerException(jenv, wiredtiger_strerror(ret)); - else - cursor->flags |= WT_CURSTD_RAW; return cursor; } } diff --git a/lang/python/wiredtiger.i b/lang/python/wiredtiger.i index bd92c246bd3..14effa18ee4 100644 --- a/lang/python/wiredtiger.i +++ b/lang/python/wiredtiger.i @@ -405,6 +405,12 @@ typedef int int_void; %} }; +%extend __wt_session { + int log_printf(const char *msg) { + return self->log_printf(self, "%s", msg); + } +}; + /* Remove / rename parts of the C API that we don't want in Python. */ %immutable __wt_cursor::session; %immutable __wt_cursor::uri; @@ -412,20 +418,20 @@ typedef int int_void; %ignore __wt_cursor::value_format; %immutable __wt_session::connection; -%ignore __wt_buf; +%ignore __wt_collator; +%ignore __wt_compressor; %ignore __wt_config_item; +%ignore __wt_data_source; %ignore __wt_event_handler; %ignore __wt_extractor; %ignore __wt_item; -%ignore __wt_collator; %ignore __wt_connection::add_collator; -%ignore __wt_compressor; %ignore __wt_connection::add_compressor; -%ignore __wt_data_source; %ignore __wt_connection::add_data_source; %ignore __wt_connection::add_extractor; %ignore __wt_connection::get_extension_api; +%ignore __wt_session::log_printf; %ignore wiredtiger_struct_pack; %ignore wiredtiger_struct_size; diff --git a/src/block/block_ext.c b/src/block/block_ext.c index 306fad1f265..ba704cf19c3 100644 --- a/src/block/block_ext.c +++ b/src/block/block_ext.c @@ -867,7 +867,7 @@ __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b) } /* - * __wt_block_insert_ext, __block_merge -- + * __wt_block_insert_ext -- * Insert an extent into an extent list, merging if possible. */ int @@ -887,6 +887,12 @@ __wt_block_insert_ext( */ return (__block_merge(session, el, off, size)); } + +/* + * __block_merge -- + * Insert an extent into an extent list, merging if possible (internal + * version). + */ static int __block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size) { @@ -1239,6 +1245,10 @@ __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el) memset(el, 0, sizeof(*el)); } +/* + * __block_extlist_dump -- + * Dump an extent list as verbose messages. + */ static int __block_extlist_dump( WT_SESSION_IMPL *session, const char *tag, WT_EXTLIST *el, int show_size) diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c index b688e994293..200bbf101df 100644 --- a/src/block/block_mgr.c +++ b/src/block/block_mgr.c @@ -22,7 +22,7 @@ __bm_readonly(WT_BM *bm, WT_SESSION_IMPL *session) } /* - * __bm_addr_string + * __bm_addr_string -- * Return a printable string representation of an address cookie. */ static int diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 44eed6ba0c5..936a112d07d 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -683,19 +683,32 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) { WT_BTREE *btree; WT_CURSOR_BTREE *cbt; + WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (start != NULL) ? start : stop; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; + /* + * For recovery, we log the start and stop keys for a truncate + * operation, not the individual records removed. On the other hand, + * for rollback we need to keep track of all the in-memory operations. + * + * We deal with this here by logging the truncate range first, then (in + * the logging code) disabling writing of the in-memory remove records + * to disk. + */ + if (S2C(session)->logging) + WT_RET(__wt_txn_truncate_log(session, start, stop)); + switch (btree->type) { case BTREE_COL_FIX: - WT_RET(__cursor_truncate_fix( + WT_ERR(__cursor_truncate_fix( session, start, stop, __wt_col_modify)); break; case BTREE_COL_VAR: - WT_RET(__cursor_truncate( + WT_ERR(__cursor_truncate( session, start, stop, __wt_col_modify)); break; case BTREE_ROW: @@ -710,15 +723,17 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) * are positioned in the tree. */ if (start != NULL) - WT_RET(__wt_btcur_search(start)); + WT_ERR(__wt_btcur_search(start)); if (stop != NULL) - WT_RET(__wt_btcur_search(stop)); - WT_RET(__cursor_truncate( + WT_ERR(__wt_btcur_search(stop)); + WT_ERR(__cursor_truncate( session, start, stop, __wt_row_modify)); break; } - return (0); +err: if (S2C(session)->logging) + WT_TRET(__wt_txn_truncate_end(session)); + return (ret); } /* diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c index 404aaa5a5bb..1c7f0e205b8 100644 --- a/src/btree/bt_evict.c +++ b/src/btree/bt_evict.c @@ -169,7 +169,8 @@ __wt_cache_evict_server(void *arg) F_CLR(cache, WT_EVICT_ACTIVE); WT_VERBOSE_ERR(session, evictserver, "sleeping"); /* Don't rely on signals: check periodically. */ - WT_ERR(__wt_cond_wait(session, cache->evict_cond, 100000)); + WT_ERR_TIMEDOUT_OK( + __wt_cond_wait(session, cache->evict_cond, 100000)); WT_VERBOSE_ERR(session, evictserver, "waking"); } @@ -350,9 +351,9 @@ __wt_evict_page(WT_SESSION_IMPL *session, WT_PAGE *page) } /* - * __wt_evict_file_exclusive_on + * __wt_evict_file_exclusive_on -- * Get exclusive eviction access to a file and discard any of the file's - * blocks queued for eviction. + * blocks queued for eviction. */ void __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) @@ -398,7 +399,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) } /* - * __wt_evict_file_exclusive_off + * __wt_evict_file_exclusive_off -- * Release exclusive eviction access to a file. */ void diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 6db8534aca4..1e4d073a290 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -197,6 +197,10 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) "%" PRIu64 ".%" PRIu64, maj_version, min_version); } + /* Get the file ID. */ + WT_RET(__wt_config_gets(session, cfg, "id", &cval)); + btree->id = (uint32_t)cval.val; + /* Validate file types and check the data format plan. */ WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL)); @@ -513,7 +517,7 @@ __wt_btree_new_leaf_page( } /* - * __wt_btree_no_eviction -- + * __wt_btree_evictable -- * Setup or release a cache-resident tree. */ void @@ -701,6 +705,10 @@ __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) return (split_size); } +/* + * pse1 -- + * Page size error message 1. + */ static int pse1(WT_SESSION_IMPL *session, const char *type, uint32_t max, uint32_t ovfl) { @@ -710,6 +718,10 @@ pse1(WT_SESSION_IMPL *session, const char *type, uint32_t max, uint32_t ovfl) type, max, ovfl); } +/* + * pse2 -- + * Page size error message 2. + */ static int pse2(WT_SESSION_IMPL *session, const char *type, uint32_t max, uint32_t ovfl, int pct) diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index fff5eadb9b9..40596a889d5 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -16,7 +16,7 @@ static int __inmem_row_leaf_entries( WT_SESSION_IMPL *, WT_PAGE_HEADER *, uint32_t *); /* - * __wt_page_in -- + * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c index b502d8d3409..a43ff27fa84 100644 --- a/src/btree/bt_ret.c +++ b/src/btree/bt_ret.c @@ -8,6 +8,42 @@ #include "wt_internal.h" /* + * __wt_row_key_get -- + * Get a reference to the current key. + */ +int +__wt_row_key_get(WT_CURSOR_BTREE *cbt, WT_ITEM *key) +{ + WT_PAGE *page; + WT_ROW *rip; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + page = cbt->page; + + switch (page->type) { + case WT_PAGE_ROW_LEAF: + rip = &page->u.row.d[cbt->slot]; + + /* + * If the cursor references a WT_INSERT item, take the key from + * there. Otherwise, take the key from the original page, and + * the value from any related WT_UPDATE item, or the page if + * the key was never updated. + */ + if (cbt->ins != NULL) { + key->data = WT_INSERT_KEY(cbt->ins); + key->size = WT_INSERT_KEY_SIZE(cbt->ins); + } else + WT_RET(__wt_row_leaf_key(session, page, rip, key, 1)); + break; + WT_ILLEGAL_VALUE(session); + } + + return (0); +} + +/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ @@ -78,10 +114,10 @@ __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) * original page, and the value from any related WT_UPDATE item, * or the page if the key was never updated. */ - if (cbt->ins != NULL && - (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { + if (cbt->ins != NULL) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); + upd = __wt_txn_read(session, cbt->ins->upd); } else { WT_RET(__wt_row_leaf_key( session, page, rip, &cursor->key, 0)); diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index 1ce32d86cb7..07a5b0c827c 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -2218,7 +2218,7 @@ __slvg_ovfl_discard(WT_SESSION_IMPL *session, WT_STUFF *ss) } /* - * __slvg_free -- + * __slvg_cleanup -- * Discard memory allocated to the page and overflow arrays. */ static int diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index 98c3cc9c3ef..69d4583b4db 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -8,7 +8,7 @@ #include "wt_internal.h" /* - * __tree_walk_delete_rollback -- + * __wt_tree_walk_delete_rollback -- * Abort pages that were deleted without being instantiated. */ void diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c index 0e157e80bf8..e88b3d23a47 100644 --- a/src/btree/col_modify.c +++ b/src/btree/col_modify.c @@ -78,7 +78,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) /* Allocate the WT_UPDATE structure and transaction ID. */ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); - WT_ERR(__wt_txn_modify(session, &upd->txnid)); + WT_ERR(__wt_txn_modify(session, cbt, upd)); logged = 1; /* @@ -121,8 +121,6 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) WT_ERR(__col_insert_alloc( session, recno, skipdepth, &ins, &ins_size)); WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); - WT_ERR(__wt_txn_modify(session, &upd->txnid)); - logged = 1; ins->upd = upd; ins_size += upd_size; @@ -132,6 +130,8 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) */ cbt->ins_head = ins_head; cbt->ins = ins; + WT_ERR(__wt_txn_modify(session, cbt, upd)); + logged = 1; /* * If there was no insert list during the search, or there was diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c index 54409493a30..e616faa1e7f 100644 --- a/src/btree/rec_evict.c +++ b/src/btree/rec_evict.c @@ -137,7 +137,7 @@ done: session->excl_next = 0; } /* - * __rec_root_update -- + * __rec_root_update -- * Update a root page's reference on eviction (clean or dirty). */ static void @@ -147,7 +147,7 @@ __rec_root_update(WT_SESSION_IMPL *session) } /* - * __rec_page_clean_update -- + * __rec_page_clean_update -- * Update a clean page's reference on eviction. */ static void diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c index 5be4899f1c6..1e04d726588 100644 --- a/src/btree/rec_write.c +++ b/src/btree/rec_write.c @@ -2136,8 +2136,12 @@ __wt_rec_row_bulk_insert(WT_CURSOR_BULK *cbulk) #define WT_FIX_ENTRIES(btree, bytes) (((bytes) * 8) / (btree)->bitcnt) +/* + * __rec_col_fix_bulk_insert_split_check -- + * Check if a bulk-loaded fixed-length column store page needs to split. + */ static inline int -__rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) +__rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) { WT_BTREE *btree; WT_RECONCILE *r; @@ -3778,7 +3782,7 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page) } /* - * __rec_write_wrapup -- + * __rec_write_wrapup -- * Finish the reconciliation. */ static int @@ -4025,7 +4029,7 @@ err: __wt_scr_free(&tkey); } /* - * __rec_write_wrapup_err -- + * __rec_write_wrapup_err -- * Finish the reconciliation on error. */ static int diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c index 3d05d7c7a49..8e791fe59f2 100644 --- a/src/btree/row_modify.c +++ b/src/btree/row_modify.c @@ -62,7 +62,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) /* Allocate the WT_UPDATE structure and transaction ID. */ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); - WT_ERR(__wt_txn_modify(session, &upd->txnid)); + WT_ERR(__wt_txn_modify(session, cbt, upd)); logged = 1; /* @@ -108,8 +108,6 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) WT_ERR(__wt_row_insert_alloc( session, key, skipdepth, &ins, &ins_size)); WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); - WT_ERR(__wt_txn_modify(session, &upd->txnid)); - logged = 1; ins->upd = upd; ins_size += upd_size; @@ -119,6 +117,8 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) */ cbt->ins_head = ins_head; cbt->ins = ins; + WT_ERR(__wt_txn_modify(session, cbt, upd)); + logged = 1; /* * If there was no insert list during the search, the cursor's @@ -155,6 +155,7 @@ err: /* if (logged) __wt_txn_unmodify(session); __wt_free(session, ins); + cbt->ins = NULL; __wt_free(session, upd); } @@ -282,7 +283,7 @@ __wt_update_obsolete_free( } /* - * __wt_page_obsolete -- + * __wt_row_leaf_obsolete -- * Discard all obsolete updates on a row-store leaf page. */ void diff --git a/src/config/config.c b/src/config/config.c index 5bfb1646288..acff9c29e28 100644 --- a/src/config/config.c +++ b/src/config/config.c @@ -486,7 +486,7 @@ __config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value) } while (0) /* - * __config_process_value + * __config_process_value -- * Deal with special config values like true / false. */ static int diff --git a/src/config/config_check.c b/src/config/config_check.c index 08d65e72d18..d613e3c734e 100644 --- a/src/config/config_check.c +++ b/src/config/config_check.c @@ -210,9 +210,9 @@ err: if (entry != NULL) { } /* - * __wt_config_check-- + * __wt_config_check -- * Check the keys in an application-supplied config string match what is - * specified in an array of check strings. + * specified in an array of check strings. */ int __wt_config_check(WT_SESSION_IMPL *session, diff --git a/src/config/config_def.c b/src/config/config_def.c index f089d4592da..79dce2fa7b7 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -46,8 +46,8 @@ static const WT_CONFIG_CHECK confchk_connection_reconfigure[] = { { "verbose", "list", "choices=[\"block\",\"ckpt\",\"compact\",\"evict\"," "\"evictserver\",\"fileops\",\"hazard\",\"log\",\"lsm\",\"mutex\"" - ",\"overflow\",\"read\",\"readserver\",\"reconcile\",\"salvage\"," - "\"shared_cache\",\"verify\",\"version\",\"write\"]", + ",\"overflow\",\"read\",\"readserver\",\"reconcile\",\"recovery\"" + ",\"salvage\",\"shared_cache\",\"verify\",\"version\",\"write\"]", NULL}, { NULL, NULL, NULL, NULL } }; @@ -60,6 +60,7 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = { { "block_compressor", "string", NULL, NULL}, { "cache_resident", "boolean", NULL, NULL}, { "checkpoint", "string", NULL, NULL}, + { "checkpoint_lsn", "string", NULL, NULL}, { "checksum", "string", "choices=[\"on\",\"off\",\"uncompressed\"]", NULL}, @@ -69,6 +70,7 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = { { "format", "string", "choices=[\"btree\"]", NULL}, { "huffman_key", "string", NULL, NULL}, { "huffman_value", "string", NULL, NULL}, + { "id", "string", NULL, NULL}, { "internal_item_max", "int", "min=0", NULL}, { "internal_key_truncate", "boolean", NULL, NULL}, { "internal_page_max", "int", "min=512B,max=512MB", NULL}, @@ -216,7 +218,7 @@ static const WT_CONFIG_CHECK confchk_checkpoint_subconfigs[] = { static const WT_CONFIG_CHECK confchk_log_subconfigs[] = { { "archive", "boolean", NULL, NULL }, { "enabled", "boolean", NULL, NULL }, - { "file_max", "int", "min=1MB,max=2GB", NULL }, + { "file_max", "int", "min=100KB,max=2GB", NULL }, { "path", "string", NULL, NULL }, { NULL, NULL, NULL, NULL } }; @@ -263,8 +265,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "verbose", "list", "choices=[\"block\",\"ckpt\",\"compact\",\"evict\"," "\"evictserver\",\"fileops\",\"hazard\",\"log\",\"lsm\",\"mutex\"" - ",\"overflow\",\"read\",\"readserver\",\"reconcile\",\"salvage\"," - "\"shared_cache\",\"verify\",\"version\",\"write\"]", + ",\"overflow\",\"read\",\"readserver\",\"reconcile\",\"recovery\"" + ",\"salvage\",\"shared_cache\",\"verify\",\"version\",\"write\"]", NULL}, { NULL, NULL, NULL, NULL } }; @@ -316,13 +318,14 @@ static const WT_CONFIG_ENTRY config_entries[] = { }, { "file.meta", "allocation_size=4KB,block_allocation=best,block_compressor=," - "cache_resident=0,checkpoint=,checksum=uncompressed,collator=," - "columns=,dictionary=0,format=btree,huffman_key=,huffman_value=," - "internal_item_max=0,internal_key_truncate=,internal_page_max=4KB" - ",key_format=u,key_gap=10,leaf_item_max=0,leaf_page_max=1MB," - "memory_page_max=5MB,os_cache_dirty_max=0,os_cache_max=0," - "prefix_compression=,prefix_compression_min=4,split_pct=75," - "value_format=u,version=(major=0,minor=0)", + "cache_resident=0,checkpoint=,checkpoint_lsn=," + "checksum=uncompressed,collator=,columns=,dictionary=0," + "format=btree,huffman_key=,huffman_value=,id=,internal_item_max=0" + ",internal_key_truncate=,internal_page_max=4KB,key_format=u," + "key_gap=10,leaf_item_max=0,leaf_page_max=1MB,memory_page_max=5MB" + ",os_cache_dirty_max=0,os_cache_max=0,prefix_compression=," + "prefix_compression_min=4,split_pct=75,value_format=u," + "version=(major=0,minor=0)", confchk_file_meta }, { "index.meta", diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index c7db3aea0f8..97eef10ff6b 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -962,6 +962,7 @@ __conn_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "overflow", WT_VERB_overflow }, { "read", WT_VERB_read }, { "reconcile", WT_VERB_reconcile }, + { "recovery", WT_VERB_recovery }, { "salvage", WT_VERB_salvage }, { "shared_cache", WT_VERB_shared_cache }, { "verify", WT_VERB_verify }, diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index dfb290f64c4..262cea40d05 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -361,6 +361,10 @@ err: __wt_spin_unlock(NULL, &cp->cache_pool_lock); return (ret); } +/* + * __cache_pool_assess -- + * Assess the usage of the cache pool. + */ static int __cache_pool_assess(uint64_t *phighest) { @@ -403,9 +407,10 @@ __cache_pool_assess(uint64_t *phighest) } /* - * Adjust the allocation of cache to each connection. If force is set ignore - * cache load information, and reduce the allocation for every connection - * allocated more than their reserved size. + * __cache_pool_adjust -- + * Adjust the allocation of cache to each connection. If force is set + * ignore cache load information, and reduce the allocation for every + * connection allocated more than their reserved size. */ static int __cache_pool_adjust(uint64_t highest, uint64_t bump_threshold) @@ -512,7 +517,7 @@ __wt_cache_pool_server(void *arg) while (F_ISSET(cp, WT_CACHE_POOL_RUN)) { if (cp->currently_used <= cp->size) - WT_ERR(__wt_cond_wait( + WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, cp->cache_pool_cond, 1000000)); /* * Re-check pool run flag - since we want to avoid getting the diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c index 511904a4565..ba8eed03e47 100644 --- a/src/conn/conn_ckpt.c +++ b/src/conn/conn_ckpt.c @@ -73,7 +73,7 @@ __ckpt_server(void *arg) WT_ERR(wt_session->checkpoint(wt_session, conn->ckpt_config)); /* Wait... */ - WT_ERR( + WT_ERR_TIMEDOUT_OK( __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs)); } @@ -84,7 +84,7 @@ err: __wt_err(session, ret, "checkpoint server error"); } /* - * __wt_checkpoint_create - + * __wt_checkpoint_create -- * Start the checkpoint server thread. */ int @@ -120,7 +120,7 @@ __wt_checkpoint_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) } /* - * __wt_checkpoint_destroy - + * __wt_checkpoint_destroy -- * Destroy the checkpoint server thread. */ int diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 55f6f997a40..6bdd93f7a6b 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -577,11 +577,13 @@ __wt_conn_btree_close(WT_SESSION_IMPL *session, int locked) if (!inuse) { /* * We should only close the metadata file when closing the - * last session (i.e., the default session for the connection). + * last session (i.e., the default session for the connection) + * or at the end of recovery. */ WT_ASSERT(session, S2BT(session) != session->metafile || - session == S2C(session)->default_session); + session == S2C(session)->default_session || + F_ISSET(session, WT_SESSION_LOGGING_DISABLED)); if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) { WT_TRET(__wt_conn_btree_sync_and_close(session)); diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 00e7e14aaca..3cd02e6143a 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -90,7 +90,8 @@ __log_archive_server(void *arg) * it gets turned back on and check again. */ if (conn->archive == 0) { - WT_ERR(__wt_cond_wait(session, conn->arch_cond, 0)); + WT_ERR_TIMEDOUT_OK( + __wt_cond_wait(session, conn->arch_cond, 0)); continue; } @@ -132,7 +133,8 @@ __log_archive_server(void *arg) log->first_lsn.offset = 0; /* Wait until the next event. */ - WT_ERR(__wt_cond_wait(session, conn->arch_cond, 0)); + WT_ERR_TIMEDOUT_OK( + __wt_cond_wait(session, conn->arch_cond, 0)); } if (0) { @@ -180,8 +182,11 @@ __wt_logmgr_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) INIT_LSN(&log->ckpt_lsn); INIT_LSN(&log->first_lsn); INIT_LSN(&log->sync_lsn); + INIT_LSN(&log->trunc_lsn); INIT_LSN(&log->write_lsn); log->fileid = 0; + WT_RET(__wt_cond_alloc(session, + "log release", 0, &log->log_release_cond)); WT_RET(__wt_log_open(session)); WT_RET(__wt_log_slot_init(session)); @@ -249,6 +254,9 @@ __wt_logmgr_destroy(WT_CONNECTION_IMPL *conn) WT_TRET(wt_session->close(wt_session, NULL)); conn->arch_session = NULL; } + + WT_TRET(__wt_log_slot_destroy(session)); + WT_TRET(__wt_cond_destroy(session, &conn->log->log_release_cond)); __wt_spin_destroy(session, &conn->log->log_lock); __wt_spin_destroy(session, &conn->log->log_slot_lock); __wt_free(session, conn->log); diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index 81f334143a8..220dea45edc 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -77,7 +77,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) * exit before files are closed. */ F_CLR(conn, WT_CONN_SERVER_RUN); - WT_TRET(__wt_logmgr_destroy(conn)); WT_TRET(__wt_checkpoint_destroy(conn)); WT_TRET(__wt_statlog_destroy(conn)); @@ -87,6 +86,17 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) /* Close open data handles. */ WT_TRET(__wt_conn_dhandle_discard(conn)); + /* + * Now that all data handles are closed, tell logging that a checkpoint + * has completed then shut down the log manager (only after closing + * data handles). + */ + if (conn->logging) { + WT_TRET(__wt_txn_checkpoint_log( + session, 1, WT_TXN_LOG_CKPT_STOP, NULL)); + WT_TRET(__wt_logmgr_destroy(conn)); + } + /* Free memory for collators */ while ((ncoll = TAILQ_FIRST(&conn->collqh)) != NULL) WT_TRET(__wt_conn_remove_collator(conn, ncoll)); diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index f26d1514e2b..35fddcfeaf0 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -272,7 +272,7 @@ __statlog_server(void *arg) * statistics and check again. */ if (conn->stat_all == 0 && conn->stat_fast == 0) { - WT_ERR(__wt_cond_wait( + WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, conn->stat_cond, conn->stat_usecs)); continue; } @@ -341,7 +341,7 @@ __statlog_server(void *arg) WT_ERR(fflush(fp) == 0 ? 0 : __wt_errno()); /* Wait until the next event. */ - WT_ERR( + WT_ERR_TIMEDOUT_OK( __wt_cond_wait(session, conn->stat_cond, conn->stat_usecs)); } diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c index 56691d88ba3..22ae7bf91ae 100644 --- a/src/cursor/cur_backup.c +++ b/src/cursor/cur_backup.c @@ -214,9 +214,10 @@ __backup_start( WT_ERR(__backup_list_append(session, cb, WT_METADATA_BACKUP)); WT_ERR(__backup_list_append(session, cb, WT_SINGLETHREAD)); - /* Add log files if logging is on. */ - if (conn->log) { - WT_ERR(__wt_log_getfiles(session, &logfiles, &logcount)); + /* Add log files if logging is on and we're doing a full backup. */ + if (!target_list && conn->log) { + WT_ERR( + __wt_log_get_active_files(session, &logfiles, &logcount)); for (i = 0; i < logcount; i++) WT_ERR(__backup_list_append(session, cb, logfiles[i])); } diff --git a/src/cursor/cur_ds.c b/src/cursor/cur_ds.c index 9e849e664e9..dc8e1618ccb 100644 --- a/src/cursor/cur_ds.c +++ b/src/cursor/cur_ds.c @@ -36,7 +36,7 @@ __curds_txn_leave(WT_SESSION_IMPL *session) } /* - * __curds_key_set - + * __curds_key_set -- * Set the key for the data-source. */ static int @@ -57,7 +57,7 @@ err: return (ret); } /* - * __curds_value_set - + * __curds_value_set -- * Set the value for the data-source. */ static int @@ -77,7 +77,7 @@ err: return (ret); } /* - * __curds_cursor_resolve - + * __curds_cursor_resolve -- * Resolve cursor operation. */ static int diff --git a/src/cursor/cur_dump.c b/src/cursor/cur_dump.c index 7883996f7cb..f89cad2ffd9 100644 --- a/src/cursor/cur_dump.c +++ b/src/cursor/cur_dump.c @@ -247,6 +247,10 @@ WT_CURDUMP_PASS(prev) WT_CURDUMP_PASS(reset) WT_CURDUMP_PASS(search) +/* + * __curdump_search_near -- + * WT_CURSOR::search_near for dump cursors. + */ static int __curdump_search_near(WT_CURSOR *cursor, int *exact) { @@ -260,6 +264,10 @@ WT_CURDUMP_PASS(insert) WT_CURDUMP_PASS(update) WT_CURDUMP_PASS(remove) +/* + * __curdump_close -- + * WT_CURSOR::close for dump cursors. + */ static int __curdump_close(WT_CURSOR *cursor) { diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index 34987898ccf..b6b597816a2 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -303,6 +303,10 @@ err: API_END(session); return (ret); } +/* + * __curindex_open_colgroups -- + * Open cursors on the column groups required for an index cursor. + */ static int __curindex_open_colgroups( WT_SESSION_IMPL *session, WT_CURSOR_INDEX *cindex, const char *cfg_arg[]) diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c index efb22e58174..3f85e65b353 100644 --- a/src/cursor/cur_std.c +++ b/src/cursor/cur_std.c @@ -19,7 +19,7 @@ __wt_cursor_notsup(WT_CURSOR *cursor) return (ENOTSUP); } -/* +/* * __wt_cursor_noop -- * Cursor noop. */ @@ -142,6 +142,44 @@ __wt_cursor_set_raw_key(WT_CURSOR *cursor, WT_ITEM *key) } /* + * __wt_cursor_get_raw_value -- + * Temporarily force raw mode in a cursor to get a canonical copy of + * the value. + */ +int +__wt_cursor_get_raw_value(WT_CURSOR *cursor, WT_ITEM *value) +{ + WT_DECL_RET; + int raw_set; + + raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0; + if (!raw_set) + F_SET(cursor, WT_CURSTD_RAW); + ret = cursor->get_value(cursor, value); + if (!raw_set) + F_CLR(cursor, WT_CURSTD_RAW); + return (ret); +} + +/* + * __wt_cursor_set_raw_value -- + * Temporarily force raw mode in a cursor to set a canonical copy of + * the value. + */ +void +__wt_cursor_set_raw_value(WT_CURSOR *cursor, WT_ITEM *value) +{ + int raw_set; + + raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0; + if (!raw_set) + F_SET(cursor, WT_CURSTD_RAW); + cursor->set_value(cursor, value); + if (!raw_set) + F_CLR(cursor, WT_CURSTD_RAW); +} + +/* * __wt_cursor_get_keyv -- * WT_CURSOR->get_key worker function. */ diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index 33eb7df53d7..86337a568b7 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -36,6 +36,7 @@ static int __curtable_update(WT_CURSOR *cursor); F_SET(*__cp, WT_CURSTD_KEY_EXT | \ WT_CURSTD_VALUE_EXT); \ WT_ERR((*__cp)->f(*__cp)); \ + WT_ERR((*__cp)->reset(*__cp)); \ } \ } while (0) @@ -602,6 +603,10 @@ err: API_END(session); return (ret); } +/* + * __curtable_open_colgroups -- + * Open cursors on column groups for a table cursor. + */ static int __curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[]) { @@ -638,6 +643,10 @@ __curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[]) return (0); } +/* + * __curtable_open_indices -- + * Open cursors on indices for a table cursor. + */ static int __curtable_open_indices(WT_CURSOR_TABLE *ctable) { diff --git a/src/docs/Doxyfile b/src/docs/Doxyfile index fefc925922a..5f7e5016892 100644 --- a/src/docs/Doxyfile +++ b/src/docs/Doxyfile @@ -1584,7 +1584,9 @@ PREDEFINED = DOXYGEN \ __wt_extension_api:=WT_EXTENSION_API \ __wt_extractor:=WT_EXTRACTOR \ __wt_item:=WT_ITEM \ - __wt_session:=WT_SESSION + __wt_session:=WT_SESSION \ + WT_HANDLE_CLOSED(x):=x \ + WT_HANDLE_NULLABLE(x):=x # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then # this tag can be used to specify a list of macro names that should be expanded. diff --git a/src/include/api.h b/src/include/api.h index 17624c60244..dacb583e596 100644 --- a/src/include/api.h +++ b/src/include/api.h @@ -95,6 +95,9 @@ #define SESSION_API_CALL(s, n, config, cfg) \ API_CALL(s, session, n, NULL, NULL, config, cfg) +#define SESSION_API_CALL_NO_CONF(s, n) \ + API_CALL_NOCONF(s, session, n, NULL, NULL) + #define SESSION_TXN_API_CALL(s, n, config, cfg) \ TXN_API_CALL(s, session, n, NULL, NULL, config, cfg) diff --git a/src/include/btree.h b/src/include/btree.h index 686bbd14d3d..4958c88716a 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -66,6 +66,8 @@ struct __wt_btree { /* Row-store comparison function */ WT_COLLATOR *collator; /* Comparison function */ + uint32_t id; /* File ID, for logging */ + uint32_t key_gap; /* Row-store prefix key gap */ uint32_t allocsize; /* Allocation size */ diff --git a/src/include/connection.h b/src/include/connection.h index b81f3f470ab..d94144ddb05 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -129,7 +129,7 @@ struct __wt_connection_impl { TAILQ_HEAD(__wt_block_qh, __wt_block) blockqh; u_int open_btree_count; /* Locked: open writable btree count */ - u_int next_file_id; /* Locked: file ID counter */ + uint32_t next_file_id; /* Locked: file ID counter */ /* * WiredTiger allocates space for 50 simultaneous sessions (threads of @@ -225,6 +225,8 @@ struct __wt_connection_impl { /* Locked: data source list */ TAILQ_HEAD(__wt_dsrc_qh, __wt_named_data_source) dsrcqh; + void *lang_private; /* Language specific private storage */ + /* If non-zero, all buffers used for I/O will be aligned to this. */ size_t buffer_alignment; diff --git a/src/include/cursor.h b/src/include/cursor.h index 0fe5bbeed4e..64b7451a9fc 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -45,6 +45,7 @@ { NULL, NULL }, /* TAILQ_ENTRY q */ \ 0, /* recno key */ \ { 0 }, /* recno raw buffer */ \ + NULL, /* lang_private */ \ { NULL, 0, 0, NULL, 0 }, /* WT_ITEM key */ \ { NULL, 0, 0, NULL, 0 }, /* WT_ITEM value */ \ 0, /* int saved_err */ \ diff --git a/src/include/error.h b/src/include/error.h index 6497a92d331..d1251d0fa1a 100644 --- a/src/include/error.h +++ b/src/include/error.h @@ -48,6 +48,14 @@ goto err; \ } \ } while (0) +#define WT_ERR_TIMEDOUT_OK(a) do { \ + if ((ret = (a)) != 0) { \ + if (ret == ETIMEDOUT) \ + ret = 0; \ + else \ + goto err; \ + } \ +} while (0) #define WT_ERR_TEST(a, v) do { \ if (a) { \ ret = (v); \ @@ -75,6 +83,11 @@ if ((__ret = (a)) != 0 && __ret != WT_NOTFOUND) \ return (__ret); \ } while (0) +#define WT_RET_TIMEDOUT_OK(a) do { \ + int __ret; \ + if ((__ret = (a)) != 0 && __ret != ETIMEDOUT) \ + return (__ret); \ +} while (0) /* Set "ret" if not already set. */ #define WT_TRET(a) do { \ diff --git a/src/include/extern.h b/src/include/extern.h index 101be45994a..6923661d3cb 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -351,6 +351,7 @@ extern int __wt_page_inmem( WT_SESSION_IMPL *session, extern int __wt_cache_read(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref); +extern int __wt_row_key_get(WT_CURSOR_BTREE *cbt, WT_ITEM *key); extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, @@ -653,6 +654,8 @@ extern int __wt_cursor_get_key(WT_CURSOR *cursor, ...); extern void __wt_cursor_set_key(WT_CURSOR *cursor, ...); extern int __wt_cursor_get_raw_key(WT_CURSOR *cursor, WT_ITEM *key); extern void __wt_cursor_set_raw_key(WT_CURSOR *cursor, WT_ITEM *key); +extern int __wt_cursor_get_raw_value(WT_CURSOR *cursor, WT_ITEM *value); +extern void __wt_cursor_set_raw_value(WT_CURSOR *cursor, WT_ITEM *value); extern int __wt_cursor_get_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap); extern void __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap); extern int __wt_cursor_get_value(WT_CURSOR *cursor, ...); @@ -674,9 +677,13 @@ extern int __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); -extern int __wt_log_getfiles(WT_SESSION_IMPL *session, - char ***files, - u_int *count); +extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); +extern int __wt_log_get_files(WT_SESSION_IMPL *session, + char ***filesp, + u_int *countp); +extern int __wt_log_get_active_files( WT_SESSION_IMPL *session, + char ***filesp, + u_int *countp); extern void __wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count); @@ -709,16 +716,124 @@ extern int __wt_log_write(WT_SESSION_IMPL *session, extern int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap); +extern int __wt_logrec_alloc(WT_SESSION_IMPL *session, + size_t size, + WT_ITEM **logrecp); +extern void __wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp); +extern int __wt_logrec_read(WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + uint32_t *rectypep); +extern int __wt_logop_read(WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + uint32_t *optypep, + uint32_t *opsizep); +extern int __wt_logop_col_put_pack( WT_SESSION_IMPL *session, + WT_ITEM *logrec, + uint32_t fileid, + uint64_t recno, + WT_ITEM *value); +extern int __wt_logop_col_put_unpack( WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + uint32_t *fileidp, + uint64_t *recnop, + WT_ITEM *valuep); +extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + FILE *out); +extern int __wt_logop_col_remove_pack( WT_SESSION_IMPL *session, + WT_ITEM *logrec, + uint32_t fileid, + uint64_t recno); +extern int __wt_logop_col_remove_unpack( WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + uint32_t *fileidp, + uint64_t *recnop); +extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + FILE *out); +extern int __wt_logop_col_truncate_pack( WT_SESSION_IMPL *session, + WT_ITEM *logrec, + uint32_t fileid, + uint64_t start, + uint64_t stop); +extern int __wt_logop_col_truncate_unpack( WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + uint32_t *fileidp, + uint64_t *startp, + uint64_t *stopp); +extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + FILE *out); +extern int __wt_logop_row_put_pack( WT_SESSION_IMPL *session, + WT_ITEM *logrec, + uint32_t fileid, + WT_ITEM *key, + WT_ITEM *value); +extern int __wt_logop_row_put_unpack( WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + uint32_t *fileidp, + WT_ITEM *keyp, + WT_ITEM *valuep); +extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + FILE *out); +extern int __wt_logop_row_remove_pack( WT_SESSION_IMPL *session, + WT_ITEM *logrec, + uint32_t fileid, + WT_ITEM *key); +extern int __wt_logop_row_remove_unpack( WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + uint32_t *fileidp, + WT_ITEM *keyp); +extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + FILE *out); +extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, + WT_ITEM *logrec, + uint32_t fileid, + WT_ITEM *start, + WT_ITEM *stop, + uint32_t mode); +extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + uint32_t *fileidp, + WT_ITEM *startp, + WT_ITEM *stopp, + uint32_t *modep); +extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + FILE *out); +extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + FILE *out); extern int __wt_log_slot_init(WT_SESSION_IMPL *session); +extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session); extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslotp); extern int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); -extern int __wt_log_slot_notify(WT_LOGSLOT *slot); -extern int __wt_log_slot_wait(WT_LOGSLOT *slot); +extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); +extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size); extern int __wt_log_slot_free(WT_LOGSLOT *slot); +extern int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, + int64_t newsize); extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, @@ -809,9 +924,10 @@ extern int __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, extern int __wt_meta_ckptlist_get( WT_SESSION_IMPL *session, const char *fname, WT_CKPT **ckptbasep); -extern int __wt_meta_ckptlist_set( WT_SESSION_IMPL *session, +extern int __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, const char *fname, - WT_CKPT *ckptbase); + WT_CKPT *ckptbase, + WT_LSN *ckptlsn); extern void __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase); extern void __wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt); @@ -1304,6 +1420,7 @@ extern int __wt_ispo2(uint32_t v); extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2); extern uint32_t __wt_random(void); extern int __wt_buf_grow(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size); +extern int __wt_buf_extend(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size); extern int __wt_buf_init(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size); extern int __wt_buf_initsize(WT_SESSION_IMPL *session, WT_ITEM *buf, @@ -1386,3 +1503,19 @@ extern uint64_t __wt_ext_transaction_oldest(WT_EXTENSION_API *wt_api); extern int __wt_ext_transaction_visible( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uint64_t transaction_id); +extern void __wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op); +extern int __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_txn_checkpoint_logread( WT_SESSION_IMPL *session, + const uint8_t **pp, + const uint8_t *end, + WT_LSN *ckpt_lsn); +extern int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, + int full, + uint32_t flags, + WT_LSN *lsnp); +extern int __wt_txn_truncate_log( WT_SESSION_IMPL *session, + WT_CURSOR_BTREE *start, + WT_CURSOR_BTREE *stop); +extern int __wt_txn_truncate_end(WT_SESSION_IMPL *session); +extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out); +extern int __wt_txn_recover(WT_SESSION_IMPL *default_session); diff --git a/src/include/flags.h b/src/include/flags.h index 91c0a6a959e..cd8de157cbf 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -16,11 +16,12 @@ #define WT_LOGSCAN_FROM_CKP 0x00000004 #define WT_LOGSCAN_ONE 0x00000002 #define WT_LOGSCAN_RECOVER 0x00000001 -#define WT_LOG_CKPT 0x00000004 #define WT_LOG_DSYNC 0x00000002 #define WT_LOG_FSYNC 0x00000001 -#define WT_SESSION_CACHE_BUSY 0x00000040 -#define WT_SESSION_INTERNAL 0x00000020 +#define WT_SESSION_CACHE_BUSY 0x00000100 +#define WT_SESSION_INTERNAL 0x00000080 +#define WT_SESSION_LOGGING_DISABLED 0x00000040 +#define WT_SESSION_LOGGING_INMEM 0x00000020 #define WT_SESSION_NO_CACHE 0x00000010 #define WT_SESSION_NO_CACHE_CHECK 0x00000008 #define WT_SESSION_NO_SCHEMA_LOCK 0x00000004 @@ -40,19 +41,24 @@ #define WT_TREE_SKIP_INTL 0x00000004 #define WT_TREE_SKIP_LEAF 0x00000002 #define WT_TREE_WAIT 0x00000001 -#define WT_VERB_block 0x00020000 -#define WT_VERB_ckpt 0x00010000 -#define WT_VERB_compact 0x00008000 -#define WT_VERB_evict 0x00004000 -#define WT_VERB_evictserver 0x00002000 -#define WT_VERB_fileops 0x00001000 -#define WT_VERB_hazard 0x00000800 -#define WT_VERB_log 0x00000400 -#define WT_VERB_lsm 0x00000200 -#define WT_VERB_mutex 0x00000100 -#define WT_VERB_overflow 0x00000080 -#define WT_VERB_read 0x00000040 -#define WT_VERB_reconcile 0x00000020 +#define WT_TXN_LOG_CKPT_FAIL 0x00000008 +#define WT_TXN_LOG_CKPT_PREPARE 0x00000004 +#define WT_TXN_LOG_CKPT_START 0x00000002 +#define WT_TXN_LOG_CKPT_STOP 0x00000001 +#define WT_VERB_block 0x00040000 +#define WT_VERB_ckpt 0x00020000 +#define WT_VERB_compact 0x00010000 +#define WT_VERB_evict 0x00008000 +#define WT_VERB_evictserver 0x00004000 +#define WT_VERB_fileops 0x00002000 +#define WT_VERB_hazard 0x00001000 +#define WT_VERB_log 0x00000800 +#define WT_VERB_lsm 0x00000400 +#define WT_VERB_mutex 0x00000200 +#define WT_VERB_overflow 0x00000100 +#define WT_VERB_read 0x00000080 +#define WT_VERB_reconcile 0x00000040 +#define WT_VERB_recovery 0x00000020 #define WT_VERB_salvage 0x00000010 #define WT_VERB_shared_cache 0x00000008 #define WT_VERB_verify 0x00000004 diff --git a/src/include/log.h b/src/include/log.h index 40ebb82958a..bb53be4484c 100644 --- a/src/include/log.h +++ b/src/include/log.h @@ -9,6 +9,7 @@ /* Logging subsystem declarations. */ #define LOG_ALIGN 128 +#define WT_LOG_SLOT_BUF_INIT_SIZE 64 * 1024 /* * We rely on this structure being aligned at 64 bits by the compiler, @@ -20,26 +21,18 @@ struct __wt_lsn { off_t offset; /* Log file offset */ }; -typedef enum { - WT_LOGREC_INT16, - WT_LOGREC_UINT16, - WT_LOGREC_INT32, - WT_LOGREC_UINT32, - WT_LOGREC_INT64, - WT_LOGREC_UINT64, - WT_LOGREC_STRING, -} WT_LOGREC_FIELDTYPE; - -typedef struct { - const char *fmt; - const char *fields[]; -} WT_LOGREC_DESC; - #define INIT_LSN(l) do { \ (l)->file = 1; \ (l)->offset = 0; \ } while (0) +#define IS_INIT_LSN(l) ((l)->file == 1 && (l)->offset == 0) + +#define MAX_LSN(l) do { \ + (l)->file = UINT32_MAX; \ + (l)->offset = INT64_MAX; \ +} while (0) + /* * Compare 2 LSNs, return -1 if lsn0 < lsn1, 0 if lsn0 == lsn1 * and 1 if lsn0 > lsn1. @@ -64,45 +57,25 @@ typedef struct { #define WT_LOG_SLOT_PENDING 2 #define WT_LOG_SLOT_READY 3 typedef struct { - union { - struct { -#undef slot_state -#define slot_state u.slot.state - int64_t state; /* Slot state */ -#undef slot_group_size -#define slot_group_size u.slot.group_size - uint64_t group_size; /* Group size */ -#undef slot_error -#define slot_error u.slot.error - int32_t error; /* Error value */ -#undef slot_index -#define slot_index u.slot.index + int64_t slot_state; /* Slot state */ + uint64_t slot_group_size; /* Group size */ + int32_t slot_error; /* Error value */ #define SLOT_INVALID_INDEX 0xffffffff - uint32_t index; /* Active slot index */ -#undef slot_start_offset -#define slot_start_offset u.slot.start_offset - off_t start_offset; /* Starting file offset */ -#undef slot_release_lsn -#define slot_release_lsn u.slot.release_lsn - WT_LSN release_lsn; /* Slot release LSN */ -#undef slot_start_lsn -#define slot_start_lsn u.slot.start_lsn - WT_LSN start_lsn; /* Slot starting LSN */ -#undef slot_end_lsn -#define slot_end_lsn u.slot.end_lsn - WT_LSN end_lsn; /* Slot ending LSN */ -#undef slot_fh -#define slot_fh u.slot.fh - WT_FH *fh; /* File handle for this group */ -#undef slot_flags -#define slot_flags u.slot.flags -#define SLOT_CLOSEFH 0x01 /* Close old fh on release */ -#define SLOT_SYNC 0x02 /* Needs sync on release */ - uint32_t flags; /* Flags */ - } slot; - uint8_t align[LOG_ALIGN]; - } u; -} WT_LOGSLOT; + uint32_t slot_index; /* Active slot index */ + off_t slot_start_offset; /* Starting file offset */ + WT_LSN slot_release_lsn; /* Slot release LSN */ + WT_LSN slot_start_lsn; /* Slot starting LSN */ + WT_LSN slot_end_lsn; /* Slot ending LSN */ + WT_FH *slot_fh; /* File handle for this group */ + WT_ITEM slot_buf; /* Buffer for grouped writes */ + WT_CONDVAR *slot_done_cond; /* Signalled when write done */ + int32_t slot_churn; /* Active slots are scarce. */ +#define SLOT_BUF_GROW 0x01 /* Grow buffer on release */ +#define SLOT_BUFFERED 0x02 /* Buffer writes */ +#define SLOT_CLOSEFH 0x04 /* Close old fh on release */ +#define SLOT_SYNC 0x08 /* Needs sync on release */ + uint32_t flags; /* Flags */ +} WT_LOGSLOT WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT))); typedef struct { WT_LOGSLOT *slot; @@ -127,6 +100,7 @@ typedef struct { WT_LSN ckpt_lsn; /* Last checkpoint LSN */ WT_LSN first_lsn; /* First LSN */ WT_LSN sync_lsn; /* LSN of the last sync */ + WT_LSN trunc_lsn; /* End LSN for recovery truncation */ WT_LSN write_lsn; /* Last LSN written to log file */ /* @@ -135,17 +109,24 @@ typedef struct { WT_SPINLOCK log_lock; /* Locked: Logging fields */ WT_SPINLOCK log_slot_lock; /* Locked: Consolidation array */ + /* Notify any waiting slots when write_lsn is updated. */ + WT_CONDVAR *log_release_cond; + /* * Consolidation array information * SLOT_ACTIVE must be less than SLOT_POOL. + * Our testing shows that the more consolidation we generate the + * better the performance we see which equates to an active slot + * slot count of one. */ -#define SLOT_ACTIVE 4 +#define SLOT_ACTIVE 1 #define SLOT_POOL 16 uint32_t pool_index; /* Global pool index */ WT_LOGSLOT *slot_array[SLOT_ACTIVE]; /* Active slots */ WT_LOGSLOT slot_pool[SLOT_POOL]; /* Pool of all slots */ - uint32_t flags; /* Currently unused */ +#define WT_LOG_FORCE_CONSOLIDATE 0x01 /* Disable direct writes */ + uint32_t flags; } WT_LOG; typedef struct { @@ -168,3 +149,40 @@ struct __wt_log_desc { uint16_t minorv; /* 06-07: Minor version */ uint64_t log_size; /* 08-15: Log file size */ }; + +/* + * WT_LOG_REC_DESC -- + * A descriptor for a log record type. + */ +struct __wt_log_rec_desc { + const char *fmt; + int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end); +}; + +/* + * WT_LOG_OP_DESC -- + * A descriptor for a log operation type. + */ +struct __wt_log_op_desc { + const char *fmt; + int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end); +}; + +/* + * DO NOT EDIT: automatically built by dist/log.py. + * Log record declarations: BEGIN + */ +#define WT_LOGREC_CHECKPOINT 0 +#define WT_LOGREC_COMMIT 1 +#define WT_LOGREC_FILE_SYNC 2 +#define WT_LOGREC_MESSAGE 3 +#define WT_LOGOP_COL_PUT 0 +#define WT_LOGOP_COL_REMOVE 1 +#define WT_LOGOP_COL_TRUNCATE 2 +#define WT_LOGOP_ROW_PUT 3 +#define WT_LOGOP_ROW_REMOVE 4 +#define WT_LOGOP_ROW_TRUNCATE 5 +/* + * Log record declarations: END + * DO NOT EDIT: automatically built by dist/log.py. + */ diff --git a/src/include/log.i b/src/include/log.i deleted file mode 100644 index 6cfe18fc441..00000000000 --- a/src/include/log.i +++ /dev/null @@ -1 +0,0 @@ -/* DO NOT EDIT: automatically built by dist/log.py. */ diff --git a/src/include/misc.h b/src/include/misc.h index 3cb5c3e4979..dc22b4da7ba 100644 --- a/src/include/misc.h +++ b/src/include/misc.h @@ -157,6 +157,12 @@ #define WT_STRING_MATCH(str, bytes, len) \ (strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0') +/* + * Macro that produces a string literal that isn't wrapped in quotes, to avoid + * tripping up spell checkers. + */ +#define WT_UNCHECKED_STRING(str) #str + /* Function return value and scratch buffer declaration and initialization. */ #define WT_DECL_ITEM(i) WT_ITEM *i = NULL #define WT_DECL_RET int ret = 0 diff --git a/src/include/session.h b/src/include/session.h index 4f0477193f1..7ceb3fdda5f 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -96,6 +96,7 @@ struct __wt_session_impl { WT_TXN_ISOLATION isolation; WT_TXN txn; /* Transaction state */ u_int ncursors; /* Count of active file cursors. */ + void *lang_private; /* Language specific private storage */ WT_REF **excl; /* Eviction exclusive list */ u_int excl_next; /* Next empty slot */ diff --git a/src/include/stat.h b/src/include/stat.h index 94a6e869be9..fa3109cc56d 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -162,6 +162,8 @@ struct __wt_connection_stats { WT_STATS dh_sweep_evict; WT_STATS dh_sweeps; WT_STATS file_open; + WT_STATS log_buffer_grow; + WT_STATS log_buffer_size; WT_STATS log_bytes_user; WT_STATS log_bytes_written; WT_STATS log_max_filesize; @@ -173,7 +175,11 @@ struct __wt_connection_stats { WT_STATS log_slot_consolidated; WT_STATS log_slot_joins; WT_STATS log_slot_races; + WT_STATS log_slot_ready_wait_timeout; + WT_STATS log_slot_release_wait_timeout; + WT_STATS log_slot_switch_fails; WT_STATS log_slot_toobig; + WT_STATS log_slot_toosmall; WT_STATS log_slot_transitions; WT_STATS log_sync; WT_STATS log_writes; diff --git a/src/include/txn.h b/src/include/txn.h index 28842f772f6..6a9dcd4abd5 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -51,6 +51,51 @@ enum __wt_txn_isolation { TXN_ISO_SNAPSHOT }; +/* + * WT_TXN_OP -- + * A transactional operation. Each transaction builds an in-memory array + * of these operations as it runs, then uses the array to either write log + * records during commit or undo the operations during rollback. + */ +struct __wt_txn_op { + uint32_t fileid; + enum { + TXN_OP_BASIC, + TXN_OP_INMEM, + TXN_OP_REF, + TXN_OP_TRUNCATE_COL, + TXN_OP_TRUNCATE_ROW + } type; + union { + /* TXN_OP_BASIC, TXN_OP_INMEM */ + struct { + WT_INSERT *ins; + WT_UPDATE *upd; + WT_ITEM key; + } op; + /* TXN_OP_REF */ + WT_REF *ref; + /* TXN_OP_TRUNCATE_COL */ + struct { + uint64_t start, stop; + } truncate_col; + /* TXN_OP_TRUNCATE_ROW */ + struct { + WT_ITEM start, stop; + enum { + TXN_TRUNC_ALL, + TXN_TRUNC_BOTH, + TXN_TRUNC_START, + TXN_TRUNC_STOP + } mode; + } truncate_row; + } u; +}; + +/* + * WT_TXN -- + * Per-session transaction context. + */ struct __wt_txn { uint64_t id; @@ -66,21 +111,20 @@ struct __wt_txn { uint64_t *snapshot; uint32_t snapshot_count; - /* - * Arrays of txn IDs in WT_UPDATE or WT_REF structures created or - * modified by this transaction. - */ - uint64_t **mod; + /* Array of modifications by this transaction. */ + WT_TXN_OP *mod; size_t mod_alloc; u_int mod_count; - WT_REF **modref; - size_t modref_alloc; - u_int modref_count; - /* Requested notification when transactions are resolved. */ WT_TXN_NOTIFY *notify; + /* Checkpoint status. */ + WT_LSN ckpt_lsn; + int full_ckpt; + uint32_t ckpt_nsnapshot; + WT_ITEM *ckpt_snapshot; + #define TXN_AUTOCOMMIT 0x01 #define TXN_ERROR 0x02 #define TXN_OLDEST 0x04 diff --git a/src/include/txn.i b/src/include/txn.i index ed02760e24d..f8ace3c1dd7 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -5,41 +5,25 @@ * See the file LICENSE for redistribution information. */ +static inline void __wt_txn_read_first(WT_SESSION_IMPL *session); +static inline void __wt_txn_read_last(WT_SESSION_IMPL *session); + /* * __wt_txn_modify -- * Mark a WT_UPDATE object modified by the current transaction. */ static inline int -__wt_txn_modify(WT_SESSION_IMPL *session, uint64_t *id) -{ - WT_TXN *txn; - - txn = &session->txn; - WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING)); - WT_RET(__wt_realloc_def( - session, &txn->mod_alloc, txn->mod_count + 1, &txn->mod)); - - txn->mod[txn->mod_count++] = id; - *id = txn->id; - return (0); -} - -/* - * __wt_txn_modify_ref -- - * Mark a WT_REF object modified by the current transaction. - */ -static inline int -__wt_txn_modify_ref(WT_SESSION_IMPL *session, WT_REF *ref) +__txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp) { WT_TXN *txn; txn = &session->txn; WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING)); - WT_RET(__wt_realloc_def( - session, &txn->modref_alloc, txn->modref_count + 1, &txn->modref)); + WT_RET(__wt_realloc_def(session, &txn->mod_alloc, + txn->mod_count + 1, &txn->mod)); - txn->modref[txn->modref_count++] = ref; - ref->txnid = txn->id; + *opp = &txn->mod[txn->mod_count++]; + WT_CLEAR(**opp); return (0); } @@ -62,6 +46,48 @@ __wt_txn_unmodify(WT_SESSION_IMPL *session) } /* + * __wt_txn_modify -- + * Mark a WT_UPDATE object modified by the current transaction. + */ +static inline int +__wt_txn_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + WT_DECL_RET; + WT_TXN_OP *op; + + WT_RET(__txn_next_op(session, &op)); + op->type = F_ISSET(session, WT_SESSION_LOGGING_INMEM) ? + TXN_OP_INMEM : TXN_OP_BASIC; + /* If we are logging, we need a reference to the key. */ + if (cbt->btree->type == BTREE_ROW && S2C(session)->logging) + WT_ERR(__wt_row_key_get(cbt, &op->u.op.key)); + op->u.op.ins = cbt->ins; + op->u.op.upd = upd; + op->fileid = S2BT(session)->id; + upd->txnid = session->txn.id; + if (0) { +err: __wt_txn_unmodify(session); + } + return (ret); +} + +/* + * __wt_txn_modify_ref -- + * Mark a WT_REF object modified by the current transaction. + */ +static inline int +__wt_txn_modify_ref(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_TXN_OP *op; + + WT_RET(__txn_next_op(session, &op)); + op->type = TXN_OP_REF; + op->u.ref = ref; + ref->txnid = session->txn.id; + return (0); +} + +/* * __wt_txn_visible_all -- * Check if a given transaction ID is "globally visible". This is, if * all sessions in the system will see the transaction ID. diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index e64d0bcab68..3ee79c5f125 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -80,6 +80,18 @@ struct __wt_extractor; typedef struct __wt_extractor WT_EXTRACTOR; struct __wt_item; typedef struct __wt_item WT_ITEM; struct __wt_session; typedef struct __wt_session WT_SESSION; +#if defined(SWIGJAVA) +#define WT_HANDLE_NULLABLE(typename) typename##_NULLABLE +#define WT_HANDLE_CLOSED(typename) typename##_CLOSED +typedef WT_CURSOR WT_CURSOR_NULLABLE; +typedef WT_CURSOR WT_CURSOR_CLOSED; +typedef WT_SESSION WT_SESSION_CLOSED; +typedef WT_CONNECTION WT_CONNECTION_CLOSED; +#elif !defined(DOXYGEN) +#define WT_HANDLE_NULLABLE(typename) typename +#define WT_HANDLE_CLOSED(typename) typename +#endif + /*! * A raw item of data to be managed. Data items have a pointer to the data and * a length (limited to 4GB for items stored in tables). WT_ITEM structures do @@ -459,7 +471,7 @@ struct __wt_cursor { * @param cursor the cursor handle * @errors */ - int __F(close)(WT_CURSOR *cursor); + int __F(close)(WT_HANDLE_CLOSED(WT_CURSOR) *cursor); /* * Protected fields, only to be used by cursor implementations. @@ -478,6 +490,8 @@ struct __wt_cursor { uint64_t recno; /* Record number, normal and raw mode */ uint8_t raw_recno_buf[WT_INTPACK64_MAXSIZE]; + void *lang_private; /* Language specific private storage */ + WT_ITEM key, value; int saved_err; /* Saved error in set_{key,value}. */ @@ -523,7 +537,8 @@ struct __wt_session { * @configempty{session.close, see dist/api_data.py} * @errors */ - int __F(close)(WT_SESSION *session, const char *config); + int __F(close)(WT_HANDLE_CLOSED(WT_SESSION) *session, + const char *config); /*! * Reconfigure a session handle. @@ -689,7 +704,7 @@ struct __wt_session { * @errors */ int __F(open_cursor)(WT_SESSION *session, - const char *uri, WT_CURSOR *to_dup, + const char *uri, WT_HANDLE_NULLABLE(WT_CURSOR) *to_dup, const char *config, WT_CURSOR **cursorp); /*! @} */ @@ -975,7 +990,9 @@ struct __wt_session { */ int __F(truncate)(WT_SESSION *session, const char *name, - WT_CURSOR *start, WT_CURSOR *stop, const char *config); + WT_HANDLE_NULLABLE(WT_CURSOR) *start, + WT_HANDLE_NULLABLE(WT_CURSOR) *stop, + const char *config); /*! * Upgrade a file or table. @@ -1148,7 +1165,8 @@ struct __wt_connection { * @configempty{connection.close, see dist/api_data.py} * @errors */ - int __F(close)(WT_CONNECTION *connection, const char *config); + int __F(close)(WT_HANDLE_CLOSED(WT_CONNECTION) *connection, + const char *config); /*! * Reconfigure a connection handle. @@ -1209,9 +1227,9 @@ struct __wt_connection { * chosen from the following options: \c "block"\, \c "ckpt"\, \c * "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\, \c * "hazard"\, \c "log"\, \c "lsm"\, \c "mutex"\, \c "overflow"\, \c - * "read"\, \c "readserver"\, \c "reconcile"\, \c "salvage"\, \c - * "shared_cache"\, \c "verify"\, \c "version"\, \c "write"; default - * empty.} + * "read"\, \c "readserver"\, \c "reconcile"\, \c "recovery"\, \c + * "salvage"\, \c "shared_cache"\, \c "verify"\, \c "version"\, \c + * "write"; default empty.} * @configend * @errors */ @@ -1464,7 +1482,7 @@ struct __wt_connection { * @config{ enabled, enable logging subsystem., a boolean * flag; default \c false.} * @config{ file_max, the - * maximum size of the log file., an integer between 1MB and 2GB; default \c + * maximum size of log files., an integer between 100KB and 2GB; default \c * 100MB.} * @config{ path, the path to a directory into * which the log files are written. If the value is not an absolute path name\, @@ -1542,8 +1560,8 @@ struct __wt_connection { * values chosen from the following options: \c "block"\, \c "ckpt"\, \c * "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\, \c "hazard"\, \c * "log"\, \c "lsm"\, \c "mutex"\, \c "overflow"\, \c "read"\, \c "readserver"\, - * \c "reconcile"\, \c "salvage"\, \c "shared_cache"\, \c "verify"\, \c - * "version"\, \c "write"; default empty.} + * \c "reconcile"\, \c "recovery"\, \c "salvage"\, \c "shared_cache"\, \c + * "verify"\, \c "version"\, \c "write"; default empty.} * @configend * Additionally, if a file named \c WiredTiger.config appears in the WiredTiger * home directory, it is read for configuration values (see @ref config_file @@ -2481,74 +2499,86 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_DH_SESSION_HANDLES 1041 /*! dhandle: sweeps conflicting with evict */ #define WT_STAT_CONN_DH_SWEEP_EVICT 1042 -/*! dhandle: number of sweep attempts */ +/*! dhandle: sweep attempts */ #define WT_STAT_CONN_DH_SWEEPS 1043 /*! files currently open */ #define WT_STAT_CONN_FILE_OPEN 1044 +/*! log: log buffer size increases */ +#define WT_STAT_CONN_LOG_BUFFER_GROW 1045 +/*! log: total log buffer size */ +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1046 /*! log: user provided log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_USER 1045 +#define WT_STAT_CONN_LOG_BYTES_USER 1047 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1046 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1048 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1047 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1049 /*! log: log read operations */ -#define WT_STAT_CONN_LOG_READS 1048 +#define WT_STAT_CONN_LOG_READS 1050 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1049 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1051 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1050 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1052 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1051 +#define WT_STAT_CONN_LOG_SCANS 1053 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1052 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1054 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1053 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1055 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1054 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1056 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1055 +#define WT_STAT_CONN_LOG_SLOT_RACES 1057 +/*! log: log slot ready wait timeouts */ +#define WT_STAT_CONN_LOG_SLOT_READY_WAIT_TIMEOUT 1058 +/*! log: log slot release wait timeouts */ +#define WT_STAT_CONN_LOG_SLOT_RELEASE_WAIT_TIMEOUT 1059 +/*! log: slots selected for switching that were unavailable */ +#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1060 /*! log: record size exceeded maximum */ -#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1056 +#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1061 +/*! log: failed to find a slot large enough for record */ +#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1062 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1057 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1063 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1058 +#define WT_STAT_CONN_LOG_SYNC 1064 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1059 +#define WT_STAT_CONN_LOG_WRITES 1065 /*! rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1060 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1066 /*! memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1061 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1067 /*! memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1062 +#define WT_STAT_CONN_MEMORY_FREE 1068 /*! memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1063 +#define WT_STAT_CONN_MEMORY_GROW 1069 /*! total read I/Os */ -#define WT_STAT_CONN_READ_IO 1064 +#define WT_STAT_CONN_READ_IO 1070 /*! page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1065 +#define WT_STAT_CONN_REC_PAGES 1071 /*! page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1066 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1072 /*! reconciliation failed because an update could not be included */ -#define WT_STAT_CONN_REC_SKIPPED_UPDATE 1067 +#define WT_STAT_CONN_REC_SKIPPED_UPDATE 1073 /*! pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1068 +#define WT_STAT_CONN_RWLOCK_READ 1074 /*! pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1069 +#define WT_STAT_CONN_RWLOCK_WRITE 1075 /*! open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1070 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1076 /*! transactions */ -#define WT_STAT_CONN_TXN_BEGIN 1071 +#define WT_STAT_CONN_TXN_BEGIN 1077 /*! transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1072 +#define WT_STAT_CONN_TXN_CHECKPOINT 1078 /*! transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1073 +#define WT_STAT_CONN_TXN_COMMIT 1079 /*! transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1074 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1080 /*! transactions rolled-back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1075 +#define WT_STAT_CONN_TXN_ROLLBACK 1081 /*! total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1076 +#define WT_STAT_CONN_WRITE_IO 1082 /*! * @} @@ -2721,7 +2751,7 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_REC_SPLIT_INTERNAL 2080 /*! reconciliation leaf pages split */ #define WT_STAT_DSRC_REC_SPLIT_LEAF 2081 -/*! reconciliation maximum number of splits created for a page */ +/*! reconciliation maximum splits for a page */ #define WT_STAT_DSRC_REC_SPLIT_MAX 2082 /*! object compaction */ #define WT_STAT_DSRC_SESSION_COMPACT 2083 diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index b8b55640296..d36656be947 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -147,6 +147,10 @@ struct __wt_insert_head; typedef struct __wt_insert_head WT_INSERT_HEAD; struct __wt_log_desc; typedef struct __wt_log_desc WT_LOG_DESC; +struct __wt_log_op_desc; + typedef struct __wt_log_op_desc WT_LOG_OP_DESC; +struct __wt_log_rec_desc; + typedef struct __wt_log_rec_desc WT_LOG_REC_DESC; struct __wt_lsm_chunk; typedef struct __wt_lsm_chunk WT_LSM_CHUNK; struct __wt_lsm_data_source; @@ -203,6 +207,8 @@ struct __wt_txn; typedef struct __wt_txn WT_TXN; struct __wt_txn_global; typedef struct __wt_txn_global WT_TXN_GLOBAL; +struct __wt_txn_op; + typedef struct __wt_txn_op WT_TXN_OP; struct __wt_txn_state; typedef struct __wt_txn_state WT_TXN_STATE; struct __wt_update; @@ -261,7 +267,6 @@ struct __wt_update; #include "bitstring.i" #include "column.i" -#include "log.i" #include "serial.i" #if defined(__cplusplus) diff --git a/src/log/log.c b/src/log/log.c index 7f5d2d7f139..c3b61108422 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -8,17 +8,82 @@ #include "wt_internal.h" /* - * __wt_log_getfiles -- + * __wt_log_ckpt -- + * Record the given LSN as the checkpoint LSN and signal the archive + * thread as needed. + */ +int +__wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + + conn = S2C(session); + log = conn->log; + log->ckpt_lsn = *ckp_lsn; + if (conn->arch_cond != NULL) + WT_RET(__wt_cond_signal(session, conn->arch_cond)); + return (0); +} + +/* + * __wt_log_get_files -- * Retrieve the list of all existing log files. */ int -__wt_log_getfiles(WT_SESSION_IMPL *session, char ***files, u_int *count) +__wt_log_get_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp) { WT_CONNECTION_IMPL *conn; + const char *log_path; + + *countp = 0; + *filesp = NULL; conn = S2C(session); - return (__wt_dirlist(session, conn->log_path, WT_LOG_FILENAME, - WT_DIRLIST_INCLUDE, files, count)); + log_path = conn->log_path; + if (log_path == NULL) + log_path = ""; + return (__wt_dirlist(session, log_path, WT_LOG_FILENAME, + WT_DIRLIST_INCLUDE, filesp, countp)); +} + +/* + * __wt_log_get_active_files -- + * Retrieve the list of active log files (those that are not candidates + * for archiving). + */ +int +__wt_log_get_active_files( + WT_SESSION_IMPL *session, char ***filesp, u_int *countp) +{ + WT_DECL_RET; + WT_LOG *log; + char **files; + uint32_t id; + u_int count, i; + + log = S2C(session)->log; + + WT_RET(__wt_log_get_files(session, &files, &count)); + + /* Filter out any files that are below the checkpoint LSN. */ + for (i = 0; i < count; ) { + WT_ERR(__wt_log_extract_lognum(session, files[i], &id)); + if (id < log->ckpt_lsn.file) { + __wt_free(session, files[i]); + files[i] = files[count - 1]; + files[--count] = NULL; + } else + i++; + } + + *filesp = files; + *countp = count; + + if (0) { +err: __wt_log_files_free(session, files, count); + } + return (ret); } /* @@ -70,11 +135,9 @@ __wt_log_extract_lognum( if (id == NULL || name == NULL) return (0); - if ((p = strrchr(name, '.')) == NULL) - return (WT_ERROR); - ++p; - if ((sscanf(++p, "%" PRIu32, id)) != 1) - return (WT_ERROR); + if ((p = strrchr(name, '.')) == NULL || + sscanf(++p, "%" PRIu32, id) != 1) + WT_RET_MSG(session, WT_ERROR, "Bad log file name '%s'", name); return (0); } @@ -138,7 +201,8 @@ __wt_log_open(WT_SESSION_IMPL *session) log = conn->log; lastlog = 0; firstlog = UINT32_MAX; - WT_RET(__wt_log_getfiles(session, &logfiles, &logcount)); + + WT_RET(__wt_log_get_files(session, &logfiles, &logcount)); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); lastlog = WT_MAX(lastlog, lognum); @@ -149,13 +213,23 @@ __wt_log_open(WT_SESSION_IMPL *session) firstlog, lastlog); log->first_lsn.file = firstlog; log->first_lsn.offset = 0; + /* * Start logging at the beginning of the next log file, no matter * where the previous log file ends. */ WT_ERR(__wt_log_newfile(session, 1)); -err: - __wt_log_files_free(session, logfiles, logcount); + + /* + * If there were log files, run recovery. + * XXX belongs at a higher level than this. + */ + if (logcount > 0) { + log->trunc_lsn = log->alloc_lsn; + WT_ERR(__wt_txn_recover(session)); + } + +err: __wt_log_files_free(session, logfiles, logcount); return (ret); } @@ -186,9 +260,13 @@ __wt_log_close(WT_SESSION_IMPL *session) return (0); } +/* + * __log_fill -- + * Copy a thread's log records into the assigned slot. + */ static int __log_fill(WT_SESSION_IMPL *session, - WT_MYSLOT *myslot, WT_ITEM *record, WT_LSN *lsnp) + WT_MYSLOT *myslot, int direct, WT_ITEM *record, WT_LSN *lsnp) { WT_DECL_RET; WT_LOG_RECORD *logrec; @@ -199,9 +277,14 @@ __log_fill(WT_SESSION_IMPL *session, * If the offset becomes a unit of LOG_ALIGN this is where we would * multiply by LOG_ALIGN to get the real file byte offset for write(). */ - WT_ERR(__wt_write(session, myslot->slot->slot_fh, - myslot->offset + myslot->slot->slot_start_offset, - logrec->len, (void *)logrec)); + if (direct) + WT_ERR(__wt_write(session, myslot->slot->slot_fh, + myslot->offset + myslot->slot->slot_start_offset, + logrec->len, (void *)logrec)); + else + memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset, + logrec, logrec->len); + WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len); if (lsnp != NULL) { *lsnp = myslot->slot->slot_start_lsn; @@ -228,27 +311,66 @@ __log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize) /* * __log_truncate -- - * Truncate the log to the given LSN. In addition to setting the - * given LSN as the new end of log file, it will remove any later - * log files that may exist. This function assumes we are in recovery - * or other dedicated time and not during live running. + * Truncate the log to the given LSN. If this_log is set, it will only + * truncate the log file indicated in the given LSN. If not set, + * it will truncate between the given LSN and the trunc_lsn. That is, + * since we pre-allocate log files, it will free that space and allow the + * log to be traversed. We use the trunc_lsn because logging has already + * opened the new/next log file before recovery ran. This function assumes + * we are in recovery or other dedicated time and not during live running. */ static int -__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn) +__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, uint32_t this_log) { + WT_CONNECTION_IMPL *conn; WT_DECL_RET; + WT_FH *log_fh, *tmp_fh; + WT_LOG *log; uint32_t lognum; u_int i, logcount; char **logfiles; - WT_RET(__wt_log_getfiles(session, &logfiles, &logcount)); + conn = S2C(session); + log = conn->log; + log_fh = NULL; + logcount = 0; + logfiles = NULL; + + /* + * Truncate the log file to the given LSN. + */ + WT_ERR(__log_openfile(session, 0, &log_fh, lsn->file)); + WT_ERR(__wt_ftruncate(session, log_fh, lsn->offset)); + tmp_fh = log_fh; + log_fh = NULL; + WT_ERR(__wt_close(session, tmp_fh)); + + /* + * If we just want to truncate the current log, return and skip + * looking for intervening logs. + */ + if (this_log) + goto err; + WT_ERR(__wt_log_get_files(session, &logfiles, &logcount)); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); - if (lognum > lsn->file) - WT_ERR(__wt_log_remove(session, lognum)); + if (lognum > lsn->file && lognum < log->trunc_lsn.file) { + WT_ERR(__log_openfile(session, 0, &log_fh, lognum)); + /* + * If there are intervening files pre-allocated, + * truncate them to the end of the log file header. + */ + WT_ERR(__wt_ftruncate(session, + log_fh, LOG_FIRST_RECORD)); + tmp_fh = log_fh; + log_fh = NULL; + WT_ERR(__wt_close(session, tmp_fh)); + } } - -err: __wt_log_files_free(session, logfiles, logcount); +err: if (log_fh != NULL) + WT_TRET(__wt_close(session, log_fh)); + if (logfiles != NULL) + __wt_log_files_free(session, logfiles, logcount); return (ret); } @@ -263,6 +385,7 @@ __log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, off_t *eof) WT_DECL_RET; WT_LOG *log; uint64_t rec; + uint32_t allocsize; off_t log_size, off; conn = S2C(session); @@ -271,6 +394,10 @@ __log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, off_t *eof) return (0); *eof = 0; WT_ERR(__wt_filesize(session, fh, &log_size)); + if (log == NULL) + allocsize = LOG_ALIGN; + else + allocsize = log->allocsize; /* * We know all log records are aligned at log->allocsize. The first * item in a log record is always the length. Look for any non-zero @@ -278,9 +405,9 @@ __log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, off_t *eof) * it could be the middle of a large record. But we know no log record * starts after it. Return an estimate of the log file size. */ - for (off = log_size - (off_t)log->allocsize; + for (off = log_size - (off_t)allocsize; off > 0; - off -= (off_t)log->allocsize) { + off -= (off_t)allocsize) { WT_ERR(__wt_read(session, fh, off, sizeof(uint64_t), &rec)); if (rec != 0) break; @@ -288,7 +415,7 @@ __log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, off_t *eof) /* * Set EOF to the last zero-filled record we saw. */ - *eof = off + (off_t)log->allocsize; + *eof = off + (off_t)allocsize; err: return (ret); } @@ -316,7 +443,7 @@ __log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) if (!__log_size_fit(session, &log->alloc_lsn, recsize)) { WT_RET(__wt_log_newfile(session, 0)); if (log->log_close_fh != NULL) - FLD_SET(slot->slot_flags, SLOT_CLOSEFH); + F_SET(slot, SLOT_CLOSEFH); } /* * Need to minimally fill in slot info here. Our slot start LSN @@ -337,6 +464,10 @@ __log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) return (0); } +/* + * __log_release -- + * Release a log slot. + */ static int __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { @@ -344,6 +475,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; + uint32_t write_size; conn = S2C(session); log = conn->log; @@ -352,23 +484,53 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) * of the file handle structure. */ close_fh = NULL; - if (FLD_ISSET(slot->slot_flags, SLOT_CLOSEFH)) { + if (F_ISSET(slot, SLOT_CLOSEFH)) { close_fh = log->log_close_fh; log->log_close_fh = NULL; - FLD_CLR(slot->slot_flags, SLOT_CLOSEFH); + F_CLR(slot, SLOT_CLOSEFH); + } + + /* Write the buffered records */ + if (F_ISSET(slot, SLOT_BUFFERED)) { + write_size = (uint32_t) + (slot->slot_end_lsn.offset - slot->slot_start_offset); + WT_ERR(__wt_write(session, slot->slot_fh, + slot->slot_start_offset, write_size, slot->slot_buf.mem)); } + /* - * Wait for earlier groups to finish. + * Wait for earlier groups to finish, otherwise there could be holes + * in the log file. */ - while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) - __wt_yield(); - if (FLD_ISSET(slot->slot_flags, SLOT_SYNC)) { + while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) { + /* + * Workloads with fast commits (no-sync is a reasonable + * approximation) benefit from yielding rather than using the + * more heavy weight condition wait. + */ + if (S2C(session)->txn_logsync == 0) + __wt_yield(); + else if (__wt_cond_wait(session, + log->log_release_cond, 10000) == ETIMEDOUT) + WT_STAT_FAST_CONN_INCR(session, + log_slot_release_wait_timeout); + } + if (F_ISSET(slot, SLOT_SYNC)) { WT_STAT_FAST_CONN_INCR(session, log_sync); WT_ERR(__wt_fsync(session, log->log_fh)); - FLD_CLR(slot->slot_flags, SLOT_SYNC); + F_CLR(slot, SLOT_SYNC); log->sync_lsn = slot->slot_end_lsn; } log->write_lsn = slot->slot_end_lsn; + WT_ERR(__wt_cond_signal(session, log->log_release_cond)); + if (F_ISSET(slot, SLOT_BUF_GROW)) { + WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); + F_CLR(slot, SLOT_BUF_GROW); + WT_STAT_FAST_CONN_INCRV(session, + log_buffer_size, slot->slot_buf.memsize); + WT_ERR(__wt_buf_grow(session, + &slot->slot_buf, slot->slot_buf.memsize * 2)); + } /* * If we have a file to close, close it now. */ @@ -433,6 +595,7 @@ __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create) memset(&tmp, 0, sizeof(tmp)); myslot.slot = &tmp; myslot.offset = 0; + /* * Recursively call __log_acquire to allocate log space for the * log descriptor record. Call __log_fill to write it, but we @@ -440,7 +603,8 @@ __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create) * earlier operations to complete. */ WT_ERR(__log_acquire(session, logrec->len, &tmp)); - WT_ERR(__log_fill(session, &myslot, buf, NULL)); + WT_ERR(__log_fill(session, &myslot, 1, buf, NULL)); + /* * If we're called from connection creation code, we need to update * the LSNs since we're the only write in progress. @@ -517,11 +681,8 @@ __wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, cksum = logrec->checksum; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, logrec->len); - if (logrec->checksum != cksum) { - WT_VERBOSE_ERR(session, log, "log_read: Bad checksum"); - ret = WT_ERROR; - goto err; - } + if (logrec->checksum != cksum) + WT_ERR_MSG(session, WT_ERROR, "log_read: Bad checksum"); record->size = logrec->len; WT_STAT_FAST_CONN_INCR(session, log_reads); err: @@ -529,6 +690,10 @@ err: return (ret); } +/* + * __wt_log_scan -- + * Scan the logs, calling a function on each record found. + */ int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, @@ -542,11 +707,16 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, WT_LOG_RECORD *logrec; WT_LSN end_lsn, rd_lsn, start_lsn; off_t log_size; - uint32_t cksum, rdup_len, reclen; + uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen; + u_int i, logcount; int done; + char **logfiles; conn = S2C(session); log = conn->log; + log_fh = NULL; + logcount = 0; + logfiles = NULL; WT_CLEAR(buf); /* @@ -556,27 +726,68 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, if (func == NULL) return (0); - if (lsnp == NULL) { - if (LF_ISSET(WT_LOGSCAN_FIRST)) - start_lsn = log->first_lsn; - else if (LF_ISSET(WT_LOGSCAN_FROM_CKP)) - start_lsn = log->ckpt_lsn; - else - return (WT_ERROR); /* Illegal usage */ - } else { - if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP)) - return (WT_ERROR); /* Illegal usage */ + if (LF_ISSET(WT_LOGSCAN_RECOVER)) + WT_VERBOSE_RET(session, log, + "__wt_log_scan truncating to %u/%" PRIuMAX, + log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset); + + if (log != NULL) { + allocsize = log->allocsize; + + if (lsnp == NULL) { + if (LF_ISSET(WT_LOGSCAN_FIRST)) + start_lsn = log->first_lsn; + else if (LF_ISSET(WT_LOGSCAN_FROM_CKP)) + start_lsn = log->ckpt_lsn; + else + return (WT_ERROR); /* Illegal usage */ + } else { + if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP)) + WT_RET_MSG(session, WT_ERROR, + "choose either a start LSN or a start flag"); - /* Offsets must be on allocation boundaries. */ - if (lsnp->offset % log->allocsize != 0 || - lsnp->file > log->fileid) - return (WT_NOTFOUND); + /* Offsets must be on allocation boundaries. */ + if (lsnp->offset % allocsize != 0 || + lsnp->file > log->fileid) + return (WT_NOTFOUND); - start_lsn = *lsnp; + start_lsn = *lsnp; + } + + end_lsn = log->alloc_lsn; + } else { + /* + * If logging is not configured, we can still print out the log + * if log files exist. We just need to set the LSNs from what + * is in the files versus what is in the live connection. + */ + /* + * Set allocsize to the minimum alignment it could be. Larger + * records and larger allocation boundaries should always be + * a multiple of this. + */ + allocsize = LOG_ALIGN; + lastlog = 0; + firstlog = UINT32_MAX; + WT_RET(__wt_log_get_files(session, &logfiles, &logcount)); + if (logcount == 0) + /* + * Return it is not supported if none don't exist. + */ + return (ENOTSUP); + for (i = 0; i < logcount; i++) { + WT_ERR(__wt_log_extract_lognum(session, logfiles[i], + &lognum)); + lastlog = WT_MAX(lastlog, lognum); + firstlog = WT_MIN(firstlog, lognum); + } + start_lsn.file = firstlog; + end_lsn.file = lastlog; + start_lsn.offset = end_lsn.offset = 0; + __wt_log_files_free(session, logfiles, logcount); + logfiles = NULL; } - end_lsn = log->alloc_lsn; - log_fh = NULL; - WT_RET(__log_openfile(session, 0, &log_fh, start_lsn.file)); + WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file)); WT_ERR(__log_filesize(session, log_fh, &log_size)); if (LF_ISSET(WT_LOGSCAN_ONE)) done = 1; @@ -591,6 +802,11 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, */ WT_ERR(__wt_close(session, log_fh)); log_fh = NULL; + /* + * Truncate this log file before we move to the next. + */ + if (LF_ISSET(WT_LOGSCAN_RECOVER)) + WT_ERR(__log_truncate(session, &rd_lsn, 1)); rd_lsn.file++; rd_lsn.offset = 0; /* @@ -607,9 +823,9 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, /* * Read the minimum allocation size a record could be. */ - WT_ASSERT(session, buf.memsize >= log->allocsize); + WT_ASSERT(session, buf.memsize >= allocsize); WT_ERR(__wt_read( - session, log_fh, rd_lsn.offset, log->allocsize, buf.mem)); + session, log_fh, rd_lsn.offset, allocsize, buf.mem)); /* * First 8 bytes is the real record length. See if we * need to read more than the allocation size. We expect @@ -625,16 +841,11 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, * that may exist. */ if (reclen == 0) { - /* - * This LSN is the end. Truncate if we're in recovery - * otherwise, just stop. - */ - if (LF_ISSET(WT_LOGSCAN_RECOVER)) - WT_ERR(__log_truncate(session, &rd_lsn)); + /* This LSN is the end. */ break; } - rdup_len = __wt_rduppo2(reclen, log->allocsize); - if (reclen > log->allocsize) { + rdup_len = __wt_rduppo2(reclen, allocsize); + if (reclen > allocsize) { WT_ERR(__wt_buf_grow(session, &buf, rdup_len)); WT_ERR(__wt_read( session, log_fh, rd_lsn.offset, reclen, buf.mem)); @@ -649,8 +860,16 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, logrec->len); if (logrec->checksum != cksum) { - ret = WT_ERROR; - goto err; + /* + * A checksum mismatch means we have reached the end of + * the useful part of the log. This should be found on + * the first pass through recovery. In the second pass + * where we truncate the log, this is where it should + * end. + */ + if (log != NULL) + log->trunc_lsn = rd_lsn; + break; } /* @@ -664,7 +883,14 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, rd_lsn.offset += (off_t)rdup_len; } while (!done); + /* Truncate if we're in recovery. */ + if (LF_ISSET(WT_LOGSCAN_RECOVER) && + LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0) + WT_ERR(__log_truncate(session, &rd_lsn, 0)); + err: WT_STAT_FAST_CONN_INCR(session, log_scans); + if (logfiles != NULL) + __wt_log_files_free(session, logfiles, logcount); __wt_buf_free(session, &buf); if (ret == ENOENT) ret = 0; @@ -673,6 +899,48 @@ err: WT_STAT_FAST_CONN_INCR(session, log_scans); return (ret); } +/* + * __log_direct_write -- + * Write a log record without using the consolidation arrays. + */ +static int +__log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, + uint32_t flags) +{ + WT_DECL_RET; + WT_LOG *log; + WT_LOGSLOT tmp; + WT_MYSLOT myslot; + int locked; + + log = S2C(session)->log; + locked = 0; + myslot.slot = &tmp; + myslot.offset = 0; + + /* Fast path the contended case. */ + if (__wt_spin_trylock(session, &log->log_slot_lock) != 0) + return (EAGAIN); + + memset(&tmp, 0, sizeof(tmp)); + locked = 1; + if (LF_ISSET(WT_LOG_FSYNC)) + F_SET(&tmp, SLOT_SYNC); + WT_ERR(__log_acquire(session, record->size, &tmp)); + __wt_spin_unlock(session, &log->log_slot_lock); + locked = 0; + WT_ERR(__log_fill(session, &myslot, 1, record, lsnp)); + WT_ERR(__log_release(session, &tmp)); + +err: if (locked) + __wt_spin_unlock(session, &log->log_slot_lock); + return (ret); +} + +/* + * __wt_log_write -- + * Write a record into the log. + */ int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags) @@ -681,7 +949,6 @@ __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_DECL_RET; WT_LOG *log; WT_LOG_RECORD *logrec; - WT_LOGSLOT tmp; WT_LSN tmp_lsn; WT_MYSLOT myslot; uint32_t rdup_len; @@ -691,8 +958,6 @@ __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, log = conn->log; locked = 0; INIT_LSN(&tmp_lsn); - myslot.slot = &tmp; - myslot.offset = 0; /* * Assume the WT_ITEM the user passed is a WT_LOG_RECORD, which has * a header at the beginning for us to fill in. @@ -720,24 +985,45 @@ __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, record->size); - memset(&tmp, 0, sizeof(tmp)); WT_STAT_FAST_CONN_INCR(session, log_writes); - if (__wt_spin_trylock(session, &log->log_slot_lock) == 0) { + + if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) { + ret = __log_direct_write(session, record, lsnp, flags); + if (ret == 0) + return (0); + if (ret != EAGAIN) + WT_ERR(ret); /* - * No contention, just write our record. We're not using - * the consolidation arrays, so send in the tmp slot. + * An EAGAIN return means we failed to get the try lock - + * fall through to the consolidation code in that case. */ - locked = 1; - if (LF_ISSET(WT_LOG_FSYNC)) - FLD_SET(tmp.slot_flags, SLOT_SYNC); - WT_ERR(__log_acquire(session, rdup_len, &tmp)); - __wt_spin_unlock(session, &log->log_slot_lock); - locked = 0; - WT_ERR(__log_fill(session, &myslot, record, lsnp)); - WT_ERR(__log_release(session, &tmp)); + } + + /* + * As soon as we see contention for the log slot, disable direct + * log writes. We get better performance by forcing writes through + * the consolidation code. This is because individual writes flood + * the I/O system faster than they contend on the log slot lock. + */ + F_SET(log, WT_LOG_FORCE_CONSOLIDATE); + if ((ret = __wt_log_slot_join( + session, rdup_len, flags, &myslot)) == ENOMEM) { + /* + * If we couldn't find a consolidated slot for this record + * write the record directly. + */ + while ((ret = __log_direct_write( + session, record, lsnp, flags)) == EAGAIN) + ; + WT_ERR(ret); + /* + * Increase the buffer size of any slots we can get access + * to, so future consolidations are likely to succeed. + */ + WT_ERR(__wt_log_slot_grow_buffers(session, 4 * rdup_len)); return (0); } - WT_ERR(__wt_log_slot_join(session, rdup_len, flags, &myslot)); + WT_ERR(ret); if (myslot.offset == 0) { __wt_spin_lock(session, &log->log_slot_lock); locked = 1; @@ -746,15 +1032,16 @@ __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, session, myslot.slot->slot_group_size, myslot.slot)); __wt_spin_unlock(session, &log->log_slot_lock); locked = 0; - WT_ERR(__wt_log_slot_notify(myslot.slot)); + WT_ERR(__wt_log_slot_notify(session, myslot.slot)); } else - WT_ERR(__wt_log_slot_wait(myslot.slot)); - WT_ERR(__log_fill(session, &myslot, record, &tmp_lsn)); + WT_ERR(__wt_log_slot_wait(session, myslot.slot)); + WT_ERR(__log_fill(session, &myslot, 0, record, &tmp_lsn)); if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) { WT_ERR(__log_release(session, myslot.slot)); WT_ERR(__wt_log_slot_free(myslot.slot)); } else if (LF_ISSET(WT_LOG_FSYNC)) { + /* Wait for our slot to be finalized */ while (LOG_CMP(&log->sync_lsn, &tmp_lsn) <= 0 && myslot.slot->slot_error == 0) __wt_yield(); @@ -773,22 +1060,23 @@ err: */ if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0) ret = myslot.slot->slot_error; - if (LF_ISSET(WT_LOG_CKPT) && ret == 0) { - log->ckpt_lsn = tmp_lsn; - if (conn->arch_cond != NULL) - WT_ERR(__wt_cond_signal(session, conn->arch_cond)); - } return (ret); } +/* + * __wt_log_vprintf -- + * Write a message into the log. + */ int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap) { WT_CONNECTION_IMPL *conn; - WT_DECL_ITEM(buf); - WT_LOG_RECORD *logrec; + WT_DECL_ITEM(logrec); + WT_DECL_RET; va_list ap_copy; - size_t len; + const char *rec_fmt = WT_UNCHECKED_STRING(I); + uint32_t rectype = WT_LOGREC_MESSAGE; + size_t header_size, len; conn = S2C(session); @@ -796,18 +1084,31 @@ __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap) return (0); va_copy(ap_copy, ap); - len = (size_t)vsnprintf(NULL, 0, fmt, ap_copy) + sizeof(WT_LOG_RECORD); + len = (size_t)vsnprintf(NULL, 0, fmt, ap_copy) + 1; va_end(ap_copy); - WT_RET(__wt_scr_alloc(session, 0, &buf)); - WT_RET(__wt_buf_initsize(session, buf, len)); + WT_RET( + __wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec)); - logrec = (WT_LOG_RECORD *)buf->mem; - (void)vsnprintf((char *)logrec->record, len, fmt, ap); + /* + * We're writing a record with the type (an integer) followed by a + * string (NUL-terminated data). To avoid writing the string into + * a buffer before copying it, we write the header first, then the + * raw bytes of the string. + */ + WT_ERR(__wt_struct_size(session, &header_size, rec_fmt, rectype)); + WT_ERR(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, header_size, + rec_fmt, rectype)); + logrec->size += (uint32_t)header_size; - WT_VERBOSE_RET(session, log, - "log_printf: %s", (char *)logrec->record); - WT_RET(__wt_log_write(session, buf, NULL, 0)); - __wt_scr_free(&buf); - return (0); + (void)vsnprintf((char *)logrec->data + logrec->size, len, fmt, ap); + + WT_VERBOSE_ERR(session, log, + "log_printf: %s", (char *)logrec->data + logrec->size); + + logrec->size += len; + WT_ERR(__wt_log_write(session, logrec, NULL, 0)); +err: __wt_scr_free(&logrec); + return (ret); } diff --git a/src/log/log_auto.c b/src/log/log_auto.c new file mode 100644 index 00000000000..50d80d67434 --- /dev/null +++ b/src/log/log_auto.c @@ -0,0 +1,431 @@ +/* DO NOT EDIT: automatically built by dist/log.py. */ + +#include "wt_internal.h" + +int +__wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp) +{ + WT_ITEM *logrec; + + WT_RET(__wt_scr_alloc(session, WT_ALIGN(size + 1, LOG_ALIGN), &logrec)); + WT_CLEAR(*(WT_LOG_RECORD *)logrec->data); + logrec->size = offsetof(WT_LOG_RECORD, record); + + *logrecp = logrec; + return (0); +} + +void +__wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp) +{ + WT_UNUSED(session); + __wt_scr_free(logrecp); +} + +int +__wt_logrec_read(WT_SESSION_IMPL *session, + const uint8_t **pp, const uint8_t *end, uint32_t *rectypep) +{ + uint64_t rectype; + + WT_UNUSED(session); + WT_RET(__wt_vunpack_uint(pp, WT_PTRDIFF(end, *pp), &rectype)); + *rectypep = (uint32_t)rectype; + return (0); +} + +int +__wt_logop_read(WT_SESSION_IMPL *session, + const uint8_t **pp, const uint8_t *end, + uint32_t *optypep, uint32_t *opsizep) +{ + return (__wt_struct_unpack( + session, *pp, WT_PTRDIFF(end, *pp), "II", optypep, opsizep)); +} + +int +__wt_logop_col_put_pack( + WT_SESSION_IMPL *session, WT_ITEM *logrec, + uint32_t fileid, uint64_t recno, WT_ITEM *value) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIru); + size_t size; + uint32_t optype, recsize; + + optype = WT_LOGOP_COL_PUT; + WT_RET(__wt_struct_size(session, &size, fmt, + optype, 0, fileid, recno, value)); + + size += __wt_vsize_uint(size) - 1; + WT_RET(__wt_buf_extend(session, logrec, logrec->size + size)); + recsize = (uint32_t)size; + WT_RET(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, size, fmt, + optype, recsize, fileid, recno, value)); + + logrec->size += (uint32_t)size; + return (0); +} + +int +__wt_logop_col_put_unpack( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIru); + uint32_t optype, size; + + WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &optype, &size, fileidp, recnop, valuep)); + WT_ASSERT(session, optype == WT_LOGOP_COL_PUT); + + *pp += size; + return (0); +} + +int +__wt_logop_col_put_print( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + uint32_t fileid; + uint64_t recno; + WT_ITEM value; + + WT_RET(__wt_logop_col_put_unpack( + session, pp, end, &fileid, &recno, &value)); + + fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); + fprintf(out, " \"recno\": \"%" PRIu64 "\",\n", recno); + fprintf(out, " \"value\": \"%.*s\",\n", + (int)value.size, (const char *)value.data); + return (0); +} + +int +__wt_logop_col_remove_pack( + WT_SESSION_IMPL *session, WT_ITEM *logrec, + uint32_t fileid, uint64_t recno) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIr); + size_t size; + uint32_t optype, recsize; + + optype = WT_LOGOP_COL_REMOVE; + WT_RET(__wt_struct_size(session, &size, fmt, + optype, 0, fileid, recno)); + + size += __wt_vsize_uint(size) - 1; + WT_RET(__wt_buf_extend(session, logrec, logrec->size + size)); + recsize = (uint32_t)size; + WT_RET(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, size, fmt, + optype, recsize, fileid, recno)); + + logrec->size += (uint32_t)size; + return (0); +} + +int +__wt_logop_col_remove_unpack( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + uint32_t *fileidp, uint64_t *recnop) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIr); + uint32_t optype, size; + + WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &optype, &size, fileidp, recnop)); + WT_ASSERT(session, optype == WT_LOGOP_COL_REMOVE); + + *pp += size; + return (0); +} + +int +__wt_logop_col_remove_print( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + uint32_t fileid; + uint64_t recno; + + WT_RET(__wt_logop_col_remove_unpack( + session, pp, end, &fileid, &recno)); + + fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); + fprintf(out, " \"recno\": \"%" PRIu64 "\",\n", recno); + return (0); +} + +int +__wt_logop_col_truncate_pack( + WT_SESSION_IMPL *session, WT_ITEM *logrec, + uint32_t fileid, uint64_t start, uint64_t stop) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIrr); + size_t size; + uint32_t optype, recsize; + + optype = WT_LOGOP_COL_TRUNCATE; + WT_RET(__wt_struct_size(session, &size, fmt, + optype, 0, fileid, start, stop)); + + size += __wt_vsize_uint(size) - 1; + WT_RET(__wt_buf_extend(session, logrec, logrec->size + size)); + recsize = (uint32_t)size; + WT_RET(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, size, fmt, + optype, recsize, fileid, start, stop)); + + logrec->size += (uint32_t)size; + return (0); +} + +int +__wt_logop_col_truncate_unpack( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + uint32_t *fileidp, uint64_t *startp, uint64_t *stopp) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIrr); + uint32_t optype, size; + + WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &optype, &size, fileidp, startp, stopp)); + WT_ASSERT(session, optype == WT_LOGOP_COL_TRUNCATE); + + *pp += size; + return (0); +} + +int +__wt_logop_col_truncate_print( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + uint32_t fileid; + uint64_t start; + uint64_t stop; + + WT_RET(__wt_logop_col_truncate_unpack( + session, pp, end, &fileid, &start, &stop)); + + fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); + fprintf(out, " \"start\": \"%" PRIu64 "\",\n", start); + fprintf(out, " \"stop\": \"%" PRIu64 "\",\n", stop); + return (0); +} + +int +__wt_logop_row_put_pack( + WT_SESSION_IMPL *session, WT_ITEM *logrec, + uint32_t fileid, WT_ITEM *key, WT_ITEM *value) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIuu); + size_t size; + uint32_t optype, recsize; + + optype = WT_LOGOP_ROW_PUT; + WT_RET(__wt_struct_size(session, &size, fmt, + optype, 0, fileid, key, value)); + + size += __wt_vsize_uint(size) - 1; + WT_RET(__wt_buf_extend(session, logrec, logrec->size + size)); + recsize = (uint32_t)size; + WT_RET(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, size, fmt, + optype, recsize, fileid, key, value)); + + logrec->size += (uint32_t)size; + return (0); +} + +int +__wt_logop_row_put_unpack( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIuu); + uint32_t optype, size; + + WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &optype, &size, fileidp, keyp, valuep)); + WT_ASSERT(session, optype == WT_LOGOP_ROW_PUT); + + *pp += size; + return (0); +} + +int +__wt_logop_row_put_print( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + uint32_t fileid; + WT_ITEM key; + WT_ITEM value; + + WT_RET(__wt_logop_row_put_unpack( + session, pp, end, &fileid, &key, &value)); + + fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); + fprintf(out, " \"key\": \"%.*s\",\n", + (int)key.size, (const char *)key.data); + fprintf(out, " \"value\": \"%.*s\",\n", + (int)value.size, (const char *)value.data); + return (0); +} + +int +__wt_logop_row_remove_pack( + WT_SESSION_IMPL *session, WT_ITEM *logrec, + uint32_t fileid, WT_ITEM *key) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIu); + size_t size; + uint32_t optype, recsize; + + optype = WT_LOGOP_ROW_REMOVE; + WT_RET(__wt_struct_size(session, &size, fmt, + optype, 0, fileid, key)); + + size += __wt_vsize_uint(size) - 1; + WT_RET(__wt_buf_extend(session, logrec, logrec->size + size)); + recsize = (uint32_t)size; + WT_RET(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, size, fmt, + optype, recsize, fileid, key)); + + logrec->size += (uint32_t)size; + return (0); +} + +int +__wt_logop_row_remove_unpack( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + uint32_t *fileidp, WT_ITEM *keyp) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIu); + uint32_t optype, size; + + WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &optype, &size, fileidp, keyp)); + WT_ASSERT(session, optype == WT_LOGOP_ROW_REMOVE); + + *pp += size; + return (0); +} + +int +__wt_logop_row_remove_print( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + uint32_t fileid; + WT_ITEM key; + + WT_RET(__wt_logop_row_remove_unpack( + session, pp, end, &fileid, &key)); + + fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); + fprintf(out, " \"key\": \"%.*s\",\n", + (int)key.size, (const char *)key.data); + return (0); +} + +int +__wt_logop_row_truncate_pack( + WT_SESSION_IMPL *session, WT_ITEM *logrec, + uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIuuI); + size_t size; + uint32_t optype, recsize; + + optype = WT_LOGOP_ROW_TRUNCATE; + WT_RET(__wt_struct_size(session, &size, fmt, + optype, 0, fileid, start, stop, mode)); + + size += __wt_vsize_uint(size) - 1; + WT_RET(__wt_buf_extend(session, logrec, logrec->size + size)); + recsize = (uint32_t)size; + WT_RET(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, size, fmt, + optype, recsize, fileid, start, stop, mode)); + + logrec->size += (uint32_t)size; + return (0); +} + +int +__wt_logop_row_truncate_unpack( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep) +{ + const char *fmt = WT_UNCHECKED_STRING(IIIuuI); + uint32_t optype, size; + + WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &optype, &size, fileidp, startp, stopp, modep)); + WT_ASSERT(session, optype == WT_LOGOP_ROW_TRUNCATE); + + *pp += size; + return (0); +} + +int +__wt_logop_row_truncate_print( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + uint32_t fileid; + WT_ITEM start; + WT_ITEM stop; + uint32_t mode; + + WT_RET(__wt_logop_row_truncate_unpack( + session, pp, end, &fileid, &start, &stop, &mode)); + + fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); + fprintf(out, " \"start\": \"%.*s\",\n", + (int)start.size, (const char *)start.data); + fprintf(out, " \"stop\": \"%.*s\",\n", + (int)stop.size, (const char *)stop.data); + fprintf(out, " \"mode\": \"%" PRIu32 "\",\n", mode); + return (0); +} + +int +__wt_txn_op_printlog( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + uint32_t optype, opsize; + + /* Peek at the size and the type. */ + WT_RET(__wt_logop_read(session, pp, end, &optype, &opsize)); + end = *pp + opsize; + + switch (optype) { + case WT_LOGOP_COL_PUT: + WT_RET(__wt_logop_col_put_print(session, pp, end, out)); + break; + + case WT_LOGOP_COL_REMOVE: + WT_RET(__wt_logop_col_remove_print(session, pp, end, out)); + break; + + case WT_LOGOP_COL_TRUNCATE: + WT_RET(__wt_logop_col_truncate_print(session, pp, end, out)); + break; + + case WT_LOGOP_ROW_PUT: + WT_RET(__wt_logop_row_put_print(session, pp, end, out)); + break; + + case WT_LOGOP_ROW_REMOVE: + WT_RET(__wt_logop_row_remove_print(session, pp, end, out)); + break; + + case WT_LOGOP_ROW_TRUNCATE: + WT_RET(__wt_logop_row_truncate_print(session, pp, end, out)); + break; + + WT_ILLEGAL_VALUE(session); + } + + return (0); +} diff --git a/src/log/log_desc.c b/src/log/log_desc.c deleted file mode 100644 index 6cfe18fc441..00000000000 --- a/src/log/log_desc.c +++ /dev/null @@ -1 +0,0 @@ -/* DO NOT EDIT: automatically built by dist/log.py. */ diff --git a/src/log/log_slot.c b/src/log/log_slot.c index de0f7b2bd73..2e46698c431 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -27,9 +27,10 @@ int __wt_log_slot_init(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; + WT_DECL_RET; WT_LOG *log; WT_LOGSLOT *slot; - uint32_t i; + int32_t i; conn = S2C(session); log = conn->log; @@ -43,16 +44,59 @@ __wt_log_slot_init(WT_SESSION_IMPL *session) */ for (i = 0; i < SLOT_ACTIVE; i++) { slot = &log->slot_pool[i]; - slot->slot_index = i; + slot->slot_index = (uint32_t)i; slot->slot_state = WT_LOG_SLOT_READY; log->slot_array[i] = slot; } + + /* + * Allocate memory for buffers now that the arrays are setup. Split + * this out to make error handling simpler. + */ + for (i = 0; i < SLOT_POOL; i++) { + WT_ERR(__wt_buf_init(session, + &log->slot_pool[i].slot_buf, WT_LOG_SLOT_BUF_INIT_SIZE)); + F_SET(&log->slot_pool[i], SLOT_BUFFERED); + WT_ERR(__wt_cond_alloc(session, + "slot pool done", 0, &log->slot_pool[i].slot_done_cond)); + } + WT_STAT_FAST_CONN_INCRV(session, + log_buffer_size, WT_LOG_SLOT_BUF_INIT_SIZE * SLOT_POOL); + if (0) { +err: while (--i >= 0) + __wt_buf_free(session, &log->slot_pool[i].slot_buf); + } + return (ret); +} + +/* + * __wt_log_slot_destroy -- + * Clean up the slot array on shutdown. + */ +int +__wt_log_slot_destroy(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + int i; + + conn = S2C(session); + log = conn->log; + + for (i = 0; i < SLOT_POOL; i++) { + __wt_buf_free(session, &log->slot_pool[i].slot_buf); + WT_TRET(__wt_cond_destroy( + session, &log->slot_pool[i].slot_done_cond)); + } return (0); } /* * __wt_log_slot_join -- - * Join a consolidated logging slot. + * Join a consolidated logging slot. Callers should be prepared to deal + * with a ENOMEM return - which indicates no slots could accommodate + * the log record. */ int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, @@ -62,13 +106,14 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, WT_LOG *log; WT_LOGSLOT *slot; int64_t cur_state, new_state, old_state; - uint32_t i; + uint32_t allocated_slot, slot_grow_attempts; conn = S2C(session); log = conn->log; + slot_grow_attempts = 0; find_slot: - i = __wt_random() % SLOT_ACTIVE; - slot = log->slot_array[i]; + allocated_slot = __wt_random() % SLOT_ACTIVE; + slot = log->slot_array[allocated_slot]; old_state = slot->slot_state; join_slot: /* @@ -90,6 +135,18 @@ join_slot: WT_STAT_FAST_CONN_INCR(session, log_slot_toobig); goto find_slot; } + /* + * If the slot buffer isn't big enough to hold this update, mark + * the slot for a buffer size increase and find another slot. + */ + if (new_state > (int64_t)slot->slot_buf.memsize) { + F_SET(slot, SLOT_BUF_GROW); + if (++slot_grow_attempts > 5) { + WT_STAT_FAST_CONN_INCR(session, log_slot_toosmall); + return (ENOMEM); + } + goto find_slot; + } cur_state = WT_ATOMIC_CAS_VAL(slot->slot_state, old_state, new_state); /* * We lost a race to add our size into this slot. Check the state @@ -107,7 +164,7 @@ join_slot: */ WT_STAT_FAST_CONN_INCR(session, log_slot_joins); if (LF_ISSET(WT_LOG_FSYNC)) - FLD_SET(slot->slot_flags, SLOT_SYNC); + F_SET(slot, SLOT_SYNC); myslotp->slot = slot; myslotp->offset = (off_t)old_state - WT_LOG_SLOT_READY; return (0); @@ -127,10 +184,12 @@ __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) WT_LOG *log; WT_LOGSLOT *newslot; int64_t old_state; - uint32_t pool_i; + int32_t yields; + uint32_t pool_i, switch_fails; conn = S2C(session); log = conn->log; + switch_fails = 0; retry: /* * Find an unused slot in the pool. @@ -139,8 +198,28 @@ retry: newslot = &log->slot_pool[pool_i]; if (++log->pool_index >= SLOT_POOL) log->pool_index = 0; - if (newslot->slot_state != WT_LOG_SLOT_FREE) + if (newslot->slot_state != WT_LOG_SLOT_FREE) { + WT_STAT_FAST_CONN_INCR(session, log_slot_switch_fails); + /* + * If it takes a number of attempts to find an available slot + * it's likely all slots are waiting to be released. This + * churn is used to change how long we pause before closing + * the slot - which leads to more consolidation and less churn. + */ + if (++switch_fails % SLOT_POOL == 0 && + switch_fails != 0 && slot->slot_churn < 5) + ++slot->slot_churn; + __wt_yield(); goto retry; + } else if (slot->slot_churn > 0) { + --slot->slot_churn; + WT_ASSERT(session, slot->slot_churn >= 0); + } + + /* Pause to allow other threads a chance to consolidate. */ + for (yields = slot->slot_churn; yields >= 0; yields--) + __wt_yield(); + /* * Swap out the slot we're going to use and put a free one in the * slot array in its place so that threads can use it right away. @@ -168,10 +247,11 @@ retry: * Notify all threads waiting for the state to be < WT_LOG_SLOT_DONE. */ int -__wt_log_slot_notify(WT_LOGSLOT *slot) +__wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { slot->slot_state = (int64_t)WT_LOG_SLOT_DONE - (int64_t)slot->slot_group_size; + WT_RET(__wt_cond_signal(session, slot->slot_done_cond)); return (0); } @@ -180,15 +260,26 @@ __wt_log_slot_notify(WT_LOGSLOT *slot) * Wait for slot leader to allocate log area and tell us our log offset. */ int -__wt_log_slot_wait(WT_LOGSLOT *slot) +__wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { - while (slot->slot_state > WT_LOG_SLOT_DONE) - __wt_yield(); + while (slot->slot_state > WT_LOG_SLOT_DONE) { + /* + * Workloads with fast commits (no-sync is a reasonable + * approximation) benefit from yielding rather than using the + * more heavy weight condition wait. + */ + if (S2C(session)->txn_logsync == 0) + __wt_yield(); + else if (__wt_cond_wait(session, + slot->slot_done_cond, 10000) == ETIMEDOUT) + WT_STAT_FAST_CONN_INCR(session, + log_slot_ready_wait_timeout); + } return (0); } /* - * _wt_log_slot_release -- + * __wt_log_slot_release -- * Each thread in a consolidated group releases its portion to * signal it has completed writing its piece of the log. */ @@ -215,3 +306,63 @@ __wt_log_slot_free(WT_LOGSLOT *slot) slot->slot_state = WT_LOG_SLOT_FREE; return (0); } + +/* + * __wt_log_slot_grow_buffers -- + * Increase the buffer size of all available slots in the buffer pool. + * Go to some lengths to include active (but unused) slots to handle + * the case where all log write record sizes exceed the size of the + * active buffer. + */ +int +__wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, int64_t newsize) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + WT_LOGSLOT *slot; + int64_t orig_state; + uint64_t old_size, total_growth; + int i; + + conn = S2C(session); + log = conn->log; + total_growth = 0; + WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); + /* + * Take the log slot lock to prevent other threads growing buffers + * at the same time. Could tighten the scope of this lock, or have + * a separate lock if there is contention. + */ + __wt_spin_lock(session, &log->log_slot_lock); + for (i = 0; i < SLOT_POOL; i++) { + slot = &log->slot_pool[i]; + /* Avoid atomic operations if they won't succeed. */ + if (slot->slot_state != WT_LOG_SLOT_FREE && + slot->slot_state != WT_LOG_SLOT_READY) + continue; + /* Don't keep growing unrelated buffers. */ + if (slot->slot_buf.memsize > (size_t)(10 * newsize) && + !F_ISSET(slot, SLOT_BUF_GROW)) + continue; + orig_state = WT_ATOMIC_CAS_VAL( + slot->slot_state, WT_LOG_SLOT_FREE, WT_LOG_SLOT_PENDING); + if (orig_state != WT_LOG_SLOT_FREE) { + orig_state = WT_ATOMIC_CAS_VAL(slot->slot_state, + WT_LOG_SLOT_READY, WT_LOG_SLOT_PENDING); + if (orig_state != WT_LOG_SLOT_READY) + continue; + } + + /* We have a slot - now go ahead and grow the buffer. */ + old_size = slot->slot_buf.memsize; + F_CLR(slot, SLOT_BUF_GROW); + WT_ERR(__wt_buf_grow(session, &slot->slot_buf, + WT_MAX(slot->slot_buf.memsize * 2, (size_t)newsize))); + slot->slot_state = orig_state; + total_growth += slot->slot_buf.memsize - old_size; + } +err: __wt_spin_unlock(session, &log->log_slot_lock); + WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, total_growth); + return (ret); +} diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index 68fbfa4c9d0..40d382e0e72 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -223,7 +223,7 @@ __wt_lsm_tree_setup_chunk( } /* - * __wt_lsm_start_worker -- + * __lsm_tree_start_worker -- * Start the worker thread for an LSM tree. */ static int @@ -526,7 +526,7 @@ __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) } /* - * __lsm_tree_throttle -- + * __wt_lsm_tree_throttle -- * Calculate whether LSM updates need to be throttled. */ void diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c index ca1759f106b..22e6df5767b 100644 --- a/src/lsm/lsm_worker.c +++ b/src/lsm/lsm_worker.c @@ -156,7 +156,7 @@ __wt_lsm_merge_worker(void *vargs) * The "main" thread polls 10 times per second, * secondary threads once per second. */ - WT_ERR(__wt_cond_wait( + WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, lsm_tree->work_cond, id == 0 ? 100000 : 1000000)); stallms += (id == 0) ? 100 : 1000; @@ -409,7 +409,7 @@ __wt_lsm_checkpoint_worker(void *arg) __lsm_unpin_chunks(session, &cookie); if (j == 0 && F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) - WT_ERR(__wt_cond_wait( + WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, lsm_tree->work_cond, 100000)); } err: __lsm_unpin_chunks(session, &cookie); diff --git a/src/meta/meta_ckpt.c b/src/meta/meta_ckpt.c index 344275876d6..273b93ef0e6 100644 --- a/src/meta/meta_ckpt.c +++ b/src/meta/meta_ckpt.c @@ -365,8 +365,8 @@ format: * Set a file's checkpoint value from the WT_CKPT list. */ int -__wt_meta_ckptlist_set( - WT_SESSION_IMPL *session, const char *fname, WT_CKPT *ckptbase) +__wt_meta_ckptlist_set(WT_SESSION_IMPL *session, + const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn) { struct timespec ts; WT_CKPT *ckpt; @@ -443,6 +443,10 @@ __wt_meta_ckptlist_set( sep = ","; } WT_ERR(__wt_buf_catfmt(session, buf, ")")); + if (ckptlsn != NULL) + WT_ERR(__wt_buf_catfmt(session, buf, + ",checkpoint_lsn=(%" PRIu32 ",%" PRIuMAX ")", + ckptlsn->file, (uintmax_t)ckptlsn->offset)); WT_ERR(__ckpt_set(session, fname, buf->mem)); err: __wt_scr_free(&buf); diff --git a/src/meta/meta_track.c b/src/meta/meta_track.c index d9828ee6cb8..3992687f559 100644 --- a/src/meta/meta_track.c +++ b/src/meta/meta_track.c @@ -89,6 +89,10 @@ __wt_meta_track_on(WT_SESSION_IMPL *session) return (0); } +/* + * __meta_track_apply -- + * Apply the changes in a metadata tracking record. + */ static int __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) { @@ -151,7 +155,7 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) case WT_ST_REMOVE: /* Remove trk.a */ if ((tret = __wt_metadata_remove( session, trk->a)) != 0) { - __wt_err(session, ret, + __wt_err(session, tret, "metadata unroll remove: %s", trk->a); WT_TRET(tret); @@ -160,7 +164,7 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) case WT_ST_SET: /* Set trk.a to trk.b */ if ((tret = __wt_metadata_update( session, trk->a, trk->b)) != 0) { - __wt_err(session, ret, + __wt_err(session, tret, "metadata unroll update %s to %s", trk->a, trk->b); WT_TRET(tret); @@ -320,8 +324,8 @@ __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key) } /* - * __wt_meta_track_fs_rename -- - * Track a filesystem rename operation. + * __wt_meta_track_fileop -- + * Track a filesystem operation. */ int __wt_meta_track_fileop( diff --git a/src/meta/meta_turtle.c b/src/meta/meta_turtle.c index 0292de522ed..166975d58fb 100644 --- a/src/meta/meta_turtle.c +++ b/src/meta/meta_turtle.c @@ -27,7 +27,7 @@ __metadata_config(WT_SESSION_IMPL *session, const char **metaconfp) /* Create a turtle file with default values. */ WT_RET(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, - "key_format=S,value_format=S,version=(major=%d,minor=%d)", + "key_format=S,value_format=S,id=0,version=(major=%d,minor=%d)", WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX)); cfg[1] = buf->data; WT_ERR(__wt_config_collapse(session, cfg, &metaconf)); diff --git a/src/os_posix/os_dir.c b/src/os_posix/os_dir.c index fab75184971..b0b7c799181 100644 --- a/src/os_posix/os_dir.c +++ b/src/os_posix/os_dir.c @@ -18,21 +18,22 @@ int __wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix, uint32_t flags, char ***dirlist, u_int *countp) { - WT_DECL_RET; struct dirent *dp; DIR *dirp; + WT_DECL_RET; const char *path; - char **entries; size_t dirallocsz; u_int count, dirsz; int match; + char **entries; + + *dirlist = NULL; + *countp = 0; WT_RET(__wt_filename(session, dir, &path)); - *countp = 0; - dirallocsz = 0; - *dirlist = NULL; dirp = NULL; + dirallocsz = 0; dirsz = 0; entries = NULL; if (flags == 0) diff --git a/src/os_posix/os_mtx.c b/src/os_posix/os_mtx.c index e6afa5c653b..762668b1672 100644 --- a/src/os_posix/os_mtx.c +++ b/src/os_posix/os_mtx.c @@ -52,7 +52,7 @@ err: __wt_free(session, cond); } /* - * __wt_cond_wait + * __wt_cond_wait -- * Wait on a mutex, optionally timing out. */ int @@ -192,7 +192,7 @@ err: __wt_free(session, rwlock); } /* - * __wt_readlock + * __wt_readlock -- * Get a shared lock. */ int @@ -210,7 +210,7 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) } /* - * __wt_try_writelock + * __wt_try_writelock -- * Try to get an exclusive lock, or fail immediately if unavailable. */ int @@ -229,7 +229,7 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) } /* - * __wt_writelock + * __wt_writelock -- * Wait to get an exclusive lock. */ int diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c index dddf9a91485..f6b141c4da2 100644 --- a/src/os_posix/os_open.c +++ b/src/os_posix/os_open.c @@ -8,7 +8,7 @@ #include "wt_internal.h" /* - * __open_directory_sync: + * __open_directory_sync -- * Fsync the directory in which we created the file. */ static int diff --git a/src/os_posix/os_strtouq.c b/src/os_posix/os_strtouq.c index cb445261ce5..b532918e241 100644 --- a/src/os_posix/os_strtouq.c +++ b/src/os_posix/os_strtouq.c @@ -8,7 +8,8 @@ #include "wt_internal.h" /* - * Convert a string to an unsigned quad integer. + * __wt_strtouq -- + * Convert a string to an unsigned quad integer. */ uint64_t __wt_strtouq(const char *nptr, char **endptr, int base) diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c index 52e29ba7ed1..fe1ede05393 100644 --- a/src/schema/schema_create.c +++ b/src/schema/schema_create.c @@ -46,6 +46,10 @@ __wt_direct_io_size_check(WT_SESSION_IMPL *session, return (0); } +/* + * __create_file -- + * Create a new 'file:' object. + */ static int __create_file(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) @@ -84,13 +88,15 @@ __create_file(WT_SESSION_IMPL *session, WT_ERR(__wt_meta_track_fileop(session, NULL, uri)); /* - * If creating an ordinary file, append the current version numbers to - * the passed-in configuration and insert the resulting configuration - * into the metadata. + * If creating an ordinary file, append the file ID and current version + * numbers to the passed-in configuration and insert the resulting + * configuration into the metadata. */ if (!is_metadata) { WT_ERR(__wt_scr_alloc(session, 0, &val)); - WT_ERR(__wt_buf_fmt(session, val, "version=(major=%d,minor=%d)", + WT_ERR(__wt_buf_fmt(session, val, + "id=%" PRIu32 ",version=(major=%d,minor=%d)", + ++S2C(session)->next_file_id, WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX)); for (p = filecfg; *p != NULL; ++p) ; @@ -120,6 +126,10 @@ err: __wt_scr_free(&val); return (ret); } +/* + * __wt_schema_colgroup_source -- + * Get the URI of the data source for a column group. + */ int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf) @@ -152,6 +162,10 @@ __wt_schema_colgroup_source(WT_SESSION_IMPL *session, return (0); } +/* + * __create_colgroup -- + * Create a column group. + */ static int __create_colgroup(WT_SESSION_IMPL *session, const char *name, int exclusive, const char *config) @@ -257,6 +271,10 @@ err: __wt_free(session, cgconf); return (ret); } +/* + * __wt_schema_index_source -- + * Get the URI of the data source for an index. + */ int __wt_schema_index_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf) @@ -285,6 +303,10 @@ __wt_schema_index_source(WT_SESSION_IMPL *session, return (0); } +/* + * __create_index -- + * Create an index. + */ static int __create_index(WT_SESSION_IMPL *session, const char *name, int exclusive, const char *config) @@ -415,6 +437,10 @@ err: __wt_free(session, idxconf); return (ret); } +/* + * __create_table -- + * Create a table. + */ static int __create_table(WT_SESSION_IMPL *session, const char *name, int exclusive, const char *config) @@ -488,6 +514,10 @@ err: if (table != NULL) { return (ret); } +/* + * __create_data_source -- + * Create a custom data source. + */ static int __create_data_source(WT_SESSION_IMPL *session, const char *uri, const char *config, WT_DATA_SOURCE *dsrc) @@ -517,6 +547,10 @@ __create_data_source(WT_SESSION_IMPL *session, return (dsrc->create(dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg)); } +/* + * __wt_schema_create -- + * Process a WT_SESSION::create operation for all supported types. + */ int __wt_schema_create( WT_SESSION_IMPL *session, const char *uri, const char *config) diff --git a/src/schema/schema_drop.c b/src/schema/schema_drop.c index 91a15f477d6..c4cb864a5de 100644 --- a/src/schema/schema_drop.c +++ b/src/schema/schema_drop.c @@ -157,6 +157,10 @@ err: if (force && ret == WT_NOTFOUND) return (ret); } +/* + * __wt_schema_drop -- + * Process a WT_SESSION::drop operation for all supported types. + */ int __wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) { diff --git a/src/schema/schema_open.c b/src/schema/schema_open.c index 372968ea221..75fb425f37d 100644 --- a/src/schema/schema_open.c +++ b/src/schema/schema_open.c @@ -114,7 +114,7 @@ err: __wt_scr_free(&buf); } /* - * ___open_index -- + * __open_index -- * Open an index. */ static int @@ -208,8 +208,8 @@ err: __wt_scr_free(&buf); } /* - * __wt_schema_open_indices -- - * Open the indices for a table. + * __wt_schema_open_index -- + * Open one or more indices for a table. */ int __wt_schema_open_index(WT_SESSION_IMPL *session, diff --git a/src/schema/schema_plan.c b/src/schema/schema_plan.c index acde7a11183..68c2eb0f29f 100644 --- a/src/schema/schema_plan.c +++ b/src/schema/schema_plan.c @@ -7,6 +7,10 @@ #include "wt_internal.h" +/* + * __find_next_col -- + * Find the next column to use for a plan. + */ static int __find_next_col(WT_SESSION_IMPL *session, WT_TABLE *table, WT_CONFIG_ITEM *colname, u_int *cgnump, u_int *colnump, char *coltype) @@ -253,6 +257,10 @@ __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, return (0); } +/* + * __find_column_format -- + * Find the format of the named column. + */ static int __find_column_format(WT_SESSION_IMPL *session, WT_TABLE *table, WT_CONFIG_ITEM *colname, int value_only, WT_PACK_VALUE *pv) diff --git a/src/session/session_api.c b/src/session/session_api.c index 04c32b794ba..133348f2a01 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -340,11 +340,13 @@ __session_log_printf(WT_SESSION *wt_session, const char *fmt, ...) va_list ap; session = (WT_SESSION_IMPL *)wt_session; + SESSION_API_CALL_NO_CONF(session, log_printf); va_start(ap, fmt); - ret =__wt_log_vprintf(session, fmt, ap); + ret = __wt_log_vprintf(session, fmt, ap); va_end(ap); +err: API_END(session); return (ret); } diff --git a/src/session/session_compact.c b/src/session/session_compact.c index 2337ee16050..502c35f6775 100644 --- a/src/session/session_compact.c +++ b/src/session/session_compact.c @@ -8,90 +8,94 @@ #include "wt_internal.h" /* -Compaction is the place where the underlying block manager becomes visible in -the higher engine btree and API layers. As there is currently only one block -manager, this code is written with it in mind: other block managers may need -changes to support compaction, and a smart block manager might need far less -support from the engine. - -First, the default block manager cannot entirely own compaction because it has -no way to find a block after it moves other than a request from the btree layer -with the new address. In other words, if internal page X points to leaf page Y, -and page Y moves, the address of page Y has to be updated in page X. Generally, -this is solved by building a translation layer in the block manager so internal -pages don't require updates to relocate blocks: however, the translation table -must be durable, has its own garbage collection issues and might be slower, all -of which have their own problems. - -Second, the btree layer cannot entirely own compaction because page addresses -are opaque, it cannot know where a page is in the file from the address cookie. - -For these reasons, compaction is a cooperative process between the btree layer -and the block manager. The btree layer walks files, and asks the block manager -if rewriting a particular block would reduce the file footprint: if writing the -page will help, the page is marked dirty so it will eventually be written. As -pages are written, the original page potentially becomes available for reuse -and if enough pages at the end of the file are available for reuse, the file can -be truncated, and compaction succeeds. - -However, writing a page is not by itself sufficient to make a page available -for reuse. The original version of the page is still referenced by at least -the most recent checkpoint in the file. To make a page available for reuse, -we have to checkpoint the file so we can discard the checkpoint referencing -the original version of the block; once no checkpoint references a block, it -becomes available for reuse. - -Compaction is not necessarily possible in WiredTiger, even in a file with lots -of available space. If a block at the end of the file is referenced by a named -checkpoint, there is nothing we can do to compact the file, no matter how many -times we rewrite the block, the named checkpoint can't be discarded and so the -reference count on the original block will never go to zero. What's worse, -because the block manager doesn't reference count blocks, it can't easily know -this is the case, and so we'll waste a lot of effort trying to compact files -that can't be compacted. - -Now, to the actual process. First, we checkpoint the high-level object (which -is potentially composed of multiple files): there are potentially many dirty -blocks in the cache, and we want to write them out and then discard previous -checkpoints so we have as many blocks as possible on the file's "available for -reuse" list when we start compaction. - -Then, we compact the high-level object. - -Compacting the object is done 10% at a time, that is, we try and move blocks -from the last 10% of the file into the beginning of the file (the 10% is hard -coded in the block manager). The reason for this is because we are walking -the file in logical order, not block offset order, and we can fail to compact -a file if we write the wrong blocks first. - -For example, imagine a file with 10 blocks in the first 10% of a file, 1,000 -blocks in the 3rd quartile of the file, and 10 blocks in the last 10% of the -file. If we were to rewrite blocks from more than the last 10% of the file, -and found the 1,000 blocks in the 3rd quartile of the file first, we'd copy -10 of them without ever rewriting the blocks from the end of the file which -would allow us to compact the file. So, we compact the last 10% of the file, -and if that works, we compact the last 10% of the file again, and so on. Note -the block manager uses a first-fit block selection algorithm during compaction -to maximize block movement. - -After each 10% compaction, we checkpoint two more times (seriously, twice). -The second and third checkpoints are because the block manager checkpoints in -two steps: blocks made available for reuse during a checkpoint are put on a -special checkpoint-available list and only moved to the real available list -after the metadata has been updated with the new checkpoint's information. -(Otherwise it is possible to allocate a rewritten block, crash before the -metadata is updated, and see corruption.) For this reason, blocks allocated -to write the checkpoint itself cannot be taken from the blocks made available -by the checkpoint. - -To say it another way, the second checkpoint puts the blocks from the end of -the file that were made available by compaction onto the checkpoint-available -list, but then potentially writes the checkpoint itself at the end of the -file, which would prevent any file truncation. When the metadata is updated -for the second checkpoint, the blocks freed by compaction become available -for the third checkpoint, so the third checkpoint's blocks are written towards -the beginning of the file, and then the file can be truncated. -*/ + * Compaction is the place where the underlying block manager becomes visible + * in the higher engine btree and API layers. As there is currently only one + * block manager, this code is written with it in mind: other block managers + * may need changes to support compaction, and a smart block manager might need + * far less support from the engine. + * + * First, the default block manager cannot entirely own compaction because it + * has no way to find a block after it moves other than a request from the + * btree layer with the new address. In other words, if internal page X points + * to leaf page Y, and page Y moves, the address of page Y has to be updated in + * page X. Generally, this is solved by building a translation layer in the + * block manager so internal pages don't require updates to relocate blocks: + * however, the translation table must be durable, has its own garbage + * collection issues and might be slower, all of which have their own problems. + * + * Second, the btree layer cannot entirely own compaction because page + * addresses are opaque, it cannot know where a page is in the file from the + * address cookie. + * + * For these reasons, compaction is a cooperative process between the btree + * layer and the block manager. The btree layer walks files, and asks the + * block manager if rewriting a particular block would reduce the file + * footprint: if writing the page will help, the page is marked dirty so it + * will eventually be written. As pages are written, the original page + * potentially becomes available for reuse and if enough pages at the end of + * the file are available for reuse, the file can be truncated, and compaction + * succeeds. + * + * However, writing a page is not by itself sufficient to make a page available + * for reuse. The original version of the page is still referenced by at least + * the most recent checkpoint in the file. To make a page available for reuse, + * we have to checkpoint the file so we can discard the checkpoint referencing + * the original version of the block; once no checkpoint references a block, it + * becomes available for reuse. + * + * Compaction is not necessarily possible in WiredTiger, even in a file with + * lots of available space. If a block at the end of the file is referenced by + * a named checkpoint, there is nothing we can do to compact the file, no + * matter how many times we rewrite the block, the named checkpoint can't be + * discarded and so the reference count on the original block will never go to + * zero. What's worse, because the block manager doesn't reference count + * blocks, it can't easily know this is the case, and so we'll waste a lot of + * effort trying to compact files that can't be compacted. + * + * Now, to the actual process. First, we checkpoint the high-level object + * (which is potentially composed of multiple files): there are potentially + * many dirty blocks in the cache, and we want to write them out and then + * discard previous checkpoints so we have as many blocks as possible on the + * file's "available for reuse" list when we start compaction. + * + * Then, we compact the high-level object. + * + * Compacting the object is done 10% at a time, that is, we try and move blocks + * from the last 10% of the file into the beginning of the file (the 10% is + * hard coded in the block manager). The reason for this is because we are + * walking the file in logical order, not block offset order, and we can fail + * to compact a file if we write the wrong blocks first. + * + * For example, imagine a file with 10 blocks in the first 10% of a file, 1,000 + * blocks in the 3rd quartile of the file, and 10 blocks in the last 10% of the + * file. If we were to rewrite blocks from more than the last 10% of the file, + * and found the 1,000 blocks in the 3rd quartile of the file first, we'd copy + * 10 of them without ever rewriting the blocks from the end of the file which + * would allow us to compact the file. So, we compact the last 10% of the + * file, and if that works, we compact the last 10% of the file again, and so + * on. Note the block manager uses a first-fit block selection algorithm + * during compaction to maximize block movement. + * + * After each 10% compaction, we checkpoint two more times (seriously, twice). + * The second and third checkpoints are because the block manager checkpoints + * in two steps: blocks made available for reuse during a checkpoint are put on + * + * a special checkpoint-available list and only moved to the real available + * list after the metadata has been updated with the new checkpoint's + * information. (Otherwise it is possible to allocate a rewritten block, crash + * before the metadata is updated, and see corruption.) For this reason, + * blocks allocated to write the checkpoint itself cannot be taken from the + * blocks made available by the checkpoint. + * + * To say it another way, the second checkpoint puts the blocks from the end of + * the file that were made available by compaction onto the + * checkpoint-available list, but then potentially writes the checkpoint itself + * at the end of the file, which would prevent any file truncation. When the + * metadata is updated for the second checkpoint, the blocks freed by + * compaction become available for the third checkpoint, so the third + * checkpoint's blocks are written towards the beginning of the file, and then + * the file can be truncated. + */ /* * __session_compact_worker -- diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index d95a17c66db..d776144d880 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -211,7 +211,8 @@ retry: WT_RET(__wt_meta_checkpoint_last_name( } /* - * Discard any session dhandles that are not open. + * __session_dhandle_sweep -- + * Discard any session dhandles that are not open. */ static int __session_dhandle_sweep(WT_SESSION_IMPL *session) @@ -235,9 +236,10 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) } /* - * Wrapper function to first sweep the session and then get the btree. - * Sweeping is only called when a session notices it has dead dhandles - * on its session dhandle list. Must be called with schema lock. + * __session_open_btree -- + * Wrapper function to first sweep the session and then get the btree. + * Sweeping is only called when a session notices it has dead dhandles on + * its session dhandle list. Must be called with schema lock. */ static int __session_open_btree(WT_SESSION_IMPL *session, diff --git a/src/session/session_salvage.c b/src/session/session_salvage.c index 3a00ec6562d..09b8a7c5bbe 100644 --- a/src/session/session_salvage.c +++ b/src/session/session_salvage.c @@ -50,8 +50,8 @@ __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) if (ckptbase[0].raw.data == NULL) WT_ERR(__wt_meta_checkpoint_clear(session, dhandle->name)); else - WT_ERR( - __wt_meta_ckptlist_set(session, dhandle->name, ckptbase)); + WT_ERR(__wt_meta_ckptlist_set( + session, dhandle->name, ckptbase, NULL)); err: __wt_meta_ckptlist_free(session, ckptbase); return (ret); diff --git a/src/support/global.c b/src/support/global.c index 807675bddf2..b365b9473eb 100644 --- a/src/support/global.c +++ b/src/support/global.c @@ -10,6 +10,10 @@ WT_PROCESS __wt_process; /* Per-process structure */ static int __wt_pthread_once_failed; /* If initialization failed */ +/* + * __system_is_little_endian -- + * Check if the system is little endian. + */ static int __system_is_little_endian(void) { @@ -28,6 +32,10 @@ __system_is_little_endian(void) return (EINVAL); } +/* + * __wt_pthread_once -- + * Global initialization, run once. + */ static void __wt_pthread_once(void) { diff --git a/src/support/hash_fnv.c b/src/support/hash_fnv.c index 111556b62a6..099dba1ec73 100644 --- a/src/support/hash_fnv.c +++ b/src/support/hash_fnv.c @@ -109,7 +109,8 @@ #define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL) /* - * fnv_64a_buf - perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer + * fnv_64a_buf -- + * Perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer * * input: * buf - start of buffer to hash diff --git a/src/support/hex.c b/src/support/hex.c index 48463e86c51..bb736932e9a 100644 --- a/src/support/hex.c +++ b/src/support/hex.c @@ -155,6 +155,10 @@ hex2byte(const u_char *from, u_char *to) return (0); } +/* + * __hex_fmterr -- + * Hex format error message. + */ static int __hex_fmterr(WT_SESSION_IMPL *session) { diff --git a/src/support/mutex.c b/src/support/mutex.c index b684fcf0299..8ed8f50322f 100644 --- a/src/support/mutex.c +++ b/src/support/mutex.c @@ -66,7 +66,7 @@ __wt_spin_lock_register(WT_SESSION_IMPL *session, } /* - * __wt_statlog_spinlock_dump -- + * __wt_statlog_dump_spinlock -- * Log the spin-lock statistics. */ int diff --git a/src/support/pow.c b/src/support/pow.c index 9ba22a53f8d..c01268245bb 100644 --- a/src/support/pow.c +++ b/src/support/pow.c @@ -28,6 +28,7 @@ #include "wt_internal.h" #ifdef __WIREDTIGER_UNUSED__ + /* * __wt_nlpo2_round -- * Round up to the next-largest power-of-two for a 32-bit unsigned value. @@ -63,6 +64,10 @@ __wt_nlpo2_round(uint32_t v) return (v + 1); } +/* + * __wt_nlpo2 -- + * Return the next largest power-of-two. + */ uint32_t __wt_nlpo2(uint32_t v) { diff --git a/src/support/scratch.c b/src/support/scratch.c index aa6a56e9d50..5f4cab931d6 100644 --- a/src/support/scratch.c +++ b/src/support/scratch.c @@ -76,6 +76,22 @@ __wt_buf_grow(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) } /* + * __wt_buf_extend -- + * Extend a buffer that's currently in-use. The difference from + * __wt_buf_grow is that extend is expected to be called repeatedly for + * the same buffer, and so grows the buffer exponentially to avoid + * repeated costly calls to realloc. + */ +int +__wt_buf_extend(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) +{ + if (size <= buf->memsize) + return (0); + + return (__wt_buf_grow(session, buf, WT_MAX(size, 2 * buf->memsize))); +} + +/* * __wt_buf_init -- * Initialize a buffer at a specific size. */ @@ -259,7 +275,7 @@ __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) } /* - * __wt_scr_alloc -- + * __wt_scr_alloc_func -- * Scratch buffer allocation function. */ int diff --git a/src/support/stat.c b/src/support/stat.c index a48597ae869..4d90409db08 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -114,8 +114,7 @@ __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats) "reconciliation failed because an update could not be included"; stats->rec_split_internal.desc = "reconciliation internal pages split"; stats->rec_split_leaf.desc = "reconciliation leaf pages split"; - stats->rec_split_max.desc = - "reconciliation maximum number of splits created for a page"; + stats->rec_split_max.desc = "reconciliation maximum splits for a page"; stats->session_compact.desc = "object compaction"; stats->session_cursor_open.desc = "open cursor count"; stats->txn_update_conflict.desc = "update conflicts"; @@ -364,8 +363,10 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) stats->dh_conn_handles.desc = "dhandle: connection dhandles swept"; stats->dh_session_handles.desc = "dhandle: session dhandles swept"; stats->dh_sweep_evict.desc = "dhandle: sweeps conflicting with evict"; - stats->dh_sweeps.desc = "dhandle: number of sweep attempts"; + stats->dh_sweeps.desc = "dhandle: sweep attempts"; stats->file_open.desc = "files currently open"; + stats->log_buffer_grow.desc = "log: log buffer size increases"; + stats->log_buffer_size.desc = "log: total log buffer size"; stats->log_bytes_user.desc = "log: user provided log bytes written"; stats->log_bytes_written.desc = "log: log bytes written"; stats->log_max_filesize.desc = "log: maximum log file size"; @@ -378,7 +379,15 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) stats->log_slot_consolidated.desc = "log: logging bytes consolidated"; stats->log_slot_joins.desc = "log: consolidated slot joins"; stats->log_slot_races.desc = "log: consolidated slot join races"; + stats->log_slot_ready_wait_timeout.desc = + "log: log slot ready wait timeouts"; + stats->log_slot_release_wait_timeout.desc = + "log: log slot release wait timeouts"; + stats->log_slot_switch_fails.desc = + "log: slots selected for switching that were unavailable"; stats->log_slot_toobig.desc = "log: record size exceeded maximum"; + stats->log_slot_toosmall.desc = + "log: failed to find a slot large enough for record"; stats->log_slot_transitions.desc = "log: consolidated slot join transitions"; stats->log_sync.desc = "log: log sync operations"; @@ -453,6 +462,7 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->dh_session_handles.v = 0; stats->dh_sweep_evict.v = 0; stats->dh_sweeps.v = 0; + stats->log_buffer_grow.v = 0; stats->log_bytes_user.v = 0; stats->log_bytes_written.v = 0; stats->log_reads.v = 0; @@ -463,7 +473,11 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->log_slot_consolidated.v = 0; stats->log_slot_joins.v = 0; stats->log_slot_races.v = 0; + stats->log_slot_ready_wait_timeout.v = 0; + stats->log_slot_release_wait_timeout.v = 0; + stats->log_slot_switch_fails.v = 0; stats->log_slot_toobig.v = 0; + stats->log_slot_toosmall.v = 0; stats->log_slot_transitions.v = 0; stats->log_sync.v = 0; stats->log_writes.v = 0; diff --git a/src/txn/txn.c b/src/txn/txn.c index 0faab9fff82..f7a3f2004c4 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -296,7 +296,7 @@ __wt_txn_release(WT_SESSION_IMPL *session) WT_TXN_STATE *txn_state; txn = &session->txn; - txn->mod_count = txn->modref_count = 0; + txn->mod_count = 0; txn->notify = NULL; txn_global = &S2C(session)->txn_global; @@ -327,7 +327,10 @@ __wt_txn_release(WT_SESSION_IMPL *session) int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) { + WT_DECL_RET; WT_TXN *txn; + WT_TXN_OP *op; + u_int i; WT_UNUSED(cfg); @@ -339,8 +342,29 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) /* Commit notification. */ if (txn->notify != NULL) - WT_RET(txn->notify->notify(txn->notify, (WT_SESSION *)session, - txn->id, 1)); + WT_TRET(txn->notify->notify(txn->notify, + (WT_SESSION *)session, txn->id, 1)); + + /* If we are logging, write a commit log record. */ + if (ret == 0 && + txn->mod_count > 0 && S2C(session)->logging && + !F_ISSET(session, WT_SESSION_LOGGING_DISABLED)) + ret = __wt_txn_log_commit(session, cfg); + + /* + * If anything went wrong, roll back. + * + * !!! + * Nothing can fail after this point. + */ + if (ret != 0) { + WT_TRET(__wt_txn_rollback(session, cfg)); + return (ret); + } + + /* Free memory associated with updates. */ + for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) + __wt_txn_op_free(session, op); /* * Auto-commit transactions need a new transaction snapshot so that the @@ -364,9 +388,8 @@ int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) { WT_DECL_RET; - WT_REF **rp; WT_TXN *txn; - uint64_t **m; + WT_TXN_OP *op; u_int i; WT_UNUSED(cfg); @@ -381,12 +404,29 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) txn->id, 0)); /* Rollback updates. */ - for (i = 0, m = txn->mod; i < txn->mod_count; i++, m++) - **m = WT_TXN_ABORTED; + for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { + switch (op->type) { + case TXN_OP_BASIC: + case TXN_OP_INMEM: + op->u.op.upd->txnid = WT_TXN_ABORTED; + break; + case TXN_OP_REF: + __wt_tree_walk_delete_rollback(op->u.ref); + break; + case TXN_OP_TRUNCATE_COL: + case TXN_OP_TRUNCATE_ROW: + /* + * Nothing to do: these operations are only logged for + * recovery. The in-memory changes will be rolled back + * with a combination of TXN_OP_REF and TXN_OP_INMEM + * operations. + */ + break; + } - /* Rollback fast deletes. */ - for (i = 0, rp = txn->modref; i < txn->modref_count; i++, rp++) - __wt_tree_walk_delete_rollback(*rp); + /* Free any memory allocated for the operation. */ + __wt_txn_op_free(session, op); + } __wt_txn_release(session); return (ret); @@ -412,7 +452,6 @@ __wt_txn_init(WT_SESSION_IMPL *session) * for eviction. */ txn->mod = NULL; - txn->modref = NULL; txn->isolation = session->isolation; return (0); @@ -429,7 +468,6 @@ __wt_txn_destroy(WT_SESSION_IMPL *session) txn = &session->txn; __wt_free(session, txn->mod); - __wt_free(session, txn->modref); __wt_free(session, txn->snapshot); } diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 368caa978f1..6ad8a6aeb2b 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -65,7 +65,7 @@ err: if (cursor != NULL) */ static int __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], - int (*op)(WT_SESSION_IMPL *, const char *[])) + int (*op)(WT_SESSION_IMPL *, const char *[]), int *fullp) { WT_CONFIG targetconf; WT_CONFIG_ITEM cval, k, v; @@ -132,6 +132,8 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], } err: __wt_scr_free(&tmp); + if (ret == 0 && fullp != NULL) + *fullp = !target_list; return (ret); } @@ -186,7 +188,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN *txn; WT_TXN_ISOLATION saved_isolation; void *saved_meta_next; - int tracking; + int full, tracking; conn = S2C(session); saved_isolation = session->isolation; @@ -214,16 +216,27 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) /* Flush dirty leaf pages before we start the checkpoint. */ session->isolation = txn->isolation = TXN_ISO_READ_COMMITTED; - WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_write_leaves)); + WT_ERR(__checkpoint_apply( + session, cfg, __wt_checkpoint_write_leaves, &full)); WT_ERR(__wt_meta_track_on(session)); tracking = 1; + /* Tell logging that we are about to start a database checkpoint. */ + if (S2C(session)->logging && full) + WT_ERR(__wt_txn_checkpoint_log( + session, full, WT_TXN_LOG_CKPT_PREPARE, NULL)); + /* Start a snapshot transaction for the checkpoint. */ wt_session = &session->iface; WT_ERR(wt_session->begin_transaction(wt_session, "isolation=snapshot")); - WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint)); + /* Tell logging that we have started a database checkpoint. */ + if (S2C(session)->logging && full) + WT_ERR(__wt_txn_checkpoint_log( + session, full, WT_TXN_LOG_CKPT_START, NULL)); + + WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint, NULL)); /* Release the snapshot transaction, before syncing the file(s). */ __wt_txn_release(session); @@ -233,7 +246,8 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * lazy checkpoints, but we don't support them yet). */ if (F_ISSET(conn, WT_CONN_CKPT_SYNC)) - WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); + WT_ERR(__checkpoint_apply( + session, cfg, __wt_checkpoint_sync, NULL)); /* Checkpoint the metadata file. */ SLIST_FOREACH(dhandle, &conn->dhlh, l) { @@ -280,6 +294,13 @@ err: /* __wt_txn_release(session); else __wt_txn_release_snapshot(session); + + /* Tell logging that we have finished a database checkpoint. */ + if (S2C(session)->logging && full) + WT_TRET(__wt_txn_checkpoint_log(session, full, + (ret == 0) ? WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_FAIL, + NULL)); + __wt_spin_unlock(session, &conn->checkpoint_lock); __wt_scr_free(&tmp); @@ -415,6 +436,7 @@ __checkpoint_worker( WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; + WT_LSN ckptlsn; const char *name; int deleted, force, hot_backup_locked, track_ckpt; char *name_alloc; @@ -425,6 +447,9 @@ __checkpoint_worker( bm = btree->bm; ckpt = ckptbase = NULL; + INIT_LSN(&ckptlsn); + dhandle = session->dhandle; + name_alloc = NULL; hot_backup_locked = 0; name_alloc = NULL; track_ckpt = 1; @@ -695,6 +720,11 @@ __checkpoint_worker( */ WT_ERR(__wt_bt_cache_force_write(session)); + /* Tell logging that a file checkpoint is starting. */ + if (S2C(session)->logging) + WT_ERR(__wt_txn_checkpoint_log( + session, 0, WT_TXN_LOG_CKPT_START, &ckptlsn)); + /* * Clear the tree's modified flag; any changes before we clear the flag * are guaranteed to be part of this checkpoint (unless reconciliation @@ -723,8 +753,8 @@ __checkpoint_worker( ckpt->write_gen = btree->write_gen; fake: /* Update the object's metadata. */ - ret = __wt_meta_ckptlist_set(session, dhandle->name, ckptbase); - WT_ERR(ret); + WT_ERR(__wt_meta_ckptlist_set( + session, dhandle->name, ckptbase, &ckptlsn)); /* * If we wrote a checkpoint (rather than faking one), pages may be @@ -740,6 +770,11 @@ fake: /* Update the object's metadata. */ WT_ERR(bm->checkpoint_resolve(bm, session)); } + /* Tell logging that the checkpoint is complete. */ + if (S2C(session)->logging) + WT_ERR(__wt_txn_checkpoint_log( + session, 0, WT_TXN_LOG_CKPT_STOP, NULL)); + err: if (hot_backup_locked) __wt_spin_unlock(session, &conn->hot_backup_lock); skip: __wt_meta_ckptlist_free(session, ckptbase); diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c new file mode 100644 index 00000000000..ca8812aae09 --- /dev/null +++ b/src/txn/txn_log.c @@ -0,0 +1,440 @@ +/*- + * Copyright (c) 2008-2013 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __txn_op_log -- + * Log an operation for the current transaction. + */ +static int +__txn_op_log(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_TXN_OP *op) +{ + WT_ITEM value; + uint64_t recno; + + value.data = WT_UPDATE_DATA(op->u.op.upd); + value.size = op->u.op.upd->size; + + /* + * Log the operation. It must be one of the following: + * 1) column store remove; + * 2) column store insert/update; + * 3) row store remove; or + * 4) row store insert/update. + */ + if (op->u.op.key.data == NULL) { + WT_ASSERT(session, op->u.op.ins != NULL); + recno = op->u.op.ins->u.recno; + + if (WT_UPDATE_DELETED_ISSET(op->u.op.upd)) + WT_RET(__wt_logop_col_remove_pack(session, logrec, + op->fileid, recno)); + else + WT_RET(__wt_logop_col_put_pack(session, logrec, + op->fileid, recno, &value)); + } else { + if (WT_UPDATE_DELETED_ISSET(op->u.op.upd)) + WT_RET(__wt_logop_row_remove_pack(session, logrec, + op->fileid, &op->u.op.key)); + else + WT_RET(__wt_logop_row_put_pack(session, logrec, + op->fileid, &op->u.op.key, &value)); + } + + return (0); +} + +/* + * __txn_commit_printlog -- + * Print a commit log record. + */ +static int +__txn_commit_printlog( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) +{ + /* The logging subsystem zero-pads records. */ + while (*pp < end && **pp) + WT_RET(__wt_txn_op_printlog(session, pp, end, out)); + return (0); +} + +/* + * __wt_txn_op_free -- + * Free memory associated with a transactional operation. + */ +void +__wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op) +{ + if (op->type == TXN_OP_TRUNCATE_ROW) { + __wt_buf_free(session, &op->u.truncate_row.start); + __wt_buf_free(session, &op->u.truncate_row.stop); + } +} + +/* + * __wt_txn_log_commit -- + * Write the operations of a transaction to the log at commit time. + */ +int +__wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_DECL_RET; + WT_DECL_ITEM(logrec); + WT_TXN *txn; + WT_TXN_OP *op; + const char *fmt = WT_UNCHECKED_STRING(Iq); + size_t header_size; + uint32_t rectype = WT_LOGREC_COMMIT; + u_int i; + + WT_UNUSED(cfg); + txn = &session->txn; + + WT_RET(__wt_struct_size(session, &header_size, fmt, rectype, txn->id)); + WT_RET(__wt_logrec_alloc(session, header_size, &logrec)); + + WT_ERR(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, header_size, + fmt, rectype, txn->id)); + logrec->size += (uint32_t)header_size; + + /* Write updates to the log. */ + for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) + switch (op->type) { + case TXN_OP_BASIC: + WT_ERR(__txn_op_log(session, logrec, op)); + break; + case TXN_OP_INMEM: + case TXN_OP_REF: + /* Nothing to log, we're done. */ + break; + case TXN_OP_TRUNCATE_COL: + WT_ERR(__wt_logop_col_truncate_pack(session, logrec, + op->fileid, + op->u.truncate_col.start, op->u.truncate_col.stop)); + break; + case TXN_OP_TRUNCATE_ROW: + WT_ERR(__wt_logop_row_truncate_pack(session, logrec, + op->fileid, + &op->u.truncate_row.start, &op->u.truncate_row.stop, + (uint32_t)op->u.truncate_row.mode)); + break; + } + + WT_ERR(__wt_log_write(session, + logrec, NULL, S2C(session)->txn_logsync)); + +err: __wt_logrec_free(session, &logrec); + return (ret); +} + +/* + * __txn_log_file_sync -- + * Write a log record for a file sync. + */ +static int +__txn_log_file_sync(WT_SESSION_IMPL *session, uint32_t flags, WT_LSN *lsnp) +{ + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + WT_DECL_ITEM(logrec); + const char *fmt = WT_UNCHECKED_STRING(ISI); + size_t header_size; + uint32_t rectype = WT_LOGREC_FILE_SYNC; + int start; + + dhandle = session->dhandle; + start = LF_ISSET(WT_TXN_LOG_CKPT_START); + + WT_RET(__wt_struct_size( + session, &header_size, fmt, rectype, dhandle->name, start)); + WT_RET(__wt_logrec_alloc(session, header_size, &logrec)); + + WT_ERR(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, header_size, + fmt, rectype, dhandle->name, start)); + logrec->size += (uint32_t)header_size; + + WT_ERR(__wt_log_write(session, logrec, lsnp, 0)); +err: __wt_logrec_free(session, &logrec); + return (ret); +} + +/* + * __wt_txn_checkpoint_logread -- + * Read a log record for a checkpoint operation. + */ +int +__wt_txn_checkpoint_logread( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + WT_LSN *ckpt_lsn) +{ + const char *fmt = WT_UNCHECKED_STRING(IQIU); + WT_ITEM ckpt_snapshot; + u_int ckpt_nsnapshot; + + WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &ckpt_lsn->file, &ckpt_lsn->offset, + &ckpt_nsnapshot, &ckpt_snapshot)); + WT_UNUSED(ckpt_nsnapshot); + WT_UNUSED(ckpt_snapshot); + *pp = end; + return (0); +} + +/* + * __wt_txn_checkpoint_log -- + * Write a log record for a checkpoint operation. + */ +int +__wt_txn_checkpoint_log( + WT_SESSION_IMPL *session, int full, uint32_t flags, WT_LSN *lsnp) +{ + WT_DECL_RET; + WT_DECL_ITEM(logrec); + WT_LSN *ckpt_lsn; + WT_TXN *txn; + const char *fmt = WT_UNCHECKED_STRING(IIQIU); + uint8_t *end, *p; + size_t recsize; + uint32_t i, rectype = WT_LOGREC_CHECKPOINT; + + txn = &session->txn; + ckpt_lsn = &txn->ckpt_lsn; + + /* + * If this is a file sync, log it unless there is a full checkpoint in + * progress. + */ + if (!full) { + if (txn->full_ckpt) { + if (lsnp != NULL) + *lsnp = *ckpt_lsn; + return (0); + } else + return (__txn_log_file_sync(session, flags, lsnp)); + } + + switch (flags) { + case WT_TXN_LOG_CKPT_PREPARE: + txn->full_ckpt = 1; + *ckpt_lsn = S2C(session)->log->alloc_lsn; + break; + + case WT_TXN_LOG_CKPT_START: + /* Take a copy of the transaction snapshot. */ + txn->ckpt_nsnapshot = txn->snapshot_count; + recsize = txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE; + WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot)); + p = txn->ckpt_snapshot->mem; + end = p + recsize; + for (i = 0; i < txn->snapshot_count; i++) + WT_ERR(__wt_vpack_uint( + &p, WT_PTRDIFF(end, p), txn->snapshot[i])); + break; + + case WT_TXN_LOG_CKPT_STOP: + /* + * During a clean connection close, we get here without the + * prepare or start steps. In that case, log the current LSN + * as the checkpoint LSN. + */ + if (!txn->full_ckpt) { + txn->ckpt_nsnapshot = 0; + *ckpt_lsn = S2C(session)->log->alloc_lsn; + } + + /* Write the checkpoint log record. */ + WT_ERR(__wt_struct_size(session, &recsize, fmt, + rectype, ckpt_lsn->file, ckpt_lsn->offset, + txn->ckpt_nsnapshot, &txn->ckpt_snapshot)); + WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); + + WT_ERR(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, recsize, fmt, + rectype, ckpt_lsn->file, ckpt_lsn->offset, + txn->ckpt_nsnapshot, &txn->ckpt_snapshot)); + logrec->size += (uint32_t)recsize; + WT_ERR(__wt_log_write(session, logrec, lsnp, 0)); + + /* + * If this full checkpoint completed successfully and there is + * no hot backup in progress, tell the logging subsystem the + * checkpoint LSN so that it can archive. + */ + if (!S2C(session)->hot_backup) + WT_ERR(__wt_log_ckpt(session, ckpt_lsn)); + + /* FALLTHROUGH */ + case WT_TXN_LOG_CKPT_FAIL: + /* Cleanup any allocated resources */ + INIT_LSN(ckpt_lsn); + txn->ckpt_nsnapshot = 0; + __wt_scr_free(&txn->ckpt_snapshot); + txn->full_ckpt = 0; + break; + } + +err: __wt_logrec_free(session, &logrec); + return (ret); +} + +/* + * __wt_txn_truncate_log -- + * Begin truncating a range of a file. + */ +int +__wt_txn_truncate_log( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) +{ + WT_BTREE *btree; + WT_ITEM *item; + WT_TXN_OP *op; + + btree = S2BT(session); + + WT_RET(__txn_next_op(session, &op)); + op->fileid = btree->id; + + if (btree->type == BTREE_ROW) { + op->type = TXN_OP_TRUNCATE_ROW; + op->u.truncate_row.mode = TXN_TRUNC_ALL; + WT_CLEAR(op->u.truncate_row.start); + WT_CLEAR(op->u.truncate_row.stop); + if (start != NULL) { + op->u.truncate_row.mode = TXN_TRUNC_START; + item = &op->u.truncate_row.start; + WT_RET(__wt_cursor_get_raw_key(&start->iface, item)); + WT_RET(__wt_buf_set( + session, item, item->data, item->size)); + } + if (stop != NULL) { + op->u.truncate_row.mode = + (op->u.truncate_row.mode == TXN_TRUNC_ALL) ? + TXN_TRUNC_STOP : TXN_TRUNC_BOTH; + item = &op->u.truncate_row.stop; + WT_RET(__wt_cursor_get_raw_key(&stop->iface, item)); + WT_RET(__wt_buf_set( + session, item, item->data, item->size)); + } + } else { + op->type = TXN_OP_TRUNCATE_COL; + op->u.truncate_col.start = + (start == NULL) ? 0 : start->recno; + op->u.truncate_col.stop = + (stop == NULL) ? 0 : stop->recno; + } + + WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOGGING_INMEM)); + F_SET(session, WT_SESSION_LOGGING_INMEM); + return (0); +} + +/* + * __wt_txn_truncate_end -- + * Finish truncating a range of a file. + */ +int +__wt_txn_truncate_end(WT_SESSION_IMPL *session) +{ + F_CLR(session, WT_SESSION_LOGGING_INMEM); + return (0); +} + +/* + * __txn_printlog -- + * Print a log record in a human-readable format. + */ +static int +__txn_printlog( + WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie) +{ + FILE *out; + WT_LSN ckpt_lsn; + const uint8_t *end, *p; + const char *msg; + uint64_t txnid; + uint32_t fileid, rectype; + int32_t start; + + out = cookie; + + p = (const uint8_t *)logrec->data + offsetof(WT_LOG_RECORD, record); + end = (const uint8_t *)logrec->data + logrec->size; + + /* First, peek at the log record type. */ + WT_RET(__wt_logrec_read(session, &p, end, &rectype)); + + if (fprintf(out, " { \"lsn\" : [%" PRIu32 ",%" PRId64 "],\n", + lsnp->file, lsnp->offset) < 0) + return (errno); + + switch (rectype) { + case WT_LOGREC_CHECKPOINT: + WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p), + WT_UNCHECKED_STRING(IQ), &ckpt_lsn.file, &ckpt_lsn.offset)); + if (fprintf(out, " \"type\" : \"checkpoint\"\n") < 0 || + fprintf( + out, " \"ckpt_lsn\" : [%" PRIu32 ",%" PRId64 "],\n", + ckpt_lsn.file, ckpt_lsn.offset) < 0) + return (errno); + break; + + case WT_LOGREC_COMMIT: + WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid)); + if (fprintf(out, " \"type\" : \"commit\"\n") < 0 || + fprintf(out, " \"txnid\" : %" PRIu64 ",\n", txnid) < 0) + return (errno); + WT_RET(__txn_commit_printlog(session, &p, end, out)); + break; + + case WT_LOGREC_FILE_SYNC: + WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p), + WT_UNCHECKED_STRING(Ii), &fileid, &start)); + if (fprintf(out, " \"type\" : \"file_sync\"\n") < 0 || + fprintf(out, " \"fileid\" : %" PRIu32 "\n", + fileid) < 0 || + fprintf(out, " \"start\" : %" PRId32 "\n", start) < 0) + return (errno); + break; + + case WT_LOGREC_MESSAGE: + WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p), + WT_UNCHECKED_STRING(S), &msg)); + if (fprintf(out, " \"type\" : \"message\"\n") < 0 || + fprintf(out, " \"message\" : \"%s\"\n", msg) < 0) + return (errno); + break; + } + + if (fprintf(out, " },\n") < 0) + return (errno); + + return (0); +} + +/* + * __wt_txn_printlog -- + * Print the log in a human-readable format. + */ +int +__wt_txn_printlog(WT_SESSION *wt_session, FILE *out) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + + if (fprintf(out, "[\n") < 0) + return (errno); + WT_RET(__wt_log_scan( + session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, out)); + if (fprintf(out, "]\n") < 0) + return (errno); + + return (0); +} diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c new file mode 100644 index 00000000000..fb1d05c2565 --- /dev/null +++ b/src/txn/txn_recover.c @@ -0,0 +1,451 @@ +/*- + * Copyright (c) 2008-2013 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* State maintained during recovery. */ +typedef struct { + WT_SESSION_IMPL *session; + + /* Files from the metadata, indexed by file ID. */ + struct { + const char *uri; /* File URI. */ + WT_CURSOR *c; /* Cursor used for recovery. */ + WT_LSN ckpt_lsn; /* File's checkpoint LSN. */ + } *files; + size_t file_alloc; /* Allocated size of files array. */ + u_int max_fileid; /* Maximum file ID seen. */ + u_int nfiles; /* Number of files in the metadata. */ + + WT_LSN ckpt_lsn; /* Start LSN for main recovery loop. */ + + int modified; /* Did recovery make any changes? */ + int metadata_only; /* + * Set during the first recovery pass, + * when only the metadata is recovered. + */ +} WT_RECOVERY; + +/* + * __recovery_cursor -- + * Get a cursor for a recovery operation. + */ +static int +__recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r, + WT_LSN *lsnp, u_int id, int duplicate, WT_CURSOR **cp) +{ + WT_CURSOR *c; + const char *cfg[] = { WT_CONFIG_BASE(session, session_open_cursor), + "overwrite", NULL }; + + c = NULL; + + /* + * Only apply operations in the correct metadata phase, and if the LSN + * is more recent than the last checkpoint. If there is no entry for a + * file, assume it was dropped. + */ + if (r->metadata_only != (id == 0) || + LOG_CMP(lsnp, &r->files[id].ckpt_lsn) < 0) + ; + else if (id > r->max_fileid) + r->max_fileid = id; + else if (id >= r->nfiles || r->files[id].uri == NULL) + WT_VERBOSE_RET(session, recovery, + "No file found with ID %u (max %u)", id, r->nfiles); + else if ((c = r->files[id].c) == NULL) { + WT_RET(__wt_open_cursor( + session, r->files[id].uri, NULL, cfg, &c)); + r->files[id].c = c; + } + + if (duplicate && c != NULL) + WT_RET(__wt_open_cursor( + session, r->files[id].uri, NULL, cfg, &c)); + + *cp = c; + return (0); +} + +/* + * Helper to a cursor if this operation is to be applied during recovery. + */ +#define GET_RECOVERY_CURSOR(s, r, lsnp, fileid, cp) \ + WT_ERR(__recovery_cursor((s), (r), (lsnp), (fileid), 0, (cp))); \ + WT_VERBOSE_ERR(session, recovery, \ + "%s op %d to file %d at LSN %u/%" PRIuMAX, \ + (cursor == NULL) ? "Skipping" : "Applying", \ + optype, fileid, lsnp->file, (uintmax_t)lsnp->offset); \ + if (cursor == NULL) \ + break + +/* + * __txn_op_apply -- + * Apply a transactional operation during recovery. + */ +static int +__txn_op_apply( + WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end) +{ + WT_CURSOR *cursor, *start, *stop; + WT_DECL_RET; + WT_ITEM key, start_key, stop_key, value; + WT_SESSION_IMPL *session; + uint64_t recno, start_recno, stop_recno; + uint32_t fileid, mode, optype, opsize; + + session = r->session; + + /* Peek at the size and the type. */ + WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize)); + end = *pp + opsize; + + switch (optype) { + case WT_LOGOP_COL_PUT: + WT_ERR(__wt_logop_col_put_unpack(session, pp, end, + &fileid, &recno, &value)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + cursor->set_key(cursor, recno); + __wt_cursor_set_raw_value(cursor, &value); + WT_ERR(cursor->insert(cursor)); + break; + + case WT_LOGOP_COL_REMOVE: + WT_ERR(__wt_logop_col_remove_unpack(session, pp, end, + &fileid, &recno)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + cursor->set_key(cursor, recno); + WT_ERR(cursor->remove(cursor)); + break; + + case WT_LOGOP_COL_TRUNCATE: + WT_ERR(__wt_logop_col_truncate_unpack(session, pp, end, + &fileid, &start_recno, &stop_recno)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + + /* Set up the cursors. */ + if (start_recno == 0) { + start = NULL; + stop = cursor; + } else if (stop_recno == 0) { + start = cursor; + stop = NULL; + } else { + start = cursor; + WT_ERR(__recovery_cursor( + session, r, lsnp, fileid, 1, &stop)); + } + + /* Set the keys. */ + if (start != NULL) + start->set_key(stop, start_recno); + if (stop != NULL) + stop->set_key(stop, stop_recno); + + WT_TRET(session->iface.truncate(&session->iface, NULL, + start, stop, NULL)); + /* If we opened a duplicate cursor, close it now. */ + if (stop != NULL && stop != cursor) + WT_TRET(stop->close(stop)); + WT_ERR(ret); + break; + + case WT_LOGOP_ROW_PUT: + WT_ERR(__wt_logop_row_put_unpack(session, pp, end, + &fileid, &key, &value)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + __wt_cursor_set_raw_key(cursor, &key); + __wt_cursor_set_raw_value(cursor, &value); + WT_ERR(cursor->insert(cursor)); + break; + + case WT_LOGOP_ROW_REMOVE: + WT_ERR(__wt_logop_row_remove_unpack(session, pp, end, + &fileid, &key)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + __wt_cursor_set_raw_key(cursor, &key); + WT_ERR(cursor->remove(cursor)); + break; + + case WT_LOGOP_ROW_TRUNCATE: + WT_ERR(__wt_logop_row_truncate_unpack(session, pp, end, + &fileid, &start_key, &stop_key, &mode)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + /* Set up the cursors. */ + start = stop = NULL; + switch (mode) { + case TXN_TRUNC_ALL: + /* Both cursors stay NULL. */ + break; + case TXN_TRUNC_BOTH: + start = cursor; + WT_ERR(__recovery_cursor( + session, r, lsnp, fileid, 1, &stop)); + break; + case TXN_TRUNC_START: + start = cursor; + break; + case TXN_TRUNC_STOP: + stop = cursor; + break; + + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Set the keys. */ + if (start != NULL) + __wt_cursor_set_raw_key(start, &start_key); + if (stop != NULL) + __wt_cursor_set_raw_key(stop, &stop_key); + + WT_TRET(session->iface.truncate(&session->iface, NULL, + start, stop, NULL)); + /* If we opened a duplicate cursor, close it now. */ + if (stop != NULL && stop != cursor) + WT_TRET(stop->close(stop)); + WT_ERR(ret); + break; + + WT_ILLEGAL_VALUE_ERR(session); + } + + r->modified = 1; + +err: if (ret != 0) + __wt_err(session, ret, + "Operation failed during recovery"); + return (ret); +} + +/* + * __txn_commit_apply -- + * Apply a commit record during recovery. + */ +static int +__txn_commit_apply( + WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end) +{ + WT_UNUSED(lsnp); + + /* The logging subsystem zero-pads records. */ + while (*pp < end && **pp) + WT_RET(__txn_op_apply(r, lsnp, pp, end)); + + return (0); +} + +/* + * __txn_log_recover -- + * Roll the log forward to recover committed changes. + */ +static int +__txn_log_recover( + WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie) +{ + WT_RECOVERY *r; + const uint8_t *end, *p; + uint64_t txnid; + uint32_t rectype; + + r = cookie; + p = (const uint8_t *)logrec->data + offsetof(WT_LOG_RECORD, record); + end = (const uint8_t *)logrec->data + logrec->size; + + /* First, peek at the log record type. */ + WT_RET(__wt_logrec_read(session, &p, end, &rectype)); + + switch (rectype) { + case WT_LOGREC_CHECKPOINT: + if (r->metadata_only) + WT_RET(__wt_txn_checkpoint_logread( + session, &p, end, &r->ckpt_lsn)); + break; + + case WT_LOGREC_COMMIT: + WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid)); + WT_UNUSED(txnid); + WT_RET(__txn_commit_apply(r, lsnp, &p, end)); + break; + } + + return (0); +} + +/* + * __recovery_setup_file -- + * Set up the recovery slot for a file. + */ +static int +__recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config) +{ + WT_CONFIG_ITEM cval; + WT_LSN lsn; + uint32_t fileid; + + WT_RET(__wt_config_getones(r->session, config, "id", &cval)); + fileid = (uint32_t)cval.val; + + if (r->nfiles <= fileid) { + WT_RET(__wt_realloc_def( + r->session, &r->file_alloc, fileid + 1, &r->files)); + r->nfiles = fileid + 1; + } + + WT_RET(__wt_strdup(r->session, uri, &r->files[fileid].uri)); + WT_RET( + __wt_config_getones(r->session, config, "checkpoint_lsn", &cval)); + /* If there is checkpoint logged for the file, apply everything. */ + if (cval.type != WT_CONFIG_ITEM_STRUCT) + INIT_LSN(&lsn); + else if (sscanf(cval.str, "(%" PRIu32 ",%" PRIdMAX ")", + &lsn.file, (intmax_t*)&lsn.offset) != 2) + WT_RET_MSG(r->session, EINVAL, + "Failed to parse checkpoint LSN '%.*s'", + (int)cval.len, cval.str); + r->files[fileid].ckpt_lsn = lsn; + + WT_VERBOSE_RET(r->session, recovery, + "Recovering %s with id %u @ (%" PRIu32 ", %" PRIu64 ")", + uri, fileid, lsn.file, lsn.offset); + + return (0); + +} + +/* + * __recovery_free -- + * Free the recovery state. + */ +static int +__recovery_free(WT_RECOVERY *r) +{ + WT_CURSOR *c; + WT_DECL_RET; + WT_SESSION_IMPL *session; + u_int i; + + session = r->session; + for (i = 0; i < r->nfiles; i++) { + __wt_free(session, r->files[i].uri); + if ((c = r->files[i].c) != NULL) + WT_TRET(c->close(c)); + } + + __wt_free(session, r->files); + return (ret); +} + +/* + * __recovery_file_scan -- + * Scan the files referenced from the metadata and gather information + * about them for recovery. + */ +static int +__recovery_file_scan(WT_RECOVERY *r) +{ + WT_DECL_RET; + WT_CURSOR *c; + const char *uri, *config; + int cmp; + + /* Scan through all files in the metadata. */ + c = r->files[0].c; + c->set_key(c, "file:"); + if ((ret = c->search_near(c, &cmp)) != 0) { + /* Is the metadata empty? */ + if (ret == WT_NOTFOUND) + ret = 0; + goto err; + } + if (cmp < 0) + WT_ERR_NOTFOUND_OK(c->next(c)); + for (; ret == 0; ret = c->next(c)) { + WT_ERR(c->get_key(c, &uri)); + if (!WT_PREFIX_MATCH(uri, "file:")) + break; + WT_ERR(c->get_value(c, &config)); + WT_ERR(__recovery_setup_file(r, uri, config)); + } + WT_ERR_NOTFOUND_OK(ret); + +err: r->max_fileid = r->nfiles; + return (ret); +} + +/* + * __wt_txn_recover -- + * Run recovery. + */ +int +__wt_txn_recover(WT_SESSION_IMPL *default_session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_RECOVERY r; + WT_SESSION_IMPL *session; + const char *config; + int modified; + + conn = S2C(default_session); + WT_CLEAR(r); + INIT_LSN(&r.ckpt_lsn); + + /* We need a real session for recovery. */ + WT_RET(__wt_open_session(conn, 0, NULL, NULL, &session)); + F_SET(session, WT_SESSION_LOGGING_DISABLED); + r.session = session; + + WT_ERR(__wt_metadata_search(session, WT_METADATA_URI, &config)); + WT_ERR(__recovery_setup_file(&r, WT_METADATA_URI, config)); + WT_ERR(__wt_metadata_cursor(session, NULL, &r.files[0].c)); + + /* + * First, do a full pass through the log to recover the metadata, + * and establish the last checkpoint LSN. + */ + r.metadata_only = 1; + WT_ERR(__wt_log_scan( + session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r)); + + WT_ASSERT(session, LOG_CMP(&r.ckpt_lsn, &conn->log->first_lsn) >= 0); + + WT_ERR(__recovery_file_scan(&r)); + + /* + * Now, recover all the files apart from the metadata. + * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. + */ + r.metadata_only = 0; + WT_VERBOSE_ERR(session, recovery, + "Main recovery loop: starting at %u/%" PRIuMAX, + r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset); + if (IS_INIT_LSN(&r.ckpt_lsn)) + WT_ERR(__wt_log_scan(session, NULL, + WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, + __txn_log_recover, &r)); + else + WT_ERR(__wt_log_scan(session, &r.ckpt_lsn, + WT_LOGSCAN_RECOVER, + __txn_log_recover, &r)); + + conn->next_file_id = r.max_fileid; + +err: modified = r.modified; + WT_TRET(__recovery_free(&r)); + __wt_free(session, config); + WT_TRET(session->iface.close(&session->iface, NULL)); + + /* + * If recovery ran successfully and modified something, log a + * checkpoint. + */ + if (ret == 0 && modified) + ret = __wt_txn_checkpoint_log( + default_session, 1, WT_TXN_LOG_CKPT_STOP, NULL); + + return (ret); +} diff --git a/src/utilities/util_printlog.c b/src/utilities/util_printlog.c index b62954a6b51..810dfac5fe2 100644 --- a/src/utilities/util_printlog.c +++ b/src/utilities/util_printlog.c @@ -12,9 +12,7 @@ static int usage(void); int util_printlog(WT_SESSION *session, int argc, char *argv[]) { - WT_CURSOR *cursor; WT_DECL_RET; - WT_ITEM key, value; int ch, printable; printable = 0; @@ -41,30 +39,11 @@ util_printlog(WT_SESSION *session, int argc, char *argv[]) if (argc != 0) return (usage()); - if ((ret = session->open_cursor(session, "log", - NULL, printable ? "printable" : "raw", &cursor)) != 0) { - fprintf(stderr, "%s: cursor open(log) failed: %s\n", - progname, wiredtiger_strerror(ret)); - goto err; - } + WT_UNUSED(printable); + ret = __wt_txn_printlog(session, stdout); - while ((ret = cursor->next(cursor)) == 0) { - if ((ret = cursor->get_key(cursor, &key)) != 0) - break; - if ((ret = cursor->get_value(cursor, &value)) != 0) - break; - if (fwrite(key.data, 1, key.size, stdout) != key.size || - fwrite("\n", 1, 1, stdout) != 1 || - fwrite(value.data, 1, value.size, stdout) != value.size || - fwrite("\n", 1, 1, stdout) != 1) { - ret = errno; - break; - } - } - if (ret == WT_NOTFOUND) - ret = 0; - else { - fprintf(stderr, "%s: cursor get(log) failed: %s\n", + if (ret != 0) { + fprintf(stderr, "%s: printlog failed: %s\n", progname, wiredtiger_strerror(ret)); goto err; } diff --git a/test/java/com/wiredtiger/test/AutoCloseTest.java b/test/java/com/wiredtiger/test/AutoCloseTest.java new file mode 100644 index 00000000000..11c29dc23c6 --- /dev/null +++ b/test/java/com/wiredtiger/test/AutoCloseTest.java @@ -0,0 +1,284 @@ +/*- + * Public Domain 2008-2013 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package com.wiredtiger.test; + +import com.wiredtiger.db.Connection; +import com.wiredtiger.db.Cursor; +import com.wiredtiger.db.Session; +import com.wiredtiger.db.WiredTigerPackingException; +import com.wiredtiger.db.wiredtiger; + +import static org.junit.Assert.assertEquals; + +import org.junit.After; +import org.junit.Test; +import org.junit.Assert; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +public class AutoCloseTest { + + /* + * Connvalid tells us that we really closed the connection. + * That allows teardown to reliably clean up so that + * a single failure in one test does not cascade. + */ + Connection conn; + boolean connvalid = false; + + private Session sessionSetup() { + String keyFormat = "S"; + String valueFormat = "u"; + + conn = wiredtiger.open("WT_HOME", "create"); + connvalid = true; + Session s = conn.open_session(null); + s.create("table:t", + "key_format=" + keyFormat + ",value_format=" + valueFormat); + return s; + } + + private Cursor populate(Session s) + throws WiredTigerPackingException { + Cursor c = s.open_cursor("table:t", null, null); + c.putKeyString("bar"); + c.putValueByteArray("foo".getBytes()); + c.insert(); + return c; + } + + @Test + public void autoCloseCursor01() + throws WiredTigerPackingException { + Session s = sessionSetup(); + Cursor c = populate(s); + + c.close(); + + boolean caught = false; + try { + // Error: cursor used after close + c.next(); + } + catch (NullPointerException iae) { + assertEquals(iae.toString().contains("cursor is null"), true); + caught = true; + } + assertEquals(caught, true); + + s.close(""); + conn.close(""); + connvalid = false; + } + + @Test + public void autoCloseCursor02() + throws WiredTigerPackingException { + Session s = sessionSetup(); + Cursor c = populate(s); + + s.close(""); + + boolean caught = false; + try { + // Error: cursor used after session closed + c.next(); + } + catch (NullPointerException iae) { + assertEquals(iae.toString().contains("cursor is null"), true); + caught = true; + } + assertEquals(caught, true); + + conn.close(""); + connvalid = false; + } + + @Test + public void autoCloseCursor03() + throws WiredTigerPackingException { + Session s = sessionSetup(); + Cursor c = populate(s); + + conn.close(""); + connvalid = false; + + boolean caught = false; + try { + // Error: cursor used after connection close + c.close(); + } + catch (NullPointerException iae) { + assertEquals(iae.toString().contains("cursor is null"), true); + caught = true; + } + assertEquals(caught, true); + } + + @Test + public void autoCloseCursor04() + throws WiredTigerPackingException { + Session s = sessionSetup(); + + // The truncate call allows both of its cursor args + // to be null, so we don't expect null checking. + + Cursor cbegin = s.open_cursor("table:t", null, null); + cbegin.putKeyString("bar"); + cbegin.search(); + Cursor cend = s.open_cursor(null, cbegin, null); + s.truncate(null, cbegin, cend, ""); + cbegin.close(); + cend.close(); + + s.truncate("table:t", null, null, null); + + conn.close(""); + connvalid = false; + } + + @Test + public void autoCloseCursor05() + throws WiredTigerPackingException { + Session s = sessionSetup(); + Cursor c = populate(s); + + // Allowable compare call + c.putKeyString("bar"); + c.search(); + Cursor c2 = s.open_cursor(null, c, null); + c.compare(c2); + + boolean caught = false; + try { + // Error: cursor arg should not be null + c.compare(null); + } + catch (NullPointerException iae) { + assertEquals(iae.toString().contains("other is null"), true); + caught = true; + } + assertEquals(caught, true); + conn.close(""); + connvalid = false; + } + + @Test + public void autoCloseSession01() + throws WiredTigerPackingException { + Session s = sessionSetup(); + Cursor c = populate(s); + + s.close(""); + + boolean caught = false; + try { + // Error: session used after close + s.drop("table:t", ""); + } + catch (NullPointerException iae) { + assertEquals(iae.toString().contains("session is null"), true); + caught = true; + } + assertEquals(caught, true); + + conn.close(""); + connvalid = false; + } + + @Test + public void autoCloseSession02() + throws WiredTigerPackingException { + Session s = sessionSetup(); + Cursor c = populate(s); + + conn.close(""); + connvalid = false; + + boolean caught = false; + try { + // Error: session used after connection close + s.close(""); + } + catch (NullPointerException iae) { + assertEquals(iae.toString().contains("session is null"), true); + caught = true; + } + assertEquals(caught, true); + } + + public void autoCloseSession03() + throws WiredTigerPackingException { + Session s = sessionSetup(); + Cursor c = populate(s); + + s.close(""); + + boolean caught = false; + try { + // Error: session used after close, using open call + s.open_cursor("table:t", null, null); + } + catch (NullPointerException iae) { + assertEquals(iae.toString().contains("session is null"), true); + caught = true; + } + assertEquals(caught, true); + + conn.close(""); + connvalid = false; + } + + @Test + public void autoCloseConnection01() + throws WiredTigerPackingException { + Session s = sessionSetup(); + Cursor c = populate(s); + + conn.close(""); + connvalid = false; + + boolean caught = false; + try { + // Error: connection used after close + conn.close(""); + } + catch (NullPointerException iae) { + assertEquals(iae.toString().contains("connection is null"), true); + caught = true; + } + assertEquals(caught, true); + } + + @After + public void teardown() { + if (connvalid) { + conn.close(""); + } + } + +} diff --git a/test/java/com/wiredtiger/test/WiredTigerSuite.java b/test/java/com/wiredtiger/test/WiredTigerSuite.java index cbcc1b4323f..24847711cc7 100644 --- a/test/java/com/wiredtiger/test/WiredTigerSuite.java +++ b/test/java/com/wiredtiger/test/WiredTigerSuite.java @@ -31,6 +31,7 @@ import org.junit.runners.Suite; @RunWith(Suite.class) @Suite.SuiteClasses( { + AutoCloseTest.class, CursorTest.class, CursorTest02.class, PackTest.class diff --git a/test/salvage/salvage.c b/test/salvage/salvage.c index 9d6eee1c59f..5b7a2c2db88 100644 --- a/test/salvage/salvage.c +++ b/test/salvage/salvage.c @@ -158,7 +158,7 @@ run(int r) printf("\t%s: run %d\n", __wt_page_type_string(page_type), r); WT_UNUSED_RET(system( - "rm -f WiredTiger WiredTiger.* __slvg.* __schema.*")); + "rm -f WiredTiger* __slvg.* __schema.*")); assert((res_fp = fopen(RSLT, "w")) != NULL); /* @@ -457,7 +457,12 @@ build(int ikey, int ivalue, int cnt) char config[256], kbuf[64], vbuf[64]; int new_slvg; - assert(wiredtiger_open(NULL, NULL, "create", &conn) == 0); + /* + * Disable logging: we're modifying files directly, we don't want to + * run recovery. + */ + assert(wiredtiger_open( + NULL, NULL, "create,log=(enabled=false)", &conn) == 0); assert(conn->open_session(conn, NULL, NULL, &session) == 0); assert(session->drop(session, "file:" LOAD, "force") == 0); @@ -604,24 +609,25 @@ process(void) config[0] = '\0'; if (verbose) snprintf(config, sizeof(config), - "error_prefix=\"%s\",verbose=[salvage,verify]", + "error_prefix=\"%s\",verbose=[salvage,verify],", progname); + strcat(config, "log=(enabled=false),"); + assert(wiredtiger_open(NULL, NULL, config, &conn) == 0); assert(conn->open_session(conn, NULL, NULL, &session) == 0); assert(session->salvage(session, "file:" SLVG, 0) == 0); assert(conn->close(conn, 0) == 0); /* Verify. */ - assert(wiredtiger_open(NULL, NULL, "", &conn) == 0); + assert(wiredtiger_open(NULL, NULL, config, &conn) == 0); assert(conn->open_session(conn, NULL, NULL, &session) == 0); assert(session->verify(session, "file:" SLVG, 0) == 0); assert(conn->close(conn, 0) == 0); /* Dump. */ assert((fp = fopen(DUMP, "w")) != NULL); - assert(wiredtiger_open(NULL, NULL, "", &conn) == 0); + assert(wiredtiger_open(NULL, NULL, config, &conn) == 0); assert(conn->open_session(conn, NULL, NULL, &session) == 0); - assert(session->create(session, "file:" SLVG, NULL) == 0); assert(session->open_cursor( session, "file:" SLVG, NULL, "dump=print", &cursor) == 0); while (cursor->next(cursor) == 0) { diff --git a/test/suite/test_txn02.py b/test/suite/test_txn02.py index ec0e2240f3d..bc024dfa531 100644 --- a/test/suite/test_txn02.py +++ b/test/suite/test_txn02.py @@ -29,8 +29,10 @@ # Transactions: commits and rollbacks # -import wiredtiger, wttest +import os, shutil +from wiredtiger import wiredtiger_open from wtscenario import multiply_scenarios, number_scenarios +import wttest class test_txn02(wttest.WiredTigerTestCase): tablename = 'test_txn02' @@ -70,11 +72,14 @@ class test_txn02(wttest.WiredTigerTestCase): txn4s = [('t4c', dict(txn4='commit')), ('t4r', dict(txn4='rollback'))] scenarios = number_scenarios(multiply_scenarios('.', types, - op1s, txn1s, op2s, txn2s, op3s, txn3s, op4s, txn4s)) # [:1] + op1s, txn1s, op2s, txn2s, op3s, txn3s, op4s, txn4s)) # Overrides WiredTigerTestCase def setUpConnectionOpen(self, dir): - conn = wiredtiger.wiredtiger_open(dir, 'create,' + + self.home = dir + self.backup_dir = os.path.join(self.home, "WT_BACKUP") + conn = wiredtiger_open(dir, + 'create,log=(enabled,file_max=100K),' + ('error_prefix="%s: ",' % self.shortid())) self.pr(`conn`) self.session2 = conn.open_session() @@ -89,9 +94,7 @@ class test_txn02(wttest.WiredTigerTestCase): actual = dict((k, v) for k, v in c if v != 0) # Search for the expected items as well as iterating for k, v in expected.iteritems(): - c.set_key(k) - c.search() - self.assertEqual(c.get_value(), v) + self.assertEqual(c[k], v) c.close() if txn_config: session.commit_transaction() @@ -108,7 +111,19 @@ class test_txn02(wttest.WiredTigerTestCase): self.check(self.session2, "isolation=read-committed", committed) self.check(self.session2, "isolation=read-uncommitted", current) + # Opening a clone of the database home directory should see the + # committed results. + wttest.removeAll(self.backup_dir) + shutil.copytree(self.home, self.backup_dir) + backup_conn = wiredtiger_open(self.backup_dir, 'log=(enabled)') + try: + self.check(backup_conn.open_session(), None, committed) + #self.check(backup_conn.open_session(), None, {}) + finally: + backup_conn.close() + def test_ops(self): + # print "Creating %s with config '%s'" % (self.uri, self.create_params) self.session.create(self.uri, self.create_params) # Set up the table with entries for 1 and 10 # We use the overwrite config so insert can update as needed. diff --git a/test/thread/t.c b/test/thread/t.c index 5d9d10df0f3..ba37522bd04 100644 --- a/test/thread/t.c +++ b/test/thread/t.c @@ -203,7 +203,7 @@ wt_shutdown(void) static void shutdown(void) { - WT_UNUSED_RET(system("rm -f WiredTiger.* __wt*")); + WT_UNUSED_RET(system("rm -f WiredTiger* __wt*")); } static int diff --git a/tools/statlog.py b/tools/statlog.py index de7f6139c95..f46c834dc5d 100644 --- a/tools/statlog.py +++ b/tools/statlog.py @@ -42,6 +42,7 @@ no_scale_per_second_list = [ 'cache: tracked dirty pages in the cache', 'cache: pages currently held in the cache', 'files currently open', + 'log: total log buffer size', 'open cursor count', 'block manager: file allocation unit size', 'block manager: checkpoint size', @@ -68,7 +69,7 @@ no_scale_per_second_list = [ 'overflow values cached in memory', 'chunks in the LSM tree', 'highest merge generation in the LSM tree', - 'reconciliation maximum number of splits created for a page', + 'reconciliation maximum splits for a page', 'open cursor count', ] # no scale-per-second list section: END |