diff options
Diffstat (limited to 'src')
171 files changed, 3031 insertions, 4014 deletions
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c index b659d83cbc7..697d59c8dcd 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c @@ -226,9 +226,9 @@ cb_asyncop(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op, int ret, uint32_t flags) /* * Either we have success and we track it, or failure and panic. * - * Reads and updates can fail with WT_NOTFOUND: we may be searching - * in a random range, or an insert op might have updated the - * last record in the table but not yet finished the actual insert. + * Reads and updates can fail with WT_NOTFOUND: we may be searching in a random range, or an + * insert op might have updated the last record in the table but not yet finished the actual + * insert. */ if (type == WT_AOP_COMPACT) return (0); diff --git a/src/third_party/wiredtiger/dist/s_comment.py b/src/third_party/wiredtiger/dist/s_comment.py index 556862f0fcc..f30de0e4794 100644 --- a/src/third_party/wiredtiger/dist/s_comment.py +++ b/src/third_party/wiredtiger/dist/s_comment.py @@ -63,14 +63,20 @@ for line in sys.stdin: indent_ws = ' ' * indentation sys.stdout.write('{}/*\n'.format(indent_ws)) current_line = indent_ws + ' *' - for word in words: + for i in range(len(words)): + word = words[i] if word == '--' and function_desc: sys.stdout.write(current_line + ' ' + word + '\n') current_line = indent_ws + ' *' + ' ' * 4 continue if word == '\n': - sys.stdout.write(current_line + '\n') - sys.stdout.write(indent_ws + ' *' + '\n') + # If we already have partially built a line, write it out. + if current_line != indent_ws + ' *': + sys.stdout.write(current_line + '\n') + # If there are more words in this comment after this + # newline, add another line break. + if i < (len(words) - 1): + sys.stdout.write(indent_ws + ' *' + '\n') current_line = indent_ws + ' *' continue if len(current_line) + len(word) >= line_length: @@ -89,6 +95,10 @@ for line in sys.stdin: function_desc = False elif multiline: comment += line + # We want to preserve newlines for block comments that have multiple paragraphs. + if sline == '*': + words.append('\n') + continue # Function names begin with either a lowercase char or an underscore. if (len(sline) >= 3 and sline.startswith('*') and sline[1] == ' ' and (sline[2].islower() or sline[2] == '_') and sline.endswith('--')): diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c index ffc3c469881..53c2ac9d95d 100644 --- a/src/third_party/wiredtiger/examples/c/ex_all.c +++ b/src/third_party/wiredtiger/examples/c/ex_all.c @@ -1047,12 +1047,12 @@ connection_ops(WT_CONNECTION *conn) /*! [Configure method configuration] */ /* - * Applications opening a cursor for the data-source object "my_data" - * have an additional configuration option "entries", which is an - * integer type, defaults to 5, and must be an integer between 1 and 10. + * Applications opening a cursor for the data-source object "my_data" have an additional + * configuration option "entries", which is an integer type, defaults to 5, and must be an + * integer between 1 and 10. * - * The method being configured is specified using a concatenation of the - * handle name, a period and the method name. + * The method being configured is specified using a concatenation of the handle name, a period + * and the method name. */ error_check(conn->configure_method( conn, "WT_SESSION.open_cursor", "my_data:", "entries=5", "int", "min=1,max=10")); diff --git a/src/third_party/wiredtiger/examples/c/ex_call_center.c b/src/third_party/wiredtiger/examples/c/ex_call_center.c index 2c404046ee8..3a7430300c4 100644 --- a/src/third_party/wiredtiger/examples/c/ex_call_center.c +++ b/src/third_party/wiredtiger/examples/c/ex_call_center.c @@ -141,17 +141,16 @@ main(int argc, char *argv[]) error_check(cursor->close(cursor)); /* - * First query: a call arrives. In SQL: + * First query: a call arrives. In SQL: * * SELECT id, name FROM Customers WHERE phone=? * - * Use the cust_phone index, lookup by phone number to fill the - * customer record. The cursor will have a key format of "S" for a - * string because the cust_phone index has a single column ("phone"), - * which is of type "S". + * Use the cust_phone index, lookup by phone number to fill the customer record. The cursor will + * have a key format of "S" for a string because the cust_phone index has a single column + * ("phone"), which is of type "S". * - * Specify the columns we want: the customer ID and the name. This - * means the cursor's value format will be "rS". + * Specify the columns we want: the customer ID and the name. This means the cursor's value + * format will be "rS". */ error_check( session->open_cursor(session, "index:customers:phone(id,name)", NULL, NULL, &cursor)); @@ -162,17 +161,16 @@ main(int argc, char *argv[]) error_check(cursor->close(cursor)); /* - * Next query: get the recent order history. In SQL: + * Next query: get the recent order history. In SQL: * * SELECT * FROM Calls WHERE cust_id=? ORDER BY call_date DESC LIMIT 3 * - * Use the call_cust_date index to find the matching calls. Since it is - * is in increasing order by date for a given customer, we want to start - * with the last record for the customer and work backwards. + * Use the call_cust_date index to find the matching calls. Since it is in increasing order by + * date for a given customer, we want to start with the last record for the customer and work + * backwards. * - * Specify a subset of columns to be returned. (Note that if these were - * all covered by the index, the primary would not have to be accessed.) - * Stop after getting 3 records. + * Specify a subset of columns to be returned. (Note that if these were all covered by the + * index, the primary would not have to be accessed.) Stop after getting 3 records. */ error_check(session->open_cursor( session, "index:calls:cust_date(cust_id,call_type,notes)", NULL, NULL, &cursor)); diff --git a/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_call_center.java b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_call_center.java index 97e5c3d7d34..abcbc395170 100644 --- a/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_call_center.java +++ b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_call_center.java @@ -238,7 +238,7 @@ public class ex_call_center { * SELECT * FROM Calls WHERE cust_id=? ORDER BY call_date DESC LIMIT 3 * * Use the call_cust_date index to find the matching calls. Since it is - * is in increasing order by date for a given customer, we want to start + * in increasing order by date for a given customer, we want to start * with the last record for the customer and work backwards. * * Specify a subset of columns to be returned. (Note that if these were diff --git a/src/third_party/wiredtiger/ext/collators/revint/revint_collator.c b/src/third_party/wiredtiger/ext/collators/revint/revint_collator.c index d3dbaa4f4c7..52cc8c958f5 100644 --- a/src/third_party/wiredtiger/ext/collators/revint/revint_collator.c +++ b/src/third_party/wiredtiger/ext/collators/revint/revint_collator.c @@ -61,22 +61,18 @@ revint_compare( wt_api = revint_collator->wt_api; /* - * All indices using this collator have an integer key, and the - * primary key is also an integer. A collator is usually passed the - * concatenation of index key and primary key (when available), + * All indices using this collator have an integer key, and the primary key is also an integer. + * A collator is usually passed the concatenation of index key and primary key (when available), * hence we initially unpack using "ii". * - * A collator may also be called with an item that includes a index - * key and no primary key. Among items having the same index key, - * an item with no primary key should sort before an item with a - * primary key. The reason is that if the application calls - * WT_CURSOR::search on a index key for which there are more than - * one value, the search key will not yet have a primary key. We - * want to position the cursor at the 'first' matching index key so - * that repeated calls to WT_CURSOR::next will see them all. + * A collator may also be called with an item that includes a index key and no primary key. + * Among items having the same index key, an item with no primary key should sort before an item + * with a primary key. The reason is that if the application calls WT_CURSOR::search on a index + * key for which there are more than one value, the search key will not yet have a primary key. + * We want to position the cursor at the 'first' matching index key so that repeated calls to + * WT_CURSOR::next will see them all. * - * To keep this code simple, we do not reverse the ordering - * when comparing primary keys. + * To keep this code simple, we do not reverse the ordering when comparing primary keys. */ if ((ret = wt_api->unpack_start(wt_api, session, "ii", k1->data, k1->size, &pstream)) != 0) return (ret); diff --git a/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c b/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c index 2204f4942fa..700bb84216d 100644 --- a/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c +++ b/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c @@ -190,20 +190,18 @@ lz4_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, uint8_t *src, siz } /* - * Decompress, starting after the prefix bytes. Use safe decompression: - * we rely on decompression to detect corruption. + * Decompress, starting after the prefix bytes. Use safe decompression: we rely on decompression + * to detect corruption. * - * Two code paths, one with and one without a bounce buffer. When doing - * raw compression, we compress to a target size irrespective of row - * boundaries, and return to our caller a "useful" compression length - * based on the last complete row that was compressed. Our caller stores - * that length, not the length of bytes actually compressed by LZ4. In - * other words, our caller doesn't know how many bytes will result from - * decompression, likely hasn't provided us a large enough buffer, and - * we have to allocate a scratch buffer. + * Two code paths, one with and one without a bounce buffer. When doing raw compression, we + * compress to a target size irrespective of row boundaries, and return to our caller a "useful" + * compression length based on the last complete row that was compressed. Our caller stores that + * length, not the length of bytes actually compressed by LZ4. In other words, our caller + * doesn't know how many bytes will result from decompression, likely hasn't provided us a large + * enough buffer, and we have to allocate a scratch buffer. * - * Even though raw compression has been removed from WiredTiger, this - * code remains for backward compatibility with existing objects. + * Even though raw compression has been removed from WiredTiger, this code remains for backward + * compatibility with existing objects. */ if (dst_len < prefix.uncompressed_len) { if ((dst_tmp = wt_api->scr_alloc(wt_api, session, (size_t)prefix.uncompressed_len)) == NULL) diff --git a/src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c b/src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c index 8bf60e5f25f..1f0a15997b9 100644 --- a/src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c +++ b/src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c @@ -157,9 +157,8 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) return (errno); /* - * Allocate a local compressor structure, with a WT_COMPRESSOR structure - * as the first field, allowing us to treat references to either type of - * structure as a reference to the other type. + * Allocate a local compressor structure, with a WT_COMPRESSOR structure as the first field, + * allowing us to treat references to either type of structure as a reference to the other type. * * Heap memory (not static), because it can support multiple databases. */ diff --git a/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c b/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c index ce63e89334e..d1febdd63dd 100644 --- a/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c +++ b/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c @@ -128,9 +128,8 @@ snappy_compression(WT_COMPRESSOR *compressor, WT_SESSION *session, uint8_t *src, *compression_failed = 0; /* - * On decompression, snappy requires an exact compressed byte - * count (the current value of snaplen). WiredTiger does not - * preserve that value, so save snaplen at the beginning of + * On decompression, snappy requires an exact compressed byte count (the current value of + * snaplen). WiredTiger does not preserve that value, so save snaplen at the beginning of * the destination buffer. * * Store the value in little-endian format. diff --git a/src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c b/src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c index 23087fa87f4..a6abf95e558 100644 --- a/src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c +++ b/src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c @@ -116,9 +116,8 @@ zstd_compress(WT_COMPRESSOR *compressor, WT_SESSION *session, uint8_t *src, size *compression_failed = 0; /* - * On decompression, Zstd requires an exact compressed byte - * count (the current value of zstd_ret). WiredTiger does not - * preserve that value, so save zstd_ret at the beginning of + * On decompression, Zstd requires an exact compressed byte count (the current value of + * zstd_ret). WiredTiger does not preserve that value, so save zstd_ret at the beginning of * the destination buffer. * * Store the value in little-endian format. diff --git a/src/third_party/wiredtiger/ext/encryptors/nop/nop_encrypt.c b/src/third_party/wiredtiger/ext/encryptors/nop/nop_encrypt.c index 7192381befe..be12a6b19ea 100644 --- a/src/third_party/wiredtiger/ext/encryptors/nop/nop_encrypt.c +++ b/src/third_party/wiredtiger/ext/encryptors/nop/nop_encrypt.c @@ -168,9 +168,8 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) return (errno); /* - * Allocate a local encryptor structure, with a WT_ENCRYPTOR structure - * as the first field, allowing us to treat references to either type of - * structure as a reference to the other type. + * Allocate a local encryptor structure, with a WT_ENCRYPTOR structure as the first field, + * allowing us to treat references to either type of structure as a reference to the other type. * * Heap memory (not static), because it can support multiple databases. */ diff --git a/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c b/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c index 8fc355c9d6c..df252beefbd 100644 --- a/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c +++ b/src/third_party/wiredtiger/ext/encryptors/rotn/rotn_encrypt.c @@ -145,8 +145,8 @@ do_rotate(char *buf, size_t len, int rotn) /* * Now rotate. * - * Avoid ctype functions because they behave in unexpected ways, - * particularly when the locale is not "C". + * Avoid ctype functions because they behave in unexpected ways, particularly when the locale is + * not "C". */ for (i = 0; i < len; i++) { if ('a' <= buf[i] && buf[i] <= 'z') @@ -439,9 +439,8 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) return (errno); /* - * Allocate a local encryptor structure, with a WT_ENCRYPTOR structure - * as the first field, allowing us to treat references to either type of - * structure as a reference to the other type. + * Allocate a local encryptor structure, with a WT_ENCRYPTOR structure as the first field, + * allowing us to treat references to either type of structure as a reference to the other type. * * Heap memory (not static), because it can support multiple databases. */ diff --git a/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c b/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c index a715a1056d9..9f9f510c8cf 100644 --- a/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c +++ b/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c @@ -47,17 +47,15 @@ #define FAIL_FS_ENV_READ_ALLOW "WT_FAIL_FS_READ_ALLOW" /* - * A "fail file system", that is, a file system extension that fails when we - * want it to. This is only used in test frameworks, this fact allows us to - * simplify some error paths. This code is not portable to Windows, as it has - * direct knowledge of file descriptors, environment variables and stack - * traces. + * A "fail file system", that is, a file system extension that fails when we want it to. This is + * only used in test frameworks, this fact allows us to simplify some error paths. This code is not + * portable to Windows, as it has direct knowledge of file descriptors, environment variables and + * stack traces. * - * When the filesystem extension is configured, parameters can set how many - * reads or writes can be allowed before failure. If this is not fine-grained - * enough, an 'environment' configuration parameter can be specified. If that - * is used, then on every file system read or write, environment variables are - * checked that control when reading or writing should fail. + * When the filesystem extension is configured, parameters can set how many reads or writes can be + * allowed before failure. If this is not fine-grained enough, an 'environment' configuration + * parameter can be specified. If that is used, then on every file system read or write, environment + * variables are checked that control when reading or writing should fail. */ typedef struct { WT_FILE_SYSTEM iface; diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 32cdae47322..a92e90e2f91 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "3818fb14bd816a5fa2b9e8d50b0c2f93a3bfb9ca", + "commit": "af2cb8f052184a94635c3bfc358620dd36df5828", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-4.4" diff --git a/src/third_party/wiredtiger/src/async/async_api.c b/src/third_party/wiredtiger/src/async/async_api.c index 81b23b238e7..ec12d8d02ab 100644 --- a/src/third_party/wiredtiger/src/async/async_api.c +++ b/src/third_party/wiredtiger/src/async/async_api.c @@ -176,8 +176,8 @@ __async_config(WT_SESSION_IMPL *session, WT_CONNECTION_IMPL *conn, const char ** *runp = cval.val != 0; /* - * Even if async is turned off, we want to parse and store the default - * values so that reconfigure can just enable them. + * Even if async is turned off, we want to parse and store the default values so that + * reconfigure can just enable them. * * Bound the minimum maximum operations at 10. */ @@ -308,18 +308,15 @@ __wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__async_config(session, &tmp_conn, cfg, &run)); /* - * There are some restrictions on the live reconfiguration of async. - * Unlike other subsystems where we simply destroy anything existing - * and restart with the new configuration, async is not so easy. - * If the user is just changing the number of workers, we want to - * allow the existing op handles and other information to remain in - * existence. So we must handle various combinations of changes - * individually. + * There are some restrictions on the live reconfiguration of async. Unlike other subsystems + * where we simply destroy anything existing and restart with the new configuration, async is + * not so easy. If the user is just changing the number of workers, we want to allow the + * existing op handles and other information to remain in existence. So we must handle various + * combinations of changes individually. * - * One restriction is that if async is currently on, the user cannot - * change the number of async op handles available. The user can try - * but we do nothing with it. However we must allow the ops_max config - * string so that a user can completely start async via reconfigure. + * One restriction is that if async is currently on, the user cannot change the number of async + * op handles available. The user can try but we do nothing with it. However we must allow the + * ops_max config string so that a user can completely start async via reconfigure. */ /* diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c index 5e2f261a424..158fc919820 100644 --- a/src/third_party/wiredtiger/src/block/block_ckpt.c +++ b/src/third_party/wiredtiger/src/block/block_ckpt.c @@ -272,16 +272,13 @@ __ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt) WT_BLOCK_CKPT *ci; /* - * Allocate a checkpoint structure, crack the cookie and read the - * checkpoint's extent lists. + * Allocate a checkpoint structure, crack the cookie and read the checkpoint's extent lists. * - * Ignore the avail list: checkpoint avail lists are only useful if we - * are rolling forward from the particular checkpoint and they represent - * our best understanding of what blocks can be allocated. If we are - * not operating on the live checkpoint, subsequent checkpoints might - * have allocated those blocks, and the avail list is useless. We don't - * discard it, because it is useful as part of verification, but we - * don't re-write it either. + * Ignore the avail list: checkpoint avail lists are only useful if we are rolling forward from + * the particular checkpoint and they represent our best understanding of what blocks can be + * allocated. If we are not operating on the live checkpoint, subsequent checkpoints might have + * allocated those blocks, and the avail list is useless. We don't discard it, because it is + * useful as part of verification, but we don't re-write it either. */ WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv)); @@ -366,30 +363,24 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) #endif /* - * Checkpoints are a two-step process: first, write a new checkpoint to - * disk (including all the new extent lists for modified checkpoints - * and the live system). As part of this, create a list of file blocks - * newly available for reallocation, based on checkpoints being deleted. - * We then return the locations of the new checkpoint information to our - * caller. Our caller has to write that information into some kind of - * stable storage, and once that's done, we can actually allocate from - * that list of newly available file blocks. (We can't allocate from - * that list immediately because the allocation might happen before our - * caller saves the new checkpoint information, and if we crashed before - * the new checkpoint location was saved, we'd have overwritten blocks - * still referenced by checkpoints in the system.) In summary, there is - * a second step: after our caller saves the checkpoint information, we - * are called to add the newly available blocks into the live system's - * available list. + * Checkpoints are a two-step process: first, write a new checkpoint to disk (including all the + * new extent lists for modified checkpoints and the live system). As part of this, create a + * list of file blocks newly available for reallocation, based on checkpoints being deleted. We + * then return the locations of the new checkpoint information to our caller. Our caller has to + * write that information into some kind of stable storage, and once that's done, we can + * actually allocate from that list of newly available file blocks. (We can't allocate from that + * list immediately because the allocation might happen before our caller saves the new + * checkpoint information, and if we crashed before the new checkpoint location was saved, we'd + * have overwritten blocks still referenced by checkpoints in the system.) In summary, there is + * a second step: after our caller saves the checkpoint information, we are called to add the + * newly available blocks into the live system's available list. * - * This function is the first step, the second step is in the resolve - * function. + * This function is the first step, the second step is in the resolve function. * - * If we're called to checkpoint the same file twice (without the second - * resolution step), or re-entered for any reason, it's an error in our - * caller, and our choices are all bad: leak blocks or potentially crash - * with our caller not yet having saved previous checkpoint information - * to stable storage. + * If we're called to checkpoint the same file twice (without the second resolution step), or + * re-entered for any reason, it's an error in our caller, and our choices are all bad: leak + * blocks or potentially crash with our caller not yet having saved previous checkpoint + * information to stable storage. */ __wt_spin_lock(session, &block->live_lock); switch (block->ckpt_state) { @@ -412,18 +403,16 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) WT_RET(ret); /* - * Extents newly available as a result of deleting previous checkpoints - * are added to a list of extents. The list should be empty, but as - * described above, there is no "free the checkpoint information" call - * into the block manager; if there was an error in an upper level that - * resulted in some previous checkpoint never being resolved, the list - * may not be empty. We should have caught that with the "checkpoint - * in progress" test, but it doesn't cost us anything to be cautious. + * Extents newly available as a result of deleting previous checkpoints are added to a list of + * extents. The list should be empty, but as described above, there is no "free the checkpoint + * information" call into the block manager; if there was an error in an upper level that + * resulted in some previous checkpoint never being resolved, the list may not be empty. We + * should have caught that with the "checkpoint in progress" test, but it doesn't cost us + * anything to be cautious. * - * We free the checkpoint's allocation and discard extent lists as part - * of the resolution step, not because they're needed at that time, but - * because it's potentially a lot of work, and waiting allows the btree - * layer to continue eviction sooner. As for the checkpoint-available + * We free the checkpoint's allocation and discard extent lists as part of the resolution step, + * not because they're needed at that time, but because it's potentially a lot of work, and + * waiting allows the btree layer to continue eviction sooner. As for the checkpoint-available * list, make sure they get cleaned out. */ __wt_block_extlist_free(session, &ci->ckpt_avail); @@ -566,11 +555,11 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) continue; /* - * We have to write the "to" checkpoint's extent lists out in - * new blocks, and update its cookie. + * We have to write the "to" checkpoint's extent lists out in new blocks, and update its + * cookie. * - * Free the blocks used to hold the "to" checkpoint's extent - * lists; don't include the avail list, it's not changing. + * Free the blocks used to hold the "to" checkpoint's extent lists; don't include the avail + * list, it's not changing. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard)); @@ -717,16 +706,14 @@ __ckpt_update(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase, WT_C } /* - * We only write an avail list for the live system, other checkpoint's - * avail lists are static and never change. + * We only write an avail list for the live system, other checkpoint's avail lists are static + * and never change. * - * Write the avail list last so it reflects changes due to allocating - * blocks for the alloc and discard lists. Second, when we write the - * live system's avail list, it's two lists: the current avail list - * plus the list of blocks to be made available when the new checkpoint - * completes. We can't merge that second list into the real list yet, - * it's not truly available until the new checkpoint locations have been - * saved to the metadata. + * Write the avail list last so it reflects changes due to allocating blocks for the alloc and + * discard lists. Second, when we write the live system's avail list, it's two lists: the + * current avail list plus the list of blocks to be made available when the new checkpoint + * completes. We can't merge that second list into the real list yet, it's not truly available + * until the new checkpoint locations have been saved to the metadata. */ if (is_live) { block->final_ckpt = ckpt; diff --git a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c index b7fda0d73b2..9af0221a81f 100644 --- a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c +++ b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c @@ -239,16 +239,14 @@ __wt_block_checkpoint_last(WT_SESSION_IMPL *session, WT_BLOCK *block, char **met WT_RET(__wt_buf_init(session, checkpoint, WT_BLOCK_CHECKPOINT_BUFFER)); /* - * Initialize a pair of structures that track the best and current - * checkpoints found so far. This is a little trickier than normal - * because we don't want to start saving a checkpoint only to find - * out it's not one we can use. I doubt that can happen and it - * suggests corruption, but half-a-checkpoint isn't a good place to - * be. Only swap to a new "best" checkpoint if we read the whole - * thing successfully. + * Initialize a pair of structures that track the best and current checkpoints found so far. + * This is a little trickier than normal because we don't want to start saving a checkpoint only + * to find out it's not one we can use. I doubt that can happen and it suggests corruption, but + * half-a-checkpoint isn't a good place to be. Only swap to a new "best" checkpoint if we read + * the whole thing successfully. * - * Don't re-order these lines: it's done this way so the WT_ITEMs - * are always initialized and error handling works. + * Don't re-order these lines: it's done this way so the WT_ITEMs are always initialized and + * error handling works. */ memset((best = &_best), 0, sizeof(_best)); memset((current = &_current), 0, sizeof(_current)); diff --git a/src/third_party/wiredtiger/src/block/block_compact.c b/src/third_party/wiredtiger/src/block/block_compact.c index 6fe4d879e23..ea20bb80ef9 100644 --- a/src/third_party/wiredtiger/src/block/block_compact.c +++ b/src/third_party/wiredtiger/src/block/block_compact.c @@ -93,14 +93,12 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp) /* * Skip files where we can't recover at least 1MB. * - * If at least 20% of the total file is available and in the first 80% - * of the file, we'll try compaction on the last 20% of the file; else, - * if at least 10% of the total file is available and in the first 90% - * of the file, we'll try compaction on the last 10% of the file. + * If at least 20% of the total file is available and in the first 80% of the file, we'll try + * compaction on the last 20% of the file; else, if at least 10% of the total file is available + * and in the first 90% of the file, we'll try compaction on the last 10% of the file. * - * We could push this further, but there's diminishing returns, a mostly - * empty file can be processed quickly, so more aggressive compaction is - * less useful. + * We could push this further, but there's diminishing returns, a mostly empty file can be + * processed quickly, so more aggressive compaction is less useful. */ if (avail_eighty > WT_MEGABYTE && avail_eighty >= ((block->size / 10) * 2)) { *skipp = false; diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c index ac8ef950868..71aeea0714f 100644 --- a/src/third_party/wiredtiger/src/block/block_ext.c +++ b/src/third_party/wiredtiger/src/block/block_ext.c @@ -63,14 +63,14 @@ __block_off_srch(WT_EXT **head, wt_off_t off, WT_EXT ***stack, bool skip_off) int i; /* - * Start at the highest skip level, then go as far as possible at each - * level before stepping down to the next. + * Start at the highest skip level, then go as far as possible at each level before stepping + * down to the next. * * Return a stack for an exact match or the next-largest item. * - * The WT_EXT structure contains two skiplists, the primary one and the - * per-size bucket one: if the skip_off flag is set, offset the skiplist - * array by the depth specified in this particular structure. + * The WT_EXT structure contains two skiplists, the primary one and the per-size bucket one: if + * the skip_off flag is set, offset the skiplist array by the depth specified in this particular + * structure. */ for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;) if (*extp != NULL && (*extp)->off < off) @@ -113,8 +113,8 @@ __block_size_srch(WT_SIZE **head, wt_off_t size, WT_SIZE ***stack) int i; /* - * Start at the highest skip level, then go as far as possible at each - * level before stepping down to the next. + * Start at the highest skip level, then go as far as possible at each level before stepping + * down to the next. * * Return a stack for an exact match or the next-largest item. */ @@ -451,8 +451,8 @@ static inline int __block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size) { /* - * Callers of this function are expected to have already acquired any - * locks required to extend the file. + * Callers of this function are expected to have already acquired any locks required to extend + * the file. * * We should never be allocating from an empty file. */ @@ -502,14 +502,12 @@ __wt_block_alloc(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_o (intmax_t)size, block->allocsize); /* - * Allocation is either first-fit (lowest offset), or best-fit (best - * size). If it's first-fit, walk the offset list linearly until we - * find an entry that will work. + * Allocation is either first-fit (lowest offset), or best-fit (best size). If it's first-fit, + * walk the offset list linearly until we find an entry that will work. * - * If it's best-fit by size, search the by-size skiplist for the size - * and take the first entry on the by-size offset list. This means we - * prefer best-fit over lower offset, but within a size we'll prefer an - * offset appearing earlier in the file. + * If it's best-fit by size, search the by-size skiplist for the size and take the first entry + * on the by-size offset list. This means we prefer best-fit over lower offset, but within a + * size we'll prefer an offset appearing earlier in the file. * * If we don't have anything big enough, extend the file. */ @@ -603,13 +601,12 @@ __wt_block_off_free(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, WT_ASSERT(session, WT_SESSION_BTREE_SYNC_SAFE(session, S2BT(session))); /* - * Callers of this function are expected to have already acquired any - * locks required to manipulate the extent lists. + * Callers of this function are expected to have already acquired any locks required to + * manipulate the extent lists. * - * We can reuse this extent immediately if it was allocated during this - * checkpoint, merge it into the avail list (which slows file growth in - * workloads including repeated overflow record modification). If this - * extent is referenced in a previous checkpoint, merge into the discard + * We can reuse this extent immediately if it was allocated during this checkpoint, merge it + * into the avail list (which slows file growth in workloads including repeated overflow record + * modification). If this extent is referenced in a previous checkpoint, merge into the discard * list. */ if ((ret = __wt_block_off_remove_overlap(session, block, &block->live.alloc, offset, size)) == @@ -914,13 +911,12 @@ __block_append( WT_ASSERT(session, el->track_size == 0); /* - * Identical to __block_merge, when we know the file is being extended, - * that is, the information is either going to be used to extend the - * last object on the list, or become a new object ending the list. + * Identical to __block_merge, when we know the file is being extended, that is, the information + * is either going to be used to extend the last object on the list, or become a new object + * ending the list. * - * The terminating element of the list is cached, check it; otherwise, - * get a stack for the last object in the skiplist, check for a simple - * extension, and otherwise append a new structure. + * The terminating element of the list is cached, check it; otherwise, get a stack for the last + * object in the skiplist, check for a simple extension, and otherwise append a new structure. */ if ((ext = el->last) != NULL && ext->off + ext->size == off) ext->size += size; @@ -955,15 +951,13 @@ __wt_block_insert_ext( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t off, wt_off_t size) { /* - * There are currently two copies of this function (this code is a one- - * liner that calls the internal version of the function, which means - * the compiler should compress out the function call). It's that way - * because the interface is still fluid, I'm not convinced there won't - * be a need for a functional split between the internal and external - * versions in the future. + * There are currently two copies of this function (this code is a one- liner that calls the + * internal version of the function, which means the compiler should compress out the function + * call). It's that way because the interface is still fluid, I'm not convinced there won't be a + * need for a functional split between the internal and external versions in the future. * - * Callers of this function are expected to have already acquired any - * locks required to manipulate the extent list. + * Callers of this function are expected to have already acquired any locks required to + * manipulate the extent list. */ return (__block_merge(session, block, el, off, size)); } @@ -1180,12 +1174,10 @@ __wt_block_extlist_write( } /* - * Get a scratch buffer, clear the page's header and data, initialize - * the header. + * Get a scratch buffer, clear the page's header and data, initialize the header. * - * Allocate memory for the extent list entries plus two additional - * entries: the initial WT_BLOCK_EXTLIST_MAGIC/0 pair and the list- - * terminating WT_BLOCK_INVALID_OFFSET/0 pair. + * Allocate memory for the extent list entries plus two additional entries: the initial + * WT_BLOCK_EXTLIST_MAGIC/0 pair and the list- terminating WT_BLOCK_INVALID_OFFSET/0 pair. */ size = ((size_t)entries + 2) * 2 * WT_INTPACK64_MAXSIZE; WT_RET(__wt_block_write_size(session, block, &size)); diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c index b29fb939663..ba32bce74bc 100644 --- a/src/third_party/wiredtiger/src/block/block_open.c +++ b/src/third_party/wiredtiger/src/block/block_open.c @@ -36,10 +36,9 @@ __wt_block_manager_create(WT_SESSION_IMPL *session, const char *filename, uint32 /* * Create the underlying file and open a handle. * - * Since WiredTiger schema operations are (currently) non-transactional, - * it's possible to see a partially-created file left from a previous - * create. Further, there's nothing to prevent users from creating files - * in our space. Move any existing files out of the way and complain. + * Since WiredTiger schema operations are (currently) non-transactional, it's possible to see a + * partially-created file left from a previous create. Further, there's nothing to prevent users + * from creating files in our space. Move any existing files out of the way and complain. */ for (;;) { if ((ret = __wt_open(session, filename, WT_FS_OPEN_FILE_TYPE_DATA, @@ -162,9 +161,9 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[ /* * Basic structure allocation, initialization. * - * Note: set the block's name-hash value before any work that can fail - * because cleanup calls the block destroy code which uses that hash - * value to remove the block from the underlying linked lists. + * Note: set the block's name-hash value before any work that can fail because cleanup calls the + * block destroy code which uses that hash value to remove the block from the underlying linked + * lists. */ WT_ERR(__wt_calloc_one(session, &block)); block->ref = 1; @@ -215,8 +214,8 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[ /* * Read the description information from the first block. * - * Salvage is a special case: if we're forcing the salvage, we don't - * look at anything, including the description information. + * Salvage is a special case: if we're forcing the salvage, we don't look at anything, including + * the description information. */ if (!forced_salvage) WT_ERR(__desc_read(session, allocsize, block)); diff --git a/src/third_party/wiredtiger/src/block/block_vrfy.c b/src/third_party/wiredtiger/src/block/block_vrfy.c index bc3109fe570..e0930aadec0 100644 --- a/src/third_party/wiredtiger/src/block/block_vrfy.c +++ b/src/third_party/wiredtiger/src/block/block_vrfy.c @@ -327,17 +327,14 @@ __wt_block_verify_addr( WT_RET(__verify_filefrag_add(session, block, NULL, offset, size, false)); /* - * It's tempting to try and flag a page as "verified" when we read it. - * That doesn't work because we may visit a page multiple times when - * verifying a single checkpoint (for example, when verifying the - * physical image of a row-store leaf page with overflow keys, the - * overflow keys are read when checking for key sort issues, and read - * again when more general overflow item checking is done). This - * function is called by the btree verification code, once per logical - * visit in a checkpoint, so we can detect if a page is referenced - * multiple times within a single checkpoint. This doesn't apply to - * the per-file list, because it is expected for the same btree blocks - * to appear in multiple checkpoints. + * It's tempting to try and flag a page as "verified" when we read it. That doesn't work because + * we may visit a page multiple times when verifying a single checkpoint (for example, when + * verifying the physical image of a row-store leaf page with overflow keys, the overflow keys + * are read when checking for key sort issues, and read again when more general overflow item + * checking is done). This function is called by the btree verification code, once per logical + * visit in a checkpoint, so we can detect if a page is referenced multiple times within a + * single checkpoint. This doesn't apply to the per-file list, because it is expected for the + * same btree blocks to appear in multiple checkpoints. * * Add the block to the per-checkpoint list. */ @@ -401,14 +398,12 @@ __verify_filefrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block) return (0); /* - * It's OK if we have not verified blocks at the end of the file: that - * happens if the file is truncated during a checkpoint or load or was - * extended after writing a checkpoint. We should never see unverified - * blocks anywhere else, though. + * It's OK if we have not verified blocks at the end of the file: that happens if the file is + * truncated during a checkpoint or load or was extended after writing a checkpoint. We should + * never see unverified blocks anywhere else, though. * - * I'm deliberately testing for a last fragment of 0, it makes no sense - * there would be no fragments verified, complain if the first fragment - * in the file wasn't verified. + * I'm deliberately testing for a last fragment of 0, it makes no sense there would be no + * fragments verified, complain if the first fragment in the file wasn't verified. */ for (last = block->frags - 1; last != 0; --last) { if (__bit_test(block->fragfile, last)) diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c index 31e000032d6..476d94af582 100644 --- a/src/third_party/wiredtiger/src/block/block_write.c +++ b/src/third_party/wiredtiger/src/block/block_write.c @@ -25,21 +25,19 @@ __wt_block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t len) /* * Truncate requires serialization, we depend on our caller for that. * - * Truncation isn't a requirement of the block manager, it's only used - * to conserve disk space. Regardless of the underlying file system - * call's result, the in-memory understanding of the file size changes. + * Truncation isn't a requirement of the block manager, it's only used to conserve disk space. + * Regardless of the underlying file system call's result, the in-memory understanding of the + * file size changes. */ block->size = block->extend_size = len; /* - * Backups are done by copying files outside of WiredTiger, potentially - * by system utilities. We cannot truncate the file during the backup - * window, we might surprise an application. + * Backups are done by copying files outside of WiredTiger, potentially by system utilities. We + * cannot truncate the file during the backup window, we might surprise an application. * - * This affects files that aren't involved in the backup (for example, - * doing incremental backups, which only copies log files, or targeted - * backups, stops all block truncation unnecessarily). We may want a - * more targeted solution at some point. + * This affects files that aren't involved in the backup (for example, doing incremental + * backups, which only copies log files, or targeted backups, stops all block truncation + * unnecessarily). We may want a more targeted solution at some point. */ if (!conn->hot_backup) { WT_WITH_HOTBACKUP_READ_LOCK(session, ret = __wt_ftruncate(session, block->fh, len), NULL); @@ -97,13 +95,11 @@ __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_FH *fh, wt_off_t WT_FILE_HANDLE *handle; /* - * The locking in this function is messy: by definition, the live system - * is locked when we're called, but that lock may have been acquired by - * our caller or our caller's caller. If our caller's lock, release_lock - * comes in set and this function can unlock it before returning (so it - * isn't held while extending the file). If it is our caller's caller, - * then release_lock comes in not set, indicating it cannot be released - * here. + * The locking in this function is messy: by definition, the live system is locked when we're + * called, but that lock may have been acquired by our caller or our caller's caller. If our + * caller's lock, release_lock comes in set and this function can unlock it before returning (so + * it isn't held while extending the file). If it is our caller's caller, then release_lock + * comes in not set, indicating it cannot be released here. * * If we unlock here, we clear release_lock. */ @@ -135,13 +131,12 @@ __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_FH *fh, wt_off_t return (0); /* - * Set the extend_size before releasing the lock, I don't want to read - * and manipulate multiple values without holding a lock. + * Set the extend_size before releasing the lock, I don't want to read and manipulate multiple + * values without holding a lock. * - * There's a race between the calculation and doing the extension, but - * it should err on the side of extend_size being smaller than the - * actual file size, and that's OK, we simply may do another extension - * sooner than otherwise. + * There's a race between the calculation and doing the extension, but it should err on the side + * of extend_size being smaller than the actual file size, and that's OK, we simply may do + * another extension sooner than otherwise. */ block->extend_size = block->size + block->extend_len * 2; @@ -245,9 +240,9 @@ __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_of /* * Align the size to an allocation unit. * - * The buffer must be big enough for us to zero to the next allocsize - * boundary, this is one of the reasons the btree layer must find out - * from the block-manager layer the maximum size of the eventual write. + * The buffer must be big enough for us to zero to the next allocsize boundary, this is one of + * the reasons the btree layer must find out from the block-manager layer the maximum size of + * the eventual write. */ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { @@ -301,21 +296,19 @@ __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_of blk->disk_size = WT_STORE_SIZE(align_size); /* - * Update the block's checksum: if our caller specifies, checksum the - * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP - * bytes. The assumption is applications with good compression support - * turn off checksums and assume corrupted blocks won't decompress - * correctly. However, if compression failed to shrink the block, the - * block wasn't compressed, in which case our caller will tell us to - * checksum the data to detect corruption. If compression succeeded, - * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes - * because they're not compressed, both to give salvage a quick test - * of whether a block is useful and to give us a test so we don't lose - * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. + * Update the block's checksum: if our caller specifies, checksum the complete data, otherwise + * checksum the leading WT_BLOCK_COMPRESS_SKIP bytes. The assumption is applications with good + * compression support turn off checksums and assume corrupted blocks won't decompress + * correctly. However, if compression failed to shrink the block, the block wasn't compressed, + * in which case our caller will tell us to checksum the data to detect corruption. If + * compression succeeded, we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes + * because they're not compressed, both to give salvage a quick test of whether a block is + * useful and to give us a test so we don't lose the first WT_BLOCK_COMPRESS_SKIP bytes without + * noticing. * - * Checksum a little-endian version of the header, and write everything - * in little-endian format. The checksum is (potentially) returned in a - * big-endian format, swap it into place in a separate step. + * Checksum a little-endian version of the header, and write everything in little-endian format. + * The checksum is (potentially) returned in a big-endian format, swap it into place in a + * separate step. */ blk->flags = 0; if (data_checksum) diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c index e4d8a6abb10..d396f87ab49 100644 --- a/src/third_party/wiredtiger/src/btree/bt_compact.c +++ b/src/third_party/wiredtiger/src/btree/bt_compact.c @@ -37,13 +37,12 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) } /* - * If the page is a replacement, test the replacement addresses. - * Ignore empty pages, they get merged into the parent. + * If the page is a replacement, test the replacement addresses. Ignore empty pages, they get + * merged into the parent. * - * Page-modify variable initialization done here because the page could - * be modified while we're looking at it, so the page modified structure - * may appear at any time (but cannot disappear). We've confirmed there - * is a page modify structure, it's OK to look at it. + * Page-modify variable initialization done here because the page could be modified while we're + * looking at it, so the page modified structure may appear at any time (but cannot disappear). + * We've confirmed there is a page modify structure, it's OK to look at it. */ mod = page->modify; if (mod->rec_result == WT_PM_REC_REPLACE) @@ -77,18 +76,15 @@ __compact_rewrite_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) btree = S2BT(session); /* - * Reviewing in-memory pages requires looking at page reconciliation - * results, because we care about where the page is stored now, not - * where the page was stored when we first read it into the cache. - * We need to ensure we don't race with page reconciliation as it's - * writing the page modify information. + * Reviewing in-memory pages requires looking at page reconciliation results, because we care + * about where the page is stored now, not where the page was stored when we first read it into + * the cache. We need to ensure we don't race with page reconciliation as it's writing the page + * modify information. * - * There are two ways we call reconciliation: checkpoints and eviction. - * Get the tree's flush lock which blocks threads writing pages for - * checkpoints. If checkpoint is holding the lock, quit working this - * file, we'll visit it again in our next pass. We don't have to worry - * about eviction, we're holding a hazard pointer on the WT_REF, it's - * not going anywhere. + * There are two ways we call reconciliation: checkpoints and eviction. Get the tree's flush + * lock which blocks threads writing pages for checkpoints. If checkpoint is holding the lock, + * quit working this file, we'll visit it again in our next pass. We don't have to worry about + * eviction, we're holding a hazard pointer on the WT_REF, it's not going anywhere. */ WT_RET(__wt_spin_trylock(session, &btree->flush_lock)); @@ -192,8 +188,8 @@ __wt_compact(WT_SESSION_IMPL *session) /* * Cheap checks that don't require locking. * - * Ignore the root: it may not have a replacement address, and - * besides, if anything else gets written, so will it. + * Ignore the root: it may not have a replacement address, and besides, if anything else + * gets written, so will it. * * Ignore dirty pages, checkpoint writes them regardless. */ @@ -247,12 +243,12 @@ __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, boo } /* - * If the page is in-memory, we want to look at it (it may have been - * modified and written, and the current location is the interesting - * one in terms of compaction, not the original location). + * If the page is in-memory, we want to look at it (it may have been modified and written, and + * the current location is the interesting one in terms of compaction, not the original + * location). * - * This test could be combined with the next one, but this is a cheap - * test and the next one is expensive. + * This test could be combined with the next one, but this is a cheap test and the next one is + * expensive. */ if (ref->state != WT_REF_DISK) return (0); @@ -266,12 +262,11 @@ __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, boo return (0); /* - * The page is on disk, so there had better be an address; assert that - * fact, test at run-time to avoid the core dump. + * The page is on disk, so there had better be an address; assert that fact, test at run-time to + * avoid the core dump. * - * Internal pages must be read to walk the tree; ask the block-manager - * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite - * won't help. + * Internal pages must be read to walk the tree; ask the block-manager if it's useful to rewrite + * leaf pages, don't do the I/O if a rewrite won't help. */ __wt_ref_info(session, ref, &addr, &addr_size, &type); WT_ASSERT(session, addr != NULL); diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index d4ebd5322f4..b7f2e0db4b3 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -41,20 +41,17 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) __cursor_set_recno(cbt, cbt->recno + 1); /* - * Fixed-width column store appends are inherently non-transactional. - * Even a non-visible update by a concurrent or aborted transaction - * changes the effective end of the data. The effect is subtle because - * of the blurring between deleted and empty values, but ideally we - * would skip all uncommitted changes at the end of the data. This - * doesn't apply to variable-width column stores because the implicitly - * created records written by reconciliation are deleted and so can be - * never seen by a read. + * Fixed-width column store appends are inherently non-transactional. Even a non-visible update + * by a concurrent or aborted transaction changes the effective end of the data. The effect is + * subtle because of the blurring between deleted and empty values, but ideally we would skip + * all uncommitted changes at the end of the data. This doesn't apply to variable-width column + * stores because the implicitly created records written by reconciliation are deleted and so + * can be never seen by a read. * - * The problem is that we don't know at this point whether there may be - * multiple uncommitted changes at the end of the data, and it would be - * expensive to check every time we hit an aborted update. If an - * insert is aborted, we simply return zero (empty), regardless of - * whether we are at the end of the data. + * The problem is that we don't know at this point whether there may be multiple uncommitted + * changes at the end of the data, and it would be expensive to check every time we hit an + * aborted update. If an insert is aborted, we simply return zero (empty), regardless of whether + * we are at the end of the data. */ if (cbt->recno < WT_INSERT_RECNO(cbt->ins)) { cbt->v = 0; @@ -249,14 +246,12 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) continue; /* - * There can be huge gaps in the variable-length - * column-store name space appearing as deleted - * records. If more than one deleted record, do - * the work of finding the next record to return - * instead of looping through the records. + * There can be huge gaps in the variable-length column-store name space appearing + * as deleted records. If more than one deleted record, do the work of finding the + * next record to return instead of looping through the records. * - * First, find the smallest record in the update - * list that's larger than the current record. + * First, find the smallest record in the update list that's larger than the current + * record. */ ins = __col_insert_search_gt(cbt->ins_head, cbt->recno); @@ -313,13 +308,11 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) cbt->iter_retry = WT_CBT_RETRY_NOTSET; /* - * For row-store pages, we need a single item that tells us the part - * of the page we're walking (otherwise switching from next to prev - * and vice-versa is just too complicated), so we map the WT_ROW and - * WT_INSERT_HEAD insert array slots into a single name space: slot 1 - * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is - * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are - * odd-numbered slots, and WT_ROW array slots are even-numbered slots. + * For row-store pages, we need a single item that tells us the part of the page we're walking + * (otherwise switching from next to prev and vice-versa is just too complicated), so we map the + * WT_ROW and WT_INSERT_HEAD insert array slots into a single name space: slot 1 is the + * "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is WT_INSERT_HEAD[0], and so on. This + * means WT_INSERT lists are odd-numbered slots, and WT_ROW array slots are even-numbered slots. * * Initialize for each new page. */ @@ -696,17 +689,13 @@ err: F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); #ifdef HAVE_DIAGNOSTIC /* - * Skip key order check, if prev is called after a next returned - * a prepare conflict error, i.e cursor has changed direction - * at a prepared update, hence current key returned could be - * same as earlier returned key. + * Skip key order check, if prev is called after a next returned a prepare conflict error, + * i.e cursor has changed direction at a prepared update, hence current key returned could + * be same as earlier returned key. * - * eg: Initial data set : (1,2,3,...10) - * insert key 11 in a prepare transaction. - * loop on next will return 1,2,3...10 and subsequent call to - * next will return a prepare conflict. Now if we call prev - * key 10 will be returned which will be same as earlier - * returned key. + * eg: Initial data set : (1,2,3,...10) insert key 11 in a prepare transaction. loop on next + * will return 1,2,3...10 and subsequent call to next will return a prepare conflict. Now if + * we call prev key 10 will be returned which will be same as earlier returned key. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_RETRY_PREV)) ret = __wt_cursor_key_order_check(session, cbt, true); diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index 315f0f5b654..2a91695ebd2 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -11,13 +11,12 @@ /* * Walking backwards through skip lists. * - * The skip list stack is an array of pointers set up by a search. It points - * to the position a node should go in the skip list. In other words, the skip - * list search stack always points *after* the search item (that is, into the - * search item's next array). + * The skip list stack is an array of pointers set up by a search. It points to the position a node + * should go in the skip list. In other words, the skip list search stack always points *after* the + * search item (that is, into the search item's next array). * - * Helper macros to go from a stack pointer at level i, pointing into a next - * array, back to the insert node containing that next array. + * Helper macros to go from a stack pointer at level i, pointing into a next array, back to the + * insert node containing that next array. */ #undef PREV_ITEM #define PREV_ITEM(ins_head, insp, i) \ @@ -73,13 +72,11 @@ restart: break; /* - * Find a starting point for the new search. That is either at the - * non-moving node if we found a valid node, or the beginning of the - * next list down that is not the current node. + * Find a starting point for the new search. That is either at the non-moving node if we found a + * valid node, or the beginning of the next list down that is not the current node. * - * Since it is the beginning of a list, and we know the current node is - * has a skip depth at least this high, any node we find must sort - * before the current node. + * Since it is the beginning of a list, and we know the current node is has a skip depth at + * least this high, any node we find must sort before the current node. */ if (ins == NULL || ins == current) for (; i >= 0; i--) { @@ -390,14 +387,12 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) if (__wt_cell_rle(&unpack) == 1) continue; /* - * There can be huge gaps in the variable-length - * column-store name space appearing as deleted - * records. If more than one deleted record, do - * the work of finding the next record to return - * instead of looping through the records. + * There can be huge gaps in the variable-length column-store name space appearing + * as deleted records. If more than one deleted record, do the work of finding the + * next record to return instead of looping through the records. * - * First, find the largest record in the update - * list that's smaller than the current record. + * First, find the largest record in the update list that's smaller than the current + * record. */ ins = __col_insert_search_lt(cbt->ins_head, cbt->recno); @@ -454,13 +449,11 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) cbt->iter_retry = WT_CBT_RETRY_NOTSET; /* - * For row-store pages, we need a single item that tells us the part - * of the page we're walking (otherwise switching from next to prev - * and vice-versa is just too complicated), so we map the WT_ROW and - * WT_INSERT_HEAD insert array slots into a single name space: slot 1 - * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is - * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are - * odd-numbered slots, and WT_ROW array slots are even-numbered slots. + * For row-store pages, we need a single item that tells us the part of the page we're walking + * (otherwise switching from next to prev and vice-versa is just too complicated), so we map the + * WT_ROW and WT_INSERT_HEAD insert array slots into a single name space: slot 1 is the + * "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is WT_INSERT_HEAD[0], and so on. This + * means WT_INSERT lists are odd-numbered slots, and WT_ROW array slots are even-numbered slots. * * Initialize for each new page. */ @@ -659,17 +652,13 @@ err: F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); #ifdef HAVE_DIAGNOSTIC /* - * Skip key order check, if next is called after a prev returned - * a prepare conflict error, i.e cursor has changed direction - * at a prepared update, hence current key returned could be - * same as earlier returned key. + * Skip key order check, if next is called after a prev returned a prepare conflict error, + * i.e cursor has changed direction at a prepared update, hence current key returned could + * be same as earlier returned key. * - * eg: Initial data set : (2,3,...10) - * insert key 1 in a prepare transaction. - * loop on prev will return 10,...3,2 and subsequent call to - * prev will return a prepare conflict. Now if we call next - * key 2 will be returned which will be same as earlier - * returned key. + * eg: Initial data set : (2,3,...10) insert key 1 in a prepare transaction. loop on prev + * will return 10,...3,2 and subsequent call to prev will return a prepare conflict. Now if + * we call next key 2 will be returned which will be same as earlier returned key. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_RETRY_NEXT)) ret = __wt_cursor_key_order_check(session, cbt, false); diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index bc70a9d389f..8f64dd5562e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -195,15 +195,13 @@ static inline bool __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt) { /* - * When there's no exact match, column-store search returns the key - * nearest the searched-for key (continuing past keys smaller than the - * searched-for key to return the next-largest key). Therefore, if the - * returned comparison is -1, the searched-for key was larger than any - * row on the page's standard information or column-store insert list. + * When there's no exact match, column-store search returns the key nearest the searched-for key + * (continuing past keys smaller than the searched-for key to return the next-largest key). + * Therefore, if the returned comparison is -1, the searched-for key was larger than any row on + * the page's standard information or column-store insert list. * - * If the returned comparison is NOT -1, there was a row equal to or - * larger than the searched-for key, and we implicitly create missing - * rows. + * If the returned comparison is NOT -1, there was a row equal to or larger than the + * searched-for key, and we implicitly create missing rows. */ return (btree->type == BTREE_COL_FIX && cbt->compare != -1); } @@ -644,16 +642,14 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) __cursor_state_save(cursor, &state); /* - * If we have a row-store page pinned, search it; if we don't have a - * page pinned, or the search of the pinned page doesn't find an exact - * match, search from the root. Unlike WT_CURSOR.search, ignore pinned - * pages in the case of column-store, search-near isn't an interesting - * enough case for column-store to add the complexity needed to avoid - * the tree search. + * If we have a row-store page pinned, search it; if we don't have a page pinned, or the search + * of the pinned page doesn't find an exact match, search from the root. Unlike + * WT_CURSOR.search, ignore pinned pages in the case of column-store, search-near isn't an + * interesting enough case for column-store to add the complexity needed to avoid the tree + * search. * - * Set the "insert" flag for the btree row-store search; we may intend - * to position the cursor at the end of the tree, rather than match an - * existing record. + * Set the "insert" flag for the btree row-store search; we may intend to position the cursor at + * the end of the tree, rather than match an existing record. */ valid = false; if (btree->type == BTREE_ROW && __cursor_page_pinned(cbt, true)) { @@ -681,17 +677,15 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) /* * If we find a valid key, return it. * - * Else, creating a record past the end of the tree in a fixed-length - * column-store implicitly fills the gap with empty records. In this - * case, we instantiate the empty record, it's an exact match. + * Else, creating a record past the end of the tree in a fixed-length column-store implicitly + * fills the gap with empty records. In this case, we instantiate the empty record, it's an + * exact match. * - * Else, move to the next key in the tree (bias for prefix searches). - * Cursor next skips invalid rows, so we don't have to test for them - * again. + * Else, move to the next key in the tree (bias for prefix searches). Cursor next skips invalid + * rows, so we don't have to test for them again. * - * Else, redo the search and move to the previous key in the tree. - * Cursor previous skips invalid rows, so we don't have to test for - * them again. + * Else, redo the search and move to the previous key in the tree. Cursor previous skips invalid + * rows, so we don't have to test for them again. * * If that fails, quit, there's no record to return. */ @@ -798,14 +792,12 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) __cursor_state_save(cursor, &state); /* - * If inserting with overwrite configured, and positioned to an on-page - * key, the update doesn't require another search. Cursors configured - * for append aren't included, regardless of whether or not they meet - * all other criteria. + * If inserting with overwrite configured, and positioned to an on-page key, the update doesn't + * require another search. Cursors configured for append aren't included, regardless of whether + * or not they meet all other criteria. * - * Fixed-length column store can never use a positioned cursor to update - * because the cursor may not be positioned to the correct record in the - * case of implicit records in the append list. + * Fixed-length column store can never use a positioned cursor to update because the cursor may + * not be positioned to the correct record in the case of implicit records in the append list. */ if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt, false) && F_ISSET(cursor, WT_CURSTD_OVERWRITE) && !append_key) { @@ -1011,27 +1003,22 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt, bool positioned) __cursor_state_save(cursor, &state); /* - * If remove positioned to an on-page key, the remove doesn't require - * another search. We don't care about the "overwrite" configuration - * because regardless of the overwrite setting, any existing record is - * removed, and the record must exist with a positioned cursor. + * If remove positioned to an on-page key, the remove doesn't require another search. We don't + * care about the "overwrite" configuration because regardless of the overwrite setting, any + * existing record is removed, and the record must exist with a positioned cursor. * - * There's trickiness in the page-pinned check. By definition a remove - * operation leaves a cursor positioned if it's initially positioned. - * However, if every item on the page is deleted and we unpin the page, - * eviction might delete the page and our search will re-instantiate an - * empty page for us. Cursor remove returns not-found whether or not - * that eviction/deletion happens and it's OK unless cursor-overwrite - * is configured (which means we return success even if there's no item - * to delete). In that case, we'll fail when we try to point the cursor - * at the key on the page to satisfy the positioned requirement. It's - * arguably safe to simply leave the key initialized in the cursor (as - * that's all a positioned cursor implies), but it's probably safer to - * avoid page eviction entirely in the positioned case. + * There's trickiness in the page-pinned check. By definition a remove operation leaves a cursor + * positioned if it's initially positioned. However, if every item on the page is deleted and we + * unpin the page, eviction might delete the page and our search will re-instantiate an empty + * page for us. Cursor remove returns not-found whether or not that eviction/deletion happens + * and it's OK unless cursor-overwrite is configured (which means we return success even if + * there's no item to delete). In that case, we'll fail when we try to point the cursor at the + * key on the page to satisfy the positioned requirement. It's arguably safe to simply leave the + * key initialized in the cursor (as that's all a positioned cursor implies), but it's probably + * safer to avoid page eviction entirely in the positioned case. * - * Fixed-length column store can never use a positioned cursor to update - * because the cursor may not be positioned to the correct record in the - * case of implicit records in the append list. + * Fixed-length column store can never use a positioned cursor to update because the cursor may + * not be positioned to the correct record in the case of implicit records in the append list. */ if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt, false)) { WT_ERR(__wt_txn_autocommit_check(session)); @@ -1050,12 +1037,11 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt, bool positioned) retry: /* - * Note these steps must be repeatable, we'll continue to take this path - * as long as we encounter WT_RESTART. + * Note these steps must be repeatable, we'll continue to take this path as long as we encounter + * WT_RESTART. * - * Any pinned page goes away if we do a search, including as a result of - * a restart. Get a local copy of any pinned key and re-save the cursor - * state: we may retry but eventually fail. + * Any pinned page goes away if we do a search, including as a result of a restart. Get a local + * copy of any pinned key and re-save the cursor state: we may retry but eventually fail. */ WT_ERR(__cursor_localkey(cursor)); __cursor_state_save(cursor, &state); @@ -1099,14 +1085,12 @@ retry: if (!__cursor_fix_implicit(btree, cbt)) goto search_notfound; /* - * Creating a record past the end of the tree in a - * fixed-length column-store implicitly fills the - * gap with empty records. Return success in that - * case, the record was deleted successfully. + * Creating a record past the end of the tree in a fixed-length column-store implicitly + * fills the gap with empty records. Return success in that case, the record was deleted + * successfully. * - * Correct the btree cursor's location: the search - * will have pointed us at the previous/next item, - * and that's not correct. + * Correct the btree cursor's location: the search will have pointed us at the + * previous/next item, and that's not correct. */ cbt->recno = cursor->recno; } else @@ -1121,11 +1105,10 @@ err: if (ret == 0) { /* - * If positioned originally, but we had to do a search, acquire - * a position so we can return success. + * If positioned originally, but we had to do a search, acquire a position so we can return + * success. * - * If not positioned originally, leave it that way, clear any - * key and reset the cursor. + * If not positioned originally, leave it that way, clear any key and reset the cursor. */ if (positioned) { if (searched) @@ -1206,14 +1189,12 @@ __btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type) __cursor_state_save(cursor, &state); /* - * If update positioned to an on-page key, the update doesn't require - * another search. We don't care about the "overwrite" configuration - * because regardless of the overwrite setting, any existing record is - * updated, and the record must exist with a positioned cursor. + * If update positioned to an on-page key, the update doesn't require another search. We don't + * care about the "overwrite" configuration because regardless of the overwrite setting, any + * existing record is updated, and the record must exist with a positioned cursor. * - * Fixed-length column store can never use a positioned cursor to update - * because the cursor may not be positioned to the correct record in the - * case of implicit records in the append list. + * Fixed-length column store can never use a positioned cursor to update because the cursor may + * not be positioned to the correct record in the case of implicit records in the append list. */ if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt, false)) { WT_ERR(__wt_txn_autocommit_check(session)); @@ -1363,23 +1344,20 @@ __cursor_chain_exceeded(WT_CURSOR_BTREE *cbt) /* * Step through the modify operations at the beginning of the chain. * - * Deleted or standard updates are anticipated to be sufficient to base - * the modify (although that's not guaranteed: they may not be visible - * or might abort before we read them). Also, this is not a hard - * limit, threads can race modifying updates. + * Deleted or standard updates are anticipated to be sufficient to base the modify (although + * that's not guaranteed: they may not be visible or might abort before we read them). Also, + * this is not a hard limit, threads can race modifying updates. * - * If the total size in bytes of the updates exceeds some factor of the - * underlying value size (which we know because the cursor is - * positioned), create a new full copy of the value. This limits the - * cache pressure from creating full copies to that factor: with the - * default factor of 1, the total size in memory of a set of modify - * updates is limited to double the size of the modifies. + * If the total size in bytes of the updates exceeds some factor of the underlying value size + * (which we know because the cursor is positioned), create a new full copy of the value. This + * limits the cache pressure from creating full copies to that factor: with the default factor + * of 1, the total size in memory of a set of modify updates is limited to double the size of + * the modifies. * - * Otherwise, limit the length of the update chain to a fixed size to - * bound the cost of rebuilding the value during reads. When history - * has to be maintained, creating extra copies of large documents - * multiplies cache pressure because the old ones cannot be freed, so - * allow the modify chain to grow. + * Otherwise, limit the length of the update chain to a fixed size to bound the cost of + * rebuilding the value during reads. When history has to be maintained, creating extra copies + * of large documents multiplies cache pressure because the old ones cannot be freed, so allow + * the modify chain to grow. */ for (i = 0, upd_size = 0; upd != NULL && upd->type == WT_UPDATE_MODIFY; ++i, upd = upd->next) { upd_size += WT_UPDATE_MEMSIZE(upd); @@ -1414,26 +1392,22 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) __cursor_state_save(cursor, &state); /* - * Get the current value and apply the modification to it, for a few - * reasons: first, we set the updated value so the application can - * retrieve the cursor's value; second, we use the updated value as - * the update if the update chain is too long; third, there's a check - * if the updated value is too large to store; fourth, to simplify the - * count of bytes being added/removed; fifth, we can get into serious - * trouble if we attempt to modify a value that doesn't exist or read - * a value that might not exist in the future. For the fifth reason, - * fail if in anything other than a snapshot transaction, read-committed - * and read-uncommitted imply values that might disappear out from under - * us or an inability to repeat point-in-time reads. + * Get the current value and apply the modification to it, for a few reasons: first, we set the + * updated value so the application can retrieve the cursor's value; second, we use the updated + * value as the update if the update chain is too long; third, there's a check if the updated + * value is too large to store; fourth, to simplify the count of bytes being added/removed; + * fifth, we can get into serious trouble if we attempt to modify a value that doesn't exist or + * read a value that might not exist in the future. For the fifth reason, fail if in anything + * other than a snapshot transaction, read-committed and read-uncommitted imply values that + * might disappear out from under us or an inability to repeat point-in-time reads. * - * Also, an application might read a value outside of a transaction and - * then call modify. For that to work, the read must be part of the - * transaction that performs the update for correctness, otherwise we - * could race with another thread and end up modifying the wrong value. - * A clever application could get this right (imagine threads that only - * updated non-overlapping, fixed-length byte strings), but it's unsafe - * because it will work most of the time and the failure is unlikely to - * be detected. Require explicit transactions for modify operations. + * Also, an application might read a value outside of a transaction and then call modify. For + * that to work, the read must be part of the transaction that performs the update for + * correctness, otherwise we could race with another thread and end up modifying the wrong + * value. A clever application could get this right (imagine threads that only updated + * non-overlapping, fixed-length byte strings), but it's unsafe because it will work most of the + * time and the failure is unlikely to be detected. Require explicit transactions for modify + * operations. */ if (session->txn.isolation != WT_ISO_SNAPSHOT) WT_ERR_MSG(session, ENOTSUP, @@ -1458,9 +1432,8 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) /* * WT_CURSOR.modify is update-without-overwrite. * - * Use the modify buffer as the update if the data package saves us some - * memory and the update chain is under the limit, else use the complete - * value. + * Use the modify buffer as the update if the data package saves us some memory and the update + * chain is under the limit, else use the complete value. */ overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE); F_CLR(cursor, WT_CURSTD_OVERWRITE); @@ -1659,23 +1632,19 @@ __cursor_truncate(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BT yield_count = sleep_usecs = 0; /* - * First, call the cursor search method to re-position the cursor: we - * may not have a cursor position (if the higher-level truncate code - * switched the cursors to have an "external" cursor key, and because - * we don't save a copy of the page's write generation information, - * which we need to remove records). + * First, call the cursor search method to re-position the cursor: we may not have a cursor position + * (if the higher-level truncate code switched the cursors to have an "external" cursor key, and + * because we don't save a copy of the page's write generation information, which we need to remove + * records). * - * Once that's done, we can delete records without a full search, unless - * we encounter a restart error because the page was modified by some - * other thread of control; in that case, repeat the full search to - * refresh the page's modification information. + * Once that's done, we can delete records without a full search, unless we encounter a restart + * error because the page was modified by some other thread of control; in that case, repeat the + * full search to refresh the page's modification information. * - * If this is a row-store, we delete leaf pages having no overflow items - * without reading them; for that to work, we have to ensure we read the - * page referenced by the ending cursor, since we may be deleting only a - * partial page at the end of the truncation. Our caller already fully - * instantiated the end cursor, so we know that page is pinned in memory - * and we can proceed without concern. + * If this is a row-store, we delete leaf pages having no overflow items without reading them; for + * that to work, we have to ensure we read the page referenced by the ending cursor, since we may be + * deleting only a partial page at the end of the truncation. Our caller already fully instantiated + * the end cursor, so we know that page is pinned in memory and we can proceed without concern. */ retry: WT_ERR(__wt_btcur_search(start)); @@ -1717,23 +1686,20 @@ __cursor_truncate_fix(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSO yield_count = sleep_usecs = 0; /* - * Handle fixed-length column-store objects separately: for row-store - * and variable-length column-store objects we have "deleted" values - * and so returned objects actually exist: fixed-length column-store - * objects are filled-in if they don't exist, that is, if you create - * record 37, records 1-36 magically appear. Those records can't be - * deleted, which means we have to ignore already "deleted" records. + * Handle fixed-length column-store objects separately: for row-store and variable-length + * column-store objects we have "deleted" values and so returned objects actually exist: + * fixed-length column-store objects are filled-in if they don't exist, that is, if you create + * record 37, records 1-36 magically appear. Those records can't be deleted, which means we have to + * ignore already "deleted" records. * - * First, call the cursor search method to re-position the cursor: we - * may not have a cursor position (if the higher-level truncate code - * switched the cursors to have an "external" cursor key, and because - * we don't save a copy of the page's write generation information, - * which we need to remove records). + * First, call the cursor search method to re-position the cursor: we may not have a cursor position + * (if the higher-level truncate code switched the cursors to have an "external" cursor key, and + * because we don't save a copy of the page's write generation information, which we need to remove + * records). * - * Once that's done, we can delete records without a full search, unless - * we encounter a restart error because the page was modified by some - * other thread of control; in that case, repeat the full search to - * refresh the page's modification information. + * Once that's done, we can delete records without a full search, unless we encounter a restart + * error because the page was modified by some other thread of control; in that case, repeat the + * full search to refresh the page's modification information. */ retry: WT_ERR(__wt_btcur_search(start)); @@ -1778,13 +1744,12 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) WT_STAT_DATA_INCR(session, cursor_truncate); /* - * For recovery, log the start and stop keys for a truncate operation, - * not the individual records removed. On the other hand, for rollback - * we need to keep track of all the in-memory operations. + * For recovery, log the start and stop keys for a truncate operation, not the individual + * records removed. On the other hand, for rollback we need to keep track of all the in-memory + * operations. * - * We deal with this here by logging the truncate range first, then (in - * the logging code) disabling writing of the in-memory remove records - * to disk. + * We deal with this here by logging the truncate range first, then (in the logging code) + * disabling writing of the in-memory remove records to disk. */ if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED)) WT_RET(__wt_txn_truncate_log(session, start, stop)); diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 7ed85112b42..f971de0e4f0 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -382,11 +382,9 @@ __wt_debug_offset( WT_ASSERT(session, S2BT_SAFE(session) != NULL); /* - * This routine depends on the default block manager's view of files, - * where an address consists of a file offset, length, and checksum. - * This is for debugging only: other block managers might not see a - * file or address the same way, that's why there's no block manager - * method. + * This routine depends on the default block manager's view of files, where an address consists + * of a file offset, length, and checksum. This is for debugging only: other block managers + * might not see a file or address the same way, that's why there's no block manager method. * * Convert the triplet into an address structure. */ @@ -1181,7 +1179,7 @@ __debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte) WT_RET(ds->f(ds, ", start_ts %s", __wt_timestamp_to_string(upd->start_ts, ts_string))); if (upd->durable_ts != WT_TS_NONE) WT_RET( - ds->f(ds, ", durable-ts %s", __wt_timestamp_to_string(upd->durable_ts, ts_string))); + ds->f(ds, ", durable_ts %s", __wt_timestamp_to_string(upd->durable_ts, ts_string))); prepare_state = NULL; switch (upd->prepare_state) { diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index 9749cef3706..f7b63524d42 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -11,51 +11,42 @@ /* * Fast-delete support. * - * This file contains most of the code that allows WiredTiger to delete pages - * of data without reading them into the cache. (This feature is currently - * only available for row-store objects.) + * This file contains most of the code that allows WiredTiger to delete pages of data without + * reading them into the cache. (This feature is currently only available for row-store objects.) * - * The way cursor truncate works in a row-store object is it explicitly reads - * the first and last pages of the truncate range, then walks the tree with a - * flag so the tree walk code skips reading eligible pages within the range - * and instead just marks them as deleted, by changing their WT_REF state to - * WT_REF_DELETED. Pages ineligible for this fast path include pages already - * in the cache, having overflow items, or requiring lookaside records. - * Ineligible pages are read and have their rows updated/deleted individually. - * The transaction for the delete operation is stored in memory referenced by - * the WT_REF.page_del field. + * The way cursor truncate works in a row-store object is it explicitly reads the first and last + * pages of the truncate range, then walks the tree with a flag so the tree walk code skips reading + * eligible pages within the range and instead just marks them as deleted, by changing their WT_REF + * state to WT_REF_DELETED. Pages ineligible for this fast path include pages already in the cache, + * having overflow items, or requiring lookaside records. Ineligible pages are read and have their + * rows updated/deleted individually. The transaction for the delete operation is stored in memory + * referenced by the WT_REF.page_del field. * - * Future cursor walks of the tree will skip the deleted page based on the - * transaction stored for the delete, but it gets more complicated if a read is - * done using a random key, or a cursor walk is done with a transaction where - * the delete is not visible. In those cases, we read the original contents of - * the page. The page-read code notices a deleted page is being read, and as - * part of the read instantiates the contents of the page, creating a WT_UPDATE - * with a deleted operation, in the same transaction as deleted the page. In - * other words, the read process makes it appear as if the page was read and - * each individual row deleted, exactly as would have happened if the page had + * Future cursor walks of the tree will skip the deleted page based on the transaction stored for + * the delete, but it gets more complicated if a read is done using a random key, or a cursor walk + * is done with a transaction where the delete is not visible. In those cases, we read the original + * contents of the page. The page-read code notices a deleted page is being read, and as part of the + * read instantiates the contents of the page, creating a WT_UPDATE with a deleted operation, in the + * same transaction as deleted the page. In other words, the read process makes it appear as if the + * page was read and each individual row deleted, exactly as would have happened if the page had * been in the cache all along. * - * There's an additional complication to support rollback of the page delete. - * When the page was marked deleted, a pointer to the WT_REF was saved in the - * deleting session's transaction list and the delete is unrolled by resetting - * the WT_REF_DELETED state back to WT_REF_DISK. However, if the page has been - * instantiated by some reading thread, that's not enough, each individual row - * on the page must have the delete operation reset. If the page split, the - * WT_UPDATE lists might have been saved/restored during reconciliation and - * appear on multiple pages, and the WT_REF stored in the deleting session's - * transaction list is no longer useful. For this reason, when the page is - * instantiated by a read, a list of the WT_UPDATE structures on the page is - * stored in the WT_REF.page_del field, with the transaction ID, that way the - * session committing/unrolling the delete can find all WT_UPDATE structures - * that require update. + * There's an additional complication to support rollback of the page delete. When the page was + * marked deleted, a pointer to the WT_REF was saved in the deleting session's transaction list and + * the delete is unrolled by resetting the WT_REF_DELETED state back to WT_REF_DISK. However, if the + * page has been instantiated by some reading thread, that's not enough, each individual row on the + * page must have the delete operation reset. If the page split, the WT_UPDATE lists might have been + * saved/restored during reconciliation and appear on multiple pages, and the WT_REF stored in the + * deleting session's transaction list is no longer useful. For this reason, when the page is + * instantiated by a read, a list of the WT_UPDATE structures on the page is stored in the + * WT_REF.page_del field, with the transaction ID, that way the session committing/unrolling the + * delete can find all WT_UPDATE structures that require update. * - * One final note: pages can also be marked deleted if emptied and evicted. In - * that case, the WT_REF state will be set to WT_REF_DELETED but there will not - * be any associated WT_REF.page_del field. These pages are always skipped - * during cursor traversal (the page could not have been evicted if there were - * updates that weren't globally visible), and if read is forced to instantiate - * such a page, it simply creates an empty page from scratch. + * One final note: pages can also be marked deleted if emptied and evicted. In that case, the WT_REF + * state will be set to WT_REF_DELETED but there will not be any associated WT_REF.page_del field. + * These pages are always skipped during cursor traversal (the page could not have been evicted if + * there were updates that weren't globally visible), and if read is forced to instantiate such a + * page, it simply creates an empty page from scratch. */ /* @@ -102,12 +93,10 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) return (0); /* - * If this WT_REF was previously part of a truncate operation, there - * may be existing page-delete information. The structure is only read - * while the state is locked, free the previous version. + * If this WT_REF was previously part of a truncate operation, there may be existing page-delete + * information. The structure is only read while the state is locked, free the previous version. * - * Note: changes have been made, we must publish any state change from - * this point on. + * Note: changes have been made, we must publish any state change from this point on. */ if (ref->page_del != NULL) { WT_ASSERT(session, ref->page_del->txnid == WT_TXN_ABORTED); @@ -116,18 +105,15 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) } /* - * We cannot truncate pages that have overflow key/value items as the - * overflow blocks have to be discarded. The way we figure that out is - * to check the page's cell type, cells for leaf pages without overflow - * items are special. + * We cannot truncate pages that have overflow key/value items as the overflow blocks have to be + * discarded. The way we figure that out is to check the page's cell type, cells for leaf pages + * without overflow items are special. * - * To look at an on-page cell, we need to look at the parent page, and - * that's dangerous, our parent page could change without warning if - * the parent page were to split, deepening the tree. We can look at - * the parent page itself because the page can't change underneath us. - * However, if the parent page splits, our reference address can change; - * we don't care what version of it we read, as long as we don't read - * it twice. + * To look at an on-page cell, we need to look at the parent page, and that's dangerous, our + * parent page could change without warning if the parent page were to split, deepening the + * tree. We can look at the parent page itself because the page can't change underneath us. + * However, if the parent page splits, our reference address can change; we don't care what + * version of it we read, as long as we don't read it twice. */ WT_ORDERED_READ(ref_addr, ref->addr); if (ref_addr != NULL && (__wt_off_page(ref->home, ref_addr) ? @@ -219,15 +205,12 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) } /* - * We can't use the normal read path to get a copy of the page - * because the session may have closed the cursor, we no longer - * have the reference to the tree required for a hazard - * pointer. We're safe because with unresolved transactions, - * the page isn't going anywhere. + * We can't use the normal read path to get a copy of the page because the session may have + * closed the cursor, we no longer have the reference to the tree required for a hazard pointer. + * We're safe because with unresolved transactions, the page isn't going anywhere. * - * The page is in an in-memory state, which means it - * was instantiated at some point. Walk any list of - * update structures and abort them. + * The page is in an in-memory state, which means it was instantiated at some point. Walk any + * list of update structures and abort them. */ WT_ASSERT(session, locked); if ((updp = ref->page_del->update_list) != NULL) @@ -255,22 +238,19 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) bool skip; /* - * Deleted pages come from two sources: either it's a truncate as - * described above, or the page has been emptied by other operations - * and eviction deleted it. + * Deleted pages come from two sources: either it's a truncate as described above, or the page + * has been emptied by other operations and eviction deleted it. * - * In both cases, the WT_REF state will be WT_REF_DELETED. In the case - * of a truncated page, there will be a WT_PAGE_DELETED structure with - * the transaction ID of the transaction that deleted the page, and the - * page is visible if that transaction ID is visible. In the case of an - * empty page, there will be no WT_PAGE_DELETED structure and the delete - * is by definition visible, eviction could not have deleted the page if - * there were changes on it that were not globally visible. + * In both cases, the WT_REF state will be WT_REF_DELETED. In the case of a truncated page, + * there will be a WT_PAGE_DELETED structure with the transaction ID of the transaction that + * deleted the page, and the page is visible if that transaction ID is visible. In the case of + * an empty page, there will be no WT_PAGE_DELETED structure and the delete is by definition + * visible, eviction could not have deleted the page if there were changes on it that were not + * globally visible. * - * We're here because we found a WT_REF state set to WT_REF_DELETED. It - * is possible the page is being read into memory right now, though, and - * the page could switch to an in-memory state at any time. Lock down - * the structure, just to be safe. + * We're here because we found a WT_REF state set to WT_REF_DELETED. It is possible the page is + * being read into memory right now, though, and the page could switch to an in-memory state at + * any time. Lock down the structure, just to be safe. */ if (ref->page_del == NULL && ref->page_las == NULL) return (true); @@ -362,26 +342,22 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) } /* - * An operation is accessing a "deleted" page, and we're building an - * in-memory version of the page (making it look like all entries in - * the page were individually updated by a remove operation). There - * are two cases where we end up here: + * An operation is accessing a "deleted" page, and we're building an in-memory version of the + * page (making it look like all entries in the page were individually updated by a remove + * operation). There are two cases where we end up here: * - * First, a running transaction used a truncate call to delete the page - * without reading it, in which case the page reference includes a - * structure with a transaction ID; the page we're building might split - * in the future, so we update that structure to include references to - * all of the update structures we create, so the transaction can abort. + * First, a running transaction used a truncate call to delete the page without reading it, in + * which case the page reference includes a structure with a transaction ID; the page we're + * building might split in the future, so we update that structure to include references to all + * of the update structures we create, so the transaction can abort. * - * Second, a truncate call deleted a page and the truncate committed, - * but an older transaction in the system forced us to keep the old - * version of the page around, then we crashed and recovered or we're - * running inside a checkpoint, and now we're being forced to read that - * page. + * Second, a truncate call deleted a page and the truncate committed, but an older transaction + * in the system forced us to keep the old version of the page around, then we crashed and + * recovered or we're running inside a checkpoint, and now we're being forced to read that page. * - * Expect a page-deleted structure if there's a running transaction that - * needs to be resolved, otherwise, there may not be one (and, if the - * transaction has resolved, we can ignore the page-deleted structure). + * Expect a page-deleted structure if there's a running transaction that needs to be resolved, + * otherwise, there may not be one (and, if the transaction has resolved, we can ignore the + * page-deleted structure). */ page_del = __wt_page_del_active(session, ref, true) ? ref->page_del : NULL; diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index c3b8a52d150..9dd84879ddf 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -24,8 +24,7 @@ void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) { /* - * A version of the page-out function that allows us to make additional - * diagnostic checks. + * A version of the page-out function that allows us to make additional diagnostic checks. * * The WT_REF cannot be the eviction thread's location. */ @@ -336,9 +335,8 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) /* * Free the in-memory index array. * - * For each entry, see if the key was an allocation (that is, if it - * points somewhere other than the original page), and if so, free - * the memory. + * For each entry, see if the key was an allocation (that is, if it points somewhere other than + * the original page), and if so, free the memory. */ WT_ROW_FOREACH (page, rip, i) { copy = WT_ROW_KEY_COPY(rip); diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index d8994e7bfab..595eb55fc5c 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -26,12 +26,10 @@ __wt_btree_page_version_config(WT_SESSION_IMPL *session) conn = S2C(session); /* - * Write timestamp format pages if at the right version or if configured - * at build-time. + * Write timestamp format pages if at the right version or if configured at build-time. * - * WiredTiger version where timestamp page format is written. This is a - * future release, and the values may require update when the release is - * named. + * WiredTiger version where timestamp page format is written. This is a future release, and the + * values may require update when the release is named. */ #define WT_VERSION_TS_MAJOR 3 #define WT_VERSION_TS_MINOR 3 @@ -201,17 +199,15 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) } /* - * Eviction ignores trees until the handle's open flag is set, configure - * eviction before that happens. + * Eviction ignores trees until the handle's open flag is set, configure eviction before that + * happens. * - * Files that can still be bulk-loaded cannot be evicted. - * Permanently cache-resident files can never be evicted. - * Special operations don't enable eviction. The underlying commands may - * turn on eviction (for example, verify turns on eviction while working - * a file to keep from consuming the cache), but it's their decision. If - * an underlying command reconfigures eviction, it must either clear the - * evict-disabled-open flag or restore the eviction configuration when - * finished so that handle close behaves correctly. + * Files that can still be bulk-loaded cannot be evicted. Permanently cache-resident files can + * never be evicted. Special operations don't enable eviction. The underlying commands may turn + * on eviction (for example, verify turns on eviction while working a file to keep from + * consuming the cache), but it's their decision. If an underlying command reconfigures + * eviction, it must either clear the evict-disabled-open flag or restore the eviction + * configuration when finished so that handle close behaves correctly. */ if (btree->original || F_ISSET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_REBALANCE | WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) { @@ -243,12 +239,10 @@ __wt_btree_close(WT_SESSION_IMPL *session) btree = S2BT(session); /* - * The close process isn't the same as discarding the handle: we might - * re-open the handle, which isn't a big deal, but the backing blocks - * for the handle may not yet have been discarded from the cache, and - * eviction uses WT_BTREE structure elements. Free backing resources - * but leave the rest alone, and we'll discard the structure when we - * discard the data handle. + * The close process isn't the same as discarding the handle: we might re-open the handle, which + * isn't a big deal, but the backing blocks for the handle may not yet have been discarded from + * the cache, and eviction uses WT_BTREE structure elements. Free backing resources but leave + * the rest alone, and we'll discard the structure when we discard the data handle. * * Handles can be closed multiple times, ignore all but the first. */ @@ -532,14 +526,12 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) if (btree->compressor != NULL && btree->compressor->compress != NULL && btree->type != BTREE_COL_FIX) { /* - * Don't do compression adjustment when on-disk page sizes are - * less than 16KB. There's not enough compression going on to - * fine-tune the size, all we end up doing is hammering shared - * memory. + * Don't do compression adjustment when on-disk page sizes are less than 16KB. There's not + * enough compression going on to fine-tune the size, all we end up doing is hammering + * shared memory. * - * Don't do compression adjustment when on-disk page sizes are - * equal to the maximum in-memory page image, the bytes taken - * for compression can't grow past the base value. + * Don't do compression adjustment when on-disk page sizes are equal to the maximum + * in-memory page image, the bytes taken for compression can't grow past the base value. */ if (btree->maxintlpage >= 16 * 1024 && btree->maxmempage_image > btree->maxintlpage) { btree->intlpage_compadjust = true; @@ -611,9 +603,8 @@ __wt_btree_tree_open(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_ WT_CLEAR(dsk); /* - * Read and verify the page (verify to catch encrypted objects we can't - * decrypt, where we read the object successfully but we can't decrypt - * it, and we want to fail gracefully). + * Read and verify the page (verify to catch encrypted objects we can't decrypt, where we read + * the object successfully but we can't decrypt it, and we want to fail gracefully). * * Create a printable version of the address to pass to verify. */ @@ -939,8 +930,8 @@ __btree_page_sizes(WT_SESSION_IMPL *session) /* * Get the maximum internal/leaf page key/value sizes. * - * In-memory configuration overrides any key/value sizes, there's no - * such thing as an overflow item in an in-memory configuration. + * In-memory configuration overrides any key/value sizes, there's no such thing as an overflow + * item in an in-memory configuration. */ if (F_ISSET(conn, WT_CONN_IN_MEMORY)) { btree->maxintlkey = WT_BTREE_MAX_OBJECT_SIZE; @@ -971,13 +962,12 @@ __btree_page_sizes(WT_SESSION_IMPL *session) } /* - * Default/maximum for internal and leaf page keys: split-page / 10. - * Default for leaf page values: split-page / 2. + * Default/maximum for internal and leaf page keys: split-page / 10. Default for leaf page + * values: split-page / 2. * - * It's difficult for applications to configure this in any exact way as - * they have to duplicate our calculation of how many keys must fit on a - * page, and given a split-percentage and page header, that isn't easy - * to do. If the maximum internal key value is too large for the page, + * It's difficult for applications to configure this in any exact way as they have to duplicate + * our calculation of how many keys must fit on a page, and given a split-percentage and page + * header, that isn't easy to do. If the maximum internal key value is too large for the page, * reset it to the default. */ if (btree->maxintlkey == 0 || btree->maxintlkey > intl_split_size / 10) diff --git a/src/third_party/wiredtiger/src/btree/bt_import.c b/src/third_party/wiredtiger/src/btree/bt_import.c index 7a1e1cd936c..02f023567f5 100644 --- a/src/third_party/wiredtiger/src/btree/bt_import.c +++ b/src/third_party/wiredtiger/src/btree/bt_import.c @@ -83,17 +83,16 @@ __wt_import(WT_SESSION_IMPL *session, const char *uri) } /* - * OK, we've now got three chunks of data: the file's metadata from when - * the last checkpoint started, the array of checkpoints as of when the - * last checkpoint was almost complete (everything written but the avail - * list), and fixed-up checkpoint information from the last checkpoint. + * OK, we've now got three chunks of data: the file's metadata from when the last checkpoint + * started, the array of checkpoints as of when the last checkpoint was almost complete + * (everything written but the avail list), and fixed-up checkpoint information from the last + * checkpoint. * - * Build and flatten the metadata and the checkpoint list, then insert - * it into the metadata for this file. + * Build and flatten the metadata and the checkpoint list, then insert it into the metadata for + * this file. * - * Strip out the checkpoint-LSN, an imported file isn't associated - * with any log files. - * Assign a unique file ID. + * Strip out the checkpoint-LSN, an imported file isn't associated with any log files. Assign a + * unique file ID. */ filecfg[1] = a->data; filecfg[2] = checkpoint_list; @@ -107,30 +106,25 @@ __wt_import(WT_SESSION_IMPL *session, const char *uri) __wt_verbose(session, WT_VERB_CHECKPOINT, "import configuration: %s/%s", uri, fileconf); /* - * The just inserted metadata was correct as of immediately before the - * before the final checkpoint, but it's not quite right. The block - * manager returned the corrected final checkpoint, put it all together. + * The just inserted metadata was correct as of immediately before the before the final + * checkpoint, but it's not quite right. The block manager returned the corrected final + * checkpoint, put it all together. * - * Get the checkpoint information from the file's metadata as an array - * of WT_CKPT structures. + * Get the checkpoint information from the file's metadata as an array of WT_CKPT structures. * - * XXX - * There's a problem here. If a file is imported from our future (leaf - * pages with unstable entries that have write-generations ahead of the - * current database's base write generation), we'll read the values and - * treat them as stable. A restart will fix this: when we added the - * imported file to our metadata, the write generation in the imported - * file's checkpoints updated our database's maximum write generation, - * and so a restart will have a maximum generation newer than the - * imported file's write generation. An alternative solution is to add - * a "base write generation" value to the imported file's metadata, and - * use that value instead of the connection's base write generation when - * deciding what page items should be read. Since all future writes to - * the imported file would be ahead of that write generation, it would - * have the effect we want. + * XXX There's a problem here. If a file is imported from our future (leaf pages with unstable + * entries that have write-generations ahead of the current database's base write generation), + * we'll read the values and treat them as stable. A restart will fix this: when we added the + * imported file to our metadata, the write generation in the imported file's checkpoints + * updated our database's maximum write generation, and so a restart will have a maximum + * generation newer than the imported file's write generation. An alternative solution is to add + * a "base write generation" value to the imported file's metadata, and use that value instead + * of the connection's base write generation when deciding what page items should be read. Since + * all future writes to the imported file would be ahead of that write generation, it would have + * the effect we want. * - * Update the last checkpoint with the corrected information. - * Update the file's metadata with the new checkpoint information. + * Update the last checkpoint with the corrected information. Update the file's metadata with + * the new checkpoint information. */ WT_ERR(__wt_meta_ckptlist_get(session, uri, false, &ckptbase)); WT_CKPT_FOREACH (ckptbase, ckpt) diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c index 25373fa592a..44b672251cb 100644 --- a/src/third_party/wiredtiger/src/btree/bt_io.c +++ b/src/third_party/wiredtiger/src/btree/bt_io.c @@ -309,19 +309,17 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *add F_SET(dsk, WT_PAGE_ENCRYPTED); /* - * We increment the block's write generation so it's easy to identify - * newer versions of blocks during salvage. (It's common in WiredTiger, - * at least for the default block manager, for multiple blocks to be - * internally consistent with identical first and last keys, so we need - * a way to know the most recent state of the block. We could check - * which leaf is referenced by a valid internal page, but that implies - * salvaging internal pages, which I don't want to do, and it's not - * as good anyway, because the internal page may not have been written - * after the leaf page was updated. So, write generations it is. + * We increment the block's write generation so it's easy to identify newer versions of blocks + * during salvage. (It's common in WiredTiger, at least for the default block manager, for + * multiple blocks to be internally consistent with identical first and last keys, so we need a + * way to know the most recent state of the block. We could check which leaf is referenced by a + * valid internal page, but that implies salvaging internal pages, which I don't want to do, and + * it's not as good anyway, because the internal page may not have been written after the leaf + * page was updated. So, write generations it is. * - * Nothing is locked at this point but two versions of a page with the - * same generation is pretty unlikely, and if we did, they're going to - * be roughly identical for the purposes of salvage, anyway. + * Nothing is locked at this point but two versions of a page with the same generation is pretty + * unlikely, and if we did, they're going to be roughly identical for the purposes of salvage, + * anyway. */ dsk->write_gen = ++btree->write_gen; diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c index 4ad373c2ba5..8ea91b31fd2 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c +++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c @@ -21,11 +21,11 @@ __ovfl_read(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ btree = S2BT(session); /* - * Read the overflow item from the block manager, then reference the - * start of the data and set the data's length. + * Read the overflow item from the block manager, then reference the start of the data and set + * the data's length. * - * Overflow reads are synchronous. That may bite me at some point, but - * WiredTiger supports large page sizes, overflow items should be rare. + * Overflow reads are synchronous. That may bite me at some point, but WiredTiger supports large + * page sizes, overflow items should be rare. */ WT_RET(__wt_bt_read(session, store, addr, addr_size)); dsk = store->data; @@ -60,13 +60,11 @@ __wt_ovfl_read( return (__ovfl_read(session, unpack->data, unpack->size, store)); /* - * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow - * value, but there was still a reader in the system that might need it, - * the on-page cell type will have been reset to WT_CELL_VALUE_OVFL_RM - * and we will be passed a page so we can check the on-page cell. + * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow value, but there was still + * a reader in the system that might need it, the on-page cell type will have been reset to + * WT_CELL_VALUE_OVFL_RM and we will be passed a page so we can check the on-page cell. * - * Acquire the overflow lock, and retest the on-page cell's value inside - * the lock. + * Acquire the overflow lock, and retest the on-page cell's value inside the lock. */ __wt_readlock(session, &S2BT(session)->ovfl_lock); if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) { @@ -188,12 +186,11 @@ __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack WT_RET(__ovfl_cache(session, page, unpack)); /* - * The second problem is to only remove the underlying blocks once, - * solved by the WT_CELL_VALUE_OVFL_RM flag. + * The second problem is to only remove the underlying blocks once, solved by the + * WT_CELL_VALUE_OVFL_RM flag. * - * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the - * underlying overflow value's blocks to be freed when reconciliation - * completes. + * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the underlying overflow value's + * blocks to be freed when reconciliation completes. */ return (__wt_ovfl_discard_add(session, page, unpack->cell)); } @@ -216,15 +213,13 @@ __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell) __wt_cell_unpack(session, page, cell, unpack); /* - * Finally remove overflow key/value objects, called when reconciliation - * finishes after successfully writing a page. + * Finally remove overflow key/value objects, called when reconciliation finishes after + * successfully writing a page. * - * Keys must have already been instantiated and value objects must have - * already been cached (if they might potentially still be read by any - * running transaction). + * Keys must have already been instantiated and value objects must have already been cached (if + * they might potentially still be read by any running transaction). * - * Acquire the overflow lock to avoid racing with a thread reading the - * backing overflow blocks. + * Acquire the overflow lock to avoid racing with a thread reading the backing overflow blocks. */ __wt_writelock(session, &btree->ovfl_lock); diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 407fbca7839..0db3e5216d2 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -148,14 +148,12 @@ __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32 case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: /* - * Column-store leaf page entries map one-to-one to the number - * of physical entries on the page (each physical entry is a - * value item). Note this value isn't necessarily correct, we + * Column-store leaf page entries map one-to-one to the number of physical entries on the + * page (each physical entry is a value item). Note this value isn't necessarily correct, we * may skip values when reading the disk image. * - * Column-store internal page entries map one-to-one to the - * number of physical entries on the page (each entry is a - * location cookie). + * Column-store internal page entries map one-to-one to the number of physical entries on + * the page (each entry is a location cookie). */ alloc_entries = dsk->u.entries; break; @@ -191,14 +189,12 @@ __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32 F_SET_ATOMIC(page, flags); /* - * Track the memory allocated to build this page so we can update the - * cache statistics in a single call. If the disk image is in allocated - * memory, start with that. + * Track the memory allocated to build this page so we can update the cache statistics in a + * single call. If the disk image is in allocated memory, start with that. * - * Accounting is based on the page-header's in-memory disk size instead - * of the buffer memory used to instantiate the page image even though - * the values might not match exactly, because that's the only value we - * have when discarding the page image and accounting needs to match. + * Accounting is based on the page-header's in-memory disk size instead of the buffer memory + * used to instantiate the page image even though the values might not match exactly, because + * that's the only value we have when discarding the page image and accounting needs to match. */ size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0; @@ -454,21 +450,16 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) break; case WT_CELL_ADDR_DEL: /* - * A cell may reference a deleted leaf page: if a leaf - * page was deleted without being read (fast truncate), - * and the deletion committed, but older transactions - * in the system required the previous version of the - * page to remain available, a special deleted-address - * type cell is written. We'll see that cell on a page - * if we read from a checkpoint including a deleted - * cell or if we crash/recover and start off from such - * a checkpoint (absent running recovery, a version of - * the page without the deleted cell would eventually - * have been written). If we crash and recover to a - * page with a deleted-address cell, we want to discard - * the page from the backing store (it was never - * discarded), and, of course, by definition no earlier - * transaction will ever need it. + * A cell may reference a deleted leaf page: if a leaf page was deleted without being + * read (fast truncate), and the deletion committed, but older transactions in the + * system required the previous version of the page to remain available, a special + * deleted-address type cell is written. We'll see that cell on a page if we read from a + * checkpoint including a deleted cell or if we crash/recover and start off from such a + * checkpoint (absent running recovery, a version of the page without the deleted cell + * would eventually have been written). If we crash and recover to a page with a + * deleted-address cell, we want to discard the page from the backing store (it was + * never discarded), and, of course, by definition no earlier transaction will ever need + * it. * * Re-create the state of a deleted page. */ @@ -524,15 +515,14 @@ __inmem_row_leaf_entries(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, ui btree = S2BT(session); /* - * Leaf row-store page entries map to a maximum of one-to-one to the - * number of physical entries on the page (each physical entry might be - * a key without a subsequent data item). To avoid over-allocation in - * workloads without empty data items, first walk the page counting the + * Leaf row-store page entries map to a maximum of one-to-one to the number of physical entries + * on the page (each physical entry might be a key without a subsequent data item). To avoid + * over-allocation in workloads without empty data items, first walk the page counting the * number of keys, then allocate the indices. * - * The page contains key/data pairs. Keys are on-page (WT_CELL_KEY) or - * overflow (WT_CELL_KEY_OVFL) items, data are either non-existent or a - * single on-page (WT_CELL_VALUE) or overflow (WT_CELL_VALUE_OVFL) item. + * The page contains key/data pairs. Keys are on-page (WT_CELL_KEY) or overflow + * (WT_CELL_KEY_OVFL) items, data are either non-existent or a single on-page (WT_CELL_VALUE) or + * overflow (WT_CELL_VALUE_OVFL) item. */ nindx = 0; WT_CELL_FOREACH_BEGIN (session, btree, dsk, unpack) { diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c index f3f8b31b33e..6acccf699a4 100644 --- a/src/third_party/wiredtiger/src/btree/bt_random.c +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -316,8 +316,9 @@ __random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) if (ret == WT_NOTFOUND) { next = !next; /* Reverse direction. */ ret = next ? __wt_btcur_next(cbt, false) : __wt_btcur_prev(cbt, false); - WT_RET(ret); /* An empty tree. */ } + WT_RET(ret); + if (i > 0) --i; else { @@ -397,15 +398,13 @@ restart: } /* - * There may be empty pages in the tree, and they're useless to - * us. If we don't find a non-empty page in "entries" random - * guesses, take the first non-empty page in the tree. If the - * search page contains nothing other than empty pages, restart - * from the root some number of times before giving up. + * There may be empty pages in the tree, and they're useless to us. If we don't find a + * non-empty page in "entries" random guesses, take the first non-empty page in the tree. If + * the search page contains nothing other than empty pages, restart from the root some + * number of times before giving up. * - * Random sampling is looking for a key/value pair on a random - * leaf page, and so will accept any page that contains a valid - * key/value pair, so on-disk is fine, but deleted is not. + * Random sampling is looking for a key/value pair on a random leaf page, and so will accept + * any page that contains a valid key/value pair, so on-disk is fine, but deleted is not. */ descent = NULL; for (i = 0; i < entries; ++i) { @@ -430,11 +429,10 @@ restart: } /* - * Swap the current page for the child page. If the page splits - * while we're retrieving it, restart the search at the root. + * Swap the current page for the child page. If the page splits while we're retrieving it, + * restart the search at the root. * - * On other error, simply return, the swap call ensures we're - * holding nothing on failure. + * On other error, simply return, the swap call ensures we're holding nothing on failure. */ descend: if ((ret = __wt_page_swap(session, current, descent, flags)) == 0) { @@ -523,12 +521,11 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) } /* - * Cursor through the tree, skipping past the sample size of the leaf - * pages in the tree between each random key return to compensate for - * unbalanced trees. + * Cursor through the tree, skipping past the sample size of the leaf pages in the tree between + * each random key return to compensate for unbalanced trees. * - * If the random descent attempt failed, we don't have a configured - * sample size, use 100 for no particular reason. + * If the random descent attempt failed, we don't have a configured sample size, use 100 for no + * particular reason. */ if (cbt->next_random_sample_size == 0) cbt->next_random_sample_size = 100; @@ -555,19 +552,17 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) } /* - * Be paranoid about loop termination: first, if the last leaf page - * skipped was also the last leaf page in the tree, skip may be set to - * zero on return along with the NULL WT_REF end-of-walk condition. - * Second, if a tree has no valid pages at all (the condition after - * initial creation), we might make no progress at all, or finally, if - * a tree has only deleted pages, we'll make progress, but never get a - * useful WT_REF. And, of course, the tree can switch from one of these - * states to another without warning. Decrement skip regardless of what + * Be paranoid about loop termination: first, if the last leaf page skipped was also the last + * leaf page in the tree, skip may be set to zero on return along with the NULL WT_REF + * end-of-walk condition. Second, if a tree has no valid pages at all (the condition after + * initial creation), we might make no progress at all, or finally, if a tree has only deleted + * pages, we'll make progress, but never get a useful WT_REF. And, of course, the tree can + * switch from one of these states to another without warning. Decrement skip regardless of what * is happening in the search, guarantee we eventually quit. * - * Pages read for data sampling aren't "useful"; don't update the read - * generation of pages already in memory, and if a page is read, set - * its generation to a low value so it is evicted quickly. + * Pages read for data sampling aren't "useful"; don't update the read generation of pages + * already in memory, and if a page is read, set its generation to a low value so it is evicted + * quickly. */ for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) { n = skip; diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 176ade40575..e75680fc946 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -430,12 +430,10 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) WT_CLEAR(tmp); /* - * Attempt to set the state to WT_REF_READING for normal reads, or - * WT_REF_LOCKED, for deleted pages or pages with lookaside entries. - * The difference is that checkpoints can skip over clean pages that - * are being read into cache, but need to wait for deletes or lookaside - * updates to be resolved (in order for checkpoint to write the correct - * version of the page). + * Attempt to set the state to WT_REF_READING for normal reads, or WT_REF_LOCKED, for deleted + * pages or pages with lookaside entries. The difference is that checkpoints can skip over clean + * pages that are being read into cache, but need to wait for deletes or lookaside updates to be + * resolved (in order for checkpoint to write the correct version of the page). * * If successful, we've won the race, read the page. */ @@ -489,15 +487,13 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) } /* - * Build the in-memory version of the page. Clear our local reference to - * the allocated copy of the disk image on return, the in-memory object - * steals it. + * Build the in-memory version of the page. Clear our local reference to the allocated copy of + * the disk image on return, the in-memory object steals it. * - * If a page is read with eviction disabled, we don't count evicting it - * as progress. Since disabling eviction allows pages to be read even - * when the cache is full, we want to avoid workloads repeatedly reading - * a page with eviction disabled (e.g., a metadata page), then evicting - * that page and deciding that is a sign that eviction is unstuck. + * If a page is read with eviction disabled, we don't count evicting it as progress. Since + * disabling eviction allows pages to be read even when the cache is full, we want to avoid + * workloads repeatedly reading a page with eviction disabled (e.g., a metadata page), then + * evicting that page and deciding that is a sign that eviction is unstuck. */ page_flags = WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED; if (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE)) @@ -683,9 +679,8 @@ read: /* * The page is in memory. * - * Get a hazard pointer if one is required. We cannot - * be evicting if no hazard pointer is required, we're - * done. + * Get a hazard pointer if one is required. We cannot be evicting if no hazard pointer + * is required, we're done. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) goto skip_evict; @@ -761,14 +756,11 @@ read: skip_evict: /* - * If we read the page and are configured to not trash - * the cache, and no other thread has already used the - * page, set the read generation so the page is evicted - * soon. + * If we read the page and are configured to not trash the cache, and no other thread + * has already used the page, set the read generation so the page is evicted soon. * - * Otherwise, if we read the page, or, if configured to - * update the page's read generation and the page isn't - * already flagged for forced eviction, update the page + * Otherwise, if we read the page, or, if configured to update the page's read + * generation and the page isn't already flagged for forced eviction, update the page * read generation. */ page = ref->page; @@ -781,17 +773,13 @@ read: __wt_cache_read_gen_bump(session, page); /* - * Check if we need an autocommit transaction. - * Starting a transaction can trigger eviction, so skip - * it if eviction isn't permitted. + * Check if we need an autocommit transaction. Starting a transaction can trigger + * eviction, so skip it if eviction isn't permitted. * - * The logic here is a little weird: some code paths do - * a blanket ban on checking the cache size in - * sessions, but still require a transaction (e.g., - * when updating metadata or lookaside). If - * WT_READ_IGNORE_CACHE_SIZE was passed in explicitly, - * we're done. If we set WT_READ_IGNORE_CACHE_SIZE - * because it was set in the session then make sure we + * The logic here is a little weird: some code paths do a blanket ban on checking the + * cache size in sessions, but still require a transaction (e.g., when updating metadata + * or lookaside). If WT_READ_IGNORE_CACHE_SIZE was passed in explicitly, we're done. If + * we set WT_READ_IGNORE_CACHE_SIZE because it was set in the session then make sure we * start a transaction. */ return (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE) && diff --git a/src/third_party/wiredtiger/src/btree/bt_rebalance.c b/src/third_party/wiredtiger/src/btree/bt_rebalance.c index 304750bd1b5..ead542b77a1 100644 --- a/src/third_party/wiredtiger/src/btree/bt_rebalance.c +++ b/src/third_party/wiredtiger/src/btree/bt_rebalance.c @@ -243,12 +243,12 @@ __rebalance_row_leaf_key(WT_SESSION_IMPL *session, const uint8_t *addr, size_t a WT_PAGE *page; /* - * We need the first key from a leaf page. Leaf pages are relatively - * complex (Huffman encoding, prefix compression, and so on), do the - * work to instantiate the page and copy the first key to the buffer. + * We need the first key from a leaf page. Leaf pages are relatively complex (Huffman encoding, + * prefix compression, and so on), do the work to instantiate the page and copy the first key to + * the buffer. * - * Page flags are 0 because we aren't releasing the memory used to read - * the page into memory and we don't want page discard to free it. + * Page flags are 0 because we aren't releasing the memory used to read the page into memory and + * we don't want page discard to free it. */ WT_RET(__wt_bt_read(session, rs->tmp1, addr, addr_len)); WT_RET(__wt_page_inmem(session, NULL, rs->tmp1->data, 0, false, &page)); @@ -296,13 +296,12 @@ __rebalance_row_walk(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts, const break; case WT_CELL_KEY_OVFL: /* - * Any overflow key that references an internal page is - * of no further use, schedule its blocks to be freed. + * Any overflow key that references an internal page is of no further use, schedule its + * blocks to be freed. * - * We could potentially use the same overflow key being - * freed here for the internal page we're creating, but - * that's more work to get reconciliation to understand - * and overflow keys are (well, should be), uncommon. + * We could potentially use the same overflow key being freed here for the internal page + * we're creating, but that's more work to get reconciliation to understand and overflow + * keys are (well, should be), uncommon. */ __wt_verbose(session, WT_VERB_REBALANCE, "free-list append overflow key: %s", __wt_addr_string(session, unpack.data, unpack.size, rs->tmp1)); diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c index 829a4c3a9f3..d9d1d8263a8 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ret.c +++ b/src/third_party/wiredtiger/src/btree/bt_ret.c @@ -137,9 +137,8 @@ __wt_value_return_upd( allocated_bytes = 0; /* - * We're passed a "standard" or "modified" update that's visible to us. - * Our caller should have already checked for deleted items (we're too - * far down the call stack to return not-found). + * We're passed a "standard" or "modified" update that's visible to us. Our caller should have + * already checked for deleted items (we're too far down the call stack to return not-found). * * Fast path if it's a standard item, assert our caller's behavior. */ diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index 5ca21d61001..ea54d449576 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -170,19 +170,15 @@ __slvg_checkpoint(WT_SESSION_IMPL *session, WT_REF *root) config = NULL; /* - * XXX - * The salvage process reads and discards previous checkpoints, so the - * underlying block manager has to ignore any previous checkpoint - * entries when creating a new checkpoint. In other words, we can't use - * the metadata checkpoint list, it lists the previous checkpoints and - * we don't care about them. Build a clean checkpoint list and use it - * instead. + * XXX The salvage process reads and discards previous checkpoints, so the underlying block + * manager has to ignore any previous checkpoint entries when creating a new checkpoint. In + * other words, we can't use the metadata checkpoint list, it lists the previous checkpoints and + * we don't care about them. Build a clean checkpoint list and use it instead. * - * Don't first clear the metadata checkpoint list and call the function - * to get a list of checkpoints: a crash between clearing the metadata - * checkpoint list and creating a new checkpoint list would look like a - * create or open of a file without a checkpoint to roll-forward from, - * and the contents of the file would be discarded. + * Don't first clear the metadata checkpoint list and call the function to get a list of + * checkpoints: a crash between clearing the metadata checkpoint list and creating a new + * checkpoint list would look like a create or open of a file without a checkpoint to + * roll-forward from, and the contents of the file would be discarded. */ WT_RET(__wt_calloc_def(session, 2, &ckptbase)); WT_ERR(__wt_strdup(session, WT_CHECKPOINT, &ckptbase->name)); @@ -209,11 +205,11 @@ __slvg_checkpoint(WT_SESSION_IMPL *session, WT_REF *root) } /* - * If no checkpoint was created, clear all recorded checkpoints for the - * file. This is expected if we didn't find any leaf pages to salvage. + * If no checkpoint was created, clear all recorded checkpoints for the file. This is expected + * if we didn't find any leaf pages to salvage. * - * If a checkpoint was created, life is good, replace any existing list - * of checkpoints with the single new one. + * If a checkpoint was created, life is good, replace any existing list of checkpoints with the + * single new one. */ if (ckptbase->raw.data == NULL) WT_TRET(__wt_meta_checkpoint_clear(session, dhandle->name)); @@ -259,13 +255,11 @@ __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(bm->salvage_start(bm, session)); /* - * Step 2: - * Read the file and build in-memory structures that reference any leaf - * or overflow page. Any pages other than leaf or overflow pages are - * added to the free list. + * Step 2: Read the file and build in-memory structures that reference any leaf or overflow + * page. Any pages other than leaf or overflow pages are added to the free list. * - * Turn off read checksum and verification error messages while we're - * reading the file, we expect to see corrupted blocks. + * Turn off read checksum and verification error messages while we're reading the file, we + * expect to see corrupted blocks. */ F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE); ret = __slvg_read(session, ss); @@ -348,12 +342,11 @@ __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) } /* - * Step 7: - * Build an internal page that references all of the leaf pages, - * and write it, as well as any merged pages, to the file. + * Step 7: Build an internal page that references all of the leaf pages, and write it, as well + * as any merged pages, to the file. * - * Count how many leaf pages we have (we could track this during the - * array shuffling/splitting, but that's a lot harder). + * Count how many leaf pages we have (we could track this during the array shuffling/splitting, + * but that's a lot harder). */ for (leaf_cnt = i = 0; i < ss->pages_next; ++i) if (ss->pages[i] != NULL) @@ -439,10 +432,9 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss) WT_ERR(__wt_progress(session, NULL, ss->fcnt)); /* - * Read (and potentially decompress) the block; the underlying - * block manager might return only good blocks if checksums are - * configured, or both good and bad blocks if we're relying on - * compression. + * Read (and potentially decompress) the block; the underlying block manager might return + * only good blocks if checksums are configured, or both good and bad blocks if we're + * relying on compression. * * Report the block's status to the block manager. */ @@ -464,11 +456,10 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss) /* * Make sure it's an expected page type for the file. * - * We only care about leaf and overflow pages from here on out; - * discard all of the others. We put them on the free list now, - * because we might as well overwrite them, we want the file to - * grow as little as possible, or shrink, and future salvage - * calls don't need them either. + * We only care about leaf and overflow pages from here on out; discard all of the others. + * We put them on the free list now, because we might as well overwrite them, we want the + * file to grow as little as possible, or shrink, and future salvage calls don't need them + * either. */ dsk = buf->data; switch (dsk->type) { @@ -617,16 +608,13 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, uint8_t *ad break; case WT_PAGE_ROW_LEAF: /* - * Row-store format: copy the first and last keys on the page. - * Keys are prefix-compressed, the simplest and slowest thing - * to do is instantiate the in-memory page, then instantiate - * and copy the full keys, then free the page. We do this on - * every leaf page, and if you need to speed up the salvage, - * it's probably a great place to start. + * Row-store format: copy the first and last keys on the page. Keys are prefix-compressed, + * the simplest and slowest thing to do is instantiate the in-memory page, then instantiate + * and copy the full keys, then free the page. We do this on every leaf page, and if you + * need to speed up the salvage, it's probably a great place to start. * - * Page flags are 0 because we aren't releasing the memory used - * to read the page into memory and we don't want page discard - * to free it. + * Page flags are 0 because we aren't releasing the memory used to read the page into memory + * and we don't want page discard to free it. */ WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, false, &page)); WT_ERR(__wt_row_leaf_key_copy(session, page, &page->pg_row[0], &trk->row_start)); @@ -768,16 +756,14 @@ __slvg_col_range(WT_SESSION_IMPL *session, WT_STUFF *ss) uint32_t i, j; /* - * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR - * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE - * BEING HANDLED. + * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR COLUMN-STORE CODE: THEY + * ARE IDENTICAL OTHER THAN THE PAGES THAT ARE BEING HANDLED. * - * Walk the page array looking for overlapping key ranges, adjusting - * the ranges based on the LSN until there are no overlaps. + * Walk the page array looking for overlapping key ranges, adjusting the ranges based on the LSN + * until there are no overlaps. * - * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE - * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE - * PLUS OFFSET. + * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE AS ENTRIES ARE SPLIT, SO + * ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE PLUS OFFSET. */ for (i = 0; i < ss->pages_next; ++i) { if (ss->pages[i] == NULL) @@ -951,12 +937,10 @@ __slvg_col_range_overlap(WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_s } /* - * Case #5: b_trk is more desirable and is a middle chunk of a_trk. - * Split a_trk into two parts, the key range before b_trk and the - * key range after b_trk. + * Case #5: b_trk is more desirable and is a middle chunk of a_trk. Split a_trk into two parts, + * the key range before b_trk and the key range after b_trk. * - * Allocate a new WT_TRACK object, and extend the array of pages as - * necessary. + * Allocate a new WT_TRACK object, and extend the array of pages as necessary. */ WT_RET(__wt_calloc_one(session, &new)); if ((ret = __wt_realloc_def(session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages)) != @@ -1356,16 +1340,14 @@ __slvg_row_range(WT_SESSION_IMPL *session, WT_STUFF *ss) btree = S2BT(session); /* - * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR - * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE - * BEING HANDLED. + * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR COLUMN-STORE CODE: THEY + * ARE IDENTICAL OTHER THAN THE PAGES THAT ARE BEING HANDLED. * - * Walk the page array looking for overlapping key ranges, adjusting - * the ranges based on the LSN until there are no overlaps. + * Walk the page array looking for overlapping key ranges, adjusting the ranges based on the LSN + * until there are no overlaps. * - * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE - * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE - * PLUS OFFSET. + * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE AS ENTRIES ARE SPLIT, SO + * ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE PLUS OFFSET. */ for (i = 0; i < ss->pages_next; ++i) { if (ss->pages[i] == NULL) @@ -1550,12 +1532,10 @@ __slvg_row_range_overlap(WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_s } /* - * Case #5: b_trk is more desirable and is a middle chunk of a_trk. - * Split a_trk into two parts, the key range before b_trk and the - * key range after b_trk. + * Case #5: b_trk is more desirable and is a middle chunk of a_trk. Split a_trk into two parts, + * the key range before b_trk and the key range after b_trk. * - * Allocate a new WT_TRACK object, and extend the array of pages as - * necessary. + * Allocate a new WT_TRACK object, and extend the array of pages as necessary. */ WT_RET(__wt_calloc_one(session, &new)); if ((ret = __wt_realloc_def(session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages)) != @@ -1819,19 +1799,16 @@ __slvg_row_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref, WT_S page = ref->page; /* - * Figure out how many page keys we want to take and how many we want - * to skip. + * Figure out how many page keys we want to take and how many we want to skip. * - * If checking the starting range key, the key we're searching for will - * be equal to the starting range key. This is because we figured out - * the true merged-page start key as part of discarding initial keys - * from the page (see the __slvg_row_range_overlap function, and its + * If checking the starting range key, the key we're searching for will be equal to the starting + * range key. This is because we figured out the true merged-page start key as part of + * discarding initial keys from the page (see the __slvg_row_range_overlap function, and its * calls to __slvg_row_trk_update_start for more information). * - * If checking the stopping range key, we want the keys on the page that - * are less-than the stopping range key. This is because we copied a - * key from another page to define this page's stop range: that page is - * the page that owns the "equal to" range space. + * If checking the stopping range key, we want the keys on the page that are less-than the + * stopping range key. This is because we copied a key from another page to define this page's + * stop range: that page is the page that owns the "equal to" range space. */ skip_start = skip_stop = 0; if (F_ISSET(trk, WT_TRACK_CHECK_START)) @@ -2043,11 +2020,10 @@ __slvg_ovfl_reconcile(WT_SESSION_IMPL *session, WT_STUFF *ss) slot = NULL; /* - * If an overflow page is referenced more than once, discard leaf pages - * with the lowest LSNs until overflow pages are only referenced once. + * If an overflow page is referenced more than once, discard leaf pages with the lowest LSNs + * until overflow pages are only referenced once. * - * This requires sorting the page list by LSN, and the overflow array by - * address cookie. + * This requires sorting the page list by LSN, and the overflow array by address cookie. */ __wt_qsort(ss->pages, (size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_gen); __wt_qsort(ss->ovfl, (size_t)ss->ovfl_next, sizeof(WT_TRACK *), __slvg_trk_compare_addr); @@ -2261,11 +2237,11 @@ __slvg_ovfl_discard(WT_SESSION_IMPL *session, WT_STUFF *ss) uint32_t i; /* - * Walk the overflow page array: if an overflow page isn't referenced, - * add its file blocks to the free list. + * Walk the overflow page array: if an overflow page isn't referenced, add its file blocks to + * the free list. * - * Clear the reference flag (it's reused to figure out if the overflow - * record is referenced, but never used, by merged pages). + * Clear the reference flag (it's reused to figure out if the overflow record is referenced, but + * never used, by merged pages). */ for (i = 0; i < ss->ovfl_next; ++i) { if ((trk = ss->ovfl[i]) == NULL) diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index ee373192a40..f22036e1ebb 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -210,21 +210,17 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, WT_REF **from_ref addr = NULL; /* - * The from-home argument is the page into which the "from" WT_REF may - * point, for example, if there's an on-page key the "from" WT_REF - * references, it will be on the page "from-home". + * The from-home argument is the page into which the "from" WT_REF may point, for example, if + * there's an on-page key the "from" WT_REF references, it will be on the page "from-home". * - * Instantiate row-store keys, and column- and row-store addresses in - * the WT_REF structures referenced by a page that's being split. The - * WT_REF structures aren't moving, but the index references are moving - * from the page we're splitting to a set of new pages, and so we can - * no longer reference the block image that remains with the page being - * split. + * Instantiate row-store keys, and column- and row-store addresses in the WT_REF structures + * referenced by a page that's being split. The WT_REF structures aren't moving, but the index + * references are moving from the page we're splitting to a set of new pages, and so we can no + * longer reference the block image that remains with the page being split. * - * No locking is required to update the WT_REF structure because we're - * the only thread splitting the page, and there's no way for readers - * to race with our updates of single pointers. The changes have to be - * written before the page goes away, of course, our caller owns that + * No locking is required to update the WT_REF structure because we're the only thread splitting + * the page, and there's no way for readers to race with our updates of single pointers. The + * changes have to be written before the page goes away, of course, our caller owns that * problem. */ if (from_home->type == WT_PAGE_ROW_INT) { @@ -336,22 +332,19 @@ __split_ref_prepare( locked = NULL; /* - * Update the moved WT_REFs so threads moving through them start looking - * at the created children's page index information. Because we've not - * yet updated the page index of the parent page into which we are going - * to split this subtree, a cursor moving through these WT_REFs will - * ascend into the created children, but eventually fail as that parent - * page won't yet know about the created children pages. That's OK, we - * spin there until the parent's page index is updated. + * Update the moved WT_REFs so threads moving through them start looking at the created + * children's page index information. Because we've not yet updated the page index of the parent + * page into which we are going to split this subtree, a cursor moving through these WT_REFs + * will ascend into the created children, but eventually fail as that parent page won't yet know + * about the created children pages. That's OK, we spin there until the parent's page index is + * updated. * - * Lock the newly created page to ensure none of its children can split. - * First, to ensure all of the child pages are updated before any pages - * can split. Second, to ensure the original split completes before any - * of the children can split. The latter involves split generations: - * the original split page has references to these children. If they - * split immediately, they could free WT_REF structures based on split - * generations earlier than the split generation we'll eventually choose - * to protect the original split page's previous page index. + * Lock the newly created page to ensure none of its children can split. First, to ensure all of + * the child pages are updated before any pages can split. Second, to ensure the original split + * completes before any of the children can split. The latter involves split generations: the + * original split page has references to these children. If they split immediately, they could + * free WT_REF structures based on split generations earlier than the split generation we'll + * eventually choose to protect the original split page's previous page index. */ alloc = cnt = 0; for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { @@ -535,16 +528,13 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) __wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_2); /* - * Get a generation for this split, mark the root page. This must be - * after the new index is swapped into place in order to know that no - * readers are looking at the old index. + * Get a generation for this split, mark the root page. This must be after the new index is + * swapped into place in order to know that no readers are looking at the old index. * - * Note: as the root page cannot currently be evicted, the root split - * generation isn't ever used. That said, it future proofs eviction - * and isn't expensive enough to special-case. + * Note: as the root page cannot currently be evicted, the root split generation isn't ever + * used. That said, it future proofs eviction and isn't expensive enough to special-case. * - * Getting a new split generation implies a full barrier, no additional - * barrier is needed. + * Getting a new split generation implies a full barrier, no additional barrier is needed. */ split_gen = __wt_gen_next(session, WT_GEN_SPLIT); root->pg_intl_split_gen = split_gen; @@ -561,14 +551,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) complete = WT_ERR_IGNORE; /* - * We can't free the previous root's index, there may be threads using - * it. Add to the session's discard list, to be freed once we know no - * threads can still be using it. + * We can't free the previous root's index, there may be threads using it. Add to the session's + * discard list, to be freed once we know no threads can still be using it. * - * This change requires care with error handling: we have already - * updated the page with a new index. Even if stashing the old value - * fails, we don't roll back that change, because threads may already - * be using the new index. + * This change requires care with error handling: we have already updated the page with a new + * index. Even if stashing the old value fails, we don't roll back that change, because threads + * may already be using the new index. */ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); @@ -644,14 +632,13 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t parent_entries = pindex->entries; /* - * Remove any refs to deleted pages while we are splitting, we have the - * internal page locked down, and are copying the refs into a new array - * anyway. Switch them to the special split state, so that any reading - * thread will restart. + * Remove any refs to deleted pages while we are splitting, we have the internal page locked + * down, and are copying the refs into a new array anyway. Switch them to the special split + * state, so that any reading thread will restart. * - * We can't do this if there is a sync running in the tree in another - * session: removing the refs frees the blocks for the deleted pages, - * which can corrupt the free list calculated by the sync. + * We can't do this if there is a sync running in the tree in another session: removing the refs + * frees the blocks for the deleted pages, which can corrupt the free list calculated by the + * sync. */ WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); for (deleted_entries = 0, i = 0; i < parent_entries; ++i) { @@ -687,14 +674,12 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t } /* - * Allocate and initialize a new page index array for the parent, then - * copy references from the original index array, plus references from - * the newly created split array, into place. + * Allocate and initialize a new page index array for the parent, then copy references from the + * original index array, plus references from the newly created split array, into place. * - * Update the WT_REF's page-index hint as we go. This can race with a - * thread setting the hint based on an older page-index, and the change - * isn't backed out in the case of an error, so there ways for the hint - * to be wrong; OK because it's just a hint. + * Update the WT_REF's page-index hint as we go. This can race with a thread setting the hint + * based on an older page-index, and the change isn't backed out in the case of an error, so + * there ways for the hint to be wrong; OK because it's just a hint. */ size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); @@ -737,12 +722,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t __wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_4); /* - * Get a generation for this split, mark the page. This must be after - * the new index is swapped into place in order to know that no readers - * are looking at the old index. + * Get a generation for this split, mark the page. This must be after the new index is swapped + * into place in order to know that no readers are looking at the old index. * - * Getting a new split generation implies a full barrier, no additional - * barrier is needed. + * Getting a new split generation implies a full barrier, no additional barrier is needed. */ split_gen = __wt_gen_next(session, WT_GEN_SPLIT); parent->pg_intl_split_gen = split_gen; @@ -798,18 +781,14 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t WT_ASSERT(session, next_ref->state == WT_REF_SPLIT); /* - * We set the WT_REF to split, discard it, freeing any resources - * it holds. + * We set the WT_REF to split, discard it, freeing any resources it holds. * - * Row-store trees where the old version of the page is being - * discarded: the previous parent page's key for this child page - * may have been an on-page overflow key. In that case, if the - * key hasn't been deleted, delete it now, including its backing - * blocks. We are exchanging the WT_REF that referenced it for - * the split page WT_REFs and their keys, and there's no longer - * any reference to it. Done after completing the split (if we - * failed, we'd leak the underlying blocks, but the parent page - * would be unaffected). + * Row-store trees where the old version of the page is being discarded: the previous parent + * page's key for this child page may have been an on-page overflow key. In that case, if + * the key hasn't been deleted, delete it now, including its backing blocks. We are + * exchanging the WT_REF that referenced it for the split page WT_REFs and their keys, and + * there's no longer any reference to it. Done after completing the split (if we failed, + * we'd leak the underlying blocks, but the parent page would be unaffected). */ if (parent->type == WT_PAGE_ROW_INT) { WT_TRET(__split_ovfl_key_cleanup(session, parent, next_ref)); @@ -955,14 +934,12 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) (void *)page, pindex->entries, children, (void *)parent); /* - * Ideally, we'd discard the original page, but that's hard since other - * threads of control are using it (for example, if eviction is walking - * the tree and looking at the page.) Instead, perform a right-split, - * moving all except the first chunk of the page's WT_REF objects to new + * Ideally, we'd discard the original page, but that's hard since other threads of control are + * using it (for example, if eviction is walking the tree and looking at the page.) Instead, + * perform a right-split, moving all except the first chunk of the page's WT_REF objects to new * pages. * - * Create and initialize a replacement WT_PAGE_INDEX for the original - * page. + * Create and initialize a replacement WT_PAGE_INDEX for the original page. */ size = sizeof(WT_PAGE_INDEX) + chunk * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &replace_index)); @@ -973,11 +950,11 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) replace_index->index[i] = *page_refp++; /* - * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted - * into the page's parent, replacing the page's page-index. + * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted into the page's parent, + * replacing the page's page-index. * - * The first slot of the new WT_PAGE_INDEX is the original page WT_REF. - * The remainder of the slots are allocated WT_REFs. + * The first slot of the new WT_PAGE_INDEX is the original page WT_REF. The remainder of the + * slots are allocated WT_REFs. */ size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); @@ -1073,12 +1050,10 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) __wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_6); /* - * Get a generation for this split, mark the parent page. This must be - * after the new index is swapped into place in order to know that no - * readers are looking at the old index. + * Get a generation for this split, mark the parent page. This must be after the new index is + * swapped into place in order to know that no readers are looking at the old index. * - * Getting a new split generation implies a full barrier, no additional - * barrier is needed. + * Getting a new split generation implies a full barrier, no additional barrier is needed. */ split_gen = __wt_gen_next(session, WT_GEN_SPLIT); page->pg_intl_split_gen = split_gen; @@ -1101,14 +1076,12 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) __wt_free(session, alloc_index); /* - * We can't free the previous page's index, there may be threads using - * it. Add to the session's discard list, to be freed once we know no - * threads can still be using it. + * We can't free the previous page's index, there may be threads using it. Add to the session's + * discard list, to be freed once we know no threads can still be using it. * - * This change requires care with error handling, we've already updated - * the parent page. Even if stashing the old value fails, we don't roll - * back that change, because threads may already be using the new parent - * page. + * This change requires care with error handling, we've already updated the parent page. Even if + * stashing the old value fails, we don't roll back that change, because threads may already be + * using the new parent page. */ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); @@ -1132,22 +1105,18 @@ err: __wt_free(session, replace_index); /* - * The alloc-index variable is the array of new WT_REF entries - * intended to be inserted into the page being split's parent. + * The alloc-index variable is the array of new WT_REF entries intended to be inserted into + * the page being split's parent. * - * Except for the first slot (the original page's WT_REF), it's - * an array of newly allocated combined WT_PAGE_INDEX and WT_REF - * structures, each of which references a newly allocated (and - * modified) child page, each of which references an index of - * WT_REFs from the page being split. Free everything except for - * slot 1 and the WT_REFs in the child page indexes. + * Except for the first slot (the original page's WT_REF), it's an array of newly allocated + * combined WT_PAGE_INDEX and WT_REF structures, each of which references a newly allocated + * (and modified) child page, each of which references an index of WT_REFs from the page + * being split. Free everything except for slot 1 and the WT_REFs in the child page indexes. * - * First, skip slot 1. Second, we want to free all of the child - * pages referenced from the alloc-index array, but we can't - * just call the usual discard function because the WT_REFs - * referenced by the child pages remain referenced by the - * original page, after error. For each entry, free the child - * page's page index (so the underlying page-free function will + * First, skip slot 1. Second, we want to free all of the child pages referenced from the + * alloc-index array, but we can't just call the usual discard function because the WT_REFs + * referenced by the child pages remain referenced by the original page, after error. For + * each entry, free the child page's page index (so the underlying page-free function will * ignore it), then call the general-purpose discard function. */ if (alloc_index == NULL) @@ -1204,19 +1173,15 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, WT_PA return (__wt_set_return(session, EBUSY)); /* - * Get a page-level lock on the parent to single-thread splits into the - * page because we need to single-thread sizing/growing the page index. - * It's OK to queue up multiple splits as the child pages split, but the - * actual split into the parent has to be serialized. Note we allocate - * memory inside of the lock and may want to invest effort in making the - * locked period shorter. + * Get a page-level lock on the parent to single-thread splits into the page because we need to + * single-thread sizing/growing the page index. It's OK to queue up multiple splits as the child + * pages split, but the actual split into the parent has to be serialized. Note we allocate + * memory inside of the lock and may want to invest effort in making the locked period shorter. * - * We use the reconciliation lock here because not only do we have to - * single-thread the split, we have to lock out reconciliation of the - * parent because reconciliation of the parent can't deal with finding - * a split child during internal page traversal. Basically, there's no - * reason to use a different lock if we have to block reconciliation - * anyway. + * We use the reconciliation lock here because not only do we have to single-thread the split, + * we have to lock out reconciliation of the parent because reconciliation of the parent can't + * deal with finding a split child during internal page traversal. Basically, there's no reason + * to use a different lock if we have to block reconciliation anyway. */ for (;;) { parent = ref->home; @@ -1326,20 +1291,18 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page) } /* - * Page splits trickle up the tree, that is, as leaf pages grow large - * enough and are evicted, they'll split into their parent. And, as - * that parent page grows large enough and is evicted, it splits into - * its parent and so on. When the page split wave reaches the root, - * the tree will permanently deepen as multiple root pages are written. + * Page splits trickle up the tree, that is, as leaf pages grow large enough and are evicted, + * they'll split into their parent. And, as that parent page grows large enough and is evicted, + * it splits into its parent and so on. When the page split wave reaches the root, the tree will + * permanently deepen as multiple root pages are written. * - * However, this only helps if internal pages are evicted (and we resist - * evicting internal pages for obvious reasons), or if the tree were to - * be closed and re-opened from a disk image, which may be a rare event. + * However, this only helps if internal pages are evicted (and we resist evicting internal pages + * for obvious reasons), or if the tree were to be closed and re-opened from a disk image, which + * may be a rare event. * - * To avoid internal pages becoming too large absent eviction, check - * parent pages each time pages are split into them. If the page is big - * enough, either split the page into its parent or, in the case of the - * root, deepen the tree. + * To avoid internal pages becoming too large absent eviction, check parent pages each time + * pages are split into them. If the page is big enough, either split the page into its parent + * or, in the case of the root, deepen the tree. * * Split up the tree. */ @@ -1439,19 +1402,17 @@ __split_multi_inmem(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT WT_ASSERT(session, orig->type != WT_PAGE_COL_VAR || ref->ref_recno != 0); /* - * This code re-creates an in-memory page from a disk image, and adds - * references to any unresolved update chains to the new page. We get - * here either because an update could not be written when evicting a - * page, or eviction chose to keep a page in memory. + * This code re-creates an in-memory page from a disk image, and adds references to any + * unresolved update chains to the new page. We get here either because an update could not be + * written when evicting a page, or eviction chose to keep a page in memory. * - * Reconciliation won't create a disk image with entries the running - * database no longer cares about (at least, not based on the current - * tests we're performing), ignore the validity window. + * Reconciliation won't create a disk image with entries the running database no longer cares + * about (at least, not based on the current tests we're performing), ignore the validity + * window. * - * Steal the disk image and link the page into the passed-in WT_REF to - * simplify error handling: our caller will not discard the disk image - * when discarding the original page, and our caller will discard the - * allocated page on error, when discarding the allocated WT_REF. + * Steal the disk image and link the page into the passed-in WT_REF to simplify error handling: + * our caller will not discard the disk image when discarding the original page, and our caller + * will discard the allocated page on error, when discarding the allocated WT_REF. */ WT_RET(__wt_page_inmem(session, ref, multi->disk_image, WT_PAGE_DISK_ALLOC, false, &page)); multi->disk_image = NULL; @@ -1591,14 +1552,13 @@ static void __split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref) { /* - * We failed creating new in-memory pages. For error-handling reasons, - * we've left the update chains referenced by both the original and - * new pages. Discard the newly allocated WT_REF structures and their - * pages (setting a flag so the discard code doesn't discard the updates - * on the page). + * We failed creating new in-memory pages. For error-handling reasons, we've left the update + * chains referenced by both the original and new pages. Discard the newly allocated WT_REF + * structures and their pages (setting a flag so the discard code doesn't discard the updates on + * the page). * - * Our callers allocate WT_REF arrays, then individual WT_REFs, check - * for uninitialized information. + * Our callers allocate WT_REF arrays, then individual WT_REFs, check for uninitialized + * information. */ if (ref != NULL) { if (ref->page != NULL) @@ -1666,9 +1626,9 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_R /* * If there's an address, the page was written, set it. * - * Copy the address: we could simply take the buffer, but that would - * complicate error handling, freeing the reference array would have - * to avoid freeing the memory, and it's not worth the confusion. + * Copy the address: we could simply take the buffer, but that would complicate error handling, + * freeing the reference array would have to avoid freeing the memory, and it's not worth the + * confusion. */ if (multi->addr.addr != NULL) { WT_RET(__wt_calloc_one(session, &addr)); @@ -1702,8 +1662,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_R } /* - * If we have a disk image and we're not closing the file, - * re-instantiate the page. + * If we have a disk image and we're not closing the file, re-instantiate the page. * * Discard any page image we don't use. */ @@ -1742,9 +1701,9 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) type = page->type; /* - * Assert splitting makes sense; specifically assert the page is dirty, - * we depend on that, otherwise the page might be evicted based on its - * last reconciliation which no longer matches reality after the split. + * Assert splitting makes sense; specifically assert the page is dirty, we depend on that, + * otherwise the page might be evicted based on its last reconciliation which no longer matches + * reality after the split. * * Note this page has already been through an in-memory split. */ @@ -1761,16 +1720,14 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) moved_ins = WT_SKIP_LAST(ins_head); /* - * The first page in the split is the current page, but we still have - * to create a replacement WT_REF, the original WT_REF will be set to - * split status and eventually freed. + * The first page in the split is the current page, but we still have to create a replacement + * WT_REF, the original WT_REF will be set to split status and eventually freed. * - * The new WT_REF is not quite identical: we have to instantiate a key, - * and the new reference is visible to readers once the split completes. + * The new WT_REF is not quite identical: we have to instantiate a key, and the new reference is + * visible to readers once the split completes. * - * Don't copy any deleted page state: we may be splitting a page that - * was instantiated after a truncate and that history should not be - * carried onto these new child pages. + * Don't copy any deleted page state: we may be splitting a page that was instantiated after a + * truncate and that history should not be carried onto these new child pages. */ WT_ERR(__wt_calloc_one(session, &split_ref[0])); parent_incr += sizeof(WT_REF); @@ -1869,12 +1826,10 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_MEM_TRANSFER(page_decr, right_incr, __wt_update_list_memsize(moved_ins->upd)); /* - * Move the last insert list item from the original page to the new - * page. + * Move the last insert list item from the original page to the new page. * - * First, update the item to the new child page. (Just append the entry - * for simplicity, the previous skip list pointers originally allocated - * can be ignored.) + * First, update the item to the new child page. (Just append the entry for simplicity, the + * previous skip list pointers originally allocated can be ignored.) */ tmp_ins_head = type == WT_PAGE_ROW_LEAF ? right->modify->mod_row_insert[0] : right->modify->mod_col_append[0]; @@ -1990,14 +1945,13 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) page->modify->mod_col_split_recno = WT_RECNO_OOB; /* - * Clear the allocated page's reference to the moved insert list element - * so it's not freed when we discard the page. + * Clear the allocated page's reference to the moved insert list element so it's not freed when + * we discard the page. * - * Move the element back to the original page list. For simplicity, the - * previous skip list pointers originally allocated can be ignored, just - * append the entry to the end of the level 0 list. As before, we depend - * on the list having multiple elements and ignore the edge cases small - * lists have. + * Move the element back to the original page list. For simplicity, the previous skip list + * pointers originally allocated can be ignored, just append the entry to the end of the level 0 + * list. As before, we depend on the list having multiple elements and ignore the edge cases + * small lists have. */ if (type == WT_PAGE_ROW_LEAF) right->modify->mod_row_insert[0]->head[0] = right->modify->mod_row_insert[0]->tail[0] = @@ -2123,8 +2077,7 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) /* * The split succeeded, we can no longer fail. * - * Finalize the move, discarding moved update lists from the original - * page. + * Finalize the move, discarding moved update lists from the original page. */ for (i = 0; i < new_entries; ++i) __split_multi_inmem_final(page, &mod->mod_multi[i]); @@ -2241,18 +2194,16 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi) __wt_verbose(session, WT_VERB_SPLIT, "%p: split-rewrite", (void *)ref); /* - * This isn't a split: a reconciliation failed because we couldn't write - * something, and in the case of forced eviction, we need to stop this - * page from being such a problem. We have exclusive access, rewrite the - * page in memory. The code lives here because the split code knows how - * to re-create a page in memory after it's been reconciled, and that's - * exactly what we want to do. + * This isn't a split: a reconciliation failed because we couldn't write something, and in the + * case of forced eviction, we need to stop this page from being such a problem. We have + * exclusive access, rewrite the page in memory. The code lives here because the split code + * knows how to re-create a page in memory after it's been reconciled, and that's exactly what + * we want to do. * * Build the new page. * - * Allocate a WT_REF, the error path calls routines that free memory. - * The only field we need to set is the record number, as it's used by - * the search routines. + * Allocate a WT_REF, the error path calls routines that free memory. The only field we need to + * set is the record number, as it's used by the search routines. */ WT_RET(__wt_calloc_one(session, &new)); new->ref_recno = ref->ref_recno; @@ -2262,19 +2213,17 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi) /* * The rewrite succeeded, we can no longer fail. * - * Finalize the move, discarding moved update lists from the original - * page. + * Finalize the move, discarding moved update lists from the original page. */ __split_multi_inmem_final(page, multi); /* * Discard the original page. * - * Pages with unresolved changes are not marked clean during - * reconciliation, do it now. + * Pages with unresolved changes are not marked clean during reconciliation, do it now. * - * Don't count this as eviction making progress, we did a one-for-one - * rewrite of a page in memory, typical in the case of cache pressure. + * Don't count this as eviction making progress, we did a one-for-one rewrite of a page in + * memory, typical in the case of cache pressure. */ __wt_page_modify_clear(session, page); F_SET_ATOMIC(page, WT_PAGE_EVICT_NO_PROGRESS); diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c index 5873e611189..faf0fc0e7ac 100644 --- a/src/third_party/wiredtiger/src/btree/bt_stat.c +++ b/src/third_party/wiredtiger/src/btree/bt_stat.c @@ -290,13 +290,11 @@ __stat_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **st } /* - * Overflow keys are hard: we have to walk the disk image to count them, - * the in-memory representation of the page doesn't necessarily contain - * a reference to the original cell. + * Overflow keys are hard: we have to walk the disk image to count them, the in-memory + * representation of the page doesn't necessarily contain a reference to the original cell. * - * Zero-length values are the same, we have to look at the disk image to - * know. They aren't stored but we know they exist if there are two keys - * in a row, or a key as the last item. + * Zero-length values are the same, we have to look at the disk image to know. They aren't + * stored but we know they exist if there are two keys in a row, or a key as the last item. */ if (page->dsk != NULL) { key = false; diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index a988793e6e7..6ede60b97e0 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -41,16 +41,14 @@ __sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page) return (false); /* - * The problematic case is when a page was evicted but when there were - * unresolved updates and not every block associated with the page has - * a disk address. We can't skip such pages because we need a checkpoint - * write with valid addresses. + * The problematic case is when a page was evicted but when there were unresolved updates and + * not every block associated with the page has a disk address. We can't skip such pages because + * we need a checkpoint write with valid addresses. * - * The page's modification information can change underfoot if the page - * is being reconciled, so we'd normally serialize with reconciliation - * before reviewing page-modification information. However, checkpoint - * is the only valid writer of dirty leaf pages at this point, we skip - * the lock. + * The page's modification information can change underfoot if the page is being reconciled, so + * we'd normally serialize with reconciliation before reviewing page-modification information. + * However, checkpoint is the only valid writer of dirty leaf pages at this point, we skip the + * lock. */ if (mod->rec_result == WT_PM_REC_MULTIBLOCK) for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) @@ -152,9 +150,8 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) /* * Write all immediately available, dirty in-cache leaf pages. * - * Writing the leaf pages is done without acquiring a high-level - * lock, serialize so multiple threads don't walk the tree at - * the same time. + * Writing the leaf pages is done without acquiring a high-level lock, serialize so multiple + * threads don't walk the tree at the same time. */ if (!btree->modified) return (0); @@ -195,27 +192,23 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) break; case WT_SYNC_CHECKPOINT: /* - * If we are flushing a file at read-committed isolation, which - * is of particular interest for flushing the metadata to make - * a schema-changing operation durable, get a transactional - * snapshot now. + * If we are flushing a file at read-committed isolation, which is of particular interest + * for flushing the metadata to make a schema-changing operation durable, get a + * transactional snapshot now. * - * All changes committed up to this point should be included. - * We don't update the snapshot in between pages because the - * metadata shouldn't have many pages. Instead, read-committed - * isolation ensures that all metadata updates completed before - * the checkpoint are included. + * All changes committed up to this point should be included. We don't update the snapshot + * in between pages because the metadata shouldn't have many pages. Instead, read-committed + * isolation ensures that all metadata updates completed before the checkpoint are included. */ if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); /* - * We cannot check the tree modified flag in the case of a - * checkpoint, the checkpoint code has already cleared it. + * We cannot check the tree modified flag in the case of a checkpoint, the checkpoint code + * has already cleared it. * - * Writing the leaf pages is done without acquiring a high-level - * lock, serialize so multiple threads don't walk the tree at - * the same time. We're holding the schema lock, but need the + * Writing the leaf pages is done without acquiring a high-level lock, serialize so multiple + * threads don't walk the tree at the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); @@ -284,21 +277,17 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) } /* - * If the page was pulled into cache by our read, try - * to evict it now. + * If the page was pulled into cache by our read, try to evict it now. * - * For eviction to have a chance, we first need to move - * the walk point to the next page checkpoint will - * visit. We want to avoid this code being too special - * purpose, so try to reuse the ordinary eviction path. + * For eviction to have a chance, we first need to move the walk point to the next page + * checkpoint will visit. We want to avoid this code being too special purpose, so try + * to reuse the ordinary eviction path. * - * Regardless of whether eviction succeeds or fails, - * the walk continues from the previous location. We - * remember whether we tried eviction, and don't try - * again. Even if eviction fails (the page may stay in - * cache clean but with history that cannot be - * discarded), that is not wasted effort because - * checkpoint doesn't need to write the page again. + * Regardless of whether eviction succeeds or fails, the walk continues from the + * previous location. We remember whether we tried eviction, and don't try again. Even + * if eviction fails (the page may stay in cache clean but with history that cannot be + * discarded), that is not wasted effort because checkpoint doesn't need to write the + * page again. */ if (!WT_PAGE_IS_INTERNAL(page) && page->read_gen == WT_READGEN_WONT_NEED && !tried_eviction) { diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index f1aed89572a..7685547b351 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -537,12 +537,10 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *addr_unpack entry = 0; WT_INTL_FOREACH_BEGIN (session, page, child_ref) { /* - * It's a depth-first traversal: this entry's starting - * key should be larger than the largest key previously - * reviewed. + * It's a depth-first traversal: this entry's starting key should be larger than the + * largest key previously reviewed. * - * The 0th key of any internal page is magic, and we - * can't test against it. + * The 0th key of any internal page is magic, and we can't test against it. */ ++entry; if (entry != 1) @@ -638,12 +636,10 @@ __verify_row_leaf_key_order(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs /* * Compare the key against the largest key we've seen so far. * - * If we're comparing against a key taken from an internal page, - * we can compare equal (which is an expected path, the internal - * page key is often a copy of the leaf page's first key). But, - * in the case of the 0th slot on an internal page, the last key - * we've seen was a key from a previous leaf page, and it's not - * OK to compare equally in that case. + * If we're comparing against a key taken from an internal page, we can compare equal (which + * is an expected path, the internal page key is often a copy of the leaf page's first key). + * But, in the case of the 0th slot on an internal page, the last key we've seen was a key + * from a previous leaf page, and it's not OK to compare equally in that case. */ WT_RET(__wt_compare(session, btree->collator, vs->tmp1, (WT_ITEM *)vs->max_key, &cmp)); if (cmp < 0) diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c index 0e4bbf2f92d..2d6654ebd43 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c @@ -264,14 +264,13 @@ __verify_dsk_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack, uint32_t char ts_string[2][WT_TS_INT_STRING_SIZE]; /* - * Check timestamp and transaction order, and optionally against parent - * values. Timestamps and transactions in the parent address aren't - * necessarily an exact match, but should be within the boundaries of - * the parent's information. + * Check timestamp and transaction order, and optionally against parent values. Timestamps and + * transactions in the parent address aren't necessarily an exact match, but should be within + * the boundaries of the parent's information. * - * There's no checking if validity information should appear on a page - * because the cell-unpacking code hides it by always returning durable - * values if they don't appear on the page. + * There's no checking if validity information should appear on a page because the + * cell-unpacking code hides it by always returning durable values if they don't appear on the + * page. */ switch (unpack->type) { case WT_CELL_ADDR_DEL: @@ -507,8 +506,7 @@ __verify_dsk_row( /* * Prefix compression checks. * - * Confirm the first non-overflow key on a page has a zero - * prefix compression count. + * Confirm the first non-overflow key on a page has a zero prefix compression count. */ prefix = unpack->prefix; if (last_pfx->size == 0 && prefix != 0) @@ -563,9 +561,8 @@ __verify_dsk_row( /* * Compare the current key against the last key. * - * Be careful about the 0th key on internal pages: we only store - * the first byte and custom collators may not be able to handle - * truncated keys. + * Be careful about the 0th key on internal pages: we only store the first byte and custom + * collators may not be able to handle truncated keys. */ if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) || (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) { diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index f6cc0267a72..d1efbb2533d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -34,15 +34,12 @@ __ref_index_slot(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp, entries = pindex->entries; /* - * Use the page's reference hint: it should be correct unless - * there was a split or delete in the parent before our slot. - * If the hint is wrong, it can be either too big or too small, - * but often only by a small amount. Search up and down the - * index starting from the hint. + * Use the page's reference hint: it should be correct unless there was a split or delete in + * the parent before our slot. If the hint is wrong, it can be either too big or too small, + * but often only by a small amount. Search up and down the index starting from the hint. * - * It's not an error for the reference hint to be wrong, it - * just means the first retrieval (which sets the hint for - * subsequent retrievals), is slower. + * It's not an error for the reference hint to be wrong, it just means the first retrieval + * (which sets the hint for subsequent retrievals), is slower. */ slot = ref->pindex_hint; if (slot >= entries) @@ -175,28 +172,25 @@ __split_prev_race(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp WT_PAGE_INDEX *pindex; /* - * Handle a cursor moving backwards through the tree or setting up at - * the end of the tree. We're passed the child page into which we're - * descending, and the parent page's page-index we used to find that - * child page. + * Handle a cursor moving backwards through the tree or setting up at the end of the tree. We're + * passed the child page into which we're descending, and the parent page's page-index we used + * to find that child page. * - * When splitting an internal page into its parent, we move the split - * pages WT_REF structures, then update the parent's page index, then - * update the split page's page index, and nothing is atomic. A thread - * can read the parent page's replacement page index and then the split - * page's original index, or vice-versa, and either change can cause a - * cursor moving backwards through the tree to skip pages. + * When splitting an internal page into its parent, we move the split pages WT_REF structures, + * then update the parent's page index, then update the split page's page index, and nothing is + * atomic. A thread can read the parent page's replacement page index and then the split page's + * original index, or vice-versa, and either change can cause a cursor moving backwards through + * the tree to skip pages. * - * This isn't a problem for a cursor setting up at the start of the tree - * or moving forward through the tree because we do right-hand splits on - * internal pages and the initial part of the split page's namespace - * won't change as part of a split (in other words, a thread reading the - * parent page's and split page's indexes will move to the same slot no - * matter what order of indexes are read. + * This isn't a problem for a cursor setting up at the start of the tree or moving forward + * through the tree because we do right-hand splits on internal pages and the initial part of + * the split page's namespace won't change as part of a split (in other words, a thread reading + * the parent page's and split page's indexes will move to the same slot no matter what order of + * indexes are read. * - * Acquire the child's page index, then confirm the parent's page index - * hasn't changed, to check for reading an old version of the parent's - * page index and then reading a new version of the child's page index. + * Acquire the child's page index, then confirm the parent's page index hasn't changed, to check + * for reading an old version of the parent's page index and then reading a new version of the + * child's page index. */ WT_INTL_INDEX_GET(session, ref->page, pindex); if (__wt_split_descent_race(session, ref, *pindexp)) @@ -406,16 +400,13 @@ restart: for (;;) { /* - * Swap our previous hazard pointer for the page - * we'll return. + * Swap our previous hazard pointer for the page we'll return. * - * Not-found is an expected return, as eviction - * might have been attempted. The page can't be - * evicted, we're holding a hazard pointer on a - * child, spin until we're successful. + * Not-found is an expected return, as eviction might have been attempted. The page + * can't be evicted, we're holding a hazard pointer on a child, spin until we're + * successful. * - * Restart is not expected, our parent WT_REF - * should not have split. + * Restart is not expected, our parent WT_REF should not have split. */ ret = __wt_page_swap(session, couple, ref, WT_READ_NOTFOUND_OK | flags); if (ret == 0) { @@ -528,8 +519,8 @@ descend: } /* - * Not-found is an expected return when walking only - * in-cache pages, or if we see a deleted page. + * Not-found is an expected return when walking only in-cache pages, or if we see a + * deleted page. * * An expected error, so "couple" is unchanged. */ @@ -540,8 +531,7 @@ descend: } /* - * The page we're moving to might have split, in which - * case restart the movement. + * The page we're moving to might have split, in which case restart the movement. * * An expected error, so "couple" is unchanged. */ diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c index 8bbda44d706..8a6c6e8aa2e 100644 --- a/src/third_party/wiredtiger/src/btree/col_modify.c +++ b/src/third_party/wiredtiger/src/btree/col_modify.c @@ -109,12 +109,11 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, /* * Delete, insert or update a column-store entry. * - * If modifying a previously modified record, cursor.ins will be set to - * point to the correct update list. Create a new update entry and link - * it into the existing list. + * If modifying a previously modified record, cursor.ins will be set to point to the correct + * update list. Create a new update entry and link it into the existing list. * - * Else, allocate an insert array as necessary, build an insert/update - * structure pair, and link it into place. + * Else, allocate an insert array as necessary, build an insert/update structure pair, and link + * it into place. */ if (cbt->compare == 0 && cbt->ins != NULL) { /* @@ -190,17 +189,15 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, ins_size += upd_size; /* - * If there was no insert list during the search, or there was - * no search because the record number has not been allocated - * yet, the cursor's information cannot be correct, search + * If there was no insert list during the search, or there was no search because the record + * number has not been allocated yet, the cursor's information cannot be correct, search * couldn't have initialized it. * - * Otherwise, point the new WT_INSERT item's skiplist to the - * next elements in the insert list (which we will check are - * still valid inside the serialization function). + * Otherwise, point the new WT_INSERT item's skiplist to the next elements in the insert + * list (which we will check are still valid inside the serialization function). * - * The serial mutex acts as our memory barrier to flush these - * writes before inserting them into the list. + * The serial mutex acts as our memory barrier to flush these writes before inserting them + * into the list. */ if (cbt->ins_stack[0] == NULL || recno == WT_RECNO_OOB) for (i = 0; i < skipdepth; i++) { diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index f202dbd7f7b..160f19ffc2a 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -176,16 +176,13 @@ descend: WT_DIAGNOSTIC_YIELD; /* - * Swap the current page for the child page. If the page splits - * while we're retrieving it, restart the search at the root. - * We cannot restart in the "current" page; for example, if a - * thread is appending to the tree, the page it's waiting for - * did an insert-split into the parent, then the parent split - * into its parent, the name space we are searching for may have - * moved above the current page in the tree. + * Swap the current page for the child page. If the page splits while we're retrieving it, + * restart the search at the root. We cannot restart in the "current" page; for example, if + * a thread is appending to the tree, the page it's waiting for did an insert-split into the + * parent, then the parent split into its parent, the name space we are searching for may + * have moved above the current page in the tree. * - * On other error, simply return, the swap call ensures we're - * holding nothing on failure. + * On other error, simply return, the swap call ensures we're holding nothing on failure. */ read_flags = WT_READ_RESTART_OK; if (F_ISSET(cbt, WT_CBT_READ_ONCE)) @@ -220,15 +217,13 @@ leaf_only: /* * Search the leaf page. * - * Search after a page is pinned does a search of the pinned page before - * doing a full tree search, in which case we might be searching for a - * record logically before the page. Return failure, and there's nothing - * else to do, the record isn't going to be on this page. + * Search after a page is pinned does a search of the pinned page before doing a full tree + * search, in which case we might be searching for a record logically before the page. Return + * failure, and there's nothing else to do, the record isn't going to be on this page. * - * We don't check inside the search path for a record greater than the - * maximum record in the tree; in that case, we get here with a record - * that's impossibly large for the page. We do have additional setup to - * do in that case, the record may be appended to the page. + * We don't check inside the search path for a record greater than the maximum record in the + * tree; in that case, we get here with a record that's impossibly large for the page. We do + * have additional setup to do in that case, the record may be appended to the page. */ if (page->type == WT_PAGE_COL_FIX) { if (recno < current->ref_recno) { diff --git a/src/third_party/wiredtiger/src/btree/row_key.c b/src/third_party/wiredtiger/src/btree/row_key.c index d278a5d0496..d0524dfe5a3 100644 --- a/src/third_party/wiredtiger/src/btree/row_key.c +++ b/src/third_party/wiredtiger/src/btree/row_key.c @@ -202,15 +202,12 @@ __wt_row_leaf_key_work( keyb->size = size; /* - * If this is the key we originally wanted, we don't - * care if we're rolling forward or backward, or if - * it's an overflow key or not, it's what we wanted. - * This shouldn't normally happen, the fast-path code - * that front-ends this function will have figured it - * out before we were called. + * If this is the key we originally wanted, we don't care if we're rolling forward or + * backward, or if it's an overflow key or not, it's what we wanted. This shouldn't + * normally happen, the fast-path code that front-ends this function will have figured + * it out before we were called. * - * The key doesn't need to be instantiated, skip past - * that test. + * The key doesn't need to be instantiated, skip past that test. */ if (slot_offset == 0) goto done; @@ -229,13 +226,11 @@ __wt_row_leaf_key_work( /* 2: the test for an instantiated off-page key. */ if (ikey != NULL) { /* - * If this is the key we originally wanted, we don't - * care if we're rolling forward or backward, or if - * it's an overflow key or not, it's what we wanted. - * Take a copy and wrap up. + * If this is the key we originally wanted, we don't care if we're rolling forward or + * backward, or if it's an overflow key or not, it's what we wanted. Take a copy and + * wrap up. * - * The key doesn't need to be instantiated, skip past - * that test. + * The key doesn't need to be instantiated, skip past that test. */ if (slot_offset == 0) { keyb->data = p; @@ -281,19 +276,15 @@ __wt_row_leaf_key_work( /* 3: the test for an on-page reference to an overflow key. */ if (unpack->type == WT_CELL_KEY_OVFL) { /* - * If this is the key we wanted from the start, we don't - * care if it's an overflow key, get a copy and wrap up. + * If this is the key we wanted from the start, we don't care if it's an overflow key, + * get a copy and wrap up. * - * Avoid racing with reconciliation deleting overflow - * keys. Deleted overflow keys must be instantiated - * first, acquire the overflow lock and check. Read - * the key if we still need to do so, but holding the - * overflow lock. Note we are not using the version of - * the cell-data-ref calls that acquire the overflow - * lock and do a look-aside into the tracking cache: - * this is an overflow key, not a value, meaning it's - * instantiated before being deleted, not copied into - * the tracking cache. + * Avoid racing with reconciliation deleting overflow keys. Deleted overflow keys must + * be instantiated first, acquire the overflow lock and check. Read the key if we still + * need to do so, but holding the overflow lock. Note we are not using the version of + * the cell-data-ref calls that acquire the overflow lock and do a look-aside into the + * tracking cache: this is an overflow key, not a value, meaning it's instantiated + * before being deleted, not copied into the tracking cache. */ if (slot_offset == 0) { __wt_readlock(session, &btree->ovfl_lock); @@ -362,16 +353,13 @@ __wt_row_leaf_key_work( */ if (direction == BACKWARD) { /* - * If there's a set of keys with identical prefixes, we - * don't want to instantiate each one, the prefixes are - * all the same. + * If there's a set of keys with identical prefixes, we don't want to instantiate each + * one, the prefixes are all the same. * - * As we roll backward through the page, track the last - * time the prefix decreased in size, so we can start - * with that key during our roll-forward. For a page - * populated with a single key prefix, we'll be able to - * instantiate the key we want as soon as we find a key - * without a prefix. + * As we roll backward through the page, track the last time the prefix decreased in + * size, so we can start with that key during our roll-forward. For a page populated + * with a single key prefix, we'll be able to instantiate the key we want as soon as we + * find a key without a prefix. */ if (slot_offset == 0) last_prefix = unpack->prefix; @@ -398,13 +386,11 @@ __wt_row_leaf_key_work( } /* - * Grow the buffer as necessary as well as ensure data - * has been copied into local buffer space, then append - * the suffix to the prefix already in the buffer. + * Grow the buffer as necessary as well as ensure data has been copied into local buffer + * space, then append the suffix to the prefix already in the buffer. * - * Don't grow the buffer unnecessarily or copy data we - * don't need, truncate the item's data length to the - * prefix bytes. + * Don't grow the buffer unnecessarily or copy data we don't need, truncate the item's + * data length to the prefix bytes. */ keyb->size = unpack->prefix; WT_ERR(__wt_buf_grow(session, keyb, keyb->size + size)); diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index 7298dee90a9..7a5b7fa2f91 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -68,13 +68,11 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, const WT_ITEM *k mod = page->modify; /* - * Modify: allocate an update array as necessary, build a WT_UPDATE - * structure, and call a serialized function to insert the WT_UPDATE - * structure. + * Modify: allocate an update array as necessary, build a WT_UPDATE structure, and call a + * serialized function to insert the WT_UPDATE structure. * - * Insert: allocate an insert array as necessary, build a WT_INSERT - * and WT_UPDATE structure pair, and call a serialized function to - * insert the WT_INSERT structure. + * Insert: allocate an insert array as necessary, build a WT_INSERT and WT_UPDATE structure + * pair, and call a serialized function to insert the WT_INSERT structure. */ if (cbt->compare == 0) { if (cbt->ins == NULL) { @@ -125,13 +123,11 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, const WT_ITEM *k /* * Allocate the insert array as necessary. * - * We allocate an additional insert array slot for insert keys - * sorting less than any key on the page. The test to select - * that slot is baroque: if the search returned the first page - * slot, we didn't end up processing an insert list, and the - * comparison value indicates the search key was smaller than - * the returned slot, then we're using the smallest-key insert - * slot. That's hard, so we set a flag. + * We allocate an additional insert array slot for insert keys sorting less than any key on + * the page. The test to select that slot is baroque: if the search returned the first page + * slot, we didn't end up processing an insert list, and the comparison value indicates the + * search key was smaller than the returned slot, then we're using the smallest-key insert + * slot. That's hard, so we set a flag. */ WT_PAGE_ALLOC_AND_SWAP(session, page, mod->mod_row_insert, ins_headp, page->entries + 1); @@ -167,16 +163,14 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, const WT_ITEM *k ins_size += upd_size; /* - * If there was no insert list during the search, the cursor's - * information cannot be correct, search couldn't have - * initialized it. + * If there was no insert list during the search, the cursor's information cannot be + * correct, search couldn't have initialized it. * - * Otherwise, point the new WT_INSERT item's skiplist to the - * next elements in the insert list (which we will check are - * still valid inside the serialization function). + * Otherwise, point the new WT_INSERT item's skiplist to the next elements in the insert + * list (which we will check are still valid inside the serialization function). * - * The serial mutex acts as our memory barrier to flush these - * writes before inserting them into the list. + * The serial mutex acts as our memory barrier to flush these writes before inserting them + * into the list. */ if (cbt->ins_stack[0] == NULL) for (i = 0; i < skipdepth; i++) { @@ -303,20 +297,17 @@ __wt_update_obsolete_check( oldest = txn_global->has_oldest_timestamp ? txn_global->oldest_timestamp : WT_TS_NONE; stable = txn_global->has_stable_timestamp ? txn_global->stable_timestamp : WT_TS_NONE; /* - * This function identifies obsolete updates, and truncates them from - * the rest of the chain; because this routine is called from inside - * a serialization function, the caller has responsibility for actually - * freeing the memory. + * This function identifies obsolete updates, and truncates them from the rest of the chain; + * because this routine is called from inside a serialization function, the caller has + * responsibility for actually freeing the memory. * * Walk the list of updates, looking for obsolete updates at the end. * - * Only updates with globally visible, self-contained data can terminate - * update chains. + * Only updates with globally visible, self-contained data can terminate update chains. * - * Birthmarks are a special case: once a birthmark becomes obsolete, it - * can be discarded and subsequent reads will see the on-page value (as - * expected). Inserting updates into the lookaside table relies on - * this behavior to avoid creating update chains with multiple + * Birthmarks are a special case: once a birthmark becomes obsolete, it can be discarded and + * subsequent reads will see the on-page value (as expected). Inserting updates into the + * lookaside table relies on this behavior to avoid creating update chains with multiple * birthmarks. */ for (first = prev = NULL, count = 0; upd != NULL; prev = upd, upd = upd->next, count++) { diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 52057ad56b9..9b69c0aa9ed 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -30,10 +30,9 @@ __search_insert_append(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT if ((ins = WT_SKIP_LAST(ins_head)) == NULL) return (0); /* - * Since the head of the skip list doesn't get mutated within this - * function, the compiler may move this assignment above within the - * loop below if it needs to (and may read a different value on each - * loop due to other threads mutating the skip list). + * Since the head of the skip list doesn't get mutated within this function, the compiler may + * move this assignment above within the loop below if it needs to (and may read a different + * value on each loop due to other threads mutating the skip list). * * Place a read barrier here to avoid this issue. */ @@ -171,11 +170,10 @@ __check_leaf_key_range( return (0); /* - * Check if the search key is smaller than the parent's starting key for - * this page. + * Check if the search key is smaller than the parent's starting key for this page. * - * We can't compare against slot 0 on a row-store internal page because - * reconciliation doesn't build it, it may not be a valid key. + * We can't compare against slot 0 on a row-store internal page because reconciliation doesn't + * build it, it may not be a valid key. */ if (indx != 0) { __wt_ref_key(leaf->home, leaf, &item->data, &item->size); @@ -241,12 +239,11 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CU skiphigh = skiplow = 0; /* - * If a cursor repeatedly appends to the tree, compare the search key - * against the last key on each internal page during insert before - * doing the full binary search. + * If a cursor repeatedly appends to the tree, compare the search key against the last key on + * each internal page during insert before doing the full binary search. * - * Track if the descent is to the right-side of the tree, used to set - * the cursor's append history. + * Track if the descent is to the right-side of the tree, used to set the cursor's append + * history. */ append_check = insert && cbt->append_tree; descend_right = true; @@ -297,17 +294,14 @@ restart: /* * Fast-path appends. * - * The 0th key on an internal page is a problem for a couple of - * reasons. First, we have to force the 0th key to sort less - * than any application key, so internal pages don't have to be - * updated if the application stores a new, "smallest" key in - * the tree. Second, reconciliation is aware of this and will - * store a byte of garbage in the 0th key, so the comparison of - * an application key and a 0th key is meaningless (but doing - * the comparison could still incorrectly modify our tracking - * of the leading bytes in each key that we can skip during the - * comparison). For these reasons, special-case the 0th key, and - * never pass it to a collator. + * The 0th key on an internal page is a problem for a couple of reasons. First, we have to + * force the 0th key to sort less than any application key, so internal pages don't have to + * be updated if the application stores a new, "smallest" key in the tree. Second, + * reconciliation is aware of this and will store a byte of garbage in the 0th key, so the + * comparison of an application key and a 0th key is meaningless (but doing the comparison + * could still incorrectly modify our tracking of the leading bytes in each key that we can + * skip during the comparison). For these reasons, special-case the 0th key, and never pass + * it to a collator. */ if (append_check) { descent = pindex->index[pindex->entries - 1]; @@ -420,16 +414,13 @@ descend: WT_DIAGNOSTIC_YIELD; /* - * Swap the current page for the child page. If the page splits - * while we're retrieving it, restart the search at the root. - * We cannot restart in the "current" page; for example, if a - * thread is appending to the tree, the page it's waiting for - * did an insert-split into the parent, then the parent split - * into its parent, the name space we are searching for may have - * moved above the current page in the tree. + * Swap the current page for the child page. If the page splits while we're retrieving it, + * restart the search at the root. We cannot restart in the "current" page; for example, if + * a thread is appending to the tree, the page it's waiting for did an insert-split into the + * parent, then the parent split into its parent, the name space we are searching for may + * have moved above the current page in the tree. * - * On other error, simply return, the swap call ensures we're - * holding nothing on failure. + * On other error, simply return, the swap call ensures we're holding nothing on failure. */ read_flags = WT_READ_RESTART_OK; if (F_ISSET(cbt, WT_CBT_READ_ONCE)) @@ -458,21 +449,17 @@ leaf_only: current = NULL; /* - * In the case of a right-side tree descent during an insert, do a fast - * check for an append to the page, try to catch cursors appending data - * into the tree. + * In the case of a right-side tree descent during an insert, do a fast check for an append to + * the page, try to catch cursors appending data into the tree. * - * It's tempting to make this test more rigorous: if a cursor inserts - * randomly into a two-level tree (a root referencing a single child - * that's empty except for an insert list), the right-side descent flag - * will be set and this comparison wasted. The problem resolves itself - * as the tree grows larger: either we're no longer doing right-side - * descent, or we'll avoid additional comparisons in internal pages, - * making up for the wasted comparison here. Similarly, the cursor's - * history is set any time it's an insert and a right-side descent, - * both to avoid a complicated/expensive test, and, in the case of - * multiple threads appending to the tree, we want to mark them all as - * appending, even if this test doesn't work. + * It's tempting to make this test more rigorous: if a cursor inserts randomly into a two-level + * tree (a root referencing a single child that's empty except for an insert list), the + * right-side descent flag will be set and this comparison wasted. The problem resolves itself + * as the tree grows larger: either we're no longer doing right-side descent, or we'll avoid + * additional comparisons in internal pages, making up for the wasted comparison here. + * Similarly, the cursor's history is set any time it's an insert and a right-side descent, both + * to avoid a complicated/expensive test, and, in the case of multiple threads appending to the + * tree, we want to mark them all as appending, even if this test doesn't work. */ if (insert && descend_right) { cbt->append_tree = 1; diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index 20a8f3df3b6..5f4b4d20c9d 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -168,9 +168,8 @@ __wt_las_create(WT_SESSION_IMPL *session, const char **cfg) return (0); /* - * Done at startup: we cannot do it on demand because we require the - * schema lock to create and drop the table, and it may not always be - * available. + * Done at startup: we cannot do it on demand because we require the schema lock to create and + * drop the table, and it may not always be available. * * Discard any previous incarnation of the table. */ @@ -262,13 +261,11 @@ __wt_las_cursor_open(WT_SESSION_IMPL *session) S2C(session)->cache->las_fileid = btree->id; /* - * Set special flags for the lookaside table: the lookaside flag (used, - * for example, to avoid writing records during reconciliation), also - * turn off checkpoints and logging. + * Set special flags for the lookaside table: the lookaside flag (used, for example, to avoid + * writing records during reconciliation), also turn off checkpoints and logging. * - * Test flags before setting them so updates can't race in subsequent - * opens (the first update is safe because it's single-threaded from - * wiredtiger_open). + * Test flags before setting them so updates can't race in subsequent opens (the first update is + * safe because it's single-threaded from wiredtiger_open). */ if (!F_ISSET(btree, WT_BTREE_LOOKASIDE)) F_SET(btree, WT_BTREE_LOOKASIDE); @@ -296,13 +293,11 @@ __wt_las_cursor(WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session *cursorp = NULL; /* - * We don't want to get tapped for eviction after we start using the - * lookaside cursor; save a copy of the current eviction state, we'll - * turn eviction off before we return. + * We don't want to get tapped for eviction after we start using the lookaside cursor; save a + * copy of the current eviction state, we'll turn eviction off before we return. * - * Don't cache lookaside table pages, we're here because of eviction - * problems and there's no reason to believe lookaside pages will be - * useful more than once. + * Don't cache lookaside table pages, we're here because of eviction problems and there's no + * reason to believe lookaside pages will be useful more than once. */ *session_flags = F_MASK(session, WT_LAS_SESSION_FLAGS); @@ -400,21 +395,19 @@ __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref) txn = &session->txn; /* - * Skip lookaside pages if reading without a timestamp and all the - * updates in lookaside are in the past. + * Skip lookaside pages if reading without a timestamp and all the updates in lookaside are in + * the past. * - * Lookaside eviction preferentially chooses the newest updates when - * creating page images with no stable timestamp. If a stable timestamp - * has been set, we have to visit the page because eviction chooses old - * version of records in that case. + * Lookaside eviction preferentially chooses the newest updates when creating page images with + * no stable timestamp. If a stable timestamp has been set, we have to visit the page because + * eviction chooses old version of records in that case. * - * One case where we may need to visit the page is if lookaside eviction - * is active in tree 2 when a checkpoint has started and is working its - * way through tree 1. In that case, lookaside may have created a page - * image with updates in the future of the checkpoint. + * One case where we may need to visit the page is if lookaside eviction is active in tree 2 + * when a checkpoint has started and is working its way through tree 1. In that case, lookaside + * may have created a page image with updates in the future of the checkpoint. * - * We also need to instantiate a lookaside page if this is an update - * operation in progress or transaction is in prepared state. + * We also need to instantiate a lookaside page if this is an update operation in progress or + * transaction is in prepared state. */ if (F_ISSET(txn, WT_TXN_PREPARE | WT_TXN_UPDATE)) return (false); @@ -437,16 +430,14 @@ __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref) return (!ref->page_las->has_prepares && ref->page_las->min_skipped_ts == WT_TS_MAX); /* - * Skip lookaside history if reading as of a timestamp, we evicted new - * versions of data and all the updates are in the past. This is not - * possible for prepared updates, because the commit timestamp was not - * known when the page was evicted. + * Skip lookaside history if reading as of a timestamp, we evicted new versions of data and all + * the updates are in the past. This is not possible for prepared updates, because the commit + * timestamp was not known when the page was evicted. * - * Otherwise, skip reading lookaside history if everything on the page - * is older than the read timestamp, and the oldest update in lookaside - * newer than the page is in the future of the reader. This seems - * unlikely, but is exactly what eviction tries to do when a checkpoint - * is running. + * Otherwise, skip reading lookaside history if everything on the page is older than the read + * timestamp, and the oldest update in lookaside newer than the page is in the future of the + * reader. This seems unlikely, but is exactly what eviction tries to do when a checkpoint is + * running. */ if (!ref->page_las->has_prepares && ref->page_las->min_skipped_ts == WT_TS_MAX && txn->read_timestamp >= ref->page_las->max_ondisk_ts) @@ -830,13 +821,12 @@ __wt_las_cursor_position(WT_CURSOR *cursor, uint64_t pageid) WT_RET(cursor->next(cursor)); /* - * Because of the special visibility rules for lookaside, a new - * block can appear in between our search and the block of - * interest. Keep trying while we have a key lower than we + * Because of the special visibility rules for lookaside, a new block can appear in between + * our search and the block of interest. Keep trying while we have a key lower than we * expect. * - * There may be no block of lookaside entries if they have been - * removed by WT_CONNECTION::rollback_to_stable. + * There may be no block of lookaside entries if they have been removed by + * WT_CONNECTION::rollback_to_stable. */ WT_RET(cursor->get_key(cursor, &las_pageid, &las_id, &las_counter, &las_key)); if (las_pageid >= pageid) @@ -935,20 +925,17 @@ __las_sweep_count(WT_CACHE *cache) uint64_t las_entry_count; /* - * The sweep server is a slow moving thread. Try to review the entire - * lookaside table once every 5 minutes. + * The sweep server is a slow moving thread. Try to review the entire lookaside table once every + * 5 minutes. * - * The reason is because the lookaside table exists because we're seeing - * cache/eviction pressure (it allows us to trade performance and disk - * space for cache space), and it's likely lookaside blocks are being - * evicted, and reading them back in doesn't help things. A trickier, - * but possibly better, alternative might be to review all lookaside - * blocks in the cache in order to get rid of them, and slowly review - * lookaside blocks that have already been evicted. + * The reason is because the lookaside table exists because we're seeing cache/eviction pressure + * (it allows us to trade performance and disk space for cache space), and it's likely lookaside + * blocks are being evicted, and reading them back in doesn't help things. A trickier, but + * possibly better, alternative might be to review all lookaside blocks in the cache in order to + * get rid of them, and slowly review lookaside blocks that have already been evicted. * - * Put upper and lower bounds on the calculation: since reads of pages - * with lookaside entries are blocked during sweep, make sure we do - * some work but don't block reads for too long. + * Put upper and lower bounds on the calculation: since reads of pages with lookaside entries + * are blocked during sweep, make sure we do some work but don't block reads for too long. */ las_entry_count = __las_entry_count(cache); return ( @@ -973,22 +960,17 @@ __las_sweep_init(WT_SESSION_IMPL *session) /* * If no files have been dropped and the lookaside file is empty, there's nothing to do. */ - if (cache->las_dropped_next == 0) { - if (__wt_las_empty(session)) - ret = WT_NOTFOUND; - goto err; - } + if (cache->las_dropped_next == 0 && __wt_las_empty(session)) + WT_ERR(WT_NOTFOUND); /* * Record the current page ID: sweep will stop after this point. * - * Since the btree IDs we're scanning are closed, any eviction must - * have already completed, so we won't miss anything with this - * approach. + * Since the btree IDs we're scanning are closed, any eviction must have already completed, so + * we won't miss anything with this approach. * - * Also, if a tree is reopened and there is lookaside activity before - * this sweep completes, it will have a higher page ID and should not - * be removed. + * Also, if a tree is reopened and there is lookaside activity before this sweep completes, it + * will have a higher page ID and should not be removed. */ cache->las_sweep_max_pageid = cache->las_pageid; @@ -1068,12 +1050,10 @@ __wt_las_sweep(WT_SESSION_IMPL *session) __wt_timing_stress(session, WT_TIMING_STRESS_LOOKASIDE_SWEEP); /* - * When continuing a sweep, position the cursor using the key from the - * last call (we don't care if we're before or after the key, either - * side is fine). + * When continuing a sweep, position the cursor using the key from the last call (we don't care + * if we're before or after the key, either side is fine). * - * Otherwise, we're starting a new sweep, gather the list of trees to - * sweep. + * Otherwise, we're starting a new sweep, gather the list of trees to sweep. */ if (sweep_key->size != 0) { __wt_cursor_set_raw_key(cursor, sweep_key); @@ -1134,10 +1114,9 @@ __wt_las_sweep(WT_SESSION_IMPL *session) /* * If the entry belongs to a dropped tree, discard it. * - * Cursor opened overwrite=true: won't return WT_NOTFOUND - * should another thread remove the record before we do (not - * expected for dropped trees), and the cursor remains - * positioned in that case. + * Cursor opened overwrite=true: won't return WT_NOTFOUND should another thread remove the + * record before we do (not expected for dropped trees), and the cursor remains positioned + * in that case. */ if (las_id >= cache->las_sweep_dropmin && las_id <= cache->las_sweep_dropmax && __bit_test(cache->las_sweep_dropmap, las_id - cache->las_sweep_dropmin)) { @@ -1158,13 +1137,11 @@ __wt_las_sweep(WT_SESSION_IMPL *session) &prepare_state, &upd_type, &las_value)); /* - * Check to see if the page or key has changed this iteration, - * and if they have, setup context for safely removing obsolete - * updates. + * Check to see if the page or key has changed this iteration, and if they have, setup + * context for safely removing obsolete updates. * - * It's important to check for page boundaries explicitly - * because it is possible for the same key to be at the start - * of the next block. See WT-3982 for details. + * It's important to check for page boundaries explicitly because it is possible for the + * same key to be at the start of the next block. See WT-3982 for details. */ if (las_pageid != saved_pageid || saved_key->size != las_key.size || memcmp(saved_key->data, las_key.data, las_key.size) != 0) { diff --git a/src/third_party/wiredtiger/src/checksum/x86/crc32-x86-alt.c b/src/third_party/wiredtiger/src/checksum/x86/crc32-x86-alt.c index db0e01c35fc..7484e9e72ce 100644 --- a/src/third_party/wiredtiger/src/checksum/x86/crc32-x86-alt.c +++ b/src/third_party/wiredtiger/src/checksum/x86/crc32-x86-alt.c @@ -32,12 +32,12 @@ #include <stddef.h> /* - * The hardware-accelerated checksum code that originally shipped on Windows - * did not correctly handle memory that wasn't 8B aligned and a multiple of 8B. - * It's likely that calculations were always 8B aligned, but there's some risk. + * The hardware-accelerated checksum code that originally shipped on Windows did not correctly + * handle memory that wasn't 8B aligned and a multiple of 8B. It's likely that calculations were + * always 8B aligned, but there's some risk. * - * What we do is always write the correct checksum, and if a checksum test - * fails, check it against the alternate version have before failing. + * What we do is always write the correct checksum, and if a checksum test fails, check it against + * the alternate version have before failing. */ #if defined(_M_AMD64) && !defined(HAVE_NO_CRC32_HARDWARE) diff --git a/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c b/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c index 3fcfcf69887..dfa1d9e03b2 100644 --- a/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c +++ b/src/third_party/wiredtiger/src/checksum/zseries/crc32-s390x.c @@ -1,9 +1,7 @@ /* - * CRC-32 algorithms implemented with the z/Architecture - * Vector Extension Facility. + * CRC-32 algorithms implemented with the z/Architecture Vector Extension Facility. * - * Copyright IBM Corp. 2015 - * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + * Copyright IBM Corp. 2015 Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> * */ @@ -48,10 +46,9 @@ __wt_crc32c_le(unsigned int crc, const unsigned char *buf, size_t len) /* * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension * - * Creates a function to perform a particular CRC-32 computation. Depending - * on the message buffer, the hardware-accelerated or software implementation - * is used. Note that the message buffer is aligned to improve fetch - * operations of VECTOR LOAD MULTIPLE instructions. + * Creates a function to perform a particular CRC-32 computation. Depending on the message buffer, + * the hardware-accelerated or software implementation is used. Note that the message buffer is + * aligned to improve fetch operations of VECTOR LOAD MULTIPLE instructions. * */ #define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \ diff --git a/src/third_party/wiredtiger/src/checksum/zseries/vx-insn.h b/src/third_party/wiredtiger/src/checksum/zseries/vx-insn.h index bf022d5ad9d..b8726e1f76f 100644 --- a/src/third_party/wiredtiger/src/checksum/zseries/vx-insn.h +++ b/src/third_party/wiredtiger/src/checksum/zseries/vx-insn.h @@ -1,11 +1,10 @@ /* * Support for Vector Instructions * - * Assembler macros to generate .byte/.word code for particular - * vector instructions that are supported by recent binutils (>= 2.26) only. + * Assembler macros to generate .byte/.word code for particular vector instructions that are + * supported by recent binutils (>= 2.26) only. * - * Copyright IBM Corp. 2015 - * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + * Copyright IBM Corp. 2015 Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> */ #ifndef __ASM_S390_VX_INSN_H diff --git a/src/third_party/wiredtiger/src/config/config_api.c b/src/third_party/wiredtiger/src/config/config_api.c index e489e932247..a50754a59a6 100644 --- a/src/third_party/wiredtiger/src/config/config_api.c +++ b/src/third_party/wiredtiger/src/config/config_api.c @@ -160,8 +160,7 @@ __conn_foc_add(WT_SESSION_IMPL *session, const void *p) conn = S2C(session); /* - * Callers of this function are expected to be holding the connection's - * api_lock. + * Callers of this function are expected to be holding the connection's api_lock. * * All callers of this function currently ignore errors. */ @@ -255,8 +254,7 @@ __wt_configure_method(WT_SESSION_IMPL *session, const char *method, const char * /* * Allocate new configuration entry and fill it in. * - * The new base value is the previous base value, a separator and the - * new configuration string. + * The new base value is the previous base value, a separator and the new configuration string. */ WT_ERR(__wt_calloc_one(session, &entry)); entry->method = (*epp)->method; diff --git a/src/third_party/wiredtiger/src/config/config_collapse.c b/src/third_party/wiredtiger/src/config/config_collapse.c index 292f3fcbe4a..e1bdc202b2b 100644 --- a/src/third_party/wiredtiger/src/config/config_collapse.c +++ b/src/third_party/wiredtiger/src/config/config_collapse.c @@ -56,9 +56,8 @@ __wt_config_collapse(WT_SESSION_IMPL *session, const char **cfg, char **config_r goto err; /* - * If the caller passes us no valid configuration strings, we get here - * with no bytes to copy -- that's OK, the underlying string copy can - * handle empty strings. + * If the caller passes us no valid configuration strings, we get here with no bytes to copy -- + * that's OK, the underlying string copy can handle empty strings. * * Strip any trailing comma. */ @@ -145,9 +144,8 @@ keep: goto err; /* - * If the caller passes us only default configuration strings, we get - * here with no bytes to copy -- that's OK, the underlying string copy - * can handle empty strings. + * If the caller passes us only default configuration strings, we get here with no bytes to copy + * -- that's OK, the underlying string copy can handle empty strings. * * Strip any trailing comma. */ diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index 9e7964758ff..56b3febfeb1 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -1351,12 +1351,11 @@ __conn_config_file( len = (size_t)size; /* - * Copy the configuration file into memory, with a little slop, I'm not - * interested in debugging off-by-ones. + * Copy the configuration file into memory, with a little slop, I'm not interested in debugging + * off-by-ones. * - * The beginning of a file is the same as if we run into an unquoted - * newline character, simplify the parsing loop by pretending that's - * what we're doing. + * The beginning of a file is the same as if we run into an unquoted newline character, simplify + * the parsing loop by pretending that's what we're doing. */ WT_ERR(__wt_buf_init(session, cbuf, len + 10)); WT_ERR(__wt_read(session, fh, (wt_off_t)0, len, ((uint8_t *)cbuf->mem) + 1)); @@ -1405,11 +1404,10 @@ __conn_config_file( } /* - * Replace any newline characters with commas (and strings of - * commas are safe). + * Replace any newline characters with commas (and strings of commas are safe). * - * After any newline, skip to a non-white-space character; if - * the next character is a hash mark, skip to the next newline. + * After any newline, skip to a non-white-space character; if the next character is a hash + * mark, skip to the next newline. */ for (;;) { for (*t++ = ','; --len > 0 && __wt_isspace((u_char) * ++p);) @@ -1472,8 +1470,8 @@ __conn_env_var(WT_SESSION_IMPL *session, const char *cfg[], const char *name, co /* * Security stuff: * - * Don't use the environment variable if the process has additional - * privileges, unless "use_environment_priv" is configured. + * Don't use the environment variable if the process has additional privileges, unless + * "use_environment_priv" is configured. */ if (!__wt_has_priv()) return (0); @@ -1632,14 +1630,12 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) is_create || exist ? WT_FS_OPEN_CREATE : 0, &conn->lock_fh); /* - * If this is a read-only connection and we cannot grab the lock file, - * check if it is because there's no write permission or if the file - * does not exist. If so, then ignore the error. - * XXX Ignoring the error does allow multiple read-only connections to - * exist at the same time on a read-only directory. + * If this is a read-only connection and we cannot grab the lock file, check if it is because + * there's no write permission or if the file does not exist. If so, then ignore the error. XXX + * Ignoring the error does allow multiple read-only connections to exist at the same time on a + * read-only directory. * - * If we got an expected permission or non-existence error then skip - * the byte lock. + * If we got an expected permission or non-existence error then skip the byte lock. */ if (F_ISSET(conn, WT_CONN_READONLY) && (ret == EACCES || ret == ENOENT)) { bytelock = false; @@ -1658,15 +1654,13 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) "another process"); /* - * If the size of the lock file is non-zero, we created it (or - * won a locking race with the thread that created it, it - * doesn't matter). + * If the size of the lock file is non-zero, we created it (or won a locking race with the thread + * that created it, it doesn't matter). * - * Write something into the file, zero-length files make me - * nervous. + * Write something into the file, zero-length files make me nervous. * - * The test against the expected length is sheer paranoia (the - * length should be 0 or correct), but it shouldn't hurt. + * The test against the expected length is sheer paranoia (the length should be 0 or correct), but + * it shouldn't hurt. */ #define WT_SINGLETHREAD_STRING "WiredTiger lock file\n" WT_ERR(__wt_filesize(session, conn->lock_fh, &size)); @@ -2027,26 +2021,22 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[]) "# or create a WiredTiger.config file to override them.")); /* - * The base configuration file contains all changes to default settings - * made at create, and we include the user-configuration file in that - * list, even though we don't expect it to change. Of course, an - * application could leave that file as it is right now and not remove - * a configuration we need, but applications can also guarantee all - * database users specify consistent environment variables and - * wiredtiger_open configuration arguments -- if we protect against - * those problems, might as well include the application's configuration - * file in that protection. + * The base configuration file contains all changes to default settings made at create, and we + * include the user-configuration file in that list, even though we don't expect it to change. + * Of course, an application could leave that file as it is right now and not remove a + * configuration we need, but applications can also guarantee all database users specify + * consistent environment variables and wiredtiger_open configuration arguments -- if we protect + * against those problems, might as well include the application's configuration file in that + * protection. * - * We were passed the configuration items specified by the application. - * That list includes configuring the default settings, presumably if - * the application configured it explicitly, that setting should survive - * even if the default changes. + * We were passed the configuration items specified by the application. That list includes + * configuring the default settings, presumably if the application configured it explicitly, + * that setting should survive even if the default changes. * - * When writing the base configuration file, we write the version and - * any configuration information set by the application (in other words, - * the stack except for cfg[0]). However, some configuration values need - * to be stripped out from the base configuration file; do that now, and - * merge the rest to be written. + * When writing the base configuration file, we write the version and any configuration + * information set by the application (in other words, the stack except for cfg[0]). However, + * some configuration values need to be stripped out from the base configuration file; do that + * now, and merge the rest to be written. */ WT_ERR(__wt_config_merge(session, cfg + 1, "compatibility=(release=)," @@ -2313,14 +2303,13 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c WT_ERR(__conn_config_env(session, cfg, i1)); /* - * We need to know if configured for read-only or in-memory behavior - * before reading/writing the filesystem. The only way the application - * can configure that before we touch the filesystem is the wiredtiger - * config string or the WIREDTIGER_CONFIG environment variable. + * We need to know if configured for read-only or in-memory behavior before reading/writing the + * filesystem. The only way the application can configure that before we touch the filesystem is + * the wiredtiger config string or the WIREDTIGER_CONFIG environment variable. * - * The environment isn't trusted by default, for security reasons; if - * the application wants us to trust the environment before reading - * the filesystem, the wiredtiger_open config string is the only way. + * The environment isn't trusted by default, for security reasons; if the application wants us + * to trust the environment before reading the filesystem, the wiredtiger_open config string is + * the only way. */ WT_ERR(__wt_config_gets(session, cfg, "in_memory", &cval)); if (cval.val != 0) @@ -2445,14 +2434,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c /* * Configuration ... * - * We can't open sessions yet, so any configurations that cause - * sessions to be opened must be handled inside __wt_connection_open. + * We can't open sessions yet, so any configurations that cause sessions to be opened must be + * handled inside __wt_connection_open. * - * The error message configuration might have changed (if set in a - * configuration file, and not in the application's configuration - * string), get it again. Do it first, make error messages correct. - * Ditto verbose configuration so we dump everything the application - * wants to see. + * The error message configuration might have changed (if set in a configuration file, and not + * in the application's configuration string), get it again. Do it first, make error messages + * correct. Ditto verbose configuration so we dump everything the application wants to see. */ WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval)); if (cval.len != 0) { diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c index 1bb9bf887ff..f1f3bdf8ee9 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c @@ -473,9 +473,8 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest) cache = entry->cache; /* - * Figure out a delta since the last time we did an assessment - * for each metric we are tracking. Watch out for wrapping - * of values. + * Figure out a delta since the last time we did an assessment for each metric we are + * tracking. Watch out for wrapping of values. * * Count pages read, assuming pages are 4KB. */ @@ -652,15 +651,12 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, uint64_t highest, uint64_t bump_th cache->cp_quota - entry->cache_size); } /* - * Bounds checking: don't go over the pool size or under the - * reserved size for this cache. + * Bounds checking: don't go over the pool size or under the reserved size for this cache. * - * Shrink by a chunk size if that doesn't drop us - * below the reserved size. + * Shrink by a chunk size if that doesn't drop us below the reserved size. * - * Limit the reduction to half of the free space in the - * connection's cache. This should reduce cache sizes - * gradually without stalling application threads. + * Limit the reduction to half of the free space in the connection's cache. This should + * reduce cache sizes gradually without stalling application threads. */ if (adjustment > 0) { *adjustedp = true; diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c index 68a437be046..70b93a164c9 100644 --- a/src/third_party/wiredtiger/src/conn/conn_ckpt.c +++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c @@ -140,8 +140,8 @@ __ckpt_server_start(WT_CONNECTION_IMPL *conn) /* * The checkpoint server gets its own session. * - * Checkpoint does enough I/O it may be called upon to perform slow - * operations for the block manager. + * Checkpoint does enough I/O it may be called upon to perform slow operations for the block + * manager. */ session_flags = WT_SESSION_CAN_WAIT; WT_RET(__wt_open_internal_session( diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index 8884fa5c23b..7c60e0d8239 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -50,18 +50,15 @@ __conn_dhandle_config_set(WT_SESSION_IMPL *session) } /* - * The defaults are included because persistent configuration - * information is stored in the metadata file and it may be from an - * earlier version of WiredTiger. If defaults are included in the - * configuration, we can add new configuration strings without - * upgrading the metadata file or writing special code in case a - * configuration string isn't initialized, as long as the new - * configuration string has an appropriate default value. + * The defaults are included because persistent configuration information is stored in the + * metadata file and it may be from an earlier version of WiredTiger. If defaults are included + * in the configuration, we can add new configuration strings without upgrading the metadata + * file or writing special code in case a configuration string isn't initialized, as long as the + * new configuration string has an appropriate default value. * - * The error handling is a little odd, but be careful: we're holding a - * chunk of allocated memory in metaconf. If we fail before we copy a - * reference to it into the object's configuration array, we must free - * it, after the copy, we don't want to free it. + * The error handling is a little odd, but be careful: we're holding a chunk of allocated memory + * in metaconf. If we fail before we copy a reference to it into the object's configuration + * array, we must free it, after the copy, we don't want to free it. */ WT_ERR(__wt_calloc_def(session, 3, &dhandle->cfg)); switch (dhandle->type) { @@ -155,12 +152,11 @@ __wt_conn_dhandle_alloc(WT_SESSION_IMPL *session, const char *uri, const char *c WT_ERR(__wt_spin_init(session, &dhandle->close_lock, "data handle close")); /* - * We are holding the data handle list lock, which protects most - * threads from seeing the new handle until that lock is released. + * We are holding the data handle list lock, which protects most threads from seeing the new + * handle until that lock is released. * - * However, the sweep server scans the list of handles without holding - * that lock, so we need a write barrier here to ensure the sweep - * server doesn't see a partially filled in structure. + * However, the sweep server scans the list of handles without holding that lock, so we need a + * write barrier here to ensure the sweep server doesn't see a partially filled in structure. */ WT_WRITE_BARRIER(); @@ -294,19 +290,15 @@ __wt_conn_dhandle_close(WT_SESSION_IMPL *session, bool final, bool mark_dead) marked_dead = true; /* - * Flush dirty data from any durable trees we couldn't mark - * dead. That involves writing a checkpoint, which can fail if - * an update cannot be written, causing the close to fail: if - * not the final close, return the EBUSY error to our caller - * for eventual retry. + * Flush dirty data from any durable trees we couldn't mark dead. That involves writing a + * checkpoint, which can fail if an update cannot be written, causing the close to fail: if + * not the final close, return the EBUSY error to our caller for eventual retry. * - * We can't discard non-durable trees yet: first we have to - * close the underlying btree handle, then we can mark the - * data handle dead. + * We can't discard non-durable trees yet: first we have to close the underlying btree + * handle, then we can mark the data handle dead. * - * If we are closing with timestamps enforced, then we have - * already checkpointed as of the timestamp as needed and any - * remaining dirty data should be discarded. + * If we are closing with timestamps enforced, then we have already checkpointed as of the + * timestamp as needed and any remaining dirty data should be discarded. */ if (!discard && !marked_dead) { if (F_ISSET(conn, WT_CONN_CLOSING_TIMESTAMP) || F_ISSET(conn, WT_CONN_IN_MEMORY) || @@ -407,16 +399,14 @@ __wt_conn_dhandle_open(WT_SESSION_IMPL *session, const char *cfg[], uint32_t fla WT_RET(__wt_evict_file_exclusive_on(session)); /* - * If the handle is already open, it has to be closed so it can be - * reopened with a new configuration. + * If the handle is already open, it has to be closed so it can be reopened with a new + * configuration. * - * This call can return EBUSY if there's an update in the tree that's - * not yet globally visible. That's not a problem because it can only - * happen when we're switching from a normal handle to a "special" one, - * so we're returning EBUSY to an attempt to verify or do other special - * operations. The reverse won't happen because when the handle from a - * verify or other special operation is closed, there won't be updates - * in the tree that can block the close. + * This call can return EBUSY if there's an update in the tree that's not yet globally visible. + * That's not a problem because it can only happen when we're switching from a normal handle to + * a "special" one, so we're returning EBUSY to an attempt to verify or do other special + * operations. The reverse won't happen because when the handle from a verify or other special + * operation is closed, there won't be updates in the tree that can block the close. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) WT_ERR(__wt_conn_dhandle_close(session, false, false)); diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index 9d5e5e75041..83dbe69f4b2 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -97,13 +97,11 @@ __logmgr_version(WT_SESSION_IMPL *session, bool reconfig) return (0); /* - * Set the log file format versions based on compatibility versions - * set in the connection. We must set this before we call log_open - * to open or create a log file. + * Set the log file format versions based on compatibility versions set in the connection. We + * must set this before we call log_open to open or create a log file. * - * Note: downgrade in this context means the new version is not the - * latest possible version. It does not mean the direction of change - * from the release we may be running currently. + * Note: downgrade in this context means the new version is not the latest possible version. It + * does not mean the direction of change from the release we may be running currently. */ if (conn->compat_major < WT_LOG_V2_MAJOR) { new_version = 1; @@ -148,15 +146,12 @@ __logmgr_version(WT_SESSION_IMPL *session, bool reconfig) if (log->log_version == new_version) return (0); /* - * If we are reconfiguring and at a new version we need to force - * the log file to advance so that we write out a log file at the - * correct version. When we are downgrading we must force a checkpoint - * and finally archive, even if disabled, so that all new version log - * files are gone. + * If we are reconfiguring and at a new version we need to force the log file to advance so that + * we write out a log file at the correct version. When we are downgrading we must force a + * checkpoint and finally archive, even if disabled, so that all new version log files are gone. * - * All of the version changes must be handled with locks on reconfigure - * because other threads may be changing log files, using pre-allocated - * files. + * All of the version changes must be handled with locks on reconfigure because other threads + * may be changing log files, using pre-allocated files. */ /* * Set the version. If it is a live change the logging subsystem will do other work as well to @@ -180,22 +175,20 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool rec bool enabled; /* - * A note on reconfiguration: the standard "is this configuration string - * allowed" checks should fail if reconfiguration has invalid strings, - * for example, "log=(enabled)", or "statistics_log=(path=XXX)", because - * the connection reconfiguration method doesn't allow those strings. - * Additionally, the base configuration values during reconfiguration - * are the currently configured values (so we don't revert to default - * values when repeatedly reconfiguring), and configuration processing - * of a currently set value should not change the currently set value. + * A note on reconfiguration: the standard "is this configuration string allowed" checks should + * fail if reconfiguration has invalid strings, for example, "log=(enabled)", or + * "statistics_log=(path=XXX)", because the connection reconfiguration method doesn't allow + * those strings. Additionally, the base configuration values during reconfiguration are the + * currently configured values (so we don't revert to default values when repeatedly + * reconfiguring), and configuration processing of a currently set value should not change the + * currently set value. * - * In this code path, log server reconfiguration does not stop/restart - * the log server, so there's no point in re-evaluating configuration - * strings that cannot be reconfigured, risking bugs in configuration - * setup, and depending on evaluation of currently set values to always - * result in the currently set value. Skip tests for any configuration - * strings which don't make sense during reconfiguration, but don't - * worry about error reporting because it should never happen. + * In this code path, log server reconfiguration does not stop/restart the log server, so + * there's no point in re-evaluating configuration strings that cannot be reconfigured, risking + * bugs in configuration setup, and depending on evaluation of currently set values to always + * result in the currently set value. Skip tests for any configuration strings which don't make + * sense during reconfiguration, but don't worry about error reporting because it should never + * happen. */ conn = S2C(session); @@ -204,11 +197,10 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool rec enabled = cval.val != 0; /* - * If we're reconfiguring, enabled must match the already - * existing setting. + * If we're reconfiguring, enabled must match the already existing setting. * - * If it is off and the user it turning it on, or it is on - * and the user is turning it off, return an error. + * If it is off and the user it turning it on, or it is on and the user is turning it off, + * return an error. * * See above: should never happen. */ @@ -230,9 +222,8 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool rec *runp = enabled; /* - * Setup a log path and compression even if logging is disabled in case - * we are going to print a log. Only do this on creation. Once a - * compressor or log path are set they cannot be changed. + * Setup a log path and compression even if logging is disabled in case we are going to print a + * log. Only do this on creation. Once a compressor or log path are set they cannot be changed. * * See above: should never happen. */ @@ -254,9 +245,9 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool rec FLD_SET(conn->log_flags, WT_CONN_LOG_ARCHIVE); /* - * The file size cannot be reconfigured. The amount of memory allocated - * to the log slots may be based on the log file size at creation and we - * don't want to re-allocate that memory while running. + * The file size cannot be reconfigured. The amount of memory allocated to the log slots may be + * based on the log file size at creation and we don't want to re-allocate that memory while + * running. * * See above: should never happen. */ @@ -286,8 +277,8 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool rec conn->log_prealloc = 1; /* - * Note it's meaningless to reconfigure this value during runtime, it - * only matters on create before recovery runs. + * Note it's meaningless to reconfigure this value during runtime, it only matters on create + * before recovery runs. * * See above: should never happen. */ @@ -598,13 +589,11 @@ __log_file_server(void *arg) */ min_lsn = log->write_lsn; /* - * We have to wait until the LSN we asked for is - * written. If it isn't signal the wrlsn thread - * to get it written. + * We have to wait until the LSN we asked for is written. If it isn't signal the wrlsn + * thread to get it written. * - * We also have to wait for the written LSN and the - * sync LSN to be in the same file so that we know we - * have synchronized all earlier log files. + * We also have to wait for the written LSN and the sync LSN to be in the same file so + * that we know we have synchronized all earlier log files. */ if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) { /* diff --git a/src/third_party/wiredtiger/src/conn/conn_reconfig.c b/src/third_party/wiredtiger/src/conn/conn_reconfig.c index 3cc46618a4a..315ae099c51 100644 --- a/src/third_party/wiredtiger/src/conn/conn_reconfig.c +++ b/src/third_party/wiredtiger/src/conn/conn_reconfig.c @@ -397,30 +397,27 @@ __wt_conn_reconfig(WT_SESSION_IMPL *session, const char **cfg) F_SET(conn, WT_CONN_RECONFIGURING); /* - * The configuration argument has been checked for validity, update the - * previous connection configuration. + * The configuration argument has been checked for validity, update the previous connection + * configuration. * - * DO NOT merge the configuration before the reconfigure calls. Some - * of the underlying reconfiguration functions do explicit checks with - * the second element of the configuration array, knowing the defaults - * are in slot #1 and the application's modifications are in slot #2. + * DO NOT merge the configuration before the reconfigure calls. Some of the underlying + * reconfiguration functions do explicit checks with the second element of the configuration + * array, knowing the defaults are in slot #1 and the application's modifications are in slot + * #2. * - * Replace the base configuration set up by CONNECTION_API_CALL with - * the current connection configuration, otherwise reconfiguration - * functions will find the base value instead of previously configured - * value. + * Replace the base configuration set up by CONNECTION_API_CALL with the current connection + * configuration, otherwise reconfiguration functions will find the base value instead of + * previously configured value. */ cfg[0] = conn->cfg; /* * Reconfigure the system. * - * The compatibility version check is special: upgrade / downgrade - * cannot be done with transactions active, and checkpoints must not - * span a version change. Hold the checkpoint lock to avoid conflicts - * with WiredTiger's checkpoint thread, and rely on the documentation - * specifying that no new operations can start until the upgrade / - * downgrade completes. + * The compatibility version check is special: upgrade / downgrade cannot be done with + * transactions active, and checkpoints must not span a version change. Hold the checkpoint lock + * to avoid conflicts with WiredTiger's checkpoint thread, and rely on the documentation + * specifying that no new operations can start until the upgrade / downgrade completes. */ WT_WITH_CHECKPOINT_LOCK(session, ret = __wt_conn_compat_config(session, cfg, true)); WT_ERR(ret); diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c index 24397ed0666..4649fc9ef4d 100644 --- a/src/third_party/wiredtiger/src/conn/conn_stat.c +++ b/src/third_party/wiredtiger/src/conn/conn_stat.c @@ -101,19 +101,17 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) char **sources; /* - * A note on reconfiguration: the standard "is this configuration string - * allowed" checks should fail if reconfiguration has invalid strings, - * for example, "log=(enabled)", or "statistics_log=(path=XXX)", because - * the connection reconfiguration method doesn't allow those strings. - * Additionally, the base configuration values during reconfiguration - * are the currently configured values (so we don't revert to default - * values when repeatedly reconfiguring), and configuration processing - * of a currently set value should not change the currently set value. + * A note on reconfiguration: the standard "is this configuration string allowed" checks should + * fail if reconfiguration has invalid strings, for example, "log=(enabled)", or + * "statistics_log=(path=XXX)", because the connection reconfiguration method doesn't allow + * those strings. Additionally, the base configuration values during reconfiguration are the + * currently configured values (so we don't revert to default values when repeatedly + * reconfiguring), and configuration processing of a currently set value should not change the + * currently set value. * - * In this code path, a previous statistics log server reconfiguration - * may have stopped the server (and we're about to restart it). Because - * stopping the server discarded the configured information stored in - * the connection structure, we have to re-evaluate all configuration + * In this code path, a previous statistics log server reconfiguration may have stopped the + * server (and we're about to restart it). Because stopping the server discarded the configured + * information stored in the connection structure, we have to re-evaluate all configuration * values, reconfiguration can't skip any of them. */ @@ -336,8 +334,8 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, bool conn_stats) /* * Open the statistics cursor and dump the statistics. * - * If we don't find an underlying object, silently ignore it, the object - * may exist only intermittently. + * If we don't find an underlying object, silently ignore it, the object may exist only + * intermittently. */ if ((ret = __wt_curstat_open(session, uri, NULL, cfg, &cursor)) != 0) { if (ret == EBUSY || ret == ENOENT || ret == WT_NOTFOUND) @@ -420,18 +418,13 @@ __statlog_lsm_apply(WT_SESSION_IMPL *session) cnt = locked = 0; /* - * Walk the list of LSM trees, checking for a match on the set of - * sources. + * Walk the list of LSM trees, checking for a match on the set of sources. * - * XXX - * We can't hold the schema lock for the traversal because the LSM - * statistics code acquires the tree lock, and the LSM cursor code - * acquires the tree lock and then acquires the schema lock, it's a - * classic deadlock. This is temporary code so I'm not going to do - * anything fancy. - * It is OK to not keep holding the schema lock after populating - * the list of matching LSM trees, since the __wt_lsm_tree_get call - * will bump a reference count, so the tree won't go away. + * XXX We can't hold the schema lock for the traversal because the LSM statistics code acquires + * the tree lock, and the LSM cursor code acquires the tree lock and then acquires the schema + * lock, it's a classic deadlock. This is temporary code so I'm not going to do anything fancy. + * It is OK to not keep holding the schema lock after populating the list of matching LSM trees, + * since the __wt_lsm_tree_get call will bump a reference count, so the tree won't go away. */ __wt_spin_lock(session, &S2C(session)->schema_lock); locked = true; @@ -512,12 +505,9 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) WT_RET(__wt_conn_btree_apply(session, NULL, __statlog_apply, NULL, NULL)); /* - * Walk the list of open LSM trees, dumping any that match the - * the list of object sources. + * Walk the list of open LSM trees, dumping any that match the list of object sources. * - * XXX - * This code should be removed when LSM objects are converted to - * data handles. + * XXX This code should be removed when LSM objects are converted to data handles. */ if (conn->stat_sources != NULL) WT_RET(__statlog_lsm_apply(session)); @@ -584,11 +574,11 @@ __statlog_server(void *arg) WT_CLEAR(tmp); /* - * We need a temporary place to build a path and an entry prefix. - * The length of the path plus 128 should be more than enough. + * We need a temporary place to build a path and an entry prefix. The length of the path plus + * 128 should be more than enough. * - * We also need a place to store the current path, because that's - * how we know when to close/re-open the file. + * We also need a place to store the current path, because that's how we know when to + * close/re-open the file. */ WT_ERR(__wt_buf_init(session, &path, strlen(conn->stat_path) + 128)); WT_ERR(__wt_buf_setstr(session, &path, "")); @@ -640,12 +630,11 @@ __statlog_start(WT_CONNECTION_IMPL *conn) /* * Start the thread. * - * Statistics logging creates a thread per database, rather than using - * a single thread to do logging for all of the databases. If we ever - * see lots of databases at a time, doing statistics logging, and we - * want to reduce the number of threads, there's no reason we have to - * have more than one thread, I just didn't feel like writing the code - * to figure out the scheduling. + * Statistics logging creates a thread per database, rather than using a single thread to do + * logging for all of the databases. If we ever see lots of databases at a time, doing + * statistics logging, and we want to reduce the number of threads, there's no reason we have to + * have more than one thread, I just didn't feel like writing the code to figure out the + * scheduling. */ WT_RET(__wt_thread_create(session, &conn->stat_tid, __statlog_server, session)); conn->stat_tid_set = true; @@ -666,17 +655,15 @@ __wt_statlog_create(WT_SESSION_IMPL *session, const char *cfg[]) conn = S2C(session); /* - * Stop any server that is already running. This means that each time - * reconfigure is called we'll bounce the server even if there are no - * configuration changes. This makes our life easier as the underlying - * configuration routine doesn't have to worry about freeing objects - * in the connection structure (it's guaranteed to always start with a - * blank slate), and we don't have to worry about races where a running - * server is reading configuration information that we're updating, and - * it's not expected that reconfiguration will happen a lot. + * Stop any server that is already running. This means that each time reconfigure is called + * we'll bounce the server even if there are no configuration changes. This makes our life + * easier as the underlying configuration routine doesn't have to worry about freeing objects in + * the connection structure (it's guaranteed to always start with a blank slate), and we don't + * have to worry about races where a running server is reading configuration information that + * we're updating, and it's not expected that reconfiguration will happen a lot. * - * If there's no server running, discard any configuration information - * so we don't leak memory during reconfiguration. + * If there's no server running, discard any configuration information so we don't leak memory + * during reconfiguration. */ if (conn->stat_session == NULL) WT_RET(__stat_config_discard(session)); diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index a9c3775ae39..b762a4d8f42 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -89,8 +89,8 @@ __sweep_expire_one(WT_SESSION_IMPL *session) /* * Mark the handle dead and close the underlying handle. * - * For btree handles, closing the handle decrements the open file - * count, meaning the close loop won't overrun the configured minimum. + * For btree handles, closing the handle decrements the open file count, meaning the close loop + * won't overrun the configured minimum. */ ret = __wt_conn_dhandle_close(session, false, true); @@ -299,15 +299,13 @@ __sweep_server(void *arg) __wt_seconds(session, &now); /* - * Sweep the lookaside table. If the lookaside table hasn't yet - * been written, there's no work to do. + * Sweep the lookaside table. If the lookaside table hasn't yet been written, there's no + * work to do. * - * Don't sweep the lookaside table if the cache is stuck full. - * The sweep uses the cache and can exacerbate the problem. - * If we try to sweep when the cache is full or we aren't - * making progress in eviction, sweeping can wind up constantly - * bringing in and evicting pages from the lookaside table, - * which will stop the cache from moving into the stuck state. + * Don't sweep the lookaside table if the cache is stuck full. The sweep uses the cache and + * can exacerbate the problem. If we try to sweep when the cache is full or we aren't making + * progress in eviction, sweeping can wind up constantly bringing in and evicting pages from + * the lookaside table, which will stop the cache from moving into the stuck state. */ if ((FLD_ISSET(conn->timing_stress_flags, WT_TIMING_STRESS_AGGRESSIVE_SWEEP) || now - last >= WT_LAS_SWEEP_SEC) && diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c index 656cb3ac3a1..4869bcb3b71 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_backup.c +++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c @@ -242,20 +242,16 @@ __backup_start(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, bool is_dup, cons if (!is_dup) { /* - * The hot backup copy is done outside of WiredTiger, which - * means file blocks can't be freed and re-allocated until the - * backup completes. The checkpoint code checks the backup flag, - * and if a backup cursor is open checkpoints aren't discarded. - * We release the lock as soon as we've set the flag, we don't - * want to block checkpoints, we just want to make sure no - * checkpoints are deleted. The checkpoint code holds the lock - * until it's finished the checkpoint, otherwise we could start - * a hot backup that would race with an already-started + * The hot backup copy is done outside of WiredTiger, which means file blocks can't be freed + * and re-allocated until the backup completes. The checkpoint code checks the backup flag, + * and if a backup cursor is open checkpoints aren't discarded. We release the lock as soon + * as we've set the flag, we don't want to block checkpoints, we just want to make sure no + * checkpoints are deleted. The checkpoint code holds the lock until it's finished the + * checkpoint, otherwise we could start a hot backup that would race with an already-started * checkpoint. * - * We are holding the checkpoint and schema locks so schema - * operations will not see the backup file list until it is - * complete and valid. + * We are holding the checkpoint and schema locks so schema operations will not see the + * backup file list until it is complete and valid. */ WT_WITH_HOTBACKUP_WRITE_LOCK(session, WT_CONN_HOTBACKUP_START(conn)); @@ -313,15 +309,13 @@ __backup_start(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, bool is_dup, cons /* Add the hot backup and standard WiredTiger files to the list. */ if (log_only) { /* - * If this is not a duplicate cursor, using the log target is an - * incremental backup. If this is a duplicate cursor then using - * the log target on an existing backup cursor means this cursor - * returns the current list of log files. That list was set up - * when parsing the URI so we don't have anything to do here. + * If this is not a duplicate cursor, using the log target is an incremental backup. If this + * is a duplicate cursor then using the log target on an existing backup cursor means this + * cursor returns the current list of log files. That list was set up when parsing the URI + * so we don't have anything to do here. * - * We also open an incremental backup source file so that we can - * detect a crash with an incremental backup existing in the - * source directory versus an improper destination. + * We also open an incremental backup source file so that we can detect a crash with an + * incremental backup existing in the source directory versus an improper destination. */ dest = WT_INCREMENTAL_BACKUP; WT_ERR(__wt_fopen(session, WT_INCREMENTAL_SRC, WT_FS_OPEN_CREATE, WT_STREAM_WRITE, &srcfs)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c index bf90ad7238e..84a39e9292d 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_ds.c +++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c @@ -87,15 +87,13 @@ __curds_cursor_resolve(WT_CURSOR *cursor, int ret) source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; /* - * Update the cursor's key, value and flags. (We use the _INT flags in - * the same way as file objects: there's some chance the underlying data - * source is passing us a reference to data only pinned per operation, - * might as well be safe.) + * Update the cursor's key, value and flags. (We use the _INT flags in the same way as file + * objects: there's some chance the underlying data source is passing us a reference to data + * only pinned per operation, might as well be safe.) * - * There's also a requirement the underlying data-source never returns - * with the cursor/source key referencing application memory: it'd be - * great to do a copy as necessary here so the data-source doesn't have - * to worry about copying the key, but we don't have enough information + * There's also a requirement the underlying data-source never returns with the cursor/source + * key referencing application memory: it'd be great to do a copy as necessary here so the + * data-source doesn't have to worry about copying the key, but we don't have enough information * to know if a cursor is pointing at application or data-source memory. */ if (ret == 0) { diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c index 8ab7c58f263..e675392939c 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_index.c +++ b/src/third_party/wiredtiger/src/cursor/cur_index.c @@ -222,15 +222,14 @@ __curindex_search(WT_CURSOR *cursor) WT_ERR(child->next(child)); /* - * We expect partial matches, and want the smallest record with a key - * greater than or equal to the search key. + * We expect partial matches, and want the smallest record with a key greater than or equal to + * the search key. * - * If the key we find is shorter than the search key, it can't possibly - * match. + * If the key we find is shorter than the search key, it can't possibly match. * - * The only way for the key to be exactly equal is if there is an index - * on the primary key, because otherwise the primary key columns will - * be appended to the index key, but we don't disallow that (odd) case. + * The only way for the key to be exactly equal is if there is an index on the primary key, + * because otherwise the primary key columns will be appended to the index key, but we don't + * disallow that (odd) case. */ found_key = child->key; if (found_key.size < cursor->key.size) @@ -301,14 +300,14 @@ __curindex_search_near(WT_CURSOR *cursor, int *exact) } /* - * We expect partial matches, and want the smallest record with a key - * greater than or equal to the search key. + * We expect partial matches, and want the smallest record with a key greater than or equal to + * the search key. * - * If the found key starts with the search key, we indicate a match by - * setting exact equal to zero. + * If the found key starts with the search key, we indicate a match by setting exact equal to + * zero. * - * The compare function expects application-supplied keys to come first - * so we flip the sign of the result to match what callers expect. + * The compare function expects application-supplied keys to come first so we flip the sign of + * the result to match what callers expect. */ found_key = child->key; if (found_key.size > cursor->key.size) { diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c index c58e032cb80..5b2dc711a7e 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_join.c +++ b/src/third_party/wiredtiger/src/cursor/cur_join.c @@ -590,14 +590,12 @@ __curjoin_entry_member( if (entry->bloom != NULL) { /* - * If the item is not in the Bloom filter, we return - * immediately, otherwise, we still may need to check the - * long way, since it may be a false positive. + * If the item is not in the Bloom filter, we return immediately, otherwise, we still may + * need to check the long way, since it may be a false positive. * - * If we don't own the Bloom filter, we must be sharing one - * in a previous entry. So the shared filter has already - * been checked and passed, we don't need to check it again. - * We'll still need to check the long way. + * If we don't own the Bloom filter, we must be sharing one in a previous entry. So the + * shared filter has already been checked and passed, we don't need to check it again. We'll + * still need to check the long way. */ if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM)) WT_ERR(__wt_bloom_inmem_get(entry->bloom, key)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_metadata.c b/src/third_party/wiredtiger/src/cursor/cur_metadata.c index 9933122f13c..14e295cddd5 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_metadata.c +++ b/src/third_party/wiredtiger/src/cursor/cur_metadata.c @@ -264,13 +264,11 @@ __curmetadata_next(WT_CURSOR *cursor) WT_ERR(__curmetadata_metadata_search(session, cursor)); else { /* - * When applications open metadata cursors, they expect to see - * all schema-level operations reflected in the results. Query - * at read-uncommitted to avoid confusion caused by the current - * transaction state. + * When applications open metadata cursors, they expect to see all schema-level operations + * reflected in the results. Query at read-uncommitted to avoid confusion caused by the + * current transaction state. * - * Don't exit from the scan if we find an incomplete entry: - * just skip over it. + * Don't exit from the scan if we find an incomplete entry: just skip over it. */ for (;;) { WT_WITH_TXN_ISOLATION( diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c index fa2b52d254d..6140b453f86 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_std.c +++ b/src/third_party/wiredtiger/src/cursor/cur_std.c @@ -980,17 +980,15 @@ __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor) WT_ITEM key; /* - * Get a copy of the cursor's raw key, and set it in the new cursor, - * then search for that key to position the cursor. + * Get a copy of the cursor's raw key, and set it in the new cursor, then search for that key to + * position the cursor. * - * We don't clear the WT_ITEM structure: all that happens when getting - * and setting the key is the data/size fields are reset to reference - * the original cursor's key. + * We don't clear the WT_ITEM structure: all that happens when getting and setting the key is + * the data/size fields are reset to reference the original cursor's key. * - * That said, we're playing games with the cursor flags: setting the key - * sets the key/value application-set flags in the new cursor, which may - * or may not be correct, but there's nothing simple that fixes it. We - * depend on the subsequent cursor search to clean things up, as search + * That said, we're playing games with the cursor flags: setting the key sets the key/value + * application-set flags in the new cursor, which may or may not be correct, but there's nothing + * simple that fixes it. We depend on the subsequent cursor search to clean things up, as search * is required to copy and/or reference private memory after success. */ WT_RET(__wt_cursor_get_raw_key(to_dup, &key)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c index fdf10a558a4..94acee0592e 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_table.c +++ b/src/third_party/wiredtiger/src/cursor/cur_table.c @@ -495,11 +495,10 @@ __curtable_insert(WT_CURSOR *cursor) /* * Split out the first insert, it may be allocating a recno. * - * If the table has indices, we also need to know whether this record - * is replacing an existing record so that the existing index entries - * can be removed. We discover if this is an overwrite by configuring - * the primary cursor for no-overwrite, and checking if the insert - * detects a duplicate key. + * If the table has indices, we also need to know whether this record is replacing an existing + * record so that the existing index entries can be removed. We discover if this is an overwrite + * by configuring the primary cursor for no-overwrite, and checking if the insert detects a + * duplicate key. */ cp = ctable->cg_cursors; primary = *cp++; @@ -675,12 +674,12 @@ __curtable_reserve(WT_CURSOR *cursor) JOINABLE_CURSOR_UPDATE_API_CALL(cursor, session, update); /* - * We don't have to open the indices here, but it makes the code similar - * to other cursor functions, and it's odd for a reserve call to succeed - * but the subsequent update fail opening indices. + * We don't have to open the indices here, but it makes the code similar to other cursor + * functions, and it's odd for a reserve call to succeed but the subsequent update fail opening + * indices. * - * Check for a transaction before index open, opening the indices will - * start a transaction if one isn't running. + * Check for a transaction before index open, opening the indices will start a transaction if + * one isn't running. */ WT_ERR(__wt_txn_context_check(session, true)); WT_ERR(__curtable_open_indices(ctable)); @@ -731,10 +730,9 @@ __wt_table_range_truncate(WT_CURSOR_TABLE *start, WT_CURSOR_TABLE *stop) /* * Step through the cursor range, removing the index entries. * - * If there are indices, copy the key we're using to step through the - * cursor range (so we can reset the cursor to its original position), - * then remove all of the index records in the truncated range. Copy - * the raw key because the memory is only valid until the cursor moves. + * If there are indices, copy the key we're using to step through the cursor range (so we can + * reset the cursor to its original position), then remove all of the index records in the + * truncated range. Copy the raw key because the memory is only valid until the cursor moves. */ if (ctable->table->nindices > 0) { if (start == NULL) { diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c index 7f916ca4a1e..b8ec59372dc 100644 --- a/src/third_party/wiredtiger/src/evict/evict_file.c +++ b/src/third_party/wiredtiger/src/evict/evict_file.c @@ -49,24 +49,20 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) page = ref->page; /* - * Eviction can fail when a page in the evicted page's subtree - * switches state. For example, if we don't evict a page marked - * empty, because we expect it to be merged into its parent, it - * might no longer be empty after it's reconciled, in which case - * eviction of its parent would fail. We can either walk the - * tree multiple times (until it's finally empty), or reconcile - * each page to get it to its final state before considering if - * it's an eviction target or will be merged into its parent. + * Eviction can fail when a page in the evicted page's subtree switches state. For example, + * if we don't evict a page marked empty, because we expect it to be merged into its parent, + * it might no longer be empty after it's reconciled, in which case eviction of its parent + * would fail. We can either walk the tree multiple times (until it's finally empty), or + * reconcile each page to get it to its final state before considering if it's an eviction + * target or will be merged into its parent. * - * Don't limit this test to any particular page type, that tends - * to introduce bugs when the reconciliation of other page types - * changes, and there's no advantage to doing so. + * Don't limit this test to any particular page type, that tends to introduce bugs when the + * reconciliation of other page types changes, and there's no advantage to doing so. * - * Eviction can also fail because an update cannot be written. - * If sessions have disjoint sets of files open, updates in a - * no-longer-referenced file may not yet be globally visible, - * and the write will fail with EBUSY. Our caller handles that - * error, retrying later. + * Eviction can also fail because an update cannot be written. If sessions have disjoint + * sets of files open, updates in a no-longer-referenced file may not yet be globally + * visible, and the write will fail with EBUSY. Our caller handles that error, retrying + * later. */ if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page)) WT_ERR(__wt_reconcile(session, ref, NULL, WT_REC_EVICT | WT_REC_VISIBLE_ALL, NULL)); @@ -85,8 +81,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) /* * Evict the page. * - * Ensure the ref state is restored to the previous - * value if eviction fails. + * Ensure the ref state is restored to the previous value if eviction fails. */ WT_ERR(__wt_evict(session, ref, ref->state, WT_EVICT_CALL_CLOSING)); break; diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 00d02886920..2f9f3220106 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -428,15 +428,14 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) return (0); #endif /* - * If we're stuck for 5 minutes in diagnostic mode, or the verbose - * evict_stuck flag is configured, log the cache and transaction state. + * If we're stuck for 5 minutes in diagnostic mode, or the verbose evict_stuck flag is + * configured, log the cache and transaction state. * * If we're stuck for 5 minutes in diagnostic mode, give up. * - * We don't do this check for in-memory workloads because application - * threads are not blocked by the cache being full. If the cache becomes - * full of clean pages, we can be servicing reads while the cache - * appears stuck to eviction. + * We don't do this check for in-memory workloads because application threads are not blocked by + * the cache being full. If the cache becomes full of clean pages, we can be servicing reads + * while the cache appears stuck to eviction. */ if (F_ISSET(conn, WT_CONN_IN_MEMORY)) return (0); @@ -580,8 +579,7 @@ __evict_update_work(WT_SESSION_IMPL *session) /* * If we need space in the cache, try to find clean pages to evict. * - * Avoid division by zero if the cache size has not yet been set in a - * shared cache. + * Avoid division by zero if the cache size has not yet been set in a shared cache. */ bytes_max = conn->cache_size + 1; bytes_inuse = __wt_cache_bytes_inuse(cache); @@ -681,14 +679,12 @@ __evict_pass(WT_SESSION_IMPL *session) ++cache->evict_pass_gen; /* - * Update the oldest ID: we use it to decide whether pages are - * candidates for eviction. Without this, if all threads are - * blocked after a long-running transaction (such as a + * Update the oldest ID: we use it to decide whether pages are candidates for eviction. + * Without this, if all threads are blocked after a long-running transaction (such as a * checkpoint) completes, we may never start evicting again. * - * Do this every time the eviction server wakes up, regardless - * of whether the cache is full, to prevent the oldest ID - * falling too far behind. Don't wait to lock the table: with + * Do this every time the eviction server wakes up, regardless of whether the cache is full, + * to prevent the oldest ID falling too far behind. Don't wait to lock the table: with * highly threaded workloads, that creates a bottleneck. */ WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT)); @@ -704,14 +700,12 @@ __evict_pass(WT_SESSION_IMPL *session) WT_RET(__evict_lru_walk(session)); /* - * If the queue has been empty recently, keep queuing more - * pages to evict. If the rate of queuing pages is high - * enough, this score will go to zero, in which case the - * eviction server might as well help out with eviction. + * If the queue has been empty recently, keep queuing more pages to evict. If the rate of + * queuing pages is high enough, this score will go to zero, in which case the eviction + * server might as well help out with eviction. * - * Also, if there is a single eviction server thread with no - * workers, it must service the urgent queue in case all - * application threads are busy. + * Also, if there is a single eviction server thread with no workers, it must service the + * urgent queue in case all application threads are busy. */ if (!WT_EVICT_HAS_WORKERS(session) && (cache->evict_empty_score < WT_EVICT_SCORE_CUTOFF || @@ -722,16 +716,13 @@ __evict_pass(WT_SESSION_IMPL *session) break; /* - * If we're making progress, keep going; if we're not making - * any progress at all, mark the cache "stuck" and go back to - * sleep, it's not something we can fix. + * If we're making progress, keep going; if we're not making any progress at all, mark the + * cache "stuck" and go back to sleep, it's not something we can fix. * - * We check for progress every 20ms, the idea being that the - * aggressive score will reach 10 after 200ms if we aren't - * making progress and eviction will start considering more - * pages. If there is still no progress after 2s, we will - * treat the cache as stuck and start rolling back - * transactions and writing updates to the lookaside table. + * We check for progress every 20ms, the idea being that the aggressive score will reach 10 + * after 200ms if we aren't making progress and eviction will start considering more pages. + * If there is still no progress after 2s, we will treat the cache as stuck and start + * rolling back transactions and writing updates to the lookaside table. */ if (eviction_progress == cache->eviction_progress) { if (WT_CLOCKDIFF_MS(time_now, time_prev) >= 20 && @@ -752,14 +743,11 @@ __evict_pass(WT_SESSION_IMPL *session) */ if (loop < 100 || cache->evict_aggressive_score < 100) { /* - * Back off if we aren't making progress: walks - * hold the handle list lock, blocking other - * operations that can free space in cache, - * such as LSM discarding handles. + * Back off if we aren't making progress: walks hold the handle list lock, blocking + * other operations that can free space in cache, such as LSM discarding handles. * - * Allow this wait to be interrupted (e.g. if a - * checkpoint completes): make sure we wait for - * a non-zero number of microseconds). + * Allow this wait to be interrupted (e.g. if a checkpoint completes): make sure we + * wait for a non-zero number of microseconds). */ WT_STAT_CONN_INCR(session, cache_eviction_server_slept); __wt_cond_wait(session, cache->evict_cond, WT_THOUSAND, NULL); @@ -1183,8 +1171,8 @@ __evict_lru_walk(WT_SESSION_IMPL *session) /* * Get some more pages to consider for eviction. * - * If the walk is interrupted, we still need to sort the queue: the - * next walk assumes there are no entries beyond WT_EVICT_WALK_BASE. + * If the walk is interrupted, we still need to sort the queue: the next walk assumes there are + * no entries beyond WT_EVICT_WALK_BASE. */ if ((ret = __evict_walk(cache->walk_session, queue)) == EBUSY) ret = 0; @@ -1266,15 +1254,12 @@ __evict_lru_walk(WT_SESSION_IMPL *session) queue->evict_candidates = candidates; else { /* - * Take all of the urgent pages plus a third of - * ordinary candidates (which could be expressed as - * WT_EVICT_WALK_INCR / WT_EVICT_WALK_BASE). In the - * steady state, we want to get as many candidates as - * the eviction walk adds to the queue. + * Take all of the urgent pages plus a third of ordinary candidates (which could be + * expressed as WT_EVICT_WALK_INCR / WT_EVICT_WALK_BASE). In the steady state, we want + * to get as many candidates as the eviction walk adds to the queue. * - * That said, if there is only one entry, which is - * normal when populating an empty file, don't exclude - * it. + * That said, if there is only one entry, which is normal when populating an empty file, + * don't exclude it. */ queue->evict_candidates = 1 + candidates + ((entries - candidates) - 1) / 3; cache->read_gen_oldest = read_gen_oldest; @@ -1470,11 +1455,9 @@ retry: /* * Skip files if we have too many active walks. * - * This used to be limited by the configured maximum number of - * hazard pointers per session. Even though that ceiling has - * been removed, we need to test eviction with huge numbers of - * active trees before allowing larger numbers of hazard - * pointers in the walk session. + * This used to be limited by the configured maximum number of hazard pointers per session. + * Even though that ceiling has been removed, we need to test eviction with huge numbers of + * active trees before allowing larger numbers of hazard pointers in the walk session. */ if (btree->evict_ref == NULL && session->nhazard > WT_EVICT_MAX_TREES) continue; @@ -1492,16 +1475,14 @@ retry: dhandle_locked = false; /* - * Re-check the "no eviction" flag, used to enforce exclusive - * access when a handle is being closed. + * Re-check the "no eviction" flag, used to enforce exclusive access when a handle is being + * closed. * - * Only try to acquire the lock and simply continue if we fail; - * the lock is held while the thread turning off eviction clears - * the tree's current eviction point, and part of the process is - * waiting on this thread to acknowledge that action. + * Only try to acquire the lock and simply continue if we fail; the lock is held while the + * thread turning off eviction clears the tree's current eviction point, and part of the + * process is waiting on this thread to acknowledge that action. * - * If a handle is being discarded, it will still be marked open, - * but won't have a root page. + * If a handle is being discarded, it will still be marked open, but won't have a root page. */ if (btree->evict_disabled == 0 && !__wt_spin_trylock(session, &cache->evict_walk_lock)) { if (btree->evict_disabled == 0 && btree->root.page != NULL) { @@ -1890,9 +1871,8 @@ __evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, u_int max_ent /* * Pages that are empty or from dead trees are fast-tracked. * - * Also evict lookaside table pages without further filtering: - * the cache is under pressure by definition and we want to - * free space. + * Also evict lookaside table pages without further filtering: the cache is under pressure + * by definition and we want to free space. */ if (__wt_page_is_empty(page) || F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || F_ISSET(btree, WT_BTREE_LOOKASIDE)) @@ -1922,15 +1902,12 @@ __evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, u_int max_ent continue; /* - * Don't attempt eviction of internal pages with children in - * cache (indicated by seeing an internal page that is the - * parent of the last page we saw). + * Don't attempt eviction of internal pages with children in cache (indicated by seeing an + * internal page that is the parent of the last page we saw). * - * Also skip internal page unless we get aggressive, the tree - * is idle (indicated by the tree being skipped for walks), - * or we are in eviction debug mode. - * The goal here is that if trees become completely idle, we - * eventually push them out of cache completely. + * Also skip internal page unless we get aggressive, the tree is idle (indicated by the tree + * being skipped for walks), or we are in eviction debug mode. The goal here is that if + * trees become completely idle, we eventually push them out of cache completely. */ if (!F_ISSET(cache, WT_CACHE_EVICT_DEBUG_MODE) && WT_PAGE_IS_INTERNAL(page)) { if (page == last_parent) @@ -1989,18 +1966,15 @@ fast: /* * Give up the walk occasionally. * - * If we happen to end up on the root page or a page requiring urgent - * eviction, clear it. We have to track hazard pointers, and the root - * page complicates that calculation. + * If we happen to end up on the root page or a page requiring urgent eviction, clear it. We + * have to track hazard pointers, and the root page complicates that calculation. * - * Likewise if we found no new candidates during the walk: there is no - * point keeping a page pinned, since it may be the only candidate in - * an idle tree. + * Likewise if we found no new candidates during the walk: there is no point keeping a page + * pinned, since it may be the only candidate in an idle tree. * - * If we land on a page requiring forced eviction, or that isn't an - * ordinary in-memory page (e.g., WT_REF_LIMBO), move until we find an - * ordinary page: we should not prevent exclusive access to the page - * until the next walk. + * If we land on a page requiring forced eviction, or that isn't an ordinary in-memory page + * (e.g., WT_REF_LIMBO), move until we find an ordinary page: we should not prevent exclusive + * access to the page until the next walk. */ if (ref != NULL) { if (__wt_ref_is_root(ref) || evict == start || give_up || @@ -2066,13 +2040,12 @@ __evict_get_ref(WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_ } /* - * The server repopulates whenever the other queue is not full, as long - * as at least one page has been evicted out of the current queue. + * The server repopulates whenever the other queue is not full, as long as at least one page has + * been evicted out of the current queue. * - * Note that there are pathological cases where there are only enough - * eviction candidates in the cache to fill one queue. In that case, - * we will continually evict one page and attempt to refill the queues. - * Such cases are extremely rare in real applications. + * Note that there are pathological cases where there are only enough eviction candidates in the + * cache to fill one queue. In that case, we will continually evict one page and attempt to + * refill the queues. Such cases are extremely rare in real applications. */ if (is_server && (!urgent_ok || __evict_queue_empty(urgent_queue, false)) && !__evict_queue_full(cache->evict_current_queue) && @@ -2090,9 +2063,8 @@ __evict_get_ref(WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_ /* * Check if the current queue needs to change. * - * The server will only evict half of the pages before looking - * for more, but should only switch queues if there are no - * other eviction workers. + * The server will only evict half of the pages before looking for more, but should only + * switch queues if there are no other eviction workers. */ queue = cache->evict_current_queue; other_queue = cache->evict_other_queue; @@ -2138,14 +2110,13 @@ __evict_get_ref(WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_ WT_ASSERT(session, evict->btree != NULL); /* - * Evicting a dirty page in the server thread could stall - * during a write and prevent eviction from finding new work. + * Evicting a dirty page in the server thread could stall during a write and prevent + * eviction from finding new work. * - * However, we can't skip entries in the urgent queue or they - * may never be found again. + * However, we can't skip entries in the urgent queue or they may never be found again. * - * Don't force application threads to evict dirty pages if they - * aren't stalled by the amount of dirty data in cache. + * Don't force application threads to evict dirty pages if they aren't stalled by the amount + * of dirty data in cache. */ if (!urgent_ok && (is_server || !F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD)) && __wt_page_is_modified(evict->ref->page)) { @@ -2235,13 +2206,11 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) } /* - * In case something goes wrong, don't pick the same set of pages every - * time. + * In case something goes wrong, don't pick the same set of pages every time. * - * We used to bump the page's read generation only if eviction failed, - * but that isn't safe: at that point, eviction has already unlocked - * the page and some other thread may have evicted it by the time we - * look at it. + * We used to bump the page's read generation only if eviction failed, but that isn't safe: at + * that point, eviction has already unlocked the page and some other thread may have evicted it + * by the time we look at it. */ __wt_cache_read_gen_bump(session, ref->page); @@ -2319,11 +2288,10 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d /* * Check if we have become busy. * - * If we're busy (because of the transaction check we just did - * or because our caller is waiting on a longer-than-usual event - * such as a page read), and the cache level drops below 100%, - * limit the work to 5 evictions and return. If that's not the - * case, we can do more. + * If we're busy (because of the transaction check we just did or because our caller is + * waiting on a longer-than-usual event such as a page read), and the cache level drops + * below 100%, limit the work to 5 evictions and return. If that's not the case, we can do + * more. */ if (!busy && txn_state->pinned_id != WT_TXN_NONE && txn_global->current != txn_global->oldest_id) diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 785c6219c6b..a13526302a2 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -260,14 +260,12 @@ __evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) ndeleted = __wt_atomic_addv32(&pindex->deleted_entries, 1); /* - * If more than 10% of the parent references are deleted, try a - * reverse split. Don't bother if there is a single deleted - * reference: the internal page is empty and we have to wait + * If more than 10% of the parent references are deleted, try a reverse split. Don't bother + * if there is a single deleted reference: the internal page is empty and we have to wait * for eviction to notice. * - * This will consume the deleted ref (and eventually free it). - * If the reverse split can't get the access it needs because - * something is busy, be sure that the page still ends up + * This will consume the deleted ref (and eventually free it). If the reverse split can't + * get the access it needs because something is busy, be sure that the page still ends up * marked deleted. */ if (ndeleted > pindex->entries / 10 && pindex->entries > 1) { @@ -361,20 +359,19 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_ break; case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ /* - * Either a split where we reconciled a page and it turned into - * a lot of pages or an in-memory page that got too large, we - * forcibly evicted it, and there wasn't anything to write. + * Either a split where we reconciled a page and it turned into a lot + * of pages or an in-memory page that got too large, we forcibly + * evicted it, and there wasn't anything to write. * - * The latter is a special case of forced eviction. Imagine a - * thread updating a small set keys on a leaf page. The page - * is too large or has too many deleted items, so we try and - * evict it, but after reconciliation there's only a small - * amount of live data (so it's a single page we can't split), - * and if there's an older reader somewhere, there's data on - * the page we can't write (so the page can't be evicted). In - * that case, we end up here with a single block that we can't - * write. Take advantage of the fact we have exclusive access - * to the page and rewrite it in memory. + * The latter is a special case of forced eviction. Imagine a thread + * updating a small set keys on a leaf page. The page is too large or + * has too many deleted items, so we try and evict it, but after + * reconciliation there's only a small amount of live data (so it's a + * single page we can't split), and if there's an older reader + * somewhere, there's data on the page we can't write (so the page + * can't be evicted). In that case, we end up here with a single + * block that we can't write. Take advantage of the fact we have + * exclusive access to the page and rewrite it in memory. */ if (mod->mod_multi_entries == 1) { WT_ASSERT(session, closing == false); @@ -386,11 +383,11 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_ /* * Update the parent to reference the replacement page. * - * A page evicted with lookaside entries may not have an - * address, if no updates were visible to reconciliation. + * A page evicted with lookaside entries may not have an address, if no + * updates were visible to reconciliation. * - * Publish: a barrier to ensure the structure fields are set - * before the state change makes the page available to readers. + * Publish: a barrier to ensure the structure fields are set before the + * state change makes the page available to readers. */ if (mod->mod_replace.addr != NULL) { WT_RET(__wt_calloc_one(session, &addr)); @@ -486,19 +483,16 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent) break; case WT_REF_DELETED: /* On-disk, deleted */ /* - * If the child page was part of a truncate, - * transaction rollback might switch this page into its - * previous state at any time, so the delete must be - * resolved before the parent can be evicted. + * If the child page was part of a truncate, transaction rollback might + * switch this page into its previous state at any time, so the delete + * must be resolved before the parent can be evicted. * - * We have the internal page locked, which prevents a - * search from descending into it. However, a walk - * from an adjacent leaf page could attempt to hazard - * couple into a child page and free the page_del - * structure as we are examining it. Flip the state to - * locked to make this check safe: if that fails, we - * have raced with a read and should give up on - * evicting the parent. + * We have the internal page locked, which prevents a search from + * descending into it. However, a walk from an adjacent leaf page could + * attempt to hazard couple into a child page and free the page_del + * structure as we are examining it. Flip the state to locked to make + * this check safe: if that fails, we have raced with a read and should + * give up on evicting the parent. */ if (!__wt_atomic_casv32(&child->state, WT_REF_DELETED, WT_REF_LOCKED)) return (__wt_set_return(session, EBUSY)); @@ -613,32 +607,29 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool /* * If the page is dirty, reconcile it to decide if we can evict it. * - * If we have an exclusive lock (we're discarding the tree), assert - * there are no updates we cannot read. + * If we have an exclusive lock (we're discarding the tree), assert there are no updates we + * cannot read. * - * Don't set any other flags for internal pages: there are no update - * lists to be saved and restored, changes can't be written into the - * lookaside table, nor can we re-create internal pages in memory. + * Don't set any other flags for internal pages: there are no update lists to be saved and + * restored, changes can't be written into the lookaside table, nor can we re-create internal + * pages in memory. * * For leaf pages: * * In-memory pages are a known configuration. * - * Set the update/restore flag, so reconciliation will write blocks it - * can write and create a list of skipped updates for blocks it cannot - * write, along with disk images. This is how eviction of active, huge - * pages works: we take a big page and reconcile it into blocks, some of - * which we write and discard, the rest of which we re-create as smaller - * in-memory pages, (restoring the updates that stopped us from writing - * the block), and inserting the whole mess into the page's parent. Set - * the flag in all cases because the incremental cost of update/restore - * in reconciliation is minimal, eviction shouldn't have picked a page - * where update/restore is necessary, absent some cache pressure. It's - * possible updates occurred after we selected this page for eviction, - * but it's unlikely and we don't try and manage that risk. + * Set the update/restore flag, so reconciliation will write blocks it can write and create a + * list of skipped updates for blocks it cannot write, along with disk images. This is how + * eviction of active, huge pages works: we take a big page and reconcile it into blocks, some + * of which we write and discard, the rest of which we re-create as smaller in-memory pages, + * (restoring the updates that stopped us from writing the block), and inserting the whole mess + * into the page's parent. Set the flag in all cases because the incremental cost of + * update/restore in reconciliation is minimal, eviction shouldn't have picked a page where + * update/restore is necessary, absent some cache pressure. It's possible updates occurred after + * we selected this page for eviction, but it's unlikely and we don't try and manage that risk. * - * Additionally, if we aren't trying to free space in the cache, scrub - * the page and keep it in memory. + * Additionally, if we aren't trying to free space in the cache, scrub the page and keep it in + * memory. */ cache = conn->cache; lookaside_retry = false; @@ -697,11 +688,10 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool /* * Give up on eviction during a checkpoint if the page splits. * - * We get here if checkpoint reads a page with lookaside entries: if - * more of those entries are visible now than when the original - * eviction happened, the page could split. In most workloads, this is - * very unlikely. However, since checkpoint is partway through - * reconciling the parent page, a split can corrupt the checkpoint. + * We get here if checkpoint reads a page with lookaside entries: if more of those entries are + * visible now than when the original eviction happened, the page could split. In most + * workloads, this is very unlikely. However, since checkpoint is partway through reconciling + * the parent page, a split can corrupt the checkpoint. */ if (WT_SESSION_BTREE_SYNC(session) && page->modify->rec_result == WT_PM_REC_MULTIBLOCK) return (__wt_set_return(session, EBUSY)); diff --git a/src/third_party/wiredtiger/src/include/async.h b/src/third_party/wiredtiger/src/include/async.h index 9a32ce6e0d2..16862a5a4e9 100644 --- a/src/third_party/wiredtiger/src/include/async.h +++ b/src/third_party/wiredtiger/src/include/async.h @@ -63,16 +63,15 @@ struct __wt_async { WT_ASYNC_OP_IMPL **async_queue; /* Async ops work queue */ uint32_t async_qsize; /* Async work queue size */ /* - * We need to have two head and tail values. All but one is + * We need to have two head and tail values. All but one is * maintained as an ever increasing value to ease wrap around. * - * alloc_head: the next one to allocate for producers. - * head: the current head visible to consumers. - * head is always <= alloc_head. - * alloc_tail: the next slot for consumers to dequeue. - * alloc_tail is always <= head. - * tail_slot: the last slot consumed. - * A producer may need wait for tail_slot to advance. + * alloc_head: the next one to allocate for producers. head: the + * current head visible to consumers. head is always <= + * alloc_head. alloc_tail: the next slot for consumers to + * dequeue. alloc_tail is always <= head. tail_slot: the last + * slot consumed. A producer may need wait for tail_slot to + * advance. */ uint64_t alloc_head; /* Next slot to enqueue */ uint64_t head; /* Head visible to worker */ diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index d168d10593c..2997fb064a8 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -166,14 +166,13 @@ struct __wt_ovfl_reuse { uint8_t addr_size; /* Overflow addr size */ /* - * On each page reconciliation, we clear the entry's in-use flag, and - * reset it as the overflow record is re-used. After reconciliation - * completes, unused skiplist entries are discarded, along with their - * underlying blocks. + * On each page reconciliation, we clear the entry's in-use flag, and reset it as the overflow + * record is re-used. After reconciliation completes, unused skiplist entries are discarded, along + * with their underlying blocks. * - * On each page reconciliation, set the just-added flag for each new - * skiplist entry; if reconciliation fails for any reason, discard the - * newly added skiplist entries, along with their underlying blocks. + * On each page reconciliation, set the just-added flag for each new skiplist entry; if + * reconciliation fails for any reason, discard the newly added skiplist entries, along with their + * underlying blocks. */ /* AUTOMATIC FLAG VALUE GENERATION START */ #define WT_OVFL_REUSE_INUSE 0x1u @@ -293,20 +292,17 @@ struct __wt_page_modify { size_t bytes_dirty; /* - * When pages are reconciled, the result is one or more replacement - * blocks. A replacement block can be in one of two states: it was - * written to disk, and so we have a block address, or it contained - * unresolved modifications and we have a disk image for it with a - * list of those unresolved modifications. The former is the common - * case: we only build lists of unresolved modifications when we're - * evicting a page, and we only expect to see unresolved modifications - * on a page being evicted in the case of a hot page that's too large - * to keep in memory as it is. In other words, checkpoints will skip - * unresolved modifications, and will write the blocks rather than - * build lists of unresolved modifications. + * When pages are reconciled, the result is one or more replacement blocks. A replacement block + * can be in one of two states: it was written to disk, and so we have a block address, or it + * contained unresolved modifications and we have a disk image for it with a list of those + * unresolved modifications. The former is the common case: we only build lists of unresolved + * modifications when we're evicting a page, and we only expect to see unresolved modifications + * on a page being evicted in the case of a hot page that's too large to keep in memory as it + * is. In other words, checkpoints will skip unresolved modifications, and will write the blocks + * rather than build lists of unresolved modifications. * - * Ugly union/struct layout to conserve memory, we never have both - * a replace address and multiple replacement blocks. + * Ugly union/struct layout to conserve memory, we never have both a replace address and + * multiple replacement blocks. */ union { struct { /* Single, written replacement block */ @@ -346,13 +342,12 @@ struct __wt_page_modify { void *disk_image; /* - * List of unresolved updates. Updates are either a row-store - * insert or update list, or column-store insert list. When - * creating lookaside records, there is an additional value, - * the committed item's transaction information. + * List of unresolved updates. Updates are either a row-store insert or update list, + * or column-store insert list. When creating lookaside records, there is an + * additional value, the committed item's transaction information. * - * If there are unresolved updates, the block wasn't written and - * there will always be a disk image. + * If there are unresolved updates, the block wasn't written and there will always + * be a disk image. */ struct __wt_save_upd { WT_INSERT *ins; /* Insert list reference */ @@ -382,12 +377,11 @@ struct __wt_page_modify { } u1; /* - * Internal pages need to be able to chain root-page splits and have a - * special transactional eviction requirement. Column-store leaf pages - * need update and append lists. + * Internal pages need to be able to chain root-page splits and have a special transactional + * eviction requirement. Column-store leaf pages need update and append lists. * - * Ugly union/struct layout to conserve memory, a page is either a leaf - * page or an internal page. + * Ugly union/struct layout to conserve memory, a page is either a leaf page or an internal + * page. */ union { struct { @@ -564,12 +558,12 @@ struct __wt_page { #define pg_intl_split_gen u.intl.split_gen /* - * Macros to copy/set the index because the name is obscured to ensure - * the field isn't read multiple times. + * Macros to copy/set the index because the name is obscured to ensure the field isn't read multiple + * times. * - * There are two versions of WT_INTL_INDEX_GET because the session split - * generation is usually set, but it's not always required: for example, - * if a page is locked for splitting, or being created or destroyed. + * There are two versions of WT_INTL_INDEX_GET because the session split generation is usually set, + * but it's not always required: for example, if a page is locked for splitting, or being created or + * destroyed. */ #define WT_INTL_INDEX_GET_SAFE(page) ((page)->u.intl.__index) #define WT_INTL_INDEX_GET(session, page, pindex) \ @@ -624,15 +618,12 @@ struct __wt_page { WT_COL *col_var; /* Values */ /* - * Variable-length column-store pages have an array - * of page entries with RLE counts greater than 1 when - * reading the page, so it's not necessary to walk the - * page counting records to find a specific entry. We - * can do a binary search in this array, then an offset - * calculation to find the cell. + * Variable-length column-store pages have an array of page entries with RLE counts + * greater than 1 when reading the page, so it's not necessary to walk the page counting + * records to find a specific entry. We can do a binary search in this array, then an + * offset calculation to find the cell. * - * It's a separate structure to keep the page structure - * as small as possible. + * It's a separate structure to keep the page structure as small as possible. */ struct __wt_col_var_repeat { uint32_t nrepeats; /* repeat slots */ @@ -649,11 +640,11 @@ struct __wt_page { } u; /* - * Page entries, type and flags are positioned at the end of the WT_PAGE - * union to reduce cache misses in the row-store search function. + * Page entries, type and flags are positioned at the end of the WT_PAGE union to reduce cache + * misses in the row-store search function. * - * The entries field only applies to leaf pages, internal pages use the - * page-index entries instead. + * The entries field only applies to leaf pages, internal pages use the page-index entries + * instead. */ uint32_t entries; /* Leaf page entries */ @@ -1010,14 +1001,13 @@ struct __wt_row { /* On-page key, on-page cell, or off-page WT_IKEY */ */ struct __wt_col { /* - * Variable-length column-store data references are page offsets, not - * pointers (we boldly re-invent short pointers). The trade-off is 4B - * per K/V pair on a 64-bit machine vs. a single cycle for the addition - * of a base pointer. The on-page data is a WT_CELL (same as row-store + * Variable-length column-store data references are page offsets, not pointers (we boldly + * re-invent short pointers). The trade-off is 4B per K/V pair on a 64-bit machine vs. a single + * cycle for the addition of a base pointer. The on-page data is a WT_CELL (same as row-store * pages). * - * Obscure the field name, code shouldn't use WT_COL->__col_value, the - * public interface is WT_COL_PTR and WT_COL_PTR_SET. + * Obscure the field name, code shouldn't use WT_COL->__col_value, the public interface is + * WT_COL_PTR and WT_COL_PTR_SET. */ uint32_t __col_value; }; @@ -1143,33 +1133,28 @@ struct __wt_update { /* * WT_INSERT -- * - * Row-store leaf pages support inserts of new K/V pairs. When the first K/V - * pair is inserted, the WT_INSERT_HEAD array is allocated, with one slot for - * every existing element in the page, plus one additional slot. A slot points - * to a WT_INSERT_HEAD structure for the items which sort after the WT_ROW - * element that references it and before the subsequent WT_ROW element; the - * skiplist structure has a randomly chosen depth of next pointers in each - * inserted node. + * Row-store leaf pages support inserts of new K/V pairs. When the first K/V pair is inserted, the + * WT_INSERT_HEAD array is allocated, with one slot for every existing element in the page, plus one + * additional slot. A slot points to a WT_INSERT_HEAD structure for the items which sort after the + * WT_ROW element that references it and before the subsequent WT_ROW element; the skiplist + * structure has a randomly chosen depth of next pointers in each inserted node. * - * The additional slot is because it's possible to insert items smaller than any - * existing key on the page: for that reason, the first slot of the insert array - * holds keys smaller than any other key on the page. - * - * In column-store variable-length run-length encoded pages, a single indx - * entry may reference a large number of records, because there's a single - * on-page entry representing many identical records. (We don't expand those - * entries when the page comes into memory, as that would require resources as - * pages are moved to/from the cache, including read-only files.) Instead, a - * single indx entry represents all of the identical records originally found + * The additional slot is because it's possible to insert items smaller than any existing key on the + * page: for that reason, the first slot of the insert array holds keys smaller than any other key * on the page. * - * Modifying (or deleting) run-length encoded column-store records is hard - * because the page's entry no longer references a set of identical items. We - * handle this by "inserting" a new entry into the insert array, with its own - * record number. (This is the only case where it's possible to insert into a - * column-store: only appends are allowed, as insert requires re-numbering - * subsequent records. Berkeley DB did support mutable records, but it won't - * scale and it isn't useful enough to re-implement, IMNSHO.) + * In column-store variable-length run-length encoded pages, a single indx entry may reference a + * large number of records, because there's a single on-page entry representing many identical + * records. (We don't expand those entries when the page comes into memory, as that would require + * resources as pages are moved to/from the cache, including read-only files.) Instead, a single + * indx entry represents all of the identical records originally found on the page. + * + * Modifying (or deleting) run-length encoded column-store records is hard because the page's entry + * no longer references a set of identical items. We handle this by "inserting" a new entry into the + * insert array, with its own record number. (This is the only case where it's possible to insert + * into a column-store: only appends are allowed, as insert requires re-numbering subsequent + * records. Berkeley DB did support mutable records, but it won't scale and it isn't useful enough + * to re-implement, IMNSHO.) */ struct __wt_insert { WT_UPDATE *upd; /* value */ @@ -1282,17 +1267,15 @@ struct __wt_insert_head { ++(i), (v) = __bit_getv(WT_PAGE_HEADER_BYTE(btree, dsk), i, (btree)->bitcnt)) /* - * Manage split generation numbers. Splits walk the list of sessions to check - * when it is safe to free structures that have been replaced. We also check - * that list periodically (e.g., when wrapping up a transaction) to free any - * memory we can. + * Manage split generation numbers. Splits walk the list of sessions to check when it is safe to + * free structures that have been replaced. We also check that list periodically (e.g., when + * wrapping up a transaction) to free any memory we can. * - * Before a thread enters code that will examine page indexes (which are - * swapped out by splits), it publishes a copy of the current split generation - * into its session. Don't assume that threads never re-enter this code: if we - * already have a split generation, leave it alone. If our caller is examining - * an index, we don't want the oldest split generation to move forward and - * potentially free it. + * Before a thread enters code that will examine page indexes (which are swapped out by splits), it + * publishes a copy of the current split generation into its session. Don't assume that threads + * never re-enter this code: if we already have a split generation, leave it alone. If our caller is + * examining an index, we don't want the oldest split generation to move forward and potentially + * free it. */ #define WT_ENTER_PAGE_INDEX(session) \ do { \ diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h index 248297e6f26..e5d091112a4 100644 --- a/src/third_party/wiredtiger/src/include/btree.h +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -26,22 +26,19 @@ #define WT_BTREE_PAGE_SIZE_MAX (512 * WT_MEGABYTE) /* - * The length of variable-length column-store values and row-store keys/values - * are stored in a 4B type, so the largest theoretical key/value item is 4GB. - * However, in the WT_UPDATE structure we use the UINT32_MAX size as a "deleted" - * flag, and second, the size of an overflow object is constrained by what an - * underlying block manager can actually write. (For example, in the default - * block manager, writing an overflow item includes the underlying block's page - * header and block manager specific structure, aligned to an allocation-sized - * unit). The btree engine limits the size of a single object to (4GB - 1KB); - * that gives us additional bytes if we ever want to store a structure length - * plus the object size in 4B, or if we need additional flag values. Attempts - * to store large key/value items in the tree trigger an immediate check to the - * block manager, to make sure it can write the item. Storing 4GB objects in a - * btree borders on clinical insanity, anyway. + * The length of variable-length column-store values and row-store keys/values are stored in a 4B + * type, so the largest theoretical key/value item is 4GB. However, in the WT_UPDATE structure we + * use the UINT32_MAX size as a "deleted" flag, and second, the size of an overflow object is + * constrained by what an underlying block manager can actually write. (For example, in the default + * block manager, writing an overflow item includes the underlying block's page header and block + * manager specific structure, aligned to an allocation-sized unit). The btree engine limits the + * size of a single object to (4GB - 1KB); that gives us additional bytes if we ever want to store a + * structure length plus the object size in 4B, or if we need additional flag values. Attempts to + * store large key/value items in the tree trigger an immediate check to the block manager, to make + * sure it can write the item. Storing 4GB objects in a btree borders on clinical insanity, anyway. * - * Record numbers are stored in 64-bit unsigned integers, meaning the largest - * record number is "really, really big". + * Record numbers are stored in 64-bit unsigned integers, meaning the largest record number is + * "really, really big". */ #define WT_BTREE_MAX_OBJECT_SIZE ((uint32_t)(UINT32_MAX - 1024)) diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 2fa3e0d94d3..69bb0dec90a 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -476,34 +476,29 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) last_running = S2C(session)->txn_global.last_running; /* - * We depend on the atomic operation being a write barrier, that is, a - * barrier to ensure all changes to the page are flushed before updating - * the page state and/or marking the tree dirty, otherwise checkpoints - * and/or page reconciliation might be looking at a clean page/tree. + * We depend on the atomic operation being a write barrier, that is, a barrier to ensure all + * changes to the page are flushed before updating the page state and/or marking the tree dirty, + * otherwise checkpoints and/or page reconciliation might be looking at a clean page/tree. * - * Every time the page transitions from clean to dirty, update the cache - * and transactional information. + * Every time the page transitions from clean to dirty, update the cache and transactional + * information. * - * The page state can only ever be incremented above dirty by the number - * of concurrently running threads, so the counter will never approach - * the point where it would wrap. + * The page state can only ever be incremented above dirty by the number of concurrently running + * threads, so the counter will never approach the point where it would wrap. */ if (page->modify->page_state < WT_PAGE_DIRTY && __wt_atomic_add32(&page->modify->page_state, 1) == WT_PAGE_DIRTY_FIRST) { __wt_cache_dirty_incr(session, page); /* - * We won the race to dirty the page, but another thread could - * have committed in the meantime, and the last_running field - * been updated past it. That is all very unlikely, but not - * impossible, so we take care to read the global state before - * the atomic increment. + * We won the race to dirty the page, but another thread could have committed in the + * meantime, and the last_running field been updated past it. That is all very unlikely, but + * not impossible, so we take care to read the global state before the atomic increment. * - * If the page was dirty on entry, then last_running == 0. The - * page could have become clean since then, if reconciliation - * completed. In that case, we leave the previous value for - * first_dirty_txn rather than potentially racing to update it, - * at worst, we'll unnecessarily write a page in a checkpoint. + * If the page was dirty on entry, then last_running == 0. The page could have become clean + * since then, if reconciliation completed. In that case, we leave the previous value for + * first_dirty_txn rather than potentially racing to update it, at worst, we'll + * unnecessarily write a page in a checkpoint. */ if (last_running != 0) page->modify->first_dirty_txn = last_running; @@ -524,10 +519,9 @@ __wt_tree_modify_set(WT_SESSION_IMPL *session) /* * Test before setting the dirty flag, it's a hot cache line. * - * The tree's modified flag is cleared by the checkpoint thread: set it - * and insert a barrier before dirtying the page. (I don't think it's - * a problem if the tree is marked dirty with all the pages clean, it - * might result in an extra checkpoint that doesn't do any work but it + * The tree's modified flag is cleared by the checkpoint thread: set it and insert a barrier + * before dirtying the page. (I don't think it's a problem if the tree is marked dirty with all + * the pages clean, it might result in an extra checkpoint that doesn't do any work but it * shouldn't cause problems; regardless, let's play it safe.) */ if (!S2BT(session)->modified) { @@ -554,21 +548,19 @@ static inline void __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page) { /* - * The page must be held exclusive when this call is made, this call - * can only be used when the page is owned by a single thread. + * The page must be held exclusive when this call is made, this call can only be used when the + * page is owned by a single thread. * * Allow the call to be made on clean pages. */ if (__wt_page_is_modified(page)) { /* - * The only part where ordering matters is during - * reconciliation where updates on other threads are performing - * writes to the page state that need to be visible to the + * The only part where ordering matters is during reconciliation where updates on other + * threads are performing writes to the page state that need to be visible to the * reconciliation thread. * - * Since clearing of the page state is not going to be happening - * during reconciliation on a separate thread, there's no write - * barrier needed here. + * Since clearing of the page state is not going to be happening during reconciliation on a + * separate thread, there's no write barrier needed here. */ page->modify->page_state = WT_PAGE_CLEAN; __wt_cache_dirty_decr(session, page); @@ -1067,9 +1059,8 @@ __wt_ref_info( page = ref->home; /* - * If NULL, there is no location. - * If off-page, the pointer references a WT_ADDR structure. - * If on-page, the pointer references a cell. + * If NULL, there is no location. If off-page, the pointer references a WT_ADDR structure. If + * on-page, the pointer references a cell. * * The type is of a limited set: internal, leaf or no-overflow leaf. */ @@ -1214,16 +1205,14 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) return (false); /* - * Check for pages with append-only workloads. A common application - * pattern is to have multiple threads frantically appending to the - * tree. We want to reconcile and evict this page, but we'd like to - * do it without making the appending threads wait. See if it's worth - * doing a split to let the threads continue before doing eviction. + * Check for pages with append-only workloads. A common application pattern is to have multiple + * threads frantically appending to the tree. We want to reconcile and evict this page, but we'd + * like to do it without making the appending threads wait. See if it's worth doing a split to + * let the threads continue before doing eviction. * - * Ignore anything other than large, dirty leaf pages. We depend on the - * page being dirty for correctness (the page must be reconciled again - * before being evicted after the split, information from a previous - * reconciliation will be wrong, so we can't evict immediately). + * Ignore anything other than large, dirty leaf pages. We depend on the page being dirty for + * correctness (the page must be reconciled again before being evicted after the split, + * information from a previous reconciliation will be wrong, so we can't evict immediately). */ if (page->memory_footprint < btree->splitmempage) return (false); @@ -1384,15 +1373,13 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) } /* - * If a split created new internal pages, those newly created internal - * pages cannot be evicted until all threads are known to have exited - * the original parent page's index, because evicting an internal page - * discards its WT_REF array, and a thread traversing the original - * parent page index might see a freed WT_REF. + * If a split created new internal pages, those newly created internal pages cannot be evicted + * until all threads are known to have exited the original parent page's index, because evicting + * an internal page discards its WT_REF array, and a thread traversing the original parent page + * index might see a freed WT_REF. * - * One special case where we know this is safe is if the handle is - * locked exclusive (e.g., when the whole tree is being evicted). In - * that case, no readers can be looking at an old index. + * One special case where we know this is safe is if the handle is locked exclusive (e.g., when + * the whole tree is being evicted). In that case, no readers can be looking at an old index. */ if (WT_PAGE_IS_INTERNAL(page) && !F_ISSET(session->dhandle, WT_DHANDLE_EXCLUSIVE) && __wt_gen_active(session, WT_GEN_SPLIT, page->pg_intl_split_gen)) @@ -1434,20 +1421,18 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) return (0); /* - * Attempt to evict pages with the special "oldest" read generation. - * This is set for pages that grow larger than the configured - * memory_page_max setting, when we see many deleted items, and when we - * are attempting to scan without trashing the cache. + * Attempt to evict pages with the special "oldest" read generation. This is set for pages that + * grow larger than the configured memory_page_max setting, when we see many deleted items, and + * when we are attempting to scan without trashing the cache. * - * Checkpoint should not queue pages for urgent eviction if they require - * dirty eviction: there is a special exemption that allows checkpoint - * to evict dirty pages in a tree that is being checkpointed, and no - * other thread can help with that. Checkpoints don't rely on this code - * for dirty eviction: that is handled explicitly in __wt_sync_file. + * Checkpoint should not queue pages for urgent eviction if they require dirty eviction: there + * is a special exemption that allows checkpoint to evict dirty pages in a tree that is being + * checkpointed, and no other thread can help with that. Checkpoints don't rely on this code for + * dirty eviction: that is handled explicitly in __wt_sync_file. * - * If the operation has disabled eviction or splitting, or the session - * is preventing from reconciling, then just queue the page for urgent - * eviction. Otherwise, attempt to release and evict it. + * If the operation has disabled eviction or splitting, or the session is preventing from + * reconciling, then just queue the page for urgent eviction. Otherwise, attempt to release and + * evict it. */ page = ref->page; if (WT_READGEN_EVICT_SOON(page->read_gen) && btree->evict_disabled == 0 && diff --git a/src/third_party/wiredtiger/src/include/btree_cmp.i b/src/third_party/wiredtiger/src/include/btree_cmp.i index 2f0596bed13..7a77c74db9e 100644 --- a/src/third_party/wiredtiger/src/include/btree_cmp.i +++ b/src/third_party/wiredtiger/src/include/btree_cmp.i @@ -225,12 +225,10 @@ __wt_lex_compare_short(const WT_ITEM *user_item, const WT_ITEM *tree_item) treep = tree_item->data; /* - * The maximum packed uint64_t is 9B, catch row-store objects using - * packed record numbers as keys. + * The maximum packed uint64_t is 9B, catch row-store objects using packed record numbers as keys. * - * Don't use a #define to compress this case statement: gcc7 complains - * about implicit fallthrough and doesn't support explicit fallthrough - * comments in macros. + * Don't use a #define to compress this case statement: gcc7 complains about implicit fallthrough + * and doesn't support explicit fallthrough comments in macros. */ #define WT_COMPARE_SHORT_MAXLEN 9 switch (len) { diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i index a4a762eae7f..fa770d49c4e 100644 --- a/src/third_party/wiredtiger/src/include/cache.i +++ b/src/third_party/wiredtiger/src/include/cache.i @@ -343,10 +343,9 @@ __wt_eviction_needed(WT_SESSION_IMPL *session, bool busy, bool readonly, double /* * Only check the dirty trigger when the session is not busy. * - * In other words, once we are pinning resources, try to finish the - * operation as quickly as possible without exceeding the cache size. - * The next transaction in this session will not be able to start until - * the cache is under the limit. + * In other words, once we are pinning resources, try to finish the operation as quickly as + * possible without exceeding the cache size. The next transaction in this session will not be + * able to start until the cache is under the limit. */ return (clean_needed || (!busy && dirty_needed)); } diff --git a/src/third_party/wiredtiger/src/include/cell.h b/src/third_party/wiredtiger/src/include/cell.h index ca9e8e50e91..a93fc3dabd0 100644 --- a/src/third_party/wiredtiger/src/include/cell.h +++ b/src/third_party/wiredtiger/src/include/cell.h @@ -84,17 +84,15 @@ #define WT_CELL_TXN_STOP 0x10 /* Newest-stop txn ID */ /* - * WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf - * block location, and WT_CELL_ADDR_LEAF_NO is a leaf block location where the - * page has no overflow items. (The goal is to speed up truncation as we don't - * have to read pages without overflow items in order to delete them. Note, - * WT_CELL_ADDR_LEAF_NO is not guaranteed to be set on every page without - * overflow items, the only guarantee is that if set, the page has no overflow - * items.) - * - * WT_CELL_VALUE_COPY is a reference to a previous cell on the page, supporting - * value dictionaries: if the two values are the same, we only store them once - * and have any second and subsequent uses reference the original. + * WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf block location, and + * WT_CELL_ADDR_LEAF_NO is a leaf block location where the page has no overflow items. (The goal is + * to speed up truncation as we don't have to read pages without overflow items in order to delete + * them. Note, WT_CELL_ADDR_LEAF_NO is not guaranteed to be set on every page without overflow + * items, the only guarantee is that if set, the page has no overflow items.) + * + * WT_CELL_VALUE_COPY is a reference to a previous cell on the page, supporting value dictionaries: + * if the two values are the same, we only store them once and have any second and subsequent uses + * reference the original. */ #define WT_CELL_ADDR_DEL (0) /* Address: deleted */ #define WT_CELL_ADDR_INT (1 << 4) /* Address: internal */ diff --git a/src/third_party/wiredtiger/src/include/column.i b/src/third_party/wiredtiger/src/include/column.i index d039386245c..138a185fa42 100644 --- a/src/third_party/wiredtiger/src/include/column.i +++ b/src/third_party/wiredtiger/src/include/column.i @@ -46,17 +46,14 @@ __col_insert_search_gt(WT_INSERT_HEAD *ins_head, uint64_t recno) } /* - * If we didn't find any records greater than or equal to the target, - * we never set the return value, set it to the first record in the - * list. + * If we didn't find any records greater than or equal to the target, we never set the return + * value, set it to the first record in the list. * - * Otherwise, it references a record less-than-or-equal to the target, - * move to a later record, that is, a subsequent record greater than - * the target. Because inserts happen concurrently, additional records - * might be inserted after the searched-for record that are still - * smaller than the target, continue to move forward until reaching a - * record larger than the target. There isn't any safety testing - * because we confirmed such a record exists before searching. + * Otherwise, it references a record less-than-or-equal to the target, move to a later record, + * that is, a subsequent record greater than the target. Because inserts happen concurrently, + * additional records might be inserted after the searched-for record that are still smaller + * than the target, continue to move forward until reaching a record larger than the target. + * There isn't any safety testing because we confirmed such a record exists before searching. */ if ((ins = ret_ins) == NULL) ins = WT_SKIP_FIRST(ins_head); @@ -282,11 +279,10 @@ __col_var_search(WT_REF *ref, uint64_t recno, uint64_t *start_recnop) /* * Find the matching slot. * - * This is done in two stages: first, we do a binary search among any - * repeating records to find largest repeating less than the search key. - * Once there, we can do a simple offset calculation to find the correct - * slot for this record number, because we know any intervening records - * have repeat counts of 1. + * This is done in two stages: first, we do a binary search among any repeating records to find + * largest repeating less than the search key. Once there, we can do a simple offset calculation + * to find the correct slot for this record number, because we know any intervening records have + * repeat counts of 1. */ for (base = 0, limit = WT_COL_VAR_REPEAT_SET(page) ? page->pg_var_nrepeats : 0; limit != 0; limit >>= 1) { diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index 32becc05467..db4b2e9b41e 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -250,15 +250,13 @@ struct __wt_connection_impl { uint32_t open_cursor_count; /* Atomic: open cursor handle count */ /* - * WiredTiger allocates space for 50 simultaneous sessions (threads of - * control) by default. Growing the number of threads dynamically is - * possible, but tricky since server threads are walking the array - * without locking it. + * WiredTiger allocates space for 50 simultaneous sessions (threads of control) by default. + * Growing the number of threads dynamically is possible, but tricky since server threads are + * walking the array without locking it. * - * There's an array of WT_SESSION_IMPL pointers that reference the - * allocated array; we do it that way because we want an easy way for - * the server thread code to avoid walking the entire array when only a - * few threads are running. + * There's an array of WT_SESSION_IMPL pointers that reference the allocated array; we do it + * that way because we want an easy way for the server thread code to avoid walking the entire + * array when only a few threads are running. */ WT_SESSION_IMPL *sessions; /* Session reference */ uint32_t session_size; /* Session array size */ diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index b52bd2c86ca..b3d32ad8417 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -265,26 +265,22 @@ struct __wt_cursor_index { }; /* - * A join iterator structure is used to generate candidate primary keys. It - * is the responsibility of the caller of the iterator to filter these - * primary key against the other conditions of the join before returning - * them the caller of WT_CURSOR::next. + * A join iterator structure is used to generate candidate primary keys. It is the responsibility of + * the caller of the iterator to filter these primary key against the other conditions of the join + * before returning them the caller of WT_CURSOR::next. * - * For a conjunction join (the default), entry_count will be 1, meaning that - * the iterator only consumes the first entry (WT_CURSOR_JOIN_ENTRY). That - * is, it successively returns primary keys from a cursor for the first - * index that was joined. When the values returned by that cursor are - * exhausted, the iterator has completed. For a disjunction join, - * exhausting a cursor just means that the iterator advances to the next - * entry. If the next entry represents an index, a new cursor is opened and - * primary keys from that index are then successively returned. + * For a conjunction join (the default), entry_count will be 1, meaning that the iterator only + * consumes the first entry (WT_CURSOR_JOIN_ENTRY). That is, it successively returns primary keys + * from a cursor for the first index that was joined. When the values returned by that cursor are + * exhausted, the iterator has completed. For a disjunction join, exhausting a cursor just means + * that the iterator advances to the next entry. If the next entry represents an index, a new cursor + * is opened and primary keys from that index are then successively returned. * - * When positioned on an entry that represents a nested join, a new child - * iterator is created that will be bound to the nested WT_CURSOR_JOIN. - * That iterator is then used to generate candidate primary keys. When its - * iteration is completed, that iterator is destroyed and the parent - * iterator advances to the next entry. Thus, depending on how deeply joins - * are nested, a similarly deep stack of iterators is created. + * When positioned on an entry that represents a nested join, a new child iterator is created that + * will be bound to the nested WT_CURSOR_JOIN. That iterator is then used to generate candidate + * primary keys. When its iteration is completed, that iterator is destroyed and the parent iterator + * advances to the next entry. Thus, depending on how deeply joins are nested, a similarly deep + * stack of iterators is created. */ struct __wt_cursor_join_iter { WT_SESSION_IMPL *session; diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index 730d69cbdc7..18c5d146a9e 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -384,10 +384,9 @@ __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd) copy = WT_ROW_KEY_COPY(rip); /* - * Get a key: we could just call __wt_row_leaf_key, but as a cursor - * is running through the tree, we may have additional information - * here (we may have the fully-built key that's immediately before - * the prefix-compressed key we want, so it's a faster construction). + * Get a key: we could just call __wt_row_leaf_key, but as a cursor is running through the tree, + * we may have additional information here (we may have the fully-built key that's immediately + * before the prefix-compressed key we want, so it's a faster construction). * * First, check for an immediately available key. */ @@ -399,14 +398,12 @@ __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd) goto slow; /* - * Unpack the cell and deal with overflow and prefix-compressed keys. - * Inline building simple prefix-compressed keys from a previous key, - * otherwise build from scratch. + * Unpack the cell and deal with overflow and prefix-compressed keys. Inline building simple + * prefix-compressed keys from a previous key, otherwise build from scratch. * - * Clear the key cell structure. It shouldn't be necessary (as far as I - * can tell, and we don't do it in lots of other places), but disabling - * shared builds (--disable-shared) results in the compiler complaining - * about uninitialized field use. + * Clear the key cell structure. It shouldn't be necessary (as far as I can tell, and we don't + * do it in lots of other places), but disabling shared builds (--disable-shared) results in the + * compiler complaining about uninitialized field use. */ kpack = &_kpack; memset(kpack, 0, sizeof(*kpack)); @@ -415,12 +412,11 @@ __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd) WT_ASSERT(session, cbt->row_key->size >= kpack->prefix); /* - * Grow the buffer as necessary as well as ensure data has been - * copied into local buffer space, then append the suffix to the - * prefix already in the buffer. + * Grow the buffer as necessary as well as ensure data has been copied into local buffer + * space, then append the suffix to the prefix already in the buffer. * - * Don't grow the buffer unnecessarily or copy data we don't - * need, truncate the item's data length to the prefix bytes. + * Don't grow the buffer unnecessarily or copy data we don't need, truncate the item's data + * length to the prefix bytes. */ cbt->row_key->size = kpack->prefix; WT_RET(__wt_buf_grow(session, cbt->row_key, cbt->row_key->size + kpack->size)); diff --git a/src/third_party/wiredtiger/src/include/hardware.h b/src/third_party/wiredtiger/src/include/hardware.h index 447d082393e..9947de8b26a 100644 --- a/src/third_party/wiredtiger/src/include/hardware.h +++ b/src/third_party/wiredtiger/src/include/hardware.h @@ -60,15 +60,14 @@ /* * Pad a structure so an array of structures get separate cache lines. * - * Note that we avoid compiler structure alignment because that requires - * allocating aligned blocks of memory, and alignment pollutes any other type - * that contains an aligned field. It is possible that a hot field positioned - * before this one will be on the same cache line, but not if it is also + * Note that we avoid compiler structure alignment because that requires allocating aligned blocks + * of memory, and alignment pollutes any other type that contains an aligned field. It is possible + * that a hot field positioned before this one will be on the same cache line, but not if it is also * padded. * - * This alignment has a small impact on portability as well, as we are using an - * anonymous union here which is supported under C11, earlier versions of - * the GNU standard, and MSVC versions as early as 2003. + * This alignment has a small impact on portability as well, as we are using an anonymous union here + * which is supported under C11, earlier versions of the GNU standard, and MSVC versions as early as + * 2003. */ #define WT_CACHE_LINE_PAD_BEGIN \ union { \ diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h index 0518d8dd0f9..f5f6bca7cc0 100644 --- a/src/third_party/wiredtiger/src/include/log.h +++ b/src/third_party/wiredtiger/src/include/log.h @@ -110,16 +110,17 @@ union __wt_lsn { * Possible values for the consolidation array slot states: * * WT_LOG_SLOT_CLOSE - slot is in use but closed to new joins. + * * WT_LOG_SLOT_FREE - slot is available for allocation. + * * WT_LOG_SLOT_WRITTEN - slot is written and should be processed by worker. * - * The slot state must be volatile: threads loop checking the state and can't - * cache the first value they see. + * The slot state must be volatile: threads loop checking the state and can't cache the first value + * they see. * - * The slot state is divided into two 32 bit sizes. One half is the - * amount joined and the other is the amount released. Since we use - * a few special states, reserve the top few bits for state. That makes - * the maximum size less than 32 bits for both joined and released. + * The slot state is divided into two 32 bit sizes. One half is the amount joined and the other is + * the amount released. Since we use a few special states, reserve the top few bits for state. That + * makes the maximum size less than 32 bits for both joined and released. */ /* * XXX The log slot bits are signed and should be rewritten as unsigned. For now, give the logging @@ -279,13 +280,11 @@ struct __wt_log { WT_CONDVAR *log_write_cond; /* - * Consolidation array information - * Our testing shows that the more consolidation we generate the - * better the performance we see which equates to an active slot - * slot count of one. + * Consolidation array information Our testing shows that the more consolidation we generate the + * better the performance we see which equates to an active slot slot count of one. * - * Note: this can't be an array, we impose cache-line alignment and - * gcc doesn't support that for arrays. + * Note: this can't be an array, we impose cache-line alignment and gcc doesn't support that for + * arrays. */ #define WT_SLOT_POOL 128 WT_LOGSLOT *active_slot; /* Active slot */ @@ -309,12 +308,10 @@ struct __wt_log_record { uint32_t checksum; /* 04-07: Checksum of the record */ /* - * No automatic generation: flag values cannot change, they're written - * to disk. + * No automatic generation: flag values cannot change, they're written to disk. * - * Unused bits in the flags, as well as the 'unused' padding, - * are expected to be zeroed; we check that to help detect file - * corruption. + * Unused bits in the flags, as well as the 'unused' padding, are expected to be zeroed; we check + * that to help detect file corruption. */ #define WT_LOG_RECORD_COMPRESSED 0x01u /* Compressed except hdr */ #define WT_LOG_RECORD_ENCRYPTED 0x02u /* Encrypted except hdr */ diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index 046d724d1f7..c303edc9488 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -147,13 +147,13 @@ /* * Flag set, clear and test. * - * They come in 3 flavors: F_XXX (handles a field named "flags" in the structure - * referenced by its argument), LF_XXX (handles a local variable named "flags"), - * and FLD_XXX (handles any variable, anywhere). + * They come in 3 flavors: F_XXX (handles a field named "flags" in the structure referenced by its + * argument), LF_XXX (handles a local variable named "flags"), and FLD_XXX (handles any variable, + * anywhere). * - * Flags are unsigned 32-bit values -- we cast to keep the compiler quiet (the - * hex constant might be a negative integer), and to ensure the hex constant is - * the correct size before applying the bitwise not operator. + * Flags are unsigned 32-bit values -- we cast to keep the compiler quiet (the hex constant might be + * a negative integer), and to ensure the hex constant is the correct size before applying the + * bitwise not operator. */ #define FLD_CLR(field, mask) ((void)((field) &= ~(mask))) #define FLD_MASK(field, mask) ((field) & (mask)) @@ -173,8 +173,8 @@ /* * Insertion sort, for sorting small sets of values. * - * The "compare_lt" argument is a function or macro that returns true when - * its first argument is less than its second argument. + * The "compare_lt" argument is a function or macro that returns true when its first argument is + * less than its second argument. */ #define WT_INSERTION_SORT(arrayp, n, value_type, compare_lt) \ do { \ diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i index e937858ba5b..d739e78cf28 100644 --- a/src/third_party/wiredtiger/src/include/misc.i +++ b/src/third_party/wiredtiger/src/include/misc.i @@ -238,12 +238,12 @@ __wt_timing_stress(WT_SESSION_IMPL *session, u_int flag) } /* - * The hardware-accelerated checksum code that originally shipped on Windows - * did not correctly handle memory that wasn't 8B aligned and a multiple of 8B. - * It's likely that calculations were always 8B aligned, but there's some risk. + * The hardware-accelerated checksum code that originally shipped on Windows did not correctly + * handle memory that wasn't 8B aligned and a multiple of 8B. It's likely that calculations were + * always 8B aligned, but there's some risk. * - * What we do is always write the correct checksum, and if a checksum test - * fails, check it against the alternate version have before failing. + * What we do is always write the correct checksum, and if a checksum test fails, check it against + * the alternate version have before failing. */ #if defined(_M_AMD64) && !defined(HAVE_NO_CRC32_HARDWARE) diff --git a/src/third_party/wiredtiger/src/include/mutex.h b/src/third_party/wiredtiger/src/include/mutex.h index 63283c92633..d65eea97b68 100644 --- a/src/third_party/wiredtiger/src/include/mutex.h +++ b/src/third_party/wiredtiger/src/include/mutex.h @@ -9,8 +9,8 @@ /* * Condition variables: * - * WiredTiger uses condition variables to signal between threads, and for - * locking operations that are expected to block. + * WiredTiger uses condition variables to signal between threads, and for locking operations that + * are expected to block. */ struct __wt_condvar { const char *name; /* Mutex name for debugging */ @@ -88,9 +88,8 @@ struct __wt_rwlock { /* Read/write lock */ /* * Spin locks: * - * WiredTiger uses spinlocks for fast mutual exclusion (where operations done - * while holding the spin lock are expected to complete in a small number of - * instructions). + * WiredTiger uses spinlocks for fast mutual exclusion (where operations done while holding the spin + * lock are expected to complete in a small number of instructions). */ #define SPINLOCK_GCC 0 #define SPINLOCK_MSVC 1 diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i index d9a93902fcd..45a0b3ab0f0 100644 --- a/src/third_party/wiredtiger/src/include/mutex.i +++ b/src/third_party/wiredtiger/src/include/mutex.i @@ -9,9 +9,8 @@ /* * Spin locks: * - * These used for cases where fast mutual exclusion is needed (where operations - * done while holding the spin lock are expected to complete in a small number - * of instructions. + * These used for cases where fast mutual exclusion is needed (where operations done while holding + * the spin lock are expected to complete in a small number of instructions. */ /* diff --git a/src/third_party/wiredtiger/src/include/packing.i b/src/third_party/wiredtiger/src/include/packing.i index 1335334f142..a251322fcbb 100644 --- a/src/third_party/wiredtiger/src/include/packing.i +++ b/src/third_party/wiredtiger/src/include/packing.i @@ -9,9 +9,9 @@ /* * Throughout this code we have to be aware of default argument conversion. * - * Refer to Chapter 8 of "Expert C Programming" by Peter van der Linden for the - * gory details. The short version is that we have less cases to deal with - * because the compiler promotes shorter types to int or unsigned int. + * Refer to Chapter 8 of "Expert C Programming" by Peter van der Linden for the gory details. The + * short version is that we have less cases to deal with because the compiler promotes shorter types + * to int or unsigned int. */ typedef struct { union { diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h index 8403097e03a..1c02f8353c6 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.h +++ b/src/third_party/wiredtiger/src/include/reconcile.h @@ -53,43 +53,36 @@ struct __wt_reconcile { bool leave_dirty; /* - * Track if reconciliation has seen any overflow items. If a leaf page - * with no overflow items is written, the parent page's address cell is - * set to the leaf-no-overflow type. This means we can delete the leaf - * page without reading it because we don't have to discard any overflow + * Track if reconciliation has seen any overflow items. If a leaf page with no overflow items is + * written, the parent page's address cell is set to the leaf-no-overflow type. This means we + * can delete the leaf page without reading it because we don't have to discard any overflow * items it might reference. * - * The test test is per-page reconciliation, that is, once we see an - * overflow item on the page, all subsequent leaf pages written for the - * page will not be leaf-no-overflow type, regardless of whether or not - * they contain overflow items. In other words, leaf-no-overflow is not - * guaranteed to be set on every page that doesn't contain an overflow - * item, only that if it is set, the page contains no overflow items. - * XXX - * This was originally done because raw compression couldn't do better, - * now that raw compression has been removed, we should do better. + * The test test is per-page reconciliation, that is, once we see an overflow item on the page, + * all subsequent leaf pages written for the page will not be leaf-no-overflow type, regardless + * of whether or not they contain overflow items. In other words, leaf-no-overflow is not + * guaranteed to be set on every page that doesn't contain an overflow item, only that if it is + * set, the page contains no overflow items. XXX This was originally done because raw + * compression couldn't do better, now that raw compression has been removed, we should do + * better. */ bool ovfl_items; /* - * Track if reconciliation of a row-store leaf page has seen empty (zero - * length) values. We don't write out anything for empty values, so if - * there are empty values on a page, we have to make two passes over the - * page when it's read to figure out how many keys it has, expensive in - * the common case of no empty values and (entries / 2) keys. Likewise, - * a page with only empty values is another common data set, and keys on - * that page will be equal to the number of entries. In both cases, set - * a flag in the page's on-disk header. + * Track if reconciliation of a row-store leaf page has seen empty (zero length) values. We + * don't write out anything for empty values, so if there are empty values on a page, we have to + * make two passes over the page when it's read to figure out how many keys it has, expensive in + * the common case of no empty values and (entries / 2) keys. Likewise, a page with only empty + * values is another common data set, and keys on that page will be equal to the number of + * entries. In both cases, set a flag in the page's on-disk header. * - * The test is per-page reconciliation as described above for the - * overflow-item test. + * The test is per-page reconciliation as described above for the overflow-item test. */ bool all_empty_value, any_empty_value; /* - * Reconciliation gets tricky if we have to split a page, which happens - * when the disk image we create exceeds the page type's maximum disk - * image size. + * Reconciliation gets tricky if we have to split a page, which happens when the disk image we + * create exceeds the page type's maximum disk image size. * * First, the target size of the page we're building. */ @@ -103,31 +96,26 @@ struct __wt_reconcile { uint32_t min_split_size; /* Minimum split page size */ /* - * We maintain two split chunks in the memory during reconciliation to - * be written out as pages. As we get to the end of the data, if the - * last one turns out to be smaller than the minimum split size, we go - * back into the penultimate chunk and split at this minimum split size - * boundary. This moves some data from the penultimate chunk to the last - * chunk, hence increasing the size of the last page written without - * decreasing the penultimate page size beyond the minimum split size. - * For this reason, we maintain an expected split percentage boundary - * and a minimum split percentage boundary. + * We maintain two split chunks in the memory during reconciliation to be written out as pages. + * As we get to the end of the data, if the last one turns out to be smaller than the minimum + * split size, we go back into the penultimate chunk and split at this minimum split size + * boundary. This moves some data from the penultimate chunk to the last chunk, hence increasing + * the size of the last page written without decreasing the penultimate page size beyond the + * minimum split size. For this reason, we maintain an expected split percentage boundary and a + * minimum split percentage boundary. * - * Chunks are referenced by current and previous pointers. In case of a - * split, previous references the first chunk and current switches to - * the second chunk. If reconciliation generates more split chunks, the - * the previous chunk is written to the disk and current and previous - * swap. + * Chunks are referenced by current and previous pointers. In case of a split, previous + * references the first chunk and current switches to the second chunk. If reconciliation + * generates more split chunks, the previous chunk is written to the disk and current and + * previous swap. */ struct __wt_rec_chunk { /* - * The recno and entries fields are the starting record number - * of the split chunk (for column-store splits), and the number - * of entries in the split chunk. + * The recno and entries fields are the starting record number of the split chunk (for + * column-store splits), and the number of entries in the split chunk. * - * The key for a row-store page; no column-store key is needed - * because the page's recno, stored in the recno field, is the - * column-store key. + * The key for a row-store page; no column-store key is needed because the page's recno, + * stored in the recno field, is the column-store key. */ uint32_t entries; uint64_t recno; diff --git a/src/third_party/wiredtiger/src/include/reconcile.i b/src/third_party/wiredtiger/src/include/reconcile.i index eabf9e58c4f..adad096da49 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.i +++ b/src/third_party/wiredtiger/src/include/reconcile.i @@ -20,19 +20,16 @@ static inline bool __wt_rec_need_split(WT_RECONCILE *r, size_t len) { /* - * In the case of a row-store leaf page, trigger a split if a threshold - * number of saved updates is reached. This allows pages to split for - * update/restore and lookaside eviction when there is no visible data - * causing the disk image to grow. + * In the case of a row-store leaf page, trigger a split if a threshold number of saved updates + * is reached. This allows pages to split for update/restore and lookaside eviction when there + * is no visible data causing the disk image to grow. * - * In the case of small pages or large keys, we might try to split when - * a page has no updates or entries, which isn't possible. To consider - * update/restore or lookaside information, require either page entries - * or updates that will be attached to the image. The limit is one of - * either, but it doesn't make sense to create pages or images with few - * entries or updates, even where page sizes are small (especially as - * updates that will eventually become overflow items can throw off our - * calculations). Bound the combination at something reasonable. + * In the case of small pages or large keys, we might try to split when a page has no updates or + * entries, which isn't possible. To consider update/restore or lookaside information, require + * either page entries or updates that will be attached to the image. The limit is one of + * either, but it doesn't make sense to create pages or images with few entries or updates, even + * where page sizes are small (especially as updates that will eventually become overflow items + * can throw off our calculations). Bound the combination at something reasonable. */ if (r->page->type == WT_PAGE_ROW_LEAF && r->entries + r->supd_next > 10) len += r->supd_memsize; @@ -128,12 +125,11 @@ __wt_rec_image_copy(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv) uint8_t *p, *t; /* - * If there's only one chunk of data to copy (because the cell and data - * are being copied from the original disk page), the cell length won't - * be set, the WT_ITEM data/length will reference the data to be copied. + * If there's only one chunk of data to copy (because the cell and data are being copied from + * the original disk page), the cell length won't be set, the WT_ITEM data/length will reference + * the data to be copied. * - * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do - * the copy in-line. + * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do the copy in-line. */ for (p = r->first_free, t = (uint8_t *)&kv->cell, len = kv->cell_len; len > 0; --len) *p++ = *t++; @@ -257,18 +253,15 @@ __wt_rec_dict_replace(WT_SESSION_IMPL *session, WT_RECONCILE *r, wt_timestamp_t uint64_t offset; /* - * We optionally create a dictionary of values and only write a unique - * value once per page, using a special "copy" cell for all subsequent - * copies of the value. We have to do the cell build and resolution at - * this low level because we need physical cell offsets for the page. + * We optionally create a dictionary of values and only write a unique value once per page, + * using a special "copy" cell for all subsequent copies of the value. We have to do the cell + * build and resolution at this low level because we need physical cell offsets for the page. * - * Sanity check: short-data cells can be smaller than dictionary-copy - * cells. If the data is already small, don't bother doing the work. - * This isn't just work avoidance: on-page cells can't grow as a result - * of writing a dictionary-copy cell, the reconciliation functions do a - * split-boundary test based on the size required by the value's cell; - * if we grow the cell after that test we'll potentially write off the - * end of the buffer's memory. + * Sanity check: short-data cells can be smaller than dictionary-copy cells. If the data is + * already small, don't bother doing the work. This isn't just work avoidance: on-page cells + * can't grow as a result of writing a dictionary-copy cell, the reconciliation functions do a + * split-boundary test based on the size required by the value's cell; if we grow the cell after + * that test we'll potentially write off the end of the buffer's memory. */ if (val->buf.size <= WT_INTPACK32_MAXSIZE) return (0); @@ -277,11 +270,10 @@ __wt_rec_dict_replace(WT_SESSION_IMPL *session, WT_RECONCILE *r, wt_timestamp_t return (0); /* - * If the dictionary offset isn't set, we're creating a new entry in the - * dictionary, set its location. + * If the dictionary offset isn't set, we're creating a new entry in the dictionary, set its + * location. * - * If the dictionary offset is set, we have a matching value. Create a - * copy cell instead. + * If the dictionary offset is set, we have a matching value. Create a copy cell instead. */ if (dp->offset == 0) dp->offset = WT_PTRDIFF32(r->first_free, r->cur_ptr->image.mem); diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i index 4f8d6ac6611..66fb066153a 100644 --- a/src/third_party/wiredtiger/src/include/serial.i +++ b/src/third_party/wiredtiger/src/include/serial.i @@ -19,16 +19,14 @@ __insert_simple_func( WT_UNUSED(session); /* - * Update the skiplist elements referencing the new WT_INSERT item. - * If we fail connecting one of the upper levels in the skiplist, - * return success: the levels we updated are correct and sufficient. - * Even though we don't get the benefit of the memory we allocated, - * we can't roll back. + * Update the skiplist elements referencing the new WT_INSERT item. If we fail connecting one of + * the upper levels in the skiplist, return success: the levels we updated are correct and + * sufficient. Even though we don't get the benefit of the memory we allocated, we can't roll + * back. * - * All structure setup must be flushed before the structure is entered - * into the list. We need a write barrier here, our callers depend on - * it. Don't pass complex arguments to the macro, some implementations - * read the old value multiple times. + * All structure setup must be flushed before the structure is entered into the list. We need a + * write barrier here, our callers depend on it. Don't pass complex arguments to the macro, some + * implementations read the old value multiple times. */ for (i = 0; i < skipdepth; i++) { WT_INSERT *old_ins = *ins_stack[i]; @@ -55,16 +53,14 @@ __insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, WT_INSE /* * Update the skiplist elements referencing the new WT_INSERT item. * - * Confirm we are still in the expected position, and no item has been - * added where our insert belongs. If we fail connecting one of the - * upper levels in the skiplist, return success: the levels we updated - * are correct and sufficient. Even though we don't get the benefit of - * the memory we allocated, we can't roll back. + * Confirm we are still in the expected position, and no item has been added where our insert + * belongs. If we fail connecting one of the upper levels in the skiplist, return success: the + * levels we updated are correct and sufficient. Even though we don't get the benefit of the + * memory we allocated, we can't roll back. * - * All structure setup must be flushed before the structure is entered - * into the list. We need a write barrier here, our callers depend on - * it. Don't pass complex arguments to the macro, some implementations - * read the old value multiple times. + * All structure setup must be flushed before the structure is entered into the list. We need a + * write barrier here, our callers depend on it. Don't pass complex arguments to the macro, some + * implementations read the old value multiple times. */ for (i = 0; i < skipdepth; i++) { WT_INSERT *old_ins = *ins_stack[i]; @@ -234,12 +230,11 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE **srch_upd *updp = NULL; /* - * All structure setup must be flushed before the structure is entered - * into the list. We need a write barrier here, our callers depend on - * it. + * All structure setup must be flushed before the structure is entered into the list. We need a + * write barrier here, our callers depend on it. * - * Swap the update into place. If that fails, a new update was added - * after our search, we raced. Check if our update is still permitted. + * Swap the update into place. If that fails, a new update was added after our search, we raced. + * Check if our update is still permitted. */ while (!__wt_atomic_cas_ptr(srch_upd, upd->next, upd)) { if ((ret = __wt_txn_update_check(session, upd->next = *srch_upd)) != 0) { diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index 01eae24cb44..f4b82b8f5e9 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -246,11 +246,11 @@ struct __wt_session_impl { /* * Hazard pointers. * - * Hazard information persists past session close because it's accessed - * by threads of control other than the thread owning the session. + * Hazard information persists past session close because it's accessed by threads of control other + * than the thread owning the session. * - * Use the non-NULL state of the hazard field to know if the session has - * previously been initialized. + * Use the non-NULL state of the hazard field to know if the session has previously been + * initialized. */ #define WT_SESSION_FIRST_USE(s) ((s)->hazard == NULL) diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 53d3f2126ae..14665c4df75 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -9,74 +9,67 @@ /* * Statistics counters: * - * We use an array of statistics structures; threads write different structures - * to avoid writing the same cache line and incurring cache coherency overheads, - * which can dramatically slow fast and otherwise read-mostly workloads. + * We use an array of statistics structures; threads write different structures to avoid writing the + * same cache line and incurring cache coherency overheads, which can dramatically slow fast and + * otherwise read-mostly workloads. * - * With an 8B statistics value and 64B cache-line alignment, 8 values share the - * same cache line. There are collisions when different threads choose the same - * statistics structure and update values that live on the cache line. There is - * likely some locality however: a thread updating the cursor search statistic - * is likely to update other cursor statistics with a chance of hitting already - * cached values. + * With an 8B statistics value and 64B cache-line alignment, 8 values share the same cache line. + * There are collisions when different threads choose the same statistics structure and update + * values that live on the cache line. There is likely some locality however: a thread updating the + * cursor search statistic is likely to update other cursor statistics with a chance of hitting + * already cached values. * - * The actual statistic value must be signed, because one thread might increment - * the value in its structure, and then another thread might decrement the same - * value in another structure (where the value was initially zero), so the value - * in the second thread's slot will go negative. + * The actual statistic value must be signed, because one thread might increment the value in its + * structure, and then another thread might decrement the same value in another structure (where the + * value was initially zero), so the value in the second thread's slot will go negative. * - * When reading a statistics value, the array values are summed and returned to - * the caller. The summation is performed without locking, so the value read - * may be inconsistent (and might be negative, if increments/decrements race - * with the reader). + * When reading a statistics value, the array values are summed and returned to the caller. The + * summation is performed without locking, so the value read may be inconsistent (and might be + * negative, if increments/decrements race with the reader). * - * Choosing how many structures isn't easy: obviously, a smaller number creates - * more conflicts while a larger number uses more memory. + * Choosing how many structures isn't easy: obviously, a smaller number creates more conflicts while + * a larger number uses more memory. * - * Ideally, if the application running on the system is CPU-intensive, and using - * all CPUs on the system, we want to use the same number of slots as there are - * CPUs (because their L1 caches are the units of coherency). However, in - * practice we cannot easily determine how many CPUs are actually available to - * the application. + * Ideally, if the application running on the system is CPU-intensive, and using all CPUs on the + * system, we want to use the same number of slots as there are CPUs (because their L1 caches are + * the units of coherency). However, in practice we cannot easily determine how many CPUs are + * actually available to the application. * - * Our next best option is to use the number of threads in the application as a - * heuristic for the number of CPUs (presumably, the application architect has - * figured out how many CPUs are available). However, inside WiredTiger we don't - * know when the application creates its threads. + * Our next best option is to use the number of threads in the application as a heuristic for the + * number of CPUs (presumably, the application architect has figured out how many CPUs are + * available). However, inside WiredTiger we don't know when the application creates its threads. * - * For now, we use a fixed number of slots. Ideally, we would approximate the - * largest number of cores we expect on any machine where WiredTiger might be - * run, however, we don't want to waste that much memory on smaller machines. - * As of 2015, machines with more than 24 CPUs are relatively rare. + * For now, we use a fixed number of slots. Ideally, we would approximate the largest number of + * cores we expect on any machine where WiredTiger might be run, however, we don't want to waste + * that much memory on smaller machines. As of 2015, machines with more than 24 CPUs are relatively + * rare. * - * Default hash table size; use a prime number of buckets rather than assuming - * a good hash (Reference Sedgewick, Algorithms in C, "Hash Functions"). + * Default hash table size; use a prime number of buckets rather than assuming a good hash + * (Reference Sedgewick, Algorithms in C, "Hash Functions"). */ #define WT_COUNTER_SLOTS 23 /* * WT_STATS_SLOT_ID is the thread's slot ID for the array of structures. * - * Ideally, we want a slot per CPU, and we want each thread to index the slot - * corresponding to the CPU it runs on. Unfortunately, getting the ID of the - * current CPU is difficult: some operating systems provide a system call to - * acquire a CPU ID, but not all (regardless, making a system call to increment - * a statistics value is far too expensive). + * Ideally, we want a slot per CPU, and we want each thread to index the slot corresponding to the + * CPU it runs on. Unfortunately, getting the ID of the current CPU is difficult: some operating + * systems provide a system call to acquire a CPU ID, but not all (regardless, making a system call + * to increment a statistics value is far too expensive). * - * Our second-best option is to use the thread ID. Unfortunately, there is no - * portable way to obtain a unique thread ID that's a small-enough number to - * be used as an array index (portable thread IDs are usually a pointer or an - * opaque chunk, not a simple integer). + * Our second-best option is to use the thread ID. Unfortunately, there is no portable way to obtain + * a unique thread ID that's a small-enough number to be used as an array index (portable thread IDs + * are usually a pointer or an opaque chunk, not a simple integer). * - * Our solution is to use the session ID; there is normally a session per thread - * and the session ID is a small, monotonically increasing number. + * Our solution is to use the session ID; there is normally a session per thread and the session ID + * is a small, monotonically increasing number. */ #define WT_STATS_SLOT_ID(session) (((session)->id) % WT_COUNTER_SLOTS) /* - * Statistic structures are arrays of int64_t's. We have functions to read/write - * those structures regardless of the specific statistic structure we're working - * with, by translating statistics structure field names to structure offsets. + * Statistic structures are arrays of int64_t's. We have functions to read/write those structures + * regardless of the specific statistic structure we're working with, by translating statistics + * structure field names to structure offsets. * * Translate a statistic's value name to an offset in the array. */ @@ -109,20 +102,17 @@ __wt_stats_aggregate(void *stats_arg, int slot) aggr_v += stats[i][slot]; /* - * This can race. However, any implementation with a single value can - * race as well, different threads could set the same counter value - * simultaneously. While we are making races more likely, we are not - * fundamentally weakening the isolation semantics found in updating a - * single value. + * This can race. However, any implementation with a single value can race as well, different + * threads could set the same counter value simultaneously. While we are making races more + * likely, we are not fundamentally weakening the isolation semantics found in updating a single + * value. * - * Additionally, the aggregation can go negative (imagine a thread - * incrementing a value after aggregation has passed its slot and a - * second thread decrementing a value before aggregation has reached - * its slot). + * Additionally, the aggregation can go negative (imagine a thread incrementing a value after + * aggregation has passed its slot and a second thread decrementing a value before aggregation + * has reached its slot). * - * For historic API compatibility, the external type is a uint64_t; - * limit our return to positive values, negative numbers would just - * look really, really large. + * For historic API compatibility, the external type is a uint64_t; limit our return to positive + * values, negative numbers would just look really, really large. */ if (aggr_v < 0) aggr_v = 0; @@ -223,12 +213,11 @@ __wt_stats_clear(void *stats_arg, int slot) #define WT_STAT_CONN_SET(session, fld, value) WT_STAT_SET(session, S2C(session)->stats, fld, value) /* - * Update data-source handle statistics if statistics gathering is enabled - * and the data-source handle is set. + * Update data-source handle statistics if statistics gathering is enabled and the data-source + * handle is set. * - * XXX - * We shouldn't have to check if the data-source handle is NULL, but it's - * necessary until everything is converted to using data-source handles. + * XXX We shouldn't have to check if the data-source handle is NULL, but it's necessary until + * everything is converted to using data-source handles. */ #define WT_STAT_DATA_DECRV(session, fld, value) \ do { \ diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index e67f680b076..7636cf42dd9 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -39,9 +39,8 @@ typedef enum { /* * Transaction ID comparison dealing with edge cases. * - * WT_TXN_ABORTED is the largest possible ID (never visible to a running - * transaction), WT_TXN_NONE is smaller than any possible ID (visible to all - * running transactions). + * WT_TXN_ABORTED is the largest possible ID (never visible to a running transaction), WT_TXN_NONE + * is smaller than any possible ID (visible to all running transactions). */ #define WT_TXNID_LE(t1, t2) ((t1) <= (t2)) @@ -158,15 +157,13 @@ struct __wt_txn_global { uint32_t read_timestampq_len; /* - * Track information about the running checkpoint. The transaction - * snapshot used when checkpointing are special. Checkpoints can run - * for a long time so we keep them out of regular visibility checks. - * Eviction and checkpoint operations know when they need to be aware - * of checkpoint transactions. + * Track information about the running checkpoint. The transaction snapshot used when + * checkpointing are special. Checkpoints can run for a long time so we keep them out of regular + * visibility checks. Eviction and checkpoint operations know when they need to be aware of + * checkpoint transactions. * - * We rely on the fact that (a) the only table a checkpoint updates is - * the metadata; and (b) once checkpoint has finished reading a table, - * it won't revisit it. + * We rely on the fact that (a) the only table a checkpoint updates is the metadata; and (b) + * once checkpoint has finished reading a table, it won't revisit it. */ volatile bool checkpoint_running; /* Checkpoint running */ volatile uint32_t checkpoint_id; /* Checkpoint's session ID */ @@ -277,8 +274,7 @@ struct __wt_txn { /* * Timestamp copied into updates created by this transaction. * - * In some use cases, this can be updated while the transaction is - * running. + * In some use cases, this can be updated while the transaction is running. */ wt_timestamp_t commit_timestamp; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 6d7ead93201..5359e296fa0 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -136,13 +136,11 @@ __txn_resolve_prepared_update(WT_SESSION_IMPL *session, WT_UPDATE *upd) txn = &session->txn; /* - * In case of a prepared transaction, the order of modification of the - * prepare timestamp to commit timestamp in the update chain will not - * affect the data visibility, a reader will encounter a prepared - * update resulting in prepare conflict. + * In case of a prepared transaction, the order of modification of the prepare timestamp to + * commit timestamp in the update chain will not affect the data visibility, a reader will + * encounter a prepared update resulting in prepare conflict. * - * As updating timestamp might not be an atomic operation, we will - * manage using state. + * As updating timestamp might not be an atomic operation, we will manage using state. */ upd->prepare_state = WT_PREPARE_LOCKED; WT_WRITE_BARRIER(); @@ -552,14 +550,12 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) WT_READ_BARRIER(); /* - * Checkpoint transactions often fall behind ordinary application - * threads. Take special effort to not keep changes pinned in cache - * if they are only required for the checkpoint and it has already - * seen them. + * Checkpoint transactions often fall behind ordinary application threads. Take special effort + * to not keep changes pinned in cache if they are only required for the checkpoint and it has + * already seen them. * - * If there is no active checkpoint or this handle is up to date with - * the active checkpoint then it's safe to ignore the checkpoint ID in - * the visibility check. + * If there is no active checkpoint or this handle is up to date with the active checkpoint then + * it's safe to ignore the checkpoint ID in the visibility check. */ checkpoint_pinned = txn_global->checkpoint_state.pinned_id; if (checkpoint_pinned == WT_TXN_NONE || WT_TXNID_LT(oldest_id, checkpoint_pinned)) @@ -586,14 +582,12 @@ __wt_txn_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *pinned_tsp) *pinned_tsp = pinned_ts = txn_global->pinned_timestamp; /* - * Checkpoint transactions often fall behind ordinary application - * threads. Take special effort to not keep changes pinned in cache if - * they are only required for the checkpoint and it has already seen - * them. + * Checkpoint transactions often fall behind ordinary application threads. Take special effort + * to not keep changes pinned in cache if they are only required for the checkpoint and it has + * already seen them. * - * If there is no active checkpoint or this handle is up to date with - * the active checkpoint then it's safe to ignore the checkpoint ID in - * the visibility check. + * If there is no active checkpoint or this handle is up to date with the active checkpoint then + * it's safe to ignore the checkpoint ID in the visibility check. */ include_checkpoint_txn = btree == NULL || (!F_ISSET(btree, WT_BTREE_LOOKASIDE) && @@ -709,13 +703,11 @@ __txn_visible_id(WT_SESSION_IMPL *session, uint64_t id) return (true); /* - * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is - * not the result of a concurrent transaction, that is, if was - * committed before the snapshot was taken. + * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is not the result of a + * concurrent transaction, that is, if was committed before the snapshot was taken. * - * The order here is important: anything newer than the maximum ID we - * saw when taking the snapshot should be invisible, even if the - * snapshot is empty. + * The order here is important: anything newer than the maximum ID we saw when taking the + * snapshot should be invisible, even if the snapshot is empty. */ if (WT_TXNID_LE(txn->snap_max, id)) return (false); @@ -936,26 +928,21 @@ __wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish) /* * Allocating transaction IDs involves several steps. * - * Firstly, publish that this transaction is allocating its ID, then - * publish the transaction ID as the current global ID. Note that this - * transaction ID might not be unique among threads and hence not valid - * at this moment. The flag will notify other transactions that are - * attempting to get their own snapshot for this transaction ID to - * retry. + * Firstly, publish that this transaction is allocating its ID, then publish the transaction ID + * as the current global ID. Note that this transaction ID might not be unique among threads and + * hence not valid at this moment. The flag will notify other transactions that are attempting + * to get their own snapshot for this transaction ID to retry. * - * Then we do an atomic increment to allocate a unique ID. This will - * give the valid ID to this transaction that we publish to the global - * transaction table. + * Then we do an atomic increment to allocate a unique ID. This will give the valid ID to this + * transaction that we publish to the global transaction table. * - * We want the global value to lead the allocated values, so that any - * allocated transaction ID eventually becomes globally visible. When - * there are no transactions running, the oldest_id will reach the - * global current ID, so we want post-increment semantics. Our atomic - * add primitive does pre-increment, so adjust the result here. + * We want the global value to lead the allocated values, so that any allocated transaction ID + * eventually becomes globally visible. When there are no transactions running, the oldest_id + * will reach the global current ID, so we want post-increment semantics. Our atomic add + * primitive does pre-increment, so adjust the result here. * - * We rely on atomic reads of the current ID to create snapshots, so - * for unlocked reads to be well defined, we must use an atomic - * increment here. + * We rely on atomic reads of the current ID to create snapshots, so for unlocked reads to be + * well defined, we must use an atomic increment here. */ if (publish) { WT_PUBLISH(txn_state->is_allocating, true); @@ -1086,8 +1073,8 @@ __wt_txn_read_last(WT_SESSION_IMPL *session) /* * Release the snap_min ID we put in the global table. * - * If the isolation has been temporarily forced, don't touch the - * snapshot here: it will be restored by WT_WITH_TXN_ISOLATION. + * If the isolation has been temporarily forced, don't touch the snapshot here: it will be + * restored by WT_WITH_TXN_ISOLATION. */ if ((!F_ISSET(txn, WT_TXN_RUNNING) || txn->isolation != WT_ISO_SNAPSHOT) && txn->forced_iso == 0) diff --git a/src/third_party/wiredtiger/src/include/verify_build.h b/src/third_party/wiredtiger/src/include/verify_build.h index a72289cc03f..d402d2d73c7 100644 --- a/src/third_party/wiredtiger/src/include/verify_build.h +++ b/src/third_party/wiredtiger/src/include/verify_build.h @@ -73,11 +73,10 @@ __wt_verify_build(void) WT_STATIC_ASSERT(sizeof(size_t) >= 8); /* - * We require a wt_off_t fit into an 8B chunk because 8B is the largest - * integral value we can encode into an address cookie. + * We require a wt_off_t fit into an 8B chunk because 8B is the largest integral value we can + * encode into an address cookie. * - * WiredTiger has never been tested on a system with 4B file offsets, - * disallow them for now. + * WiredTiger has never been tested on a system with 4B file offsets, disallow them for now. */ WT_STATIC_ASSERT(sizeof(wt_off_t) == 8); diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index d6f18f82bb9..aeda4608082 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -201,14 +201,12 @@ __log_fs_write( WT_DECL_RET; /* - * If we're writing into a new log file and we're running in - * compatibility mode to an older release, we have to wait for all - * writes to the previous log file to complete otherwise there could - * be a hole at the end of the previous log file that we cannot detect. + * If we're writing into a new log file and we're running in compatibility mode to an older + * release, we have to wait for all writes to the previous log file to complete otherwise there + * could be a hole at the end of the previous log file that we cannot detect. * - * NOTE: Check for a version less than the one writing the system - * record since we've had a log version change without any actual - * file format changes. + * NOTE: Check for a version less than the one writing the system record since we've had a log + * version change without any actual file format changes. */ if (S2C(session)->log->log_version < WT_LOG_VERSION_SYSTEM && slot->slot_release_lsn.l.file < slot->slot_start_lsn.l.file) { @@ -784,9 +782,9 @@ __log_file_header(WT_SESSION_IMPL *session, WT_FH *fh, WT_LSN *end_lsn, bool pre /* * Now that the record is set up, initialize the record header. * - * Checksum a little-endian version of the header, and write everything - * in little-endian format. The checksum is (potentially) returned in a - * big-endian format, swap it into place in a separate step. + * Checksum a little-endian version of the header, and write everything in little-endian format. + * The checksum is (potentially) returned in a big-endian format, swap it into place in a + * separate step. */ logrec->len = log->allocsize; logrec->checksum = 0; @@ -1790,16 +1788,14 @@ __log_has_hole(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t log_size, wt_off_t break; } /* - * A presumed log record begins here where the buffer - * becomes non-zero. If we have enough of a log record - * present in the buffer, we either have a valid header - * or corruption. Verify the header of this record to - * determine whether it is just a hole or corruption. + * A presumed log record begins here where the buffer becomes non-zero. If we have + * enough of a log record present in the buffer, we either have a valid header or + * corruption. Verify the header of this record to determine whether it is just a hole + * or corruption. * - * We don't bother making this check for backup copies, - * as records may have their beginning zeroed, hence - * the part after a hole may in fact be the middle of - * the record. + * We don't bother making this check for backup copies, as records may have their + * beginning zeroed, hence the part after a hole may in fact be the middle of the + * record. */ if (!F_ISSET(conn, WT_CONN_WAS_BACKUP)) { logrec = (WT_LOG_RECORD *)p; @@ -2348,13 +2344,12 @@ advance: next_lsn.l.offset += rdup_len; if (rd_lsn.l.offset != 0) { /* - * We need to manage the different buffers here. - * Buf is the buffer this function uses to read from - * the disk. The callback buffer may change based - * on whether encryption and compression are used. + * We need to manage the different buffers here. Buf is the buffer this function uses to + * read from the disk. The callback buffer may change based on whether encryption and + * compression are used. * - * We want to free any buffers from compression and - * encryption but keep the one we use for reading. + * We want to free any buffers from compression and encryption but keep the one we use + * for reading. */ cbbuf = buf; if (F_ISSET(logrec, WT_LOG_RECORD_ENCRYPTED)) { @@ -2589,12 +2584,11 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, ui myslot.slot = NULL; memset(&myslot, 0, sizeof(myslot)); /* - * Assume the WT_ITEM the caller passed is a WT_LOG_RECORD, which has a - * header at the beginning for us to fill in. + * Assume the WT_ITEM the caller passed is a WT_LOG_RECORD, which has a header at the beginning + * for us to fill in. * - * If using direct_io, the caller should pass us an aligned record. - * But we need to make sure it is big enough and zero-filled so - * that we can write the full amount. Do this whether or not + * If using direct_io, the caller should pass us an aligned record. But we need to make sure it + * is big enough and zero-filled so that we can write the full amount. Do this whether or not * direct_io is in use because it makes the reading code cleaner. */ WT_STAT_CONN_INCRV(session, log_bytes_payload, record->size); @@ -2602,8 +2596,8 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, ui WT_ERR(__wt_buf_grow(session, record, rdup_len)); WT_ASSERT(session, record->data == record->mem); /* - * If the caller's record only partially fills the necessary - * space, we need to zero-fill the remainder. + * If the caller's record only partially fills the necessary space, we need to zero-fill the + * remainder. * * The cast is safe, we've already checked to make sure it's in range. */ @@ -2611,28 +2605,23 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, ui if (fill_size != 0) { memset((uint8_t *)record->mem + record->size, 0, fill_size); /* - * Set the last byte of the log record to a non-zero value, - * that allows us, on the input side, to tell that a log - * record was completely written; there couldn't have been - * a partial write. That means that any checksum mismatch - * in those conditions is a log corruption. + * Set the last byte of the log record to a non-zero value, that allows us, on the input + * side, to tell that a log record was completely written; there couldn't have been a + * partial write. That means that any checksum mismatch in those conditions is a log + * corruption. * - * Without this changed byte, when we see a zeroed last byte, - * we must always treat a checksum error as a possible partial - * write. Since partial writes can happen as a result of an - * interrupted process (for example, a shutdown), we must - * treat a checksum error as a normal occurrence, and merely - * the place where the log must be truncated. So any real + * Without this changed byte, when we see a zeroed last byte, we must always treat a + * checksum error as a possible partial write. Since partial writes can happen as a result + * of an interrupted process (for example, a shutdown), we must treat a checksum error as a + * normal occurrence, and merely the place where the log must be truncated. So any real * corruption within log records is hard to detect as such. * - * However, we can only make this modification if there is - * more than one byte being filled, as the first zero byte - * past the actual record is needed to terminate the loop - * in txn_commit_apply. + * However, we can only make this modification if there is more than one byte being filled, + * as the first zero byte past the actual record is needed to terminate the loop in + * txn_commit_apply. * - * This is not a log format change, as we only are changing a - * byte in the padding portion of a record, and no logging code - * has ever checked that it is any particular value up to now. + * This is not a log format change, as we only are changing a byte in the padding portion of + * a record, and no logging code has ever checked that it is any particular value up to now. */ if (fill_size > 1) *((uint8_t *)record->mem + rdup_len - 1) = WT_DEBUG_BYTE; @@ -2681,8 +2670,7 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, ui __wt_log_slot_free(session, myslot.slot); } else if (force) { /* - * If we are going to wait for this slot to get written, - * signal the wrlsn thread. + * If we are going to wait for this slot to get written, signal the wrlsn thread. * * XXX I've seen times when conditions are NULL. */ diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index bc860952baf..42155e7df56 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -67,13 +67,12 @@ __wt_clsm_await_switch(WT_CURSOR_LSM *clsm) session = (WT_SESSION_IMPL *)clsm->iface.session; /* - * If there is no primary chunk, or a chunk has overflowed the hard - * limit, which either means a worker thread has fallen behind or there - * has just been a user-level checkpoint, wait until the tree changes. + * If there is no primary chunk, or a chunk has overflowed the hard limit, which either means a + * worker thread has fallen behind or there has just been a user-level checkpoint, wait until + * the tree changes. * - * We used to switch chunks in the application thread here, but that is - * problematic because there is a transaction in progress and it could - * roll back, leaving the metadata inconsistent. + * We used to switch chunks in the application thread here, but that is problematic because + * there is a transaction in progress and it could roll back, leaving the metadata inconsistent. */ for (waited = 0; lsm_tree->nchunks == 0 || clsm->dsk_gen == lsm_tree->dsk_gen; ++waited) { if (waited % WT_THOUSAND == 0) @@ -112,15 +111,13 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) } /* - * In LSM there are multiple btrees active at one time. The tree - * switch code needs to use btree API methods, and it wants to - * operate on the btree for the primary chunk. Set that up now. + * In LSM there are multiple btrees active at one time. The tree switch code needs to use btree + * API methods, and it wants to operate on the btree for the primary chunk. Set that up now. * - * If the primary chunk has grown too large, set a flag so the worker - * thread will switch when it gets a chance to avoid introducing high - * latency into application threads. Don't do this indefinitely: if a - * chunk grows twice as large as the configured size, block until it - * can be switched. + * If the primary chunk has grown too large, set a flag so the worker thread will switch when it + * gets a chance to avoid introducing high latency into application threads. Don't do this + * indefinitely: if a chunk grows twice as large as the configured size, block until it can be + * switched. */ hard_limit = lsm_tree->need_switch; @@ -200,19 +197,14 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) __wt_txn_cursor_op(session); /* - * Figure out how many updates are required for - * snapshot isolation. + * Figure out how many updates are required for snapshot isolation. * - * This is not a normal visibility check on the maximum - * transaction ID in each chunk: any transaction ID - * that overlaps with our snapshot is a potential - * conflict. + * This is not a normal visibility check on the maximum transaction ID in each chunk: + * any transaction ID that overlaps with our snapshot is a potential conflict. * - * Note that the pinned ID is correct here: it tracks - * concurrent transactions excluding special - * transactions such as checkpoint (which we can't - * conflict with because checkpoint only writes the - * metadata, which is not an LSM tree). + * Note that the pinned ID is correct here: it tracks concurrent transactions excluding + * special transactions such as checkpoint (which we can't conflict with because + * checkpoint only writes the metadata, which is not an LSM tree). */ clsm->nupdates = 1; if (txn->isolation == WT_ISO_SNAPSHOT && F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { @@ -557,9 +549,8 @@ retry: /* * Close any cursors we no longer need. * - * Drop the LSM tree lock while we do this: if the cache is - * full, we may block while closing a cursor. Save the - * generation number and retry if it has changed under us. + * Drop the LSM tree lock while we do this: if the cache is full, we may block while closing + * a cursor. Save the generation number and retry if it has changed under us. */ if (clsm->chunks != NULL && ngood < clsm->nchunks) { close_range_start = ngood; @@ -651,19 +642,16 @@ retry: btree = ((WT_CURSOR_BTREE *)primary)->btree; /* - * If the primary is not yet set as the primary, do that now. - * Note that eviction was configured off when the underlying - * object was created, which is what we want, leave it alone. + * If the primary is not yet set as the primary, do that now. Note that eviction was + * configured off when the underlying object was created, which is what we want, leave it + * alone. * - * We don't have to worry about races here: every thread that - * modifies the tree will have to come through here, at worse - * we set the flag repeatedly. We don't use a WT_BTREE handle - * flag, however, we could race doing the read-modify-write of - * the flags field. + * We don't have to worry about races here: every thread that modifies the tree will have to + * come through here, at worse we set the flag repeatedly. We don't use a WT_BTREE handle + * flag, however, we could race doing the read-modify-write of the flags field. * - * If something caused the chunk to be closed and reopened - * since it was created, we can no longer use it as a primary - * chunk and we need to force a switch. We detect the tree was + * If something caused the chunk to be closed and reopened since it was created, we can no + * longer use it as a primary chunk and we need to force a switch. We detect the tree was * created when it was opened by checking the "original" flag. */ if (!btree->lsm_primary && btree->original) @@ -837,12 +825,11 @@ __clsm_position_chunk(WT_CURSOR_LSM *clsm, WT_CURSOR *c, bool forward, int *cmpp WT_RET(forward ? c->next(c) : c->prev(c)); /* - * With higher isolation levels, where we have stable reads, - * we're done: the cursor is now positioned as expected. + * With higher isolation levels, where we have stable reads, we're done: the cursor is now + * positioned as expected. * - * With read-uncommitted isolation, a new record could have - * appeared in between the search and stepping forward / back. - * In that case, keep going until we see a key in the expected + * With read-uncommitted isolation, a new record could have appeared in between the search + * and stepping forward / back. In that case, keep going until we see a key in the expected * range. */ if (session->txn.isolation != WT_ISO_READ_UNCOMMITTED) @@ -1270,14 +1257,13 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); /* - * search_near is somewhat fiddly: we can't just use a nearby key from - * the in-memory chunk because there could be a closer key on disk. + * search_near is somewhat fiddly: we can't just use a nearby key from the in-memory chunk + * because there could be a closer key on disk. * - * As we search down the chunks, we stop as soon as we find an exact - * match. Otherwise, we maintain the smallest cursor larger than the - * search key and the largest cursor smaller than the search key. At - * the end, we prefer the larger cursor, but if no record is larger, - * position on the last record in the tree. + * As we search down the chunks, we stop as soon as we find an exact match. Otherwise, we + * maintain the smallest cursor larger than the search key and the largest cursor smaller than + * the search key. At the end, we prefer the larger cursor, but if no record is larger, position + * on the last record in the tree. */ WT_FORALL_CURSORS(clsm, c, i) { @@ -1435,13 +1421,12 @@ __clsm_put(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, const WT_ITEM *key, co } /* - * Update the record count. It is in a shared structure, but it's only - * approximate, so don't worry about protecting access. + * Update the record count. It is in a shared structure, but it's only approximate, so don't + * worry about protecting access. * - * Throttle if necessary. Every 100 update operations on each cursor, - * check if throttling is required. Don't rely only on the shared - * counter because it can race, and because for some workloads, there - * may not be enough records per chunk to get effective throttling. + * Throttle if necessary. Every 100 update operations on each cursor, check if throttling is + * required. Don't rely only on the shared counter because it can race, and because for some + * workloads, there may not be enough records per chunk to get effective throttling. */ if ((++clsm->primary_chunk->count % 100 == 0 || ++clsm->update_count >= 100) && lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) { diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c index eaecb197b08..aa7a400d3c9 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c @@ -592,13 +592,11 @@ __wt_lsm_manager_push_entry( } /* - * Don't allow any work units unless a tree is active, this avoids - * races on shutdown between clearing out queues and pushing new - * work units. + * Don't allow any work units unless a tree is active, this avoids races on shutdown between + * clearing out queues and pushing new work units. * - * Increment the queue reference before checking the flag since - * on close, the flag is cleared and then the queue reference count - * is checked. + * Increment the queue reference before checking the flag since on close, the flag is cleared + * and then the queue reference count is checked. */ (void)__wt_atomic_add32(&lsm_tree->queue_ref, 1); if (!lsm_tree->active) { diff --git a/src/third_party/wiredtiger/src/lsm/lsm_merge.c b/src/third_party/wiredtiger/src/lsm/lsm_merge.c index 58b44f9cf2a..7110a75cec0 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_merge.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_merge.c @@ -199,18 +199,15 @@ __lsm_merge_span(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id, u_in return (WT_NOTFOUND); /* - * Look for the most efficient merge we can do. We define efficiency - * as collapsing as many levels as possible while processing the - * smallest number of rows. + * Look for the most efficient merge we can do. We define efficiency as collapsing as many levels as + * possible while processing the smallest number of rows. * - * We make a distinction between "major" and "minor" merges. The - * difference is whether the oldest chunk is involved: if it is, we can - * discard tombstones, because there can be no older record to marked - * deleted. + * We make a distinction between "major" and "minor" merges. The difference is whether the oldest + * chunk is involved: if it is, we can discard tombstones, because there can be no older record to + * marked deleted. * - * Respect the configured limit on the number of chunks to merge: start - * with the most recent set of chunks and work backwards until going - * further becomes significantly less efficient. + * Respect the configured limit on the number of chunks to merge: start with the most recent set of + * chunks and work backwards until going further becomes significantly less efficient. */ retry_find: oldest_gen = youngest_gen = lsm_tree->chunk[end_chunk]->generation; @@ -539,12 +536,12 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) F_SET(chunk, WT_LSM_CHUNK_ONDISK); /* - * We have no current way of continuing if the metadata update fails, - * so we will panic in that case. Put some effort into cleaning up - * after ourselves here - so things have a chance of shutting down. + * We have no current way of continuing if the metadata update fails, so we will panic in that + * case. Put some effort into cleaning up after ourselves here - so things have a chance of + * shutting down. * - * Any errors that happened after the tree was locked are - * fatal - we can't guarantee the state of the tree. + * Any errors that happened after the tree was locked are fatal - we can't guarantee the state + * of the tree. */ if ((ret = __wt_lsm_meta_write(session, lsm_tree, NULL)) != 0) WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge"); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_meta.c b/src/third_party/wiredtiger/src/lsm/lsm_meta.c index c6f7a82968c..c39e4756bed 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_meta.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_meta.c @@ -290,9 +290,8 @@ __lsm_meta_read_v1(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, const char * /* * Set up the config for each chunk. * - * Make the memory_page_max double the chunk size, so application - * threads don't immediately try to force evict the chunk when the - * worker thread clears the NO_EVICTION flag. + * Make the memory_page_max double the chunk size, so application threads don't immediately try + * to force evict the chunk when the worker thread clears the NO_EVICTION flag. */ file_cfg[1] = lsmconf; WT_ERR(__wt_scr_alloc(session, 0, &buf)); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c index 9d34eca0589..c30d77d6c05 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_stat.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c @@ -63,9 +63,8 @@ __curstat_lsm_init(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR_STAT *cs /* * Get the statistics for the chunk's underlying object. * - * XXX kludge: we may have an empty chunk where no checkpoint - * was written. If so, try to open the ordinary handle on that - * chunk instead. + * XXX kludge: we may have an empty chunk where no checkpoint was written. If so, try to + * open the ordinary handle on that chunk instead. */ WT_ERR(__wt_buf_fmt(session, uribuf, "statistics:%s", chunk->uri)); ret = __wt_curstat_open(session, uribuf->data, NULL, diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c index 9b6933a61e2..40cf169566c 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c @@ -289,13 +289,12 @@ __wt_lsm_tree_setup_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LS WT_RET(__wt_lsm_tree_chunk_name(session, lsm_tree, chunk->id, chunk->generation, &chunk->uri)); /* - * If the underlying file exists, drop the chunk first - there may be - * some content hanging over from an aborted merge or checkpoint. + * If the underlying file exists, drop the chunk first - there may be some content hanging over + * from an aborted merge or checkpoint. * - * Don't do this for the very first chunk: we are called during - * WT_SESSION::create, and doing a drop inside there does interesting - * things with handle locks and metadata tracking. It can never have - * been the result of an interrupted merge, anyway. + * Don't do this for the very first chunk: we are called during WT_SESSION::create, and doing a + * drop inside there does interesting things with handle locks and metadata tracking. It can + * never have been the result of an interrupted merge, anyway. */ if (chunk->id > 1) WT_RET(__lsm_tree_cleanup_old(session, chunk->uri)); @@ -660,8 +659,7 @@ __wt_lsm_tree_throttle(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool dec /* * Merge throttling, based on the number of on-disk, level 0 chunks. * - * Don't throttle if the tree has less than a single level's number - * of chunks. + * Don't throttle if the tree has less than a single level's number of chunks. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_MERGES)) { if (lsm_tree->nchunks < lsm_tree->merge_max) diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index 8f815277e6b..3be7acf7379 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -101,11 +101,11 @@ __wt_lsm_get_chunk_to_flush( } /* - * Don't be overly zealous about pushing old chunks from cache. - * Attempting too many drops can interfere with checkpoints. + * Don't be overly zealous about pushing old chunks from cache. Attempting too many drops can + * interfere with checkpoints. * - * If retrying a discard push an additional work unit so there are - * enough to trigger checkpoints. + * If retrying a discard push an additional work unit so there are enough to trigger + * checkpoints. */ if (evict_chunk != NULL && flush_chunk != NULL) { chunk = (__wt_random(&session->rnd) & 1) ? evict_chunk : flush_chunk; @@ -619,10 +619,9 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri) const char *drop_cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_drop), "remove_files=false", NULL}; /* - * We need to grab the schema lock to drop the file, so first try to - * make sure there is minimal work to freeing space in the cache. Only - * bother trying to discard the checkpoint handle: the in-memory handle - * should have been closed already. + * We need to grab the schema lock to drop the file, so first try to make sure there is minimal + * work to freeing space in the cache. Only bother trying to discard the checkpoint handle: the + * in-memory handle should have been closed already. * * This will fail with EBUSY if the file is still in use. */ @@ -663,14 +662,12 @@ __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) flush_metadata = false; /* - * Take a copy of the current state of the LSM tree and look for chunks - * to drop. We do it this way to avoid holding the LSM tree lock while - * doing I/O or waiting on the schema lock. + * Take a copy of the current state of the LSM tree and look for chunks to drop. We do it this + * way to avoid holding the LSM tree lock while doing I/O or waiting on the schema lock. * - * This is safe because only one thread will be in this function at a - * time. Merges may complete concurrently, and the old_chunks array - * may be extended, but we shuffle down the pointers each time we free - * one to keep the non-NULL slots at the beginning of the array. + * This is safe because only one thread will be in this function at a time. Merges may complete + * concurrently, and the old_chunks array may be extended, but we shuffle down the pointers each + * time we free one to keep the non-NULL slots at the beginning of the array. */ WT_CLEAR(cookie); WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, true)); diff --git a/src/third_party/wiredtiger/src/meta/meta_apply.c b/src/third_party/wiredtiger/src/meta/meta_apply.c index e8f5ac33605..752ad8c09db 100644 --- a/src/third_party/wiredtiger/src/meta/meta_apply.c +++ b/src/third_party/wiredtiger/src/meta/meta_apply.c @@ -41,15 +41,13 @@ __meta_btree_apply(WT_SESSION_IMPL *session, WT_CURSOR *cursor, continue; /* - * We need to pull the handle into the session handle cache - * and make sure it's referenced to stop other internal code - * dropping the handle (e.g in LSM when cleaning up obsolete - * chunks). Holding the schema lock isn't enough. + * We need to pull the handle into the session handle cache and make sure it's referenced to + * stop other internal code dropping the handle (e.g in LSM when cleaning up obsolete + * chunks). Holding the schema lock isn't enough. * - * Handles that are busy are skipped without the whole - * operation failing. This deals among other cases with - * checkpoint encountering handles that are locked (e.g., for - * bulk loads or verify operations). + * Handles that are busy are skipped without the whole operation failing. This deals among + * other cases with checkpoint encountering handles that are locked (e.g., for bulk loads or + * verify operations). */ if ((t_ret = __wt_session_get_dhandle(session, uri, NULL, NULL, 0)) != 0) { WT_TRET_BUSY_OK(t_ret); diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c index 3b0749d9020..c8c4383f1a3 100644 --- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c +++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c @@ -40,9 +40,8 @@ __wt_meta_checkpoint( /* * Retrieve the named checkpoint or the last checkpoint. * - * If we don't find a named checkpoint, we're done, they're read-only. - * If we don't find a default checkpoint, it's creation, return "no - * data" and let our caller handle it. + * If we don't find a named checkpoint, we're done, they're read-only. If we don't find a + * default checkpoint, it's creation, return "no data" and let our caller handle it. */ if (checkpoint == NULL) { if ((ret = __ckpt_last(session, config, ckpt)) == WT_NOTFOUND) { @@ -358,13 +357,11 @@ __wt_meta_ckptlist_get( if (update) { /* - * This isn't clean, but there's necessary cooperation between - * the schema layer (that maintains the list of checkpoints), - * the btree layer (that knows when the root page is written, - * creating a new checkpoint), and the block manager (which - * actually creates the checkpoint). All of that cooperation is - * handled in the array of checkpoint structures referenced from - * the WT_BTREE structure. + * This isn't clean, but there's necessary cooperation between the schema layer (that + * maintains the list of checkpoints), the btree layer (that knows when the root page is + * written, creating a new checkpoint), and the block manager (which actually creates the + * checkpoint). All of that cooperation is handled in the array of checkpoint structures + * referenced from the WT_BTREE structure. * * Allocate a slot for a new value, plus a slot to mark the end. */ @@ -498,21 +495,17 @@ __ckptlist_review_write_gen(WT_SESSION_IMPL *session, WT_CKPT *ckpt) uint64_t v; /* - * Every page written in a given wiredtiger_open() session needs to be - * in a single "generation", it's how we know to ignore transactional - * information found on pages written in previous generations. We make - * this work by writing the maximum write generation we've ever seen - * as the write-generation of the metadata file's checkpoint. When - * wiredtiger_open() is called, we copy that write generation into the - * connection's name space as the base write generation value. Then, - * whenever we open a file, if the file's write generation is less than - * the base value, we update the file's write generation so all writes - * will appear after the base value, and we ignore transactions on pages - * where the write generation is less than the base value. + * Every page written in a given wiredtiger_open() session needs to be in a single "generation", + * it's how we know to ignore transactional information found on pages written in previous + * generations. We make this work by writing the maximum write generation we've ever seen as the + * write-generation of the metadata file's checkpoint. When wiredtiger_open() is called, we copy + * that write generation into the connection's name space as the base write generation value. + * Then, whenever we open a file, if the file's write generation is less than the base value, we + * update the file's write generation so all writes will appear after the base value, and we + * ignore transactions on pages where the write generation is less than the base value. * - * At every checkpoint, if the file's checkpoint write generation is - * larger than the connection's maximum write generation, update the - * connection. + * At every checkpoint, if the file's checkpoint write generation is larger than the + * connection's maximum write generation, update the connection. */ do { WT_ORDERED_READ(v, S2C(session)->max_write_gen); diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c index f92a64e7e3d..69e4ca2e056 100644 --- a/src/third_party/wiredtiger/src/meta/meta_table.c +++ b/src/third_party/wiredtiger/src/meta/meta_table.c @@ -74,12 +74,11 @@ __wt_metadata_cursor_open(WT_SESSION_IMPL *session, const char *config, WT_CURSO btree = ((WT_CURSOR_BTREE *)(*cursorp))->btree; /* - * Special settings for metadata: skew eviction so metadata almost - * always stays in cache and make sure metadata is logged if possible. + * Special settings for metadata: skew eviction so metadata almost always stays in cache and make + * sure metadata is logged if possible. * - * Test before setting so updates can't race in subsequent opens (the - * first update is safe because it's single-threaded from - * wiredtiger_open). + * Test before setting so updates can't race in subsequent opens (the first update is safe because + * it's single-threaded from wiredtiger_open). */ #define WT_EVICT_META_SKEW 10000 if (btree->evict_priority == 0) diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c index e1289864c6c..80e409f380f 100644 --- a/src/third_party/wiredtiger/src/meta/meta_turtle.c +++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c @@ -197,19 +197,16 @@ __wt_turtle_init(WT_SESSION_IMPL *session) WT_RET(__wt_remove_if_exists(session, WT_METADATA_TURTLE_SET, false)); /* - * If we found a corrupted turtle file, then delete it and create a new. - * We could die after creating the turtle file and before creating the - * metadata file, or worse, the metadata file might be in some random - * state. Make sure that doesn't happen: if we don't find the turtle - * file, first create the metadata file, load any hot backup, and then - * create the turtle file. No matter what happens, if metadata file - * creation doesn't fully complete, we won't have a turtle file and we - * will repeat the process until we succeed. + * If we found a corrupted turtle file, then delete it and create a new. We could die after + * creating the turtle file and before creating the metadata file, or worse, the metadata file + * might be in some random state. Make sure that doesn't happen: if we don't find the turtle + * file, first create the metadata file, load any hot backup, and then create the turtle file. + * No matter what happens, if metadata file creation doesn't fully complete, we won't have a + * turtle file and we will repeat the process until we succeed. * - * Incremental backups can occur only if recovery is run and it becomes - * live. So, if there is a turtle file and an incremental backup file, - * that is an error. Otherwise, if there's already a turtle file, we're - * done. + * Incremental backups can occur only if recovery is run and it becomes live. So, if there is a + * turtle file and an incremental backup file, that is an error. Otherwise, if there's already a + * turtle file, we're done. */ WT_RET(__wt_fs_exist(session, WT_INCREMENTAL_BACKUP, &exist_incr)); WT_RET(__wt_fs_exist(session, WT_INCREMENTAL_SRC, &exist_isrc)); diff --git a/src/third_party/wiredtiger/src/os_common/os_alloc.c b/src/third_party/wiredtiger/src/os_common/os_alloc.c index 7933e01dedb..7ad9cf4ddc8 100644 --- a/src/third_party/wiredtiger/src/os_common/os_alloc.c +++ b/src/third_party/wiredtiger/src/os_common/os_alloc.c @@ -217,11 +217,10 @@ __wt_realloc_aligned( } #endif /* - * If there is no posix_memalign function, or no alignment configured, - * fall back to realloc. + * If there is no posix_memalign function, or no alignment configured, fall back to realloc. * - * Windows note: Visual C CRT memalign does not match POSIX behavior - * and would also double each allocation so it is bad for memory use. + * Windows note: Visual C CRT memalign does not match POSIX behavior and would also double each + * allocation so it is bad for memory use. */ return (__realloc_func(session, bytes_allocated_ret, bytes_to_allocate, false, retp)); } diff --git a/src/third_party/wiredtiger/src/os_common/os_fhandle.c b/src/third_party/wiredtiger/src/os_common/os_fhandle.c index bba63e2ae44..d3d12f76a11 100644 --- a/src/third_party/wiredtiger/src/os_common/os_fhandle.c +++ b/src/third_party/wiredtiger/src/os_common/os_fhandle.c @@ -235,8 +235,7 @@ __wt_open(WT_SESSION_IMPL *session, const char *name, WT_FS_OPEN_FILE_TYPE file_ fh->file_type = file_type; /* - * If this is a read-only connection, open all files read-only except - * the lock file. + * If this is a read-only connection, open all files read-only except the lock file. * * The only file created in read-only mode is the lock file. */ @@ -331,8 +330,7 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp) __wt_verbose(session, WT_VERB_FILEOPS, "%s: file-close", fh->name); /* - * If the reference count hasn't gone to 0, or if it's an in-memory - * object, we're done. + * If the reference count hasn't gone to 0, or if it's an in-memory object, we're done. * * Assert the reference count is correct, but don't let it wrap. */ diff --git a/src/third_party/wiredtiger/src/os_posix/os_dlopen.c b/src/third_party/wiredtiger/src/os_posix/os_dlopen.c index 7ba37803a44..afdee29f4ed 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_dlopen.c +++ b/src/third_party/wiredtiger/src/os_posix/os_dlopen.c @@ -65,9 +65,8 @@ __wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh) /* * FreeBSD dies inside __cxa_finalize when closing handles. * - * For now, just skip the dlclose: this may leak some resources until - * the process exits, but that is preferable to hard-to-debug crashes - * during exit. + * For now, just skip the dlclose: this may leak some resources until the process exits, but that is + * preferable to hard-to-debug crashes during exit. */ #ifndef __FreeBSD__ if (dlclose(dlh->handle) != 0) { diff --git a/src/third_party/wiredtiger/src/os_posix/os_fallocate.c b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c index 06b65b2c921..341f4f85537 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_fallocate.c +++ b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c @@ -103,17 +103,14 @@ __wt_posix_file_extend(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_o /* * The first file extension call: figure out what this system has. * - * This function is configured as a locking call, so we know we're - * single-threaded through here. Set the nolock function first, then - * publish the NULL replacement to ensure the handle functions are - * always correct. + * This function is configured as a locking call, so we know we're single-threaded through here. + * Set the nolock function first, then publish the NULL replacement to ensure the handle + * functions are always correct. * - * We've seen Linux systems where posix_fallocate has corrupted existing - * file data (even though that is explicitly disallowed by POSIX). - * FreeBSD and Solaris support posix_fallocate, and so far we've seen - * no problems leaving it unlocked. Check for fallocate (and the system - * call version of fallocate) first to avoid locking on Linux if at all - * possible. + * We've seen Linux systems where posix_fallocate has corrupted existing file data (even though + * that is explicitly disallowed by POSIX). FreeBSD and Solaris support posix_fallocate, and so + * far we've seen no problems leaving it unlocked. Check for fallocate (and the system call + * version of fallocate) first to avoid locking on Linux if at all possible. */ if (__posix_std_fallocate(file_handle, wt_session, offset) == 0) { file_handle->fh_extend_nolock = __posix_std_fallocate; diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c index dfa075d1249..0e0794d6cfa 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_fs.c +++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c @@ -372,13 +372,11 @@ __posix_file_lock(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, bool lock pfh = (WT_FILE_HANDLE_POSIX *)file_handle; /* - * WiredTiger requires this function be able to acquire locks past - * the end of file. + * WiredTiger requires this function be able to acquire locks past the end of file. * - * Note we're using fcntl(2) locking: all fcntl locks associated with a - * file for a given process are removed when any file descriptor for the - * file is closed by the process, even if a lock was never requested for - * that file descriptor. + * Note we're using fcntl(2) locking: all fcntl locks associated with a file for a given process + * are removed when any file descriptor for the file is closed by the process, even if a lock + * was never requested for that file descriptor. */ fl.l_start = 0; fl.l_len = 1; diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c index bd68c7afdbd..a5a0854fa20 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c +++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c @@ -79,17 +79,14 @@ __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs locked = true; /* - * It's possible to race with threads waking us up. That's not a problem - * if there are multiple wakeups because the next wakeup will get us, or - * if we're only pausing for a short period. It's a problem if there's - * only a single wakeup, our waker is likely waiting for us to exit. - * After acquiring the mutex (so we're guaranteed to be awakened by any - * future wakeup call), optionally check if we're OK to keep running. - * This won't ensure our caller won't just loop and call us again, but - * at least it's not our fault. + * It's possible to race with threads waking us up. That's not a problem if there are multiple + * wakeups because the next wakeup will get us, or if we're only pausing for a short period. + * It's a problem if there's only a single wakeup, our waker is likely waiting for us to exit. + * After acquiring the mutex (so we're guaranteed to be awakened by any future wakeup call), + * optionally check if we're OK to keep running. This won't ensure our caller won't just loop + * and call us again, but at least it's not our fault. * - * Assert we're not waiting longer than a second if not checking the - * run status. + * Assert we're not waiting longer than a second if not checking the run status. */ WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION); if (run_func != NULL && !run_func(session)) @@ -97,17 +94,14 @@ __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs if (usecs > 0) { /* - * Get the current time as the basis for calculating when the - * wait should end. Prefer a monotonic clock source to avoid - * unexpectedly long sleeps when the system clock is adjusted. + * Get the current time as the basis for calculating when the wait should end. Prefer a monotonic + * clock source to avoid unexpectedly long sleeps when the system clock is adjusted. * - * Failing that, query the time directly and don't attempt to - * correct for the clock moving backwards, which would result - * in a sleep that is too long by however much the clock is - * updated. This isn't as good as a monotonic clock source but - * makes the window of vulnerability smaller (i.e., the - * calculated time is only incorrect if the system clock - * changes in between us querying it and waiting). + * Failing that, query the time directly and don't attempt to correct for the clock moving + * backwards, which would result in a sleep that is too long by however much the clock is updated. + * This isn't as good as a monotonic clock source but makes the window of vulnerability smaller + * (i.e., the calculated time is only incorrect if the system clock changes in between us querying + * it and waiting). */ #ifdef HAVE_PTHREAD_COND_MONOTONIC WT_SYSCALL_RETRY(clock_gettime(CLOCK_MONOTONIC, &ts), ret); diff --git a/src/third_party/wiredtiger/src/os_win/os_fs.c b/src/third_party/wiredtiger/src/os_win/os_fs.c index 2e67a0c8a61..c5015788613 100644 --- a/src/third_party/wiredtiger/src/os_win/os_fs.c +++ b/src/third_party/wiredtiger/src/os_win/os_fs.c @@ -184,9 +184,9 @@ __win_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session) /* * Close the primary and secondary handles. * - * We don't open Windows system handles when opening directories for - * flushing, as it's not necessary (or possible) to flush a directory - * on Windows. Confirm the file handle is open before closing it. + * We don't open Windows system handles when opening directories for flushing, as it's not + * necessary (or possible) to flush a directory on Windows. Confirm the file handle is open + * before closing it. */ if (win_fh->filehandle != INVALID_HANDLE_VALUE && CloseHandle(win_fh->filehandle) == 0) { windows_error = __wt_getlasterror(); @@ -486,12 +486,10 @@ __win_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char desired_access |= GENERIC_WRITE; /* - * Security: - * The application may spawn a new process, and we don't want another - * process to have access to our file handles. + * Security: The application may spawn a new process, and we don't want another process to have + * access to our file handles. * - * TODO: Set tighter file permissions but set bInheritHandle to false - * to prevent inheritance + * TODO: Set tighter file permissions but set bInheritHandle to false to prevent inheritance */ f = FILE_ATTRIBUTE_NORMAL; diff --git a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c index 75b0fe75478..af67fd6a264 100644 --- a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c +++ b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c @@ -59,17 +59,14 @@ __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs locked = true; /* - * It's possible to race with threads waking us up. That's not a problem - * if there are multiple wakeups because the next wakeup will get us, or - * if we're only pausing for a short period. It's a problem if there's - * only a single wakeup, our waker is likely waiting for us to exit. - * After acquiring the mutex (so we're guaranteed to be awakened by any - * future wakeup call), optionally check if we're OK to keep running. - * This won't ensure our caller won't just loop and call us again, but - * at least it's not our fault. + * It's possible to race with threads waking us up. That's not a problem if there are multiple + * wakeups because the next wakeup will get us, or if we're only pausing for a short period. + * It's a problem if there's only a single wakeup, our waker is likely waiting for us to exit. + * After acquiring the mutex (so we're guaranteed to be awakened by any future wakeup call), + * optionally check if we're OK to keep running. This won't ensure our caller won't just loop + * and call us again, but at least it's not our fault. * - * Assert we're not waiting longer than a second if not checking the - * run status. + * Assert we're not waiting longer than a second if not checking the run status. */ WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION); diff --git a/src/third_party/wiredtiger/src/os_win/os_setvbuf.c b/src/third_party/wiredtiger/src/os_win/os_setvbuf.c index 8b26c379e0a..9b027f60100 100644 --- a/src/third_party/wiredtiger/src/os_win/os_setvbuf.c +++ b/src/third_party/wiredtiger/src/os_win/os_setvbuf.c @@ -16,13 +16,12 @@ void __wt_stream_set_line_buffer(FILE *fp) { /* - * This function exists because MSVC doesn't support buffer sizes of 0 - * to the setvbuf call. To avoid re-introducing the bug, we have helper - * functions and disallow calling setvbuf directly in WiredTiger code. + * This function exists because MSVC doesn't support buffer sizes of 0 to the setvbuf call. To + * avoid re-introducing the bug, we have helper functions and disallow calling setvbuf directly + * in WiredTiger code. * - * Additionally, MSVC doesn't support line buffering, the result is the - * same as full-buffering. We assume our caller wants immediate output, - * set no-buffering instead. + * Additionally, MSVC doesn't support line buffering, the result is the same as full-buffering. + * We assume our caller wants immediate output, set no-buffering instead. */ __wt_stream_set_no_buffer(fp); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_child.c b/src/third_party/wiredtiger/src/reconcile/rec_child.c index b1d696e2ac6..390f183f651 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_child.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_child.c @@ -20,20 +20,18 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_C page_del = ref->page_del; /* - * Internal pages with child leaf pages in the WT_REF_DELETED state are - * a special case during reconciliation. First, if the deletion was a - * result of a session truncate call, the deletion may not be visible to - * us. In that case, we proceed as with any change not visible during - * reconciliation by ignoring the change for the purposes of writing the - * internal page. + * Internal pages with child leaf pages in the WT_REF_DELETED state are a special case during + * reconciliation. First, if the deletion was a result of a session truncate call, the deletion + * may not be visible to us. In that case, we proceed as with any change not visible during + * reconciliation by ignoring the change for the purposes of writing the internal page. * - * In this case, there must be an associated page-deleted structure, and - * it holds the transaction ID we care about. + * In this case, there must be an associated page-deleted structure, and it holds the + * transaction ID we care about. * * In some cases, there had better not be any updates we can't see. * - * A visible update to be in READY state (i.e. not in LOCKED or - * PREPARED state), for truly visible to others. + * A visible update to be in READY state (i.e. not in LOCKED or PREPARED state), for truly + * visible to others. */ if (F_ISSET(r, WT_REC_VISIBILITY_ERR) && page_del != NULL && __wt_page_del_active(session, ref, false)) @@ -42,26 +40,22 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_C /* * Deal with any underlying disk blocks. * - * First, check to see if there is an address associated with this leaf: - * if there isn't, we're done, the underlying page is already gone. If - * the page still exists, check for any transactions in the system that - * might want to see the page's state before it's deleted. + * First, check to see if there is an address associated with this leaf: if there isn't, we're + * done, the underlying page is already gone. If the page still exists, check for any + * transactions in the system that might want to see the page's state before it's deleted. * - * If any such transactions exist, we cannot discard the underlying leaf - * page to the block manager because the transaction may eventually read - * it. However, this write might be part of a checkpoint, and should we - * recover to that checkpoint, we'll need to delete the leaf page, else - * we'd leak it. The solution is to write a proxy cell on the internal - * page ensuring the leaf page is eventually discarded. + * If any such transactions exist, we cannot discard the underlying leaf page to the block + * manager because the transaction may eventually read it. However, this write might be part of + * a checkpoint, and should we recover to that checkpoint, we'll need to delete the leaf page, + * else we'd leak it. The solution is to write a proxy cell on the internal page ensuring the + * leaf page is eventually discarded. * - * If no such transactions exist, we can discard the leaf page to the - * block manager and no cell needs to be written at all. We do this - * outside of the underlying tracking routines because this action is - * permanent and irrevocable. (Clearing the address means we've lost - * track of the disk address in a permanent way. This is safe because - * there's no path to reading the leaf page again: if there's ever a - * read into this part of the name space again, the cache read function - * instantiates an entirely new page.) + * If no such transactions exist, we can discard the leaf page to the block manager and no cell + * needs to be written at all. We do this outside of the underlying tracking routines because + * this action is permanent and irrevocable. (Clearing the address means we've lost track of the + * disk address in a permanent way. This is safe because there's no path to reading the leaf + * page again: if there's ever a read into this part of the name space again, the cache read + * function instantiates an entirely new page.) */ if (ref->addr != NULL && !__wt_page_del_active(session, ref, true)) { /* @@ -98,13 +92,12 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_C r->leave_dirty = true; /* - * If the original page cannot be freed, we need to keep a slot on the - * page to reference it from the parent page. + * If the original page cannot be freed, we need to keep a slot on the page to reference it from + * the parent page. * - * If the delete is not visible in this checkpoint, write the original - * address normally. Otherwise, we have to write a proxy record. - * If the delete state is not ready, then delete is not visible as it - * is in prepared state. + * If the delete is not visible in this checkpoint, write the original address normally. + * Otherwise, we have to write a proxy record. If the delete state is not ready, then delete is + * not visible as it is in prepared state. */ if (!__wt_page_del_active(session, ref, false)) *statep = WT_CHILD_PROXY; @@ -130,16 +123,14 @@ __wt_rec_child_modify( *statep = WT_CHILD_ORIGINAL; /* - * This function is called when walking an internal page to decide how - * to handle child pages referenced by the internal page. + * This function is called when walking an internal page to decide how to handle child pages + * referenced by the internal page. * - * Internal pages are reconciled for two reasons: first, when evicting - * an internal page, second by the checkpoint code when writing internal - * pages. During eviction, all pages should be in the WT_REF_DISK or - * WT_REF_DELETED state. During checkpoint, eviction that might affect - * review of an internal page is prohibited, however, as the subtree is - * not reserved for our exclusive use, there are other page states that - * must be considered. + * Internal pages are reconciled for two reasons: first, when evicting an internal page, second + * by the checkpoint code when writing internal pages. During eviction, all pages should be in + * the WT_REF_DISK or WT_REF_DELETED state. During checkpoint, eviction that might affect review + * of an internal page is prohibited, however, as the subtree is not reserved for our exclusive + * use, there are other page states that must be considered. */ for (;; __wt_yield()) { switch (r->tested_ref_state = ref->state) { @@ -151,11 +142,9 @@ __wt_rec_child_modify( /* * The child is in a deleted state. * - * It's possible the state could change underneath us as - * the page is read in, and we can race between checking - * for a deleted state and looking at the transaction ID - * to see if the delete is visible to us. Lock down the - * structure. + * It's possible the state could change underneath us as the page is read in, and we can + * race between checking for a deleted state and looking at the transaction ID to see if + * the delete is visible to us. Lock down the structure. */ if (!WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED)) break; @@ -167,9 +156,8 @@ __wt_rec_child_modify( /* * Locked. * - * We should never be here during eviction, active child - * pages in an evicted page's subtree fails the eviction - * attempt. + * We should never be here during eviction, active child pages in an evicted page's + * subtree fails the eviction attempt. */ WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); if (F_ISSET(r, WT_REC_EVICT)) @@ -191,9 +179,8 @@ __wt_rec_child_modify( /* * On disk or in cache with lookaside updates. * - * We should never be here during eviction: active - * child pages in an evicted page's subtree fails the - * eviction attempt. + * We should never be here during eviction: active child pages in an evicted page's + * subtree fails the eviction attempt. */ if (F_ISSET(r, WT_REC_EVICT) && __wt_page_las_active(session, ref)) { WT_ASSERT(session, false); @@ -214,25 +201,23 @@ __wt_rec_child_modify( /* * In memory. * - * We should never be here during eviction, active child - * pages in an evicted page's subtree fails the eviction - * attempt. + * We should never be here during eviction, active child pages in an evicted page's + * subtree fails the eviction attempt. */ WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); if (F_ISSET(r, WT_REC_EVICT)) return (__wt_set_return(session, EBUSY)); /* - * If called during checkpoint, acquire a hazard pointer - * so the child isn't evicted, it's an in-memory case. + * If called during checkpoint, acquire a hazard pointer so the child isn't evicted, + * it's an in-memory case. * - * This call cannot return split/restart, we have a lock - * on the parent which prevents a child page split. + * This call cannot return split/restart, we have a lock on the parent which prevents a + * child page split. * - * Set WT_READ_NO_WAIT because we're only interested in - * the WT_REF's final state. Pages in transition might - * change WT_REF state during our read, and then return - * WT_NOTFOUND to us. In that case, loop and look again. + * Set WT_READ_NO_WAIT because we're only interested in the WT_REF's final state. Pages + * in transition might change WT_REF state during our read, and then return WT_NOTFOUND + * to us. In that case, loop and look again. */ ret = __wt_page_in( session, ref, WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT); @@ -248,9 +233,8 @@ __wt_rec_child_modify( /* * Being read, not modified by definition. * - * We should never be here during eviction, active child - * pages in an evicted page's subtree fails the eviction - * attempt. + * We should never be here during eviction, active child pages in an evicted page's + * subtree fails the eviction attempt. */ WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); if (F_ISSET(r, WT_REC_EVICT)) @@ -261,14 +245,12 @@ __wt_rec_child_modify( /* * The page was split out from under us. * - * We should never be here during eviction, active child - * pages in an evicted page's subtree fails the eviction - * attempt. + * We should never be here during eviction, active child pages in an evicted page's + * subtree fails the eviction attempt. * - * We should never be here during checkpoint, dirty page - * eviction is shutout during checkpoint, all splits in - * process will have completed before we walk any pages - * for checkpoint. + * We should never be here during checkpoint, dirty page eviction is shutout during + * checkpoint, all splits in process will have completed before we walk any pages for + * checkpoint. */ WT_ASSERT(session, WT_REF_SPLIT != WT_REF_SPLIT); return (__wt_set_return(session, EBUSY)); @@ -281,25 +263,20 @@ __wt_rec_child_modify( in_memory: /* - * In-memory states: the child is potentially modified if the page's - * modify structure has been instantiated. If the modify structure - * exists and the page has actually been modified, set that state. - * If that's not the case, we would normally use the original cell's - * disk address as our reference, however there are two special cases, - * both flagged by a missing block address. + * In-memory states: the child is potentially modified if the page's modify structure has been + * instantiated. If the modify structure exists and the page has actually been modified, set + * that state. If that's not the case, we would normally use the original cell's disk address as + * our reference, however there are two special cases, both flagged by a missing block address. * - * First, if forced to instantiate a deleted child page and it's never - * modified, we end up here with a page that has a modify structure, no - * modifications, and no disk address. Ignore those pages, they're not - * modified and there is no reason to write the cell. + * First, if forced to instantiate a deleted child page and it's never modified, we end up here + * with a page that has a modify structure, no modifications, and no disk address. Ignore those + * pages, they're not modified and there is no reason to write the cell. * - * Second, insert splits are permitted during checkpoint. When doing the - * final checkpoint pass, we first walk the internal page's page-index - * and write out any dirty pages we find, then we write out the internal - * page in post-order traversal. If we found the split page in the first - * step, it will have an address; if we didn't find the split page in - * the first step, it won't have an address and we ignore it, it's not - * part of the checkpoint. + * Second, insert splits are permitted during checkpoint. When doing the final checkpoint pass, + * we first walk the internal page's page-index and write out any dirty pages we find, then we + * write out the internal page in post-order traversal. If we found the split page in the first + * step, it will have an address; if we didn't find the split page in the first step, it won't + * have an address and we ignore it, it's not part of the checkpoint. */ mod = ref->page->modify; if (mod != NULL && mod->rec_result != 0) diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c index d9a974cc68a..c4241f840ae 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_col.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c @@ -26,14 +26,12 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) if (cbulk->entry == cbulk->nrecs) { if (cbulk->entry != 0) { /* - * If everything didn't fit, update the counters and - * split. + * If everything didn't fit, update the counters and split. * * Boundary: split or write the page. * - * No need to have a minimum split size boundary, all - * pages are filled 100% except the last, allowing it to - * grow in the future. + * No need to have a minimum split size boundary, all pages are filled 100% except the + * last, allowing it to grow in the future. */ __wt_rec_incr( session, r, cbulk->entry, __bitstr_size((size_t)cbulk->entry * btree->bitcnt)); @@ -404,14 +402,12 @@ __wt_rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) } /* - * If everything didn't fit, update the counters and - * split. + * If everything didn't fit, update the counters and split. * * Boundary: split or write the page. * - * No need to have a minimum split size boundary, all - * pages are filled 100% except the last, allowing it to - * grow in the future. + * No need to have a minimum split size boundary, all pages are filled 100% except the + * last, allowing it to grow in the future. */ __wt_rec_incr(session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt)); WT_RET(__wt_rec_split(session, r, 0)); @@ -708,21 +704,16 @@ __wt_rec_col_var( goto record_loop; /* - * Overflow items are tricky: we don't know until we're - * finished processing the set of values if we need the - * overflow value or not. If we don't use the overflow - * item at all, we have to discard it from the backing - * file, otherwise we'll leak blocks on the checkpoint. - * That's safe because if the backing overflow value is - * still needed by any running transaction, we'll cache - * a copy in the update list. + * Overflow items are tricky: we don't know until we're finished processing the set of + * values if we need the overflow value or not. If we don't use the overflow item at all, we + * have to discard it from the backing file, otherwise we'll leak blocks on the checkpoint. + * That's safe because if the backing overflow value is still needed by any running + * transaction, we'll cache a copy in the update list. * - * Regardless, we avoid copying in overflow records: if - * there's a WT_INSERT entry that modifies a reference - * counted overflow record, we may have to write copies - * of the overflow record, and in that case we'll do the - * comparisons, but we don't read overflow items just to - * see if they match records on either side. + * Regardless, we avoid copying in overflow records: if there's a WT_INSERT entry that + * modifies a reference counted overflow record, we may have to write copies of the overflow + * record, and in that case we'll do the comparisons, but we don't read overflow items just + * to see if they match records on either side. */ if (vpack->ovfl) { ovfl_state = OVFL_UNUSED; @@ -799,15 +790,12 @@ __wt_rec_col_var( } } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) { /* - * If doing an update save and restore, and the - * underlying value is a removed overflow value, - * we end up here. + * If doing an update save and restore, and the underlying value is a removed + * overflow value, we end up here. * - * If necessary, when the overflow value was - * originally removed, reconciliation appended - * a globally visible copy of the value to the - * key's update list, meaning the on-page item - * isn't accessed after page re-instantiation. + * If necessary, when the overflow value was originally removed, reconciliation + * appended a globally visible copy of the value to the key's update list, meaning + * the on-page item isn't accessed after page re-instantiation. * * Assert the case. */ @@ -844,8 +832,7 @@ __wt_rec_col_var( /* * An as-yet-unused overflow item. * - * We're going to copy the on-page cell, - * write out any record we're tracking. + * We're going to copy the on-page cell, write out any record we're tracking. */ if (rle != 0) { WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, durable_ts, diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c index 733f450070e..27de9d69e67 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_row.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c @@ -343,14 +343,12 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* For each entry in the in-memory page... */ WT_INTL_FOREACH_BEGIN (session, page, ref) { /* - * There are different paths if the key is an overflow item vs. - * a straight-forward on-page value. If an overflow item, we - * would have instantiated it, and we can use that fact to set - * things up. + * There are different paths if the key is an overflow item vs. a straight-forward on-page + * value. If an overflow item, we would have instantiated it, and we can use that fact to + * set things up. * - * Note the cell reference and unpacked key cell are available - * only in the case of an instantiated, off-page key, we don't - * bother setting them if that's not possible. + * Note the cell reference and unpacked key cell are available only in the case of an + * instantiated, off-page key, we don't bother setting them if that's not possible. */ if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS)) { cell = NULL; @@ -372,11 +370,10 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* * Ignored child. * - * Overflow keys referencing pages we're not writing are - * no longer useful, schedule them for discard. Don't - * worry about instantiation, internal page keys are - * always instantiated. Don't worry about reuse, - * reusing this key in this reconciliation is unlikely. + * Overflow keys referencing pages we're not writing are no longer useful, schedule them + * for discard. Don't worry about instantiation, internal page keys are always + * instantiated. Don't worry about reuse, reusing this key in this reconciliation is + * unlikely. */ if (key_onpage_ovfl) WT_ERR(__wt_ovfl_discard_add(session, page, kpack->cell)); @@ -758,13 +755,11 @@ __wt_rec_row_leaf( dictionary = false; if (upd == NULL) { /* - * When the page was read into memory, there may not - * have been a value item. + * When the page was read into memory, there may not have been a value item. * - * If there was a value item, check if it's a dictionary - * cell (a copy of another item on the page). If it's a - * copy, we have to create a new value item as the old - * item might have been discarded from the page. + * If there was a value item, check if it's a dictionary cell (a copy of another item on + * the page). If it's a copy, we have to create a new value item as the old item might + * have been discarded from the page. */ if (vpack->raw == WT_CELL_VALUE_COPY) { /* If the item is Huffman encoded, decode it. */ @@ -782,36 +777,28 @@ __wt_rec_row_leaf( dictionary = true; } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) { /* - * If doing an update save and restore, and the - * underlying value is a removed overflow value, - * we end up here. + * If doing an update save and restore, and the underlying value is a removed + * overflow value, we end up here. * - * If necessary, when the overflow value was - * originally removed, reconciliation appended - * a globally visible copy of the value to the - * key's update list, meaning the on-page item - * isn't accessed after page re-instantiation. + * If necessary, when the overflow value was originally removed, reconciliation + * appended a globally visible copy of the value to the key's update list, meaning + * the on-page item isn't accessed after page re-instantiation. * * Assert the case. */ WT_ASSERT(session, F_ISSET(r, WT_REC_UPDATE_RESTORE)); /* - * If the key is also a removed overflow item, - * don't write anything at all. + * If the key is also a removed overflow item, don't write anything at all. * - * We don't have to write anything because the - * code re-instantiating the page gets the key - * to match the saved list of updates from the - * original page. By not putting the key on - * the page, we'll move the key/value set from - * a row-store leaf page slot to an insert list, - * but that shouldn't matter. + * We don't have to write anything because the code re-instantiating the page gets + * the key to match the saved list of updates from the original page. By not putting + * the key on the page, we'll move the key/value set from a row-store leaf page slot + * to an insert list, but that shouldn't matter. * - * The reason we bother with the test is because - * overflows are expensive to write. It's hard - * to imagine a real workload where this test is - * worth the effort, but it's a simple test. + * The reason we bother with the test is because overflows are expensive to write. + * It's hard to imagine a real workload where this test is worth the effort, but + * it's a simple test. */ if (kpack != NULL && kpack->raw == WT_CELL_KEY_OVFL_RM) goto leaf_insert; @@ -855,14 +842,11 @@ __wt_rec_row_leaf( break; case WT_UPDATE_TOMBSTONE: /* - * If this key/value pair was deleted, we're - * done. + * If this key/value pair was deleted, we're done. * - * Overflow keys referencing discarded values - * are no longer useful, discard the backing - * blocks. Don't worry about reuse, reusing - * keys from a row-store page reconciliation - * seems unlikely enough to ignore. + * Overflow keys referencing discarded values are no longer useful, discard the + * backing blocks. Don't worry about reuse, reusing keys from a row-store page + * reconciliation seems unlikely enough to ignore. */ if (kpack != NULL && kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM) { /* @@ -892,8 +876,7 @@ __wt_rec_row_leaf( /* * Build key cell. * - * If the key is an overflow key that hasn't been removed, use - * the original backing blocks. + * If the key is an overflow key that hasn't been removed, use the original backing blocks. */ key_onpage_ovfl = kpack != NULL && kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM; if (key_onpage_ovfl) { @@ -930,14 +913,11 @@ __wt_rec_row_leaf( WT_ASSERT(session, tmpkey->size != 0); /* - * Grow the buffer as necessary, ensuring data - * data has been copied into local buffer space, - * then append the suffix to the prefix already - * in the buffer. + * Grow the buffer as necessary, ensuring data data has been copied into local + * buffer space, then append the suffix to the prefix already in the buffer. * - * Don't grow the buffer unnecessarily or copy - * data we don't need, truncate the item's data - * length to the prefix bytes. + * Don't grow the buffer unnecessarily or copy data we don't need, truncate the + * item's data length to the prefix bytes. */ tmpkey->size = kpack->prefix; WT_ERR(__wt_buf_grow(session, tmpkey, tmpkey->size + kpack->size)); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_track.c b/src/third_party/wiredtiger/src/reconcile/rec_track.c index 0ecd3f6998b..ae7fd9b6d79 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_track.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_track.c @@ -301,11 +301,9 @@ __ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) head = page->modify->ovfl_track->ovfl_reuse; /* - * Discard any overflow records that aren't in-use, freeing underlying - * blocks. + * Discard any overflow records that aren't in-use, freeing underlying blocks. * - * First, walk the overflow reuse lists (except for the lowest one), - * fixing up skiplist links. + * First, walk the overflow reuse lists (except for the lowest one), fixing up skiplist links. */ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) for (e = &head[i]; (reuse = *e) != NULL;) { @@ -317,15 +315,13 @@ __ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) } /* - * Second, discard any overflow record without an in-use flag, clear - * the flags for the next run. + * Second, discard any overflow record without an in-use flag, clear the flags for the next run. * - * As part of the pass through the lowest level, figure out how much - * space we added/subtracted from the page, and update its footprint. - * We don't get it exactly correct because we don't know the depth of - * the skiplist here, but it's close enough, and figuring out the - * memory footprint change in the reconciliation wrapup code means - * fewer atomic updates and less code overall. + * As part of the pass through the lowest level, figure out how much space we added/subtracted + * from the page, and update its footprint. We don't get it exactly correct because we don't + * know the depth of the skiplist here, but it's close enough, and figuring out the memory + * footprint change in the reconciliation wrapup code means fewer atomic updates and less code + * overall. */ decr = 0; for (e = &head[0]; (reuse = *e) != NULL;) { @@ -368,11 +364,9 @@ __ovfl_reuse_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) head = page->modify->ovfl_track->ovfl_reuse; /* - * Discard any overflow records that were just added, freeing underlying - * blocks. + * Discard any overflow records that were just added, freeing underlying blocks. * - * First, walk the overflow reuse lists (except for the lowest one), - * fixing up skiplist links. + * First, walk the overflow reuse lists (except for the lowest one), fixing up skiplist links. */ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) for (e = &head[i]; (reuse = *e) != NULL;) { @@ -464,14 +458,12 @@ __wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr skipdepth = __wt_skip_choose_depth(session); /* - * Allocate the WT_OVFL_REUSE structure, next pointers for the skip - * list, room for the address and value, then copy everything into - * place. + * Allocate the WT_OVFL_REUSE structure, next pointers for the skip list, room for the address + * and value, then copy everything into place. * - * To minimize the WT_OVFL_REUSE structure size, the address offset - * and size are single bytes: that's safe because the address follows - * the structure (which can't be more than about 100B), and address - * cookies are limited to 255B. + * To minimize the WT_OVFL_REUSE structure size, the address offset and size are single bytes: + * that's safe because the address follows the structure (which can't be more than about 100B), + * and address cookies are limited to 255B. */ size = sizeof(WT_OVFL_REUSE) + skipdepth * sizeof(WT_OVFL_REUSE *) + addr_size + value_size; WT_RET(__wt_calloc(session, 1, size, &reuse)); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c index 06dcf73fbb5..2150bf63559 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -68,13 +68,11 @@ __rec_append_orig_value( } /* - * We need the original on-page value for some reader: get a copy and - * append it to the end of the update list with a transaction ID that - * guarantees its visibility. + * We need the original on-page value for some reader: get a copy and append it to the end of + * the update list with a transaction ID that guarantees its visibility. * - * If we don't have a value cell, it's an insert/append list key/value - * pair which simply doesn't exist for some reader; place a deleted - * record at the end of the update list. + * If we don't have a value cell, it's an insert/append list key/value pair which simply doesn't + * exist for some reader; place a deleted record at the end of the update list. */ append = NULL; /* -Wconditional-uninitialized */ size = 0; /* -Wconditional-uninitialized */ @@ -87,12 +85,11 @@ __rec_append_orig_value( } /* - * If we're saving the original value for a birthmark, transfer over - * the transaction ID and clear out the birthmark update. + * If we're saving the original value for a birthmark, transfer over the transaction ID and + * clear out the birthmark update. * - * Else, set the entry's transaction information to the lowest possible - * value. Cleared memory matches the lowest possible transaction ID and - * timestamp, do nothing. + * Else, set the entry's transaction information to the lowest possible value. Cleared memory + * matches the lowest possible transaction ID and timestamp, do nothing. */ if (upd->type == WT_UPDATE_BIRTHMARK) { append->txnid = upd->txnid; @@ -236,9 +233,8 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v /* * Track the oldest update not on the page. * - * This is used to decide whether reads can use the - * page image, hence using the start rather than the - * durable timestamp. + * This is used to decide whether reads can use the page image, hence using the start + * rather than the durable timestamp. */ if (upd_select->upd == NULL && upd->start_ts < r->min_skipped_ts) r->min_skipped_ts = upd->start_ts; diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 26b1849693a..5746e20273b 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -315,13 +315,12 @@ __rec_write_check_complete( WT_RET(tret); /* - * Check if this reconciliation attempt is making progress. If there's - * any sign of progress, don't fall back to the lookaside table. + * Check if this reconciliation attempt is making progress. If there's any sign of progress, + * don't fall back to the lookaside table. * - * Check if the current reconciliation split, in which case we'll - * likely get to write at least one of the blocks. If we've created a - * page image for a page that previously didn't have one, or we had a - * page image and it is now empty, that's also progress. + * Check if the current reconciliation split, in which case we'll likely get to write at least + * one of the blocks. If we've created a page image for a page that previously didn't have one, + * or we had a page image and it is now empty, that's also progress. */ if (r->multi_next > 1) return (0); @@ -337,11 +336,11 @@ __rec_write_check_complete( return (0); /* - * Check if the current reconciliation applied some updates, in which - * case evict/restore should gain us some space. + * Check if the current reconciliation applied some updates, in which case evict/restore should + * gain us some space. * - * Check if lookaside eviction is possible. If any of the updates we - * saw were uncommitted, the lookaside table cannot be used. + * Check if lookaside eviction is possible. If any of the updates we saw were uncommitted, the + * lookaside table cannot be used. */ if (r->update_uncommitted || r->update_used) return (0); @@ -372,12 +371,10 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r) /* * The page remains dirty. * - * Any checkpoint call cleared the tree's modified flag before - * writing pages, so we must explicitly reset it. We insert a - * barrier after the change for clarity (the requirement is the - * flag be set before a subsequent checkpoint reads it, and - * as the current checkpoint is waiting on this reconciliation - * to complete, there's no risk of that happening). + * Any checkpoint call cleared the tree's modified flag before writing pages, so we must + * explicitly reset it. We insert a barrier after the change for clarity (the requirement is + * the flag be set before a subsequent checkpoint reads it, and as the current checkpoint is + * waiting on this reconciliation to complete, there's no risk of that happening). */ btree->modified = true; WT_FULL_BARRIER(); @@ -421,17 +418,15 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * We set the page state to mark it as having been dirtied for - * the first time prior to reconciliation. A failed atomic cas - * indicates that an update has taken place during + * We set the page state to mark it as having been dirtied for the first time prior to + * reconciliation. A failed atomic cas indicates that an update has taken place during * reconciliation. * - * The page only might be clean; if the page state is unchanged - * since reconciliation started, it's clean. + * The page only might be clean; if the page state is unchanged since reconciliation + * started, it's clean. * - * If the page state changed, the page has been written since - * reconciliation started and remains dirty (that can't happen - * when evicting, the page is exclusively locked). + * If the page state changed, the page has been written since reconciliation started and + * remains dirty (that can't happen when evicting, the page is exclusively locked). */ if (__wt_atomic_cas32(&mod->page_state, WT_PAGE_DIRTY_FIRST, WT_PAGE_CLEAN)) __wt_cache_dirty_decr(session, page); @@ -477,11 +472,11 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) session, WT_VERB_SPLIT, "root page split -> %" PRIu32 " pages", mod->mod_multi_entries); /* - * Create a new root page, initialize the array of child references, - * mark it dirty, then write it. + * Create a new root page, initialize the array of child references, mark it dirty, then write + * it. * - * Don't count the eviction of this page as progress, checkpoint can - * repeatedly create and discard these pages. + * Don't count the eviction of this page as progress, checkpoint can repeatedly create and + * discard these pages. */ WT_RET(__wt_page_alloc(session, page->type, mod->mod_multi_entries, false, &next)); F_SET_ATOMIC(next, WT_PAGE_EVICT_NO_PROGRESS); @@ -576,12 +571,11 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO r->orig_txn_checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT); /* - * Update the page state to indicate that all currently installed - * updates will be included in this reconciliation if it would mark the - * page clean. + * Update the page state to indicate that all currently installed updates will be included in + * this reconciliation if it would mark the page clean. * - * Add a write barrier to make it more likely that a thread adding an - * update will see this state change. + * Add a write barrier to make it more likely that a thread adding an update will see this state + * change. */ page->modify->page_state = WT_PAGE_DIRTY_FIRST; WT_FULL_BARRIER(); @@ -596,17 +590,14 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO WT_ORDERED_READ(r->last_running, txn_global->last_running); /* - * Decide whether to skew on-page values towards newer or older - * versions. This is a heuristic attempting to minimize the number of - * pages that need to be rewritten by future checkpoints. + * Decide whether to skew on-page values towards newer or older versions. This is a heuristic + * attempting to minimize the number of pages that need to be rewritten by future checkpoints. * - * We usually prefer to skew to newer versions, the logic being that by - * the time the next checkpoint runs, it is likely that all the updates - * we choose will be stable. However, if checkpointing with a - * timestamp (indicated by a stable_timestamp being set), and there is - * a checkpoint already running, or this page was read with lookaside - * history, or the stable timestamp hasn't changed since last time this - * page was successfully, skew oldest instead. + * We usually prefer to skew to newer versions, the logic being that by the time the next + * checkpoint runs, it is likely that all the updates we choose will be stable. However, if + * checkpointing with a timestamp (indicated by a stable_timestamp being set), and there is a + * checkpoint already running, or this page was read with lookaside history, or the stable + * timestamp hasn't changed since last time this page was successfully, skew oldest instead. */ if (F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DEBUG_MODE) && __wt_random(&session->rnd) % 3 == 0) @@ -686,9 +677,8 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO r->evict_matching_checksum_failed = false; /* - * Dictionary compression only writes repeated values once. We grow - * the dictionary as necessary, always using the largest size we've - * seen. + * Dictionary compression only writes repeated values once. We grow the dictionary as necessary, + * always using the largest size we've seen. * * Reset the dictionary. * @@ -945,12 +935,10 @@ __rec_split_chunk_init( * * Don't touch the disk image item memory, that memory is reused. * - * Clear the disk page header to ensure all of it is initialized, even - * the unused fields. + * Clear the disk page header to ensure all of it is initialized, even the unused fields. * - * In the case of fixed-length column-store, clear the entire buffer: - * fixed-length column-store sets bits in bytes, where the bytes are - * assumed to initially be 0. + * In the case of fixed-length column-store, clear the entire buffer: fixed-length column-store + * sets bits in bytes, where the bytes are assumed to initially be 0. */ WT_RET(__wt_buf_init(session, &chunk->image, memsize)); memset(chunk->image.mem, 0, r->page->type == WT_PAGE_COL_FIX ? memsize : WT_PAGE_HEADER_SIZE); @@ -988,39 +976,32 @@ __wt_rec_split_init( r->page_size = (uint32_t)max; /* - * If we have to split, we want to choose a smaller page size for the - * split pages, because otherwise we could end up splitting one large - * packed page over and over. We don't want to pick the minimum size - * either, because that penalizes an application that did a bulk load - * and subsequently inserted a few items into packed pages. Currently - * defaulted to 75%, but I have no empirical evidence that's "correct". + * If we have to split, we want to choose a smaller page size for the split pages, because + * otherwise we could end up splitting one large packed page over and over. We don't want to + * pick the minimum size either, because that penalizes an application that did a bulk load and + * subsequently inserted a few items into packed pages. Currently defaulted to 75%, but I have + * no empirical evidence that's "correct". * - * The maximum page size may be a multiple of the split page size (for - * example, there's a maximum page size of 128KB, but because the table - * is active and we don't want to split a lot, the split size is 20KB). - * The maximum page size may NOT be an exact multiple of the split page + * The maximum page size may be a multiple of the split page size (for example, there's a + * maximum page size of 128KB, but because the table is active and we don't want to split a lot, + * the split size is 20KB). The maximum page size may NOT be an exact multiple of the split page * size. * - * It's lots of work to build these pages and don't want to start over - * when we reach the maximum page size (it's painful to restart after - * creating overflow items and compacted data, for example, as those - * items have already been written to disk). So, the loop calls the - * helper functions when approaching a split boundary, and we save the - * information at that point. We also save the boundary information at - * the minimum split size. We maintain two chunks (each boundary - * represents a chunk that gets written as a page) in the memory, - * writing out the older one to the disk as a page when we need to make - * space for a new chunk. On reaching the last chunk, if it turns out to - * be smaller than the minimum split size, we go back into the - * penultimate chunk and split at this minimum split size boundary. This - * moves some data from the penultimate chunk to the last chunk, hence - * increasing the size of the last page written without decreasing the - * penultimate page size beyond the minimum split size. + * It's lots of work to build these pages and don't want to start over when we reach the maximum + * page size (it's painful to restart after creating overflow items and compacted data, for + * example, as those items have already been written to disk). So, the loop calls the helper + * functions when approaching a split boundary, and we save the information at that point. We + * also save the boundary information at the minimum split size. We maintain two chunks (each + * boundary represents a chunk that gets written as a page) in the memory, writing out the older + * one to the disk as a page when we need to make space for a new chunk. On reaching the last + * chunk, if it turns out to be smaller than the minimum split size, we go back into the + * penultimate chunk and split at this minimum split size boundary. This moves some data from + * the penultimate chunk to the last chunk, hence increasing the size of the last page written + * without decreasing the penultimate page size beyond the minimum split size. * - * Finally, all this doesn't matter for fixed-size column-store pages - * and salvage. Fixed-size column store pages can split under (very) - * rare circumstances, but they're allocated at a fixed page size, never - * anything smaller. In salvage, as noted above, we can't split at all. + * Finally, all this doesn't matter for fixed-size column-store pages and salvage. Fixed-size + * column store pages can split under (very) rare circumstances, but they're allocated at a + * fixed page size, never anything smaller. In salvage, as noted above, we can't split at all. */ if (r->salvage != NULL) { r->split_size = 0; @@ -1094,14 +1075,12 @@ __rec_is_checkpoint(WT_SESSION_IMPL *session, WT_RECONCILE *r) * * This function exists as a place to hang this comment. * - * Any time we write the root page of the tree without splitting we are - * creating a checkpoint (and have to tell the underlying block manager - * so it creates and writes the additional information checkpoints - * require). However, checkpoints are completely consistent, and so we - * have to resolve information about the blocks we're expecting to free - * as part of the checkpoint, before writing the checkpoint. In short, - * we don't do checkpoint writes here; clear the boundary information as - * a reminder and create the checkpoint during wrapup. + * Any time we write the root page of the tree without splitting we are creating a checkpoint + * (and have to tell the underlying block manager so it creates and writes the additional + * information checkpoints require). However, checkpoints are completely consistent, and so we + * have to resolve information about the blocks we're expecting to free as part of the + * checkpoint, before writing the checkpoint. In short, we don't do checkpoint writes here; + * clear the boundary information as a reminder and create the checkpoint during wrapup. */ return (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT) && __wt_ref_is_root(r->ref)); } @@ -1124,36 +1103,30 @@ __rec_split_row_promote(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ITEM *key, int cmp; /* - * For a column-store, the promoted key is the recno and we already have - * a copy. For a row-store, it's the first key on the page, a variable- - * length byte string, get a copy. + * For a column-store, the promoted key is the recno and we already have a copy. For a + * row-store, it's the first key on the page, a variable- length byte string, get a copy. * - * This function is called from the split code at each split boundary, - * but that means we're not called before the first boundary, and we - * will eventually have to get the first key explicitly when splitting - * a page. + * This function is called from the split code at each split boundary, but that means we're not + * called before the first boundary, and we will eventually have to get the first key explicitly + * when splitting a page. * - * For the current slot, take the last key we built, after doing suffix - * compression. The "last key we built" describes some process: before - * calling the split code, we must place the last key on the page before - * the boundary into the "last" key structure, and the first key on the - * page after the boundary into the "current" key structure, we're going - * to compare them for suffix compression. + * For the current slot, take the last key we built, after doing suffix compression. The "last + * key we built" describes some process: before calling the split code, we must place the last + * key on the page before the boundary into the "last" key structure, and the first key on the + * page after the boundary into the "current" key structure, we're going to compare them for + * suffix compression. * - * Suffix compression is a hack to shorten keys on internal pages. We - * only need enough bytes in the promoted key to ensure searches go to - * the correct page: the promoted key has to be larger than the last key - * on the leaf page preceding it, but we don't need any more bytes than - * that. In other words, we can discard any suffix bytes not required - * to distinguish between the key being promoted and the last key on the - * leaf page preceding it. This can only be done for the first level of - * internal pages, you cannot repeat suffix truncation as you split up - * the tree, it loses too much information. + * Suffix compression is a hack to shorten keys on internal pages. We only need enough bytes in + * the promoted key to ensure searches go to the correct page: the promoted key has to be larger + * than the last key on the leaf page preceding it, but we don't need any more bytes than that. + * In other words, we can discard any suffix bytes not required to distinguish between the key + * being promoted and the last key on the leaf page preceding it. This can only be done for the + * first level of internal pages, you cannot repeat suffix truncation as you split up the tree, + * it loses too much information. * - * Note #1: if the last key on the previous page was an overflow key, - * we don't have the in-memory key against which to compare, and don't - * try to do suffix compression. The code for that case turns suffix - * compression off for the next key, we don't have to deal with it here. + * Note #1: if the last key on the previous page was an overflow key, we don't have the + * in-memory key against which to compare, and don't try to do suffix compression. The code for + * that case turns suffix compression off for the next key, we don't have to deal with it here. */ if (type != WT_PAGE_ROW_LEAF || !r->key_sfx_compress) return (__wt_buf_set(session, key, r->cur->data, r->cur->size)); @@ -1449,9 +1422,8 @@ __rec_split_finish_process_prev(WT_SESSION_IMPL *session, WT_RECONCILE *r) if (prev_ptr->min_offset != 0 && cur_ptr->image.size < r->min_split_size) { /* - * The last chunk, pointed to by the current image pointer, has - * less than the minimum data. Let's move any data more than the - * minimum from the previous image into the current. + * The last chunk, pointed to by the current image pointer, has less than the minimum data. + * Let's move any data more than the minimum from the previous image into the current. * * Grow the current buffer if it is not large enough. */ @@ -1504,13 +1476,11 @@ int __wt_rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) { /* - * We're done reconciling, write the final page. We may arrive here with - * no entries to write if the page was entirely empty or if nothing on - * the page was visible to us. + * We're done reconciling, write the final page. We may arrive here with no entries to write if + * the page was entirely empty or if nothing on the page was visible to us. * - * Pages with skipped or not-yet-globally visible updates aren't really - * empty; otherwise, the page is truly empty and we will merge it into - * its parent during the parent's reconciliation. + * Pages with skipped or not-yet-globally visible updates aren't really empty; otherwise, the + * page is truly empty and we will merge it into its parent during the parent's reconciliation. */ if (r->entries == 0 && r->supd_next == 0) return (0); @@ -1564,11 +1534,11 @@ __rec_split_write_supd( int cmp; /* - * Check if we've saved updates that belong to this block, and move - * any to the per-block structure. + * Check if we've saved updates that belong to this block, and move any to the per-block + * structure. * - * This code requires a key be filled in for the next block (or the - * last block flag be set, if there's no next block). + * This code requires a key be filled in for the next block (or the last block flag be set, if + * there's no next block). * * The last block gets all remaining saved updates. */ @@ -1580,13 +1550,11 @@ __rec_split_write_supd( } /* - * Get the saved update's key and compare it with the block's key range. - * If the saved update list belongs with the block we're about to write, - * move it to the per-block memory. Check only to the first update that - * doesn't go with the block, they must be in sorted order. + * Get the saved update's key and compare it with the block's key range. If the saved update + * list belongs with the block we're about to write, move it to the per-block memory. Check only + * to the first update that doesn't go with the block, they must be in sorted order. * - * The other chunk will have the key for the next page, that's what we - * compare against. + * The other chunk will have the key for the next page, that's what we compare against. */ next = chunk == r->cur_ptr ? r->prev_ptr : r->cur_ptr; page = r->page; @@ -1716,10 +1684,9 @@ __rec_split_write_reuse( /* * Calculating the checksum is the expensive part, try to avoid it. * - * Ignore the last block of any reconciliation. Pages are written in the - * same block order every time, so the last block written for a page is - * unlikely to match any previously written block or block written in - * the future, (absent a point-update earlier in the page which didn't + * Ignore the last block of any reconciliation. Pages are written in the same block order every + * time, so the last block written for a page is unlikely to match any previously written block + * or block written in the future, (absent a point-update earlier in the page which didn't * change the size of the on-page object in any way). */ if (last_block) @@ -1803,18 +1770,15 @@ __rec_compression_adjust(WT_SESSION_IMPL *session, uint32_t max, size_t compress if (compressed_size > max) { /* - * The compressed size is GT the page maximum. - * Check if the pre-compression size is larger than the maximum. - * If 10% of the page size larger than the maximum, decrease it - * by that amount. Else if it's not already at the page maximum, - * set it there. + * The compressed size is GT the page maximum. Check if the pre-compression size is larger + * than the maximum. If 10% of the page size larger than the maximum, decrease it by that + * amount. Else if it's not already at the page maximum, set it there. * - * Note we're using 10% of the maximum page size as our test for - * when to adjust the pre-compression size as well as the amount - * by which we adjust it. Not updating the value when it's close - * to the page size keeps us from constantly updating a shared - * memory location, and 10% of the page size is an OK step value - * as well, so we use it in both cases. + * Note we're using 10% of the maximum page size as our test for when to adjust the + * pre-compression size as well as the amount by which we adjust it. Not updating the value + * when it's close to the page size keeps us from constantly updating a shared memory + * location, and 10% of the page size is an OK step value as well, so we use it in both + * cases. */ adjust = current - max; if (adjust > ten_percent) @@ -1827,12 +1791,10 @@ __rec_compression_adjust(WT_SESSION_IMPL *session, uint32_t max, size_t compress /* * The compressed size is LTE the page maximum. * - * Don't increase the pre-compressed size on the last block, the - * last block might be tiny. + * Don't increase the pre-compressed size on the last block, the last block might be tiny. * - * If the compressed size is less than the page maximum by 10%, - * increase the pre-compression size by 10% of the page, or up - * to the maximum in-memory image size. + * If the compressed size is less than the page maximum by 10%, increase the pre-compression + * size by 10% of the page, or up to the maximum in-memory image size. * * Note we're using 10% of the maximum page size... see above. */ @@ -1918,13 +1880,12 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK *chunk __rec_split_write_header(session, r, chunk, multi, compressed_image->mem); /* - * If we are writing the whole page in our first/only attempt, it might - * be a checkpoint (checkpoints are only a single page, by definition). - * Checkpoints aren't written here, the wrapup functions do the write. + * If we are writing the whole page in our first/only attempt, it might be a checkpoint + * (checkpoints are only a single page, by definition). Checkpoints aren't written here, the + * wrapup functions do the write. * - * Track the buffer with the image. (This is bad layering, but we can't - * write the image until the wrapup code, and we don't have a code path - * from here to there.) + * Track the buffer with the image. (This is bad layering, but we can't write the image until + * the wrapup code, and we don't have a code path from here to there.) */ if (last_block && r->multi_next == 1 && __rec_is_checkpoint(session, r)) { WT_ASSERT(session, r->supd_next == 0); @@ -2244,8 +2205,8 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* * Discard the replacement leaf page's blocks. * - * The exception is root pages are never tracked or free'd, they - * are checkpoints, and must be explicitly dropped. + * The exception is root pages are never tracked or free'd, they are + * checkpoints, and must be explicitly dropped. */ if (!__wt_ref_is_root(ref)) WT_RET(__wt_btree_block_free(session, mod->mod_replace.addr, mod->mod_replace.size)); @@ -2306,17 +2267,14 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) break; case 1: /* 1-for-1 page swap */ /* - * Because WiredTiger's pages grow without splitting, we're - * replacing a single page with another single page most of - * the time. + * Because WiredTiger's pages grow without splitting, we're replacing a single page with + * another single page most of the time. * - * If in-memory, or saving/restoring changes for this page and - * there's only one block, there's nothing to write. Set up - * a single block as if to split, then use that disk image to - * rewrite the page in memory. This is separate from simple - * replacements where eviction has decided to retain the page - * in memory because the latter can't handle update lists and - * splits can. + * If in-memory, or saving/restoring changes for this page and there's only one block, + * there's nothing to write. Set up a single block as if to split, then use that disk + * image to rewrite the page in memory. This is separate from simple replacements where + * eviction has decided to retain the page in memory because the latter can't handle + * update lists and splits can. */ if (F_ISSET(r, WT_REC_IN_MEMORY) || (F_ISSET(r, WT_REC_UPDATE_RESTORE) && r->multi->supd_entries != 0)) @@ -2395,12 +2353,10 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) multi->addr.reuse = 0; /* - * On error, discard blocks we've written, they're unreferenced by the - * tree. This is not a question of correctness, we're avoiding block - * leaks. + * On error, discard blocks we've written, they're unreferenced by the tree. This is not a + * question of correctness, we're avoiding block leaks. * - * Don't discard backing blocks marked for reuse, they remain part of - * a previous reconciliation. + * Don't discard backing blocks marked for reuse, they remain part of a previous reconciliation. */ for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i) if (multi->addr.addr != NULL) { diff --git a/src/third_party/wiredtiger/src/schema/schema_create.c b/src/third_party/wiredtiger/src/schema/schema_create.c index bb4a61687eb..cd7609f7153 100644 --- a/src/third_party/wiredtiger/src/schema/schema_create.c +++ b/src/third_party/wiredtiger/src/schema/schema_create.c @@ -98,13 +98,12 @@ __create_file(WT_SESSION_IMPL *session, const char *uri, bool exclusive, const c } /* - * Open the file to check that it was setup correctly. We don't need to - * pass the configuration, we just wrote the collapsed configuration - * into the metadata file, and it's going to be read/used by underlying - * functions. + * Open the file to check that it was setup correctly. We don't need to pass the configuration, + * we just wrote the collapsed configuration into the metadata file, and it's going to be + * read/used by underlying functions. * - * Keep the handle exclusive until it is released at the end of the - * call, otherwise we could race with a drop. + * Keep the handle exclusive until it is released at the end of the call, otherwise we could + * race with a drop. */ WT_ERR(__wt_session_get_dhandle(session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE)); if (WT_META_TRACKING(session)) @@ -371,18 +370,15 @@ __create_index(WT_SESSION_IMPL *session, const char *name, bool exclusive, const name); /* - * Note: it would be better to keep the table exclusive here, while - * changing its indexes. We don't because some operation we perform - * below reacquire the table handle (such as opening a cursor on the - * table in order to fill the index). If we keep the handle exclusive - * here, those operations wanting ordinary access will conflict, - * leading to errors. At the same time, we don't want to allow - * table cursors that have already been fully opened to remain open - * across this call. + * Note: it would be better to keep the table exclusive here, while changing its indexes. We + * don't because some operation we perform below reacquire the table handle (such as opening a + * cursor on the table in order to fill the index). If we keep the handle exclusive here, those + * operations wanting ordinary access will conflict, leading to errors. At the same time, we + * don't want to allow table cursors that have already been fully opened to remain open across + * this call. * - * Temporarily getting the table exclusively serves the purpose - * of ensuring that cursors on the table that are already open - * must at least be closed before this call proceeds. + * Temporarily getting the table exclusively serves the purpose of ensuring that cursors on the + * table that are already open must at least be closed before this call proceeds. */ tlen = (size_t)(idxname++ - tablename); if ((ret = __wt_schema_get_table( diff --git a/src/third_party/wiredtiger/src/schema/schema_drop.c b/src/third_party/wiredtiger/src/schema/schema_drop.c index 4a3e616d9ab..527e8540d74 100644 --- a/src/third_party/wiredtiger/src/schema/schema_drop.c +++ b/src/third_party/wiredtiger/src/schema/schema_drop.c @@ -113,15 +113,13 @@ __drop_table(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) /* * Open the table so we can drop its column groups and indexes. * - * Ideally we would keep the table locked exclusive across the drop, - * but for now we rely on the global table lock to prevent the table - * being reopened while it is being dropped. One issue is that the - * WT_WITHOUT_LOCKS macro can drop and reacquire the global table lock, - * avoiding deadlocks while waiting for LSM operation to quiesce. + * Ideally we would keep the table locked exclusive across the drop, but for now we rely on the + * global table lock to prevent the table being reopened while it is being dropped. One issue is + * that the WT_WITHOUT_LOCKS macro can drop and reacquire the global table lock, avoiding + * deadlocks while waiting for LSM operation to quiesce. * - * Temporarily getting the table exclusively serves the purpose - * of ensuring that cursors on the table that are already open - * must at least be closed before this call proceeds. + * Temporarily getting the table exclusively serves the purpose of ensuring that cursors on the + * table that are already open must at least be closed before this call proceeds. */ WT_ERR(__wt_schema_get_table_uri(session, uri, true, WT_DHANDLE_EXCLUSIVE, &table)); WT_ERR(__wt_schema_release_table(session, &table)); diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c index 4c6a8b02c26..4e9d98d77fb 100644 --- a/src/third_party/wiredtiger/src/schema/schema_open.c +++ b/src/third_party/wiredtiger/src/schema/schema_open.c @@ -148,14 +148,12 @@ __open_index(WT_SESSION_IMPL *session, WT_TABLE *table, WT_INDEX *idx) WT_ERR(__wt_strndup(session, cval.str, cval.len, &idx->key_format)); /* - * The key format for an index is somewhat subtle: the application - * specifies a set of columns that it will use for the key, but the - * engine usually adds some hidden columns in order to derive the - * primary key. These hidden columns are part of the file's key. + * The key format for an index is somewhat subtle: the application specifies a set of columns + * that it will use for the key, but the engine usually adds some hidden columns in order to + * derive the primary key. These hidden columns are part of the file's key. * - * The file's key_format is stored persistently, we need to calculate - * the index cursor key format (which will usually omit some of those - * keys). + * The file's key_format is stored persistently, we need to calculate the index cursor key + * format (which will usually omit some of those keys). */ WT_ERR(__wt_buf_init(session, buf, 0)); WT_ERR(__wt_config_getones(session, idx->config, "columns", &idx->colconf)); diff --git a/src/third_party/wiredtiger/src/schema/schema_project.c b/src/third_party/wiredtiger/src/schema/schema_project.c index ebcbe45e8fc..aa441d67f5f 100644 --- a/src/third_party/wiredtiger/src/schema/schema_project.c +++ b/src/third_party/wiredtiger/src/schema/schema_project.c @@ -310,11 +310,9 @@ __wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char * /* * Read the item we're about to overwrite. * - * There is subtlety here: the value format - * may not exactly match the cursor's format. - * In particular, we need lengths with raw - * columns in the middle of a packed struct, - * but not if they are at the end of a struct. + * There is subtlety here: the value format may not exactly match the cursor's + * format. In particular, we need lengths with raw columns in the middle of a packed + * struct, but not if they are at the end of a struct. */ WT_RET(__pack_next(&pack, &pv)); diff --git a/src/third_party/wiredtiger/src/schema/schema_rename.c b/src/third_party/wiredtiger/src/schema/schema_rename.c index 304d7305504..a151b1640c8 100644 --- a/src/third_party/wiredtiger/src/schema/schema_rename.c +++ b/src/third_party/wiredtiger/src/schema/schema_rename.c @@ -207,10 +207,9 @@ __rename_table(WT_SESSION_IMPL *session, const char *uri, const char *newuri, co /* * Open the table so we can rename its column groups and indexes. * - * Ideally we would keep the table locked exclusive across the rename, - * but for now we rely on the global table lock to prevent the table - * being reopened while it is being renamed. One issue is that the - * WT_WITHOUT_LOCKS macro can drop and reacquire the global table lock, + * Ideally we would keep the table locked exclusive across the rename, but for now we rely on + * the global table lock to prevent the table being reopened while it is being renamed. One + * issue is that the WT_WITHOUT_LOCKS macro can drop and reacquire the global table lock, * avoiding deadlocks while waiting for LSM operation to quiesce. */ WT_RET(__wt_schema_get_table(session, oldname, strlen(oldname), false, 0, &table)); diff --git a/src/third_party/wiredtiger/src/schema/schema_stat.c b/src/third_party/wiredtiger/src/schema/schema_stat.c index f612129b2ce..33f8d5cc7d6 100644 --- a/src/third_party/wiredtiger/src/schema/schema_stat.c +++ b/src/third_party/wiredtiger/src/schema/schema_stat.c @@ -150,9 +150,8 @@ __wt_curstat_table_init( /* * Process the column groups. * - * Set the cursor to reference the data source statistics; we don't - * initialize it, instead we copy (rather than aggregate), the first - * column's statistics, which has the same effect. + * Set the cursor to reference the data source statistics; we don't initialize it, instead we + * copy (rather than aggregate), the first column's statistics, which has the same effect. */ stats = &cst->u.dsrc_stats; for (i = 0; i < WT_COLGROUPS(table); i++) { diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index 09148db3018..098acb87c60 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -325,13 +325,12 @@ __session_close(WT_SESSION *wt_session, const char *config) WT_STAT_CONN_DECR(session, session_open); /* - * Sessions are re-used, clear the structure: the clear sets the active - * field to 0, which will exclude the hazard array from review by the - * eviction thread. Because some session fields are accessed by other - * threads, the structure must be cleared carefully. + * Sessions are re-used, clear the structure: the clear sets the active field to 0, which will + * exclude the hazard array from review by the eviction thread. Because some session fields are + * accessed by other threads, the structure must be cleared carefully. * - * We don't need to publish here, because regardless of the active field - * being non-zero, the hazard pointer is always valid. + * We don't need to publish here, because regardless of the active field being non-zero, the + * hazard pointer is always valid. */ __session_clear(session); session = conn->default_session; @@ -423,12 +422,11 @@ __session_open_cursor_int(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR * *cursorp = NULL; /* - * Open specific cursor types we know about, or call the generic data - * source open function. + * Open specific cursor types we know about, or call the generic data source open function. * - * Unwind a set of string comparisons into a switch statement hoping - * the compiler can make it fast, but list the common choices first - * instead of sorting so if/else patterns are still fast. + * Unwind a set of string comparisons into a switch statement hoping the compiler can make it + * fast, but list the common choices first instead of sorting so if/else patterns are still + * fast. */ switch (uri[0]) { /* @@ -596,13 +594,12 @@ err: WT_TRET(cursor->close(cursor)); } /* - * Opening a cursor on a non-existent data source will set ret to - * either of ENOENT or WT_NOTFOUND at this point. However, - * applications may reasonably do this inside a transaction to check - * for the existence of a table or index. + * Opening a cursor on a non-existent data source will set ret to either of ENOENT or + * WT_NOTFOUND at this point. However, applications may reasonably do this inside a transaction + * to check for the existence of a table or index. * - * Failure in opening a cursor should not set an error on the - * transaction and WT_NOTFOUND will be mapped to ENOENT. + * Failure in opening a cursor should not set an error on the transaction and WT_NOTFOUND will + * be mapped to ENOENT. */ API_END_RET_NO_TXN_ERROR(session, ret); @@ -1350,15 +1347,14 @@ __wt_session_range_truncate( WT_ERR(__wt_bad_object_type(session, stop->uri)); /* - * If both cursors set, check they're correctly ordered with respect to - * each other. We have to test this before any search, the search can - * change the initial cursor position. + * If both cursors set, check they're correctly ordered with respect to each other. We have to + * test this before any search, the search can change the initial cursor position. * - * Rather happily, the compare routine will also confirm the cursors - * reference the same object and the keys are set. + * Rather happily, the compare routine will also confirm the cursors reference the same object + * and the keys are set. * - * The test for a NULL start comparison function isn't necessary (we - * checked it above), but it quiets clang static analysis complaints. + * The test for a NULL start comparison function isn't necessary (we checked it above), but it + * quiets clang static analysis complaints. */ if (start != NULL && stop != NULL && start->compare != NULL) { WT_ERR(start->compare(start, stop, &cmp)); @@ -1391,13 +1387,12 @@ __wt_session_range_truncate( } /* - * We always truncate in the forward direction because the underlying - * data structures can move through pages faster forward than backward. - * If we don't have a start cursor, create one and position it at the - * first record. + * We always truncate in the forward direction because the underlying data structures can move + * through pages faster forward than backward. If we don't have a start cursor, create one and + * position it at the first record. * - * If start is NULL, stop must not be NULL, but static analyzers have - * a hard time with that, test explicitly. + * If start is NULL, stop must not be NULL, but static analyzers have a hard time with that, + * test explicitly. */ if (start == NULL && stop != NULL) { WT_ERR(__session_open_cursor((WT_SESSION *)session, stop->uri, NULL, NULL, &start)); @@ -1421,9 +1416,8 @@ err: /* * Close any locally-opened start cursor. * - * Reset application cursors, they've possibly moved and the - * application cannot use them. Note that we can make it here with a - * NULL start cursor (e.g., if the truncate range is empty). + * Reset application cursors, they've possibly moved and the application cannot use them. Note + * that we can make it here with a NULL start cursor (e.g., if the truncate range is empty). */ if (local_start) WT_TRET(start->close(start)); @@ -1450,12 +1444,12 @@ __session_truncate( WT_STAT_CONN_INCR(session, cursor_truncate); /* - * If the URI is specified, we don't need a start/stop, if start/stop - * is specified, we don't need a URI. One exception is the log URI - * which may truncate (archive) log files for a backup cursor. + * If the URI is specified, we don't need a start/stop, if start/stop is specified, we don't + * need a URI. One exception is the log URI which may truncate (archive) log files for a backup + * cursor. * - * If no URI is specified, and both cursors are specified, start/stop - * must reference the same object. + * If no URI is specified, and both cursors are specified, start/stop must reference the same + * object. * * Any specified cursor must have been initialized. */ @@ -1956,17 +1950,14 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config) WT_ERR(__wt_inmem_unsupported_op(session, NULL)); /* - * Checkpoints require a snapshot to write a transactionally consistent - * snapshot of the data. + * Checkpoints require a snapshot to write a transactionally consistent snapshot of the data. * - * We can't use an application's transaction: if it has uncommitted - * changes, they will be written in the checkpoint and may appear after - * a crash. + * We can't use an application's transaction: if it has uncommitted changes, they will be + * written in the checkpoint and may appear after a crash. * - * Use a real snapshot transaction: we don't want any chance of the - * snapshot being updated during the checkpoint. Eviction is prevented - * from evicting anything newer than this because we track the oldest - * transaction ID in the system that is not visible to all readers. + * Use a real snapshot transaction: we don't want any chance of the snapshot being updated + * during the checkpoint. Eviction is prevented from evicting anything newer than this because + * we track the oldest transaction ID in the system that is not visible to all readers. */ WT_ERR(__wt_txn_context_check(session, false)); diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c index 2be298c330e..f01962b2e78 100644 --- a/src/third_party/wiredtiger/src/session/session_compact.c +++ b/src/third_party/wiredtiger/src/session/session_compact.c @@ -287,11 +287,9 @@ __compact_worker(WT_SESSION_IMPL *session) } /* - * If compaction failed because checkpoint was running, - * continue with the next handle. We might continue to - * race with checkpoint on each handle, but that's OK, - * we'll step through all the handles, and then we'll - * block until a checkpoint completes. + * If compaction failed because checkpoint was running, continue with the next handle. + * We might continue to race with checkpoint on each handle, but that's OK, we'll step + * through all the handles, and then we'll block until a checkpoint completes. * * Just quit if eviction is the problem. */ diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c index 9a3fceeb48b..9bf35dca909 100644 --- a/src/third_party/wiredtiger/src/session/session_dhandle.c +++ b/src/third_party/wiredtiger/src/session/session_dhandle.c @@ -115,14 +115,12 @@ __wt_session_lock_dhandle(WT_SESSION_IMPL *session, uint32_t flags, bool *is_dea want_exclusive = LF_ISSET(WT_DHANDLE_EXCLUSIVE); /* - * If this session already has exclusive access to the handle, there is - * no point trying to lock it again. + * If this session already has exclusive access to the handle, there is no point trying to lock + * it again. * - * This should only happen if a checkpoint handle is locked multiple - * times during a checkpoint operation, or the handle is already open - * without any special flags. In particular, it must fail if - * attempting to checkpoint a handle opened for a bulk load, even in - * the same session. + * This should only happen if a checkpoint handle is locked multiple times during a checkpoint + * operation, or the handle is already open without any special flags. In particular, it must + * fail if attempting to checkpoint a handle opened for a bulk load, even in the same session. */ if (dhandle->excl_session == session) { if (!LF_ISSET(WT_DHANDLE_LOCK_ONLY) && @@ -134,14 +132,12 @@ __wt_session_lock_dhandle(WT_SESSION_IMPL *session, uint32_t flags, bool *is_dea } /* - * Check that the handle is open. We've already incremented - * the reference count, so once the handle is open it won't be - * closed by another thread. + * Check that the handle is open. We've already incremented the reference count, so once the + * handle is open it won't be closed by another thread. * - * If we can see the WT_DHANDLE_OPEN flag set while holding a - * lock on the handle, then it's really open and we can start - * using it. Alternatively, if we can get an exclusive lock - * and WT_DHANDLE_OPEN is still not set, we need to do the open. + * If we can see the WT_DHANDLE_OPEN flag set while holding a lock on the handle, then it's + * really open and we can start using it. Alternatively, if we can get an exclusive lock and + * WT_DHANDLE_OPEN is still not set, we need to do the open. */ for (;;) { /* If the handle is dead, give up. */ @@ -159,11 +155,10 @@ __wt_session_lock_dhandle(WT_SESSION_IMPL *session, uint32_t flags, bool *is_dea /* * If the handle is open, get a read lock and recheck. * - * Wait for a read lock if we want exclusive access and failed - * to get it: the sweep server may be closing this handle, and - * we need to wait for it to release its lock. If we want - * exclusive access and find the handle open once we get the - * read lock, give up: some other thread has it locked for real. + * Wait for a read lock if we want exclusive access and failed to get it: the sweep server + * may be closing this handle, and we need to wait for it to release its lock. If we want + * exclusive access and find the handle open once we get the read lock, give up: some other + * thread has it locked for real. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && (!want_exclusive || lock_busy)) { __wt_readlock(session, &dhandle->rwlock); @@ -324,16 +319,15 @@ retry: __wt_free(session, checkpoint); /* - * There's a potential race: we get the name of the most recent unnamed - * checkpoint, but if it's discarded (or locked so it can be discarded) - * by the time we try to open it, we'll fail the open. Retry in those - * cases, a new "last" checkpoint should surface, and we can't return an - * error, the application will be justifiably upset if we can't open the - * last checkpoint instance of an object. + * There's a potential race: we get the name of the most recent unnamed checkpoint, but if it's + * discarded (or locked so it can be discarded) by the time we try to open it, we'll fail the + * open. Retry in those cases, a new "last" checkpoint should surface, and we can't return an + * error, the application will be justifiably upset if we can't open the last checkpoint + * instance of an object. * - * The check against WT_NOTFOUND is correct: if there was no checkpoint - * for the object (that is, the object has never been in a checkpoint), - * we returned immediately after the call to search for that name. + * The check against WT_NOTFOUND is correct: if there was no checkpoint for the object (that is, + * the object has never been in a checkpoint), we returned immediately after the call to search + * for that name. */ if (last_ckpt && (ret == WT_NOTFOUND || ret == EBUSY)) goto retry; @@ -485,14 +479,12 @@ __wt_session_get_dhandle(WT_SESSION_IMPL *session, const char *uri, const char * WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE)); /* - * For now, we need the schema lock and handle list locks to - * open a file for real. + * For now, we need the schema lock and handle list locks to open a file for real. * - * Code needing exclusive access (such as drop or verify) - * assumes that it can close all open handles, then open an - * exclusive handle on the active tree and no other threads can - * reopen handles in the meantime. A combination of the schema - * and handle list locks are used to enforce this. + * Code needing exclusive access (such as drop or verify) assumes that it can close all open + * handles, then open an exclusive handle on the active tree and no other threads can reopen + * handles in the meantime. A combination of the schema and handle list locks are used to + * enforce this. */ if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) { dhandle->excl_session = NULL; diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c index c967354564c..2d7dd124b4f 100644 --- a/src/third_party/wiredtiger/src/support/err.c +++ b/src/third_party/wiredtiger/src/support/err.c @@ -228,14 +228,13 @@ __eventv(WT_SESSION_IMPL *session, bool msg_event, int error, const char *func, if (error != 0) { /* - * When the engine calls __wt_err on error, it often outputs an - * error message including the string associated with the error - * it's returning. We could change the calls to call __wt_errx, - * but it's simpler to not append an error string if all we are - * doing is duplicating an existing error string. + * When the engine calls __wt_err on error, it often outputs an error message including the + * string associated with the error it's returning. We could change the calls to call + * __wt_errx, but it's simpler to not append an error string if all we are doing is + * duplicating an existing error string. * - * Use strcmp to compare: both strings are nul-terminated, and - * we don't want to run past the end of the buffer. + * Use strcmp to compare: both strings are nul-terminated, and we don't want to run past the + * end of the buffer. */ err = __wt_strerror(session, error, NULL, 0); len = strlen(err); @@ -244,18 +243,15 @@ __eventv(WT_SESSION_IMPL *session, bool msg_event, int error, const char *func, } /* - * If a handler fails, return the error status: if we're in the process - * of handling an error, any return value we provide will be ignored by - * our caller, our caller presumably already has an error value it will - * be returning. + * If a handler fails, return the error status: if we're in the process of handling an error, + * any return value we provide will be ignored by our caller, our caller presumably already has + * an error value it will be returning. * - * If an application-specified or default informational message handler - * fails, complain using the application-specified or default error - * handler. + * If an application-specified or default informational message handler fails, complain using + * the application-specified or default error handler. * - * If an application-specified error message handler fails, complain - * using the default error handler. If the default error handler fails, - * fallback to stderr. + * If an application-specified error message handler fails, complain using the default error + * handler. If the default error handler fails, fallback to stderr. */ wt_session = (WT_SESSION *)session; handler = session->event_handler; diff --git a/src/third_party/wiredtiger/src/support/generation.c b/src/third_party/wiredtiger/src/support/generation.c index 431ca2c5a2f..c6e9de2c25a 100644 --- a/src/third_party/wiredtiger/src/support/generation.c +++ b/src/third_party/wiredtiger/src/support/generation.c @@ -100,9 +100,8 @@ __wt_gen_drain(WT_SESSION_IMPL *session, int which, uint64_t generation) WT_ORDERED_READ(v, s->generations[which]); /* - * The generation argument is newer than the limit. Wait - * for threads in generations older than the argument - * generation, threads in argument generations are OK. + * The generation argument is newer than the limit. Wait for threads in generations + * older than the argument generation, threads in argument generations are OK. * * The thread's generation may be 0 (that is, not set). */ diff --git a/src/third_party/wiredtiger/src/support/hazard.c b/src/third_party/wiredtiger/src/support/hazard.c index 3710da4ec5e..75901c8181d 100644 --- a/src/third_party/wiredtiger/src/support/hazard.c +++ b/src/third_party/wiredtiger/src/support/hazard.c @@ -124,16 +124,13 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp /* * Do the dance: * - * The memory location which makes a page "real" is the WT_REF's state - * of WT_REF_LIMBO or WT_REF_MEM, which can be set to WT_REF_LOCKED - * at any time by the page eviction server. + * The memory location which makes a page "real" is the WT_REF's state of WT_REF_LIMBO or + * WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the page eviction server. * - * Add the WT_REF reference to the session's hazard list and flush the - * write, then see if the page's state is still valid. If so, we can - * use the page because the page eviction server will see our hazard - * pointer before it discards the page (the eviction server sets the - * state to WT_REF_LOCKED, then flushes memory and checks the hazard - * pointers). + * Add the WT_REF reference to the session's hazard list and flush the write, then see if the + * page's state is still valid. If so, we can use the page because the page eviction server will + * see our hazard pointer before it discards the page (the eviction server sets the state to + * WT_REF_LOCKED, then flushes memory and checks the hazard pointers). */ hp->ref = ref; #ifdef HAVE_DIAGNOSTIC @@ -200,12 +197,11 @@ __wt_hazard_clear(WT_SESSION_IMPL *session, WT_REF *ref) hp->ref = NULL; /* - * If this was the last hazard pointer in the session, - * reset the size so that checks can skip this session. + * If this was the last hazard pointer in the session, reset the size so that checks can + * skip this session. * - * A write-barrier() is necessary before the change to - * the in-use value, the number of active references - * can never be less than the number of in-use slots. + * A write-barrier() is necessary before the change to the in-use value, the number of + * active references can never be less than the number of in-use slots. */ if (--session->nhazard == 0) WT_PUBLISH(session->hazard_inuse, 0); @@ -280,16 +276,13 @@ static inline void hazard_get_reference(WT_SESSION_IMPL *session, WT_HAZARD **hazardp, uint32_t *hazard_inusep) { /* - * Hazard pointer arrays can be swapped out from under us if they grow. - * First, read the current in-use value. The read must precede the read - * of the hazard pointer itself (so the in-use value is pessimistic - * should the hazard array grow), and additionally ensure we only read - * the in-use value once. Then, read the hazard pointer, also ensuring - * we only read it once. + * Hazard pointer arrays can be swapped out from under us if they grow. First, read the current + * in-use value. The read must precede the read of the hazard pointer itself (so the in-use + * value is pessimistic should the hazard array grow), and additionally ensure we only read the + * in-use value once. Then, read the hazard pointer, also ensuring we only read it once. * - * Use a barrier instead of marking the fields volatile because we don't - * want to slow down the rest of the hazard pointer functions that don't - * need special treatment. + * Use a barrier instead of marking the fields volatile because we don't want to slow down the + * rest of the hazard pointer functions that don't need special treatment. */ WT_ORDERED_READ(*hazard_inusep, session->hazard_inuse); WT_ORDERED_READ(*hazardp, session->hazard); diff --git a/src/third_party/wiredtiger/src/support/huffman.c b/src/third_party/wiredtiger/src/support/huffman.c index 8420a625b45..906dc311a63 100644 --- a/src/third_party/wiredtiger/src/support/huffman.c +++ b/src/third_party/wiredtiger/src/support/huffman.c @@ -87,8 +87,7 @@ typedef struct __wt_huffman_obj { /* * Queue element data structure. * - * Consists of a pointer to a huffman tree node, and a pointer to the next - * element in the queue. + * Consists of a pointer to a huffman tree node, and a pointer to the next element in the queue. */ typedef struct node_queue_elem { WT_FREQTREE_NODE *node; @@ -98,8 +97,8 @@ typedef struct node_queue_elem { /* * Queue of huffman tree nodes. * - * Contains a pointer to the beginning and the end of the queue, which is - * implemented as a linked list. + * Contains a pointer to the beginning and the end of the queue, which is implemented as a linked + * list. */ typedef struct node_queue { NODE_QUEUE_ELEM *first; @@ -381,9 +380,8 @@ __wt_huffman_open( /* * Adding the leaves to the queue. * - * Discard symbols with a frequency of 0; this assumes these symbols - * never occur in the source stream, and the purpose is to reduce the - * huffman tree's size. + * Discard symbols with a frequency of 0; this assumes these symbols never occur in the source + * stream, and the purpose is to reduce the huffman tree's size. */ for (i = 0; i < symcnt; ++i) if (indexed_freqs[i].frequency > 0) { diff --git a/src/third_party/wiredtiger/src/support/modify.c b/src/third_party/wiredtiger/src/support/modify.c index 848289d264d..086fb4b3920 100644 --- a/src/third_party/wiredtiger/src/support/modify.c +++ b/src/third_party/wiredtiger/src/support/modify.c @@ -111,17 +111,15 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_ITEM *value, WT_MODIFY *modify, size = modify->size; /* - * Grow the buffer to the maximum size we'll need. This is pessimistic - * because it ignores replacement bytes, but it's a simpler calculation. + * Grow the buffer to the maximum size we'll need. This is pessimistic because it ignores + * replacement bytes, but it's a simpler calculation. * - * Grow the buffer first. This function is often called using a cursor - * buffer referencing on-page memory and it's easy to overwrite a page. - * A side-effect of growing the buffer is to ensure the buffer's value - * is in buffer-local memory. + * Grow the buffer first. This function is often called using a cursor buffer referencing + * on-page memory and it's easy to overwrite a page. A side-effect of growing the buffer is to + * ensure the buffer's value is in buffer-local memory. * - * Because the buffer may reference an overflow item, the data may not - * start at the start of the buffer's memory and we have to correct for - * that. + * Because the buffer may reference an overflow item, the data may not start at the start of the + * buffer's memory and we have to correct for that. */ item_offset = WT_DATA_IN_ITEM(value) ? WT_PTRDIFF(value->data, value->mem) : 0; WT_RET(__wt_buf_grow( @@ -217,15 +215,12 @@ __modify_fast_path(WT_ITEM *value, const size_t *p, int nentries, int *nappliedp WT_CLEAR(prev); /* [-Werror=maybe-uninitialized] */ /* - * If the modifications are sorted and don't overlap in the old or new - * values, we can do a fast application of all the modifications - * modifications in a single pass. + * If the modifications are sorted and don't overlap in the old or new values, we can do a fast + * application of all the modifications modifications in a single pass. * - * The requirement for ordering is unfortunate, but modifications are - * performed in order, and applications specify byte offsets based on - * that. In other words, byte offsets are cumulative, modifications - * that shrink or grow the data affect subsequent modification's byte - * offsets. + * The requirement for ordering is unfortunate, but modifications are performed in order, and + * applications specify byte offsets based on that. In other words, byte offsets are cumulative, + * modifications that shrink or grow the data affect subsequent modification's byte offsets. */ fastpath = first = true; *nappliedp = 0; @@ -348,14 +343,12 @@ __wt_modify_apply(WT_CURSOR *cursor, const void *modify) nentries = (int)tmp; /* - * Grow the buffer first. This function is often called using a cursor - * buffer referencing on-page memory and it's easy to overwrite a page. - * A side-effect of growing the buffer is to ensure the buffer's value - * is in buffer-local memory. + * Grow the buffer first. This function is often called using a cursor buffer referencing + * on-page memory and it's easy to overwrite a page. A side-effect of growing the buffer is to + * ensure the buffer's value is in buffer-local memory. * - * Because the buffer may reference an overflow item, the data may not - * start at the start of the buffer's memory and we have to correct for - * that. + * Because the buffer may reference an overflow item, the data may not start at the start of the + * buffer's memory and we have to correct for that. */ item_offset = WT_DATA_IN_ITEM(value) ? WT_PTRDIFF(value->data, value->mem) : 0; WT_RET(__wt_buf_grow(session, value, item_offset + value->size)); diff --git a/src/third_party/wiredtiger/src/support/mtx_rw.c b/src/third_party/wiredtiger/src/support/mtx_rw.c index bb89e343b69..ff35fce0c81 100644 --- a/src/third_party/wiredtiger/src/support/mtx_rw.c +++ b/src/third_party/wiredtiger/src/support/mtx_rw.c @@ -402,10 +402,9 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) /* * Wait for our group to start and any readers to drain. * - * We take care here to do an atomic read of the full 64-bit lock - * value. Otherwise, reads are not guaranteed to be ordered and we - * could see no readers active from a different batch and decide that - * we have the lock. + * We take care here to do an atomic read of the full 64-bit lock value. Otherwise, reads are + * not guaranteed to be ordered and we could see no readers active from a different batch and + * decide that we have the lock. */ for (pause_cnt = 0, old.u.v = l->u.v; ticket != old.u.s.current || old.u.s.readers_active != 0; pause_cnt++, old.u.v = l->u.v) { @@ -460,9 +459,8 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) /* * Allow the next batch to start. * - * If there are readers in the next group, swap queued readers - * to active: this could race with new readlock requests, so we - * have to spin. + * If there are readers in the next group, swap queued readers to active: this could race + * with new readlock requests, so we have to spin. */ new.u.v = old.u.v; if (++new.u.s.current == new.u.s.reader) { diff --git a/src/third_party/wiredtiger/src/support/rand.c b/src/third_party/wiredtiger/src/support/rand.c index 264ee711755..fcc76147f7e 100644 --- a/src/third_party/wiredtiger/src/support/rand.c +++ b/src/third_party/wiredtiger/src/support/rand.c @@ -29,16 +29,15 @@ #include "wt_internal.h" /* - * This is an implementation of George Marsaglia's multiply-with-carry pseudo- - * random number generator. Computationally fast, with reasonable randomness - * properties, and a claimed period of > 2^60. + * This is an implementation of George Marsaglia's multiply-with-carry pseudo- random number + * generator. Computationally fast, with reasonable randomness properties, and a claimed period of > + * 2^60. * - * Be very careful about races here. Multiple threads can call __wt_random - * concurrently, and it is okay if those concurrent calls get the same return - * value. What is *not* okay is if reading/writing the shared state races and - * uses two different values for m_w or m_z. That can result in a stored value - * of zero, in which case they will be stuck on zero forever. Take a local copy - * of the values to avoid that, and read/write in atomic, 8B chunks. + * Be very careful about races here. Multiple threads can call __wt_random concurrently, and it is + * okay if those concurrent calls get the same return value. What is *not* okay is if + * reading/writing the shared state races and uses two different values for m_w or m_z. That can + * result in a stored value of zero, in which case they will be stuck on zero forever. Take a local + * copy of the values to avoid that, and read/write in atomic, 8B chunks. */ #undef M_W #define M_W(r) r.x.w diff --git a/src/third_party/wiredtiger/src/support/scratch.c b/src/third_party/wiredtiger/src/support/scratch.c index 294f8f2fe0f..74195d18502 100644 --- a/src/third_party/wiredtiger/src/support/scratch.c +++ b/src/third_party/wiredtiger/src/support/scratch.c @@ -241,11 +241,10 @@ __wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp *scratchp = NULL; /* - * Each WT_SESSION_IMPL has an array of scratch buffers available for - * use by any function. We use WT_ITEM structures for scratch memory - * because we already have functions that do variable-length allocation - * on a WT_ITEM. Scratch buffers are allocated only by a single thread - * of control, so no locking is necessary. + * Each WT_SESSION_IMPL has an array of scratch buffers available for use by any function. We + * use WT_ITEM structures for scratch memory because we already have functions that do + * variable-length allocation on a WT_ITEM. Scratch buffers are allocated only by a single + * thread of control, so no locking is necessary. * * Walk the array, looking for a buffer we can use. */ diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 09caef4345e..7aaba221842 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -415,9 +415,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) /* * Make sure the ID doesn't move past any named snapshots. * - * Don't include the read/assignment in the assert statement. Coverity - * complains if there are assignments only done in diagnostic builds, - * and when the read is from a volatile. + * Don't include the read/assignment in the assert statement. Coverity complains if there + * are assignments only done in diagnostic builds, and when the read is from a volatile. */ uint64_t id = txn_global->nsnap_oldest_id; WT_ASSERT(session, id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); @@ -475,8 +474,8 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) session->operation_timeout_us = S2C(session)->operation_timeout_us; /* - * The default sync setting is inherited from the connection, but can - * be overridden by an explicit "sync" setting for this transaction. + * The default sync setting is inherited from the connection, but can be overridden by an + * explicit "sync" setting for this transaction. * * We want to distinguish between inheriting implicitly and explicitly. */ @@ -615,9 +614,8 @@ __wt_txn_release(WT_SESSION_IMPL *session) /* * Ensure the transaction flags are cleared on exit * - * Purposely do NOT clear the commit and durable timestamps on release. - * Other readers may still find these transactions in the durable queue - * and will need to see those timestamps. + * Purposely do NOT clear the commit and durable timestamps on release. Other readers may still + * find these transactions in the durable queue and will need to see those timestamps. */ txn->flags = 0; txn->prepare_timestamp = WT_TS_NONE; @@ -1131,18 +1129,14 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[]) op->u.op_upd = NULL; WT_STAT_CONN_INCR(session, txn_prepared_updates_count); /* - * Set the key repeated flag which tells us that we've - * got multiple updates to the same key by the same txn. - * This is later used in txn commit. + * Set the key repeated flag which tells us that we've got multiple updates to the same + * key by the same txn. This is later used in txn commit. * - * When we see a reserved update we set the - * WT_UPDATE_RESERVED flag instead. We do this as we - * cannot know if our current update should specify the - * key repeated flag as we don't want to traverse the - * entire update chain to find out. i.e. if there is - * an update with our txnid after the reserved update - * we should set key repeated, but if there isn't we - * shouldn't. + * When we see a reserved update we set the WT_UPDATE_RESERVED flag instead. We do this + * as we cannot know if our current update should specify the key repeated flag as we + * don't want to traverse the entire update chain to find out. i.e. if there is an + * update with our txnid after the reserved update we should set key repeated, but if + * there isn't we shouldn't. */ if (upd->next != NULL && upd->txnid == upd->next->txnid) { if (upd->next->type == WT_UPDATE_RESERVE) diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index ccfd378b3b7..e960ec03d48 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -160,15 +160,12 @@ __checkpoint_apply_all( if (!target_list && op != NULL) { /* - * If the checkpoint is named or we're dropping checkpoints, we - * checkpoint both open and closed files; else, only checkpoint - * open files. + * If the checkpoint is named or we're dropping checkpoints, we checkpoint both open and + * closed files; else, only checkpoint open files. * - * XXX - * We don't optimize unnamed checkpoints of a list of targets, - * we open the targets and checkpoint them even if they are - * quiescent and don't need a checkpoint, believing applications - * unlikely to checkpoint a list of closed targets. + * XXX We don't optimize unnamed checkpoints of a list of targets, we open the targets and + * checkpoint them even if they are quiescent and don't need a checkpoint, believing + * applications unlikely to checkpoint a list of closed targets. */ ckpt_closed = named; if (!ckpt_closed) { @@ -217,21 +214,17 @@ __checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[]) WT_NAMED_DATA_SOURCE *ndsrc; /* - * A place-holder, to support data sources: we assume calling the - * underlying data-source session checkpoint function is sufficient to - * checkpoint all objects in the data source, open or closed, and we - * don't attempt to optimize the checkpoint of individual targets. - * Those assumptions are not necessarily going to be true for all - * data sources. + * A place-holder, to support data sources: we assume calling the underlying data-source session + * checkpoint function is sufficient to checkpoint all objects in the data source, open or + * closed, and we don't attempt to optimize the checkpoint of individual targets. Those + * assumptions are not necessarily going to be true for all data sources. * - * It's not difficult to support data-source checkpoints of individual - * targets (__wt_schema_worker is the underlying function that will do - * the work, and it's already written to support data-sources, although - * we'd probably need to pass the URI of the object to the data source - * checkpoint function which we don't currently do). However, doing a - * full data checkpoint is trickier: currently, the connection code is - * written to ignore all objects other than "file:", and that code will - * require significant changes to work with data sources. + * It's not difficult to support data-source checkpoints of individual targets + * (__wt_schema_worker is the underlying function that will do the work, and it's already + * written to support data-sources, although we'd probably need to pass the URI of the object to + * the data source checkpoint function which we don't currently do). However, doing a full data + * checkpoint is trickier: currently, the connection code is written to ignore all objects other + * than "file:", and that code will require significant changes to work with data sources. */ TAILQ_FOREACH (ndsrc, &S2C(session)->dsrcqh, q) { dsrc = ndsrc->dsrc; @@ -407,9 +400,8 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) /* * We haven't reached the current target. * - * Don't wait indefinitely: there might be dirty pages - * that can't be evicted. If we can't meet the target, - * give up and start the checkpoint for real. + * Don't wait indefinitely: there might be dirty pages that can't be evicted. If we can't + * meet the target, give up and start the checkpoint for real. */ bytes_written_total = cache->bytes_written - bytes_written_start; if (bytes_written_total > max_write) @@ -541,9 +533,8 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[ /* * Start a snapshot transaction for the checkpoint. * - * Note: we don't go through the public API calls because they have - * side effects on cursors, which applications can hold open across - * calls to checkpoint. + * Note: we don't go through the public API calls because they have side effects on cursors, + * which applications can hold open across calls to checkpoint. */ WT_RET(__wt_txn_begin(session, txn_cfg)); @@ -574,8 +565,8 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[ /* * Remove the checkpoint transaction from the global table. * - * This allows ordinary visibility checks to move forward because - * checkpoints often take a long time and only write to the metadata. + * This allows ordinary visibility checks to move forward because checkpoints often take a long + * time and only write to the metadata. */ __wt_writelock(session, &txn_global->rwlock); txn_global->checkpoint_state = *txn_state; @@ -597,8 +588,8 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[ /* * Set the checkpoint transaction's timestamp, if requested. * - * We rely on having the global transaction data locked so the oldest - * timestamp can't move past the stable timestamp. + * We rely on having the global transaction data locked so the oldest timestamp can't move past + * the stable timestamp. */ WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT | WT_TXN_HAS_TS_READ | WT_TXN_TS_PUBLISHED | WT_TXN_PUBLIC_TS_READ)); @@ -627,8 +618,8 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[ session, txn->read_timestamp, "Checkpoint requested at stable timestamp"); /* - * The snapshot we established when the transaction started may - * be too early to match the timestamp we just read. + * The snapshot we established when the transaction started may be too early to match the + * timestamp we just read. * * Get a new one. */ @@ -636,11 +627,11 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[ } /* - * Get a list of handles we want to flush; for named checkpoints this - * may pull closed objects into the session cache. + * Get a list of handles we want to flush; for named checkpoints this may pull closed objects + * into the session cache. * - * First, gather all handles, then start the checkpoint transaction, - * then release any clean handles. + * First, gather all handles, then start the checkpoint transaction, then release any clean + * handles. */ WT_ASSERT(session, session->ckpt_handle_next == 0); WT_WITH_TABLE_READ_LOCK( @@ -673,12 +664,11 @@ __txn_checkpoint_can_skip( txn_global = &conn->txn_global; /* - * This function also parses out some configuration options and hands - * them back to the caller - make sure it does that parsing regardless - * of the result. + * This function also parses out some configuration options and hands them back to the caller - + * make sure it does that parsing regardless of the result. * - * Determine if this is going to be a full checkpoint, that is a - * checkpoint that applies to all data tables in a database. + * Determine if this is going to be a full checkpoint, that is a checkpoint that applies to all + * data tables in a database. */ WT_RET(__wt_config_gets(session, cfg, "target", &cval)); __wt_config_subinit(session, &targetconf, &cval); @@ -788,8 +778,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) /* * Update the global oldest ID so we do all possible cleanup. * - * This is particularly important for compact, so that all dirty pages - * can be fully written. + * This is particularly important for compact, so that all dirty pages can be fully written. */ WT_ERR(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); @@ -814,32 +803,29 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) /* * Start the checkpoint for real. * - * Bump the global checkpoint generation, used to figure out whether - * checkpoint has visited a tree. Use an atomic increment even though - * we are single-threaded because readers of the checkpoint generation - * don't hold the checkpoint lock. + * Bump the global checkpoint generation, used to figure out whether checkpoint has visited a + * tree. Use an atomic increment even though we are single-threaded because readers of the + * checkpoint generation don't hold the checkpoint lock. * - * We do need to update it before clearing the checkpoint's entry out - * of the transaction table, or a thread evicting in a tree could - * ignore the checkpoint's transaction. + * We do need to update it before clearing the checkpoint's entry out of the transaction table, + * or a thread evicting in a tree could ignore the checkpoint's transaction. */ generation = __wt_gen_next(session, WT_GEN_CHECKPOINT); WT_STAT_CONN_SET(session, txn_checkpoint_generation, generation); /* - * We want to skip checkpointing clean handles whenever possible. That - * is, when the checkpoint is not named or forced. However, we need to - * take care about ordering with respect to the checkpoint transaction. + * We want to skip checkpointing clean handles whenever possible. That is, when the checkpoint + * is not named or forced. However, we need to take care about ordering with respect to the + * checkpoint transaction. * - * We can't skip clean handles before starting the transaction or the - * checkpoint can miss updates in trees that become dirty as the - * checkpoint is starting. If we wait until the transaction has - * started before locking a handle, there could be a metadata-changing - * operation in between (e.g., salvage) that will cause a write - * conflict when the checkpoint goes to write the metadata. + * We can't skip clean handles before starting the transaction or the checkpoint can miss + * updates in trees that become dirty as the checkpoint is starting. If we wait until the + * transaction has started before locking a handle, there could be a metadata-changing operation + * in between (e.g., salvage) that will cause a write conflict when the checkpoint goes to write + * the metadata. * - * Hold the schema lock while starting the transaction and gathering - * handles so the set we get is complete and correct. + * Hold the schema lock while starting the transaction and gathering handles so the set we get + * is complete and correct. */ WT_WITH_SCHEMA_LOCK(session, ret = __checkpoint_prepare(session, &tracking, cfg)); WT_ERR(ret); @@ -910,15 +896,12 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_txn_commit(session, NULL)); /* - * Ensure that the metadata changes are durable before the checkpoint - * is resolved. Do this by either checkpointing the metadata or syncing - * the log file. - * Recovery relies on the checkpoint LSN in the metadata only being - * updated by full checkpoints so only checkpoint the metadata for - * full or non-logged checkpoints. + * Ensure that the metadata changes are durable before the checkpoint is resolved. Do this by + * either checkpointing the metadata or syncing the log file. Recovery relies on the checkpoint + * LSN in the metadata only being updated by full checkpoints so only checkpoint the metadata + * for full or non-logged checkpoints. * - * This is very similar to __wt_meta_track_off, ideally they would be - * merged. + * This is very similar to __wt_meta_track_off, ideally they would be merged. */ if (full || !logging) { session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; @@ -978,17 +961,14 @@ err: conn->ckpt_timer_start.tv_sec = 0; /* - * XXX - * Rolling back the changes here is problematic. + * XXX Rolling back the changes here is problematic. * - * If we unroll here, we need a way to roll back changes to the avail - * list for each tree that was successfully synced before the error - * occurred. Otherwise, the next time we try this operation, we will - * try to free an old checkpoint again. + * If we unroll here, we need a way to roll back changes to the avail list for each tree that + * was successfully synced before the error occurred. Otherwise, the next time we try this + * operation, we will try to free an old checkpoint again. * - * OTOH, if we commit the changes after a failure, we have partially - * overwritten the checkpoint, so what ends up on disk is not - * consistent. + * OTOH, if we commit the changes after a failure, we have partially overwritten the checkpoint, + * so what ends up on disk is not consistent. */ failed = ret != 0; if (failed) @@ -1089,18 +1069,15 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting) /* * Don't highjack the session checkpoint thread for eviction. * - * Application threads are not generally available for potentially slow - * operations, but checkpoint does enough I/O it may be called upon to - * perform slow operations for the block manager. + * Application threads are not generally available for potentially slow operations, but checkpoint + * does enough I/O it may be called upon to perform slow operations for the block manager. * - * Application checkpoints wait until the checkpoint lock is available, - * compaction checkpoints don't. + * Application checkpoints wait until the checkpoint lock is available, compaction checkpoints + * don't. * - * Checkpoints should always use a separate session for lookaside - * updates, otherwise those updates are pinned until the checkpoint - * commits. Also, there are unfortunate interactions between the - * special rules for lookaside eviction and the special handling of the - * checkpoint transaction. + * Checkpoints should always use a separate session for lookaside updates, otherwise those updates + * are pinned until the checkpoint commits. Also, there are unfortunate interactions between the + * special rules for lookaside eviction and the special handling of the checkpoint transaction. */ #undef WT_CHECKPOINT_SESSION_FLAGS #define WT_CHECKPOINT_SESSION_FLAGS (WT_SESSION_CAN_WAIT | WT_SESSION_IGNORE_CACHE_SIZE) @@ -1259,11 +1236,9 @@ __checkpoint_lock_dirty_tree_int(WT_SESSION_IMPL *session, bool is_checkpoint, b /* * Lock the checkpoints that will be deleted. * - * Checkpoints are only locked when tracking is enabled, which covers - * checkpoint and drop operations, but not close. The reasoning is - * there should be no access to a checkpoint during close, because any - * thread accessing a checkpoint will also have the current file handle - * open. + * Checkpoints are only locked when tracking is enabled, which covers checkpoint and drop + * operations, but not close. The reasoning is there should be no access to a checkpoint during + * close, because any thread accessing a checkpoint will also have the current file handle open. */ if (WT_META_TRACKING(session)) WT_CKPT_FOREACH (ckptbase, ckpt) { @@ -1426,26 +1401,22 @@ __checkpoint_mark_skip(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, bool force) /* * Check for clean objects not requiring a checkpoint. * - * If we're closing a handle, and the object is clean, we can skip the - * checkpoint, whatever checkpoints we have are sufficient. (We might - * not have any checkpoints if the object was never modified, and that's - * OK: the object creation code doesn't mark the tree modified so we can - * skip newly created trees here.) + * If we're closing a handle, and the object is clean, we can skip the checkpoint, whatever + * checkpoints we have are sufficient. (We might not have any checkpoints if the object was + * never modified, and that's OK: the object creation code doesn't mark the tree modified so we + * can skip newly created trees here.) * - * If the application repeatedly checkpoints an object (imagine hourly - * checkpoints using the same explicit or internal name), there's no - * reason to repeat the checkpoint for clean objects. The test is if - * the only checkpoint we're deleting is the last one in the list and - * it has the same name as the checkpoint we're about to take, skip the - * work. (We can't skip checkpoints that delete more than the last - * checkpoint because deleting those checkpoints might free up space in - * the file.) This means an application toggling between two (or more) - * checkpoint names will repeatedly take empty checkpoints, but that's - * not likely enough to make detection worthwhile. + * If the application repeatedly checkpoints an object (imagine hourly checkpoints using the + * same explicit or internal name), there's no reason to repeat the checkpoint for clean + * objects. The test is if the only checkpoint we're deleting is the last one in the list and it + * has the same name as the checkpoint we're about to take, skip the work. (We can't skip + * checkpoints that delete more than the last checkpoint because deleting those checkpoints + * might free up space in the file.) This means an application toggling between two (or more) + * checkpoint names will repeatedly take empty checkpoints, but that's not likely enough to make + * detection worthwhile. * - * Checkpoint read-only objects otherwise: the application must be able - * to open the checkpoint in a cursor after taking any checkpoint, which - * means it must exist. + * Checkpoint read-only objects otherwise: the application must be able to open the checkpoint + * in a cursor after taking any checkpoint, which means it must exist. */ F_CLR(btree, WT_BTREE_SKIP_CKPT); if (!btree->modified && !force) { @@ -1602,16 +1573,14 @@ fake: /* * Update the object's metadata. * - * If the object is the metadata, the call to __wt_meta_ckptlist_set - * will update the turtle file and swap the new one into place. We - * need to make sure the metadata is on disk before the turtle file is - * updated. + * If the object is the metadata, the call to __wt_meta_ckptlist_set will update the turtle file + * and swap the new one into place. We need to make sure the metadata is on disk before the + * turtle file is updated. * - * If we are doing a checkpoint in a file without a transaction (e.g., - * closing a dirty tree before an exclusive operation like verify), - * the metadata update will be auto-committed. In that case, we need to - * sync the file here or we could roll forward the metadata in - * recovery and open a checkpoint that isn't yet durable. + * If we are doing a checkpoint in a file without a transaction (e.g., closing a dirty tree + * before an exclusive operation like verify), the metadata update will be auto-committed. In + * that case, we need to sync the file here or we could roll forward the metadata in recovery + * and open a checkpoint that isn't yet durable. */ if (WT_IS_METADATA(dhandle) || !F_ISSET(&session->txn, WT_TXN_RUNNING)) WT_ERR(__wt_checkpoint_sync(session, NULL)); diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index f74f0d45562..124465cc529 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -28,12 +28,11 @@ __txn_op_log_row_key_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) memset(&key, 0, sizeof(key)); /* - * We used to take the row-store logging key from the page referenced by - * the cursor, then switched to taking it from the cursor itself. Check - * they are the same. + * We used to take the row-store logging key from the page referenced by the cursor, then + * switched to taking it from the cursor itself. Check they are the same. * - * If the cursor references a WT_INSERT item, take the key from there, - * else take the key from the original page. + * If the cursor references a WT_INSERT item, take the key from there, else take the key from + * the original page. */ if (cbt->ins == NULL) { session = (WT_SESSION_IMPL *)cbt->iface.session; diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index 6ccb7625108..24653712e13 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -55,12 +55,12 @@ __recovery_cursor( if (WT_LOGOP_IS_IGNORED(id)) return (0); /* - * Metadata operations have an id of 0. Match operations based - * on the id and the current pass of recovery for metadata. + * Metadata operations have an id of 0. Match operations based on the id and the current pass of + * recovery for metadata. * - * Only apply operations in the correct metadata phase, and if the LSN - * is more recent than the last checkpoint. If there is no entry for a - * file, assume it was dropped or missing after a hot backup. + * Only apply operations in the correct metadata phase, and if the LSN is more recent than the + * last checkpoint. If there is no entry for a file, assume it was dropped or missing after a + * hot backup. */ metadata_op = id == WT_METAFILE_ID; if (r->metadata_only != metadata_op) @@ -575,15 +575,13 @@ __wt_txn_recover(WT_SESSION_IMPL *session) } /* - * First, do a pass through the log to recover the metadata, and - * establish the last checkpoint LSN. Skip this when opening a hot - * backup: we already have the correct metadata in that case. + * First, do a pass through the log to recover the metadata, and establish the last checkpoint + * LSN. Skip this when opening a hot backup: we already have the correct metadata in that case. * - * If we're running with salvage and we hit an error, we ignore it - * and continue. In salvage we want to recover whatever part of the - * data we can from the last checkpoint up until whatever problem we - * detect in the log file. In salvage, we ignore errors from scanning - * the log so recovery can continue. Other errors remain errors. + * If we're running with salvage and we hit an error, we ignore it and continue. In salvage we + * want to recover whatever part of the data we can from the last checkpoint up until whatever + * problem we detect in the log file. In salvage, we ignore errors from scanning the log so + * recovery can continue. Other errors remain errors. */ if (!was_backup) { r.metadata_only = true; diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 7b6463f6f2e..d3d9c2b4dfb 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -105,13 +105,11 @@ __txn_abort_newer_update( first_upd = upd->next; } else if (rollback_timestamp < upd->durable_ts) { /* - * If any updates are aborted, all newer updates - * better be aborted as well. + * If any updates are aborted, all newer updates better be aborted as well. * - * Timestamp ordering relies on the validations at - * the time of commit. Thus if the table is not - * configured for key consistency check, the - * the timestamps could be out of order here. + * Timestamp ordering relies on the validations at the time of commit. Thus if the table + * is not configured for key consistency check, the timestamps could be out of order + * here. */ WT_ASSERT(session, !FLD_ISSET(S2BT(session)->assert_flags, WT_ASSERT_COMMIT_TS_KEYS) || upd == first_upd); @@ -227,22 +225,19 @@ __txn_abort_newer_updates(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t bool local_read; /* - * If we created a page image with updates that need to be rolled back, - * read the history into cache now and make sure the page is marked - * dirty. Otherwise, the history we need could be swept from the - * lookaside table before the page is read because the lookaside sweep - * code has no way to tell that the page image is invalid. + * If we created a page image with updates that need to be rolled back, read the history into + * cache now and make sure the page is marked dirty. Otherwise, the history we need could be + * swept from the lookaside table before the page is read because the lookaside sweep code has + * no way to tell that the page image is invalid. * - * So, if there is lookaside history for a page, first check if the - * history needs to be rolled back then ensure the history is loaded - * into cache. + * So, if there is lookaside history for a page, first check if the history needs to be rolled + * back then ensure the history is loaded into cache. * - * Also, we have separately discarded any lookaside history more recent - * than the rollback timestamp. For page_las structures in cache, - * reset any future timestamps back to the rollback timestamp. This - * allows those structures to be discarded once the rollback timestamp - * is stable (crucially for tests, they can be discarded if the - * connection is closed right after a rollback_to_stable call). + * Also, we have separately discarded any lookaside history more recent than the rollback + * timestamp. For page_las structures in cache, reset any future timestamps back to the rollback + * timestamp. This allows those structures to be discarded once the rollback timestamp is stable + * (crucially for tests, they can be discarded if the connection is closed right after a + * rollback_to_stable call). */ local_read = false; read_flags = WT_READ_WONT_NEED; @@ -267,18 +262,14 @@ __txn_abort_newer_updates(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t WT_ERR(__wt_delete_page_rollback(session, ref)); /* - * If we have a ref with no page, or the page is clean, there is - * nothing to roll back. + * If we have a ref with no page, or the page is clean, there is nothing to roll back. * - * This check for a clean page is partly an optimization (checkpoint - * only marks pages clean when they have no unwritten updates so - * there's no point visiting them again), but also covers a corner case - * of a checkpoint with use_timestamp=false. Such a checkpoint - * effectively moves the stable timestamp forward, because changes that - * are written in the checkpoint cannot be reliably rolled back. The - * actual stable timestamp doesn't change, though, so if we try to roll - * back clean pages the in-memory tree can get out of sync with the - * on-disk tree. + * This check for a clean page is partly an optimization (checkpoint only marks pages clean when + * they have no unwritten updates so there's no point visiting them again), but also covers a + * corner case of a checkpoint with use_timestamp=false. Such a checkpoint effectively moves the + * stable timestamp forward, because changes that are written in the checkpoint cannot be + * reliably rolled back. The actual stable timestamp doesn't change, though, so if we try to + * roll back clean pages the in-memory tree can get out of sync with the on-disk tree. */ if ((page = ref->page) == NULL || !__wt_page_is_modified(page)) goto err; @@ -468,13 +459,12 @@ __txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[]) WT_STAT_CONN_INCR(session, txn_rollback_to_stable); /* - * Mark that a rollback operation is in progress and wait for eviction - * to drain. This is necessary because lookaside eviction uses - * transactions and causes the check for a quiescent system to fail. + * Mark that a rollback operation is in progress and wait for eviction to drain. This is + * necessary because lookaside eviction uses transactions and causes the check for a quiescent + * system to fail. * - * Configuring lookaside eviction off isn't atomic, safe because the - * flag is only otherwise set when closing down the database. Assert - * to avoid confusion in the future. + * Configuring lookaside eviction off isn't atomic, safe because the flag is only otherwise set + * when closing down the database. Assert to avoid confusion in the future. */ WT_ASSERT(session, !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE)); F_SET(conn, WT_CONN_EVICTION_NO_LOOKASIDE); diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index 2d9291ebbce..a9ee80c953f 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -210,20 +210,18 @@ __txn_get_published_timestamp(WT_SESSION_IMPL *session, WT_TXN *txn) wt_timestamp_t ts; /* - * Any checking of bit flags in this logic is invalid. __wt_txn_release - * may have already been called on this transaction which will set the - * flags member to 0. So we need to deduce which timestamp to use purely - * by inspecting the timestamp members which we deliberately preserve + * Any checking of bit flags in this logic is invalid. __wt_txn_release may have already been + * called on this transaction which will set the flags member to 0. So we need to deduce which + * timestamp to use purely by inspecting the timestamp members which we deliberately preserve * for reader threads such as ourselves. * - * In the non-prepared case, the first commit will either be less than - * the commit (in the case of multiple commits) in which case we should - * return the first commit. Or it will be equal to the commit (in the - * case of a single commit) and we can return durable (which is mirrored - * from the commit timestamp). + * In the non-prepared case, the first commit will either be less than the commit (in the case + * of multiple commits) in which case we should return the first commit. Or it will be equal to + * the commit (in the case of a single commit) and we can return durable (which is mirrored from + * the commit timestamp). * - * In the prepared case, the first commit will always be equal to the - * commit so we'll return durable. + * In the prepared case, the first commit will always be equal to the commit so we'll return + * durable. */ if (txn->commit_timestamp != txn->first_commit_timestamp) ts = txn->first_commit_timestamp; @@ -546,14 +544,12 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) set: __wt_writelock(session, &txn_global->rwlock); /* - * This method can be called from multiple threads, check that we are - * moving the global timestamps forwards. + * This method can be called from multiple threads, check that we are moving the global + * timestamps forwards. * - * The exception is the durable timestamp, where the application can - * move it backwards (in fact, it only really makes sense to explicitly - * move it backwards because it otherwise tracks the largest - * durable_timestamp so it moves forward whenever transactions are - * assigned timestamps). + * The exception is the durable timestamp, where the application can move it backwards (in fact, + * it only really makes sense to explicitly move it backwards because it otherwise tracks the + * largest durable_timestamp so it moves forward whenever transactions are assigned timestamps). */ if (has_durable) { txn_global->durable_timestamp = durable_ts; diff --git a/src/third_party/wiredtiger/src/utilities/util_loadtext.c b/src/third_party/wiredtiger/src/utilities/util_loadtext.c index 1d4414b47b5..879eb0270f6 100644 --- a/src/third_party/wiredtiger/src/utilities/util_loadtext.c +++ b/src/third_party/wiredtiger/src/utilities/util_loadtext.c @@ -68,8 +68,7 @@ text(WT_SESSION *session, const char *uri) /* * We're about to load strings, make sure the formats match. * - * Row-store tables have key/value pairs, column-store tables only have - * values. + * Row-store tables have key/value pairs, column-store tables only have values. */ if (!WT_STREQ(cursor->value_format, "S") || (!WT_STREQ(cursor->key_format, "S") && !WT_STREQ(cursor->key_format, "r"))) diff --git a/src/third_party/wiredtiger/test/csuite/schema_abort/main.c b/src/third_party/wiredtiger/test/csuite/schema_abort/main.c index bd127d8a686..ffa46247106 100644 --- a/src/third_party/wiredtiger/test/csuite/schema_abort/main.c +++ b/src/third_party/wiredtiger/test/csuite/schema_abort/main.c @@ -153,13 +153,12 @@ static WT_EVENT_HANDLER event_handler = { }; /* - * The following are various schema-related functions to have some threads - * performing during the test. The goal is to make sure that after a random - * abort, the database is left in a recoverable state. Yield during the - * schema operations to increase chance of abort during them. + * The following are various schema-related functions to have some threads performing during the + * test. The goal is to make sure that after a random abort, the database is left in a recoverable + * state. Yield during the schema operations to increase chance of abort during them. * - * TODO: Currently only verifies insert data, it would be ideal to modify the - * schema operations so that we can verify the state of the schema too. + * TODO: Currently only verifies insert data, it would be ideal to modify the schema operations so + * that we can verify the state of the schema too. */ static void diff --git a/src/third_party/wiredtiger/test/csuite/scope/main.c b/src/third_party/wiredtiger/test/csuite/scope/main.c index dc7b312e5c8..57947fcf166 100644 --- a/src/third_party/wiredtiger/test/csuite/scope/main.c +++ b/src/third_party/wiredtiger/test/csuite/scope/main.c @@ -138,9 +138,8 @@ cursor_scope_ops(WT_SESSION *session, const char *uri) } /* - * The application must keep key and value memory valid until - * the next operation that positions the cursor, modifies the - * data, or resets or closes the cursor. + * The application must keep key and value memory valid until the next operation that + * positions the cursor, modifies the data, or resets or closes the cursor. * * Modifying either the key or value buffers is not permitted. */ @@ -199,8 +198,8 @@ cursor_scope_ops(WT_SESSION *session, const char *uri) case INSERT: case REMOVE: /* - * Insert and remove configured with a search key do - * not position the cursor and have no key or value. + * Insert and remove configured with a search key do not position the cursor and have no + * key or value. * * There should be two error messages, ignore them. */ @@ -217,8 +216,7 @@ cursor_scope_ops(WT_SESSION *session, const char *uri) break; case REMOVE_POS: /* - * Remove configured with a cursor position has a key, - * but no value. + * Remove configured with a cursor position has a key, but no value. * * There should be one error message, ignore it. */ @@ -243,11 +241,10 @@ cursor_scope_ops(WT_SESSION *session, const char *uri) case SEARCH_NEAR: case UPDATE: /* - * Modify, reserve, search, search-near and update all - * position the cursor and have both a key and value. + * Modify, reserve, search, search-near and update all position the cursor and have both + * a key and value. * - * Any key/value should not reference application - * memory. + * Any key/value should not reference application memory. */ if (recno) { testutil_assert(cursor->get_key(cursor, &keyr) == 0); diff --git a/src/third_party/wiredtiger/test/csuite/truncated_log/main.c b/src/third_party/wiredtiger/test/csuite/truncated_log/main.c index befc30eab61..4f31496994a 100644 --- a/src/third_party/wiredtiger/test/csuite/truncated_log/main.c +++ b/src/third_party/wiredtiger/test/csuite/truncated_log/main.c @@ -154,13 +154,11 @@ fill_db(void) save_lsn.l.file = 0; /* - * Write data into the table until we move to log file 2. - * We do the calculation below so that we don't have to walk the - * log for every record. + * Write data into the table until we move to log file 2. We do the calculation below so that we + * don't have to walk the log for every record. * - * Calculate about how many records should fit in the log file. - * Subtract a bunch for metadata and file creation records. - * Then subtract out a few more records to be conservative. + * Calculate about how many records should fit in the log file. Subtract a bunch for metadata + * and file creation records. Then subtract out a few more records to be conservative. */ units = (K_SIZE + V_SIZE) / 128 + 1; min_key = 90000 / (units * 128) - 15; diff --git a/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c b/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c index 3a39ffa4c57..147e907430f 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c @@ -30,16 +30,14 @@ /* * JIRA ticket reference: WT-2853 * - * Test case description: create two threads: one is populating/updating - * records in a table with a few indices, the other is reading from table and - * indices. The test is adapted from one that uses cursor joins, this test - * does not, but simulates some of the access patterns. + * Test case description: create two threads: one is populating/updating records in a table with a + * few indices, the other is reading from table and indices. The test is adapted from one that uses + * cursor joins, this test does not, but simulates some of the access patterns. * - * Failure mode: after a second or two of progress by both threads, they both - * appear to slow dramatically, almost locking up. After some time (I've - * observed from a half minute to a few minutes), the lock up ends and both - * threads seem to be inserting and reading at a normal fast pace. That - * continues until the test ends (~30 seconds). + * Failure mode: after a second or two of progress by both threads, they both appear to slow + * dramatically, almost locking up. After some time (I've observed from a half minute to a few + * minutes), the lock up ends and both threads seem to be inserting and reading at a normal fast + * pace. That continues until the test ends (~30 seconds). */ static void *thread_insert(void *); diff --git a/src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c b/src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c index ff59ee95267..4e4e7f860a5 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2909_checkpoint_integrity/main.c @@ -31,41 +31,34 @@ #include <sys/wait.h> /* - * JIRA ticket reference: WT-2909 - * Test case description: + * JIRA ticket reference: WT-2909 Test case description: * - * This test attempts to check the integrity of checkpoints by injecting - * failures (by means of a custom file system) and then trying to recover. To - * insulate the top level program from various crashes that may occur when - * injecting failures, the "populate" code runs in another process, and is - * expected to sometimes fail. Then the top level program runs recovery (with - * the normal file system) and checks the results. Any failure at the top level - * indicates a checkpoint integrity problem. + * This test attempts to check the integrity of checkpoints by injecting failures (by means of a + * custom file system) and then trying to recover. To insulate the top level program from various + * crashes that may occur when injecting failures, the "populate" code runs in another process, and + * is expected to sometimes fail. Then the top level program runs recovery (with the normal file + * system) and checks the results. Any failure at the top level indicates a checkpoint integrity + * problem. * - * Each subtest uses the same kind of schema and data, the only variance is - * when the faults are injected. At the moment, this test only injects during - * checkpoints, and only injects write failures. It varies in the number of - * successful writes that occur before an injected failure (during a checkpoint - * operation), this can be indicated with "-o N". When N is not specified, the - * test attempts to find the optimal range of N for testing. Clearly when N is - * large, then the checkpoint may be successfully written, and the data - * represented by the checkpoint will be fully present. When N is small, - * nothing of interest is written and no data is present. To find the sweet - * spot where interesting failures occur, the test does a binary search to find - * the approximate N that divides the "small" and "large" cases. This is not - * strictly deterministic, a given N may give different results on different - * runs. But approximate optimal N can be determined, allowing a series of - * additional tests clustered around this N. + * Each subtest uses the same kind of schema and data, the only variance is when the faults are + * injected. At the moment, this test only injects during checkpoints, and only injects write + * failures. It varies in the number of successful writes that occur before an injected failure + * (during a checkpoint operation), this can be indicated with "-o N". When N is not specified, the + * test attempts to find the optimal range of N for testing. Clearly when N is large, then the + * checkpoint may be successfully written, and the data represented by the checkpoint will be fully + * present. When N is small, nothing of interest is written and no data is present. To find the + * sweet spot where interesting failures occur, the test does a binary search to find the + * approximate N that divides the "small" and "large" cases. This is not strictly deterministic, a + * given N may give different results on different runs. But approximate optimal N can be + * determined, allowing a series of additional tests clustered around this N. * - * The data is stored in two tables, one having indices. Both tables have - * the same keys and are updated with the same key in a single transaction. + * The data is stored in two tables, one having indices. Both tables have the same keys and are + * updated with the same key in a single transaction. * - * Failure mode: - * If one table is out of step with the other, that is detected as a failure at - * the top level. If an index is missing values (or has extra values), that is - * likewise a failure at the top level. If the tables or the home directory - * cannot be opened, that is a top level error. The tables must be present - * as an initial checkpoint is done without any injected fault. + * Failure mode: If one table is out of step with the other, that is detected as a failure at the + * top level. If an index is missing values (or has extra values), that is likewise a failure at the + * top level. If the tables or the home directory cannot be opened, that is a top level error. The + * tables must be present as an initial checkpoint is done without any injected fault. */ /* diff --git a/src/third_party/wiredtiger/test/csuite/wt2999_join_extractor/main.c b/src/third_party/wiredtiger/test/csuite/wt2999_join_extractor/main.c index 3bf02ed3f3c..a0443afa023 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2999_join_extractor/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2999_join_extractor/main.c @@ -30,14 +30,12 @@ /* * JIRA ticket reference: WT-2999 * - * Test case description: Create a table that stores ~4K size blobs; - * two indices are defined using a pair of custom extractors - * that pull the first and second 32-bit integers from the blob. - * A simple join is created using the two indices, and iterated. + * Test case description: Create a table that stores ~4K size blobs; two indices are defined using a + * pair of custom extractors that pull the first and second 32-bit integers from the blob. A simple + * join is created using the two indices, and iterated. * - * Failure mode: When a custom extractor is used with cursor - * joins, there are memory leaks at the point where the extractor - * sets the key. + * Failure mode: When a custom extractor is used with cursor joins, there are memory leaks at the + * point where the extractor sets the key. */ static int custom_extract1(WT_EXTRACTOR *extractor, WT_SESSION *session, const WT_ITEM *key, diff --git a/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c b/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c index 97b2a1a03a2..d70a9e0475e 100644 --- a/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c @@ -30,16 +30,15 @@ /* * JIRA ticket reference: WT-3363 * - * Test case description: There are a number of operations that we run that we - * expect not to conflict with or block against a running checkpoint. This test - * aims to run repeated checkpoints in a thread, while running an assortment - * of operations that we expect to execute quickly on further threads. To - * ensure that we catch any blockages we introduce a very large delay into the + * Test case description: There are a number of operations that we run that we expect not to + * conflict with or block against a running checkpoint. This test aims to run repeated checkpoints + * in a thread, while running an assortment of operations that we expect to execute quickly on + * further threads. To ensure that we catch any blockages we introduce a very large delay into the * checkpoint and measure that no operation takes 1/2 the length of this delay. * - * Failure mode: We monitor the execution time of all operations and if we find - * any operation taking longer than 1/2 the delay time, we abort dumping a core - * file which can be used to determine what operation was blocked. + * Failure mode: We monitor the execution time of all operations and if we find any operation taking + * longer than 1/2 the delay time, we abort dumping a core file which can be used to determine what + * operation was blocked. */ static WT_THREAD_RET do_checkpoints(void *); static WT_THREAD_RET do_ops(void *); diff --git a/src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c b/src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c index 264dbbb5679..442d3afb306 100644 --- a/src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c @@ -59,11 +59,11 @@ handle_message(WT_EVENT_HANDLER *handler, WT_SESSION *session, int error, const stderr, "Got cache overflow error (expect_panic=%s)\n", expect_panic ? "true" : "false"); /* - * If we're expecting a panic, exit with zero to indicate to the - * parent that this test was successful. + * If we're expecting a panic, exit with zero to indicate to the parent that this test was + * successful. * - * If not, don't intercept. We'll naturally exit with non-zero - * if we're terminating due to panic. + * If not, don't intercept. We'll naturally exit with non-zero if we're terminating due to + * panic. */ if (expect_panic) exit(EXIT_SUCCESS); @@ -101,15 +101,13 @@ las_workload(TEST_OPTS *opts, const char *las_file_max) } /* - * Open a snapshot isolation transaction in another session. This forces - * the cache to retain all previous values. Then update all keys with a - * new value in the original session while keeping that snapshot - * transaction open. With the large value buffer, small cache and lots - * of keys, this will force a lot of lookaside usage. + * Open a snapshot isolation transaction in another session. This forces the cache to retain all + * previous values. Then update all keys with a new value in the original session while keeping + * that snapshot transaction open. With the large value buffer, small cache and lots of keys, + * this will force a lot of lookaside usage. * - * When the file_max setting is small, the maximum size should easily be - * reached and we should panic. When the maximum size is large or not - * set, then we should succeed. + * When the file_max setting is small, the maximum size should easily be reached and we should + * panic. When the maximum size is large or not set, then we should succeed. */ testutil_check(opts->conn->open_session(opts->conn, NULL, NULL, &other_session)); testutil_check(other_session->begin_transaction(other_session, "isolation=snapshot")); @@ -147,11 +145,11 @@ test_las_workload(TEST_OPTS *opts, const char *las_file_max) testutil_make_work_dir(opts->home); /* - * Since it's possible that the workload will panic and abort, we will - * fork the process and execute the workload in the child process. + * Since it's possible that the workload will panic and abort, we will fork the process and + * execute the workload in the child process. * - * This way, we can safely check the exit code of the child process and - * confirm that it is what we expected. + * This way, we can safely check the exit code of the child process and confirm that it is what + * we expected. */ pid = fork(); if (pid < 0) diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml index 578bddc84b7..a61c797e683 100755 --- a/src/third_party/wiredtiger/test/evergreen.yml +++ b/src/third_party/wiredtiger/test/evergreen.yml @@ -4,7 +4,7 @@ # functions: - "fetch source" : + "get project" : command: git.get_project params: directory: wiredtiger @@ -13,7 +13,7 @@ functions: params: aws_key: ${aws_key} aws_secret: ${aws_secret} - remote_file: wiredtiger/${build_variant}/${revision}/artifacts/${build_id}.tgz + remote_file: wiredtiger/${build_variant}/${revision}/artifacts/${dependent_task|compile}_${build_id}.tgz bucket: build_external extract_to: wiredtiger "fetch artifacts from little-endian" : @@ -55,11 +55,12 @@ functions: set -o verbose if [ "Windows_NT" = "$OS" ]; then pip install scons==3.1.1 - scons-3.1.1.bat --enable-python=c:\\swigwin-3.0.2\\swig.exe --enable-diagnostic ${smp_command|} + scons-3.1.1.bat ${win_configure_flags|--enable-python=c:\\swigwin-3.0.2\\swig.exe --enable-diagnostic} ${smp_command|} else cd build_posix sh ./reconf - ${configure_env_vars|} ../configure ${configure_python_setting|} --enable-diagnostic --enable-python --enable-zlib --enable-strict --enable-static --prefix=$(pwd)/LOCAL_INSTALL + ${configure_env_vars|} ../configure ${configure_python_setting|} \ + ${posix_configure_flags|--enable-diagnostic --enable-python --enable-zlib --enable-strict --enable-static --prefix=$(pwd)/LOCAL_INSTALL} ${make_command|make} ${smp_command|} 2>&1 # On macOS, change the binary location with install_name_tool since DYLD_LIBRARY_PATH @@ -71,62 +72,59 @@ functions: install_name_tool -change /usr/local/lib/libwiredtiger-$WT_VERSION.dylib $(pwd)/.libs/libwiredtiger-$WT_VERSION.dylib .libs/wt fi fi - -pre: - - command: shell.exec - params: - script: | - rm -rf "wiredtiger" -post: - - command: shell.exec + "make check directory": + command: shell.exec params: - working_dir: "wiredtiger" + working_dir: "wiredtiger/build_posix" script: | set -o errexit set -o verbose - tar cfz ../wiredtiger.tgz . - - command: s3.put + + ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C ${directory} ${smp_command|} 2>&1 + "upload artifact": + - command: archive.targz_pack + params: + target: "wiredtiger.tgz" + source_dir: "wiredtiger" + include: + - "./**" + - command: s3.put + params: + aws_secret: ${aws_secret} + aws_key: ${aws_key} + local_file: wiredtiger.tgz + bucket: build_external + permissions: public-read + content_type: application/tar + display_name: Artifacts + remote_file: wiredtiger/${build_variant}/${revision}/artifacts/${task_name}_${build_id}.tgz + "cleanup": + command: shell.exec params: - aws_secret: ${aws_secret} - aws_key: ${aws_key} - local_file: wiredtiger.tgz - bucket: build_external - permissions: public-read - content_type: application/tar - display_name: Artifacts - remote_file: wiredtiger/${build_variant}/${revision}/artifacts/${task_id}.tgz + script: | + rm -rf "wiredtiger" + rm -rf "wiredtiger.tgz" + +pre: + - func: "cleanup" +post: + - func: "upload artifact" + - func: "cleanup" tasks: ## Base compile task on posix flavours - name: compile commands: - - func: "fetch source" - - command: git.apply_patch - params: - directory: wiredtiger + - func: "get project" - func: "compile wiredtiger" - - command: archive.targz_pack - params: - target: "wiredtiger.tgz" - source_dir: "wiredtiger" - include: - - "./**" - - command: s3.put - params: - aws_secret: ${aws_secret} - aws_key: ${aws_key} - local_file: wiredtiger.tgz - bucket: build_external - permissions: public-read - content_type: application/tar - display_name: Artifacts_compile - remote_file: wiredtiger/${build_variant}/${revision}/artifacts/${build_id}.tgz - # Remove the artifacts here so the later post commands won't perform duplicated archiving. - - command: shell.exec - params: - script: | - rm -rf "wiredtiger" - rm -rf "wiredtiger.tgz" + + - name: compile-asan + commands: + - func: "get project" + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=/opt/mongodbtoolchain/v3/bin/clang CXX=/opt/mongodbtoolchain/v3/bin/clang++ ASAN_OPTIONS=detect_leaks=1:abort_on_error=1:disable_coredump=0 ASAN_SYMBOLIZER_PATH=/opt/mongodbtoolchain/v3/bin/llvm-symbolizer CFLAGS=-fsanitize=address + posix_configure_flags: --enable-silent-rules --enable-strict --enable-diagnostic --disable-static - name: make-check-test depends_on: @@ -151,14 +149,9 @@ tasks: commands: - func: "fetch artifacts" - func: "compile wiredtiger" - - command: shell.exec - params: - working_dir: "wiredtiger/build_posix" - script: | - set -o errexit - set -o verbose - - ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C lang/python ${smp_command|} 2>&1 + - func: "make check directory" + vars: + directory: lang/python - name: examples-c-test depends_on: @@ -166,14 +159,24 @@ tasks: commands: - func: "fetch artifacts" - func: "compile wiredtiger" - - command: shell.exec - params: - working_dir: "wiredtiger/build_posix" - script: | - set -o errexit - set -o verbose + - func: "make check directory" + vars: + directory: examples/c - ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C examples/c ${smp_command|} 2>&1 + - name: examples-c-test-asan + depends_on: + - name: compile-asan + commands: + - func: "fetch artifacts" + vars: + dependent_task: compile-asan + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=/opt/mongodbtoolchain/v3/bin/clang CXX=/opt/mongodbtoolchain/v3/bin/clang++ ASAN_OPTIONS=detect_leaks=1:abort_on_error=1:disable_coredump=0 ASAN_SYMBOLIZER_PATH=/opt/mongodbtoolchain/v3/bin/llvm-symbolizer CFLAGS=-fsanitize=address + posix_configure_flags: --enable-silent-rules --enable-strict --enable-diagnostic --disable-static + - func: "make check directory" + vars: + directory: examples/c - name: bloom-test depends_on: @@ -181,14 +184,9 @@ tasks: commands: - func: "fetch artifacts" - func: "compile wiredtiger" - - command: shell.exec - params: - working_dir: "wiredtiger/build_posix" - script: | - set -o errexit - set -o verbose - - ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/bloom ${smp_command|} 2>&1 + - func: "make check directory" + vars: + directory: test/bloom - name: checkpoint-test depends_on: @@ -196,14 +194,9 @@ tasks: commands: - func: "fetch artifacts" - func: "compile wiredtiger" - - command: shell.exec - params: - working_dir: "wiredtiger/build_posix" - script: | - set -o errexit - set -o verbose - - ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/checkpoint ${smp_command|} 2>&1 + - func: "make check directory" + vars: + directory: test/checkpoint - name: cursor-order-test depends_on: @@ -211,14 +204,9 @@ tasks: commands: - func: "fetch artifacts" - func: "compile wiredtiger" - - command: shell.exec - params: - working_dir: "wiredtiger/build_posix" - script: | - set -o errexit - set -o verbose - - ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/cursor_order ${smp_command|} 2>&1 + - func: "make check directory" + vars: + directory: test/cursor_order - name: fops-test depends_on: @@ -226,14 +214,9 @@ tasks: commands: - func: "fetch artifacts" - func: "compile wiredtiger" - - command: shell.exec - params: - working_dir: "wiredtiger/build_posix" - script: | - set -o errexit - set -o verbose - - ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/fops ${smp_command|} 2>&1 + - func: "make check directory" + vars: + directory: test/fops - name: format-test depends_on: @@ -241,14 +224,9 @@ tasks: commands: - func: "fetch artifacts" - func: "compile wiredtiger" - - command: shell.exec - params: - working_dir: "wiredtiger/build_posix" - script: | - set -o errexit - set -o verbose - - ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/format ${smp_command|} 2>&1 + - func: "make check directory" + vars: + directory: test/format - name: huge-test depends_on: @@ -256,14 +234,9 @@ tasks: commands: - func: "fetch artifacts" - func: "compile wiredtiger" - - command: shell.exec - params: - working_dir: "wiredtiger/build_posix" - script: | - set -o errexit - set -o verbose - - ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/huge ${smp_command|} 2>&1 + - func: "make check directory" + vars: + directory: test/huge - name: manydbs-test depends_on: @@ -271,14 +244,9 @@ tasks: commands: - func: "fetch artifacts" - func: "compile wiredtiger" - - command: shell.exec - params: - working_dir: "wiredtiger/build_posix" - script: | - set -o errexit - set -o verbose - - ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/manydbs ${smp_command|} 2>&1 + - func: "make check directory" + vars: + directory: test/manydbs - name: packing-test depends_on: @@ -286,14 +254,9 @@ tasks: commands: - func: "fetch artifacts" - func: "compile wiredtiger" - - command: shell.exec - params: - working_dir: "wiredtiger/build_posix" - script: | - set -o errexit - set -o verbose - - ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/packing ${smp_command|} 2>&1 + - func: "make check directory" + vars: + directory: test/packing - name: readonly-test depends_on: @@ -301,14 +264,9 @@ tasks: commands: - func: "fetch artifacts" - func: "compile wiredtiger" - - command: shell.exec - params: - working_dir: "wiredtiger/build_posix" - script: | - set -o errexit - set -o verbose - - ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/readonly ${smp_command|} 2>&1 + - func: "make check directory" + vars: + directory: test/readonly - name: salvage-test depends_on: @@ -316,14 +274,9 @@ tasks: commands: - func: "fetch artifacts" - func: "compile wiredtiger" - - command: shell.exec - params: - working_dir: "wiredtiger/build_posix" - script: | - set -o errexit - set -o verbose - - ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/salvage ${smp_command|} 2>&1 + - func: "make check directory" + vars: + directory: test/salvage - name: thread-test depends_on: @@ -331,14 +284,9 @@ tasks: commands: - func: "fetch artifacts" - func: "compile wiredtiger" - - command: shell.exec - params: - working_dir: "wiredtiger/build_posix" - script: | - set -o errexit - set -o verbose - - ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C test/thread ${smp_command|} 2>&1 + - func: "make check directory" + vars: + directory: test/thread - name: bench-wtperf-test depends_on: @@ -346,14 +294,9 @@ tasks: commands: - func: "fetch artifacts" - func: "compile wiredtiger" - - command: shell.exec - params: - working_dir: "wiredtiger/build_posix" - script: | - set -o errexit - set -o verbose - - ${test_env_vars|} ${make_command|make} VERBOSE=1 check -C bench/wtperf ${smp_command|} 2>&1 + - func: "make check directory" + vars: + directory: bench/wtperf # End of normal make check test tasks @@ -857,7 +800,7 @@ tasks: # Avoid /usr/bin/python, at least on macOS: with System Integrity # Protection enabled, it ignores DYLD_LIBRARY_PATH and hence # doesn't find the WiredTiger library in the local tree. - ${test_env_vars|} python ../test/suite/run.py -v 2 ${smp_command|} 2>&1 + ${test_env_vars|} ${python_binary|python} ../test/suite/run.py -v 2 ${smp_command|} 2>&1 # Break out Python unit tests into multiple buckets/tasks based on test name and runtime # The test/suite/run.py script can work out test names by casting each command argument @@ -1028,7 +971,7 @@ tasks: - name: million-collection-test commands: - - func: "fetch source" + - func: "get project" - func: "fetch mongo-tests repo" - command: shell.exec params: @@ -1043,7 +986,7 @@ tasks: - name: compatibility-test-for-mongodb-releases commands: - - func: "fetch source" + - func: "get project" - command: shell.exec params: working_dir: "wiredtiger" @@ -1243,6 +1186,8 @@ buildvariants: - name: unit-test-bucket06 - name: unit-test-bucket07 - name: fops + - name: compile-asan + - name: examples-c-test-asan - name: ubuntu1804-python3 display_name: Ubuntu 18.04 (Python3) @@ -1266,6 +1211,21 @@ buildvariants: - name: unit-test-bucket06 - name: unit-test-bucket07 +- name: rhel80 + display_name: RHEL 8.0 + run_on: + - rhel80-test + expansions: + test_env_vars: PATH=/opt/mongodbtoolchain/v3/bin:$PATH LD_LIBRARY_PATH=$(pwd)/.libs top_srcdir=$(pwd)/.. top_builddir=$(pwd) + smp_command: -j $(grep -c ^processor /proc/cpuinfo) + configure_env_vars: CC=/opt/mongodbtoolchain/v3/bin/gcc CXX=/opt/mongodbtoolchain/v3/bin/g++ PATH=/opt/mongodbtoolchain/v3/bin:$PATH + make_command: PATH=/opt/mongodbtoolchain/v3/bin:$PATH make + tasks: + - name: compile + - name: make-check-test + - name: unit-test + - name: fops + - name: large-scale-test display_name: Large scale testing batchtime: 1440 # 1 day diff --git a/src/third_party/wiredtiger/test/format/bulk.c b/src/third_party/wiredtiger/test/format/bulk.c index 303b0e4dbca..0f46e645311 100644 --- a/src/third_party/wiredtiger/test/format/bulk.c +++ b/src/third_party/wiredtiger/test/format/bulk.c @@ -151,15 +151,13 @@ wts_load(void) } /* - * We don't want to size the cache to ensure the initial data - * set can load in the in-memory case, guaranteeing the load - * succeeds probably means future updates are also guaranteed - * to succeed, which isn't what we want. If we run out of space - * in the initial load, reset the row counter and continue. + * We don't want to size the cache to ensure the initial data set can load in the in-memory + * case, guaranteeing the load succeeds probably means future updates are also guaranteed to + * succeed, which isn't what we want. If we run out of space in the initial load, reset the + * row counter and continue. * - * Decrease inserts, they can't be successful if we're at the - * cache limit, and increase the delete percentage to get some - * extra space once the run starts. + * Decrease inserts, they can't be successful if we're at the cache limit, and increase the + * delete percentage to get some extra space once the run starts. */ if ((ret = cursor->insert(cursor)) != 0) { testutil_assert(ret == WT_CACHE_FULL || ret == WT_ROLLBACK); diff --git a/src/third_party/wiredtiger/test/format/compact.c b/src/third_party/wiredtiger/test/format/compact.c index e0492b7d5d6..a8c7ea4b3f9 100644 --- a/src/third_party/wiredtiger/test/format/compact.c +++ b/src/third_party/wiredtiger/test/format/compact.c @@ -60,12 +60,11 @@ compact(void *arg) break; /* - * Compact can return EBUSY if concurrent with alter or if there - * is eviction pressure, or we collide with checkpoints. + * Compact can return EBUSY if concurrent with alter or if there is eviction pressure, or we + * collide with checkpoints. * - * Compact returns ETIMEDOUT if the compaction doesn't finish in - * in some number of seconds. We don't configure a timeout and - * occasionally exceed the default of 1200 seconds. + * Compact returns ETIMEDOUT if the compaction doesn't finish in in some number of seconds. + * We don't configure a timeout and occasionally exceed the default of 1200 seconds. */ ret = session->compact(session, g.uri, NULL); if (ret != 0 && ret != EBUSY && ret != ETIMEDOUT && ret != WT_ROLLBACK) diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c index 712bd27fffb..8cec1318efc 100644 --- a/src/third_party/wiredtiger/test/format/config.c +++ b/src/third_party/wiredtiger/test/format/config.c @@ -109,11 +109,10 @@ config_setup(void) /* * LSM requires a row-store and backing disk. * - * Configuring truncation or timestamps results in LSM - * cache problems, don't configure LSM if those set. + * Configuring truncation or timestamps results in LSM cache problems, don't + * configure LSM if those set. * - * XXX - * Remove the timestamp test when WT-4162 resolved. + * XXX Remove the timestamp test when WT-4162 resolved. */ if (g.type != ROW || g.c_in_memory) break; @@ -209,16 +208,14 @@ config_setup(void) /* * Run-length is configured by a number of operations and a timer. * - * If the operation count and the timer are both configured, do nothing. - * If only the timer is configured, clear the operations count. - * If only the operation count is configured, limit the run to 6 hours. - * If neither is configured, leave the operations count alone and limit - * the run to 30 minutes. + * If the operation count and the timer are both configured, do nothing. If only the timer is + * configured, clear the operations count. If only the operation count is configured, limit the + * run to 6 hours. If neither is configured, leave the operations count alone and limit the run + * to 30 minutes. * - * In other words, if we rolled the dice on everything, do a short run. - * If we chose a number of operations but the rest of the configuration - * means operations take a long time to complete (for example, a small - * cache and many worker threads), don't let it run forever. + * In other words, if we rolled the dice on everything, do a short run. If we chose a number of + * operations but the rest of the configuration means operations take a long time to complete + * (for example, a small cache and many worker threads), don't let it run forever. */ if (config_is_perm("timer")) { if (!config_is_perm("ops")) @@ -263,16 +260,14 @@ config_cache(void) /* * Maximum internal/leaf page size sanity. * - * Ensure we can service at least one operation per-thread concurrently - * without filling the cache with pinned pages, that is, every thread - * consuming an internal page and a leaf page (or a pair of leaf pages - * for cursor movements). + * Ensure we can service at least one operation per-thread concurrently without filling the + * cache with pinned pages, that is, every thread consuming an internal page and a leaf page (or + * a pair of leaf pages for cursor movements). * * Maximum memory pages are in units of MB. * - * This code is what dramatically increases the cache size when there - * are lots of threads, it grows the cache to several megabytes per - * thread. + * This code is what dramatically increases the cache size when there are lots of threads, it + * grows the cache to several megabytes per thread. */ g.c_cache = WT_MAX(g.c_cache, 2 * g.c_threads * g.c_memory_page_max); @@ -368,8 +363,7 @@ config_compression(const char *conf_name) /* * Select a compression type from the list of built-in engines. * - * Listed percentages are only correct if all of the possible engines - * are compiled in. + * Listed percentages are only correct if all of the possible engines are compiled in. */ switch (mmrand(NULL, 1, 20)) { #ifdef HAVE_BUILTIN_EXTENSION_LZ4 @@ -657,13 +651,11 @@ config_pct(void) } /* - * Walk the list, allocating random numbers of operations in a random - * order. + * Walk the list, allocating random numbers of operations in a random order. * - * If the "order" field is non-zero, we need to create a value for this - * operation. Find the largest order field in the array; if one non-zero - * order field is found, it's the last entry and gets the remainder of - * the operations. + * If the "order" field is non-zero, we need to create a value for this operation. Find the + * largest order field in the array; if one non-zero order field is found, it's the last entry + * and gets the remainder of the operations. */ for (pct = 100 - pct;;) { for (i = n = max_order = max_slot = 0; i < WT_ELEMENTS(list); ++i) { diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index 890f03c845c..d8cfea5730a 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -90,13 +90,11 @@ typedef struct { WT_RAND_STATE rnd; /* Global RNG state */ /* - * Prepare will return an error if the prepare timestamp is less than - * any active read timestamp. Lock across allocating prepare and read - * timestamps. + * Prepare will return an error if the prepare timestamp is less than any active read timestamp. + * Lock across allocating prepare and read timestamps. * - * We get the last committed timestamp periodically in order to update - * the oldest timestamp, that requires locking out transactional ops - * that set a timestamp. + * We get the last committed timestamp periodically in order to update the oldest timestamp, + * that requires locking out transactional ops that set a timestamp. */ pthread_rwlock_t ts_lock; diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index 6f5e7943c83..f136372260c 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -107,15 +107,13 @@ wts_ops(bool lastrun) modify_repl_init(); /* - * There are two mechanisms to specify the length of the run, a number - * of operations and a timer, when either expire the run terminates. + * There are two mechanisms to specify the length of the run, a number of operations and a + * timer, when either expire the run terminates. * - * Each thread does an equal share of the total operations (and make - * sure that it's not 0). + * Each thread does an equal share of the total operations (and make sure that it's not 0). * - * Calculate how many fourth-of-a-second sleeps until the timer expires. - * If the timer expires and threads don't return in 15 minutes, assume - * there is something hung, and force the quit. + * Calculate how many fourth-of-a-second sleeps until the timer expires. If the timer expires + * and threads don't return in 15 minutes, assume there is something hung, and force the quit. */ if (g.c_ops == 0) thread_ops = -1; @@ -340,9 +338,8 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp) /* * Otherwise, pick a current timestamp. * - * Prepare returns an error if the prepare timestamp is less - * than any active read timestamp, single-thread transaction - * prepare and begin. + * Prepare returns an error if the prepare timestamp is less than any active read timestamp, + * single-thread transaction prepare and begin. * * Lock out the oldest timestamp update. */ @@ -474,12 +471,12 @@ prepare_transaction(TINFO *tinfo) ++tinfo->prepare; /* - * Prepare timestamps must be less than or equal to the eventual commit - * timestamp. Set the prepare timestamp to whatever the global value is - * now. The subsequent commit will increment it, ensuring correctness. + * Prepare timestamps must be less than or equal to the eventual commit timestamp. Set the + * prepare timestamp to whatever the global value is now. The subsequent commit will increment + * it, ensuring correctness. * - * Prepare returns an error if the prepare timestamp is less than any - * active read timestamp, single-thread transaction prepare and begin. + * Prepare returns an error if the prepare timestamp is less than any active read timestamp, + * single-thread transaction prepare and begin. * * Lock out the oldest timestamp update. */ @@ -573,11 +570,9 @@ ops_open_session(TINFO *tinfo, bool *ckpt_handlep) } if (cursor == NULL) { /* - * Configure "append", in the case of column stores, we append - * when inserting new rows. + * Configure "append", in the case of column stores, we append when inserting new rows. * - * WT_SESSION.open_cursor can return EBUSY if concurrent with a - * metadata operation, retry. + * WT_SESSION.open_cursor can return EBUSY if concurrent with a metadata operation, retry. */ while ((ret = session->open_cursor(session, g.uri, NULL, "append", &cursor)) == EBUSY) __wt_yield(); @@ -842,16 +837,13 @@ ops(void *arg) tinfo->keyno = mmrand(&tinfo->rnd, 1, (u_int)g.rows); /* - * Truncate up to 5% of the table. If the range overlaps - * the beginning/end of the table, set the key to 0 (the - * truncate function then sets a cursor to NULL so that - * code is tested). + * Truncate up to 5% of the table. If the range overlaps the beginning/end of the table, + * set the key to 0 (the truncate function then sets a cursor to NULL so that code is + * tested). * - * This gets tricky: there are 2 directions (truncating - * from lower keys to the current position or from - * the current position to higher keys), and collation - * order (truncating from lower keys to higher keys or - * vice-versa). + * This gets tricky: there are 2 directions (truncating from lower keys to the current + * position or from the current position to higher keys), and collation order + * (truncating from lower keys to higher keys or vice-versa). */ greater_than = mmrand(&tinfo->rnd, 0, 1) == 1; range = g.rows < 20 ? 0 : mmrand(&tinfo->rnd, 0, (u_int)g.rows / 20); @@ -1583,30 +1575,26 @@ table_append(uint64_t keyno) ep = g.append + g.append_max; /* - * We don't want to ignore records we append, which requires we update - * the "last row" as we insert new records. Threads allocating record - * numbers can race with other threads, so the thread allocating record - * N may return after the thread allocating N + 1. We can't update a - * record before it's been inserted, and so we can't leave gaps when the - * count of records in the table is incremented. + * We don't want to ignore records we append, which requires we update the "last row" as we + * insert new records. Threads allocating record numbers can race with other threads, so the + * thread allocating record N may return after the thread allocating N + 1. We can't update a + * record before it's been inserted, and so we can't leave gaps when the count of records in the + * table is incremented. * - * The solution is the append table, which contains an unsorted list of - * appended records. Every time we finish appending a record, process - * the table, trying to update the total records in the object. + * The solution is the append table, which contains an unsorted list of appended records. Every + * time we finish appending a record, process the table, trying to update the total records in + * the object. * * First, enter the new key into the append list. * - * It's technically possible to race: we allocated space for 10 records - * per thread, but the check for the maximum number of records being - * appended doesn't lock. If a thread allocated a new record and went - * to sleep (so the append table fills up), then N threads of control - * used the same g.append_cnt value to decide there was an available - * slot in the append table and both allocated new records, we could run - * out of space in the table. It's unfortunately not even unlikely in - * the case of a large number of threads all inserting as fast as they - * can and a single thread going to sleep for an unexpectedly long time. - * If it happens, sleep and retry until earlier records are resolved - * and we find a slot. + * It's technically possible to race: we allocated space for 10 records per thread, but the + * check for the maximum number of records being appended doesn't lock. If a thread allocated a + * new record and went to sleep (so the append table fills up), then N threads of control used + * the same g.append_cnt value to decide there was an available slot in the append table and + * both allocated new records, we could run out of space in the table. It's unfortunately not + * even unlikely in the case of a large number of threads all inserting as fast as they can and + * a single thread going to sleep for an unexpectedly long time. If it happens, sleep and retry + * until earlier records are resolved and we find a slot. */ for (done = 0;;) { testutil_check(pthread_rwlock_wrlock(&g.append_lock)); diff --git a/src/third_party/wiredtiger/test/format/random.c b/src/third_party/wiredtiger/test/format/random.c index 131cb0bd258..c808c53d442 100644 --- a/src/third_party/wiredtiger/test/format/random.c +++ b/src/third_party/wiredtiger/test/format/random.c @@ -70,7 +70,16 @@ random_kv(void *arg) /* This is just a smoke-test, get some key/value pairs. */ for (i = mmrand(NULL, 0, 1000); i > 0; --i) { - testutil_check(cursor->next(cursor)); + switch (ret = cursor->next(cursor)) { + case 0: + break; + case WT_NOTFOUND: + case WT_ROLLBACK: + case WT_PREPARE_CONFLICT: + continue; + default: + testutil_check(ret); + } testutil_check(cursor->get_key(cursor, &key)); testutil_check(cursor->get_value(cursor, &value)); } diff --git a/src/third_party/wiredtiger/test/format/salvage.c b/src/third_party/wiredtiger/test/format/salvage.c index efe2e0162a4..8c6e003370b 100644 --- a/src/third_party/wiredtiger/test/format/salvage.c +++ b/src/third_party/wiredtiger/test/format/salvage.c @@ -61,12 +61,11 @@ corrupt(void) char buf[8 * 1024], copycmd[2 * 1024]; /* - * If it's a single Btree file (not LSM), open the file, and corrupt - * roughly 2% of the file at a random spot, including the beginning - * of the file and overlapping the end. + * If it's a single Btree file (not LSM), open the file, and corrupt roughly 2% of the file at a + * random spot, including the beginning of the file and overlapping the end. * - * It's a little tricky: if the data source is a file, we're looking - * for "wt", if the data source is a table, we're looking for "wt.wt". + * It's a little tricky: if the data source is a file, we're looking for "wt", if the data + * source is a table, we're looking for "wt.wt". */ testutil_check(__wt_snprintf(buf, sizeof(buf), "%s/%s", g.home, WT_NAME)); if ((fd = open(buf, O_RDWR)) != -1) { diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c index c46a12f45b2..7a43ca9f9b4 100644 --- a/src/third_party/wiredtiger/test/format/t.c +++ b/src/third_party/wiredtiger/test/format/t.c @@ -134,11 +134,10 @@ main(int argc, char *argv[]) } /* - * If we weren't given a configuration file, set values from "CONFIG", - * if it exists. + * If we weren't given a configuration file, set values from "CONFIG", if it exists. * - * Small hack to ignore any CONFIG file named ".", that just makes it - * possible to ignore any local CONFIG file, used when running checks. + * Small hack to ignore any CONFIG file named ".", that just makes it possible to ignore any + * local CONFIG file, used when running checks. */ if (config == NULL && access("CONFIG", R_OK) == 0) config = "CONFIG"; @@ -213,12 +212,10 @@ main(int argc, char *argv[]) wts_ops(reps == FORMAT_OPERATION_REPS); /* - * Copy out the run's statistics after the last - * set of operations. + * Copy out the run's statistics after the last set of operations. * - * XXX - * Verify closes the underlying handle and - * discards the statistics, read them first. + * XXX Verify closes the underlying handle and discards the statistics, read them + * first. */ if (reps == FORMAT_OPERATION_REPS) wts_stats(); diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c index 88c5afd8e06..bdc98d25b46 100644 --- a/src/third_party/wiredtiger/test/format/util.c +++ b/src/third_party/wiredtiger/test/format/util.c @@ -165,9 +165,8 @@ val_init(void) /* * Set initial buffer contents to recognizable text. * - * Add a few extra bytes in order to guarantee we can always offset - * into the buffer by a few extra bytes, used to generate different - * data for column-store run-length encoded files. + * Add a few extra bytes in order to guarantee we can always offset into the buffer by a few + * extra bytes, used to generate different data for column-store run-length encoded files. */ val_len = MAX(KILOBYTE(100), g.c_value_max) + 20; val_base = dmalloc(val_len); @@ -351,11 +350,11 @@ path_setup(const char *home) testutil_check(__wt_snprintf(g.home_stats, len, "%s/%s", g.home, "stats")); /* - * Home directory initialize command: create the directory if it doesn't - * exist, else remove everything except the RNG log file. + * Home directory initialize command: create the directory if it doesn't exist, else remove + * everything except the RNG log file. * - * Redirect the "cd" command to /dev/null so chatty cd implementations - * don't add the new working directory to our output. + * Redirect the "cd" command to /dev/null so chatty cd implementations don't add the new working + * directory to our output. */ #undef CMD #ifdef _WIN32 @@ -398,11 +397,10 @@ path_setup(const char *home) "BACKUP_COPY", g.home, "BACKUP", g.home, "BACKUP_COPY")); /* - * Salvage command, save the interesting files so we can replay the - * salvage command as necessary. + * Salvage command, save the interesting files so we can replay the salvage command as necessary. * - * Redirect the "cd" command to /dev/null so chatty cd implementations - * don't add the new working directory to our output. + * Redirect the "cd" command to /dev/null so chatty cd implementations don't add the new working + * directory to our output. */ #undef CMD #ifdef _WIN32 @@ -439,12 +437,11 @@ rng(WT_RAND_STATE *rnd) rnd = &g.rnd; /* - * We can reproduce a single-threaded run based on the random numbers - * used in the initial run, plus the configuration files. + * We can reproduce a single-threaded run based on the random numbers used in the initial run, + * plus the configuration files. * - * Check g.replay and g.rand_log_stop: multithreaded runs log/replay - * until they get to the operations phase, then turn off log/replay, - * threaded operation order can't be replayed. + * Check g.replay and g.rand_log_stop: multithreaded runs log/replay until they get to the + * operations phase, then turn off log/replay, threaded operation order can't be replayed. */ if (g.rand_log_stop) return (__wt_random(rnd)); diff --git a/src/third_party/wiredtiger/test/readonly/readonly.c b/src/third_party/wiredtiger/test/readonly/readonly.c index bd6adae429a..fb10a0f61e2 100644 --- a/src/third_party/wiredtiger/test/readonly/readonly.c +++ b/src/third_party/wiredtiger/test/readonly/readonly.c @@ -291,11 +291,10 @@ main(int argc, char *argv[]) testutil_die(ret, "wiredtiger_open readonly nolock"); /* - * Create a child to also open a connection handle to the databases. - * We cannot use fork here because using fork the child inherits the - * same memory image. Therefore the WT process structure is set in - * the child even though it should not be. So use 'system' to spawn - * an entirely new process. + * Create a child to also open a connection handle to the databases. We cannot use fork here + * because using fork the child inherits the same memory image. Therefore the WT process + * structure is set in the child even though it should not be. So use 'system' to spawn an + * entirely new process. * * The child will exit with success if its test passes. */ diff --git a/src/third_party/wiredtiger/test/utility/test_util.h b/src/third_party/wiredtiger/test/utility/test_util.h index 398727a6ca8..3442e8edcec 100644 --- a/src/third_party/wiredtiger/test/utility/test_util.h +++ b/src/third_party/wiredtiger/test/utility/test_util.h @@ -185,8 +185,8 @@ u64_to_string(uint64_t n, char **pp) char *p; /* - * The argument pointer references the last element of a buffer (which - * must be large enough to hold any possible value). + * The argument pointer references the last element of a buffer (which must be large enough to + * hold any possible value). * * Nul-terminate the buffer. */ |