diff options
17 files changed, 402 insertions, 219 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 068abb517a5..e770694e17c 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -216,8 +216,8 @@ LLLLLLL LOGREC LOGSCAN LOOKASIDE -LRSVv LRU +LRrSVv LSB LSM LSN @@ -349,6 +349,7 @@ SSHH SSq STAILQ STEC +STR STRUCT Scalability Scalable diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index a7c02120dad..8131208bf28 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "c600bde20363629405082a3ea985b70dfb00850e", + "commit": "18d13b8f6bc8d345952f16a7f3c63608e405fd77", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-4.2" diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c index 525728b73dc..ae2c64a126d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_random.c +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -9,147 +9,332 @@ #include "wt_internal.h" /* - * __wt_row_random_leaf -- - * Return a random key from a row-store leaf page. + * __random_insert_valid -- + * Check if the inserted key/value pair is valid. */ -int -__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +static int +__random_insert_valid( + WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_INSERT *ins, WT_UPDATE **updp, bool *validp) { - WT_INSERT *ins, **start, **stop; - WT_INSERT_HEAD *ins_head; - WT_PAGE *page; - uint64_t samples; - uint32_t choice, entries, i; - int level; + *updp = NULL; + *validp = false; - page = cbt->ref->page; - start = stop = NULL; /* [-Wconditional-uninitialized] */ - entries = 0; /* [-Wconditional-uninitialized] */ + __cursor_pos_clear(cbt); + cbt->slot = 0; + cbt->ins_head = ins_head; + cbt->ins = ins; + cbt->compare = 0; + + return (__wt_cursor_valid(cbt, updp, validp)); +} + +/* + * __random_slot_valid -- + * Check if the slot key/value pair is valid. + */ +static int +__random_slot_valid(WT_CURSOR_BTREE *cbt, uint32_t slot, WT_UPDATE **updp, bool *validp) +{ + *updp = NULL; + *validp = false; __cursor_pos_clear(cbt); + cbt->slot = slot; + cbt->compare = 0; + + return (__wt_cursor_valid(cbt, updp, validp)); +} - /* If the page has disk-based entries, select from them. */ - if (page->entries != 0) { - cbt->compare = 0; - cbt->slot = __wt_random(&session->rnd) % page->entries; +/* Magic constant: 5000 entries in a skip list is enough to forcibly evict. */ +#define WT_RANDOM_SKIP_EVICT_SOON 5000 +/* Magic constant: 50 entries in a skip list is enough to predict the size. */ +#define WT_RANDOM_SKIP_PREDICT 50 - /* - * The real row-store search function builds the key, so we have to as well. - */ - return (__wt_row_leaf_key(session, page, page->pg_row + cbt->slot, cbt->tmp, false)); - } +/* + * __random_skip_entries -- + * Return an estimate of how many entries are in a skip list. + */ +static uint32_t +__random_skip_entries(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head) +{ + WT_INSERT **t; + uint32_t entries; + int level; - /* - * If the tree is new (and not empty), it might have a large insert - * list. - * - * Walk down the list until we find a level with at least 50 entries, - * that's where we'll start rolling random numbers. The value 50 is - * used to ignore levels with only a few entries, that is, levels which - * are potentially badly skewed. - */ - F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) - return (WT_NOTFOUND); + entries = 0; /* [-Wconditional-uninitialized] */ + + if (ins_head == NULL) + return (0); + + /* Find a level with enough entries on it to predict the size of the list. */ for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { - start = &ins_head->head[level]; - for (entries = 0, stop = start; *stop != NULL; stop = &(*stop)->next[level]) + for (entries = 0, t = &ins_head->head[level]; *t != NULL; t = &(*t)->next[level]) ++entries; - if (entries > 50) + if (entries > WT_RANDOM_SKIP_PREDICT) break; } - /* - * If it's a tiny list and we went all the way to level 0, correct the level; entries is - * correctly set. - */ - if (level < 0) - level = 0; + /* Use the skiplist probability to estimate the size of the list. */ + WT_ASSERT(session, WT_SKIP_PROBABILITY == UINT32_MAX >> 2); + while (--level >= 0) + entries *= 4; /* - * Step down the skip list levels, selecting a random chunk of the name space at each level. + * Random lookups in newly created collections can be slow if a page consists of a large + * skiplist. Schedule the page for eviction if we encounter a large skiplist. This is worthwhile + * because applications that take a sample often take many samples, so the overhead of + * traversing the skip list each time accumulates to real time. */ - for (samples = entries; level > 0; samples += entries) { + if (entries > WT_RANDOM_SKIP_EVICT_SOON) + __wt_page_evict_soon(session, cbt->ref); + + return (entries); +} + +/* Magic constant: check 3 records before/after the selected record. */ +#define WT_RANDOM_SKIP_LOCAL 3 +/* Magic constant: retry 3 times in a skip list before giving up. */ +#define WT_RANDOM_SKIP_RETRY 3 + +/* + * __random_leaf_skip -- + * Return a random key/value from a skip list. + */ +static int +__random_leaf_skip(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, + uint32_t entries, WT_UPDATE **updp, bool *validp) +{ + WT_INSERT *ins, *saved_ins; + uint32_t i; + int retry; + + *updp = NULL; + *validp = false; + + /* This is a relatively expensive test, try a few times then quit. */ + for (retry = 0; retry < WT_RANDOM_SKIP_RETRY; ++retry) { /* - * There are (entries) or (entries + 1) chunks of the name space - * considered at each level. They are: between start and the 1st - * element, between the 1st and 2nd elements, and so on to the - * last chunk which is the name space after the stop element on - * the current level. This last chunk of name space may or may - * not be there: as we descend the levels of the skip list, this - * chunk may appear, depending if the next level down has - * entries logically after the stop point in the current level. - * We can't ignore those entries: because of the algorithm used - * to determine the depth of a skiplist, there may be a large - * number of entries "revealed" by descending a level. - * - * If the next level down has more items after the current stop - * point, there are (entries + 1) chunks to consider, else there - * are (entries) chunks. + * Randomly select a record in the skip list and walk to it. Remember the entry a few + * records before our target so we can look around in case our chosen record isn't valid. */ - if (*(stop - 1) == NULL) - choice = __wt_random(&session->rnd) % entries; - else - choice = __wt_random(&session->rnd) % (entries + 1); + saved_ins = NULL; + i = __wt_random(&session->rnd) % entries; + for (ins = WT_SKIP_FIRST(ins_head); ins != NULL; ins = WT_SKIP_NEXT(ins)) { + if (--i == 0) + break; + if (i == WT_RANDOM_SKIP_LOCAL * 2) + saved_ins = ins; + } - if (choice == entries) { - /* - * We selected the name space after the stop element on this level. Set the start point - * to the current stop point, descend a level and move the stop element to the end of - * the list, that is, the end of the newly discovered name space, counting entries as we - * go. - */ - start = stop; - --start; - --level; - for (entries = 0, stop = start; *stop != NULL; stop = &(*stop)->next[level]) - ++entries; - } else { - /* - * We selected another name space on the level. Move the start pointer the selected - * number of entries forward to the start of the selected chunk (if the selected number - * is 0, start won't move). Set the stop pointer to the next element in the list and - * drop both start and stop down a level. - */ - for (i = 0; i < choice; ++i) - start = &(*start)->next[level]; - stop = &(*start)->next[level]; + /* Try and return our selected record. */ + if (ins != NULL) { + WT_RET(__random_insert_valid(cbt, ins_head, ins, updp, validp)); + if (*validp) + return (0); + } + + /* Check a few records before/after our selected record. */ + i = WT_RANDOM_SKIP_LOCAL; + if (saved_ins != NULL) { + i = WT_RANDOM_SKIP_LOCAL * 2; + ins = saved_ins; + } + for (; --i > 0 && ins != NULL; ins = WT_SKIP_NEXT(ins)) { + WT_RET(__random_insert_valid(cbt, ins_head, ins, updp, validp)); + if (*validp) + return (0); + } + } + return (0); +} + +/* Magic constant: 100 entries in any randomly chosen skip list is enough to select from it. */ +#define WT_RANDOM_SKIP_INSERT_ENOUGH 100 +/* Magic constant: 1000 entries in an initial skip list is enough to always select from it. */ +#define WT_RANDOM_SKIP_INSERT_SMALLEST_ENOUGH 1000 - --start; - --stop; - --level; +/* + * __random_leaf_insert -- + * Look for a large insert list from which we can select a random item. + */ +static int +__random_leaf_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *validp) +{ + WT_INSERT_HEAD *ins_head; + WT_PAGE *page; + uint32_t entries, slot, start; + + *updp = NULL; + *validp = false; + + page = cbt->ref->page; - /* Count the entries in the selected name space. */ - for (entries = 0, ins = *start; ins != *stop; ins = ins->next[level]) - ++entries; + /* Check for a large insert list with no items, that's common when tables are newly created. */ + ins_head = WT_ROW_INSERT_SMALLEST(page); + entries = __random_skip_entries(session, cbt, ins_head); + if (entries >= WT_RANDOM_SKIP_INSERT_SMALLEST_ENOUGH) { + WT_RET(__random_leaf_skip(session, cbt, ins_head, entries, updp, validp)); + if (*validp) + return (0); + } + + /* + * Look for any reasonably large insert list. We're selecting a random insert list and won't end + * up on the same insert list every time we search this page (unless there's only one list), so + * decrease the required number of records required to select from the list. + */ + if (page->entries > 0) { + start = __wt_random(&session->rnd) % page->entries; + for (slot = start; slot < page->entries; ++slot) { + ins_head = WT_ROW_INSERT(page, &page->pg_row[slot]); + entries = __random_skip_entries(session, cbt, ins_head); + if (entries >= WT_RANDOM_SKIP_INSERT_ENOUGH) { + WT_RET(__random_leaf_skip(session, cbt, ins_head, entries, updp, validp)); + if (*validp) + return (0); + } } + for (slot = 0; slot < start; ++slot) { + ins_head = WT_ROW_INSERT(page, &page->pg_row[slot]); + entries = __random_skip_entries(session, cbt, ins_head); + if (entries >= WT_RANDOM_SKIP_INSERT_ENOUGH) { + WT_RET(__random_leaf_skip(session, cbt, ins_head, entries, updp, validp)); + if (*validp) + return (0); + } + } + } + + /* Fall back to the single insert list, if it's not tiny. */ + ins_head = WT_ROW_INSERT_SMALLEST(page); + entries = __random_skip_entries(session, cbt, ins_head); + if (entries >= WT_RANDOM_SKIP_INSERT_ENOUGH) { + WT_RET(__random_leaf_skip(session, cbt, ins_head, entries, updp, validp)); + if (*validp) + return (0); + } + return (0); +} + +/* Magic constant: retry 10 times in the disk-based entries before giving up. */ +#define WT_RANDOM_DISK_RETRY 10 + +/* + * __random_leaf_disk -- + * Return a random key/value from a page's on-disk entries. + */ +static int +__random_leaf_disk(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *validp) +{ + WT_PAGE *page; + uint32_t entries, slot; + int retry; + + *updp = NULL; + *validp = false; + + page = cbt->ref->page; + entries = cbt->ref->page->entries; + + /* This is a relatively cheap test, so try several times. */ + for (retry = 0; retry < WT_RANDOM_DISK_RETRY; ++retry) { + slot = __wt_random(&session->rnd) % entries; + WT_RET(__random_slot_valid(cbt, slot, updp, validp)); + if (!*validp) + continue; + + /* The row-store search function builds the key, so we have to as well. */ + return (__wt_row_leaf_key(session, page, page->pg_row + slot, cbt->tmp, false)); } + return (0); +} + +/* Magic constant: cursor up to 250 next/previous records before selecting a key. */ +#define WT_RANDOM_CURSOR_MOVE 250 +/* Magic constant: 1000 disk-based entries in a page is enough to always select from them. */ +#define WT_RANDOM_DISK_ENOUGH 1000 + +/* + * __random_leaf -- + * Return a random key/value from a row-store leaf page. + */ +static int +__random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + WT_UPDATE *upd; + uint32_t i; + bool next, valid; + + cursor = (WT_CURSOR *)cbt; /* - * When we reach the bottom level, entries will already be set. Select - * a random entry from the name space and return it. - * - * It should be impossible for the entries count to be 0 at this point, - * but check for it out of paranoia and to quiet static testing tools. + * If the page has a sufficiently large number of disk-based entries, randomly select from them. + * Ignoring large insert lists could skew the results, but enough disk-based entries should span + * a reasonable chunk of the name space. */ - if (entries > 0) - entries = __wt_random(&session->rnd) % entries; - for (ins = *start; entries > 0; --entries) - ins = ins->next[0]; + if (cbt->ref->page->entries > WT_RANDOM_DISK_ENOUGH) { + WT_RET(__random_leaf_disk(session, cbt, &upd, &valid)); + if (valid) + return (__cursor_kv_return(session, cbt, upd)); + } - cbt->ins = ins; - cbt->ins_head = ins_head; - cbt->compare = 0; + /* Look for any large insert list and select from it. */ + WT_RET(__random_leaf_insert(session, cbt, &upd, &valid)); + if (valid) + return (__cursor_kv_return(session, cbt, upd)); /* - * Random lookups in newly created collections can be slow if a page consists of a large - * skiplist. Schedule the page for eviction if we encounter a large skiplist. This worthwhile - * because applications that take a sample often take many samples, so the overhead of - * traversing the skip list each time accumulates to real time. + * Try again if there are at least a few hundred disk-based entries: this may be a normal leaf + * page with big items. */ - if (samples > 5000) - __wt_page_evict_soon(session, cbt->ref); + if (cbt->ref->page->entries > WT_RANDOM_DISK_ENOUGH / 2) { + WT_RET(__random_leaf_disk(session, cbt, &upd, &valid)); + if (valid) + return (__cursor_kv_return(session, cbt, upd)); + } + + /* + * We don't have many disk-based entries, we didn't find any large insert lists. Where we get + * into trouble is a small number of pages with large numbers of deleted items. Try and move out + * of the problematic namespace into something we can use by cursoring forward or backward. On a + * page with a sufficiently large group of deleted items where the randomly selected entries are + * all deleted, simply moving to the next or previous record likely means moving to the same + * record every time, so move the cursor a random number of items. Further, detect if we're + * about to return the same item twice in a row and try to avoid it. (If there's only a single + * record, or only a pair of records, we'll still end up in trouble, but at some point the tree + * is too small to do anything better.) All of this is slow and expensive, but the alternative + * is customer complaints. + */ + __cursor_pos_clear(cbt); + cbt->slot = 0; + next = true; /* Forward from the beginning of the page. */ + for (i = __wt_random(&session->rnd) % WT_RANDOM_CURSOR_MOVE;;) { + ret = next ? __wt_btcur_next(cbt, false) : __wt_btcur_prev(cbt, false); + if (ret == WT_NOTFOUND) { + next = false; /* Reverse direction from the end of the tree. */ + ret = __wt_btcur_prev(cbt, false); + WT_RET(ret); /* An empty tree. */ + } + if (i > 0) + --i; + else { + /* + * Skip the record we returned last time, once. Clear the tracking value so we don't + * skip that record twice, it just means the tree is too small for anything reasonable. + */ + if (cursor->key.size == cbt->tmp->size && + memcmp(cursor->key.data, cbt->tmp->data, cbt->tmp->size) == 0) { + cbt->tmp->size = 0; + i = __wt_random(&session->rnd) % WT_RANDOM_CURSOR_MOVE; + } else { + WT_RET(__wt_buf_set(session, cbt->tmp, cursor->key.data, cursor->key.size)); + break; + } + } + } return (0); } @@ -280,15 +465,14 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; - WT_UPDATE *upd; wt_off_t size; uint64_t n, skip; uint32_t read_flags; - bool valid; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; + read_flags = WT_READ_RESTART_OK; if (F_ISSET(cbt, WT_CBT_READ_ONCE)) FLD_SET(read_flags, WT_READ_WONT_NEED); @@ -319,8 +503,10 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { WT_ERR(__cursor_func_init(cbt, true)); WT_WITH_PAGE_INDEX(session, ret = __wt_random_descent(session, &cbt->ref, read_flags)); - if (ret == 0) - goto random_page_entry; + if (ret == 0) { + WT_ERR(__random_leaf(session, cbt)); + return (0); + } /* * Random descent may return not-found: the tree might be empty or have so many deleted @@ -394,20 +580,9 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) if (cbt->ref == NULL) WT_ERR(__wt_btcur_next(cbt, false)); -random_page_entry: - /* - * Select a random entry from the leaf page. If it's not valid, move to the next entry, if that - * doesn't work, move to the previous entry. - */ - WT_ERR(__wt_row_random_leaf(session, cbt)); - WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); - if (valid) - WT_ERR(__cursor_kv_return(session, cbt, upd)); - else { - if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) - ret = __wt_btcur_prev(cbt, false); - WT_ERR(ret); - } + /* Select a random entry from the leaf page. */ + WT_ERR(__random_leaf(session, cbt)); + return (0); err: diff --git a/src/third_party/wiredtiger/src/docs/command-line.dox b/src/third_party/wiredtiger/src/docs/command-line.dox index 852e07dd70a..986a634e786 100644 --- a/src/third_party/wiredtiger/src/docs/command-line.dox +++ b/src/third_party/wiredtiger/src/docs/command-line.dox @@ -3,7 +3,7 @@ WiredTiger includes a command line utility, \c wt. @section util_global_synopsis SYNOPSIS -<code>wt [-LRVv] [-C config] [-E secretkey ] [-h directory] command [command-specific arguments]</code> +<code>wt [-LRrVv] [-C config] [-E secretkey ] [-h directory] command [command-specific arguments]</code> @section util_global_description DESCRIPTION The \c wt tool is a command-line utility that provides access to @@ -22,6 +22,8 @@ Specify a database home directory. Forcibly turn off logging subsystem for debugging purposes. @par <code>-R</code> Run recovery if the underlying database is configured to do so. +@par <code>-r</code> +Access the database via a readonly connection @par <code>-V</code> Display WiredTiger version and exit. @par <code>-v</code> @@ -34,7 +36,11 @@ The \c wt tool supports several commands. If configured in the underlying database, some commands will run recovery when opening the database. If the user wants to force recovery on any command, use the \c -R option. In general, commands that modify the database or tables will run recovery -by default and commands that only read data will not run recovery. +by default and commands that only read data will not run recovery. It is +recommended when attempting to diagnose a corrupt database, that the -r +flag be used. This flag will open the connection read-only and prevent +utility commands from writing prevent utility commands from writing to +any of the existing database objects. <hr> @section util_alter wt alter @@ -142,7 +148,7 @@ which can be re-loaded into a new table using the \c load command. See @subpage dump_formats for details of the dump file formats. @subsection util_dump_synopsis Synopsis -<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] dump [-jrx] [-c checkpoint] [-f output] uri</code> +<code>wt [-RrVv] [-C config] [-E secretkey ] [-h directory] dump [-jrx] [-c checkpoint] [-f output] uri</code> @subsection util_dump_options Options The following are command-specific options for the \c dump command: @@ -176,7 +182,7 @@ database. If a URI is specified as an argument, only information about that data source is printed. @subsection util_list_synopsis Synopsis -<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] list [-cv] [uri]</code> +<code>wt [-RrVv] [-C config] [-E secretkey ] [-h directory] list [-cv] [uri]</code> @subsection util_list_options Options The following are command-specific options for the \c list command: @@ -292,7 +298,7 @@ Display the database log. The \c printlog command outputs the database log. @subsection util_printlog_synopsis Synopsis -<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] printlog [-x] [-f output]</code> +<code>wt [-RrVv] [-C config] [-E secretkey ] [-h directory] printlog [-x] [-f output]</code> @subsection util_printlog_options Options The following are command-specific options for the \c printlog command: @@ -316,7 +322,7 @@ with string or record number keys and string values. The \c read command exits non-zero if a specified record is not found. @subsection util_read_synopsis Synopsis -<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] read uri key ...</code> +<code>wt [-RrVv] [-C config] [-E secretkey ] [-h directory] read uri key ...</code> @subsection util_read_options Options The \c read command has no command-specific options. @@ -404,7 +410,7 @@ The \c verify command verifies the specified table, exiting success if the data source is correct, and failure if the data source is corrupted. @subsection util_verify_synopsis Synopsis -<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] verify uri</code> +<code>wt [-RrVv] [-C config] [-E secretkey ] [-h directory] verify uri</code> @subsection util_verify_options Options The \c verify command has no command-specific options. diff --git a/src/third_party/wiredtiger/src/docs/spell.ok b/src/third_party/wiredtiger/src/docs/spell.ok index 98ec49779a4..0677b92028d 100644 --- a/src/third_party/wiredtiger/src/docs/spell.ok +++ b/src/third_party/wiredtiger/src/docs/spell.ok @@ -53,7 +53,7 @@ LIBS LLVM LOGREC LRU -LRVv +LRrVv LSB LSM LZ @@ -79,6 +79,7 @@ RepMgr Riak RocksDB Roelofs +RrVv Rrx SCons Seward's diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index 18b17a3bebd..b52bd2c86ca 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -145,13 +145,15 @@ struct __wt_cursor_btree { WT_ROW *rip_saved; /* Last-returned key reference */ /* - * A temporary buffer for caching RLE values for column-store files (if - * RLE is non-zero, then we don't unpack the value every time we move - * to the next cursor position, we re-use the unpacked value we stored - * here the first time we hit the value). + * A temporary buffer, used in a few different ways: * - * A temporary buffer for building on-page keys when searching row-store - * files. + * 1) For caching RLE values for column-store files (if RLE is non-zero, then we don't unpack + * the value every time we move to the next cursor position, we re-use the unpacked value we + * stored here the first time we hit the value). + * + * 2) For building on-page keys when searching row-store files. + * + * 3) For tracking random return values to avoid repetition. */ WT_ITEM *tmp, _tmp; diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index f069e683ff8..d02b4dca326 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -1194,8 +1194,6 @@ extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert, bool restore) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) diff --git a/src/third_party/wiredtiger/src/utilities/util_main.c b/src/third_party/wiredtiger/src/utilities/util_main.c index 4f609cdeeef..50879a6c862 100644 --- a/src/third_party/wiredtiger/src/utilities/util_main.c +++ b/src/third_party/wiredtiger/src/utilities/util_main.c @@ -11,7 +11,7 @@ const char *home = "."; /* Home directory */ const char *progname; /* Program name */ /* Global arguments */ -const char *usage_prefix = "[-LRSVv] [-C config] [-E secretkey] [-h home]"; +const char *usage_prefix = "[-LRrSVv] [-C config] [-E secretkey] [-h home]"; bool verbose = false; /* Verbose flag */ static const char *command; /* Command name */ @@ -45,6 +45,9 @@ usage(void) "-R\t" "run recovery (if recovery configured)\n" "\t" + "-r\t" + "access the database via a readonly connection\n" + "\t" "-S\t" "run salvage recovery (if recovery configured)\n" "\t" @@ -144,7 +147,7 @@ main(int argc, char *argv[]) rec_config = REC_ERROR; logoff = readonly = recover = salvage = false; /* Check for standard options. */ - while ((ch = __wt_getopt(progname, argc, argv, "C:E:h:LRSVv")) != EOF) + while ((ch = __wt_getopt(progname, argc, argv, "C:E:h:LRrSVv")) != EOF) switch (ch) { case 'C': /* wiredtiger_open config */ cmd_config = __wt_optarg; @@ -168,6 +171,10 @@ main(int argc, char *argv[]) rec_config = REC_RECOVER; recover = true; break; + case 'r': + readonly_config = READONLY; + readonly = true; + break; case 'S': /* salvage */ rec_config = REC_SALVAGE; salvage = true; @@ -187,6 +194,10 @@ main(int argc, char *argv[]) fprintf(stderr, "Only one of -L, -R, and -S is allowed.\n"); goto err; } + if ((recover || salvage) && readonly) { + fprintf(stderr, "-R and -S cannot be used with -r\n"); + goto err; + } argc -= __wt_optind; argv += __wt_optind; @@ -225,20 +236,17 @@ main(int argc, char *argv[]) func = util_downgrade; else if (strcmp(command, "drop") == 0) func = util_drop; - else if (strcmp(command, "dump") == 0) { + else if (strcmp(command, "dump") == 0) func = util_dump; - readonly_config = READONLY; - } break; case 'i': if (strcmp(command, "import") == 0) func = util_import; break; case 'l': - if (strcmp(command, "list") == 0) { + if (strcmp(command, "list") == 0) func = util_list; - readonly_config = READONLY; - } else if (strcmp(command, "load") == 0) { + else if (strcmp(command, "load") == 0) { func = util_load; config = "create"; } else if (strcmp(command, "loadtext") == 0) { @@ -250,14 +258,12 @@ main(int argc, char *argv[]) if (strcmp(command, "printlog") == 0) { func = util_printlog; rec_config = REC_LOGOFF; - readonly_config = READONLY; } break; case 'r': - if (strcmp(command, "read") == 0) { + if (strcmp(command, "read") == 0) func = util_read; - readonly_config = READONLY; - } else if (strcmp(command, "rebalance") == 0) + else if (strcmp(command, "rebalance") == 0) func = util_rebalance; else if (strcmp(command, "rename") == 0) func = util_rename; @@ -268,7 +274,6 @@ main(int argc, char *argv[]) else if (strcmp(command, "stat") == 0) { func = util_stat; config = "statistics=(all)"; - readonly_config = READONLY; } break; case 't': @@ -280,10 +285,8 @@ main(int argc, char *argv[]) func = util_upgrade; break; case 'v': - if (strcmp(command, "verify") == 0) { + if (strcmp(command, "verify") == 0) func = util_verify; - readonly_config = READONLY; - } break; case 'w': if (strcmp(command, "write") == 0) @@ -297,13 +300,6 @@ main(int argc, char *argv[]) goto err; } - /* - * If the user has specified recovery or salvage disable readonly mode, as they are both not - * readonly operations. - */ - if (recover || salvage) - readonly_config = NULL; - /* Build the configuration string. */ len = 10; /* some slop */ p1 = p2 = p3 = ""; diff --git a/src/third_party/wiredtiger/test/csuite/import/smoke.sh b/src/third_party/wiredtiger/test/csuite/import/smoke.sh index 250c5896d8b..db73e28eb99 100755 --- a/src/third_party/wiredtiger/test/csuite/import/smoke.sh +++ b/src/third_party/wiredtiger/test/csuite/import/smoke.sh @@ -1,12 +1,6 @@ #! /bin/sh set -e -# XXX - The following line should not be necessary. But without it, the immediate -# test and exit reveals a bug and memory leak in /bin/bash when run with valgrind. -# That leak is reported in https://bugzilla.redhat.com/show_bug.cgi?id=1746101 -# When that bug is fixed, this script can remove this line. -date > /dev/null - # Bypass this test for slow machines, valgrind test "$TESTUTIL_SLOW_MACHINE" = "1" && exit 0 test "$TESTUTIL_BYPASS_VALGRIND" = "1" && exit 0 diff --git a/src/third_party/wiredtiger/test/csuite/random_abort/main.c b/src/third_party/wiredtiger/test/csuite/random_abort/main.c index b7dd1596f84..3e755565bf7 100644 --- a/src/third_party/wiredtiger/test/csuite/random_abort/main.c +++ b/src/third_party/wiredtiger/test/csuite/random_abort/main.c @@ -73,6 +73,10 @@ static bool inmem; #define MAX_MODIFY_ENTRIES 10 #define MAX_VAL 4096 +/* + * STR_MAX_VAL is set to MAX_VAL - 1 to account for the extra null character. + */ +#define STR_MAX_VAL "4095" static void handler(int) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); static void usage(void) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); @@ -161,8 +165,6 @@ thread_run(void *arg) else testutil_check(session->open_cursor(session, uri, NULL, NULL, &cursor)); - data.data = buf; - data.size = sizeof(buf); /* * Write our portion of the key space until we're killed. */ @@ -495,7 +497,8 @@ recover_and_verify(uint32_t nthreads) * If it is modify operation, make sure value of the fetched record matches with * saved. */ - ret = fscanf(fp[MODIFY_RECORD_FILE_ID], "%s %" SCNu64 "\n", file_value, &key); + ret = fscanf( + fp[MODIFY_RECORD_FILE_ID], "%" STR_MAX_VAL "s %" SCNu64 "\n", file_value, &key); /* * Consider anything other than clear success in getting the key to be EOF. We've diff --git a/src/third_party/wiredtiger/test/csuite/wt2535_insert_race/main.c b/src/third_party/wiredtiger/test/csuite/wt2535_insert_race/main.c index 376dc5f81ef..7be2a800828 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2535_insert_race/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2535_insert_race/main.c @@ -49,6 +49,10 @@ main(int argc, char *argv[]) uint64_t current_value; int i; + /* Bypass this test for valgrind */ + if (testutil_is_flag_set("TESTUTIL_BYPASS_VALGRIND")) + return (EXIT_SUCCESS); + opts = &_opts; memset(opts, 0, sizeof(*opts)); opts->nthreads = 20; diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml index 447691f24e2..bf20d7568bc 100755 --- a/src/third_party/wiredtiger/test/evergreen.yml +++ b/src/third_party/wiredtiger/test/evergreen.yml @@ -79,12 +79,11 @@ pre: post: - command: shell.exec params: + working_dir: "wiredtiger" script: | set -o errexit set -o verbose - cd wiredtiger tar cfz ../wiredtiger.tgz . - cd .. - command: s3.put params: aws_secret: ${aws_secret} @@ -95,15 +94,10 @@ post: content_type: application/tar display_name: Artifacts remote_file: wiredtiger/${build_variant}/${revision}/artifacts/${task_id}.tgz - - command: shell.exec - params: - script: | - rm -rf "wiredtiger" tasks: ## Base compile task on posix flavours - name: compile - depends_on: [] commands: - func: "fetch source" - command: git.apply_patch diff --git a/src/third_party/wiredtiger/test/format/rebalance.c b/src/third_party/wiredtiger/test/format/rebalance.c index 7cdfaa188d3..8850f3c47b0 100644 --- a/src/third_party/wiredtiger/test/format/rebalance.c +++ b/src/third_party/wiredtiger/test/format/rebalance.c @@ -40,9 +40,9 @@ wts_rebalance(void) track("rebalance", 0ULL, NULL); - /* Dump the current object. Pass -R flag to avoid readonly check while logging statistics */ + /* Dump the current object */ testutil_check(__wt_snprintf(cmd, sizeof(cmd), ".." DIR_DELIM_STR ".." DIR_DELIM_STR "wt" - " -R -h %s dump -f %s/rebalance.orig %s", + " -h %s dump -f %s/rebalance.orig %s", g.home, g.home, g.uri)); testutil_checkfmt(system(cmd), "command failed: %s", cmd); @@ -60,7 +60,7 @@ wts_rebalance(void) wts_verify("post-rebalance verify"); wts_close(); testutil_check(__wt_snprintf(cmd, sizeof(cmd), ".." DIR_DELIM_STR ".." DIR_DELIM_STR "wt" - " -R -h %s dump -f %s/rebalance.new %s", + " -h %s dump -f %s/rebalance.new %s", g.home, g.home, g.uri)); testutil_checkfmt(system(cmd), "command failed: %s", cmd); diff --git a/src/third_party/wiredtiger/test/format/smoke.sh b/src/third_party/wiredtiger/test/format/smoke.sh index 0c86b5e57c6..309cedbc5ac 100755 --- a/src/third_party/wiredtiger/test/format/smoke.sh +++ b/src/third_party/wiredtiger/test/format/smoke.sh @@ -9,3 +9,5 @@ $TEST_WRAPPER ./t $args file_type=fix $TEST_WRAPPER ./t $args file_type=row $TEST_WRAPPER ./t $args file_type=row data_source=lsm $TEST_WRAPPER ./t $args file_type=var +# Force a rebalance to occur with statistics logging to test the utility +$TEST_WRAPPER ./t $args file_type=row statistics_server=1 rebalance=1 diff --git a/src/third_party/wiredtiger/test/suite/test_backup01.py b/src/third_party/wiredtiger/test/suite/test_backup01.py index 161746d5a7f..b002a9c4601 100644 --- a/src/third_party/wiredtiger/test/suite/test_backup01.py +++ b/src/third_party/wiredtiger/test/suite/test_backup01.py @@ -64,8 +64,7 @@ class test_backup(wttest.WiredTigerTestCase, suite_subprocess): # Compare the original and backed-up files using the wt dump command. def compare(self, uri): self.runWt(['dump', uri], outfilename='orig') - # Pass -R flag as we need to run recovery on the backup dir before calling dump. - self.runWt(['-h', self.dir, '-R', 'dump', uri], outfilename='backup') + self.runWt(['-h', self.dir, 'dump', uri], outfilename='backup') self.assertEqual(True, compare_files(self, 'orig', 'backup')) # Test simple backup cursor open/close. @@ -89,8 +88,7 @@ class test_backup(wttest.WiredTigerTestCase, suite_subprocess): # Make sure all the files were copied. self.runWt(['list'], outfilename='outfile.orig') - # Pass -R flag as we need to run recovery on the backup dir before calling list. - self.runWt(['-h', self.dir, '-R', 'list'], outfilename='outfile.backup') + self.runWt(['-h', self.dir, 'list'], outfilename='outfile.backup') self.assertEqual( True, compare_files(self, 'outfile.orig', 'outfile.backup')) diff --git a/src/third_party/wiredtiger/test/suite/test_backup03.py b/src/third_party/wiredtiger/test/suite/test_backup03.py index 9ccfd06b748..10bc8fe1215 100644 --- a/src/third_party/wiredtiger/test/suite/test_backup03.py +++ b/src/third_party/wiredtiger/test/suite/test_backup03.py @@ -95,8 +95,7 @@ class test_backup_target(wttest.WiredTigerTestCase, suite_subprocess): # Compare the original and backed-up files using the wt dump command. def compare(self, uri): self.runWt(['dump', uri], outfilename='orig') - # Pass -R flag as we need to run recovery on the backup dir before calling dump. - self.runWt(['-h', self.dir, '-R', 'dump', uri], outfilename='backup') + self.runWt(['-h', self.dir, 'dump', uri], outfilename='backup') self.assertEqual(True, compare_files(self, 'orig', 'backup')) # Check that a URI doesn't exist, both the meta-data and the file names. diff --git a/src/third_party/wiredtiger/test/suite/test_cursor_random.py b/src/third_party/wiredtiger/test/suite/test_cursor_random.py index a869aba3b4d..736f5f9d397 100644 --- a/src/third_party/wiredtiger/test/suite/test_cursor_random.py +++ b/src/third_party/wiredtiger/test/suite/test_cursor_random.py @@ -95,48 +95,58 @@ class test_cursor_random(wttest.WiredTigerTestCase): # Check that next_random works in the presence of a larger set of values, # where the values are in an insert list. - def test_cursor_random_multiple_insert_records(self): + def cursor_random_multiple_insert_records(self, n): uri = self.type - ds = self.dataset(self, uri, 100, + ds = self.dataset(self, uri, n, config='allocation_size=512,leaf_page_max=512') ds.populate() - # In a insert list, next_random always selects the middle key/value - # pair, all we can do is confirm cursor.next works. + # Assert we only see 20% matches. We expect to see less than that, but we don't want + # to chase random test failures, either. cursor = self.session.open_cursor(uri, None, self.config) - self.assertEqual(cursor.next(), 0) + list=[] + for i in range(1,100): + self.assertEqual(cursor.next(), 0) + list.append(cursor.get_key()) + self.assertGreater(len(set(list)), 80) + + def test_cursor_random_multiple_insert_records_small(self): + self.cursor_random_multiple_insert_records(2000) + def test_cursor_random_multiple_insert_records_large(self): + self.cursor_random_multiple_insert_records(10000) # Check that next_random works in the presence of a larger set of values, # where the values are in a disk format page. - def cursor_random_multiple_page_records(self, reopen): + def cursor_random_multiple_page_records(self, n, reopen): uri = self.type - ds = self.dataset(self, uri, 10000, + ds = self.dataset(self, uri, n, config='allocation_size=512,leaf_page_max=512') ds.populate() - # Optionally close the connection so everything is forced to disk, - # insert lists are an entirely different path in the code. + # Optionally close the connection so everything is forced to disk, insert lists are an + # entirely different page format. if reopen: self.reopen_conn() + # Assert we only see 20% matches. We expect to see less than that, but we don't want + # to chase random test failures, either. cursor = self.session.open_cursor(uri, None, self.config) - last = '' - match = 0 - for i in range(1,10): + list=[] + for i in range(1, 100): self.assertEqual(cursor.next(), 0) - current = cursor.get_key() - if current == last: - match += 1 - last = current - self.assertLess(match, 5, - 'next_random did not return random records, too many matches found') - - def test_cursor_random_multiple_page_records_reopen(self): - self.cursor_random_multiple_page_records(1) - def test_cursor_random_multiple_page_records(self): - self.cursor_random_multiple_page_records(0) - - # Check that next_random fails in the presence of a set of values, some of + list.append(cursor.get_key()) + self.assertGreater(len(set(list)), 80) + + def test_cursor_random_multiple_page_records_reopen_small(self): + self.cursor_random_multiple_page_records(2000, True) + def test_cursor_random_multiple_page_records_reopen_large(self): + self.cursor_random_multiple_page_records(10000, True) + def test_cursor_random_multiple_page_records_small(self): + self.cursor_random_multiple_page_records(2000, False) + def test_cursor_random_multiple_page_records_large(self): + self.cursor_random_multiple_page_records(10000, False) + + # Check that next_random succeeds in the presence of a set of values, some of # which are deleted. def test_cursor_random_deleted_partial(self): uri = self.type |