diff options
Diffstat (limited to 'src/third_party/wiredtiger/src')
40 files changed, 1049 insertions, 771 deletions
diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c index c20a294c07b..98cc10a6de1 100644 --- a/src/third_party/wiredtiger/src/block/block_ckpt.c +++ b/src/third_party/wiredtiger/src/block/block_ckpt.c @@ -342,6 +342,11 @@ __ckpt_verify(WT_SESSION_IMPL *session, WT_CKPT *ckptbase) break; /* FALLTHROUGH */ default: + /* + * Don't convert to WT_ILLEGAL_VALUE, it won't compile + * on some gcc compilers because they don't understand + * FALLTHROUGH as part of a macro. + */ return ( __wt_illegal_value(session, "checkpoint array")); } diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index d58dc78fbed..6e1ab526e52 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -1197,8 +1197,7 @@ __cursor_chain_exceeded(WT_CURSOR_BTREE *cbt) upd = page->modify->mod_row_update[cbt->slot]; for (i = 0; upd != NULL; ++i, upd = upd->next) { - if (upd->type == WT_UPDATE_DELETED || - upd->type == WT_UPDATE_STANDARD) + if (WT_UPDATE_DATA_VALUE(upd)) return (false); if (i >= WT_MAX_MODIFY_UPDATE) return (true); @@ -1219,7 +1218,7 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) WT_DECL_RET; WT_SESSION_IMPL *session; size_t orig, new; - bool chain_exceeded, overwrite; + bool overwrite; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; @@ -1259,13 +1258,13 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) /* * WT_CURSOR.modify is update-without-overwrite. * - * Use the modify buffer as the update if under the limit, else use the - * complete value. + * Use the modify buffer as the update if the data package saves us some + * memory and the update chain is under the limit, else use the complete + * value. */ overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE); F_CLR(cursor, WT_CURSTD_OVERWRITE); - chain_exceeded = __cursor_chain_exceeded(cbt); - if (chain_exceeded) + if (cursor->value.size <= 64 || __cursor_chain_exceeded(cbt)) ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD); else if ((ret = __wt_modify_pack(session, &modify, entries, nentries)) == 0) diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index b8d11be7b3e..d91ac027738 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -39,7 +39,6 @@ static int __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, bool); static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *); static int __debug_dsk_cell(WT_DBG *, const WT_PAGE_HEADER *); static int __debug_dsk_col_fix(WT_DBG *, const WT_PAGE_HEADER *); -static int __debug_item(WT_DBG *, const char *, const void *, size_t); static int __debug_page(WT_DBG *, WT_REF *, uint32_t); static int __debug_page_col_fix(WT_DBG *, WT_REF *); static int __debug_page_col_int(WT_DBG *, WT_PAGE *, uint32_t); @@ -81,6 +80,41 @@ __debug_hex_byte(WT_DBG *ds, uint8_t v) } /* + * __debug_bytes -- + * Dump a single set of bytes. + */ +static int +__debug_bytes(WT_DBG *ds, const void *data_arg, size_t size) +{ + size_t i; + u_char ch; + const uint8_t *data; + + for (data = data_arg, i = 0; i < size; ++i, ++data) { + ch = data[0]; + if (__wt_isprint(ch)) + WT_RET(ds->f(ds, "%c", (int)ch)); + else + WT_RET(__debug_hex_byte(ds, data[0])); + } + return (0); +} + +/* + * __debug_item -- + * Dump a single data/size pair, with an optional tag. + */ +static int +__debug_item(WT_DBG *ds, const char *tag, const void *data_arg, size_t size) +{ + WT_RET(ds->f(ds, + "\t%s%s{", tag == NULL ? "" : tag, tag == NULL ? "" : " ")); + WT_RET(__debug_bytes(ds, data_arg, size)); + WT_RET(ds->f(ds, "}\n")); + return (0); +} + +/* * __dmsg_event -- * Send a debug message to the event handler. */ @@ -993,23 +1027,26 @@ static int __debug_modified(WT_DBG *ds, WT_UPDATE *upd) { const size_t *p; - int nentries; + size_t nentries, data_size, offset, size; const uint8_t *data; - void *modify; - - modify = upd->data; - p = modify; - nentries = (int)*p++; - data = (uint8_t *)modify + + p = (size_t *)upd->data; + memcpy(&nentries, p++, sizeof(size_t)); + data = upd->data + sizeof(size_t) + ((size_t)nentries * 3 * sizeof(size_t)); - WT_RET(ds->f(ds, "%d: ", nentries)); - for (; nentries-- > 0; data += p[0], p += 3) + WT_RET(ds->f(ds, "%" WT_SIZET_FMT ": ", nentries)); + for (; nentries-- > 0; data += data_size) { + memcpy(&data_size, p++, sizeof(size_t)); + memcpy(&offset, p++, sizeof(size_t)); + memcpy(&size, p++, sizeof(size_t)); WT_RET(ds->f(ds, "{%" WT_SIZET_FMT ", %" WT_SIZET_FMT ", %" WT_SIZET_FMT - ", %.*s}%s", p[0], p[1], p[2], - (int)p[2], data, nentries == 0 ? "" : ", ")); + ", ", + data_size, offset, size)); + WT_RET(__debug_bytes(ds, data, data_size)); + WT_RET(ds->f(ds, "}%s", nentries == 0 ? "" : ", ")); + } return (0); } @@ -1052,17 +1089,10 @@ __debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte) #ifdef HAVE_TIMESTAMPS if (!__wt_timestamp_iszero( WT_TIMESTAMP_NULL(&upd->timestamp))) { -#if WT_TIMESTAMP_SIZE == 8 - WT_RET(ds->f(ds, - ", stamp %" PRIu64, upd->timestamp.val)); -#else - int i; - - WT_RET(ds->f(ds, ", stamp 0x")); - for (i = 0; i < WT_TIMESTAMP_SIZE; ++i) - WT_RET(ds->f(ds, - "%" PRIx8, upd->timestamp.ts[i])); -#endif + char hex_timestamp[2 * WT_TIMESTAMP_SIZE + 1]; + WT_RET(__wt_timestamp_to_hex_string( + ds->session, hex_timestamp, &upd->timestamp)); + WT_RET(ds->f(ds, ", stamp %s", hex_timestamp)); } #endif WT_RET(ds->f(ds, "\n")); @@ -1250,28 +1280,4 @@ __debug_cell_data(WT_DBG *ds, return (ret); } - -/* - * __debug_item -- - * Dump a single data/size pair, with an optional tag. - */ -static int -__debug_item(WT_DBG *ds, const char *tag, const void *data_arg, size_t size) -{ - size_t i; - u_char ch; - const uint8_t *data; - - WT_RET(ds->f(ds, - "\t%s%s{", tag == NULL ? "" : tag, tag == NULL ? "" : " ")); - for (data = data_arg, i = 0; i < size; ++i, ++data) { - ch = data[0]; - if (__wt_isprint(ch)) - WT_RET(ds->f(ds, "%c", (int)ch)); - else - WT_RET(__debug_hex_byte(ds, data[0])); - } - WT_RET(ds->f(ds, "}\n")); - return (0); -} #endif diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index bc9356e2669..806a9770057 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -249,9 +249,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) /* Free the overflow on-page, reuse and transaction-cache skiplists. */ __wt_ovfl_reuse_free(session, page); - if (mod->ovfl_track != NULL) - __wt_free(session, mod->ovfl_track->remove); __wt_ovfl_discard_free(session, page); + __wt_ovfl_discard_remove(session, page); __wt_free(session, page->modify->ovfl_track); __wt_spin_destroy(session, &page->modify->page_lock); diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c index f933245eaef..fab38f3cc8d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c +++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c @@ -49,7 +49,6 @@ __wt_ovfl_read(WT_SESSION_IMPL *session, { WT_DECL_RET; WT_OVFL_TRACK *track; - WT_UPDATE *upd; size_t i; *decoded = false; @@ -74,14 +73,13 @@ __wt_ovfl_read(WT_SESSION_IMPL *session, __wt_readlock(session, &S2BT(session)->ovfl_lock); if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) { track = page->modify->ovfl_track; - for (upd = NULL, i = 0; i < track->remove_next; ++i) + for (i = 0; i < track->remove_next; ++i) if (track->remove[i].cell == unpack->cell) { - upd = track->remove[i].upd; + store->data = track->remove[i].data; + store->size = track->remove[i].size; break; } WT_ASSERT(session, i < track->remove_next); - store->data = upd->data; - store->size = upd->size; *decoded = true; } else ret = __ovfl_read(session, unpack->data, unpack->size, store); @@ -91,134 +89,56 @@ __wt_ovfl_read(WT_SESSION_IMPL *session, } /* - * __ovfl_cache_col_visible -- - * column-store: check for a globally visible update. + * __wt_ovfl_discard_remove -- + * Free the on-page overflow value cache. */ -static bool -__ovfl_cache_col_visible( - WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_CELL_UNPACK *unpack) +void +__wt_ovfl_discard_remove(WT_SESSION_IMPL *session, WT_PAGE *page) { - /* - * Column-store is harder than row_store: we're here because there's a - * reader in the system that might read the original version of an - * overflow record, which might match a number of records. For example, - * the original overflow value was for records 100-200, we've replaced - * each of those records individually, but there exists a reader that - * might read any one of those records, and all of those records have - * different update entries with different transaction IDs. Since it's - * infeasible to determine if there's a globally visible update for each - * reader for each record, we test the simple case where a single record - * has a single, globally visible update. If that's not the case, cache - * the value. - */ - if (__wt_cell_rle(unpack) == 1 && - WT_UPDATE_DATA_VALUE(upd) && __wt_txn_upd_visible_all(session, upd)) - return (true); - return (false); -} - -/* - * __ovfl_cache_row_visible -- - * row-store: check for a globally visible update. - */ -static bool -__ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_UPDATE *upd) -{ - /* Check to see if there's a globally visible update. */ - for (; upd != NULL; upd = upd->next) - if (WT_UPDATE_DATA_VALUE(upd) && - __wt_txn_upd_visible_all(session, upd)) - return (true); - - return (false); + WT_OVFL_TRACK *track; + uint32_t i; + + if (page->modify != NULL && + (track = page->modify->ovfl_track) != NULL) { + for (i = 0; i < track->remove_next; ++i) + __wt_free(session, track->remove[i].data); + __wt_free(session, page->modify->ovfl_track->remove); + track->remove_allocated = 0; + track->remove_next = 0; + } } /* - * __ovfl_cache_append_update -- - * Append an overflow value to the update list. + * __ovfl_cache -- + * Cache an overflow value. */ static int -__ovfl_cache_append_update(WT_SESSION_IMPL *session, WT_PAGE *page, - WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack, WT_UPDATE **updp) +__ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack) { WT_DECL_ITEM(tmp); WT_DECL_RET; - WT_UPDATE *append, *upd; - size_t size; - - *updp = NULL; + WT_OVFL_TRACK *track; /* Read the overflow value. */ WT_RET(__wt_scr_alloc(session, 1024, &tmp)); WT_ERR(__wt_dsk_cell_data_ref(session, page->type, unpack, tmp)); - /* - * Create an update entry with no transaction ID to ensure global - * visibility, append it to the update list. - * - * We don't need locks or barriers in this function: any thread reading - * the update list will see our newly appended record or not, it doesn't - * matter until the on-page cell is set to WT_CELL_VALUE_OVFL_RM. That - * involves atomic operations which will act as our barrier. Regardless, - * we update the page footprint as part of this operation, which acts as - * a barrier as well. - * - * The update transaction ID choice is tricky, to work around an issue - * in variable-length column store. Imagine an overflow value with an - * RLE greater than 1. We append a copy to the end of an update chain, - * but it's possible it's the overflow value for more than one record, - * and appending it to the end of one record's update chain means a - * subsequent enter of a globally visible value to one of the records - * would allow the truncation of the overflow chain that leaves other - * records without a value. If appending such an overflow record, set - * the transaction ID to the first possible transaction ID. That ID is - * old enough to be globally visible, but we can use it as a flag if an - * update record cannot be discarded when truncating an update chain. - */ - WT_ERR(__wt_update_alloc( - session, tmp, &append, &size, WT_UPDATE_STANDARD)); - append->txnid = page->type == WT_PAGE_COL_VAR && - __wt_cell_rle(unpack) > 1 ? WT_TXN_FIRST : WT_TXN_NONE; - for (upd = upd_list; upd->next != NULL; upd = upd->next) - ; - WT_PUBLISH(upd->next, append); - - __wt_cache_page_inmem_incr(session, page, size); - - *updp = append; - -err: __wt_scr_free(session, &tmp); - return (ret); -} - -/* - * __ovfl_cache -- - * Cache an overflow value. - */ -static int -__ovfl_cache(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack) -{ - WT_OVFL_TRACK *track; - WT_UPDATE *upd; - - /* Append a copy of the overflow value to the update list. */ - WT_RET(__ovfl_cache_append_update( - session, page, upd_list, unpack, &upd)); - /* Allocating tracking structures as necessary. */ if (page->modify->ovfl_track == NULL) - WT_RET(__wt_ovfl_track_init(session, page)); + WT_ERR(__wt_ovfl_track_init(session, page)); track = page->modify->ovfl_track; - /* Add the value's information to the update list. */ - WT_RET(__wt_realloc_def(session, + /* Copy the overflow item into place. */ + WT_ERR(__wt_realloc_def(session, &track->remove_allocated, track->remove_next + 1, &track->remove)); track->remove[track->remove_next].cell = unpack->cell; - track->remove[track->remove_next].upd = upd; + WT_ERR(__wt_strndup(session, + tmp->data, tmp->size, &track->remove[track->remove_next].data)); + track->remove[track->remove_next].size = tmp->size; ++track->remove_next; - return (0); +err: __wt_scr_free(session, &tmp); + return (ret); } /* @@ -227,12 +147,14 @@ __ovfl_cache(WT_SESSION_IMPL *session, */ int __wt_ovfl_remove(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack) + WT_PAGE *page, WT_CELL_UNPACK *unpack, bool checkpoint) { - bool visible; - /* - * This function solves a problem in reconciliation. The scenario is: + * This function solves two problems in reconciliation. + * + * The first problem is snapshot readers needing on-page overflow values + * that have been removed. The scenario is as follows: + * * - reconciling a leaf page that references an overflow item * - the item is updated and the update committed * - a checkpoint runs, freeing the backing overflow blocks @@ -263,28 +185,16 @@ __wt_ovfl_remove(WT_SESSION_IMPL *session, * per overflow item. We don't do any of that because overflow values * are supposed to be rare and we shouldn't see contention for the lock. * - * Check for a globally visible update. If there is a globally visible - * update, we don't need to cache the item because it's not possible for - * a running thread to have moved past it. - */ - switch (page->type) { - case WT_PAGE_COL_VAR: - visible = __ovfl_cache_col_visible(session, upd_list, unpack); - break; - case WT_PAGE_ROW_LEAF: - visible = __ovfl_cache_row_visible(session, upd_list); - break; - WT_ILLEGAL_VALUE(session); - } - - /* - * If there's no globally visible update, there's a reader in the system - * that might try and read the old value, cache it. + * We only have to do this for checkpoints: in any eviction mode, there + * can't be threads sitting in our update lists. */ - if (!visible) - WT_RET(__ovfl_cache(session, page, upd_list, unpack)); + if (checkpoint) + WT_RET(__ovfl_cache(session, page, unpack)); /* + * The second problem is to only remove the underlying blocks once, + * solved by the WT_CELL_VALUE_OVFL_RM flag. + * * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the * underlying overflow value's blocks to be freed when reconciliation * completes. diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 49b12b2d4e9..0c3cb026421 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -96,7 +96,7 @@ __col_instantiate(WT_SESSION_IMPL *session, /* Search the page and add updates. */ WT_RET(__wt_col_search(session, recno, ref, cbt)); WT_RET(__wt_col_modify( - session, cbt, recno, NULL, updlist, updlist->type, false)); + session, cbt, recno, NULL, updlist, WT_UPDATE_INVALID, false)); return (0); } @@ -121,7 +121,7 @@ __row_instantiate(WT_SESSION_IMPL *session, /* Search the page and add updates. */ WT_RET(__wt_row_search(session, key, ref, cbt, true)); WT_RET(__wt_row_modify( - session, cbt, key, NULL, updlist, updlist->type, false)); + session, cbt, key, NULL, updlist, WT_UPDATE_INVALID, false)); return (0); } diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index a0db4457f62..ac90d6693d3 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -1446,8 +1446,8 @@ __split_multi_inmem( WT_ERR(__wt_col_search(session, recno, ref, &cbt)); /* Apply the modification. */ - WT_ERR(__wt_col_modify( - session, &cbt, recno, NULL, upd, upd->type, true)); + WT_ERR(__wt_col_modify(session, &cbt, + recno, NULL, upd, WT_UPDATE_INVALID, true)); break; case WT_PAGE_ROW_LEAF: /* Build a key. */ @@ -1468,8 +1468,8 @@ __split_multi_inmem( WT_ERR(__wt_row_search(session, key, ref, &cbt, true)); /* Apply the modification. */ - WT_ERR(__wt_row_modify( - session, &cbt, key, NULL, upd, upd->type, true)); + WT_ERR(__wt_row_modify(session, + &cbt, key, NULL, upd, WT_UPDATE_INVALID, true)); break; WT_ILLEGAL_VALUE_ERR(session); } diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index 3fdafcebfb9..261c0fc1937 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -634,8 +634,7 @@ err: WT_LEAVE_PAGE_INDEX(session); int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) { - return (__tree_walk_internal( - session, refp, NULL, NULL, NULL, flags)); + return (__tree_walk_internal(session, refp, NULL, NULL, NULL, flags)); } /* @@ -661,8 +660,8 @@ __wt_tree_walk_custom_skip( int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *), void *func_cookie, uint32_t flags) { - return (__tree_walk_internal(session, refp, - NULL, skip_func, func_cookie, flags)); + return (__tree_walk_internal( + session, refp, NULL, skip_func, func_cookie, flags)); } /* diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index 6e610b86376..5e84899999a 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -263,6 +263,13 @@ __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value, *updp = NULL; /* + * The code paths leading here are convoluted: assert we never attempt + * to allocate an update structure if only intending to insert one we + * already have. + */ + WT_ASSERT(session, modify_type != WT_UPDATE_INVALID); + + /* * Allocate the WT_UPDATE structure and room for the value, then copy * the value into place. */ @@ -304,14 +311,11 @@ __wt_update_obsolete_check( * Walk the list of updates, looking for obsolete updates at the end. * * Only updates with globally visible, self-contained data can terminate - * update chains, ignore modified and reserved updates. Special case the - * first transaction ID, it flags column-store overflow values which can - * never be discarded. + * update chains. */ for (first = NULL, count = 0; upd != NULL; upd = upd->next, count++) if (WT_UPDATE_DATA_VALUE(upd) && - __wt_txn_upd_visible_all(session, upd) && - upd->txnid != WT_TXN_FIRST) { + __wt_txn_upd_visible_all(session, upd)) { if (first == NULL) first = upd; } else if (upd->txnid != WT_TXN_ABORTED) diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index c53a63ccb25..764006b024d 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -172,8 +172,8 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\"," "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\"," "\"salvage\",\"shared_cache\",\"split\",\"temporary\"," - "\"thread_group\",\"transaction\",\"verify\",\"version\"," - "\"write\"]", + "\"thread_group\",\"timestamp\",\"transaction\",\"verify\"," + "\"version\",\"write\"]", NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; @@ -225,7 +225,6 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_checkpoint[] = { { "drop", "list", NULL, NULL, NULL, 0 }, { "force", "boolean", NULL, NULL, NULL, 0 }, { "name", "string", NULL, NULL, NULL, 0 }, - { "read_timestamp", "string", NULL, NULL, NULL, 0 }, { "target", "list", NULL, NULL, NULL, 0 }, { "use_timestamp", "boolean", NULL, NULL, NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } @@ -802,8 +801,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\"," "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\"," "\"salvage\",\"shared_cache\",\"split\",\"temporary\"," - "\"thread_group\",\"transaction\",\"verify\",\"version\"," - "\"write\"]", + "\"thread_group\",\"timestamp\",\"transaction\",\"verify\"," + "\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", @@ -897,8 +896,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\"," "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\"," "\"salvage\",\"shared_cache\",\"split\",\"temporary\"," - "\"thread_group\",\"transaction\",\"verify\",\"version\"," - "\"write\"]", + "\"thread_group\",\"timestamp\",\"transaction\",\"verify\"," + "\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -987,8 +986,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\"," "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\"," "\"salvage\",\"shared_cache\",\"split\",\"temporary\"," - "\"thread_group\",\"transaction\",\"verify\",\"version\"," - "\"write\"]", + "\"thread_group\",\"timestamp\",\"transaction\",\"verify\"," + "\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -1077,8 +1076,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\"," "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\"," "\"salvage\",\"shared_cache\",\"split\",\"temporary\"," - "\"thread_group\",\"transaction\",\"verify\",\"version\"," - "\"write\"]", + "\"thread_group\",\"timestamp\",\"transaction\",\"verify\"," + "\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", @@ -1173,9 +1172,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { confchk_WT_SESSION_begin_transaction, 6 }, { "WT_SESSION.checkpoint", - "drop=,force=false,name=,read_timestamp=,target=," - "use_timestamp=true", - confchk_WT_SESSION_checkpoint, 6 + "drop=,force=false,name=,target=,use_timestamp=true", + confchk_WT_SESSION_checkpoint, 5 }, { "WT_SESSION.close", "", diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index df71ddf18f6..b29b6184ce3 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -8,8 +8,6 @@ #include "wt_internal.h" -static int __conn_statistics_config(WT_SESSION_IMPL *, const char *[]); - /* * ext_collate -- * Call the collation function (external API version). @@ -190,45 +188,6 @@ __wt_conn_remove_collator(WT_SESSION_IMPL *session) } /* - * __conn_compat_config -- - * Configure compatibility version. - */ -static int -__conn_compat_config(WT_SESSION_IMPL *session, const char **cfg) -{ - WT_CONFIG_ITEM cval; - WT_CONNECTION_IMPL *conn; - uint16_t patch; - - conn = S2C(session); - WT_RET(__wt_config_gets(session, cfg, - "compatibility.release", &cval)); - if (cval.len != 0) { - /* - * Accept either a major.minor release string or a - * major.minor.patch release string. We ignore the patch - * value, but allow it in the string. - */ - if (sscanf(cval.str, "%" SCNu16 ".%" SCNu16, - &conn->compat_major, &conn->compat_minor) != 2 && - sscanf(cval.str, "%" SCNu16 ".%" SCNu16 ".%" SCNu16, - &conn->compat_major, &conn->compat_minor, &patch) != 3) - WT_RET_MSG(session, - EINVAL, "illegal compatibility release"); - if (conn->compat_major > WIREDTIGER_VERSION_MAJOR) - WT_RET_MSG(session, EINVAL, "unknown major version"); - if (conn->compat_major == WIREDTIGER_VERSION_MAJOR && - conn->compat_minor > WIREDTIGER_VERSION_MINOR) - WT_RET_MSG(session, - EINVAL, "illegal compatibility version"); - } else { - conn->compat_major = WIREDTIGER_VERSION_MAJOR; - conn->compat_minor = WIREDTIGER_VERSION_MINOR; - } - return (0); -} - -/* * __compressor_confchk -- * Validate the compressor. */ @@ -1143,57 +1102,12 @@ __conn_reconfigure(WT_CONNECTION *wt_conn, const char *config) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; - const char *p; - bool locked; conn = (WT_CONNECTION_IMPL *)wt_conn; - locked = false; CONNECTION_API_CALL(conn, session, reconfigure, config, cfg); - - /* Serialize reconfiguration. */ - __wt_spin_lock(session, &conn->reconfig_lock); - locked = true; - - /* - * The configuration argument has been checked for validity, update the - * previous connection configuration. - * - * DO NOT merge the configuration before the reconfigure calls. Some - * of the underlying reconfiguration functions do explicit checks with - * the second element of the configuration array, knowing the defaults - * are in slot #1 and the application's modifications are in slot #2. - * - * First, replace the base configuration set up by CONNECTION_API_CALL - * with the current connection configuration, otherwise reconfiguration - * functions will find the base value instead of previously configured - * value. - */ - cfg[0] = conn->cfg; - cfg[1] = config; - - /* Second, reconfigure the system. */ - WT_ERR(__conn_compat_config(session, cfg)); - WT_ERR(__conn_statistics_config(session, cfg)); - WT_ERR(__wt_async_reconfig(session, cfg)); - WT_ERR(__wt_cache_config(session, true, cfg)); - WT_ERR(__wt_checkpoint_server_create(session, cfg)); - WT_ERR(__wt_logmgr_reconfig(session, cfg)); - WT_ERR(__wt_lsm_manager_reconfig(session, cfg)); - WT_ERR(__wt_statlog_create(session, cfg)); - WT_ERR(__wt_sweep_config(session, cfg)); - WT_ERR(__wt_verbose_config(session, cfg)); - WT_ERR(__wt_timing_stress_config(session, cfg)); - - /* Third, merge everything together, creating a new connection state. */ - WT_ERR(__wt_config_merge(session, cfg, NULL, &p)); - __wt_free(session, conn->cfg); - conn->cfg = p; - -err: if (locked) - __wt_spin_unlock(session, &conn->reconfig_lock); - - API_END_RET(session, ret); + ret = __wt_conn_reconfig(session, cfg); +err: API_END_RET(session, ret); } /* @@ -1274,8 +1188,7 @@ __conn_rollback_to_stable(WT_CONNECTION *wt_conn, const char *config) conn = (WT_CONNECTION_IMPL *)wt_conn; - CONNECTION_API_CALL( - conn, session, rollback_to_stable, config, cfg); + CONNECTION_API_CALL(conn, session, rollback_to_stable, config, cfg); WT_TRET(__wt_txn_rollback_to_stable(session, cfg)); err: API_END_RET(session, ret); } @@ -1788,94 +1701,6 @@ err: /* return (ret); } -/* - * __conn_statistics_config -- - * Set statistics configuration. - */ -static int -__conn_statistics_config(WT_SESSION_IMPL *session, const char *cfg[]) -{ - WT_CONFIG_ITEM cval, sval; - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - uint32_t flags; - int set; - - conn = S2C(session); - - WT_RET(__wt_config_gets(session, cfg, "statistics", &cval)); - - flags = 0; - set = 0; - if ((ret = __wt_config_subgets( - session, &cval, "none", &sval)) == 0 && sval.val != 0) { - flags = 0; - ++set; - } - WT_RET_NOTFOUND_OK(ret); - - if ((ret = __wt_config_subgets( - session, &cval, "fast", &sval)) == 0 && sval.val != 0) { - LF_SET(WT_STAT_TYPE_FAST); - ++set; - } - WT_RET_NOTFOUND_OK(ret); - - if ((ret = __wt_config_subgets( - session, &cval, "all", &sval)) == 0 && sval.val != 0) { - LF_SET( - WT_STAT_TYPE_ALL | WT_STAT_TYPE_CACHE_WALK | - WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); - ++set; - } - WT_RET_NOTFOUND_OK(ret); - - if (set > 1) - WT_RET_MSG(session, EINVAL, - "Only one of all, fast, none configuration values should " - "be specified"); - - /* - * Now that we've parsed general statistics categories, process - * sub-categories. - */ - if ((ret = __wt_config_subgets( - session, &cval, "cache_walk", &sval)) == 0 && sval.val != 0) - /* - * Configuring cache walk statistics implies fast statistics. - * Keep that knowledge internal for now - it may change in the - * future. - */ - LF_SET(WT_STAT_TYPE_FAST | WT_STAT_TYPE_CACHE_WALK); - WT_RET_NOTFOUND_OK(ret); - - if ((ret = __wt_config_subgets( - session, &cval, "tree_walk", &sval)) == 0 && sval.val != 0) - /* - * Configuring tree walk statistics implies fast statistics. - * Keep that knowledge internal for now - it may change in the - * future. - */ - LF_SET(WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); - WT_RET_NOTFOUND_OK(ret); - - if ((ret = __wt_config_subgets( - session, &cval, "clear", &sval)) == 0 && sval.val != 0) { - if (!LF_ISSET(WT_STAT_TYPE_ALL | WT_STAT_TYPE_CACHE_WALK | - WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK)) - WT_RET_MSG(session, EINVAL, - "the value \"clear\" can only be specified if " - "statistics are enabled"); - LF_SET(WT_STAT_CLEAR); - } - WT_RET_NOTFOUND_OK(ret); - - /* Configuring statistics clears any existing values. */ - conn->stat_flags = flags; - - return (0); -} - /* Simple structure for name and flag configuration searches. */ typedef struct { const char *name; @@ -1916,6 +1741,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "split", WT_VERB_SPLIT }, { "temporary", WT_VERB_TEMPORARY }, { "thread_group", WT_VERB_THREAD_GROUP }, + { "timestamp", WT_VERB_TIMESTAMP }, { "transaction", WT_VERB_TRANSACTION }, { "verify", WT_VERB_VERIFY }, { "version", WT_VERB_VERSION }, @@ -2344,7 +2170,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, /* * Set compatibility versions early so that any subsystem sees it. */ - WT_ERR(__conn_compat_config(session, cfg)); + WT_ERR(__wt_conn_compat_config(session, cfg)); /* * If the application didn't configure its own file system, configure @@ -2531,7 +2357,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval)); conn->mmap = cval.val != 0; - WT_ERR(__conn_statistics_config(session, cfg)); + WT_ERR(__wt_conn_statistics_config(session, cfg)); WT_ERR(__wt_lsm_manager_config(session, cfg)); WT_ERR(__wt_sweep_config(session, cfg)); diff --git a/src/third_party/wiredtiger/src/conn/conn_reconfig.c b/src/third_party/wiredtiger/src/conn/conn_reconfig.c new file mode 100644 index 00000000000..e67f2c9a18d --- /dev/null +++ b/src/third_party/wiredtiger/src/conn/conn_reconfig.c @@ -0,0 +1,210 @@ +/*- + * Copyright (c) 2014-2017 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_conn_compat_config -- + * Configure compatibility version. + */ +int +__wt_conn_compat_config(WT_SESSION_IMPL *session, const char **cfg) +{ + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + uint16_t patch; + bool txn_active; + + conn = S2C(session); + WT_RET(__wt_config_gets(session, cfg, + "compatibility.release", &cval)); + if (cval.len == 0) { + conn->compat_major = WIREDTIGER_VERSION_MAJOR; + conn->compat_minor = WIREDTIGER_VERSION_MINOR; + return (0); + } + + /* + * Accept either a major.minor release string or a + * major.minor.patch release string. We ignore the patch + * value, but allow it in the string. + */ + if (sscanf(cval.str, "%" SCNu16 ".%" SCNu16, + &conn->compat_major, &conn->compat_minor) != 2 && + sscanf(cval.str, "%" SCNu16 ".%" SCNu16 ".%" SCNu16, + &conn->compat_major, &conn->compat_minor, &patch) != 3) + WT_RET_MSG(session, EINVAL, "illegal compatibility release"); + if (conn->compat_major > WIREDTIGER_VERSION_MAJOR) + WT_RET_MSG(session, EINVAL, "unknown major version"); + if (conn->compat_major == WIREDTIGER_VERSION_MAJOR && + conn->compat_minor > WIREDTIGER_VERSION_MINOR) + WT_RET_MSG(session, EINVAL, "illegal compatibility version"); + + /* + * We're doing an upgrade or downgrade, check whether transactions are + * active. + */ + WT_RET(__wt_txn_activity_check(session, &txn_active)); + if (txn_active) + WT_RET_MSG(session, ENOTSUP, + "upgrade / downgrade must run single-threaded"); + return (0); +} + +/* + * __wt_conn_statistics_config -- + * Set statistics configuration. + */ +int +__wt_conn_statistics_config(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONFIG_ITEM cval, sval; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + uint32_t flags; + int set; + + conn = S2C(session); + + WT_RET(__wt_config_gets(session, cfg, "statistics", &cval)); + + flags = 0; + set = 0; + if ((ret = __wt_config_subgets( + session, &cval, "none", &sval)) == 0 && sval.val != 0) { + flags = 0; + ++set; + } + WT_RET_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets( + session, &cval, "fast", &sval)) == 0 && sval.val != 0) { + LF_SET(WT_STAT_TYPE_FAST); + ++set; + } + WT_RET_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets( + session, &cval, "all", &sval)) == 0 && sval.val != 0) { + LF_SET( + WT_STAT_TYPE_ALL | WT_STAT_TYPE_CACHE_WALK | + WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); + ++set; + } + WT_RET_NOTFOUND_OK(ret); + + if (set > 1) + WT_RET_MSG(session, EINVAL, + "Only one of all, fast, none configuration values should " + "be specified"); + + /* + * Now that we've parsed general statistics categories, process + * sub-categories. + */ + if ((ret = __wt_config_subgets( + session, &cval, "cache_walk", &sval)) == 0 && sval.val != 0) + /* + * Configuring cache walk statistics implies fast statistics. + * Keep that knowledge internal for now - it may change in the + * future. + */ + LF_SET(WT_STAT_TYPE_FAST | WT_STAT_TYPE_CACHE_WALK); + WT_RET_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets( + session, &cval, "tree_walk", &sval)) == 0 && sval.val != 0) + /* + * Configuring tree walk statistics implies fast statistics. + * Keep that knowledge internal for now - it may change in the + * future. + */ + LF_SET(WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); + WT_RET_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets( + session, &cval, "clear", &sval)) == 0 && sval.val != 0) { + if (!LF_ISSET(WT_STAT_TYPE_ALL | WT_STAT_TYPE_CACHE_WALK | + WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK)) + WT_RET_MSG(session, EINVAL, + "the value \"clear\" can only be specified if " + "statistics are enabled"); + LF_SET(WT_STAT_CLEAR); + } + WT_RET_NOTFOUND_OK(ret); + + /* Configuring statistics clears any existing values. */ + conn->stat_flags = flags; + + return (0); +} + +/* + * __wt_conn_reconfig -- + * Reconfigure a connection (internal version). + */ +int +__wt_conn_reconfig(WT_SESSION_IMPL *session, const char **cfg) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + const char *p; + + conn = S2C(session); + + /* Serialize reconfiguration. */ + __wt_spin_lock(session, &conn->reconfig_lock); + + /* + * The configuration argument has been checked for validity, update the + * previous connection configuration. + * + * DO NOT merge the configuration before the reconfigure calls. Some + * of the underlying reconfiguration functions do explicit checks with + * the second element of the configuration array, knowing the defaults + * are in slot #1 and the application's modifications are in slot #2. + * + * Replace the base configuration set up by CONNECTION_API_CALL with + * the current connection configuration, otherwise reconfiguration + * functions will find the base value instead of previously configured + * value. + */ + cfg[0] = conn->cfg; + + /* + * Reconfigure the system. + * + * The compatibility version check is special: upgrade / downgrade + * cannot be done with transactions active, and checkpoints must not + * span a version change. Hold the checkpoint lock to avoid conflicts + * with WiredTiger's checkpoint thread, and rely on the documentation + * specifying that no new operations can start until the upgrade / + * downgrade completes. + */ + WT_WITH_CHECKPOINT_LOCK(session, + ret = __wt_conn_compat_config(session, cfg)); + WT_ERR(__wt_conn_statistics_config(session, cfg)); + WT_ERR(__wt_async_reconfig(session, cfg)); + WT_ERR(__wt_cache_config(session, true, cfg)); + WT_ERR(__wt_checkpoint_server_create(session, cfg)); + WT_ERR(__wt_logmgr_reconfig(session, cfg)); + WT_ERR(__wt_lsm_manager_reconfig(session, cfg)); + WT_ERR(__wt_statlog_create(session, cfg)); + WT_ERR(__wt_sweep_config(session, cfg)); + WT_ERR(__wt_verbose_config(session, cfg)); + WT_ERR(__wt_timing_stress_config(session, cfg)); + + /* Third, merge everything together, creating a new connection state. */ + WT_ERR(__wt_config_merge(session, cfg, NULL, &p)); + __wt_free(session, conn->cfg); + conn->cfg = p; + +err: __wt_spin_unlock(session, &conn->reconfig_lock); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c index 10de133be75..087c811747a 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_ds.c +++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c @@ -38,17 +38,16 @@ static int __curds_key_set(WT_CURSOR *cursor) { WT_CURSOR *source; - WT_DECL_RET; source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; - WT_ERR(__cursor_needkey(cursor)); + WT_RET(__cursor_needkey(cursor)); source->recno = cursor->recno; source->key.data = cursor->key.data; source->key.size = cursor->key.size; -err: return (ret); + return (0); } /* @@ -59,16 +58,15 @@ static int __curds_value_set(WT_CURSOR *cursor) { WT_CURSOR *source; - WT_DECL_RET; source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; - WT_ERR(__cursor_needvalue(cursor)); + WT_RET(__cursor_needvalue(cursor)); source->value.data = cursor->value.data; source->value.size = cursor->value.size; -err: return (ret); + return (0); } /* diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c index 855ad70d6e0..e3ae9dbd9f6 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_join.c +++ b/src/third_party/wiredtiger/src/cursor/cur_join.c @@ -499,9 +499,7 @@ __curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, passed = (cmp < 0); break; - default: - WT_RET(__wt_illegal_value(session, NULL)); - break; + WT_ILLEGAL_VALUE(session); } if (!passed) { diff --git a/src/third_party/wiredtiger/src/docs/Doxyfile b/src/third_party/wiredtiger/src/docs/Doxyfile index 8292df18e47..e95d8babe48 100644 --- a/src/third_party/wiredtiger/src/docs/Doxyfile +++ b/src/third_party/wiredtiger/src/docs/Doxyfile @@ -206,8 +206,8 @@ TAB_SIZE = 8 # You can put \n's in the value part of an alias to insert newlines. ALIASES = "notyet{1}=Note: <b>"\1"</b> not yet supported in WiredTiger.\n@todo fix when \1 supported\n\n" \ - "errors=@returns zero on success and a non-zero error code on failure. See @ref error_returns \"Error Returns\" for details." \ - "ebusy_errors=@returns zero on success, EBUSY if there are open cursors on the object and a non-zero error code on failure. See @ref error_returns \"Error Returns\" for details." \ + "errors=@returns zero on success and a non-zero error code on failure. See @ref error_handling \"Error handling\" for details." \ + "ebusy_errors=@returns zero on success, EBUSY if the object is not available for exclusive access, and a non-zero error code on failure. See @ref error_handling \"Error handling\" for details." \ "ex_ref{1}=@ref \1 \"\1\"" \ "ref_single=@ref" \ "subpage_single=@subpage" \ diff --git a/src/third_party/wiredtiger/src/docs/error-handling.dox b/src/third_party/wiredtiger/src/docs/error-handling.dox index eb9ca6bb82a..7b7c0cd03d5 100644 --- a/src/third_party/wiredtiger/src/docs/error-handling.dox +++ b/src/third_party/wiredtiger/src/docs/error-handling.dox @@ -3,11 +3,11 @@ WiredTiger operations return a value of 0 on success and a non-zero value on error. Error codes may be either positive or negative: positive error codes are standard error codes as described for -POSIX-like systems (for example, EINVAL or EBUSY), negative error codes -are WiredTiger-specific (for example, WT_ROLLBACK). +POSIX-like systems (for example, \c EINVAL or \c EBUSY), negative error +codes are WiredTiger-specific (for example, \c WT_ROLLBACK). WiredTiger-specific error codes always appear in the -31,800 to -31,999 -range. +range, inclusive. @m_if{java} Informational return values, like <code>wiredtiger.WT_NOTFOUND</code> @@ -29,11 +29,22 @@ correctly-written WiredTiger application will likely catch errors. Note that no further WiredTiger calls are required after \c WiredTigerPanicException is caught (and further calls will themselves immediately fail). +@m_endif + +WiredTiger returns \c EBUSY for operations requiring exclusive access, when +an object is not available for exclusive access. For example, the +WT_SESSION::drop or WT_SESSION::verify methods will fail if the object +has open cursors. Note that internal WiredTiger threads may temporarily +open cursors on objects (for example, threads performing operations like +statistics logging), and operations may temporarily fail and return \c EBUSY +when there are no application cursors open on the object. -The following is a complete list of possible WiredTiger-specific -return values, all constants defined in the com.wiredtiger.db.wiredtiger class: +@m_if{java} +The following is a complete list of the WiredTiger-specific return +values, all constants defined in the com.wiredtiger.db.wiredtiger class: @m_else -The following is a list of possible WiredTiger-specific errors: +The following is a complete list of the WiredTiger-specific return +values: @m_endif @if IGNORE_BUILT_BY_API_ERR_BEGIN diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 01a9179aedc..f0d810281c2 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -386,8 +386,9 @@ struct __wt_page_modify { /* Cached overflow value cell/update address pairs. */ struct { - WT_CELL *cell; - WT_UPDATE *upd; + WT_CELL *cell; + uint8_t *data; + size_t size; } *remove; size_t remove_allocated; uint32_t remove_next; @@ -895,10 +896,11 @@ struct __wt_update { uint32_t size; /* data length */ -#define WT_UPDATE_DELETED 0 /* deleted */ -#define WT_UPDATE_MODIFIED 1 /* partial-update modify value */ -#define WT_UPDATE_RESERVED 2 /* reserved */ -#define WT_UPDATE_STANDARD 3 /* complete value */ +#define WT_UPDATE_INVALID 0 /* diagnostic check */ +#define WT_UPDATE_DELETED 1 /* deleted */ +#define WT_UPDATE_MODIFIED 2 /* partial-update modify value */ +#define WT_UPDATE_RESERVED 3 /* reserved */ +#define WT_UPDATE_STANDARD 4 /* complete value */ uint8_t type; /* type (one byte to conserve memory) */ /* If the update includes a complete value. */ @@ -936,7 +938,7 @@ struct __wt_update { * Limit update chains to a small value to avoid penalizing reads and * permit truncation. */ -#define WT_MAX_MODIFY_UPDATE 100 +#define WT_MAX_MODIFY_UPDATE 10 /* * WT_INSERT -- diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index afd4c874cf1..9a86dbc1a26 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -150,7 +150,8 @@ extern const char *__wt_cell_type_string(uint8_t type); extern const char *__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf); extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *buf); extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_ovfl_discard_remove(WT_SESSION_IMPL *session, WT_PAGE *page); +extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, bool checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -278,6 +279,9 @@ extern int __wt_logmgr_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIB extern int __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_connection_close(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_conn_compat_config(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_conn_statistics_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_conn_reconfig(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_conn_stat_init(WT_SESSION_IMPL *session); extern int __wt_statlog_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_statlog_destroy(WT_SESSION_IMPL *session, bool is_close) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -475,6 +479,7 @@ extern int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, const char *uri, int ( extern int __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool force, WT_LSM_CHUNK **chunkp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_work_switch( WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **entryp, bool *ran) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern bool __wt_lsm_chunk_visible_all( WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk); extern int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -664,7 +669,7 @@ __wt_assert(WT_SESSION_IMPL *session, #endif WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern int __wt_panic(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_illegal_value(WT_SESSION_IMPL *session, const char *name) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_illegal_value_func( WT_SESSION_IMPL *session, const char *tag, const char *file, int line) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_inmem_unsupported_op(WT_SESSION_IMPL *session, const char *tag) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_object_unsupported(WT_SESSION_IMPL *session, const char *uri) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -683,7 +688,6 @@ extern int __wt_stash_add(WT_SESSION_IMPL *session, int which, uint64_t generati extern void __wt_stash_discard_all(WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session); extern int __wt_library_init(void) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_breakpoint(void); -extern void __wt_attach(WT_SESSION_IMPL *session); extern uint64_t __wt_hash_city64(const void *s, size_t len); extern uint64_t __wt_hash_fnv64(const void *string, size_t len); extern int @@ -809,6 +813,8 @@ extern int __wt_txn_named_snapshot_config(WT_SESSION_IMPL *session, const char * extern void __wt_txn_named_snapshot_destroy(WT_SESSION_IMPL *session); extern int __wt_txn_recover(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_timestamp_to_hex_string( WT_SESSION_IMPL *session, char *hex_timestamp, const wt_timestamp_t *ts_src) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_verbose_timestamp(WT_SESSION_IMPL *session, const wt_timestamp_t *ts, const char *msg); extern int __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_global_query_timestamp( WT_SESSION_IMPL *session, char *hex_timestamp, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h index 243716c2ecb..ccb32900dc4 100644 --- a/src/third_party/wiredtiger/src/include/flags.h +++ b/src/third_party/wiredtiger/src/include/flags.h @@ -113,10 +113,11 @@ #define WT_VERB_SPLIT 0x00800000 #define WT_VERB_TEMPORARY 0x01000000 #define WT_VERB_THREAD_GROUP 0x02000000 -#define WT_VERB_TRANSACTION 0x04000000 -#define WT_VERB_VERIFY 0x08000000 -#define WT_VERB_VERSION 0x10000000 -#define WT_VERB_WRITE 0x20000000 +#define WT_VERB_TIMESTAMP 0x04000000 +#define WT_VERB_TRANSACTION 0x08000000 +#define WT_VERB_VERIFY 0x10000000 +#define WT_VERB_VERSION 0x20000000 +#define WT_VERB_WRITE 0x40000000 #define WT_VISIBILITY_ERR 0x00000080 /* * flags section: END diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h index df7d6c8d5ca..397f17400de 100644 --- a/src/third_party/wiredtiger/src/include/lsm.h +++ b/src/third_party/wiredtiger/src/include/lsm.h @@ -97,6 +97,11 @@ struct __wt_lsm_chunk { * out, or by compact to get the most * recent chunk flushed. */ + WT_DECL_TIMESTAMP(switch_timestamp)/* + * The timestamp used to decide when + * updates need to detect conflicts. + */ + WT_SPINLOCK timestamp_spinlock; uint32_t id; /* ID used to generate URIs */ uint32_t generation; /* Merge generation */ @@ -107,10 +112,11 @@ struct __wt_lsm_chunk { int8_t evicted; /* 1/0: in-memory chunk was evicted */ uint8_t flushing; /* 1/0: chunk flush in progress */ -#define WT_LSM_CHUNK_BLOOM 0x01 -#define WT_LSM_CHUNK_MERGING 0x02 -#define WT_LSM_CHUNK_ONDISK 0x04 -#define WT_LSM_CHUNK_STABLE 0x08 +#define WT_LSM_CHUNK_BLOOM 0x01 +#define WT_LSM_CHUNK_HAS_TIMESTAMP 0x02 +#define WT_LSM_CHUNK_MERGING 0x04 +#define WT_LSM_CHUNK_ONDISK 0x08 +#define WT_LSM_CHUNK_STABLE 0x10 uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index bf7d36e19ca..a6cb56dd852 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -293,6 +293,10 @@ typedef void wt_timestamp_t; __wt_page_swap_func(session, held, want, flags) #endif +/* Called on unexpected code path: locate the failure. */ +#define __wt_illegal_value(session, msg) \ + __wt_illegal_value_func(session, msg, __FILE__, __LINE__) + /* Random number generator state. */ union __wt_rand_state { uint64_t v; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 61ab343151c..e0513a82892 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -69,7 +69,6 @@ struct __wt_named_snapshot { struct __wt_txn_state { WT_CACHE_LINE_PAD_BEGIN - WT_RWLOCK rwlock; volatile uint64_t id; volatile uint64_t pinned_id; volatile uint64_t metadata_pinned; @@ -105,6 +104,9 @@ struct __wt_txn_global { /* Protects the active transaction states. */ WT_RWLOCK rwlock; + /* Protects logging, checkpoints and transaction visibility. */ + WT_RWLOCK visibility_rwlock; + /* List of transactions sorted by commit timestamp. */ WT_RWLOCK commit_timestamp_rwlock; TAILQ_HEAD(__wt_txn_cts_qh, __wt_txn) commit_timestamph; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 30f29e0f5d0..8067b6128c5 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -11,6 +11,8 @@ static inline void __wt_txn_read_last(WT_SESSION_IMPL *session); #ifdef HAVE_TIMESTAMPS #if WT_TIMESTAMP_SIZE == 8 +#define WT_WITH_TIMESTAMP_READLOCK(session, l, e) e + /* * __wt_timestamp_cmp -- * Compare two timestamps. @@ -61,6 +63,12 @@ __wt_timestamp_set_zero(wt_timestamp_t *ts) ts->val = 0; } #else +#define WT_WITH_TIMESTAMP_READLOCK(s, l, e) do { \ + __wt_readlock((s), (l)); \ + e; \ + __wt_readunlock((s), (l)); \ +} while (0) + /* * __wt_timestamp_cmp -- * Compare two timestamps. @@ -90,8 +98,7 @@ __wt_timestamp_iszero(const wt_timestamp_t *ts) { static const wt_timestamp_t zero_timestamp; - return (memcmp(ts->ts, - WT_TIMESTAMP_NULL(&zero_timestamp), WT_TIMESTAMP_SIZE) == 0); + return (memcmp(ts->ts, &zero_timestamp, WT_TIMESTAMP_SIZE) == 0); } /* @@ -182,7 +189,17 @@ __wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd) op->type = F_ISSET(session, WT_SESSION_LOGGING_INMEM) ? WT_TXN_OP_INMEM : WT_TXN_OP_BASIC; #ifdef HAVE_TIMESTAMPS - if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) { + /* + * Mark the update with a timestamp, if we have one. + * + * Updates in the metadata never get timestamps (either now or at + * commit): metadata cannot be read at a point in time, only the most + * recently committed data matches files on disk. + */ + if (WT_IS_METADATA(session->dhandle)) { + if (!F_ISSET(session, WT_SESSION_LOGGING_INMEM)) + op->type = WT_TXN_OP_BASIC_TS; + } else if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) { __wt_timestamp_set(&upd->timestamp, &txn->commit_timestamp); if (!F_ISSET(session, WT_SESSION_LOGGING_INMEM)) op->type = WT_TXN_OP_BASIC_TS; @@ -285,9 +302,9 @@ __txn_visible_all_id(WT_SESSION_IMPL *session, uint64_t id) /* * __wt_txn_visible_all -- - * Check if a given transaction is "globally visible". This is, if - * all sessions in the system will see the transaction ID including the - * ID that belongs to a running checkpoint. + * Check if a given transaction is "globally visible". This is, if all + * sessions in the system will see the transaction ID including the ID + * that belongs to a running checkpoint. */ static inline bool __wt_txn_visible_all( @@ -302,12 +319,18 @@ __wt_txn_visible_all( int cmp; /* Timestamp check. */ - if (!txn_global->has_pinned_timestamp || timestamp == NULL) + if (timestamp == NULL || __wt_timestamp_iszero(timestamp)) return (true); - __wt_readlock(session, &txn_global->rwlock); - cmp = __wt_timestamp_cmp(timestamp, &txn_global->pinned_timestamp); - __wt_readunlock(session, &txn_global->rwlock); + /* + * If no oldest timestamp has been supplied, updates have to stay in + * cache until we are shutting down. + */ + if (!txn_global->has_pinned_timestamp) + return (F_ISSET(S2C(session), WT_CONN_CLOSING)); + + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + cmp = __wt_timestamp_cmp(timestamp, &txn_global->pinned_timestamp)); /* * We can discard updates with timestamps less than or equal to the @@ -581,8 +604,7 @@ __wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish) /* * __wt_txn_id_check -- - * A transaction is going to do an update, start an auto commit - * transaction if required and allocate a transaction ID. + * A transaction is going to do an update, allocate a transaction ID. */ static inline int __wt_txn_id_check(WT_SESSION_IMPL *session) @@ -606,7 +628,7 @@ __wt_txn_id_check(WT_SESSION_IMPL *session) * more we can do. */ if (txn->id == WT_TXN_ABORTED) - WT_RET_MSG(session, ENOMEM, "Out of transaction IDs"); + WT_RET_MSG(session, WT_ERROR, "out of transaction IDs"); F_SET(txn, WT_TXN_HAS_ID); return (0); @@ -730,11 +752,11 @@ __wt_txn_am_oldest(WT_SESSION_IMPL *session) } /* - * __wt_txn_are_any_active -- + * __wt_txn_activity_check -- * Check whether there are any running transactions. */ static inline int -__wt_txn_are_any_active(WT_SESSION_IMPL *session, bool *any_active) +__wt_txn_activity_check(WT_SESSION_IMPL *session, bool *txn_active) { WT_TXN_GLOBAL *txn_global; @@ -747,6 +769,8 @@ __wt_txn_are_any_active(WT_SESSION_IMPL *session, bool *any_active) WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); - *any_active = (txn_global->oldest_id != txn_global->current); + *txn_active = (txn_global->oldest_id != txn_global->current || + txn_global->metadata_pinned != txn_global->current); + return (0); } diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 5d087447c5a..7825962d89f 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -504,6 +504,12 @@ struct __wt_cursor { * (as it partially depends on the underlying file configuration), but * is always a small number of bytes less than 4GB. * + * The WT_CURSOR::modify method stores a change record in cache and + * writes a change record to the log, instead of the usual complete + * value. This can reduce cache and logging requirements, but may result + * in slower reads because the complete value must be assembled during + * retrieval. + * * @param cursor the cursor handle * @param entries an array of modification data structures * @param nentries the number of modification data structures @@ -1537,7 +1543,7 @@ struct __wt_session { * @snippet ex_all.c Reset the session * * @param session the session handle - * @ebusy_errors + * @errors */ int __F(reset)(WT_SESSION *session); @@ -1998,8 +2004,10 @@ struct __wt_connection { * checkpoint; setting this value above 0 configures periodic * checkpoints., an integer between 0 and 100000; default \c 0.} * @config{ ),,} - * @config{compatibility = (, set compatibility version of database., a - * set of related configuration options defined below.} + * @config{compatibility = (, set compatibility version of database. + * Changing the compatibility version requires that there are no active + * operations for the duration of the call., a set of related + * configuration options defined below.} * @config{ release, compatibility release * version string., a string; default empty.} * @config{ ),,} @@ -2143,8 +2151,9 @@ struct __wt_connection { * "metadata"\, \c "mutex"\, \c "overflow"\, \c "read"\, \c * "rebalance"\, \c "reconcile"\, \c "recovery"\, \c * "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c "split"\, - * \c "temporary"\, \c "thread_group"\, \c "transaction"\, \c "verify"\, - * \c "version"\, \c "write"; default empty.} + * \c "temporary"\, \c "thread_group"\, \c "timestamp"\, \c + * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default + * empty.} * @configend * @errors */ @@ -2258,6 +2267,8 @@ struct __wt_connection { * * @snippet ex_all.c set oldest timestamp * + * @snippet ex_all.c set stable timestamp + * * @param connection the connection handle * @configstart{WT_CONNECTION.set_timestamp, see dist/api_data.py} * @config{commit_timestamp, reset the maximum commit timestamp tracked @@ -2292,8 +2303,8 @@ struct __wt_connection { * WT_CONNECTION::set_timestamp. Any updates to checkpoint durable * tables that are more recent than the stable timestamp are removed. * - * This method requires that there are no active cursor operations - * for the duration of the call. + * This method requires that there are no active operations for the + * duration of the call. * * Any updates made to logged tables will not be rolled back. Any * updates made without an associated timestamp will not be rolled @@ -2527,10 +2538,12 @@ struct __wt_connection { * @config{ ),,} * @config{checkpoint_sync, flush files to stable storage when closing or * writing checkpoints., a boolean flag; default \c true.} - * @config{compatibility = (, set compatibility version of database., a set of - * related configuration options defined below.} - * @config{ release, compatibility release version - * string., a string; default empty.} + * @config{compatibility = (, set compatibility version of database. Changing + * the compatibility version requires that there are no active operations for + * the duration of the call., a set of related configuration options defined + * below.} + * @config{ release, compatibility release + * version string., a string; default empty.} * @config{ ),,} * @config{config_base, write the base configuration file if creating the * database. If \c false in the config passed directly to ::wiredtiger_open\, @@ -2766,8 +2779,8 @@ struct __wt_connection { * "lookaside_activity"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c * "mutex"\, \c "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c - * "split"\, \c "temporary"\, \c "thread_group"\, \c "transaction"\, \c - * "verify"\, \c "version"\, \c "write"; default empty.} + * "split"\, \c "temporary"\, \c "thread_group"\, \c "timestamp"\, \c + * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default empty.} * @config{write_through, Use \c FILE_FLAG_WRITE_THROUGH on Windows to write to * files. Ignored on non-Windows systems. Options are given as a list\, such * as <code>"write_through=[data]"</code>. Configuring \c write_through requires @@ -3301,7 +3314,6 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp) * Error returns *******************************************/ /*! - * @anchor error_returns * @name Error returns * Most functions and methods in WiredTiger return an integer code indicating * whether the operation succeeded or failed. A return of zero indicates diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index 6a1709b03f2..39656c17ee0 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -538,8 +538,7 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { chunk = lsm_tree->chunk[ngood - 1]; clsm->chunks[ngood - 1]->switch_txn = chunk->switch_txn; - if (__wt_txn_visible_all( - session, chunk->switch_txn, NULL)) + if (__wt_lsm_chunk_visible_all(session, chunk)) break; } } else { @@ -937,10 +936,9 @@ retry: /* goto retry; err: __clsm_leave(clsm); - API_END(session, ret); if (ret == 0) __clsm_deleted_decode(clsm, &cursor->value); - return (ret); + API_END_RET(session, ret); } /* @@ -1029,8 +1027,7 @@ __clsm_next_random(WT_CURSOR *cursor) err: F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); } __clsm_leave(clsm); - API_END(session, ret); - return (ret); + API_END_RET(session, ret); } /* @@ -1116,10 +1113,9 @@ retry: /* goto retry; err: __clsm_leave(clsm); - API_END(session, ret); if (ret == 0) __clsm_deleted_decode(clsm, &cursor->value); - return (ret); + API_END_RET(session, ret); } /* @@ -1275,10 +1271,9 @@ __clsm_search(WT_CURSOR *cursor) ret = __clsm_lookup(clsm, &cursor->value); err: __clsm_leave(clsm); - API_END(session, ret); if (ret == 0) __clsm_deleted_decode(clsm, &cursor->value); - return (ret); + API_END_RET(session, ret); } /* @@ -1418,7 +1413,6 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) *exactp = cmp; err: __clsm_leave(clsm); - API_END(session, ret); if (closest != NULL) WT_TRET(closest->reset(closest)); @@ -1428,7 +1422,7 @@ err: __clsm_leave(clsm); } else clsm->current = NULL; - return (ret); + API_END_RET(session, ret); } /* diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c index 24a0429a184..3949d88cec4 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c @@ -208,14 +208,20 @@ __wt_lsm_manager_start(WT_SESSION_IMPL *session) conn = S2C(session); manager = &conn->lsm_manager; - if (F_ISSET(conn, WT_CONN_READONLY)) { - manager->lsm_workers = 0; - return (0); - } /* - * We need at least a manager, a switch thread and a generic - * worker. + * If readonly or the manager is running, or we've already failed, + * there's no work to do. */ + if (F_ISSET(conn, WT_CONN_READONLY) || + manager->lsm_workers != 0 || + F_ISSET(manager, WT_LSM_MANAGER_SHUTDOWN)) + return (0); + + /* It's possible to race, see if we're the winner. */ + if (!__wt_atomic_cas32(&manager->lsm_workers, 0, 1)) + return (0); + + /* We need at least a manager, a switch thread and a generic worker. */ WT_ASSERT(session, manager->lsm_workers_max > 2); /* @@ -245,6 +251,15 @@ err: for (i = 0; i++) WT_TRET((&worker_session->iface)->close( &worker_session->iface, NULL)); + + /* Make the failure permanent, we won't try again. */ + F_SET(manager, WT_LSM_MANAGER_SHUTDOWN); + + /* + * Reset the workers count (otherwise, LSM destroy will hang + * waiting for threads to exit. + */ + WT_PUBLISH(manager->lsm_workers, 0); } return (ret); } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c index 18e1f6d3115..e6eccf96467 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c @@ -35,6 +35,7 @@ __lsm_tree_discard_state(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if ((chunk = lsm_tree->chunk[i]) == NULL) continue; + __wt_spin_destroy(session, &chunk->timestamp_spinlock); __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, chunk); @@ -44,6 +45,7 @@ __lsm_tree_discard_state(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) chunk = lsm_tree->old_chunks[i]; WT_ASSERT(session, chunk != NULL); + __wt_spin_destroy(session, &chunk->timestamp_spinlock); __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, chunk); @@ -280,6 +282,8 @@ __wt_lsm_tree_setup_chunk( WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); __wt_epoch(session, &chunk->create_time); + __wt_spin_init(session, + &chunk->timestamp_spinlock, "LSM chunk timestamp"); WT_RET(__wt_lsm_tree_chunk_name( session, lsm_tree, chunk->id, &chunk->uri)); @@ -474,8 +478,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); /* Start the LSM manager thread if it isn't running. */ - if (__wt_atomic_cas32(&conn->lsm_manager.lsm_workers, 0, 1)) - WT_RET(__wt_lsm_manager_start(session)); + WT_RET(__wt_lsm_manager_start(session)); /* Make sure no one beat us to it. */ if ((ret = __lsm_tree_find( diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index 2f21e8acdc3..816eafebe99 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -256,6 +256,63 @@ err: } /* + * __wt_lsm_chunk_visible_all -- + * Setup a timestamp and check visibility for a chunk, can be called + * from multiple threads in parallel + */ +bool +__wt_lsm_chunk_visible_all( + WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk) +{ + /* Once a chunk has been flushed it's contents must be visible */ + if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK | WT_LSM_CHUNK_STABLE)) + return (true); + + if (chunk->switch_txn == WT_TXN_NONE || + !__wt_txn_visible_all(session, chunk->switch_txn, NULL)) + return (false); + +#ifdef HAVE_TIMESTAMPS + { + WT_TXN_GLOBAL *txn_global; + + txn_global = &S2C(session)->txn_global; + + /* + * Once all transactions with updates in the chunk are visible all + * timestamps associated with those updates are assigned so setup a + * timestamp for visibility checking. + */ + if (txn_global->has_commit_timestamp || + txn_global->has_pinned_timestamp) { + if (!F_ISSET(chunk, WT_LSM_CHUNK_HAS_TIMESTAMP)) { + __wt_spin_lock(session, &chunk->timestamp_spinlock); + /* Set the timestamp if we won the race */ + if (!F_ISSET(chunk, WT_LSM_CHUNK_HAS_TIMESTAMP)) { + __wt_readlock(session, &txn_global->rwlock); + __wt_timestamp_set(&chunk->switch_timestamp, + &txn_global->commit_timestamp); + __wt_readunlock(session, &txn_global->rwlock); + F_SET(chunk, WT_LSM_CHUNK_HAS_TIMESTAMP); + } + __wt_spin_unlock(session, &chunk->timestamp_spinlock); + } + if (!__wt_txn_visible_all( + session, chunk->switch_txn, &chunk->switch_timestamp)) + return (false); + } else + /* + * If timestamps aren't in use when the chunk becomes visible + * use the zero timestamp for visibility checks. Otherwise + * there could be confusion if timestamps start being used. + */ + F_SET(chunk, WT_LSM_CHUNK_HAS_TIMESTAMP); + } +#endif + return (true); +} + +/* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. */ @@ -295,14 +352,12 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, /* Stop if a running transaction needs the chunk. */ WT_RET(__wt_txn_update_oldest( session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); - if (chunk->switch_txn == WT_TXN_NONE || - !__wt_txn_visible_all(session, chunk->switch_txn, NULL)) { + if (!__wt_lsm_chunk_visible_all(session, chunk)) { __wt_verbose(session, WT_VERB_LSM, "LSM worker %s: running transaction, return", chunk->uri); return (0); } - if (!__wt_atomic_cas8(&chunk->flushing, 0, 1)) return (0); flush_set = true; diff --git a/src/third_party/wiredtiger/src/os_common/os_abort.c b/src/third_party/wiredtiger/src/os_common/os_abort.c index 905f3160acf..ebef001ce67 100644 --- a/src/third_party/wiredtiger/src/os_common/os_abort.c +++ b/src/third_party/wiredtiger/src/os_common/os_abort.c @@ -16,12 +16,18 @@ void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_ATTRIBUTE((noreturn)) { - __wt_errx(session, "aborting WiredTiger library"); +#ifdef HAVE_ATTACH + u_int i; -#ifdef HAVE_DIAGNOSTIC - __wt_attach(session); -#endif + __wt_errx(session, "process ID %" PRIdMAX + ": waiting for debugger...", (intmax_t)getpid()); + /* Sleep forever, the debugger will interrupt us when it attaches. */ + for (i = 0; i < WT_MILLION; ++i) + __wt_sleep(10, 0); +#else + __wt_errx(session, "aborting WiredTiger library"); +#endif abort(); /* NOTREACHED */ } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index a3cb6a53a09..10c2c0dc937 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -58,8 +58,12 @@ typedef struct { uint64_t orig_btree_checkpoint_gen; uint64_t orig_txn_checkpoint_gen; - /* Track the oldest transaction running when reconciliation starts. */ + /* + * Track the oldest running transaction and the stable timestamp when + * reconciliation starts. + */ uint64_t last_running; + WT_DECL_TIMESTAMP(stable_timestamp) /* Track the page's maximum transaction. */ uint64_t max_txn; @@ -506,6 +510,13 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_TRET(session->block_manager_cleanup(session)); WT_TRET(__rec_destroy_session(session)); + + /* + * We track removed overflow objects in case there's a reader + * in transit when they're removed. Any form of eviction locks + * out readers, we can discard them all. + */ + __wt_ovfl_discard_remove(session, page); } WT_RET(ret); @@ -881,6 +892,7 @@ __rec_init(WT_SESSION_IMPL *session, WT_BTREE *btree; WT_PAGE *page; WT_RECONCILE *r; + WT_TXN_GLOBAL *txn_global; btree = S2BT(session); page = ref->page; @@ -924,7 +936,13 @@ __rec_init(WT_SESSION_IMPL *session, * transaction running when reconciliation starts is considered * uncommitted. */ - WT_ORDERED_READ(r->last_running, S2C(session)->txn_global.last_running); + txn_global = &S2C(session)->txn_global; + WT_ORDERED_READ(r->last_running, txn_global->last_running); +#ifdef HAVE_TIMESTAMPS + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set( + &r->stable_timestamp, &txn_global->stable_timestamp)); +#endif /* * Lookaside table eviction is configured when eviction gets aggressive, @@ -1194,6 +1212,64 @@ __rec_update_move(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_SAVE_UPD *supd) } /* + * __rec_append_orig_value -- + * Append the key's original value to its update list. + */ +static int +__rec_append_orig_value(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_UPDATE *append, *upd; + size_t size; + + /* If at least one standard update is globally visible, we're done. */ + for (upd = upd_list; upd != NULL; upd = upd->next) + if (WT_UPDATE_DATA_VALUE(upd) && + __wt_txn_upd_visible_all(session, upd)) + return (0); + + /* + * We need the original on-page value for some reader: get a copy and + * append it to the end of the update list with a transaction ID that + * guarantees its visibility. + * + * If we don't have a value cell, it's an insert/append list key/value + * pair which simply doesn't exist for some reader; place a deleted + * record at the end of the update list. + */ + append = NULL; /* -Wconditional-uninitialized */ + size = 0; /* -Wconditional-uninitialized */ + if (unpack == NULL || unpack->type == WT_CELL_DEL) + WT_RET(__wt_update_alloc(session, + NULL, &append, &size, WT_UPDATE_DELETED)); + else { + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp)); + WT_ERR(__wt_update_alloc( + session, tmp, &append, &size, WT_UPDATE_STANDARD)); + } + + /* + * Give the entry no transaction ID to ensure global visibility, append + * it to the update list. + * + * Note the change to the actual reader-accessible update list: from now + * on, the original on-page value appears at the end of the update list, + * even if this reconciliation subsequently fails. + */ + append->txnid = WT_TXN_NONE; + for (upd = upd_list; upd->next != NULL; upd = upd->next) + ; + WT_PUBLISH(upd->next, append); + __wt_cache_page_inmem_incr(session, page, size); + +err: __wt_scr_free(session, &tmp); + return (ret); +} + +/* * __rec_txn_read -- * Return the update in a list that should be written (or NULL if none can * be written). @@ -1203,18 +1279,14 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) { WT_BTREE *btree; - WT_DECL_RET; - WT_DECL_ITEM(tmp); - WT_DECL_TIMESTAMP(min_timestamp) WT_DECL_TIMESTAMP(max_timestamp) WT_PAGE *page; - WT_UPDATE *append, *upd, *upd_list; - size_t size, update_mem; - uint64_t max_txn, min_txn, txnid; - bool append_origv, skipped; + WT_UPDATE *upd, *upd_list; + size_t update_mem; + uint64_t max_txn, txnid; + bool skipped; *updp = NULL; - append = NULL; /* -Wconditional-uninitialized */ btree = S2BT(session); page = r->page; @@ -1235,9 +1307,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, max_txn = WT_TXN_NONE; #ifdef HAVE_TIMESTAMPS __wt_timestamp_set_zero(&max_timestamp); - __wt_timestamp_set_inf(&min_timestamp); #endif - min_txn = UINT64_MAX; if (F_ISSET(r, WT_EVICTING)) { /* Discard obsolete updates. */ @@ -1258,8 +1328,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ if (WT_TXNID_LT(max_txn, txnid)) max_txn = txnid; - if (WT_TXNID_LT(txnid, min_txn)) - min_txn = txnid; /* * Find the first update we can use. @@ -1285,17 +1353,13 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (*updp == NULL) *updp = upd; + #ifdef HAVE_TIMESTAMPS /* Track min/max timestamps. */ if (__wt_timestamp_cmp( - &max_timestamp, &upd->timestamp) < 0) + &upd->timestamp, &max_timestamp) > 0) __wt_timestamp_set( &max_timestamp, &upd->timestamp); - - if (__wt_timestamp_cmp( - &min_timestamp, &upd->timestamp) > 0) - __wt_timestamp_set( - &min_timestamp, &upd->timestamp); #endif } } else @@ -1325,7 +1389,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, } } - /* Reconciliation should never see a reserved update. */ + /* Reconciliation should never see an aborted or reserved update. */ WT_ASSERT(session, *updp == NULL || ((*updp)->txnid != WT_TXN_ABORTED && (*updp)->type != WT_UPDATE_RESERVED)); @@ -1370,18 +1434,17 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (!skipped && (F_ISSET(btree, WT_BTREE_LOOKASIDE) || __wt_txn_visible_all(session, max_txn, WT_TIMESTAMP_NULL(&max_timestamp)))) { -#ifdef HAVE_DIAGNOSTIC /* * The checkpoint transaction is special. Make sure we never * write (metadata) updates from a checkpoint in a concurrent * session. */ - txnid = *updp == NULL ? WT_TXN_NONE : (*updp)->txnid; - WT_ASSERT(session, txnid == WT_TXN_NONE || - txnid != S2C(session)->txn_global.checkpoint_state.id || + WT_ASSERT(session, *updp == NULL || + (*updp)->txnid != + S2C(session)->txn_global.checkpoint_state.id || WT_SESSION_IS_CHECKPOINT(session)); -#endif - return (0); + + goto check_original_value; } /* @@ -1400,7 +1463,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ if (!F_ISSET(r, WT_EVICTING)) { r->leave_dirty = true; - return (0); + goto check_original_value; } /* @@ -1441,7 +1504,20 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (skipped) r->update_mem_uncommitted += update_mem; - append_origv = false; +#ifdef HAVE_TIMESTAMPS + /* + * Don't allow lookaside eviction with updates newer than the stable + * timestamp. Also don't recommend lookaside eviction in that case. + */ + if (__wt_timestamp_cmp(&max_timestamp, &r->stable_timestamp) > 0) { + if (!F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) + return (EBUSY); + + if (!skipped) + r->update_mem_uncommitted += update_mem; + } +#endif + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) { /* * The save/restore eviction path. @@ -1456,58 +1532,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, /* The page can't be marked clean. */ r->leave_dirty = true; - } else { - /* - * The lookaside table eviction path. - * - * If at least one update is globally visible, copy the update - * list and ignore the current on-page value. If no update is - * globally visible, readers require the page's original value. - */ - if (!__wt_txn_visible_all( - session, min_txn, WT_TIMESTAMP_NULL(&min_timestamp))) - append_origv = true; - } - - /* - * We need the original on-page value for some reason: get a copy and - * append it to the end of the update list with a transaction ID that - * guarantees its visibility. - */ - if (append_origv) { - /* - * If we don't have a value cell, it's an insert/append list - * key/value pair which simply doesn't exist for some reader; - * place a deleted record at the end of the update list. - */ - size = 0; /* -Wconditional-uninitialized */ - if (vpack == NULL || vpack->type == WT_CELL_DEL) - WT_RET(__wt_update_alloc(session, - NULL, &append, &size, WT_UPDATE_DELETED)); - else { - WT_RET(__wt_scr_alloc(session, 0, &tmp)); - if ((ret = __wt_page_cell_data_ref( - session, page, vpack, tmp)) == 0) - ret = __wt_update_alloc(session, - tmp, &append, &size, WT_UPDATE_STANDARD); - __wt_scr_free(session, &tmp); - WT_RET(ret); - } - - /* - * Give the entry no transaction ID to ensure global visibility, - * append it to the update list. - * - * Note the change to the actual reader-accessible update list: - * from now on, the original on-page value appears at the end - * of the update list, even if this reconciliation subsequently - * fails. - */ - append->txnid = WT_TXN_NONE; - for (upd = upd_list; upd->next != NULL; upd = upd->next) - ; - WT_PUBLISH(upd->next, append); - __wt_cache_page_inmem_incr(session, page, size); } /* @@ -1521,7 +1545,23 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * that transaction ID is globally visible, we know we no longer need * the lookaside table records, allowing them to be discarded. */ - return (__rec_update_save(session, r, ins, ripcip, *updp)); + WT_RET(__rec_update_save(session, r, ins, ripcip, *updp)); + +check_original_value: + /* + * Returning an update means the original on-page value might be lost, + * and that's a problem if there's a reader that needs it. There are + * two cases: any lookaside table eviction (because the backing disk + * image is rewritten), or any reconciliation of a backing overflow + * record that will be physically removed once it's no longer needed. + */ + if (*updp != NULL && + (F_ISSET(r, WT_EVICT_LOOKASIDE) || + (vpack != NULL && + vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM))) + WT_RET(__rec_append_orig_value(session, page, *updp, vpack)); + + return (0); } /* @@ -4708,7 +4748,7 @@ __rec_col_var(WT_SESSION_IMPL *session, * file, otherwise we'll leak blocks on the checkpoint. * That's safe because if the backing overflow value is * still needed by any running transaction, we'll cache - * a copy in the reconciliation tracking structures. + * a copy in the update list. * * Regardless, we avoid copying in overflow records: if * there's a WT_INSERT entry that modifies a reference @@ -4793,8 +4833,8 @@ record_loop: /* * The on-page value will never be accessed, * write a placeholder record. */ - data = "@"; - size = 1; + data = "ovfl-unused"; + size = WT_STORE_SIZE(strlen("ovfl-unused")); } else { update_no_copy = false; /* Maybe data copy */ @@ -4928,7 +4968,8 @@ compare: /* */ if (ovfl_state == OVFL_UNUSED && vpack->raw != WT_CELL_VALUE_OVFL_RM) - WT_ERR(__wt_ovfl_remove(session, page, upd, vpack)); + WT_ERR(__wt_ovfl_remove( + session, page, vpack, !F_ISSET(r, WT_EVICTING))); } /* Walk any append list. */ @@ -5535,8 +5576,9 @@ __rec_row_leaf(WT_SESSION_IMPL *session, * The on-page value will never be accessed, * write a placeholder record. */ - WT_ERR(__rec_cell_build_val( - session, r, "@", 1, (uint64_t)0)); + WT_ERR(__rec_cell_build_val(session, r, + "ovfl-unused", strlen("ovfl-unused"), + (uint64_t)0)); } else { val->buf.data = val_cell; val->buf.size = __wt_cell_total_len(vpack); @@ -5554,8 +5596,8 @@ __rec_row_leaf(WT_SESSION_IMPL *session, */ if (vpack != NULL && vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM) - WT_ERR(__wt_ovfl_remove( - session, page, upd, vpack)); + WT_ERR(__wt_ovfl_remove(session, + page, vpack, !F_ISSET(r, WT_EVICTING))); switch (upd->type) { case WT_UPDATE_DELETED: diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index 52d11651191..1a63ed675b5 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -1848,9 +1848,9 @@ __open_session(WT_CONNECTION_IMPL *conn, if (!session_ret->active) break; if (i == conn->session_size) - WT_ERR_MSG(session, ENOMEM, - "only configured to support %" PRIu32 " sessions" - " (including %d additional internal sessions)", + WT_ERR_MSG(session, WT_ERROR, + "out of sessions, only configured to support %" PRIu32 + " sessions (including %d additional internal sessions)", conn->session_size, WT_EXTRA_INTERNAL_SESSIONS); /* diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c index 94ae27628c2..a6ab328864d 100644 --- a/src/third_party/wiredtiger/src/support/err.c +++ b/src/third_party/wiredtiger/src/support/err.c @@ -533,17 +533,20 @@ __wt_panic(WT_SESSION_IMPL *session) } /* - * __wt_illegal_value -- + * __wt_illegal_value_func -- * A standard error message when we detect an illegal value. */ int -__wt_illegal_value(WT_SESSION_IMPL *session, const char *name) +__wt_illegal_value_func( + WT_SESSION_IMPL *session, const char *tag, const char *file, int line) WT_GCC_FUNC_ATTRIBUTE((cold)) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { - __wt_errx(session, "%s%s%s", - name == NULL ? "" : name, name == NULL ? "" : ": ", - "encountered an illegal file format or internal value"); + __wt_errx(session, "%s%s%s: (%s, %d)", + tag == NULL ? "" : tag, + tag == NULL ? "" : ": ", + "encountered an illegal file format or internal value", + file, line); return (__wt_panic(session)); } diff --git a/src/third_party/wiredtiger/src/support/global.c b/src/third_party/wiredtiger/src/support/global.c index 6525fe21809..e425b690a5b 100644 --- a/src/third_party/wiredtiger/src/support/global.c +++ b/src/third_party/wiredtiger/src/support/global.c @@ -106,25 +106,4 @@ __wt_breakpoint(void) */ __wt_yield(); } - -/* - * __wt_attach -- - * A routine to wait for the debugging to attach. - */ -void -__wt_attach(WT_SESSION_IMPL *session) -{ -#ifdef HAVE_ATTACH - u_int i; - - __wt_errx(session, "process ID %" PRIdMAX - ": waiting for debugger...", (intmax_t)getpid()); - - /* Sleep forever, the debugger will interrupt us when it attaches. */ - for (i = 0; i < WT_MILLION; ++i) - __wt_sleep(10, 0); -#else - WT_UNUSED(session); -#endif -} #endif diff --git a/src/third_party/wiredtiger/src/support/time.c b/src/third_party/wiredtiger/src/support/time.c index 233bc871e06..240a77591a3 100644 --- a/src/third_party/wiredtiger/src/support/time.c +++ b/src/third_party/wiredtiger/src/support/time.c @@ -35,8 +35,7 @@ __time_check_monotonic(WT_SESSION_IMPL *session, struct timespec *tsp) /* * __wt_epoch -- - * Return the time since the Epoch, adjusted so it never appears to go - * backwards. + * Return the time since the Epoch. */ void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) @@ -45,9 +44,14 @@ __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) struct timespec tmp; /* - * Read into a local variable so that we're comparing the correct - * value when we check for monotonic increasing time. There are - * many places we read into an unlocked global variable. + * Read into a local variable, then check for monotonically increasing + * time, ensuring single threads never see time move backward. We don't + * prevent multiple threads from seeing time move backwards (even when + * reading time serially, the saved last-read time is per thread, not + * per timer, so multiple threads can race the time). Nor do we prevent + * multiple threads simultaneously reading the time from seeing random + * time or time moving backwards (assigning the time structure to the + * returned memory location implies multicycle writes to memory). */ __wt_epoch_raw(session, &tmp); __time_check_monotonic(session, &tmp); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 191f7e0ba0f..09efb2924bf 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -445,12 +445,11 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_txn_parse_timestamp( session, "read", &txn->read_timestamp, &cval)); - __wt_readlock(session, &txn_global->rwlock); - __wt_timestamp_set( - &oldest_timestamp, &txn_global->oldest_timestamp); - __wt_timestamp_set( - &stable_timestamp, &txn_global->stable_timestamp); - __wt_readunlock(session, &txn_global->rwlock); + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set( + &oldest_timestamp, &txn_global->oldest_timestamp); + __wt_timestamp_set( + &stable_timestamp, &txn_global->stable_timestamp)); if (__wt_timestamp_cmp( &txn->read_timestamp, &oldest_timestamp) < 0) WT_RET_MSG(session, EINVAL, @@ -568,18 +567,20 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; WT_TXN_OP *op; + u_int i; + bool did_update, locked; #ifdef HAVE_TIMESTAMPS - WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; wt_timestamp_t prev_commit_timestamp; bool update_timestamp; #endif - u_int i; - bool did_update; txn = &session->txn; conn = S2C(session); + txn_global = &conn->txn_global; did_update = txn->mod_count != 0; + locked = false; WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || !did_update); @@ -665,6 +666,14 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) * This is particularly important for checkpoints. */ __wt_txn_release_snapshot(session); + /* + * We hold the visibility lock for reading from the time + * we write our log record until the time we release our + * transaction so that the LSN any checkpoint gets will + * always reflect visible data. + */ + __wt_readlock(session, &txn_global->visibility_rwlock); + locked = true; WT_ERR(__wt_txn_log_commit(session, cfg)); } @@ -687,9 +696,12 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) #ifdef HAVE_TIMESTAMPS if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && - op->type != WT_TXN_OP_BASIC_TS) + op->type != WT_TXN_OP_BASIC_TS) { + WT_ASSERT(session, + op->fileid != WT_METAFILE_ID); __wt_timestamp_set(&op->u.upd->timestamp, &txn->commit_timestamp); + } #endif break; @@ -724,14 +736,15 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) #endif __wt_txn_release(session); + if (locked) + __wt_readunlock(session, &txn_global->visibility_rwlock); #ifdef HAVE_TIMESTAMPS /* First check if we've already committed something in the future. */ if (update_timestamp) { - __wt_readlock(session, &txn_global->rwlock); - __wt_timestamp_set( - &prev_commit_timestamp, &txn_global->commit_timestamp); - __wt_readunlock(session, &txn_global->rwlock); + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set( + &prev_commit_timestamp, &txn_global->commit_timestamp)); update_timestamp = __wt_timestamp_cmp( &txn->commit_timestamp, &prev_commit_timestamp) > 0; } @@ -760,6 +773,8 @@ err: /* * !!! * Nothing can fail after this point. */ + if (locked) + __wt_readunlock(session, &txn_global->visibility_rwlock); WT_TRET(__wt_txn_rollback(session, cfg)); return (ret); } @@ -930,6 +945,7 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_spin_init( session, &txn_global->id_lock, "transaction id lock")); WT_RET(__wt_rwlock_init(session, &txn_global->rwlock)); + WT_RET(__wt_rwlock_init(session, &txn_global->visibility_rwlock)); WT_RET(__wt_rwlock_init(session, &txn_global->commit_timestamp_rwlock)); TAILQ_INIT(&txn_global->commit_timestamph); @@ -971,6 +987,7 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session) __wt_rwlock_destroy(session, &txn_global->commit_timestamp_rwlock); __wt_rwlock_destroy(session, &txn_global->read_timestamp_rwlock); __wt_rwlock_destroy(session, &txn_global->nsnap_rwlock); + __wt_rwlock_destroy(session, &txn_global->visibility_rwlock); __wt_free(session, txn_global->states); } @@ -981,10 +998,7 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session) int __wt_txn_global_shutdown(WT_SESSION_IMPL *session) { - WT_DECL_RET; - WT_TXN_GLOBAL *txn_global; - - txn_global = &S2C(session)->txn_global; + bool txn_active; /* * We're shutting down. Make sure everything gets freed. @@ -995,10 +1009,8 @@ __wt_txn_global_shutdown(WT_SESSION_IMPL *session) * transaction ID will catch up with the current ID. */ for (;;) { - WT_TRET(__wt_txn_update_oldest(session, - WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); - if (txn_global->oldest_id == txn_global->current && - txn_global->metadata_pinned == txn_global->current) + WT_RET(__wt_txn_activity_check(session, &txn_active)); + if (!txn_active) break; WT_STAT_CONN_INCR(session, txn_release_blocked); @@ -1010,10 +1022,10 @@ __wt_txn_global_shutdown(WT_SESSION_IMPL *session) * Now that all transactions have completed, no timestamps should be * pinned. */ - __wt_timestamp_set_inf(&txn_global->pinned_timestamp); + __wt_timestamp_set_inf(&S2C(session)->txn_global.pinned_timestamp); #endif - return (ret); + return (0); } #if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) @@ -1031,7 +1043,9 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session) const char *iso_tag; uint64_t id; uint32_t i, session_cnt; - +#ifdef HAVE_TIMESTAMPS + char hex_timestamp[3][2 * WT_TIMESTAMP_SIZE + 1]; +#endif conn = S2C(session); txn_global = &conn->txn_global; @@ -1042,10 +1056,35 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session) WT_RET(__wt_msg(session, "last running ID: %" PRIu64, txn_global->last_running)); WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id)); - WT_RET(__wt_msg(session, - "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id)); - WT_RET(__wt_msg(session, "checkpoint running? %s", +#ifdef HAVE_TIMESTAMPS + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[0], &txn_global->commit_timestamp)); + WT_RET(__wt_msg(session, "commit timestamp: %s", hex_timestamp[0])); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[0], &txn_global->oldest_timestamp)); + WT_RET(__wt_msg(session, "oldest timestamp: %s", hex_timestamp[0])); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[0], &txn_global->pinned_timestamp)); + WT_RET(__wt_msg(session, "pinned timestamp: %s", hex_timestamp[0])); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[0], &txn_global->stable_timestamp)); + WT_RET(__wt_msg(session, "stable timestamp: %s", hex_timestamp[0])); + WT_RET(__wt_msg(session, "has_commit_timestamp: %s", + txn_global->has_commit_timestamp ? "yes" : "no")); + WT_RET(__wt_msg(session, "has_oldest_timestamp: %s", + txn_global->has_oldest_timestamp ? "yes" : "no")); + WT_RET(__wt_msg(session, "has_pinned_timestamp: %s", + txn_global->has_pinned_timestamp ? "yes" : "no")); + WT_RET(__wt_msg(session, "has_stable_timestamp: %s", + txn_global->has_stable_timestamp ? "yes" : "no")); + WT_RET(__wt_msg(session, "oldest_is_pinned: %s", + txn_global->oldest_is_pinned ? "yes" : "no")); + WT_RET(__wt_msg(session, "stable_is_pinned: %s", + txn_global->stable_is_pinned ? "yes" : "no")); +#endif + + WT_RET(__wt_msg(session, "checkpoint running: %s", txn_global->checkpoint_running ? "yes" : "no")); WT_RET(__wt_msg(session, "checkpoint generation: %" PRIu64, __wt_gen(session, WT_GEN_CHECKPOINT))); @@ -1054,9 +1093,11 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session) WT_RET(__wt_msg(session, "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_state.id)); + WT_RET(__wt_msg(session, + "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id)); + WT_ORDERED_READ(session_cnt, conn->session_cnt); WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt)); - WT_RET(__wt_msg(session, "Transaction state of active sessions:")); /* @@ -1083,7 +1124,40 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session) iso_tag = "WT_ISO_SNAPSHOT"; break; } - +#ifdef HAVE_TIMESTAMPS + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[0], &txn->commit_timestamp)); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[1], &txn->first_commit_timestamp)); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[2], &txn->read_timestamp)); + WT_RET(__wt_msg(session, + "ID: %8" PRIu64 + ", mod count: %u" + ", pinned ID: %8" PRIu64 + ", snap min: %" PRIu64 + ", snap max: %" PRIu64 + ", commit_timestamp: %s" + ", first_commit_timestamp: %s" + ", read_timestamp: %s" + ", metadata pinned ID: %" PRIu64 + ", flags: 0x%08" PRIx32 + ", name: %s" + ", isolation: %s", + id, + txn->mod_count, + s->pinned_id, + txn->snap_min, + txn->snap_max, + hex_timestamp[0], + hex_timestamp[1], + hex_timestamp[2], + s->metadata_pinned, + txn->flags, + conn->sessions[i].name == NULL ? + "EMPTY" : conn->sessions[i].name, + iso_tag)); +#else WT_RET(__wt_msg(session, "ID: %6" PRIu64 ", mod count: %u" @@ -1104,6 +1178,7 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session) conn->sessions[i].name == NULL ? "EMPTY" : conn->sessions[i].name, iso_tag)); +#endif } WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 532c4819d29..9065966fe8f 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -571,43 +571,17 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; - char timestamp_buf[2 * WT_TIMESTAMP_SIZE + 1], timestamp_config[100]; - const char *query_cfg[] = { WT_CONFIG_BASE(session, - WT_CONNECTION_query_timestamp), "get=stable", NULL }; const char *txn_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL, NULL }; + bool use_timestamp; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); - /* - * Someone giving us a specific timestamp overrides the general - * use_timestamp. - */ - WT_RET(__wt_config_gets(session, cfg, "read_timestamp", &cval)); - if (cval.len > 0) { - WT_RET(__wt_snprintf(timestamp_config, sizeof(timestamp_config), - "read_timestamp=%.*s", (int)cval.len, cval.str)); - txn_cfg[2] = timestamp_config; - } else if (txn_global->has_stable_timestamp) { - WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval)); - /* - * Get the stable timestamp currently set. Then set that as - * the read timestamp for the transaction. - */ - if (cval.val != 0) { - if ((ret = __wt_txn_global_query_timestamp(session, - timestamp_buf, query_cfg)) != 0 && - ret != WT_NOTFOUND) - return (ret); - WT_RET(__wt_snprintf(timestamp_config, - sizeof(timestamp_config), - "read_timestamp=%s", timestamp_buf)); - txn_cfg[2] = timestamp_config; - } - } + WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval)); + use_timestamp = (cval.val != 0); /* * Start a snapshot transaction for the checkpoint. @@ -667,15 +641,33 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, const char *cfg[]) */ txn_state->id = txn_state->pinned_id = txn_state->metadata_pinned = WT_TXN_NONE; - __wt_writeunlock(session, &txn_global->rwlock); #ifdef HAVE_TIMESTAMPS /* - * Now that the checkpoint transaction is published, clear it from the - * regular lists. + * Set the checkpoint transaction's timestamp, if requested. + * + * We rely on having the global transaction data locked so the oldest + * timestamp can't move past the stable timestamp. */ - __wt_txn_clear_commit_timestamp(session); - __wt_txn_clear_read_timestamp(session); + WT_ASSERT(session, !F_ISSET(txn, + WT_TXN_HAS_TS_COMMIT | WT_TXN_HAS_TS_READ | + WT_TXN_PUBLIC_TS_COMMIT | WT_TXN_PUBLIC_TS_READ)); + + if (use_timestamp && txn_global->has_stable_timestamp) { + __wt_timestamp_set( + &txn->read_timestamp, &txn_global->stable_timestamp); + F_SET(txn, WT_TXN_HAS_TS_READ); + } +#else + WT_UNUSED(use_timestamp); +#endif + + __wt_writeunlock(session, &txn_global->rwlock); + +#ifdef HAVE_TIMESTAMPS + if (F_ISSET(txn, WT_TXN_HAS_TS_READ)) + __wt_verbose_timestamp(session, &txn->read_timestamp, + "Checkpoint requested at stable timestamp"); #endif /* diff --git a/src/third_party/wiredtiger/src/txn/txn_ext.c b/src/third_party/wiredtiger/src/txn/txn_ext.c index 1fe4d6ddf47..103a1d38166 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ext.c +++ b/src/third_party/wiredtiger/src/txn/txn_ext.c @@ -72,7 +72,8 @@ __wt_ext_transaction_notify( if (txn->notify == notify) return (0); if (txn->notify != NULL) - return (ENOMEM); + WT_RET_MSG( + session, WT_ERROR, "transaction notify already scheduled"); txn->notify = notify; diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index 1fc74fb53a1..a03047b5392 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -357,12 +357,14 @@ __wt_txn_checkpoint_log( WT_ITEM *ckpt_snapshot, empty; WT_LSN *ckpt_lsn; WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; uint8_t *end, *p; size_t recsize; uint32_t i, rectype; const char *fmt; conn = S2C(session); + txn_global = &conn->txn_global; txn = &session->txn; ckpt_lsn = &txn->ckpt_lsn; @@ -408,6 +410,15 @@ __wt_txn_checkpoint_log( } /* + * We take and immediately release the visibility lock. + * Acquiring the write lock guarantees that any transaction + * that has written to the log has also made its transaction + * visible at this time. + */ + __wt_writelock(session, &txn_global->visibility_rwlock); + __wt_writeunlock(session, &txn_global->visibility_rwlock); + + /* * We need to make sure that the log records in the checkpoint * LSN are on disk. In particular to make sure that the * current log file exists. diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 9c02322c526..e19bbc73bb3 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -37,10 +37,10 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session) * updated while rolling back, accessing it without a lock would * violate protocol. */ - txn_global = &S2C(session)->txn_global; - __wt_readlock(session, &txn_global->rwlock); - __wt_timestamp_set(&rollback_timestamp, &txn_global->stable_timestamp); - __wt_readunlock(session, &txn_global->rwlock); + txn_global = &conn->txn_global; + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set( + &rollback_timestamp, &txn_global->stable_timestamp)); __wt_las_cursor(session, &cursor, &session_flags); @@ -120,11 +120,11 @@ __txn_abort_newer_update(WT_SESSION_IMPL *session, } /* - * __txn_abort_newer_row_skip -- + * __txn_abort_newer_insert -- * Apply the update abort check to each entry in an insert skip list */ static void -__txn_abort_newer_row_skip(WT_SESSION_IMPL *session, +__txn_abort_newer_insert(WT_SESSION_IMPL *session, WT_INSERT_HEAD *head, wt_timestamp_t *rollback_timestamp) { WT_INSERT *ins; @@ -134,6 +134,50 @@ __txn_abort_newer_row_skip(WT_SESSION_IMPL *session, } /* + * __txn_abort_newer_col_var -- + * Abort updates on a variable length col leaf page with timestamps newer + * than the rollback timestamp. + */ +static void +__txn_abort_newer_col_var( + WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t *rollback_timestamp) +{ + WT_COL *cip; + WT_INSERT_HEAD *ins; + uint32_t i; + + /* Review the changes to the original on-page data items */ + WT_COL_FOREACH(page, cip, i) + if ((ins = WT_COL_UPDATE(page, cip)) != NULL) + __txn_abort_newer_insert(session, + ins, rollback_timestamp); + + /* Review the append list */ + if ((ins = WT_COL_APPEND(page)) != NULL) + __txn_abort_newer_insert(session, ins, rollback_timestamp); +} + +/* + * __txn_abort_newer_col_fix -- + * Abort updates on a fixed length col leaf page with timestamps newer than + * the rollback timestamp. + */ +static void +__txn_abort_newer_col_fix( + WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t *rollback_timestamp) +{ + WT_INSERT_HEAD *ins; + + /* Review the changes to the original on-page data items */ + if ((ins = WT_COL_UPDATE_SINGLE(page)) != NULL) + __txn_abort_newer_insert(session, ins, rollback_timestamp); + + /* Review the append list */ + if ((ins = WT_COL_APPEND(page)) != NULL) + __txn_abort_newer_insert(session, ins, rollback_timestamp); +} + +/* * __txn_abort_newer_row_leaf -- * Abort updates on a row leaf page with timestamps newer than the * rollback timestamp. @@ -152,8 +196,7 @@ __txn_abort_newer_row_leaf( * page. */ if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) - __txn_abort_newer_row_skip( - session, insert, rollback_timestamp); + __txn_abort_newer_insert(session, insert, rollback_timestamp); /* * Review updates that belong to keys that are on the disk image, @@ -165,7 +208,7 @@ __txn_abort_newer_row_leaf( session, upd, rollback_timestamp); if ((insert = WT_ROW_INSERT(page, rip)) != NULL) - __txn_abort_newer_row_skip( + __txn_abort_newer_insert( session, insert, rollback_timestamp); } } @@ -182,6 +225,13 @@ __txn_abort_newer_updates( page = ref->page; switch (page->type) { + case WT_PAGE_COL_FIX: + __txn_abort_newer_col_fix(session, page, rollback_timestamp); + break; + case WT_PAGE_COL_VAR: + __txn_abort_newer_col_var(session, page, rollback_timestamp); + break; + case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: /* * There is nothing to do for internal pages, since we aren't @@ -193,9 +243,7 @@ __txn_abort_newer_updates( case WT_PAGE_ROW_LEAF: __txn_abort_newer_row_leaf(session, page, rollback_timestamp); break; - default: - WT_RET_MSG(session, EINVAL, "rollback_to_stable " - "is only supported for row store btrees"); + WT_ILLEGAL_VALUE(session); } return (0); @@ -209,14 +257,11 @@ static int __txn_rollback_to_stable_custom_skip( WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp) { - WT_UNUSED(session); WT_UNUSED(context); + WT_UNUSED(session); /* Review all pages that are in memory. */ - if (ref->state == WT_REF_MEM || ref->state == WT_REF_DELETED) - *skipp = false; - else - *skipp = true; + *skipp = !(ref->state == WT_REF_MEM || ref->state == WT_REF_DELETED); return (0); } @@ -284,8 +329,7 @@ __txn_rollback_to_stable_btree( * Add the btree ID to the bitstring, so we can exclude any * lookaside entries for this btree. */ - __bit_set( - S2C(session)->stable_rollback_bitstring, btree->id); + __bit_set(S2C(session)->stable_rollback_bitstring, btree->id); return (0); } @@ -297,19 +341,15 @@ __txn_rollback_to_stable_btree( if (btree->root.page == NULL) return (0); - if (btree->type != BTREE_ROW) - WT_RET_MSG(session, EINVAL, "rollback_to_stable " - "is only supported for row store btrees"); - /* * Copy the stable timestamp, otherwise we'd need to lock it each time * it's accessed. Even though the stable timestamp isn't supposed to be * updated while rolling back, accessing it without a lock would * violate protocol. */ - __wt_readlock(session, &txn_global->rwlock); - __wt_timestamp_set(&rollback_timestamp, &txn_global->stable_timestamp); - __wt_readunlock(session, &txn_global->rwlock); + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set( + &rollback_timestamp, &txn_global->stable_timestamp)); /* * Ensure the eviction server is out of the file - we don't @@ -333,15 +373,12 @@ static int __txn_rollback_to_stable_check(WT_SESSION_IMPL *session) { WT_TXN_GLOBAL *txn_global; - bool active_txns, stable_set; + bool txn_active; txn_global = &S2C(session)->txn_global; - __wt_readlock(session, &txn_global->rwlock); - stable_set = !__wt_timestamp_iszero(&txn_global->stable_timestamp); - __wt_readunlock(session, &txn_global->rwlock); - if (!stable_set) - WT_RET_MSG(session, EINVAL, "rollback_to_stable requires a " - "stable timestamp"); + if (!txn_global->has_stable_timestamp) + WT_RET_MSG(session, EINVAL, + "rollback_to_stable requires a stable timestamp"); /* * Help the user - see if they have any active transactions. I'd @@ -349,8 +386,8 @@ __txn_rollback_to_stable_check(WT_SESSION_IMPL *session) * require peeking into all open sessions, which isn't really * kosher. */ - WT_RET(__wt_txn_are_any_active(session, &active_txns)); - if (active_txns) + WT_RET(__wt_txn_activity_check(session, &txn_active)); + if (txn_active) WT_RET_MSG(session, EINVAL, "rollback_to_stable illegal with active transactions"); @@ -369,9 +406,8 @@ __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[]) #ifndef HAVE_TIMESTAMPS WT_UNUSED(cfg); - WT_RET_MSG(session, EINVAL, "rollback_to_stable " - "requires a version of WiredTiger built with timestamp " - "support"); + WT_RET_MSG(session, ENOTSUP, "rollback_to_stable " + "requires a version of WiredTiger built with timestamp support"); #else WT_CONNECTION_IMPL *conn; WT_DECL_RET; diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index 9e4a1e200cc..275ef941490 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -10,6 +10,83 @@ #ifdef HAVE_TIMESTAMPS /* + * __wt_timestamp_to_hex_string -- + * Convert a timestamp to hex string representation. + */ +int +__wt_timestamp_to_hex_string( + WT_SESSION_IMPL *session, char *hex_timestamp, const wt_timestamp_t *ts_src) +{ + wt_timestamp_t ts; + + __wt_timestamp_set(&ts, ts_src); + + if (__wt_timestamp_iszero(&ts)) { + hex_timestamp[0] = '0'; + hex_timestamp[1] = '\0'; + return (0); + } + +#if WT_TIMESTAMP_SIZE == 8 + { + char *p, v; + + for (p = hex_timestamp; ts.val != 0; ts.val >>= 4) + *p++ = (char)__wt_hex((u_char)(ts.val & 0x0f)); + *p = '\0'; + + /* Reverse the string. */ + for (--p; p > hex_timestamp;) { + v = *p; + *p-- = *hex_timestamp; + *hex_timestamp++ = v; + } + WT_UNUSED(session); + } +#else + { + WT_ITEM hexts; + size_t len; + uint8_t *tsp; + + /* Avoid memory allocation: set up an item guaranteed large enough. */ + hexts.data = hexts.mem = hex_timestamp; + hexts.memsize = 2 * WT_TIMESTAMP_SIZE + 1; + /* Trim leading zeros. */ + for (tsp = ts.ts, len = WT_TIMESTAMP_SIZE; + len > 0 && *tsp == 0; + ++tsp, --len) + ; + WT_RET(__wt_raw_to_hex(session, tsp, len, &hexts)); + } +#endif + return (0); +} + +/* + * __wt_verbose_timestamp -- + * Output a verbose message along with the specified timestamp + */ +void +__wt_verbose_timestamp(WT_SESSION_IMPL *session, + const wt_timestamp_t *ts, const char *msg) +{ +#ifdef HAVE_VERBOSE + char timestamp_buf[2 * WT_TIMESTAMP_SIZE + 1]; + + if (__wt_timestamp_to_hex_string(session, timestamp_buf, ts) != 0) + return; + + __wt_verbose(session, + WT_VERB_TIMESTAMP, "Timestamp %s : %s", timestamp_buf, msg); +#else + WT_UNUSED(session); + WT_UNUSED(ts); + WT_UNUSED(msg); +#endif +} + +/* * __wt_txn_parse_timestamp -- * Decodes and sets a timestamp. */ @@ -25,7 +102,7 @@ __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, /* Protect against unexpectedly long hex strings. */ if (cval->len > 2 * WT_TIMESTAMP_SIZE) WT_RET_MSG(session, EINVAL, - "Failed to parse %s timestamp '%.*s': too long", + "%s timestamp too long '%.*s'", name, (int)cval->len, cval->str); #if WT_TIMESTAMP_SIZE == 8 @@ -119,10 +196,9 @@ __txn_global_query_timestamp( if (WT_STRING_MATCH("all_committed", cval.str, cval.len)) { if (!txn_global->has_commit_timestamp) return (WT_NOTFOUND); - __wt_readlock(session, &txn_global->rwlock); - __wt_timestamp_set(&ts, &txn_global->commit_timestamp); + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set(&ts, &txn_global->commit_timestamp)); WT_ASSERT(session, !__wt_timestamp_iszero(&ts)); - __wt_readunlock(session, &txn_global->rwlock); /* Compare with the oldest running transaction. */ __wt_readlock(session, &txn_global->commit_timestamp_rwlock); @@ -157,9 +233,8 @@ __txn_global_query_timestamp( } else if (WT_STRING_MATCH("stable", cval.str, cval.len)) { if (!txn_global->has_stable_timestamp) return (WT_NOTFOUND); - __wt_readlock(session, &txn_global->rwlock); - __wt_timestamp_set(&ts, &txn_global->stable_timestamp); - __wt_readunlock(session, &txn_global->rwlock); + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set(&ts, &txn_global->stable_timestamp)); } else WT_RET_MSG(session, EINVAL, "unknown timestamp query %.*s", (int)cval.len, cval.str); @@ -181,47 +256,7 @@ __wt_txn_global_query_timestamp( wt_timestamp_t ts; WT_RET(__txn_global_query_timestamp(session, &ts, cfg)); - -#if WT_TIMESTAMP_SIZE == 8 - { - char *p, v; - - for (p = hex_timestamp; ts.val != 0; ts.val >>= 4) - *p++ = (char)__wt_hex((u_char)(ts.val & 0x0f)); - *p = '\0'; - - /* Reverse the string. */ - for (--p; p > hex_timestamp;) { - v = *p; - *p-- = *hex_timestamp; - *hex_timestamp++ = v; - } - } -#else - { - WT_ITEM hexts; - size_t len; - uint8_t *tsp; - - /* - * Keep clang-analyzer happy: it can't tell that ts will be set - * whenever the call below succeeds. - */ - __wt_timestamp_set_zero(&ts); - WT_RET(__txn_global_query_timestamp(session, &ts, cfg)); - - /* Avoid memory allocation: set up an item guaranteed large enough. */ - hexts.data = hexts.mem = hex_timestamp; - hexts.memsize = 2 * WT_TIMESTAMP_SIZE + 1; - /* Trim leading zeros. */ - for (tsp = ts.ts, len = WT_TIMESTAMP_SIZE; - len > 0 && *tsp == 0; - ++tsp, --len) - ; - WT_RET(__wt_raw_to_hex(session, tsp, len, &hexts)); - } -#endif - return (0); + return (__wt_timestamp_to_hex_string(session, hex_timestamp, &ts)); #else WT_UNUSED(hex_timestamp); WT_UNUSED(cfg); @@ -253,9 +288,9 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session) if (txn_global->oldest_is_pinned) return (0); - __wt_readlock(session, &txn_global->rwlock); - __wt_timestamp_set(&oldest_timestamp, &txn_global->oldest_timestamp); - __wt_readunlock(session, &txn_global->rwlock); + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set( + &oldest_timestamp, &txn_global->oldest_timestamp)); /* Scan to find the global pinned timestamp. */ if ((ret = __txn_global_query_timestamp( @@ -276,6 +311,8 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session) txn_global->oldest_is_pinned = __wt_timestamp_cmp( &txn_global->pinned_timestamp, &txn_global->oldest_timestamp) == 0; + __wt_verbose_timestamp(session, + &pinned_timestamp, "Updated pinned timestamp"); } __wt_writeunlock(session, &txn_global->rwlock); @@ -388,6 +425,8 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) if (has_commit) { __wt_timestamp_set(&txn_global->commit_timestamp, &commit_ts); txn_global->has_commit_timestamp = true; + __wt_verbose_timestamp(session, &commit_ts, + "Updated global commit timestamp"); } if (has_oldest && (!txn_global->has_oldest_timestamp || @@ -396,6 +435,8 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) __wt_timestamp_set(&txn_global->oldest_timestamp, &oldest_ts); txn_global->has_oldest_timestamp = true; txn_global->oldest_is_pinned = false; + __wt_verbose_timestamp(session, &oldest_ts, + "Updated global oldest timestamp"); } if (has_stable && (!txn_global->has_stable_timestamp || @@ -404,17 +445,18 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) __wt_timestamp_set(&txn_global->stable_timestamp, &stable_ts); txn_global->has_stable_timestamp = true; txn_global->stable_is_pinned = false; + __wt_verbose_timestamp(session, &stable_ts, + "Updated global stable timestamp"); } __wt_writeunlock(session, &txn_global->rwlock); if (has_oldest || has_stable) WT_RET(__wt_txn_update_pinned_timestamp(session)); - + } #else WT_RET_MSG(session, EINVAL, "set_timestamp requires a " "version of WiredTiger built with timestamp support"); #endif - } return (0); } |