diff options
author | Chenhao Qu <chenhao.qu@mongodb.com> | 2022-04-19 23:39:05 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-04-20 00:10:46 +0000 |
commit | e916372674aa762557c9e30141b277e019d8039f (patch) | |
tree | 554849d0df1ebcb3ba70140f9b3481e55c7a2a60 /src | |
parent | 12bbc9f4f1a5d7a4826f2f2847c0e03ebc93103e (diff) | |
download | mongo-e916372674aa762557c9e30141b277e019d8039f.tar.gz |
Import wiredtiger: ad0b418109f05284ecdc343683988388a1a7fd39 from branch mongodb-master
ref: 5013fd0ff6..ad0b418109
for: 6.1.0-rc0
WT-8447 Database corruption from RTS after fast-delete
Diffstat (limited to 'src')
22 files changed, 590 insertions, 216 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index f3446688f86..f054921907c 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-master", - "commit": "5013fd0ff65c0f86742c24039feeeda03369a987" + "commit": "ad0b418109f05284ecdc343683988388a1a7fd39" } diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index 1edd7afd169..ccb6225fd89 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -126,7 +126,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) /* Allocate and initialize the page-deleted structure. */ WT_ERR(__wt_calloc_one(session, &ref->ft_info.del)); - ref->ft_info.del->previous_state = previous_state; + ref->ft_info.del->previous_ref_state = previous_state; WT_ERR(__wt_txn_modify_page_delete(session, ref)); @@ -193,15 +193,15 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) * a hazard pointer. We're safe since pages with unresolved transactions aren't going anywhere. */ if (current_state == WT_REF_DELETED) - current_state = ref->ft_info.del->previous_state; + current_state = ref->ft_info.del->previous_ref_state; else if ((updp = ref->ft_info.update) != NULL) for (; *updp != NULL; ++updp) (*updp)->txnid = WT_TXN_ABORTED; /* - * We didn't set the WT_PAGE_DELETED transaction ID to aborted or discard any WT_UPDATE list, - * instead, we discard both structures entirely, it has the same effect. It's a single call, - * they're a union of two pointers. + * Don't set the WT_PAGE_DELETED transaction ID to aborted, discard any WT_UPDATE list or set + * the committed flag; instead, discard the structures, it has the same effect. It's a single + * call, they're a union of two pointers. */ __wt_free(session, ref->ft_info.del); @@ -304,29 +304,24 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) WT_STAT_CONN_DATA_INCR(session, cache_read_deleted); - /* - * Give the page a modify structure. - */ - WT_RET(__wt_page_modify_init(session, page)); + /* Track the prepared, fast-truncate pages we've had to instantiate. */ + if (ref->ft_info.del != NULL && ref->ft_info.del->prepare_state != WT_PREPARE_INIT) + WT_STAT_CONN_DATA_INCR(session, cache_read_deleted_prepared); /* - * We would like to free the deleted pages, but if the tree is newly created, there is nothing - * that needs to be freed. Furthermore, if the handle is read-only, we are not able to do so. + * Give the page a modify structure. If the tree is already dirty and so will be written, mark + * the page dirty. (We want to free the deleted pages, but if the handle is read-only or if the + * application never modifies the tree, we're not able to do so.) */ - if (!btree->original) + WT_RET(__wt_page_modify_init(session, page)); + if (btree->modified) __wt_page_modify_set(session, page); - /* - * Allocate the per-page update array if one doesn't already exist. (It might already exist - * because deletes are instantiated after the history store table updates.) - */ + /* Allocate the per-page update array if one doesn't already exist. */ if (page->entries != 0 && page->modify->mod_row_update == NULL) WT_PAGE_ALLOC_AND_SWAP( session, page, page->modify->mod_row_update, upd_array, page->entries); - if (ref->ft_info.del != NULL && ref->ft_info.del->prepare_state != WT_PREPARE_INIT) - WT_STAT_CONN_DATA_INCR(session, cache_read_deleted_prepared); - /* * An operation is accessing a "deleted" page, and we're building an in-memory version of the * page (making it look like all entries in the page were individually updated by a remove @@ -337,9 +332,10 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * building might split in the future, so we update that structure to include references to all * of the update structures we create, so the transaction can abort. * - * Second, a truncate call deleted a page and the truncate committed, but an older transaction - * in the system forced us to keep the old version of the page around, then we crashed and - * recovered or we're running inside a checkpoint, and now we're being forced to read that page. + * Second, a truncate call deleted a page and the truncate resolved, but an older transaction or + * the stable timestamp forced us to keep the old version of the page around, and then we + * crashed and recovered or we're running inside a checkpoint, and now we're being forced to + * read that page. * * If there's a page-deleted structure that's not yet globally visible, get a reference and * migrate transaction ID and timestamp information to the updates (globally visible means the @@ -350,7 +346,7 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * other pages. */ page_del = __wt_page_del_active(session, ref, true) ? ref->ft_info.del : NULL; - if (page_del != NULL && page_del->committed == 0) { + if (page_del != NULL && !page_del->committed) { count = 0; if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) WT_SKIP_FOREACH (ins, insert) diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index bdc946a18a5..1cc2db1294e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -593,13 +593,13 @@ __inmem_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page, bool *preparedp, size_t page->pg_fix_numtws = entry_num; /* - * If we skipped "quite a few" entries (threshold is arbitrary), mark the page dirty so it - * gets rewritten without them. + * If we skipped "quite a few" entries (threshold is arbitrary), and the tree is already + * dirty and so will be written, mark the page dirty so it gets rewritten without them. */ - if (!F_ISSET(btree, WT_BTREE_READONLY) && skipped >= auxhdr.entries / 4 && - skipped >= dsk->u.entries / 100 && skipped > 4) { + if (btree->modified && skipped >= auxhdr.entries / 4 && skipped >= dsk->u.entries / 100 && + skipped > 4) { WT_RET(__wt_page_modify_init(session, page)); - __wt_page_modify_set(session, page); + __wt_page_only_modify_set(session, page); } break; @@ -797,22 +797,18 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) break; case WT_CELL_ADDR_DEL: /* - * A cell may reference a deleted leaf page: if a leaf page was deleted without being - * read (fast truncate), and the deletion committed, but older transactions in the - * system required the previous version of the page to remain available, a special - * deleted-address type cell is written. We'll see that cell on a page if we read from a - * checkpoint including a deleted cell or if we crash/recover and start off from such a - * checkpoint (absent running recovery, a version of the page without the deleted cell - * would eventually have been written). If we crash and recover to a page with a - * deleted-address cell, we want to discard the page from the backing store (it was - * never discarded), and, of course, by definition no earlier transaction will ever need - * it. - * - * Re-create the state of a deleted page. + * If a page was deleted without being read (fast truncate), and the delete committed, + * but older transactions in the system required the previous version of the page to + * remain available or the delete can still be rolled back by RTS, a deleted-address + * type cell is written. We'll see that cell on a page if we read from a checkpoint + * including a deleted cell or if we crash/recover and start off from such a checkpoint. + * Recreate the fast-delete state for the page. */ - ref->addr = unpack.cell; + if (F_ISSET(page->dsk, WT_PAGE_FT_UPDATE)) { + WT_ERR(__wt_calloc_one(session, &ref->ft_info.del)); + *ref->ft_info.del = unpack.page_del; + } WT_REF_SET_STATE(ref, WT_REF_DELETED); - ++refp; /* * If the tree is already dirty and so will be written, mark the page dirty. (We want to @@ -821,8 +817,11 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) */ if (btree->modified) { WT_ERR(__wt_page_modify_init(session, page)); - __wt_page_modify_set(session, page); + __wt_page_only_modify_set(session, page); } + + ref->addr = unpack.cell; + ++refp; break; case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 6c771be480a..b4673fc9955 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -152,12 +152,13 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) WT_ERR(__wt_page_inmem_prepare(session, ref)); skip_read: - switch (previous_state) { - case WT_REF_DELETED: - /* Move all records to a deleted state. */ + /* + * In the case of a fast delete, move all of the page's records to a deleted state based on the + * fast-delete information. Skip for special commands that don't care about an in-memory state. + */ + if (previous_state == WT_REF_DELETED && + !F_ISSET(S2BT(session), WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) WT_ERR(__wt_delete_page_instantiate(session, ref)); - break; - } F_CLR(ref, WT_REF_FLAG_READING); WT_REF_SET_STATE(ref, WT_REF_MEM); diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c index 269a226991e..b9bc0a32086 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c @@ -122,6 +122,8 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_H LF_CLR(WT_PAGE_ENCRYPTED); if (LF_ISSET(WT_PAGE_UNUSED)) LF_CLR(WT_PAGE_UNUSED); + if (LF_ISSET(WT_PAGE_FT_UPDATE)) + LF_CLR(WT_PAGE_FT_UPDATE); if (flags != 0) WT_RET_VRFY(session, "page at %s has invalid flags set: 0x%" PRIx8, tag, flags); diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index ef2464b9d8c..ac3f4fecac4 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -258,8 +258,14 @@ __tree_walk_internal(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp /* Ensure we have a snapshot to check visibility or we only check global visibility. */ WT_ASSERT(session, LF_ISSET(WT_READ_VISIBLE_ALL) || F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT)); - /* All current tree walks skip deleted pages. */ - LF_SET(WT_READ_SKIP_DELETED); + /* + * All tree walks except for rollback-to-stable skip deleted pages. We set read-skip-deleted + * here because we didn't want to add a flag to all of the tree-walk callers, and we make it + * worse because we don't want to add a flag that turns the read-skip-deleted flag off, so we + * test the RTS flag itself. + */ + if (!F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE)) + LF_SET(WT_READ_SKIP_DELETED); /* * !!! @@ -436,7 +442,7 @@ descend: if (skip) break; empty_internal = false; - } else if (current_state == WT_REF_DELETED) { + } else if (LF_ISSET(WT_READ_SKIP_DELETED) && current_state == WT_REF_DELETED) { /* * Try to skip deleted pages visible to us. */ diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index 29253deda1f..625c1d53496 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -132,8 +132,8 @@ __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, WT_ASSERT(session, !WT_IS_HS(S2BT(session)->dhandle) || (*upd_entry == NULL || - ((*upd_entry)->type == WT_UPDATE_TOMBSTONE && (*upd_entry)->txnid == WT_TS_NONE && - (*upd_entry)->start_ts == WT_TS_NONE)) || + ((*upd_entry)->type == WT_UPDATE_TOMBSTONE && + (*upd_entry)->txnid == WT_TXN_NONE && (*upd_entry)->start_ts == WT_TS_NONE)) || (upd_arg->type == WT_UPDATE_TOMBSTONE && upd_arg->start_ts == WT_TS_NONE && upd_arg->next == NULL) || (upd_arg->type == WT_UPDATE_TOMBSTONE && upd_arg->next != NULL && diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index c22422dabf5..8eb9b1c3f49 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -77,6 +77,7 @@ struct __wt_page_header { #define WT_PAGE_EMPTY_V_NONE 0x04u /* Page has no zero-length values */ #define WT_PAGE_ENCRYPTED 0x08u /* Page is encrypted on disk */ #define WT_PAGE_UNUSED 0x10u /* Historic lookaside store page updates, no longer used */ +#define WT_PAGE_FT_UPDATE 0x20u /* Page contains updated fast-truncate information */ uint8_t flags; /* 25: flags */ /* A byte of padding, positioned to be added to the flags. */ @@ -857,20 +858,37 @@ struct __wt_page { * Related information for truncated pages. */ struct __wt_page_deleted { + /* + * Transaction IDs are set when updates are created (before they become visible) and only change + * when marked with WT_TXN_ABORTED. Transaction ID readers expect to copy a transaction ID into + * a local variable and see a stable value. In case a compiler might re-read the transaction ID + * from memory rather than using the local variable, mark the shared transaction IDs volatile to + * prevent unexpected repeated/reordered reads. + */ volatile uint64_t txnid; /* Transaction ID */ wt_timestamp_t timestamp; /* Timestamps */ wt_timestamp_t durable_timestamp; /* - * The state is used for transaction prepare to manage visibility and inheriting prepare state - * to update_list. + * The prepare state is used for transaction prepare to manage visibility and inheriting prepare + * state to update_list. */ - volatile uint8_t prepare_state; /* Prepare state. */ + volatile uint8_t prepare_state; - uint8_t previous_state; /* Previous state */ + /* + * The previous state of the WT_REF; if the fast-truncate transaction is rolled back without the + * page first being instantiated, this is the state to which the WT_REF returns. + */ + uint8_t previous_ref_state; - uint8_t committed; /* Committed */ + /* + * If the fast-truncate transaction has committed. If we're forced to instantiate the page, and + * the committed flag isn't set, we have to create an update structure list for the transaction + * to resolve in a subsequent commit. (This is tricky: if the transaction is rolled back, the + * entire structure is discarded, that is, the flag is set only on commit and not on rollback.) + */ + bool committed; }; /* @@ -1135,6 +1153,13 @@ struct __wt_ikey { * WT_UPDATE structures are formed into a forward-linked list. */ struct __wt_update { + /* + * Transaction IDs are set when updates are created (before they become visible) and only change + * when marked with WT_TXN_ABORTED. Transaction ID readers expect to copy a transaction ID into + * a local variable and see a stable value. In case a compiler might re-read the transaction ID + * from memory rather than using the local variable, mark the shared transaction IDs volatile to + * prevent unexpected repeated/reordered reads. + */ volatile uint64_t txnid; /* transaction ID */ wt_timestamp_t durable_ts; /* timestamps */ diff --git a/src/third_party/wiredtiger/src/include/cell.h b/src/third_party/wiredtiger/src/include/cell.h index 68e11c3d903..351c7ae6464 100644 --- a/src/third_party/wiredtiger/src/include/cell.h +++ b/src/third_party/wiredtiger/src/include/cell.h @@ -132,20 +132,21 @@ */ struct __wt_cell { /* - * Maximum of 71 bytes: + * Maximum of 98 bytes: * 1: cell descriptor byte * 1: prefix compression count * 1: secondary descriptor byte * 36: 4 timestamps (uint64_t encoding, max 9 bytes) * 18: 2 transaction IDs (uint64_t encoding, max 9 bytes) * 9: associated 64-bit value (uint64_t encoding, max 9 bytes) + * 27: fast-delete information (transaction ID, 2 timestamps) * 5: data length (uint32_t encoding, max 5 bytes) * - * This calculation is extremely pessimistic: the prefix compression - * count and 64V value overlap, and the validity window, 64V value - * and data length are all optional in some cases. + * This calculation is pessimistic: the prefix compression count and 64V value overlap, and the + * validity window, 64V value, fast-delete information and data length are all optional in some + * or even most cases. */ - uint8_t __chunk[1 + 1 + 1 + 7 * WT_INTPACK64_MAXSIZE + WT_INTPACK32_MAXSIZE]; + uint8_t __chunk[98]; }; /* AUTOMATIC FLAG VALUE GENERATION START 0 */ @@ -197,6 +198,8 @@ struct __wt_cell_unpack_addr { WT_CELL_COMMON_FIELDS; WT_TIME_AGGREGATE ta; /* Address validity window */ + + WT_PAGE_DELETED page_del; /* Fast-truncate information */ }; /* diff --git a/src/third_party/wiredtiger/src/include/cell_inline.h b/src/third_party/wiredtiger/src/include/cell_inline.h index e593ff17a0d..fd5bb9c5e69 100644 --- a/src/third_party/wiredtiger/src/include/cell_inline.h +++ b/src/third_party/wiredtiger/src/include/cell_inline.h @@ -195,7 +195,7 @@ __cell_pack_addr_validity(WT_SESSION_IMPL *session, uint8_t **pp, WT_TIME_AGGREG */ static inline size_t __wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type, uint64_t recno, - WT_TIME_AGGREGATE *ta, size_t size) + WT_PAGE_DELETED *page_del, WT_TIME_AGGREGATE *ta, size_t size) { uint8_t *p; @@ -205,6 +205,31 @@ __wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type, ui __cell_pack_addr_validity(session, &p, ta); + /* + * If passed fast-delete information, override the cell type and append the fast-delete + * information after the aggregated timestamp information. + */ + if (page_del != NULL) { + /* + * We only fast-truncate leaf pages without overflow items, however, we can write a proxy + * cell for a page, evict and then read the internal page, and then checkpoint is writing it + * again. + */ + WT_ASSERT(session, cell_type == WT_CELL_ADDR_DEL || cell_type == WT_CELL_ADDR_LEAF_NO); + cell_type = WT_CELL_ADDR_DEL; + + /* We should never be in an in-progress prepared state. */ + WT_ASSERT(session, + page_del->prepare_state == WT_PREPARE_INIT || + page_del->prepare_state == WT_PREPARE_RESOLVED); + + if (__wt_process.fast_truncate_2022) { + WT_IGNORE_RET(__wt_vpack_uint(&p, 0, page_del->txnid)); + WT_IGNORE_RET(__wt_vpack_uint(&p, 0, page_del->timestamp)); + WT_IGNORE_RET(__wt_vpack_uint(&p, 0, page_del->durable_timestamp)); + } + } + if (recno == WT_RECNO_OOB) cell->__chunk[0] |= (uint8_t)cell_type; /* Type */ else { @@ -212,6 +237,7 @@ __wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type, ui /* Record number */ WT_IGNORE_RET(__wt_vpack_uint(&p, 0, recno)); } + /* Length */ WT_IGNORE_RET(__wt_vpack_uint(&p, 0, (uint64_t)size)); return (WT_PTRDIFF(p, cell)); @@ -670,6 +696,7 @@ __wt_cell_unpack_safe(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CE WT_TIME_WINDOW tw; } copy; WT_CELL_UNPACK_COMMON *unpack; + WT_PAGE_DELETED *page_del; WT_TIME_AGGREGATE *ta; WT_TIME_WINDOW *tw; uint64_t v; @@ -767,6 +794,7 @@ copy_cell_restart: if ((cell->__chunk[0] & WT_CELL_SECOND_DESC) == 0) break; flags = *p++; /* skip second descriptor byte */ + WT_CELL_LEN_CHK(p, 0); if (LF_ISSET(WT_CELL_PREPARE)) ta->prepare = 1; @@ -810,6 +838,7 @@ copy_cell_restart: if ((cell->__chunk[0] & WT_CELL_SECOND_DESC) == 0) break; flags = *p++; /* skip second descriptor byte */ + WT_CELL_LEN_CHK(p, 0); if (LF_ISSET(WT_CELL_PREPARE)) tw->prepare = 1; @@ -845,6 +874,24 @@ copy_cell_restart: break; } + /* Unpack any fast-truncate information. */ + if (unpack->raw == WT_CELL_ADDR_DEL && F_ISSET(dsk, WT_PAGE_FT_UPDATE)) { + page_del = &unpack_addr->page_del; + WT_RET(__wt_vunpack_uint( + &p, end == NULL ? 0 : WT_PTRDIFF(end, p), (uint64_t *)&page_del->txnid)); + WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &page_del->timestamp)); + WT_RET(__wt_vunpack_uint( + &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &page_del->durable_timestamp)); + page_del->prepare_state = 0; /* No prepare can have been in progress. */ + page_del->previous_ref_state = WT_REF_DISK; /* The leaf page is on disk. */ + page_del->committed = true; /* There is no running transaction. */ + + /* Avoid a stale transaction ID on restart. */ + if (dsk->write_gen <= S2BT(session)->base_write_gen && + !F_ISSET(session, WT_SESSION_DEBUG_DO_NOT_CLEAR_TXN_ID)) + page_del->txnid = WT_TXN_NONE; + } + /* * Check for an RLE count or record number that optionally follows the cell descriptor byte on * column-store variable-length pages. diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index 2723ee14570..c8c69f919a6 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -27,6 +27,8 @@ struct __wt_process { double tsc_nsec_ratio; /* rdtsc ticks to nanoseconds */ bool use_epochtime; /* use expensive time */ + bool fast_truncate_2022; /* fast-truncate fix run-time configuration */ + WT_CACHE_POOL *cache_pool; /* shared cache information */ }; extern WT_PROCESS __wt_process; diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index f7a8c7a085a..e8ac5790016 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -1274,8 +1274,6 @@ extern int __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_IT ) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_search(WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool insert, WT_REF *leaf, bool leaf_safe, bool *leaf_foundp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_rts_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, - bool visible_all, bool *skipp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) @@ -2206,7 +2204,7 @@ static inline int __wt_vunpack_uint(const uint8_t **pp, size_t maxlen, uint64_t static inline int __wt_write(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, const void *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline size_t __wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type, - uint64_t recno, WT_TIME_AGGREGATE *ta, size_t size) + uint64_t recno, WT_PAGE_DELETED *page_del, WT_TIME_AGGREGATE *ta, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline size_t __wt_cell_pack_copy(WT_SESSION_IMPL *session, WT_CELL *cell, WT_TIME_WINDOW *tw, uint64_t rle, uint64_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -2320,7 +2318,7 @@ static inline void __wt_rec_auximage_copy( static inline void __wt_rec_auxincr( WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size); static inline void __wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, - WT_ADDR *addr, WT_CELL_UNPACK_ADDR *vpack, bool proxy_cell, uint64_t recno); + WT_ADDR *addr, WT_CELL_UNPACK_ADDR *vpack, uint64_t recno, WT_PAGE_DELETED *page_del); static inline void __wt_rec_image_copy(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv); static inline void __wt_rec_incr( WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size); diff --git a/src/third_party/wiredtiger/src/include/reconcile_inline.h b/src/third_party/wiredtiger/src/include/reconcile_inline.h index 39209bcc9af..eb983b46e2e 100644 --- a/src/third_party/wiredtiger/src/include/reconcile_inline.h +++ b/src/third_party/wiredtiger/src/include/reconcile_inline.h @@ -323,22 +323,20 @@ __wt_rec_auximage_copy(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t count */ static inline void __wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ADDR *addr, - WT_CELL_UNPACK_ADDR *vpack, bool proxy_cell, uint64_t recno) + WT_CELL_UNPACK_ADDR *vpack, uint64_t recno, WT_PAGE_DELETED *page_del) { WT_REC_KV *val; + WT_TIME_AGGREGATE *ta; u_int cell_type; val = &r->v; /* - * Our caller optionally specifies a cell type (deleted proxy cells), otherwise go with what we - * know. + * Caller includes fast-delete information in the case of fast-delete proxy cells, which both + * flags the fast-delete case and provides the additional information written in the parent's + * address cell. */ - if (proxy_cell) - cell_type = WT_CELL_ADDR_DEL; - else if (vpack != NULL) - cell_type = vpack->type; - else { + if (vpack == NULL) { switch (addr->type) { case WT_ADDR_INT: cell_type = WT_CELL_ADDR_INT; @@ -352,16 +350,12 @@ __wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ADDR *add break; } WT_ASSERT(session, addr->size != 0); + ta = &addr->ta; + } else { + cell_type = vpack->type; + ta = &vpack->ta; } - - __rec_cell_addr_stats(r, vpack == NULL ? &addr->ta : &vpack->ta); - - /* - * We don't check the address size because we can't store an address on an overflow page: if the - * address won't fit, the overflow page's address won't fit either. This possibility must be - * handled by Btree configuration, we have to disallow internal page sizes that are too small - * with respect to the largest address cookie the underlying block manager might return. - */ + __rec_cell_addr_stats(r, ta); /* * We don't copy the data into the buffer, it's not necessary; just re-point the buffer's @@ -371,16 +365,14 @@ __wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ADDR *add WT_ASSERT(session, addr != NULL); val->buf.data = addr->addr; val->buf.size = addr->size; - val->cell_len = - __wt_cell_pack_addr(session, &val->cell, cell_type, recno, &addr->ta, val->buf.size); } else { WT_ASSERT(session, addr == NULL); val->buf.data = vpack->data; val->buf.size = vpack->size; - val->cell_len = - __wt_cell_pack_addr(session, &val->cell, cell_type, recno, &vpack->ta, val->buf.size); } + val->cell_len = + __wt_cell_pack_addr(session, &val->cell, cell_type, recno, page_del, ta, val->buf.size); val->len = val->cell_len + val->buf.size; } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_child.c b/src/third_party/wiredtiger/src/reconcile/rec_child.c index 1980d887a63..50ef393f78c 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_child.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_child.c @@ -17,88 +17,74 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_C { WT_PAGE_DELETED *page_del; WT_TXN *txn; + uint8_t prepare_state; + + *statep = WT_CHILD_IGNORE; - page_del = ref->ft_info.del; txn = session->txn; /* - * Internal pages with child leaf pages in the WT_REF_DELETED state are a special case during - * reconciliation. First, if the deletion was a result of a session truncate call, the deletion - * may not be visible to us. In that case, we proceed as with any change not visible during - * reconciliation by ignoring the change for the purposes of writing the internal page. - * - * In this case, there must be an associated page-deleted structure, and it holds the - * transaction ID we care about. - * - * In some cases, there had better not be any updates we can't see. + * The complicated case is a fast-delete which may not be visible or stable. Otherwise, discard + * any underlying disk blocks and don't write anything. + */ + page_del = ref->ft_info.del; + if (page_del == NULL) + return (ref->addr == NULL ? 0 : __wt_ref_block_free(session, ref)); + + /* + * The fast-delete may not yet be visible to us. In that case, we proceed as with any change not + * visible during reconciliation by ignoring the change for the purposes of writing the internal + * page. * - * A visible update to be in READY state (i.e. not in LOCKED or PREPARED state), for truly - * visible to others. + * We expect the page to be clean after reconciliation. If there are invisible updates, abort + * eviction. */ - if (F_ISSET(r, WT_REC_CLEAN_AFTER_REC | WT_REC_VISIBILITY_ERR) && page_del != NULL && - __wt_page_del_active(session, ref, !F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))) { + if (__wt_page_del_active(session, ref, !F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))) { if (F_ISSET(r, WT_REC_VISIBILITY_ERR)) WT_RET_PANIC(session, EINVAL, "reconciliation illegally skipped an update"); - return (__wt_set_return(session, EBUSY)); + if (F_ISSET(r, WT_REC_CLEAN_AFTER_REC)) + return (__wt_set_return(session, EBUSY)); + *statep = WT_CHILD_ORIGINAL; + return (0); } /* - * Deal with any underlying disk blocks. - * - * First, check to see if there is an address associated with this leaf: if there isn't, we're - * done, the underlying page is already gone. If the page still exists, check for any - * transactions in the system that might want to see the page's state before it's deleted. - * - * If any such transactions exist, we cannot discard the underlying leaf page to the block - * manager because the transaction may eventually read it. However, this write might be part of - * a checkpoint, and should we recover to that checkpoint, we'll need to delete the leaf page, - * else we'd leak it. The solution is to write a proxy cell on the internal page ensuring the - * leaf page is eventually discarded. + * A visible entry can be in a prepared state and checkpoints skip in-progress prepared changes. + * We can't race here, the entry won't be visible to the checkpoint, or will be in a prepared + * state, one or the other. * - * If no such transactions exist, we can discard the leaf page to the block manager and no cell - * needs to be written at all. We do this outside of the underlying tracking routines because - * this action is permanent and irrevocable. (Clearing the address means we've lost track of the - * disk address in a permanent way. This is safe because there's no path to reading the leaf - * page again: if there's ever a read into this part of the name space again, the cache read - * function instantiates an entirely new page.) + * We should never see an in-progress prepare in eviction: when we check to see if an internal + * page can be evicted, we check for an unresolved fast-truncate, which includes a fast-truncate + * in a prepared state, so it's an error to see that during eviction. */ - if (ref->addr != NULL && !__wt_page_del_active(session, ref, true)) { - WT_RET(__wt_ref_block_free(session, ref)); + WT_ORDERED_READ(prepare_state, page_del->prepare_state); + if (prepare_state == WT_PREPARE_INPROGRESS || prepare_state == WT_PREPARE_LOCKED) { + WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); - /* Any fast-truncate information can be freed as soon as the delete is stable. */ - __wt_overwrite_and_free(session, ref->ft_info.del); + *statep = WT_CHILD_ORIGINAL; + return (0); } /* - * If the original page is gone, we can skip the slot on the internal page. + * Deal with underlying disk blocks. If there are readers that might want to see the page's + * state before it's deleted, or the fast-delete can be undone by RTS, we can't discard the + * pages. Write a cell to the internal page with information describing the fast-delete. */ - if (ref->addr == NULL) { - *statep = WT_CHILD_IGNORE; + if (__wt_page_del_active(session, ref, true)) { + *statep = WT_CHILD_PROXY; return (0); } /* - * Internal pages with deletes that aren't stable cannot be evicted, we don't have sufficient - * information to restore the page's information if subsequently read (we wouldn't know which - * transactions should see the original page and which should see the deleted page). - */ - if (F_ISSET(r, WT_REC_EVICT)) - return (__wt_set_return(session, EBUSY)); - - /* If the page cannot be marked clean. */ - r->leave_dirty = true; - - /* - * If the original page cannot be freed, we need to keep a slot on the page to reference it from - * the parent page. - * - * If the delete is not visible in this checkpoint, write the original address normally. - * Otherwise, we have to write a proxy record. If the delete state is not ready, then delete is - * not visible as it is in prepared state. + * Otherwise, we can discard the leaf page to the block manager and no cell needs to be written. + * Done outside of the underlying tracking routines because this action is permanent and + * irrevocable. (Clearing the address means we've lost track of the disk address in a permanent + * way. This is safe because there's no path to reading the leaf page again: if there's ever a + * read into this part of the name space again, the cache read function instantiates an entirely + * new page.) */ - if (!__wt_page_del_active(session, ref, false)) - *statep = WT_CHILD_PROXY; - + WT_RET(__wt_ref_block_free(session, ref)); + __wt_overwrite_and_free(session, ref->ft_info.del); return (0); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c index 3a726134a6f..870e35d3067 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_col.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c @@ -187,7 +187,7 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* Build the value cell. */ addr = &multi->addr; - __wt_rec_cell_build_addr(session, r, addr, NULL, false, r->recno); + __wt_rec_cell_build_addr(session, r, addr, NULL, r->recno, NULL); /* Boundary: split or write the page. */ if (__wt_rec_need_split(r, val->len)) @@ -296,7 +296,7 @@ __wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) __wt_cell_unpack_addr(session, page->dsk, ref->addr, vpack); if (F_ISSET(vpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED)) { /* Need to rebuild the cell with the updated time info. */ - __wt_rec_cell_build_addr(session, r, NULL, vpack, false, ref->ref_recno); + __wt_rec_cell_build_addr(session, r, NULL, vpack, ref->ref_recno, NULL); } else { val->buf.data = ref->addr; val->buf.size = __wt_cell_total_len(vpack); @@ -305,7 +305,7 @@ __wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) } WT_TIME_AGGREGATE_COPY(&ta, &vpack->ta); } else { - __wt_rec_cell_build_addr(session, r, addr, NULL, false, ref->ref_recno); + __wt_rec_cell_build_addr(session, r, addr, NULL, ref->ref_recno, NULL); WT_TIME_AGGREGATE_COPY(&ta, &addr->ta); } WT_CHILD_RELEASE_ERR(session, hazard, ref); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c index a3657631701..cda9c4f7a54 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_row.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c @@ -256,7 +256,7 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) r->cell_zero = false; addr = &multi->addr; - __wt_rec_cell_build_addr(session, r, addr, NULL, false, WT_RECNO_OOB); + __wt_rec_cell_build_addr(session, r, addr, NULL, WT_RECNO_OOB, NULL); /* Boundary: split or write the page. */ if (__wt_rec_need_split(r, key->len + val->len)) @@ -290,7 +290,7 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_PAGE *child; WT_REC_KV *key, *val; WT_REF *ref; - WT_TIME_AGGREGATE ta; + WT_TIME_AGGREGATE ft_ta, ta; size_t size; bool hazard; const void *p; @@ -298,6 +298,7 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) btree = S2BT(session); child = NULL; hazard = false; + WT_TIME_AGGREGATE_INIT(&ft_ta); key = &r->k; kpack = &_kpack; @@ -388,32 +389,28 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* Original child. */ break; case WT_CHILD_PROXY: - /* Deleted child where we write a proxy cell. */ + /* Fast-delete child where we write a proxy cell. */ break; } /* * Build the value cell, the child page's address. Addr points to an on-page cell or an - * off-page WT_ADDR structure. There's a special cell type in the case of page deletion - * requiring a proxy cell, otherwise use the information from the addr or original cell. + * off-page WT_ADDR structure. */ if (__wt_off_page(page, addr)) { - __wt_rec_cell_build_addr(session, r, addr, NULL, state == WT_CHILD_PROXY, WT_RECNO_OOB); + __wt_rec_cell_build_addr(session, r, addr, NULL, WT_RECNO_OOB, + state == WT_CHILD_PROXY ? ref->ft_info.del : NULL); WT_TIME_AGGREGATE_COPY(&ta, &addr->ta); } else { __wt_cell_unpack_addr(session, page->dsk, ref->addr, vpack); - if (F_ISSET(vpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED)) { + if (F_ISSET(vpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED) || state == WT_CHILD_PROXY) { /* * The transaction ids are cleared after restart. Repack the cell with new validity - * to flush the cleared transaction ids. + * to flush the cleared transaction ids. The other use is proxy cells where we need + * to write additional information into the address cell. */ - __wt_rec_cell_build_addr( - session, r, NULL, vpack, state == WT_CHILD_PROXY, WT_RECNO_OOB); - } else if (state == WT_CHILD_PROXY) { - WT_ERR(__wt_buf_set(session, &val->buf, ref->addr, __wt_cell_total_len(vpack))); - __wt_cell_type_reset(session, val->buf.mem, 0, WT_CELL_ADDR_DEL); - val->cell_len = 0; - val->len = val->buf.size; + __wt_rec_cell_build_addr(session, r, NULL, vpack, WT_RECNO_OOB, + state == WT_CHILD_PROXY ? ref->ft_info.del : NULL); } else { val->buf.data = ref->addr; val->buf.size = __wt_cell_total_len(vpack); @@ -422,6 +419,19 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) } WT_TIME_AGGREGATE_COPY(&ta, &vpack->ta); } + + /* + * The fast-truncate is a stop time window and has to be considered in the internal page's + * aggregate information for RTS to find it. + */ + if (state == WT_CHILD_PROXY) { + ft_ta.newest_start_durable_ts = ta.newest_start_durable_ts; + ft_ta.newest_stop_durable_ts = ref->ft_info.del->durable_timestamp; + ft_ta.oldest_start_ts = ta.oldest_start_ts; + ft_ta.newest_txn = ref->ft_info.del->txnid; + ft_ta.newest_stop_ts = ref->ft_info.del->timestamp; + ft_ta.newest_stop_txn = ref->ft_info.del->txnid; + } WT_CHILD_RELEASE_ERR(session, hazard, ref); /* Build key cell. Truncate any 0th key, internal pages don't need 0th keys. */ @@ -438,6 +448,8 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* Copy the key and value onto the page. */ __wt_rec_image_copy(session, r, key); __wt_rec_image_copy(session, r, val); + if (state == WT_CHILD_PROXY) + WT_TIME_AGGREGATE_MERGE(session, &r->cur_ptr->ta, &ft_ta); WT_TIME_AGGREGATE_MERGE(session, &r->cur_ptr->ta, &ta); /* Update compression state. */ diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 758d972f954..4257deb2812 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -1862,17 +1862,18 @@ __rec_split_write_header(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK dsk->type = page->type; dsk->flags = 0; - - /* Set the zero-length value flag in the page header. */ + /* Set the all/none zero-length value flags. */ if (page->type == WT_PAGE_ROW_LEAF) { - F_CLR(dsk, WT_PAGE_EMPTY_V_ALL | WT_PAGE_EMPTY_V_NONE); - if (chunk->entries != 0 && r->all_empty_value) F_SET(dsk, WT_PAGE_EMPTY_V_ALL); if (chunk->entries != 0 && !r->any_empty_value) F_SET(dsk, WT_PAGE_EMPTY_V_NONE); } + /* Set the fast-truncate proxy cell information flag. */ + if (page->type == WT_PAGE_ROW_INT && __wt_process.fast_truncate_2022) + F_SET(dsk, WT_PAGE_FT_UPDATE); + dsk->unused = 0; dsk->version = WT_PAGE_VERSION_TS; diff --git a/src/third_party/wiredtiger/src/support/global.c b/src/third_party/wiredtiger/src/support/global.c index dc038c2b53c..720e6dcd5fc 100644 --- a/src/third_party/wiredtiger/src/support/global.c +++ b/src/third_party/wiredtiger/src/support/global.c @@ -123,6 +123,11 @@ __global_once(void) __wt_process.checksum = wiredtiger_crc32c_func(); __global_calibrate_ticks(); + + /* Run-time configuration. */ +#ifdef WT_STANDALONE_BUILD + __wt_process.fast_truncate_2022 = true; +#endif } /* diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 12c99d31d35..048935d3192 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -1666,7 +1666,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) if (op->type == WT_TXN_OP_REF_DELETE) { WT_REF_LOCK(session, op->u.ref, &previous_state); if (previous_state == WT_REF_DELETED) - op->u.ref->ft_info.del->committed = 1; + op->u.ref->ft_info.del->committed = true; else __wt_free(session, op->u.ref->ft_info.update); WT_REF_UNLOCK(op->u.ref, previous_state); diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index b4112f6f416..d994fcad9f8 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -1097,8 +1097,8 @@ __rollback_get_ref_max_durable_timestamp(WT_SESSION_IMPL *session, WT_TIME_AGGRE /* * __rollback_page_needs_abort -- - * Check whether the page needs rollback. Return true if the page has modifications newer than - * the given timestamp Otherwise return false. + * Check whether the page needs rollback, returning true if the page has modifications newer + * than the given timestamp. */ static bool __rollback_page_needs_abort( @@ -1224,54 +1224,40 @@ __rollback_abort_updates(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r } /* - * __rollback_abort_fast_truncate -- - * Abort fast truncate for an internal page of leaf pages. + * __rollback_to_stable_page_skip -- + * Skip if rollback to stable doesn't require reading this page. */ static int -__rollback_abort_fast_truncate( - WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp) -{ - WT_REF *child_ref; - - WT_INTL_FOREACH_BEGIN (session, ref->page, child_ref) { - /* - * A fast-truncate page is either in the WT_REF_DELETED state (where the WT_PAGE_DELETED - * structure has the timestamp information), or in an in-memory state where it started as a - * fast-truncate page which was then instantiated and the timestamp information moved to the - * individual WT_UPDATE structures. When reviewing internal pages, ignore the second case, - * an instantiated page is handled when the leaf page is visited. - */ - if (child_ref->state == WT_REF_DELETED && child_ref->ft_info.del != NULL && - rollback_timestamp < child_ref->ft_info.del->durable_timestamp) { - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "%p: deleted page rolled back", (void *)child_ref); - WT_RET(__wt_delete_page_rollback(session, child_ref)); - } - } - WT_INTL_FOREACH_END; - return (0); -} - -/* - * __wt_rts_page_skip -- - * Skip if rollback to stable doesn't requires to read this page. - */ -int -__wt_rts_page_skip( +__rollback_to_stable_page_skip( WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool visible_all, bool *skipp) { + WT_PAGE_DELETED *page_del; wt_timestamp_t rollback_timestamp; - rollback_timestamp = *(wt_timestamp_t *)(context); + rollback_timestamp = *(wt_timestamp_t *)context; + WT_UNUSED(visible_all); + *skipp = false; /* Default to reading */ - WT_UNUSED(visible_all); + /* + * Skip fast-truncate operations durable at or before the RTS timestamp (reading the page will + * delete it). A page without fast-truncate timestamp information is an old format page: skip + * them as there's no way to get correct behavior, and skipping them matches historic behavior. + */ + if (ref->state == WT_REF_DELETED) { + page_del = ref->ft_info.del; + if (page_del == NULL || + (__rollback_txn_visible_id(session, page_del->txnid) && + page_del->durable_timestamp <= rollback_timestamp)) + *skipp = true; + return (0); + } - /* If the page state is other than on disk, we want to look at it. */ + /* Otherwise, if the page state is other than on disk, we want to look at it. */ if (ref->state != WT_REF_DISK) return (0); - /* Check whether this ref has any possible updates to be aborted. */ + /* Check whether this on-disk page has any updates to be aborted. */ if (!__rollback_page_needs_abort(session, ref, rollback_timestamp)) { *skipp = true; __wt_verbose_multi( @@ -1294,13 +1280,11 @@ __rollback_to_stable_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollbac /* Walk the tree, marking commits aborted where appropriate. */ ref = NULL; - while ((ret = __wt_tree_walk_custom_skip(session, &ref, __wt_rts_page_skip, &rollback_timestamp, - WT_READ_NO_EVICT | WT_READ_WONT_NEED | WT_READ_VISIBLE_ALL)) == 0 && + while ( + (ret = __wt_tree_walk_custom_skip(session, &ref, __rollback_to_stable_page_skip, + &rollback_timestamp, WT_READ_NO_EVICT | WT_READ_WONT_NEED | WT_READ_VISIBLE_ALL)) == 0 && ref != NULL) - if (F_ISSET(ref, WT_REF_FLAG_INTERNAL)) - WT_WITH_PAGE_INDEX( - session, ret = __rollback_abort_fast_truncate(session, ref, rollback_timestamp)); - else + if (F_ISSET(ref, WT_REF_FLAG_LEAF)) WT_RET(__rollback_abort_updates(session, ref, rollback_timestamp)); return (ret); diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py new file mode 100644 index 00000000000..c88bd3d578b --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +from helper import simulate_crash_restart +from test_rollback_to_stable01 import test_rollback_to_stable_base +from wiredtiger import stat, WT_NOTFOUND +from wtdataset import SimpleDataSet +from wtscenario import make_scenarios + +# test_rollback_to_stable33.py +# Test interaction between fast-delete and RTS. +class test_rollback_to_stable33(test_rollback_to_stable_base): + session_config = 'isolation=snapshot' + conn_config = 'cache_size=50MB,statistics=(all),log=(enabled=false)' + + format_values = [ + ('column', dict(key_format='r', value_format='S', extraconfig='')), + ('column_fix', dict(key_format='r', value_format='8t', + extraconfig=',allocation_size=512,leaf_page_max=512')), + ('integer_row', dict(key_format='i', value_format='S', extraconfig='')), + ('string_row', dict(key_format='S', value_format='S', extraconfig='')), + ] + prepare_values = [ + ('no_prepare', dict(prepare=False)), + ('prepare', dict(prepare=True)), + ] + second_checkpoint_values = [ + ('second_checkpoint', dict(second_checkpoint=True)), + ('no_second_checkpoint', dict(second_checkpoint=False)), + ] + rollback_modes = [ + ('runtime', dict(crash=False)), + ('recovery', dict(crash=True)), + ] + + scenarios = make_scenarios(format_values, prepare_values, second_checkpoint_values, + rollback_modes) + + # Make all the values different so it's easier to see what happens if ranges go missing. + def mkdata(self, basevalue, i): + if self.value_format == '8t': + return basevalue + return basevalue + str(i) + + def evict(self, ds, lo, hi, basevalue): + evict_cursor = self.session.open_cursor(ds.uri, None, "debug=(release_evict)") + self.session.begin_transaction() + + # Evict every 3rd key to make sure we get all the pages but not write them out + # over and over again any more than necessary. FUTURE: improve this to evict + # each page once when we get a suitable interface for that. + for i in range(lo, hi, 3): + evict_cursor.set_key(ds.key(i)) + self.assertEquals(evict_cursor.search(), 0) + self.assertEquals(evict_cursor.get_value(), self.mkdata(basevalue, i)) + evict_cursor.reset() + self.session.rollback_transaction() + evict_cursor.close() + + # Call this checkx to distinguish it from the parent class's default check(). + def checkx(self, ds, nrows, read_ts, basevalue): + self.session.begin_transaction('read_timestamp=' + self.timestamp_str(read_ts)) + cursor = self.session.open_cursor(ds.uri) + i = 1 + for k, v in cursor: + self.assertEqual(v, self.mkdata(basevalue, i)) + self.assertEqual(k, ds.key(i)) + i += 1 + self.session.commit_transaction() + self.assertEqual(i, nrows + 1) + cursor.close() + + def test_rollback_to_stable(self): + # RTS will fail if there are uncommitted prepared transactions, so skip tests of prepare + # with a runtime call to RTS, that doesn't add useful testing scenarios. + if self.prepare and not self.crash: + return + + nrows = 10000 + + # Create a table without logging. + uri = "table:rollback_to_stable33" + ds = SimpleDataSet( + self, uri, 0, key_format=self.key_format, value_format=self.value_format, + config='log=(enabled=false)' + self.extraconfig) + ds.populate() + + if self.value_format == '8t': + valuea = 97 + valueb = 98 + else: + valuea = "aaaaa" * 100 + valueb = "bbbbb" * 100 + + # Pin oldest and stable timestamps to 10. + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(10) + + ',stable_timestamp=' + self.timestamp_str(10)) + + cursor = self.session.open_cursor(uri) + + # Write some baseline data out at time 20. + self.session.begin_transaction() + for i in range(1, nrows + 1): + cursor[ds.key(i)] = self.mkdata(valuea, i) + # Make a new transaction every 97 keys so the transactions don't get huge. + if i % 97 == 0: + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(20)) + self.session.begin_transaction() + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(20)) + + # Write some more data out at time 30. + self.session.begin_transaction() + for i in range(1, nrows + 1): + cursor[ds.key(i)] = self.mkdata(valueb, i) + # Make a new transaction every 97 keys so the transactions don't get huge. + if i % 97 == 0: + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(30)) + self.session.begin_transaction() + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(30)) + + cursor.close() + + # Evict the lot. + self.evict(ds, 1, nrows + 1, valueb) + + # Move stable to 25 (after the baseline data). + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(25)) + + # Checkpoint. + self.session.checkpoint() + + # Now fast-delete the lot at time 35. + # Use a separate session for this so that if we leave the truncate prepared it + # doesn't obstruct the rest of the test. + session2 = self.conn.open_session() + session2.begin_transaction() + lo_cursor = session2.open_cursor(uri) + lo_cursor.set_key(ds.key(nrows // 2 + 1)) + hi_cursor = session2.open_cursor(uri) + hi_cursor.set_key(ds.key(nrows + 1)) + session2.truncate(None, lo_cursor, hi_cursor, None) + if self.prepare: + session2.prepare_transaction('prepare_timestamp=' + self.timestamp_str(35)) + else: + session2.commit_transaction('commit_timestamp=' + self.timestamp_str(35)) + hi_cursor.close() + lo_cursor.close() + + # Check stats to make sure we fast-deleted at least one page. + # Since VLCS and FLCS do not (yet) support fast-delete, instead assert we didn't. + stat_cursor = self.session.open_cursor('statistics:', None, None) + fastdelete_pages = stat_cursor[stat.conn.rec_page_delete_fast][2] + if self.key_format == 'r': + self.assertEqual(fastdelete_pages, 0) + else: + self.assertGreater(fastdelete_pages, 0) + + if self.second_checkpoint: + # Checkpoint again with the deletion. + self.session.checkpoint() + + # Roll back, either via crashing or by explicit RTS. + if self.crash: + simulate_crash_restart(self, ".", "RESTART") + else: + self.conn.rollback_to_stable() + + # We should see the original data at read-ts 20 and 30. + self.checkx(ds, nrows, 20, valuea) + self.checkx(ds, nrows, 30, valuea) diff --git a/src/third_party/wiredtiger/test/suite/test_truncate09.py b/src/third_party/wiredtiger/test/suite/test_truncate09.py new file mode 100644 index 00000000000..e4ace88710e --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_truncate09.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_truncate09.py +# Check for fast-truncate rollback-to-stable timestamps. + +import wttest +from helper import copy_wiredtiger_home, simulate_crash_restart +from wtdataset import simple_key, simple_value +from wtscenario import make_scenarios + +class test_truncate09(wttest.WiredTigerTestCase): + # We don't test FLCS, missing records return as 0 values. + format_values = [ + ('column', dict(key_format='r')), + ('row_integer', dict(key_format='i')), + ] + scenarios = make_scenarios(format_values) + + def test_truncate09(self): + # Create a large table with lots of pages. + uri = "table:test_truncate09" + format = 'key_format={},value_format=S'.format(self.key_format) + self.session.create(uri, 'allocation_size=512,leaf_page_max=512,' + format) + + cursor = self.session.open_cursor(uri) + for i in range(1, 80000): + cursor[simple_key(cursor, i)] = simple_value(cursor, i) + cursor.close() + + # Force to disk. + self.reopen_conn() + + # Set the oldest timestamp and the stable timestamp. + self.conn.set_timestamp('oldest_timestamp='+ self.timestamp_str(100)) + self.conn.set_timestamp('stable_timestamp='+ self.timestamp_str(100)) + + # Start a transaction. + self.session.begin_transaction() + + # Truncate a chunk. + c1 = self.session.open_cursor(uri, None) + c1.set_key(simple_key(c1, 20000)) + c2 = self.session.open_cursor(uri, None) + c2.set_key(simple_key(c1, 40000)) + self.session.truncate(None, c1, c2, None) + + # Commit the transaction. + self.session.timestamp_transaction('commit_timestamp=' + self.timestamp_str(150)) + self.session.commit_transaction() + + # Move the stable timestamp to make the previous truncate operation permanent. + self.conn.set_timestamp('stable_timestamp='+ self.timestamp_str(200)) + + # Checkpoint + self.session.checkpoint() + + # Start a transaction. + self.session.begin_transaction() + + # Truncate a chunk. + c1.set_key(simple_key(c1, 50000)) + c2.set_key(simple_key(c1, 70000)) + self.session.truncate(None, c1, c2, None) + + # Remove a single row. + c1.set_key(simple_key(c1, 75000)) + c1.remove() + + # Commit the transaction. + self.session.timestamp_transaction('commit_timestamp=' + self.timestamp_str(250)) + self.session.commit_transaction() + + # Checkpoint + self.session.checkpoint() + + # Restart, testing RTS on the copy. + copy_wiredtiger_home(self, ".", "RESTART") + simulate_crash_restart(self, ".", "RESTART") + + # Search for a key in the truncated range which is stabilised, hence should not find it. + cursor = self.session.open_cursor(uri) + cursor.set_key(simple_key(cursor, 30000)) + self.assertNotEqual(cursor.search(), 0) + + # Search for a key in the truncated range which is not stabilised, hence should find it. + cursor.set_key(simple_key(cursor, 60000)) + self.assertEqual(cursor.search(), 0) + + # Search for a removed key which is not stabilised, hence should find it. + cursor.set_key(simple_key(cursor, 75000)) + self.assertEqual(cursor.search(), 0) + +if __name__ == '__main__': + wttest.run() |