summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorChenhao Qu <chenhao.qu@mongodb.com>2022-04-19 23:39:05 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-04-20 00:10:46 +0000
commite916372674aa762557c9e30141b277e019d8039f (patch)
tree554849d0df1ebcb3ba70140f9b3481e55c7a2a60 /src
parent12bbc9f4f1a5d7a4826f2f2847c0e03ebc93103e (diff)
downloadmongo-e916372674aa762557c9e30141b277e019d8039f.tar.gz
Import wiredtiger: ad0b418109f05284ecdc343683988388a1a7fd39 from branch mongodb-master
ref: 5013fd0ff6..ad0b418109 for: 6.1.0-rc0 WT-8447 Database corruption from RTS after fast-delete
Diffstat (limited to 'src')
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c42
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c39
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c11
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c12
-rw-r--r--src/third_party/wiredtiger/src/btree/row_modify.c4
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h35
-rw-r--r--src/third_party/wiredtiger/src/include/cell.h13
-rw-r--r--src/third_party/wiredtiger/src/include/cell_inline.h49
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h2
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h6
-rw-r--r--src/third_party/wiredtiger/src/include/reconcile_inline.h34
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_child.c104
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_col.c6
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_row.c42
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c9
-rw-r--r--src/third_party/wiredtiger/src/support/global.c5
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c2
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c72
-rw-r--r--src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py195
-rw-r--r--src/third_party/wiredtiger/test/suite/test_truncate09.py120
22 files changed, 590 insertions, 216 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index f3446688f86..f054921907c 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-master",
- "commit": "5013fd0ff65c0f86742c24039feeeda03369a987"
+ "commit": "ad0b418109f05284ecdc343683988388a1a7fd39"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index 1edd7afd169..ccb6225fd89 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -126,7 +126,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
/* Allocate and initialize the page-deleted structure. */
WT_ERR(__wt_calloc_one(session, &ref->ft_info.del));
- ref->ft_info.del->previous_state = previous_state;
+ ref->ft_info.del->previous_ref_state = previous_state;
WT_ERR(__wt_txn_modify_page_delete(session, ref));
@@ -193,15 +193,15 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
* a hazard pointer. We're safe since pages with unresolved transactions aren't going anywhere.
*/
if (current_state == WT_REF_DELETED)
- current_state = ref->ft_info.del->previous_state;
+ current_state = ref->ft_info.del->previous_ref_state;
else if ((updp = ref->ft_info.update) != NULL)
for (; *updp != NULL; ++updp)
(*updp)->txnid = WT_TXN_ABORTED;
/*
- * We didn't set the WT_PAGE_DELETED transaction ID to aborted or discard any WT_UPDATE list,
- * instead, we discard both structures entirely, it has the same effect. It's a single call,
- * they're a union of two pointers.
+ * Don't set the WT_PAGE_DELETED transaction ID to aborted, discard any WT_UPDATE list or set
+ * the committed flag; instead, discard the structures, it has the same effect. It's a single
+ * call, they're a union of two pointers.
*/
__wt_free(session, ref->ft_info.del);
@@ -304,29 +304,24 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
WT_STAT_CONN_DATA_INCR(session, cache_read_deleted);
- /*
- * Give the page a modify structure.
- */
- WT_RET(__wt_page_modify_init(session, page));
+ /* Track the prepared, fast-truncate pages we've had to instantiate. */
+ if (ref->ft_info.del != NULL && ref->ft_info.del->prepare_state != WT_PREPARE_INIT)
+ WT_STAT_CONN_DATA_INCR(session, cache_read_deleted_prepared);
/*
- * We would like to free the deleted pages, but if the tree is newly created, there is nothing
- * that needs to be freed. Furthermore, if the handle is read-only, we are not able to do so.
+ * Give the page a modify structure. If the tree is already dirty and so will be written, mark
+ * the page dirty. (We want to free the deleted pages, but if the handle is read-only or if the
+ * application never modifies the tree, we're not able to do so.)
*/
- if (!btree->original)
+ WT_RET(__wt_page_modify_init(session, page));
+ if (btree->modified)
__wt_page_modify_set(session, page);
- /*
- * Allocate the per-page update array if one doesn't already exist. (It might already exist
- * because deletes are instantiated after the history store table updates.)
- */
+ /* Allocate the per-page update array if one doesn't already exist. */
if (page->entries != 0 && page->modify->mod_row_update == NULL)
WT_PAGE_ALLOC_AND_SWAP(
session, page, page->modify->mod_row_update, upd_array, page->entries);
- if (ref->ft_info.del != NULL && ref->ft_info.del->prepare_state != WT_PREPARE_INIT)
- WT_STAT_CONN_DATA_INCR(session, cache_read_deleted_prepared);
-
/*
* An operation is accessing a "deleted" page, and we're building an in-memory version of the
* page (making it look like all entries in the page were individually updated by a remove
@@ -337,9 +332,10 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
* building might split in the future, so we update that structure to include references to all
* of the update structures we create, so the transaction can abort.
*
- * Second, a truncate call deleted a page and the truncate committed, but an older transaction
- * in the system forced us to keep the old version of the page around, then we crashed and
- * recovered or we're running inside a checkpoint, and now we're being forced to read that page.
+ * Second, a truncate call deleted a page and the truncate resolved, but an older transaction or
+ * the stable timestamp forced us to keep the old version of the page around, and then we
+ * crashed and recovered or we're running inside a checkpoint, and now we're being forced to
+ * read that page.
*
* If there's a page-deleted structure that's not yet globally visible, get a reference and
* migrate transaction ID and timestamp information to the updates (globally visible means the
@@ -350,7 +346,7 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
* other pages.
*/
page_del = __wt_page_del_active(session, ref, true) ? ref->ft_info.del : NULL;
- if (page_del != NULL && page_del->committed == 0) {
+ if (page_del != NULL && !page_del->committed) {
count = 0;
if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
WT_SKIP_FOREACH (ins, insert)
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
index bdc946a18a5..1cc2db1294e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_page.c
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -593,13 +593,13 @@ __inmem_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page, bool *preparedp, size_t
page->pg_fix_numtws = entry_num;
/*
- * If we skipped "quite a few" entries (threshold is arbitrary), mark the page dirty so it
- * gets rewritten without them.
+ * If we skipped "quite a few" entries (threshold is arbitrary), and the tree is already
+ * dirty and so will be written, mark the page dirty so it gets rewritten without them.
*/
- if (!F_ISSET(btree, WT_BTREE_READONLY) && skipped >= auxhdr.entries / 4 &&
- skipped >= dsk->u.entries / 100 && skipped > 4) {
+ if (btree->modified && skipped >= auxhdr.entries / 4 && skipped >= dsk->u.entries / 100 &&
+ skipped > 4) {
WT_RET(__wt_page_modify_init(session, page));
- __wt_page_modify_set(session, page);
+ __wt_page_only_modify_set(session, page);
}
break;
@@ -797,22 +797,18 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
break;
case WT_CELL_ADDR_DEL:
/*
- * A cell may reference a deleted leaf page: if a leaf page was deleted without being
- * read (fast truncate), and the deletion committed, but older transactions in the
- * system required the previous version of the page to remain available, a special
- * deleted-address type cell is written. We'll see that cell on a page if we read from a
- * checkpoint including a deleted cell or if we crash/recover and start off from such a
- * checkpoint (absent running recovery, a version of the page without the deleted cell
- * would eventually have been written). If we crash and recover to a page with a
- * deleted-address cell, we want to discard the page from the backing store (it was
- * never discarded), and, of course, by definition no earlier transaction will ever need
- * it.
- *
- * Re-create the state of a deleted page.
+ * If a page was deleted without being read (fast truncate), and the delete committed,
+ * but older transactions in the system required the previous version of the page to
+ * remain available or the delete can still be rolled back by RTS, a deleted-address
+ * type cell is written. We'll see that cell on a page if we read from a checkpoint
+ * including a deleted cell or if we crash/recover and start off from such a checkpoint.
+ * Recreate the fast-delete state for the page.
*/
- ref->addr = unpack.cell;
+ if (F_ISSET(page->dsk, WT_PAGE_FT_UPDATE)) {
+ WT_ERR(__wt_calloc_one(session, &ref->ft_info.del));
+ *ref->ft_info.del = unpack.page_del;
+ }
WT_REF_SET_STATE(ref, WT_REF_DELETED);
- ++refp;
/*
* If the tree is already dirty and so will be written, mark the page dirty. (We want to
@@ -821,8 +817,11 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
*/
if (btree->modified) {
WT_ERR(__wt_page_modify_init(session, page));
- __wt_page_modify_set(session, page);
+ __wt_page_only_modify_set(session, page);
}
+
+ ref->addr = unpack.cell;
+ ++refp;
break;
case WT_CELL_ADDR_INT:
case WT_CELL_ADDR_LEAF:
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index 6c771be480a..b4673fc9955 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -152,12 +152,13 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
WT_ERR(__wt_page_inmem_prepare(session, ref));
skip_read:
- switch (previous_state) {
- case WT_REF_DELETED:
- /* Move all records to a deleted state. */
+ /*
+ * In the case of a fast delete, move all of the page's records to a deleted state based on the
+ * fast-delete information. Skip for special commands that don't care about an in-memory state.
+ */
+ if (previous_state == WT_REF_DELETED &&
+ !F_ISSET(S2BT(session), WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
WT_ERR(__wt_delete_page_instantiate(session, ref));
- break;
- }
F_CLR(ref, WT_REF_FLAG_READING);
WT_REF_SET_STATE(ref, WT_REF_MEM);
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
index 269a226991e..b9bc0a32086 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
@@ -122,6 +122,8 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_H
LF_CLR(WT_PAGE_ENCRYPTED);
if (LF_ISSET(WT_PAGE_UNUSED))
LF_CLR(WT_PAGE_UNUSED);
+ if (LF_ISSET(WT_PAGE_FT_UPDATE))
+ LF_CLR(WT_PAGE_FT_UPDATE);
if (flags != 0)
WT_RET_VRFY(session, "page at %s has invalid flags set: 0x%" PRIx8, tag, flags);
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index ef2464b9d8c..ac3f4fecac4 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -258,8 +258,14 @@ __tree_walk_internal(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp
/* Ensure we have a snapshot to check visibility or we only check global visibility. */
WT_ASSERT(session, LF_ISSET(WT_READ_VISIBLE_ALL) || F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT));
- /* All current tree walks skip deleted pages. */
- LF_SET(WT_READ_SKIP_DELETED);
+ /*
+ * All tree walks except for rollback-to-stable skip deleted pages. We set read-skip-deleted
+ * here because we didn't want to add a flag to all of the tree-walk callers, and we make it
+ * worse because we don't want to add a flag that turns the read-skip-deleted flag off, so we
+ * test the RTS flag itself.
+ */
+ if (!F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE))
+ LF_SET(WT_READ_SKIP_DELETED);
/*
* !!!
@@ -436,7 +442,7 @@ descend:
if (skip)
break;
empty_internal = false;
- } else if (current_state == WT_REF_DELETED) {
+ } else if (LF_ISSET(WT_READ_SKIP_DELETED) && current_state == WT_REF_DELETED) {
/*
* Try to skip deleted pages visible to us.
*/
diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c
index 29253deda1f..625c1d53496 100644
--- a/src/third_party/wiredtiger/src/btree/row_modify.c
+++ b/src/third_party/wiredtiger/src/btree/row_modify.c
@@ -132,8 +132,8 @@ __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value,
WT_ASSERT(session,
!WT_IS_HS(S2BT(session)->dhandle) ||
(*upd_entry == NULL ||
- ((*upd_entry)->type == WT_UPDATE_TOMBSTONE && (*upd_entry)->txnid == WT_TS_NONE &&
- (*upd_entry)->start_ts == WT_TS_NONE)) ||
+ ((*upd_entry)->type == WT_UPDATE_TOMBSTONE &&
+ (*upd_entry)->txnid == WT_TXN_NONE && (*upd_entry)->start_ts == WT_TS_NONE)) ||
(upd_arg->type == WT_UPDATE_TOMBSTONE && upd_arg->start_ts == WT_TS_NONE &&
upd_arg->next == NULL) ||
(upd_arg->type == WT_UPDATE_TOMBSTONE && upd_arg->next != NULL &&
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index c22422dabf5..8eb9b1c3f49 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -77,6 +77,7 @@ struct __wt_page_header {
#define WT_PAGE_EMPTY_V_NONE 0x04u /* Page has no zero-length values */
#define WT_PAGE_ENCRYPTED 0x08u /* Page is encrypted on disk */
#define WT_PAGE_UNUSED 0x10u /* Historic lookaside store page updates, no longer used */
+#define WT_PAGE_FT_UPDATE 0x20u /* Page contains updated fast-truncate information */
uint8_t flags; /* 25: flags */
/* A byte of padding, positioned to be added to the flags. */
@@ -857,20 +858,37 @@ struct __wt_page {
* Related information for truncated pages.
*/
struct __wt_page_deleted {
+ /*
+ * Transaction IDs are set when updates are created (before they become visible) and only change
+ * when marked with WT_TXN_ABORTED. Transaction ID readers expect to copy a transaction ID into
+ * a local variable and see a stable value. In case a compiler might re-read the transaction ID
+ * from memory rather than using the local variable, mark the shared transaction IDs volatile to
+ * prevent unexpected repeated/reordered reads.
+ */
volatile uint64_t txnid; /* Transaction ID */
wt_timestamp_t timestamp; /* Timestamps */
wt_timestamp_t durable_timestamp;
/*
- * The state is used for transaction prepare to manage visibility and inheriting prepare state
- * to update_list.
+ * The prepare state is used for transaction prepare to manage visibility and inheriting prepare
+ * state to update_list.
*/
- volatile uint8_t prepare_state; /* Prepare state. */
+ volatile uint8_t prepare_state;
- uint8_t previous_state; /* Previous state */
+ /*
+ * The previous state of the WT_REF; if the fast-truncate transaction is rolled back without the
+ * page first being instantiated, this is the state to which the WT_REF returns.
+ */
+ uint8_t previous_ref_state;
- uint8_t committed; /* Committed */
+ /*
+ * If the fast-truncate transaction has committed. If we're forced to instantiate the page, and
+ * the committed flag isn't set, we have to create an update structure list for the transaction
+ * to resolve in a subsequent commit. (This is tricky: if the transaction is rolled back, the
+ * entire structure is discarded, that is, the flag is set only on commit and not on rollback.)
+ */
+ bool committed;
};
/*
@@ -1135,6 +1153,13 @@ struct __wt_ikey {
* WT_UPDATE structures are formed into a forward-linked list.
*/
struct __wt_update {
+ /*
+ * Transaction IDs are set when updates are created (before they become visible) and only change
+ * when marked with WT_TXN_ABORTED. Transaction ID readers expect to copy a transaction ID into
+ * a local variable and see a stable value. In case a compiler might re-read the transaction ID
+ * from memory rather than using the local variable, mark the shared transaction IDs volatile to
+ * prevent unexpected repeated/reordered reads.
+ */
volatile uint64_t txnid; /* transaction ID */
wt_timestamp_t durable_ts; /* timestamps */
diff --git a/src/third_party/wiredtiger/src/include/cell.h b/src/third_party/wiredtiger/src/include/cell.h
index 68e11c3d903..351c7ae6464 100644
--- a/src/third_party/wiredtiger/src/include/cell.h
+++ b/src/third_party/wiredtiger/src/include/cell.h
@@ -132,20 +132,21 @@
*/
struct __wt_cell {
/*
- * Maximum of 71 bytes:
+ * Maximum of 98 bytes:
* 1: cell descriptor byte
* 1: prefix compression count
* 1: secondary descriptor byte
* 36: 4 timestamps (uint64_t encoding, max 9 bytes)
* 18: 2 transaction IDs (uint64_t encoding, max 9 bytes)
* 9: associated 64-bit value (uint64_t encoding, max 9 bytes)
+ * 27: fast-delete information (transaction ID, 2 timestamps)
* 5: data length (uint32_t encoding, max 5 bytes)
*
- * This calculation is extremely pessimistic: the prefix compression
- * count and 64V value overlap, and the validity window, 64V value
- * and data length are all optional in some cases.
+ * This calculation is pessimistic: the prefix compression count and 64V value overlap, and the
+ * validity window, 64V value, fast-delete information and data length are all optional in some
+ * or even most cases.
*/
- uint8_t __chunk[1 + 1 + 1 + 7 * WT_INTPACK64_MAXSIZE + WT_INTPACK32_MAXSIZE];
+ uint8_t __chunk[98];
};
/* AUTOMATIC FLAG VALUE GENERATION START 0 */
@@ -197,6 +198,8 @@ struct __wt_cell_unpack_addr {
WT_CELL_COMMON_FIELDS;
WT_TIME_AGGREGATE ta; /* Address validity window */
+
+ WT_PAGE_DELETED page_del; /* Fast-truncate information */
};
/*
diff --git a/src/third_party/wiredtiger/src/include/cell_inline.h b/src/third_party/wiredtiger/src/include/cell_inline.h
index e593ff17a0d..fd5bb9c5e69 100644
--- a/src/third_party/wiredtiger/src/include/cell_inline.h
+++ b/src/third_party/wiredtiger/src/include/cell_inline.h
@@ -195,7 +195,7 @@ __cell_pack_addr_validity(WT_SESSION_IMPL *session, uint8_t **pp, WT_TIME_AGGREG
*/
static inline size_t
__wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type, uint64_t recno,
- WT_TIME_AGGREGATE *ta, size_t size)
+ WT_PAGE_DELETED *page_del, WT_TIME_AGGREGATE *ta, size_t size)
{
uint8_t *p;
@@ -205,6 +205,31 @@ __wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type, ui
__cell_pack_addr_validity(session, &p, ta);
+ /*
+ * If passed fast-delete information, override the cell type and append the fast-delete
+ * information after the aggregated timestamp information.
+ */
+ if (page_del != NULL) {
+ /*
+ * We only fast-truncate leaf pages without overflow items, however, we can write a proxy
+ * cell for a page, evict and then read the internal page, and then checkpoint is writing it
+ * again.
+ */
+ WT_ASSERT(session, cell_type == WT_CELL_ADDR_DEL || cell_type == WT_CELL_ADDR_LEAF_NO);
+ cell_type = WT_CELL_ADDR_DEL;
+
+ /* We should never be in an in-progress prepared state. */
+ WT_ASSERT(session,
+ page_del->prepare_state == WT_PREPARE_INIT ||
+ page_del->prepare_state == WT_PREPARE_RESOLVED);
+
+ if (__wt_process.fast_truncate_2022) {
+ WT_IGNORE_RET(__wt_vpack_uint(&p, 0, page_del->txnid));
+ WT_IGNORE_RET(__wt_vpack_uint(&p, 0, page_del->timestamp));
+ WT_IGNORE_RET(__wt_vpack_uint(&p, 0, page_del->durable_timestamp));
+ }
+ }
+
if (recno == WT_RECNO_OOB)
cell->__chunk[0] |= (uint8_t)cell_type; /* Type */
else {
@@ -212,6 +237,7 @@ __wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type, ui
/* Record number */
WT_IGNORE_RET(__wt_vpack_uint(&p, 0, recno));
}
+
/* Length */
WT_IGNORE_RET(__wt_vpack_uint(&p, 0, (uint64_t)size));
return (WT_PTRDIFF(p, cell));
@@ -670,6 +696,7 @@ __wt_cell_unpack_safe(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CE
WT_TIME_WINDOW tw;
} copy;
WT_CELL_UNPACK_COMMON *unpack;
+ WT_PAGE_DELETED *page_del;
WT_TIME_AGGREGATE *ta;
WT_TIME_WINDOW *tw;
uint64_t v;
@@ -767,6 +794,7 @@ copy_cell_restart:
if ((cell->__chunk[0] & WT_CELL_SECOND_DESC) == 0)
break;
flags = *p++; /* skip second descriptor byte */
+ WT_CELL_LEN_CHK(p, 0);
if (LF_ISSET(WT_CELL_PREPARE))
ta->prepare = 1;
@@ -810,6 +838,7 @@ copy_cell_restart:
if ((cell->__chunk[0] & WT_CELL_SECOND_DESC) == 0)
break;
flags = *p++; /* skip second descriptor byte */
+ WT_CELL_LEN_CHK(p, 0);
if (LF_ISSET(WT_CELL_PREPARE))
tw->prepare = 1;
@@ -845,6 +874,24 @@ copy_cell_restart:
break;
}
+ /* Unpack any fast-truncate information. */
+ if (unpack->raw == WT_CELL_ADDR_DEL && F_ISSET(dsk, WT_PAGE_FT_UPDATE)) {
+ page_del = &unpack_addr->page_del;
+ WT_RET(__wt_vunpack_uint(
+ &p, end == NULL ? 0 : WT_PTRDIFF(end, p), (uint64_t *)&page_del->txnid));
+ WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &page_del->timestamp));
+ WT_RET(__wt_vunpack_uint(
+ &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &page_del->durable_timestamp));
+ page_del->prepare_state = 0; /* No prepare can have been in progress. */
+ page_del->previous_ref_state = WT_REF_DISK; /* The leaf page is on disk. */
+ page_del->committed = true; /* There is no running transaction. */
+
+ /* Avoid a stale transaction ID on restart. */
+ if (dsk->write_gen <= S2BT(session)->base_write_gen &&
+ !F_ISSET(session, WT_SESSION_DEBUG_DO_NOT_CLEAR_TXN_ID))
+ page_del->txnid = WT_TXN_NONE;
+ }
+
/*
* Check for an RLE count or record number that optionally follows the cell descriptor byte on
* column-store variable-length pages.
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index 2723ee14570..c8c69f919a6 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -27,6 +27,8 @@ struct __wt_process {
double tsc_nsec_ratio; /* rdtsc ticks to nanoseconds */
bool use_epochtime; /* use expensive time */
+ bool fast_truncate_2022; /* fast-truncate fix run-time configuration */
+
WT_CACHE_POOL *cache_pool; /* shared cache information */
};
extern WT_PROCESS __wt_process;
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index f7a8c7a085a..e8ac5790016 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -1274,8 +1274,6 @@ extern int __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_IT
) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_search(WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool insert, WT_REF *leaf,
bool leaf_safe, bool *leaf_foundp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_rts_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context,
- bool visible_all, bool *skipp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
@@ -2206,7 +2204,7 @@ static inline int __wt_vunpack_uint(const uint8_t **pp, size_t maxlen, uint64_t
static inline int __wt_write(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len,
const void *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline size_t __wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type,
- uint64_t recno, WT_TIME_AGGREGATE *ta, size_t size)
+ uint64_t recno, WT_PAGE_DELETED *page_del, WT_TIME_AGGREGATE *ta, size_t size)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline size_t __wt_cell_pack_copy(WT_SESSION_IMPL *session, WT_CELL *cell,
WT_TIME_WINDOW *tw, uint64_t rle, uint64_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -2320,7 +2318,7 @@ static inline void __wt_rec_auximage_copy(
static inline void __wt_rec_auxincr(
WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size);
static inline void __wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r,
- WT_ADDR *addr, WT_CELL_UNPACK_ADDR *vpack, bool proxy_cell, uint64_t recno);
+ WT_ADDR *addr, WT_CELL_UNPACK_ADDR *vpack, uint64_t recno, WT_PAGE_DELETED *page_del);
static inline void __wt_rec_image_copy(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv);
static inline void __wt_rec_incr(
WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size);
diff --git a/src/third_party/wiredtiger/src/include/reconcile_inline.h b/src/third_party/wiredtiger/src/include/reconcile_inline.h
index 39209bcc9af..eb983b46e2e 100644
--- a/src/third_party/wiredtiger/src/include/reconcile_inline.h
+++ b/src/third_party/wiredtiger/src/include/reconcile_inline.h
@@ -323,22 +323,20 @@ __wt_rec_auximage_copy(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t count
*/
static inline void
__wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ADDR *addr,
- WT_CELL_UNPACK_ADDR *vpack, bool proxy_cell, uint64_t recno)
+ WT_CELL_UNPACK_ADDR *vpack, uint64_t recno, WT_PAGE_DELETED *page_del)
{
WT_REC_KV *val;
+ WT_TIME_AGGREGATE *ta;
u_int cell_type;
val = &r->v;
/*
- * Our caller optionally specifies a cell type (deleted proxy cells), otherwise go with what we
- * know.
+ * Caller includes fast-delete information in the case of fast-delete proxy cells, which both
+ * flags the fast-delete case and provides the additional information written in the parent's
+ * address cell.
*/
- if (proxy_cell)
- cell_type = WT_CELL_ADDR_DEL;
- else if (vpack != NULL)
- cell_type = vpack->type;
- else {
+ if (vpack == NULL) {
switch (addr->type) {
case WT_ADDR_INT:
cell_type = WT_CELL_ADDR_INT;
@@ -352,16 +350,12 @@ __wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ADDR *add
break;
}
WT_ASSERT(session, addr->size != 0);
+ ta = &addr->ta;
+ } else {
+ cell_type = vpack->type;
+ ta = &vpack->ta;
}
-
- __rec_cell_addr_stats(r, vpack == NULL ? &addr->ta : &vpack->ta);
-
- /*
- * We don't check the address size because we can't store an address on an overflow page: if the
- * address won't fit, the overflow page's address won't fit either. This possibility must be
- * handled by Btree configuration, we have to disallow internal page sizes that are too small
- * with respect to the largest address cookie the underlying block manager might return.
- */
+ __rec_cell_addr_stats(r, ta);
/*
* We don't copy the data into the buffer, it's not necessary; just re-point the buffer's
@@ -371,16 +365,14 @@ __wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ADDR *add
WT_ASSERT(session, addr != NULL);
val->buf.data = addr->addr;
val->buf.size = addr->size;
- val->cell_len =
- __wt_cell_pack_addr(session, &val->cell, cell_type, recno, &addr->ta, val->buf.size);
} else {
WT_ASSERT(session, addr == NULL);
val->buf.data = vpack->data;
val->buf.size = vpack->size;
- val->cell_len =
- __wt_cell_pack_addr(session, &val->cell, cell_type, recno, &vpack->ta, val->buf.size);
}
+ val->cell_len =
+ __wt_cell_pack_addr(session, &val->cell, cell_type, recno, page_del, ta, val->buf.size);
val->len = val->cell_len + val->buf.size;
}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_child.c b/src/third_party/wiredtiger/src/reconcile/rec_child.c
index 1980d887a63..50ef393f78c 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_child.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_child.c
@@ -17,88 +17,74 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_C
{
WT_PAGE_DELETED *page_del;
WT_TXN *txn;
+ uint8_t prepare_state;
+
+ *statep = WT_CHILD_IGNORE;
- page_del = ref->ft_info.del;
txn = session->txn;
/*
- * Internal pages with child leaf pages in the WT_REF_DELETED state are a special case during
- * reconciliation. First, if the deletion was a result of a session truncate call, the deletion
- * may not be visible to us. In that case, we proceed as with any change not visible during
- * reconciliation by ignoring the change for the purposes of writing the internal page.
- *
- * In this case, there must be an associated page-deleted structure, and it holds the
- * transaction ID we care about.
- *
- * In some cases, there had better not be any updates we can't see.
+ * The complicated case is a fast-delete which may not be visible or stable. Otherwise, discard
+ * any underlying disk blocks and don't write anything.
+ */
+ page_del = ref->ft_info.del;
+ if (page_del == NULL)
+ return (ref->addr == NULL ? 0 : __wt_ref_block_free(session, ref));
+
+ /*
+ * The fast-delete may not yet be visible to us. In that case, we proceed as with any change not
+ * visible during reconciliation by ignoring the change for the purposes of writing the internal
+ * page.
*
- * A visible update to be in READY state (i.e. not in LOCKED or PREPARED state), for truly
- * visible to others.
+ * We expect the page to be clean after reconciliation. If there are invisible updates, abort
+ * eviction.
*/
- if (F_ISSET(r, WT_REC_CLEAN_AFTER_REC | WT_REC_VISIBILITY_ERR) && page_del != NULL &&
- __wt_page_del_active(session, ref, !F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))) {
+ if (__wt_page_del_active(session, ref, !F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))) {
if (F_ISSET(r, WT_REC_VISIBILITY_ERR))
WT_RET_PANIC(session, EINVAL, "reconciliation illegally skipped an update");
- return (__wt_set_return(session, EBUSY));
+ if (F_ISSET(r, WT_REC_CLEAN_AFTER_REC))
+ return (__wt_set_return(session, EBUSY));
+ *statep = WT_CHILD_ORIGINAL;
+ return (0);
}
/*
- * Deal with any underlying disk blocks.
- *
- * First, check to see if there is an address associated with this leaf: if there isn't, we're
- * done, the underlying page is already gone. If the page still exists, check for any
- * transactions in the system that might want to see the page's state before it's deleted.
- *
- * If any such transactions exist, we cannot discard the underlying leaf page to the block
- * manager because the transaction may eventually read it. However, this write might be part of
- * a checkpoint, and should we recover to that checkpoint, we'll need to delete the leaf page,
- * else we'd leak it. The solution is to write a proxy cell on the internal page ensuring the
- * leaf page is eventually discarded.
+ * A visible entry can be in a prepared state and checkpoints skip in-progress prepared changes.
+ * We can't race here, the entry won't be visible to the checkpoint, or will be in a prepared
+ * state, one or the other.
*
- * If no such transactions exist, we can discard the leaf page to the block manager and no cell
- * needs to be written at all. We do this outside of the underlying tracking routines because
- * this action is permanent and irrevocable. (Clearing the address means we've lost track of the
- * disk address in a permanent way. This is safe because there's no path to reading the leaf
- * page again: if there's ever a read into this part of the name space again, the cache read
- * function instantiates an entirely new page.)
+ * We should never see an in-progress prepare in eviction: when we check to see if an internal
+ * page can be evicted, we check for an unresolved fast-truncate, which includes a fast-truncate
+ * in a prepared state, so it's an error to see that during eviction.
*/
- if (ref->addr != NULL && !__wt_page_del_active(session, ref, true)) {
- WT_RET(__wt_ref_block_free(session, ref));
+ WT_ORDERED_READ(prepare_state, page_del->prepare_state);
+ if (prepare_state == WT_PREPARE_INPROGRESS || prepare_state == WT_PREPARE_LOCKED) {
+ WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
- /* Any fast-truncate information can be freed as soon as the delete is stable. */
- __wt_overwrite_and_free(session, ref->ft_info.del);
+ *statep = WT_CHILD_ORIGINAL;
+ return (0);
}
/*
- * If the original page is gone, we can skip the slot on the internal page.
+ * Deal with underlying disk blocks. If there are readers that might want to see the page's
+ * state before it's deleted, or the fast-delete can be undone by RTS, we can't discard the
+ * pages. Write a cell to the internal page with information describing the fast-delete.
*/
- if (ref->addr == NULL) {
- *statep = WT_CHILD_IGNORE;
+ if (__wt_page_del_active(session, ref, true)) {
+ *statep = WT_CHILD_PROXY;
return (0);
}
/*
- * Internal pages with deletes that aren't stable cannot be evicted, we don't have sufficient
- * information to restore the page's information if subsequently read (we wouldn't know which
- * transactions should see the original page and which should see the deleted page).
- */
- if (F_ISSET(r, WT_REC_EVICT))
- return (__wt_set_return(session, EBUSY));
-
- /* If the page cannot be marked clean. */
- r->leave_dirty = true;
-
- /*
- * If the original page cannot be freed, we need to keep a slot on the page to reference it from
- * the parent page.
- *
- * If the delete is not visible in this checkpoint, write the original address normally.
- * Otherwise, we have to write a proxy record. If the delete state is not ready, then delete is
- * not visible as it is in prepared state.
+ * Otherwise, we can discard the leaf page to the block manager and no cell needs to be written.
+ * Done outside of the underlying tracking routines because this action is permanent and
+ * irrevocable. (Clearing the address means we've lost track of the disk address in a permanent
+ * way. This is safe because there's no path to reading the leaf page again: if there's ever a
+ * read into this part of the name space again, the cache read function instantiates an entirely
+ * new page.)
*/
- if (!__wt_page_del_active(session, ref, false))
- *statep = WT_CHILD_PROXY;
-
+ WT_RET(__wt_ref_block_free(session, ref));
+ __wt_overwrite_and_free(session, ref->ft_info.del);
return (0);
}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c
index 3a726134a6f..870e35d3067 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_col.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c
@@ -187,7 +187,7 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/* Build the value cell. */
addr = &multi->addr;
- __wt_rec_cell_build_addr(session, r, addr, NULL, false, r->recno);
+ __wt_rec_cell_build_addr(session, r, addr, NULL, r->recno, NULL);
/* Boundary: split or write the page. */
if (__wt_rec_need_split(r, val->len))
@@ -296,7 +296,7 @@ __wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
__wt_cell_unpack_addr(session, page->dsk, ref->addr, vpack);
if (F_ISSET(vpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED)) {
/* Need to rebuild the cell with the updated time info. */
- __wt_rec_cell_build_addr(session, r, NULL, vpack, false, ref->ref_recno);
+ __wt_rec_cell_build_addr(session, r, NULL, vpack, ref->ref_recno, NULL);
} else {
val->buf.data = ref->addr;
val->buf.size = __wt_cell_total_len(vpack);
@@ -305,7 +305,7 @@ __wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
}
WT_TIME_AGGREGATE_COPY(&ta, &vpack->ta);
} else {
- __wt_rec_cell_build_addr(session, r, addr, NULL, false, ref->ref_recno);
+ __wt_rec_cell_build_addr(session, r, addr, NULL, ref->ref_recno, NULL);
WT_TIME_AGGREGATE_COPY(&ta, &addr->ta);
}
WT_CHILD_RELEASE_ERR(session, hazard, ref);
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c
index a3657631701..cda9c4f7a54 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_row.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c
@@ -256,7 +256,7 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
r->cell_zero = false;
addr = &multi->addr;
- __wt_rec_cell_build_addr(session, r, addr, NULL, false, WT_RECNO_OOB);
+ __wt_rec_cell_build_addr(session, r, addr, NULL, WT_RECNO_OOB, NULL);
/* Boundary: split or write the page. */
if (__wt_rec_need_split(r, key->len + val->len))
@@ -290,7 +290,7 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_PAGE *child;
WT_REC_KV *key, *val;
WT_REF *ref;
- WT_TIME_AGGREGATE ta;
+ WT_TIME_AGGREGATE ft_ta, ta;
size_t size;
bool hazard;
const void *p;
@@ -298,6 +298,7 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
btree = S2BT(session);
child = NULL;
hazard = false;
+ WT_TIME_AGGREGATE_INIT(&ft_ta);
key = &r->k;
kpack = &_kpack;
@@ -388,32 +389,28 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/* Original child. */
break;
case WT_CHILD_PROXY:
- /* Deleted child where we write a proxy cell. */
+ /* Fast-delete child where we write a proxy cell. */
break;
}
/*
* Build the value cell, the child page's address. Addr points to an on-page cell or an
- * off-page WT_ADDR structure. There's a special cell type in the case of page deletion
- * requiring a proxy cell, otherwise use the information from the addr or original cell.
+ * off-page WT_ADDR structure.
*/
if (__wt_off_page(page, addr)) {
- __wt_rec_cell_build_addr(session, r, addr, NULL, state == WT_CHILD_PROXY, WT_RECNO_OOB);
+ __wt_rec_cell_build_addr(session, r, addr, NULL, WT_RECNO_OOB,
+ state == WT_CHILD_PROXY ? ref->ft_info.del : NULL);
WT_TIME_AGGREGATE_COPY(&ta, &addr->ta);
} else {
__wt_cell_unpack_addr(session, page->dsk, ref->addr, vpack);
- if (F_ISSET(vpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED)) {
+ if (F_ISSET(vpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED) || state == WT_CHILD_PROXY) {
/*
* The transaction ids are cleared after restart. Repack the cell with new validity
- * to flush the cleared transaction ids.
+ * to flush the cleared transaction ids. The other use is proxy cells where we need
+ * to write additional information into the address cell.
*/
- __wt_rec_cell_build_addr(
- session, r, NULL, vpack, state == WT_CHILD_PROXY, WT_RECNO_OOB);
- } else if (state == WT_CHILD_PROXY) {
- WT_ERR(__wt_buf_set(session, &val->buf, ref->addr, __wt_cell_total_len(vpack)));
- __wt_cell_type_reset(session, val->buf.mem, 0, WT_CELL_ADDR_DEL);
- val->cell_len = 0;
- val->len = val->buf.size;
+ __wt_rec_cell_build_addr(session, r, NULL, vpack, WT_RECNO_OOB,
+ state == WT_CHILD_PROXY ? ref->ft_info.del : NULL);
} else {
val->buf.data = ref->addr;
val->buf.size = __wt_cell_total_len(vpack);
@@ -422,6 +419,19 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
}
WT_TIME_AGGREGATE_COPY(&ta, &vpack->ta);
}
+
+ /*
+ * The fast-truncate is a stop time window and has to be considered in the internal page's
+ * aggregate information for RTS to find it.
+ */
+ if (state == WT_CHILD_PROXY) {
+ ft_ta.newest_start_durable_ts = ta.newest_start_durable_ts;
+ ft_ta.newest_stop_durable_ts = ref->ft_info.del->durable_timestamp;
+ ft_ta.oldest_start_ts = ta.oldest_start_ts;
+ ft_ta.newest_txn = ref->ft_info.del->txnid;
+ ft_ta.newest_stop_ts = ref->ft_info.del->timestamp;
+ ft_ta.newest_stop_txn = ref->ft_info.del->txnid;
+ }
WT_CHILD_RELEASE_ERR(session, hazard, ref);
/* Build key cell. Truncate any 0th key, internal pages don't need 0th keys. */
@@ -438,6 +448,8 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/* Copy the key and value onto the page. */
__wt_rec_image_copy(session, r, key);
__wt_rec_image_copy(session, r, val);
+ if (state == WT_CHILD_PROXY)
+ WT_TIME_AGGREGATE_MERGE(session, &r->cur_ptr->ta, &ft_ta);
WT_TIME_AGGREGATE_MERGE(session, &r->cur_ptr->ta, &ta);
/* Update compression state. */
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 758d972f954..4257deb2812 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -1862,17 +1862,18 @@ __rec_split_write_header(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK
dsk->type = page->type;
dsk->flags = 0;
-
- /* Set the zero-length value flag in the page header. */
+ /* Set the all/none zero-length value flags. */
if (page->type == WT_PAGE_ROW_LEAF) {
- F_CLR(dsk, WT_PAGE_EMPTY_V_ALL | WT_PAGE_EMPTY_V_NONE);
-
if (chunk->entries != 0 && r->all_empty_value)
F_SET(dsk, WT_PAGE_EMPTY_V_ALL);
if (chunk->entries != 0 && !r->any_empty_value)
F_SET(dsk, WT_PAGE_EMPTY_V_NONE);
}
+ /* Set the fast-truncate proxy cell information flag. */
+ if (page->type == WT_PAGE_ROW_INT && __wt_process.fast_truncate_2022)
+ F_SET(dsk, WT_PAGE_FT_UPDATE);
+
dsk->unused = 0;
dsk->version = WT_PAGE_VERSION_TS;
diff --git a/src/third_party/wiredtiger/src/support/global.c b/src/third_party/wiredtiger/src/support/global.c
index dc038c2b53c..720e6dcd5fc 100644
--- a/src/third_party/wiredtiger/src/support/global.c
+++ b/src/third_party/wiredtiger/src/support/global.c
@@ -123,6 +123,11 @@ __global_once(void)
__wt_process.checksum = wiredtiger_crc32c_func();
__global_calibrate_ticks();
+
+ /* Run-time configuration. */
+#ifdef WT_STANDALONE_BUILD
+ __wt_process.fast_truncate_2022 = true;
+#endif
}
/*
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 12c99d31d35..048935d3192 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -1666,7 +1666,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
if (op->type == WT_TXN_OP_REF_DELETE) {
WT_REF_LOCK(session, op->u.ref, &previous_state);
if (previous_state == WT_REF_DELETED)
- op->u.ref->ft_info.del->committed = 1;
+ op->u.ref->ft_info.del->committed = true;
else
__wt_free(session, op->u.ref->ft_info.update);
WT_REF_UNLOCK(op->u.ref, previous_state);
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index b4112f6f416..d994fcad9f8 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -1097,8 +1097,8 @@ __rollback_get_ref_max_durable_timestamp(WT_SESSION_IMPL *session, WT_TIME_AGGRE
/*
* __rollback_page_needs_abort --
- * Check whether the page needs rollback. Return true if the page has modifications newer than
- * the given timestamp Otherwise return false.
+ * Check whether the page needs rollback, returning true if the page has modifications newer
+ * than the given timestamp.
*/
static bool
__rollback_page_needs_abort(
@@ -1224,54 +1224,40 @@ __rollback_abort_updates(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r
}
/*
- * __rollback_abort_fast_truncate --
- * Abort fast truncate for an internal page of leaf pages.
+ * __rollback_to_stable_page_skip --
+ * Skip if rollback to stable doesn't require reading this page.
*/
static int
-__rollback_abort_fast_truncate(
- WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp)
-{
- WT_REF *child_ref;
-
- WT_INTL_FOREACH_BEGIN (session, ref->page, child_ref) {
- /*
- * A fast-truncate page is either in the WT_REF_DELETED state (where the WT_PAGE_DELETED
- * structure has the timestamp information), or in an in-memory state where it started as a
- * fast-truncate page which was then instantiated and the timestamp information moved to the
- * individual WT_UPDATE structures. When reviewing internal pages, ignore the second case,
- * an instantiated page is handled when the leaf page is visited.
- */
- if (child_ref->state == WT_REF_DELETED && child_ref->ft_info.del != NULL &&
- rollback_timestamp < child_ref->ft_info.del->durable_timestamp) {
- __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
- "%p: deleted page rolled back", (void *)child_ref);
- WT_RET(__wt_delete_page_rollback(session, child_ref));
- }
- }
- WT_INTL_FOREACH_END;
- return (0);
-}
-
-/*
- * __wt_rts_page_skip --
- * Skip if rollback to stable doesn't requires to read this page.
- */
-int
-__wt_rts_page_skip(
+__rollback_to_stable_page_skip(
WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool visible_all, bool *skipp)
{
+ WT_PAGE_DELETED *page_del;
wt_timestamp_t rollback_timestamp;
- rollback_timestamp = *(wt_timestamp_t *)(context);
+ rollback_timestamp = *(wt_timestamp_t *)context;
+ WT_UNUSED(visible_all);
+
*skipp = false; /* Default to reading */
- WT_UNUSED(visible_all);
+ /*
+ * Skip fast-truncate operations durable at or before the RTS timestamp (reading the page will
+ * delete it). A page without fast-truncate timestamp information is an old format page: skip
+ * them as there's no way to get correct behavior, and skipping them matches historic behavior.
+ */
+ if (ref->state == WT_REF_DELETED) {
+ page_del = ref->ft_info.del;
+ if (page_del == NULL ||
+ (__rollback_txn_visible_id(session, page_del->txnid) &&
+ page_del->durable_timestamp <= rollback_timestamp))
+ *skipp = true;
+ return (0);
+ }
- /* If the page state is other than on disk, we want to look at it. */
+ /* Otherwise, if the page state is other than on disk, we want to look at it. */
if (ref->state != WT_REF_DISK)
return (0);
- /* Check whether this ref has any possible updates to be aborted. */
+ /* Check whether this on-disk page has any updates to be aborted. */
if (!__rollback_page_needs_abort(session, ref, rollback_timestamp)) {
*skipp = true;
__wt_verbose_multi(
@@ -1294,13 +1280,11 @@ __rollback_to_stable_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollbac
/* Walk the tree, marking commits aborted where appropriate. */
ref = NULL;
- while ((ret = __wt_tree_walk_custom_skip(session, &ref, __wt_rts_page_skip, &rollback_timestamp,
- WT_READ_NO_EVICT | WT_READ_WONT_NEED | WT_READ_VISIBLE_ALL)) == 0 &&
+ while (
+ (ret = __wt_tree_walk_custom_skip(session, &ref, __rollback_to_stable_page_skip,
+ &rollback_timestamp, WT_READ_NO_EVICT | WT_READ_WONT_NEED | WT_READ_VISIBLE_ALL)) == 0 &&
ref != NULL)
- if (F_ISSET(ref, WT_REF_FLAG_INTERNAL))
- WT_WITH_PAGE_INDEX(
- session, ret = __rollback_abort_fast_truncate(session, ref, rollback_timestamp));
- else
+ if (F_ISSET(ref, WT_REF_FLAG_LEAF))
WT_RET(__rollback_abort_updates(session, ref, rollback_timestamp));
return (ret);
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py
new file mode 100644
index 00000000000..c88bd3d578b
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable34.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+from helper import simulate_crash_restart
+from test_rollback_to_stable01 import test_rollback_to_stable_base
+from wiredtiger import stat, WT_NOTFOUND
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_rollback_to_stable33.py
+# Test interaction between fast-delete and RTS.
+class test_rollback_to_stable33(test_rollback_to_stable_base):
+ session_config = 'isolation=snapshot'
+ conn_config = 'cache_size=50MB,statistics=(all),log=(enabled=false)'
+
+ format_values = [
+ ('column', dict(key_format='r', value_format='S', extraconfig='')),
+ ('column_fix', dict(key_format='r', value_format='8t',
+ extraconfig=',allocation_size=512,leaf_page_max=512')),
+ ('integer_row', dict(key_format='i', value_format='S', extraconfig='')),
+ ('string_row', dict(key_format='S', value_format='S', extraconfig='')),
+ ]
+ prepare_values = [
+ ('no_prepare', dict(prepare=False)),
+ ('prepare', dict(prepare=True)),
+ ]
+ second_checkpoint_values = [
+ ('second_checkpoint', dict(second_checkpoint=True)),
+ ('no_second_checkpoint', dict(second_checkpoint=False)),
+ ]
+ rollback_modes = [
+ ('runtime', dict(crash=False)),
+ ('recovery', dict(crash=True)),
+ ]
+
+ scenarios = make_scenarios(format_values, prepare_values, second_checkpoint_values,
+ rollback_modes)
+
+ # Make all the values different so it's easier to see what happens if ranges go missing.
+ def mkdata(self, basevalue, i):
+ if self.value_format == '8t':
+ return basevalue
+ return basevalue + str(i)
+
+ def evict(self, ds, lo, hi, basevalue):
+ evict_cursor = self.session.open_cursor(ds.uri, None, "debug=(release_evict)")
+ self.session.begin_transaction()
+
+ # Evict every 3rd key to make sure we get all the pages but not write them out
+ # over and over again any more than necessary. FUTURE: improve this to evict
+ # each page once when we get a suitable interface for that.
+ for i in range(lo, hi, 3):
+ evict_cursor.set_key(ds.key(i))
+ self.assertEquals(evict_cursor.search(), 0)
+ self.assertEquals(evict_cursor.get_value(), self.mkdata(basevalue, i))
+ evict_cursor.reset()
+ self.session.rollback_transaction()
+ evict_cursor.close()
+
+ # Call this checkx to distinguish it from the parent class's default check().
+ def checkx(self, ds, nrows, read_ts, basevalue):
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(read_ts))
+ cursor = self.session.open_cursor(ds.uri)
+ i = 1
+ for k, v in cursor:
+ self.assertEqual(v, self.mkdata(basevalue, i))
+ self.assertEqual(k, ds.key(i))
+ i += 1
+ self.session.commit_transaction()
+ self.assertEqual(i, nrows + 1)
+ cursor.close()
+
+ def test_rollback_to_stable(self):
+ # RTS will fail if there are uncommitted prepared transactions, so skip tests of prepare
+ # with a runtime call to RTS, that doesn't add useful testing scenarios.
+ if self.prepare and not self.crash:
+ return
+
+ nrows = 10000
+
+ # Create a table without logging.
+ uri = "table:rollback_to_stable33"
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config='log=(enabled=false)' + self.extraconfig)
+ ds.populate()
+
+ if self.value_format == '8t':
+ valuea = 97
+ valueb = 98
+ else:
+ valuea = "aaaaa" * 100
+ valueb = "bbbbb" * 100
+
+ # Pin oldest and stable timestamps to 10.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(10) +
+ ',stable_timestamp=' + self.timestamp_str(10))
+
+ cursor = self.session.open_cursor(uri)
+
+ # Write some baseline data out at time 20.
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = self.mkdata(valuea, i)
+ # Make a new transaction every 97 keys so the transactions don't get huge.
+ if i % 97 == 0:
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(20))
+ self.session.begin_transaction()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(20))
+
+ # Write some more data out at time 30.
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = self.mkdata(valueb, i)
+ # Make a new transaction every 97 keys so the transactions don't get huge.
+ if i % 97 == 0:
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(30))
+ self.session.begin_transaction()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(30))
+
+ cursor.close()
+
+ # Evict the lot.
+ self.evict(ds, 1, nrows + 1, valueb)
+
+ # Move stable to 25 (after the baseline data).
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(25))
+
+ # Checkpoint.
+ self.session.checkpoint()
+
+ # Now fast-delete the lot at time 35.
+ # Use a separate session for this so that if we leave the truncate prepared it
+ # doesn't obstruct the rest of the test.
+ session2 = self.conn.open_session()
+ session2.begin_transaction()
+ lo_cursor = session2.open_cursor(uri)
+ lo_cursor.set_key(ds.key(nrows // 2 + 1))
+ hi_cursor = session2.open_cursor(uri)
+ hi_cursor.set_key(ds.key(nrows + 1))
+ session2.truncate(None, lo_cursor, hi_cursor, None)
+ if self.prepare:
+ session2.prepare_transaction('prepare_timestamp=' + self.timestamp_str(35))
+ else:
+ session2.commit_transaction('commit_timestamp=' + self.timestamp_str(35))
+ hi_cursor.close()
+ lo_cursor.close()
+
+ # Check stats to make sure we fast-deleted at least one page.
+ # Since VLCS and FLCS do not (yet) support fast-delete, instead assert we didn't.
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ fastdelete_pages = stat_cursor[stat.conn.rec_page_delete_fast][2]
+ if self.key_format == 'r':
+ self.assertEqual(fastdelete_pages, 0)
+ else:
+ self.assertGreater(fastdelete_pages, 0)
+
+ if self.second_checkpoint:
+ # Checkpoint again with the deletion.
+ self.session.checkpoint()
+
+ # Roll back, either via crashing or by explicit RTS.
+ if self.crash:
+ simulate_crash_restart(self, ".", "RESTART")
+ else:
+ self.conn.rollback_to_stable()
+
+ # We should see the original data at read-ts 20 and 30.
+ self.checkx(ds, nrows, 20, valuea)
+ self.checkx(ds, nrows, 30, valuea)
diff --git a/src/third_party/wiredtiger/test/suite/test_truncate09.py b/src/third_party/wiredtiger/test/suite/test_truncate09.py
new file mode 100644
index 00000000000..e4ace88710e
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_truncate09.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_truncate09.py
+# Check for fast-truncate rollback-to-stable timestamps.
+
+import wttest
+from helper import copy_wiredtiger_home, simulate_crash_restart
+from wtdataset import simple_key, simple_value
+from wtscenario import make_scenarios
+
+class test_truncate09(wttest.WiredTigerTestCase):
+ # We don't test FLCS, missing records return as 0 values.
+ format_values = [
+ ('column', dict(key_format='r')),
+ ('row_integer', dict(key_format='i')),
+ ]
+ scenarios = make_scenarios(format_values)
+
+ def test_truncate09(self):
+ # Create a large table with lots of pages.
+ uri = "table:test_truncate09"
+ format = 'key_format={},value_format=S'.format(self.key_format)
+ self.session.create(uri, 'allocation_size=512,leaf_page_max=512,' + format)
+
+ cursor = self.session.open_cursor(uri)
+ for i in range(1, 80000):
+ cursor[simple_key(cursor, i)] = simple_value(cursor, i)
+ cursor.close()
+
+ # Force to disk.
+ self.reopen_conn()
+
+ # Set the oldest timestamp and the stable timestamp.
+ self.conn.set_timestamp('oldest_timestamp='+ self.timestamp_str(100))
+ self.conn.set_timestamp('stable_timestamp='+ self.timestamp_str(100))
+
+ # Start a transaction.
+ self.session.begin_transaction()
+
+ # Truncate a chunk.
+ c1 = self.session.open_cursor(uri, None)
+ c1.set_key(simple_key(c1, 20000))
+ c2 = self.session.open_cursor(uri, None)
+ c2.set_key(simple_key(c1, 40000))
+ self.session.truncate(None, c1, c2, None)
+
+ # Commit the transaction.
+ self.session.timestamp_transaction('commit_timestamp=' + self.timestamp_str(150))
+ self.session.commit_transaction()
+
+ # Move the stable timestamp to make the previous truncate operation permanent.
+ self.conn.set_timestamp('stable_timestamp='+ self.timestamp_str(200))
+
+ # Checkpoint
+ self.session.checkpoint()
+
+ # Start a transaction.
+ self.session.begin_transaction()
+
+ # Truncate a chunk.
+ c1.set_key(simple_key(c1, 50000))
+ c2.set_key(simple_key(c1, 70000))
+ self.session.truncate(None, c1, c2, None)
+
+ # Remove a single row.
+ c1.set_key(simple_key(c1, 75000))
+ c1.remove()
+
+ # Commit the transaction.
+ self.session.timestamp_transaction('commit_timestamp=' + self.timestamp_str(250))
+ self.session.commit_transaction()
+
+ # Checkpoint
+ self.session.checkpoint()
+
+ # Restart, testing RTS on the copy.
+ copy_wiredtiger_home(self, ".", "RESTART")
+ simulate_crash_restart(self, ".", "RESTART")
+
+ # Search for a key in the truncated range which is stabilised, hence should not find it.
+ cursor = self.session.open_cursor(uri)
+ cursor.set_key(simple_key(cursor, 30000))
+ self.assertNotEqual(cursor.search(), 0)
+
+ # Search for a key in the truncated range which is not stabilised, hence should find it.
+ cursor.set_key(simple_key(cursor, 60000))
+ self.assertEqual(cursor.search(), 0)
+
+ # Search for a removed key which is not stabilised, hence should find it.
+ cursor.set_key(simple_key(cursor, 75000))
+ self.assertEqual(cursor.search(), 0)
+
+if __name__ == '__main__':
+ wttest.run()