summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/reconcile/rec_write.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/reconcile/rec_write.c')
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c143
1 files changed, 82 insertions, 61 deletions
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index af43a56f877..a62489cb661 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -1205,7 +1205,6 @@ static int
__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp)
{
- WT_BTREE *btree;
WT_PAGE *page;
WT_UPDATE *first_ts_upd, *first_txn_upd, *first_upd, *upd;
wt_timestamp_t *timestampp;
@@ -1214,7 +1213,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
*updp = NULL;
- btree = S2BT(session);
page = r->page;
first_ts_upd = first_txn_upd = NULL;
max_txn = WT_TXN_NONE;
@@ -1262,15 +1260,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
* uncommitted updates). Lookaside eviction can save any
* committed update. Regular eviction checks that the maximum
* transaction ID and timestamp seen are stable.
- *
- * Use the first committed entry we find in the lookaside
- * table.
*/
- if (F_ISSET(btree, WT_BTREE_LOOKASIDE) && !uncommitted) {
- *updp = upd;
- break;
- }
-
if (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
!__wt_txn_upd_visible_all(session, upd) :
!__wt_txn_upd_visible(session, upd)) {
@@ -1326,9 +1316,10 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
/*
* The checkpoint transaction is special. Make sure we never write
- * (metadata) updates from a checkpoint in a concurrent session.
+ * metadata updates from a checkpoint in a concurrent session.
*/
- WT_ASSERT(session, *updp == NULL || (*updp)->txnid == WT_TXN_NONE ||
+ WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) ||
+ *updp == NULL || (*updp)->txnid == WT_TXN_NONE ||
(*updp)->txnid != S2C(session)->txn_global.checkpoint_state.id ||
WT_SESSION_IS_CHECKPOINT(session));
@@ -1352,13 +1343,10 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
WT_UNUSED(first_ts_upd);
timestampp = NULL;
#endif
- if (F_ISSET(btree, WT_BTREE_LOOKASIDE))
- all_visible = !uncommitted;
- else
- all_visible = *updp == first_txn_upd &&
- (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
- __wt_txn_visible_all(session, max_txn, timestampp) :
- __wt_txn_visible(session, max_txn, timestampp));
+ all_visible = *updp == first_txn_upd &&
+ (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
+ __wt_txn_visible_all(session, max_txn, timestampp) :
+ __wt_txn_visible(session, max_txn, timestampp));
if (all_visible)
goto check_original_value;
@@ -1391,8 +1379,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
* path is the WT_REC_UPDATE_RESTORE flag, the lookaside table path is
* the WT_REC_LOOKASIDE flag.
*/
- if (!F_ISSET(r, WT_REC_LOOKASIDE | WT_REC_UPDATE_RESTORE) &&
- !F_ISSET(btree, WT_BTREE_LOOKASIDE))
+ if (!F_ISSET(r, WT_REC_LOOKASIDE | WT_REC_UPDATE_RESTORE))
return (EBUSY);
if (uncommitted && !F_ISSET(r, WT_REC_UPDATE_RESTORE))
return (EBUSY);
@@ -1405,14 +1392,14 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
#ifdef HAVE_TIMESTAMPS
/* Track the oldest saved timestamp for lookaside. */
- if (F_ISSET(r, WT_REC_LOOKASIDE)) {
+ if (F_ISSET(r, WT_REC_LOOKASIDE))
for (upd = first_upd; upd->next != NULL; upd = upd->next)
- ;
- if (__wt_timestamp_cmp(
- &r->min_saved_timestamp, &upd->timestamp) > 0)
- __wt_timestamp_set(
- &r->min_saved_timestamp, &upd->timestamp);
- }
+ if (upd->txnid != WT_TXN_ABORTED &&
+ upd->txnid != WT_TXN_NONE &&
+ __wt_timestamp_cmp(
+ &upd->timestamp, &r->min_saved_timestamp) < 0)
+ __wt_timestamp_set(
+ &r->min_saved_timestamp, &upd->timestamp);
#endif
check_original_value:
@@ -1659,6 +1646,17 @@ __rec_child_modify(WT_SESSION_IMPL *session,
if (F_ISSET(r, WT_REC_EVICT))
return (EBUSY);
+ /*
+ * A page evicted with lookaside entries may not have
+ * an address, if no updates were visible to
+ * reconciliation. Any child pages in that state
+ * should be ignored.
+ */
+ if (ref->addr == NULL) {
+ *statep = WT_CHILD_IGNORE;
+ WT_CHILD_RELEASE(session, *hazardp, ref);
+ }
+
goto done;
case WT_REF_MEM:
@@ -1996,6 +1994,29 @@ __rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r)
return (page_size * 2);
}
+#define WT_REC_MAX_SAVED_UPDATES 100
+
+/*
+ * __rec_need_split --
+ * Check whether adding some bytes to the page requires a split.
+ *
+ * This takes into account the disk image growing across a boundary, and
+ * also triggers a split for row store leaf pages when a threshold number
+ * of saved updates is reached. This allows pages to split for update /
+ * restore and lookaside eviction when there is no visible data that
+ * causes the disk image to grow.
+ */
+static bool
+__rec_need_split(WT_RECONCILE *r, size_t len)
+{
+ if (r->page->type == WT_PAGE_ROW_LEAF &&
+ r->supd_next >= WT_REC_MAX_SAVED_UPDATES)
+ return (true);
+
+ return (r->raw_compression ?
+ len > r->space_avail : WT_CHECK_CROSSING_BND(r, len));
+}
+
/*
* __rec_split_page_size_from_pct --
* Given a split percentage, calculate split page size in bytes.
@@ -2456,7 +2477,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
btree = S2BT(session);
/* Fixed length col store can call with next_len 0 */
- WT_ASSERT(session, next_len == 0 || r->space_avail < next_len);
+ WT_ASSERT(session, next_len == 0 || __rec_need_split(r, next_len));
/*
* We should never split during salvage, and we're about to drop core
@@ -2474,7 +2495,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
* Additionally, grow the buffer to contain the current item if we
* haven't already consumed a reasonable portion of a split chunk.
*/
- if (inuse < r->split_size / 2)
+ if (inuse < r->split_size / 2 && !__rec_need_split(r, 0))
goto done;
/* All page boundaries reset the dictionary. */
@@ -2557,7 +2578,7 @@ __rec_split_crossing_bnd(
WT_BTREE *btree;
size_t min_offset;
- WT_ASSERT(session, WT_CHECK_CROSSING_BND(r, next_len));
+ WT_ASSERT(session, __rec_need_split(r, next_len));
/*
* If crossing the minimum split size boundary, store the boundary
@@ -2566,7 +2587,7 @@ __rec_split_crossing_bnd(
* large enough, just split at this point.
*/
if (WT_CROSSING_MIN_BND(r, next_len) &&
- !WT_CROSSING_SPLIT_BND(r, next_len)) {
+ !WT_CROSSING_SPLIT_BND(r, next_len) && !__rec_need_split(r, 0)) {
btree = S2BT(session);
WT_ASSERT(session, r->cur_ptr->min_offset == 0);
@@ -2640,7 +2661,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
/*
* We can get here if the first key/value pair won't fit.
*/
- if (r->entries == 0)
+ if (r->entries == 0 && !__rec_need_split(r, 0))
goto split_grow;
/*
@@ -4110,13 +4131,13 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
WT_CHILD_RELEASE_ERR(session, hazard, ref);
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (val->len > r->space_avail)
+ if (__rec_need_split(r, val->len)) {
+ if (r->raw_compression)
WT_ERR(__rec_split_raw(session, r, val->len));
- } else
- if (WT_CHECK_CROSSING_BND(r, val->len))
+ else
WT_ERR(__rec_split_crossing_bnd(
session, r, val->len));
+ }
/* Copy the value onto the page. */
__rec_copy_incr(session, r, val);
@@ -4158,13 +4179,13 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
addr->addr, addr->size, __rec_vtype(addr), r->recno);
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (val->len > r->space_avail)
+ if (__rec_need_split(r, val->len)) {
+ if (r->raw_compression)
WT_RET(__rec_split_raw(session, r, val->len));
- } else
- if (WT_CHECK_CROSSING_BND(r, val->len))
+ else
WT_RET(__rec_split_crossing_bnd(
session, r, val->len));
+ }
/* Copy the value onto the page. */
__rec_copy_incr(session, r, val);
@@ -4431,12 +4452,12 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r,
session, r, value->data, value->size, rle));
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (val->len > r->space_avail)
+ if (__rec_need_split(r, val->len)) {
+ if (r->raw_compression)
WT_RET(__rec_split_raw(session, r, val->len));
- } else
- if (WT_CHECK_CROSSING_BND(r, val->len))
+ else
WT_RET(__rec_split_crossing_bnd(session, r, val->len));
+ }
/* Copy the value onto the page. */
if (!deleted && !overflow_type && btree->dictionary)
@@ -5132,12 +5153,11 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
r->cell_zero = false;
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (key->len + val->len > r->space_avail)
+ if (__rec_need_split(r, key->len + val->len)) {
+ if (r->raw_compression)
WT_ERR(__rec_split_raw(
session, r, key->len + val->len));
- } else
- if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) {
+ else {
/*
* In one path above, we copied address blocks
* from the page rather than building the actual
@@ -5153,6 +5173,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_ERR(__rec_split_crossing_bnd(
session, r, key->len + val->len));
}
+ }
/* Copy the key and value onto the page. */
__rec_copy_incr(session, r, key);
@@ -5202,14 +5223,14 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB);
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (key->len + val->len > r->space_avail)
+ if (__rec_need_split(r, key->len + val->len)) {
+ if (r->raw_compression)
WT_RET(__rec_split_raw(
session, r, key->len + val->len));
- } else
- if (WT_CHECK_CROSSING_BND(r, key->len + val->len))
+ else
WT_RET(__rec_split_crossing_bnd(
session, r, key->len + val->len));
+ }
/* Copy the key and value onto the page. */
__rec_copy_incr(session, r, key);
@@ -5549,12 +5570,11 @@ build:
}
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (key->len + val->len > r->space_avail)
+ if (__rec_need_split(r, key->len + val->len)) {
+ if (r->raw_compression)
WT_ERR(__rec_split_raw(
session, r, key->len + val->len));
- } else
- if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) {
+ else {
/*
* If we copied address blocks from the page
* rather than building the actual key, we have
@@ -5585,6 +5605,7 @@ build:
WT_ERR(__rec_split_crossing_bnd(
session, r, key->len + val->len));
}
+ }
/* Copy the key/value pair onto the page. */
__rec_copy_incr(session, r, key);
@@ -5693,12 +5714,11 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key));
/* Boundary: split or write the page. */
- if (r->raw_compression) {
- if (key->len + val->len > r->space_avail)
+ if (__rec_need_split(r, key->len + val->len)) {
+ if (r->raw_compression)
WT_RET(__rec_split_raw(
session, r, key->len + val->len));
- } else
- if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) {
+ else {
/*
* Turn off prefix compression until a full key
* written to the new page, and (unless already
@@ -5717,6 +5737,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
WT_RET(__rec_split_crossing_bnd(
session, r, key->len + val->len));
}
+ }
/* Copy the key/value pair onto the page. */
__rec_copy_incr(session, r, key);