/*- * Copyright (c) 2014-2019 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ #include "wt_internal.h" /* * Fast-delete support. * * This file contains most of the code that allows WiredTiger to delete pages * of data without reading them into the cache. (This feature is currently * only available for row-store objects.) * * The way cursor truncate works in a row-store object is it explicitly reads * the first and last pages of the truncate range, then walks the tree with a * flag so the tree walk code skips reading eligible pages within the range * and instead just marks them as deleted, by changing their WT_REF state to * WT_REF_DELETED. Pages ineligible for this fast path include pages already * in the cache, having overflow items, or requiring lookaside records. * Ineligible pages are read and have their rows updated/deleted individually. * The transaction for the delete operation is stored in memory referenced by * the WT_REF.page_del field. * * Future cursor walks of the tree will skip the deleted page based on the * transaction stored for the delete, but it gets more complicated if a read is * done using a random key, or a cursor walk is done with a transaction where * the delete is not visible. In those cases, we read the original contents of * the page. The page-read code notices a deleted page is being read, and as * part of the read instantiates the contents of the page, creating a WT_UPDATE * with a deleted operation, in the same transaction as deleted the page. In * other words, the read process makes it appear as if the page was read and * each individual row deleted, exactly as would have happened if the page had * been in the cache all along. * * There's an additional complication to support rollback of the page delete. * When the page was marked deleted, a pointer to the WT_REF was saved in the * deleting session's transaction list and the delete is unrolled by resetting * the WT_REF_DELETED state back to WT_REF_DISK. However, if the page has been * instantiated by some reading thread, that's not enough, each individual row * on the page must have the delete operation reset. If the page split, the * WT_UPDATE lists might have been saved/restored during reconciliation and * appear on multiple pages, and the WT_REF stored in the deleting session's * transaction list is no longer useful. For this reason, when the page is * instantiated by a read, a list of the WT_UPDATE structures on the page is * stored in the WT_REF.page_del field, with the transaction ID, that way the * session committing/unrolling the delete can find all WT_UPDATE structures * that require update. * * One final note: pages can also be marked deleted if emptied and evicted. In * that case, the WT_REF state will be set to WT_REF_DELETED but there will not * be any associated WT_REF.page_del field. These pages are always skipped * during cursor traversal (the page could not have been evicted if there were * updates that weren't globally visible), and if read is forced to instantiate * such a page, it simply creates an empty page from scratch. */ /* * __wt_delete_page -- * If deleting a range, try to delete the page without instantiating it. */ int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_ADDR *ref_addr; WT_DECL_RET; uint32_t previous_state; *skipp = false; /* If we have a clean page in memory, attempt to evict it. */ previous_state = ref->state; if ((previous_state == WT_REF_MEM || previous_state == WT_REF_LIMBO) && WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { WT_REF_SET_STATE(ref, previous_state); return (0); } (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); ret = __wt_evict(session, ref, previous_state, 0); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); ret = 0; } /* * Fast check to see if it's worth locking, then atomically switch the page's state to lock it. */ previous_state = ref->state; switch (previous_state) { case WT_REF_DISK: case WT_REF_LOOKASIDE: break; default: return (0); } if (!WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED)) return (0); /* * If this WT_REF was previously part of a truncate operation, there * may be existing page-delete information. The structure is only read * while the state is locked, free the previous version. * * Note: changes have been made, we must publish any state change from * this point on. */ if (ref->page_del != NULL) { WT_ASSERT(session, ref->page_del->txnid == WT_TXN_ABORTED); __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); } /* * We cannot truncate pages that have overflow key/value items as the * overflow blocks have to be discarded. The way we figure that out is * to check the page's cell type, cells for leaf pages without overflow * items are special. * * To look at an on-page cell, we need to look at the parent page, and * that's dangerous, our parent page could change without warning if * the parent page were to split, deepening the tree. We can look at * the parent page itself because the page can't change underneath us. * However, if the parent page splits, our reference address can change; * we don't care what version of it we read, as long as we don't read * it twice. */ WT_ORDERED_READ(ref_addr, ref->addr); if (ref_addr != NULL && (__wt_off_page(ref->home, ref_addr) ? ref_addr->type != WT_ADDR_LEAF_NO : __wt_cell_type_raw((WT_CELL *)ref_addr) != WT_CELL_ADDR_LEAF_NO)) goto err; /* * This action dirties the parent page: mark it dirty now, there's no future reconciliation of * the child leaf page that will dirty it as we write the tree. */ WT_ERR(__wt_page_parent_modify_set(session, ref, false)); /* Allocate and initialize the page-deleted structure. */ WT_ERR(__wt_calloc_one(session, &ref->page_del)); ref->page_del->previous_state = previous_state; WT_ERR(__wt_txn_modify_page_delete(session, ref)); *skipp = true; WT_STAT_CONN_INCR(session, rec_page_delete_fast); WT_STAT_DATA_INCR(session, rec_page_delete_fast); /* Publish the page to its new state, ensuring visibility. */ WT_REF_SET_STATE(ref, WT_REF_DELETED); return (0); err: __wt_free(session, ref->page_del); /* Publish the page to its previous state, ensuring visibility. */ WT_REF_SET_STATE(ref, previous_state); return (ret); } /* * __wt_delete_page_rollback -- * Abort pages that were deleted without being instantiated. */ int __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) { WT_UPDATE **updp; uint64_t sleep_usecs, yield_count; uint32_t current_state; bool locked; /* * If the page is still "deleted", it's as we left it, reset the state to on-disk and we're * done. Otherwise, we expect the page is either instantiated or being instantiated. Loop * because it's possible for the page to return to the deleted state if instantiation fails. */ for (locked = false, sleep_usecs = yield_count = 0;;) { switch (current_state = ref->state) { case WT_REF_DELETED: /* * If the page is still "deleted", it's as we left it, reset the state. */ if (WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, ref->page_del->previous_state)) goto done; break; case WT_REF_LOCKED: /* * A possible state, the page is being instantiated. */ break; case WT_REF_MEM: case WT_REF_SPLIT: if (WT_REF_CAS_STATE(session, ref, current_state, WT_REF_LOCKED)) locked = true; break; case WT_REF_DISK: case WT_REF_LIMBO: case WT_REF_LOOKASIDE: case WT_REF_READING: default: return (__wt_illegal_value(session, current_state)); } if (locked) break; /* * We wait for the change in page state, yield before retrying, and if we've yielded enough * times, start sleeping so we don't burn CPU to no purpose. */ __wt_spin_backoff(&yield_count, &sleep_usecs); WT_STAT_CONN_INCRV(session, page_del_rollback_blocked, sleep_usecs); } /* * We can't use the normal read path to get a copy of the page * because the session may have closed the cursor, we no longer * have the reference to the tree required for a hazard * pointer. We're safe because with unresolved transactions, * the page isn't going anywhere. * * The page is in an in-memory state, which means it * was instantiated at some point. Walk any list of * update structures and abort them. */ WT_ASSERT(session, locked); if ((updp = ref->page_del->update_list) != NULL) for (; *updp != NULL; ++updp) (*updp)->txnid = WT_TXN_ABORTED; WT_REF_SET_STATE(ref, current_state); done: /* * Now mark the truncate aborted: this must come last because after this point there is nothing * preventing the page from being evicted. */ WT_PUBLISH(ref->page_del->txnid, WT_TXN_ABORTED); return (0); } /* * __wt_delete_page_skip -- * If iterating a cursor, skip deleted pages that are either visible to us or globally visible. */ bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) { bool skip; /* * Deleted pages come from two sources: either it's a truncate as * described above, or the page has been emptied by other operations * and eviction deleted it. * * In both cases, the WT_REF state will be WT_REF_DELETED. In the case * of a truncated page, there will be a WT_PAGE_DELETED structure with * the transaction ID of the transaction that deleted the page, and the * page is visible if that transaction ID is visible. In the case of an * empty page, there will be no WT_PAGE_DELETED structure and the delete * is by definition visible, eviction could not have deleted the page if * there were changes on it that were not globally visible. * * We're here because we found a WT_REF state set to WT_REF_DELETED. It * is possible the page is being read into memory right now, though, and * the page could switch to an in-memory state at any time. Lock down * the structure, just to be safe. */ if (ref->page_del == NULL && ref->page_las == NULL) return (true); if (!WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED)) return (false); skip = !__wt_page_del_active(session, ref, visible_all) && !__wt_page_las_active(session, ref); /* * The page_del structure can be freed as soon as the delete is stable: it is only read when the * ref state is locked. It is worth checking every time we come through because once this is * freed, we no longer need synchronization to check the ref. */ if (skip && ref->page_del != NULL && (visible_all || __wt_txn_visible_all(session, ref->page_del->txnid, ref->page_del->timestamp))) { __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); } WT_REF_SET_STATE(ref, WT_REF_DELETED); return (skip); } /* * __tombstone_update_alloc -- * Allocate and initialize a page-deleted tombstone update structure. */ static int __tombstone_update_alloc( WT_SESSION_IMPL *session, WT_PAGE_DELETED *page_del, WT_UPDATE **updp, size_t *sizep) { WT_UPDATE *upd; WT_RET(__wt_update_alloc(session, NULL, &upd, sizep, WT_UPDATE_TOMBSTONE)); /* * Cleared memory matches the lowest possible transaction ID and timestamp, do nothing. */ if (page_del != NULL) { upd->txnid = page_del->txnid; upd->start_ts = page_del->timestamp; upd->durable_ts = page_del->durable_timestamp; upd->prepare_state = page_del->prepare_state; } *updp = upd; return (0); } /* * __wt_delete_page_instantiate -- * Instantiate an entirely deleted row-store leaf page. */ int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; WT_DECL_RET; WT_INSERT *ins; WT_INSERT_HEAD *insert; WT_PAGE *page; WT_PAGE_DELETED *page_del; WT_ROW *rip; WT_UPDATE **upd_array, *upd; size_t size; uint32_t count, i; btree = S2BT(session); page = ref->page; WT_STAT_CONN_INCR(session, cache_read_deleted); WT_STAT_DATA_INCR(session, cache_read_deleted); /* * Give the page a modify structure. * * Mark tree dirty, unless the handle is read-only. * (We'd like to free the deleted pages, but if the handle is read-only, * we're not able to do so.) */ WT_RET(__wt_page_modify_init(session, page)); if (!F_ISSET(btree, WT_BTREE_READONLY)) __wt_page_modify_set(session, page); if (ref->page_del != NULL && ref->page_del->prepare_state != WT_PREPARE_INIT) { WT_STAT_CONN_INCR(session, cache_read_deleted_prepared); WT_STAT_DATA_INCR(session, cache_read_deleted_prepared); } /* * An operation is accessing a "deleted" page, and we're building an * in-memory version of the page (making it look like all entries in * the page were individually updated by a remove operation). There * are two cases where we end up here: * * First, a running transaction used a truncate call to delete the page * without reading it, in which case the page reference includes a * structure with a transaction ID; the page we're building might split * in the future, so we update that structure to include references to * all of the update structures we create, so the transaction can abort. * * Second, a truncate call deleted a page and the truncate committed, * but an older transaction in the system forced us to keep the old * version of the page around, then we crashed and recovered or we're * running inside a checkpoint, and now we're being forced to read that * page. * * Expect a page-deleted structure if there's a running transaction that * needs to be resolved, otherwise, there may not be one (and, if the * transaction has resolved, we can ignore the page-deleted structure). */ page_del = __wt_page_del_active(session, ref, true) ? ref->page_del : NULL; /* * Allocate the per-page update array if one doesn't already exist. (It might already exist * because deletes are instantiated after lookaside table updates.) */ if (page->entries != 0 && page->modify->mod_row_update == NULL) WT_RET(__wt_calloc_def(session, page->entries, &page->modify->mod_row_update)); /* * Allocate the per-reference update array; in the case of instantiating a page deleted in a * running transaction, we need a list of the update structures for the eventual commit or * abort. */ if (page_del != NULL) { count = 0; if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) WT_SKIP_FOREACH (ins, insert) ++count; WT_ROW_FOREACH (page, rip, i) { ++count; if ((insert = WT_ROW_INSERT(page, rip)) != NULL) WT_SKIP_FOREACH (ins, insert) ++count; } WT_RET(__wt_calloc_def(session, count + 1, &page_del->update_list)); } /* Walk the page entries, giving each one a tombstone. */ size = 0; count = 0; upd_array = page->modify->mod_row_update; if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) WT_SKIP_FOREACH (ins, insert) { WT_ERR(__tombstone_update_alloc(session, page_del, &upd, &size)); upd->next = ins->upd; ins->upd = upd; if (page_del != NULL) page_del->update_list[count++] = upd; } WT_ROW_FOREACH (page, rip, i) { WT_ERR(__tombstone_update_alloc(session, page_del, &upd, &size)); upd->next = upd_array[WT_ROW_SLOT(page, rip)]; upd_array[WT_ROW_SLOT(page, rip)] = upd; if (page_del != NULL) page_del->update_list[count++] = upd; if ((insert = WT_ROW_INSERT(page, rip)) != NULL) WT_SKIP_FOREACH (ins, insert) { WT_ERR(__tombstone_update_alloc(session, page_del, &upd, &size)); upd->next = ins->upd; ins->upd = upd; if (page_del != NULL) page_del->update_list[count++] = upd; } } __wt_cache_page_inmem_incr(session, page, size); return (0); err: /* * The page-delete update structure may have existed before we were called, and presumably might * be in use by a running transaction. The list of update structures cannot have been created * before we were called, and should not exist if we exit with an error. */ if (page_del != NULL) __wt_free(session, page_del->update_list); return (ret); }