/*- * Copyright (c) 2014-2015 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ #include "wt_internal.h" /* * Fast-delete support. * * This file contains most of the code that allows WiredTiger to delete pages * of data without reading them into the cache. (This feature is currently * only available for row-store objects.) * * The way cursor truncate works in a row-store object is it explicitly reads * the first and last pages of the truncate range, then walks the tree with a * flag so the cursor walk code marks any page within the range, that hasn't * yet been read and which has no overflow items, as deleted, by changing the * WT_REF state to WT_REF_DELETED. Pages already in the cache or with overflow * items, have their rows updated/deleted individually. The transaction for the * delete operation is stored in memory referenced by the WT_REF.page_del field. * * Future cursor walks of the tree will skip the deleted page based on the * transaction stored for the delete, but it gets more complicated if a read is * done using a random key, or a cursor walk is done with a transaction where * the delete is not visible. In those cases, we read the original contents of * the page. The page-read code notices a deleted page is being read, and as * part of the read instantiates the contents of the page, creating a WT_UPDATE * with a deleted operation, in the same transaction as deleted the page. In * other words, the read process makes it appear as if the page was read and * each individual row deleted, exactly as would have happened if the page had * been in the cache all along. * * There's an additional complication to support rollback of the page delete. * When the page was marked deleted, a pointer to the WT_REF was saved in the * deleting session's transaction list and the delete is unrolled by resetting * the WT_REF_DELETED state back to WT_REF_DISK. However, if the page has been * instantiated by some reading thread, that's not enough, each individual row * on the page must have the delete operation reset. If the page split, the * WT_UPDATE lists might have been saved/restored during reconciliation and * appear on multiple pages, and the WT_REF stored in the deleting session's * transaction list is no longer useful. For this reason, when the page is * instantiated by a read, a list of the WT_UPDATE structures on the page is * stored in the WT_REF.page_del field, with the transaction ID, that way the * session unrolling the delete can find all of the WT_UPDATE structures that * require update. * * One final note: pages can also be marked deleted if emptied and evicted. In * that case, the WT_REF state will be set to WT_REF_DELETED but there will not * be any associated WT_REF.page_del field. These pages are always skipped * during cursor traversal (the page could not have been evicted if there were * updates that weren't globally visible), and if read is forced to instantiate * such a page, it simply creates an empty page from scratch. */ /* * __wt_delete_page -- * If deleting a range, try to delete the page without instantiating it. */ int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_DECL_RET; WT_PAGE *parent; *skipp = false; /* If we have a clean page in memory, attempt to evict it. */ if (ref->state == WT_REF_MEM && __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { WT_PUBLISH(ref->state, WT_REF_MEM); return (0); } (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); ret = __wt_evict_page(session, ref); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); } /* * Atomically switch the page's state to lock it. If the page is not * on-disk, other threads may be using it, no fast delete. * * Possible optimization: if the page is already deleted and the delete * is visible to us (the delete has been committed), we could skip the * page instead of instantiating it and figuring out there are no rows * in the page. While that's a huge amount of work to no purpose, it's * unclear optimizing for overlapping range deletes is worth the effort. */ if (ref->state != WT_REF_DISK || !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* * We cannot fast-delete pages that have overflow key/value items as * the overflow blocks have to be discarded. The way we figure that * out is to check the on-page cell type for the page, cells for leaf * pages that have no overflow items are special. * * In some cases, the reference address may not reference an on-page * cell (for example, some combination of page splits), in which case * we can't check the original cell value and we fail. * * To look at an on-page cell, we need to look at the parent page, and * that's dangerous, our parent page could change without warning if * the parent page were to split, deepening the tree. It's safe: the * page's reference will always point to some valid page, and if we find * any problems we simply fail the fast-delete optimization. * * !!! * I doubt it's worth the effort, but we could copy the cell's type into * the reference structure, and then we wouldn't need an on-page cell. */ parent = ref->home; if (__wt_off_page(parent, ref->addr) || __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO) goto err; /* * This action dirties the parent page: mark it dirty now, there's no * future reconciliation of the child leaf page that will dirty it as * we write the tree. */ WT_ERR(__wt_page_parent_modify_set(session, ref, false)); /* * Record the change in the transaction structure and set the change's * transaction ID. */ WT_ERR(__wt_calloc_one(session, &ref->page_del)); ref->page_del->txnid = session->txn.id; WT_ERR(__wt_txn_modify_ref(session, ref)); *skipp = true; WT_PUBLISH(ref->state, WT_REF_DELETED); return (0); err: __wt_free(session, ref->page_del); /* * Restore the page to on-disk status, we'll have to instantiate it. */ WT_PUBLISH(ref->state, WT_REF_DISK); return (ret); } /* * __wt_delete_page_rollback -- * Abort pages that were deleted without being instantiated. */ void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) { WT_UPDATE **upd; /* * If the page is still "deleted", it's as we left it, reset the state * to on-disk and we're done. Otherwise, we expect the page is either * instantiated or being instantiated. Loop because it's possible for * the page to return to the deleted state if instantiation fails. */ for (;; __wt_yield()) switch (ref->state) { case WT_REF_DISK: case WT_REF_READING: WT_ASSERT(session, 0); /* Impossible, assert */ break; case WT_REF_DELETED: /* * If the page is still "deleted", it's as we left it, * reset the state. */ if (__wt_atomic_casv32( &ref->state, WT_REF_DELETED, WT_REF_DISK)) return; break; case WT_REF_LOCKED: /* * A possible state, the page is being instantiated. */ break; case WT_REF_MEM: case WT_REF_SPLIT: /* * We can't use the normal read path to get a copy of * the page because the session may have closed the * cursor, we no longer have the reference to the tree * required for a hazard pointer. We're safe because * with unresolved transactions, the page isn't going * anywhere. * * The page is in an in-memory state, walk the list of * update structures and abort them. */ for (upd = ref->page_del->update_list; *upd != NULL; ++upd) (*upd)->txnid = WT_TXN_ABORTED; /* * Discard the memory, the transaction can't abort * twice. */ __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); return; } } /* * __wt_delete_page_skip -- * If iterating a cursor, skip deleted pages that are either visible to * us or globally visible. */ bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) { bool skip; /* * Deleted pages come from two sources: either it's a fast-delete as * described above, or the page has been emptied by other operations * and eviction deleted it. * * In both cases, the WT_REF state will be WT_REF_DELETED. In the case * of a fast-delete page, there will be a WT_PAGE_DELETED structure with * the transaction ID of the transaction that deleted the page, and the * page is visible if that transaction ID is visible. In the case of an * empty page, there will be no WT_PAGE_DELETED structure and the delete * is by definition visible, eviction could not have deleted the page if * there were changes on it that were not globally visible. * * We're here because we found a WT_REF state set to WT_REF_DELETED. It * is possible the page is being read into memory right now, though, and * the page could switch to an in-memory state at any time. Lock down * the structure, just to be safe. */ if (ref->page_del == NULL) return (true); if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) return (false); skip = ref->page_del == NULL || (visible_all ? __wt_txn_visible_all(session, ref->page_del->txnid) : __wt_txn_visible(session, ref->page_del->txnid)); /* * The page_del structure can be freed as soon as the delete is stable: * it is only read when the ref state is WT_REF_DELETED. It is worth * checking every time we come through because once this is freed, we * no longer need synchronization to check the ref. */ if (skip && ref->page_del != NULL && (visible_all || __wt_txn_visible_all(session, ref->page_del->txnid))) { __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); } WT_PUBLISH(ref->state, WT_REF_DELETED); return (skip); } /* * __wt_delete_page_instantiate -- * Instantiate an entirely deleted row-store leaf page. */ int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_DELETED *page_del; WT_UPDATE **upd_array, *upd; size_t size; uint32_t i; btree = S2BT(session); page = ref->page; page_del = ref->page_del; /* * Give the page a modify structure. * * If the tree is already dirty and so will be written, mark the page * dirty. (We'd like to free the deleted pages, but if the handle is * read-only or if the application never modifies the tree, we're not * able to do so.) */ if (btree->modified) { WT_RET(__wt_page_modify_init(session, page)); __wt_page_modify_set(session, page); } /* * An operation is accessing a "deleted" page, and we're building an * in-memory version of the page (making it look like all entries in * the page were individually updated by a remove operation). There * are two cases where we end up here: * * First, a running transaction used a truncate call to delete the page * without reading it, in which case the page reference includes a * structure with a transaction ID; the page we're building might split * in the future, so we update that structure to include references to * all of the update structures we create, so the transaction can abort. * * Second, a truncate call deleted a page and the truncate committed, * but an older transaction in the system forced us to keep the old * version of the page around, then we crashed and recovered, and now * we're being forced to read that page. * * In the first case, we have a page reference structure, in the second * second, we don't. * * Allocate the per-reference update array; in the case of instantiating * a page, deleted by a running transaction that might eventually abort, * we need a list of the update structures so we can do that abort. The * hard case is if a page splits: the update structures might be moved * to different pages, and we still have to find them all for an abort. */ if (page_del != NULL) WT_RET(__wt_calloc_def( session, page->pg_row_entries + 1, &page_del->update_list)); /* Allocate the per-page update array. */ WT_ERR(__wt_calloc_def(session, page->pg_row_entries, &upd_array)); page->pg_row_upd = upd_array; /* * Fill in the per-reference update array with references to update * structures, fill in the per-page update array with references to * deleted items. */ for (i = 0, size = 0; i < page->pg_row_entries; ++i) { WT_ERR(__wt_calloc_one(session, &upd)); WT_UPDATE_DELETED_SET(upd); if (page_del == NULL) upd->txnid = WT_TXN_NONE; /* Globally visible */ else { upd->txnid = page_del->txnid; page_del->update_list[i] = upd; } upd->next = upd_array[i]; upd_array[i] = upd; size += sizeof(WT_UPDATE *) + WT_UPDATE_MEMSIZE(upd); } __wt_cache_page_inmem_incr(session, page, size); return (0); err: /* * There's no need to free the page update structures on error, our * caller will discard the page and do that work for us. We could * similarly leave the per-reference update array alone because it * won't ever be used by any page that's not in-memory, but cleaning * it up makes sense, especially if we come back in to this function * attempting to instantiate this page again. */ if (page_del != NULL) __wt_free(session, page_del->update_list); return (ret); }