/*- * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ #include "wt_internal.h" /* * __sync_checkpoint_can_skip -- * There are limited conditions under which we can skip writing a dirty * page during checkpoint. */ static inline bool __sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_PAGE_MODIFY *mod; WT_MULTI *multi; WT_TXN *txn; u_int i; mod = page->modify; txn = &session->txn; /* * We can skip some dirty pages during a checkpoint. The requirements: * * 1. they must be leaf pages, * 2. there is a snapshot transaction active (which is the case in * ordinary application checkpoints but not all internal cases), * 3. the first dirty update on the page is sufficiently recent the * checkpoint transaction would skip them, * 4. there's already an address for every disk block involved. */ if (WT_PAGE_IS_INTERNAL(page)) return (false); if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) return (false); if (!WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) return (false); /* * The problematic case is when a page was evicted but when there were * unresolved updates and not every block associated with the page has * a disk address. We can't skip such pages because we need a checkpoint * write with valid addresses. * * The page's modification information can change underfoot if the page * is being reconciled, so we'd normally serialize with reconciliation * before reviewing page-modification information. However, checkpoint * is the only valid writer of dirty leaf pages at this point, we skip * the lock. */ if (mod->rec_result == WT_PM_REC_MULTIBLOCK) for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) if (multi->addr.addr == NULL) return (false); return (true); } /* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { struct timespec end, start; WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_pinned_id; uint32_t flags; conn = S2C(session); btree = S2BT(session); walk = NULL; txn = &session->txn; saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; flags = WT_READ_CACHE | WT_READ_NO_GEN; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) __wt_epoch(session, &start); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } /* * Save the oldest transaction ID we need to keep around. * Otherwise, in a busy system, we could be updating pages so * fast that write leaves never catches up. We deliberately * have no transaction running at this point that would keep * the oldest ID from moving forwards as we walk the tree. */ oldest_id = __wt_txn_oldest_id(session); flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write hot pages (defined as pages that have * been updated since the write phase leaves started): * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, WT_CHECKPOINTING, NULL)); } } break; case WT_SYNC_CHECKPOINT: /* * If we are flushing a file at read-committed isolation, which * is of particular interest for flushing the metadata to make * a schema-changing operation durable, get a transactional * snapshot now. * * All changes committed up to this point should be included. * We don't update the snapshot in between pages because the * metadata shouldn't have many pages. Instead, read-committed * isolation ensures that all metadata updates completed before * the checkpoint are included. */ if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * In the final checkpoint pass, child pages cannot be evicted * from underneath internal pages nor can underlying blocks be * freed until the checkpoint's block lists are stable. Also, * we cannot split child pages into parents unless we know the * final pass will write a consistent view of that namespace. * Set the checkpointing flag to block such actions and wait for * any problematic eviction or page splits to complete. */ WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE); /* * Sync for checkpoint allows splits to happen while the queue * is being drained, but not reconciliation. We need to do this, * since draining the queue can take long enough for hot pages * to grow significantly larger than the configured maximum * size. */ F_SET(btree, WT_BTREE_NO_RECONCILE); ret = __wt_evict_file_exclusive_on(session); F_CLR(btree, WT_BTREE_NO_RECONCILE); WT_ERR(ret); __wt_evict_file_exclusive_off(session); WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING); /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* Skip clean pages. */ if (!__wt_page_is_modified(walk->page)) continue; /* * Take a local reference to the page modify structure * now that we know the page is dirty. It needs to be * done in this order otherwise the page modify * structure could have been created between taking the * reference and checking modified. */ page = walk->page; /* * Write dirty pages, if we can't skip them. If we skip * a page, mark the tree dirty. The checkpoint marked it * clean and we can't skip future checkpoints until this * page is written. */ if (__sync_checkpoint_can_skip(session, page)) { __wt_tree_modify_set(session); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } WT_ERR(__wt_reconcile( session, walk, NULL, WT_CHECKPOINTING, NULL)); } break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: WT_ERR(__wt_illegal_value(session, NULL)); break; } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { __wt_epoch(session, &end); __wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote: %" PRIu64 " leaf pages (%" PRIu64 "B), %" PRIu64 " internal pages (%" PRIu64 "B), and took %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_pages, leaf_bytes, internal_pages, internal_bytes, WT_TIMEDIFF_MS(end, start)); } err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); /* * If we got a snapshot in order to write pages, and there was no * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && saved_pinned_id == WT_TXN_NONE) __wt_txn_release_snapshot(session); /* Clear the checkpoint flag and push the change. */ if (btree->checkpointing != WT_CKPT_OFF) WT_PUBLISH(btree->checkpointing, WT_CKPT_OFF); __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, false)); return (ret); } /* * __wt_cache_op -- * Cache operations. */ int __wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op) { WT_DECL_RET; switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_CLOSE: /* * Make sure the checkpoint reference is set for * reconciliation; it's ugly, but drilling a function parameter * path from our callers to the reconciliation of the tree's * root page is going to be worse. */ WT_ASSERT(session, S2BT(session)->ckpt != NULL); break; case WT_SYNC_DISCARD: case WT_SYNC_WRITE_LEAVES: break; } switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_WRITE_LEAVES: ret = __sync_file(session, op); break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: ret = __wt_evict_file(session, op); break; } return (ret); }