/*-
 * Copyright (c) 2014-2017 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

/*
 * __sync_checkpoint_can_skip --
 *	There are limited conditions under which we can skip writing a dirty
 * page during checkpoint.
 */
static inline bool
__sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_MULTI *multi;
	WT_PAGE_MODIFY *mod;
	WT_TXN *txn;
	u_int i;

	mod = page->modify;
	txn = &session->txn;

	/*
	 * We can skip some dirty pages during a checkpoint. The requirements:
	 *
	 * 1. they must be leaf pages,
	 * 2. there is a snapshot transaction active (which is the case in
	 *    ordinary application checkpoints but not all internal cases),
	 * 3. the first dirty update on the page is sufficiently recent the
	 *    checkpoint transaction would skip them,
	 * 4. there's already an address for every disk block involved.
	 */
	if (WT_PAGE_IS_INTERNAL(page))
		return (false);
	if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
		return (false);
	if (!WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn))
		return (false);

	/*
	 * The problematic case is when a page was evicted but when there were
	 * unresolved updates and not every block associated with the page has
	 * a disk address. We can't skip such pages because we need a checkpoint
	 * write with valid addresses.
	 *
	 * The page's modification information can change underfoot if the page
	 * is being reconciled, so we'd normally serialize with reconciliation
	 * before reviewing page-modification information. However, checkpoint
	 * is the only valid writer of dirty leaf pages at this point, we skip
	 * the lock.
	 */
	if (mod->rec_result == WT_PM_REC_MULTIBLOCK)
		for (multi = mod->mod_multi,
		    i = 0; i < mod->mod_multi_entries; ++multi, ++i)
			if (multi->addr.addr == NULL)
				return (false);
	return (true);
}

/*
 * __sync_dup_walk --
 *	Duplicate a tree walk point.
 */
static inline int
__sync_dup_walk(
    WT_SESSION_IMPL *session, WT_REF *walk, uint32_t flags, WT_REF **dupp)
{
	WT_REF *old;
	bool busy;

	if ((old = *dupp) != NULL) {
		*dupp = NULL;
		WT_RET(__wt_page_release(session, old, flags));
	}

	/* It is okay to duplicate a walk before it starts. */
	if (walk == NULL || __wt_ref_is_root(walk)) {
		*dupp = walk;
		return (0);
	}

	/* Get a duplicate hazard pointer. */
	for (;;) {
#ifdef HAVE_DIAGNOSTIC
		WT_RET(
		    __wt_hazard_set(session, walk, &busy, __func__, __LINE__));
#else
		WT_RET(__wt_hazard_set(session, walk, &busy));
#endif
		/*
		 * We already have a hazard pointer, we should generally be able
		 * to get another one. We can get spurious busy errors (e.g., if
		 * eviction is attempting to lock the page. Keep trying: we have
		 * one hazard pointer so we should be able to get another one.
		 */
		if (!busy)
			break;
		__wt_yield();
	}

	*dupp = walk;
	return (0);
}

/*
 * __sync_evict_page --
 *	Attempt to evict a page during a checkpoint walk.
 */
static int
__sync_evict_page(WT_SESSION_IMPL *session, WT_REF **walkp, uint32_t flags)
{
	WT_DECL_RET;
	WT_REF *next, *to_evict;

	to_evict = *walkp;
	next = NULL;

	/*
	 * Get the ref after the page we're trying to evicting.  If the
	 * eviction is successful, the walk will continue from here.
	 */
	WT_RET(__sync_dup_walk(session, to_evict, flags, &next));
	WT_ERR(__wt_tree_walk(session, &next, flags));

	WT_ERR(__wt_page_release_evict(session, to_evict));

	/* Success: continue the walk at the next page. */
	*walkp = next;
	return (0);

err:	WT_TRET(__wt_page_release(session, next, flags));
	return (ret);
}

/*
 * __sync_file --
 *	Flush pages for a specific file.
 */
static int
__sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
{
	struct timespec end, start;
	WT_BTREE *btree;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_REF *prev, *walk;
	WT_TXN *txn;
	uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
	uint64_t oldest_id, saved_pinned_id;
	uint32_t flags;
	bool evict_failed, skip_walk, timer;

	conn = S2C(session);
	btree = S2BT(session);
	prev = walk = NULL;
	txn = &session->txn;
	evict_failed = skip_walk = false;

	flags = WT_READ_CACHE | WT_READ_NO_GEN;
	internal_bytes = leaf_bytes = 0;
	internal_pages = leaf_pages = 0;
	saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
	timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT);
	if (timer)
		__wt_epoch(session, &start);

	switch (syncop) {
	case WT_SYNC_WRITE_LEAVES:
		/*
		 * Write all immediately available, dirty in-cache leaf pages.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.
		 */
		if (!btree->modified)
			return (0);
		__wt_spin_lock(session, &btree->flush_lock);
		if (!btree->modified) {
			__wt_spin_unlock(session, &btree->flush_lock);
			return (0);
		}

		/*
		 * Save the oldest transaction ID we need to keep around.
		 * Otherwise, in a busy system, we could be updating pages so
		 * fast that write leaves never catches up.  We deliberately
		 * have no transaction running at this point that would keep
		 * the oldest ID from moving forwards as we walk the tree.
		 */
		oldest_id = __wt_txn_oldest_id(session);

		LF_SET(WT_READ_NO_WAIT | WT_READ_SKIP_INTL);
		for (;;) {
			WT_ERR(__wt_tree_walk(session, &walk, flags));
			if (walk == NULL)
				break;

			/*
			 * Write dirty pages if nobody beat us to it.  Don't
			 * try to write hot pages (defined as pages that have
			 * been updated since the write phase leaves started):
			 * checkpoint will have to visit them anyway.
			 */
			page = walk->page;
			if (__wt_page_is_modified(page) &&
			    WT_TXNID_LT(page->modify->update_txn, oldest_id)) {
				if (txn->isolation == WT_ISO_READ_COMMITTED)
					__wt_txn_get_snapshot(session);
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
				WT_ERR(__wt_reconcile(session,
				    walk, NULL, WT_REC_CHECKPOINT, NULL));
			}
		}
		break;
	case WT_SYNC_CHECKPOINT:
		/*
		 * If we are flushing a file at read-committed isolation, which
		 * is of particular interest for flushing the metadata to make
		 * a schema-changing operation durable, get a transactional
		 * snapshot now.
		 *
		 * All changes committed up to this point should be included.
		 * We don't update the snapshot in between pages because the
		 * metadata shouldn't have many pages.  Instead, read-committed
		 * isolation ensures that all metadata updates completed before
		 * the checkpoint are included.
		 */
		if (txn->isolation == WT_ISO_READ_COMMITTED)
			__wt_txn_get_snapshot(session);

		/*
		 * We cannot check the tree modified flag in the case of a
		 * checkpoint, the checkpoint code has already cleared it.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.  We're holding the schema lock, but need the
		 * lower-level lock as well.
		 */
		__wt_spin_lock(session, &btree->flush_lock);

		/*
		 * In the final checkpoint pass, child pages cannot be evicted
		 * from underneath internal pages nor can underlying blocks be
		 * freed until the checkpoint's block lists are stable. Also,
		 * we cannot split child pages into parents unless we know the
		 * final pass will write a consistent view of that namespace.
		 * Set the checkpointing flag to block such actions and wait for
		 * any problematic eviction or page splits to complete.
		 */
		btree->checkpointing = WT_CKPT_PREPARE;
		(void)__wt_gen_next_drain(session, WT_GEN_EVICT);
		btree->checkpointing = WT_CKPT_RUNNING;

		/* Write all dirty in-cache pages. */
		LF_SET(WT_READ_NO_EVICT);

		/* Read pages with lookaside entries and evict them asap. */
		LF_SET(WT_READ_LOOKASIDE | WT_READ_WONT_NEED);

		for (;;) {
			if (!skip_walk) {
				WT_ERR(__sync_dup_walk(
				    session, walk, flags, &prev));
				WT_ERR(__wt_tree_walk(session, &walk, flags));
			}
			skip_walk = false;

			if (walk == NULL)
				break;

			/* Skip clean pages. */
			if (!__wt_page_is_modified(walk->page))
				continue;

			/*
			 * Take a local reference to the page modify structure
			 * now that we know the page is dirty. It needs to be
			 * done in this order otherwise the page modify
			 * structure could have been created between taking the
			 * reference and checking modified.
			 */
			page = walk->page;

			/*
			 * Write dirty pages, if we can't skip them. If we skip
			 * a page, mark the tree dirty. The checkpoint marked it
			 * clean and we can't skip future checkpoints until this
			 * page is written.
			 */
			if (__sync_checkpoint_can_skip(session, page)) {
				__wt_tree_modify_set(session);
				continue;
			}

			if (WT_PAGE_IS_INTERNAL(page)) {
				internal_bytes += page->memory_footprint;
				++internal_pages;
			} else {
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
			}

			/*
			 * If the page needs forced eviction, try to do that
			 * now.
			 *
			 * For eviction to have a chance, we first need to move
			 * the walk point to the next page checkpoint will
			 * visit.  We want to avoid this code being too special
			 * purpose, so try to reuse the ordinary eviction path.
			 *
			 * If eviction succeeded, it steps to the next ref, so
			 * we have to skip the next walk.  If eviction fails,
			 * remember so we don't retry it.
			 */
			if (!WT_PAGE_IS_INTERNAL(page) &&
			    page->read_gen == WT_READGEN_OLDEST &&
			    !evict_failed) {
				if ((ret = __sync_evict_page(
				    session, &walk, flags)) == 0) {
					evict_failed = false;
					skip_walk = true;
				} else {
					walk = prev;
					prev = NULL;
					evict_failed = true;
				}
				WT_ERR_BUSY_OK(ret);
				continue;
			}

			evict_failed = false;
			WT_ERR(__wt_reconcile(
			    session, walk, NULL, WT_REC_CHECKPOINT, NULL));
		}
		break;
	case WT_SYNC_CLOSE:
	case WT_SYNC_DISCARD:
		WT_ERR(__wt_illegal_value(session, NULL));
		break;
	}

	if (timer) {
		__wt_epoch(session, &end);
		__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "__sync_file WT_SYNC_%s wrote: %" PRIu64
		    " leaf pages (%" PRIu64 "B), %" PRIu64
		    " internal pages (%" PRIu64 "B), and took %" PRIu64 "ms",
		    syncop == WT_SYNC_WRITE_LEAVES ?
		    "WRITE_LEAVES" : "CHECKPOINT",
		    leaf_pages, leaf_bytes, internal_pages, internal_bytes,
		    WT_TIMEDIFF_MS(end, start));
	}

err:	/* On error, clear any left-over tree walk. */
	WT_TRET(__wt_page_release(session, walk, flags));
	WT_TRET(__wt_page_release(session, prev, flags));

	/*
	 * If we got a snapshot in order to write pages, and there was no
	 * snapshot active when we started, release it.
	 */
	if (txn->isolation == WT_ISO_READ_COMMITTED &&
	    saved_pinned_id == WT_TXN_NONE)
		__wt_txn_release_snapshot(session);

	/* Clear the checkpoint flag. */
	btree->checkpointing = WT_CKPT_OFF;

	__wt_spin_unlock(session, &btree->flush_lock);

	/*
	 * Leaves are written before a checkpoint (or as part of a file close,
	 * before checkpointing the file).  Start a flush to stable storage,
	 * but don't wait for it.
	 */
	if (ret == 0 &&
	    syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC))
		WT_RET(btree->bm->sync(btree->bm, session, false));

	return (ret);
}

/*
 * __wt_cache_op --
 *	Cache operations.
 */
int
__wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op)
{
	WT_DECL_RET;

	switch (op) {
	case WT_SYNC_CHECKPOINT:
	case WT_SYNC_CLOSE:
		/*
		 * Make sure the checkpoint reference is set for
		 * reconciliation; it's ugly, but drilling a function parameter
		 * path from our callers to the reconciliation of the tree's
		 * root page is going to be worse.
		 */
		WT_ASSERT(session, S2BT(session)->ckpt != NULL);
		break;
	case WT_SYNC_DISCARD:
	case WT_SYNC_WRITE_LEAVES:
		break;
	}

	switch (op) {
	case WT_SYNC_CHECKPOINT:
	case WT_SYNC_WRITE_LEAVES:
		ret = __sync_file(session, op);
		break;
	case WT_SYNC_CLOSE:
	case WT_SYNC_DISCARD:
		ret = __wt_evict_file(session, op);
		break;
	}
	return (ret);
}