Temporary commit of merge of MariaDB 10.0-base and MySQL 5.6

author: Michael Widenius <monty@askmonty.org> 2012-08-01 17:27:34 +0300
committer: Michael Widenius <monty@askmonty.org> 2012-08-01 17:27:34 +0300
commit: 1d0f70c2f894b27e98773a282871d32802f67964 (patch)
tree: 833e683e0ced29c4323c29a9d845703d4dfcd81b /storage/innobase/buf/buf0flu.cc
parent: 5a86a61219826aadf8d08cbc447fe438f2bf50c3 (diff)
download: mariadb-git-1d0f70c2f894b27e98773a282871d32802f67964.tar.gz
1 files changed, 2523 insertions, 0 deletions
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
new file mode 100644
index 00000000000..023ed766c62
--- /dev/null
+++ b/storage/innobase/buf/buf0flu.cc
@@ -0,0 +1,2523 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0flu.cc
+The database buffer buf_pool flush algorithm
+
+Created 11/11/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0flu.h"
+
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "page0zip.h"
+#ifndef UNIV_HOTBACKUP
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "page0page.h"
+#include "fil0fil.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+#include "os0file.h"
+#include "trx0sys.h"
+#include "srv0mon.h"
+#include "mysql/plugin.h"
+#include "mysql/service_thd_wait.h"
+#include "buf0dblwr.h"
+
+#ifdef UNIV_NONINL
+#include "buf0flu.ic"
+#endif
+
+/**********************************************************************
+These statistics are generated for heuristics used in estimating the
+rate at which we should flush the dirty blocks to avoid bursty IO
+activity. Note that the rate of flushing not only depends on how many
+dirty pages we have in the buffer pool but it is also a fucntion of
+how much redo the workload is generating and at what rate. */
+/* @{ */
+
+/** Number of intervals for which we keep the history of these stats.
+Each interval is 1 second, defined by the rate at which
+srv_error_monitor_thread() calls buf_flush_stat_update(). */
+#define BUF_FLUSH_STAT_N_INTERVAL 20
+
+/** Sampled values buf_flush_stat_cur.
+Not protected by any mutex.  Updated by buf_flush_stat_update(). */
+static buf_flush_stat_t	buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
+
+/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
+static ulint		buf_flush_stat_arr_ind;
+
+/** Values at start of the current interval. Reset by
+buf_flush_stat_update(). */
+static buf_flush_stat_t	buf_flush_stat_cur;
+
+/** Running sum of past values of buf_flush_stat_cur.
+Updated by buf_flush_stat_update(). Not protected by any mutex. */
+static buf_flush_stat_t	buf_flush_stat_sum;
+
+/** Number of pages flushed through non flush_list flushes. */
+static ulint buf_lru_flush_page_count = 0;
+
+/** Flag indicating if the page_cleaner is in active state. This flag
+is set to TRUE by the page_cleaner thread when it is spawned and is set
+back to FALSE at shutdown by the page_cleaner as well. Therefore no
+need to protect it by a mutex. It is only ever read by the thread
+doing the shutdown */
+UNIV_INTERN ibool buf_page_cleaner_is_active = FALSE;
+
+/** LRU flush batch is further divided into this chunk size to
+reduce the wait time for the threads waiting for a clean block */
+#define PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE	100
+
+#ifdef UNIV_PFS_THREAD
+UNIV_INTERN mysql_pfs_key_t buf_page_cleaner_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+/** If LRU list of a buf_pool is less than this size then LRU eviction
+should not happen. This is because when we do LRU flushing we also put
+the blocks on free list. If LRU list is very small then we can end up
+in thrashing. */
+#define BUF_LRU_MIN_LEN		256
+
+/* @} */
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/******************************************************************//**
+Validates the flush list.
+@return	TRUE if ok */
+static
+ibool
+buf_flush_validate_low(
+/*===================*/
+	buf_pool_t*	buf_pool);	/*!< in: Buffer pool instance */
+
+/******************************************************************//**
+Validates the flush list some of the time.
+@return	TRUE if ok or the check was skipped */
+static
+ibool
+buf_flush_validate_skip(
+/*====================*/
+	buf_pool_t*	buf_pool)	/*!< in: Buffer pool instance */
+{
+/** Try buf_flush_validate_low() every this many times */
+# define BUF_FLUSH_VALIDATE_SKIP	23
+
+	/** The buf_flush_validate_low() call skip counter.
+	Use a signed type because of the race condition below. */
+	static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+
+	/* There is a race condition below, but it does not matter,
+	because this call is only for heuristic purposes. We want to
+	reduce the call frequency of the costly buf_flush_validate_low()
+	check in debug builds. */
+	if (--buf_flush_validate_count > 0) {
+		return(TRUE);
+	}
+
+	buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+	return(buf_flush_validate_low(buf_pool));
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/******************************************************************//**
+Insert a block in the flush_rbt and returns a pointer to its
+predecessor or NULL if no predecessor. The ordering is maintained
+on the basis of the <oldest_modification, space, offset> key.
+@return	pointer to the predecessor or NULL if no predecessor. */
+static
+buf_page_t*
+buf_flush_insert_in_flush_rbt(
+/*==========================*/
+	buf_page_t*	bpage)	/*!< in: bpage to be inserted. */
+{
+	const ib_rbt_node_t*	c_node;
+	const ib_rbt_node_t*	p_node;
+	buf_page_t*		prev = NULL;
+	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+	/* Insert this buffer into the rbt. */
+	c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
+	ut_a(c_node != NULL);
+
+	/* Get the predecessor. */
+	p_node = rbt_prev(buf_pool->flush_rbt, c_node);
+
+	if (p_node != NULL) {
+		buf_page_t**	value;
+		value = rbt_value(buf_page_t*, p_node);
+		prev = *value;
+		ut_a(prev != NULL);
+	}
+
+	return(prev);
+}
+
+/*********************************************************//**
+Delete a bpage from the flush_rbt. */
+static
+void
+buf_flush_delete_from_flush_rbt(
+/*============================*/
+	buf_page_t*	bpage)	/*!< in: bpage to be removed. */
+{
+#ifdef UNIV_DEBUG
+	ibool		ret = FALSE;
+#endif /* UNIV_DEBUG */
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+#ifdef UNIV_DEBUG
+	ret =
+#endif /* UNIV_DEBUG */
+	rbt_delete(buf_pool->flush_rbt, &bpage);
+
+	ut_ad(ret);
+}
+
+/*****************************************************************//**
+Compare two modified blocks in the buffer pool. The key for comparison
+is:
+key = <oldest_modification, space, offset>
+This comparison is used to maintian ordering of blocks in the
+buf_pool->flush_rbt.
+Note that for the purpose of flush_rbt, we only need to order blocks
+on the oldest_modification. The other two fields are used to uniquely
+identify the blocks.
+@return	 < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
+static
+int
+buf_flush_block_cmp(
+/*================*/
+	const void*	p1,		/*!< in: block1 */
+	const void*	p2)		/*!< in: block2 */
+{
+	int			ret;
+	const buf_page_t*	b1 = *(const buf_page_t**) p1;
+	const buf_page_t*	b2 = *(const buf_page_t**) p2;
+#ifdef UNIV_DEBUG
+	buf_pool_t*		buf_pool = buf_pool_from_bpage(b1);
+#endif /* UNIV_DEBUG */
+
+	ut_ad(b1 != NULL);
+	ut_ad(b2 != NULL);
+
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+	ut_ad(b1->in_flush_list);
+	ut_ad(b2->in_flush_list);
+
+	if (b2->oldest_modification > b1->oldest_modification) {
+		return(1);
+	} else if (b2->oldest_modification < b1->oldest_modification) {
+		return(-1);
+	}
+
+	/* If oldest_modification is same then decide on the space. */
+	ret = (int)(b2->space - b1->space);
+
+	/* Or else decide ordering on the offset field. */
+	return(ret ? ret : (int)(b2->offset - b1->offset));
+}
+
+/********************************************************************//**
+Initialize the red-black tree to speed up insertions into the flush_list
+during recovery process. Should be called at the start of recovery
+process before any page has been read/written. */
+UNIV_INTERN
+void
+buf_flush_init_flush_rbt(void)
+/*==========================*/
+{
+	ulint	i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		buf_flush_list_mutex_enter(buf_pool);
+
+		/* Create red black tree for speedy insertions in flush list. */
+		buf_pool->flush_rbt = rbt_create(
+			sizeof(buf_page_t*), buf_flush_block_cmp);
+
+		buf_flush_list_mutex_exit(buf_pool);
+	}
+}
+
+/********************************************************************//**
+Frees up the red-black tree. */
+UNIV_INTERN
+void
+buf_flush_free_flush_rbt(void)
+/*==========================*/
+{
+	ulint	i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		buf_flush_list_mutex_enter(buf_pool);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		ut_a(buf_flush_validate_low(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+		rbt_free(buf_pool->flush_rbt);
+		buf_pool->flush_rbt = NULL;
+
+		buf_flush_list_mutex_exit(buf_pool);
+	}
+}
+
+/********************************************************************//**
+Inserts a modified block into the flush list. */
+UNIV_INTERN
+void
+buf_flush_insert_into_flush_list(
+/*=============================*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	buf_block_t*	block,		/*!< in/out: block which is modified */
+	lsn_t		lsn)		/*!< in: oldest modification */
+{
+	ut_ad(!buf_pool_mutex_own(buf_pool));
+	ut_ad(log_flush_order_mutex_own());
+	ut_ad(mutex_own(&block->mutex));
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
+	      || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
+		  <= lsn));
+
+	/* If we are in the recovery then we need to update the flush
+	red-black tree as well. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		buf_flush_list_mutex_exit(buf_pool);
+		buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
+		return;
+	}
+
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(!block->page.in_flush_list);
+
+	ut_d(block->page.in_flush_list = TRUE);
+	block->page.oldest_modification = lsn;
+	UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
+
+#ifdef UNIV_DEBUG_VALGRIND
+	{
+		ulint	zip_size = buf_block_get_zip_size(block);
+
+		if (zip_size) {
+			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
+		} else {
+			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
+		}
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_skip(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	buf_flush_list_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Inserts a modified block into the flush list in the right sorted position.
+This function is used by recovery, because there the modifications do not
+necessarily come in the order of lsn's. */
+UNIV_INTERN
+void
+buf_flush_insert_sorted_into_flush_list(
+/*====================================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_block_t*	block,		/*!< in/out: block which is modified */
+	lsn_t		lsn)		/*!< in: oldest modification */
+{
+	buf_page_t*	prev_b;
+	buf_page_t*	b;
+
+	ut_ad(!buf_pool_mutex_own(buf_pool));
+	ut_ad(log_flush_order_mutex_own());
+	ut_ad(mutex_own(&block->mutex));
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	/* The field in_LRU_list is protected by buf_pool->mutex, which
+	we are not holding.  However, while a block is in the flush
+	list, it is dirty and cannot be discarded, not from the
+	page_hash or from the LRU list.  At most, the uncompressed
+	page frame of a compressed block may be discarded or created
+	(copying the block->page to or from a buf_page_t that is
+	dynamically allocated from buf_buddy_alloc()).  Because those
+	transitions hold block->mutex and the flush list mutex (via
+	buf_flush_relocate_on_flush_list()), there is no possibility
+	of a race condition in the assertions below. */
+	ut_ad(block->page.in_LRU_list);
+	ut_ad(block->page.in_page_hash);
+	/* buf_buddy_block_register() will take a block in the
+	BUF_BLOCK_MEMORY state, not a file page. */
+	ut_ad(!block->page.in_zip_hash);
+
+	ut_ad(!block->page.in_flush_list);
+	ut_d(block->page.in_flush_list = TRUE);
+	block->page.oldest_modification = lsn;
+
+#ifdef UNIV_DEBUG_VALGRIND
+	{
+		ulint	zip_size = buf_block_get_zip_size(block);
+
+		if (zip_size) {
+			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
+		} else {
+			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
+		}
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	prev_b = NULL;
+
+	/* For the most part when this function is called the flush_rbt
+	should not be NULL. In a very rare boundary case it is possible
+	that the flush_rbt has already been freed by the recovery thread
+	before the last page was hooked up in the flush_list by the
+	io-handler thread. In that case we'll  just do a simple
+	linear search in the else block. */
+	if (buf_pool->flush_rbt) {
+
+		prev_b = buf_flush_insert_in_flush_rbt(&block->page);
+
+	} else {
+
+		b = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+		while (b && b->oldest_modification
+		       > block->page.oldest_modification) {
+			ut_ad(b->in_flush_list);
+			prev_b = b;
+			b = UT_LIST_GET_NEXT(list, b);
+		}
+	}
+
+	if (prev_b == NULL) {
+		UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
+	} else {
+		UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
+				     prev_b, &block->page);
+	}
+
+	MONITOR_INC(MONITOR_PAGE_INFLUSH);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_low(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	buf_flush_list_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Returns TRUE if the file page block is immediately suitable for replacement,
+i.e., the transition FILE_PAGE => NOT_USED allowed.
+@return	TRUE if can replace immediately */
+UNIV_INTERN
+ibool
+buf_flush_ready_for_replace(
+/*========================*/
+	buf_page_t*	bpage)	/*!< in: buffer control block, must be
+				buf_page_in_file(bpage) and in the LRU list */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(bpage->in_LRU_list);
+
+	if (UNIV_LIKELY(buf_page_in_file(bpage))) {
+
+		return(bpage->oldest_modification == 0
+		       && buf_page_get_io_fix(bpage) == BUF_IO_NONE
+		       && bpage->buf_fix_count == 0);
+	}
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: Error: buffer block state %lu"
+		" in the LRU list!\n",
+		(ulong) buf_page_get_state(bpage));
+	ut_print_buf(stderr, bpage, sizeof(buf_page_t));
+	putc('\n', stderr);
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Returns TRUE if the block is modified and ready for flushing.
+@return	TRUE if can flush immediately */
+UNIV_INLINE
+ibool
+buf_flush_ready_for_flush(
+/*======================*/
+	buf_page_t*	bpage,	/*!< in: buffer control block, must be
+				buf_page_in_file(bpage) */
+	enum buf_flush	flush_type)/*!< in: type of flush */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+	ut_a(buf_page_in_file(bpage));
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
+
+	if (bpage->oldest_modification == 0
+	    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+		return(FALSE);
+	}
+
+	ut_ad(bpage->in_flush_list);
+
+	switch (flush_type) {
+	case BUF_FLUSH_LIST:
+		return(TRUE);
+
+	case BUF_FLUSH_LRU:
+	case BUF_FLUSH_SINGLE_PAGE:
+		/* Because any thread may call single page flush, even
+		when owning locks on pages, to avoid deadlocks, we must
+		make sure that the that it is not buffer fixed.
+		The same holds true for LRU flush because a user thread
+		may end up waiting for an LRU flush to end while
+		holding locks on other pages. */
+		return(bpage->buf_fix_count == 0);
+	case BUF_FLUSH_N_TYPES:
+		break;
+	}
+
+	ut_error;
+	return(FALSE);
+}
+
+/********************************************************************//**
+Remove a block from the flush list of modified blocks. */
+UNIV_INTERN
+void
+buf_flush_remove(
+/*=============*/
+	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(bpage->in_flush_list);
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_ZIP_PAGE:
+		/* Clean compressed pages should not be on the flush list */
+	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_error;
+		return;
+	case BUF_BLOCK_ZIP_DIRTY:
+		buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
+		UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		buf_LRU_insert_zip_clean(bpage);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+		break;
+	case BUF_BLOCK_FILE_PAGE:
+		UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
+		break;
+	}
+
+	/* If the flush_rbt is active then delete from there as well. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		buf_flush_delete_from_flush_rbt(bpage);
+	}
+
+	/* Must be done after we have removed it from the flush_rbt
+	because we assert on in_flush_list in comparison function. */
+	ut_d(bpage->in_flush_list = FALSE);
+
+	bpage->oldest_modification = 0;
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_skip(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	MONITOR_DEC(MONITOR_PAGE_INFLUSH);
+
+	buf_flush_list_mutex_exit(buf_pool);
+}
+
+/*******************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage have already been
+copied to dpage.
+IMPORTANT: When this function is called bpage and dpage are not
+exact copies of each other. For example, they both will have different
+::state. Also the ::list pointers in dpage may be stale. We need to
+use the current list node (bpage) to do the list manipulation because
+the list pointers could have changed between the time that we copied
+the contents of bpage to the dpage and the flush list manipulation
+below. */
+UNIV_INTERN
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+	buf_page_t*	bpage,	/*!< in/out: control block being moved */
+	buf_page_t*	dpage)	/*!< in/out: destination block */
+{
+	buf_page_t*	prev;
+	buf_page_t* 	prev_b = NULL;
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	/* Must reside in the same buffer pool. */
+	ut_ad(buf_pool == buf_pool_from_bpage(dpage));
+
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	/* FIXME: At this point we have both buf_pool and flush_list
+	mutexes. Theoretically removal of a block from flush list is
+	only covered by flush_list mutex but currently we do
+	have buf_pool mutex in buf_flush_remove() therefore this block
+	is guaranteed to be in the flush list. We need to check if
+	this will work without the assumption of block removing code
+	having the buf_pool mutex. */
+	ut_ad(bpage->in_flush_list);
+	ut_ad(dpage->in_flush_list);
+
+	/* If recovery is active we must swap the control blocks in
+	the flush_rbt as well. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		buf_flush_delete_from_flush_rbt(bpage);
+		prev_b = buf_flush_insert_in_flush_rbt(dpage);
+	}
+
+	/* Must be done after we have removed it from the flush_rbt
+	because we assert on in_flush_list in comparison function. */
+	ut_d(bpage->in_flush_list = FALSE);
+
+	prev = UT_LIST_GET_PREV(list, bpage);
+	UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
+
+	if (prev) {
+		ut_ad(prev->in_flush_list);
+		UT_LIST_INSERT_AFTER(
+			list,
+			buf_pool->flush_list,
+			prev, dpage);
+	} else {
+		UT_LIST_ADD_FIRST(
+			list,
+			buf_pool->flush_list,
+			dpage);
+	}
+
+	/* Just an extra check. Previous in flush_list
+	should be the same control block as in flush_rbt. */
+	ut_a(!buf_pool->flush_rbt || prev_b == prev);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_low(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	buf_flush_list_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Updates the flush system data structures when a write is completed. */
+UNIV_INTERN
+void
+buf_flush_write_complete(
+/*=====================*/
+	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
+{
+	enum buf_flush	flush_type;
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(bpage);
+
+	buf_flush_remove(bpage);
+
+	flush_type = buf_page_get_flush_type(bpage);
+	buf_pool->n_flush[flush_type]--;
+
+	/* fprintf(stderr, "n pending flush %lu\n",
+	buf_pool->n_flush[flush_type]); */
+
+	if (buf_pool->n_flush[flush_type] == 0
+	    && buf_pool->init_flush[flush_type] == FALSE) {
+
+		/* The running flush batch has ended */
+
+		os_event_set(buf_pool->no_flush[flush_type]);
+	}
+
+	switch (flush_type) {
+	case BUF_FLUSH_LIST:
+	case BUF_FLUSH_LRU:
+		buf_dblwr_update();
+		break;
+	case BUF_FLUSH_SINGLE_PAGE:
+		/* Single page flushes are synchronous. No need
+		to update doublewrite */
+		break;
+	case BUF_FLUSH_N_TYPES:
+		ut_error;
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Initializes a page for writing to the tablespace. */
+UNIV_INTERN
+void
+buf_flush_init_for_writing(
+/*=======================*/
+	byte*	page,		/*!< in/out: page */
+	void*	page_zip_,	/*!< in/out: compressed page, or NULL */
+	lsn_t	newest_lsn)	/*!< in: newest modification lsn
+				to the page */
+{
+	ib_uint32_t	checksum = 0 /* silence bogus gcc warning */;
+
+	ut_ad(page);
+
+	if (page_zip_) {
+		page_zip_des_t*	page_zip;
+		ulint		zip_size;
+
+		page_zip = static_cast<page_zip_des_t*>(page_zip_);
+		zip_size = page_zip_get_size(page_zip);
+
+		ut_ad(zip_size);
+		ut_ad(ut_is_2pow(zip_size));
+		ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+
+		switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
+		case FIL_PAGE_TYPE_ALLOCATED:
+		case FIL_PAGE_INODE:
+		case FIL_PAGE_IBUF_BITMAP:
+		case FIL_PAGE_TYPE_FSP_HDR:
+		case FIL_PAGE_TYPE_XDES:
+			/* These are essentially uncompressed pages. */
+			memcpy(page_zip->data, page, zip_size);
+			/* fall through */
+		case FIL_PAGE_TYPE_ZBLOB:
+		case FIL_PAGE_TYPE_ZBLOB2:
+		case FIL_PAGE_INDEX:
+			checksum = page_zip_calc_checksum(
+				page_zip->data, zip_size,
+				static_cast<srv_checksum_algorithm_t>(
+					srv_checksum_algorithm));
+
+			mach_write_to_8(page_zip->data
+					+ FIL_PAGE_LSN, newest_lsn);
+			memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+			mach_write_to_4(page_zip->data
+					+ FIL_PAGE_SPACE_OR_CHKSUM,
+					checksum);
+			return;
+		}
+
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: ERROR: The compressed page to be written"
+		      " seems corrupt:", stderr);
+		ut_print_buf(stderr, page, zip_size);
+		fputs("\nInnoDB: Possibly older version of the page:", stderr);
+		ut_print_buf(stderr, page_zip->data, zip_size);
+		putc('\n', stderr);
+		ut_error;
+	}
+
+	/* Write the newest modification lsn to the page header and trailer */
+	mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
+
+	mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+			newest_lsn);
+
+	/* Store the new formula checksum */
+
+	switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+		checksum = buf_calc_page_crc32(page);
+		break;
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+		checksum = (ib_uint32_t) buf_calc_page_new_checksum(page);
+		break;
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		checksum = BUF_NO_CHECKSUM_MAGIC;
+		break;
+	/* no default so the compiler will emit a warning if new enum
+	is added and not handled here */
+	}
+
+	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
+
+	/* We overwrite the first 4 bytes of the end lsn field to store
+	the old formula checksum. Since it depends also on the field
+	FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
+	new formula checksum. */
+
+	if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB
+	    || srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB) {
+
+		checksum = (ib_uint32_t) buf_calc_page_old_checksum(page);
+
+		/* In other cases we use the value assigned from above.
+		If CRC32 is used then it is faster to use that checksum
+		(calculated above) instead of calculating another one.
+		We can afford to store something other than
+		buf_calc_page_old_checksum() or BUF_NO_CHECKSUM_MAGIC in
+		this field because the file will not be readable by old
+		versions of MySQL/InnoDB anyway (older than MySQL 5.6.3) */
+	}
+
+	mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+			checksum);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Flush a batch of writes to the datafiles that have already been
+written by the OS. */
+UNIV_INTERN
+void
+buf_flush_sync_datafiles(void)
+/*==========================*/
+{
+	/* Wake possible simulated aio thread to actually post the
+	writes to the operating system */
+	os_aio_simulated_wake_handler_threads();
+
+	/* Wait that all async writes to tablespaces have been posted to
+	the OS */
+	os_aio_wait_until_no_pending_writes();
+
+	/* Now we flush the data to disk (for example, with fsync) */
+	fil_flush_file_spaces(FIL_TABLESPACE);
+
+	return;
+}
+
+/********************************************************************//**
+Does an asynchronous write of a buffer page. NOTE: in simulated aio and
+also when the doublewrite buffer is used, we must call
+buf_dblwr_flush_buffered_writes after we have posted a batch of
+writes! */
+static
+void
+buf_flush_write_block_low(
+/*======================*/
+	buf_page_t*	bpage,		/*!< in: buffer block to write */
+	enum buf_flush	flush_type)	/*!< in: type of flush */
+{
+	ulint	zip_size	= buf_page_get_zip_size(bpage);
+	page_t*	frame		= NULL;
+
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(!buf_pool_mutex_own(buf_pool));
+#endif
+
+#ifdef UNIV_LOG_DEBUG
+	static ibool univ_log_debug_warned;
+#endif /* UNIV_LOG_DEBUG */
+
+	ut_ad(buf_page_in_file(bpage));
+
+	/* We are not holding buf_pool->mutex or block_mutex here.
+	Nevertheless, it is safe to access bpage, because it is
+	io_fixed and oldest_modification != 0.  Thus, it cannot be
+	relocated in the buffer pool or removed from flush_list or
+	LRU_list. */
+	ut_ad(!buf_pool_mutex_own(buf_pool));
+	ut_ad(!buf_flush_list_mutex_own(buf_pool));
+	ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
+	ut_ad(bpage->oldest_modification != 0);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
+#endif
+	ut_ad(bpage->newest_modification != 0);
+
+#ifdef UNIV_LOG_DEBUG
+	if (!univ_log_debug_warned) {
+		univ_log_debug_warned = TRUE;
+		fputs("Warning: cannot force log to disk if"
+		      " UNIV_LOG_DEBUG is defined!\n"
+		      "Crash recovery will not work!\n",
+		      stderr);
+	}
+#else
+	/* Force the log to the disk before writing the modified block */
+	log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
+#endif
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_error;
+		break;
+	case BUF_BLOCK_ZIP_DIRTY:
+		frame = bpage->zip.data;
+
+		ut_a(page_zip_verify_checksum(frame, zip_size));
+
+		mach_write_to_8(frame + FIL_PAGE_LSN,
+				bpage->newest_modification);
+		memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+		break;
+	case BUF_BLOCK_FILE_PAGE:
+		frame = bpage->zip.data;
+		if (!frame) {
+			frame = ((buf_block_t*) bpage)->frame;
+		}
+
+		buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
+					   bpage->zip.data
+					   ? &bpage->zip : NULL,
+					   bpage->newest_modification);
+		break;
+	}
+
+	if (!srv_use_doublewrite_buf || !buf_dblwr) {
+		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+		       FALSE, buf_page_get_space(bpage), zip_size,
+		       buf_page_get_page_no(bpage), 0,
+		       zip_size ? zip_size : UNIV_PAGE_SIZE,
+		       frame, bpage);
+	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
+		buf_dblwr_write_single_page(bpage);
+	} else {
+		buf_dblwr_add_to_batch(bpage);
+	}
+}
+
+/********************************************************************//**
+Writes a flushable page asynchronously from the buffer pool to a file.
+NOTE: in simulated aio we must call
+os_aio_simulated_wake_handler_threads after we have posted a batch of
+writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
+held upon entering this function, and they will be released by this
+function. */
+static
+void
+buf_flush_page(
+/*===========*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_page_t*	bpage,		/*!< in: buffer control block */
+	enum buf_flush	flush_type)	/*!< in: type of flush */
+{
+	mutex_t*	block_mutex;
+	ibool		is_uncompressed;
+
+	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_page_in_file(bpage));
+
+	block_mutex = buf_page_get_mutex(bpage);
+	ut_ad(mutex_own(block_mutex));
+
+	ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
+
+	buf_page_set_io_fix(bpage, BUF_IO_WRITE);
+
+	buf_page_set_flush_type(bpage, flush_type);
+
+	if (buf_pool->n_flush[flush_type] == 0) {
+
+		os_event_reset(buf_pool->no_flush[flush_type]);
+	}
+
+	buf_pool->n_flush[flush_type]++;
+
+	is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
+
+	switch (flush_type) {
+		ibool	is_s_latched;
+	case BUF_FLUSH_LIST:
+		/* If the simulated aio thread is not running, we must
+		not wait for any latch, as we may end up in a deadlock:
+		if buf_fix_count == 0, then we know we need not wait */
+
+		is_s_latched = (bpage->buf_fix_count == 0);
+		if (is_s_latched && is_uncompressed) {
+			rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
+					   BUF_IO_WRITE);
+		}
+
+		mutex_exit(block_mutex);
+		buf_pool_mutex_exit(buf_pool);
+
+		/* Even though bpage is not protected by any mutex at
+		this point, it is safe to access bpage, because it is
+		io_fixed and oldest_modification != 0.  Thus, it
+		cannot be relocated in the buffer pool or removed from
+		flush_list or LRU_list. */
+
+		if (!is_s_latched) {
+			buf_dblwr_flush_buffered_writes();
+
+			if (is_uncompressed) {
+				rw_lock_s_lock_gen(&((buf_block_t*) bpage)
+						   ->lock, BUF_IO_WRITE);
+			}
+		}
+
+		break;
+
+	case BUF_FLUSH_LRU:
+	case BUF_FLUSH_SINGLE_PAGE:
+		/* VERY IMPORTANT:
+		Because any thread may call single page flush, even when
+		owning locks on pages, to avoid deadlocks, we must make
+		sure that the s-lock is acquired on the page without
+		waiting: this is accomplished because
+		buf_flush_ready_for_flush() must hold, and that requires
+		the page not to be bufferfixed.
+		The same holds true for LRU flush because a user thread
+		may end up waiting for an LRU flush to end while
+		holding locks on other pages. */
+
+		if (is_uncompressed) {
+			rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
+					   BUF_IO_WRITE);
+		}
+
+		/* Note that the s-latch is acquired before releasing the
+		buf_pool mutex: this ensures that the latch is acquired
+		immediately. */
+
+		mutex_exit(block_mutex);
+		buf_pool_mutex_exit(buf_pool);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	/* Even though bpage is not protected by any mutex at this
+	point, it is safe to access bpage, because it is io_fixed and
+	oldest_modification != 0.  Thus, it cannot be relocated in the
+	buffer pool or removed from flush_list or LRU_list. */
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints) {
+		fprintf(stderr,
+			"Flushing %u space %u page %u\n",
+			flush_type, bpage->space, bpage->offset);
+	}
+#endif /* UNIV_DEBUG */
+	buf_flush_write_block_low(bpage, flush_type);
+}
+
+# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/********************************************************************//**
+Writes a flushable page asynchronously from the buffer pool to a file.
+NOTE: buf_pool->mutex and block->mutex must be held upon entering this
+function, and they will be released by this function after flushing.
+This is loosely based on buf_flush_batch() and buf_flush_page().
+@return TRUE if the page was flushed and the mutexes released */
+UNIV_INTERN
+ibool
+buf_flush_page_try(
+/*===============*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
+	buf_block_t*	block)		/*!< in/out: buffer control block */
+{
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(mutex_own(&block->mutex));
+
+	if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
+		return(FALSE);
+	}
+
+	/* The following call will release the buffer pool and
+	block mutex. */
+	buf_flush_page(buf_pool, &block->page, BUF_FLUSH_SINGLE_PAGE);
+	buf_flush_sync_datafiles();
+	return(TRUE);
+}
+# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+/***********************************************************//**
+Flushes to disk all flushable pages within the flush area.
+@return	number of pages flushed */
+static
+ulint
+buf_flush_try_neighbors(
+/*====================*/
+	ulint		space,		/*!< in: space id */
+	ulint		offset,		/*!< in: page offset */
+	enum buf_flush	flush_type,	/*!< in: BUF_FLUSH_LRU or
+					BUF_FLUSH_LIST */
+	ulint		n_flushed,	/*!< in: number of pages
+					flushed so far in this batch */
+	ulint		n_to_flush)	/*!< in: maximum number of pages
+					we are allowed to flush */
+{
+	ulint		i;
+	ulint		low;
+	ulint		high;
+	ulint		count = 0;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+
+	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+
+	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
+	    || !srv_flush_neighbors) {
+		/* If there is little space or neighbor flushing is
+		not enabled then just flush the victim. */
+		low = offset;
+		high = offset + 1;
+	} else {
+		/* When flushed, dirty blocks are searched in
+		neighborhoods of this size, and flushed along with the
+		original page. */
+
+		ulint	buf_flush_area;
+
+		buf_flush_area	= ut_min(
+			BUF_READ_AHEAD_AREA(buf_pool),
+			buf_pool->curr_size / 16);
+
+		low = (offset / buf_flush_area) * buf_flush_area;
+		high = (offset / buf_flush_area + 1) * buf_flush_area;
+	}
+
+	/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
+
+	if (high > fil_space_get_size(space)) {
+		high = fil_space_get_size(space);
+	}
+
+	for (i = low; i < high; i++) {
+
+		buf_page_t*	bpage;
+
+		if ((count + n_flushed) >= n_to_flush) {
+
+			/* We have already flushed enough pages and
+			should call it a day. There is, however, one
+			exception. If the page whose neighbors we
+			are flushing has not been flushed yet then
+			we'll try to flush the victim that we
+			selected originally. */
+			if (i <= offset) {
+				i = offset;
+			} else {
+				break;
+			}
+		}
+
+		buf_pool = buf_pool_get(space, i);
+
+		buf_pool_mutex_enter(buf_pool);
+
+		/* We only want to flush pages from this buffer pool. */
+		bpage = buf_page_hash_get(buf_pool, space, i);
+
+		if (!bpage) {
+
+			buf_pool_mutex_exit(buf_pool);
+			continue;
+		}
+
+		ut_a(buf_page_in_file(bpage));
+
+		/* We avoid flushing 'non-old' blocks in an LRU flush,
+		because the flushed blocks are soon freed */
+
+		if (flush_type != BUF_FLUSH_LRU
+		    || i == offset
+		    || buf_page_is_old(bpage)) {
+			mutex_t* block_mutex = buf_page_get_mutex(bpage);
+
+			mutex_enter(block_mutex);
+
+			if (buf_flush_ready_for_flush(bpage, flush_type)
+			    && (i == offset || !bpage->buf_fix_count)) {
+				/* We only try to flush those
+				neighbors != offset where the buf fix
+				count is zero, as we then know that we
+				probably can latch the page without a
+				semaphore wait. Semaphore waits are
+				expensive because we must flush the
+				doublewrite buffer before we start
+				waiting. */
+
+				buf_flush_page(buf_pool, bpage, flush_type);
+				ut_ad(!mutex_own(block_mutex));
+				ut_ad(!buf_pool_mutex_own(buf_pool));
+				count++;
+				continue;
+			} else {
+				mutex_exit(block_mutex);
+			}
+		}
+		buf_pool_mutex_exit(buf_pool);
+	}
+
+	if (count > 0) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+					MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+					MONITOR_FLUSH_NEIGHBOR_COUNT,
+					MONITOR_FLUSH_NEIGHBOR_PAGES,
+					(count - 1));
+	}
+
+	return(count);
+}
+
+/********************************************************************//**
+Check if the block is modified and ready for flushing. If the the block
+is ready to flush then flush the page and try o flush its neighbors.
+
+@return	TRUE if buf_pool mutex was released during this function.
+This does not guarantee that some pages were written as well.
+Number of pages written are incremented to the count. */
+static
+ibool
+buf_flush_page_and_try_neighbors(
+/*=============================*/
+	buf_page_t*	bpage,		/*!< in: buffer control block,
+					must be
+					buf_page_in_file(bpage) */
+	enum buf_flush	flush_type,	/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+	ulint		n_to_flush,	/*!< in: number of pages to
+					flush */
+	ulint*		count)		/*!< in/out: number of pages
+					flushed */
+{
+	mutex_t*	block_mutex;
+	ibool		flushed = FALSE;
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+#endif /* UNIV_DEBUG */
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	block_mutex = buf_page_get_mutex(bpage);
+	mutex_enter(block_mutex);
+
+	ut_a(buf_page_in_file(bpage));
+
+	if (buf_flush_ready_for_flush(bpage, flush_type)) {
+		ulint		space;
+		ulint		offset;
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_bpage(bpage);
+
+		buf_pool_mutex_exit(buf_pool);
+
+		/* These fields are protected by both the
+		buffer pool mutex and block mutex. */
+		space = buf_page_get_space(bpage);
+		offset = buf_page_get_page_no(bpage);
+
+		mutex_exit(block_mutex);
+
+		/* Try to flush also all the neighbors */
+		*count += buf_flush_try_neighbors(space,
+						  offset,
+						  flush_type,
+						  *count,
+						  n_to_flush);
+
+		buf_pool_mutex_enter(buf_pool);
+		flushed = TRUE;
+	} else {
+		mutex_exit(block_mutex);
+	}
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	return(flushed);
+}
+
+/*******************************************************************//**
+This utility moves the uncompressed frames of pages to the free list.
+Note that this function does not actually flush any data to disk. It
+just detaches the uncompressed frames from the compressed pages at the
+tail of the unzip_LRU and puts those freed frames in the free list.
+Note that it is a best effort attempt and it is not guaranteed that
+after a call to this function there will be 'max' blocks in the free
+list.
+@return number of blocks moved to the free list. */
+static
+ulint
+buf_free_from_unzip_LRU_list_batch(
+/*===============================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		max)		/*!< in: desired number of
+					blocks in the free_list */
+{
+	buf_block_t*	block;
+	ulint		scanned = 0;
+	ulint		count = 0;
+	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
+	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
+	while (block != NULL && count < max
+	       && free_len < srv_LRU_scan_depth
+	       && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
+
+		++scanned;
+		if (buf_LRU_free_block(&block->page, FALSE)) {
+			/* Block was freed. buf_pool->mutex potentially
+			released and reacquired */
+			++count;
+			block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
+
+		} else {
+
+			block = UT_LIST_GET_PREV(unzip_LRU, block);
+		}
+
+		free_len = UT_LIST_GET_LEN(buf_pool->free);
+		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+	}
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_BATCH_SCANNED,
+			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+			scanned);
+	}
+
+	return(count);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list.
+The calling thread is not allowed to own any latches on pages!
+It attempts to make 'max' blocks available in the free list. Note that
+it is a best effort attempt and it is not guaranteed that after a call
+to this function there will be 'max' blocks in the free list.
+@return number of blocks for which the write request was queued. */
+static
+ulint
+buf_flush_LRU_list_batch(
+/*=====================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		max)		/*!< in: desired number of
+					blocks in the free_list */
+{
+	buf_page_t*	bpage;
+	ulint		scanned = 0;
+	ulint		count = 0;
+	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
+	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	while (bpage != NULL && count < max
+	       && free_len < srv_LRU_scan_depth
+	       && lru_len > BUF_LRU_MIN_LEN) {
+
+		mutex_t* block_mutex = buf_page_get_mutex(bpage);
+		ibool	 evict;
+
+		mutex_enter(block_mutex);
+		evict = buf_flush_ready_for_replace(bpage);
+		mutex_exit(block_mutex);
+
+		++scanned;
+
+		/* If the block is ready to be replaced we try to
+		free it i.e.: put it on the free list.
+		Otherwise we try to flush the block and its
+		neighbors. In this case we'll put it on the
+		free list in the next pass. We do this extra work
+		of putting blocks to the free list instead of
+		just flushing them because after every flush
+		we have to restart the scan from the tail of
+		the LRU list and if we don't clear the tail
+		of the flushed pages then the scan becomes
+		O(n*n). */
+		if (evict) {
+			if (buf_LRU_free_block(bpage, TRUE)) {
+				/* buf_pool->mutex was potentially
+				released and reacquired. */
+				bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+			} else {
+				bpage = UT_LIST_GET_PREV(LRU, bpage);
+			}
+		} else if (buf_flush_page_and_try_neighbors(
+				bpage,
+				BUF_FLUSH_LRU, max, &count)) {
+
+			/* buf_pool->mutex was released.
+			Restart the scan. */
+			bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+		} else {
+			bpage = UT_LIST_GET_PREV(LRU, bpage);
+		}
+
+		free_len = UT_LIST_GET_LEN(buf_pool->free);
+		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+	}
+
+	/* We keep track of all flushes happening as part of LRU
+	flush. When estimating the desired rate at which flush_list
+	should be flushed, we factor in this value. */
+	buf_lru_flush_page_count += count;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_BATCH_SCANNED,
+			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+			scanned);
+	}
+
+	return(count);
+}
+
+/*******************************************************************//**
+Flush and move pages from LRU or unzip_LRU list to the free list.
+Whether LRU or unzip_LRU is used depends on the state of the system.
+@return number of blocks for which either the write request was queued
+or in case of unzip_LRU the number of blocks actually moved to the
+free list */
+static
+ulint
+buf_do_LRU_batch(
+/*=============*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		max)		/*!< in: desired number of
+					blocks in the free_list */
+{
+	ulint	count = 0;
+
+	if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
+		count += buf_free_from_unzip_LRU_list_batch(buf_pool, max);
+	}
+
+	if (max > count) {
+		count += buf_flush_LRU_list_batch(buf_pool, max - count);
+	}
+
+	return(count);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the flush_list.
+the calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued;
+ULINT_UNDEFINED if there was a flush of the same type already
+running */
+static
+ulint
+buf_do_flush_list_batch(
+/*====================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		min_n,		/*!< in: wished minimum mumber
+					of blocks flushed (it is not
+					guaranteed that the actual
+					number is that big, though) */
+	lsn_t		lsn_limit)	/*!< all blocks whose
+					oldest_modification is smaller
+					than this should be flushed (if
+					their number does not exceed
+					min_n) */
+{
+	ulint		len;
+	buf_page_t*	bpage;
+	ulint		count = 0;
+	ulint		scanned = 0;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	/* If we have flushed enough, leave the loop */
+	do {
+		/* Start from the end of the list looking for a suitable
+		block to be flushed. */
+
+		buf_flush_list_mutex_enter(buf_pool);
+
+		/* We use len here because theoretically insertions can
+		happen in the flush_list below while we are traversing
+		it for a suitable candidate for flushing. We'd like to
+		set a limit on how farther we are willing to traverse
+		the list. */
+		len = UT_LIST_GET_LEN(buf_pool->flush_list);
+		bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+
+		if (bpage) {
+			ut_a(bpage->oldest_modification > 0);
+		}
+
+		if (!bpage || bpage->oldest_modification >= lsn_limit) {
+
+			/* We have flushed enough */
+			buf_flush_list_mutex_exit(buf_pool);
+			break;
+		}
+
+		ut_a(bpage->oldest_modification > 0);
+
+		ut_ad(bpage->in_flush_list);
+
+		buf_flush_list_mutex_exit(buf_pool);
+
+		/* The list may change during the flushing and we cannot
+		safely preserve within this function a pointer to a
+		block in the list! */
+		while (bpage != NULL
+		       && len > 0
+		       && !buf_flush_page_and_try_neighbors(
+				bpage, BUF_FLUSH_LIST, min_n, &count)) {
+
+			++scanned;
+			buf_flush_list_mutex_enter(buf_pool);
+
+			/* If we are here that means that buf_pool->mutex
+			 was not released in buf_flush_page_and_try_neighbors()
+			above and this guarantees that bpage didn't get
+			relocated since we released the flush_list
+			mutex above. There is a chance, however, that
+			the bpage got removed from flush_list (not
+			currently possible because flush_list_remove()
+			also obtains buf_pool mutex but that may change
+			in future). To avoid this scenario we check
+			the oldest_modification and if it is zero
+			we start all over again. */
+			if (bpage->oldest_modification == 0) {
+				buf_flush_list_mutex_exit(buf_pool);
+				break;
+			}
+
+			bpage = UT_LIST_GET_PREV(list, bpage);
+
+			ut_ad(!bpage || bpage->in_flush_list);
+
+			buf_flush_list_mutex_exit(buf_pool);
+
+			--len;
+		}
+
+	} while (count < min_n && bpage != NULL && len > 0);
+
+	MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
+				     MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+				     MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+				     scanned);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	return(count);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list or flush_list.
+NOTE 1: in the case of an LRU flush the calling thread may own latches to
+pages: to avoid deadlocks, this function must be written so that it cannot
+end up waiting for these latches! NOTE 2: in the case of a flush list flush,
+the calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued;
+ULINT_UNDEFINED if there was a flush of the same type already running */
+static
+ulint
+buf_flush_batch(
+/*============*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	enum buf_flush	flush_type,	/*!< in: BUF_FLUSH_LRU or
+					BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
+					then the caller must not own any
+					latches on pages */
+	ulint		min_n,		/*!< in: wished minimum mumber of blocks
+					flushed (it is not guaranteed that the
+					actual number is that big, though) */
+	lsn_t		lsn_limit)	/*!< in: in the case of BUF_FLUSH_LIST
+					all blocks whose oldest_modification is
+					smaller than this should be flushed
+					(if their number does not exceed
+					min_n), otherwise ignored */
+{
+	ulint		count	= 0;
+
+	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad((flush_type != BUF_FLUSH_LIST)
+	      || sync_thread_levels_empty_except_dict());
+#endif /* UNIV_SYNC_DEBUG */
+
+	buf_pool_mutex_enter(buf_pool);
+
+	/* Note: The buffer pool mutex is released and reacquired within
+	the flush functions. */
+	switch (flush_type) {
+	case BUF_FLUSH_LRU:
+		count = buf_do_LRU_batch(buf_pool, min_n);
+		break;
+	case BUF_FLUSH_LIST:
+		count = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit);
+		break;
+	default:
+		ut_error;
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+
+	buf_dblwr_flush_buffered_writes();
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints && count > 0) {
+		fprintf(stderr, flush_type == BUF_FLUSH_LRU
+			? "Flushed %lu pages in LRU flush\n"
+			: "Flushed %lu pages in flush list flush\n",
+			(ulong) count);
+	}
+#endif /* UNIV_DEBUG */
+
+	srv_buf_pool_flushed += count;
+
+	return(count);
+}
+
+/******************************************************************//**
+Gather the aggregated stats for both flush list and LRU list flushing */
+static
+void
+buf_flush_common(
+/*=============*/
+	enum buf_flush	flush_type,	/*!< in: type of flush */
+	ulint		page_count)	/*!< in: number of pages flushed */
+{
+	buf_dblwr_flush_buffered_writes();
+
+	ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints && page_count > 0) {
+		fprintf(stderr, flush_type == BUF_FLUSH_LRU
+			? "Flushed %lu pages in LRU flush\n"
+			: "Flushed %lu pages in flush list flush\n",
+			(ulong) page_count);
+	}
+#endif /* UNIV_DEBUG */
+
+	srv_buf_pool_flushed += page_count;
+
+	if (flush_type == BUF_FLUSH_LRU) {
+		/* We keep track of all flushes happening as part of LRU
+		flush. When estimating the desired rate at which flush_list
+		should be flushed we factor in this value. */
+		buf_lru_flush_page_count += page_count;
+	}
+}
+
+/******************************************************************//**
+Start a buffer flush batch for LRU or flush list */
+static
+ibool
+buf_flush_start(
+/*============*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	enum buf_flush	flush_type)	/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+{
+	buf_pool_mutex_enter(buf_pool);
+
+	if (buf_pool->n_flush[flush_type] > 0
+	   || buf_pool->init_flush[flush_type] == TRUE) {
+
+		/* There is already a flush batch of the same type running */
+
+		buf_pool_mutex_exit(buf_pool);
+
+		return(FALSE);
+	}
+
+	buf_pool->init_flush[flush_type] = TRUE;
+
+	buf_pool_mutex_exit(buf_pool);
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+End a buffer flush batch for LRU or flush list */
+static
+void
+buf_flush_end(
+/*==========*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	enum buf_flush	flush_type)	/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+{
+	buf_pool_mutex_enter(buf_pool);
+
+	buf_pool->init_flush[flush_type] = FALSE;
+
+	buf_pool->try_LRU_scan = TRUE;
+
+	if (buf_pool->n_flush[flush_type] == 0) {
+
+		/* The running flush batch has ended */
+
+		os_event_set(buf_pool->no_flush[flush_type]);
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+}
+
+/******************************************************************//**
+Waits until a flush batch of the given type ends */
+UNIV_INTERN
+void
+buf_flush_wait_batch_end(
+/*=====================*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	enum buf_flush	type)		/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+{
+	ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
+
+	if (buf_pool == NULL) {
+		ulint	i;
+
+		for (i = 0; i < srv_buf_pool_instances; ++i) {
+			buf_pool_t*	buf_pool;
+
+			buf_pool = buf_pool_from_array(i);
+
+			thd_wait_begin(NULL, THD_WAIT_DISKIO);
+			os_event_wait(buf_pool->no_flush[type]);
+			thd_wait_end(NULL);
+		}
+	} else {
+		thd_wait_begin(NULL, THD_WAIT_DISKIO);
+		os_event_wait(buf_pool->no_flush[type]);
+		thd_wait_end(NULL);
+	}
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list and also
+puts replaceable clean pages from the end of the LRU list to the free
+list.
+NOTE: The calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued;
+ULINT_UNDEFINED if there was a flush of the same type already running */
+static
+ulint
+buf_flush_LRU(
+/*==========*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
+	ulint		min_n)		/*!< in: wished minimum mumber of blocks
+					flushed (it is not guaranteed that the
+					actual number is that big, though) */
+{
+	ulint		page_count;
+
+	if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
+		return(ULINT_UNDEFINED);
+	}
+
+	page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0);
+
+	buf_flush_end(buf_pool, BUF_FLUSH_LRU);
+
+	buf_flush_common(BUF_FLUSH_LRU, page_count);
+
+	return(page_count);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the flush list of
+all buffer pool instances.
+NOTE: The calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued;
+ULINT_UNDEFINED if there was a flush of the same type already running */
+UNIV_INTERN
+ulint
+buf_flush_list(
+/*===========*/
+	ulint		min_n,		/*!< in: wished minimum mumber of blocks
+					flushed (it is not guaranteed that the
+					actual number is that big, though) */
+	lsn_t		lsn_limit)	/*!< in the case BUF_FLUSH_LIST all
+					blocks whose oldest_modification is
+					smaller than this should be flushed
+					(if their number does not exceed
+					min_n), otherwise ignored */
+{
+	ulint		i;
+	ulint		total_page_count = 0;
+	ibool		skipped = FALSE;
+
+	if (min_n != ULINT_MAX) {
+		/* Ensure that flushing is spread evenly amongst the
+		buffer pool instances. When min_n is ULINT_MAX
+		we need to flush everything up to the lsn limit
+		so no limit here. */
+		min_n = (min_n + srv_buf_pool_instances - 1)
+			 / srv_buf_pool_instances;
+	}
+
+	/* Flush to lsn_limit in all buffer pool instances */
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+		ulint		page_count = 0;
+
+		buf_pool = buf_pool_from_array(i);
+
+		if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) {
+			/* We have two choices here. If lsn_limit was
+			specified then skipping an instance of buffer
+			pool means we cannot guarantee that all pages
+			up to lsn_limit has been flushed. We can
+			return right now with failure or we can try
+			to flush remaining buffer pools up to the
+			lsn_limit. We attempt to flush other buffer
+			pools based on the assumption that it will
+			help in the retry which will follow the
+			failure. */
+			skipped = TRUE;
+
+			continue;
+		}
+
+		page_count = buf_flush_batch(
+			buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit);
+
+		buf_flush_end(buf_pool, BUF_FLUSH_LIST);
+
+		buf_flush_common(BUF_FLUSH_LIST, page_count);
+
+		total_page_count += page_count;
+
+		if (page_count) {
+			MONITOR_INC_VALUE_CUMULATIVE(
+				MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+				MONITOR_FLUSH_BATCH_COUNT,
+				MONITOR_FLUSH_BATCH_PAGES,
+				page_count);
+		}
+	}
+
+	return(lsn_limit != LSN_MAX && skipped
+	       ? ULINT_UNDEFINED : total_page_count);
+}
+
+/******************************************************************//**
+This function picks up a single dirty page from the tail of the LRU
+list, flushes it, removes it from page_hash and LRU list and puts
+it on the free list. It is called from user threads when they are
+unable to find a replaceable page at the tail of the LRU list i.e.:
+when the background LRU flushing in the page_cleaner thread is not
+fast enough to keep pace with the workload.
+@return TRUE if success. */
+UNIV_INTERN
+ibool
+buf_flush_single_page_from_LRU(
+/*===========================*/
+	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
+{
+	ulint		scanned;
+	buf_page_t*	bpage;
+	mutex_t*	block_mutex;
+	ibool		freed;
+	ibool		evict_zip;
+
+	buf_pool_mutex_enter(buf_pool);
+
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), scanned = 1;
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_PREV(LRU, bpage), ++scanned) {
+
+		block_mutex = buf_page_get_mutex(bpage);
+		mutex_enter(block_mutex);
+		if (buf_flush_ready_for_flush(bpage,
+					      BUF_FLUSH_SINGLE_PAGE)) {
+			/* buf_flush_page() will release the block
+			mutex */
+			break;
+		}
+		mutex_exit(block_mutex);
+	}
+
+	MONITOR_INC_VALUE_CUMULATIVE(
+		MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+		MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
+		MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
+		scanned);
+
+	if (!bpage) {
+		/* Can't find a single flushable page. */
+		buf_pool_mutex_exit(buf_pool);
+		return(FALSE);
+	}
+
+	/* The following call will release the buffer pool and
+	block mutex. */
+	buf_flush_page(buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE);
+
+	buf_flush_sync_datafiles();
+
+	/* At this point the page has been written to the disk.
+	As we are not holding buffer pool or block mutex therefore
+	we cannot use the bpage safely. It may have been plucked out
+	of the LRU list by some other thread or it may even have
+	relocated in case of a compressed page. We need to start
+	the scan of LRU list again to remove the block from the LRU
+	list and put it on the free list. */
+	buf_pool_mutex_enter(buf_pool);
+
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_PREV(LRU, bpage)) {
+
+		ibool	ready;
+
+		block_mutex = buf_page_get_mutex(bpage);
+		mutex_enter(block_mutex);
+		ready = buf_flush_ready_for_replace(bpage);
+		mutex_exit(block_mutex);
+		if (ready) {
+			break;
+		}
+
+	}
+
+	if (!bpage) {
+		/* Can't find a single replaceable page. */
+		buf_pool_mutex_exit(buf_pool);
+		return(FALSE);
+	}
+
+	evict_zip = !buf_LRU_evict_from_unzip_LRU(buf_pool);;
+
+	freed = buf_LRU_free_block(bpage, evict_zip);
+	buf_pool_mutex_exit(buf_pool);
+
+	return(freed);
+}
+
+/*********************************************************************
+Update the historical stats that we are collecting for flush rate
+heuristics at the end of each interval.
+Flush rate heuristic depends on (a) rate of redo log generation and
+(b) the rate at which LRU flush is happening. */
+UNIV_INTERN
+void
+buf_flush_stat_update(void)
+/*=======================*/
+{
+	buf_flush_stat_t*	item;
+	lsn_t			lsn_diff;
+	lsn_t			lsn;
+	ulint			n_flushed;
+
+	lsn = log_get_lsn();
+	if (buf_flush_stat_cur.redo == 0) {
+		/* First time around. Just update the current LSN
+		and return. */
+		buf_flush_stat_cur.redo = lsn;
+		return;
+	}
+
+	item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
+
+	/* values for this interval */
+	lsn_diff = lsn - buf_flush_stat_cur.redo;
+	n_flushed = buf_lru_flush_page_count
+		    - buf_flush_stat_cur.n_flushed;
+
+	/* add the current value and subtract the obsolete entry. */
+	buf_flush_stat_sum.redo += lsn_diff - item->redo;
+	buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
+
+	/* put current entry in the array. */
+	item->redo = lsn_diff;
+	item->n_flushed = n_flushed;
+
+	/* update the index */
+	buf_flush_stat_arr_ind++;
+	buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
+
+	/* reset the current entry. */
+	buf_flush_stat_cur.redo = lsn;
+	buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
+}
+
+/*********************************************************************
+Determines the fraction of dirty pages that need to be flushed based
+on the speed at which we generate redo log. Note that if redo log
+is generated at a significant rate without corresponding increase
+in the number of dirty pages (for example, an in-memory workload)
+it can cause IO bursts of flushing. This function implements heuristics
+to avoid this burstiness.
+@return	number of dirty pages to be flushed / second */
+static
+ulint
+buf_flush_get_desired_flush_rate(void)
+/*==================================*/
+{
+	ulint		i;
+	lsn_t		redo_avg;
+	ulint		n_dirty = 0;
+	ib_uint64_t	n_flush_req;
+	ib_uint64_t	lru_flush_avg;
+	lsn_t		lsn = log_get_lsn();
+	lsn_t		log_capacity = log_get_capacity();
+
+	/* log_capacity should never be zero after the initialization
+	of log subsystem. */
+	ut_ad(log_capacity != 0);
+
+	/* Get total number of dirty pages. It is OK to access
+	flush_list without holding any mutex as we are using this
+	only for heuristics. */
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+		n_dirty += UT_LIST_GET_LEN(buf_pool->flush_list);
+	}
+
+	/* An overflow can happen if we generate more than 2^32 bytes
+	of redo in this interval i.e.: 4G of redo in 1 second. We can
+	safely consider this as infinity because if we ever come close
+	to 4G we'll start a synchronous flush of dirty pages. */
+	/* redo_avg below is average at which redo is generated in
+	past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
+	interval. */
+	redo_avg = buf_flush_stat_sum.redo / BUF_FLUSH_STAT_N_INTERVAL
+		+ (lsn - buf_flush_stat_cur.redo);
+
+	/* An overflow can happen possibly if we flush more than 2^32
+	pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
+	unlikely scenario. Even when this happens it means that our
+	flush rate will be off the mark. It won't affect correctness
+	of any subsystem. */
+	/* lru_flush_avg below is rate at which pages are flushed as
+	part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
+	number of pages flushed in the current interval. */
+	lru_flush_avg = buf_flush_stat_sum.n_flushed
+			/ BUF_FLUSH_STAT_N_INTERVAL
+			+ (buf_lru_flush_page_count
+			   - buf_flush_stat_cur.n_flushed);
+
+	n_flush_req = (n_dirty * redo_avg) / log_capacity;
+
+	/* The number of pages that we want to flush from the flush
+	list is the difference between the required rate and the
+	number of pages that we are historically flushing from the
+	LRU list */
+	if (n_flush_req <= lru_flush_avg) {
+		return(0);
+	} else {
+		ib_uint64_t	rate;
+
+		rate = n_flush_req - lru_flush_avg;
+
+		return((ulint) (rate < PCT_IO(100) ? rate : PCT_IO(100)));
+	}
+}
+
+/*********************************************************************//**
+Clears up tail of the LRU lists:
+* Put replaceable pages at the tail of LRU to the free list
+* Flush dirty pages at the tail of LRU to the disk
+The depth to which we scan each buffer pool is controlled by dynamic
+config parameter innodb_LRU_scan_depth.
+@return total pages flushed */
+UNIV_INLINE
+ulint
+page_cleaner_flush_LRU_tail(void)
+/*=============================*/
+{
+	ulint	i;
+	ulint	j;
+	ulint	total_flushed = 0;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+
+		buf_pool_t*	buf_pool = buf_pool_from_array(i);
+
+		/* We divide LRU flush into smaller chunks because
+		there may be user threads waiting for the flush to
+		end in buf_LRU_get_free_block(). */
+		for (j = 0;
+		     j < srv_LRU_scan_depth;
+		     j += PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE) {
+
+			ulint	n_flushed = buf_flush_LRU(buf_pool,
+				PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE);
+
+			/* Currently page_cleaner is the only thread
+			that can trigger an LRU flush. It is possible
+			that a batch triggered during last iteration is
+			still running, */
+			if (n_flushed != ULINT_UNDEFINED) {
+				total_flushed += n_flushed;
+			}
+		}
+	}
+
+	if (total_flushed) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_BATCH_TOTAL_PAGE,
+			MONITOR_LRU_BATCH_COUNT,
+			MONITOR_LRU_BATCH_PAGES,
+			total_flushed);
+	}
+
+	return(total_flushed);
+}
+
+/*********************************************************************//**
+Wait for any possible LRU flushes that are in progress to end. */
+UNIV_INLINE
+void
+page_cleaner_wait_LRU_flush(void)
+/*=============================*/
+{
+	ulint	i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		buf_pool_mutex_enter(buf_pool);
+
+		if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
+		   || buf_pool->init_flush[BUF_FLUSH_LRU]) {
+
+			buf_pool_mutex_exit(buf_pool);
+			buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
+		} else {
+			buf_pool_mutex_exit(buf_pool);
+		}
+	}
+}
+
+/*********************************************************************//**
+Flush a batch of dirty pages from the flush list
+@return number of pages flushed, 0 if no page is flushed or if another
+flush_list type batch is running */
+static
+ulint
+page_cleaner_do_flush_batch(
+/*========================*/
+	ulint		n_to_flush,	/*!< in: number of pages that
+					we should attempt to flush. If
+					an lsn_limit is provided then
+					this value will have no affect */
+	lsn_t		lsn_limit)	/*!< in: LSN up to which flushing
+					must happen */
+{
+	ulint n_flushed;
+
+	ut_ad(n_to_flush == ULINT_MAX || lsn_limit == LSN_MAX);
+
+	n_flushed = buf_flush_list(n_to_flush, lsn_limit);
+	if (n_flushed == ULINT_UNDEFINED) {
+		n_flushed = 0;
+	}
+
+	return(n_flushed);
+}
+
+/*********************************************************************//**
+This function is called approximately once every second by the
+page_cleaner thread. Based on various factors it decides if there is a
+need to do flushing. If flushing is needed it is performed and the
+number of pages flushed is returned.
+@return number of pages flushed */
+static
+ulint
+page_cleaner_flush_pages_if_needed(void)
+/*====================================*/
+{
+	ulint	n_pages_flushed = 0;
+	lsn_t	lsn_limit = log_async_flush_lsn();
+
+	/* Currently we decide whether or not to flush and how much to
+	flush based on three factors.
+
+	1) If the amount of LSN for which pages are not flushed to disk
+	yet is greater than log_sys->max_modified_age_async. This is
+	the most urgent type of flush and we attempt to cleanup enough
+	of the tail of the flush_list to avoid flushing inside user
+	threads.
+
+	2) If modified page ratio is greater than the one specified by
+	the user. In that case we flush full 100% IO_CAPACITY of the
+	server. Note that 1 and 2 are not mutually exclusive. We can
+	end up executing both steps.
+
+	3) If adaptive_flushing is set by the user and neither of 1
+	or 2 has occurred above then we flush a batch based on our
+	heuristics. */
+
+	if (lsn_limit != LSN_MAX) {
+
+		/* async flushing is requested */
+		n_pages_flushed = page_cleaner_do_flush_batch(ULINT_MAX,
+							      lsn_limit);
+
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_FLUSH_ASYNC_TOTAL_PAGE,
+			MONITOR_FLUSH_ASYNC_COUNT,
+			MONITOR_FLUSH_ASYNC_PAGES,
+			n_pages_flushed);
+	}
+
+	if (UNIV_UNLIKELY(n_pages_flushed < PCT_IO(100)
+			  && buf_get_modified_ratio_pct()
+			     > srv_max_buf_pool_modified_pct)) {
+
+		/* Try to keep the number of modified pages in the
+		buffer pool under the limit wished by the user */
+
+		n_pages_flushed += page_cleaner_do_flush_batch(PCT_IO(100),
+							       LSN_MAX);
+
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_FLUSH_MAX_DIRTY_TOTAL_PAGE,
+			MONITOR_FLUSH_MAX_DIRTY_COUNT,
+			MONITOR_FLUSH_MAX_DIRTY_PAGES,
+			n_pages_flushed);
+	}
+
+	if (srv_adaptive_flushing && n_pages_flushed == 0) {
+
+		/* Try to keep the rate of flushing of dirty
+		pages such that redo log generation does not
+		produce bursts of IO at checkpoint time. */
+		ulint n_flush = buf_flush_get_desired_flush_rate();
+
+		ut_ad(n_flush <= PCT_IO(100));
+		if (n_flush) {
+			n_pages_flushed = page_cleaner_do_flush_batch(
+				n_flush, LSN_MAX);
+
+			MONITOR_INC_VALUE_CUMULATIVE(
+				MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+				MONITOR_FLUSH_ADAPTIVE_COUNT,
+				MONITOR_FLUSH_ADAPTIVE_PAGES,
+				n_pages_flushed);
+		}
+	}
+
+	return(n_pages_flushed);
+}
+
+/*********************************************************************//**
+Puts the page_cleaner thread to sleep if it has finished work in less
+than a second */
+static
+void
+page_cleaner_sleep_if_needed(
+/*=========================*/
+	ulint	next_loop_time)	/*!< in: time when next loop iteration
+				should start */
+{
+	ulint	cur_time = ut_time_ms();
+
+	if (next_loop_time > cur_time) {
+		/* Get sleep interval in micro seconds. We use
+		ut_min() to avoid long sleep in case of
+		wrap around. */
+		os_thread_sleep(ut_min(1000000,
+				(next_loop_time - cur_time)
+				 * 1000));
+	}
+}
+
+/******************************************************************//**
+page_cleaner thread tasked with flushing dirty pages from the buffer
+pools. As of now we'll have only one instance of this thread.
+@return a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(buf_flush_page_cleaner_thread)(
+/*==========================================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	ulint	next_loop_time = ut_time_ms() + 1000;
+	ulint	n_flushed = 0;
+	ulint	last_activity = srv_get_activity_count();
+	ulint	i;
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(buf_page_cleaner_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "InnoDB: page_cleaner thread running, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	buf_page_cleaner_is_active = TRUE;
+
+	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+
+		/* The page_cleaner skips sleep if the server is
+		idle and there are no pending IOs in the buffer pool
+		and there is work to do. */
+		if (srv_check_activity(last_activity)
+		    || buf_get_n_pending_read_ios()
+		    || n_flushed == 0) {
+			page_cleaner_sleep_if_needed(next_loop_time);
+		}
+
+		next_loop_time = ut_time_ms() + 1000;
+
+		if (srv_check_activity(last_activity)) {
+			last_activity = srv_get_activity_count();
+
+			/* Flush pages from end of LRU if required */
+			n_flushed = page_cleaner_flush_LRU_tail();
+
+			/* Flush pages from flush_list if required */
+			n_flushed += page_cleaner_flush_pages_if_needed();
+		} else {
+			n_flushed = page_cleaner_do_flush_batch(
+							PCT_IO(100),
+							LSN_MAX);
+
+			if (n_flushed) {
+				MONITOR_INC_VALUE_CUMULATIVE(
+					MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+					MONITOR_FLUSH_BACKGROUND_COUNT,
+					MONITOR_FLUSH_BACKGROUND_PAGES,
+					n_flushed);
+			}
+		}
+	}
+
+	ut_ad(srv_shutdown_state > 0);
+	if (srv_fast_shutdown == 2) {
+		/* In very fast shutdown we simulate a crash of
+		buffer pool. We are not required to do any flushing */
+		goto thread_exit;
+	}
+
+	/* In case of normal and slow shutdown the page_cleaner thread
+	must wait for all other activity in the server to die down.
+	Note that we can start flushing the buffer pool as soon as the
+	server enters shutdown phase but we must stay alive long enough
+	to ensure that any work done by the master or purge threads is
+	also flushed.
+	During shutdown we pass through two stages. In the first stage,
+	when SRV_SHUTDOWN_CLEANUP is set other threads like the master
+	and the purge threads may be working as well. We start flushing
+	the buffer pool but can't be sure that no new pages are being
+	dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */
+
+	do {
+		n_flushed = page_cleaner_do_flush_batch(PCT_IO(100), LSN_MAX);
+
+		/* We sleep only if there are no pages to flush */
+		if (n_flushed == 0) {
+			os_thread_sleep(100000);
+		}
+	} while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
+
+	/* At this point all threads including the master and the purge
+	thread must have been suspended. */
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
+	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
+
+	/* We can now make a final sweep on flushing the buffer pool
+	and exit after we have cleaned the whole buffer pool.
+	It is important that we wait for any running batch that has
+	been triggered by us to finish. Otherwise we can end up
+	considering end of that batch as a finish of our final
+	sweep and we'll come out of the loop leaving behind dirty pages
+	in the flush_list */
+	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+	page_cleaner_wait_LRU_flush();
+
+	do {
+
+		n_flushed = buf_flush_list(PCT_IO(100), LSN_MAX);
+		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+
+	} while (n_flushed > 0);
+
+	/* Some sanity checks */
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
+	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t* buf_pool = buf_pool_from_array(i);
+		ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
+	}
+
+	/* We have lived our life. Time to die. */
+
+thread_exit:
+	buf_page_cleaner_is_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+
+/** Functor to validate the flush list. */
+struct	Check {
+	void	operator()(const buf_page_t* elem)
+	{
+		ut_a(elem->in_flush_list);
+	}
+};
+
+/******************************************************************//**
+Validates the flush list.
+@return	TRUE if ok */
+static
+ibool
+buf_flush_validate_low(
+/*===================*/
+	buf_pool_t*	buf_pool)		/*!< in: Buffer pool instance */
+{
+	buf_page_t*		bpage;
+	const ib_rbt_node_t*	rnode = NULL;
+
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+	UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list, Check());
+
+	bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+	/* If we are in recovery mode i.e.: flush_rbt != NULL
+	then each block in the flush_list must also be present
+	in the flush_rbt. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		rnode = rbt_first(buf_pool->flush_rbt);
+	}
+
+	while (bpage != NULL) {
+		const lsn_t	om = bpage->oldest_modification;
+
+		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
+
+		ut_ad(bpage->in_flush_list);
+
+		/* A page in buf_pool->flush_list can be in
+		BUF_BLOCK_REMOVE_HASH state. This happens when a page
+		is in the middle of being relocated. In that case the
+		original descriptor can have this state and still be
+		in the flush list waiting to acquire the
+		buf_pool->flush_list_mutex to complete the relocation. */
+		ut_a(buf_page_in_file(bpage)
+		     || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
+		ut_a(om > 0);
+
+		if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+			buf_page_t** prpage;
+
+			ut_a(rnode);
+			prpage = rbt_value(buf_page_t*, rnode);
+
+			ut_a(*prpage);
+			ut_a(*prpage == bpage);
+			rnode = rbt_next(buf_pool->flush_rbt, rnode);
+		}
+
+		bpage = UT_LIST_GET_NEXT(list, bpage);
+
+		ut_a(!bpage || om >= bpage->oldest_modification);
+	}
+
+	/* By this time we must have exhausted the traversal of
+	flush_rbt (if active) as well. */
+	ut_a(rnode == NULL);
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Validates the flush list.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+buf_flush_validate(
+/*===============*/
+	buf_pool_t*	buf_pool)	/*!< buffer pool instance */
+{
+	ibool	ret;
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	ret = buf_flush_validate_low(buf_pool);
+
+	buf_flush_list_mutex_exit(buf_pool);
+
+	return(ret);
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
author	Michael Widenius <monty@askmonty.org>	2012-08-01 17:27:34 +0300
committer	Michael Widenius <monty@askmonty.org>	2012-08-01 17:27:34 +0300
commit	1d0f70c2f894b27e98773a282871d32802f67964 (patch)
tree	833e683e0ced29c4323c29a9d845703d4dfcd81b /storage/innobase/buf/buf0flu.cc
parent	5a86a61219826aadf8d08cbc447fe438f2bf50c3 (diff)
download	mariadb-git-1d0f70c2f894b27e98773a282871d32802f67964.tar.gz