summaryrefslogtreecommitdiff
path: root/storage/xtradb/buf
diff options
context:
space:
mode:
Diffstat (limited to 'storage/xtradb/buf')
-rw-r--r--storage/xtradb/buf/buf0buddy.c595
-rw-r--r--storage/xtradb/buf/buf0buddy.cc726
-rw-r--r--storage/xtradb/buf/buf0buf.cc (renamed from storage/xtradb/buf/buf0buf.c)1911
-rw-r--r--storage/xtradb/buf/buf0checksum.cc155
-rw-r--r--storage/xtradb/buf/buf0dblwr.cc1136
-rw-r--r--storage/xtradb/buf/buf0dump.cc621
-rw-r--r--storage/xtradb/buf/buf0flu.c2402
-rw-r--r--storage/xtradb/buf/buf0flu.cc2938
-rw-r--r--storage/xtradb/buf/buf0lru.cc (renamed from storage/xtradb/buf/buf0lru.c)1976
-rw-r--r--storage/xtradb/buf/buf0rea.cc (renamed from storage/xtradb/buf/buf0rea.c)224
10 files changed, 7582 insertions, 5102 deletions
diff --git a/storage/xtradb/buf/buf0buddy.c b/storage/xtradb/buf/buf0buddy.c
deleted file mode 100644
index 439be08b01f..00000000000
--- a/storage/xtradb/buf/buf0buddy.c
+++ /dev/null
@@ -1,595 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file buf/buf0buddy.c
-Binary buddy allocator for compressed pages
-
-Created December 2006 by Marko Makela
-*******************************************************/
-
-#define THIS_MODULE
-#include "buf0buddy.h"
-#ifdef UNIV_NONINL
-# include "buf0buddy.ic"
-#endif
-#undef THIS_MODULE
-#include "buf0buf.h"
-#include "buf0lru.h"
-#include "buf0flu.h"
-#include "page0zip.h"
-
-/**********************************************************************//**
-Get the offset of the buddy of a compressed page frame.
-@return the buddy relative of page */
-UNIV_INLINE
-byte*
-buf_buddy_get(
-/*==========*/
- byte* page, /*!< in: compressed page */
- ulint size) /*!< in: page size in bytes */
-{
- ut_ad(ut_is_2pow(size));
- ut_ad(size >= BUF_BUDDY_LOW);
- ut_ad(size < BUF_BUDDY_HIGH);
- ut_ad(!ut_align_offset(page, size));
-
- if (((ulint) page) & size) {
- return(page - size);
- } else {
- return(page + size);
- }
-}
-
-/** Validate a given zip_free list. */
-#define BUF_BUDDY_LIST_VALIDATE(b, i) \
- UT_LIST_VALIDATE(zip_list, buf_page_t, \
- b->zip_free[i], \
- ut_ad(buf_page_get_state( \
- ut_list_node_313) \
- == BUF_BLOCK_ZIP_FREE))
-
-/**********************************************************************//**
-Add a block to the head of the appropriate buddy free list. */
-UNIV_INLINE
-void
-buf_buddy_add_to_free(
-/*==================*/
- buf_pool_t* buf_pool, /*!< in: buffer pool instance */
- buf_page_t* bpage, /*!< in,own: block to be freed */
- ulint i) /*!< in: index of
- buf_pool->zip_free[] */
-{
- //ut_ad(buf_pool_mutex_own(buf_pool));
- ut_ad(mutex_own(&buf_pool->zip_free_mutex));
- ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
- ut_ad(buf_pool->zip_free[i].start != bpage);
- UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_free[i], bpage);
-}
-
-/**********************************************************************//**
-Remove a block from the appropriate buddy free list. */
-UNIV_INLINE
-void
-buf_buddy_remove_from_free(
-/*=======================*/
- buf_pool_t* buf_pool, /*!< in: buffer pool instance */
- buf_page_t* bpage, /*!< in: block to be removed */
- ulint i) /*!< in: index of
- buf_pool->zip_free[] */
-{
-#ifdef UNIV_DEBUG
- buf_page_t* prev = UT_LIST_GET_PREV(zip_list, bpage);
- buf_page_t* next = UT_LIST_GET_NEXT(zip_list, bpage);
-
- ut_ad(!prev || buf_page_get_state(prev) == BUF_BLOCK_ZIP_FREE);
- ut_ad(!next || buf_page_get_state(next) == BUF_BLOCK_ZIP_FREE);
-#endif /* UNIV_DEBUG */
-
- //ut_ad(buf_pool_mutex_own(buf_pool));
- ut_ad(mutex_own(&buf_pool->zip_free_mutex));
- ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
- UT_LIST_REMOVE(zip_list, buf_pool->zip_free[i], bpage);
-}
-
-/**********************************************************************//**
-Try to allocate a block from buf_pool->zip_free[].
-@return allocated block, or NULL if buf_pool->zip_free[] was empty */
-static
-void*
-buf_buddy_alloc_zip(
-/*================*/
- buf_pool_t* buf_pool, /*!< in: buffer pool instance */
- ulint i) /*!< in: index of buf_pool->zip_free[] */
-{
- buf_page_t* bpage;
-
- //ut_ad(buf_pool_mutex_own(buf_pool));
- ut_ad(mutex_own(&buf_pool->zip_free_mutex));
- ut_a(i < BUF_BUDDY_SIZES);
- ut_a(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
-
- ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
-
- bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]);
-
- if (bpage) {
- ut_a(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
-
- buf_buddy_remove_from_free(buf_pool, bpage, i);
- } else if (i + 1 < BUF_BUDDY_SIZES) {
- /* Attempt to split. */
- bpage = buf_buddy_alloc_zip(buf_pool, i + 1);
-
- if (bpage) {
- buf_page_t* buddy = (buf_page_t*)
- (((char*) bpage) + (BUF_BUDDY_LOW << i));
-
- ut_ad(!buf_pool_contains_zip(buf_pool, buddy));
- ut_d(memset(buddy, i, BUF_BUDDY_LOW << i));
- buddy->state = BUF_BLOCK_ZIP_FREE;
- buf_buddy_add_to_free(buf_pool, buddy, i);
- }
- }
-
- if (bpage) {
- ut_d(memset(bpage, ~i, BUF_BUDDY_LOW << i));
- UNIV_MEM_ALLOC(bpage, BUF_BUDDY_SIZES << i);
- }
-
- return(bpage);
-}
-
-/**********************************************************************//**
-Deallocate a buffer frame of UNIV_PAGE_SIZE. */
-static
-void
-buf_buddy_block_free(
-/*=================*/
- buf_pool_t* buf_pool, /*!< in: buffer pool instance */
- void* buf, /*!< in: buffer frame to deallocate */
- ibool have_page_hash_mutex)
-{
- const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf);
- buf_page_t* bpage;
- buf_block_t* block;
-
- //ut_ad(buf_pool_mutex_own(buf_pool));
- ut_ad(!mutex_own(&buf_pool->zip_mutex));
- ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE));
-
- mutex_enter(&buf_pool->zip_hash_mutex);
-
- HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage,
- ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY
- && bpage->in_zip_hash && !bpage->in_page_hash),
- ((buf_block_t*) bpage)->frame == buf);
- ut_a(bpage);
- ut_a(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY);
- ut_ad(!bpage->in_page_hash);
- ut_ad(bpage->in_zip_hash);
- ut_d(bpage->in_zip_hash = FALSE);
- HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage);
-
- mutex_exit(&buf_pool->zip_hash_mutex);
-
- ut_d(memset(buf, 0, UNIV_PAGE_SIZE));
- UNIV_MEM_INVALID(buf, UNIV_PAGE_SIZE);
-
- block = (buf_block_t*) bpage;
- mutex_enter(&block->mutex);
- buf_LRU_block_free_non_file_page(block, have_page_hash_mutex);
- mutex_exit(&block->mutex);
-
- ut_ad(buf_pool->buddy_n_frames > 0);
- ut_d(buf_pool->buddy_n_frames--);
-}
-
-/**********************************************************************//**
-Allocate a buffer block to the buddy allocator. */
-static
-void
-buf_buddy_block_register(
-/*=====================*/
- buf_block_t* block) /*!< in: buffer frame to allocate */
-{
- buf_pool_t* buf_pool = buf_pool_from_block(block);
- const ulint fold = BUF_POOL_ZIP_FOLD(block);
- //ut_ad(buf_pool_mutex_own(buf_pool));
- ut_ad(!mutex_own(&buf_pool->zip_mutex));
- ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE);
-
- buf_block_set_state(block, BUF_BLOCK_MEMORY);
-
- ut_a(block->frame);
- ut_a(!ut_align_offset(block->frame, UNIV_PAGE_SIZE));
-
- ut_ad(!block->page.in_page_hash);
- ut_ad(!block->page.in_zip_hash);
- ut_d(block->page.in_zip_hash = TRUE);
-
- mutex_enter(&buf_pool->zip_hash_mutex);
- HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page);
- mutex_exit(&buf_pool->zip_hash_mutex);
-
- ut_d(buf_pool->buddy_n_frames++);
-}
-
-/**********************************************************************//**
-Allocate a block from a bigger object.
-@return allocated block */
-static
-void*
-buf_buddy_alloc_from(
-/*=================*/
- buf_pool_t* buf_pool, /*!< in: buffer pool instance */
- void* buf, /*!< in: a block that is free to use */
- ulint i, /*!< in: index of
- buf_pool->zip_free[] */
- ulint j) /*!< in: size of buf as an index
- of buf_pool->zip_free[] */
-{
- ulint offs = BUF_BUDDY_LOW << j;
- ut_ad(j <= BUF_BUDDY_SIZES);
- ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
- ut_ad(j >= i);
- ut_ad(!ut_align_offset(buf, offs));
-
- /* Add the unused parts of the block to the free lists. */
- while (j > i) {
- buf_page_t* bpage;
-
- offs >>= 1;
- j--;
-
- bpage = (buf_page_t*) ((byte*) buf + offs);
- ut_d(memset(bpage, j, BUF_BUDDY_LOW << j));
- bpage->state = BUF_BLOCK_ZIP_FREE;
- ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
- buf_buddy_add_to_free(buf_pool, bpage, j);
- }
-
- return(buf);
-}
-
-/**********************************************************************//**
-Allocate a block. The thread calling this function must hold
-buf_pool->mutex and must not hold buf_pool->zip_mutex or any block->mutex.
-The buf_pool_mutex may be released and reacquired.
-@return allocated block, never NULL */
-UNIV_INTERN
-void*
-buf_buddy_alloc_low(
-/*================*/
- buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
- ulint i, /*!< in: index of buf_pool->zip_free[],
- or BUF_BUDDY_SIZES */
- ibool* lru, /*!< in: pointer to a variable that
- will be assigned TRUE if storage was
- allocated from the LRU list and
- buf_pool->mutex was temporarily
- released */
- ibool have_page_hash_mutex)
-{
- buf_block_t* block;
-
- ut_ad(lru);
- //ut_ad(buf_pool_mutex_own(buf_pool));
- ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
- ut_ad(!mutex_own(&buf_pool->zip_mutex));
- ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
-
- if (i < BUF_BUDDY_SIZES) {
- /* Try to allocate from the buddy system. */
- mutex_enter(&buf_pool->zip_free_mutex);
- block = buf_buddy_alloc_zip(buf_pool, i);
-
- if (block) {
- goto func_exit;
- }
- mutex_exit(&buf_pool->zip_free_mutex);
- }
-
- /* Try allocating from the buf_pool->free list. */
- block = buf_LRU_get_free_only(buf_pool);
-
- if (block) {
-
- goto alloc_big;
- }
-
- /* Try replacing an uncompressed page in the buffer pool. */
- //buf_pool_mutex_exit(buf_pool);
- mutex_exit(&buf_pool->LRU_list_mutex);
- if (have_page_hash_mutex) {
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
- }
- block = buf_LRU_get_free_block(buf_pool);
- *lru = TRUE;
- //buf_pool_mutex_enter(buf_pool);
- mutex_enter(&buf_pool->LRU_list_mutex);
- if (have_page_hash_mutex) {
- rw_lock_x_lock(&buf_pool->page_hash_latch);
- }
-
-alloc_big:
- buf_buddy_block_register(block);
-
- mutex_enter(&buf_pool->zip_free_mutex);
- block = buf_buddy_alloc_from(
- buf_pool, block->frame, i, BUF_BUDDY_SIZES);
-
-func_exit:
- buf_pool->buddy_stat[i].used++;
- mutex_exit(&buf_pool->zip_free_mutex);
-
- return(block);
-}
-
-/**********************************************************************//**
-Try to relocate a block.
-@return TRUE if relocated */
-static
-ibool
-buf_buddy_relocate(
-/*===============*/
- buf_pool_t* buf_pool, /*!< in: buffer pool instance */
- void* src, /*!< in: block to relocate */
- void* dst, /*!< in: free block to relocate to */
- ulint i, /*!< in: index of
- buf_pool->zip_free[] */
- ibool have_page_hash_mutex)
-{
- buf_page_t* bpage;
- const ulint size = BUF_BUDDY_LOW << i;
- mutex_t* mutex;
- ulint space;
- ulint page_no;
-
- //ut_ad(buf_pool_mutex_own(buf_pool));
- ut_ad(mutex_own(&buf_pool->zip_free_mutex));
- ut_ad(!mutex_own(&buf_pool->zip_mutex));
- ut_ad(!ut_align_offset(src, size));
- ut_ad(!ut_align_offset(dst, size));
- ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
- UNIV_MEM_ASSERT_W(dst, size);
-
- if (!have_page_hash_mutex) {
- mutex_exit(&buf_pool->zip_free_mutex);
- mutex_enter(&buf_pool->LRU_list_mutex);
- rw_lock_x_lock(&buf_pool->page_hash_latch);
- }
-
- /* We assume that all memory from buf_buddy_alloc()
- is used for compressed page frames. */
-
- /* We look inside the allocated objects returned by
- buf_buddy_alloc() and assume that each block is a compressed
- page that contains a valid space_id and page_no in the page
- header. Should the fields be invalid, we will be unable to
- relocate the block. */
-
- /* The src block may be split into smaller blocks,
- some of which may be free. Thus, the
- mach_read_from_4() calls below may attempt to read
- from free memory. The memory is "owned" by the buddy
- allocator (and it has been allocated from the buffer
- pool), so there is nothing wrong about this. The
- mach_read_from_4() calls here will only trigger bogus
- Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */
- space = mach_read_from_4((const byte *) src
- + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
- page_no = mach_read_from_4((const byte *) src
- + FIL_PAGE_OFFSET);
- /* Suppress Valgrind warnings about conditional jump
- on uninitialized value. */
- UNIV_MEM_VALID(&space, sizeof space);
- UNIV_MEM_VALID(&page_no, sizeof page_no);
- bpage = buf_page_hash_get(buf_pool, space, page_no);
-
- if (!bpage || bpage->zip.data != src) {
- /* The block has probably been freshly
- allocated by buf_LRU_get_free_block() but not
- added to buf_pool->page_hash yet. Obviously,
- it cannot be relocated. */
-
- if (!have_page_hash_mutex) {
- mutex_enter(&buf_pool->zip_free_mutex);
- mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
- }
- return(FALSE);
- }
-
- if (page_zip_get_size(&bpage->zip) != size) {
- /* The block is of different size. We would
- have to relocate all blocks covered by src.
- For the sake of simplicity, give up. */
- ut_ad(page_zip_get_size(&bpage->zip) < size);
-
- if (!have_page_hash_mutex) {
- mutex_enter(&buf_pool->zip_free_mutex);
- mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
- }
- return(FALSE);
- }
-
- /* To keep latch order */
- if (have_page_hash_mutex)
- mutex_exit(&buf_pool->zip_free_mutex);
-
- /* The block must have been allocated, but it may
- contain uninitialized data. */
- UNIV_MEM_ASSERT_W(src, size);
-
- mutex = buf_page_get_mutex_enter(bpage);
-
- mutex_enter(&buf_pool->zip_free_mutex);
-
- if (mutex && buf_page_can_relocate(bpage)) {
- /* Relocate the compressed page. */
- ullint usec = ut_time_us(NULL);
- ut_a(bpage->zip.data == src);
- memcpy(dst, src, size);
- bpage->zip.data = dst;
- mutex_exit(mutex);
- UNIV_MEM_INVALID(src, size);
- {
- buf_buddy_stat_t* buddy_stat
- = &buf_pool->buddy_stat[i];
- buddy_stat->relocated++;
- buddy_stat->relocated_usec
- += ut_time_us(NULL) - usec;
- }
-
- if (!have_page_hash_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
- }
- return(TRUE);
- }
-
- if (!have_page_hash_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
- }
-
- if (mutex) {
- mutex_exit(mutex);
- }
- return(FALSE);
-}
-
-/**********************************************************************//**
-Deallocate a block. */
-UNIV_INTERN
-void
-buf_buddy_free_low(
-/*===============*/
- buf_pool_t* buf_pool, /*!< in: buffer pool instance */
- void* buf, /*!< in: block to be freed, must not be
- pointed to by the buffer pool */
- ulint i, /*!< in: index of buf_pool->zip_free[],
- or BUF_BUDDY_SIZES */
- ibool have_page_hash_mutex)
-{
- buf_page_t* bpage;
- buf_page_t* buddy;
-
- //ut_ad(buf_pool_mutex_own(buf_pool));
- ut_ad(mutex_own(&buf_pool->zip_free_mutex));
- ut_ad(!mutex_own(&buf_pool->zip_mutex));
- ut_ad(i <= BUF_BUDDY_SIZES);
- ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
- ut_ad(buf_pool->buddy_stat[i].used > 0);
-
- buf_pool->buddy_stat[i].used--;
-recombine:
- UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i);
- ((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE;
-
- if (i == BUF_BUDDY_SIZES) {
- mutex_exit(&buf_pool->zip_free_mutex);
- buf_buddy_block_free(buf_pool, buf, have_page_hash_mutex);
- mutex_enter(&buf_pool->zip_free_mutex);
- return;
- }
-
- ut_ad(i < BUF_BUDDY_SIZES);
- ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
- ut_ad(!buf_pool_contains_zip(buf_pool, buf));
-
- /* Do not recombine blocks if there are few free blocks.
- We may waste up to 15360*max_len bytes to free blocks
- (1024 + 2048 + 4096 + 8192 = 15360) */
- if (UT_LIST_GET_LEN(buf_pool->zip_free[i]) < 16) {
- goto func_exit;
- }
-
- /* Try to combine adjacent blocks. */
- buddy = (buf_page_t*) buf_buddy_get(((byte*) buf), BUF_BUDDY_LOW << i);
-
-#ifndef UNIV_DEBUG_VALGRIND
- /* When Valgrind instrumentation is not enabled, we can read
- buddy->state to quickly determine that a block is not free.
- When the block is not free, buddy->state belongs to a compressed
- page frame that may be flagged uninitialized in our Valgrind
- instrumentation. */
-
- if (buddy->state != BUF_BLOCK_ZIP_FREE) {
-
- goto buddy_nonfree;
- }
-#endif /* !UNIV_DEBUG_VALGRIND */
-
- for (bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); bpage; ) {
- ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
-
- if (bpage == buddy) {
- /* The buddy is free: recombine */
- buf_buddy_remove_from_free(buf_pool, bpage, i);
-buddy_is_free:
- ut_ad(buf_page_get_state(buddy) == BUF_BLOCK_ZIP_FREE);
- ut_ad(!buf_pool_contains_zip(buf_pool, buddy));
- i++;
- buf = ut_align_down(buf, BUF_BUDDY_LOW << i);
-
- goto recombine;
- }
-
- ut_a(bpage != buf);
- UNIV_MEM_ASSERT_W(bpage, BUF_BUDDY_LOW << i);
- bpage = UT_LIST_GET_NEXT(zip_list, bpage);
- }
-
-#ifndef UNIV_DEBUG_VALGRIND
-buddy_nonfree:
-#endif /* !UNIV_DEBUG_VALGRIND */
-
- ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
-
- /* The buddy is not free. Is there a free block of this size? */
- bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]);
-
- if (bpage) {
-
- /* Remove the block from the free list, because a successful
- buf_buddy_relocate() will overwrite bpage->list. */
- buf_buddy_remove_from_free(buf_pool, bpage, i);
-
- /* Try to relocate the buddy of buf to the free block. */
- if (buf_buddy_relocate(buf_pool, buddy, bpage, i, have_page_hash_mutex)) {
-
- buddy->state = BUF_BLOCK_ZIP_FREE;
- goto buddy_is_free;
- }
-
- buf_buddy_add_to_free(buf_pool, bpage, i);
- }
-
-func_exit:
- /* Free the block to the buddy list. */
- bpage = buf;
-
- /* Fill large blocks with a constant pattern. */
- ut_d(memset(bpage, i, BUF_BUDDY_LOW << i));
- UNIV_MEM_INVALID(bpage, BUF_BUDDY_LOW << i);
- bpage->state = BUF_BLOCK_ZIP_FREE;
- buf_buddy_add_to_free(buf_pool, bpage, i);
-}
diff --git a/storage/xtradb/buf/buf0buddy.cc b/storage/xtradb/buf/buf0buddy.cc
new file mode 100644
index 00000000000..3f8f339a81a
--- /dev/null
+++ b/storage/xtradb/buf/buf0buddy.cc
@@ -0,0 +1,726 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buddy.cc
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#define THIS_MODULE
+#include "buf0buddy.h"
+#ifdef UNIV_NONINL
+# include "buf0buddy.ic"
+#endif
+#undef THIS_MODULE
+#include "buf0buf.h"
+#include "buf0lru.h"
+#include "buf0flu.h"
+#include "page0zip.h"
+#include "srv0start.h"
+
+/** When freeing a buf we attempt to coalesce by looking at its buddy
+and deciding whether it is free or not. To ascertain if the buddy is
+free we look for BUF_BUDDY_STAMP_FREE at BUF_BUDDY_STAMP_OFFSET
+within the buddy. The question is how we can be sure that it is
+safe to look at BUF_BUDDY_STAMP_OFFSET.
+The answer lies in following invariants:
+* All blocks allocated by buddy allocator are used for compressed
+page frame.
+* A compressed table always have space_id < SRV_LOG_SPACE_FIRST_ID
+* BUF_BUDDY_STAMP_OFFSET always points to the space_id field in
+a frame.
+ -- The above is true because we look at these fields when the
+ corresponding buddy block is free which implies that:
+ * The block we are looking at must have an address aligned at
+ the same size that its free buddy has. For example, if we have
+ a free block of 8K then its buddy's address must be aligned at
+ 8K as well.
+ * It is possible that the block we are looking at may have been
+ further divided into smaller sized blocks but its starting
+ address must still remain the start of a page frame i.e.: it
+ cannot be middle of a block. For example, if we have a free
+ block of size 8K then its buddy may be divided into blocks
+ of, say, 1K, 1K, 2K, 4K but the buddy's address will still be
+ the starting address of first 1K compressed page.
+ * What is important to note is that for any given block, the
+ buddy's address cannot be in the middle of a larger block i.e.:
+ in above example, our 8K block cannot have a buddy whose address
+ is aligned on 8K but it is part of a larger 16K block.
+*/
+
+/** Offset within buf_buddy_free_t where free or non_free stamps
+are written.*/
+#define BUF_BUDDY_STAMP_OFFSET FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID
+
+/** Value that we stamp on all buffers that are currently on the zip_free
+list. This value is stamped at BUF_BUDDY_STAMP_OFFSET offset */
+#define BUF_BUDDY_STAMP_FREE (SRV_LOG_SPACE_FIRST_ID)
+
+/** Stamp value for non-free buffers. Will be overwritten by a non-zero
+value by the consumer of the block */
+#define BUF_BUDDY_STAMP_NONFREE (0XFFFFFFFF)
+
+#if BUF_BUDDY_STAMP_FREE >= BUF_BUDDY_STAMP_NONFREE
+# error "BUF_BUDDY_STAMP_FREE >= BUF_BUDDY_STAMP_NONFREE"
+#endif
+
+/** Return type of buf_buddy_is_free() */
+enum buf_buddy_state_t {
+ BUF_BUDDY_STATE_FREE, /*!< If the buddy to completely free */
+ BUF_BUDDY_STATE_USED, /*!< Buddy currently in used */
+ BUF_BUDDY_STATE_PARTIALLY_USED/*!< Some sub-blocks in the buddy
+ are in use */
+};
+
+#ifdef UNIV_DEBUG_VALGRIND
+/**********************************************************************//**
+Invalidate memory area that we won't access while page is free */
+UNIV_INLINE
+void
+buf_buddy_mem_invalid(
+/*==================*/
+ buf_buddy_free_t* buf, /*!< in: block to check */
+ ulint i) /*!< in: index of zip_free[] */
+{
+ const size_t size = BUF_BUDDY_LOW << i;
+ ut_ad(i <= BUF_BUDDY_SIZES);
+
+ UNIV_MEM_ASSERT_W(buf, size);
+ UNIV_MEM_INVALID(buf, size);
+}
+#else /* UNIV_DEBUG_VALGRIND */
+# define buf_buddy_mem_invalid(buf, i) ut_ad((i) <= BUF_BUDDY_SIZES)
+#endif /* UNIV_DEBUG_VALGRIND */
+
+/**********************************************************************//**
+Check if a buddy is stamped free.
+@return whether the buddy is free */
+UNIV_INLINE __attribute__((warn_unused_result))
+bool
+buf_buddy_stamp_is_free(
+/*====================*/
+ const buf_buddy_free_t* buf) /*!< in: block to check */
+{
+ return(mach_read_from_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET)
+ == BUF_BUDDY_STAMP_FREE);
+}
+
+/**********************************************************************//**
+Stamps a buddy free. */
+UNIV_INLINE
+void
+buf_buddy_stamp_free(
+/*=================*/
+ buf_buddy_free_t* buf, /*!< in/out: block to stamp */
+ ulint i) /*!< in: block size */
+{
+ ut_d(memset(buf, i, BUF_BUDDY_LOW << i));
+ buf_buddy_mem_invalid(buf, i);
+ mach_write_to_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET,
+ BUF_BUDDY_STAMP_FREE);
+ buf->stamp.size = i;
+}
+
+/**********************************************************************//**
+Stamps a buddy nonfree.
+@param[in/out] buf block to stamp
+@param[in] i block size */
+#define buf_buddy_stamp_nonfree(buf, i) do { \
+ buf_buddy_mem_invalid(buf, i); \
+ memset(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, 0xff, 4); \
+} while (0)
+#if BUF_BUDDY_STAMP_NONFREE != 0xffffffff
+# error "BUF_BUDDY_STAMP_NONFREE != 0xffffffff"
+#endif
+
+/**********************************************************************//**
+Get the offset of the buddy of a compressed page frame.
+@return the buddy relative of page */
+UNIV_INLINE
+void*
+buf_buddy_get(
+/*==========*/
+ byte* page, /*!< in: compressed page */
+ ulint size) /*!< in: page size in bytes */
+{
+ ut_ad(ut_is_2pow(size));
+ ut_ad(size >= BUF_BUDDY_LOW);
+ ut_ad(BUF_BUDDY_LOW <= UNIV_ZIP_SIZE_MIN);
+ ut_ad(size < BUF_BUDDY_HIGH);
+ ut_ad(BUF_BUDDY_HIGH == UNIV_PAGE_SIZE);
+ ut_ad(!ut_align_offset(page, size));
+
+ if (((ulint) page) & size) {
+ return(page - size);
+ } else {
+ return(page + size);
+ }
+}
+
+/** Validate a given zip_free list. */
+struct CheckZipFree {
+ ulint i;
+ CheckZipFree(ulint i) : i (i) {}
+
+ void operator()(const buf_buddy_free_t* elem) const
+ {
+ ut_a(buf_buddy_stamp_is_free(elem));
+ ut_a(elem->stamp.size <= i);
+ }
+};
+
+#define BUF_BUDDY_LIST_VALIDATE(bp, i) \
+ UT_LIST_VALIDATE(list, buf_buddy_free_t, \
+ bp->zip_free[i], CheckZipFree(i))
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Debug function to validate that a buffer is indeed free i.e.: in the
+zip_free[].
+@return true if free */
+UNIV_INLINE
+bool
+buf_buddy_check_free(
+/*=================*/
+ buf_pool_t* buf_pool,/*!< in: buffer pool instance */
+ const buf_buddy_free_t* buf, /*!< in: block to check */
+ ulint i) /*!< in: index of buf_pool->zip_free[] */
+{
+ const ulint size = BUF_BUDDY_LOW << i;
+
+ ut_ad(mutex_own(&buf_pool->zip_free_mutex));
+ ut_ad(!ut_align_offset(buf, size));
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+ buf_buddy_free_t* itr;
+
+ for (itr = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
+ itr && itr != buf;
+ itr = UT_LIST_GET_NEXT(list, itr)) {
+ }
+
+ return(itr == buf);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Checks if a buf is free i.e.: in the zip_free[].
+@retval BUF_BUDDY_STATE_FREE if fully free
+@retval BUF_BUDDY_STATE_USED if currently in use
+@retval BUF_BUDDY_STATE_PARTIALLY_USED if partially in use. */
+static __attribute__((warn_unused_result))
+buf_buddy_state_t
+buf_buddy_is_free(
+/*==============*/
+ buf_buddy_free_t* buf, /*!< in: block to check */
+ ulint i) /*!< in: index of
+ buf_pool->zip_free[] */
+{
+#ifdef UNIV_DEBUG
+ const ulint size = BUF_BUDDY_LOW << i;
+ ut_ad(!ut_align_offset(buf, size));
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+#endif /* UNIV_DEBUG */
+
+ /* We assume that all memory from buf_buddy_alloc()
+ is used for compressed page frames. */
+
+ /* We look inside the allocated objects returned by
+ buf_buddy_alloc() and assume that each block is a compressed
+ page that contains one of the following in space_id.
+ * BUF_BUDDY_STAMP_FREE if the block is in a zip_free list or
+ * BUF_BUDDY_STAMP_NONFREE if the block has been allocated but
+ not initialized yet or
+ * A valid space_id of a compressed tablespace
+
+ The call below attempts to read from free memory. The memory
+ is "owned" by the buddy allocator (and it has been allocated
+ from the buffer pool), so there is nothing wrong about this. */
+ if (!buf_buddy_stamp_is_free(buf)) {
+ return(BUF_BUDDY_STATE_USED);
+ }
+
+ /* A block may be free but a fragment of it may still be in use.
+ To guard against that we write the free block size in terms of
+ zip_free index at start of stamped block. Note that we can
+ safely rely on this value only if the buf is free. */
+ ut_ad(buf->stamp.size <= i);
+ return(buf->stamp.size == i
+ ? BUF_BUDDY_STATE_FREE
+ : BUF_BUDDY_STATE_PARTIALLY_USED);
+}
+
+/**********************************************************************//**
+Add a block to the head of the appropriate buddy free list. */
+UNIV_INLINE
+void
+buf_buddy_add_to_free(
+/*==================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ buf_buddy_free_t* buf, /*!< in,own: block to be freed */
+ ulint i) /*!< in: index of
+ buf_pool->zip_free[] */
+{
+ ut_ad(mutex_own(&buf_pool->zip_free_mutex));
+ ut_ad(buf_pool->zip_free[i].start != buf);
+
+ buf_buddy_stamp_free(buf, i);
+ UT_LIST_ADD_FIRST(list, buf_pool->zip_free[i], buf);
+ ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
+}
+
+/**********************************************************************//**
+Remove a block from the appropriate buddy free list. */
+UNIV_INLINE
+void
+buf_buddy_remove_from_free(
+/*=======================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ buf_buddy_free_t* buf, /*!< in,own: block to be freed */
+ ulint i) /*!< in: index of
+ buf_pool->zip_free[] */
+{
+ ut_ad(mutex_own(&buf_pool->zip_free_mutex));
+ ut_ad(buf_buddy_check_free(buf_pool, buf, i));
+
+ UT_LIST_REMOVE(list, buf_pool->zip_free[i], buf);
+ buf_buddy_stamp_nonfree(buf, i);
+}
+
+/**********************************************************************//**
+Try to allocate a block from buf_pool->zip_free[].
+@return allocated block, or NULL if buf_pool->zip_free[] was empty */
+static
+buf_buddy_free_t*
+buf_buddy_alloc_zip(
+/*================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ ulint i) /*!< in: index of buf_pool->zip_free[] */
+{
+ buf_buddy_free_t* buf;
+
+ ut_ad(mutex_own(&buf_pool->zip_free_mutex));
+ ut_a(i < BUF_BUDDY_SIZES);
+ ut_a(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+ ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
+
+ buf = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
+
+ if (buf) {
+ buf_buddy_remove_from_free(buf_pool, buf, i);
+ } else if (i + 1 < BUF_BUDDY_SIZES) {
+ /* Attempt to split. */
+ buf = buf_buddy_alloc_zip(buf_pool, i + 1);
+
+ if (buf) {
+ buf_buddy_free_t* buddy =
+ reinterpret_cast<buf_buddy_free_t*>(
+ buf->stamp.bytes
+ + (BUF_BUDDY_LOW << i));
+
+ ut_ad(!buf_pool_contains_zip(buf_pool, buddy));
+ buf_buddy_add_to_free(buf_pool, buddy, i);
+ }
+ }
+
+ if (buf) {
+ /* Trash the page other than the BUF_BUDDY_STAMP_NONFREE. */
+ UNIV_MEM_TRASH(buf, ~i, BUF_BUDDY_STAMP_OFFSET);
+ UNIV_MEM_TRASH(BUF_BUDDY_STAMP_OFFSET + 4
+ + buf->stamp.bytes, ~i,
+ (BUF_BUDDY_LOW << i)
+ - (BUF_BUDDY_STAMP_OFFSET + 4));
+ ut_ad(mach_read_from_4(buf->stamp.bytes
+ + BUF_BUDDY_STAMP_OFFSET)
+ == BUF_BUDDY_STAMP_NONFREE);
+ }
+
+ return(buf);
+}
+
+/**********************************************************************//**
+Deallocate a buffer frame of UNIV_PAGE_SIZE. */
+static
+void
+buf_buddy_block_free(
+/*=================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ void* buf) /*!< in: buffer frame to deallocate */
+{
+ const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf);
+ buf_page_t* bpage;
+ buf_block_t* block;
+
+ ut_ad(!mutex_own(&buf_pool->zip_mutex));
+ ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE));
+
+ mutex_enter(&buf_pool->zip_hash_mutex);
+
+ HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage,
+ ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY
+ && bpage->in_zip_hash && !bpage->in_page_hash),
+ ((buf_block_t*) bpage)->frame == buf);
+ ut_a(bpage);
+ ut_a(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY);
+ ut_ad(!bpage->in_page_hash);
+ ut_ad(bpage->in_zip_hash);
+ ut_d(bpage->in_zip_hash = FALSE);
+ HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage);
+
+ mutex_exit(&buf_pool->zip_hash_mutex);
+
+ ut_d(memset(buf, 0, UNIV_PAGE_SIZE));
+ UNIV_MEM_INVALID(buf, UNIV_PAGE_SIZE);
+
+ block = (buf_block_t*) bpage;
+ mutex_enter(&block->mutex);
+ buf_LRU_block_free_non_file_page(block);
+ mutex_exit(&block->mutex);
+
+ ut_ad(buf_pool->buddy_n_frames > 0);
+ ut_d(buf_pool->buddy_n_frames--);
+}
+
+/**********************************************************************//**
+Allocate a buffer block to the buddy allocator. */
+static
+void
+buf_buddy_block_register(
+/*=====================*/
+ buf_block_t* block) /*!< in: buffer frame to allocate */
+{
+ buf_pool_t* buf_pool = buf_pool_from_block(block);
+ const ulint fold = BUF_POOL_ZIP_FOLD(block);
+ ut_ad(!mutex_own(&buf_pool->zip_mutex));
+ ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE);
+
+ buf_block_set_state(block, BUF_BLOCK_MEMORY);
+
+ ut_a(block->frame);
+ ut_a(!ut_align_offset(block->frame, UNIV_PAGE_SIZE));
+
+ ut_ad(!block->page.in_page_hash);
+ ut_ad(!block->page.in_zip_hash);
+ ut_d(block->page.in_zip_hash = TRUE);
+
+ mutex_enter(&buf_pool->zip_hash_mutex);
+ HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page);
+ mutex_exit(&buf_pool->zip_hash_mutex);
+
+ ut_d(buf_pool->buddy_n_frames++);
+}
+
+/**********************************************************************//**
+Allocate a block from a bigger object.
+@return allocated block */
+static
+void*
+buf_buddy_alloc_from(
+/*=================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ void* buf, /*!< in: a block that is free to use */
+ ulint i, /*!< in: index of
+ buf_pool->zip_free[] */
+ ulint j) /*!< in: size of buf as an index
+ of buf_pool->zip_free[] */
+{
+ ulint offs = BUF_BUDDY_LOW << j;
+ ut_ad(mutex_own(&buf_pool->zip_free_mutex));
+ ut_ad(j <= BUF_BUDDY_SIZES);
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+ ut_ad(j >= i);
+ ut_ad(!ut_align_offset(buf, offs));
+
+ /* Add the unused parts of the block to the free lists. */
+ while (j > i) {
+ buf_buddy_free_t* zip_buf;
+
+ offs >>= 1;
+ j--;
+
+ zip_buf = reinterpret_cast<buf_buddy_free_t*>(
+ reinterpret_cast<byte*>(buf) + offs);
+ buf_buddy_add_to_free(buf_pool, zip_buf, j);
+ }
+
+ buf_buddy_stamp_nonfree(reinterpret_cast<buf_buddy_free_t*>(buf), i);
+ return(buf);
+}
+
+/**********************************************************************//**
+Allocate a block. The thread calling this function must hold
+buf_pool->LRU_list_mutex and must not hold buf_pool->zip_mutex or any
+block->mutex. The buf_pool->LRU_list_mutex may be released and reacquired.
+@return allocated block, never NULL */
+UNIV_INTERN
+void*
+buf_buddy_alloc_low(
+/*================*/
+ buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
+ ulint i, /*!< in: index of buf_pool->zip_free[],
+ or BUF_BUDDY_SIZES */
+ ibool* lru) /*!< in: pointer to a variable that
+ will be assigned TRUE if storage was
+ allocated from the LRU list and
+ buf_pool->LRU_list_mutex was
+ temporarily released */
+{
+ buf_block_t* block;
+
+ ut_ad(lru);
+ ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+ ut_ad(!mutex_own(&buf_pool->zip_mutex));
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+ if (i < BUF_BUDDY_SIZES) {
+ /* Try to allocate from the buddy system. */
+ mutex_enter(&buf_pool->zip_free_mutex);
+ block = (buf_block_t*) buf_buddy_alloc_zip(buf_pool, i);
+
+ if (block) {
+ goto func_exit;
+ }
+ mutex_exit(&buf_pool->zip_free_mutex);
+ }
+
+ /* Try allocating from the buf_pool->free list. */
+ block = buf_LRU_get_free_only(buf_pool);
+
+ if (block) {
+
+ goto alloc_big;
+ }
+
+ /* Try replacing an uncompressed page in the buffer pool. */
+ mutex_exit(&buf_pool->LRU_list_mutex);
+ block = buf_LRU_get_free_block(buf_pool);
+ *lru = TRUE;
+ mutex_enter(&buf_pool->LRU_list_mutex);
+
+alloc_big:
+ buf_buddy_block_register(block);
+
+ mutex_enter(&buf_pool->zip_free_mutex);
+ block = (buf_block_t*) buf_buddy_alloc_from(
+ buf_pool, block->frame, i, BUF_BUDDY_SIZES);
+
+func_exit:
+ buf_pool->buddy_stat[i].used++;
+ mutex_exit(&buf_pool->zip_free_mutex);
+
+ return(block);
+}
+
+/**********************************************************************//**
+Try to relocate a block. The caller must hold zip_free_mutex, and this
+function will release and lock it again.
+@return true if relocated */
+static
+bool
+buf_buddy_relocate(
+/*===============*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ void* src, /*!< in: block to relocate */
+ void* dst, /*!< in: free block to relocate to */
+ ulint i) /*!< in: index of
+ buf_pool->zip_free[] */
+{
+ buf_page_t* bpage;
+ const ulint size = BUF_BUDDY_LOW << i;
+ ib_mutex_t* mutex;
+ ulint space;
+ ulint offset;
+ prio_rw_lock_t* hash_lock;
+
+ ut_ad(mutex_own(&buf_pool->zip_free_mutex));
+ ut_ad(!mutex_own(&buf_pool->zip_mutex));
+ ut_ad(!ut_align_offset(src, size));
+ ut_ad(!ut_align_offset(dst, size));
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+ UNIV_MEM_ASSERT_W(dst, size);
+
+ space = mach_read_from_4((const byte*) src
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ offset = mach_read_from_4((const byte*) src
+ + FIL_PAGE_OFFSET);
+
+ /* Suppress Valgrind warnings about conditional jump
+ on uninitialized value. */
+ UNIV_MEM_VALID(&space, sizeof space);
+ UNIV_MEM_VALID(&offset, sizeof offset);
+
+ ut_ad(space != BUF_BUDDY_STAMP_FREE);
+
+ mutex_exit(&buf_pool->zip_free_mutex);
+ /* Lock page hash to prevent a relocation for the target page */
+ bpage = buf_page_hash_get_s_locked(buf_pool, space, offset, &hash_lock);
+
+ if (!bpage || bpage->zip.data != src) {
+ /* The block has probably been freshly
+ allocated by buf_LRU_get_free_block() but not
+ added to buf_pool->page_hash yet. Obviously,
+ it cannot be relocated. */
+
+ if (bpage) {
+ rw_lock_s_unlock(hash_lock);
+ }
+ mutex_enter(&buf_pool->zip_free_mutex);
+ return(false);
+ }
+
+ if (page_zip_get_size(&bpage->zip) != size) {
+ /* The block is of different size. We would
+ have to relocate all blocks covered by src.
+ For the sake of simplicity, give up. */
+ ut_ad(page_zip_get_size(&bpage->zip) < size);
+
+ rw_lock_s_unlock(hash_lock);
+ mutex_enter(&buf_pool->zip_free_mutex);
+ return(false);
+ }
+
+ /* The block must have been allocated, but it may
+ contain uninitialized data. */
+ UNIV_MEM_ASSERT_W(src, size);
+
+ mutex = buf_page_get_mutex(bpage);
+
+ mutex_enter(mutex);
+
+ rw_lock_s_unlock(hash_lock);
+
+ mutex_enter(&buf_pool->zip_free_mutex);
+
+ if (buf_page_can_relocate(bpage)) {
+ /* Relocate the compressed page. */
+ ullint usec = ut_time_us(NULL);
+ ut_a(bpage->zip.data == src);
+ memcpy(dst, src, size);
+ bpage->zip.data = (page_zip_t*) dst;
+ mutex_exit(mutex);
+ buf_buddy_mem_invalid(
+ reinterpret_cast<buf_buddy_free_t*>(src), i);
+
+ buf_buddy_stat_t* buddy_stat = &buf_pool->buddy_stat[i];
+ buddy_stat->relocated++;
+ buddy_stat->relocated_usec += ut_time_us(NULL) - usec;
+ return(true);
+ }
+
+ mutex_exit(mutex);
+ return(false);
+}
+
+/**********************************************************************//**
+Deallocate a block. */
+UNIV_INTERN
+void
+buf_buddy_free_low(
+/*===============*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ void* buf, /*!< in: block to be freed, must not be
+ pointed to by the buffer pool */
+ ulint i) /*!< in: index of buf_pool->zip_free[],
+ or BUF_BUDDY_SIZES */
+{
+ buf_buddy_free_t* buddy;
+
+ ut_ad(!mutex_own(&buf_pool->zip_mutex));
+ ut_ad(i <= BUF_BUDDY_SIZES);
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+ mutex_enter(&buf_pool->zip_free_mutex);
+
+ ut_ad(buf_pool->buddy_stat[i].used > 0);
+ buf_pool->buddy_stat[i].used--;
+recombine:
+ UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i);
+
+ if (i == BUF_BUDDY_SIZES) {
+ mutex_exit(&buf_pool->zip_free_mutex);
+ buf_buddy_block_free(buf_pool, buf);
+ return;
+ }
+
+ ut_ad(i < BUF_BUDDY_SIZES);
+ ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
+ ut_ad(!buf_pool_contains_zip(buf_pool, buf));
+
+ /* Do not recombine blocks if there are few free blocks.
+ We may waste up to 15360*max_len bytes to free blocks
+ (1024 + 2048 + 4096 + 8192 = 15360) */
+ if (UT_LIST_GET_LEN(buf_pool->zip_free[i]) < 16) {
+ goto func_exit;
+ }
+
+ /* Try to combine adjacent blocks. */
+ buddy = reinterpret_cast<buf_buddy_free_t*>(
+ buf_buddy_get(reinterpret_cast<byte*>(buf),
+ BUF_BUDDY_LOW << i));
+
+ switch (buf_buddy_is_free(buddy, i)) {
+ case BUF_BUDDY_STATE_FREE:
+ /* The buddy is free: recombine */
+ buf_buddy_remove_from_free(buf_pool, buddy, i);
+buddy_is_free:
+ ut_ad(!buf_pool_contains_zip(buf_pool, buddy));
+ i++;
+ buf = ut_align_down(buf, BUF_BUDDY_LOW << i);
+
+ goto recombine;
+
+ case BUF_BUDDY_STATE_USED:
+ ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
+
+ /* The buddy is not free. Is there a free block of
+ this size? */
+ if (buf_buddy_free_t* zip_buf =
+ UT_LIST_GET_FIRST(buf_pool->zip_free[i])) {
+
+ /* Remove the block from the free list, because
+ a successful buf_buddy_relocate() will overwrite
+ zip_free->list. */
+ buf_buddy_remove_from_free(buf_pool, zip_buf, i);
+
+ /* Try to relocate the buddy of buf to the free
+ block. */
+ if (buf_buddy_relocate(buf_pool, buddy, zip_buf, i)) {
+
+ goto buddy_is_free;
+ }
+
+ buf_buddy_add_to_free(buf_pool, zip_buf, i);
+ }
+
+ break;
+ case BUF_BUDDY_STATE_PARTIALLY_USED:
+ /* Some sub-blocks in the buddy are still in use.
+ Relocation will fail. No need to try. */
+ break;
+ }
+
+func_exit:
+ /* Free the block to the buddy list. */
+ buf_buddy_add_to_free(buf_pool,
+ reinterpret_cast<buf_buddy_free_t*>(buf),
+ i);
+ mutex_exit(&buf_pool->zip_free_mutex);
+}
diff --git a/storage/xtradb/buf/buf0buf.c b/storage/xtradb/buf/buf0buf.cc
index f06fd4abfb1..d4b170028d9 100644
--- a/storage/xtradb/buf/buf0buf.c
+++ b/storage/xtradb/buf/buf0buf.cc
@@ -18,13 +18,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/**************************************************//**
-@file buf/buf0buf.c
+@file buf/buf0buf.cc
The database buffer buf_pool
Created 11/5/1995 Heikki Tuuri
@@ -51,6 +51,8 @@ Created 11/5/1995 Heikki Tuuri
#include "dict0dict.h"
#include "log0recv.h"
#include "page0zip.h"
+#include "srv0mon.h"
+#include "buf0checksum.h"
#include "trx0trx.h"
#include "srv0start.h"
@@ -69,7 +71,8 @@ _increment_page_get_statistics(buf_block_t* block, trx_t* trx)
ut_ad(trx && trx->take_stats);
if (!trx->distinct_page_access_hash) {
- trx->distinct_page_access_hash = mem_alloc(DPAH_SIZE);
+ trx->distinct_page_access_hash
+ = static_cast<byte *>(mem_alloc(DPAH_SIZE));
memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
}
@@ -116,24 +119,9 @@ in the file along with the file page, resides in the control block.
Buffer pool struct
------------------
-The buffer buf_pool contains a single mutex which protects all the
+The buffer buf_pool contains several mutexes which protect all the
control data structures of the buf_pool. The content of a buffer frame is
protected by a separate read-write lock in its control block, though.
-These locks can be locked and unlocked without owning the buf_pool->mutex.
-The OS events in the buf_pool struct can be waited for without owning the
-buf_pool->mutex.
-
-The buf_pool->mutex is a hot-spot in main memory, causing a lot of
-memory bus traffic on multiprocessor systems when processors
-alternately access the mutex. On our Pentium, the mutex is accessed
-maybe every 10 microseconds. We gave up the solution to have mutexes
-for each control block, for instance, because it seemed to be
-complicated.
-
-A solution to reduce mutex contention of the buf_pool->mutex is to
-create a separate mutex for the page hash table. On Pentium,
-accessing the hash table takes 2 microseconds, about half
-of the total buf_pool->mutex hold time.
Control blocks
--------------
@@ -217,7 +205,7 @@ uncompressed pages are accessible via buf_block_t objects that are
reachable via buf_pool->chunks[].
The chains of free memory blocks (buf_pool->zip_free[]) are used by
-the buddy allocator (buf0buddy.c) to keep track of currently unused
+the buddy allocator (buf0buddy.cc) to keep track of currently unused
memory blocks of size sizeof(buf_page_t)..UNIV_PAGE_SIZE / 2. These
blocks are inside the UNIV_PAGE_SIZE-sized memory blocks of type
BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
@@ -298,7 +286,6 @@ UNIV_INTERN ibool buf_debug_prints = FALSE;
#ifdef UNIV_PFS_RWLOCK
/* Keys to register buffer block related rwlocks and mutexes with
performance schema */
-UNIV_INTERN mysql_pfs_key_t buf_pool_page_hash_key;
UNIV_INTERN mysql_pfs_key_t buf_block_lock_key;
# ifdef UNIV_SYNC_DEBUG
UNIV_INTERN mysql_pfs_key_t buf_block_debug_latch_key;
@@ -309,6 +296,7 @@ UNIV_INTERN mysql_pfs_key_t buf_block_debug_latch_key;
UNIV_INTERN mysql_pfs_key_t buffer_block_mutex_key;
UNIV_INTERN mysql_pfs_key_t buf_pool_mutex_key;
UNIV_INTERN mysql_pfs_key_t buf_pool_zip_mutex_key;
+UNIV_INTERN mysql_pfs_key_t buf_pool_flush_state_mutex_key;
UNIV_INTERN mysql_pfs_key_t buf_pool_LRU_list_mutex_key;
UNIV_INTERN mysql_pfs_key_t buf_pool_free_list_mutex_key;
UNIV_INTERN mysql_pfs_key_t buf_pool_zip_free_mutex_key;
@@ -334,19 +322,26 @@ be effective only if PFS_GROUP_BUFFER_SYNC is defined. */
# endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
#endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
+/** Macro to determine whether the read of write counter is used depending
+on the io_type */
+#define MONITOR_RW_COUNTER(io_type, counter) \
+ ((io_type == BUF_IO_READ) \
+ ? (counter##_READ) \
+ : (counter##_WRITTEN))
+
/********************************************************************//**
Gets the smallest oldest_modification lsn for any page in the pool. Returns
zero if all modified pages have been flushed to disk.
@return oldest modification in pool, zero if none */
UNIV_INTERN
-ib_uint64_t
+lsn_t
buf_pool_get_oldest_modification(void)
/*==================================*/
{
ulint i;
buf_page_t* bpage;
- ib_uint64_t lsn = 0;
- ib_uint64_t oldest_lsn = 0;
+ lsn_t lsn = 0;
+ lsn_t oldest_lsn = 0;
/* When we traverse all the flush lists we don't want another
thread to add a dirty page to any flush list. */
@@ -403,6 +398,7 @@ buf_get_total_list_len(
buf_pool_t* buf_pool;
buf_pool = buf_pool_from_array(i);
+
*LRU_len += UT_LIST_GET_LEN(buf_pool->LRU);
*free_len += UT_LIST_GET_LEN(buf_pool->free);
*flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list);
@@ -418,11 +414,10 @@ buf_get_total_list_size_in_bytes(
buf_pools_list_size_t* buf_pools_list_size) /*!< out: list sizes
in all buffer pools */
{
- ulint i;
ut_ad(buf_pools_list_size);
memset(buf_pools_list_size, 0, sizeof(*buf_pools_list_size));
- for (i = 0; i < srv_buf_pool_instances; i++) {
+ for (ulint i = 0; i < srv_buf_pool_instances; i++) {
buf_pool_t* buf_pool;
buf_pool = buf_pool_from_array(i);
@@ -500,97 +495,24 @@ buf_block_alloc(
#endif /* !UNIV_HOTBACKUP */
/********************************************************************//**
-Calculates a page checksum which is stored to the page when it is written
-to a file. Note that we must be careful to calculate the same value on
-32-bit and 64-bit architectures.
-@return checksum */
-UNIV_INTERN
-ulint
-buf_calc_page_new_checksum(
-/*=======================*/
- const byte* page) /*!< in: buffer page */
-{
- ulint checksum;
-
- /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
- ..._ARCH_LOG_NO, are written outside the buffer pool to the first
- pages of data files, we have to skip them in the page checksum
- calculation.
- We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
- checksum is stored, and also the last 8 bytes of page because
- there we store the old formula checksum. */
-
- checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
- FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
- + ut_fold_binary(page + FIL_PAGE_DATA,
- UNIV_PAGE_SIZE - FIL_PAGE_DATA
- - FIL_PAGE_END_LSN_OLD_CHKSUM);
- checksum = checksum & 0xFFFFFFFFUL;
-
- return(checksum);
-}
-
-UNIV_INTERN
-ulint
-buf_calc_page_new_checksum_32(
-/*==========================*/
- const byte* page) /*!< in: buffer page */
-{
- ulint checksum;
-
- checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
- FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
- + ut_fold_binary(page + FIL_PAGE_DATA,
- FIL_PAGE_DATA_ALIGN_32 - FIL_PAGE_DATA)
- + ut_fold_binary_32(page + FIL_PAGE_DATA_ALIGN_32,
- UNIV_PAGE_SIZE - FIL_PAGE_DATA_ALIGN_32
- - FIL_PAGE_END_LSN_OLD_CHKSUM);
-
- checksum = checksum & 0xFFFFFFFFUL;
-
- return(checksum);
-}
-
-/********************************************************************//**
-In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
-looked at the first few bytes of the page. This calculates that old
-checksum.
-NOTE: we must first store the new formula checksum to
-FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
-because this takes that field as an input!
-@return checksum */
-UNIV_INTERN
-ulint
-buf_calc_page_old_checksum(
-/*=======================*/
- const byte* page) /*!< in: buffer page */
-{
- ulint checksum;
-
- checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
-
- checksum = checksum & 0xFFFFFFFFUL;
-
- return(checksum);
-}
-
-/********************************************************************//**
Checks if a page is corrupt.
@return TRUE if corrupted */
UNIV_INTERN
ibool
buf_page_is_corrupted(
/*==================*/
- ibool check_lsn, /*!< in: TRUE if we need to check
+ bool check_lsn, /*!< in: true if we need to check
and complain about the LSN */
const byte* read_buf, /*!< in: a database page */
ulint zip_size) /*!< in: size of compressed page;
0 for uncompressed pages */
{
- ulint checksum_field;
- ulint old_checksum_field;
+ ulint checksum_field1;
+ ulint checksum_field2;
+ ibool crc32_inited = FALSE;
+ ib_uint32_t crc32 = ULINT32_UNDEFINED;
- if (UNIV_LIKELY(!zip_size)
+ if (!zip_size
&& memcmp(read_buf + FIL_PAGE_LSN + 4,
read_buf + UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
@@ -603,93 +525,205 @@ buf_page_is_corrupted(
#ifndef UNIV_HOTBACKUP
if (check_lsn && recv_lsn_checks_on) {
- ib_uint64_t current_lsn;
+ lsn_t current_lsn;
+
+ /* Since we are going to reset the page LSN during the import
+ phase it makes no sense to spam the log with error messages. */
if (log_peek_lsn(&current_lsn)
- && UNIV_UNLIKELY
- (current_lsn
- < mach_read_from_8(read_buf + FIL_PAGE_LSN))) {
+ && current_lsn
+ < mach_read_from_8(read_buf + FIL_PAGE_LSN)) {
ut_print_timestamp(stderr);
fprintf(stderr,
- " InnoDB: Error: page %lu log sequence number"
- " %llu\n"
+ " InnoDB: Error: page %lu log sequence number"
+ " " LSN_PF "\n"
"InnoDB: is in the future! Current system "
- "log sequence number %llu.\n"
+ "log sequence number " LSN_PF ".\n"
"InnoDB: Your database may be corrupt or "
"you may have copied the InnoDB\n"
"InnoDB: tablespace but not the InnoDB "
"log files. See\n"
- "InnoDB: " REFMAN "forcing-innodb-recovery.html\n"
+ "InnoDB: " REFMAN
+ "forcing-innodb-recovery.html\n"
"InnoDB: for more information.\n",
- (ulong) mach_read_from_4(read_buf
- + FIL_PAGE_OFFSET),
- mach_read_from_8(read_buf + FIL_PAGE_LSN),
+ (ulong) mach_read_from_4(
+ read_buf + FIL_PAGE_OFFSET),
+ (lsn_t) mach_read_from_8(
+ read_buf + FIL_PAGE_LSN),
current_lsn);
}
}
#endif
- /* If we use checksums validation, make additional check before
- returning TRUE to ensure that the checksum is not equal to
- BUF_NO_CHECKSUM_MAGIC which might be stored by InnoDB with checksums
- disabled. Otherwise, skip checksum calculation and return FALSE */
+ /* Check whether the checksum fields have correct values */
+
+ if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_NONE) {
+ return(FALSE);
+ }
- if (UNIV_LIKELY(srv_use_checksums)) {
- checksum_field = mach_read_from_4(read_buf
- + FIL_PAGE_SPACE_OR_CHKSUM);
+ if (zip_size) {
+ return(!page_zip_verify_checksum(read_buf, zip_size));
+ }
- if (UNIV_UNLIKELY(zip_size)) {
- return(checksum_field != BUF_NO_CHECKSUM_MAGIC
- && checksum_field
- != page_zip_calc_checksum(read_buf, zip_size));
- }
+ checksum_field1 = mach_read_from_4(
+ read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
- old_checksum_field = mach_read_from_4(
- read_buf + UNIV_PAGE_SIZE
- - FIL_PAGE_END_LSN_OLD_CHKSUM);
+ checksum_field2 = mach_read_from_4(
+ read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM);
- /* There are 2 valid formulas for old_checksum_field:
+ /* declare empty pages non-corrupted */
+ if (checksum_field1 == 0 && checksum_field2 == 0
+ && mach_read_from_4(read_buf + FIL_PAGE_LSN) == 0) {
+ /* make sure that the page is really empty */
+ ut_d(for (ulint i = 0; i < UNIV_PAGE_SIZE; i++) {
+ ut_a(read_buf[i] == 0); });
+
+ return(FALSE);
+ }
+
+ switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+
+ crc32 = buf_calc_page_crc32(read_buf);
+
+ return(checksum_field1 != crc32 || checksum_field2 != crc32);
+
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+
+ return(checksum_field1
+ != buf_calc_page_new_checksum(read_buf)
+ || checksum_field2
+ != buf_calc_page_old_checksum(read_buf));
+
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+
+ return(checksum_field1 != BUF_NO_CHECKSUM_MAGIC
+ || checksum_field2 != BUF_NO_CHECKSUM_MAGIC);
+
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ /* There are 3 valid formulas for
+ checksum_field2 (old checksum field):
1. Very old versions of InnoDB only stored 8 byte lsn to the
start and the end of the page.
- 2. Newer InnoDB versions store the old formula checksum
- there. */
+ 2. InnoDB versions before MySQL 5.6.3 store the old formula
+ checksum (buf_calc_page_old_checksum()).
- if (old_checksum_field != mach_read_from_4(read_buf
- + FIL_PAGE_LSN)
- && old_checksum_field != BUF_NO_CHECKSUM_MAGIC
- && old_checksum_field
- != buf_calc_page_old_checksum(read_buf)) {
+ 3. InnoDB versions 5.6.3 and newer with
+ innodb_checksum_algorithm=strict_crc32|crc32 store CRC32. */
- return(TRUE);
+ /* since innodb_checksum_algorithm is not strict_* allow
+ any of the algos to match for the old field */
+
+ if (checksum_field2
+ != mach_read_from_4(read_buf + FIL_PAGE_LSN)
+ && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) {
+
+ /* The checksum does not match any of the
+ fast to check. First check the selected algorithm
+ for writing checksums because we assume that the
+ chance of it matching is higher. */
+
+ if (srv_checksum_algorithm
+ == SRV_CHECKSUM_ALGORITHM_CRC32) {
+
+ crc32 = buf_calc_page_crc32(read_buf);
+ crc32_inited = TRUE;
+
+ if (checksum_field2 != crc32
+ && checksum_field2
+ != buf_calc_page_old_checksum(read_buf)) {
+
+ return(TRUE);
+ }
+ } else {
+ ut_ad(srv_checksum_algorithm
+ == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+ if (checksum_field2
+ != buf_calc_page_old_checksum(read_buf)) {
+
+ crc32 = buf_calc_page_crc32(read_buf);
+ crc32_inited = TRUE;
+
+ if (checksum_field2 != crc32) {
+ return(TRUE);
+ }
+ }
+ }
}
+ /* old field is fine, check the new field */
+
/* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
(always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
- if (!srv_fast_checksum
- && checksum_field != 0
- && checksum_field != BUF_NO_CHECKSUM_MAGIC
- && checksum_field
- != buf_calc_page_new_checksum(read_buf)) {
+ if (checksum_field1 != 0
+ && checksum_field1 != BUF_NO_CHECKSUM_MAGIC) {
- return(TRUE);
+ /* The checksum does not match any of the
+ fast to check. First check the selected algorithm
+ for writing checksums because we assume that the
+ chance of it matching is higher. */
+
+ if (srv_checksum_algorithm
+ == SRV_CHECKSUM_ALGORITHM_CRC32) {
+
+ if (!crc32_inited) {
+ crc32 = buf_calc_page_crc32(read_buf);
+ crc32_inited = TRUE;
+ }
+
+ if (checksum_field1 != crc32
+ && checksum_field1
+ != buf_calc_page_new_checksum(read_buf)) {
+
+ return(TRUE);
+ }
+ } else {
+ ut_ad(srv_checksum_algorithm
+ == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+ if (checksum_field1
+ != buf_calc_page_new_checksum(read_buf)) {
+
+ if (!crc32_inited) {
+ crc32 = buf_calc_page_crc32(
+ read_buf);
+ crc32_inited = TRUE;
+ }
+
+ if (checksum_field1 != crc32) {
+ return(TRUE);
+ }
+ }
+ }
}
- if (srv_fast_checksum
- && checksum_field != 0
- && checksum_field != BUF_NO_CHECKSUM_MAGIC
- && checksum_field
- != buf_calc_page_new_checksum_32(read_buf)
- && checksum_field
- != buf_calc_page_new_checksum(read_buf)) {
+ /* If CRC32 is stored in at least one of the fields, then the
+ other field must also be CRC32 */
+ if (crc32_inited
+ && ((checksum_field1 == crc32
+ && checksum_field2 != crc32)
+ || (checksum_field1 != crc32
+ && checksum_field2 == crc32))) {
return(TRUE);
}
+
+ break;
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ /* should have returned FALSE earlier */
+ ut_error;
+ /* no default so the compiler will emit a warning if new enum
+ is added and not handled here */
}
+ DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", return(TRUE); );
+
return(FALSE);
}
@@ -710,10 +744,14 @@ buf_page_print(
#ifndef UNIV_HOTBACKUP
dict_index_t* index;
#endif /* !UNIV_HOTBACKUP */
- ulint checksum;
- ulint checksum_32;
- ulint old_checksum;
- ulint size = zip_size;
+ ulint size = zip_size;
+
+ if (!read_buf) {
+ fprintf(stderr,
+ " InnoDB: Not dumping page as (in memory) pointer "
+ "is NULL\n");
+ return;
+ }
if (!size) {
size = UNIV_PAGE_SIZE;
@@ -722,7 +760,7 @@ buf_page_print(
if (!(flags & BUF_PAGE_PRINT_NO_FULL)) {
ut_print_timestamp(stderr);
fprintf(stderr,
- " InnoDB: Page dump in ascii and hex (%lu bytes):\n",
+ " InnoDB: Page dump in ascii and hex (%lu bytes):\n",
(ulong) size);
ut_print_buf(stderr, read_buf, size);
fputs("\nInnoDB: End of page dump\n", stderr);
@@ -730,104 +768,80 @@ buf_page_print(
if (zip_size) {
/* Print compressed page. */
-
- switch (fil_page_get_type(read_buf)) {
- case FIL_PAGE_TYPE_ZBLOB:
- case FIL_PAGE_TYPE_ZBLOB2:
- checksum = srv_use_checksums
- ? page_zip_calc_checksum(read_buf, zip_size)
- : BUF_NO_CHECKSUM_MAGIC;
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: Compressed BLOB page"
- " checksum %lu, stored %lu\n"
- "InnoDB: Page lsn %lu %lu\n"
- "InnoDB: Page number (if stored"
- " to page already) %lu,\n"
- "InnoDB: space id (if stored"
- " to page already) %lu\n",
- (ulong) checksum,
- (ulong) mach_read_from_4(
- read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
- (ulong) mach_read_from_4(
- read_buf + FIL_PAGE_LSN),
- (ulong) mach_read_from_4(
- read_buf + (FIL_PAGE_LSN + 4)),
- (ulong) mach_read_from_4(
- read_buf + FIL_PAGE_OFFSET),
- (ulong) mach_read_from_4(
- read_buf
- + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
- return;
- default:
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: unknown page type %lu,"
- " assuming FIL_PAGE_INDEX\n",
- fil_page_get_type(read_buf));
- /* fall through */
- case FIL_PAGE_INDEX:
- checksum = srv_use_checksums
- ? page_zip_calc_checksum(read_buf, zip_size)
- : BUF_NO_CHECKSUM_MAGIC;
-
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: Compressed page checksum %lu,"
- " stored %lu\n"
- "InnoDB: Page lsn %lu %lu\n"
- "InnoDB: Page number (if stored"
- " to page already) %lu,\n"
- "InnoDB: space id (if stored"
- " to page already) %lu\n",
- (ulong) checksum,
- (ulong) mach_read_from_4(
- read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
- (ulong) mach_read_from_4(
- read_buf + FIL_PAGE_LSN),
- (ulong) mach_read_from_4(
- read_buf + (FIL_PAGE_LSN + 4)),
- (ulong) mach_read_from_4(
- read_buf + FIL_PAGE_OFFSET),
- (ulong) mach_read_from_4(
- read_buf
- + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
- return;
- case FIL_PAGE_TYPE_XDES:
- /* This is an uncompressed page. */
- break;
- }
- }
-
- checksum = srv_use_checksums
- ? buf_calc_page_new_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
- checksum_32 = srv_use_checksums
- ? buf_calc_page_new_checksum_32(read_buf) : BUF_NO_CHECKSUM_MAGIC;
- old_checksum = srv_use_checksums
- ? buf_calc_page_old_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
-
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: Page checksum %lu (32bit_calc: %lu), prior-to-4.0.14-form"
- " checksum %lu\n"
- "InnoDB: stored checksum %lu, prior-to-4.0.14-form"
- " stored checksum %lu\n"
- "InnoDB: Page lsn %lu %lu, low 4 bytes of lsn"
- " at page end %lu\n"
- "InnoDB: Page number (if stored to page already) %lu,\n"
- "InnoDB: space id (if created with >= MySQL-4.1.1"
- " and stored already) %lu\n",
- (ulong) checksum, (ulong) checksum_32, (ulong) old_checksum,
- (ulong) mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
- (ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Compressed page type (" ULINTPF "); "
+ "stored checksum in field1 " ULINTPF "; "
+ "calculated checksums for field1: "
+ "%s " ULINTPF ", "
+ "%s " ULINTPF ", "
+ "%s " ULINTPF "; "
+ "page LSN " LSN_PF "; "
+ "page number (if stored to page already) " ULINTPF "; "
+ "space id (if stored to page already) " ULINTPF "\n",
+ fil_page_get_type(read_buf),
+ mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+ buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_CRC32),
+ page_zip_calc_checksum(read_buf, zip_size,
+ SRV_CHECKSUM_ALGORITHM_CRC32),
+ buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_INNODB),
+ page_zip_calc_checksum(read_buf, zip_size,
+ SRV_CHECKSUM_ALGORITHM_INNODB),
+ buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_NONE),
+ page_zip_calc_checksum(read_buf, zip_size,
+ SRV_CHECKSUM_ALGORITHM_NONE),
+ mach_read_from_8(read_buf + FIL_PAGE_LSN),
+ mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
+ mach_read_from_4(read_buf
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+ } else {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: uncompressed page, "
+ "stored checksum in field1 " ULINTPF ", "
+ "calculated checksums for field1: "
+ "%s " UINT32PF ", "
+ "%s " ULINTPF ", "
+ "%s " ULINTPF ", "
+
+ "stored checksum in field2 " ULINTPF ", "
+ "calculated checksums for field2: "
+ "%s " UINT32PF ", "
+ "%s " ULINTPF ", "
+ "%s " ULINTPF ", "
+
+ "page LSN " ULINTPF " " ULINTPF ", "
+ "low 4 bytes of LSN at page end " ULINTPF ", "
+ "page number (if stored to page already) " ULINTPF ", "
+ "space id (if created with >= MySQL-4.1.1 "
+ "and stored already) %lu\n",
+ mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+ buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32),
+ buf_calc_page_crc32(read_buf),
+ buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB),
+ buf_calc_page_new_checksum(read_buf),
+ buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE),
+ BUF_NO_CHECKSUM_MAGIC,
+
+ mach_read_from_4(read_buf + UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN_OLD_CHKSUM),
- (ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN),
- (ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
- (ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+ buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32),
+ buf_calc_page_crc32(read_buf),
+ buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB),
+ buf_calc_page_old_checksum(read_buf),
+ buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE),
+ BUF_NO_CHECKSUM_MAGIC,
+
+ mach_read_from_4(read_buf + FIL_PAGE_LSN),
+ mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
+ mach_read_from_4(read_buf + UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
- (ulong) mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
- (ulong) mach_read_from_4(read_buf
+ mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
+ mach_read_from_4(read_buf
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+ }
#ifndef UNIV_HOTBACKUP
if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE)
@@ -929,7 +943,7 @@ pfs_register_buffer_block(
PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER);
for (i = 0; i < num_to_register; i++) {
- mutex_t* mutex;
+ ib_mutex_t* mutex;
rw_lock_t* rwlock;
# ifdef UNIV_PFS_MUTEX
@@ -972,7 +986,7 @@ buf_block_init(
buf_block_t* block, /*!< in: pointer to control block */
byte* frame) /*!< in: pointer to buffer frame */
{
- UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE, block);
+ UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE);
block->frame = frame;
@@ -995,13 +1009,9 @@ buf_block_init(
block->page.in_zip_hash = FALSE;
block->page.in_flush_list = FALSE;
block->page.in_free_list = FALSE;
-#endif /* UNIV_DEBUG */
- block->page.flush_list.prev = NULL;
- block->page.flush_list.next = NULL;
- block->page.zip_list.prev = NULL;
- block->page.zip_list.next = NULL;
block->page.in_LRU_list = FALSE;
block->in_unzip_LRU_list = FALSE;
+#endif /* UNIV_DEBUG */
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
block->n_pointers = 0;
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
@@ -1033,7 +1043,6 @@ buf_block_init(
#endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
ut_ad(rw_lock_validate(&(block->lock)));
-
}
/********************************************************************//**
@@ -1071,14 +1080,14 @@ buf_chunk_init(
/* Allocate the block descriptors from
the start of the memory block. */
- chunk->blocks = chunk->mem;
+ chunk->blocks = (buf_block_t*) chunk->mem;
/* Align a pointer to the first frame. Note that when
os_large_page_size is smaller than UNIV_PAGE_SIZE,
we may allocate one fewer block than requested. When
it is bigger, we may allocate more blocks than requested. */
- frame = ut_align(chunk->mem, UNIV_PAGE_SIZE);
+ frame = (byte*) ut_align(chunk->mem, UNIV_PAGE_SIZE);
chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
- (frame != chunk->mem);
@@ -1110,11 +1119,9 @@ buf_chunk_init(
UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);
/* Add the block to the free list */
- mutex_enter(&buf_pool->free_list_mutex);
- UT_LIST_ADD_LAST(free, buf_pool->free, (&block->page));
+ UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page));
ut_d(block->page.in_free_list = TRUE);
- mutex_exit(&buf_pool->free_list_mutex);
ut_ad(buf_pool_from_block(block) == buf_pool);
block++;
@@ -1169,8 +1176,6 @@ buf_pool_contains_zip(
buf_chunk_t* chunk = buf_pool->chunks;
ut_ad(buf_pool);
- //ut_ad(buf_pool_mutex_own(buf_pool));
- ut_ad(mutex_own(&buf_pool->zip_free_mutex));
for (n = buf_pool->n_chunks; n--; chunk++) {
buf_block_t* block = buf_chunk_contains_zip(chunk, data);
@@ -1202,7 +1207,7 @@ buf_chunk_not_freed(
ibool ready;
switch (buf_block_get_state(block)) {
- case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_POOL_WATCH:
case BUF_BLOCK_ZIP_PAGE:
case BUF_BLOCK_ZIP_DIRTY:
/* The uncompressed buffer pool should never
@@ -1221,8 +1226,9 @@ buf_chunk_not_freed(
ready = buf_flush_ready_for_replace(&block->page);
mutex_exit(&block->mutex);
- if (block->page.is_corrupt) {
- /* corrupt page may remain, it can be skipped */
+ if (UNIV_UNLIKELY(block->page.is_corrupt)) {
+ /* corrupt page may remain, it can be
+ skipped */
break;
}
@@ -1248,8 +1254,6 @@ buf_pool_set_sizes(void)
ulint i;
ulint curr_size = 0;
- buf_pool_mutex_enter_all();
-
for (i = 0; i < srv_buf_pool_instances; i++) {
buf_pool_t* buf_pool;
@@ -1259,8 +1263,6 @@ buf_pool_set_sizes(void)
srv_buf_pool_curr_size = curr_size;
srv_buf_pool_old_size = srv_buf_pool_size;
-
- buf_pool_mutex_exit_all();
}
/********************************************************************//**
@@ -1280,12 +1282,8 @@ buf_pool_init_instance(
/* 1. Initialize general fields
------------------------------- */
- mutex_create(buf_pool_mutex_key,
- &buf_pool->mutex, SYNC_BUF_POOL);
mutex_create(buf_pool_LRU_list_mutex_key,
&buf_pool->LRU_list_mutex, SYNC_BUF_LRU_LIST);
- rw_lock_create(buf_pool_page_hash_key,
- &buf_pool->page_hash_latch, SYNC_BUF_PAGE_HASH);
mutex_create(buf_pool_free_list_mutex_key,
&buf_pool->free_list_mutex, SYNC_BUF_FREE_LIST);
mutex_create(buf_pool_zip_free_mutex_key,
@@ -1294,14 +1292,14 @@ buf_pool_init_instance(
&buf_pool->zip_hash_mutex, SYNC_BUF_ZIP_HASH);
mutex_create(buf_pool_zip_mutex_key,
&buf_pool->zip_mutex, SYNC_BUF_BLOCK);
-
- mutex_enter(&buf_pool->LRU_list_mutex);
- rw_lock_x_lock(&buf_pool->page_hash_latch);
- buf_pool_mutex_enter(buf_pool);
+ mutex_create(buf_pool_flush_state_mutex_key,
+ &buf_pool->flush_state_mutex, SYNC_BUF_FLUSH_STATE);
if (buf_pool_size > 0) {
buf_pool->n_chunks = 1;
- buf_pool->chunks = chunk = mem_zalloc(sizeof *chunk);
+
+ buf_pool->chunks = chunk =
+ (buf_chunk_t*) mem_zalloc(sizeof *chunk);
UT_LIST_INIT(buf_pool->free);
@@ -1309,19 +1307,28 @@ buf_pool_init_instance(
mem_free(chunk);
mem_free(buf_pool);
- mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
- buf_pool_mutex_exit(buf_pool);
-
return(DB_ERROR);
}
buf_pool->instance_no = instance_no;
buf_pool->old_pool_size = buf_pool_size;
buf_pool->curr_size = chunk->size;
+ buf_pool->read_ahead_area
+ = ut_min(64, ut_2_power_up(buf_pool->curr_size / 32));
buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
- buf_pool->page_hash = hash_create(2 * buf_pool->curr_size);
+ /* Number of locks protecting page_hash must be a
+ power of two */
+ srv_n_page_hash_locks =
+ ut_2_power_up(srv_n_page_hash_locks);
+ ut_a(srv_n_page_hash_locks != 0);
+ ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS);
+
+ buf_pool->page_hash = ha_create(2 * buf_pool->curr_size,
+ srv_n_page_hash_locks,
+ MEM_HEAP_FOR_PAGE_HASH,
+ SYNC_BUF_PAGE_HASH);
+
buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
buf_pool->last_printout_time = ut_time();
@@ -1333,17 +1340,15 @@ buf_pool_init_instance(
SYNC_BUF_FLUSH_LIST);
for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
- buf_pool->no_flush[i] = os_event_create(NULL);
+ buf_pool->no_flush[i] = os_event_create();
}
- /* 3. Initialize LRU fields
- --------------------------- */
+ buf_pool->watch = (buf_page_t*) mem_zalloc(
+ sizeof(*buf_pool->watch) * BUF_POOL_WATCH_SIZE);
/* All fields are initialized by mem_zalloc(). */
- mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
- buf_pool_mutex_exit(buf_pool);
+ buf_pool->try_LRU_scan = TRUE;
return(DB_SUCCESS);
}
@@ -1380,6 +1385,9 @@ buf_pool_free_instance(
bpage = prev_bpage;
}
+ mem_free(buf_pool->watch);
+ buf_pool->watch = NULL;
+
chunks = buf_pool->chunks;
chunk = chunks + buf_pool->n_chunks;
@@ -1388,6 +1396,7 @@ buf_pool_free_instance(
}
mem_free(buf_pool->chunks);
+ ha_clear(buf_pool->page_hash);
hash_table_free(buf_pool->page_hash);
hash_table_free(buf_pool->zip_hash);
}
@@ -1396,7 +1405,7 @@ buf_pool_free_instance(
Creates the buffer pool.
@return DB_SUCCESS if success, DB_ERROR if not enough memory or error */
UNIV_INTERN
-ulint
+dberr_t
buf_pool_init(
/*==========*/
ulint total_size, /*!< in: size of the total pool in bytes */
@@ -1410,10 +1419,8 @@ buf_pool_init(
ut_ad(n_instances <= MAX_BUFFER_POOLS);
ut_ad(n_instances == srv_buf_pool_instances);
- /* We create an extra buffer pool instance, this instance is used
- for flushing the flush lists, to keep track of n_flush for all
- the buffer pools and also used as a waiting object during flushing. */
- buf_pool_ptr = mem_zalloc(n_instances * sizeof *buf_pool_ptr);
+ buf_pool_ptr = (buf_pool_t*) mem_zalloc(
+ n_instances * sizeof *buf_pool_ptr);
for (i = 0; i < n_instances; i++) {
buf_pool_t* ptr = &buf_pool_ptr[i];
@@ -1464,11 +1471,7 @@ buf_pool_clear_hash_index(void)
ulint p;
#ifdef UNIV_SYNC_DEBUG
- ulint j;
-
- for (j = 0; j < btr_search_index_num; j++) {
- ut_ad(rw_lock_own(&btr_search_latch_arr[j], RW_LOCK_EX));
- }
+ ut_ad(btr_search_own_all(RW_LOCK_EX));
#endif /* UNIV_SYNC_DEBUG */
ut_ad(!btr_search_enabled);
@@ -1519,23 +1522,25 @@ buf_relocate(
ulint fold;
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
- //ut_ad(buf_pool_mutex_own(buf_pool));
+ fold = buf_page_address_fold(bpage->space, bpage->offset);
+
ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
-#ifdef UNIV_SYNC_DEBUG
- ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_EX));
-#endif
+ ut_ad(buf_page_hash_lock_held_x(buf_pool, bpage));
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
ut_a(bpage->buf_fix_count == 0);
ut_ad(bpage->in_LRU_list);
ut_ad(!bpage->in_zip_hash);
ut_ad(bpage->in_page_hash);
- ut_ad(bpage == buf_page_hash_get(buf_pool,
- bpage->space, bpage->offset));
+ ut_ad(bpage == buf_page_hash_get_low(buf_pool,
+ bpage->space,
+ bpage->offset,
+ fold));
+
ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
#ifdef UNIV_DEBUG
switch (buf_page_get_state(bpage)) {
- case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_POOL_WATCH:
case BUF_BLOCK_NOT_USED:
case BUF_BLOCK_READY_FOR_USE:
case BUF_BLOCK_FILE_PAGE:
@@ -1550,7 +1555,7 @@ buf_relocate(
memcpy(dpage, bpage, sizeof *dpage);
- bpage->in_LRU_list = FALSE;
+ ut_d(bpage->in_LRU_list = FALSE);
ut_d(bpage->in_page_hash = FALSE);
/* relocate buf_pool->LRU */
@@ -1580,12 +1585,10 @@ buf_relocate(
#endif /* UNIV_LRU_DEBUG */
}
- ut_d(UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU,
- ut_ad(ut_list_node_313->in_LRU_list)));
+ ut_d(UT_LIST_VALIDATE(
+ LRU, buf_page_t, buf_pool->LRU, CheckInLRUList()));
/* relocate buf_pool->page_hash */
- fold = buf_page_address_fold(bpage->space, bpage->offset);
-
HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
}
@@ -1600,6 +1603,8 @@ buf_pool_watch_is_sentinel(
buf_pool_t* buf_pool, /*!< buffer pool instance */
const buf_page_t* bpage) /*!< in: block */
{
+ /* We must also own the appropriate hash lock. */
+ ut_ad(buf_page_hash_lock_held_s_or_x(buf_pool, bpage));
ut_ad(buf_page_in_file(bpage));
if (bpage < &buf_pool->watch[0]
@@ -1620,8 +1625,9 @@ buf_pool_watch_is_sentinel(
}
/****************************************************************//**
-Add watch for the given page to be read in. Caller must have the buffer pool
-mutex reserved.
+Add watch for the given page to be read in. Caller must have
+appropriate hash_lock for the bpage. This function may release the
+hash_lock and reacquire it.
@return NULL if watch set, block if the page is in the buffer pool */
UNIV_INTERN
buf_page_t*
@@ -1634,32 +1640,53 @@ buf_pool_watch_set(
buf_page_t* bpage;
ulint i;
buf_pool_t* buf_pool = buf_pool_get(space, offset);
- mutex_t* block_mutex;
+ prio_rw_lock_t* hash_lock;
- //ut_ad(buf_pool_mutex_own(buf_pool));
+ hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
- rw_lock_x_lock(&buf_pool->page_hash_latch);
bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
if (UNIV_LIKELY_NULL(bpage)) {
-
- block_mutex = buf_page_get_mutex_enter(bpage);
- ut_a(block_mutex);
-
+page_found:
if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) {
/* The page was loaded meanwhile. */
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
return(bpage);
}
/* Add to an existing watch. */
+ mutex_enter(&buf_pool->zip_mutex);
bpage->buf_fix_count++;
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
- mutex_exit(block_mutex);
+ mutex_exit(&buf_pool->zip_mutex);
return(NULL);
}
- /* buf_pool->watch is protected by zip_mutex for now */
- mutex_enter(&buf_pool->zip_mutex);
+ /* From this point this function becomes fairly heavy in terms
+ of latching. We acquire all the hash_locks. They are needed
+ because we don't want to read any stale information in
+ buf_pool->watch[]. However, it is not in the critical code path
+ as this function will be called only by the purge thread. */
+
+
+ /* To obey latching order first release the hash_lock. */
+ rw_lock_x_unlock(hash_lock);
+
+ hash_lock_x_all(buf_pool->page_hash);
+
+ /* We have to recheck that the page
+ was not loaded or a watch set by some other
+ purge thread. This is because of the small
+ time window between when we release the
+ hash_lock to acquire all the hash locks above. */
+
+ bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
+ if (UNIV_LIKELY_NULL(bpage)) {
+ hash_unlock_x_all_but(buf_pool->page_hash, hash_lock);
+ goto page_found;
+ }
+
for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
bpage = &buf_pool->watch[i];
@@ -1674,21 +1701,25 @@ buf_pool_watch_set(
ut_ad(!bpage->in_page_hash);
ut_ad(bpage->buf_fix_count == 0);
- /* bpage is pointing to buf_pool->watch[],
- which is protected by buf_pool->mutex.
- Normally, buf_page_t objects are protected by
- buf_block_t::mutex or buf_pool->zip_mutex or both. */
+ mutex_enter(&buf_pool->zip_mutex);
bpage->state = BUF_BLOCK_ZIP_PAGE;
bpage->space = space;
bpage->offset = offset;
bpage->buf_fix_count = 1;
- bpage->buf_pool_index = buf_pool_index(buf_pool);
+
+ mutex_exit(&buf_pool->zip_mutex);
+
ut_d(bpage->in_page_hash = TRUE);
HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
fold, bpage);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
- mutex_exit(&buf_pool->zip_mutex);
+
+ /* Once the sentinel is in the page_hash we can
+ safely release all locks except just the
+ relevant hash_lock */
+ hash_unlock_x_all_but(buf_pool->page_hash,
+ hash_lock);
+
return(NULL);
case BUF_BLOCK_ZIP_PAGE:
ut_ad(bpage->in_page_hash);
@@ -1706,8 +1737,6 @@ buf_pool_watch_set(
ut_error;
/* Fix compiler warning */
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
- mutex_exit(&buf_pool->zip_mutex);
return(NULL);
}
@@ -1725,11 +1754,14 @@ buf_pool_watch_remove(
space, offset) */
buf_page_t* watch) /*!< in/out: sentinel for watch */
{
- //ut_ad(buf_pool_mutex_own(buf_pool));
#ifdef UNIV_SYNC_DEBUG
- ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_EX));
-#endif
- ut_ad(mutex_own(&buf_pool->zip_mutex)); /* for now */
+ /* We must also own the appropriate hash_bucket mutex. */
+ prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+ ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ ut_ad(buf_page_get_state(watch) == BUF_BLOCK_ZIP_PAGE);
+ ut_ad(buf_own_zip_mutex_for_page(watch));
HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, watch);
ut_d(watch->in_page_hash = FALSE);
@@ -1750,32 +1782,34 @@ buf_pool_watch_unset(
buf_page_t* bpage;
buf_pool_t* buf_pool = buf_pool_get(space, offset);
ulint fold = buf_page_address_fold(space, offset);
+ prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool,
+ fold);
+
+ rw_lock_x_lock(hash_lock);
- //buf_pool_mutex_enter(buf_pool);
- rw_lock_x_lock(&buf_pool->page_hash_latch);
bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
/* The page must exist because buf_pool_watch_set()
increments buf_fix_count. */
ut_a(bpage);
if (UNIV_UNLIKELY(!buf_pool_watch_is_sentinel(buf_pool, bpage))) {
- mutex_t* mutex = buf_page_get_mutex_enter(bpage);
+ ib_mutex_t* mutex = buf_page_get_mutex(bpage);
+ mutex_enter(mutex);
ut_a(bpage->buf_fix_count > 0);
bpage->buf_fix_count--;
mutex_exit(mutex);
} else {
- mutex_enter(&buf_pool->zip_mutex);
ut_a(bpage->buf_fix_count > 0);
+ mutex_enter(&buf_pool->zip_mutex);
if (UNIV_LIKELY(!--bpage->buf_fix_count)) {
buf_pool_watch_remove(buf_pool, fold, bpage);
}
mutex_exit(&buf_pool->zip_mutex);
}
- //buf_pool_mutex_exit(buf_pool);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
+ rw_lock_x_unlock(hash_lock);
}
/****************************************************************//**
@@ -1794,17 +1828,17 @@ buf_pool_watch_occurred(
buf_page_t* bpage;
buf_pool_t* buf_pool = buf_pool_get(space, offset);
ulint fold = buf_page_address_fold(space, offset);
+ prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool,
+ fold);
- //buf_pool_mutex_enter(buf_pool);
- rw_lock_s_lock(&buf_pool->page_hash_latch);
+ rw_lock_s_lock(hash_lock);
bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
/* The page must exist because buf_pool_watch_set()
increments buf_fix_count. */
ut_a(bpage);
ret = !buf_pool_watch_is_sentinel(buf_pool, bpage);
- //buf_pool_mutex_exit(buf_pool);
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
+ rw_lock_s_unlock(hash_lock);
return(ret);
}
@@ -1821,7 +1855,6 @@ buf_page_make_young(
{
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
- //buf_pool_mutex_enter(buf_pool);
ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
mutex_enter(&buf_pool->LRU_list_mutex);
@@ -1829,7 +1862,6 @@ buf_page_make_young(
buf_LRU_make_block_young(bpage);
- //buf_pool_mutex_exit(buf_pool);
mutex_exit(&buf_pool->LRU_list_mutex);
}
@@ -1844,10 +1876,6 @@ buf_page_make_young_if_needed(
buf_page_t* bpage) /*!< in/out: buffer block of a
file page */
{
-#ifdef UNIV_DEBUG
- buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
- ut_ad(!buf_pool_mutex_own(buf_pool));
-#endif /* UNIV_DEBUG */
ut_a(buf_page_in_file(bpage));
if (buf_page_peek_if_too_old(bpage)) {
@@ -1868,18 +1896,12 @@ buf_reset_check_index_page_at_flush(
buf_block_t* block;
buf_pool_t* buf_pool = buf_pool_get(space, offset);
- //buf_pool_mutex_enter(buf_pool);
- rw_lock_s_lock(&buf_pool->page_hash_latch);
-
block = (buf_block_t*) buf_page_hash_get(buf_pool, space, offset);
if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) {
ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
block->check_index_page_at_flush = FALSE;
}
-
- //buf_pool_mutex_exit(buf_pool);
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
}
#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
@@ -1898,22 +1920,22 @@ buf_page_set_file_page_was_freed(
{
buf_page_t* bpage;
buf_pool_t* buf_pool = buf_pool_get(space, offset);
+ prio_rw_lock_t* hash_lock;
- //buf_pool_mutex_enter(buf_pool);
- rw_lock_s_lock(&buf_pool->page_hash_latch);
-
- bpage = buf_page_hash_get(buf_pool, space, offset);
+ bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
+ &hash_lock);
if (bpage) {
+ ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+ mutex_enter(block_mutex);
+ rw_lock_s_unlock(hash_lock);
/* bpage->file_page_was_freed can already hold
when this code is invoked from dict_drop_index_tree() */
bpage->file_page_was_freed = TRUE;
+ mutex_exit(block_mutex);
}
- //buf_pool_mutex_exit(buf_pool);
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
-
return(bpage);
}
@@ -1932,25 +1954,66 @@ buf_page_reset_file_page_was_freed(
{
buf_page_t* bpage;
buf_pool_t* buf_pool = buf_pool_get(space, offset);
+ prio_rw_lock_t* hash_lock;
- //buf_pool_mutex_enter(buf_pool);
- rw_lock_s_lock(&buf_pool->page_hash_latch);
-
- bpage = buf_page_hash_get(buf_pool, space, offset);
-
+ bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
+ &hash_lock);
if (bpage) {
+ ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+ mutex_enter(block_mutex);
+ rw_lock_s_unlock(hash_lock);
bpage->file_page_was_freed = FALSE;
+ mutex_exit(block_mutex);
}
- //buf_pool_mutex_exit(buf_pool);
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
-
return(bpage);
}
#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
/********************************************************************//**
+Attempts to discard the uncompressed frame of a compressed page. The
+caller should not be holding any mutexes when this function is called.
+@return TRUE if successful, FALSE otherwise. */
+static
+void
+buf_block_try_discard_uncompressed(
+/*===============================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ buf_page_t* bpage;
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
+
+ /* Since we need to acquire buf_pool->LRU_list_mutex to discard
+ the uncompressed frame and because page_hash mutex resides below
+ buf_pool->LRU_list_mutex in sync ordering therefore we must first
+ release the page_hash mutex. This means that the block in question
+ can move out of page_hash. Therefore we need to check again if the
+ block is still in page_hash. */
+
+ mutex_enter(&buf_pool->LRU_list_mutex);
+
+ bpage = buf_page_hash_get(buf_pool, space, offset);
+
+ if (bpage) {
+
+ ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
+
+ mutex_enter(block_mutex);
+
+ if (buf_LRU_free_page(bpage, false)) {
+
+ mutex_exit(block_mutex);
+ return;
+ }
+ mutex_exit(block_mutex);
+ }
+
+ mutex_exit(&buf_pool->LRU_list_mutex);
+}
+
+/********************************************************************//**
Get read access to a compressed page (usually of type
FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
The page must be released with buf_page_release_zip().
@@ -1968,7 +2031,9 @@ buf_page_get_zip(
ulint offset) /*!< in: page number */
{
buf_page_t* bpage;
- mutex_t* block_mutex;
+ ib_mutex_t* block_mutex;
+ prio_rw_lock_t* hash_lock;
+ ibool discard_attempted = FALSE;
ibool must_read;
trx_t* trx = NULL;
ulint sec;
@@ -1976,7 +2041,6 @@ buf_page_get_zip(
ib_uint64_t start_time;
ib_uint64_t finish_time;
buf_pool_t* buf_pool = buf_pool_get(space, offset);
- ibool have_LRU_mutex = FALSE;
if (UNIV_UNLIKELY(innobase_get_slow_log())) {
trx = innobase_get_trx();
@@ -1984,10 +2048,12 @@ buf_page_get_zip(
buf_pool->stat.n_page_gets++;
for (;;) {
- //buf_pool_mutex_enter(buf_pool);
lookup:
- rw_lock_s_lock(&buf_pool->page_hash_latch);
- bpage = buf_page_hash_get(buf_pool, space, offset);
+
+ /* The following call will also grab the page_hash
+ mutex if the page is found. */
+ bpage = buf_page_hash_get_s_locked(buf_pool, space,
+ offset, &hash_lock);
if (bpage) {
ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
break;
@@ -1995,9 +2061,7 @@ lookup:
/* Page not in buf_pool: needs to be read from file */
- //buf_pool_mutex_exit(buf_pool);
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
-
+ ut_ad(!hash_lock);
buf_read_page(space, zip_size, offset, trx);
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
@@ -2005,88 +2069,52 @@ lookup:
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
}
- if (UNIV_UNLIKELY(!bpage->zip.data)) {
+ ut_ad(buf_page_hash_lock_held_s(buf_pool, bpage));
+
+ if (!bpage->zip.data) {
/* There is no compressed page. */
err_exit:
- //buf_pool_mutex_exit(buf_pool);
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
+ rw_lock_s_unlock(hash_lock);
return(NULL);
}
if (UNIV_UNLIKELY(bpage->is_corrupt && srv_pass_corrupt_table <= 1)) {
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
+ rw_lock_s_unlock(hash_lock);
return(NULL);
}
- block_mutex = buf_page_get_mutex_enter(bpage);
-
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
-
ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_POOL_WATCH:
case BUF_BLOCK_NOT_USED:
case BUF_BLOCK_READY_FOR_USE:
case BUF_BLOCK_MEMORY:
case BUF_BLOCK_REMOVE_HASH:
- case BUF_BLOCK_ZIP_FREE:
- if (block_mutex)
- mutex_exit(block_mutex);
break;
case BUF_BLOCK_ZIP_PAGE:
case BUF_BLOCK_ZIP_DIRTY:
- ut_a(block_mutex == &buf_pool->zip_mutex);
+ block_mutex = &buf_pool->zip_mutex;
+ mutex_enter(block_mutex);
bpage->buf_fix_count++;
goto got_block;
case BUF_BLOCK_FILE_PAGE:
- {
- ut_a(block_mutex == &((buf_block_t*) bpage)->mutex);
-
- /* release mutex to obey to latch-order */
- mutex_exit(block_mutex);
-
- /* get LRU_list_mutex for buf_LRU_free_block() */
- if (!have_LRU_mutex) {
- mutex_enter(&buf_pool->LRU_list_mutex);
- have_LRU_mutex = TRUE;
- }
-
- mutex_enter(block_mutex);
-
- if (UNIV_UNLIKELY(bpage->space != space
- || bpage->offset != offset
- || !bpage->in_LRU_list
- || !bpage->zip.data)) {
- /* someone should interrupt, retry */
- if (have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- have_LRU_mutex = FALSE;
- }
- mutex_exit(block_mutex);
- goto lookup;
- }
-
/* Discard the uncompressed page frame if possible. */
- if (buf_LRU_free_block(bpage, FALSE, &have_LRU_mutex)) {
- if (have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- have_LRU_mutex = FALSE;
- }
- mutex_exit(block_mutex);
+ if (!discard_attempted) {
+ rw_lock_s_unlock(hash_lock);
+ buf_block_try_discard_uncompressed(space,
+ offset);
+ discard_attempted = TRUE;
goto lookup;
}
- if (have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- have_LRU_mutex = FALSE;
- }
-
+ block_mutex = &((buf_block_t*) bpage)->mutex;
+ mutex_enter(block_mutex);
buf_block_buf_fix_inc((buf_block_t*) bpage,
__FILE__, __LINE__);
goto got_block;
- }
}
ut_error;
@@ -2095,7 +2123,10 @@ err_exit:
got_block:
must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ;
- //buf_pool_mutex_exit(buf_pool);
+ rw_lock_s_unlock(hash_lock);
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+ ut_a(!bpage->file_page_was_freed);
+#endif
buf_page_set_accessed(bpage);
@@ -2103,10 +2134,6 @@ got_block:
buf_page_make_young_if_needed(bpage);
-#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
- ut_a(!bpage->file_page_was_freed);
-#endif
-
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(++buf_dbg_counter % 5771 || buf_validate());
ut_a(bpage->buf_fix_count > 0);
@@ -2181,26 +2208,28 @@ buf_zip_decompress(
buf_block_t* block, /*!< in/out: block */
ibool check) /*!< in: TRUE=verify the page checksum */
{
- const byte* frame = block->page.zip.data;
- ulint stamp_checksum = mach_read_from_4(
- frame + FIL_PAGE_SPACE_OR_CHKSUM);
+ const byte* frame = block->page.zip.data;
+ ulint size = page_zip_get_size(&block->page.zip);
ut_ad(buf_block_get_zip_size(block));
ut_a(buf_block_get_space(block) != 0);
- if (UNIV_LIKELY(check && stamp_checksum != BUF_NO_CHECKSUM_MAGIC)) {
- ulint calc_checksum = page_zip_calc_checksum(
- frame, page_zip_get_size(&block->page.zip));
+ if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
- if (UNIV_UNLIKELY(stamp_checksum != calc_checksum)) {
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: compressed page checksum mismatch"
- " (space %u page %u): %lu != %lu\n",
- block->page.space, block->page.offset,
- stamp_checksum, calc_checksum);
- return(FALSE);
- }
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: compressed page checksum mismatch"
+ " (space %u page %u): stored: %lu, crc32: %lu "
+ "innodb: %lu, none: %lu\n",
+ block->page.space, block->page.offset,
+ mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM),
+ page_zip_calc_checksum(frame, size,
+ SRV_CHECKSUM_ALGORITHM_CRC32),
+ page_zip_calc_checksum(frame, size,
+ SRV_CHECKSUM_ALGORITHM_INNODB),
+ page_zip_calc_checksum(frame, size,
+ SRV_CHECKSUM_ALGORITHM_NONE));
+ return(FALSE);
}
switch (fil_page_get_type(frame)) {
@@ -2277,12 +2306,13 @@ buf_block_align_instance(
ut_ad(block->frame == page_align(ptr));
#ifdef UNIV_DEBUG
/* A thread that updates these fields must
- hold buf_pool->mutex and block->mutex. Acquire
+ hold one of the buf_pool mutexes, depending on the
+ page state, and block->mutex. Acquire
only the latter. */
mutex_enter(&block->mutex);
switch (buf_block_get_state(block)) {
- case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_POOL_WATCH:
case BUF_BLOCK_ZIP_PAGE:
case BUF_BLOCK_ZIP_DIRTY:
/* These types should only be used in
@@ -2373,8 +2403,8 @@ buf_pointer_is_block_field_instance(
/* TODO: protect buf_pool->chunks with a mutex (it will
currently remain constant after buf_pool_init()) */
while (chunk < echunk) {
- if (ptr >= (void *)chunk->blocks
- && ptr < (void *)(chunk->blocks + chunk->size)) {
+ if (ptr >= (void*) chunk->blocks
+ && ptr < (void*) (chunk->blocks + chunk->size)) {
return(TRUE);
}
@@ -2421,15 +2451,35 @@ buf_block_is_uncompressed(
const buf_block_t* block) /*!< in: pointer to block,
not dereferenced */
{
- //ut_ad(buf_pool_mutex_own(buf_pool));
-
if (UNIV_UNLIKELY((((ulint) block) % sizeof *block) != 0)) {
/* The pointer should be aligned. */
return(FALSE);
}
- return(buf_pointer_is_block_field_instance(buf_pool, (void *)block));
+ return(buf_pointer_is_block_field_instance(buf_pool, (void*) block));
+}
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/********************************************************************//**
+Return true if probe is enabled.
+@return true if probe enabled. */
+static
+bool
+buf_debug_execute_is_force_flush()
+/*==============================*/
+{
+ DBUG_EXECUTE_IF("ib_buf_force_flush", return(true); );
+
+ /* This is used during queisce testing, we want to ensure maximum
+ buffering by the change buffer. */
+
+ if (srv_ibuf_disable_background_merge) {
+ return(true);
+ }
+
+ return(false);
}
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
/********************************************************************//**
This is the general function used to get access to a database page.
@@ -2456,15 +2506,15 @@ buf_page_get_gen(
unsigned access_time;
ulint fix_type;
ibool must_read;
+ prio_rw_lock_t* hash_lock;
+ ib_mutex_t* block_mutex;
ulint retries = 0;
- mutex_t* block_mutex = NULL;
trx_t* trx = NULL;
ulint sec;
ulint ms;
ib_uint64_t start_time;
ib_uint64_t finish_time;
buf_pool_t* buf_pool = buf_pool_get(space, offset);
- ibool have_LRU_mutex = FALSE;
ut_ad(mtr);
ut_ad(mtr->state == MTR_ACTIVE);
@@ -2498,48 +2548,36 @@ buf_page_get_gen(
}
buf_pool->stat.n_page_gets++;
fold = buf_page_address_fold(space, offset);
+ hash_lock = buf_page_hash_lock_get(buf_pool, fold);
loop:
block = guess;
- //buf_pool_mutex_enter(buf_pool);
+ rw_lock_s_lock(hash_lock);
if (block) {
- block_mutex = buf_page_get_mutex_enter((buf_page_t*)block);
-
/* If the guess is a compressed page descriptor that
has been allocated by buf_page_alloc_descriptor(),
it may have been freed by buf_relocate(). */
- if (!block_mutex) {
- block = guess = NULL;
- } else if (!buf_block_is_uncompressed(buf_pool, block)
+ if (!buf_block_is_uncompressed(buf_pool, block)
|| offset != block->page.offset
|| space != block->page.space
|| buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
- mutex_exit(block_mutex);
-
+ /* Our guess was bogus or things have changed
+ since. */
block = guess = NULL;
} else {
ut_ad(!block->page.in_zip_hash);
- ut_ad(block->page.in_page_hash);
}
}
if (block == NULL) {
- rw_lock_s_lock(&buf_pool->page_hash_latch);
block = (buf_block_t*) buf_page_hash_get_low(
buf_pool, space, offset, fold);
- if (block) {
-
- block_mutex = buf_page_get_mutex_enter((buf_page_t*)block);
- ut_a(block_mutex);
- }
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
}
-loop2:
- if (block && buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
- mutex_exit(block_mutex);
+ if (!block || buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
+ rw_lock_s_unlock(hash_lock);
block = NULL;
}
@@ -2547,33 +2585,35 @@ loop2:
/* Page not in buf_pool: needs to be read from file */
if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+ rw_lock_x_lock(hash_lock);
block = (buf_block_t*) buf_pool_watch_set(
space, offset, fold);
if (UNIV_LIKELY_NULL(block)) {
- block_mutex = buf_page_get_mutex((buf_page_t*)block);
- ut_a(block_mutex);
- ut_ad(mutex_own(block_mutex));
+ /* We can release hash_lock after we
+ acquire block_mutex to make sure that
+ no state change takes place. */
+ block_mutex = buf_page_get_mutex(&block->page);
+ mutex_enter(block_mutex);
+
+ /* Now safe to release page_hash mutex */
+ rw_lock_x_unlock(hash_lock);
goto got_block;
}
- }
- //buf_pool_mutex_exit(buf_pool);
+ rw_lock_x_unlock(hash_lock);
+ }
if (mode == BUF_GET_IF_IN_POOL
|| mode == BUF_PEEK_IF_IN_POOL
|| mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
-
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
return(NULL);
}
- /* We should not hold LRU mutex below when trying
- to read the page */
- if (have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- have_LRU_mutex = FALSE;
- }
-
if (buf_read_page(space, zip_size, offset, trx)) {
buf_read_ahead_random(space, zip_size, offset,
ibuf_inside(mtr), trx);
@@ -2611,8 +2651,18 @@ loop2:
goto loop;
}
+
+ /* We can release hash_lock after we acquire block_mutex to
+ make sure that no state change takes place. */
+ block_mutex = buf_page_get_mutex(&block->page);
+ mutex_enter(block_mutex);
+
+ /* Now safe to release page_hash mutex */
+ rw_lock_s_unlock(hash_lock);
+
got_block:
ut_ad(page_zip_get_size(&block->page.zip) == zip_size);
+ ut_ad(mutex_own(block_mutex));
must_read = buf_block_get_io_fix(block) == BUF_IO_READ;
@@ -2623,7 +2673,6 @@ got_block:
but we cannot wait around for the read to
complete. */
null_exit:
- //buf_pool_mutex_exit(buf_pool);
mutex_exit(block_mutex);
return(NULL);
@@ -2633,24 +2682,19 @@ null_exit:
srv_pass_corrupt_table <= 1)) {
mutex_exit(block_mutex);
+
return(NULL);
}
switch (buf_block_get_state(block)) {
buf_page_t* bpage;
- ibool success;
case BUF_BLOCK_FILE_PAGE:
- if (block_mutex == &buf_pool->zip_mutex) {
- /* it is wrong mutex... */
- mutex_exit(block_mutex);
- goto loop;
- }
+ ut_ad(block_mutex != &buf_pool->zip_mutex);
break;
case BUF_BLOCK_ZIP_PAGE:
case BUF_BLOCK_ZIP_DIRTY:
- ut_ad(block_mutex == &buf_pool->zip_mutex);
if (mode == BUF_PEEK_IF_IN_POOL) {
/* This mode is only used for dropping an
adaptive hash index. There cannot be an
@@ -2660,20 +2704,17 @@ null_exit:
}
bpage = &block->page;
- /* Protect bpage->buf_fix_count. */
- //mutex_enter(&buf_pool->zip_mutex);
+ ut_ad(block_mutex == &buf_pool->zip_mutex);
if (bpage->buf_fix_count
|| buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
/* This condition often occurs when the buffer
is not buffer-fixed, but I/O-fixed by
buf_page_init_for_read(). */
- //mutex_exit(&buf_pool->zip_mutex);
+ mutex_exit(&buf_pool->zip_mutex);
wait_until_unfixed:
/* The block is buffer-fixed or I/O-fixed.
Try again later. */
- //buf_pool_mutex_exit(buf_pool);
- mutex_exit(block_mutex);
os_thread_sleep(WAIT_FOR_READ);
goto loop;
@@ -2685,30 +2726,22 @@ wait_until_unfixed:
bpage->buf_fix_count++;
/* Allocate an uncompressed page. */
- //buf_pool_mutex_exit(buf_pool);
- //mutex_exit(&buf_pool->zip_mutex);
- mutex_exit(block_mutex);
-
+ mutex_exit(&buf_pool->zip_mutex);
block = buf_LRU_get_free_block(buf_pool);
ut_a(block);
- block_mutex = &block->mutex;
- //buf_pool_mutex_enter(buf_pool);
- if (!have_LRU_mutex) {
- mutex_enter(&buf_pool->LRU_list_mutex);
- have_LRU_mutex = TRUE;
- }
+ mutex_enter(&buf_pool->LRU_list_mutex);
+
+ rw_lock_x_lock(hash_lock);
+ /* Buffer-fixing prevents the page_hash from changing. */
+ ut_ad(bpage == buf_page_hash_get_low(
+ buf_pool, space, offset, fold));
- rw_lock_x_lock(&buf_pool->page_hash_latch);
mutex_enter(&block->mutex);
mutex_enter(&buf_pool->zip_mutex);
- /* Buffer-fixing prevents the page_hash from changing. */
- ut_ad(bpage == buf_page_hash_get_low(buf_pool,
- space, offset, fold));
- if (UNIV_UNLIKELY
- (--bpage->buf_fix_count
- || buf_page_get_io_fix(bpage) != BUF_IO_NONE)) {
+ if (--bpage->buf_fix_count
+ || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
mutex_exit(&buf_pool->zip_mutex);
/* The block was buffer-fixed or I/O-fixed while
@@ -2717,15 +2750,11 @@ wait_until_unfixed:
This should be extremely unlikely, for example,
if buf_page_get_zip() was invoked. */
- buf_LRU_block_free_non_file_page(block, TRUE);
- //mutex_exit(&block->mutex);
-
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
-
- if (have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- have_LRU_mutex = FALSE;
- }
+ buf_LRU_block_free_non_file_page(block);
+ mutex_exit(&buf_pool->LRU_list_mutex);
+ mutex_exit(&buf_pool->zip_mutex);
+ rw_lock_x_unlock(hash_lock);
+ mutex_exit(&block->mutex);
goto wait_until_unfixed;
}
@@ -2734,19 +2763,16 @@ wait_until_unfixed:
and uncompress it. */
buf_relocate(bpage, &block->page);
-
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
-
buf_block_init_low(block);
block->lock_hash_val = lock_rec_hash(space, offset);
UNIV_MEM_DESC(&block->page.zip.data,
- page_zip_get_size(&block->page.zip), block);
+ page_zip_get_size(&block->page.zip));
if (buf_page_get_state(&block->page)
== BUF_BLOCK_ZIP_PAGE) {
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
- UT_LIST_REMOVE(zip_list, buf_pool->zip_clean,
+ UT_LIST_REMOVE(list, buf_pool->zip_clean,
&block->page);
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
ut_ad(!block->page.in_flush_list);
@@ -2764,10 +2790,7 @@ wait_until_unfixed:
/* Insert at the front of unzip_LRU list */
buf_unzip_LRU_add_block(block, FALSE);
- if (have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- have_LRU_mutex = FALSE;
- }
+ mutex_exit(&buf_pool->LRU_list_mutex);
block->page.buf_fix_count = 1;
buf_block_set_io_fix(block, BUF_IO_READ);
@@ -2775,21 +2798,23 @@ wait_until_unfixed:
UNIV_MEM_INVALID(bpage, sizeof *bpage);
- access_time = buf_page_is_accessed(&block->page);
+ rw_lock_x_unlock(hash_lock);
- mutex_exit(block_mutex);
- mutex_exit(&buf_pool->zip_mutex);
+ os_atomic_increment_ulint(&buf_pool->n_pend_unzip, 1);
- buf_pool_mutex_enter(buf_pool);
- buf_pool->n_pend_unzip++;
- buf_pool_mutex_exit(buf_pool);
+ access_time = buf_page_is_accessed(&block->page);
+ mutex_exit(&block->mutex);
+ mutex_exit(&buf_pool->zip_mutex);
buf_page_free_descriptor(bpage);
/* Decompress the page while not holding
- buf_pool->mutex or block->mutex. */
- success = buf_zip_decompress(block, srv_use_checksums);
- ut_a(success);
+ any buf_pool or block->mutex. */
+
+ /* Page checksum verification is already done when
+ the page is read from disk. Hence page checksum
+ verification is not necessary when decompressing the page. */
+ ut_a(buf_zip_decompress(block, FALSE));
if (UNIV_LIKELY(!recv_no_ibuf_operations)) {
if (access_time) {
@@ -2803,20 +2828,15 @@ wait_until_unfixed:
}
/* Unfix and unlatch the block. */
- //buf_pool_mutex_enter(buf_pool);
- block_mutex = &block->mutex;
- mutex_enter(block_mutex);
+ mutex_enter(&block->mutex);
block->page.buf_fix_count--;
buf_block_set_io_fix(block, BUF_IO_NONE);
-
- buf_pool_mutex_enter(buf_pool);
- buf_pool->n_pend_unzip--;
- buf_pool_mutex_exit(buf_pool);
+ os_atomic_decrement_ulint(&buf_pool->n_pend_unzip, 1);
rw_lock_x_unlock(&block->lock);
break;
- case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_POOL_WATCH:
case BUF_BLOCK_NOT_USED:
case BUF_BLOCK_READY_FOR_USE:
case BUF_BLOCK_MEMORY:
@@ -2825,9 +2845,13 @@ wait_until_unfixed:
break;
}
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
- //mutex_enter(&block->mutex);
#if UNIV_WORD_SIZE == 4
/* On 32-bit systems, there is no padding in buf_page_t. On
other systems, Valgrind could complain about uninitialized pad
@@ -2835,64 +2859,65 @@ wait_until_unfixed:
UNIV_MEM_ASSERT_RW(&block->page, sizeof block->page);
#endif
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+
if ((mode == BUF_GET_IF_IN_POOL || mode == BUF_GET_IF_IN_POOL_OR_WATCH)
- && ibuf_debug) {
+ && (ibuf_debug || buf_debug_execute_is_force_flush())) {
/* Try to evict the block from the buffer pool, to use the
insert buffer (change buffer) as much as possible. */
- ulint page_no = buf_block_get_page_no(block);
- if (buf_LRU_free_block(&block->page, TRUE, &have_LRU_mutex)) {
- mutex_exit(block_mutex);
+ /* To obey the latching order, release the
+ block->mutex before acquiring buf_pool->LRU_list_mutex. Protect
+ the block from changes by temporarily buffer-fixing it
+ for the time we are not holding block->mutex. */
+
+ buf_block_buf_fix_inc(block, file, line);
+ mutex_exit(&block->mutex);
+ mutex_enter(&buf_pool->LRU_list_mutex);
+ mutex_enter(&block->mutex);
+ buf_block_buf_fix_dec(block);
+
+ if (buf_LRU_free_page(&block->page, true)) {
+ mutex_exit(&block->mutex);
+ rw_lock_x_lock(hash_lock);
+
if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
/* Set the watch, as it would have
been set if the page were not in the
buffer pool in the first place. */
block = (buf_block_t*) buf_pool_watch_set(
space, offset, fold);
+ } else {
+ block = (buf_block_t*) buf_page_hash_get_low(
+ buf_pool, space, offset, fold);
+ }
- if (UNIV_LIKELY_NULL(block)) {
- block_mutex = buf_page_get_mutex((buf_page_t*)block);
- ut_a(block_mutex);
- ut_ad(mutex_own(block_mutex));
+ rw_lock_x_unlock(hash_lock);
- /* The page entered the buffer
- pool for some reason. Try to
- evict it again. */
- goto got_block;
- }
+ if (UNIV_LIKELY_NULL(block)) {
+ /* Either the page has been read in or
+ a watch was set on that in the window
+ where we released the buf_pool::mutex
+ and before we acquire the hash_lock
+ above. Try again. */
+ guess = block;
+ goto loop;
}
- //buf_pool_mutex_exit(buf_pool);
+
fprintf(stderr,
"innodb_change_buffering_debug evict %u %u\n",
(unsigned) space, (unsigned) offset);
return(NULL);
- } else if (UNIV_UNLIKELY(buf_block_get_state(block)
- != BUF_BLOCK_FILE_PAGE
- || (buf_block_get_page_no(block) != page_no)
- || (buf_block_get_space(block) != space))) {
-
- /* buf_LRU_free_block temporarily releases the
- block mutex, and now block points to something
- else. */
- mutex_exit(block_mutex);
- block = NULL;
- goto loop2;
-
} else {
- /* We should not hold LRU mutex below when trying
- to flush page */
- if (have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- have_LRU_mutex = FALSE;
- }
- if (buf_flush_page_try(buf_pool, block)) {
- fprintf(stderr,
- "innodb_change_buffering_debug flush %u %u\n",
- (unsigned) space, (unsigned) offset);
- guess = block;
- goto loop;
- }
+ mutex_exit(&buf_pool->LRU_list_mutex);
+ }
+
+ if (buf_flush_page_try(buf_pool, block)) {
+ fprintf(stderr,
+ "innodb_change_buffering_debug flush %u %u\n",
+ (unsigned) space, (unsigned) offset);
+ guess = block;
+ goto loop;
}
/* Failed to evict the page; change it directly */
@@ -2904,7 +2929,6 @@ wait_until_unfixed:
ut_a(mode == BUF_GET_POSSIBLY_FREED
|| !block->page.file_page_was_freed);
#endif
-
/* Check if this is the first access to the page */
access_time = buf_page_is_accessed(&block->page);
@@ -2989,6 +3013,11 @@ wait_until_unfixed:
ut_a(ibuf_count_get(buf_block_get_space(block),
buf_block_get_page_no(block)) == 0);
#endif
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
if (UNIV_UNLIKELY(trx && trx->take_stats)) {
_increment_page_get_statistics(block, trx);
}
@@ -3006,8 +3035,7 @@ buf_page_optimistic_get(
/*====================*/
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
buf_block_t* block, /*!< in: guessed buffer block */
- ib_uint64_t modify_clock,/*!< in: modify clock value if mode is
- ..._GUESS_ON_CLOCK */
+ ib_uint64_t modify_clock,/*!< in: modify clock value */
const char* file, /*!< in: file name */
ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mini-transaction */
@@ -3090,7 +3118,9 @@ buf_page_optimistic_get(
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
- ut_a(block->page.file_page_was_freed == FALSE);
+ mutex_enter(&block->mutex);
+ ut_a(!block->page.file_page_was_freed);
+ mutex_exit(&block->mutex);
#endif
if (UNIV_UNLIKELY(innobase_get_slow_log())) {
trx = innobase_get_trx();
@@ -3201,7 +3231,18 @@ buf_page_get_known_nowait(
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
- ut_a(mode == BUF_KEEP_OLD || !block->page.file_page_was_freed);
+ if (mode != BUF_KEEP_OLD) {
+ /* If mode == BUF_KEEP_OLD, we are executing an I/O
+ completion routine. Avoid a bogus assertion failure
+ when ibuf_merge_or_delete_for_page() is processing a
+ page that was just freed due to DROP INDEX, or
+ deleting a record from SYS_INDEXES. This check will be
+ skipped in recv_recover_page() as well. */
+
+ mutex_enter(&block->mutex);
+ ut_a(!block->page.file_page_was_freed);
+ mutex_exit(&block->mutex);
+ }
#endif
#ifdef UNIV_IBUF_COUNT_DEBUG
@@ -3226,7 +3267,7 @@ buf_page_get_known_nowait(
/*******************************************************************//**
Given a tablespace id and page number tries to get that page. If the
page is not in the buffer pool it is not loaded and NULL is returned.
-Suitable for using when holding the kernel mutex.
+Suitable for using when holding the lock_sys_t::mutex.
@return pointer to a page or NULL */
UNIV_INTERN
const buf_block_t*
@@ -3242,25 +3283,25 @@ buf_page_try_get_func(
ibool success;
ulint fix_type;
buf_pool_t* buf_pool = buf_pool_get(space_id, page_no);
+ prio_rw_lock_t* hash_lock;
ut_ad(mtr);
ut_ad(mtr->state == MTR_ACTIVE);
- //buf_pool_mutex_enter(buf_pool);
- rw_lock_s_lock(&buf_pool->page_hash_latch);
- block = buf_block_hash_get(buf_pool, space_id, page_no);
+ block = buf_block_hash_get_s_locked(buf_pool, space_id,
+ page_no, &hash_lock);
if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
- //buf_pool_mutex_exit(buf_pool);
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
+ if (block) {
+ rw_lock_s_unlock(hash_lock);
+ }
return(NULL);
}
ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
mutex_enter(&block->mutex);
- //buf_pool_mutex_exit(buf_pool);
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
+ rw_lock_s_unlock(hash_lock);
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
@@ -3299,7 +3340,9 @@ buf_page_try_get_func(
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
- ut_a(block->page.file_page_was_freed == FALSE);
+ mutex_enter(&block->mutex);
+ ut_a(!block->page.file_page_was_freed);
+ mutex_exit(&block->mutex);
#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
@@ -3352,13 +3395,15 @@ buf_page_init(
buf_page_t* hash_page;
ut_ad(buf_pool == buf_pool_get(space, offset));
- //ut_ad(buf_pool_mutex_own(buf_pool));
-#ifdef UNIV_SYNC_DEBUG
- ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_EX));
-#endif
+
ut_ad(mutex_own(&(block->mutex)));
ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(buf_page_hash_lock_get(buf_pool, fold),
+ RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
/* Set the state of the block */
buf_block_set_file_page(block, space, offset);
@@ -3384,14 +3429,17 @@ buf_page_init(
if (UNIV_LIKELY(!hash_page)) {
} else if (buf_pool_watch_is_sentinel(buf_pool, hash_page)) {
/* Preserve the reference count. */
- ulint buf_fix_count;
mutex_enter(&buf_pool->zip_mutex);
- buf_fix_count = hash_page->buf_fix_count;
+
+ ulint buf_fix_count = hash_page->buf_fix_count;
+
ut_a(buf_fix_count > 0);
block->page.buf_fix_count += buf_fix_count;
buf_pool_watch_remove(buf_pool, fold, hash_page);
+
mutex_exit(&buf_pool->zip_mutex);
+
} else {
fprintf(stderr,
"InnoDB: Error: page %lu %lu already found"
@@ -3401,8 +3449,6 @@ buf_page_init(
(const void*) hash_page, (const void*) block);
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
mutex_exit(&block->mutex);
- //buf_pool_mutex_exit(buf_pool);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
buf_print();
buf_LRU_print();
buf_validate();
@@ -3435,7 +3481,7 @@ UNIV_INTERN
buf_page_t*
buf_page_init_for_read(
/*===================*/
- ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
+ dberr_t* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */
ulint space, /*!< in: space id */
ulint zip_size,/*!< in: compressed page size, or 0 */
@@ -3449,6 +3495,7 @@ buf_page_init_for_read(
buf_block_t* block;
buf_page_t* bpage = NULL;
buf_page_t* watch_page;
+ prio_rw_lock_t* hash_lock;
mtr_t mtr;
ulint fold;
ibool lru = FALSE;
@@ -3477,8 +3524,7 @@ buf_page_init_for_read(
ut_ad(mode == BUF_READ_ANY_PAGE);
}
- if (zip_size && UNIV_LIKELY(!unzip)
- && UNIV_LIKELY(!recv_recovery_is_on())) {
+ if (zip_size && !unzip && !recv_recovery_is_on()) {
block = NULL;
} else {
block = buf_LRU_get_free_block(buf_pool);
@@ -3487,29 +3533,24 @@ buf_page_init_for_read(
}
fold = buf_page_address_fold(space, offset);
+ hash_lock = buf_page_hash_lock_get(buf_pool, fold);
- //buf_pool_mutex_enter(buf_pool);
ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
mutex_enter(&buf_pool->LRU_list_mutex);
- rw_lock_x_lock(&buf_pool->page_hash_latch);
+ rw_lock_x_lock(hash_lock);
watch_page = buf_page_hash_get_low(buf_pool, space, offset, fold);
-
if (watch_page && !buf_pool_watch_is_sentinel(buf_pool, watch_page)) {
/* The page is already in the buffer pool. */
watch_page = NULL;
err_exit:
+ mutex_exit(&buf_pool->LRU_list_mutex);
+ rw_lock_x_unlock(hash_lock);
if (block) {
mutex_enter(&block->mutex);
- mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
- buf_LRU_block_free_non_file_page(block, FALSE);
+ buf_LRU_block_free_non_file_page(block);
mutex_exit(&block->mutex);
}
- else {
- mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
- }
bpage = NULL;
goto func_exit;
@@ -3526,16 +3567,18 @@ err_exit:
if (block) {
bpage = &block->page;
+
mutex_enter(&block->mutex);
ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
buf_page_init(buf_pool, space, offset, fold, zip_size, block);
-
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
+ rw_lock_x_unlock(hash_lock);
/* The block must be put to the LRU list, to the old blocks */
buf_LRU_add_block(bpage, TRUE/* to old blocks */);
+ mutex_exit(&buf_pool->LRU_list_mutex);
+ lru = TRUE;
/* We set a pass-type x-lock on the frame because then
the same thread which called for the read operation
@@ -3549,19 +3592,20 @@ err_exit:
rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
buf_page_set_io_fix(bpage, BUF_IO_READ);
- if (UNIV_UNLIKELY(zip_size)) {
- /* buf_pool->mutex may be released and
+ if (zip_size) {
+ /* buf_pool->LRU_list_mutex may be released and
reacquired by buf_buddy_alloc(). Thus, we
must release block->mutex in order not to
break the latching order in the reacquisition
- of buf_pool->mutex. We also must defer this
+ of buf_pool->LRU_list_mutex. We also must defer this
operation until after the block descriptor has
been added to buf_pool->LRU and
buf_pool->page_hash. */
mutex_exit(&block->mutex);
- data = buf_buddy_alloc(buf_pool, zip_size, &lru, FALSE);
+ mutex_enter(&buf_pool->LRU_list_mutex);
+ data = buf_buddy_alloc(buf_pool, zip_size, &lru);
mutex_enter(&block->mutex);
- block->page.zip.data = data;
+ block->page.zip.data = (page_zip_t*) data;
/* To maintain the invariant
block->in_unzip_LRU_list
@@ -3570,35 +3614,39 @@ err_exit:
after block->page.zip.data is set. */
ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
buf_unzip_LRU_add_block(block, TRUE);
+ mutex_exit(&buf_pool->LRU_list_mutex);
}
- mutex_exit(&buf_pool->LRU_list_mutex);
mutex_exit(&block->mutex);
} else {
+ rw_lock_x_unlock(hash_lock);
+
/* The compressed page must be allocated before the
control block (bpage), in order to avoid the
invocation of buf_buddy_relocate_block() on
uninitialized data. */
- data = buf_buddy_alloc(buf_pool, zip_size, &lru, TRUE);
+ data = buf_buddy_alloc(buf_pool, zip_size, &lru);
+
+ rw_lock_x_lock(hash_lock);
/* If buf_buddy_alloc() allocated storage from the LRU list,
- it released and reacquired buf_pool->mutex. Thus, we must
- check the page_hash again, as it may have been modified. */
+ it released and reacquired buf_pool->LRU_list_mutex. Thus, we
+ must check the page_hash again, as it may have been
+ modified. */
if (UNIV_UNLIKELY(lru)) {
watch_page = buf_page_hash_get_low(
buf_pool, space, offset, fold);
- if (watch_page
+ if (UNIV_UNLIKELY(watch_page
&& !buf_pool_watch_is_sentinel(buf_pool,
- watch_page)) {
+ watch_page))) {
/* The block was added by some other thread. */
- watch_page = NULL;
- buf_buddy_free(buf_pool, data, zip_size, TRUE);
-
mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
+ rw_lock_x_unlock(hash_lock);
+ watch_page = NULL;
+ buf_buddy_free(buf_pool, data, zip_size);
bpage = NULL;
goto func_exit;
@@ -3612,11 +3660,11 @@ err_exit:
page_zip_des_init(&bpage->zip);
page_zip_set_size(&bpage->zip, zip_size);
- bpage->zip.data = data;
+ bpage->zip.data = (page_zip_t*) data;
mutex_enter(&buf_pool->zip_mutex);
UNIV_MEM_DESC(bpage->zip.data,
- page_zip_get_size(&bpage->zip), bpage);
+ page_zip_get_size(&bpage->zip));
buf_page_init_low(bpage);
@@ -3629,15 +3677,17 @@ err_exit:
bpage->in_zip_hash = FALSE;
bpage->in_flush_list = FALSE;
bpage->in_free_list = FALSE;
-#endif /* UNIV_DEBUG */
bpage->in_LRU_list = FALSE;
+#endif /* UNIV_DEBUG */
ut_d(bpage->in_page_hash = TRUE);
if (UNIV_LIKELY_NULL(watch_page)) {
+
/* Preserve the reference count. */
ulint buf_fix_count = watch_page->buf_fix_count;
ut_a(buf_fix_count > 0);
+ ut_ad(buf_own_zip_mutex_for_page(bpage));
bpage->buf_fix_count += buf_fix_count;
ut_ad(buf_pool_watch_is_sentinel(buf_pool, watch_page));
buf_pool_watch_remove(buf_pool, fold, watch_page);
@@ -3646,15 +3696,14 @@ err_exit:
HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold,
bpage);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
+ rw_lock_x_unlock(hash_lock);
- /* The block must be put to the LRU list, to the old blocks
+ /* The block must be put to the LRU list, to the old blocks.
The zip_size is already set into the page zip */
buf_LRU_add_block(bpage, TRUE/* to old blocks */);
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
buf_LRU_insert_zip_clean(bpage);
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
mutex_exit(&buf_pool->LRU_list_mutex);
buf_page_set_io_fix(bpage, BUF_IO_READ);
@@ -3662,17 +3711,20 @@ err_exit:
mutex_exit(&buf_pool->zip_mutex);
}
- buf_pool_mutex_enter(buf_pool);
- buf_pool->n_pend_reads++;
- buf_pool_mutex_exit(buf_pool);
+ os_atomic_increment_ulint(&buf_pool->n_pend_reads, 1);
func_exit:
- //buf_pool_mutex_exit(buf_pool);
if (mode == BUF_READ_IBUF_PAGES_ONLY) {
ibuf_mtr_commit(&mtr);
}
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
ut_ad(!bpage || buf_page_in_file(bpage));
return(bpage);
}
@@ -3697,7 +3749,8 @@ buf_page_create(
buf_block_t* block;
ulint fold;
buf_block_t* free_block = NULL;
- buf_pool_t* buf_pool = buf_pool_get(space, offset);
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
+ prio_rw_lock_t* hash_lock;
ut_ad(mtr);
ut_ad(mtr->state == MTR_ACTIVE);
@@ -3706,11 +3759,11 @@ buf_page_create(
free_block = buf_LRU_get_free_block(buf_pool);
fold = buf_page_address_fold(space, offset);
+ hash_lock = buf_page_hash_lock_get(buf_pool, fold);
- //buf_pool_mutex_enter(buf_pool);
ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
mutex_enter(&buf_pool->LRU_list_mutex);
- rw_lock_x_lock(&buf_pool->page_hash_latch);
+ rw_lock_x_lock(hash_lock);
block = (buf_block_t*) buf_page_hash_get_low(
buf_pool, space, offset, fold);
@@ -3726,9 +3779,8 @@ buf_page_create(
#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
/* Page can be found in buf_pool */
- //buf_pool_mutex_exit(buf_pool);
+ rw_lock_x_unlock(hash_lock);
mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
buf_block_free(free_block);
@@ -3749,8 +3801,9 @@ buf_page_create(
mutex_enter(&block->mutex);
- buf_page_init(buf_pool, space, offset, fold, zip_size,block);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
+ buf_page_init(buf_pool, space, offset, fold, zip_size, block);
+
+ rw_lock_x_unlock(hash_lock);
/* The block must be put to the LRU list */
buf_LRU_add_block(&block->page, FALSE);
@@ -3763,21 +3816,22 @@ buf_page_create(
ibool lru;
/* Prevent race conditions during buf_buddy_alloc(),
- which may release and reacquire buf_pool->mutex,
+ which may release and reacquire buf_pool->LRU_list_mutex,
by IO-fixing and X-latching the block. */
buf_page_set_io_fix(&block->page, BUF_IO_READ);
rw_lock_x_lock(&block->lock);
+
mutex_exit(&block->mutex);
- /* buf_pool->mutex may be released and reacquired by
+ /* buf_pool->LRU_list_mutex may be released and reacquired by
buf_buddy_alloc(). Thus, we must release block->mutex
in order not to break the latching order in
- the reacquisition of buf_pool->mutex. We also must
+ the reacquisition of buf_pool->LRU_list_mutex. We also must
defer this operation until after the block descriptor
has been added to buf_pool->LRU and buf_pool->page_hash. */
- data = buf_buddy_alloc(buf_pool, zip_size, &lru, FALSE);
+ data = buf_buddy_alloc(buf_pool, zip_size, &lru);
mutex_enter(&block->mutex);
- block->page.zip.data = data;
+ block->page.zip.data = (page_zip_t*) data;
/* To maintain the invariant
block->in_unzip_LRU_list
@@ -3804,9 +3858,6 @@ buf_page_create(
ibuf_merge_or_delete_for_page(NULL, space, offset, zip_size, TRUE);
- /* Flush pages from the end of the LRU list if necessary */
- buf_flush_free_margin(buf_pool, FALSE);
-
frame = block->frame;
memset(frame + FIL_PAGE_PREV, 0xff, 4);
@@ -3832,6 +3883,114 @@ buf_page_create(
}
/********************************************************************//**
+Monitor the buffer page read/write activity, and increment corresponding
+counter value if MONITOR_MODULE_BUF_PAGE (module_buf_page) module is
+enabled. */
+static
+void
+buf_page_monitor(
+/*=============*/
+ const buf_page_t* bpage, /*!< in: pointer to the block */
+ enum buf_io_fix io_type)/*!< in: io_fix types */
+{
+ const byte* frame;
+ monitor_id_t counter;
+
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+ /* If the counter module is not turned on, just return */
+ if (!MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)) {
+ return;
+ }
+
+ ut_a(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
+
+ frame = bpage->zip.data
+ ? bpage->zip.data
+ : ((buf_block_t*) bpage)->frame;
+
+ switch (fil_page_get_type(frame)) {
+ ulint level;
+
+ case FIL_PAGE_INDEX:
+ level = btr_page_get_level_low(frame);
+
+ /* Check if it is an index page for insert buffer */
+ if (btr_page_get_index_id(frame)
+ == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
+ if (level == 0) {
+ counter = MONITOR_RW_COUNTER(
+ io_type, MONITOR_INDEX_IBUF_LEAF_PAGE);
+ } else {
+ counter = MONITOR_RW_COUNTER(
+ io_type,
+ MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
+ }
+ } else {
+ if (level == 0) {
+ counter = MONITOR_RW_COUNTER(
+ io_type, MONITOR_INDEX_LEAF_PAGE);
+ } else {
+ counter = MONITOR_RW_COUNTER(
+ io_type, MONITOR_INDEX_NON_LEAF_PAGE);
+ }
+ }
+ break;
+
+ case FIL_PAGE_UNDO_LOG:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE);
+ break;
+
+ case FIL_PAGE_INODE:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE);
+ break;
+
+ case FIL_PAGE_IBUF_FREE_LIST:
+ counter = MONITOR_RW_COUNTER(io_type,
+ MONITOR_IBUF_FREELIST_PAGE);
+ break;
+
+ case FIL_PAGE_IBUF_BITMAP:
+ counter = MONITOR_RW_COUNTER(io_type,
+ MONITOR_IBUF_BITMAP_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_SYS:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_TRX_SYS:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_FSP_HDR:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_XDES:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_BLOB:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_ZBLOB:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_ZBLOB2:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE);
+ break;
+
+ default:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE);
+ }
+
+ MONITOR_INC_NOCHECK(counter);
+}
+
+/********************************************************************//**
Mark a table with the specified space pointed by bpage->space corrupted.
Also remove the bpage from LRU list.
@return TRUE if successful */
@@ -3846,12 +4005,14 @@ buf_mark_space_corrupt(
== BUF_BLOCK_FILE_PAGE);
ulint space = bpage->space;
ibool ret = TRUE;
+ const ulint fold = buf_page_address_fold(bpage->space,
+ bpage->offset);
+ prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold);
/* First unfix and release lock on the bpage */
- //buf_pool_mutex_enter(buf_pool);
ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
mutex_enter(&buf_pool->LRU_list_mutex);
- rw_lock_x_lock(&buf_pool->page_hash_latch);
+ rw_lock_x_lock(hash_lock);
mutex_enter(buf_page_get_mutex(bpage));
ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
ut_ad(bpage->buf_fix_count == 0);
@@ -3869,18 +4030,14 @@ buf_mark_space_corrupt(
if (dict_set_corrupted_by_space(space)) {
buf_LRU_free_one_page(bpage);
} else {
+ mutex_exit(buf_page_get_mutex(bpage));
ret = FALSE;
}
- buf_pool_mutex_enter(buf_pool);
- ut_ad(buf_pool->n_pend_reads > 0);
- buf_pool->n_pend_reads--;
- buf_pool_mutex_exit(buf_pool);
-
- mutex_exit(buf_page_get_mutex(bpage));
- //buf_pool_mutex_exit(buf_pool);
mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
+
+ ut_ad(buf_pool->n_pend_reads > 0);
+ os_atomic_decrement_ulint(&buf_pool->n_pend_reads, 1);
return(ret);
}
@@ -3888,9 +4045,9 @@ buf_mark_space_corrupt(
/********************************************************************//**
Completes an asynchronous read or write request of a file page to or from
the buffer pool.
-@return TRUE if successful */
+@return true if successful */
UNIV_INTERN
-ibool
+bool
buf_page_io_complete(
/*=================*/
buf_page_t* bpage) /*!< in: pointer to the block in question */
@@ -3899,8 +4056,7 @@ buf_page_io_complete(
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
const ibool uncompressed = (buf_page_get_state(bpage)
== BUF_BLOCK_FILE_PAGE);
- ibool have_LRU_mutex = FALSE;
- mutex_t* block_mutex;
+ bool have_LRU_mutex = false;
ut_a(buf_page_in_file(bpage));
@@ -3920,15 +4076,16 @@ buf_page_io_complete(
if (buf_page_get_zip_size(bpage)) {
frame = bpage->zip.data;
- buf_pool->n_pend_unzip++;
+ os_atomic_increment_ulint(&buf_pool->n_pend_unzip, 1);
if (uncompressed
&& !buf_zip_decompress((buf_block_t*) bpage,
FALSE)) {
- buf_pool->n_pend_unzip--;
+ os_atomic_decrement_ulint(
+ &buf_pool->n_pend_unzip, 1);
goto corrupt;
}
- buf_pool->n_pend_unzip--;
+ os_atomic_decrement_ulint(&buf_pool->n_pend_unzip, 1);
} else {
ut_a(uncompressed);
frame = ((buf_block_t*) bpage)->frame;
@@ -3941,9 +4098,8 @@ buf_page_io_complete(
read_space_id = mach_read_from_4(
frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
- if ((bpage->space == TRX_SYS_SPACE
- || (srv_doublewrite_file && bpage->space == TRX_DOUBLEWRITE_SPACE))
- && trx_doublewrite_page_inside(bpage->offset)) {
+ if (bpage->space == TRX_SYS_SPACE
+ && buf_dblwr_page_inside(bpage->offset)) {
ut_print_timestamp(stderr);
fprintf(stderr,
@@ -3977,8 +4133,20 @@ buf_page_io_complete(
/* From version 3.23.38 up we store the page checksum
to the 4 first bytes of the page end lsn field */
- if (buf_page_is_corrupted(TRUE, frame,
+ if (buf_page_is_corrupted(true, frame,
buf_page_get_zip_size(bpage))) {
+
+ /* Not a real corruption if it was triggered by
+ error injection */
+ DBUG_EXECUTE_IF("buf_page_is_corrupt_failure",
+ if (bpage->space > TRX_SYS_SPACE
+ && buf_mark_space_corrupt(bpage)) {
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Simulated page corruption");
+ return(true);
+ }
+ goto page_not_corrupt;
+ ;);
corrupt:
fprintf(stderr,
"InnoDB: Database page corruption on disk"
@@ -4016,7 +4184,7 @@ corrupt:
REFMAN "forcing-innodb-recovery.html\n"
"InnoDB: about forcing recovery.\n", stderr);
- if (srv_pass_corrupt_table && !trx_sys_sys_space(bpage->space)
+ if (srv_pass_corrupt_table && bpage->space != 0
&& bpage->space < SRV_LOG_SPACE_FIRST_ID) {
trx_t* trx;
@@ -4039,7 +4207,7 @@ corrupt:
table as corrupted instead of crashing server */
if (bpage->space > TRX_SYS_SPACE
&& buf_mark_space_corrupt(bpage)) {
- return(FALSE);
+ return(false);
} else {
fputs("InnoDB: Ending processing"
" because of"
@@ -4051,6 +4219,9 @@ corrupt:
}
} /**/
+ DBUG_EXECUTE_IF("buf_page_is_corrupt_failure",
+ page_not_corrupt: bpage = bpage; );
+
if (recv_recovery_is_on()) {
/* Pages must be uncompressed for crash recovery. */
ut_a(uncompressed);
@@ -4088,14 +4259,17 @@ corrupt:
buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY ||
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU)) {
+
+ have_LRU_mutex = TRUE; /* optimistic */
}
retry_mutex:
- if (!have_LRU_mutex) {
+ if (have_LRU_mutex) {
mutex_enter(&buf_pool->LRU_list_mutex);
- have_LRU_mutex = TRUE;
}
- block_mutex = buf_page_get_mutex_enter(bpage);
- ut_a(block_mutex);
+
+ ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
+ mutex_enter(block_mutex);
+
if (io_type == BUF_IO_WRITE
&& (
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
@@ -4108,7 +4282,6 @@ retry_mutex:
mutex_exit(block_mutex);
goto retry_mutex;
}
- buf_pool_mutex_enter(buf_pool);
#ifdef UNIV_IBUF_COUNT_DEBUG
if (io_type == BUF_IO_WRITE || uncompressed) {
@@ -4123,23 +4296,20 @@ retry_mutex:
removes the newest lock debug record, without checking the thread
id. */
- buf_page_set_io_fix(bpage, BUF_IO_NONE);
-
switch (io_type) {
case BUF_IO_READ:
+
+ buf_page_set_io_fix(bpage, BUF_IO_NONE);
+
/* NOTE that the call to ibuf may have moved the ownership of
the x-latch to this OS thread: do not let this confuse you in
debugging! */
- if (have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- have_LRU_mutex = FALSE;
- }
-
- ut_a(!have_LRU_mutex);
ut_ad(buf_pool->n_pend_reads > 0);
- buf_pool->n_pend_reads--;
- buf_pool->stat.n_pages_read++;
+ os_atomic_decrement_ulint(&buf_pool->n_pend_reads, 1);
+ os_atomic_increment_ulint(&buf_pool->stat.n_pages_read, 1);
+
+ ut_ad(!have_LRU_mutex);
if (uncompressed) {
rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock,
@@ -4154,9 +4324,10 @@ retry_mutex:
buf_flush_write_complete(bpage);
+ os_atomic_increment_ulint(&buf_pool->stat.n_pages_written, 1);
+
if (have_LRU_mutex) {
mutex_exit(&buf_pool->LRU_list_mutex);
- have_LRU_mutex = FALSE;
}
if (uncompressed) {
@@ -4164,14 +4335,14 @@ retry_mutex:
BUF_IO_WRITE);
}
- buf_pool->stat.n_pages_written++;
-
break;
default:
ut_error;
}
+ buf_page_monitor(bpage, io_type);
+
#ifdef UNIV_DEBUG
if (buf_debug_prints) {
fprintf(stderr, "Has %s page space %lu page no %lu\n",
@@ -4181,40 +4352,9 @@ retry_mutex:
}
#endif /* UNIV_DEBUG */
- buf_pool_mutex_exit(buf_pool);
mutex_exit(block_mutex);
- return(TRUE);
-}
-
-/********************************************************************//**
-*/
-UNIV_INTERN
-buf_block_t*
-buf_page_from_array(
-/*================*/
- buf_pool_t* buf_pool,
- ulint n_block)
-{
- ulint n_chunks, offset;
- buf_chunk_t* chunk;
-
- ut_a(n_block < buf_pool->curr_size);
-
- chunk = buf_pool->chunks;
- offset = n_block;
-
- for (n_chunks = buf_pool->n_chunks; n_chunks--; chunk++) {
- if (offset < chunk->size) {
- return(&chunk->blocks[offset]);
- }
-
- offset -= chunk->size;
- }
-
- ut_error;
-
- return(NULL);
+ return(true);
}
/*********************************************************************//**
@@ -4231,17 +4371,17 @@ buf_all_freed_instance(
ut_ad(buf_pool);
- //buf_pool_mutex_enter(buf_pool);
ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
- mutex_enter(&buf_pool->LRU_list_mutex);
- rw_lock_x_lock(&buf_pool->page_hash_latch);
-
chunk = buf_pool->chunks;
for (i = buf_pool->n_chunks; i--; chunk++) {
+ mutex_enter(&buf_pool->LRU_list_mutex);
+
const buf_block_t* block = buf_chunk_not_freed(chunk);
+ mutex_exit(&buf_pool->LRU_list_mutex);
+
if (UNIV_LIKELY_NULL(block)) {
fprintf(stderr,
"Page %lu %lu still fixed or dirty\n",
@@ -4251,10 +4391,6 @@ buf_all_freed_instance(
}
}
- //buf_pool_mutex_exit(buf_pool);
- mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
-
return(TRUE);
}
@@ -4266,10 +4402,11 @@ buf_pool_invalidate_instance(
/*=========================*/
buf_pool_t* buf_pool) /*!< in: buffer pool instance */
{
- ibool freed;
- enum buf_flush i;
+ ulint i;
- buf_pool_mutex_enter(buf_pool);
+ ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
+
+ mutex_enter(&buf_pool->flush_state_mutex);
for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
@@ -4285,23 +4422,20 @@ buf_pool_invalidate_instance(
pool invalidation to proceed we must ensure there is NO
write activity happening. */
if (buf_pool->n_flush[i] > 0) {
- buf_pool_mutex_exit(buf_pool);
- buf_flush_wait_batch_end(buf_pool, i);
- buf_pool_mutex_enter(buf_pool);
+ buf_flush_t type = static_cast<buf_flush_t>(i);
+
+ mutex_exit(&buf_pool->flush_state_mutex);
+ buf_flush_wait_batch_end(buf_pool, type);
+ mutex_enter(&buf_pool->flush_state_mutex);
}
}
-
- buf_pool_mutex_exit(buf_pool);
+ mutex_exit(&buf_pool->flush_state_mutex);
ut_ad(buf_all_freed_instance(buf_pool));
- freed = TRUE;
-
- while (freed) {
- freed = buf_LRU_search_and_free_block(buf_pool, 100);
+ while (buf_LRU_scan_and_free_block(buf_pool, TRUE)) {
}
- //buf_pool_mutex_enter(buf_pool);
ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
mutex_enter(&buf_pool->LRU_list_mutex);
@@ -4311,13 +4445,11 @@ buf_pool_invalidate_instance(
buf_pool->freed_page_clock = 0;
buf_pool->LRU_old = NULL;
buf_pool->LRU_old_len = 0;
- buf_pool->LRU_flush_ended = 0;
+
+ mutex_exit(&buf_pool->LRU_list_mutex);
memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat));
buf_refresh_io_stats(buf_pool);
-
- //buf_pool_mutex_exit(buf_pool);
- mutex_exit(&buf_pool->LRU_list_mutex);
}
/*********************************************************************//**
@@ -4349,21 +4481,25 @@ buf_pool_validate_instance(
buf_page_t* b;
buf_chunk_t* chunk;
ulint i;
- ulint n_single_flush = 0;
ulint n_lru_flush = 0;
+ ulint n_page_flush = 0;
ulint n_list_flush = 0;
ulint n_lru = 0;
ulint n_flush = 0;
ulint n_free = 0;
ulint n_zip = 0;
+ ulint fold = 0;
+ ulint space = 0;
+ ulint offset = 0;
ut_ad(buf_pool);
- //buf_pool_mutex_enter(buf_pool);
ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
mutex_enter(&buf_pool->LRU_list_mutex);
- rw_lock_x_lock(&buf_pool->page_hash_latch);
- /* for keep the new latch order, it cannot validate correctly... */
+ hash_lock_x_all(buf_pool->page_hash);
+ mutex_enter(&buf_pool->zip_mutex);
+ mutex_enter(&buf_pool->free_list_mutex);
+ mutex_enter(&buf_pool->flush_state_mutex);
chunk = buf_pool->chunks;
@@ -4376,10 +4512,8 @@ buf_pool_validate_instance(
for (j = chunk->size; j--; block++) {
- mutex_enter(&block->mutex);
-
switch (buf_block_get_state(block)) {
- case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_POOL_WATCH:
case BUF_BLOCK_ZIP_PAGE:
case BUF_BLOCK_ZIP_DIRTY:
/* These should only occur on
@@ -4388,22 +4522,26 @@ buf_pool_validate_instance(
break;
case BUF_BLOCK_FILE_PAGE:
- ut_a(buf_page_hash_get(buf_pool,
- buf_block_get_space(
- block),
- buf_block_get_page_no(
- block))
+
+ space = buf_block_get_space(block);
+ offset = buf_block_get_page_no(block);
+ fold = buf_page_address_fold(space, offset);
+ ut_a(buf_page_hash_get_low(buf_pool,
+ space,
+ offset,
+ fold)
== &block->page);
#ifdef UNIV_IBUF_COUNT_DEBUG
- ut_a(buf_page_get_io_fix(&block->page)
+ ut_a(buf_page_get_io_fix_unlocked(&block->page)
== BUF_IO_READ
|| !ibuf_count_get(buf_block_get_space(
block),
buf_block_get_page_no(
block)));
#endif
- switch (buf_page_get_io_fix(&block->page)) {
+ switch (buf_page_get_io_fix_unlocked(
+ &block->page)) {
case BUF_IO_NONE:
break;
@@ -4411,16 +4549,8 @@ buf_pool_validate_instance(
switch (buf_page_get_flush_type(
&block->page)) {
case BUF_FLUSH_LRU:
- n_lru_flush++;
- ut_a(rw_lock_is_locked(
- &block->lock,
- RW_LOCK_SHARED));
- break;
- case BUF_FLUSH_LIST:
- n_list_flush++;
- break;
case BUF_FLUSH_SINGLE_PAGE:
- n_single_flush++;
+ case BUF_FLUSH_LIST:
break;
default:
ut_error;
@@ -4451,17 +4581,13 @@ buf_pool_validate_instance(
/* do nothing */
break;
}
-
- mutex_exit(&block->mutex);
}
}
- mutex_enter(&buf_pool->zip_mutex);
-
/* Check clean compressed-only blocks. */
for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
- b = UT_LIST_GET_NEXT(zip_list, b)) {
+ b = UT_LIST_GET_NEXT(list, b)) {
ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
switch (buf_page_get_io_fix(b)) {
case BUF_IO_NONE:
@@ -4469,7 +4595,7 @@ buf_pool_validate_instance(
/* All clean blocks should be I/O-unfixed. */
break;
case BUF_IO_READ:
- /* In buf_LRU_free_block(), we temporarily set
+ /* In buf_LRU_free_page(), we temporarily set
b->io_fix = BUF_IO_READ for a newly allocated
control block in order to prevent
buf_page_get_gen() from decompressing the block. */
@@ -4483,8 +4609,9 @@ buf_pool_validate_instance(
we have acquired buf_pool->zip_mutex above which acts
as the 'block->mutex' for these bpages. */
ut_a(!b->oldest_modification);
- ut_a(buf_page_hash_get(buf_pool, b->space, b->offset) == b);
-
+ fold = buf_page_address_fold(b->space, b->offset);
+ ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset,
+ fold) == b);
n_lru++;
n_zip++;
}
@@ -4493,7 +4620,7 @@ buf_pool_validate_instance(
buf_flush_list_mutex_enter(buf_pool);
for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
- b = UT_LIST_GET_NEXT(flush_list, b)) {
+ b = UT_LIST_GET_NEXT(list, b)) {
ut_ad(b->in_flush_list);
ut_a(b->oldest_modification);
n_flush++;
@@ -4502,7 +4629,9 @@ buf_pool_validate_instance(
case BUF_BLOCK_ZIP_DIRTY:
n_lru++;
n_zip++;
- switch (buf_page_get_io_fix(b)) {
+ /* fallthrough */
+ case BUF_BLOCK_FILE_PAGE:
+ switch (buf_page_get_io_fix_unlocked(b)) {
case BUF_IO_NONE:
case BUF_IO_READ:
case BUF_IO_PIN:
@@ -4512,22 +4641,21 @@ buf_pool_validate_instance(
case BUF_FLUSH_LRU:
n_lru_flush++;
break;
+ case BUF_FLUSH_SINGLE_PAGE:
+ n_page_flush++;
+ break;
case BUF_FLUSH_LIST:
n_list_flush++;
break;
- case BUF_FLUSH_SINGLE_PAGE:
- n_single_flush++;
- break;
default:
ut_error;
}
break;
+ default:
+ ut_error;
}
break;
- case BUF_BLOCK_FILE_PAGE:
- /* uncompressed page */
- break;
- case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_POOL_WATCH:
case BUF_BLOCK_ZIP_PAGE:
case BUF_BLOCK_NOT_USED:
case BUF_BLOCK_READY_FOR_USE:
@@ -4536,11 +4664,14 @@ buf_pool_validate_instance(
ut_error;
break;
}
- ut_a(buf_page_hash_get(buf_pool, b->space, b->offset) == b);
+ fold = buf_page_address_fold(b->space, b->offset);
+ ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset,
+ fold) == b);
}
ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
+ hash_unlock_x_all(buf_pool->page_hash);
buf_flush_list_mutex_exit(buf_pool);
mutex_exit(&buf_pool->zip_mutex);
@@ -4553,8 +4684,9 @@ buf_pool_validate_instance(
}
ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
- /* because of latching order with block->mutex, we cannot get needed mutexes before that */
-/*
+
+ mutex_exit(&buf_pool->LRU_list_mutex);
+
if (UT_LIST_GET_LEN(buf_pool->free) != n_free) {
fprintf(stderr, "Free list len %lu, free blocks %lu\n",
(ulong) UT_LIST_GET_LEN(buf_pool->free),
@@ -4562,14 +4694,13 @@ buf_pool_validate_instance(
ut_error;
}
- ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush);
+ mutex_exit(&buf_pool->free_list_mutex);
+
ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
-*/
+ ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush);
- //buf_pool_mutex_exit(buf_pool);
- mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
+ mutex_exit(&buf_pool->flush_state_mutex);
ut_a(buf_LRU_validate());
ut_a(buf_flush_validate(buf_pool));
@@ -4622,14 +4753,13 @@ buf_print_instance(
size = buf_pool->curr_size;
- index_ids = mem_alloc(size * sizeof *index_ids);
- counts = mem_alloc(sizeof(ulint) * size);
-
- //buf_pool_mutex_enter(buf_pool);
ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
- mutex_enter(&buf_pool->LRU_list_mutex);
- mutex_enter(&buf_pool->free_list_mutex);
- buf_flush_list_mutex_enter(buf_pool);
+ index_ids = static_cast<index_id_t*>(
+ mem_alloc(size * sizeof *index_ids));
+
+ counts = static_cast<ulint*>(mem_alloc(sizeof(ulint) * size));
+
+ /* Dirty reads below */
fprintf(stderr,
"buf_pool size %lu\n"
@@ -4656,12 +4786,12 @@ buf_print_instance(
(ulong) buf_pool->stat.n_pages_created,
(ulong) buf_pool->stat.n_pages_written);
- buf_flush_list_mutex_exit(buf_pool);
-
/* Count the number of blocks belonging to each index in the buffer */
n_found = 0;
+ mutex_enter(&buf_pool->LRU_list_mutex);
+
chunk = buf_pool->chunks;
for (i = buf_pool->n_chunks; i--; chunk++) {
@@ -4697,9 +4827,7 @@ buf_print_instance(
}
}
- //buf_pool_mutex_exit(buf_pool);
mutex_exit(&buf_pool->LRU_list_mutex);
- mutex_exit(&buf_pool->free_list_mutex);
for (i = 0; i < n_found; i++) {
index = dict_index_get_if_in_cache(index_ids[i]);
@@ -4756,7 +4884,8 @@ buf_get_latched_pages_number_instance(
buf_chunk_t* chunk;
ulint fixed_pages_number = 0;
- //buf_pool_mutex_enter(buf_pool);
+ /* The LRU list mutex is enough to protect the required fields below */
+ mutex_enter(&buf_pool->LRU_list_mutex);
chunk = buf_pool->chunks;
@@ -4773,24 +4902,23 @@ buf_get_latched_pages_number_instance(
continue;
}
- mutex_enter(&block->mutex);
-
if (block->page.buf_fix_count != 0
- || buf_page_get_io_fix(&block->page)
+ || buf_page_get_io_fix_unlocked(&block->page)
!= BUF_IO_NONE) {
fixed_pages_number++;
}
- mutex_exit(&block->mutex);
}
}
+ mutex_exit(&buf_pool->LRU_list_mutex);
+
mutex_enter(&buf_pool->zip_mutex);
/* Traverse the lists of clean and dirty compressed-only blocks. */
for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
- b = UT_LIST_GET_NEXT(zip_list, b)) {
+ b = UT_LIST_GET_NEXT(list, b)) {
ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE);
@@ -4802,7 +4930,7 @@ buf_get_latched_pages_number_instance(
buf_flush_list_mutex_enter(buf_pool);
for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
- b = UT_LIST_GET_NEXT(flush_list, b)) {
+ b = UT_LIST_GET_NEXT(list, b)) {
ut_ad(b->in_flush_list);
switch (buf_page_get_state(b)) {
@@ -4814,13 +4942,16 @@ buf_get_latched_pages_number_instance(
break;
case BUF_BLOCK_FILE_PAGE:
/* uncompressed page */
+ case BUF_BLOCK_REMOVE_HASH:
+ /* We hold flush list but not LRU list mutex here.
+ Thus encountering BUF_BLOCK_REMOVE_HASH pages is
+ possible. */
break;
- case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_POOL_WATCH:
case BUF_BLOCK_ZIP_PAGE:
case BUF_BLOCK_NOT_USED:
case BUF_BLOCK_READY_FOR_USE:
case BUF_BLOCK_MEMORY:
- case BUF_BLOCK_REMOVE_HASH:
ut_error;
break;
}
@@ -4828,7 +4959,6 @@ buf_get_latched_pages_number_instance(
buf_flush_list_mutex_exit(buf_pool);
mutex_exit(&buf_pool->zip_mutex);
- //buf_pool_mutex_exit(buf_pool);
return(fixed_pages_number);
}
@@ -4859,26 +4989,18 @@ buf_get_latched_pages_number(void)
#endif /* UNIV_DEBUG */
/*********************************************************************//**
-Returns the number of pending buf pool ios.
-@return number of pending I/O operations */
+Returns the number of pending buf pool read ios.
+@return number of pending read I/O operations */
UNIV_INTERN
ulint
-buf_get_n_pending_ios(void)
-/*=======================*/
+buf_get_n_pending_read_ios(void)
+/*============================*/
{
ulint i;
ulint pend_ios = 0;
for (i = 0; i < srv_buf_pool_instances; i++) {
- buf_pool_t* buf_pool;
-
- buf_pool = buf_pool_from_array(i);
-
- pend_ios +=
- buf_pool->n_pend_reads
- + buf_pool->n_flush[BUF_FLUSH_LRU]
- + buf_pool->n_flush[BUF_FLUSH_LIST]
- + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE];
+ pend_ios += buf_pool_from_array(i)->n_pend_reads;
}
return(pend_ios);
@@ -4936,8 +5058,6 @@ buf_stats_aggregate_pool_info(
total_info->n_pend_reads += pool_info->n_pend_reads;
total_info->n_pending_flush_lru += pool_info->n_pending_flush_lru;
total_info->n_pending_flush_list += pool_info->n_pending_flush_list;
- total_info->n_pending_flush_single_page +=
- pool_info->n_pending_flush_single_page;
total_info->n_pages_made_young += pool_info->n_pages_made_young;
total_info->n_pages_not_made_young += pool_info->n_pages_not_made_young;
total_info->n_pages_read += pool_info->n_pages_read;
@@ -4985,12 +5105,7 @@ buf_stats_get_pool_info(
/* Find appropriate pool_info to store stats for this buffer pool */
pool_info = &all_pool_info[pool_id];
-
ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
- mutex_enter(&buf_pool->LRU_list_mutex);
- mutex_enter(&buf_pool->free_list_mutex);
- buf_pool_mutex_enter(buf_pool);
- buf_flush_list_mutex_enter(buf_pool);
pool_info->pool_unique_id = pool_id;
@@ -5010,6 +5125,8 @@ buf_stats_get_pool_info(
pool_info->n_pend_reads = buf_pool->n_pend_reads;
+ mutex_enter(&buf_pool->flush_state_mutex);
+
pool_info->n_pending_flush_lru =
(buf_pool->n_flush[BUF_FLUSH_LRU]
+ buf_pool->init_flush[BUF_FLUSH_LRU]);
@@ -5019,9 +5136,10 @@ buf_stats_get_pool_info(
+ buf_pool->init_flush[BUF_FLUSH_LIST]);
pool_info->n_pending_flush_single_page =
- buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE];
+ (buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
+ + buf_pool->init_flush[BUF_FLUSH_SINGLE_PAGE]);
- buf_flush_list_mutex_exit(buf_pool);
+ mutex_exit(&buf_pool->flush_state_mutex);
current_time = time(NULL);
time_elapsed = 0.001 + difftime(current_time,
@@ -5104,9 +5222,6 @@ buf_stats_get_pool_info(
pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
buf_refresh_io_stats(buf_pool);
- mutex_exit(&buf_pool->LRU_list_mutex);
- mutex_exit(&buf_pool->free_list_mutex);
- buf_pool_mutex_exit(buf_pool);
}
/*********************************************************************//**
@@ -5122,7 +5237,7 @@ buf_print_io_instance(
fprintf(file,
"Buffer pool size %lu\n"
- "Buffer pool size, bytes %lu\n"
+ "Buffer pool size, bytes " ULINTPF "\n"
"Free buffers %lu\n"
"Database pages %lu\n"
"Old database pages %lu\n"
@@ -5212,8 +5327,10 @@ buf_print_io(
pool_info_total = &pool_info[srv_buf_pool_instances];
} else {
ut_a(srv_buf_pool_instances == 1);
- pool_info_total = pool_info = (buf_pool_info_t*) mem_zalloc(
- sizeof *pool_info)
+
+ pool_info_total = pool_info =
+ static_cast<buf_pool_info_t*>(
+ mem_zalloc(sizeof *pool_info));
}
for (i = 0; i < srv_buf_pool_instances; i++) {
@@ -5271,9 +5388,7 @@ void
buf_refresh_io_stats_all(void)
/*==========================*/
{
- ulint i;
-
- for (i = 0; i < srv_buf_pool_instances; i++) {
+ for (ulint i = 0; i < srv_buf_pool_instances; i++) {
buf_pool_t* buf_pool;
buf_pool = buf_pool_from_array(i);
@@ -5290,9 +5405,7 @@ ibool
buf_all_freed(void)
/*===============*/
{
- ulint i;
-
- for (i = 0; i < srv_buf_pool_instances; i++) {
+ for (ulint i = 0; i < srv_buf_pool_instances; i++) {
buf_pool_t* buf_pool;
buf_pool = buf_pool_from_array(i);
@@ -5300,7 +5413,7 @@ buf_all_freed(void)
if (!buf_all_freed_instance(buf_pool)) {
return(FALSE);
}
- }
+ }
return(TRUE);
}
@@ -5311,27 +5424,27 @@ pool.
@return number of pending i/o */
UNIV_INTERN
ulint
-buf_pool_check_num_pending_io(void)
-/*===============================*/
+buf_pool_check_no_pending_io(void)
+/*==============================*/
{
ulint i;
ulint pending_io = 0;
- buf_pool_mutex_enter_all();
-
for (i = 0; i < srv_buf_pool_instances; i++) {
- const buf_pool_t* buf_pool;
+ buf_pool_t* buf_pool;
buf_pool = buf_pool_from_array(i);
- pending_io += buf_pool->n_pend_reads
- + buf_pool->n_flush[BUF_FLUSH_LRU]
- + buf_pool->n_flush[BUF_FLUSH_LIST]
- + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE];
+ pending_io += buf_pool->n_pend_reads;
- }
+ mutex_enter(&buf_pool->flush_state_mutex);
- buf_pool_mutex_exit_all();
+ pending_io += buf_pool->n_flush[BUF_FLUSH_LRU];
+ pending_io += buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE];
+ pending_io += buf_pool->n_flush[BUF_FLUSH_LIST];
+
+ mutex_exit(&buf_pool->flush_state_mutex);
+ }
return(pending_io);
}
@@ -5348,12 +5461,10 @@ buf_get_free_list_len(void)
{
ulint len;
- //buf_pool_mutex_enter(buf_pool);
mutex_enter(&buf_pool->free_list_mutex);
len = UT_LIST_GET_LEN(buf_pool->free);
- //buf_pool_mutex_exit(buf_pool);
mutex_exit(&buf_pool->free_list_mutex);
return(len);
@@ -5382,7 +5493,7 @@ buf_page_init_for_backup_restore(
/* We assume that block->page.data has been allocated
with zip_size == UNIV_PAGE_SIZE. */
- ut_ad(zip_size <= UNIV_PAGE_SIZE);
+ ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
ut_ad(ut_is_2pow(zip_size));
page_zip_set_size(&block->page.zip, zip_size);
if (zip_size) {
diff --git a/storage/xtradb/buf/buf0checksum.cc b/storage/xtradb/buf/buf0checksum.cc
new file mode 100644
index 00000000000..ec79bbe6be9
--- /dev/null
+++ b/storage/xtradb/buf/buf0checksum.cc
@@ -0,0 +1,155 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0checksum.cc
+Buffer pool checksum functions, also linked from /extra/innochecksum.cc
+
+Created Aug 11, 2011 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include "fil0fil.h" /* FIL_* */
+#include "ut0crc32.h" /* ut_crc32() */
+#include "ut0rnd.h" /* ut_fold_binary() */
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "srv0srv.h" /* SRV_CHECKSUM_* */
+#include "buf0types.h"
+
+/** the macro MYSQL_SYSVAR_ENUM() requires "long unsigned int" and if we
+use srv_checksum_algorithm_t here then we get a compiler error:
+ha_innodb.cc:12251: error: cannot convert 'srv_checksum_algorithm_t*' to
+ 'long unsigned int*' in initialization */
+UNIV_INTERN ulong srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_INNODB;
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************************//**
+Calculates a page CRC32 which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return checksum */
+UNIV_INTERN
+ib_uint32_t
+buf_calc_page_crc32(
+/*================*/
+ const byte* page) /*!< in: buffer page */
+{
+ ib_uint32_t checksum;
+
+ /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool
+ to the first pages of data files, we have to skip them in the page
+ checksum calculation.
+ We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
+ checksum is stored, and also the last 8 bytes of page because
+ there we store the old formula checksum. */
+
+ checksum = ut_crc32(page + FIL_PAGE_OFFSET,
+ FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
+ ^ ut_crc32(page + FIL_PAGE_DATA,
+ UNIV_PAGE_SIZE - FIL_PAGE_DATA
+ - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+ return(checksum);
+}
+
+/********************************************************************//**
+Calculates a page checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum(
+/*=======================*/
+ const byte* page) /*!< in: buffer page */
+{
+ ulint checksum;
+
+ /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool
+ to the first pages of data files, we have to skip them in the page
+ checksum calculation.
+ We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
+ checksum is stored, and also the last 8 bytes of page because
+ there we store the old formula checksum. */
+
+ checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
+ FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
+ + ut_fold_binary(page + FIL_PAGE_DATA,
+ UNIV_PAGE_SIZE - FIL_PAGE_DATA
+ - FIL_PAGE_END_LSN_OLD_CHKSUM);
+ checksum = checksum & 0xFFFFFFFFUL;
+
+ return(checksum);
+}
+
+/********************************************************************//**
+In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
+looked at the first few bytes of the page. This calculates that old
+checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@return checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_old_checksum(
+/*=======================*/
+ const byte* page) /*!< in: buffer page */
+{
+ ulint checksum;
+
+ checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
+
+ checksum = checksum & 0xFFFFFFFFUL;
+
+ return(checksum);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/********************************************************************//**
+Return a printable string describing the checksum algorithm.
+@return algorithm name */
+UNIV_INTERN
+const char*
+buf_checksum_algorithm_name(
+/*========================*/
+ srv_checksum_algorithm_t algo) /*!< in: algorithm */
+{
+ switch (algo) {
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ return("crc32");
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+ return("innodb");
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+ return("none");
+ }
+
+ ut_error;
+ return(NULL);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/xtradb/buf/buf0dblwr.cc b/storage/xtradb/buf/buf0dblwr.cc
new file mode 100644
index 00000000000..506a5b177ba
--- /dev/null
+++ b/storage/xtradb/buf/buf0dblwr.cc
@@ -0,0 +1,1136 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dblwr.cc
+Doublwrite buffer module
+
+Created 2011/12/19
+*******************************************************/
+
+#include "buf0dblwr.h"
+
+#ifdef UNIV_NONINL
+#include "buf0buf.ic"
+#endif
+
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+
+#ifndef UNIV_HOTBACKUP
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t buf_dblwr_mutex_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+/** The doublewrite buffer */
+UNIV_INTERN buf_dblwr_t* buf_dblwr = NULL;
+
+/** Set to TRUE when the doublewrite buffer is being created */
+UNIV_INTERN ibool buf_dblwr_being_created = FALSE;
+
+/****************************************************************//**
+Determines if a page number is located inside the doublewrite buffer.
+@return TRUE if the location is inside the two blocks of the
+doublewrite buffer */
+UNIV_INTERN
+ibool
+buf_dblwr_page_inside(
+/*==================*/
+ ulint page_no) /*!< in: page number */
+{
+ if (buf_dblwr == NULL) {
+
+ return(FALSE);
+ }
+
+ if (page_no >= buf_dblwr->block1
+ && page_no < buf_dblwr->block1
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ return(TRUE);
+ }
+
+ if (page_no >= buf_dblwr->block2
+ && page_no < buf_dblwr->block2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/****************************************************************//**
+Calls buf_page_get() on the TRX_SYS_PAGE and returns a pointer to the
+doublewrite buffer within it.
+@return pointer to the doublewrite buffer within the filespace header
+page. */
+UNIV_INLINE
+byte*
+buf_dblwr_get(
+/*==========*/
+ mtr_t* mtr) /*!< in/out: MTR to hold the page latch */
+{
+ buf_block_t* block;
+
+ block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
+ RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ return(buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE);
+}
+
+/********************************************************************//**
+Flush a batch of writes to the datafiles that have already been
+written to the dblwr buffer on disk. */
+UNIV_INLINE
+void
+buf_dblwr_sync_datafiles()
+/*======================*/
+{
+ /* Wake possible simulated aio thread to actually post the
+ writes to the operating system */
+ os_aio_simulated_wake_handler_threads();
+
+ /* Wait that all async writes to tablespaces have been posted to
+ the OS */
+ os_aio_wait_until_no_pending_writes();
+
+ /* Now we flush the data to disk (for example, with fsync) */
+ fil_flush_file_spaces(FIL_TABLESPACE);
+}
+
+/****************************************************************//**
+Creates or initialializes the doublewrite buffer at a database start. */
+static
+void
+buf_dblwr_init(
+/*===========*/
+ byte* doublewrite) /*!< in: pointer to the doublewrite buf
+ header on trx sys page */
+{
+ ulint buf_size;
+
+ buf_dblwr = static_cast<buf_dblwr_t*>(
+ mem_zalloc(sizeof(buf_dblwr_t)));
+
+ /* There are two blocks of same size in the doublewrite
+ buffer. */
+ buf_size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+
+ /* There must be atleast one buffer for single page writes
+ and one buffer for batch writes. */
+ ut_a(srv_doublewrite_batch_size > 0
+ && srv_doublewrite_batch_size < buf_size);
+
+ mutex_create(buf_dblwr_mutex_key,
+ &buf_dblwr->mutex, SYNC_DOUBLEWRITE);
+
+ buf_dblwr->b_event = os_event_create();
+ buf_dblwr->s_event = os_event_create();
+ buf_dblwr->first_free = 0;
+ buf_dblwr->s_reserved = 0;
+ buf_dblwr->b_reserved = 0;
+
+ buf_dblwr->block1 = mach_read_from_4(
+ doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
+ buf_dblwr->block2 = mach_read_from_4(
+ doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
+
+ buf_dblwr->in_use = static_cast<bool*>(
+ mem_zalloc(buf_size * sizeof(bool)));
+
+ buf_dblwr->write_buf_unaligned = static_cast<byte*>(
+ ut_malloc((1 + buf_size) * UNIV_PAGE_SIZE));
+
+ buf_dblwr->write_buf = static_cast<byte*>(
+ ut_align(buf_dblwr->write_buf_unaligned,
+ UNIV_PAGE_SIZE));
+
+ buf_dblwr->buf_block_arr = static_cast<buf_page_t**>(
+ mem_zalloc(buf_size * sizeof(void*)));
+}
+
+/****************************************************************//**
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
+doublewrite buffer is placed on the trx system header page. */
+UNIV_INTERN
+void
+buf_dblwr_create(void)
+/*==================*/
+{
+ buf_block_t* block2;
+ buf_block_t* new_block;
+ byte* doublewrite;
+ byte* fseg_header;
+ ulint page_no;
+ ulint prev_page_no;
+ ulint i;
+ mtr_t mtr;
+
+ if (buf_dblwr) {
+ /* Already inited */
+
+ return;
+ }
+
+start_again:
+ mtr_start(&mtr);
+ buf_dblwr_being_created = TRUE;
+
+ doublewrite = buf_dblwr_get(&mtr);
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+ == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+ /* The doublewrite buffer has already been created:
+ just read in some numbers */
+
+ buf_dblwr_init(doublewrite);
+
+ mtr_commit(&mtr);
+ buf_dblwr_being_created = FALSE;
+ return;
+ }
+
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Doublewrite buffer not found: creating new");
+
+ if (buf_pool_get_curr_size()
+ < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2 + 100)
+ * UNIV_PAGE_SIZE)) {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Cannot create doublewrite buffer: you must "
+ "increase your buffer pool size. Cannot continue "
+ "operation.");
+
+ exit(EXIT_FAILURE);
+ }
+
+ block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
+ TRX_SYS_DOUBLEWRITE
+ + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
+
+ /* fseg_create acquires a second latch on the page,
+ therefore we must declare it: */
+
+ buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
+
+ if (block2 == NULL) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Cannot create doublewrite buffer: you must "
+ "increase your tablespace size. "
+ "Cannot continue operation.");
+
+ /* We exit without committing the mtr to prevent
+ its modifications to the database getting to disk */
+
+ exit(EXIT_FAILURE);
+ }
+
+ fseg_header = doublewrite + TRX_SYS_DOUBLEWRITE_FSEG;
+ prev_page_no = 0;
+
+ for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2; i++) {
+ new_block = fseg_alloc_free_page(
+ fseg_header, prev_page_no + 1, FSP_UP, &mtr);
+ if (new_block == NULL) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Cannot create doublewrite buffer: you must "
+ "increase your tablespace size. "
+ "Cannot continue operation.");
+
+ exit(EXIT_FAILURE);
+ }
+
+ /* We read the allocated pages to the buffer pool;
+ when they are written to disk in a flush, the space
+ id and page number fields are also written to the
+ pages. When we at database startup read pages
+ from the doublewrite buffer, we know that if the
+ space id and page number in them are the same as
+ the page position in the tablespace, then the page
+ has not been written to in doublewrite. */
+
+ ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+ page_no = buf_block_get_page_no(new_block);
+
+ if (i == FSP_EXTENT_SIZE / 2) {
+ ut_a(page_no == FSP_EXTENT_SIZE);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+
+ } else if (i == FSP_EXTENT_SIZE / 2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ ut_a(page_no == 2 * FSP_EXTENT_SIZE);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+
+ } else if (i > FSP_EXTENT_SIZE / 2) {
+ ut_a(page_no == prev_page_no + 1);
+ }
+
+ if (((i + 1) & 15) == 0) {
+ /* rw_locks can only be recursively x-locked
+ 2048 times. (on 32 bit platforms,
+ (lint) 0 - (X_LOCK_DECR * 2049)
+ is no longer a negative number, and thus
+ lock_word becomes like a shared lock).
+ For 4k page size this loop will
+ lock the fseg header too many times. Since
+ this code is not done while any other threads
+ are active, restart the MTR occasionally. */
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ doublewrite = buf_dblwr_get(&mtr);
+ fseg_header = doublewrite
+ + TRX_SYS_DOUBLEWRITE_FSEG;
+ }
+
+ prev_page_no = page_no;
+ }
+
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N,
+ MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
+ + TRX_SYS_DOUBLEWRITE_REPEAT,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N,
+ MLOG_4BYTES, &mtr);
+
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+ MLOG_4BYTES, &mtr);
+ mtr_commit(&mtr);
+
+ /* Flush the modified pages to disk and make a checkpoint */
+ log_make_checkpoint_at(LSN_MAX, TRUE);
+
+ /* Remove doublewrite pages from LRU */
+ buf_pool_invalidate();
+
+ ib_logf(IB_LOG_LEVEL_INFO, "Doublewrite buffer created");
+
+ goto start_again;
+}
+
+/****************************************************************//**
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function uses a possible doublewrite buffer to restore
+half-written pages in the data files. */
+UNIV_INTERN
+void
+buf_dblwr_init_or_restore_pages(
+/*============================*/
+ ibool restore_corrupt_pages) /*!< in: TRUE=restore pages */
+{
+ byte* buf;
+ byte* read_buf;
+ byte* unaligned_read_buf;
+ ulint block1;
+ ulint block2;
+ byte* page;
+ ibool reset_space_ids = FALSE;
+ byte* doublewrite;
+ ulint space_id;
+ ulint page_no;
+ ulint i;
+
+ /* We do the file i/o past the buffer pool */
+
+ unaligned_read_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
+
+ read_buf = static_cast<byte*>(
+ ut_align(unaligned_read_buf, UNIV_PAGE_SIZE));
+
+ /* Read the trx sys header to check if we are using the doublewrite
+ buffer */
+
+ fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0,
+ UNIV_PAGE_SIZE, read_buf, NULL);
+ doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+ == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+ /* The doublewrite buffer has been created */
+
+ buf_dblwr_init(doublewrite);
+
+ block1 = buf_dblwr->block1;
+ block2 = buf_dblwr->block2;
+
+ buf = buf_dblwr->write_buf;
+ } else {
+ goto leave_func;
+ }
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
+ != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
+
+ /* We are upgrading from a version < 4.1.x to a version where
+ multiple tablespaces are supported. We must reset the space id
+ field in the pages in the doublewrite buffer because starting
+ from this version the space id is stored to
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
+
+ reset_space_ids = TRUE;
+
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Resetting space id's in the doublewrite buffer");
+ }
+
+ /* Read the pages from the doublewrite buffer to memory */
+
+ fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block1, 0,
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ buf, NULL);
+ fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block2, 0,
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ NULL);
+ /* Check if any of these pages is half-written in data files, in the
+ intended position */
+
+ page = buf;
+
+ for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
+
+ ulint source_page_no;
+ page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
+
+ if (reset_space_ids) {
+
+ space_id = 0;
+ mach_write_to_4(page
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0);
+ /* We do not need to calculate new checksums for the
+ pages because the field .._SPACE_ID does not affect
+ them. Write the page back to where we read it from. */
+
+ if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ source_page_no = block1 + i;
+ } else {
+ source_page_no = block2
+ + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+ }
+
+ fil_io(OS_FILE_WRITE, true, 0, 0, source_page_no, 0,
+ UNIV_PAGE_SIZE, page, NULL);
+ } else {
+
+ space_id = mach_read_from_4(
+ page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ }
+
+ if (!restore_corrupt_pages) {
+ /* The database was shut down gracefully: no need to
+ restore pages */
+
+ } else if (!fil_tablespace_exists_in_mem(space_id)) {
+ /* Maybe we have dropped the single-table tablespace
+ and this page once belonged to it: do nothing */
+
+ } else if (!fil_check_adress_in_tablespace(space_id,
+ page_no)) {
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "A page in the doublewrite buffer is not "
+ "within space bounds; space id %lu "
+ "page number %lu, page %lu in "
+ "doublewrite buf.",
+ (ulong) space_id, (ulong) page_no, (ulong) i);
+
+ } else if (space_id == TRX_SYS_SPACE
+ && ((page_no >= block1
+ && page_no
+ < block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+ || (page_no >= block2
+ && page_no
+ < (block2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) {
+
+ /* It is an unwritten doublewrite buffer page:
+ do nothing */
+ } else {
+ ulint zip_size = fil_space_get_zip_size(space_id);
+
+ /* Read in the actual page from the file */
+ fil_io(OS_FILE_READ, true, space_id, zip_size,
+ page_no, 0,
+ zip_size ? zip_size : UNIV_PAGE_SIZE,
+ read_buf, NULL);
+
+ /* Check if the page is corrupt */
+
+ if (buf_page_is_corrupted(true, read_buf, zip_size)) {
+
+ fprintf(stderr,
+ "InnoDB: Warning: database page"
+ " corruption or a failed\n"
+ "InnoDB: file read of"
+ " space %lu page %lu.\n"
+ "InnoDB: Trying to recover it from"
+ " the doublewrite buffer.\n",
+ (ulong) space_id, (ulong) page_no);
+
+ if (buf_page_is_corrupted(true,
+ page, zip_size)) {
+ fprintf(stderr,
+ "InnoDB: Dump of the page:\n");
+ buf_page_print(
+ read_buf, zip_size,
+ BUF_PAGE_PRINT_NO_CRASH);
+ fprintf(stderr,
+ "InnoDB: Dump of"
+ " corresponding page"
+ " in doublewrite buffer:\n");
+ buf_page_print(
+ page, zip_size,
+ BUF_PAGE_PRINT_NO_CRASH);
+
+ fprintf(stderr,
+ "InnoDB: Also the page in the"
+ " doublewrite buffer"
+ " is corrupt.\n"
+ "InnoDB: Cannot continue"
+ " operation.\n"
+ "InnoDB: You can try to"
+ " recover the database"
+ " with the my.cnf\n"
+ "InnoDB: option:\n"
+ "InnoDB:"
+ " innodb_force_recovery=6\n");
+ ut_error;
+ }
+
+ /* Write the good page from the
+ doublewrite buffer to the intended
+ position */
+
+ fil_io(OS_FILE_WRITE, true, space_id,
+ zip_size, page_no, 0,
+ zip_size ? zip_size : UNIV_PAGE_SIZE,
+ page, NULL);
+
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Recovered the page from"
+ " the doublewrite buffer.");
+ }
+ }
+
+ page += UNIV_PAGE_SIZE;
+ }
+
+ fil_flush_file_spaces(FIL_TABLESPACE);
+
+leave_func:
+ ut_free(unaligned_read_buf);
+}
+
+/****************************************************************//**
+Frees doublewrite buffer. */
+UNIV_INTERN
+void
+buf_dblwr_free(void)
+/*================*/
+{
+ /* Free the double write data structures. */
+ ut_a(buf_dblwr != NULL);
+ ut_ad(buf_dblwr->s_reserved == 0);
+ ut_ad(buf_dblwr->b_reserved == 0);
+
+ os_event_free(buf_dblwr->b_event);
+ os_event_free(buf_dblwr->s_event);
+ ut_free(buf_dblwr->write_buf_unaligned);
+ buf_dblwr->write_buf_unaligned = NULL;
+
+ mem_free(buf_dblwr->buf_block_arr);
+ buf_dblwr->buf_block_arr = NULL;
+
+ mem_free(buf_dblwr->in_use);
+ buf_dblwr->in_use = NULL;
+
+ mutex_free(&buf_dblwr->mutex);
+ mem_free(buf_dblwr);
+ buf_dblwr = NULL;
+}
+
+/********************************************************************//**
+Updates the doublewrite buffer when an IO request is completed. */
+UNIV_INTERN
+void
+buf_dblwr_update(
+/*=============*/
+ const buf_page_t* bpage, /*!< in: buffer block descriptor */
+ buf_flush_t flush_type)/*!< in: flush type */
+{
+ if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
+ return;
+ }
+
+ switch (flush_type) {
+ case BUF_FLUSH_LIST:
+ case BUF_FLUSH_LRU:
+ mutex_enter(&buf_dblwr->mutex);
+
+ ut_ad(buf_dblwr->batch_running);
+ ut_ad(buf_dblwr->b_reserved > 0);
+ ut_ad(buf_dblwr->b_reserved <= buf_dblwr->first_free);
+
+ buf_dblwr->b_reserved--;
+
+ if (buf_dblwr->b_reserved == 0) {
+ mutex_exit(&buf_dblwr->mutex);
+ /* This will finish the batch. Sync data files
+ to the disk. */
+ fil_flush_file_spaces(FIL_TABLESPACE);
+ mutex_enter(&buf_dblwr->mutex);
+
+ /* We can now reuse the doublewrite memory buffer: */
+ buf_dblwr->first_free = 0;
+ buf_dblwr->batch_running = false;
+ os_event_set(buf_dblwr->b_event);
+ }
+
+ mutex_exit(&buf_dblwr->mutex);
+ break;
+ case BUF_FLUSH_SINGLE_PAGE:
+ {
+ const ulint size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+ ulint i;
+ mutex_enter(&buf_dblwr->mutex);
+ for (i = srv_doublewrite_batch_size; i < size; ++i) {
+ if (buf_dblwr->buf_block_arr[i] == bpage) {
+ buf_dblwr->s_reserved--;
+ buf_dblwr->buf_block_arr[i] = NULL;
+ buf_dblwr->in_use[i] = false;
+ break;
+ }
+ }
+
+ /* The block we are looking for must exist as a
+ reserved block. */
+ ut_a(i < size);
+ }
+ os_event_set(buf_dblwr->s_event);
+ mutex_exit(&buf_dblwr->mutex);
+ break;
+ case BUF_FLUSH_N_TYPES:
+ ut_error;
+ }
+}
+
+/********************************************************************//**
+Check the LSN values on the page. */
+static
+void
+buf_dblwr_check_page_lsn(
+/*=====================*/
+ const page_t* page) /*!< in: page to check */
+{
+ if (memcmp(page + (FIL_PAGE_LSN + 4),
+ page + (UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
+ 4)) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: ERROR: The page to be written"
+ " seems corrupt!\n"
+ "InnoDB: The low 4 bytes of LSN fields do not match "
+ "(" ULINTPF " != " ULINTPF ")!"
+ " Noticed in the buffer pool.\n",
+ mach_read_from_4(
+ page + FIL_PAGE_LSN + 4),
+ mach_read_from_4(
+ page + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4));
+ }
+}
+
+/********************************************************************//**
+Asserts when a corrupt block is find during writing out data to the
+disk. */
+static
+void
+buf_dblwr_assert_on_corrupt_block(
+/*==============================*/
+ const buf_block_t* block) /*!< in: block to check */
+{
+ buf_page_print(block->frame, 0, BUF_PAGE_PRINT_NO_CRASH);
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Apparent corruption of an"
+ " index page n:o %lu in space %lu\n"
+ "InnoDB: to be written to data file."
+ " We intentionally crash server\n"
+ "InnoDB: to prevent corrupt data"
+ " from ending up in data\n"
+ "InnoDB: files.\n",
+ (ulong) buf_block_get_page_no(block),
+ (ulong) buf_block_get_space(block));
+
+ ut_error;
+}
+
+/********************************************************************//**
+Check the LSN values on the page with which this block is associated.
+Also validate the page if the option is set. */
+static
+void
+buf_dblwr_check_block(
+/*==================*/
+ const buf_block_t* block) /*!< in: block to check */
+{
+ if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
+ || block->page.zip.data) {
+ /* No simple validate for compressed pages exists. */
+ return;
+ }
+
+ buf_dblwr_check_page_lsn(block->frame);
+
+ if (!block->check_index_page_at_flush) {
+ return;
+ }
+
+ if (page_is_comp(block->frame)) {
+ if (!page_simple_validate_new(block->frame)) {
+ buf_dblwr_assert_on_corrupt_block(block);
+ }
+ } else if (!page_simple_validate_old(block->frame)) {
+
+ buf_dblwr_assert_on_corrupt_block(block);
+ }
+}
+
+/********************************************************************//**
+Writes a page that has already been written to the doublewrite buffer
+to the datafile. It is the job of the caller to sync the datafile. */
+static
+void
+buf_dblwr_write_block_to_datafile(
+/*==============================*/
+ const buf_page_t* bpage, /*!< in: page to write */
+ bool sync) /*!< in: true if sync IO
+ is requested */
+{
+ ut_a(bpage);
+ ut_a(buf_page_in_file(bpage));
+
+ const ulint flags = sync
+ ? OS_FILE_WRITE
+ : OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER;
+
+ if (bpage->zip.data) {
+ fil_io(flags, sync, buf_page_get_space(bpage),
+ buf_page_get_zip_size(bpage),
+ buf_page_get_page_no(bpage), 0,
+ buf_page_get_zip_size(bpage),
+ (void*) bpage->zip.data,
+ (void*) bpage);
+
+ return;
+ }
+
+
+ const buf_block_t* block = (buf_block_t*) bpage;
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+ buf_dblwr_check_page_lsn(block->frame);
+
+ fil_io(flags, sync, buf_block_get_space(block), 0,
+ buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
+ (void*) block->frame, (void*) block);
+}
+
+/********************************************************************//**
+Flushes possible buffered writes from the doublewrite memory buffer to disk,
+and also wakes up the aio thread if simulated aio is used. It is very
+important to call this function after a batch of writes has been posted,
+and also when we may have to wait for a page latch! Otherwise a deadlock
+of threads can occur. */
+UNIV_INTERN
+void
+buf_dblwr_flush_buffered_writes(void)
+/*=================================*/
+{
+ byte* write_buf;
+ ulint first_free;
+ ulint len;
+
+ if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
+ /* Sync the writes to the disk. */
+ buf_dblwr_sync_datafiles();
+ return;
+ }
+
+try_again:
+ mutex_enter(&buf_dblwr->mutex);
+
+ /* Write first to doublewrite buffer blocks. We use synchronous
+ aio and thus know that file write has been completed when the
+ control returns. */
+
+ if (buf_dblwr->first_free == 0) {
+
+ mutex_exit(&buf_dblwr->mutex);
+
+ return;
+ }
+
+ if (buf_dblwr->batch_running) {
+ /* Another thread is running the batch right now. Wait
+ for it to finish. */
+ ib_int64_t sig_count = os_event_reset(buf_dblwr->b_event);
+ mutex_exit(&buf_dblwr->mutex);
+
+ os_event_wait_low(buf_dblwr->b_event, sig_count);
+ goto try_again;
+ }
+
+ ut_a(!buf_dblwr->batch_running);
+ ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved);
+
+ /* Disallow anyone else to post to doublewrite buffer or to
+ start another batch of flushing. */
+ buf_dblwr->batch_running = true;
+ first_free = buf_dblwr->first_free;
+
+ /* Now safe to release the mutex. Note that though no other
+ thread is allowed to post to the doublewrite batch flushing
+ but any threads working on single page flushes are allowed
+ to proceed. */
+ mutex_exit(&buf_dblwr->mutex);
+
+ write_buf = buf_dblwr->write_buf;
+
+ for (ulint len2 = 0, i = 0;
+ i < buf_dblwr->first_free;
+ len2 += UNIV_PAGE_SIZE, i++) {
+
+ const buf_block_t* block;
+
+ block = (buf_block_t*) buf_dblwr->buf_block_arr[i];
+
+ if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
+ || block->page.zip.data) {
+ /* No simple validate for compressed
+ pages exists. */
+ continue;
+ }
+
+ /* Check that the actual page in the buffer pool is
+ not corrupt and the LSN values are sane. */
+ buf_dblwr_check_block(block);
+
+ /* Check that the page as written to the doublewrite
+ buffer has sane LSN values. */
+ buf_dblwr_check_page_lsn(write_buf + len2);
+ }
+
+ /* Write out the first block of the doublewrite buffer */
+ len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
+ buf_dblwr->first_free) * UNIV_PAGE_SIZE;
+
+ fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
+ buf_dblwr->block1, 0, len,
+ (void*) write_buf, NULL);
+
+ if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ /* No unwritten pages in the second block. */
+ goto flush;
+ }
+
+ /* Write out the second block of the doublewrite buffer. */
+ len = (buf_dblwr->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+ * UNIV_PAGE_SIZE;
+
+ write_buf = buf_dblwr->write_buf
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
+
+ fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
+ buf_dblwr->block2, 0, len,
+ (void*) write_buf, NULL);
+
+flush:
+ /* increment the doublewrite flushed pages counter */
+ srv_stats.dblwr_pages_written.add(buf_dblwr->first_free);
+ srv_stats.dblwr_writes.inc();
+
+ /* Now flush the doublewrite buffer data to disk */
+ fil_flush(TRX_SYS_SPACE);
+
+ /* We know that the writes have been flushed to disk now
+ and in recovery we will find them in the doublewrite buffer
+ blocks. Next do the writes to the intended positions. */
+
+ /* Up to this point first_free and buf_dblwr->first_free are
+ same because we have set the buf_dblwr->batch_running flag
+ disallowing any other thread to post any request but we
+ can't safely access buf_dblwr->first_free in the loop below.
+ This is so because it is possible that after we are done with
+ the last iteration and before we terminate the loop, the batch
+ gets finished in the IO helper thread and another thread posts
+ a new batch setting buf_dblwr->first_free to a higher value.
+ If this happens and we are using buf_dblwr->first_free in the
+ loop termination condition then we'll end up dispatching
+ the same block twice from two different threads. */
+ ut_ad(first_free == buf_dblwr->first_free);
+ for (ulint i = 0; i < first_free; i++) {
+ buf_dblwr_write_block_to_datafile(
+ buf_dblwr->buf_block_arr[i], false);
+ }
+
+ /* Wake possible simulated aio thread to actually post the
+ writes to the operating system. We don't flush the files
+ at this point. We leave it to the IO helper thread to flush
+ datafiles when the whole batch has been processed. */
+ os_aio_simulated_wake_handler_threads();
+}
+
+/********************************************************************//**
+Posts a buffer page for writing. If the doublewrite memory buffer is
+full, calls buf_dblwr_flush_buffered_writes and waits for for free
+space to appear. */
+UNIV_INTERN
+void
+buf_dblwr_add_to_batch(
+/*====================*/
+ buf_page_t* bpage) /*!< in: buffer block to write */
+{
+ ulint zip_size;
+
+ ut_a(buf_page_in_file(bpage));
+ ut_ad(!mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex));
+
+try_again:
+ mutex_enter(&buf_dblwr->mutex);
+
+ ut_a(buf_dblwr->first_free <= srv_doublewrite_batch_size);
+
+ if (buf_dblwr->batch_running) {
+
+ /* This not nearly as bad as it looks. There is only
+ page_cleaner thread which does background flushing
+ in batches therefore it is unlikely to be a contention
+ point. The only exception is when a user thread is
+ forced to do a flush batch because of a sync
+ checkpoint. */
+ ib_int64_t sig_count = os_event_reset(buf_dblwr->b_event);
+ mutex_exit(&buf_dblwr->mutex);
+
+ os_event_wait_low(buf_dblwr->b_event, sig_count);
+ goto try_again;
+ }
+
+ if (buf_dblwr->first_free == srv_doublewrite_batch_size) {
+ mutex_exit(&(buf_dblwr->mutex));
+
+ buf_dblwr_flush_buffered_writes();
+
+ goto try_again;
+ }
+
+ zip_size = buf_page_get_zip_size(bpage);
+
+ if (zip_size) {
+ UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
+ /* Copy the compressed page and clear the rest. */
+ memcpy(buf_dblwr->write_buf
+ + UNIV_PAGE_SIZE * buf_dblwr->first_free,
+ bpage->zip.data, zip_size);
+ memset(buf_dblwr->write_buf
+ + UNIV_PAGE_SIZE * buf_dblwr->first_free
+ + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
+ } else {
+ ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+ UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
+ UNIV_PAGE_SIZE);
+
+ memcpy(buf_dblwr->write_buf
+ + UNIV_PAGE_SIZE * buf_dblwr->first_free,
+ ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
+ }
+
+ buf_dblwr->buf_block_arr[buf_dblwr->first_free] = bpage;
+
+ buf_dblwr->first_free++;
+ buf_dblwr->b_reserved++;
+
+ ut_ad(!buf_dblwr->batch_running);
+ ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved);
+ ut_ad(buf_dblwr->b_reserved <= srv_doublewrite_batch_size);
+
+ if (buf_dblwr->first_free == srv_doublewrite_batch_size) {
+ mutex_exit(&(buf_dblwr->mutex));
+
+ buf_dblwr_flush_buffered_writes();
+
+ return;
+ }
+
+ mutex_exit(&(buf_dblwr->mutex));
+}
+
+/********************************************************************//**
+Writes a page to the doublewrite buffer on disk, sync it, then write
+the page to the datafile and sync the datafile. This function is used
+for single page flushes. If all the buffers allocated for single page
+flushes in the doublewrite buffer are in use we wait here for one to
+become free. We are guaranteed that a slot will become free because any
+thread that is using a slot must also release the slot before leaving
+this function. */
+UNIV_INTERN
+void
+buf_dblwr_write_single_page(
+/*========================*/
+ buf_page_t* bpage, /*!< in: buffer block to write */
+ bool sync) /*!< in: true if sync IO requested */
+{
+ ulint n_slots;
+ ulint size;
+ ulint zip_size;
+ ulint offset;
+ ulint i;
+
+ ut_a(buf_page_in_file(bpage));
+ ut_a(srv_use_doublewrite_buf);
+ ut_a(buf_dblwr != NULL);
+
+ /* total number of slots available for single page flushes
+ starts from srv_doublewrite_batch_size to the end of the
+ buffer. */
+ size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+ ut_a(size > srv_doublewrite_batch_size);
+ n_slots = size - srv_doublewrite_batch_size;
+
+ if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
+
+ /* Check that the actual page in the buffer pool is
+ not corrupt and the LSN values are sane. */
+ buf_dblwr_check_block((buf_block_t*) bpage);
+
+ /* Check that the page as written to the doublewrite
+ buffer has sane LSN values. */
+ if (!bpage->zip.data) {
+ buf_dblwr_check_page_lsn(
+ ((buf_block_t*) bpage)->frame);
+ }
+ }
+
+retry:
+ mutex_enter(&buf_dblwr->mutex);
+ if (buf_dblwr->s_reserved == n_slots) {
+
+ /* All slots are reserved. */
+ ib_int64_t sig_count =
+ os_event_reset(buf_dblwr->s_event);
+ mutex_exit(&buf_dblwr->mutex);
+ os_event_wait_low(buf_dblwr->s_event, sig_count);
+
+ goto retry;
+ }
+
+ for (i = srv_doublewrite_batch_size; i < size; ++i) {
+
+ if (!buf_dblwr->in_use[i]) {
+ break;
+ }
+ }
+
+ /* We are guaranteed to find a slot. */
+ ut_a(i < size);
+ buf_dblwr->in_use[i] = true;
+ buf_dblwr->s_reserved++;
+ buf_dblwr->buf_block_arr[i] = bpage;
+
+ /* increment the doublewrite flushed pages counter */
+ srv_stats.dblwr_pages_written.inc();
+ srv_stats.dblwr_writes.inc();
+
+ mutex_exit(&buf_dblwr->mutex);
+
+ /* Lets see if we are going to write in the first or second
+ block of the doublewrite buffer. */
+ if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ offset = buf_dblwr->block1 + i;
+ } else {
+ offset = buf_dblwr->block2 + i
+ - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+ }
+
+ /* We deal with compressed and uncompressed pages a little
+ differently here. In case of uncompressed pages we can
+ directly write the block to the allocated slot in the
+ doublewrite buffer in the system tablespace and then after
+ syncing the system table space we can proceed to write the page
+ in the datafile.
+ In case of compressed page we first do a memcpy of the block
+ to the in-memory buffer of doublewrite before proceeding to
+ write it. This is so because we want to pad the remaining
+ bytes in the doublewrite page with zeros. */
+
+ zip_size = buf_page_get_zip_size(bpage);
+ if (zip_size) {
+ memcpy(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i,
+ bpage->zip.data, zip_size);
+ memset(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i
+ + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
+
+ fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
+ offset, 0, UNIV_PAGE_SIZE,
+ (void*) (buf_dblwr->write_buf
+ + UNIV_PAGE_SIZE * i), NULL);
+ } else {
+ /* It is a regular page. Write it directly to the
+ doublewrite buffer */
+ fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
+ offset, 0, UNIV_PAGE_SIZE,
+ (void*) ((buf_block_t*) bpage)->frame,
+ NULL);
+ }
+
+ /* Now flush the doublewrite buffer data to disk */
+ fil_flush(TRX_SYS_SPACE);
+
+ /* We know that the write has been flushed to disk now
+ and during recovery we will find it in the doublewrite buffer
+ blocks. Next do the write to the intended position. */
+ buf_dblwr_write_block_to_datafile(bpage, sync);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/buf/buf0dump.cc b/storage/xtradb/buf/buf0dump.cc
new file mode 100644
index 00000000000..090e8cac63b
--- /dev/null
+++ b/storage/xtradb/buf/buf0dump.cc
@@ -0,0 +1,621 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dump.cc
+Implements a buffer pool dump/load.
+
+Created April 08, 2011 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+
+#include <stdarg.h> /* va_* */
+#include <string.h> /* strerror() */
+
+#include "buf0buf.h" /* srv_buf_pool_instances */
+#include "buf0dump.h"
+#include "db0err.h"
+#include "dict0dict.h" /* dict_operation_lock */
+#include "os0file.h" /* OS_FILE_MAX_PATH */
+#include "os0sync.h" /* os_event* */
+#include "os0thread.h" /* os_thread_* */
+#include "srv0srv.h" /* srv_fast_shutdown, srv_buf_dump* */
+#include "srv0start.h" /* srv_shutdown_state */
+#include "sync0rw.h" /* rw_lock_s_lock() */
+#include "ut0byte.h" /* ut_ull_create() */
+#include "ut0sort.h" /* UT_SORT_FUNCTION_BODY */
+
+enum status_severity {
+ STATUS_INFO,
+ STATUS_NOTICE,
+ STATUS_ERR
+};
+
+#define SHUTTING_DOWN() (UNIV_UNLIKELY(srv_shutdown_state \
+ != SRV_SHUTDOWN_NONE))
+
+/* Flags that tell the buffer pool dump/load thread which action should it
+take after being waked up. */
+static ibool buf_dump_should_start = FALSE;
+static ibool buf_load_should_start = FALSE;
+
+static ibool buf_load_abort_flag = FALSE;
+
+/* Used to temporary store dump info in order to avoid IO while holding
+buffer pool LRU list mutex during dump and also to sort the contents of the
+dump before reading the pages from disk during load.
+We store the space id in the high 32 bits and page no in low 32 bits. */
+typedef ib_uint64_t buf_dump_t;
+
+/* Aux macros to create buf_dump_t and to extract space and page from it */
+#define BUF_DUMP_CREATE(space, page) ut_ull_create(space, page)
+#define BUF_DUMP_SPACE(a) ((ulint) ((a) >> 32))
+#define BUF_DUMP_PAGE(a) ((ulint) ((a) & 0xFFFFFFFFUL))
+
+/*****************************************************************//**
+Wakes up the buffer pool dump/load thread and instructs it to start
+a dump. This function is called by MySQL code via buffer_pool_dump_now()
+and it should return immediately because the whole MySQL is frozen during
+its execution. */
+UNIV_INTERN
+void
+buf_dump_start()
+/*============*/
+{
+ buf_dump_should_start = TRUE;
+ os_event_set(srv_buf_dump_event);
+}
+
+/*****************************************************************//**
+Wakes up the buffer pool dump/load thread and instructs it to start
+a load. This function is called by MySQL code via buffer_pool_load_now()
+and it should return immediately because the whole MySQL is frozen during
+its execution. */
+UNIV_INTERN
+void
+buf_load_start()
+/*============*/
+{
+ buf_load_should_start = TRUE;
+ os_event_set(srv_buf_dump_event);
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_dump_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_DUMP_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_dump_status'; */
+static __attribute__((nonnull, format(printf, 2, 3)))
+void
+buf_dump_status(
+/*============*/
+ enum status_severity severity,/*!< in: status severity */
+ const char* fmt, /*!< in: format */
+ ...) /*!< in: extra parameters according
+ to fmt */
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ ut_vsnprintf(
+ export_vars.innodb_buffer_pool_dump_status,
+ sizeof(export_vars.innodb_buffer_pool_dump_status),
+ fmt, ap);
+
+ if (severity == STATUS_NOTICE || severity == STATUS_ERR) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: %s\n",
+ export_vars.innodb_buffer_pool_dump_status);
+ }
+
+ va_end(ap);
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_load_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_LOAD_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_load_status'; */
+static __attribute__((nonnull, format(printf, 2, 3)))
+void
+buf_load_status(
+/*============*/
+ enum status_severity severity,/*!< in: status severity */
+ const char* fmt, /*!< in: format */
+ ...) /*!< in: extra parameters according to fmt */
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ ut_vsnprintf(
+ export_vars.innodb_buffer_pool_load_status,
+ sizeof(export_vars.innodb_buffer_pool_load_status),
+ fmt, ap);
+
+ if (severity == STATUS_NOTICE || severity == STATUS_ERR) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: %s\n",
+ export_vars.innodb_buffer_pool_load_status);
+ }
+
+ va_end(ap);
+}
+
+/*****************************************************************//**
+Perform a buffer pool dump into the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_dump_status will be set accordingly, see buf_dump_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_dump(
+/*=====*/
+ ibool obey_shutdown) /*!< in: quit if we are in a shutting down
+ state */
+{
+#define SHOULD_QUIT() (SHUTTING_DOWN() && obey_shutdown)
+
+ char full_filename[OS_FILE_MAX_PATH];
+ char tmp_filename[OS_FILE_MAX_PATH];
+ char now[32];
+ FILE* f;
+ ulint i;
+ int ret;
+
+ ut_snprintf(full_filename, sizeof(full_filename),
+ "%s%c%s", srv_data_home, SRV_PATH_SEPARATOR,
+ srv_buf_dump_filename);
+
+ ut_snprintf(tmp_filename, sizeof(tmp_filename),
+ "%s.incomplete", full_filename);
+
+ buf_dump_status(STATUS_NOTICE, "Dumping buffer pool(s) to %s",
+ full_filename);
+
+ f = fopen(tmp_filename, "w");
+ if (f == NULL) {
+ buf_dump_status(STATUS_ERR,
+ "Cannot open '%s' for writing: %s",
+ tmp_filename, strerror(errno));
+ return;
+ }
+ /* else */
+
+ /* walk through each buffer pool */
+ for (i = 0; i < srv_buf_pool_instances && !SHOULD_QUIT(); i++) {
+ buf_pool_t* buf_pool;
+ const buf_page_t* bpage;
+ buf_dump_t* dump;
+ ulint n_pages;
+ ulint j;
+
+ buf_pool = buf_pool_from_array(i);
+
+ /* obtain buf_pool LRU list mutex before allocate, since
+ UT_LIST_GET_LEN(buf_pool->LRU) could change */
+ mutex_enter(&buf_pool->LRU_list_mutex);
+
+ n_pages = UT_LIST_GET_LEN(buf_pool->LRU);
+
+ /* skip empty buffer pools */
+ if (n_pages == 0) {
+ mutex_exit(&buf_pool->LRU_list_mutex);
+ continue;
+ }
+
+ dump = static_cast<buf_dump_t*>(
+ ut_malloc(n_pages * sizeof(*dump))) ;
+
+ if (dump == NULL) {
+ mutex_exit(&buf_pool->LRU_list_mutex);
+ fclose(f);
+ buf_dump_status(STATUS_ERR,
+ "Cannot allocate " ULINTPF " bytes: %s",
+ (ulint) (n_pages * sizeof(*dump)),
+ strerror(errno));
+ /* leave tmp_filename to exist */
+ return;
+ }
+
+ for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), j = 0;
+ bpage != NULL;
+ bpage = UT_LIST_GET_PREV(LRU, bpage), j++) {
+
+ ut_a(buf_page_in_file(bpage));
+
+ dump[j] = BUF_DUMP_CREATE(buf_page_get_space(bpage),
+ buf_page_get_page_no(bpage));
+ }
+
+ ut_a(j == n_pages);
+
+ mutex_exit(&buf_pool->LRU_list_mutex);
+
+ for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) {
+ ret = fprintf(f, ULINTPF "," ULINTPF "\n",
+ BUF_DUMP_SPACE(dump[j]),
+ BUF_DUMP_PAGE(dump[j]));
+ if (ret < 0) {
+ ut_free(dump);
+ fclose(f);
+ buf_dump_status(STATUS_ERR,
+ "Cannot write to '%s': %s",
+ tmp_filename, strerror(errno));
+ /* leave tmp_filename to exist */
+ return;
+ }
+
+ if (j % 128 == 0) {
+ buf_dump_status(
+ STATUS_INFO,
+ "Dumping buffer pool "
+ ULINTPF "/" ULINTPF ", "
+ "page " ULINTPF "/" ULINTPF,
+ i + 1, srv_buf_pool_instances,
+ j + 1, n_pages);
+ }
+ }
+
+ ut_free(dump);
+ }
+
+ ret = fclose(f);
+ if (ret != 0) {
+ buf_dump_status(STATUS_ERR,
+ "Cannot close '%s': %s",
+ tmp_filename, strerror(errno));
+ return;
+ }
+ /* else */
+
+ ret = unlink(full_filename);
+ if (ret != 0 && errno != ENOENT) {
+ buf_dump_status(STATUS_ERR,
+ "Cannot delete '%s': %s",
+ full_filename, strerror(errno));
+ /* leave tmp_filename to exist */
+ return;
+ }
+ /* else */
+
+ ret = rename(tmp_filename, full_filename);
+ if (ret != 0) {
+ buf_dump_status(STATUS_ERR,
+ "Cannot rename '%s' to '%s': %s",
+ tmp_filename, full_filename,
+ strerror(errno));
+ /* leave tmp_filename to exist */
+ return;
+ }
+ /* else */
+
+ /* success */
+
+ ut_sprintf_timestamp(now);
+
+ buf_dump_status(STATUS_NOTICE,
+ "Buffer pool(s) dump completed at %s", now);
+}
+
+/*****************************************************************//**
+Compare two buffer pool dump entries, used to sort the dump on
+space_no,page_no before loading in order to increase the chance for
+sequential IO.
+@return -1/0/1 if entry 1 is smaller/equal/bigger than entry 2 */
+static
+lint
+buf_dump_cmp(
+/*=========*/
+ const buf_dump_t d1, /*!< in: buffer pool dump entry 1 */
+ const buf_dump_t d2) /*!< in: buffer pool dump entry 2 */
+{
+ if (d1 < d2) {
+ return(-1);
+ } else if (d1 == d2) {
+ return(0);
+ } else {
+ return(1);
+ }
+}
+
+/*****************************************************************//**
+Sort a buffer pool dump on space_no, page_no. */
+static
+void
+buf_dump_sort(
+/*==========*/
+ buf_dump_t* dump, /*!< in/out: buffer pool dump to sort */
+ buf_dump_t* tmp, /*!< in/out: temp storage */
+ ulint low, /*!< in: lowest index (inclusive) */
+ ulint high) /*!< in: highest index (non-inclusive) */
+{
+ UT_SORT_FUNCTION_BODY(buf_dump_sort, dump, tmp, low, high,
+ buf_dump_cmp);
+}
+
+/*****************************************************************//**
+Perform a buffer pool load from the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_load_status will be set accordingly, see buf_load_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_load()
+/*======*/
+{
+ char full_filename[OS_FILE_MAX_PATH];
+ char now[32];
+ FILE* f;
+ buf_dump_t* dump;
+ buf_dump_t* dump_tmp;
+ ulint dump_n;
+ ulint total_buffer_pools_pages;
+ ulint i;
+ ulint space_id;
+ ulint page_no;
+ int fscanf_ret;
+
+ /* Ignore any leftovers from before */
+ buf_load_abort_flag = FALSE;
+
+ ut_snprintf(full_filename, sizeof(full_filename),
+ "%s%c%s", srv_data_home, SRV_PATH_SEPARATOR,
+ srv_buf_dump_filename);
+
+ buf_load_status(STATUS_NOTICE,
+ "Loading buffer pool(s) from %s", full_filename);
+
+ f = fopen(full_filename, "r");
+ if (f == NULL) {
+ buf_load_status(STATUS_ERR,
+ "Cannot open '%s' for reading: %s",
+ full_filename, strerror(errno));
+ return;
+ }
+ /* else */
+
+ /* First scan the file to estimate how many entries are in it.
+ This file is tiny (approx 500KB per 1GB buffer pool), reading it
+ two times is fine. */
+ dump_n = 0;
+ while (fscanf(f, ULINTPF "," ULINTPF, &space_id, &page_no) == 2
+ && !SHUTTING_DOWN()) {
+ dump_n++;
+ }
+
+ if (!SHUTTING_DOWN() && !feof(f)) {
+ /* fscanf() returned != 2 */
+ const char* what;
+ if (ferror(f)) {
+ what = "reading";
+ } else {
+ what = "parsing";
+ }
+ fclose(f);
+ buf_load_status(STATUS_ERR, "Error %s '%s', "
+ "unable to load buffer pool (stage 1)",
+ what, full_filename);
+ return;
+ }
+
+ /* If dump is larger than the buffer pool(s), then we ignore the
+ extra trailing. This could happen if a dump is made, then buffer
+ pool is shrunk and then load it attempted. */
+ total_buffer_pools_pages = buf_pool_get_n_pages()
+ * srv_buf_pool_instances;
+ if (dump_n > total_buffer_pools_pages) {
+ dump_n = total_buffer_pools_pages;
+ }
+
+ dump = static_cast<buf_dump_t*>(ut_malloc(dump_n * sizeof(*dump)));
+
+ if (dump == NULL) {
+ fclose(f);
+ buf_load_status(STATUS_ERR,
+ "Cannot allocate " ULINTPF " bytes: %s",
+ (ulint) (dump_n * sizeof(*dump)),
+ strerror(errno));
+ return;
+ }
+
+ dump_tmp = static_cast<buf_dump_t*>(
+ ut_malloc(dump_n * sizeof(*dump_tmp)));
+
+ if (dump_tmp == NULL) {
+ ut_free(dump);
+ fclose(f);
+ buf_load_status(STATUS_ERR,
+ "Cannot allocate " ULINTPF " bytes: %s",
+ (ulint) (dump_n * sizeof(*dump_tmp)),
+ strerror(errno));
+ return;
+ }
+
+ rewind(f);
+
+ for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+ fscanf_ret = fscanf(f, ULINTPF "," ULINTPF,
+ &space_id, &page_no);
+
+ if (fscanf_ret != 2) {
+ if (feof(f)) {
+ break;
+ }
+ /* else */
+
+ ut_free(dump);
+ ut_free(dump_tmp);
+ fclose(f);
+ buf_load_status(STATUS_ERR,
+ "Error parsing '%s', unable "
+ "to load buffer pool (stage 2)",
+ full_filename);
+ return;
+ }
+
+ if (space_id > ULINT32_MASK || page_no > ULINT32_MASK) {
+ ut_free(dump);
+ ut_free(dump_tmp);
+ fclose(f);
+ buf_load_status(STATUS_ERR,
+ "Error parsing '%s': bogus "
+ "space,page " ULINTPF "," ULINTPF
+ " at line " ULINTPF ", "
+ "unable to load buffer pool",
+ full_filename,
+ space_id, page_no,
+ i);
+ return;
+ }
+
+ dump[i] = BUF_DUMP_CREATE(space_id, page_no);
+ }
+
+ /* Set dump_n to the actual number of initialized elements,
+ i could be smaller than dump_n here if the file got truncated after
+ we read it the first time. */
+ dump_n = i;
+
+ fclose(f);
+
+ if (dump_n == 0) {
+ ut_free(dump);
+ ut_sprintf_timestamp(now);
+ buf_load_status(STATUS_NOTICE,
+ "Buffer pool(s) load completed at %s "
+ "(%s was empty)", now, full_filename);
+ return;
+ }
+
+ if (!SHUTTING_DOWN()) {
+ buf_dump_sort(dump, dump_tmp, 0, dump_n);
+ }
+
+ ut_free(dump_tmp);
+
+ for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+
+ buf_read_page_async(BUF_DUMP_SPACE(dump[i]),
+ BUF_DUMP_PAGE(dump[i]));
+
+ if (i % 64 == 63) {
+ os_aio_simulated_wake_handler_threads();
+ }
+
+ if (i % 128 == 0) {
+ buf_load_status(STATUS_INFO,
+ "Loaded " ULINTPF "/" ULINTPF " pages",
+ i + 1, dump_n);
+ }
+
+ if (buf_load_abort_flag) {
+ buf_load_abort_flag = FALSE;
+ ut_free(dump);
+ buf_load_status(
+ STATUS_NOTICE,
+ "Buffer pool(s) load aborted on request");
+ return;
+ }
+ }
+
+ ut_free(dump);
+
+ ut_sprintf_timestamp(now);
+
+ buf_load_status(STATUS_NOTICE,
+ "Buffer pool(s) load completed at %s", now);
+}
+
+/*****************************************************************//**
+Aborts a currently running buffer pool load. This function is called by
+MySQL code via buffer_pool_load_abort() and it should return immediately
+because the whole MySQL is frozen during its execution. */
+UNIV_INTERN
+void
+buf_load_abort()
+/*============*/
+{
+ buf_load_abort_flag = TRUE;
+}
+
+/*****************************************************************//**
+This is the main thread for buffer pool dump/load. It waits for an
+event and when waked up either performs a dump or load and sleeps
+again.
+@return this function does not return, it calls os_thread_exit() */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(buf_dump_thread)(
+/*============================*/
+ void* arg __attribute__((unused))) /*!< in: a dummy parameter
+ required by os_thread_create */
+{
+ ut_ad(!srv_read_only_mode);
+
+ srv_buf_dump_thread_active = TRUE;
+
+ buf_dump_status(STATUS_INFO, "not started");
+ buf_load_status(STATUS_INFO, "not started");
+
+ if (srv_buffer_pool_load_at_startup) {
+ buf_load();
+ }
+
+ while (!SHUTTING_DOWN()) {
+
+ os_event_wait(srv_buf_dump_event);
+
+ if (buf_dump_should_start) {
+ buf_dump_should_start = FALSE;
+ buf_dump(TRUE /* quit on shutdown */);
+ }
+
+ if (buf_load_should_start) {
+ buf_load_should_start = FALSE;
+ buf_load();
+ }
+
+ os_event_reset(srv_buf_dump_event);
+ }
+
+ if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) {
+ buf_dump(FALSE /* ignore shutdown down flag,
+ keep going even if we are in a shutdown state */);
+ }
+
+ srv_buf_dump_thread_active = FALSE;
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
diff --git a/storage/xtradb/buf/buf0flu.c b/storage/xtradb/buf/buf0flu.c
deleted file mode 100644
index fea665eba40..00000000000
--- a/storage/xtradb/buf/buf0flu.c
+++ /dev/null
@@ -1,2402 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file buf/buf0flu.c
-The database buffer buf_pool flush algorithm
-
-Created 11/11/1995 Heikki Tuuri
-*******************************************************/
-
-#include "buf0flu.h"
-
-#ifdef UNIV_NONINL
-#include "buf0flu.ic"
-#endif
-
-#include "buf0buf.h"
-#include "srv0srv.h"
-#include "page0zip.h"
-#ifndef UNIV_HOTBACKUP
-#include "ut0byte.h"
-#include "ut0lst.h"
-#include "page0page.h"
-#include "fil0fil.h"
-#include "buf0lru.h"
-#include "buf0rea.h"
-#include "ibuf0ibuf.h"
-#include "log0log.h"
-#include "os0file.h"
-#include "trx0sys.h"
-#include "mysql/plugin.h"
-#include "mysql/service_thd_wait.h"
-
-/**********************************************************************
-These statistics are generated for heuristics used in estimating the
-rate at which we should flush the dirty blocks to avoid bursty IO
-activity. Note that the rate of flushing not only depends on how many
-dirty pages we have in the buffer pool but it is also a fucntion of
-how much redo the workload is generating and at what rate. */
-/* @{ */
-
-/** Number of intervals for which we keep the history of these stats.
-Each interval is 1 second, defined by the rate at which
-srv_error_monitor_thread() calls buf_flush_stat_update(). */
-#define BUF_FLUSH_STAT_N_INTERVAL 20
-
-/** Sampled values buf_flush_stat_cur.
-Not protected by any mutex. Updated by buf_flush_stat_update(). */
-static buf_flush_stat_t buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
-
-/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
-static ulint buf_flush_stat_arr_ind;
-
-/** Values at start of the current interval. Reset by
-buf_flush_stat_update(). */
-static buf_flush_stat_t buf_flush_stat_cur;
-
-/** Running sum of past values of buf_flush_stat_cur.
-Updated by buf_flush_stat_update(). Not protected by any mutex. */
-static buf_flush_stat_t buf_flush_stat_sum;
-
-/** Number of pages flushed through non flush_list flushes. */
-// static ulint buf_lru_flush_page_count = 0;
-
-/* @} */
-
-/******************************************************************//**
-Increases flush_list size in bytes with zip_size for compressed page,
-UNIV_PAGE_SIZE for uncompressed page in inline function */
-static inline
-void
-incr_flush_list_size_in_bytes(
-/*==========================*/
- buf_block_t* block, /*!< in: control block */
- buf_pool_t* buf_pool) /*!< in: buffer pool instance */
-{
- ulint zip_size;
- ut_ad(buf_flush_list_mutex_own(buf_pool));
- zip_size = page_zip_get_size(&block->page.zip);
- buf_pool->stat.flush_list_bytes += zip_size ? zip_size : UNIV_PAGE_SIZE;
- ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
-}
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/******************************************************************//**
-Validates the flush list.
-@return TRUE if ok */
-static
-ibool
-buf_flush_validate_low(
-/*===================*/
- buf_pool_t* buf_pool); /*!< in: Buffer pool instance */
-
-/******************************************************************//**
-Validates the flush list some of the time.
-@return TRUE if ok or the check was skipped */
-static
-ibool
-buf_flush_validate_skip(
-/*====================*/
- buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
-{
-/** Try buf_flush_validate_low() every this many times */
-# define BUF_FLUSH_VALIDATE_SKIP 23
-
- /** The buf_flush_validate_low() call skip counter.
- Use a signed type because of the race condition below. */
- static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
-
- /* There is a race condition below, but it does not matter,
- because this call is only for heuristic purposes. We want to
- reduce the call frequency of the costly buf_flush_validate_low()
- check in debug builds. */
- if (--buf_flush_validate_count > 0) {
- return(TRUE);
- }
-
- buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
- return(buf_flush_validate_low(buf_pool));
-}
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-/******************************************************************//**
-Insert a block in the flush_rbt and returns a pointer to its
-predecessor or NULL if no predecessor. The ordering is maintained
-on the basis of the <oldest_modification, space, offset> key.
-@return pointer to the predecessor or NULL if no predecessor. */
-static
-buf_page_t*
-buf_flush_insert_in_flush_rbt(
-/*==========================*/
- buf_page_t* bpage) /*!< in: bpage to be inserted. */
-{
- const ib_rbt_node_t* c_node;
- const ib_rbt_node_t* p_node;
- buf_page_t* prev = NULL;
- buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
-
- ut_ad(buf_flush_list_mutex_own(buf_pool));
-
- /* Insert this buffer into the rbt. */
- c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
- ut_a(c_node != NULL);
-
- /* Get the predecessor. */
- p_node = rbt_prev(buf_pool->flush_rbt, c_node);
-
- if (p_node != NULL) {
- buf_page_t** value;
- value = rbt_value(buf_page_t*, p_node);
- prev = *value;
- ut_a(prev != NULL);
- }
-
- return(prev);
-}
-
-/*********************************************************//**
-Delete a bpage from the flush_rbt. */
-static
-void
-buf_flush_delete_from_flush_rbt(
-/*============================*/
- buf_page_t* bpage) /*!< in: bpage to be removed. */
-{
-#ifdef UNIV_DEBUG
- ibool ret = FALSE;
-#endif /* UNIV_DEBUG */
- buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
-
- ut_ad(buf_flush_list_mutex_own(buf_pool));
-
-#ifdef UNIV_DEBUG
- ret =
-#endif /* UNIV_DEBUG */
- rbt_delete(buf_pool->flush_rbt, &bpage);
-
- ut_ad(ret);
-}
-
-/*****************************************************************//**
-Compare two modified blocks in the buffer pool. The key for comparison
-is:
-key = <oldest_modification, space, offset>
-This comparison is used to maintian ordering of blocks in the
-buf_pool->flush_rbt.
-Note that for the purpose of flush_rbt, we only need to order blocks
-on the oldest_modification. The other two fields are used to uniquely
-identify the blocks.
-@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
-static
-int
-buf_flush_block_cmp(
-/*================*/
- const void* p1, /*!< in: block1 */
- const void* p2) /*!< in: block2 */
-{
- int ret;
- const buf_page_t* b1 = *(const buf_page_t**) p1;
- const buf_page_t* b2 = *(const buf_page_t**) p2;
-#ifdef UNIV_DEBUG
- buf_pool_t* buf_pool = buf_pool_from_bpage(b1);
-#endif /* UNIV_DEBUG */
-
- ut_ad(b1 != NULL);
- ut_ad(b2 != NULL);
-
- ut_ad(buf_flush_list_mutex_own(buf_pool));
-
- ut_ad(b1->in_flush_list);
- ut_ad(b2->in_flush_list);
-
- if (b2->oldest_modification > b1->oldest_modification) {
- return(1);
- } else if (b2->oldest_modification < b1->oldest_modification) {
- return(-1);
- }
-
- /* If oldest_modification is same then decide on the space. */
- ret = (int)(b2->space - b1->space);
-
- /* Or else decide ordering on the offset field. */
- return(ret ? ret : (int)(b2->offset - b1->offset));
-}
-
-/********************************************************************//**
-Initialize the red-black tree to speed up insertions into the flush_list
-during recovery process. Should be called at the start of recovery
-process before any page has been read/written. */
-UNIV_INTERN
-void
-buf_flush_init_flush_rbt(void)
-/*==========================*/
-{
- ulint i;
-
- for (i = 0; i < srv_buf_pool_instances; i++) {
- buf_pool_t* buf_pool;
-
- buf_pool = buf_pool_from_array(i);
-
- buf_flush_list_mutex_enter(buf_pool);
-
- /* Create red black tree for speedy insertions in flush list. */
- buf_pool->flush_rbt = rbt_create(
- sizeof(buf_page_t*), buf_flush_block_cmp);
-
- buf_flush_list_mutex_exit(buf_pool);
- }
-}
-
-/********************************************************************//**
-Frees up the red-black tree. */
-UNIV_INTERN
-void
-buf_flush_free_flush_rbt(void)
-/*==========================*/
-{
- ulint i;
-
- for (i = 0; i < srv_buf_pool_instances; i++) {
- buf_pool_t* buf_pool;
-
- buf_pool = buf_pool_from_array(i);
-
- buf_flush_list_mutex_enter(buf_pool);
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
- ut_a(buf_flush_validate_low(buf_pool));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
- rbt_free(buf_pool->flush_rbt);
- buf_pool->flush_rbt = NULL;
-
- buf_flush_list_mutex_exit(buf_pool);
- }
-}
-
-/********************************************************************//**
-Inserts a modified block into the flush list. */
-UNIV_INTERN
-void
-buf_flush_insert_into_flush_list(
-/*=============================*/
- buf_pool_t* buf_pool, /*!< buffer pool instance */
- buf_block_t* block, /*!< in/out: block which is modified */
- ib_uint64_t lsn) /*!< in: oldest modification */
-{
- ut_ad(!buf_pool_mutex_own(buf_pool));
- ut_ad(log_flush_order_mutex_own());
- ut_ad(mutex_own(&block->mutex));
-
- buf_flush_list_mutex_enter(buf_pool);
-
- ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
- || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
- <= lsn));
-
- /* If we are in the recovery then we need to update the flush
- red-black tree as well. */
- if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
- buf_flush_list_mutex_exit(buf_pool);
- buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
- return;
- }
-
- ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
- ut_ad(!block->page.in_flush_list);
-
- ut_d(block->page.in_flush_list = TRUE);
- block->page.oldest_modification = lsn;
- UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page);
- incr_flush_list_size_in_bytes(block, buf_pool);
-
-#ifdef UNIV_DEBUG_VALGRIND
- {
- ulint zip_size = buf_block_get_zip_size(block);
-
- if (UNIV_UNLIKELY(zip_size)) {
- UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
- } else {
- UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
- }
- }
-#endif /* UNIV_DEBUG_VALGRIND */
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
- ut_a(buf_flush_validate_skip(buf_pool));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
- buf_flush_list_mutex_exit(buf_pool);
-}
-
-/********************************************************************//**
-Inserts a modified block into the flush list in the right sorted position.
-This function is used by recovery, because there the modifications do not
-necessarily come in the order of lsn's. */
-UNIV_INTERN
-void
-buf_flush_insert_sorted_into_flush_list(
-/*====================================*/
- buf_pool_t* buf_pool, /*!< in: buffer pool instance */
- buf_block_t* block, /*!< in/out: block which is modified */
- ib_uint64_t lsn) /*!< in: oldest modification */
-{
- buf_page_t* prev_b;
- buf_page_t* b;
-
- ut_ad(!buf_pool_mutex_own(buf_pool));
- ut_ad(log_flush_order_mutex_own());
- ut_ad(mutex_own(&block->mutex));
- ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-
- buf_flush_list_mutex_enter(buf_pool);
-
- /* The field in_LRU_list is protected by buf_pool->mutex, which
- we are not holding. However, while a block is in the flush
- list, it is dirty and cannot be discarded, not from the
- page_hash or from the LRU list. At most, the uncompressed
- page frame of a compressed block may be discarded or created
- (copying the block->page to or from a buf_page_t that is
- dynamically allocated from buf_buddy_alloc()). Because those
- transitions hold block->mutex and the flush list mutex (via
- buf_flush_relocate_on_flush_list()), there is no possibility
- of a race condition in the assertions below. */
- ut_ad(block->page.in_LRU_list);
- ut_ad(block->page.in_page_hash);
- /* buf_buddy_block_register() will take a block in the
- BUF_BLOCK_MEMORY state, not a file page. */
- ut_ad(!block->page.in_zip_hash);
-
- ut_ad(!block->page.in_flush_list);
- ut_d(block->page.in_flush_list = TRUE);
- block->page.oldest_modification = lsn;
-
-#ifdef UNIV_DEBUG_VALGRIND
- {
- ulint zip_size = buf_block_get_zip_size(block);
-
- if (UNIV_UNLIKELY(zip_size)) {
- UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
- } else {
- UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
- }
- }
-#endif /* UNIV_DEBUG_VALGRIND */
-
- prev_b = NULL;
-
- /* For the most part when this function is called the flush_rbt
- should not be NULL. In a very rare boundary case it is possible
- that the flush_rbt has already been freed by the recovery thread
- before the last page was hooked up in the flush_list by the
- io-handler thread. In that case we'll just do a simple
- linear search in the else block. */
- if (buf_pool->flush_rbt) {
-
- prev_b = buf_flush_insert_in_flush_rbt(&block->page);
-
- } else {
-
- b = UT_LIST_GET_FIRST(buf_pool->flush_list);
-
- while (b && b->oldest_modification
- > block->page.oldest_modification) {
- ut_ad(b->in_flush_list);
- prev_b = b;
- b = UT_LIST_GET_NEXT(flush_list, b);
- }
- }
-
- if (prev_b == NULL) {
- UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page);
- } else {
- UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list,
- prev_b, &block->page);
- }
-
- incr_flush_list_size_in_bytes(block, buf_pool);
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
- ut_a(buf_flush_validate_low(buf_pool));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
- buf_flush_list_mutex_exit(buf_pool);
-}
-
-/********************************************************************//**
-Returns TRUE if the file page block is immediately suitable for replacement,
-i.e., the transition FILE_PAGE => NOT_USED allowed.
-@return TRUE if can replace immediately */
-UNIV_INTERN
-ibool
-buf_flush_ready_for_replace(
-/*========================*/
- buf_page_t* bpage) /*!< in: buffer control block, must be
- buf_page_in_file(bpage) and in the LRU list */
-{
-#ifdef UNIV_DEBUG
- //buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
- //ut_ad(buf_pool_mutex_own(buf_pool));
-#endif
- ut_ad(mutex_own(buf_page_get_mutex(bpage)));
- //ut_ad(bpage->in_LRU_list);
-
- if (UNIV_LIKELY(bpage->in_LRU_list && buf_page_in_file(bpage))) {
-
- return(bpage->oldest_modification == 0
- && buf_page_get_io_fix(bpage) == BUF_IO_NONE
- && bpage->buf_fix_count == 0);
- }
-
- /* permited not to own LRU_mutex.. */
-/*
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: Error: buffer block state %lu"
- " in the LRU list!\n",
- (ulong) buf_page_get_state(bpage));
- ut_print_buf(stderr, bpage, sizeof(buf_page_t));
- putc('\n', stderr);
-*/
-
- return(FALSE);
-}
-
-/********************************************************************//**
-Returns TRUE if the block is modified and ready for flushing.
-@return TRUE if can flush immediately */
-UNIV_INLINE
-ibool
-buf_flush_ready_for_flush(
-/*======================*/
- buf_page_t* bpage, /*!< in: buffer control block, must be
- buf_page_in_file(bpage) */
- enum buf_flush flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
-{
-#ifdef UNIV_DEBUG
- //buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
- //ut_ad(buf_pool_mutex_own(buf_pool));
-#endif
- //ut_a(buf_page_in_file(bpage));
- ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
- ut_ad(mutex_own(buf_page_get_mutex(bpage))
- || flush_type == BUF_FLUSH_LIST);
-
- if (buf_page_in_file(bpage) && bpage->oldest_modification != 0
- && buf_page_get_io_fix_unlocked(bpage) == BUF_IO_NONE) {
- ut_ad(bpage->in_flush_list);
-
- if (flush_type != BUF_FLUSH_LRU) {
-
- return(TRUE);
-
- } else if (bpage->buf_fix_count == 0) {
-
- /* If we are flushing the LRU list, to avoid deadlocks
- we require the block not to be bufferfixed, and hence
- not latched. */
-
- return(TRUE);
- }
- }
-
- return(FALSE);
-}
-
-/********************************************************************//**
-Remove a block from the flush list of modified blocks. */
-UNIV_INTERN
-void
-buf_flush_remove(
-/*=============*/
- buf_page_t* bpage) /*!< in: pointer to the block in question */
-{
- buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
- ulint zip_size;
-
- ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
- ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_DIRTY
- || mutex_own(&buf_pool->LRU_list_mutex));
-#endif
- ut_ad(bpage->in_flush_list);
-
- buf_flush_list_mutex_enter(buf_pool);
-
- switch (buf_page_get_state(bpage)) {
- case BUF_BLOCK_ZIP_PAGE:
- /* Clean compressed pages should not be on the flush list */
- case BUF_BLOCK_ZIP_FREE:
- case BUF_BLOCK_NOT_USED:
- case BUF_BLOCK_READY_FOR_USE:
- case BUF_BLOCK_MEMORY:
- case BUF_BLOCK_REMOVE_HASH:
- ut_error;
- return;
- case BUF_BLOCK_ZIP_DIRTY:
- buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
- UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
- buf_LRU_insert_zip_clean(bpage);
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
- break;
- case BUF_BLOCK_FILE_PAGE:
- UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
- break;
- }
-
- /* If the flush_rbt is active then delete from there as well. */
- if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
- buf_flush_delete_from_flush_rbt(bpage);
- }
-
- /* Must be done after we have removed it from the flush_rbt
- because we assert on in_flush_list in comparison function. */
- ut_d(bpage->in_flush_list = FALSE);
-
- zip_size = page_zip_get_size(&bpage->zip);
- buf_pool->stat.flush_list_bytes -= zip_size ? zip_size : UNIV_PAGE_SIZE;
-
- bpage->oldest_modification = 0;
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
- ut_a(buf_flush_validate_skip(buf_pool));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
- buf_flush_list_mutex_exit(buf_pool);
-}
-
-/*******************************************************************//**
-Relocates a buffer control block on the flush_list.
-Note that it is assumed that the contents of bpage have already been
-copied to dpage.
-IMPORTANT: When this function is called bpage and dpage are not
-exact copies of each other. For example, they both will have different
-::state. Also the ::list pointers in dpage may be stale. We need to
-use the current list node (bpage) to do the list manipulation because
-the list pointers could have changed between the time that we copied
-the contents of bpage to the dpage and the flush list manipulation
-below. */
-UNIV_INTERN
-void
-buf_flush_relocate_on_flush_list(
-/*=============================*/
- buf_page_t* bpage, /*!< in/out: control block being moved */
- buf_page_t* dpage) /*!< in/out: destination block */
-{
- buf_page_t* prev;
- buf_page_t* prev_b = NULL;
- buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
-
- //ut_ad(buf_pool_mutex_own(buf_pool));
- /* Must reside in the same buffer pool. */
- ut_ad(buf_pool == buf_pool_from_bpage(dpage));
-
- ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-
- buf_flush_list_mutex_enter(buf_pool);
-
- /* FIXME: At this point we have both buf_pool and flush_list
- mutexes. Theoretically removal of a block from flush list is
- only covered by flush_list mutex but currently we do
- have buf_pool mutex in buf_flush_remove() therefore this block
- is guaranteed to be in the flush list. We need to check if
- this will work without the assumption of block removing code
- having the buf_pool mutex. */
- ut_ad(bpage->in_flush_list);
- ut_ad(dpage->in_flush_list);
-
- /* If recovery is active we must swap the control blocks in
- the flush_rbt as well. */
- if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
- buf_flush_delete_from_flush_rbt(bpage);
- prev_b = buf_flush_insert_in_flush_rbt(dpage);
- }
-
- /* Must be done after we have removed it from the flush_rbt
- because we assert on in_flush_list in comparison function. */
- ut_d(bpage->in_flush_list = FALSE);
-
- prev = UT_LIST_GET_PREV(flush_list, bpage);
- UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
-
- if (prev) {
- ut_ad(prev->in_flush_list);
- UT_LIST_INSERT_AFTER(
- flush_list,
- buf_pool->flush_list,
- prev, dpage);
- } else {
- UT_LIST_ADD_FIRST(
- flush_list,
- buf_pool->flush_list,
- dpage);
- }
-
- /* Just an extra check. Previous in flush_list
- should be the same control block as in flush_rbt. */
- ut_a(!buf_pool->flush_rbt || prev_b == prev);
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
- ut_a(buf_flush_validate_low(buf_pool));
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
- buf_flush_list_mutex_exit(buf_pool);
-}
-
-/********************************************************************//**
-Updates the flush system data structures when a write is completed. */
-UNIV_INTERN
-void
-buf_flush_write_complete(
-/*=====================*/
- buf_page_t* bpage) /*!< in: pointer to the block in question */
-{
- enum buf_flush flush_type;
- buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
-
- ut_ad(bpage);
-
- buf_flush_remove(bpage);
-
- flush_type = buf_page_get_flush_type(bpage);
- buf_pool->n_flush[flush_type]--;
-
- if (flush_type == BUF_FLUSH_LRU) {
- /* Put the block to the end of the LRU list to wait to be
- moved to the free list */
-
- buf_LRU_make_block_old(bpage);
-
- buf_pool->LRU_flush_ended++;
- }
-
- /* fprintf(stderr, "n pending flush %lu\n",
- buf_pool->n_flush[flush_type]); */
-
- if (buf_pool->n_flush[flush_type] == 0
- && buf_pool->init_flush[flush_type] == FALSE) {
-
- /* The running flush batch has ended */
-
- os_event_set(buf_pool->no_flush[flush_type]);
- }
-}
-
-/********************************************************************//**
-Flush a batch of writes to the datafiles that have already been
-written by the OS. */
-static
-void
-buf_flush_sync_datafiles(void)
-/*==========================*/
-{
- /* Wake possible simulated aio thread to actually post the
- writes to the operating system */
- os_aio_simulated_wake_handler_threads();
-
- /* Wait that all async writes to tablespaces have been posted to
- the OS */
- os_aio_wait_until_no_pending_writes();
-
- /* Now we flush the data to disk (for example, with fsync) */
- fil_flush_file_spaces(FIL_TABLESPACE);
-
- return;
-}
-
-/********************************************************************//**
-Flushes possible buffered writes from the doublewrite memory buffer to disk,
-and also wakes up the aio thread if simulated aio is used. It is very
-important to call this function after a batch of writes has been posted,
-and also when we may have to wait for a page latch! Otherwise a deadlock
-of threads can occur. */
-static
-void
-buf_flush_buffered_writes(void)
-/*===========================*/
-{
- byte* write_buf;
- ulint len;
- ulint len2;
- ulint i;
-
- if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
- /* Sync the writes to the disk. */
- buf_flush_sync_datafiles();
- return;
- }
-
- mutex_enter(&(trx_doublewrite->mutex));
-
- /* Write first to doublewrite buffer blocks. We use synchronous
- aio and thus know that file write has been completed when the
- control returns. */
-
- if (trx_doublewrite->first_free == 0) {
-
- mutex_exit(&(trx_doublewrite->mutex));
-
- return;
- }
-
- for (i = 0; i < trx_doublewrite->first_free; i++) {
-
- const buf_block_t* block;
-
- block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
-
- if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
- || block->page.zip.data) {
- /* No simple validate for compressed pages exists. */
- continue;
- }
-
- if (UNIV_UNLIKELY
- (memcmp(block->frame + (FIL_PAGE_LSN + 4),
- block->frame + (UNIV_PAGE_SIZE
- - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
- 4))) {
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: ERROR: The page to be written"
- " seems corrupt!\n"
- "InnoDB: The lsn fields do not match!"
- " Noticed in the buffer pool\n"
- "InnoDB: before posting to the"
- " doublewrite buffer.\n");
- }
-
- if (!block->check_index_page_at_flush) {
- } else if (page_is_comp(block->frame)) {
- if (UNIV_UNLIKELY
- (!page_simple_validate_new(block->frame))) {
-corrupted_page:
- buf_page_print(block->frame, 0,
- BUF_PAGE_PRINT_NO_CRASH);
-
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: Apparent corruption of an"
- " index page n:o %lu in space %lu\n"
- "InnoDB: to be written to data file."
- " We intentionally crash server\n"
- "InnoDB: to prevent corrupt data"
- " from ending up in data\n"
- "InnoDB: files.\n",
- (ulong) buf_block_get_page_no(block),
- (ulong) buf_block_get_space(block));
-
- ut_error;
- }
- } else if (UNIV_UNLIKELY
- (!page_simple_validate_old(block->frame))) {
-
- goto corrupted_page;
- }
- }
-
- /* increment the doublewrite flushed pages counter */
- srv_dblwr_pages_written+= trx_doublewrite->first_free;
- srv_dblwr_writes++;
-
- len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
- trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
-
- write_buf = trx_doublewrite->write_buf;
- i = 0;
-
- fil_io(OS_FILE_WRITE, TRUE,
- (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0,
- trx_doublewrite->block1, 0, len,
- (void*) write_buf, NULL);
-
- for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
- len2 += UNIV_PAGE_SIZE, i++) {
- const buf_block_t* block = (buf_block_t*)
- trx_doublewrite->buf_block_arr[i];
-
- if (UNIV_LIKELY(!block->page.zip.data)
- && UNIV_LIKELY(buf_block_get_state(block)
- == BUF_BLOCK_FILE_PAGE)
- && UNIV_UNLIKELY
- (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
- write_buf + len2
- + (UNIV_PAGE_SIZE
- - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: ERROR: The page to be written"
- " seems corrupt!\n"
- "InnoDB: The lsn fields do not match!"
- " Noticed in the doublewrite block1.\n");
- }
- }
-
- if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
- goto flush;
- }
-
- len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
- * UNIV_PAGE_SIZE;
-
- write_buf = trx_doublewrite->write_buf
- + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
- ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
-
- fil_io(OS_FILE_WRITE, TRUE,
- (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0,
- trx_doublewrite->block2, 0, len,
- (void*) write_buf, NULL);
-
- for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
- len2 += UNIV_PAGE_SIZE, i++) {
- const buf_block_t* block = (buf_block_t*)
- trx_doublewrite->buf_block_arr[i];
-
- if (UNIV_LIKELY(!block->page.zip.data)
- && UNIV_LIKELY(buf_block_get_state(block)
- == BUF_BLOCK_FILE_PAGE)
- && UNIV_UNLIKELY
- (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
- write_buf + len2
- + (UNIV_PAGE_SIZE
- - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: ERROR: The page to be"
- " written seems corrupt!\n"
- "InnoDB: The lsn fields do not match!"
- " Noticed in"
- " the doublewrite block2.\n");
- }
- }
-
-flush:
- /* Now flush the doublewrite buffer data to disk */
-
- fil_flush(srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE, FALSE);
-
- /* We know that the writes have been flushed to disk now
- and in recovery we will find them in the doublewrite buffer
- blocks. Next do the writes to the intended positions. */
-
- for (i = 0; i < trx_doublewrite->first_free; i++) {
- const buf_block_t* block = (buf_block_t*)
- trx_doublewrite->buf_block_arr[i];
-
- ut_a(buf_page_in_file(&block->page));
- if (UNIV_LIKELY_NULL(block->page.zip.data)) {
- fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
- FALSE, buf_page_get_space(&block->page),
- buf_page_get_zip_size(&block->page),
- buf_page_get_page_no(&block->page), 0,
- buf_page_get_zip_size(&block->page),
- (void*)block->page.zip.data,
- (void*)block);
-
- /* Increment the counter of I/O operations used
- for selecting LRU policy. */
- buf_LRU_stat_inc_io();
-
- continue;
- }
-
- ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-
- if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
- block->frame
- + (UNIV_PAGE_SIZE
- - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
- 4))) {
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: ERROR: The page to be written"
- " seems corrupt!\n"
- "InnoDB: The lsn fields do not match!"
- " Noticed in the buffer pool\n"
- "InnoDB: after posting and flushing"
- " the doublewrite buffer.\n"
- "InnoDB: Page buf fix count %lu,"
- " io fix %lu, state %lu\n",
- (ulong)block->page.buf_fix_count,
- (ulong)buf_block_get_io_fix_unlocked(block),
- (ulong)buf_block_get_state(block));
- }
-
- fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
- FALSE, buf_block_get_space(block), 0,
- buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
- (void*)block->frame, (void*)block);
-
- /* Increment the counter of I/O operations used
- for selecting LRU policy. */
- buf_LRU_stat_inc_io();
- }
-
- /* Sync the writes to the disk. */
- buf_flush_sync_datafiles();
-
- /* We can now reuse the doublewrite memory buffer: */
- trx_doublewrite->first_free = 0;
-
- mutex_exit(&(trx_doublewrite->mutex));
-}
-
-/********************************************************************//**
-Posts a buffer page for writing. If the doublewrite memory buffer is
-full, calls buf_flush_buffered_writes and waits for for free space to
-appear. */
-static
-void
-buf_flush_post_to_doublewrite_buf(
-/*==============================*/
- buf_page_t* bpage) /*!< in: buffer block to write */
-{
- ulint zip_size;
-try_again:
- mutex_enter(&(trx_doublewrite->mutex));
-
- ut_a(buf_page_in_file(bpage));
-
- if (trx_doublewrite->first_free
- >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
- mutex_exit(&(trx_doublewrite->mutex));
-
- buf_flush_buffered_writes();
-
- goto try_again;
- }
-
- zip_size = buf_page_get_zip_size(bpage);
-
- if (UNIV_UNLIKELY(zip_size)) {
- UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
- /* Copy the compressed page and clear the rest. */
- memcpy(trx_doublewrite->write_buf
- + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
- bpage->zip.data, zip_size);
- memset(trx_doublewrite->write_buf
- + UNIV_PAGE_SIZE * trx_doublewrite->first_free
- + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
- } else {
- ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
- UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
- UNIV_PAGE_SIZE);
-
- memcpy(trx_doublewrite->write_buf
- + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
- ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
- }
-
- trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
-
- trx_doublewrite->first_free++;
-
- if (trx_doublewrite->first_free
- >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
- mutex_exit(&(trx_doublewrite->mutex));
-
- buf_flush_buffered_writes();
-
- return;
- }
-
- mutex_exit(&(trx_doublewrite->mutex));
-}
-#endif /* !UNIV_HOTBACKUP */
-
-/********************************************************************//**
-Initializes a page for writing to the tablespace. */
-UNIV_INTERN
-void
-buf_flush_init_for_writing(
-/*=======================*/
- byte* page, /*!< in/out: page */
- void* page_zip_, /*!< in/out: compressed page, or NULL */
- ib_uint64_t newest_lsn) /*!< in: newest modification lsn
- to the page */
-{
- ut_ad(page);
-
- if (page_zip_) {
- page_zip_des_t* page_zip = page_zip_;
- ulint zip_size = page_zip_get_size(page_zip);
- ut_ad(zip_size);
- ut_ad(ut_is_2pow(zip_size));
- ut_ad(zip_size <= UNIV_PAGE_SIZE);
-
- switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
- case FIL_PAGE_TYPE_ALLOCATED:
- case FIL_PAGE_INODE:
- case FIL_PAGE_IBUF_BITMAP:
- case FIL_PAGE_TYPE_FSP_HDR:
- case FIL_PAGE_TYPE_XDES:
- /* These are essentially uncompressed pages. */
- memcpy(page_zip->data, page, zip_size);
- /* fall through */
- case FIL_PAGE_TYPE_ZBLOB:
- case FIL_PAGE_TYPE_ZBLOB2:
- case FIL_PAGE_INDEX:
- mach_write_to_8(page_zip->data
- + FIL_PAGE_LSN, newest_lsn);
- memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
- mach_write_to_4(page_zip->data
- + FIL_PAGE_SPACE_OR_CHKSUM,
- srv_use_checksums
- ? page_zip_calc_checksum(
- page_zip->data, zip_size)
- : BUF_NO_CHECKSUM_MAGIC);
- return;
- }
-
- ut_print_timestamp(stderr);
- fputs(" InnoDB: ERROR: The compressed page to be written"
- " seems corrupt:", stderr);
- ut_print_buf(stderr, page, zip_size);
- fputs("\nInnoDB: Possibly older version of the page:", stderr);
- ut_print_buf(stderr, page_zip->data, zip_size);
- putc('\n', stderr);
- ut_error;
- }
-
- /* Write the newest modification lsn to the page header and trailer */
- mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
-
- mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
- newest_lsn);
-
- /* Store the new formula checksum */
-
- mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
- srv_use_checksums
- ? (!srv_fast_checksum
- ? buf_calc_page_new_checksum(page)
- : buf_calc_page_new_checksum_32(page))
- : BUF_NO_CHECKSUM_MAGIC);
-
- /* We overwrite the first 4 bytes of the end lsn field to store
- the old formula checksum. Since it depends also on the field
- FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
- new formula checksum. */
-
- mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
- srv_use_checksums
- ? buf_calc_page_old_checksum(page)
- : BUF_NO_CHECKSUM_MAGIC);
-}
-
-#ifndef UNIV_HOTBACKUP
-/********************************************************************//**
-Does an asynchronous write of a buffer page. NOTE: in simulated aio and
-also when the doublewrite buffer is used, we must call
-buf_flush_buffered_writes after we have posted a batch of writes! */
-static
-void
-buf_flush_write_block_low(
-/*======================*/
- buf_page_t* bpage) /*!< in: buffer block to write */
-{
- ulint zip_size = buf_page_get_zip_size(bpage);
- page_t* frame = NULL;
-
-#ifdef UNIV_DEBUG
- buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
- //ut_ad(!buf_pool_mutex_own(buf_pool));
-#endif
-
-#ifdef UNIV_LOG_DEBUG
- static ibool univ_log_debug_warned;
-#endif /* UNIV_LOG_DEBUG */
-
- ut_ad(buf_page_in_file(bpage));
-
- /* We are not holding buf_pool->mutex or block_mutex here.
- Nevertheless, it is safe to access bpage, because it is
- io_fixed and oldest_modification != 0. Thus, it cannot be
- relocated in the buffer pool or removed from flush_list or
- LRU_list. */
- //ut_ad(!buf_pool_mutex_own(buf_pool));
- ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
- ut_ad(!buf_flush_list_mutex_own(buf_pool));
- ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
- ut_ad(buf_page_get_io_fix_unlocked(bpage) == BUF_IO_WRITE);
- ut_ad(bpage->oldest_modification != 0);
-
-#ifdef UNIV_IBUF_COUNT_DEBUG
- ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
-#endif
- ut_ad(bpage->newest_modification != 0);
-
-#ifdef UNIV_LOG_DEBUG
- if (!univ_log_debug_warned) {
- univ_log_debug_warned = TRUE;
- fputs("Warning: cannot force log to disk if"
- " UNIV_LOG_DEBUG is defined!\n"
- "Crash recovery will not work!\n",
- stderr);
- }
-#else
- /* Force the log to the disk before writing the modified block */
- log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
-#endif
- switch (buf_page_get_state(bpage)) {
- case BUF_BLOCK_ZIP_FREE:
- case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
- case BUF_BLOCK_NOT_USED:
- case BUF_BLOCK_READY_FOR_USE:
- case BUF_BLOCK_MEMORY:
- case BUF_BLOCK_REMOVE_HASH:
- ut_error;
- break;
- case BUF_BLOCK_ZIP_DIRTY:
- frame = bpage->zip.data;
- if (UNIV_LIKELY(srv_use_checksums)) {
- ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
- == page_zip_calc_checksum(frame, zip_size));
- }
- mach_write_to_8(frame + FIL_PAGE_LSN,
- bpage->newest_modification);
- memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
- break;
- case BUF_BLOCK_FILE_PAGE:
- frame = bpage->zip.data;
- if (!frame) {
- frame = ((buf_block_t*) bpage)->frame;
- }
-
- buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
- bpage->zip.data
- ? &bpage->zip : NULL,
- bpage->newest_modification);
- break;
- }
-
- if (!srv_use_doublewrite_buf || !trx_doublewrite) {
- fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
- FALSE, buf_page_get_space(bpage), zip_size,
- buf_page_get_page_no(bpage), 0,
- zip_size ? zip_size : UNIV_PAGE_SIZE,
- frame, bpage);
- } else {
- buf_flush_post_to_doublewrite_buf(bpage);
- }
-}
-
-# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-/********************************************************************//**
-Writes a flushable page asynchronously from the buffer pool to a file.
-NOTE: block->mutex must be held upon entering this function, and it will be
-released by this function after flushing.
-This is loosely based on buf_flush_batch() and buf_flush_page().
-@return TRUE if the page was flushed and the mutex released */
-UNIV_INTERN
-ibool
-buf_flush_page_try(
-/*===============*/
- buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
- buf_block_t* block) /*!< in/out: buffer control block */
-{
- //ut_ad(buf_pool_mutex_own(buf_pool));
- ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
- ut_ad(mutex_own(&block->mutex));
-
- if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_LRU)) {
- return(FALSE);
- }
-
- buf_pool_mutex_enter(buf_pool);
-
- if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
- || buf_pool->init_flush[BUF_FLUSH_LRU]) {
- buf_pool_mutex_exit(buf_pool);
- /* There is already a flush batch of the same type running */
- return(FALSE);
- }
-
- buf_pool->init_flush[BUF_FLUSH_LRU] = TRUE;
-
- buf_page_set_io_fix(&block->page, BUF_IO_WRITE);
-
- buf_page_set_flush_type(&block->page, BUF_FLUSH_LRU);
-
- if (buf_pool->n_flush[BUF_FLUSH_LRU]++ == 0) {
-
- os_event_reset(buf_pool->no_flush[BUF_FLUSH_LRU]);
- }
-
- /* VERY IMPORTANT:
- Because any thread may call the LRU flush, even when owning
- locks on pages, to avoid deadlocks, we must make sure that the
- s-lock is acquired on the page without waiting: this is
- accomplished because buf_flush_ready_for_flush() must hold,
- and that requires the page not to be bufferfixed. */
-
- rw_lock_s_lock_gen(&block->lock, BUF_IO_WRITE);
-
- /* Note that the s-latch is acquired before releasing the
- buf_pool mutex: this ensures that the latch is acquired
- immediately. */
-
- mutex_exit(&block->mutex);
- buf_pool_mutex_exit(buf_pool);
-
- /* Even though block is not protected by any mutex at this
- point, it is safe to access block, because it is io_fixed and
- oldest_modification != 0. Thus, it cannot be relocated in the
- buffer pool or removed from flush_list or LRU_list. */
-
- buf_flush_write_block_low(&block->page);
-
- buf_pool_mutex_enter(buf_pool);
- buf_pool->init_flush[BUF_FLUSH_LRU] = FALSE;
-
- if (buf_pool->n_flush[BUF_FLUSH_LRU] == 0) {
- /* The running flush batch has ended */
- os_event_set(buf_pool->no_flush[BUF_FLUSH_LRU]);
- }
-
- buf_pool_mutex_exit(buf_pool);
- buf_flush_buffered_writes();
-
- return(TRUE);
-}
-# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
-
-/********************************************************************//**
-Writes a flushable page asynchronously from the buffer pool to a file.
-NOTE: in simulated aio we must call
-os_aio_simulated_wake_handler_threads after we have posted a batch of
-writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
-held upon entering this function, and they will be released by this
-function. */
-static
-void
-buf_flush_page(
-/*===========*/
- buf_pool_t* buf_pool, /*!< in: buffer pool instance */
- buf_page_t* bpage, /*!< in: buffer control block */
- enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU
- or BUF_FLUSH_LIST */
-{
- mutex_t* block_mutex;
- ibool is_uncompressed;
-
- ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
- //ut_ad(buf_pool_mutex_own(buf_pool));
-#ifdef UNIV_SYNC_DEBUG
- ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_SHARED));
-#endif
- ut_ad(buf_page_in_file(bpage));
-
- block_mutex = buf_page_get_mutex(bpage);
- ut_ad(mutex_own(block_mutex));
-
- buf_pool_mutex_enter(buf_pool);
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
-
- ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
-
- buf_page_set_io_fix(bpage, BUF_IO_WRITE);
-
- buf_page_set_flush_type(bpage, flush_type);
-
- if (buf_pool->n_flush[flush_type] == 0) {
-
- os_event_reset(buf_pool->no_flush[flush_type]);
- }
-
- buf_pool->n_flush[flush_type]++;
-
- is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
- ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
-
- switch (flush_type) {
- ibool is_s_latched;
- case BUF_FLUSH_LIST:
- /* If the simulated aio thread is not running, we must
- not wait for any latch, as we may end up in a deadlock:
- if buf_fix_count == 0, then we know we need not wait */
-
- is_s_latched = (bpage->buf_fix_count == 0);
- if (is_s_latched && is_uncompressed) {
- rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
- BUF_IO_WRITE);
- }
-
- mutex_exit(block_mutex);
- buf_pool_mutex_exit(buf_pool);
-
- /* Even though bpage is not protected by any mutex at
- this point, it is safe to access bpage, because it is
- io_fixed and oldest_modification != 0. Thus, it
- cannot be relocated in the buffer pool or removed from
- flush_list or LRU_list. */
-
- if (!is_s_latched) {
- buf_flush_buffered_writes();
-
- if (is_uncompressed) {
- rw_lock_s_lock_gen(&((buf_block_t*) bpage)
- ->lock, BUF_IO_WRITE);
- }
- }
-
- break;
-
- case BUF_FLUSH_LRU:
- /* VERY IMPORTANT:
- Because any thread may call the LRU flush, even when owning
- locks on pages, to avoid deadlocks, we must make sure that the
- s-lock is acquired on the page without waiting: this is
- accomplished because buf_flush_ready_for_flush() must hold,
- and that requires the page not to be bufferfixed. */
-
- if (is_uncompressed) {
- rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
- BUF_IO_WRITE);
- }
-
- /* Note that the s-latch is acquired before releasing the
- buf_pool mutex: this ensures that the latch is acquired
- immediately. */
-
- mutex_exit(block_mutex);
- buf_pool_mutex_exit(buf_pool);
- break;
-
- default:
- ut_error;
- }
-
- /* Even though bpage is not protected by any mutex at this
- point, it is safe to access bpage, because it is io_fixed and
- oldest_modification != 0. Thus, it cannot be relocated in the
- buffer pool or removed from flush_list or LRU_list. */
-
-#ifdef UNIV_DEBUG
- if (buf_debug_prints) {
- fprintf(stderr,
- "Flushing %u space %u page %u\n",
- flush_type, bpage->space, bpage->offset);
- }
-#endif /* UNIV_DEBUG */
- buf_flush_write_block_low(bpage);
-}
-
-/***********************************************************//**
-Flushes to disk all flushable pages within the flush area.
-@return number of pages flushed */
-static
-ulint
-buf_flush_try_neighbors(
-/*====================*/
- ulint space, /*!< in: space id */
- ulint offset, /*!< in: page offset */
- enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU or
- BUF_FLUSH_LIST */
- ulint n_flushed, /*!< in: number of pages
- flushed so far in this batch */
- ulint n_to_flush) /*!< in: maximum number of pages
- we are allowed to flush */
-{
- ulint i;
- ulint low;
- ulint high;
- ulint count = 0;
- buf_pool_t* buf_pool = buf_pool_get(space, offset);
- ibool is_forward_scan;
-
- ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
-
- if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN || !srv_flush_neighbor_pages) {
- /* If there is little space, it is better not to flush
- any block except from the end of the LRU list */
-
- low = offset;
- high = offset + 1;
- } else {
- /* When flushed, dirty blocks are searched in
- neighborhoods of this size, and flushed along with the
- original page. */
-
- ulint buf_flush_area;
-
- buf_flush_area = ut_min(
- BUF_READ_AHEAD_AREA(buf_pool),
- buf_pool->curr_size / 16);
-
- low = (offset / buf_flush_area) * buf_flush_area;
- high = (offset / buf_flush_area + 1) * buf_flush_area;
- }
-
- /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
-
- if (high > fil_space_get_size(space)) {
- high = fil_space_get_size(space);
- }
-
- if (srv_flush_neighbor_pages == 2) {
-
- /* In the case of contiguous flush where the requested page
- does not fall at the start of flush area, first scan backward
- from the page and later forward from it. */
- is_forward_scan = (offset == low);
- }
- else {
- is_forward_scan = TRUE;
- }
-
-scan:
- if (srv_flush_neighbor_pages == 2) {
- if (is_forward_scan) {
- i = offset;
- }
- else {
- i = offset - 1;
- }
- }
- else {
- i = low;
- }
-
- for (; is_forward_scan ? (i < high) : (i >= low);
- is_forward_scan ? i++ : i--) {
-
- buf_page_t* bpage;
-
- if ((count + n_flushed) >= n_to_flush) {
-
- /* We have already flushed enough pages and
- should call it a day. There is, however, one
- exception. If the page whose neighbors we
- are flushing has not been flushed yet then
- we'll try to flush the victim that we
- selected originally. */
- if (i <= offset) {
- i = offset;
- } else {
- break;
- }
- }
-
- buf_pool = buf_pool_get(space, i);
-
- //buf_pool_mutex_enter(buf_pool);
- rw_lock_s_lock(&buf_pool->page_hash_latch);
-
- /* We only want to flush pages from this buffer pool. */
- bpage = buf_page_hash_get(buf_pool, space, i);
-
- if (!bpage) {
-
- //buf_pool_mutex_exit(buf_pool);
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
- if (srv_flush_neighbor_pages == 2) {
-
- /* This is contiguous neighbor page flush and
- the pages here are not contiguous. */
- break;
- }
- continue;
- }
-
- ut_a(buf_page_in_file(bpage));
-
- /* We avoid flushing 'non-old' blocks in an LRU flush,
- because the flushed blocks are soon freed */
-
- if (flush_type != BUF_FLUSH_LRU
- || i == offset
- || buf_page_is_old(bpage)) {
- mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
-
- if (block_mutex && buf_flush_ready_for_flush(bpage, flush_type)
- && (i == offset || !bpage->buf_fix_count)) {
- /* We only try to flush those
- neighbors != offset where the buf fix
- count is zero, as we then know that we
- probably can latch the page without a
- semaphore wait. Semaphore waits are
- expensive because we must flush the
- doublewrite buffer before we start
- waiting. */
-
- buf_flush_page(buf_pool, bpage, flush_type);
- ut_ad(!mutex_own(block_mutex));
- ut_ad(!buf_pool_mutex_own(buf_pool));
- count++;
- continue;
- } else if (block_mutex) {
- mutex_exit(block_mutex);
- }
- }
- //buf_pool_mutex_exit(buf_pool);
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
-
- if (srv_flush_neighbor_pages == 2) {
-
- /* We are trying to do the contiguous neighbor page
- flush, but the last page we checked was unflushable,
- making a "hole" in the flush, so stop this attempt. */
- break;
- }
- }
-
- if (!is_forward_scan) {
-
- /* Backward scan done, now do the forward scan */
- ut_a (srv_flush_neighbor_pages == 2);
- is_forward_scan = TRUE;
- goto scan;
- }
-
- return(count);
-}
-
-/********************************************************************//**
-Check if the block is modified and ready for flushing. If the the block
-is ready to flush then flush the page and try o flush its neighbors.
-
-@return TRUE if LRU list mutex was not released during this function.
-This does not guarantee that some pages were written as well.
-Number of pages written are incremented to the count. */
-static
-ibool
-buf_flush_page_and_try_neighbors(
-/*=============================*/
- buf_page_t* bpage, /*!< in: buffer control block */
- enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU
- or BUF_FLUSH_LIST */
- ulint n_to_flush, /*!< in: number of pages to
- flush */
- ulint* count) /*!< in/out: number of pages
- flushed */
-{
- mutex_t* block_mutex = NULL;
- ibool flushed = FALSE;
-#ifdef UNIV_DEBUG
- buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
-#endif /* UNIV_DEBUG */
-
- ut_ad((flush_type == BUF_FLUSH_LRU
- && mutex_own(&buf_pool->LRU_list_mutex))
- || (flush_type == BUF_FLUSH_LIST
- && buf_flush_list_mutex_own(buf_pool)));
-
- if (flush_type == BUF_FLUSH_LRU) {
- block_mutex = buf_page_get_mutex_enter(bpage);
- ut_ad(block_mutex);
- }
-
- ut_a(buf_page_in_file(bpage));
-
- if (buf_flush_ready_for_flush(bpage, flush_type)) {
- ulint space;
- ulint offset;
- buf_pool_t* buf_pool;
-
- buf_pool = buf_pool_from_bpage(bpage);
-
- //buf_pool_mutex_exit(buf_pool);
- if (flush_type == BUF_FLUSH_LRU) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- }
-
- /* These fields are protected by both the
- buffer pool mutex and block mutex. */
- space = buf_page_get_space(bpage);
- offset = buf_page_get_page_no(bpage);
-
- if (flush_type == BUF_FLUSH_LRU) {
- mutex_exit(block_mutex);
- } else {
- buf_flush_list_mutex_exit(buf_pool);
- }
-
- /* Try to flush also all the neighbors */
- *count += buf_flush_try_neighbors(space,
- offset,
- flush_type,
- *count,
- n_to_flush);
-
- if (flush_type == BUF_FLUSH_LRU) {
- mutex_enter(&buf_pool->LRU_list_mutex);
- } else {
- buf_flush_list_mutex_enter(buf_pool);
- }
- flushed = TRUE;
- } else if (block_mutex) {
- mutex_exit(block_mutex);
- }
-
- ut_ad((flush_type == BUF_FLUSH_LRU
- && mutex_own(&buf_pool->LRU_list_mutex))
- || buf_flush_list_mutex_own(buf_pool));
-
- return(flushed);
-}
-
-/*******************************************************************//**
-This utility flushes dirty blocks from the end of the LRU list.
-In the case of an LRU flush the calling thread may own latches to
-pages: to avoid deadlocks, this function must be written so that it
-cannot end up waiting for these latches!
-@return number of blocks for which the write request was queued. */
-static
-ulint
-buf_flush_LRU_list_batch(
-/*=====================*/
- buf_pool_t* buf_pool, /*!< in: buffer pool instance */
- ulint max) /*!< in: max of blocks to flush */
-{
- buf_page_t* bpage;
- ulint count = 0;
-
- //ut_ad(buf_pool_mutex_own(buf_pool));
- ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
-
- do {
- /* Start from the end of the list looking for a
- suitable block to be flushed. */
- bpage = UT_LIST_GET_LAST(buf_pool->LRU);
-
- /* Iterate backwards over the flush list till we find
- a page that isn't ready for flushing. */
- while (bpage != NULL
- && !buf_flush_page_and_try_neighbors(
- bpage, BUF_FLUSH_LRU, max, &count)) {
-
- bpage = UT_LIST_GET_PREV(LRU, bpage);
- }
- } while (bpage != NULL && count < max);
-
- /* We keep track of all flushes happening as part of LRU
- flush. When estimating the desired rate at which flush_list
- should be flushed, we factor in this value. */
- buf_lru_flush_page_count += count;
-
- //ut_ad(buf_pool_mutex_own(buf_pool));
- ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
-
- return(count);
-}
-
-/*******************************************************************//**
-This utility flushes dirty blocks from the end of the flush_list.
-the calling thread is not allowed to own any latches on pages!
-@return number of blocks for which the write request was queued;
-ULINT_UNDEFINED if there was a flush of the same type already
-running */
-static
-ulint
-buf_flush_flush_list_batch(
-/*=======================*/
- buf_pool_t* buf_pool, /*!< in: buffer pool instance */
- ulint min_n, /*!< in: wished minimum mumber
- of blocks flushed (it is not
- guaranteed that the actual
- number is that big, though) */
- ib_uint64_t lsn_limit) /*!< all blocks whose
- oldest_modification is smaller
- than this should be flushed (if
- their number does not exceed
- min_n) */
-{
- ulint len;
- buf_page_t* bpage;
- ulint count = 0;
-
- //ut_ad(buf_pool_mutex_own(buf_pool));
-
- /* If we have flushed enough, leave the loop */
- do {
- /* Start from the end of the list looking for a suitable
- block to be flushed. */
-
- buf_flush_list_mutex_enter(buf_pool);
-
- /* We use len here because theoretically insertions can
- happen in the flush_list below while we are traversing
- it for a suitable candidate for flushing. We'd like to
- set a limit on how farther we are willing to traverse
- the list. */
- len = UT_LIST_GET_LEN(buf_pool->flush_list);
- bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
-
- if (bpage) {
- ut_a(bpage->oldest_modification > 0);
- }
-
- if (!bpage || bpage->oldest_modification >= lsn_limit) {
-
- /* We have flushed enough */
- buf_flush_list_mutex_exit(buf_pool);
- break;
- }
-
- ut_a(bpage->oldest_modification > 0);
-
- ut_ad(bpage->in_flush_list);
-
- /* The list may change during the flushing and we cannot
- safely preserve within this function a pointer to a
- block in the list! */
- while (bpage != NULL
- && len > 0
- && !buf_flush_page_and_try_neighbors(
- bpage, BUF_FLUSH_LIST, min_n, &count)) {
-
- /* If we are here that means that buf_pool->mutex
- was not released in buf_flush_page_and_try_neighbors()
- above and this guarantees that bpage didn't get
- relocated since we released the flush_list
- mutex above. There is a chance, however, that
- the bpage got removed from flush_list (not
- currently possible because flush_list_remove()
- also obtains buf_pool mutex but that may change
- in future). To avoid this scenario we check
- the oldest_modification and if it is zero
- we start all over again. */
- if (bpage->oldest_modification == 0) {
- buf_flush_list_mutex_exit(buf_pool);
- break;
- }
-
- bpage = UT_LIST_GET_PREV(flush_list, bpage);
-
- ut_ad(!bpage || bpage->in_flush_list);
-
- --len;
- }
-
- buf_flush_list_mutex_exit(buf_pool);
-
- } while (count < min_n && bpage != NULL && len > 0);
-
- //ut_ad(buf_pool_mutex_own(buf_pool));
-
- return(count);
-}
-
-/*******************************************************************//**
-This utility flushes dirty blocks from the end of the LRU list or flush_list.
-NOTE 1: in the case of an LRU flush the calling thread may own latches to
-pages: to avoid deadlocks, this function must be written so that it cannot
-end up waiting for these latches! NOTE 2: in the case of a flush list flush,
-the calling thread is not allowed to own any latches on pages!
-@return number of blocks for which the write request was queued;
-ULINT_UNDEFINED if there was a flush of the same type already running */
-static
-ulint
-buf_flush_batch(
-/*============*/
- buf_pool_t* buf_pool, /*!< in: buffer pool instance */
- enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU or
- BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
- then the caller must not own any
- latches on pages */
- ulint min_n, /*!< in: wished minimum mumber of blocks
- flushed (it is not guaranteed that the
- actual number is that big, though) */
- ib_uint64_t lsn_limit) /*!< in: in the case of BUF_FLUSH_LIST
- all blocks whose oldest_modification is
- smaller than this should be flushed
- (if their number does not exceed
- min_n), otherwise ignored */
-{
- ulint count = 0;
-
- ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
-#ifdef UNIV_SYNC_DEBUG
- ut_ad((flush_type != BUF_FLUSH_LIST)
- || sync_thread_levels_empty_except_dict());
-#endif /* UNIV_SYNC_DEBUG */
-
- //buf_pool_mutex_enter(buf_pool);
-
- /* Note: The buffer pool mutex is released and reacquired within
- the flush functions. */
- switch(flush_type) {
- case BUF_FLUSH_LRU:
- mutex_enter(&buf_pool->LRU_list_mutex);
- count = buf_flush_LRU_list_batch(buf_pool, min_n);
- mutex_exit(&buf_pool->LRU_list_mutex);
- break;
- case BUF_FLUSH_LIST:
- count = buf_flush_flush_list_batch(buf_pool, min_n, lsn_limit);
- break;
- default:
- ut_error;
- }
-
- //buf_pool_mutex_exit(buf_pool);
-
- buf_flush_buffered_writes();
-
-#ifdef UNIV_DEBUG
- if (buf_debug_prints && count > 0) {
- fprintf(stderr, flush_type == BUF_FLUSH_LRU
- ? "Flushed %lu pages in LRU flush\n"
- : "Flushed %lu pages in flush list flush\n",
- (ulong) count);
- }
-#endif /* UNIV_DEBUG */
-
- return(count);
-}
-
-/******************************************************************//**
-Gather the aggregated stats for both flush list and LRU list flushing */
-static
-void
-buf_flush_common(
-/*=============*/
- enum buf_flush flush_type, /*!< in: type of flush */
- ulint page_count) /*!< in: number of pages flushed */
-{
- buf_flush_buffered_writes();
-
- ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
-
-#ifdef UNIV_DEBUG
- if (buf_debug_prints && page_count > 0) {
- fprintf(stderr, flush_type == BUF_FLUSH_LRU
- ? "Flushed %lu pages in LRU flush\n"
- : "Flushed %lu pages in flush list flush\n",
- (ulong) page_count);
- }
-#endif /* UNIV_DEBUG */
-
- srv_buf_pool_flushed += page_count;
-}
-
-/******************************************************************//**
-Start a buffer flush batch for LRU or flush list */
-static
-ibool
-buf_flush_start(
-/*============*/
- buf_pool_t* buf_pool, /*!< buffer pool instance */
- enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU
- or BUF_FLUSH_LIST */
-{
- buf_pool_mutex_enter(buf_pool);
-
- if (buf_pool->n_flush[flush_type] > 0
- || buf_pool->init_flush[flush_type] == TRUE) {
-
- /* There is already a flush batch of the same type running */
-
- buf_pool_mutex_exit(buf_pool);
-
- return(FALSE);
- }
-
- buf_pool->init_flush[flush_type] = TRUE;
-
- buf_pool_mutex_exit(buf_pool);
-
- return(TRUE);
-}
-
-/******************************************************************//**
-End a buffer flush batch for LRU or flush list */
-static
-void
-buf_flush_end(
-/*==========*/
- buf_pool_t* buf_pool, /*!< buffer pool instance */
- enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU
- or BUF_FLUSH_LIST */
-{
- buf_pool_mutex_enter(buf_pool);
-
- buf_pool->init_flush[flush_type] = FALSE;
-
- if (buf_pool->n_flush[flush_type] == 0) {
-
- /* The running flush batch has ended */
-
- os_event_set(buf_pool->no_flush[flush_type]);
- }
-
- buf_pool_mutex_exit(buf_pool);
-}
-
-/******************************************************************//**
-Waits until a flush batch of the given type ends */
-UNIV_INTERN
-void
-buf_flush_wait_batch_end(
-/*=====================*/
- buf_pool_t* buf_pool, /*!< buffer pool instance */
- enum buf_flush type) /*!< in: BUF_FLUSH_LRU
- or BUF_FLUSH_LIST */
-{
- ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
-
- if (buf_pool == NULL) {
- ulint i;
-
- for (i = 0; i < srv_buf_pool_instances; ++i) {
- buf_pool_t* buf_pool;
-
- buf_pool = buf_pool_from_array(i);
-
- thd_wait_begin(NULL, THD_WAIT_DISKIO);
- os_event_wait(buf_pool->no_flush[type]);
- thd_wait_end(NULL);
- }
- } else {
- thd_wait_begin(NULL, THD_WAIT_DISKIO);
- os_event_wait(buf_pool->no_flush[type]);
- thd_wait_end(NULL);
- }
-}
-
-/*******************************************************************//**
-This utility flushes dirty blocks from the end of the LRU list.
-NOTE: The calling thread may own latches to pages: to avoid deadlocks,
-this function must be written so that it cannot end up waiting for these
-latches!
-@return number of blocks for which the write request was queued;
-ULINT_UNDEFINED if there was a flush of the same type already running */
-UNIV_INTERN
-ulint
-buf_flush_LRU(
-/*==========*/
- buf_pool_t* buf_pool, /*!< in: buffer pool instance */
- ulint min_n) /*!< in: wished minimum mumber of blocks
- flushed (it is not guaranteed that the
- actual number is that big, though) */
-{
- ulint page_count;
-
- if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
- return(ULINT_UNDEFINED);
- }
-
- page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0);
-
- buf_flush_end(buf_pool, BUF_FLUSH_LRU);
-
- buf_flush_common(BUF_FLUSH_LRU, page_count);
-
- return(page_count);
-}
-
-/*******************************************************************//**
-This utility flushes dirty blocks from the end of the flush list of
-all buffer pool instances.
-NOTE: The calling thread is not allowed to own any latches on pages!
-@return number of blocks for which the write request was queued;
-ULINT_UNDEFINED if there was a flush of the same type already running */
-UNIV_INTERN
-ulint
-buf_flush_list(
-/*===========*/
- ulint min_n, /*!< in: wished minimum mumber of blocks
- flushed (it is not guaranteed that the
- actual number is that big, though) */
- ib_uint64_t lsn_limit) /*!< in the case BUF_FLUSH_LIST all
- blocks whose oldest_modification is
- smaller than this should be flushed
- (if their number does not exceed
- min_n), otherwise ignored */
-{
- ulint i;
- ulint total_page_count = 0;
- ibool skipped = FALSE;
-
- if (min_n != ULINT_MAX) {
- /* Ensure that flushing is spread evenly amongst the
- buffer pool instances. When min_n is ULINT_MAX
- we need to flush everything up to the lsn limit
- so no limit here. */
- min_n = (min_n + srv_buf_pool_instances - 1)
- / srv_buf_pool_instances;
- }
-
- /* Flush to lsn_limit in all buffer pool instances */
- for (i = 0; i < srv_buf_pool_instances; i++) {
- buf_pool_t* buf_pool;
- ulint page_count = 0;
-
- buf_pool = buf_pool_from_array(i);
-
- if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) {
- /* We have two choices here. If lsn_limit was
- specified then skipping an instance of buffer
- pool means we cannot guarantee that all pages
- up to lsn_limit has been flushed. We can
- return right now with failure or we can try
- to flush remaining buffer pools up to the
- lsn_limit. We attempt to flush other buffer
- pools based on the assumption that it will
- help in the retry which will follow the
- failure. */
- skipped = TRUE;
-
- continue;
- }
-
- page_count = buf_flush_batch(
- buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit);
-
- buf_flush_end(buf_pool, BUF_FLUSH_LIST);
-
- buf_flush_common(BUF_FLUSH_LIST, page_count);
-
- total_page_count += page_count;
- }
-
- return(lsn_limit != IB_ULONGLONG_MAX && skipped
- ? ULINT_UNDEFINED : total_page_count);
-}
-
-/******************************************************************//**
-Gives a recommendation of how many blocks should be flushed to establish
-a big enough margin of replaceable blocks near the end of the LRU list
-and in the free list.
-@return number of blocks which should be flushed from the end of the
-LRU list */
-static
-ulint
-buf_flush_LRU_recommendation(
-/*=========================*/
- buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
-{
- buf_page_t* bpage;
- ulint n_replaceable;
- ulint distance = 0;
- ibool have_LRU_mutex = FALSE;
-
- if(UT_LIST_GET_LEN(buf_pool->unzip_LRU))
- have_LRU_mutex = TRUE;
-retry:
- //buf_pool_mutex_enter(buf_pool);
- if (have_LRU_mutex)
- mutex_enter(&buf_pool->LRU_list_mutex);
-
- n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
-
- bpage = UT_LIST_GET_LAST(buf_pool->LRU);
-
- while ((bpage != NULL)
- && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
- + BUF_FLUSH_EXTRA_MARGIN(buf_pool))
- && (distance < BUF_LRU_FREE_SEARCH_LEN(buf_pool))) {
-
- mutex_t* block_mutex;
- if (!bpage->in_LRU_list) {
- /* reatart. but it is very optimistic */
- bpage = UT_LIST_GET_LAST(buf_pool->LRU);
- continue;
- }
- block_mutex = buf_page_get_mutex_enter(bpage);
-
- if (block_mutex && buf_flush_ready_for_replace(bpage)) {
- n_replaceable++;
- }
-
- if (block_mutex) {
- mutex_exit(block_mutex);
- }
-
- distance++;
-
- bpage = UT_LIST_GET_PREV(LRU, bpage);
- }
-
- //buf_pool_mutex_exit(buf_pool);
- if (have_LRU_mutex)
- mutex_exit(&buf_pool->LRU_list_mutex);
-
- if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)) {
-
- return(0);
- } else if (!have_LRU_mutex) {
- /* confirm it again with LRU_mutex for exactness */
- have_LRU_mutex = TRUE;
- distance = 0;
- goto retry;
- }
-
- return(BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
- + BUF_FLUSH_EXTRA_MARGIN(buf_pool)
- - n_replaceable);
-}
-
-/*********************************************************************//**
-Flushes pages from the end of the LRU list if there is too small a margin
-of replaceable pages there or in the free list. VERY IMPORTANT: this function
-is called also by threads which have locks on pages. To avoid deadlocks, we
-flush only pages such that the s-lock required for flushing can be acquired
-immediately, without waiting. */
-UNIV_INTERN
-void
-buf_flush_free_margin(
-/*==================*/
- buf_pool_t* buf_pool, /*!< in: Buffer pool instance */
- ibool wait)
-{
- ulint n_to_flush;
-
- n_to_flush = buf_flush_LRU_recommendation(buf_pool);
-
- if (n_to_flush > 0) {
- ulint n_flushed;
-
- n_flushed = buf_flush_LRU(buf_pool, n_to_flush);
-
- if (wait && n_flushed == ULINT_UNDEFINED) {
- /* There was an LRU type flush batch already running;
- let us wait for it to end */
-
- buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
- }
- }
-}
-
-/*********************************************************************//**
-Flushes pages from the end of all the LRU lists. */
-UNIV_INTERN
-void
-buf_flush_free_margins(
-/*========================*/
- ibool wait)
-{
- ulint i;
-
- for (i = 0; i < srv_buf_pool_instances; i++) {
- buf_pool_t* buf_pool;
-
- buf_pool = buf_pool_from_array(i);
-
- buf_flush_free_margin(buf_pool, wait);
- }
-}
-
-/*********************************************************************
-Update the historical stats that we are collecting for flush rate
-heuristics at the end of each interval.
-Flush rate heuristic depends on (a) rate of redo log generation and
-(b) the rate at which LRU flush is happening. */
-UNIV_INTERN
-void
-buf_flush_stat_update(void)
-/*=======================*/
-{
- buf_flush_stat_t* item;
- ib_uint64_t lsn_diff;
- ib_uint64_t lsn;
- ulint n_flushed;
-
- lsn = log_get_lsn();
- if (buf_flush_stat_cur.redo == 0) {
- /* First time around. Just update the current LSN
- and return. */
- buf_flush_stat_cur.redo = lsn;
- return;
- }
-
- item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
-
- /* values for this interval */
- lsn_diff = lsn - buf_flush_stat_cur.redo;
- n_flushed = buf_lru_flush_page_count
- - buf_flush_stat_cur.n_flushed;
-
- /* add the current value and subtract the obsolete entry. */
- buf_flush_stat_sum.redo += lsn_diff - item->redo;
- buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
-
- /* put current entry in the array. */
- item->redo = lsn_diff;
- item->n_flushed = n_flushed;
-
- /* update the index */
- buf_flush_stat_arr_ind++;
- buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
-
- /* reset the current entry. */
- buf_flush_stat_cur.redo = lsn;
- buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
-}
-
-/*********************************************************************
-Determines the fraction of dirty pages that need to be flushed based
-on the speed at which we generate redo log. Note that if redo log
-is generated at a significant rate without corresponding increase
-in the number of dirty pages (for example, an in-memory workload)
-it can cause IO bursts of flushing. This function implements heuristics
-to avoid this burstiness.
-@return number of dirty pages to be flushed / second */
-UNIV_INTERN
-ulint
-buf_flush_get_desired_flush_rate(void)
-/*==================================*/
-{
- ulint i;
- lint rate;
- ulint redo_avg;
- ulint n_dirty = 0;
- ulint n_flush_req;
- ulint lru_flush_avg;
- ib_uint64_t lsn = log_get_lsn();
- ulint log_capacity = log_get_capacity();
-
- /* log_capacity should never be zero after the initialization
- of log subsystem. */
- ut_ad(log_capacity != 0);
-
- /* Get total number of dirty pages. It is OK to access
- flush_list without holding any mutex as we are using this
- only for heuristics. */
- for (i = 0; i < srv_buf_pool_instances; i++) {
- buf_pool_t* buf_pool;
-
- buf_pool = buf_pool_from_array(i);
- n_dirty += UT_LIST_GET_LEN(buf_pool->flush_list);
- }
-
- /* An overflow can happen if we generate more than 2^32 bytes
- of redo in this interval i.e.: 4G of redo in 1 second. We can
- safely consider this as infinity because if we ever come close
- to 4G we'll start a synchronous flush of dirty pages. */
- /* redo_avg below is average at which redo is generated in
- past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
- interval. */
- redo_avg = (ulint) (buf_flush_stat_sum.redo
- / BUF_FLUSH_STAT_N_INTERVAL
- + (lsn - buf_flush_stat_cur.redo));
-
- /* An overflow can happen possibly if we flush more than 2^32
- pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
- unlikely scenario. Even when this happens it means that our
- flush rate will be off the mark. It won't affect correctness
- of any subsystem. */
- /* lru_flush_avg below is rate at which pages are flushed as
- part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
- number of pages flushed in the current interval. */
- lru_flush_avg = buf_flush_stat_sum.n_flushed
- / BUF_FLUSH_STAT_N_INTERVAL
- + (buf_lru_flush_page_count
- - buf_flush_stat_cur.n_flushed);
-
- n_flush_req = (n_dirty * redo_avg) / log_capacity;
-
- /* The number of pages that we want to flush from the flush
- list is the difference between the required rate and the
- number of pages that we are historically flushing from the
- LRU list */
- rate = n_flush_req - lru_flush_avg;
- return(rate > 0 ? (ulint) rate : 0);
-}
-
-#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-/******************************************************************//**
-Validates the flush list.
-@return TRUE if ok */
-static
-ibool
-buf_flush_validate_low(
-/*===================*/
- buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
-{
- buf_page_t* bpage;
- const ib_rbt_node_t* rnode = NULL;
-
- ut_ad(buf_flush_list_mutex_own(buf_pool));
-
- UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list,
- ut_ad(ut_list_node_313->in_flush_list));
-
- bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
-
- /* If we are in recovery mode i.e.: flush_rbt != NULL
- then each block in the flush_list must also be present
- in the flush_rbt. */
- if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
- rnode = rbt_first(buf_pool->flush_rbt);
- }
-
- while (bpage != NULL) {
- const ib_uint64_t om = bpage->oldest_modification;
-
- ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
-
- ut_ad(bpage->in_flush_list);
-
- /* A page in buf_pool->flush_list can be in
- BUF_BLOCK_REMOVE_HASH state. This happens when a page
- is in the middle of being relocated. In that case the
- original descriptor can have this state and still be
- in the flush list waiting to acquire the
- buf_pool->flush_list_mutex to complete the relocation. */
- ut_a(buf_page_in_file(bpage)
- || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
- ut_a(om > 0);
-
- if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
- buf_page_t** prpage;
-
- ut_a(rnode);
- prpage = rbt_value(buf_page_t*, rnode);
-
- ut_a(*prpage);
- ut_a(*prpage == bpage);
- rnode = rbt_next(buf_pool->flush_rbt, rnode);
- }
-
- bpage = UT_LIST_GET_NEXT(flush_list, bpage);
-
- ut_a(!bpage || om >= bpage->oldest_modification);
- }
-
- /* By this time we must have exhausted the traversal of
- flush_rbt (if active) as well. */
- ut_a(rnode == NULL);
-
- return(TRUE);
-}
-
-/******************************************************************//**
-Validates the flush list.
-@return TRUE if ok */
-UNIV_INTERN
-ibool
-buf_flush_validate(
-/*===============*/
- buf_pool_t* buf_pool) /*!< buffer pool instance */
-{
- ibool ret;
-
- buf_flush_list_mutex_enter(buf_pool);
-
- ret = buf_flush_validate_low(buf_pool);
-
- buf_flush_list_mutex_exit(buf_pool);
-
- return(ret);
-}
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc
new file mode 100644
index 00000000000..abcee504d2e
--- /dev/null
+++ b/storage/xtradb/buf/buf0flu.cc
@@ -0,0 +1,2938 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0flu.cc
+The database buffer buf_pool flush algorithm
+
+Created 11/11/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0flu.h"
+
+#ifdef UNIV_NONINL
+#include "buf0flu.ic"
+#endif
+
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "page0zip.h"
+#ifndef UNIV_HOTBACKUP
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "page0page.h"
+#include "fil0fil.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+#include "os0file.h"
+#include "trx0sys.h"
+#include "srv0mon.h"
+#include "mysql/plugin.h"
+#include "mysql/service_thd_wait.h"
+
+/** Number of pages flushed through non flush_list flushes. */
+// static ulint buf_lru_flush_page_count = 0;
+
+/** Flag indicating if the page_cleaner is in active state. This flag
+is set to TRUE by the page_cleaner thread when it is spawned and is set
+back to FALSE at shutdown by the page_cleaner as well. Therefore no
+need to protect it by a mutex. It is only ever read by the thread
+doing the shutdown */
+UNIV_INTERN ibool buf_page_cleaner_is_active = FALSE;
+
+#ifdef UNIV_PFS_THREAD
+UNIV_INTERN mysql_pfs_key_t buf_page_cleaner_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+/** If LRU list of a buf_pool is less than this size then LRU eviction
+should not happen. This is because when we do LRU flushing we also put
+the blocks on free list. If LRU list is very small then we can end up
+in thrashing. */
+#define BUF_LRU_MIN_LEN 256
+
+/* @} */
+
+/** Handled page counters for a single flush */
+struct flush_counters_t {
+ ulint flushed; /*!< number of dirty pages flushed */
+ ulint evicted; /*!< number of clean pages evicted */
+};
+
+/******************************************************************//**
+Increases flush_list size in bytes with zip_size for compressed page,
+UNIV_PAGE_SIZE for uncompressed page in inline function */
+static inline
+void
+incr_flush_list_size_in_bytes(
+/*==========================*/
+ buf_block_t* block, /*!< in: control block */
+ buf_pool_t* buf_pool) /*!< in: buffer pool instance */
+{
+ ut_ad(buf_flush_list_mutex_own(buf_pool));
+ ulint zip_size = page_zip_get_size(&block->page.zip);
+ buf_pool->stat.flush_list_bytes += zip_size ? zip_size : UNIV_PAGE_SIZE;
+ ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/******************************************************************//**
+Validates the flush list.
+@return TRUE if ok */
+static
+ibool
+buf_flush_validate_low(
+/*===================*/
+ buf_pool_t* buf_pool); /*!< in: Buffer pool instance */
+
+/******************************************************************//**
+Validates the flush list some of the time.
+@return TRUE if ok or the check was skipped */
+static
+ibool
+buf_flush_validate_skip(
+/*====================*/
+ buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
+{
+/** Try buf_flush_validate_low() every this many times */
+# define BUF_FLUSH_VALIDATE_SKIP 23
+
+ /** The buf_flush_validate_low() call skip counter.
+ Use a signed type because of the race condition below. */
+ static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+
+ /* There is a race condition below, but it does not matter,
+ because this call is only for heuristic purposes. We want to
+ reduce the call frequency of the costly buf_flush_validate_low()
+ check in debug builds. */
+ if (--buf_flush_validate_count > 0) {
+ return(TRUE);
+ }
+
+ buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+ return(buf_flush_validate_low(buf_pool));
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/*******************************************************************//**
+Sets hazard pointer during flush_list iteration. */
+UNIV_INLINE
+void
+buf_flush_set_hp(
+/*=============*/
+ buf_pool_t* buf_pool,/*!< in/out: buffer pool instance */
+ const buf_page_t* bpage) /*!< in: buffer control block */
+{
+ ut_ad(buf_flush_list_mutex_own(buf_pool));
+ ut_ad(buf_pool->flush_list_hp == NULL || bpage == NULL);
+ ut_ad(!bpage || buf_page_in_file(bpage)
+ || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
+ ut_ad(!bpage || bpage->in_flush_list);
+ ut_ad(!bpage || buf_pool_from_bpage(bpage) == buf_pool);
+
+ buf_pool->flush_list_hp = bpage;
+}
+
+/*******************************************************************//**
+Checks if the given block is a hazard pointer
+@return true if bpage is hazard pointer */
+UNIV_INLINE
+bool
+buf_flush_is_hp(
+/*============*/
+ buf_pool_t* buf_pool,/*!< in: buffer pool instance */
+ const buf_page_t* bpage) /*!< in: buffer control block */
+{
+ ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+ return(buf_pool->flush_list_hp == bpage);
+}
+
+/*******************************************************************//**
+Whenever we move a block in flush_list (either to remove it or to
+relocate it) we check the hazard pointer set by some other thread
+doing the flush list scan. If the hazard pointer is the same as the
+one we are about going to move then we set it to NULL to force a rescan
+in the thread doing the batch. */
+UNIV_INLINE
+void
+buf_flush_update_hp(
+/*================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ buf_page_t* bpage) /*!< in: buffer control block */
+{
+ ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+ if (buf_flush_is_hp(buf_pool, bpage)) {
+ buf_flush_set_hp(buf_pool, NULL);
+ MONITOR_INC(MONITOR_FLUSH_HP_RESCAN);
+ }
+}
+
+/******************************************************************//**
+Insert a block in the flush_rbt and returns a pointer to its
+predecessor or NULL if no predecessor. The ordering is maintained
+on the basis of the <oldest_modification, space, offset> key.
+@return pointer to the predecessor or NULL if no predecessor. */
+static
+buf_page_t*
+buf_flush_insert_in_flush_rbt(
+/*==========================*/
+ buf_page_t* bpage) /*!< in: bpage to be inserted. */
+{
+ const ib_rbt_node_t* c_node;
+ const ib_rbt_node_t* p_node;
+ buf_page_t* prev = NULL;
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+
+ ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+ /* Insert this buffer into the rbt. */
+ c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
+ ut_a(c_node != NULL);
+
+ /* Get the predecessor. */
+ p_node = rbt_prev(buf_pool->flush_rbt, c_node);
+
+ if (p_node != NULL) {
+ buf_page_t** value;
+ value = rbt_value(buf_page_t*, p_node);
+ prev = *value;
+ ut_a(prev != NULL);
+ }
+
+ return(prev);
+}
+
+/*********************************************************//**
+Delete a bpage from the flush_rbt. */
+static
+void
+buf_flush_delete_from_flush_rbt(
+/*============================*/
+ buf_page_t* bpage) /*!< in: bpage to be removed. */
+{
+#ifdef UNIV_DEBUG
+ ibool ret = FALSE;
+#endif /* UNIV_DEBUG */
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+
+ ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+#ifdef UNIV_DEBUG
+ ret =
+#endif /* UNIV_DEBUG */
+ rbt_delete(buf_pool->flush_rbt, &bpage);
+
+ ut_ad(ret);
+}
+
+/*****************************************************************//**
+Compare two modified blocks in the buffer pool. The key for comparison
+is:
+key = <oldest_modification, space, offset>
+This comparison is used to maintian ordering of blocks in the
+buf_pool->flush_rbt.
+Note that for the purpose of flush_rbt, we only need to order blocks
+on the oldest_modification. The other two fields are used to uniquely
+identify the blocks.
+@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
+static
+int
+buf_flush_block_cmp(
+/*================*/
+ const void* p1, /*!< in: block1 */
+ const void* p2) /*!< in: block2 */
+{
+ int ret;
+ const buf_page_t* b1 = *(const buf_page_t**) p1;
+ const buf_page_t* b2 = *(const buf_page_t**) p2;
+#ifdef UNIV_DEBUG
+ buf_pool_t* buf_pool = buf_pool_from_bpage(b1);
+#endif /* UNIV_DEBUG */
+
+ ut_ad(b1 != NULL);
+ ut_ad(b2 != NULL);
+
+ ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+ ut_ad(b1->in_flush_list);
+ ut_ad(b2->in_flush_list);
+
+ if (b2->oldest_modification > b1->oldest_modification) {
+ return(1);
+ } else if (b2->oldest_modification < b1->oldest_modification) {
+ return(-1);
+ }
+
+ /* If oldest_modification is same then decide on the space. */
+ ret = (int)(b2->space - b1->space);
+
+ /* Or else decide ordering on the offset field. */
+ return(ret ? ret : (int)(b2->offset - b1->offset));
+}
+
+/********************************************************************//**
+Initialize the red-black tree to speed up insertions into the flush_list
+during recovery process. Should be called at the start of recovery
+process before any page has been read/written. */
+UNIV_INTERN
+void
+buf_flush_init_flush_rbt(void)
+/*==========================*/
+{
+ ulint i;
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+
+ buf_flush_list_mutex_enter(buf_pool);
+
+ /* Create red black tree for speedy insertions in flush list. */
+ buf_pool->flush_rbt = rbt_create(
+ sizeof(buf_page_t*), buf_flush_block_cmp);
+
+ buf_flush_list_mutex_exit(buf_pool);
+ }
+}
+
+/********************************************************************//**
+Frees up the red-black tree. */
+UNIV_INTERN
+void
+buf_flush_free_flush_rbt(void)
+/*==========================*/
+{
+ ulint i;
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+
+ buf_flush_list_mutex_enter(buf_pool);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_low(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ rbt_free(buf_pool->flush_rbt);
+ buf_pool->flush_rbt = NULL;
+
+ buf_flush_list_mutex_exit(buf_pool);
+ }
+}
+
+/********************************************************************//**
+Inserts a modified block into the flush list. */
+UNIV_INTERN
+void
+buf_flush_insert_into_flush_list(
+/*=============================*/
+ buf_pool_t* buf_pool, /*!< buffer pool instance */
+ buf_block_t* block, /*!< in/out: block which is modified */
+ lsn_t lsn) /*!< in: oldest modification */
+{
+ ut_ad(log_flush_order_mutex_own());
+ ut_ad(mutex_own(&block->mutex));
+
+ buf_flush_list_mutex_enter(buf_pool);
+
+ ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
+ || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
+ <= lsn));
+
+ /* If we are in the recovery then we need to update the flush
+ red-black tree as well. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_flush_list_mutex_exit(buf_pool);
+ buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
+ return;
+ }
+
+ ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+ ut_ad(!block->page.in_flush_list);
+
+ ut_d(block->page.in_flush_list = TRUE);
+ block->page.oldest_modification = lsn;
+ UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
+ incr_flush_list_size_in_bytes(block, buf_pool);
+
+#ifdef UNIV_DEBUG_VALGRIND
+ {
+ ulint zip_size = buf_block_get_zip_size(block);
+
+ if (zip_size) {
+ UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
+ } else {
+ UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
+ }
+ }
+#endif /* UNIV_DEBUG_VALGRIND */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_skip(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ buf_flush_list_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Inserts a modified block into the flush list in the right sorted position.
+This function is used by recovery, because there the modifications do not
+necessarily come in the order of lsn's. */
+UNIV_INTERN
+void
+buf_flush_insert_sorted_into_flush_list(
+/*====================================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ buf_block_t* block, /*!< in/out: block which is modified */
+ lsn_t lsn) /*!< in: oldest modification */
+{
+ buf_page_t* prev_b;
+ buf_page_t* b;
+
+ ut_ad(log_flush_order_mutex_own());
+ ut_ad(mutex_own(&block->mutex));
+ ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+ buf_flush_list_mutex_enter(buf_pool);
+
+ /* The field in_LRU_list is protected by buf_pool->LRU_list_mutex,
+ which we are not holding. However, while a block is in the flush
+ list, it is dirty and cannot be discarded, not from the
+ page_hash or from the LRU list. At most, the uncompressed
+ page frame of a compressed block may be discarded or created
+ (copying the block->page to or from a buf_page_t that is
+ dynamically allocated from buf_buddy_alloc()). Because those
+ transitions hold block->mutex and the flush list mutex (via
+ buf_flush_relocate_on_flush_list()), there is no possibility
+ of a race condition in the assertions below. */
+ ut_ad(block->page.in_LRU_list);
+ ut_ad(block->page.in_page_hash);
+ /* buf_buddy_block_register() will take a block in the
+ BUF_BLOCK_MEMORY state, not a file page. */
+ ut_ad(!block->page.in_zip_hash);
+
+ ut_ad(!block->page.in_flush_list);
+ ut_d(block->page.in_flush_list = TRUE);
+ block->page.oldest_modification = lsn;
+
+#ifdef UNIV_DEBUG_VALGRIND
+ {
+ ulint zip_size = buf_block_get_zip_size(block);
+
+ if (zip_size) {
+ UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
+ } else {
+ UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
+ }
+ }
+#endif /* UNIV_DEBUG_VALGRIND */
+
+ prev_b = NULL;
+
+ /* For the most part when this function is called the flush_rbt
+ should not be NULL. In a very rare boundary case it is possible
+ that the flush_rbt has already been freed by the recovery thread
+ before the last page was hooked up in the flush_list by the
+ io-handler thread. In that case we'll just do a simple
+ linear search in the else block. */
+ if (buf_pool->flush_rbt) {
+
+ prev_b = buf_flush_insert_in_flush_rbt(&block->page);
+
+ } else {
+
+ b = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+ while (b && b->oldest_modification
+ > block->page.oldest_modification) {
+ ut_ad(b->in_flush_list);
+ prev_b = b;
+ b = UT_LIST_GET_NEXT(list, b);
+ }
+ }
+
+ if (prev_b == NULL) {
+ UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
+ } else {
+ UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
+ prev_b, &block->page);
+ }
+
+ incr_flush_list_size_in_bytes(block, buf_pool);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_low(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ buf_flush_list_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Returns TRUE if the file page block is immediately suitable for replacement,
+i.e., the transition FILE_PAGE => NOT_USED allowed.
+@return TRUE if can replace immediately */
+UNIV_INTERN
+ibool
+buf_flush_ready_for_replace(
+/*========================*/
+ buf_page_t* bpage) /*!< in: buffer control block, must be
+ buf_page_in_file(bpage) and in the LRU list */
+{
+#ifdef UNIV_DEBUG
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+ ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+#endif
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+ ut_ad(bpage->in_LRU_list);
+
+ if (UNIV_LIKELY(buf_page_in_file(bpage))) {
+
+ return(bpage->oldest_modification == 0
+ && buf_page_get_io_fix(bpage) == BUF_IO_NONE
+ && bpage->buf_fix_count == 0);
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: buffer block state %lu"
+ " in the LRU list!\n",
+ (ulong) buf_page_get_state(bpage));
+ ut_print_buf(stderr, bpage, sizeof(buf_page_t));
+ putc('\n', stderr);
+
+ return(FALSE);
+}
+
+/********************************************************************//**
+Returns true if the block is modified and ready for flushing.
+@return true if can flush immediately */
+UNIV_INTERN
+bool
+buf_flush_ready_for_flush(
+/*======================*/
+ buf_page_t* bpage, /*!< in: buffer control block, must be
+ buf_page_in_file(bpage) */
+ buf_flush_t flush_type)/*!< in: type of flush */
+{
+ ut_ad(flush_type < BUF_FLUSH_N_TYPES);
+ ut_ad(mutex_own(buf_page_get_mutex(bpage))
+ || flush_type == BUF_FLUSH_LIST);
+ ut_a(buf_page_in_file(bpage));
+
+ if (bpage->oldest_modification == 0
+ || buf_page_get_io_fix_unlocked(bpage) != BUF_IO_NONE) {
+ return(false);
+ }
+
+ ut_ad(bpage->in_flush_list);
+
+ switch (flush_type) {
+ case BUF_FLUSH_LIST:
+ case BUF_FLUSH_LRU:
+ case BUF_FLUSH_SINGLE_PAGE:
+ /* Because any thread may call single page flush, even
+ when owning locks on pages, to avoid deadlocks, we must
+ make sure that the that it is not buffer fixed.
+ The same holds true for LRU flush because a user thread
+ may end up waiting for an LRU flush to end while
+ holding locks on other pages. */
+ return(bpage->buf_fix_count == 0);
+ case BUF_FLUSH_N_TYPES:
+ break;
+ }
+
+ ut_error;
+ return(false);
+}
+
+/********************************************************************//**
+Remove a block from the flush list of modified blocks. */
+UNIV_INTERN
+void
+buf_flush_remove(
+/*=============*/
+ buf_page_t* bpage) /*!< in: pointer to the block in question */
+{
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+ ulint zip_size;
+
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_DIRTY
+ || mutex_own(&buf_pool->LRU_list_mutex));
+#endif
+ ut_ad(bpage->in_flush_list);
+
+ buf_flush_list_mutex_enter(buf_pool);
+
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_POOL_WATCH:
+ case BUF_BLOCK_ZIP_PAGE:
+ /* Clean compressed pages should not be on the flush list */
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ return;
+ case BUF_BLOCK_ZIP_DIRTY:
+ buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
+ UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ buf_LRU_insert_zip_clean(bpage);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
+ break;
+ }
+
+ /* If the flush_rbt is active then delete from there as well. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_flush_delete_from_flush_rbt(bpage);
+ }
+
+ /* Must be done after we have removed it from the flush_rbt
+ because we assert on in_flush_list in comparison function. */
+ ut_d(bpage->in_flush_list = FALSE);
+
+ zip_size = page_zip_get_size(&bpage->zip);
+ buf_pool->stat.flush_list_bytes -= zip_size ? zip_size : UNIV_PAGE_SIZE;
+
+ bpage->oldest_modification = 0;
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_skip(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ buf_flush_update_hp(buf_pool, bpage);
+ buf_flush_list_mutex_exit(buf_pool);
+}
+
+/*******************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage have already been
+copied to dpage.
+IMPORTANT: When this function is called bpage and dpage are not
+exact copies of each other. For example, they both will have different
+::state. Also the ::list pointers in dpage may be stale. We need to
+use the current list node (bpage) to do the list manipulation because
+the list pointers could have changed between the time that we copied
+the contents of bpage to the dpage and the flush list manipulation
+below. */
+UNIV_INTERN
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+ buf_page_t* bpage, /*!< in/out: control block being moved */
+ buf_page_t* dpage) /*!< in/out: destination block */
+{
+ buf_page_t* prev;
+ buf_page_t* prev_b = NULL;
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+
+ /* Must reside in the same buffer pool. */
+ ut_ad(buf_pool == buf_pool_from_bpage(dpage));
+
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+ buf_flush_list_mutex_enter(buf_pool);
+
+ ut_ad(bpage->in_flush_list);
+ ut_ad(dpage->in_flush_list);
+
+ /* If recovery is active we must swap the control blocks in
+ the flush_rbt as well. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_flush_delete_from_flush_rbt(bpage);
+ prev_b = buf_flush_insert_in_flush_rbt(dpage);
+ }
+
+ /* Must be done after we have removed it from the flush_rbt
+ because we assert on in_flush_list in comparison function. */
+ ut_d(bpage->in_flush_list = FALSE);
+
+ prev = UT_LIST_GET_PREV(list, bpage);
+ UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
+
+ if (prev) {
+ ut_ad(prev->in_flush_list);
+ UT_LIST_INSERT_AFTER(
+ list,
+ buf_pool->flush_list,
+ prev, dpage);
+ } else {
+ UT_LIST_ADD_FIRST(
+ list,
+ buf_pool->flush_list,
+ dpage);
+ }
+
+ /* Just an extra check. Previous in flush_list
+ should be the same control block as in flush_rbt. */
+ ut_a(!buf_pool->flush_rbt || prev_b == prev);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_low(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ buf_flush_update_hp(buf_pool, bpage);
+ buf_flush_list_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Updates the flush system data structures when a write is completed. */
+UNIV_INTERN
+void
+buf_flush_write_complete(
+/*=====================*/
+ buf_page_t* bpage) /*!< in: pointer to the block in question */
+{
+ buf_flush_t flush_type = buf_page_get_flush_type(bpage);
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+
+ mutex_enter(&buf_pool->flush_state_mutex);
+
+ buf_flush_remove(bpage);
+
+ buf_page_set_io_fix(bpage, BUF_IO_NONE);
+
+ buf_pool->n_flush[flush_type]--;
+
+ /* fprintf(stderr, "n pending flush %lu\n",
+ buf_pool->n_flush[flush_type]); */
+
+ if (buf_pool->n_flush[flush_type] == 0
+ && buf_pool->init_flush[flush_type] == FALSE) {
+
+ /* The running flush batch has ended */
+
+ os_event_set(buf_pool->no_flush[flush_type]);
+ }
+
+ buf_dblwr_update(bpage, flush_type);
+
+ mutex_exit(&buf_pool->flush_state_mutex);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Calculate the checksum of a page from compressed table and update the page. */
+UNIV_INTERN
+void
+buf_flush_update_zip_checksum(
+/*==========================*/
+ buf_frame_t* page, /*!< in/out: Page to update */
+ ulint zip_size, /*!< in: Compressed page size */
+ lsn_t lsn) /*!< in: Lsn to stamp on the page */
+{
+ ut_a(zip_size > 0);
+
+ ib_uint32_t checksum = page_zip_calc_checksum(
+ page, zip_size,
+ static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm));
+
+ mach_write_to_8(page + FIL_PAGE_LSN, lsn);
+ memset(page + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
+}
+
+/********************************************************************//**
+Initializes a page for writing to the tablespace. */
+UNIV_INTERN
+void
+buf_flush_init_for_writing(
+/*=======================*/
+ byte* page, /*!< in/out: page */
+ void* page_zip_, /*!< in/out: compressed page, or NULL */
+ lsn_t newest_lsn) /*!< in: newest modification lsn
+ to the page */
+{
+ ib_uint32_t checksum = 0 /* silence bogus gcc warning */;
+
+ ut_ad(page);
+
+ if (page_zip_) {
+ page_zip_des_t* page_zip;
+ ulint zip_size;
+
+ page_zip = static_cast<page_zip_des_t*>(page_zip_);
+ zip_size = page_zip_get_size(page_zip);
+
+ ut_ad(zip_size);
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+
+ switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_IBUF_BITMAP:
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ /* These are essentially uncompressed pages. */
+ memcpy(page_zip->data, page, zip_size);
+ /* fall through */
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ case FIL_PAGE_INDEX:
+
+ buf_flush_update_zip_checksum(
+ page_zip->data, zip_size, newest_lsn);
+
+ return;
+ }
+
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: ERROR: The compressed page to be written"
+ " seems corrupt:", stderr);
+ ut_print_buf(stderr, page, zip_size);
+ fputs("\nInnoDB: Possibly older version of the page:", stderr);
+ ut_print_buf(stderr, page_zip->data, zip_size);
+ putc('\n', stderr);
+ ut_error;
+ }
+
+ /* Write the newest modification lsn to the page header and trailer */
+ mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
+
+ mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+ newest_lsn);
+
+ /* Store the new formula checksum */
+
+ switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ checksum = buf_calc_page_crc32(page);
+ break;
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+ checksum = (ib_uint32_t) buf_calc_page_new_checksum(page);
+ break;
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+ checksum = BUF_NO_CHECKSUM_MAGIC;
+ break;
+ /* no default so the compiler will emit a warning if new enum
+ is added and not handled here */
+ }
+
+ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
+
+ /* We overwrite the first 4 bytes of the end lsn field to store
+ the old formula checksum. Since it depends also on the field
+ FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
+ new formula checksum. */
+
+ if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB
+ || srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB) {
+
+ checksum = (ib_uint32_t) buf_calc_page_old_checksum(page);
+
+ /* In other cases we use the value assigned from above.
+ If CRC32 is used then it is faster to use that checksum
+ (calculated above) instead of calculating another one.
+ We can afford to store something other than
+ buf_calc_page_old_checksum() or BUF_NO_CHECKSUM_MAGIC in
+ this field because the file will not be readable by old
+ versions of MySQL/InnoDB anyway (older than MySQL 5.6.3) */
+ }
+
+ mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+ checksum);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Does an asynchronous write of a buffer page. NOTE: in simulated aio and
+also when the doublewrite buffer is used, we must call
+buf_dblwr_flush_buffered_writes after we have posted a batch of
+writes! */
+static
+void
+buf_flush_write_block_low(
+/*======================*/
+ buf_page_t* bpage, /*!< in: buffer block to write */
+ buf_flush_t flush_type, /*!< in: type of flush */
+ bool sync) /*!< in: true if sync IO request */
+{
+ ulint zip_size = buf_page_get_zip_size(bpage);
+ page_t* frame = NULL;
+
+#ifdef UNIV_DEBUG
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+ ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
+#endif
+
+#ifdef UNIV_LOG_DEBUG
+ static ibool univ_log_debug_warned;
+#endif /* UNIV_LOG_DEBUG */
+
+ ut_ad(buf_page_in_file(bpage));
+
+ /* We are not holding block_mutex here.
+ Nevertheless, it is safe to access bpage, because it is
+ io_fixed and oldest_modification != 0. Thus, it cannot be
+ relocated in the buffer pool or removed from flush_list or
+ LRU_list. */
+ ut_ad(!buf_flush_list_mutex_own(buf_pool));
+ ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
+ ut_ad(buf_page_get_io_fix_unlocked(bpage) == BUF_IO_WRITE);
+ ut_ad(bpage->oldest_modification != 0);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
+#endif
+ ut_ad(bpage->newest_modification != 0);
+
+#ifdef UNIV_LOG_DEBUG
+ if (!univ_log_debug_warned) {
+ univ_log_debug_warned = TRUE;
+ fputs("Warning: cannot force log to disk if"
+ " UNIV_LOG_DEBUG is defined!\n"
+ "Crash recovery will not work!\n",
+ stderr);
+ }
+#else
+ /* Force the log to the disk before writing the modified block */
+ log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
+#endif
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_POOL_WATCH:
+ case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ case BUF_BLOCK_ZIP_DIRTY:
+ frame = bpage->zip.data;
+
+ ut_a(page_zip_verify_checksum(frame, zip_size));
+
+ mach_write_to_8(frame + FIL_PAGE_LSN,
+ bpage->newest_modification);
+ memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ frame = bpage->zip.data;
+ if (!frame) {
+ frame = ((buf_block_t*) bpage)->frame;
+ }
+
+ buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
+ bpage->zip.data
+ ? &bpage->zip : NULL,
+ bpage->newest_modification);
+ break;
+ }
+
+ if (!srv_use_doublewrite_buf || !buf_dblwr) {
+ fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+ sync, buf_page_get_space(bpage), zip_size,
+ buf_page_get_page_no(bpage), 0,
+ zip_size ? zip_size : UNIV_PAGE_SIZE,
+ frame, bpage);
+ } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
+ buf_dblwr_write_single_page(bpage, sync);
+ } else {
+ ut_ad(!sync);
+ buf_dblwr_add_to_batch(bpage);
+ }
+
+ /* When doing single page flushing the IO is done synchronously
+ and we flush the changes to disk only for the tablespace we
+ are working on. */
+ if (sync) {
+ ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
+ fil_flush(buf_page_get_space(bpage));
+ buf_page_io_complete(bpage);
+ }
+
+ /* Increment the counter of I/O operations used
+ for selecting LRU policy. */
+ buf_LRU_stat_inc_io();
+}
+
+/********************************************************************//**
+Writes a flushable page asynchronously from the buffer pool to a file.
+NOTE: in simulated aio we must call
+os_aio_simulated_wake_handler_threads after we have posted a batch of
+writes! NOTE: buf_page_get_mutex(bpage) must be held upon entering this
+function, and it will be released by this function. */
+UNIV_INTERN
+void
+buf_flush_page(
+/*===========*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ buf_page_t* bpage, /*!< in: buffer control block */
+ buf_flush_t flush_type, /*!< in: type of flush */
+ bool sync) /*!< in: true if sync IO request */
+{
+ ib_mutex_t* block_mutex;
+ ibool is_uncompressed;
+
+ ut_ad(flush_type < BUF_FLUSH_N_TYPES);
+ ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
+ ut_ad(buf_page_in_file(bpage));
+ ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE);
+
+ block_mutex = buf_page_get_mutex(bpage);
+ ut_ad(mutex_own(block_mutex));
+
+ ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
+
+ mutex_enter(&buf_pool->flush_state_mutex);
+
+ buf_page_set_io_fix(bpage, BUF_IO_WRITE);
+
+ buf_page_set_flush_type(bpage, flush_type);
+
+ if (buf_pool->n_flush[flush_type] == 0) {
+
+ os_event_reset(buf_pool->no_flush[flush_type]);
+ }
+
+ buf_pool->n_flush[flush_type]++;
+
+ mutex_exit(&buf_pool->flush_state_mutex);
+
+ is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+ ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
+
+ switch (flush_type) {
+ ibool is_s_latched;
+ case BUF_FLUSH_LIST:
+ /* If the simulated aio thread is not running, we must
+ not wait for any latch, as we may end up in a deadlock:
+ if buf_fix_count == 0, then we know we need not wait */
+
+ is_s_latched = (bpage->buf_fix_count == 0);
+ if (is_s_latched && is_uncompressed) {
+ rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
+ BUF_IO_WRITE);
+ }
+
+ mutex_exit(block_mutex);
+
+ /* Even though bpage is not protected by any mutex at
+ this point, it is safe to access bpage, because it is
+ io_fixed and oldest_modification != 0. Thus, it
+ cannot be relocated in the buffer pool or removed from
+ flush_list or LRU_list. */
+
+ if (!is_s_latched) {
+ buf_dblwr_flush_buffered_writes();
+
+ if (is_uncompressed) {
+ rw_lock_s_lock_gen(&((buf_block_t*) bpage)
+ ->lock, BUF_IO_WRITE);
+ }
+ }
+
+ break;
+
+ case BUF_FLUSH_LRU:
+ case BUF_FLUSH_SINGLE_PAGE:
+ /* VERY IMPORTANT:
+ Because any thread may call single page flush, even when
+ owning locks on pages, to avoid deadlocks, we must make
+ sure that the s-lock is acquired on the page without
+ waiting: this is accomplished because
+ buf_flush_ready_for_flush() must hold, and that requires
+ the page not to be bufferfixed.
+ The same holds true for LRU flush because a user thread
+ may end up waiting for an LRU flush to end while
+ holding locks on other pages. */
+
+ if (is_uncompressed) {
+ rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
+ BUF_IO_WRITE);
+ }
+
+ /* Note that the s-latch is acquired before releasing the
+ buf_page_get_mutex() mutex: this ensures that the latch is
+ acquired immediately. */
+
+ mutex_exit(block_mutex);
+ break;
+
+ default:
+ ut_error;
+ }
+
+ /* Even though bpage is not protected by any mutex at this
+ point, it is safe to access bpage, because it is io_fixed and
+ oldest_modification != 0. Thus, it cannot be relocated in the
+ buffer pool or removed from flush_list or LRU_list. */
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+ fprintf(stderr,
+ "Flushing %u space %u page %u\n",
+ flush_type, bpage->space, bpage->offset);
+ }
+#endif /* UNIV_DEBUG */
+ buf_flush_write_block_low(bpage, flush_type, sync);
+}
+
+# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/********************************************************************//**
+Writes a flushable page asynchronously from the buffer pool to a file.
+NOTE: block->mutex must be held upon entering this function, and it will be
+released by this function after flushing. This is loosely based on
+buf_flush_batch() and buf_flush_page().
+@return TRUE if the page was flushed and the mutexes released */
+UNIV_INTERN
+ibool
+buf_flush_page_try(
+/*===============*/
+ buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
+ buf_block_t* block) /*!< in/out: buffer control block */
+{
+ ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+ ut_ad(mutex_own(&block->mutex));
+
+ if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
+ return(FALSE);
+ }
+
+ /* The following call will release the buffer pool and
+ block mutex. */
+ buf_flush_page(buf_pool, &block->page, BUF_FLUSH_SINGLE_PAGE, true);
+ return(TRUE);
+}
+# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+/***********************************************************//**
+Check the page is in buffer pool and can be flushed.
+@return true if the page can be flushed. */
+static
+bool
+buf_flush_check_neighbor(
+/*=====================*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: page offset */
+ buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU or
+ BUF_FLUSH_LIST */
+{
+ buf_page_t* bpage;
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
+ bool ret;
+ prio_rw_lock_t* hash_lock;
+ ib_mutex_t* block_mutex;
+
+ ut_ad(flush_type == BUF_FLUSH_LRU
+ || flush_type == BUF_FLUSH_LIST);
+
+ /* We only want to flush pages from this buffer pool. */
+ bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
+ &hash_lock);
+
+ if (!bpage) {
+
+ return(false);
+ }
+
+ block_mutex = buf_page_get_mutex(bpage);
+
+ mutex_enter(block_mutex);
+
+ rw_lock_s_unlock(hash_lock);
+
+ ut_a(buf_page_in_file(bpage));
+
+ /* We avoid flushing 'non-old' blocks in an LRU flush,
+ because the flushed blocks are soon freed */
+
+ ret = false;
+ if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) {
+
+ if (buf_flush_ready_for_flush(bpage, flush_type)) {
+ ret = true;
+ }
+ }
+
+ mutex_exit(block_mutex);
+
+ return(ret);
+}
+
+/***********************************************************//**
+Flushes to disk all flushable pages within the flush area.
+@return number of pages flushed */
+static
+ulint
+buf_flush_try_neighbors(
+/*====================*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: page offset */
+ buf_flush_t flush_type, /*!< in: BUF_FLUSH_LRU or
+ BUF_FLUSH_LIST */
+ ulint n_flushed, /*!< in: number of pages
+ flushed so far in this batch */
+ ulint n_to_flush) /*!< in: maximum number of pages
+ we are allowed to flush */
+{
+ ulint i;
+ ulint low;
+ ulint high;
+ ulint count = 0;
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
+
+ ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+ ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
+ ut_ad(!buf_flush_list_mutex_own(buf_pool));
+
+ if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
+ || srv_flush_neighbors == 0) {
+ /* If there is little space or neighbor flushing is
+ not enabled then just flush the victim. */
+ low = offset;
+ high = offset + 1;
+ } else {
+ /* When flushed, dirty blocks are searched in
+ neighborhoods of this size, and flushed along with the
+ original page. */
+
+ ulint buf_flush_area;
+
+ buf_flush_area = ut_min(
+ BUF_READ_AHEAD_AREA(buf_pool),
+ buf_pool->curr_size / 16);
+
+ low = (offset / buf_flush_area) * buf_flush_area;
+ high = (offset / buf_flush_area + 1) * buf_flush_area;
+
+ if (srv_flush_neighbors == 1) {
+ /* adjust 'low' and 'high' to limit
+ for contiguous dirty area */
+ if (offset > low) {
+ for (i = offset - 1;
+ i >= low
+ && buf_flush_check_neighbor(
+ space, i, flush_type);
+ i--) {
+ /* do nothing */
+ }
+ low = i + 1;
+ }
+
+ for (i = offset + 1;
+ i < high
+ && buf_flush_check_neighbor(
+ space, i, flush_type);
+ i++) {
+ /* do nothing */
+ }
+ high = i;
+ }
+ }
+
+ /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
+
+ if (high > fil_space_get_size(space)) {
+ high = fil_space_get_size(space);
+ }
+
+ for (i = low; i < high; i++) {
+
+ buf_page_t* bpage;
+ prio_rw_lock_t* hash_lock;
+ ib_mutex_t* block_mutex;
+
+ if ((count + n_flushed) >= n_to_flush) {
+
+ /* We have already flushed enough pages and
+ should call it a day. There is, however, one
+ exception. If the page whose neighbors we
+ are flushing has not been flushed yet then
+ we'll try to flush the victim that we
+ selected originally. */
+ if (i <= offset) {
+ i = offset;
+ } else {
+ break;
+ }
+ }
+
+ buf_pool = buf_pool_get(space, i);
+
+ /* We only want to flush pages from this buffer pool. */
+ bpage = buf_page_hash_get_s_locked(buf_pool, space, i,
+ &hash_lock);
+
+ if (!bpage) {
+
+ continue;
+ }
+
+ block_mutex = buf_page_get_mutex(bpage);
+
+ mutex_enter(block_mutex);
+
+ rw_lock_s_unlock(hash_lock);
+
+ ut_a(buf_page_in_file(bpage));
+
+ /* We avoid flushing 'non-old' blocks in an LRU flush,
+ because the flushed blocks are soon freed */
+
+ if (flush_type != BUF_FLUSH_LRU
+ || i == offset
+ || buf_page_is_old(bpage)) {
+
+ if (buf_flush_ready_for_flush(bpage, flush_type)
+ && (i == offset || !bpage->buf_fix_count)) {
+ /* We only try to flush those
+ neighbors != offset where the buf fix
+ count is zero, as we then know that we
+ probably can latch the page without a
+ semaphore wait. Semaphore waits are
+ expensive because we must flush the
+ doublewrite buffer before we start
+ waiting. */
+
+ buf_flush_page(buf_pool, bpage, flush_type, false);
+ ut_ad(!mutex_own(block_mutex));
+ count++;
+ continue;
+ }
+ }
+
+ mutex_exit(block_mutex);
+ }
+
+ if (count > 0) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+ MONITOR_FLUSH_NEIGHBOR_COUNT,
+ MONITOR_FLUSH_NEIGHBOR_PAGES,
+ (count - 1));
+ }
+
+ return(count);
+}
+
+/********************************************************************//**
+Check if the block is modified and ready for flushing. If the the block
+is ready to flush then flush the page and try o flush its neighbors.
+
+@return TRUE if, depending on the flush type, either LRU or flush list
+mutex was released during this function. This does not guarantee that some
+pages were written as well.
+Number of pages written are incremented to the count. */
+static
+ibool
+buf_flush_page_and_try_neighbors(
+/*=============================*/
+ buf_page_t* bpage, /*!< in: buffer control block,
+ must be
+ buf_page_in_file(bpage) */
+ buf_flush_t flush_type, /*!< in: BUF_FLUSH_LRU
+ or BUF_FLUSH_LIST */
+ ulint n_to_flush, /*!< in: number of pages to
+ flush */
+ ulint* count) /*!< in/out: number of pages
+ flushed */
+{
+ ib_mutex_t* block_mutex = NULL;
+ ibool flushed = FALSE;
+#ifdef UNIV_DEBUG
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+#endif /* UNIV_DEBUG */
+
+ ut_ad((flush_type == BUF_FLUSH_LRU
+ && mutex_own(&buf_pool->LRU_list_mutex))
+ || (flush_type == BUF_FLUSH_LIST
+ && buf_flush_list_mutex_own(buf_pool)));
+
+ if (flush_type == BUF_FLUSH_LRU) {
+ block_mutex = buf_page_get_mutex(bpage);
+ mutex_enter(block_mutex);
+ }
+
+ if (UNIV_UNLIKELY(buf_page_get_state(bpage)
+ == BUF_BLOCK_REMOVE_HASH)) {
+
+ /* In case we don't hold the LRU list mutex, we may see a page
+ that is about to be relocated on the flush list. Do not
+ attempt to flush it. */
+ ut_ad(flush_type == BUF_FLUSH_LIST);
+ return (flushed);
+ }
+
+ ut_a(buf_page_in_file(bpage));
+
+ if (buf_flush_ready_for_flush(bpage, flush_type)) {
+ ulint space;
+ ulint offset;
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_bpage(bpage);
+
+ if (flush_type == BUF_FLUSH_LRU) {
+ mutex_exit(&buf_pool->LRU_list_mutex);
+ }
+
+ /* These fields are protected by the buf_page_get_mutex()
+ mutex. */
+ space = buf_page_get_space(bpage);
+ offset = buf_page_get_page_no(bpage);
+
+ if (flush_type == BUF_FLUSH_LRU) {
+ mutex_exit(block_mutex);
+ } else {
+ buf_flush_list_mutex_exit(buf_pool);
+ }
+
+ /* Try to flush also all the neighbors */
+ *count += buf_flush_try_neighbors(space,
+ offset,
+ flush_type,
+ *count,
+ n_to_flush);
+
+ if (flush_type == BUF_FLUSH_LRU) {
+ mutex_enter(&buf_pool->LRU_list_mutex);
+ } else {
+ buf_flush_list_mutex_enter(buf_pool);
+ }
+ flushed = TRUE;
+ } else if (flush_type == BUF_FLUSH_LRU) {
+ mutex_exit(block_mutex);
+ }
+
+ ut_ad((flush_type == BUF_FLUSH_LRU
+ && mutex_own(&buf_pool->LRU_list_mutex))
+ || (flush_type == BUF_FLUSH_LIST
+ && buf_flush_list_mutex_own(buf_pool)));
+
+ return(flushed);
+}
+
+/*******************************************************************//**
+This utility moves the uncompressed frames of pages to the free list.
+Note that this function does not actually flush any data to disk. It
+just detaches the uncompressed frames from the compressed pages at the
+tail of the unzip_LRU and puts those freed frames in the free list.
+Note that it is a best effort attempt and it is not guaranteed that
+after a call to this function there will be 'max' blocks in the free
+list.
+@return number of blocks moved to the free list. */
+static
+ulint
+buf_free_from_unzip_LRU_list_batch(
+/*===============================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ ulint max) /*!< in: desired number of
+ blocks in the free_list */
+{
+ buf_block_t* block;
+ ulint scanned = 0;
+ ulint count = 0;
+ ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
+ ulint lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+
+ ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+
+ block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
+ while (block != NULL && count < max
+ && free_len < srv_LRU_scan_depth
+ && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
+
+ ib_mutex_t* block_mutex = buf_page_get_mutex(&block->page);
+
+ ++scanned;
+
+ mutex_enter(block_mutex);
+
+ if (buf_LRU_free_page(&block->page, false)) {
+
+ mutex_exit(block_mutex);
+ /* Block was freed. LRU list mutex potentially
+ released and reacquired */
+ ++count;
+ mutex_enter(&buf_pool->LRU_list_mutex);
+ block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
+
+ } else {
+
+ mutex_exit(block_mutex);
+ block = UT_LIST_GET_PREV(unzip_LRU, block);
+ }
+
+ free_len = UT_LIST_GET_LEN(buf_pool->free);
+ lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+ }
+
+ ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+
+ if (scanned) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_BATCH_SCANNED,
+ MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+ scanned);
+ }
+
+ return(count);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list.
+The calling thread is not allowed to own any latches on pages!
+It attempts to make 'max' blocks available in the free list. Note that
+it is a best effort attempt and it is not guaranteed that after a call
+to this function there will be 'max' blocks in the free list.
+@return number of blocks for which the write request was queued. */
+__attribute__((nonnull))
+static
+void
+buf_flush_LRU_list_batch(
+/*=====================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ ulint max, /*!< in: desired number of
+ blocks in the free_list */
+ bool limited_scan, /*!< in: if true, allow to scan only up
+ to srv_LRU_scan_depth pages in total */
+ flush_counters_t* n) /*!< out: flushed/evicted page
+ counts */
+{
+ buf_page_t* bpage;
+ ulint scanned = 0;
+ ulint lru_position = 0;
+ ulint max_lru_position;
+ ulint max_scanned_pages;
+ ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
+ ulint lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+ n->flushed = 0;
+ n->evicted = 0;
+
+ ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+
+ max_scanned_pages = limited_scan ? srv_LRU_scan_depth : lru_len * max;
+ max_lru_position = ut_min(srv_LRU_scan_depth, lru_len);
+
+ bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+ while (bpage != NULL
+ && (srv_cleaner_eviction_factor ? n->evicted : n->flushed) < max
+ && free_len < srv_LRU_scan_depth
+ && lru_len > BUF_LRU_MIN_LEN
+ && lru_position < max_lru_position
+ && scanned < max_scanned_pages) {
+
+ ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
+ ibool evict;
+ ulint failed_acquire;
+
+ ++scanned;
+ ++lru_position;
+
+ failed_acquire = mutex_enter_nowait(block_mutex);
+
+ evict = UNIV_LIKELY(!failed_acquire)
+ && buf_flush_ready_for_replace(bpage);
+
+ if (UNIV_LIKELY(!failed_acquire) && !evict) {
+
+ mutex_exit(block_mutex);
+ }
+
+ /* If the block is ready to be replaced we try to
+ free it i.e.: put it on the free list.
+ Otherwise we try to flush the block and its
+ neighbors. In this case we'll put it on the
+ free list in the next pass. We do this extra work
+ of putting blocks to the free list instead of
+ just flushing them because after every flush
+ we have to restart the scan from the tail of
+ the LRU list and if we don't clear the tail
+ of the flushed pages then the scan becomes
+ O(n*n). */
+ if (evict) {
+ if (buf_LRU_free_page(bpage, true)) {
+
+ mutex_exit(block_mutex);
+ n->evicted++;
+ lru_position = 0;
+ mutex_enter(&buf_pool->LRU_list_mutex);
+ bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+ } else {
+
+ bpage = UT_LIST_GET_PREV(LRU, bpage);
+ mutex_exit(block_mutex);
+ }
+ } else if (UNIV_LIKELY(!failed_acquire)) {
+
+ if (buf_flush_page_and_try_neighbors(
+ bpage,
+ BUF_FLUSH_LRU, max, &n->flushed)) {
+
+ lru_position = 0;
+
+ /* LRU list mutex was released.
+ Restart the scan. */
+ bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+ } else {
+
+ bpage = UT_LIST_GET_PREV(LRU, bpage);
+ }
+ }
+
+ free_len = UT_LIST_GET_LEN(buf_pool->free);
+ lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+ }
+
+ ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+
+ /* We keep track of all flushes happening as part of LRU
+ flush. When estimating the desired rate at which flush_list
+ should be flushed, we factor in this value. */
+ buf_pool->stat.buf_lru_flush_page_count += n->flushed;
+
+ if (scanned) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_BATCH_SCANNED,
+ MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+ scanned);
+ }
+}
+
+/*******************************************************************//**
+Flush and move pages from LRU or unzip_LRU list to the free list.
+Whether LRU or unzip_LRU is used depends on the state of the system.
+@return number of blocks for which either the write request was queued
+or in case of unzip_LRU the number of blocks actually moved to the
+free list */
+__attribute__((nonnull))
+static
+void
+buf_do_LRU_batch(
+/*=============*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ ulint max, /*!< in: desired number of
+ blocks in the free_list */
+ bool limited_scan, /*!< in: if true, allow to scan only up
+ to srv_LRU_scan_depth pages in total */
+ flush_counters_t* n) /*!< out: flushed/evicted page
+ counts */
+{
+ ulint count = 0;
+
+ if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
+ count += buf_free_from_unzip_LRU_list_batch(buf_pool, max);
+ }
+
+ if (max > count) {
+ buf_flush_LRU_list_batch(buf_pool, max - count, limited_scan,
+ n);
+ } else {
+ n->evicted = 0;
+ n->flushed = 0;
+ }
+
+ n->evicted += count;
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the flush_list.
+the calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued;
+ULINT_UNDEFINED if there was a flush of the same type already
+running */
+static
+ulint
+buf_do_flush_list_batch(
+/*====================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ ulint min_n, /*!< in: wished minimum mumber
+ of blocks flushed (it is not
+ guaranteed that the actual
+ number is that big, though) */
+ lsn_t lsn_limit) /*!< all blocks whose
+ oldest_modification is smaller
+ than this should be flushed (if
+ their number does not exceed
+ min_n) */
+{
+ ulint count = 0;
+ ulint scanned = 0;
+
+ /* Start from the end of the list looking for a suitable
+ block to be flushed. */
+ buf_flush_list_mutex_enter(buf_pool);
+ ulint len = UT_LIST_GET_LEN(buf_pool->flush_list);
+
+ /* In order not to degenerate this scan to O(n*n) we attempt
+ to preserve pointer of previous block in the flush list. To do
+ so we declare it a hazard pointer. Any thread working on the
+ flush list must check the hazard pointer and if it is removing
+ the same block then it must reset it. */
+ for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+ count < min_n && bpage != NULL && len > 0
+ && bpage->oldest_modification < lsn_limit;
+ ++scanned) {
+
+ buf_page_t* prev;
+
+ ut_a(bpage->oldest_modification > 0);
+ ut_ad(bpage->in_flush_list);
+
+ prev = UT_LIST_GET_PREV(list, bpage);
+ buf_flush_set_hp(buf_pool, prev);
+
+#ifdef UNIV_DEBUG
+ bool flushed =
+#endif /* UNIV_DEBUG */
+ buf_flush_page_and_try_neighbors(
+ bpage, BUF_FLUSH_LIST, min_n, &count);
+
+ ut_ad(flushed || buf_flush_is_hp(buf_pool, prev));
+
+ if (!buf_flush_is_hp(buf_pool, prev)) {
+ /* The hazard pointer was reset by some other
+ thread. Restart the scan. */
+ ut_ad(buf_flush_is_hp(buf_pool, NULL));
+ bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+ len = UT_LIST_GET_LEN(buf_pool->flush_list);
+ } else {
+ bpage = prev;
+ --len;
+ buf_flush_set_hp(buf_pool, NULL);
+ }
+
+ ut_ad(!bpage || bpage->in_flush_list);
+ }
+
+ buf_flush_list_mutex_exit(buf_pool);
+
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
+ MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+ MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+ scanned);
+
+ return(count);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list or flush_list.
+NOTE 1: in the case of an LRU flush the calling thread may own latches to
+pages: to avoid deadlocks, this function must be written so that it cannot
+end up waiting for these latches! NOTE 2: in the case of a flush list flush,
+the calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued */
+__attribute__((nonnull))
+static
+void
+buf_flush_batch(
+/*============*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ buf_flush_t flush_type, /*!< in: BUF_FLUSH_LRU or
+ BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
+ then the caller must not own any
+ latches on pages */
+ ulint min_n, /*!< in: wished minimum mumber of blocks
+ flushed (it is not guaranteed that the
+ actual number is that big, though) */
+ lsn_t lsn_limit, /*!< in: in the case of BUF_FLUSH_LIST
+ all blocks whose oldest_modification is
+ smaller than this should be flushed
+ (if their number does not exceed
+ min_n), otherwise ignored */
+ bool limited_lru_scan,/*!< in: for LRU flushes, if true,
+ allow to scan only up to
+ srv_LRU_scan_depth pages in total */
+ flush_counters_t* n) /*!< out: flushed/evicted page
+ counts */
+{
+ ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad((flush_type != BUF_FLUSH_LIST)
+ || sync_thread_levels_empty_except_dict());
+#endif /* UNIV_SYNC_DEBUG */
+
+ /* Note: The buffer pool mutexes are released and reacquired within
+ the flush functions. */
+ switch (flush_type) {
+ case BUF_FLUSH_LRU:
+ mutex_enter(&buf_pool->LRU_list_mutex);
+ buf_do_LRU_batch(buf_pool, min_n, limited_lru_scan, n);
+ mutex_exit(&buf_pool->LRU_list_mutex);
+ break;
+ case BUF_FLUSH_LIST:
+ ut_ad(!limited_lru_scan);
+ n->flushed = buf_do_flush_list_batch(buf_pool, min_n,
+ lsn_limit);
+ n->evicted = 0;
+ break;
+ default:
+ ut_error;
+ }
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints && n->flushed > 0) {
+ fprintf(stderr, flush_type == BUF_FLUSH_LRU
+ ? "Flushed %lu pages in LRU flush\n"
+ : "Flushed %lu pages in flush list flush\n",
+ (ulong) n->flushed);
+ }
+#endif /* UNIV_DEBUG */
+}
+
+/******************************************************************//**
+Gather the aggregated stats for both flush list and LRU list flushing */
+static
+void
+buf_flush_common(
+/*=============*/
+ buf_flush_t flush_type, /*!< in: type of flush */
+ ulint page_count) /*!< in: number of pages flushed */
+{
+ if (page_count) {
+ buf_dblwr_flush_buffered_writes();
+ }
+
+ ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints && page_count > 0) {
+ fprintf(stderr, flush_type == BUF_FLUSH_LRU
+ ? "Flushed %lu pages in LRU flush\n"
+ : "Flushed %lu pages in flush list flush\n",
+ (ulong) page_count);
+ }
+#endif /* UNIV_DEBUG */
+
+ srv_stats.buf_pool_flushed.add(page_count);
+}
+
+/******************************************************************//**
+Start a buffer flush batch for LRU or flush list */
+static
+ibool
+buf_flush_start(
+/*============*/
+ buf_pool_t* buf_pool, /*!< buffer pool instance */
+ buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU
+ or BUF_FLUSH_LIST */
+{
+ mutex_enter(&buf_pool->flush_state_mutex);
+
+ if (buf_pool->n_flush[flush_type] > 0
+ || buf_pool->init_flush[flush_type] == TRUE) {
+
+ /* There is already a flush batch of the same type running */
+
+ mutex_exit(&buf_pool->flush_state_mutex);
+
+ return(FALSE);
+ }
+
+ buf_pool->init_flush[flush_type] = TRUE;
+
+ mutex_exit(&buf_pool->flush_state_mutex);
+
+ return(TRUE);
+}
+
+/******************************************************************//**
+End a buffer flush batch for LRU or flush list */
+static
+void
+buf_flush_end(
+/*==========*/
+ buf_pool_t* buf_pool, /*!< buffer pool instance */
+ buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU
+ or BUF_FLUSH_LIST */
+{
+ mutex_enter(&buf_pool->flush_state_mutex);
+
+ buf_pool->init_flush[flush_type] = FALSE;
+
+ buf_pool->try_LRU_scan = TRUE;
+
+ if (buf_pool->n_flush[flush_type] == 0) {
+
+ /* The running flush batch has ended */
+
+ os_event_set(buf_pool->no_flush[flush_type]);
+ }
+
+ mutex_exit(&buf_pool->flush_state_mutex);
+}
+
+/******************************************************************//**
+Waits until a flush batch of the given type ends */
+UNIV_INTERN
+void
+buf_flush_wait_batch_end(
+/*=====================*/
+ buf_pool_t* buf_pool, /*!< buffer pool instance */
+ buf_flush_t type) /*!< in: BUF_FLUSH_LRU
+ or BUF_FLUSH_LIST */
+{
+ ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
+
+ if (buf_pool == NULL) {
+ ulint i;
+
+ for (i = 0; i < srv_buf_pool_instances; ++i) {
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+
+ thd_wait_begin(NULL, THD_WAIT_DISKIO);
+ os_event_wait(buf_pool->no_flush[type]);
+ thd_wait_end(NULL);
+ }
+ } else {
+ thd_wait_begin(NULL, THD_WAIT_DISKIO);
+ os_event_wait(buf_pool->no_flush[type]);
+ thd_wait_end(NULL);
+ }
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list and also
+puts replaceable clean pages from the end of the LRU list to the free
+list.
+NOTE: The calling thread is not allowed to own any latches on pages!
+@return true if a batch was queued successfully. false if another batch
+of same type was already running. */
+__attribute__((nonnull))
+static
+bool
+buf_flush_LRU(
+/*==========*/
+ buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
+ ulint min_n, /*!< in: wished minimum mumber of blocks
+ flushed (it is not guaranteed that the
+ actual number is that big, though) */
+ bool limited_scan, /*!< in: if true, allow to scan
+ only up to srv_LRU_scan_depth
+ pages in total */
+ flush_counters_t *n) /*!< out: flushed/evicted page
+ counts */
+{
+ if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
+ n->flushed = 0;
+ n->evicted = 0;
+ return(false);
+ }
+
+ buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0, limited_scan, n);
+
+ buf_flush_end(buf_pool, BUF_FLUSH_LRU);
+
+ buf_flush_common(BUF_FLUSH_LRU, n->flushed);
+
+ return(true);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the flush list of
+all buffer pool instances.
+NOTE: The calling thread is not allowed to own any latches on pages!
+@return true if a batch was queued successfully for each buffer pool
+instance. false if another batch of same type was already running in
+at least one of the buffer pool instance */
+UNIV_INTERN
+bool
+buf_flush_list(
+/*===========*/
+ ulint min_n, /*!< in: wished minimum mumber of blocks
+ flushed (it is not guaranteed that the
+ actual number is that big, though) */
+ lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all
+ blocks whose oldest_modification is
+ smaller than this should be flushed
+ (if their number does not exceed
+ min_n), otherwise ignored */
+ ulint* n_processed) /*!< out: the number of pages
+ which were processed is passed
+ back to caller. Ignored if NULL */
+
+{
+ ulint i;
+
+ ulint requested_pages[MAX_BUFFER_POOLS];
+ bool active_instance[MAX_BUFFER_POOLS];
+ ulint remaining_instances = srv_buf_pool_instances;
+ bool timeout = false;
+ ulint flush_start_time = 0;
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ requested_pages[i] = 0;
+ active_instance[i] = true;
+ }
+
+ if (n_processed) {
+ *n_processed = 0;
+ }
+
+ if (min_n != ULINT_MAX) {
+ /* Ensure that flushing is spread evenly amongst the
+ buffer pool instances. When min_n is ULINT_MAX
+ we need to flush everything up to the lsn limit
+ so no limit here. */
+ min_n = (min_n + srv_buf_pool_instances - 1)
+ / srv_buf_pool_instances;
+ if (lsn_limit != LSN_MAX) {
+ flush_start_time = ut_time_ms();
+ }
+ }
+
+ /* Flush to lsn_limit in all buffer pool instances */
+ while (remaining_instances && !timeout) {
+
+ ulint flush_common_batch = 0;
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+
+ if (flush_start_time
+ && (ut_time_ms() - flush_start_time
+ >= srv_cleaner_max_flush_time)) {
+
+ timeout = true;
+ break;
+ }
+
+ if (active_instance[i]) {
+
+ buf_pool_t* buf_pool;
+ ulint chunk_size;
+ flush_counters_t n;
+
+ chunk_size = ut_min(
+ srv_cleaner_flush_chunk_size,
+ min_n - requested_pages[i]);
+
+ buf_pool = buf_pool_from_array(i);
+
+ if (!buf_flush_start(buf_pool,
+ BUF_FLUSH_LIST)) {
+
+ continue;
+ }
+
+ buf_flush_batch(buf_pool, BUF_FLUSH_LIST,
+ chunk_size, lsn_limit, false,
+ &n);
+
+ buf_flush_end(buf_pool, BUF_FLUSH_LIST);
+
+ flush_common_batch += n.flushed;
+
+ if (n_processed) {
+ *n_processed += n.flushed;
+ }
+
+ requested_pages[i] += chunk_size;
+
+ if (requested_pages[i] >= min_n
+ || !n.flushed) {
+
+ active_instance[i] = false;
+ remaining_instances--;
+ }
+
+ if (n.flushed) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+ MONITOR_FLUSH_BATCH_COUNT,
+ MONITOR_FLUSH_BATCH_PAGES,
+ n.flushed);
+ }
+ }
+ }
+
+ buf_flush_common(BUF_FLUSH_LIST, flush_common_batch);
+ }
+
+ /* If we haven't flushed all the instances due to timeout or a repeat
+ failure to start a flush, return failure */
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ if (active_instance[i]) {
+ return(false);
+ }
+ }
+
+ return(true);
+}
+
+/******************************************************************//**
+This function picks up a single dirty page from the tail of the LRU
+list, flushes it, removes it from page_hash and LRU list and puts
+it on the free list. It is called from user threads when they are
+unable to find a replaceable page at the tail of the LRU list i.e.:
+when the background LRU flushing in the page_cleaner thread is not
+fast enough to keep pace with the workload.
+@return TRUE if success. */
+UNIV_INTERN
+ibool
+buf_flush_single_page_from_LRU(
+/*===========================*/
+ buf_pool_t* buf_pool) /*!< in/out: buffer pool instance */
+{
+ ulint scanned;
+ buf_page_t* bpage;
+ ib_mutex_t* block_mutex;
+ ibool freed;
+ bool evict_zip;
+
+ mutex_enter(&buf_pool->LRU_list_mutex);
+
+ for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), scanned = 1;
+ bpage != NULL;
+ bpage = UT_LIST_GET_PREV(LRU, bpage), ++scanned) {
+
+ block_mutex = buf_page_get_mutex(bpage);
+ mutex_enter(block_mutex);
+ if (buf_flush_ready_for_flush(bpage,
+ BUF_FLUSH_SINGLE_PAGE)) {
+ /* buf_flush_page() will release the block
+ mutex */
+ break;
+ }
+ mutex_exit(block_mutex);
+ }
+
+ mutex_exit(&buf_pool->LRU_list_mutex);
+
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+ MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
+ MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
+ scanned);
+
+ if (!bpage) {
+ /* Can't find a single flushable page. */
+ return(FALSE);
+ }
+
+ /* The following call will release the buf_page_get_mutex() mutex. */
+ buf_flush_page(buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
+
+ /* At this point the page has been written to the disk.
+ As we are not holding LRU list or buf_page_get_mutex() mutex therefore
+ we cannot use the bpage safely. It may have been plucked out
+ of the LRU list by some other thread or it may even have
+ relocated in case of a compressed page. We need to start
+ the scan of LRU list again to remove the block from the LRU
+ list and put it on the free list. */
+ mutex_enter(&buf_pool->LRU_list_mutex);
+
+ for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+ bpage != NULL;
+ bpage = UT_LIST_GET_PREV(LRU, bpage)) {
+
+ ibool ready;
+
+ block_mutex = buf_page_get_mutex(bpage);
+ mutex_enter(block_mutex);
+ ready = buf_flush_ready_for_replace(bpage);
+ if (ready) {
+ break;
+ }
+ mutex_exit(block_mutex);
+
+ }
+
+ if (!bpage) {
+ /* Can't find a single replaceable page. */
+ mutex_exit(&buf_pool->LRU_list_mutex);
+ return(FALSE);
+ }
+
+ evict_zip = !buf_LRU_evict_from_unzip_LRU(buf_pool);;
+
+ freed = buf_LRU_free_page(bpage, evict_zip);
+ if (!freed)
+ mutex_exit(&buf_pool->LRU_list_mutex);
+ mutex_exit(block_mutex);
+
+ return(freed);
+}
+
+/*********************************************************************//**
+Clears up tail of the LRU lists:
+* Put replaceable pages at the tail of LRU to the free list
+* Flush dirty pages at the tail of LRU to the disk
+The depth to which we scan each buffer pool is controlled by dynamic
+config parameter innodb_LRU_scan_depth.
+@return total pages flushed */
+UNIV_INTERN
+ulint
+buf_flush_LRU_tail(void)
+/*====================*/
+{
+ ulint total_flushed = 0;
+ ulint start_time = ut_time_ms();
+ ulint scan_depth[MAX_BUFFER_POOLS];
+ ulint requested_pages[MAX_BUFFER_POOLS];
+ bool active_instance[MAX_BUFFER_POOLS];
+ bool limited_scan[MAX_BUFFER_POOLS];
+ ulint previous_evicted[MAX_BUFFER_POOLS];
+ ulint remaining_instances = srv_buf_pool_instances;
+ ulint lru_chunk_size = srv_cleaner_lru_chunk_size;
+ ulint free_list_lwm = srv_LRU_scan_depth / 100
+ * srv_cleaner_free_list_lwm;
+
+ for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+
+ const buf_pool_t* buf_pool = buf_pool_from_array(i);
+
+ scan_depth[i] = ut_min(srv_LRU_scan_depth,
+ UT_LIST_GET_LEN(buf_pool->LRU));
+ requested_pages[i] = 0;
+ active_instance[i] = true;
+ limited_scan[i] = true;
+ previous_evicted[i] = 0;
+ }
+
+ while (remaining_instances) {
+
+ if (ut_time_ms() - start_time >= srv_cleaner_max_lru_time) {
+
+ break;
+ }
+
+ for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+
+ if (!active_instance[i]) {
+ continue;
+ }
+
+ ulint free_len = free_list_lwm;
+ buf_pool_t* buf_pool = buf_pool_from_array(i);
+
+ do {
+ flush_counters_t n;
+
+ ut_ad(requested_pages[i] <= scan_depth[i]);
+
+ /* Currently page_cleaner is the only thread
+ that can trigger an LRU flush. It is possible
+ that a batch triggered during last iteration is
+ still running, */
+ if (buf_flush_LRU(buf_pool, lru_chunk_size,
+ limited_scan[i], &n)) {
+
+ /* Allowed only one batch per
+ buffer pool instance. */
+ buf_flush_wait_batch_end(
+ buf_pool, BUF_FLUSH_LRU);
+ }
+
+ total_flushed += n.flushed;
+
+ /* When we evict less pages than we did on a
+ previous try we relax the LRU scan limit in
+ order to attempt to evict more */
+ limited_scan[i]
+ = (previous_evicted[i] > n.evicted);
+ previous_evicted[i] = n.evicted;
+
+ requested_pages[i] += lru_chunk_size;
+
+ if (requested_pages[i] >= scan_depth[i]
+ || !(srv_cleaner_eviction_factor
+ ? n.evicted : n.flushed)) {
+
+ active_instance[i] = false;
+ remaining_instances--;
+ } else {
+
+ free_len = UT_LIST_GET_LEN(
+ buf_pool->free);
+ }
+ } while (active_instance[i]
+ && free_len <= free_list_lwm);
+ }
+ }
+
+ if (total_flushed) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_BATCH_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_COUNT,
+ MONITOR_LRU_BATCH_PAGES,
+ total_flushed);
+ }
+
+ return(total_flushed);
+}
+
+/*********************************************************************//**
+Wait for any possible LRU flushes that are in progress to end. */
+UNIV_INTERN
+void
+buf_flush_wait_LRU_batch_end(void)
+/*==============================*/
+{
+ for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+
+ mutex_enter(&buf_pool->flush_state_mutex);
+
+ if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
+ || buf_pool->init_flush[BUF_FLUSH_LRU]) {
+
+ mutex_exit(&buf_pool->flush_state_mutex);
+ buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
+ } else {
+ mutex_exit(&buf_pool->flush_state_mutex);
+ }
+ }
+}
+
+/*********************************************************************//**
+Flush a batch of dirty pages from the flush list
+@return number of pages flushed, 0 if no page is flushed or if another
+flush_list type batch is running */
+static
+ulint
+page_cleaner_do_flush_batch(
+/*========================*/
+ ulint n_to_flush, /*!< in: number of pages that
+ we should attempt to flush. */
+ lsn_t lsn_limit) /*!< in: LSN up to which flushing
+ must happen */
+{
+ ulint n_flushed;
+
+ buf_flush_list(n_to_flush, lsn_limit, &n_flushed);
+
+ return(n_flushed);
+}
+
+/*********************************************************************//**
+Calculates if flushing is required based on number of dirty pages in
+the buffer pool.
+@return percent of io_capacity to flush to manage dirty page ratio */
+static
+ulint
+af_get_pct_for_dirty()
+/*==================*/
+{
+ ulint dirty_pct = buf_get_modified_ratio_pct();
+
+ ut_a(srv_max_dirty_pages_pct_lwm
+ <= srv_max_buf_pool_modified_pct);
+
+ if (srv_max_dirty_pages_pct_lwm == 0) {
+ /* The user has not set the option to preflush dirty
+ pages as we approach the high water mark. */
+ if (dirty_pct > srv_max_buf_pool_modified_pct) {
+ /* We have crossed the high water mark of dirty
+ pages In this case we start flushing at 100% of
+ innodb_io_capacity. */
+ return(100);
+ }
+ } else if (dirty_pct > srv_max_dirty_pages_pct_lwm) {
+ /* We should start flushing pages gradually. */
+ return((dirty_pct * 100)
+ / (srv_max_buf_pool_modified_pct + 1));
+ }
+
+ return(0);
+}
+
+/*********************************************************************//**
+Calculates if flushing is required based on redo generation rate.
+@return percent of io_capacity to flush to manage redo space */
+static
+ulint
+af_get_pct_for_lsn(
+/*===============*/
+ lsn_t age) /*!< in: current age of LSN. */
+{
+ lsn_t max_async_age;
+ lsn_t lsn_age_factor;
+ lsn_t af_lwm = (srv_adaptive_flushing_lwm
+ * log_get_capacity()) / 100;
+
+ if (age < af_lwm) {
+ /* No adaptive flushing. */
+ return(0);
+ }
+
+ max_async_age = log_get_max_modified_age_async();
+
+ if (age < max_async_age && !srv_adaptive_flushing) {
+ /* We have still not reached the max_async point and
+ the user has disabled adaptive flushing. */
+ return(0);
+ }
+
+ /* If we are here then we know that either:
+ 1) User has enabled adaptive flushing
+ 2) User may have disabled adaptive flushing but we have reached
+ max_async_age. */
+ lsn_age_factor = (age * 100) / max_async_age;
+
+ ut_ad(srv_max_io_capacity >= srv_io_capacity);
+ switch ((srv_cleaner_lsn_age_factor_t)srv_cleaner_lsn_age_factor) {
+ case SRV_CLEANER_LSN_AGE_FACTOR_LEGACY:
+ return(static_cast<ulint>(
+ ((srv_max_io_capacity / srv_io_capacity)
+ * (lsn_age_factor
+ * sqrt((double)lsn_age_factor)))
+ / 7.5));
+ case SRV_CLEANER_LSN_AGE_FACTOR_HIGH_CHECKPOINT:
+ return(static_cast<ulint>(
+ ((srv_max_io_capacity / srv_io_capacity)
+ * (lsn_age_factor * lsn_age_factor
+ * sqrt((double)lsn_age_factor)))
+ / 700.5));
+ default:
+ ut_error;
+ }
+}
+
+/*********************************************************************//**
+This function is called approximately once every second by the
+page_cleaner thread. Based on various factors it decides if there is a
+need to do flushing. If flushing is needed it is performed and the
+number of pages flushed is returned.
+@return number of pages flushed */
+static
+ulint
+page_cleaner_flush_pages_if_needed(void)
+/*====================================*/
+{
+ static lsn_t lsn_avg_rate = 0;
+ static lsn_t prev_lsn = 0;
+ static lsn_t last_lsn = 0;
+ static ulint sum_pages = 0;
+ static ulint last_pages = 0;
+ static ulint prev_pages = 0;
+ static ulint avg_page_rate = 0;
+ static ulint n_iterations = 0;
+ lsn_t oldest_lsn;
+ lsn_t cur_lsn;
+ lsn_t age;
+ lsn_t lsn_rate;
+ ulint n_pages = 0;
+ ulint pct_for_dirty = 0;
+ ulint pct_for_lsn = 0;
+ ulint pct_total = 0;
+ int age_factor = 0;
+
+ cur_lsn = log_get_lsn();
+
+ if (prev_lsn == 0) {
+ /* First time around. */
+ prev_lsn = cur_lsn;
+ return(0);
+ }
+
+ if (prev_lsn == cur_lsn) {
+ return(0);
+ }
+
+ /* We update our variables every srv_flushing_avg_loops
+ iterations to smooth out transition in workload. */
+ if (++n_iterations >= srv_flushing_avg_loops) {
+
+ avg_page_rate = ((sum_pages / srv_flushing_avg_loops)
+ + avg_page_rate) / 2;
+
+ /* How much LSN we have generated since last call. */
+ lsn_rate = (cur_lsn - prev_lsn) / srv_flushing_avg_loops;
+
+ lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
+
+ prev_lsn = cur_lsn;
+
+ n_iterations = 0;
+
+ sum_pages = 0;
+ }
+
+ oldest_lsn = buf_pool_get_oldest_modification();
+
+ ut_ad(oldest_lsn <= log_get_lsn());
+
+ age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0;
+
+ pct_for_dirty = af_get_pct_for_dirty();
+ pct_for_lsn = af_get_pct_for_lsn(age);
+
+ pct_total = ut_max(pct_for_dirty, pct_for_lsn);
+
+ /* Cap the maximum IO capacity that we are going to use by
+ max_io_capacity. */
+ n_pages = PCT_IO(pct_total);
+ if (age < log_get_max_modified_age_async())
+ n_pages = (n_pages + avg_page_rate) / 2;
+
+ if (n_pages > srv_max_io_capacity) {
+ n_pages = srv_max_io_capacity;
+ }
+
+ if (last_pages && cur_lsn - last_lsn > lsn_avg_rate / 2) {
+ age_factor = prev_pages / last_pages;
+ }
+
+ MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
+
+ prev_pages = n_pages;
+ n_pages = page_cleaner_do_flush_batch(
+ n_pages, oldest_lsn + lsn_avg_rate * (age_factor + 1));
+
+ last_lsn= cur_lsn;
+ last_pages= n_pages + 1;
+
+ MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
+ MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
+ MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
+ MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
+
+ if (n_pages) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+ MONITOR_FLUSH_ADAPTIVE_COUNT,
+ MONITOR_FLUSH_ADAPTIVE_PAGES,
+ n_pages);
+
+ sum_pages += n_pages;
+ }
+
+ return(n_pages);
+}
+
+/*********************************************************************//**
+Puts the page_cleaner thread to sleep if it has finished work in less
+than a second */
+static
+void
+page_cleaner_sleep_if_needed(
+/*=========================*/
+ ulint next_loop_time) /*!< in: time when next loop iteration
+ should start */
+{
+ ulint cur_time = ut_time_ms();
+
+ if (next_loop_time > cur_time) {
+ /* Get sleep interval in micro seconds. We use
+ ut_min() to avoid long sleep in case of
+ wrap around. */
+ os_thread_sleep(ut_min(1000000,
+ (next_loop_time - cur_time)
+ * 1000));
+ }
+}
+
+/*********************************************************************//**
+Returns the aggregate free list length over all buffer pool instances.
+@return total free list length. */
+__attribute__((warn_unused_result))
+static
+ulint
+buf_get_total_free_list_length(void)
+/*================================*/
+{
+ ulint result = 0;
+
+ for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+
+ result += UT_LIST_GET_LEN(buf_pool_from_array(i)->free);
+ }
+
+ return result;
+}
+
+/*********************************************************************//**
+Adjust the desired page cleaner thread sleep time for LRU flushes. */
+__attribute__((nonnull))
+static
+void
+page_cleaner_adapt_lru_sleep_time(
+/*==============================*/
+ ulint* lru_sleep_time) /*!< in/out: desired page cleaner thread sleep
+ time for LRU flushes */
+{
+ ulint free_len = buf_get_total_free_list_length();
+ ulint max_free_len = srv_LRU_scan_depth * srv_buf_pool_instances;
+
+ if (free_len < max_free_len / 100) {
+
+ /* Free lists filled less than 1%, no sleep */
+ *lru_sleep_time = 0;
+ } else if (free_len > max_free_len / 5) {
+
+ /* Free lists filled more than 20%, sleep a bit more */
+ *lru_sleep_time += 50;
+ if (*lru_sleep_time > srv_cleaner_max_lru_time)
+ *lru_sleep_time = srv_cleaner_max_lru_time;
+ } else if (free_len < max_free_len / 20 && *lru_sleep_time >= 50) {
+
+ /* Free lists filled less than 5%, sleep a bit less */
+ *lru_sleep_time -= 50;
+ } else {
+
+ /* Free lists filled between 5% and 20%, no change */
+ }
+}
+
+/*********************************************************************//**
+Get the desired page cleaner thread sleep time for flush list flushes.
+@return desired sleep time */
+__attribute__((warn_unused_result))
+static
+ulint
+page_cleaner_adapt_flush_sleep_time(void)
+/*=====================================*/
+{
+ lsn_t age = log_get_lsn() - log_sys->last_checkpoint_lsn;
+
+ if (age > log_sys->max_modified_age_sync) {
+
+ /* No sleep if in sync preflush zone */
+ return(0);
+ }
+
+ /* In all other cases flush list factors do not influence the page
+ cleaner sleep time */
+ return(srv_cleaner_max_flush_time);
+}
+
+/******************************************************************//**
+page_cleaner thread tasked with flushing dirty pages from the buffer
+pools. As of now we'll have only one instance of this thread.
+@return a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(buf_flush_page_cleaner_thread)(
+/*==========================================*/
+ void* arg __attribute__((unused)))
+ /*!< in: a dummy parameter required by
+ os_thread_create */
+{
+ ulint next_loop_time = ut_time_ms() + 1000;
+ ulint n_flushed = 0;
+ ulint last_activity = srv_get_activity_count();
+ ulint lru_sleep_time = srv_cleaner_max_lru_time;
+
+ ut_ad(!srv_read_only_mode);
+
+#ifdef UNIV_PFS_THREAD
+ pfs_register_thread(buf_page_cleaner_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+ srv_cleaner_tid = os_thread_get_tid();
+
+ os_thread_set_priority(srv_cleaner_tid, srv_sched_priority_cleaner);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "InnoDB: page_cleaner thread running, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+ buf_page_cleaner_is_active = TRUE;
+
+ while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+
+ ulint flush_sleep_time;
+ ulint page_cleaner_sleep_time;
+
+ srv_current_thread_priority = srv_cleaner_thread_priority;
+
+ /* The page_cleaner skips sleep if the server is
+ idle and there are no pending IOs in the buffer pool
+ and there is work to do. */
+ if (srv_check_activity(last_activity)
+ || buf_get_n_pending_read_ios()
+ || n_flushed == 0) {
+ page_cleaner_sleep_if_needed(next_loop_time);
+ }
+
+ page_cleaner_adapt_lru_sleep_time(&lru_sleep_time);
+
+ flush_sleep_time = page_cleaner_adapt_flush_sleep_time();
+
+ page_cleaner_sleep_time = ut_min(lru_sleep_time,
+ flush_sleep_time);
+
+ next_loop_time = ut_time_ms() + page_cleaner_sleep_time;
+
+ /* Flush pages from end of LRU if required */
+ n_flushed = buf_flush_LRU_tail();
+
+ if (srv_check_activity(last_activity)) {
+ last_activity = srv_get_activity_count();
+
+ /* Flush pages from flush_list if required */
+ n_flushed += page_cleaner_flush_pages_if_needed();
+ } else {
+ n_flushed = page_cleaner_do_flush_batch(
+ PCT_IO(100),
+ LSN_MAX);
+
+ if (n_flushed) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+ MONITOR_FLUSH_BACKGROUND_COUNT,
+ MONITOR_FLUSH_BACKGROUND_PAGES,
+ n_flushed);
+ }
+ }
+ }
+
+ ut_ad(srv_shutdown_state > 0);
+ if (srv_fast_shutdown == 2) {
+ /* In very fast shutdown we simulate a crash of
+ buffer pool. We are not required to do any flushing */
+ goto thread_exit;
+ }
+
+ /* In case of normal and slow shutdown the page_cleaner thread
+ must wait for all other activity in the server to die down.
+ Note that we can start flushing the buffer pool as soon as the
+ server enters shutdown phase but we must stay alive long enough
+ to ensure that any work done by the master or purge threads is
+ also flushed.
+ During shutdown we pass through two stages. In the first stage,
+ when SRV_SHUTDOWN_CLEANUP is set other threads like the master
+ and the purge threads may be working as well. We start flushing
+ the buffer pool but can't be sure that no new pages are being
+ dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */
+
+ do {
+ n_flushed = page_cleaner_do_flush_batch(PCT_IO(100), LSN_MAX);
+
+ /* We sleep only if there are no pages to flush */
+ if (n_flushed == 0) {
+ os_thread_sleep(100000);
+ }
+ } while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
+
+ /* At this point all threads including the master and the purge
+ thread must have been suspended. */
+ ut_a(srv_get_active_thread_type() == SRV_NONE);
+ ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
+
+ /* We can now make a final sweep on flushing the buffer pool
+ and exit after we have cleaned the whole buffer pool.
+ It is important that we wait for any running batch that has
+ been triggered by us to finish. Otherwise we can end up
+ considering end of that batch as a finish of our final
+ sweep and we'll come out of the loop leaving behind dirty pages
+ in the flush_list */
+ buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+ buf_flush_wait_LRU_batch_end();
+
+ bool success;
+
+ do {
+
+ success = buf_flush_list(PCT_IO(100), LSN_MAX, &n_flushed);
+ buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+
+ } while (!success || n_flushed > 0);
+
+ /* Some sanity checks */
+ ut_a(srv_get_active_thread_type() == SRV_NONE);
+ ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
+ for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_t* buf_pool = buf_pool_from_array(i);
+ ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
+ }
+
+ /* We have lived our life. Time to die. */
+
+thread_exit:
+ buf_page_cleaner_is_active = FALSE;
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+
+/** Functor to validate the flush list. */
+struct Check {
+ void operator()(const buf_page_t* elem)
+ {
+ ut_a(elem->in_flush_list);
+ }
+};
+
+/******************************************************************//**
+Validates the flush list.
+@return TRUE if ok */
+static
+ibool
+buf_flush_validate_low(
+/*===================*/
+ buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
+{
+ buf_page_t* bpage;
+ const ib_rbt_node_t* rnode = NULL;
+
+ ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+ UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list, Check());
+
+ bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+ /* If we are in recovery mode i.e.: flush_rbt != NULL
+ then each block in the flush_list must also be present
+ in the flush_rbt. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ rnode = rbt_first(buf_pool->flush_rbt);
+ }
+
+ while (bpage != NULL) {
+ const lsn_t om = bpage->oldest_modification;
+
+ ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
+
+ ut_ad(bpage->in_flush_list);
+
+ /* A page in buf_pool->flush_list can be in
+ BUF_BLOCK_REMOVE_HASH state. This happens when a page
+ is in the middle of being relocated. In that case the
+ original descriptor can have this state and still be
+ in the flush list waiting to acquire the
+ buf_pool->flush_list_mutex to complete the relocation. */
+ ut_a(buf_page_in_file(bpage)
+ || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
+ ut_a(om > 0);
+
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_page_t** prpage;
+
+ ut_a(rnode);
+ prpage = rbt_value(buf_page_t*, rnode);
+
+ ut_a(*prpage);
+ ut_a(*prpage == bpage);
+ rnode = rbt_next(buf_pool->flush_rbt, rnode);
+ }
+
+ bpage = UT_LIST_GET_NEXT(list, bpage);
+
+ ut_a(!bpage || om >= bpage->oldest_modification);
+ }
+
+ /* By this time we must have exhausted the traversal of
+ flush_rbt (if active) as well. */
+ ut_a(rnode == NULL);
+
+ return(TRUE);
+}
+
+/******************************************************************//**
+Validates the flush list.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+buf_flush_validate(
+/*===============*/
+ buf_pool_t* buf_pool) /*!< buffer pool instance */
+{
+ ibool ret;
+
+ buf_flush_list_mutex_enter(buf_pool);
+
+ ret = buf_flush_validate_low(buf_pool);
+
+ buf_flush_list_mutex_exit(buf_pool);
+
+ return(ret);
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Check if there are any dirty pages that belong to a space id in the flush
+list in a particular buffer pool.
+@return number of dirty pages present in a single buffer pool */
+UNIV_INTERN
+ulint
+buf_pool_get_dirty_pages_count(
+/*===========================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool */
+ ulint id) /*!< in: space id to check */
+
+{
+ ulint count = 0;
+
+ buf_flush_list_mutex_enter(buf_pool);
+
+ buf_page_t* bpage;
+
+ for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
+ bpage != 0;
+ bpage = UT_LIST_GET_NEXT(list, bpage)) {
+
+ ut_ad(buf_page_in_file(bpage)
+ || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
+ ut_ad(bpage->in_flush_list);
+ ut_ad(bpage->oldest_modification > 0);
+
+ if (bpage->space == id) {
+ ++count;
+ }
+ }
+
+ buf_flush_list_mutex_exit(buf_pool);
+
+ return(count);
+}
+
+/******************************************************************//**
+Check if there are any dirty pages that belong to a space id in the flush list.
+@return number of dirty pages present in all the buffer pools */
+UNIV_INTERN
+ulint
+buf_flush_get_dirty_pages_count(
+/*============================*/
+ ulint id) /*!< in: space id to check */
+
+{
+ ulint count = 0;
+
+ for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+
+ count += buf_pool_get_dirty_pages_count(buf_pool, id);
+ }
+
+ return(count);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/xtradb/buf/buf0lru.c b/storage/xtradb/buf/buf0lru.cc
index cfb45232084..8a6d042f4c7 100644
--- a/storage/xtradb/buf/buf0lru.c
+++ b/storage/xtradb/buf/buf0lru.cc
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/**************************************************//**
-@file buf/buf0lru.c
+@file buf/buf0lru.cc
The database buffer replacement algorithm
Created 11/5/1995 Heikki Tuuri
@@ -25,6 +25,7 @@ Created 11/5/1995 Heikki Tuuri
#include "buf0lru.h"
+#ifndef UNIV_HOTBACKUP
#ifdef UNIV_NONINL
#include "buf0lru.ic"
#endif
@@ -40,6 +41,7 @@ Created 11/5/1995 Heikki Tuuri
#include "btr0btr.h"
#include "buf0buddy.h"
#include "buf0buf.h"
+#include "buf0dblwr.h"
#include "buf0flu.h"
#include "buf0rea.h"
#include "btr0sea.h"
@@ -49,6 +51,10 @@ Created 11/5/1995 Heikki Tuuri
#include "log0recv.h"
#include "srv0srv.h"
#include "srv0start.h"
+#include "srv0mon.h"
+#include "lock0lock.h"
+
+#include "ha_prototypes.h"
/** The number of blocks from the LRU_old pointer onward, including
the block pointed to, must be buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
@@ -70,7 +76,7 @@ allowed to point to either end of the LRU list. */
/** When dropping the search hash index entries before deleting an ibd
file, we build a local array of pages belonging to that tablespace
in the buffer pool. Following is the size of that array.
-We also release buf_pool->mutex after scanning this many pages of the
+We also release buf_pool->LRU_list_mutex after scanning this many pages of the
flush_list when dropping a table. This is to ensure that other threads
are not blocked for extended period of time when using very large
buffer pools. */
@@ -126,21 +132,25 @@ UNIV_INTERN uint buf_LRU_old_threshold_ms;
/******************************************************************//**
Takes a block out of the LRU list and page hash table.
If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
-the object will be freed and buf_pool->zip_mutex will be released.
-
-If a compressed page or a compressed-only block descriptor is freed,
-other compressed pages or compressed-only block descriptors may be
-relocated.
-@return the new state of the block (BUF_BLOCK_ZIP_FREE if the state
-was BUF_BLOCK_ZIP_PAGE, or BUF_BLOCK_REMOVE_HASH otherwise) */
-static
-enum buf_page_state
-buf_LRU_block_remove_hashed_page(
-/*=============================*/
+the object will be freed.
+
+The caller must hold buf_pool->LRU_list_mutex, the buf_page_get_mutex() mutex
+and the appropriate hash_lock. This function will release the
+buf_page_get_mutex() and the hash_lock.
+
+If a compressed page is freed other compressed pages may be relocated.
+@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
+caller needs to free the page to the free list
+@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
+this case the block is already returned to the buddy allocator. */
+static __attribute__((nonnull, warn_unused_result))
+bool
+buf_LRU_block_remove_hashed(
+/*========================*/
buf_page_t* bpage, /*!< in: block, must contain a file page and
be in a state where it can be freed; there
may or may not be a hash index to the page */
- ibool zip); /*!< in: TRUE if should remove also the
+ bool zip); /*!< in: true if should remove also the
compressed page of an uncompressed page */
/******************************************************************//**
Puts a file page whose has no hash index to the free list. */
@@ -148,9 +158,8 @@ static
void
buf_LRU_block_free_hashed_page(
/*===========================*/
- buf_block_t* block, /*!< in: block, must contain a file page and
+ buf_block_t* block); /*!< in: block, must contain a file page and
be in a state where it can be freed */
- ibool have_page_hash_mutex);
/******************************************************************//**
Increases LRU size in bytes with zip_size for compressed page,
@@ -162,9 +171,8 @@ incr_LRU_size_in_bytes(
buf_page_t* bpage, /*!< in: control block */
buf_pool_t* buf_pool) /*!< in: buffer pool instance */
{
- ulint zip_size;
ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
- zip_size = page_zip_get_size(&bpage->zip);
+ ulint zip_size = page_zip_get_size(&bpage->zip);
buf_pool->stat.LRU_bytes += zip_size ? zip_size : UNIV_PAGE_SIZE;
ut_ad(buf_pool->stat.LRU_bytes <= buf_pool->curr_pool_size);
}
@@ -173,28 +181,19 @@ incr_LRU_size_in_bytes(
Determines if the unzip_LRU list should be used for evicting a victim
instead of the general LRU list.
@return TRUE if should use unzip_LRU */
-UNIV_INLINE
+UNIV_INTERN
ibool
buf_LRU_evict_from_unzip_LRU(
/*=========================*/
- buf_pool_t* buf_pool,
- ibool* have_LRU_mutex)
+ buf_pool_t* buf_pool)
{
ulint io_avg;
ulint unzip_avg;
- //ut_ad(buf_pool_mutex_own(buf_pool));
+ ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
- if (!*have_LRU_mutex) {
- mutex_enter(&buf_pool->LRU_list_mutex);
- *have_LRU_mutex = TRUE;
- }
/* If the unzip_LRU list is empty, we can only use the LRU. */
if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0) {
- if (*have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- *have_LRU_mutex = FALSE;
- }
return(FALSE);
}
@@ -203,26 +202,14 @@ buf_LRU_evict_from_unzip_LRU(
decompressed pages in the buffer pool. */
if (UT_LIST_GET_LEN(buf_pool->unzip_LRU)
<= UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
- if (*have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- *have_LRU_mutex = FALSE;
- }
return(FALSE);
}
/* If eviction hasn't started yet, we assume by default
that a workload is disk bound. */
if (buf_pool->freed_page_clock == 0) {
- if (*have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- *have_LRU_mutex = FALSE;
- }
return(TRUE);
}
- if (*have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- *have_LRU_mutex = FALSE;
- }
/* Calculate the average over past intervals, and add the values
of the current interval. */
@@ -266,11 +253,9 @@ buf_LRU_drop_page_hash_batch(
When doing a DROP TABLE/DISCARD TABLESPACE we have to drop all page
hash index entries belonging to that table. This function tries to
do that in batch. Note that this is a 'best effort' attempt and does
-not guarantee that ALL hash entries will be removed.
-
-@return number of hashed pages found*/
+not guarantee that ALL hash entries will be removed. */
static
-ulint
+void
buf_LRU_drop_page_hash_for_tablespace(
/*==================================*/
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
@@ -280,22 +265,18 @@ buf_LRU_drop_page_hash_for_tablespace(
ulint* page_arr;
ulint num_entries;
ulint zip_size;
- ulint num_found = 0;
zip_size = fil_space_get_zip_size(id);
if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
/* Somehow, the tablespace does not exist. Nothing to drop. */
ut_ad(0);
- return num_found;
+ return;
}
- page_arr = ut_malloc(
- sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE);
-
- //buf_pool_mutex_enter(buf_pool);
+ page_arr = static_cast<ulint*>(ut_malloc(
+ sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE));
- ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
mutex_enter(&buf_pool->LRU_list_mutex);
num_entries = 0;
@@ -303,17 +284,12 @@ scan_again:
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
while (bpage != NULL) {
- /* bpage->state,space,io_fix,buf_fix_count are protected by block_mutex at XtraDB */
- mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
buf_page_t* prev_bpage;
ibool is_fixed;
+ ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
- if (UNIV_UNLIKELY(!block_mutex)) {
- goto next_page;
- }
-
ut_a(buf_page_in_file(bpage));
if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE
@@ -322,40 +298,32 @@ scan_again:
/* Compressed pages are never hashed.
Skip blocks of other tablespaces.
Skip I/O-fixed blocks (to be dealt with later). */
- mutex_exit(block_mutex);
next_page:
bpage = prev_bpage;
continue;
}
- //mutex_enter(&((buf_block_t*) bpage)->mutex);
+ mutex_enter(block_mutex);
is_fixed = bpage->buf_fix_count > 0
|| !((buf_block_t*) bpage)->index;
- //mutex_exit(&((buf_block_t*) bpage)->mutex);
+ mutex_exit(block_mutex);
if (is_fixed) {
- mutex_exit(block_mutex);
goto next_page;
}
/* Store the page number so that we can drop the hash
index in a batch later. */
page_arr[num_entries] = bpage->offset;
-
- mutex_exit(block_mutex);
-
ut_a(num_entries < BUF_LRU_DROP_SEARCH_SIZE);
-
++num_entries;
- ++num_found;
if (num_entries < BUF_LRU_DROP_SEARCH_SIZE) {
goto next_page;
}
- /* Array full. We release the buf_pool->mutex to obey
+ /* Array full. We release the buf_pool->LRU_list_mutex to obey
the latching order. */
- //buf_pool_mutex_exit(buf_pool);
mutex_exit(&buf_pool->LRU_list_mutex);
buf_LRU_drop_page_hash_batch(
@@ -363,11 +331,9 @@ next_page:
num_entries = 0;
- //buf_pool_mutex_enter(buf_pool);
- ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
mutex_enter(&buf_pool->LRU_list_mutex);
- /* Note that we released the buf_pool mutex above
+ /* Note that we released the buf_pool->LRU_list_mutex above
after reading the prev_bpage during processing of a
page_hash_batch (i.e.: when the array was full).
Because prev_bpage could belong to a compressed-only
@@ -381,52 +347,38 @@ next_page:
guarantee that ALL such entries will be dropped. */
/* If, however, bpage has been removed from LRU list
- to the free list then we should restart the scan.
- bpage->state is protected by buf_pool mutex. */
-
- /* obtain block_mutex again to avoid race condition of bpage->state */
- block_mutex = buf_page_get_mutex_enter(bpage);
- if (!block_mutex) {
- goto scan_again;
- }
+ to the free list then we should restart the scan. */
if (bpage
&& buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
- mutex_exit(block_mutex);
goto scan_again;
}
- mutex_exit(block_mutex);
}
- //buf_pool_mutex_exit(buf_pool);
mutex_exit(&buf_pool->LRU_list_mutex);
/* Drop any remaining batch of search hashed pages. */
buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, num_entries);
ut_free(page_arr);
-
- return num_found;
}
/******************************************************************//**
While flushing (or removing dirty) pages from a tablespace we don't
-want to hog the CPU and resources. Release the LRU list and block
+want to hog the CPU and resources. Release the buffer pool and block
mutex and try to force a context switch. Then reacquire the same mutexes.
The current page is "fixed" before the release of the mutexes and then
"unfixed" again once we have reacquired the mutexes. */
-static
+static __attribute__((nonnull))
void
buf_flush_yield(
/*============*/
buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
buf_page_t* bpage) /*!< in/out: current page */
{
- mutex_t* block_mutex;
-
- block_mutex = buf_page_get_mutex(bpage);
+ ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
- ut_ad(mutex_own(block_mutex));
ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+ ut_ad(mutex_own(block_mutex));
ut_ad(buf_page_in_file(bpage));
/* "Fix" the block so that the position cannot be
@@ -434,40 +386,39 @@ buf_flush_yield(
block mutexes. */
buf_page_set_sticky(bpage);
- /* Now it is safe to release the LRU list mutex. */
+ /* Now it is safe to release the LRU list mutex */
mutex_exit(&buf_pool->LRU_list_mutex);
mutex_exit(block_mutex);
/* Try and force a context switch. */
os_thread_yield();
- ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
mutex_enter(&buf_pool->LRU_list_mutex);
mutex_enter(block_mutex);
/* "Unfix" the block now that we have both the
- LRU list and block mutex again. */
+ buffer pool and block mutex again. */
buf_page_unset_sticky(bpage);
mutex_exit(block_mutex);
}
/******************************************************************//**
-If we have hogged the resources for too long then release the LRU list
-and flush list mutex and do a thread yield. Set the current page to
-"sticky" so that it is not relocated during the yield.
-@return TRUE if yielded */
-static
-ibool
+If we have hogged the resources for too long then release the buffer
+pool and flush list mutex and do a thread yield. Set the current page
+to "sticky" so that it is not relocated during the yield.
+@return true if yielded */
+static __attribute__((nonnull(1), warn_unused_result))
+bool
buf_flush_try_yield(
/*================*/
buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
buf_page_t* bpage, /*!< in/out: bpage to remove */
ulint processed, /*!< in: number of pages processed */
- ibool* must_restart) /*!< in/out: if TRUE, we have to
+ bool* must_restart) /*!< in/out: if true, we have to
restart the flush list scan */
{
/* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the
- loop we release buf_pool->mutex to let other threads
+ loop we release buf_pool->LRU_list_mutex to let other threads
do their job but only if the block is not IO fixed. This
ensures that the block stays in its position in the
flush_list. */
@@ -476,7 +427,7 @@ buf_flush_try_yield(
&& processed >= BUF_LRU_DROP_SEARCH_SIZE
&& buf_page_get_io_fix_unlocked(bpage) == BUF_IO_NONE) {
- mutex_t* block_mutex;
+ ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
buf_flush_list_mutex_exit(buf_pool);
@@ -485,30 +436,24 @@ buf_flush_try_yield(
buf_page_get_gen() won't be called for pages from this
tablespace. */
- block_mutex = buf_page_get_mutex_enter(bpage);
- if (UNIV_UNLIKELY(block_mutex == NULL)) {
-
- buf_flush_list_mutex_enter(buf_pool);
-
- *must_restart = TRUE;
- return FALSE;
- }
-
+ mutex_enter(block_mutex);
/* Recheck the I/O fix and the flush list presence now that we
hold the right mutex */
if (UNIV_UNLIKELY(buf_page_get_io_fix(bpage) != BUF_IO_NONE
|| bpage->oldest_modification == 0)) {
mutex_exit(block_mutex);
+
+ *must_restart = true;
+
buf_flush_list_mutex_enter(buf_pool);
- *must_restart = TRUE;
- return FALSE;
+ return false;
}
- *must_restart = FALSE;
+ *must_restart = false;
- /* Release the LRU list and block mutex
+ /* Release the LRU list and buf_page_get_mutex() mutex
to give the other threads a go. */
buf_flush_yield(buf_pool, bpage);
@@ -521,36 +466,35 @@ buf_flush_try_yield(
ut_ad(bpage->in_flush_list);
- return(TRUE);
+ return(true);
}
- return(FALSE);
+ return(false);
}
/******************************************************************//**
Removes a single page from a given tablespace inside a specific
buffer pool instance.
-@return TRUE if page was removed. */
-static
-ibool
+@return true if page was removed. */
+static __attribute__((nonnull, warn_unused_result))
+bool
buf_flush_or_remove_page(
/*=====================*/
buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
buf_page_t* bpage, /*!< in/out: bpage to remove */
- ibool* must_restart) /*!< in/out: if TRUE, must restart the
+ bool flush, /*!< in: flush to disk if true but
+ don't remove else remove without
+ flushing to disk */
+ bool* must_restart) /*!< in/out: if true, must restart the
flush list scan */
{
- mutex_t* block_mutex;
- ibool processed = FALSE;
+ ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
ut_ad(buf_flush_list_mutex_own(buf_pool));
- block_mutex = buf_page_get_mutex(bpage);
-
- /* bpage->space and bpage->io_fix are protected by
- buf_pool->mutex and block_mutex. It is safe to check
- them while holding buf_pool->mutex only. */
+ /* It is safe to check bpage->space and bpage->io_fix while holding
+ buf_pool->LRU_list_mutex only. */
if (UNIV_UNLIKELY(buf_page_get_io_fix_unlocked(bpage)
!= BUF_IO_NONE)) {
@@ -558,50 +502,71 @@ buf_flush_or_remove_page(
/* We cannot remove this page during this scan
yet; maybe the system is currently reading it
in, or flushing the modifications to the file */
+ return(false);
- } else {
-
- /* We have to release the flush_list_mutex to obey the
- latching order. We are not however guaranteed that the page
- will stay in the flush_list. */
+ }
- buf_flush_list_mutex_exit(buf_pool);
+ bool processed = false;
- /* We don't have to worry about bpage becoming a dangling
- pointer by a compressed page flush list relocation because
- buf_page_get_gen() won't be called for pages from this
- tablespace. */
+ buf_flush_list_mutex_exit(buf_pool);
- mutex_enter(block_mutex);
+ /* We don't have to worry about bpage becoming a dangling
+ pointer by a compressed page flush list relocation because
+ buf_page_get_gen() won't be called for pages from this
+ tablespace. */
- /* Recheck the page I/O fix and the flush list presence now
- thatwe hold the right mutex. */
- if (UNIV_UNLIKELY(buf_page_get_io_fix(bpage) != BUF_IO_NONE
- || bpage->oldest_modification == 0)) {
+ mutex_enter(block_mutex);
- /* The page became I/O-fixed or is not on the flush
- list anymore, this invalidates any flush-list-page
- pointers we have. */
- *must_restart = TRUE;
+ /* Recheck the page I/O fix and the flush list presence now
+ that we hold the right mutex. */
+ if (UNIV_UNLIKELY(buf_page_get_io_fix(bpage) != BUF_IO_NONE
+ || bpage->oldest_modification == 0)) {
- } else {
+ /* The page became I/O-fixed or is not on the flush
+ list anymore, this invalidates any flush-list-page
+ pointers we have. */
- ut_ad(bpage->oldest_modification != 0);
+ mutex_exit(block_mutex);
- if (bpage->buf_fix_count == 0) {
+ *must_restart = TRUE;
- buf_flush_remove(bpage);
+ } else if (!flush) {
- processed = TRUE;
- }
- }
+ buf_flush_remove(bpage);
mutex_exit(block_mutex);
- buf_flush_list_mutex_enter(buf_pool);
+ processed = true;
+
+ } else if (buf_flush_ready_for_flush(bpage,
+ BUF_FLUSH_SINGLE_PAGE)) {
+
+ mutex_exit(&buf_pool->LRU_list_mutex);
+
+ /* The following call will release the buf_page_get_mutex()
+ mutex. */
+ buf_flush_page(buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, false);
+ ut_ad(!mutex_own(block_mutex));
+
+ /* Wake possible simulated aio thread to actually
+ post the writes to the operating system */
+ os_aio_simulated_wake_handler_threads();
+
+ mutex_enter(&buf_pool->LRU_list_mutex);
+
+ processed = true;
+ } else {
+ /* Not ready for flush. It can't be IO fixed because we
+ checked for that at the start of the function. It must
+ be buffer fixed. */
+ ut_ad(bpage->buf_fix_count > 0);
+ mutex_exit(block_mutex);
}
+ buf_flush_list_mutex_enter(buf_pool);
+
ut_ad(!mutex_own(block_mutex));
+ ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
return(processed);
}
@@ -611,49 +576,81 @@ Remove all dirty pages belonging to a given tablespace inside a specific
buffer pool instance when we are deleting the data file(s) of that
tablespace. The pages still remain a part of LRU and are evicted from
the list as they age towards the tail of the LRU.
-@return TRUE if all freed. */
-static
-ibool
+@retval DB_SUCCESS if all freed
+@retval DB_FAIL if not all freed
+@retval DB_INTERRUPTED if the transaction was interrupted */
+static __attribute__((nonnull(1), warn_unused_result))
+dberr_t
buf_flush_or_remove_pages(
/*======================*/
buf_pool_t* buf_pool, /*!< buffer pool instance */
- ulint id) /*!< in: target space id for which
+ ulint id, /*!< in: target space id for which
to remove or flush pages */
+ bool flush, /*!< in: flush to disk if true but
+ don't remove else remove without
+ flushing to disk */
+ const trx_t* trx) /*!< to check if the operation must
+ be interrupted, can be 0 */
{
buf_page_t* prev;
buf_page_t* bpage;
ulint processed = 0;
- ibool all_freed = TRUE;
- ibool must_restart = FALSE;
buf_flush_list_mutex_enter(buf_pool);
+rescan:
+ bool must_restart = false;
+ bool all_freed = true;
+
for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
- !must_restart && bpage != NULL;
+ bpage != NULL;
bpage = prev) {
ut_a(buf_page_in_file(bpage));
- ut_ad(bpage->in_flush_list);
/* Save the previous link because once we free the
page we can't rely on the links. */
- prev = UT_LIST_GET_PREV(flush_list, bpage);
+ prev = UT_LIST_GET_PREV(list, bpage);
if (buf_page_get_space(bpage) != id) {
/* Skip this block, as it does not belong to
the target space. */
- } else if (!buf_flush_or_remove_page(buf_pool, bpage,
+ } else if (!buf_flush_or_remove_page(buf_pool, bpage, flush,
&must_restart)) {
/* Remove was unsuccessful, we have to try again
- by scanning the entire list from the end. */
+ by scanning the entire list from the end.
+ This also means that we never released the
+ flush list mutex. Therefore we can trust the prev
+ pointer.
+ buf_flush_or_remove_page() released the
+ flush list mutex but not the LRU list mutex.
+ Therefore it is possible that a new page was
+ added to the flush list. For example, in case
+ where we are at the head of the flush list and
+ prev == NULL. That is OK because we have the
+ tablespace quiesced and no new pages for this
+ space-id should enter flush_list. This is
+ because the only callers of this function are
+ DROP TABLE and FLUSH TABLE FOR EXPORT.
+ We know that we'll have to do at least one more
+ scan but we don't break out of loop here and
+ try to do as much work as we can in this
+ iteration. */
+
+ all_freed = false;
+ } else if (flush) {
+
+ /* The processing was successful. And during the
+ processing we have released all the buf_pool mutexes
+ when calling buf_page_flush(). We cannot trust
+ prev pointer. */
+ goto rescan;
+ } else if (UNIV_UNLIKELY(must_restart)) {
- all_freed = FALSE;
- }
- if (UNIV_UNLIKELY(must_restart)) {
ut_ad(!all_freed);
break;
}
@@ -668,15 +665,27 @@ buf_flush_or_remove_pages(
/* Reset the batch size counter if we had to yield. */
processed = 0;
- } else if (UNIV_UNLIKELY(must_restart)) {
- all_freed = FALSE;
}
+#ifdef DBUG_OFF
+ if (flush) {
+ DBUG_EXECUTE_IF("ib_export_flush_crash",
+ static ulint n_pages;
+ if (++n_pages == 4) {DBUG_SUICIDE();});
+ }
+#endif /* DBUG_OFF */
+
+ /* The check for trx is interrupted is expensive, we want
+ to check every N iterations. */
+ if (!processed && trx && trx_is_interrupted(trx)) {
+ buf_flush_list_mutex_exit(buf_pool);
+ return(DB_INTERRUPTED);
+ }
}
buf_flush_list_mutex_exit(buf_pool);
- return(all_freed);
+ return(all_freed ? DB_SUCCESS : DB_FAIL);
}
/******************************************************************//**
@@ -684,36 +693,47 @@ Remove or flush all the dirty pages that belong to a given tablespace
inside a specific buffer pool instance. The pages will remain in the LRU
list and will be evicted from the LRU list as they age and move towards
the tail of the LRU list. */
-static
+static __attribute__((nonnull(1)))
void
buf_flush_dirty_pages(
/*==================*/
buf_pool_t* buf_pool, /*!< buffer pool instance */
- ulint id) /*!< in: space id */
+ ulint id, /*!< in: space id */
+ bool flush, /*!< in: flush to disk if true otherwise
+ remove the pages without flushing */
+ const trx_t* trx) /*!< to check if the operation must
+ be interrupted */
{
- ibool all_freed;
+ dberr_t err;
do {
- ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
mutex_enter(&buf_pool->LRU_list_mutex);
- all_freed = buf_flush_or_remove_pages(buf_pool, id);
+ err = buf_flush_or_remove_pages(buf_pool, id, flush, trx);
mutex_exit(&buf_pool->LRU_list_mutex);
ut_ad(buf_flush_validate(buf_pool));
- if (!all_freed) {
- os_thread_sleep(20000);
+ if (err == DB_FAIL) {
+ os_thread_sleep(2000);
}
- } while (!all_freed);
+ /* DB_FAIL is a soft error, it means that the task wasn't
+ completed, needs to be retried. */
+
+ ut_ad(buf_flush_validate(buf_pool));
+
+ } while (err == DB_FAIL);
+
+ ut_ad(err == DB_INTERRUPTED
+ || buf_pool_get_dirty_pages_count(buf_pool, id) == 0);
}
/******************************************************************//**
Remove all pages that belong to a given tablespace inside a specific
buffer pool instance when we are DISCARDing the tablespace. */
-static
+static __attribute__((nonnull))
void
buf_LRU_remove_all_pages(
/*=====================*/
@@ -724,10 +744,7 @@ buf_LRU_remove_all_pages(
ibool all_freed;
scan_again:
- //buf_pool_mutex_enter(buf_pool);
- ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
mutex_enter(&buf_pool->LRU_list_mutex);
- rw_lock_x_lock(&buf_pool->page_hash_latch);
all_freed = TRUE;
@@ -735,44 +752,52 @@ scan_again:
bpage != NULL;
/* No op */) {
+ prio_rw_lock_t* hash_lock;
buf_page_t* prev_bpage;
- mutex_t* block_mutex;
+ ib_mutex_t* block_mutex = NULL;
ut_a(buf_page_in_file(bpage));
ut_ad(bpage->in_LRU_list);
prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
- block_mutex = buf_page_get_mutex_enter(bpage);
-
- if (!block_mutex) {
- /* It may be impossible case...
- Something wrong, so will be scan_again */
-
- all_freed = FALSE;
- goto next_page;
- }
+ /* It is safe to check bpage->space and bpage->io_fix while
+ holding buf_pool->LRU_list_mutex only and later recheck
+ while holding the buf_page_get_mutex() mutex. */
if (buf_page_get_space(bpage) != id) {
/* Skip this block, as it does not belong to
the space that is being invalidated. */
-
- mutex_exit(block_mutex);
goto next_page;
- } else if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+ } else if (UNIV_UNLIKELY(buf_page_get_io_fix_unlocked(bpage)
+ != BUF_IO_NONE)) {
/* We cannot remove this page during this scan
yet; maybe the system is currently reading it
in, or flushing the modifications to the file */
- mutex_exit(block_mutex);
all_freed = FALSE;
goto next_page;
} else {
+ ulint fold = buf_page_address_fold(
+ bpage->space, bpage->offset);
+
+ hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+ rw_lock_x_lock(hash_lock);
- if (bpage->buf_fix_count > 0) {
+ block_mutex = buf_page_get_mutex(bpage);
+ mutex_enter(block_mutex);
+
+ if (UNIV_UNLIKELY(
+ buf_page_get_space(bpage) != id
+ || bpage->buf_fix_count > 0
+ || (buf_page_get_io_fix(bpage)
+ != BUF_IO_NONE))) {
mutex_exit(block_mutex);
+ rw_lock_x_unlock(hash_lock);
+
/* We cannot remove this page during
this scan yet; maybe the system is
currently reading it in, or flushing
@@ -802,13 +827,14 @@ scan_again:
ulint zip_size;
mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
zip_size = buf_page_get_zip_size(bpage);
page_no = buf_page_get_page_no(bpage);
mutex_exit(block_mutex);
+ rw_lock_x_unlock(hash_lock);
+
/* Note that the following call will acquire
and release block->lock X-latch. */
@@ -819,6 +845,7 @@ scan_again:
}
if (bpage->oldest_modification != 0) {
+
buf_flush_remove(bpage);
}
@@ -826,28 +853,28 @@ scan_again:
/* Remove from the LRU list. */
- if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
- != BUF_BLOCK_ZIP_FREE) {
+ if (buf_LRU_block_remove_hashed(bpage, true)) {
- buf_LRU_block_free_hashed_page((buf_block_t*) bpage, TRUE);
+ mutex_enter(block_mutex);
+ buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
mutex_exit(block_mutex);
-
} else {
- /* The block_mutex should have been released
- by buf_LRU_block_remove_hashed_page() when it
- returns BUF_BLOCK_ZIP_FREE. */
ut_ad(block_mutex == &buf_pool->zip_mutex);
}
ut_ad(!mutex_own(block_mutex));
+#ifdef UNIV_SYNC_DEBUG
+ /* buf_LRU_block_remove_hashed() releases the hash_lock */
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
next_page:
bpage = prev_bpage;
}
-// buf_pool_mutex_exit(buf_pool);
mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
if (!all_freed) {
os_thread_sleep(20000);
@@ -857,17 +884,63 @@ next_page:
}
/******************************************************************//**
-Removes all pages belonging to a given tablespace. */
+Remove pages belonging to a given tablespace inside a specific
+buffer pool instance when we are deleting the data file(s) of that
+tablespace. The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU only if buf_remove
+is BUF_REMOVE_FLUSH_NO_WRITE. */
+static __attribute__((nonnull(1)))
+void
+buf_LRU_remove_pages(
+/*=================*/
+ buf_pool_t* buf_pool, /*!< buffer pool instance */
+ ulint id, /*!< in: space id */
+ buf_remove_t buf_remove, /*!< in: remove or flush strategy */
+ const trx_t* trx) /*!< to check if the operation must
+ be interrupted */
+{
+ switch (buf_remove) {
+ case BUF_REMOVE_ALL_NO_WRITE:
+ buf_LRU_remove_all_pages(buf_pool, id);
+ break;
+
+ case BUF_REMOVE_FLUSH_NO_WRITE:
+ ut_a(trx == 0);
+ buf_flush_dirty_pages(buf_pool, id, false, NULL);
+ break;
+
+ case BUF_REMOVE_FLUSH_WRITE:
+ ut_a(trx != 0);
+ buf_flush_dirty_pages(buf_pool, id, true, trx);
+ /* Ensure that all asynchronous IO is completed. */
+ os_aio_wait_until_no_pending_writes();
+ fil_flush(id);
+ break;
+ }
+}
+
+/******************************************************************//**
+Flushes all dirty pages or removes all pages belonging
+to a given tablespace. A PROBLEM: if readahead is being started, what
+guarantees that it will not try to read in pages after this operation
+has completed? */
UNIV_INTERN
void
buf_LRU_flush_or_remove_pages(
/*==========================*/
- ulint id, /*!< in: space id */
- enum buf_remove_t buf_remove)/*!< in: remove or flush
- strategy */
+ ulint id, /*!< in: space id */
+ buf_remove_t buf_remove, /*!< in: remove or flush strategy */
+ const trx_t* trx) /*!< to check if the operation must
+ be interrupted */
{
ulint i;
+ /* Before we attempt to drop pages one by one we first
+ attempt to drop page hash index entries in batches to make
+ it more efficient. The batching attempt is a best effort
+ attempt and does not guarantee that all pages hash entries
+ will be dropped. We get rid of remaining page hash entries
+ one by one below. */
for (i = 0; i < srv_buf_pool_instances; i++) {
buf_pool_t* buf_pool;
@@ -875,28 +948,21 @@ buf_LRU_flush_or_remove_pages(
switch (buf_remove) {
case BUF_REMOVE_ALL_NO_WRITE:
- /* A DISCARD tablespace case. Remove AHI entries
- and evict all pages from LRU. */
-
- /* Before we attempt to drop pages hash entries
- one by one we first attempt to drop page hash
- index entries in batches to make it more
- efficient. The batching attempt is a best effort
- attempt and does not guarantee that all pages
- hash entries will be dropped. We get rid of
- remaining page hash entries one by one below. */
buf_LRU_drop_page_hash_for_tablespace(buf_pool, id);
- buf_LRU_remove_all_pages(buf_pool, id);
break;
case BUF_REMOVE_FLUSH_NO_WRITE:
- /* A DROP table case. AHI entries are already
- removed. No need to evict all pages from LRU
- list. Just evict pages from flush list without
- writing. */
- buf_flush_dirty_pages(buf_pool, id);
+ /* It is a DROP TABLE for a single table
+ tablespace. No AHI entries exist because
+ we already dealt with them when freeing up
+ extents. */
+ case BUF_REMOVE_FLUSH_WRITE:
+ /* We allow read-only queries against the
+ table, there is no need to drop the AHI entries. */
break;
}
+
+ buf_LRU_remove_pages(buf_pool, id, buf_remove, trx);
}
}
@@ -912,8 +978,7 @@ buf_LRU_insert_zip_clean(
buf_page_t* b;
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
- //ut_ad(buf_pool_mutex_own(buf_pool));
- //ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+ ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
ut_ad(mutex_own(&buf_pool->zip_mutex));
ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
@@ -922,17 +987,17 @@ buf_LRU_insert_zip_clean(
b = bpage;
do {
b = UT_LIST_GET_NEXT(LRU, b);
- } while (b && (buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE || !b->in_LRU_list));
+ } while (b && buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE);
/* Insert bpage before b, i.e., after the predecessor of b. */
if (b) {
- b = UT_LIST_GET_PREV(zip_list, b);
+ b = UT_LIST_GET_PREV(list, b);
}
if (b) {
- UT_LIST_INSERT_AFTER(zip_list, buf_pool->zip_clean, b, bpage);
+ UT_LIST_INSERT_AFTER(list, buf_pool->zip_clean, b, bpage);
} else {
- UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_clean, bpage);
+ UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, bpage);
}
}
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
@@ -946,63 +1011,48 @@ ibool
buf_LRU_free_from_unzip_LRU_list(
/*=============================*/
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
- ulint n_iterations, /*!< in: how many times this has
- been called repeatedly without
- result: a high value means that
- we should search farther; we will
- search n_iterations / 5 of the
- unzip_LRU list, or nothing if
- n_iterations >= 5 */
- ibool* have_LRU_mutex)
+ ibool scan_all) /*!< in: scan whole LRU list
+ if TRUE, otherwise scan only
+ srv_LRU_scan_depth / 2 blocks. */
{
buf_block_t* block;
- ulint distance;
-
- //ut_ad(buf_pool_mutex_own(buf_pool));
+ ibool freed;
+ ulint scanned;
- /* Theoratically it should be much easier to find a victim
- from unzip_LRU as we can choose even a dirty block (as we'll
- be evicting only the uncompressed frame). In a very unlikely
- eventuality that we are unable to find a victim from
- unzip_LRU, we fall back to the regular LRU list. We do this
- if we have done five iterations so far. */
-
- if (UNIV_UNLIKELY(n_iterations >= 5)
- || !buf_LRU_evict_from_unzip_LRU(buf_pool, have_LRU_mutex)) {
+ ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+ if (!buf_LRU_evict_from_unzip_LRU(buf_pool)) {
return(FALSE);
}
- distance = 100 + (n_iterations
- * UT_LIST_GET_LEN(buf_pool->unzip_LRU)) / 5;
+ for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU),
+ scanned = 1, freed = FALSE;
+ block != NULL && !freed
+ && (scan_all || scanned < srv_LRU_scan_depth);
+ ++scanned) {
-restart:
- for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
- UNIV_LIKELY(block != NULL) && UNIV_LIKELY(distance > 0);
- block = UT_LIST_GET_PREV(unzip_LRU, block), distance--) {
-
- ibool freed;
+ buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU,
+ block);
mutex_enter(&block->mutex);
- if (!block->in_unzip_LRU_list || !block->page.in_LRU_list
- || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
- mutex_exit(&block->mutex);
- goto restart;
- }
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
ut_ad(block->in_unzip_LRU_list);
ut_ad(block->page.in_LRU_list);
- freed = buf_LRU_free_block(&block->page, FALSE, have_LRU_mutex);
+ freed = buf_LRU_free_page(&block->page, false);
+
mutex_exit(&block->mutex);
- if (freed) {
- return(TRUE);
- }
+ block = prev_block;
}
- return(FALSE);
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+ scanned);
+ return(freed);
}
/******************************************************************//**
@@ -1012,61 +1062,56 @@ UNIV_INLINE
ibool
buf_LRU_free_from_common_LRU_list(
/*==============================*/
- buf_pool_t* buf_pool,
- ulint n_iterations,
- /*!< in: how many times this has been called
- repeatedly without result: a high value means
- that we should search farther; if
- n_iterations < 10, then we search
- n_iterations / 10 * buf_pool->curr_size
- pages from the end of the LRU list */
- ibool* have_LRU_mutex)
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ ibool scan_all) /*!< in: scan whole LRU list
+ if TRUE, otherwise scan only
+ srv_LRU_scan_depth / 2 blocks. */
{
buf_page_t* bpage;
- ulint distance;
-
- //ut_ad(buf_pool_mutex_own(buf_pool));
+ ibool freed;
+ ulint scanned;
- distance = 100 + (n_iterations * buf_pool->curr_size) / 10;
+ ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
-restart:
- for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
- UNIV_LIKELY(bpage != NULL) && UNIV_LIKELY(distance > 0);
- bpage = UT_LIST_GET_PREV(LRU, bpage), distance--) {
+ for (bpage = UT_LIST_GET_LAST(buf_pool->LRU),
+ scanned = 1, freed = FALSE;
+ bpage != NULL && !freed
+ && (scan_all || scanned < srv_LRU_scan_depth);
+ ++scanned) {
- ibool freed;
unsigned accessed;
- mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
-
- if (!block_mutex) {
- goto restart;
- }
-
- if (!bpage->in_LRU_list
- || !buf_page_in_file(bpage)) {
- mutex_exit(block_mutex);
- goto restart;
- }
+ buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU,
+ bpage);
+ ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
ut_ad(buf_page_in_file(bpage));
ut_ad(bpage->in_LRU_list);
accessed = buf_page_is_accessed(bpage);
- freed = buf_LRU_free_block(bpage, TRUE, have_LRU_mutex);
+
+ mutex_enter(block_mutex);
+
+ freed = buf_LRU_free_page(bpage, true);
+
mutex_exit(block_mutex);
- if (freed) {
+ if (freed && !accessed) {
/* Keep track of pages that are evicted without
ever being accessed. This gives us a measure of
the effectiveness of readahead */
- if (!accessed) {
- ++buf_pool->stat.n_ra_pages_evicted;
- }
- return(TRUE);
+ ++buf_pool->stat.n_ra_pages_evicted;
}
+
+ bpage = prev_bpage;
}
- return(FALSE);
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_SEARCH_SCANNED,
+ MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+ scanned);
+
+ return(freed);
}
/******************************************************************//**
@@ -1074,90 +1119,34 @@ Try to free a replaceable block.
@return TRUE if found and freed */
UNIV_INTERN
ibool
-buf_LRU_search_and_free_block(
-/*==========================*/
- buf_pool_t* buf_pool,
- /*!< in: buffer pool instance */
- ulint n_iterations)
- /*!< in: how many times this has been called
- repeatedly without result: a high value means
- that we should search farther; if
- n_iterations < 10, then we search
- n_iterations / 10 * buf_pool->curr_size
- pages from the end of the LRU list; if
- n_iterations < 5, then we will also search
- n_iterations / 5 of the unzip_LRU list. */
+buf_LRU_scan_and_free_block(
+/*========================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ ibool scan_all) /*!< in: scan whole LRU list
+ if TRUE, otherwise scan only
+ 'old' blocks. */
{
ibool freed = FALSE;
- ibool have_LRU_mutex = FALSE;
+ bool use_unzip_list = UT_LIST_GET_LEN(buf_pool->unzip_LRU) > 0;
- //buf_pool_mutex_enter(buf_pool);
- if (UT_LIST_GET_LEN(buf_pool->unzip_LRU)) {
- ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
- mutex_enter(&buf_pool->LRU_list_mutex);
- have_LRU_mutex = TRUE;
- }
-
- freed = buf_LRU_free_from_unzip_LRU_list(buf_pool, n_iterations, &have_LRU_mutex);
+ mutex_enter(&buf_pool->LRU_list_mutex);
- if (!freed) {
- freed = buf_LRU_free_from_common_LRU_list(
- buf_pool, n_iterations, &have_LRU_mutex);
+ if (use_unzip_list) {
+ freed = buf_LRU_free_from_unzip_LRU_list(buf_pool, scan_all);
}
- buf_pool_mutex_enter(buf_pool);
if (!freed) {
- buf_pool->LRU_flush_ended = 0;
- } else if (buf_pool->LRU_flush_ended > 0) {
- buf_pool->LRU_flush_ended--;
+ freed = buf_LRU_free_from_common_LRU_list(buf_pool, scan_all);
}
- buf_pool_mutex_exit(buf_pool);
- if (have_LRU_mutex)
+ if (!freed) {
mutex_exit(&buf_pool->LRU_list_mutex);
+ }
return(freed);
}
/******************************************************************//**
-Tries to remove LRU flushed blocks from the end of the LRU list and put them
-to the free list. This is beneficial for the efficiency of the insert buffer
-operation, as flushed pages from non-unique non-clustered indexes are here
-taken out of the buffer pool, and their inserts redirected to the insert
-buffer. Otherwise, the flushed blocks could get modified again before read
-operations need new buffer blocks, and the i/o work done in flushing would be
-wasted. */
-UNIV_INTERN
-void
-buf_LRU_try_free_flushed_blocks(
-/*============================*/
- buf_pool_t* buf_pool) /*!< in: buffer pool instance */
-{
-
- if (buf_pool == NULL) {
- ulint i;
-
- for (i = 0; i < srv_buf_pool_instances; i++) {
- buf_pool = buf_pool_from_array(i);
- buf_LRU_try_free_flushed_blocks(buf_pool);
- }
- } else {
- buf_pool_mutex_enter(buf_pool);
-
- while (buf_pool->LRU_flush_ended > 0) {
-
- buf_pool_mutex_exit(buf_pool);
-
- buf_LRU_search_and_free_block(buf_pool, 1);
-
- buf_pool_mutex_enter(buf_pool);
- }
-
- buf_pool_mutex_exit(buf_pool);
- }
-}
-
-/******************************************************************//**
Returns TRUE if less than 25 % of the buffer pool in any instance is
available. This can be used in heuristics to prevent huge transactions
eating up the whole buffer pool for their locks.
@@ -1175,11 +1164,6 @@ buf_LRU_buf_pool_running_out(void)
buf_pool = buf_pool_from_array(i);
- //buf_pool_mutex_enter(buf_pool);
- ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
- mutex_enter(&buf_pool->LRU_list_mutex);
- mutex_enter(&buf_pool->free_list_mutex);
-
if (!recv_recovery_on
&& UT_LIST_GET_LEN(buf_pool->free)
+ UT_LIST_GET_LEN(buf_pool->LRU)
@@ -1187,10 +1171,6 @@ buf_LRU_buf_pool_running_out(void)
ret = TRUE;
}
-
- //buf_pool_mutex_exit(buf_pool);
- mutex_exit(&buf_pool->LRU_list_mutex);
- mutex_exit(&buf_pool->free_list_mutex);
}
return(ret);
@@ -1208,9 +1188,8 @@ buf_LRU_get_free_only(
{
buf_block_t* block;
- //ut_ad(buf_pool_mutex_own(buf_pool));
+ mutex_enter_last(&buf_pool->free_list_mutex);
- mutex_enter(&buf_pool->free_list_mutex);
block = (buf_block_t*) UT_LIST_GET_LAST(buf_pool->free);
if (block) {
@@ -1220,44 +1199,37 @@ buf_LRU_get_free_only(
ut_ad(!block->page.in_flush_list);
ut_ad(!block->page.in_LRU_list);
ut_a(!buf_page_in_file(&block->page));
- UT_LIST_REMOVE(free, buf_pool->free, (&block->page));
+ UT_LIST_REMOVE(list, buf_pool->free, (&block->page));
+ buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE);
mutex_exit(&buf_pool->free_list_mutex);
mutex_enter(&block->mutex);
- buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE);
UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE);
ut_ad(buf_pool_from_block(block) == buf_pool);
mutex_exit(&block->mutex);
- } else {
- mutex_exit(&buf_pool->free_list_mutex);
+ return(block);
}
- return(block);
+ mutex_exit(&buf_pool->free_list_mutex);
+
+ return(NULL);
}
/******************************************************************//**
-Returns a free block from the buf_pool. The block is taken off the
-free list. If it is empty, blocks are moved from the end of the
-LRU list to the free list.
-@return the free control block, in state BUF_BLOCK_READY_FOR_USE */
-UNIV_INTERN
-buf_block_t*
-buf_LRU_get_free_block(
-/*===================*/
- buf_pool_t* buf_pool) /*!< in/out: buffer pool instance */
+Checks how much of buf_pool is occupied by non-data objects like
+AHI, lock heaps etc. Depending on the size of non-data objects this
+function will either assert or issue a warning and switch on the
+status monitor. */
+static
+void
+buf_LRU_check_size_of_non_data_objects(
+/*===================================*/
+ const buf_pool_t* buf_pool) /*!< in: buffer pool instance */
{
- buf_block_t* block = NULL;
- ibool freed;
- ulint n_iterations = 1;
- ibool mon_value_was = FALSE;
- ibool started_monitor = FALSE;
-loop:
- //buf_pool_mutex_enter(buf_pool);
-
if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free)
+ UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 20) {
ut_print_timestamp(stderr);
@@ -1309,7 +1281,7 @@ loop:
buf_lru_switched_on_innodb_mon = TRUE;
srv_print_innodb_monitor = TRUE;
- os_event_set(srv_lock_timeout_thread_event);
+ os_event_set(lock_sys->timeout_event);
}
} else if (buf_lru_switched_on_innodb_mon) {
@@ -1321,12 +1293,67 @@ loop:
buf_lru_switched_on_innodb_mon = FALSE;
srv_print_innodb_monitor = FALSE;
}
+}
+
+/** The maximum allowed backoff sleep time duration, microseconds */
+#define MAX_FREE_LIST_BACKOFF_SLEEP 10000
+
+/** The sleep reduction factor for high-priority waiter backoff sleeps */
+#define FREE_LIST_BACKOFF_HIGH_PRIO_DIVIDER 100
+
+/** The sleep reduction factor for low-priority waiter backoff sleeps */
+#define FREE_LIST_BACKOFF_LOW_PRIO_DIVIDER 1
+
+/******************************************************************//**
+Returns a free block from the buf_pool. The block is taken off the
+free list. If free list is empty, blocks are moved from the end of the
+LRU list to the free list.
+This function is called from a user thread when it needs a clean
+block to read in a page. Note that we only ever get a block from
+the free list. Even when we flush a page or find a page in LRU scan
+we put it to free list to be used.
+* iteration 0:
+ * get a block from free list, success:done
+ * if there is an LRU flush batch in progress:
+ * wait for batch to end: retry free list
+ * if buf_pool->try_LRU_scan is set
+ * scan LRU up to srv_LRU_scan_depth to find a clean block
+ * the above will put the block on free list
+ * success:retry the free list
+ * flush one dirty page from tail of LRU to disk
+ * the above will put the block on free list
+ * success: retry the free list
+* iteration 1:
+ * same as iteration 0 except:
+ * scan whole LRU list
+ * scan LRU list even if buf_pool->try_LRU_scan is not set
+* iteration > 1:
+ * same as iteration 1 but sleep 100ms
+@return the free control block, in state BUF_BLOCK_READY_FOR_USE */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_block(
+/*===================*/
+ buf_pool_t* buf_pool) /*!< in/out: buffer pool instance */
+{
+ buf_block_t* block = NULL;
+ ibool freed = FALSE;
+ ulint n_iterations = 0;
+ ulint flush_failures = 0;
+ ibool mon_value_was = FALSE;
+ ibool started_monitor = FALSE;
+
+ ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
+
+ MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
+loop:
+ buf_LRU_check_size_of_non_data_objects(buf_pool);
/* If there is a block in the free list, take it */
block = buf_LRU_get_free_only(buf_pool);
- //buf_pool_mutex_exit(buf_pool);
if (block) {
+
ut_ad(buf_pool_from_block(block) == buf_pool);
memset(&block->page.zip, 0, sizeof block->page.zip);
@@ -1337,20 +1364,108 @@ loop:
return(block);
}
- /* If no block was in the free list, search from the end of the LRU
- list and try to free a block there */
+ if (srv_empty_free_list_algorithm == SRV_EMPTY_FREE_LIST_BACKOFF
+ && buf_page_cleaner_is_active
+ && srv_shutdown_state == SRV_SHUTDOWN_NONE) {
- freed = buf_LRU_search_and_free_block(buf_pool, n_iterations);
+ /* Backoff to minimize the free list mutex contention while the
+ free list is empty */
+ ulint priority = srv_current_thread_priority;
+
+ if (n_iterations < 3) {
+
+ os_thread_yield();
+ if (!priority) {
+ os_thread_yield();
+ }
+ } else {
+
+ ulint i, b;
+
+ if (n_iterations < 6) {
+ i = n_iterations - 3;
+ } else if (n_iterations < 8) {
+ i = 4;
+ } else if (n_iterations < 11) {
+ i = 5;
+ } else {
+ i = n_iterations - 5;
+ }
+ b = 1 << i;
+ if (b > MAX_FREE_LIST_BACKOFF_SLEEP) {
+ b = MAX_FREE_LIST_BACKOFF_SLEEP;
+ }
+ os_thread_sleep(b / (priority
+ ? FREE_LIST_BACKOFF_HIGH_PRIO_DIVIDER
+ : FREE_LIST_BACKOFF_LOW_PRIO_DIVIDER));
+ }
+
+ /* In case of backoff, do not ever attempt single page flushes
+ and wait for the cleaner to free some pages instead. */
+
+ n_iterations++;
- if (freed > 0) {
+ goto loop;
+ } else {
+
+ /* The cleaner is not running or Oracle MySQL 5.6 algorithm was
+ requested, will perform a single page flush */
+ ut_ad((srv_empty_free_list_algorithm
+ == SRV_EMPTY_FREE_LIST_LEGACY)
+ || !buf_page_cleaner_is_active
+ || (srv_shutdown_state != SRV_SHUTDOWN_NONE));
+ }
+
+ mutex_enter(&buf_pool->flush_state_mutex);
+
+ if (buf_pool->init_flush[BUF_FLUSH_LRU]
+ && srv_use_doublewrite_buf
+ && buf_dblwr != NULL) {
+
+ mutex_exit(&buf_pool->flush_state_mutex);
+
+ /* If there is an LRU flush happening in the background
+ then we wait for it to end instead of trying a single
+ page flush. If, however, we are not using doublewrite
+ buffer then it is better to do our own single page
+ flush instead of waiting for LRU flush to end. */
+ buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
goto loop;
}
- if (n_iterations > 30) {
+ mutex_exit(&buf_pool->flush_state_mutex);
+
+ freed = FALSE;
+ if (buf_pool->try_LRU_scan || n_iterations > 0) {
+
+ /* If no block was in the free list, search from the
+ end of the LRU list and try to free a block there.
+ If we are doing for the first time we'll scan only
+ tail of the LRU list otherwise we scan the whole LRU
+ list. */
+ freed = buf_LRU_scan_and_free_block(buf_pool,
+ n_iterations > 0);
+
+ if (!freed && n_iterations == 0) {
+ /* Tell other threads that there is no point
+ in scanning the LRU list. This flag is set to
+ TRUE again when we flush a batch from this
+ buffer pool. */
+ buf_pool->try_LRU_scan = FALSE;
+ }
+ }
+
+ if (freed) {
+ goto loop;
+
+ }
+
+ if (n_iterations > 20) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Warning: difficult to find free blocks in\n"
- "InnoDB: the buffer pool (%lu search iterations)!"
+ "InnoDB: the buffer pool (%lu search iterations)!\n"
+ "InnoDB: %lu failed attempts to flush a page!"
" Consider\n"
"InnoDB: increasing the buffer pool size.\n"
"InnoDB: It is also possible that"
@@ -1369,6 +1484,7 @@ loop:
"InnoDB: Starting InnoDB Monitor to print further\n"
"InnoDB: diagnostics to the standard output.\n",
(ulong) n_iterations,
+ (ulong) flush_failures,
(ulong) fil_n_pending_log_flushes,
(ulong) fil_n_pending_tablespace_flushes,
(ulong) os_n_file_reads, (ulong) os_n_file_writes,
@@ -1377,35 +1493,35 @@ loop:
mon_value_was = srv_print_innodb_monitor;
started_monitor = TRUE;
srv_print_innodb_monitor = TRUE;
- os_event_set(srv_lock_timeout_thread_event);
+ os_event_set(lock_sys->timeout_event);
}
- /* No free block was found: try to flush the LRU list */
-
- buf_flush_free_margin(buf_pool, TRUE);
- ++srv_buf_pool_wait_free;
-
- os_aio_simulated_wake_handler_threads();
-
- buf_pool_mutex_enter(buf_pool);
-
- if (buf_pool->LRU_flush_ended > 0) {
- /* We have written pages in an LRU flush. To make the insert
- buffer more efficient, we try to move these pages to the free
- list. */
-
- buf_pool_mutex_exit(buf_pool);
+ /* If we have scanned the whole LRU and still are unable to
+ find a free block then we should sleep here to let the
+ page_cleaner do an LRU batch for us.
+ TODO: It'd be better if we can signal the page_cleaner. Perhaps
+ we should use timed wait for page_cleaner. */
+ if (n_iterations > 1) {
- buf_LRU_try_free_flushed_blocks(buf_pool);
- } else {
- buf_pool_mutex_exit(buf_pool);
+ os_thread_sleep(100000);
}
- if (n_iterations > 10) {
-
- os_thread_sleep(500000);
+ /* No free block was found: try to flush the LRU list.
+ This call will flush one page from the LRU and put it on the
+ free list. That means that the free block is up for grabs for
+ all user threads.
+ TODO: A more elegant way would have been to return the freed
+ up block to the caller here but the code that deals with
+ removing the block from page_hash and LRU_list is fairly
+ involved (particularly in case of compressed pages). We
+ can do that in a separate patch sometime in future. */
+ if (!buf_flush_single_page_from_LRU(buf_pool)) {
+ MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
+ ++flush_failures;
}
+ srv_stats.buf_pool_wait_free.add(n_iterations, 1);
+
n_iterations++;
goto loop;
@@ -1424,7 +1540,6 @@ buf_LRU_old_adjust_len(
ulint new_len;
ut_a(buf_pool->LRU_old);
- //ut_ad(buf_pool_mutex_own(buf_pool));
ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
ut_ad(buf_pool->LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN);
ut_ad(buf_pool->LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX);
@@ -1491,7 +1606,6 @@ buf_LRU_old_init(
{
buf_page_t* bpage;
- //ut_ad(buf_pool_mutex_own(buf_pool));
ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN);
@@ -1527,14 +1641,13 @@ buf_unzip_LRU_remove_block_if_needed(
ut_ad(buf_pool);
ut_ad(bpage);
ut_ad(buf_page_in_file(bpage));
- //ut_ad(buf_pool_mutex_own(buf_pool));
ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
if (buf_page_belongs_to_unzip_LRU(bpage)) {
buf_block_t* block = (buf_block_t*) bpage;
ut_ad(block->in_unzip_LRU_list);
- block->in_unzip_LRU_list = FALSE;
+ ut_d(block->in_unzip_LRU_list = FALSE);
UT_LIST_REMOVE(unzip_LRU, buf_pool->unzip_LRU, block);
}
@@ -1553,7 +1666,6 @@ buf_LRU_remove_block(
ut_ad(buf_pool);
ut_ad(bpage);
- //ut_ad(buf_pool_mutex_own(buf_pool));
ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
ut_a(buf_page_in_file(bpage));
@@ -1584,7 +1696,7 @@ buf_LRU_remove_block(
/* Remove the block from the LRU list */
UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage);
- bpage->in_LRU_list = FALSE;
+ ut_d(bpage->in_LRU_list = FALSE);
zip_size = page_zip_get_size(&bpage->zip);
buf_pool->stat.LRU_bytes -= zip_size ? zip_size : UNIV_PAGE_SIZE;
@@ -1634,13 +1746,12 @@ buf_unzip_LRU_add_block(
ut_ad(buf_pool);
ut_ad(block);
- //ut_ad(buf_pool_mutex_own(buf_pool));
ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
ut_ad(!block->in_unzip_LRU_list);
- block->in_unzip_LRU_list = TRUE;
+ ut_d(block->in_unzip_LRU_list = TRUE);
if (old) {
UT_LIST_ADD_LAST(unzip_LRU, buf_pool->unzip_LRU, block);
@@ -1664,14 +1775,13 @@ buf_LRU_add_block_to_end_low(
ut_ad(buf_pool);
ut_ad(bpage);
- //ut_ad(buf_pool_mutex_own(buf_pool));
ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
ut_a(buf_page_in_file(bpage));
ut_ad(!bpage->in_LRU_list);
UT_LIST_ADD_LAST(LRU, buf_pool->LRU, bpage);
- bpage->in_LRU_list = TRUE;
+ ut_d(bpage->in_LRU_list = TRUE);
incr_LRU_size_in_bytes(bpage, buf_pool);
@@ -1721,7 +1831,6 @@ buf_LRU_add_block_low(
ut_ad(buf_pool);
ut_ad(bpage);
- //ut_ad(buf_pool_mutex_own(buf_pool));
ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
ut_a(buf_page_in_file(bpage));
@@ -1747,7 +1856,7 @@ buf_LRU_add_block_low(
buf_pool->LRU_old_len++;
}
- bpage->in_LRU_list = TRUE;
+ ut_d(bpage->in_LRU_list = TRUE);
incr_LRU_size_in_bytes(bpage, buf_pool);
@@ -1806,7 +1915,6 @@ buf_LRU_make_block_young(
{
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
- //ut_ad(buf_pool_mutex_own(buf_pool));
ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
if (bpage->old) {
@@ -1833,32 +1941,36 @@ buf_LRU_make_block_old(
Try to free a block. If bpage is a descriptor of a compressed-only
page, the descriptor object will be freed as well.
-NOTE: If this function returns TRUE, it will temporarily
-release buf_pool->mutex. Furthermore, the page frame will no longer be
-accessible via bpage.
+NOTE: If this function returns true, it will release the LRU list mutex,
+and temporarily release and relock the buf_page_get_mutex() mutex.
+Furthermore, the page frame will no longer be accessible via bpage. If this
+function returns false, the buf_page_get_mutex() might be temporarily released
+and relocked too.
+
+The caller must hold the LRU list and buf_page_get_mutex() mutexes.
-The caller must hold buf_pool->mutex and buf_page_get_mutex(bpage) and
-release these two mutexes after the call. No other
-buf_page_get_mutex() may be held when calling this function.
-@return TRUE if freed, FALSE otherwise. */
+@return true if freed, false otherwise. */
UNIV_INTERN
-ibool
-buf_LRU_free_block(
+bool
+buf_LRU_free_page(
/*===============*/
buf_page_t* bpage, /*!< in: block to be freed */
- ibool zip, /*!< in: TRUE if should remove also the
+ bool zip) /*!< in: true if should remove also the
compressed page of an uncompressed page */
- ibool* have_LRU_mutex)
{
buf_page_t* b = NULL;
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
- mutex_t* block_mutex = buf_page_get_mutex(bpage);
+ const ulint fold = buf_page_address_fold(bpage->space,
+ bpage->offset);
+ prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+ ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
- //ut_ad(buf_pool_mutex_own(buf_pool));
+ ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
ut_ad(mutex_own(block_mutex));
ut_ad(buf_page_in_file(bpage));
- //ut_ad(bpage->in_LRU_list);
- ut_ad(!bpage->in_flush_list == !bpage->oldest_modification);
+ ut_ad(bpage->in_LRU_list);
+
#if UNIV_WORD_SIZE == 4
/* On 32-bit systems, there is no padding in buf_page_t. On
other systems, Valgrind could complain about uninitialized pad
@@ -1866,10 +1978,10 @@ buf_LRU_free_block(
UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage);
#endif
- if (!bpage->in_LRU_list || !block_mutex || !buf_page_can_relocate(bpage)) {
+ if (!buf_page_can_relocate(bpage)) {
/* Do not free buffer-fixed or I/O-fixed blocks. */
- return(FALSE);
+ return(false);
}
#ifdef UNIV_IBUF_COUNT_DEBUG
@@ -1881,28 +1993,32 @@ buf_LRU_free_block(
/* Do not completely free dirty blocks. */
if (bpage->oldest_modification) {
- return(FALSE);
+ return(false);
}
- } else if (bpage->oldest_modification) {
- /* Do not completely free dirty blocks. */
+ } else if ((bpage->oldest_modification)
+ && (buf_page_get_state(bpage)
+ != BUF_BLOCK_FILE_PAGE)) {
- if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
- ut_ad(buf_page_get_state(bpage)
- == BUF_BLOCK_ZIP_DIRTY);
- return(FALSE);
- }
+ ut_ad(buf_page_get_state(bpage)
+ == BUF_BLOCK_ZIP_DIRTY);
+
+ return(false);
- goto alloc;
} else if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
- /* Allocate the control block for the compressed page.
- If it cannot be allocated (without freeing a block
- from the LRU list), refuse to free bpage. */
-alloc:
b = buf_page_alloc_descriptor();
ut_a(b);
- //memcpy(b, bpage, sizeof *b);
}
+ ut_ad(buf_page_in_file(bpage));
+ ut_ad(bpage->in_LRU_list);
+ ut_ad(!bpage->in_flush_list == !bpage->oldest_modification);
+#if UNIV_WORD_SIZE == 4
+ /* On 32-bit systems, there is no padding in buf_page_t. On
+ other systems, Valgrind could complain about uninitialized pad
+ bytes. */
+ UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage);
+#endif
+
#ifdef UNIV_DEBUG
if (buf_debug_prints) {
fprintf(stderr, "Putting space %lu page %lu to free list\n",
@@ -1911,232 +2027,216 @@ alloc:
}
#endif /* UNIV_DEBUG */
- /* not to break latch order, must re-enter block_mutex */
mutex_exit(block_mutex);
- if (!*have_LRU_mutex) {
- mutex_enter(&buf_pool->LRU_list_mutex); /* optimistic */
- *have_LRU_mutex = TRUE;
- }
- rw_lock_x_lock(&buf_pool->page_hash_latch);
+ rw_lock_x_lock(hash_lock);
mutex_enter(block_mutex);
- /* recheck states of block */
- if (!bpage->in_LRU_list || block_mutex != buf_page_get_mutex(bpage)
- || !buf_page_can_relocate(bpage)) {
+ if (UNIV_UNLIKELY(!buf_page_can_relocate(bpage)
+ || ((zip || !bpage->zip.data)
+ && bpage->oldest_modification))) {
+
not_freed:
+ rw_lock_x_unlock(hash_lock);
if (b) {
buf_page_free_descriptor(b);
}
- if (*have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- *have_LRU_mutex = FALSE;
- }
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
- return(FALSE);
- } else if (zip || !bpage->zip.data) {
- if (bpage->oldest_modification)
- goto not_freed;
- } else if (bpage->oldest_modification) {
- if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
- ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY);
- goto not_freed;
- }
+
+ return(false);
+ } else if (UNIV_UNLIKELY(bpage->oldest_modification
+ && (buf_page_get_state(bpage)
+ != BUF_BLOCK_FILE_PAGE))) {
+
+ ut_ad(buf_page_get_state(bpage)
+ == BUF_BLOCK_ZIP_DIRTY);
+ goto not_freed;
}
if (b) {
memcpy(b, bpage, sizeof *b);
}
- if (buf_LRU_block_remove_hashed_page(bpage, zip)
- != BUF_BLOCK_ZIP_FREE) {
- ut_a(bpage->buf_fix_count == 0);
+ if (!buf_LRU_block_remove_hashed(bpage, zip)) {
+
+ mutex_exit(&buf_pool->LRU_list_mutex);
if (b) {
- buf_page_t* hash_b;
- buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b);
+ buf_page_free_descriptor(b);
+ }
- const ulint fold = buf_page_address_fold(
- bpage->space, bpage->offset);
+ mutex_enter(block_mutex);
- hash_b = buf_page_hash_get_low(
- buf_pool, bpage->space, bpage->offset, fold);
+ return(true);
+ }
- ut_a(!hash_b);
+#ifdef UNIV_SYNC_DEBUG
+ /* buf_LRU_block_remove_hashed() releases the hash_lock */
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)
+ && !rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ /* We have just freed a BUF_BLOCK_FILE_PAGE. If b != NULL
+ then it was a compressed page with an uncompressed frame and
+ we are interested in freeing only the uncompressed frame.
+ Therefore we have to reinsert the compressed page descriptor
+ into the LRU and page_hash (and possibly flush_list).
+ if b == NULL then it was a regular page that has been freed */
- while (prev_b && !prev_b->in_LRU_list) {
- prev_b = UT_LIST_GET_PREV(LRU, prev_b);
- }
+ if (b) {
+ buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b);
- b->state = b->oldest_modification
- ? BUF_BLOCK_ZIP_DIRTY
- : BUF_BLOCK_ZIP_PAGE;
- UNIV_MEM_DESC(b->zip.data,
- page_zip_get_size(&b->zip), b);
-
- /* The fields in_page_hash and in_LRU_list of
- the to-be-freed block descriptor should have
- been cleared in
- buf_LRU_block_remove_hashed_page(), which
- invokes buf_LRU_remove_block(). */
- ut_ad(!bpage->in_page_hash);
- ut_ad(!bpage->in_LRU_list);
- /* bpage->state was BUF_BLOCK_FILE_PAGE because
- b != NULL. The type cast below is thus valid. */
- ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
+ rw_lock_x_lock(hash_lock);
+ mutex_enter(block_mutex);
+
+ ut_a(!buf_page_hash_get_low(buf_pool,
+ bpage->space,
+ bpage->offset,
+ fold));
+
+ b->state = b->oldest_modification
+ ? BUF_BLOCK_ZIP_DIRTY
+ : BUF_BLOCK_ZIP_PAGE;
+ UNIV_MEM_DESC(b->zip.data,
+ page_zip_get_size(&b->zip));
+
+ /* The fields in_page_hash and in_LRU_list of
+ the to-be-freed block descriptor should have
+ been cleared in
+ buf_LRU_block_remove_hashed(), which
+ invokes buf_LRU_remove_block(). */
+ ut_ad(!bpage->in_page_hash);
+ ut_ad(!bpage->in_LRU_list);
+ /* bpage->state was BUF_BLOCK_FILE_PAGE because
+ b != NULL. The type cast below is thus valid. */
+ ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
- /* The fields of bpage were copied to b before
- buf_LRU_block_remove_hashed_page() was invoked. */
- ut_ad(!b->in_zip_hash);
- ut_ad(b->in_page_hash);
- ut_ad(b->in_LRU_list);
+ /* The fields of bpage were copied to b before
+ buf_LRU_block_remove_hashed() was invoked. */
+ ut_ad(!b->in_zip_hash);
+ ut_ad(b->in_page_hash);
+ ut_ad(b->in_LRU_list);
- HASH_INSERT(buf_page_t, hash,
- buf_pool->page_hash, fold, b);
+ HASH_INSERT(buf_page_t, hash,
+ buf_pool->page_hash, fold, b);
- /* Insert b where bpage was in the LRU list. */
- if (UNIV_LIKELY(prev_b != NULL)) {
- ulint lru_len;
+ /* Insert b where bpage was in the LRU list. */
+ if (UNIV_LIKELY(prev_b != NULL)) {
+ ulint lru_len;
- ut_ad(prev_b->in_LRU_list);
- ut_ad(buf_page_in_file(prev_b));
+ ut_ad(prev_b->in_LRU_list);
+ ut_ad(buf_page_in_file(prev_b));
#if UNIV_WORD_SIZE == 4
- /* On 32-bit systems, there is no
- padding in buf_page_t. On other
- systems, Valgrind could complain about
- uninitialized pad bytes. */
- UNIV_MEM_ASSERT_RW(prev_b, sizeof *prev_b);
+ /* On 32-bit systems, there is no
+ padding in buf_page_t. On other
+ systems, Valgrind could complain about
+ uninitialized pad bytes. */
+ UNIV_MEM_ASSERT_RW(prev_b, sizeof *prev_b);
#endif
- UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU,
- prev_b, b);
+ UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU,
+ prev_b, b);
- incr_LRU_size_in_bytes(b, buf_pool);
+ incr_LRU_size_in_bytes(b, buf_pool);
- if (buf_page_is_old(b)) {
- buf_pool->LRU_old_len++;
- if (UNIV_UNLIKELY
- (buf_pool->LRU_old
- == UT_LIST_GET_NEXT(LRU, b))) {
+ if (buf_page_is_old(b)) {
+ buf_pool->LRU_old_len++;
+ if (UNIV_UNLIKELY
+ (buf_pool->LRU_old
+ == UT_LIST_GET_NEXT(LRU, b))) {
- buf_pool->LRU_old = b;
- }
+ buf_pool->LRU_old = b;
}
+ }
- lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
-
- if (lru_len > BUF_LRU_OLD_MIN_LEN) {
- ut_ad(buf_pool->LRU_old);
- /* Adjust the length of the
- old block list if necessary */
- buf_LRU_old_adjust_len(buf_pool);
- } else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
- /* The LRU list is now long
- enough for LRU_old to become
- defined: init it */
- buf_LRU_old_init(buf_pool);
- }
+ lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+ if (lru_len > BUF_LRU_OLD_MIN_LEN) {
+ ut_ad(buf_pool->LRU_old);
+ /* Adjust the length of the
+ old block list if necessary */
+ buf_LRU_old_adjust_len(buf_pool);
+ } else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
+ /* The LRU list is now long
+ enough for LRU_old to become
+ defined: init it */
+ buf_LRU_old_init(buf_pool);
+ }
#ifdef UNIV_LRU_DEBUG
- /* Check that the "old" flag is consistent
- in the block and its neighbours. */
- buf_page_set_old(b, buf_page_is_old(b));
+ /* Check that the "old" flag is consistent
+ in the block and its neighbours. */
+ buf_page_set_old(b, buf_page_is_old(b));
#endif /* UNIV_LRU_DEBUG */
- } else {
- b->in_LRU_list = FALSE;
- buf_LRU_add_block_low(b, buf_page_is_old(b));
- }
+ } else {
+ ut_d(b->in_LRU_list = FALSE);
+ buf_LRU_add_block_low(b, buf_page_is_old(b));
+ }
- mutex_enter(&buf_pool->zip_mutex);
- if (b->state == BUF_BLOCK_ZIP_PAGE) {
+ mutex_enter(&buf_pool->zip_mutex);
+ rw_lock_x_unlock(hash_lock);
+ if (b->state == BUF_BLOCK_ZIP_PAGE) {
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
- buf_LRU_insert_zip_clean(b);
+ buf_LRU_insert_zip_clean(b);
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
- } else {
- /* Relocate on buf_pool->flush_list. */
- buf_flush_relocate_on_flush_list(bpage, b);
- }
-
- bpage->zip.data = NULL;
- page_zip_set_size(&bpage->zip, 0);
-
- /* Prevent buf_page_get_gen() from
- decompressing the block while we release
- buf_pool->mutex and block_mutex. */
- buf_page_set_sticky(b);
- mutex_exit(&buf_pool->zip_mutex);
+ } else {
+ /* Relocate on buf_pool->flush_list. */
+ buf_flush_relocate_on_flush_list(bpage, b);
}
- //buf_pool_mutex_exit(buf_pool);
- if (*have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- *have_LRU_mutex = FALSE;
- }
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
+ bpage->zip.data = NULL;
+ page_zip_set_size(&bpage->zip, 0);
+
+ /* Prevent buf_page_get_gen() from
+ decompressing the block while we release block_mutex. */
+ buf_page_set_sticky(b);
+ mutex_exit(&buf_pool->zip_mutex);
mutex_exit(block_mutex);
- /* Remove possible adaptive hash index on the page.
- The page was declared uninitialized by
- buf_LRU_block_remove_hashed_page(). We need to flag
- the contents of the page valid (which it still is) in
- order to avoid bogus Valgrind warnings.*/
+ }
- UNIV_MEM_VALID(((buf_block_t*) bpage)->frame,
- UNIV_PAGE_SIZE);
- btr_search_drop_page_hash_index((buf_block_t*) bpage);
- UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
- UNIV_PAGE_SIZE);
+ mutex_exit(&buf_pool->LRU_list_mutex);
- if (b) {
- /* Compute and stamp the compressed page
- checksum while not holding any mutex. The
- block is already half-freed
- (BUF_BLOCK_REMOVE_HASH) and removed from
- buf_pool->page_hash, thus inaccessible by any
- other thread. */
-
- mach_write_to_4(
- b->zip.data + FIL_PAGE_SPACE_OR_CHKSUM,
- UNIV_LIKELY(srv_use_checksums)
- ? page_zip_calc_checksum(
- b->zip.data,
- page_zip_get_size(&b->zip))
- : BUF_NO_CHECKSUM_MAGIC);
- }
+ /* Remove possible adaptive hash index on the page.
+ The page was declared uninitialized by
+ buf_LRU_block_remove_hashed(). We need to flag
+ the contents of the page valid (which it still is) in
+ order to avoid bogus Valgrind warnings.*/
- //buf_pool_mutex_enter(buf_pool);
- if (!*have_LRU_mutex) {
- mutex_enter(&buf_pool->LRU_list_mutex);
- *have_LRU_mutex = TRUE;
- }
- mutex_enter(block_mutex);
+ UNIV_MEM_VALID(((buf_block_t*) bpage)->frame,
+ UNIV_PAGE_SIZE);
+ btr_search_drop_page_hash_index((buf_block_t*) bpage);
+ UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
+ UNIV_PAGE_SIZE);
- if (b) {
- mutex_enter(&buf_pool->zip_mutex);
- buf_page_unset_sticky(b);
- mutex_exit(&buf_pool->zip_mutex);
- }
+ if (b) {
+ ib_uint32_t checksum;
+ /* Compute and stamp the compressed page
+ checksum while not holding any mutex. The
+ block is already half-freed
+ (BUF_BLOCK_REMOVE_HASH) and removed from
+ buf_pool->page_hash, thus inaccessible by any
+ other thread. */
- buf_LRU_block_free_hashed_page((buf_block_t*) bpage, FALSE);
+ checksum = page_zip_calc_checksum(
+ b->zip.data,
+ page_zip_get_size(&b->zip),
+ static_cast<srv_checksum_algorithm_t>(
+ srv_checksum_algorithm));
- if (*have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- *have_LRU_mutex = FALSE;
- }
+ mach_write_to_4(b->zip.data + FIL_PAGE_SPACE_OR_CHKSUM,
+ checksum);
+ }
- } else {
- /* The block_mutex should have been released by
- buf_LRU_block_remove_hashed_page() when it returns
- BUF_BLOCK_ZIP_FREE. */
- ut_ad(block_mutex == &buf_pool->zip_mutex);
- mutex_enter(block_mutex);
+ mutex_enter(block_mutex);
- if (*have_LRU_mutex) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- *have_LRU_mutex = FALSE;
- }
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
+ if (b) {
+ mutex_enter(&buf_pool->zip_mutex);
+ buf_page_unset_sticky(b);
+ mutex_exit(&buf_pool->zip_mutex);
}
- return(TRUE);
+ buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
+ ut_ad(mutex_own(block_mutex));
+ ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
+ return(true);
}
/******************************************************************//**
@@ -2145,14 +2245,12 @@ UNIV_INTERN
void
buf_LRU_block_free_non_file_page(
/*=============================*/
- buf_block_t* block, /*!< in: block, must not contain a file page */
- ibool have_page_hash_mutex)
+ buf_block_t* block) /*!< in: block, must not contain a file page */
{
void* data;
buf_pool_t* buf_pool = buf_pool_from_block(block);
ut_ad(block);
- //ut_ad(buf_pool_mutex_own(buf_pool));
ut_ad(mutex_own(&block->mutex));
switch (buf_block_get_state(block)) {
@@ -2170,8 +2268,6 @@ buf_LRU_block_free_non_file_page(
ut_ad(!block->page.in_flush_list);
ut_ad(!block->page.in_LRU_list);
- buf_block_set_state(block, BUF_BLOCK_NOT_USED);
-
UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE);
#ifdef UNIV_DEBUG
/* Wipe contents of page to reveal possible stale pointers to it */
@@ -2186,19 +2282,17 @@ buf_LRU_block_free_non_file_page(
if (data) {
block->page.zip.data = NULL;
mutex_exit(&block->mutex);
- //buf_pool_mutex_exit_forbid(buf_pool);
buf_buddy_free(
- buf_pool, data, page_zip_get_size(&block->page.zip),
- have_page_hash_mutex);
+ buf_pool, data, page_zip_get_size(&block->page.zip));
- //buf_pool_mutex_exit_allow(buf_pool);
mutex_enter(&block->mutex);
page_zip_set_size(&block->page.zip, 0);
}
- mutex_enter(&buf_pool->free_list_mutex);
- UT_LIST_ADD_FIRST(free, buf_pool->free, (&block->page));
+ mutex_enter_first(&buf_pool->free_list_mutex);
+ buf_block_set_state(block, BUF_BLOCK_NOT_USED);
+ UT_LIST_ADD_FIRST(list, buf_pool->free, (&block->page));
ut_d(block->page.in_free_list = TRUE);
mutex_exit(&buf_pool->free_list_mutex);
@@ -2208,35 +2302,42 @@ buf_LRU_block_free_non_file_page(
/******************************************************************//**
Takes a block out of the LRU list and page hash table.
If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
-the object will be freed and buf_pool->zip_mutex will be released.
+the object will be freed.
-If a compressed page or a compressed-only block descriptor is freed,
-other compressed pages or compressed-only block descriptors may be
-relocated.
-@return the new state of the block (BUF_BLOCK_ZIP_FREE if the state
-was BUF_BLOCK_ZIP_PAGE, or BUF_BLOCK_REMOVE_HASH otherwise) */
+The caller must hold buf_pool->LRU_list_mutex, the buf_page_get_mutex() mutex
+and the appropriate hash_lock. This function will release the
+buf_page_get_mutex() and the hash_lock.
+
+If a compressed page is freed other compressed pages may be relocated.
+@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
+caller needs to free the page to the free list
+@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
+this case the block is already returned to the buddy allocator. */
static
-enum buf_page_state
-buf_LRU_block_remove_hashed_page(
-/*=============================*/
+bool
+buf_LRU_block_remove_hashed(
+/*========================*/
buf_page_t* bpage, /*!< in: block, must contain a file page and
be in a state where it can be freed; there
may or may not be a hash index to the page */
- ibool zip) /*!< in: TRUE if should remove also the
+ bool zip) /*!< in: true if should remove also the
compressed page of an uncompressed page */
{
ulint fold;
const buf_page_t* hashed_bpage;
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+ prio_rw_lock_t* hash_lock;
ut_ad(bpage);
- //ut_ad(buf_pool_mutex_own(buf_pool));
ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
-#ifdef UNIV_SYNC_DEBUG
- ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_EX));
-#endif
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+ fold = buf_page_address_fold(bpage->space, bpage->offset);
+ hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
ut_a(bpage->buf_fix_count == 0);
@@ -2312,7 +2413,7 @@ buf_LRU_block_remove_hashed_page(
UNIV_MEM_ASSERT_W(bpage->zip.data,
page_zip_get_size(&bpage->zip));
break;
- case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_POOL_WATCH:
case BUF_BLOCK_ZIP_DIRTY:
case BUF_BLOCK_NOT_USED:
case BUF_BLOCK_READY_FOR_USE:
@@ -2322,9 +2423,8 @@ buf_LRU_block_remove_hashed_page(
break;
}
- fold = buf_page_address_fold(bpage->space, bpage->offset);
- hashed_bpage = buf_page_hash_get_low(
- buf_pool, bpage->space, bpage->offset, fold);
+ hashed_bpage = buf_page_hash_get_low(buf_pool, bpage->space,
+ bpage->offset, fold);
if (UNIV_UNLIKELY(bpage != hashed_bpage)) {
fprintf(stderr,
@@ -2344,9 +2444,8 @@ buf_LRU_block_remove_hashed_page(
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
mutex_exit(buf_page_get_mutex(bpage));
- //buf_pool_mutex_exit(buf_pool);
+ rw_lock_x_unlock(hash_lock);
mutex_exit(&buf_pool->LRU_list_mutex);
- rw_lock_x_unlock(&buf_pool->page_hash_latch);
buf_print();
buf_LRU_print();
buf_validate();
@@ -2368,19 +2467,18 @@ buf_LRU_block_remove_hashed_page(
ut_a(buf_page_get_zip_size(bpage));
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
- UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, bpage);
+ UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage);
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
mutex_exit(&buf_pool->zip_mutex);
- //buf_pool_mutex_exit_forbid(buf_pool);
+ rw_lock_x_unlock(hash_lock);
buf_buddy_free(
buf_pool, bpage->zip.data,
- page_zip_get_size(&bpage->zip), TRUE);
+ page_zip_get_size(&bpage->zip));
- //buf_pool_mutex_exit_allow(buf_pool);
buf_page_free_descriptor(bpage);
- return(BUF_BLOCK_ZIP_FREE);
+ return(false);
case BUF_BLOCK_FILE_PAGE:
memset(((buf_block_t*) bpage)->frame
@@ -2391,6 +2489,29 @@ buf_LRU_block_remove_hashed_page(
UNIV_PAGE_SIZE);
buf_page_set_state(bpage, BUF_BLOCK_REMOVE_HASH);
+ /* Question: If we release bpage and hash mutex here
+ then what protects us against:
+ 1) Some other thread buffer fixing this page
+ 2) Some other thread trying to read this page and
+ not finding it in buffer pool attempting to read it
+ from the disk.
+ Answer:
+ 1) Cannot happen because the page is no longer in the
+ page_hash. Only possibility is when while invalidating
+ a tablespace we buffer fix the prev_page in LRU to
+ avoid relocation during the scan. But that is not
+ possible because we are holding LRU list mutex.
+
+ 2) Not possible because in buf_page_init_for_read()
+ we do a look up of page_hash while holding LRU list
+ mutex and since we are holding LRU list mutex here
+ and by the time we'll release it in the caller we'd
+ have inserted the compressed only descriptor in the
+ page_hash. */
+ ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+ rw_lock_x_unlock(hash_lock);
+ mutex_exit(&((buf_block_t*) bpage)->mutex);
+
if (zip && bpage->zip.data) {
/* Free the compressed page. */
void* data = bpage->zip.data;
@@ -2399,21 +2520,17 @@ buf_LRU_block_remove_hashed_page(
ut_ad(!bpage->in_free_list);
ut_ad(!bpage->in_flush_list);
ut_ad(!bpage->in_LRU_list);
- mutex_exit(&((buf_block_t*) bpage)->mutex);
- //buf_pool_mutex_exit_forbid(buf_pool);
buf_buddy_free(
buf_pool, data,
- page_zip_get_size(&bpage->zip), TRUE);
+ page_zip_get_size(&bpage->zip));
- //buf_pool_mutex_exit_allow(buf_pool);
- mutex_enter(&((buf_block_t*) bpage)->mutex);
page_zip_set_size(&bpage->zip, 0);
}
- return(BUF_BLOCK_REMOVE_HASH);
+ return(true);
- case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_POOL_WATCH:
case BUF_BLOCK_ZIP_DIRTY:
case BUF_BLOCK_NOT_USED:
case BUF_BLOCK_READY_FOR_USE:
@@ -2423,7 +2540,7 @@ buf_LRU_block_remove_hashed_page(
}
ut_error;
- return(BUF_BLOCK_ZIP_FREE);
+ return(false);
}
/******************************************************************//**
@@ -2432,19 +2549,14 @@ static
void
buf_LRU_block_free_hashed_page(
/*===========================*/
- buf_block_t* block, /*!< in: block, must contain a file page and
+ buf_block_t* block) /*!< in: block, must contain a file page and
be in a state where it can be freed */
- ibool have_page_hash_mutex)
{
-#ifdef UNIV_DEBUG
- //buf_pool_t* buf_pool = buf_pool_from_block(block);
- //ut_ad(buf_pool_mutex_own(buf_pool));
-#endif
ut_ad(mutex_own(&block->mutex));
buf_block_set_state(block, BUF_BLOCK_MEMORY);
- buf_LRU_block_free_non_file_page(block, have_page_hash_mutex);
+ buf_LRU_block_free_non_file_page(block);
}
/******************************************************************//**
@@ -2457,24 +2569,34 @@ buf_LRU_free_one_page(
be in a state where it can be freed; there
may or may not be a hash index to the page */
{
-#ifdef UNIV_DEBUG
+#if defined(UNIV_DEBUG) || defined(UNIV_SYNC_DEBUG)
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
#endif
- mutex_t* block_mutex = buf_page_get_mutex(bpage);
+#ifdef UNIV_SYNC_DEBUG
+ const ulint fold = buf_page_address_fold(bpage->space,
+ bpage->offset);
+ prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+#endif
+ ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
ut_ad(mutex_own(block_mutex));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif
- if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
- != BUF_BLOCK_ZIP_FREE) {
- buf_LRU_block_free_hashed_page((buf_block_t*) bpage, TRUE);
- } else {
- /* The block_mutex should have been released by
- buf_LRU_block_remove_hashed_page() when it returns
- BUF_BLOCK_ZIP_FREE. */
- ut_ad(block_mutex == &buf_pool->zip_mutex);
+ if (buf_LRU_block_remove_hashed(bpage, true)) {
mutex_enter(block_mutex);
+ buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
+ mutex_exit(block_mutex);
}
+
+ /* buf_LRU_block_remove_hashed() releases hash_lock and block_mutex */
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)
+ && !rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(!mutex_own(block_mutex));
}
/**********************************************************************//**
@@ -2501,8 +2623,6 @@ buf_LRU_old_ratio_update_instance(
}
if (adjust) {
- //buf_pool_mutex_enter(buf_pool);
- ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
mutex_enter(&buf_pool->LRU_list_mutex);
if (ratio != buf_pool->LRU_old_ratio) {
@@ -2515,12 +2635,11 @@ buf_LRU_old_ratio_update_instance(
}
}
- //buf_pool_mutex_exit(buf_pool);
mutex_exit(&buf_pool->LRU_list_mutex);
} else {
buf_pool->LRU_old_ratio = ratio;
}
- /* the reverse of
+ /* the reverse of
ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */
return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5));
}
@@ -2606,369 +2725,6 @@ func_exit:
memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur);
}
-/********************************************************************//**
-Dump the LRU page list to the specific file. */
-#define LRU_DUMP_FILE "ib_lru_dump"
-#define LRU_DUMP_TEMP_FILE "ib_lru_dump.tmp"
-#define LRU_OS_FILE_WRITE() \
- os_file_write(LRU_DUMP_FILE, dump_file, buffer, \
- (buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL, \
- (buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)), \
- buffer_size)
-#define LRU_DUMP_PAGE_COUNT 1 /* Specifies how many dump pages
- should be filled for each hold
- of the LRU_list_mutex. */
-
-UNIV_INTERN
-ibool
-buf_LRU_file_dump(void)
-/*===================*/
-{
- os_file_t dump_file = (os_file_t) -1;
- ibool success;
- byte* buffer_base = NULL;
- byte* buffer = NULL;
- const ulint buffer_size = LRU_DUMP_PAGE_COUNT * UNIV_PAGE_SIZE;
- buf_page_t* bpage;
- buf_page_t* first_bpage;
- ulint buffers;
- ulint offset;
- ulint pages_written;
- ulint i;
- ulint total_pages;
-
- /* Sanity test to make sure page size is a multiple of
- assumed dump record size */
- ut_a(UNIV_PAGE_SIZE % 8 == 0);
-
- for (i = 0; i < srv_n_data_files; i++) {
- if (strstr(srv_data_file_names[i], LRU_DUMP_FILE) != NULL) {
- fprintf(stderr,
- " InnoDB: The name '%s' seems to be used for"
- " innodb_data_file_path. Dumping LRU list is"
- " not done for safeness.\n", LRU_DUMP_FILE);
- goto end;
- }
- }
-
- buffer_base = ut_malloc(UNIV_PAGE_SIZE + buffer_size);
- buffer = ut_align(buffer_base, UNIV_PAGE_SIZE);
- if (!buffer) {
- fprintf(stderr,
- " InnoDB: cannot allocate buffer.\n");
- goto end;
- }
-
- dump_file = os_file_create(innodb_file_temp_key, LRU_DUMP_TEMP_FILE,
- OS_FILE_OVERWRITE, OS_FILE_NORMAL, OS_DATA_FILE,
- &success);
-
- if (!success) {
- os_file_get_last_error(TRUE);
- fprintf(stderr,
- " InnoDB: cannot open %s\n", LRU_DUMP_FILE);
- goto end;
- }
-
- buffers = offset = 0;
- for (i = 0; i < srv_buf_pool_instances; i++) {
- buf_pool_t* buf_pool;
-
- buf_pool = buf_pool_from_array(i);
-
- ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
- mutex_enter(&buf_pool->LRU_list_mutex);
- bpage = first_bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
- total_pages = UT_LIST_GET_LEN(buf_pool->LRU);
-
- pages_written = 0;
- while (bpage != NULL && (pages_written++ < total_pages)) {
-
- buf_page_t* next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
-
- if (next_bpage == first_bpage) {
- /* Do not release list mutex here, it will be
- released just outside this while loop */
- fprintf(stderr,
- "InnoDB: detected cycle in LRU for"
- " buffer pool %lu, skipping to next"
- " buffer pool.\n", i);
- break;
- }
-
- mach_write_to_4(buffer + offset * 4, bpage->space);
- offset++;
- mach_write_to_4(buffer + offset * 4, bpage->offset);
- offset++;
-
- ut_a(offset <= buffer_size);
- if (offset == buffer_size/4) {
- mutex_t *next_block_mutex = NULL;
-
- if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- success = FALSE;
- fprintf(stderr,
- " InnoDB: stopped dumping lru"
- " pages because of server"
- " shutdown.\n");
- goto end;
- }
-
- /* While writing file, release buffer pool
- mutex but keep the next page fixed so we
- don't worry about our list iterator becoming
- invalid */
- if (next_bpage) {
- next_block_mutex = buf_page_get_mutex(
- next_bpage);
-
- mutex_enter(next_block_mutex);
- next_bpage->buf_fix_count++;
- mutex_exit(next_block_mutex);
- }
- mutex_exit(&buf_pool->LRU_list_mutex);
-
- success = LRU_OS_FILE_WRITE();
-
- /* Grab this here so that next_bpage can't
- be purged when we drop the fix_count */
- ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
- mutex_enter(&buf_pool->LRU_list_mutex);
-
- if (next_bpage) {
- mutex_enter(next_block_mutex);
- next_bpage->buf_fix_count--;
- mutex_exit(next_block_mutex);
- }
-
- if (!success) {
- mutex_exit(&buf_pool->LRU_list_mutex);
- fprintf(stderr,
- " InnoDB: cannot write page"
- " %lu of %s\n",
- buffers, LRU_DUMP_FILE);
- goto end;
- }
- buffers++;
- offset = 0;
-
- bpage = next_bpage;
- } else {
- bpage = UT_LIST_GET_NEXT(LRU, bpage);
- }
- } /* while(bpage ...) */
- mutex_exit(&buf_pool->LRU_list_mutex);
- } /* for(srv_buf_pool_instances ...) */
-
- mach_write_to_4(buffer + offset * 4, 0xFFFFFFFFUL);
- offset++;
- mach_write_to_4(buffer + offset * 4, 0xFFFFFFFFUL);
- offset++;
-
- success = LRU_OS_FILE_WRITE();
-end:
- if (dump_file != (os_file_t) -1) {
- if (success) {
- success = os_file_flush(dump_file, TRUE);
- }
- os_file_close(dump_file);
- }
- if (success) {
- success = os_file_rename(innodb_file_temp_key,
- LRU_DUMP_TEMP_FILE, LRU_DUMP_FILE);
- }
- if (buffer_base)
- ut_free(buffer_base);
-
- return(success);
-}
-
-typedef struct {
- ib_uint32_t space_id;
- ib_uint32_t page_no;
-} dump_record_t;
-
-static int dump_record_cmp(const void *a, const void *b)
-{
- const dump_record_t *rec1 = (dump_record_t *) a;
- const dump_record_t *rec2 = (dump_record_t *) b;
-
- if (rec1->space_id < rec2->space_id)
- return -1;
- if (rec1->space_id > rec2->space_id)
- return 1;
- if (rec1->page_no < rec2->page_no)
- return -1;
- return rec1->page_no > rec2->page_no;
-}
-
-/********************************************************************//**
-Read the pages based on the specific file.*/
-UNIV_INTERN
-ibool
-buf_LRU_file_restore(void)
-/*======================*/
-{
- os_file_t dump_file = (os_file_t) -1;
- ibool success;
- byte* buffer_base = NULL;
- byte* buffer = NULL;
- ulint buffers;
- ulint offset;
- ulint reads = 0;
- ulint req = 0;
- ibool terminated = FALSE;
- ibool ret = FALSE;
- dump_record_t* records = NULL;
- ulint size;
- ulint size_high;
- ulint recsize = sizeof(dump_record_t);
- ulint length;
-
- dump_file = os_file_create_simple_no_error_handling(innodb_file_temp_key,
- LRU_DUMP_FILE, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success);
- if (!success || !os_file_get_size(dump_file, &size, &size_high)) {
- os_file_get_last_error(TRUE);
- fprintf(stderr,
- " InnoDB: cannot open %s, "
- " buffer pool preload not done.\n", LRU_DUMP_FILE);
- goto end;
- }
-
- if (size == 0 || size_high > 0 || size % recsize) {
- fprintf(stderr, " InnoDB: broken LRU dump file,"
- " buffer pool preload not done\n");
- goto end;
- }
-
- ut_print_timestamp(stderr);
- fprintf(stderr, " InnoDB: Restoring buffer pool pages from %s\n",
- LRU_DUMP_FILE);
-
- if (size == 0 || size_high > 0 || size % 8) {
- fprintf(stderr, " InnoDB: broken LRU dump file\n");
- goto end;
- }
- buffer_base = ut_malloc(2 * UNIV_PAGE_SIZE);
- buffer = ut_align(buffer_base, UNIV_PAGE_SIZE);
- records = ut_malloc(size);
- if (!buffer || !records) {
- fprintf(stderr,
- " InnoDB: cannot allocate buffer.\n");
- goto end;
- }
-
- buffers = 0;
- length = 0;
- while (!terminated) {
- success = os_file_read(dump_file, buffer,
- (buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL,
- (buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)),
- UNIV_PAGE_SIZE);
- if (!success) {
- fprintf(stderr,
- " InnoDB: either could not read page %lu of %s,"
- " or terminated unexpectedly.\n",
- buffers, LRU_DUMP_FILE);
- goto end;
- }
-
- for (offset = 0; offset < UNIV_PAGE_SIZE/4; offset += 2) {
- ulint space_id;
- ulint page_no;
-
- space_id = mach_read_from_4(buffer + offset * 4);
- page_no = mach_read_from_4(buffer + (offset + 1) * 4);
- if (space_id == 0xFFFFFFFFUL
- || page_no == 0xFFFFFFFFUL) {
- terminated = TRUE;
- break;
- }
-
- records[length].space_id = space_id;
- records[length].page_no = page_no;
- length++;
- if (length * 8 >= size) {
- fprintf(stderr,
- " InnoDB: could not find the "
- "end-of-file marker after reading "
- "the expected %lu bytes from the "
- "LRU dump file.\n"
- " InnoDB: this could be caused by a "
- "broken or incomplete file.\n"
- " InnoDB: trying to process what has "
- "been read so far.\n",
- size);
- terminated= TRUE;
- break;
- }
- }
- buffers++;
- }
-
- qsort(records, length, sizeof(dump_record_t), dump_record_cmp);
-
- for (offset = 0; offset < length; offset++) {
- ulint space_id;
- ulint page_no;
- ulint zip_size;
- ulint err;
- ib_int64_t tablespace_version;
-
- space_id = records[offset].space_id;
- page_no = records[offset].page_no;
-
- if (offset % 16 == 15) {
- os_aio_simulated_wake_handler_threads();
- buf_flush_free_margins(FALSE);
- /* skip loading of the rest of the file if we are
- terminating anyway */
- if(srv_shutdown_state != SRV_SHUTDOWN_NONE) {
- fprintf(stderr,
- " InnoDB: stopped loading lru pages"
- " because of server shutdown\n");
- break;
- }
- }
-
- zip_size = fil_space_get_zip_size(space_id);
- if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
- continue;
- }
-
- if (fil_is_exist(space_id, page_no)) {
-
- tablespace_version = fil_space_get_version(space_id);
-
- req++;
- reads += buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
- | OS_AIO_SIMULATED_WAKE_LATER,
- space_id, zip_size, TRUE,
- tablespace_version, page_no, NULL);
- buf_LRU_stat_inc_io();
- }
- }
-
- os_aio_simulated_wake_handler_threads();
- buf_flush_free_margins(FALSE);
-
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: Completed reading buffer pool pages"
- " (requested: %lu, read: %lu)\n", req, reads);
- ret = TRUE;
-end:
- if (dump_file != (os_file_t) -1)
- os_file_close(dump_file);
- if (buffer_base)
- ut_free(buffer_base);
- if (records)
- ut_free(records);
-
- return(ret);
-}
-
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
/**********************************************************************//**
Validates the LRU list for one buffer pool instance. */
@@ -2984,8 +2740,6 @@ buf_LRU_validate_instance(
ulint new_len;
ut_ad(buf_pool);
- //buf_pool_mutex_enter(buf_pool);
- ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
mutex_enter(&buf_pool->LRU_list_mutex);
if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) {
@@ -3002,17 +2756,16 @@ buf_LRU_validate_instance(
ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE);
}
- UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU,
- ut_ad(ut_list_node_313->in_LRU_list));
-
- bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+ UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU, CheckInLRUList());
old_len = 0;
- while (bpage != NULL) {
+ for (bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+ bpage != NULL;
+ bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
switch (buf_page_get_state(bpage)) {
- case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_POOL_WATCH:
case BUF_BLOCK_NOT_USED:
case BUF_BLOCK_READY_FOR_USE:
case BUF_BLOCK_MEMORY:
@@ -3041,32 +2794,30 @@ buf_LRU_validate_instance(
ut_a(!next || buf_page_is_old(next));
}
-
- bpage = UT_LIST_GET_NEXT(LRU, bpage);
}
ut_a(buf_pool->LRU_old_len == old_len);
mutex_exit(&buf_pool->LRU_list_mutex);
+
mutex_enter(&buf_pool->free_list_mutex);
- UT_LIST_VALIDATE(free, buf_page_t, buf_pool->free,
- ut_ad(ut_list_node_313->in_free_list));
+ UT_LIST_VALIDATE(list, buf_page_t, buf_pool->free, CheckInFreeList());
for (bpage = UT_LIST_GET_FIRST(buf_pool->free);
bpage != NULL;
- bpage = UT_LIST_GET_NEXT(free, bpage)) {
+ bpage = UT_LIST_GET_NEXT(list, bpage)) {
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED);
}
mutex_exit(&buf_pool->free_list_mutex);
- ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
+
mutex_enter(&buf_pool->LRU_list_mutex);
- UT_LIST_VALIDATE(unzip_LRU, buf_block_t, buf_pool->unzip_LRU,
- ut_ad(ut_list_node_313->in_unzip_LRU_list
- && ut_list_node_313->page.in_LRU_list));
+ UT_LIST_VALIDATE(
+ unzip_LRU, buf_block_t, buf_pool->unzip_LRU,
+ CheckUnzipLRUAndLRUList());
for (block = UT_LIST_GET_FIRST(buf_pool->unzip_LRU);
block;
@@ -3077,7 +2828,6 @@ buf_LRU_validate_instance(
ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
}
- //buf_pool_mutex_exit(buf_pool);
mutex_exit(&buf_pool->LRU_list_mutex);
}
@@ -3114,8 +2864,6 @@ buf_LRU_print_instance(
const buf_page_t* bpage;
ut_ad(buf_pool);
- //buf_pool_mutex_enter(buf_pool);
- ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
mutex_enter(&buf_pool->LRU_list_mutex);
bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
@@ -3173,7 +2921,6 @@ buf_LRU_print_instance(
bpage = UT_LIST_GET_NEXT(LRU, bpage);
}
- //buf_pool_mutex_exit(buf_pool);
mutex_exit(&buf_pool->LRU_list_mutex);
}
@@ -3193,3 +2940,4 @@ buf_LRU_print(void)
}
}
#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/buf/buf0rea.c b/storage/xtradb/buf/buf0rea.cc
index 44db27cf943..6e348bbf004 100644
--- a/storage/xtradb/buf/buf0rea.c
+++ b/storage/xtradb/buf/buf0rea.cc
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/**************************************************//**
-@file buf/buf0rea.c
+@file buf/buf0rea.cc
The database buffer read
Created 11/5/1995 Heikki Tuuri
@@ -31,6 +31,7 @@ Created 11/5/1995 Heikki Tuuri
#include "buf0buf.h"
#include "buf0flu.h"
#include "buf0lru.h"
+#include "buf0dblwr.h"
#include "ibuf0ibuf.h"
#include "log0recv.h"
#include "trx0sys.h"
@@ -60,12 +61,17 @@ buf_read_page_handle_error(
buf_page_t* bpage) /*!< in: pointer to the block */
{
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
- const ibool uncompressed = (buf_page_get_state(bpage)
+ const bool uncompressed = (buf_page_get_state(bpage)
== BUF_BLOCK_FILE_PAGE);
+ const ulint fold = buf_page_address_fold(bpage->space,
+ bpage->offset);
+ prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold);
- /* First unfix and release lock on the bpage */
mutex_enter(&buf_pool->LRU_list_mutex);
+ rw_lock_x_lock(hash_lock);
mutex_enter(buf_page_get_mutex(bpage));
+
+ /* First unfix and release lock on the bpage */
ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
ut_ad(bpage->buf_fix_count == 0);
@@ -81,11 +87,10 @@ buf_read_page_handle_error(
/* remove the block from LRU list */
buf_LRU_free_one_page(bpage);
- ut_ad(buf_pool->n_pend_reads > 0);
- buf_pool->n_pend_reads--;
-
- mutex_exit(buf_page_get_mutex(bpage));
mutex_exit(&buf_pool->LRU_list_mutex);
+
+ ut_ad(buf_pool->n_pend_reads > 0);
+ os_atomic_decrement_ulint(&buf_pool->n_pend_reads, 1);
}
/********************************************************************//**
@@ -96,16 +101,16 @@ flag is cleared and the x-lock released by an i/o-handler thread.
@return 1 if a read request was queued, 0 if the page already resided
in buf_pool, or if the page is in the doublewrite buffer blocks in
which case it is never read into the pool, or if the tablespace does
-not exist or is being dropped
+not exist or is being dropped
@return 1 if read request is issued. 0 if it is not */
UNIV_INTERN
ulint
buf_read_page_low(
/*==============*/
- ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
+ dberr_t* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
trying to read from a non-existent tablespace, or a
tablespace which is just now being dropped */
- ibool sync, /*!< in: TRUE if synchronous aio is desired */
+ bool sync, /*!< in: true if synchronous aio is desired */
ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
at read-ahead functions) */
@@ -122,21 +127,17 @@ buf_read_page_low(
{
buf_page_t* bpage;
ulint wake_later;
+ ibool ignore_nonexistent_pages;
*err = DB_SUCCESS;
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
- if (trx_doublewrite
- && (space == TRX_SYS_SPACE
- || (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE))
- && ( (offset >= trx_doublewrite->block1
- && offset < trx_doublewrite->block1
- + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
- || (offset >= trx_doublewrite->block2
- && offset < trx_doublewrite->block2
- + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
+ ignore_nonexistent_pages = mode & BUF_READ_IGNORE_NONEXISTENT_PAGES;
+ mode &= ~BUF_READ_IGNORE_NONEXISTENT_PAGES;
+
+ if (space == TRX_SYS_SPACE && buf_dblwr_page_inside(offset)) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Warning: trying to read"
@@ -155,7 +156,7 @@ buf_read_page_low(
syncronous i/o, to make sure they do not get involved in
thread deadlocks. */
- sync = TRUE;
+ sync = true;
}
/* The following call will also check if the tablespace does not exist
@@ -178,7 +179,7 @@ buf_read_page_low(
}
/* recv_get_fil_addr_struct() */
- recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
+ recv_addr = (recv_addr_t*)HASH_GET_FIRST(recv_sys->addr_hash,
hash_calc_hash(ut_fold_ulint_pair(space, offset),
recv_sys->addr_hash));
while (recv_addr) {
@@ -186,7 +187,7 @@ buf_read_page_low(
&& (recv_addr->page_no == offset)) {
break;
}
- recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+ recv_addr = (recv_addr_t*)HASH_GET_NEXT(addr_hash, recv_addr);
}
if ((recv_addr == NULL)
@@ -212,26 +213,28 @@ not_to_recover:
#ifdef UNIV_DEBUG
if (buf_debug_prints) {
fprintf(stderr,
- "Posting read request for page %lu, sync %lu\n",
- (ulong) offset,
- (ulong) sync);
+ "Posting read request for page %lu, sync %s\n",
+ (ulong) offset, sync ? "true" : "false");
}
#endif
ut_ad(buf_page_in_file(bpage));
+ ut_ad(!mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex));
if (sync) {
thd_wait_begin(NULL, THD_WAIT_DISKIO);
}
if (zip_size) {
- *err = _fil_io(OS_FILE_READ | wake_later,
- sync, space, zip_size, offset, 0, zip_size,
- bpage->zip.data, bpage, trx);
+ *err = _fil_io(OS_FILE_READ | wake_later
+ | ignore_nonexistent_pages,
+ sync, space, zip_size, offset, 0, zip_size,
+ bpage->zip.data, bpage, trx);
} else {
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
- *err = _fil_io(OS_FILE_READ | wake_later,
+ *err = _fil_io(OS_FILE_READ | wake_later
+ | ignore_nonexistent_pages,
sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
((buf_block_t*) bpage)->frame, bpage, trx);
}
@@ -240,13 +243,15 @@ not_to_recover:
thd_wait_end(NULL);
}
- if (*err == DB_TABLESPACE_DELETED) {
- buf_read_page_handle_error(bpage);
- return(0);
+ if (*err != DB_SUCCESS) {
+ if (ignore_nonexistent_pages || *err == DB_TABLESPACE_DELETED) {
+ buf_read_page_handle_error(bpage);
+ return(0);
+ }
+ SRV_CORRUPT_TABLE_CHECK(*err == DB_SUCCESS,
+ bpage->is_corrupt = TRUE;);
}
- SRV_CORRUPT_TABLE_CHECK(*err == DB_SUCCESS, bpage->is_corrupt = TRUE;);
-
if (sync) {
/* The i/o is already completed when we arrive from
fil_read */
@@ -291,7 +296,7 @@ buf_read_ahead_random(
ulint ibuf_mode;
ulint count;
ulint low, high;
- ulint err;
+ dberr_t err;
ulint i;
const ulint buf_read_ahead_random_area
= BUF_READ_AHEAD_AREA(buf_pool);
@@ -331,11 +336,8 @@ buf_read_ahead_random(
high = fil_space_get_size(space);
}
- buf_pool_mutex_enter(buf_pool);
-
if (buf_pool->n_pend_reads
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
- buf_pool_mutex_exit(buf_pool);
return(0);
}
@@ -344,8 +346,12 @@ buf_read_ahead_random(
that is, reside near the start of the LRU list. */
for (i = low; i < high; i++) {
+
+ prio_rw_lock_t* hash_lock;
+
const buf_page_t* bpage =
- buf_page_hash_get(buf_pool, space, i);
+ buf_page_hash_get_s_locked(buf_pool, space, i,
+ &hash_lock);
if (bpage
&& buf_page_is_accessed(bpage)
@@ -356,13 +362,16 @@ buf_read_ahead_random(
if (recent_blocks
>= BUF_READ_AHEAD_RANDOM_THRESHOLD(buf_pool)) {
- buf_pool_mutex_exit(buf_pool);
+ rw_lock_s_unlock(hash_lock);
goto read_ahead;
}
}
+
+ if (bpage) {
+ rw_lock_s_unlock(hash_lock);
+ }
}
- buf_pool_mutex_exit(buf_pool);
/* Do nothing */
return(0);
@@ -383,7 +392,7 @@ read_ahead:
if (!ibuf_bitmap_page(zip_size, i)) {
count += buf_read_page_low(
- &err, FALSE,
+ &err, false,
ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
space, zip_size, FALSE,
tablespace_version, i, trx);
@@ -420,7 +429,7 @@ read_ahead:
buf_LRU_stat_inc_io();
buf_pool->stat.n_ra_pages_read_rnd += count;
- srv_buf_pool_reads += count;
+ srv_stats.buf_pool_reads.add(count);
return(count);
}
@@ -439,20 +448,19 @@ buf_read_page(
ulint offset, /*!< in: page number */
trx_t* trx)
{
- buf_pool_t* buf_pool = buf_pool_get(space, offset);
ib_int64_t tablespace_version;
ulint count;
- ulint err;
+ dberr_t err;
tablespace_version = fil_space_get_version(space);
/* We do the i/o in the synchronous aio mode to save thread
switches: hence TRUE */
- count = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
+ count = buf_read_page_low(&err, true, BUF_READ_ANY_PAGE, space,
zip_size, FALSE,
tablespace_version, offset, trx);
- srv_buf_pool_reads += count;
+ srv_stats.buf_pool_reads.add(count);
if (err == DB_TABLESPACE_DELETED) {
ut_print_timestamp(stderr);
fprintf(stderr,
@@ -463,9 +471,6 @@ buf_read_page(
(ulong) space, (ulong) offset);
}
- /* Flush pages from the end of the LRU list if necessary */
- buf_flush_free_margin(buf_pool, TRUE);
-
/* Increment number of I/O operations used for LRU policy. */
buf_LRU_stat_inc_io();
@@ -473,6 +478,49 @@ buf_read_page(
}
/********************************************************************//**
+High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@return TRUE if page has been read in, FALSE in case of failure */
+UNIV_INTERN
+ibool
+buf_read_page_async(
+/*================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ ulint zip_size;
+ ib_int64_t tablespace_version;
+ ulint count;
+ dberr_t err;
+
+ zip_size = fil_space_get_zip_size(space);
+
+ if (zip_size == ULINT_UNDEFINED) {
+ return(FALSE);
+ }
+
+ tablespace_version = fil_space_get_version(space);
+
+ count = buf_read_page_low(&err, true, BUF_READ_ANY_PAGE
+ | OS_AIO_SIMULATED_WAKE_LATER
+ | BUF_READ_IGNORE_NONEXISTENT_PAGES,
+ space, zip_size, FALSE,
+ tablespace_version, offset, NULL);
+ srv_stats.buf_pool_reads.add(count);
+
+ /* We do not increment number of I/O operations used for LRU policy
+ here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
+ about evicting uncompressed version of compressed pages from the
+ buffer pool. Since this function is called from buffer pool load
+ these IOs are deliberate and are not part of normal workload we can
+ ignore these in our heuristics. */
+
+ return(count > 0);
+}
+
+/********************************************************************//**
Applies linear read-ahead if in the buf_pool the page is a border page of
a linear read-ahead area and all the pages in the area have been accessed.
Does not read any page if the read-ahead mechanism is not activated. Note
@@ -511,6 +559,7 @@ buf_read_ahead_linear(
buf_page_t* bpage;
buf_frame_t* frame;
buf_page_t* pred_bpage = NULL;
+ unsigned pred_bpage_is_accessed = 0;
ulint pred_offset;
ulint succ_offset;
ulint count;
@@ -519,13 +568,14 @@ buf_read_ahead_linear(
ulint fail_count;
ulint ibuf_mode;
ulint low, high;
- ulint err;
+ dberr_t err;
ulint i;
const ulint buf_read_ahead_linear_area
= BUF_READ_AHEAD_AREA(buf_pool);
ulint threshold;
- if (!(srv_read_ahead & 2)) {
+ /* check if readahead is disabled */
+ if (!srv_read_ahead_threshold) {
return(0);
}
@@ -561,10 +611,7 @@ buf_read_ahead_linear(
tablespace_version = fil_space_get_version(space);
- buf_pool_mutex_enter(buf_pool);
-
if (high > fil_space_get_size(space)) {
- buf_pool_mutex_exit(buf_pool);
/* The area is not whole, return */
return(0);
@@ -572,11 +619,9 @@ buf_read_ahead_linear(
if (buf_pool->n_pend_reads
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
- buf_pool_mutex_exit(buf_pool);
return(0);
}
- buf_pool_mutex_exit(buf_pool);
/* Check that almost all pages in the area have been accessed; if
offset == low, the accesses must be in a descending order, otherwise,
@@ -595,9 +640,12 @@ buf_read_ahead_linear(
fail_count = 0;
- rw_lock_s_lock(&buf_pool->page_hash_latch);
for (i = low; i < high; i++) {
- bpage = buf_page_hash_get(buf_pool, space, i);
+
+ prio_rw_lock_t* hash_lock;
+
+ bpage = buf_page_hash_get_s_locked(buf_pool, space, i,
+ &hash_lock);
if (bpage == NULL || !buf_page_is_accessed(bpage)) {
/* Not accessed */
@@ -614,7 +662,7 @@ buf_read_ahead_linear(
a little against this. */
int res = ut_ulint_cmp(
buf_page_is_accessed(bpage),
- buf_page_is_accessed(pred_bpage));
+ pred_bpage_is_accessed);
/* Accesses not in the right order */
if (res != 0 && res != asc_or_desc) {
fail_count++;
@@ -623,13 +671,20 @@ buf_read_ahead_linear(
if (fail_count > threshold) {
/* Too many failures: return */
- //buf_pool_mutex_exit(buf_pool);
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
+ if (bpage) {
+ rw_lock_s_unlock(hash_lock);
+ }
return(0);
}
- if (bpage && buf_page_is_accessed(bpage)) {
- pred_bpage = bpage;
+ if (bpage) {
+ if (buf_page_is_accessed(bpage)) {
+ pred_bpage = bpage;
+ pred_bpage_is_accessed
+ = buf_page_is_accessed(bpage);
+ }
+
+ rw_lock_s_unlock(hash_lock);
}
}
@@ -639,8 +694,6 @@ buf_read_ahead_linear(
bpage = buf_page_hash_get(buf_pool, space, offset);
if (bpage == NULL) {
- //buf_pool_mutex_exit(buf_pool);
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
return(0);
}
@@ -666,9 +719,6 @@ buf_read_ahead_linear(
pred_offset = fil_page_get_prev(frame);
succ_offset = fil_page_get_next(frame);
- //buf_pool_mutex_exit(buf_pool);
- rw_lock_s_unlock(&buf_pool->page_hash_latch);
-
if ((offset == low) && (succ_offset == offset + 1)) {
/* This is ok, we can continue */
@@ -721,7 +771,7 @@ buf_read_ahead_linear(
if (!ibuf_bitmap_page(zip_size, i)) {
count += buf_read_page_low(
- &err, FALSE,
+ &err, false,
ibuf_mode,
space, zip_size, FALSE, tablespace_version, i, trx);
if (err == DB_TABLESPACE_DELETED) {
@@ -743,9 +793,6 @@ buf_read_ahead_linear(
os_aio_simulated_wake_handler_threads();
- /* Flush pages from the end of the LRU list if necessary */
- buf_flush_free_margin(buf_pool, TRUE);
-
#ifdef UNIV_DEBUG
if (buf_debug_prints && (count > 0)) {
fprintf(stderr,
@@ -770,7 +817,7 @@ UNIV_INTERN
void
buf_read_ibuf_merge_pages(
/*======================*/
- ibool sync, /*!< in: TRUE if the caller
+ bool sync, /*!< in: true if the caller
wants this function to wait
for the highest address page
to get read in, before this
@@ -797,7 +844,7 @@ buf_read_ibuf_merge_pages(
#endif
for (i = 0; i < n_stored; i++) {
- ulint err;
+ dberr_t err;
buf_pool_t* buf_pool;
ulint zip_size = fil_space_get_zip_size(space_ids[i]);
@@ -831,9 +878,6 @@ tablespace_deleted:
os_aio_simulated_wake_handler_threads();
- /* Flush pages from the end of all the LRU lists if necessary */
- buf_flush_free_margins(FALSE);
-
#ifdef UNIV_DEBUG
if (buf_debug_prints) {
fprintf(stderr,
@@ -866,7 +910,7 @@ buf_read_recv_pages(
{
ib_int64_t tablespace_version;
ulint count;
- ulint err;
+ dberr_t err;
ulint i;
zip_size = fil_space_get_zip_size(space);
@@ -890,7 +934,7 @@ buf_read_recv_pages(
for (i = 0; i < n_stored; i++) {
/* recv_get_fil_addr_struct() */
- recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
+ recv_addr = (recv_addr_t*)HASH_GET_FIRST(recv_sys->addr_hash,
hash_calc_hash(ut_fold_ulint_pair(space, page_nos[i]),
recv_sys->addr_hash));
while (recv_addr) {
@@ -898,7 +942,7 @@ buf_read_recv_pages(
&& (recv_addr->page_no == page_nos[i])) {
break;
}
- recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+ recv_addr = (recv_addr_t*)HASH_GET_NEXT(addr_hash, recv_addr);
}
if ((recv_addr == NULL)
@@ -931,7 +975,8 @@ not_to_recover:
os_aio_print_debug = FALSE;
buf_pool = buf_pool_get(space, page_nos[i]);
- while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
+ while (buf_pool->n_pend_reads
+ >= recv_n_pool_free_frames / 2) {
os_aio_simulated_wake_handler_threads();
os_thread_sleep(10000);
@@ -947,7 +992,7 @@ not_to_recover:
"InnoDB: Number of pending reads %lu,"
" pending pread calls %lu\n",
(ulong) buf_pool->n_pend_reads,
- (ulong)os_file_n_pending_preads);
+ (ulong) os_file_n_pending_preads);
os_aio_print_debug = TRUE;
}
@@ -956,11 +1001,11 @@ not_to_recover:
os_aio_print_debug = FALSE;
if ((i + 1 == n_stored) && sync) {
- buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
+ buf_read_page_low(&err, true, BUF_READ_ANY_PAGE, space,
zip_size, TRUE, tablespace_version,
page_nos[i], NULL);
} else {
- buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
+ buf_read_page_low(&err, false, BUF_READ_ANY_PAGE
| OS_AIO_SIMULATED_WAKE_LATER,
space, zip_size, TRUE,
tablespace_version, page_nos[i], NULL);
@@ -969,9 +1014,6 @@ not_to_recover:
os_aio_simulated_wake_handler_threads();
- /* Flush pages from the end of all the LRU lists if necessary */
- buf_flush_free_margins(FALSE);
-
#ifdef UNIV_DEBUG
if (buf_debug_prints) {
fprintf(stderr,