summaryrefslogtreecommitdiff
path: root/storage/innobase/buf/buf0buf.cc
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/buf/buf0buf.cc')
-rw-r--r--storage/innobase/buf/buf0buf.cc5236
1 files changed, 5236 insertions, 0 deletions
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
new file mode 100644
index 00000000000..2b543d8d1cf
--- /dev/null
+++ b/storage/innobase/buf/buf0buf.cc
@@ -0,0 +1,5236 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buf.cc
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0buf.h"
+
+#ifdef UNIV_NONINL
+#include "buf0buf.ic"
+#endif
+
+#include "mem0mem.h"
+#include "btr0btr.h"
+#include "fil0fil.h"
+#ifndef UNIV_HOTBACKUP
+#include "buf0buddy.h"
+#include "lock0lock.h"
+#include "btr0sea.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "log0log.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "srv0srv.h"
+#include "dict0dict.h"
+#include "log0recv.h"
+#include "page0zip.h"
+#include "srv0mon.h"
+#include "buf0checksum.h"
+#include "buf0dblwr.h"
+
+/*
+ IMPLEMENTATION OF THE BUFFER POOL
+ =================================
+
+Performance improvement:
+------------------------
+Thread scheduling in NT may be so slow that the OS wait mechanism should
+not be used even in waiting for disk reads to complete.
+Rather, we should put waiting query threads to the queue of
+waiting jobs, and let the OS thread do something useful while the i/o
+is processed. In this way we could remove most OS thread switches in
+an i/o-intensive benchmark like TPC-C.
+
+A possibility is to put a user space thread library between the database
+and NT. User space thread libraries might be very fast.
+
+SQL Server 7.0 can be configured to use 'fibers' which are lightweight
+threads in NT. These should be studied.
+
+ Buffer frames and blocks
+ ------------------------
+Following the terminology of Gray and Reuter, we call the memory
+blocks where file pages are loaded buffer frames. For each buffer
+frame there is a control block, or shortly, a block, in the buffer
+control array. The control info which does not need to be stored
+in the file along with the file page, resides in the control block.
+
+ Buffer pool struct
+ ------------------
+The buffer buf_pool contains a single mutex which protects all the
+control data structures of the buf_pool. The content of a buffer frame is
+protected by a separate read-write lock in its control block, though.
+These locks can be locked and unlocked without owning the buf_pool->mutex.
+The OS events in the buf_pool struct can be waited for without owning the
+buf_pool->mutex.
+
+The buf_pool->mutex is a hot-spot in main memory, causing a lot of
+memory bus traffic on multiprocessor systems when processors
+alternately access the mutex. On our Pentium, the mutex is accessed
+maybe every 10 microseconds. We gave up the solution to have mutexes
+for each control block, for instance, because it seemed to be
+complicated.
+
+A solution to reduce mutex contention of the buf_pool->mutex is to
+create a separate mutex for the page hash table. On Pentium,
+accessing the hash table takes 2 microseconds, about half
+of the total buf_pool->mutex hold time.
+
+ Control blocks
+ --------------
+
+The control block contains, for instance, the bufferfix count
+which is incremented when a thread wants a file page to be fixed
+in a buffer frame. The bufferfix operation does not lock the
+contents of the frame, however. For this purpose, the control
+block contains a read-write lock.
+
+The buffer frames have to be aligned so that the start memory
+address of a frame is divisible by the universal page size, which
+is a power of two.
+
+We intend to make the buffer buf_pool size on-line reconfigurable,
+that is, the buf_pool size can be changed without closing the database.
+Then the database administarator may adjust it to be bigger
+at night, for example. The control block array must
+contain enough control blocks for the maximum buffer buf_pool size
+which is used in the particular database.
+If the buf_pool size is cut, we exploit the virtual memory mechanism of
+the OS, and just refrain from using frames at high addresses. Then the OS
+can swap them to disk.
+
+The control blocks containing file pages are put to a hash table
+according to the file address of the page.
+We could speed up the access to an individual page by using
+"pointer swizzling": we could replace the page references on
+non-leaf index pages by direct pointers to the page, if it exists
+in the buf_pool. We could make a separate hash table where we could
+chain all the page references in non-leaf pages residing in the buf_pool,
+using the page reference as the hash key,
+and at the time of reading of a page update the pointers accordingly.
+Drawbacks of this solution are added complexity and,
+possibly, extra space required on non-leaf pages for memory pointers.
+A simpler solution is just to speed up the hash table mechanism
+in the database, using tables whose size is a power of 2.
+
+ Lists of blocks
+ ---------------
+
+There are several lists of control blocks.
+
+The free list (buf_pool->free) contains blocks which are currently not
+used.
+
+The common LRU list contains all the blocks holding a file page
+except those for which the bufferfix count is non-zero.
+The pages are in the LRU list roughly in the order of the last
+access to the page, so that the oldest pages are at the end of the
+list. We also keep a pointer to near the end of the LRU list,
+which we can use when we want to artificially age a page in the
+buf_pool. This is used if we know that some page is not needed
+again for some time: we insert the block right after the pointer,
+causing it to be replaced sooner than would normally be the case.
+Currently this aging mechanism is used for read-ahead mechanism
+of pages, and it can also be used when there is a scan of a full
+table which cannot fit in the memory. Putting the pages near the
+end of the LRU list, we make sure that most of the buf_pool stays
+in the main memory, undisturbed.
+
+The unzip_LRU list contains a subset of the common LRU list. The
+blocks on the unzip_LRU list hold a compressed file page and the
+corresponding uncompressed page frame. A block is in unzip_LRU if and
+only if the predicate buf_page_belongs_to_unzip_LRU(&block->page)
+holds. The blocks in unzip_LRU will be in same order as they are in
+the common LRU list. That is, each manipulation of the common LRU
+list will result in the same manipulation of the unzip_LRU list.
+
+The chain of modified blocks (buf_pool->flush_list) contains the blocks
+holding file pages that have been modified in the memory
+but not written to disk yet. The block with the oldest modification
+which has not yet been written to disk is at the end of the chain.
+The access to this list is protected by buf_pool->flush_list_mutex.
+
+The chain of unmodified compressed blocks (buf_pool->zip_clean)
+contains the control blocks (buf_page_t) of those compressed pages
+that are not in buf_pool->flush_list and for which no uncompressed
+page has been allocated in the buffer pool. The control blocks for
+uncompressed pages are accessible via buf_block_t objects that are
+reachable via buf_pool->chunks[].
+
+The chains of free memory blocks (buf_pool->zip_free[]) are used by
+the buddy allocator (buf0buddy.cc) to keep track of currently unused
+memory blocks of size sizeof(buf_page_t)..UNIV_PAGE_SIZE / 2. These
+blocks are inside the UNIV_PAGE_SIZE-sized memory blocks of type
+BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
+pool. The buddy allocator is solely used for allocating control
+blocks for compressed pages (buf_page_t) and compressed page frames.
+
+ Loading a file page
+ -------------------
+
+First, a victim block for replacement has to be found in the
+buf_pool. It is taken from the free list or searched for from the
+end of the LRU-list. An exclusive lock is reserved for the frame,
+the io_fix field is set in the block fixing the block in buf_pool,
+and the io-operation for loading the page is queued. The io-handler thread
+releases the X-lock on the frame and resets the io_fix field
+when the io operation completes.
+
+A thread may request the above operation using the function
+buf_page_get(). It may then continue to request a lock on the frame.
+The lock is granted when the io-handler releases the x-lock.
+
+ Read-ahead
+ ----------
+
+The read-ahead mechanism is intended to be intelligent and
+isolated from the semantically higher levels of the database
+index management. From the higher level we only need the
+information if a file page has a natural successor or
+predecessor page. On the leaf level of a B-tree index,
+these are the next and previous pages in the natural
+order of the pages.
+
+Let us first explain the read-ahead mechanism when the leafs
+of a B-tree are scanned in an ascending or descending order.
+When a read page is the first time referenced in the buf_pool,
+the buffer manager checks if it is at the border of a so-called
+linear read-ahead area. The tablespace is divided into these
+areas of size 64 blocks, for example. So if the page is at the
+border of such an area, the read-ahead mechanism checks if
+all the other blocks in the area have been accessed in an
+ascending or descending order. If this is the case, the system
+looks at the natural successor or predecessor of the page,
+checks if that is at the border of another area, and in this case
+issues read-requests for all the pages in that area. Maybe
+we could relax the condition that all the pages in the area
+have to be accessed: if data is deleted from a table, there may
+appear holes of unused pages in the area.
+
+A different read-ahead mechanism is used when there appears
+to be a random access pattern to a file.
+If a new page is referenced in the buf_pool, and several pages
+of its random access area (for instance, 32 consecutive pages
+in a tablespace) have recently been referenced, we may predict
+that the whole area may be needed in the near future, and issue
+the read requests for the whole area.
+*/
+
+#ifndef UNIV_HOTBACKUP
+/** Value in microseconds */
+static const int WAIT_FOR_READ = 5000;
+/** Number of attemtps made to read in a page in the buffer pool */
+static const ulint BUF_PAGE_READ_MAX_RETRIES = 100;
+
+/** The buffer pools of the database */
+UNIV_INTERN buf_pool_t* buf_pool_ptr;
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+static ulint buf_dbg_counter = 0; /*!< This is used to insert validation
+ operations in execution in the
+ debug version */
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_DEBUG
+/** If this is set TRUE, the program prints info whenever
+read-ahead or flush occurs */
+UNIV_INTERN ibool buf_debug_prints = FALSE;
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_PFS_RWLOCK
+/* Keys to register buffer block related rwlocks and mutexes with
+performance schema */
+UNIV_INTERN mysql_pfs_key_t buf_block_lock_key;
+# ifdef UNIV_SYNC_DEBUG
+UNIV_INTERN mysql_pfs_key_t buf_block_debug_latch_key;
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t buffer_block_mutex_key;
+UNIV_INTERN mysql_pfs_key_t buf_pool_mutex_key;
+UNIV_INTERN mysql_pfs_key_t buf_pool_zip_mutex_key;
+UNIV_INTERN mysql_pfs_key_t flush_list_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
+# ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
+
+/* Buffer block mutexes and rwlocks can be registered
+in one group rather than individually. If PFS_GROUP_BUFFER_SYNC
+is defined, register buffer block mutex and rwlock
+in one group after their initialization. */
+# define PFS_GROUP_BUFFER_SYNC
+
+/* This define caps the number of mutexes/rwlocks can
+be registered with performance schema. Developers can
+modify this define if necessary. Please note, this would
+be effective only if PFS_GROUP_BUFFER_SYNC is defined. */
+# define PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER ULINT_MAX
+
+# endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
+#endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
+
+/** Macro to determine whether the read of write counter is used depending
+on the io_type */
+#define MONITOR_RW_COUNTER(io_type, counter) \
+ ((io_type == BUF_IO_READ) \
+ ? (counter##_READ) \
+ : (counter##_WRITTEN))
+
+/********************************************************************//**
+Gets the smallest oldest_modification lsn for any page in the pool. Returns
+zero if all modified pages have been flushed to disk.
+@return oldest modification in pool, zero if none */
+UNIV_INTERN
+lsn_t
+buf_pool_get_oldest_modification(void)
+/*==================================*/
+{
+ ulint i;
+ buf_page_t* bpage;
+ lsn_t lsn = 0;
+ lsn_t oldest_lsn = 0;
+
+ /* When we traverse all the flush lists we don't want another
+ thread to add a dirty page to any flush list. */
+ log_flush_order_mutex_enter();
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+
+ buf_flush_list_mutex_enter(buf_pool);
+
+ bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+
+ if (bpage != NULL) {
+ ut_ad(bpage->in_flush_list);
+ lsn = bpage->oldest_modification;
+ }
+
+ buf_flush_list_mutex_exit(buf_pool);
+
+ if (!oldest_lsn || oldest_lsn > lsn) {
+ oldest_lsn = lsn;
+ }
+ }
+
+ log_flush_order_mutex_exit();
+
+ /* The returned answer may be out of date: the flush_list can
+ change after the mutex has been released. */
+
+ return(oldest_lsn);
+}
+
+/********************************************************************//**
+Get total buffer pool statistics. */
+UNIV_INTERN
+void
+buf_get_total_list_len(
+/*===================*/
+ ulint* LRU_len, /*!< out: length of all LRU lists */
+ ulint* free_len, /*!< out: length of all free lists */
+ ulint* flush_list_len) /*!< out: length of all flush lists */
+{
+ ulint i;
+
+ *LRU_len = 0;
+ *free_len = 0;
+ *flush_list_len = 0;
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+
+ if (!buf_pool) {
+ continue;
+ }
+
+ *LRU_len += UT_LIST_GET_LEN(buf_pool->LRU);
+ *free_len += UT_LIST_GET_LEN(buf_pool->free);
+ *flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list);
+ }
+}
+
+/********************************************************************//**
+Get total buffer pool statistics. */
+UNIV_INTERN
+void
+buf_get_total_stat(
+/*===============*/
+ buf_pool_stat_t* tot_stat) /*!< out: buffer pool stats */
+{
+ ulint i;
+
+ memset(tot_stat, 0, sizeof(*tot_stat));
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_stat_t*buf_stat;
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+
+ if (!buf_pool) {
+ continue;
+ }
+
+ buf_stat = &buf_pool->stat;
+ tot_stat->n_page_gets += buf_stat->n_page_gets;
+ tot_stat->n_pages_read += buf_stat->n_pages_read;
+ tot_stat->n_pages_written += buf_stat->n_pages_written;
+ tot_stat->n_pages_created += buf_stat->n_pages_created;
+ tot_stat->n_ra_pages_read_rnd += buf_stat->n_ra_pages_read_rnd;
+ tot_stat->n_ra_pages_read += buf_stat->n_ra_pages_read;
+ tot_stat->n_ra_pages_evicted += buf_stat->n_ra_pages_evicted;
+ tot_stat->n_pages_made_young += buf_stat->n_pages_made_young;
+
+ tot_stat->n_pages_not_made_young +=
+ buf_stat->n_pages_not_made_young;
+ }
+}
+
+/********************************************************************//**
+Allocates a buffer block.
+@return own: the allocated block, in state BUF_BLOCK_MEMORY */
+UNIV_INTERN
+buf_block_t*
+buf_block_alloc(
+/*============*/
+ buf_pool_t* buf_pool) /*!< in/out: buffer pool instance,
+ or NULL for round-robin selection
+ of the buffer pool */
+{
+ buf_block_t* block;
+ ulint index;
+ static ulint buf_pool_index;
+
+ if (buf_pool == NULL) {
+ /* We are allocating memory from any buffer pool, ensure
+ we spread the grace on all buffer pool instances. */
+ index = buf_pool_index++ % srv_buf_pool_instances;
+ buf_pool = buf_pool_from_array(index);
+ }
+
+ block = buf_LRU_get_free_block(buf_pool);
+
+ buf_block_set_state(block, BUF_BLOCK_MEMORY);
+
+ return(block);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Checks if a page is corrupt.
+@return TRUE if corrupted */
+UNIV_INTERN
+ibool
+buf_page_is_corrupted(
+/*==================*/
+ const byte* read_buf, /*!< in: a database page */
+ ulint zip_size) /*!< in: size of compressed page;
+ 0 for uncompressed pages */
+{
+ ulint checksum_field1;
+ ulint checksum_field2;
+ ibool crc32_inited = FALSE;
+ ib_uint32_t crc32 = ULINT32_UNDEFINED;
+
+ if (!zip_size
+ && memcmp(read_buf + FIL_PAGE_LSN + 4,
+ read_buf + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
+
+ /* Stored log sequence numbers at the start and the end
+ of page do not match */
+
+ return(TRUE);
+ }
+
+#ifndef UNIV_HOTBACKUP
+ if (recv_lsn_checks_on) {
+ lsn_t current_lsn;
+
+ if (log_peek_lsn(&current_lsn)
+ && UNIV_UNLIKELY
+ (current_lsn
+ < mach_read_from_8(read_buf + FIL_PAGE_LSN))) {
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Error: page %lu log sequence number"
+ " " LSN_PF "\n"
+ "InnoDB: is in the future! Current system "
+ "log sequence number " LSN_PF ".\n"
+ "InnoDB: Your database may be corrupt or "
+ "you may have copied the InnoDB\n"
+ "InnoDB: tablespace but not the InnoDB "
+ "log files. See\n"
+ "InnoDB: " REFMAN
+ "forcing-innodb-recovery.html\n"
+ "InnoDB: for more information.\n",
+ (ulong) mach_read_from_4(
+ read_buf + FIL_PAGE_OFFSET),
+ (lsn_t) mach_read_from_8(
+ read_buf + FIL_PAGE_LSN),
+ current_lsn);
+ }
+ }
+#endif
+
+ /* Check whether the checksum fields have correct values */
+
+ if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_NONE) {
+ return(FALSE);
+ }
+
+ if (zip_size) {
+ return(!page_zip_verify_checksum(read_buf, zip_size));
+ }
+
+ checksum_field1 = mach_read_from_4(
+ read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
+
+ checksum_field2 = mach_read_from_4(
+ read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+ /* declare empty pages non-corrupted */
+ if (checksum_field1 == 0 && checksum_field2 == 0
+ && mach_read_from_4(read_buf + FIL_PAGE_LSN) == 0) {
+ /* make sure that the page is really empty */
+ ut_d(for (ulint i = 0; i < UNIV_PAGE_SIZE; i++) {
+ ut_a(read_buf[i] == 0); });
+
+ return(FALSE);
+ }
+
+ switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+
+ crc32 = buf_calc_page_crc32(read_buf);
+
+ return(checksum_field1 != crc32 || checksum_field2 != crc32);
+
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+
+ return(checksum_field1
+ != buf_calc_page_new_checksum(read_buf)
+ || checksum_field2
+ != buf_calc_page_old_checksum(read_buf));
+
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+
+ return(checksum_field1 != BUF_NO_CHECKSUM_MAGIC
+ || checksum_field2 != BUF_NO_CHECKSUM_MAGIC);
+
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ /* There are 3 valid formulas for
+ checksum_field2 (old checksum field):
+
+ 1. Very old versions of InnoDB only stored 8 byte lsn to the
+ start and the end of the page.
+
+ 2. InnoDB versions before MySQL 5.6.3 store the old formula
+ checksum (buf_calc_page_old_checksum()).
+
+ 3. InnoDB versions 5.6.3 and newer with
+ innodb_checksum_algorithm=strict_crc32|crc32 store CRC32. */
+
+ /* since innodb_checksum_algorithm is not strict_* allow
+ any of the algos to match for the old field */
+
+ if (checksum_field2
+ != mach_read_from_4(read_buf + FIL_PAGE_LSN)
+ && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) {
+
+ /* The checksum does not match any of the
+ fast to check. First check the selected algorithm
+ for writing checksums because we assume that the
+ chance of it matching is higher. */
+
+ if (srv_checksum_algorithm
+ == SRV_CHECKSUM_ALGORITHM_CRC32) {
+
+ crc32 = buf_calc_page_crc32(read_buf);
+ crc32_inited = TRUE;
+
+ if (checksum_field2 != crc32
+ && checksum_field2
+ != buf_calc_page_old_checksum(read_buf)) {
+
+ return(TRUE);
+ }
+ } else {
+ ut_ad(srv_checksum_algorithm
+ == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+ if (checksum_field2
+ != buf_calc_page_old_checksum(read_buf)) {
+
+ crc32 = buf_calc_page_crc32(read_buf);
+ crc32_inited = TRUE;
+
+ if (checksum_field2 != crc32) {
+ return(TRUE);
+ }
+ }
+ }
+ }
+
+ /* old field is fine, check the new field */
+
+ /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
+ (always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
+
+ if (checksum_field1 != 0
+ && checksum_field1 != BUF_NO_CHECKSUM_MAGIC) {
+
+ /* The checksum does not match any of the
+ fast to check. First check the selected algorithm
+ for writing checksums because we assume that the
+ chance of it matching is higher. */
+
+ if (srv_checksum_algorithm
+ == SRV_CHECKSUM_ALGORITHM_CRC32) {
+
+ if (!crc32_inited) {
+ crc32 = buf_calc_page_crc32(read_buf);
+ crc32_inited = TRUE;
+ }
+
+ if (checksum_field1 != crc32
+ && checksum_field1
+ != buf_calc_page_new_checksum(read_buf)) {
+
+ return(TRUE);
+ }
+ } else {
+ ut_ad(srv_checksum_algorithm
+ == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+ if (checksum_field1
+ != buf_calc_page_new_checksum(read_buf)) {
+
+ if (!crc32_inited) {
+ crc32 = buf_calc_page_crc32(
+ read_buf);
+ crc32_inited = TRUE;
+ }
+
+ if (checksum_field1 != crc32) {
+ return(TRUE);
+ }
+ }
+ }
+ }
+
+ /* If CRC32 is stored in at least one of the fields, then the
+ other field must also be CRC32 */
+ if (crc32_inited
+ && ((checksum_field1 == crc32
+ && checksum_field2 != crc32)
+ || (checksum_field1 != crc32
+ && checksum_field2 == crc32))) {
+
+ return(TRUE);
+ }
+
+ break;
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ /* should have returned FALSE earlier */
+ ut_error;
+ /* no default so the compiler will emit a warning if new enum
+ is added and not handled here */
+ }
+
+ return(FALSE);
+}
+
+/********************************************************************//**
+Prints a page to stderr. */
+UNIV_INTERN
+void
+buf_page_print(
+/*===========*/
+ const byte* read_buf, /*!< in: a database page */
+ ulint zip_size, /*!< in: compressed page size, or
+ 0 for uncompressed pages */
+ ulint flags) /*!< in: 0 or
+ BUF_PAGE_PRINT_NO_CRASH or
+ BUF_PAGE_PRINT_NO_FULL */
+
+{
+#ifndef UNIV_HOTBACKUP
+ dict_index_t* index;
+#endif /* !UNIV_HOTBACKUP */
+ ulint size = zip_size;
+
+ if (!size) {
+ size = UNIV_PAGE_SIZE;
+ }
+
+ if (!(flags & BUF_PAGE_PRINT_NO_FULL)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Page dump in ascii and hex (%lu bytes):\n",
+ (ulong) size);
+ ut_print_buf(stderr, read_buf, size);
+ fputs("\nInnoDB: End of page dump\n", stderr);
+ }
+
+ if (zip_size) {
+ /* Print compressed page. */
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Compressed page type (" ULINTPF "); "
+ "stored checksum in field1 " ULINTPF "; "
+ "calculated checksums for field1: "
+ "%s " ULINTPF ", "
+ "%s " ULINTPF ", "
+ "%s " ULINTPF "; "
+ "page LSN " LSN_PF "; "
+ "page number (if stored to page already) " ULINTPF "; "
+ "space id (if stored to page already) " ULINTPF "\n",
+ fil_page_get_type(read_buf),
+ mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+ buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_CRC32),
+ page_zip_calc_checksum(read_buf, zip_size,
+ SRV_CHECKSUM_ALGORITHM_CRC32),
+ buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_INNODB),
+ page_zip_calc_checksum(read_buf, zip_size,
+ SRV_CHECKSUM_ALGORITHM_INNODB),
+ buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_NONE),
+ page_zip_calc_checksum(read_buf, zip_size,
+ SRV_CHECKSUM_ALGORITHM_NONE),
+ mach_read_from_8(read_buf + FIL_PAGE_LSN),
+ mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
+ mach_read_from_4(read_buf
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+ } else {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: uncompressed page, "
+ "stored checksum in field1 " ULINTPF ", "
+ "calculated checksums for field1: "
+ "%s " UINT32PF ", "
+ "%s " ULINTPF ", "
+ "%s " ULINTPF ", "
+
+ "stored checksum in field2 " ULINTPF ", "
+ "calculated checksums for field2: "
+ "%s " UINT32PF ", "
+ "%s " ULINTPF ", "
+ "%s " ULINTPF ", "
+
+ "page LSN " ULINTPF " " ULINTPF ", "
+ "low 4 bytes of LSN at page end " ULINTPF ", "
+ "page number (if stored to page already) " ULINTPF ", "
+ "space id (if created with >= MySQL-4.1.1 "
+ "and stored already) %lu\n",
+ mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+ buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32),
+ buf_calc_page_crc32(read_buf),
+ buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB),
+ buf_calc_page_new_checksum(read_buf),
+ buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE),
+ BUF_NO_CHECKSUM_MAGIC,
+
+ mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM),
+ buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32),
+ buf_calc_page_crc32(read_buf),
+ buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB),
+ buf_calc_page_old_checksum(read_buf),
+ buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE),
+ BUF_NO_CHECKSUM_MAGIC,
+
+ mach_read_from_4(read_buf + FIL_PAGE_LSN),
+ mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
+ mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
+ mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
+ mach_read_from_4(read_buf
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+ }
+
+#ifndef UNIV_HOTBACKUP
+ if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE)
+ == TRX_UNDO_INSERT) {
+ fprintf(stderr,
+ "InnoDB: Page may be an insert undo log page\n");
+ } else if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE)
+ == TRX_UNDO_UPDATE) {
+ fprintf(stderr,
+ "InnoDB: Page may be an update undo log page\n");
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ switch (fil_page_get_type(read_buf)) {
+ index_id_t index_id;
+ case FIL_PAGE_INDEX:
+ index_id = btr_page_get_index_id(read_buf);
+ fprintf(stderr,
+ "InnoDB: Page may be an index page where"
+ " index id is %llu\n",
+ (ullint) index_id);
+#ifndef UNIV_HOTBACKUP
+ index = dict_index_find_on_id_low(index_id);
+ if (index) {
+ fputs("InnoDB: (", stderr);
+ dict_index_name_print(stderr, NULL, index);
+ fputs(")\n", stderr);
+ }
+#endif /* !UNIV_HOTBACKUP */
+ break;
+ case FIL_PAGE_INODE:
+ fputs("InnoDB: Page may be an 'inode' page\n", stderr);
+ break;
+ case FIL_PAGE_IBUF_FREE_LIST:
+ fputs("InnoDB: Page may be an insert buffer free list page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_ALLOCATED:
+ fputs("InnoDB: Page may be a freshly allocated page\n",
+ stderr);
+ break;
+ case FIL_PAGE_IBUF_BITMAP:
+ fputs("InnoDB: Page may be an insert buffer bitmap page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_SYS:
+ fputs("InnoDB: Page may be a system page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_TRX_SYS:
+ fputs("InnoDB: Page may be a transaction system page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_FSP_HDR:
+ fputs("InnoDB: Page may be a file space header page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_XDES:
+ fputs("InnoDB: Page may be an extent descriptor page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_BLOB:
+ fputs("InnoDB: Page may be a BLOB page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ fputs("InnoDB: Page may be a compressed BLOB page\n",
+ stderr);
+ break;
+ }
+
+ ut_ad(flags & BUF_PAGE_PRINT_NO_CRASH);
+}
+
+#ifndef UNIV_HOTBACKUP
+
+# ifdef PFS_GROUP_BUFFER_SYNC
+/********************************************************************//**
+This function registers mutexes and rwlocks in buffer blocks with
+performance schema. If PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER is
+defined to be a value less than chunk->size, then only mutexes
+and rwlocks in the first PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER
+blocks are registered. */
+static
+void
+pfs_register_buffer_block(
+/*======================*/
+ buf_chunk_t* chunk) /*!< in/out: chunk of buffers */
+{
+ ulint i;
+ ulint num_to_register;
+ buf_block_t* block;
+
+ block = chunk->blocks;
+
+ num_to_register = ut_min(chunk->size,
+ PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER);
+
+ for (i = 0; i < num_to_register; i++) {
+ mutex_t* mutex;
+ rw_lock_t* rwlock;
+
+# ifdef UNIV_PFS_MUTEX
+ mutex = &block->mutex;
+ ut_a(!mutex->pfs_psi);
+ mutex->pfs_psi = (PSI_server)
+ ? PSI_server->init_mutex(buffer_block_mutex_key, mutex)
+ : NULL;
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+ rwlock = &block->lock;
+ ut_a(!rwlock->pfs_psi);
+ rwlock->pfs_psi = (PSI_server)
+ ? PSI_server->init_rwlock(buf_block_lock_key, rwlock)
+ : NULL;
+
+# ifdef UNIV_SYNC_DEBUG
+ rwlock = &block->debug_latch;
+ ut_a(!rwlock->pfs_psi);
+ rwlock->pfs_psi = (PSI_server)
+ ? PSI_server->init_rwlock(buf_block_debug_latch_key,
+ rwlock)
+ : NULL;
+# endif /* UNIV_SYNC_DEBUG */
+
+# endif /* UNIV_PFS_RWLOCK */
+ block++;
+ }
+}
+# endif /* PFS_GROUP_BUFFER_SYNC */
+
+/********************************************************************//**
+Initializes a buffer control block when the buf_pool is created. */
+static
+void
+buf_block_init(
+/*===========*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ buf_block_t* block, /*!< in: pointer to control block */
+ byte* frame) /*!< in: pointer to buffer frame */
+{
+ UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE);
+
+ block->frame = frame;
+
+ block->page.buf_pool_index = buf_pool_index(buf_pool);
+ block->page.state = BUF_BLOCK_NOT_USED;
+ block->page.buf_fix_count = 0;
+ block->page.io_fix = BUF_IO_NONE;
+
+ block->modify_clock = 0;
+
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+ block->page.file_page_was_freed = FALSE;
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+
+ block->check_index_page_at_flush = FALSE;
+ block->index = NULL;
+
+#ifdef UNIV_DEBUG
+ block->page.in_page_hash = FALSE;
+ block->page.in_zip_hash = FALSE;
+ block->page.in_flush_list = FALSE;
+ block->page.in_free_list = FALSE;
+ block->page.in_LRU_list = FALSE;
+ block->in_unzip_LRU_list = FALSE;
+#endif /* UNIV_DEBUG */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ block->n_pointers = 0;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ page_zip_des_init(&block->page.zip);
+
+#if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC
+ /* If PFS_SKIP_BUFFER_MUTEX_RWLOCK is defined, skip registration
+ of buffer block mutex/rwlock with performance schema. If
+ PFS_GROUP_BUFFER_SYNC is defined, skip the registration
+ since buffer block mutex/rwlock will be registered later in
+ pfs_register_buffer_block() */
+
+ mutex_create(PFS_NOT_INSTRUMENTED, &block->mutex, SYNC_BUF_BLOCK);
+ rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING);
+
+# ifdef UNIV_SYNC_DEBUG
+ rw_lock_create(PFS_NOT_INSTRUMENTED,
+ &block->debug_latch, SYNC_NO_ORDER_CHECK);
+# endif /* UNIV_SYNC_DEBUG */
+
+#else /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
+ mutex_create(buffer_block_mutex_key, &block->mutex, SYNC_BUF_BLOCK);
+ rw_lock_create(buf_block_lock_key, &block->lock, SYNC_LEVEL_VARYING);
+
+# ifdef UNIV_SYNC_DEBUG
+ rw_lock_create(buf_block_debug_latch_key,
+ &block->debug_latch, SYNC_NO_ORDER_CHECK);
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
+
+ ut_ad(rw_lock_validate(&(block->lock)));
+}
+
+/********************************************************************//**
+Allocates a chunk of buffer frames.
+@return chunk, or NULL on failure */
+static
+buf_chunk_t*
+buf_chunk_init(
+/*===========*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ buf_chunk_t* chunk, /*!< out: chunk of buffers */
+ ulint mem_size) /*!< in: requested size in bytes */
+{
+ buf_block_t* block;
+ byte* frame;
+ ulint i;
+
+ /* Round down to a multiple of page size,
+ although it already should be. */
+ mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);
+ /* Reserve space for the block descriptors. */
+ mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block)
+ + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
+
+ chunk->mem_size = mem_size;
+ chunk->mem = os_mem_alloc_large(&chunk->mem_size);
+
+ if (UNIV_UNLIKELY(chunk->mem == NULL)) {
+
+ return(NULL);
+ }
+
+ /* Allocate the block descriptors from
+ the start of the memory block. */
+ chunk->blocks = (buf_block_t*) chunk->mem;
+
+ /* Align a pointer to the first frame. Note that when
+ os_large_page_size is smaller than UNIV_PAGE_SIZE,
+ we may allocate one fewer block than requested. When
+ it is bigger, we may allocate more blocks than requested. */
+
+ frame = (byte*) ut_align(chunk->mem, UNIV_PAGE_SIZE);
+ chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
+ - (frame != chunk->mem);
+
+ /* Subtract the space needed for block descriptors. */
+ {
+ ulint size = chunk->size;
+
+ while (frame < (byte*) (chunk->blocks + size)) {
+ frame += UNIV_PAGE_SIZE;
+ size--;
+ }
+
+ chunk->size = size;
+ }
+
+ /* Init block structs and assign frames for them. Then we
+ assign the frames to the first blocks (we already mapped the
+ memory above). */
+
+ block = chunk->blocks;
+
+ for (i = chunk->size; i--; ) {
+
+ buf_block_init(buf_pool, block, frame);
+ UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);
+
+ /* Add the block to the free list */
+ UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page));
+
+ ut_d(block->page.in_free_list = TRUE);
+ ut_ad(buf_pool_from_block(block) == buf_pool);
+
+ block++;
+ frame += UNIV_PAGE_SIZE;
+ }
+
+#ifdef PFS_GROUP_BUFFER_SYNC
+ pfs_register_buffer_block(chunk);
+#endif
+ return(chunk);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Finds a block in the given buffer chunk that points to a
+given compressed page.
+@return buffer block pointing to the compressed page, or NULL */
+static
+buf_block_t*
+buf_chunk_contains_zip(
+/*===================*/
+ buf_chunk_t* chunk, /*!< in: chunk being checked */
+ const void* data) /*!< in: pointer to compressed page */
+{
+ buf_block_t* block;
+ ulint i;
+
+ block = chunk->blocks;
+
+ for (i = chunk->size; i--; block++) {
+ if (block->page.zip.data == data) {
+
+ return(block);
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Finds a block in the buffer pool that points to a
+given compressed page.
+@return buffer block pointing to the compressed page, or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_pool_contains_zip(
+/*==================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ const void* data) /*!< in: pointer to compressed page */
+{
+ ulint n;
+ buf_chunk_t* chunk = buf_pool->chunks;
+
+ ut_ad(buf_pool);
+ ut_ad(buf_pool_mutex_own(buf_pool));
+ for (n = buf_pool->n_chunks; n--; chunk++) {
+
+ buf_block_t* block = buf_chunk_contains_zip(chunk, data);
+
+ if (block) {
+ return(block);
+ }
+ }
+
+ return(NULL);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Checks that all file pages in the buffer chunk are in a replaceable state.
+@return address of a non-free block, or NULL if all freed */
+static
+const buf_block_t*
+buf_chunk_not_freed(
+/*================*/
+ buf_chunk_t* chunk) /*!< in: chunk being checked */
+{
+ buf_block_t* block;
+ ulint i;
+
+ block = chunk->blocks;
+
+ for (i = chunk->size; i--; block++) {
+ ibool ready;
+
+ switch (buf_block_get_state(block)) {
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ /* The uncompressed buffer pool should never
+ contain compressed block descriptors. */
+ ut_error;
+ break;
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ /* Skip blocks that are not being used for
+ file pages. */
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ mutex_enter(&block->mutex);
+ ready = buf_flush_ready_for_replace(&block->page);
+ mutex_exit(&block->mutex);
+
+ if (!ready) {
+
+ return(block);
+ }
+
+ break;
+ }
+ }
+
+ return(NULL);
+}
+
+/********************************************************************//**
+Set buffer pool size variables after resizing it */
+static
+void
+buf_pool_set_sizes(void)
+/*====================*/
+{
+ ulint i;
+ ulint curr_size = 0;
+
+ buf_pool_mutex_enter_all();
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+ curr_size += buf_pool->curr_pool_size;
+ }
+
+ srv_buf_pool_curr_size = curr_size;
+ srv_buf_pool_old_size = srv_buf_pool_size;
+
+ buf_pool_mutex_exit_all();
+}
+
+/********************************************************************//**
+Initialize a buffer pool instance.
+@return DB_SUCCESS if all goes well. */
+UNIV_INTERN
+ulint
+buf_pool_init_instance(
+/*===================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ ulint buf_pool_size, /*!< in: size in bytes */
+ ulint instance_no) /*!< in: id of the instance */
+{
+ ulint i;
+ buf_chunk_t* chunk;
+
+ /* 1. Initialize general fields
+ ------------------------------- */
+ mutex_create(buf_pool_mutex_key,
+ &buf_pool->mutex, SYNC_BUF_POOL);
+ mutex_create(buf_pool_zip_mutex_key,
+ &buf_pool->zip_mutex, SYNC_BUF_BLOCK);
+
+ buf_pool_mutex_enter(buf_pool);
+
+ if (buf_pool_size > 0) {
+ buf_pool->n_chunks = 1;
+
+ buf_pool->chunks = chunk =
+ (buf_chunk_t*) mem_zalloc(sizeof *chunk);
+
+ UT_LIST_INIT(buf_pool->free);
+
+ if (!buf_chunk_init(buf_pool, chunk, buf_pool_size)) {
+ mem_free(chunk);
+ mem_free(buf_pool);
+
+ buf_pool_mutex_exit(buf_pool);
+
+ return(DB_ERROR);
+ }
+
+ buf_pool->instance_no = instance_no;
+ buf_pool->old_pool_size = buf_pool_size;
+ buf_pool->curr_size = chunk->size;
+ buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
+
+ /* Number of locks protecting page_hash must be a
+ power of two */
+ srv_n_page_hash_locks =
+ ut_2_power_up(srv_n_page_hash_locks);
+ ut_a(srv_n_page_hash_locks != 0);
+ ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS);
+
+ buf_pool->page_hash = ha_create(2 * buf_pool->curr_size,
+ srv_n_page_hash_locks,
+ MEM_HEAP_FOR_PAGE_HASH,
+ SYNC_BUF_PAGE_HASH);
+
+ buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
+
+ buf_pool->last_printout_time = ut_time();
+ }
+ /* 2. Initialize flushing fields
+ -------------------------------- */
+
+ mutex_create(flush_list_mutex_key, &buf_pool->flush_list_mutex,
+ SYNC_BUF_FLUSH_LIST);
+
+ for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
+ buf_pool->no_flush[i] = os_event_create(NULL);
+ }
+
+ buf_pool->watch = (buf_page_t*) mem_zalloc(
+ sizeof(*buf_pool->watch) * BUF_POOL_WATCH_SIZE);
+
+ /* All fields are initialized by mem_zalloc(). */
+
+ buf_pool->try_LRU_scan = TRUE;
+
+ buf_pool_mutex_exit(buf_pool);
+
+ return(DB_SUCCESS);
+}
+
+/********************************************************************//**
+free one buffer pool instance */
+static
+void
+buf_pool_free_instance(
+/*===================*/
+ buf_pool_t* buf_pool) /* in,own: buffer pool instance
+ to free */
+{
+ buf_chunk_t* chunk;
+ buf_chunk_t* chunks;
+ buf_page_t* bpage;
+
+ bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+ while (bpage != NULL) {
+ buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+ enum buf_page_state state = buf_page_get_state(bpage);
+
+ ut_ad(buf_page_in_file(bpage));
+ ut_ad(bpage->in_LRU_list);
+
+ if (state != BUF_BLOCK_FILE_PAGE) {
+ /* We must not have any dirty block except
+ when doing a fast shutdown. */
+ ut_ad(state == BUF_BLOCK_ZIP_PAGE
+ || srv_fast_shutdown == 2);
+ buf_page_free_descriptor(bpage);
+ }
+
+ bpage = prev_bpage;
+ }
+
+ mem_free(buf_pool->watch);
+ buf_pool->watch = NULL;
+
+ chunks = buf_pool->chunks;
+ chunk = chunks + buf_pool->n_chunks;
+
+ while (--chunk >= chunks) {
+ os_mem_free_large(chunk->mem, chunk->mem_size);
+ }
+
+ mem_free(buf_pool->chunks);
+ ha_clear(buf_pool->page_hash);
+ hash_table_free(buf_pool->page_hash);
+ hash_table_free(buf_pool->zip_hash);
+}
+
+/********************************************************************//**
+Creates the buffer pool.
+@return DB_SUCCESS if success, DB_ERROR if not enough memory or error */
+UNIV_INTERN
+ulint
+buf_pool_init(
+/*==========*/
+ ulint total_size, /*!< in: size of the total pool in bytes */
+ ulint n_instances) /*!< in: number of instances */
+{
+ ulint i;
+ const ulint size = total_size / n_instances;
+
+ ut_ad(n_instances > 0);
+ ut_ad(n_instances <= MAX_BUFFER_POOLS);
+ ut_ad(n_instances == srv_buf_pool_instances);
+
+ buf_pool_ptr = (buf_pool_t*) mem_zalloc(
+ n_instances * sizeof *buf_pool_ptr);
+
+ for (i = 0; i < n_instances; i++) {
+ buf_pool_t* ptr = &buf_pool_ptr[i];
+
+ if (buf_pool_init_instance(ptr, size, i) != DB_SUCCESS) {
+
+ /* Free all the instances created so far. */
+ buf_pool_free(i);
+
+ return(DB_ERROR);
+ }
+ }
+
+ buf_pool_set_sizes();
+ buf_LRU_old_ratio_update(100 * 3/ 8, FALSE);
+
+ btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void*) / 64);
+
+ return(DB_SUCCESS);
+}
+
+/********************************************************************//**
+Frees the buffer pool at shutdown. This must not be invoked before
+freeing all mutexes. */
+UNIV_INTERN
+void
+buf_pool_free(
+/*==========*/
+ ulint n_instances) /*!< in: numbere of instances to free */
+{
+ ulint i;
+
+ for (i = 0; i < n_instances; i++) {
+ buf_pool_free_instance(buf_pool_from_array(i));
+ }
+
+ mem_free(buf_pool_ptr);
+ buf_pool_ptr = NULL;
+}
+
+/********************************************************************//**
+Clears the adaptive hash index on all pages in the buffer pool. */
+UNIV_INTERN
+void
+buf_pool_clear_hash_index(void)
+/*===========================*/
+{
+ ulint p;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(!btr_search_enabled);
+
+ for (p = 0; p < srv_buf_pool_instances; p++) {
+ buf_pool_t* buf_pool = buf_pool_from_array(p);
+ buf_chunk_t* chunks = buf_pool->chunks;
+ buf_chunk_t* chunk = chunks + buf_pool->n_chunks;
+
+ while (--chunk >= chunks) {
+ buf_block_t* block = chunk->blocks;
+ ulint i = chunk->size;
+
+ for (; i--; block++) {
+ dict_index_t* index = block->index;
+
+ /* We can set block->index = NULL
+ when we have an x-latch on btr_search_latch;
+ see the comment in buf0buf.h */
+
+ if (!index) {
+ /* Not hashed */
+ continue;
+ }
+
+ block->index = NULL;
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ block->n_pointers = 0;
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ }
+ }
+ }
+}
+
+/********************************************************************//**
+Relocate a buffer control block. Relocates the block on the LRU list
+and in buf_pool->page_hash. Does not relocate bpage->list.
+The caller must take care of relocating bpage->list. */
+UNIV_INTERN
+void
+buf_relocate(
+/*=========*/
+ buf_page_t* bpage, /*!< in/out: control block being relocated;
+ buf_page_get_state(bpage) must be
+ BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
+ buf_page_t* dpage) /*!< in/out: destination control block */
+{
+ buf_page_t* b;
+ ulint fold;
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+
+ fold = buf_page_address_fold(bpage->space, bpage->offset);
+
+ ut_ad(buf_pool_mutex_own(buf_pool));
+ ut_ad(buf_page_hash_lock_held_x(buf_pool, bpage));
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+ ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
+ ut_a(bpage->buf_fix_count == 0);
+ ut_ad(bpage->in_LRU_list);
+ ut_ad(!bpage->in_zip_hash);
+ ut_ad(bpage->in_page_hash);
+ ut_ad(bpage == buf_page_hash_get_low(buf_pool,
+ bpage->space,
+ bpage->offset,
+ fold));
+
+ ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+#ifdef UNIV_DEBUG
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_FILE_PAGE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ case BUF_BLOCK_ZIP_DIRTY:
+ case BUF_BLOCK_ZIP_PAGE:
+ break;
+ }
+#endif /* UNIV_DEBUG */
+
+ memcpy(dpage, bpage, sizeof *dpage);
+
+ ut_d(bpage->in_LRU_list = FALSE);
+ ut_d(bpage->in_page_hash = FALSE);
+
+ /* relocate buf_pool->LRU */
+ b = UT_LIST_GET_PREV(LRU, bpage);
+ UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage);
+
+ if (b) {
+ UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, b, dpage);
+ } else {
+ UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, dpage);
+ }
+
+ if (UNIV_UNLIKELY(buf_pool->LRU_old == bpage)) {
+ buf_pool->LRU_old = dpage;
+#ifdef UNIV_LRU_DEBUG
+ /* buf_pool->LRU_old must be the first item in the LRU list
+ whose "old" flag is set. */
+ ut_a(buf_pool->LRU_old->old);
+ ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
+ || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
+ ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
+ || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
+ } else {
+ /* Check that the "old" flag is consistent in
+ the block and its neighbours. */
+ buf_page_set_old(dpage, buf_page_is_old(dpage));
+#endif /* UNIV_LRU_DEBUG */
+ }
+
+ ut_d(UT_LIST_VALIDATE(
+ LRU, buf_page_t, buf_pool->LRU, CheckInLRUList()));
+
+ /* relocate buf_pool->page_hash */
+ HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
+ HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
+}
+
+/********************************************************************//**
+Determine if a block is a sentinel for a buffer pool watch.
+@return TRUE if a sentinel for a buffer pool watch, FALSE if not */
+UNIV_INTERN
+ibool
+buf_pool_watch_is_sentinel(
+/*=======================*/
+ buf_pool_t* buf_pool, /*!< buffer pool instance */
+ const buf_page_t* bpage) /*!< in: block */
+{
+ /* We must also own the appropriate hash lock. */
+ ut_ad(buf_page_hash_lock_held_s_or_x(buf_pool, bpage));
+ ut_ad(buf_page_in_file(bpage));
+
+ if (bpage < &buf_pool->watch[0]
+ || bpage >= &buf_pool->watch[BUF_POOL_WATCH_SIZE]) {
+
+ ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_PAGE
+ || bpage->zip.data != NULL);
+
+ return(FALSE);
+ }
+
+ ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
+ ut_ad(!bpage->in_zip_hash);
+ ut_ad(bpage->in_page_hash);
+ ut_ad(bpage->zip.data == NULL);
+ ut_ad(bpage->buf_fix_count > 0);
+ return(TRUE);
+}
+
+/****************************************************************//**
+Add watch for the given page to be read in. Caller must have
+appropriate hash_lock for the bpage. This function may release the
+hash_lock and reacquire it.
+@return NULL if watch set, block if the page is in the buffer pool */
+UNIV_INTERN
+buf_page_t*
+buf_pool_watch_set(
+/*===============*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: page number */
+ ulint fold) /*!< in: buf_page_address_fold(space, offset) */
+{
+ buf_page_t* bpage;
+ ulint i;
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
+ rw_lock_t* hash_lock;
+
+ hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
+
+ if (UNIV_LIKELY_NULL(bpage)) {
+page_found:
+ if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) {
+ /* The page was loaded meanwhile. */
+ return(bpage);
+ }
+ /* Add to an existing watch. */
+ bpage->buf_fix_count++;
+ return(NULL);
+ }
+
+ /* From this point this function becomes fairly heavy in terms
+ of latching. We acquire the buf_pool mutex as well as all the
+ hash_locks. buf_pool mutex is needed because any changes to
+ the page_hash must be covered by it and hash_locks are needed
+ because we don't want to read any stale information in
+ buf_pool->watch[]. However, it is not in the critical code path
+ as this function will be called only by the purge thread. */
+
+
+ /* To obey latching order first release the hash_lock. */
+ rw_lock_x_unlock(hash_lock);
+
+ buf_pool_mutex_enter(buf_pool);
+ hash_lock_x_all(buf_pool->page_hash);
+
+ /* We have to recheck that the page
+ was not loaded or a watch set by some other
+ purge thread. This is because of the small
+ time window between when we release the
+ hash_lock to acquire buf_pool mutex above. */
+
+ bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
+ if (UNIV_LIKELY_NULL(bpage)) {
+ buf_pool_mutex_exit(buf_pool);
+ hash_unlock_x_all_but(buf_pool->page_hash, hash_lock);
+ goto page_found;
+ }
+
+ for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
+ bpage = &buf_pool->watch[i];
+
+ ut_ad(bpage->access_time == 0);
+ ut_ad(bpage->newest_modification == 0);
+ ut_ad(bpage->oldest_modification == 0);
+ ut_ad(bpage->zip.data == NULL);
+ ut_ad(!bpage->in_zip_hash);
+
+ switch (bpage->state) {
+ case BUF_BLOCK_POOL_WATCH:
+ ut_ad(!bpage->in_page_hash);
+ ut_ad(bpage->buf_fix_count == 0);
+
+ /* bpage is pointing to buf_pool->watch[],
+ which is protected by buf_pool->mutex.
+ Normally, buf_page_t objects are protected by
+ buf_block_t::mutex or buf_pool->zip_mutex or both. */
+
+ bpage->state = BUF_BLOCK_ZIP_PAGE;
+ bpage->space = space;
+ bpage->offset = offset;
+ bpage->buf_fix_count = 1;
+
+ ut_d(bpage->in_page_hash = TRUE);
+ HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
+ fold, bpage);
+
+ buf_pool_mutex_exit(buf_pool);
+ /* Once the sentinel is in the page_hash we can
+ safely release all locks except just the
+ relevant hash_lock */
+ hash_unlock_x_all_but(buf_pool->page_hash,
+ hash_lock);
+
+ return(NULL);
+ case BUF_BLOCK_ZIP_PAGE:
+ ut_ad(bpage->in_page_hash);
+ ut_ad(bpage->buf_fix_count > 0);
+ break;
+ default:
+ ut_error;
+ }
+ }
+
+ /* Allocation failed. Either the maximum number of purge
+ threads should never exceed BUF_POOL_WATCH_SIZE, or this code
+ should be modified to return a special non-NULL value and the
+ caller should purge the record directly. */
+ ut_error;
+
+ /* Fix compiler warning */
+ return(NULL);
+}
+
+/****************************************************************//**
+Remove the sentinel block for the watch before replacing it with a real block.
+buf_page_watch_clear() or buf_page_watch_occurred() will notice that
+the block has been replaced with the real block.
+@return reference count, to be added to the replacement block */
+static
+void
+buf_pool_watch_remove(
+/*==================*/
+ buf_pool_t* buf_pool, /*!< buffer pool instance */
+ ulint fold, /*!< in: buf_page_address_fold(
+ space, offset) */
+ buf_page_t* watch) /*!< in/out: sentinel for watch */
+{
+#ifdef UNIV_SYNC_DEBUG
+ /* We must also own the appropriate hash_bucket mutex. */
+ rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+ ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ ut_ad(buf_pool_mutex_own(buf_pool));
+
+ HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, watch);
+ ut_d(watch->in_page_hash = FALSE);
+ watch->buf_fix_count = 0;
+ watch->state = BUF_BLOCK_POOL_WATCH;
+}
+
+/****************************************************************//**
+Stop watching if the page has been read in.
+buf_pool_watch_set(space,offset) must have returned NULL before. */
+UNIV_INTERN
+void
+buf_pool_watch_unset(
+/*=================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ buf_page_t* bpage;
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
+ ulint fold = buf_page_address_fold(space, offset);
+ rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool,
+ fold);
+
+ /* We only need to have buf_pool mutex in case where we end
+ up calling buf_pool_watch_remove but to obey latching order
+ we acquire it here before acquiring hash_lock. This should
+ not cause too much grief as this function is only ever
+ called from the purge thread. */
+ buf_pool_mutex_enter(buf_pool);
+
+ rw_lock_x_lock(hash_lock);
+
+ bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
+ /* The page must exist because buf_pool_watch_set()
+ increments buf_fix_count. */
+ ut_a(bpage);
+
+ if (UNIV_UNLIKELY(!buf_pool_watch_is_sentinel(buf_pool, bpage))) {
+ mutex_t* mutex = buf_page_get_mutex(bpage);
+
+ mutex_enter(mutex);
+ ut_a(bpage->buf_fix_count > 0);
+ bpage->buf_fix_count--;
+ mutex_exit(mutex);
+ } else {
+ ut_a(bpage->buf_fix_count > 0);
+
+ if (UNIV_LIKELY(!--bpage->buf_fix_count)) {
+ buf_pool_watch_remove(buf_pool, fold, bpage);
+ }
+ }
+
+ buf_pool_mutex_exit(buf_pool);
+ rw_lock_x_unlock(hash_lock);
+}
+
+/****************************************************************//**
+Check if the page has been read in.
+This may only be called after buf_pool_watch_set(space,offset)
+has returned NULL and before invoking buf_pool_watch_unset(space,offset).
+@return FALSE if the given page was not read in, TRUE if it was */
+UNIV_INTERN
+ibool
+buf_pool_watch_occurred(
+/*====================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ ibool ret;
+ buf_page_t* bpage;
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
+ ulint fold = buf_page_address_fold(space, offset);
+ rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool,
+ fold);
+
+ rw_lock_s_lock(hash_lock);
+
+ bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
+ /* The page must exist because buf_pool_watch_set()
+ increments buf_fix_count. */
+ ut_a(bpage);
+ ret = !buf_pool_watch_is_sentinel(buf_pool, bpage);
+ rw_lock_s_unlock(hash_lock);
+
+ return(ret);
+}
+
+/********************************************************************//**
+Moves a page to the start of the buffer pool LRU list. This high-level
+function can be used to prevent an important page from slipping out of
+the buffer pool. */
+UNIV_INTERN
+void
+buf_page_make_young(
+/*================*/
+ buf_page_t* bpage) /*!< in: buffer block of a file page */
+{
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+
+ buf_pool_mutex_enter(buf_pool);
+
+ ut_a(buf_page_in_file(bpage));
+
+ buf_LRU_make_block_young(bpage);
+
+ buf_pool_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Sets the time of the first access of a page and moves a page to the
+start of the buffer pool LRU list if it is too old. This high-level
+function can be used to prevent an important page from slipping
+out of the buffer pool. */
+static
+void
+buf_page_set_accessed_make_young(
+/*=============================*/
+ buf_page_t* bpage, /*!< in/out: buffer block of a
+ file page */
+ unsigned access_time) /*!< in: bpage->access_time
+ read under mutex protection,
+ or 0 if unknown */
+{
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+
+ ut_ad(!buf_pool_mutex_own(buf_pool));
+ ut_a(buf_page_in_file(bpage));
+
+ if (buf_page_peek_if_too_old(bpage)) {
+ buf_pool_mutex_enter(buf_pool);
+ buf_LRU_make_block_young(bpage);
+ buf_pool_mutex_exit(buf_pool);
+ } else if (!access_time) {
+ ulint time_ms = ut_time_ms();
+ buf_pool_mutex_enter(buf_pool);
+ buf_page_set_accessed(bpage, time_ms);
+ buf_pool_mutex_exit(buf_pool);
+ }
+}
+
+/********************************************************************//**
+Resets the check_index_page_at_flush field of a page if found in the buffer
+pool. */
+UNIV_INTERN
+void
+buf_reset_check_index_page_at_flush(
+/*================================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ buf_block_t* block;
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
+
+ buf_pool_mutex_enter(buf_pool);
+
+ block = (buf_block_t*) buf_page_hash_get(buf_pool, space, offset);
+
+ if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) {
+ ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
+ block->check_index_page_at_flush = FALSE;
+ }
+
+ buf_pool_mutex_exit(buf_pool);
+}
+
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+/********************************************************************//**
+Sets file_page_was_freed TRUE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_set_file_page_was_freed(
+/*=============================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ buf_page_t* bpage;
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
+ rw_lock_t* hash_lock;
+
+ bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
+ &hash_lock);
+
+ if (bpage) {
+ mutex_t* block_mutex = buf_page_get_mutex(bpage);
+ ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+ mutex_enter(block_mutex);
+ rw_lock_s_unlock(hash_lock);
+ /* bpage->file_page_was_freed can already hold
+ when this code is invoked from dict_drop_index_tree() */
+ bpage->file_page_was_freed = TRUE;
+ mutex_exit(block_mutex);
+ }
+
+ return(bpage);
+}
+
+/********************************************************************//**
+Sets file_page_was_freed FALSE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_reset_file_page_was_freed(
+/*===============================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ buf_page_t* bpage;
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
+ rw_lock_t* hash_lock;
+
+ bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
+ &hash_lock);
+ if (bpage) {
+ mutex_t* block_mutex = buf_page_get_mutex(bpage);
+ ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+ mutex_enter(block_mutex);
+ rw_lock_s_unlock(hash_lock);
+ bpage->file_page_was_freed = FALSE;
+ mutex_exit(block_mutex);
+ }
+
+ return(bpage);
+}
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+
+/********************************************************************//**
+Attempts to discard the uncompressed frame of a compressed page. The
+caller should not be holding any mutexes when this function is called.
+@return TRUE if successful, FALSE otherwise. */
+static
+void
+buf_block_try_discard_uncompressed(
+/*===============================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ buf_page_t* bpage;
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
+
+ /* Since we need to acquire buf_pool mutex to discard
+ the uncompressed frame and because page_hash mutex resides
+ below buf_pool mutex in sync ordering therefore we must
+ first release the page_hash mutex. This means that the
+ block in question can move out of page_hash. Therefore
+ we need to check again if the block is still in page_hash. */
+ buf_pool_mutex_enter(buf_pool);
+
+ bpage = buf_page_hash_get(buf_pool, space, offset);
+
+ if (bpage) {
+ buf_LRU_free_block(bpage, FALSE);
+ }
+
+ buf_pool_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with buf_page_release_zip().
+NOTE: the page is not protected by any latch. Mutual exclusion has to
+be implemented at a higher level. In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@return pointer to the block */
+UNIV_INTERN
+buf_page_t*
+buf_page_get_zip(
+/*=============*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size */
+ ulint offset) /*!< in: page number */
+{
+ buf_page_t* bpage;
+ mutex_t* block_mutex;
+ rw_lock_t* hash_lock;
+ ibool discard_attempted = FALSE;
+ ibool must_read;
+ unsigned access_time;
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
+
+ buf_pool->stat.n_page_gets++;
+
+ for (;;) {
+lookup:
+
+ /* The following call will also grab the page_hash
+ mutex if the page is found. */
+ bpage = buf_page_hash_get_s_locked(buf_pool, space,
+ offset, &hash_lock);
+ if (bpage) {
+ ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+ break;
+ }
+
+ /* Page not in buf_pool: needs to be read from file */
+
+ ut_ad(!hash_lock);
+ buf_read_page(space, zip_size, offset);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 37 || buf_validate());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+ }
+
+ ut_ad(buf_page_hash_lock_held_s(buf_pool, bpage));
+
+ if (!bpage->zip.data) {
+ /* There is no compressed page. */
+err_exit:
+ rw_lock_s_unlock(hash_lock);
+ return(NULL);
+ }
+
+ ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ case BUF_BLOCK_ZIP_FREE:
+ break;
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ block_mutex = &buf_pool->zip_mutex;
+ mutex_enter(block_mutex);
+ bpage->buf_fix_count++;
+ goto got_block;
+ case BUF_BLOCK_FILE_PAGE:
+ /* Discard the uncompressed page frame if possible. */
+ if (!discard_attempted) {
+ rw_lock_s_unlock(hash_lock);
+ buf_block_try_discard_uncompressed(space,
+ offset);
+ discard_attempted = TRUE;
+ goto lookup;
+ }
+
+ block_mutex = &((buf_block_t*) bpage)->mutex;
+ mutex_enter(block_mutex);
+ buf_block_buf_fix_inc((buf_block_t*) bpage,
+ __FILE__, __LINE__);
+ goto got_block;
+ }
+
+ ut_error;
+ goto err_exit;
+
+got_block:
+ must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ;
+ access_time = buf_page_is_accessed(bpage);
+
+ rw_lock_s_unlock(hash_lock);
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+ ut_a(!bpage->file_page_was_freed);
+#endif
+ mutex_exit(block_mutex);
+
+ buf_page_set_accessed_make_young(bpage, access_time);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 5771 || buf_validate());
+ ut_a(bpage->buf_fix_count > 0);
+ ut_a(buf_page_in_file(bpage));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ if (must_read) {
+ /* Let us wait until the read operation
+ completes */
+
+ for (;;) {
+ enum buf_io_fix io_fix;
+
+ mutex_enter(block_mutex);
+ io_fix = buf_page_get_io_fix(bpage);
+ mutex_exit(block_mutex);
+
+ if (io_fix == BUF_IO_READ) {
+
+ os_thread_sleep(WAIT_FOR_READ);
+ } else {
+ break;
+ }
+ }
+ }
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(buf_page_get_space(bpage),
+ buf_page_get_page_no(bpage)) == 0);
+#endif
+ return(bpage);
+}
+
+/********************************************************************//**
+Initialize some fields of a control block. */
+UNIV_INLINE
+void
+buf_block_init_low(
+/*===============*/
+ buf_block_t* block) /*!< in: block to init */
+{
+ block->check_index_page_at_flush = FALSE;
+ block->index = NULL;
+
+ block->n_hash_helps = 0;
+ block->n_fields = 1;
+ block->n_bytes = 0;
+ block->left_side = TRUE;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Decompress a block.
+@return TRUE if successful */
+UNIV_INTERN
+ibool
+buf_zip_decompress(
+/*===============*/
+ buf_block_t* block, /*!< in/out: block */
+ ibool check) /*!< in: TRUE=verify the page checksum */
+{
+ const byte* frame = block->page.zip.data;
+ ulint size = page_zip_get_size(&block->page.zip);
+
+ ut_ad(buf_block_get_zip_size(block));
+ ut_a(buf_block_get_space(block) != 0);
+
+ if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: compressed page checksum mismatch"
+ " (space %u page %u): stored: %lu, crc32: %lu "
+ "innodb: %lu, none: %lu\n",
+ block->page.space, block->page.offset,
+ mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM),
+ page_zip_calc_checksum(frame, size,
+ SRV_CHECKSUM_ALGORITHM_CRC32),
+ page_zip_calc_checksum(frame, size,
+ SRV_CHECKSUM_ALGORITHM_INNODB),
+ page_zip_calc_checksum(frame, size,
+ SRV_CHECKSUM_ALGORITHM_NONE));
+ return(FALSE);
+ }
+
+ switch (fil_page_get_type(frame)) {
+ case FIL_PAGE_INDEX:
+ if (page_zip_decompress(&block->page.zip,
+ block->frame, TRUE)) {
+ return(TRUE);
+ }
+
+ fprintf(stderr,
+ "InnoDB: unable to decompress space %lu page %lu\n",
+ (ulong) block->page.space,
+ (ulong) block->page.offset);
+ return(FALSE);
+
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_IBUF_BITMAP:
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ /* Copy to uncompressed storage. */
+ memcpy(block->frame, frame,
+ buf_block_get_zip_size(block));
+ return(TRUE);
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: unknown compressed page"
+ " type %lu\n",
+ fil_page_get_type(frame));
+ return(FALSE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Gets the block to whose frame the pointer is pointing to if found
+in this buffer pool instance.
+@return pointer to block */
+UNIV_INTERN
+buf_block_t*
+buf_block_align_instance(
+/*=====================*/
+ buf_pool_t* buf_pool, /*!< in: buffer in which the block
+ resides */
+ const byte* ptr) /*!< in: pointer to a frame */
+{
+ buf_chunk_t* chunk;
+ ulint i;
+
+ /* TODO: protect buf_pool->chunks with a mutex (it will
+ currently remain constant after buf_pool_init()) */
+ for (chunk = buf_pool->chunks, i = buf_pool->n_chunks; i--; chunk++) {
+ ulint offs;
+
+ if (UNIV_UNLIKELY(ptr < chunk->blocks->frame)) {
+
+ continue;
+ }
+ /* else */
+
+ offs = ptr - chunk->blocks->frame;
+
+ offs >>= UNIV_PAGE_SIZE_SHIFT;
+
+ if (UNIV_LIKELY(offs < chunk->size)) {
+ buf_block_t* block = &chunk->blocks[offs];
+
+ /* The function buf_chunk_init() invokes
+ buf_block_init() so that block[n].frame ==
+ block->frame + n * UNIV_PAGE_SIZE. Check it. */
+ ut_ad(block->frame == page_align(ptr));
+#ifdef UNIV_DEBUG
+ /* A thread that updates these fields must
+ hold buf_pool->mutex and block->mutex. Acquire
+ only the latter. */
+ mutex_enter(&block->mutex);
+
+ switch (buf_block_get_state(block)) {
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ /* These types should only be used in
+ the compressed buffer pool, whose
+ memory is allocated from
+ buf_pool->chunks, in UNIV_PAGE_SIZE
+ blocks flagged as BUF_BLOCK_MEMORY. */
+ ut_error;
+ break;
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ /* Some data structures contain
+ "guess" pointers to file pages. The
+ file pages may have been freed and
+ reused. Do not complain. */
+ break;
+ case BUF_BLOCK_REMOVE_HASH:
+ /* buf_LRU_block_remove_hashed_page()
+ will overwrite the FIL_PAGE_OFFSET and
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID with
+ 0xff and set the state to
+ BUF_BLOCK_REMOVE_HASH. */
+ ut_ad(page_get_space_id(page_align(ptr))
+ == 0xffffffff);
+ ut_ad(page_get_page_no(page_align(ptr))
+ == 0xffffffff);
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ ut_ad(block->page.space
+ == page_get_space_id(page_align(ptr)));
+ ut_ad(block->page.offset
+ == page_get_page_no(page_align(ptr)));
+ break;
+ }
+
+ mutex_exit(&block->mutex);
+#endif /* UNIV_DEBUG */
+
+ return(block);
+ }
+ }
+
+ return(NULL);
+}
+
+/*******************************************************************//**
+Gets the block to whose frame the pointer is pointing to.
+@return pointer to block, never NULL */
+UNIV_INTERN
+buf_block_t*
+buf_block_align(
+/*============*/
+ const byte* ptr) /*!< in: pointer to a frame */
+{
+ ulint i;
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ buf_block_t* block;
+
+ block = buf_block_align_instance(
+ buf_pool_from_array(i), ptr);
+ if (block) {
+ return(block);
+ }
+ }
+
+ /* The block should always be found. */
+ ut_error;
+ return(NULL);
+}
+
+/********************************************************************//**
+Find out if a pointer belongs to a buf_block_t. It can be a pointer to
+the buf_block_t itself or a member of it. This functions checks one of
+the buffer pool instances.
+@return TRUE if ptr belongs to a buf_block_t struct */
+static
+ibool
+buf_pointer_is_block_field_instance(
+/*================================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ const void* ptr) /*!< in: pointer not dereferenced */
+{
+ const buf_chunk_t* chunk = buf_pool->chunks;
+ const buf_chunk_t* const echunk = chunk + buf_pool->n_chunks;
+
+ /* TODO: protect buf_pool->chunks with a mutex (it will
+ currently remain constant after buf_pool_init()) */
+ while (chunk < echunk) {
+ if (ptr >= (void*) chunk->blocks
+ && ptr < (void*) (chunk->blocks + chunk->size)) {
+
+ return(TRUE);
+ }
+
+ chunk++;
+ }
+
+ return(FALSE);
+}
+
+/********************************************************************//**
+Find out if a pointer belongs to a buf_block_t. It can be a pointer to
+the buf_block_t itself or a member of it
+@return TRUE if ptr belongs to a buf_block_t struct */
+UNIV_INTERN
+ibool
+buf_pointer_is_block_field(
+/*=======================*/
+ const void* ptr) /*!< in: pointer not dereferenced */
+{
+ ulint i;
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ ibool found;
+
+ found = buf_pointer_is_block_field_instance(
+ buf_pool_from_array(i), ptr);
+ if (found) {
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/********************************************************************//**
+Find out if a buffer block was created by buf_chunk_init().
+@return TRUE if "block" has been added to buf_pool->free by buf_chunk_init() */
+static
+ibool
+buf_block_is_uncompressed(
+/*======================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ const buf_block_t* block) /*!< in: pointer to block,
+ not dereferenced */
+{
+ if (UNIV_UNLIKELY((((ulint) block) % sizeof *block) != 0)) {
+ /* The pointer should be aligned. */
+ return(FALSE);
+ }
+
+ return(buf_pointer_is_block_field_instance(buf_pool, (void*) block));
+}
+
+/********************************************************************//**
+This is the general function used to get access to a database page.
+@return pointer to the block or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_page_get_gen(
+/*=============*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint offset, /*!< in: page number */
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
+ buf_block_t* guess, /*!< in: guessed block or NULL */
+ ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
+ BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or
+ BUF_GET_IF_IN_POOL_OR_WATCH */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ buf_block_t* block;
+ ulint fold;
+ unsigned access_time;
+ ulint fix_type;
+ ibool must_read;
+ rw_lock_t* hash_lock;
+ mutex_t* block_mutex;
+ buf_page_t* hash_bpage;
+ ulint retries = 0;
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
+
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
+ ut_ad((rw_latch == RW_S_LATCH)
+ || (rw_latch == RW_X_LATCH)
+ || (rw_latch == RW_NO_LATCH));
+#ifdef UNIV_DEBUG
+ switch (mode) {
+ case BUF_GET_NO_LATCH:
+ ut_ad(rw_latch == RW_NO_LATCH);
+ break;
+ case BUF_GET:
+ case BUF_GET_IF_IN_POOL:
+ case BUF_PEEK_IF_IN_POOL:
+ case BUF_GET_IF_IN_POOL_OR_WATCH:
+ case BUF_GET_POSSIBLY_FREED:
+ break;
+ default:
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+ ut_ad(zip_size == fil_space_get_zip_size(space));
+ ut_ad(ut_is_2pow(zip_size));
+#ifndef UNIV_LOG_DEBUG
+ ut_ad(!ibuf_inside(mtr)
+ || ibuf_page_low(space, zip_size, offset,
+ FALSE, file, line, NULL));
+#endif
+ buf_pool->stat.n_page_gets++;
+ fold = buf_page_address_fold(space, offset);
+ hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+loop:
+ block = guess;
+
+ rw_lock_s_lock(hash_lock);
+ if (block) {
+ /* If the guess is a compressed page descriptor that
+ has been allocated by buf_page_alloc_descriptor(),
+ it may have been freed by buf_relocate(). */
+
+ if (!buf_block_is_uncompressed(buf_pool, block)
+ || offset != block->page.offset
+ || space != block->page.space
+ || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
+
+ /* Our guess was bogus or things have changed
+ since. */
+ block = guess = NULL;
+ } else {
+ ut_ad(!block->page.in_zip_hash);
+ ut_ad(block->page.in_page_hash);
+ }
+ }
+
+ if (block == NULL) {
+ block = (buf_block_t*) buf_page_hash_get_low(
+ buf_pool, space, offset, fold);
+ }
+
+ if (!block || buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
+ rw_lock_s_unlock(hash_lock);
+ block = NULL;
+ }
+
+ if (block == NULL) {
+ /* Page not in buf_pool: needs to be read from file */
+
+ if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+ rw_lock_x_lock(hash_lock);
+ block = (buf_block_t*) buf_pool_watch_set(
+ space, offset, fold);
+
+ if (UNIV_LIKELY_NULL(block)) {
+ /* We can release hash_lock after we
+ acquire block_mutex to make sure that
+ no state change takes place. */
+ block_mutex = buf_page_get_mutex(&block->page);
+ mutex_enter(block_mutex);
+
+ /* Now safe to release page_hash mutex */
+ rw_lock_x_unlock(hash_lock);
+ goto got_block;
+ }
+
+ rw_lock_x_unlock(hash_lock);
+ }
+
+ if (mode == BUF_GET_IF_IN_POOL
+ || mode == BUF_PEEK_IF_IN_POOL
+ || mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+ return(NULL);
+ }
+
+ if (buf_read_page(space, zip_size, offset)) {
+ buf_read_ahead_random(space, zip_size, offset,
+ ibuf_inside(mtr));
+
+ retries = 0;
+ } else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
+ ++retries;
+ } else {
+ fprintf(stderr, "InnoDB: Error: Unable"
+ " to read tablespace %lu page no"
+ " %lu into the buffer pool after"
+ " %lu attempts\n"
+ "InnoDB: The most probable cause"
+ " of this error may be that the"
+ " table has been corrupted.\n"
+ "InnoDB: You can try to fix this"
+ " problem by using"
+ " innodb_force_recovery.\n"
+ "InnoDB: Please see reference manual"
+ " for more details.\n"
+ "InnoDB: Aborting...\n",
+ space, offset,
+ BUF_PAGE_READ_MAX_RETRIES);
+
+ ut_error;
+ }
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 37 || buf_validate());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+ goto loop;
+ }
+
+
+ /* We can release hash_lock after we acquire block_mutex to
+ make sure that no state change takes place. */
+ block_mutex = buf_page_get_mutex(&block->page);
+ mutex_enter(block_mutex);
+
+ /* Now safe to release page_hash mutex */
+ rw_lock_s_unlock(hash_lock);
+
+got_block:
+ ut_ad(page_zip_get_size(&block->page.zip) == zip_size);
+ ut_ad(mutex_own(block_mutex));
+
+ must_read = buf_block_get_io_fix(block) == BUF_IO_READ;
+
+ if (must_read && (mode == BUF_GET_IF_IN_POOL
+ || mode == BUF_PEEK_IF_IN_POOL)) {
+
+ /* The page is being read to buffer pool,
+ but we cannot wait around for the read to
+ complete. */
+ mutex_exit(block_mutex);
+
+ return(NULL);
+ }
+
+ switch (buf_block_get_state(block)) {
+ buf_page_t* bpage;
+
+ case BUF_BLOCK_FILE_PAGE:
+ break;
+
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ bpage = &block->page;
+
+ if (bpage->buf_fix_count
+ || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+ /* This condition often occurs when the buffer
+ is not buffer-fixed, but I/O-fixed by
+ buf_page_init_for_read(). */
+ mutex_exit(block_mutex);
+wait_until_unfixed:
+ /* The block is buffer-fixed or I/O-fixed.
+ Try again later. */
+ os_thread_sleep(WAIT_FOR_READ);
+
+ goto loop;
+ }
+
+ /* Allocate an uncompressed page. */
+ mutex_exit(block_mutex);
+ block = buf_LRU_get_free_block(buf_pool);
+ ut_a(block);
+
+ buf_pool_mutex_enter(buf_pool);
+
+ /* As we have released the page_hash lock and the
+ block_mutex to allocate an uncompressed page it is
+ possible that page_hash might have changed. We do
+ another lookup here while holding the hash_lock
+ to verify that bpage is indeed still a part of
+ page_hash. */
+ rw_lock_x_lock(hash_lock);
+ hash_bpage = buf_page_hash_get_low(buf_pool, space,
+ offset, fold);
+
+ mutex_enter(&block->mutex);
+ if (UNIV_UNLIKELY(bpage != hash_bpage)) {
+ /* The buf_pool->page_hash was modified
+ while buf_pool->mutex was released.
+ Free the block that was allocated. */
+
+ buf_LRU_block_free_non_file_page(block);
+ buf_pool_mutex_exit(buf_pool);
+ mutex_exit(&block->mutex);
+ rw_lock_x_unlock(hash_lock);
+
+ block = NULL;
+ goto loop;
+ }
+
+ if (UNIV_UNLIKELY
+ (bpage->buf_fix_count
+ || buf_page_get_io_fix(bpage) != BUF_IO_NONE)) {
+
+ rw_lock_x_unlock(hash_lock);
+ /* The block was buffer-fixed or I/O-fixed
+ while buf_pool->mutex was not held by this thread.
+ Free the block that was allocated and try again.
+ This should be extremely unlikely. */
+
+ buf_LRU_block_free_non_file_page(block);
+ buf_pool_mutex_exit(buf_pool);
+ mutex_exit(&block->mutex);
+
+ goto wait_until_unfixed;
+ }
+
+ /* Move the compressed page from bpage to block,
+ and uncompress it. */
+
+ mutex_enter(&buf_pool->zip_mutex);
+
+ buf_relocate(bpage, &block->page);
+ buf_block_init_low(block);
+ block->lock_hash_val = lock_rec_hash(space, offset);
+
+ UNIV_MEM_DESC(&block->page.zip.data,
+ page_zip_get_size(&block->page.zip));
+
+ if (buf_page_get_state(&block->page)
+ == BUF_BLOCK_ZIP_PAGE) {
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ UT_LIST_REMOVE(list, buf_pool->zip_clean,
+ &block->page);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+ ut_ad(!block->page.in_flush_list);
+ } else {
+ /* Relocate buf_pool->flush_list. */
+ buf_flush_relocate_on_flush_list(bpage,
+ &block->page);
+ }
+
+ /* Buffer-fix, I/O-fix, and X-latch the block
+ for the duration of the decompression.
+ Also add the block to the unzip_LRU list. */
+ block->page.state = BUF_BLOCK_FILE_PAGE;
+
+ /* Insert at the front of unzip_LRU list */
+ buf_unzip_LRU_add_block(block, FALSE);
+
+ block->page.buf_fix_count = 1;
+ buf_block_set_io_fix(block, BUF_IO_READ);
+ rw_lock_x_lock_func(&block->lock, 0, file, line);
+
+ UNIV_MEM_INVALID(bpage, sizeof *bpage);
+
+ rw_lock_x_unlock(hash_lock);
+ mutex_exit(&block->mutex);
+ mutex_exit(&buf_pool->zip_mutex);
+ buf_pool->n_pend_unzip++;
+
+ buf_pool_mutex_exit(buf_pool);
+
+ buf_page_free_descriptor(bpage);
+
+ /* Decompress the page and apply buffered operations
+ while not holding buf_pool->mutex or block->mutex. */
+
+ ut_a(buf_zip_decompress(block, TRUE));
+
+ if (UNIV_LIKELY(!recv_no_ibuf_operations)) {
+ ibuf_merge_or_delete_for_page(block, space, offset,
+ zip_size, TRUE);
+ }
+
+ /* Unfix and unlatch the block. */
+ buf_pool_mutex_enter(buf_pool);
+ mutex_enter(&block->mutex);
+ block->page.buf_fix_count--;
+ buf_block_set_io_fix(block, BUF_IO_NONE);
+ buf_pool->n_pend_unzip--;
+ buf_pool_mutex_exit(buf_pool);
+ rw_lock_x_unlock(&block->lock);
+
+ break;
+
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ }
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+#if UNIV_WORD_SIZE == 4
+ /* On 32-bit systems, there is no padding in buf_page_t. On
+ other systems, Valgrind could complain about uninitialized pad
+ bytes. */
+ UNIV_MEM_ASSERT_RW(&block->page, sizeof block->page);
+#endif
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+ if ((mode == BUF_GET_IF_IN_POOL || mode == BUF_GET_IF_IN_POOL_OR_WATCH)
+ && ibuf_debug) {
+ /* Try to evict the block from the buffer pool, to use the
+ insert buffer (change buffer) as much as possible. */
+
+ /* To obey the latching order, release the
+ block->mutex before acquiring buf_pool->mutex. Protect
+ the block from changes by temporarily buffer-fixing it
+ for the time we are not holding block->mutex. */
+ buf_block_buf_fix_inc(block, file, line);
+ mutex_exit(&block->mutex);
+ buf_pool_mutex_enter(buf_pool);
+ mutex_enter(&block->mutex);
+ buf_block_buf_fix_dec(block);
+ mutex_exit(&block->mutex);
+
+ /* Now we are only holding the buf_pool->mutex,
+ not block->mutex or hash_lock. Blocks cannot be
+ relocated or enter or exit the buf_pool while we
+ are holding the buf_pool->mutex. */
+
+ if (buf_LRU_free_block(&block->page, TRUE)) {
+ buf_pool_mutex_exit(buf_pool);
+ rw_lock_x_lock(hash_lock);
+
+ if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+ /* Set the watch, as it would have
+ been set if the page were not in the
+ buffer pool in the first place. */
+ block = (buf_block_t*) buf_pool_watch_set(
+ space, offset, fold);
+ } else {
+ block = (buf_block_t*) buf_page_hash_get_low(
+ buf_pool, space, offset, fold);
+ }
+
+ if (UNIV_LIKELY_NULL(block)) {
+ block_mutex = buf_page_get_mutex(
+ &block->page);
+ /* The page entered the buffer
+ pool for some reason. Try to
+ evict it again. */
+ mutex_enter(block_mutex);
+ rw_lock_x_unlock(hash_lock);
+
+ goto got_block;
+ }
+
+ rw_lock_x_unlock(hash_lock);
+ fprintf(stderr,
+ "innodb_change_buffering_debug evict %u %u\n",
+ (unsigned) space, (unsigned) offset);
+ return(NULL);
+ }
+
+ mutex_enter(&block->mutex);
+
+ if (buf_flush_page_try(buf_pool, block)) {
+ fprintf(stderr,
+ "innodb_change_buffering_debug flush %u %u\n",
+ (unsigned) space, (unsigned) offset);
+ guess = block;
+ goto loop;
+ }
+
+ /* Failed to evict the page; change it directly */
+
+ buf_pool_mutex_exit(buf_pool);
+ }
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+ buf_block_buf_fix_inc(block, file, line);
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+ ut_a(mode == BUF_GET_POSSIBLY_FREED
+ || !block->page.file_page_was_freed);
+#endif
+ mutex_exit(&block->mutex);
+
+ /* Check if this is the first access to the page */
+
+ access_time = buf_page_is_accessed(&block->page);
+
+ if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL)) {
+ buf_page_set_accessed_make_young(&block->page, access_time);
+ }
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 5771 || buf_validate());
+ ut_a(block->page.buf_fix_count > 0);
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ switch (rw_latch) {
+ case RW_NO_LATCH:
+ if (must_read) {
+ /* Let us wait until the read operation
+ completes */
+
+ for (;;) {
+ enum buf_io_fix io_fix;
+
+ mutex_enter(&block->mutex);
+ io_fix = buf_block_get_io_fix(block);
+ mutex_exit(&block->mutex);
+
+ if (io_fix == BUF_IO_READ) {
+
+ os_thread_sleep(WAIT_FOR_READ);
+ } else {
+ break;
+ }
+ }
+ }
+
+ fix_type = MTR_MEMO_BUF_FIX;
+ break;
+
+ case RW_S_LATCH:
+ rw_lock_s_lock_func(&(block->lock), 0, file, line);
+
+ fix_type = MTR_MEMO_PAGE_S_FIX;
+ break;
+
+ default:
+ ut_ad(rw_latch == RW_X_LATCH);
+ rw_lock_x_lock_func(&(block->lock), 0, file, line);
+
+ fix_type = MTR_MEMO_PAGE_X_FIX;
+ break;
+ }
+
+ mtr_memo_push(mtr, block, fix_type);
+
+ if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL) && !access_time) {
+ /* In the case of a first access, try to apply linear
+ read-ahead */
+
+ buf_read_ahead_linear(space, zip_size, offset,
+ ibuf_inside(mtr));
+ }
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(buf_block_get_space(block),
+ buf_block_get_page_no(block)) == 0);
+#endif
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+ return(block);
+}
+
+/********************************************************************//**
+This is the general function used to get optimistic access to a database
+page.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_optimistic_get(
+/*====================*/
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+ buf_block_t* block, /*!< in: guessed buffer block */
+ ib_uint64_t modify_clock,/*!< in: modify clock value */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ buf_pool_t* buf_pool;
+ unsigned access_time;
+ ibool success;
+ ulint fix_type;
+
+ ut_ad(block);
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
+ ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+
+ mutex_enter(&block->mutex);
+
+ if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) {
+
+ mutex_exit(&block->mutex);
+
+ return(FALSE);
+ }
+
+ buf_block_buf_fix_inc(block, file, line);
+
+ mutex_exit(&block->mutex);
+
+ /* Check if this is the first access to the page.
+ We do a dirty read on purpose, to avoid mutex contention.
+ This field is only used for heuristic purposes; it does not
+ affect correctness. */
+
+ access_time = buf_page_is_accessed(&block->page);
+ buf_page_set_accessed_make_young(&block->page, access_time);
+
+ ut_ad(!ibuf_inside(mtr)
+ || ibuf_page(buf_block_get_space(block),
+ buf_block_get_zip_size(block),
+ buf_block_get_page_no(block), NULL));
+
+ if (rw_latch == RW_S_LATCH) {
+ success = rw_lock_s_lock_nowait(&(block->lock),
+ file, line);
+ fix_type = MTR_MEMO_PAGE_S_FIX;
+ } else {
+ success = rw_lock_x_lock_func_nowait(&(block->lock),
+ file, line);
+ fix_type = MTR_MEMO_PAGE_X_FIX;
+ }
+
+ if (UNIV_UNLIKELY(!success)) {
+ mutex_enter(&block->mutex);
+ buf_block_buf_fix_dec(block);
+ mutex_exit(&block->mutex);
+
+ return(FALSE);
+ }
+
+ if (UNIV_UNLIKELY(modify_clock != block->modify_clock)) {
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ if (rw_latch == RW_S_LATCH) {
+ rw_lock_s_unlock(&(block->lock));
+ } else {
+ rw_lock_x_unlock(&(block->lock));
+ }
+
+ mutex_enter(&block->mutex);
+ buf_block_buf_fix_dec(block);
+ mutex_exit(&block->mutex);
+
+ return(FALSE);
+ }
+
+ mtr_memo_push(mtr, block, fix_type);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 5771 || buf_validate());
+ ut_a(block->page.buf_fix_count > 0);
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+ mutex_enter(&block->mutex);
+ ut_a(!block->page.file_page_was_freed);
+ mutex_exit(&block->mutex);
+#endif
+
+ if (UNIV_UNLIKELY(!access_time)) {
+ /* In the case of a first access, try to apply linear
+ read-ahead */
+
+ buf_read_ahead_linear(buf_block_get_space(block),
+ buf_block_get_zip_size(block),
+ buf_block_get_page_no(block),
+ ibuf_inside(mtr));
+ }
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(buf_block_get_space(block),
+ buf_block_get_page_no(block)) == 0);
+#endif
+ buf_pool = buf_pool_from_block(block);
+ buf_pool->stat.n_page_gets++;
+
+ return(TRUE);
+}
+
+/********************************************************************//**
+This is used to get access to a known database page, when no waiting can be
+done. For example, if a search in an adaptive hash index leads us to this
+frame.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_get_known_nowait(
+/*======================*/
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+ buf_block_t* block, /*!< in: the known page */
+ ulint mode, /*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ buf_pool_t* buf_pool;
+ ibool success;
+ ulint fix_type;
+
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
+ ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+
+ mutex_enter(&block->mutex);
+
+ if (buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH) {
+ /* Another thread is just freeing the block from the LRU list
+ of the buffer pool: do not try to access this page; this
+ attempt to access the page can only come through the hash
+ index because when the buffer block state is ..._REMOVE_HASH,
+ we have already removed it from the page address hash table
+ of the buffer pool. */
+
+ mutex_exit(&block->mutex);
+
+ return(FALSE);
+ }
+
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+ buf_block_buf_fix_inc(block, file, line);
+
+ mutex_exit(&block->mutex);
+
+ buf_pool = buf_pool_from_block(block);
+
+ if (mode == BUF_MAKE_YOUNG && buf_page_peek_if_too_old(&block->page)) {
+ buf_pool_mutex_enter(buf_pool);
+ buf_LRU_make_block_young(&block->page);
+ buf_pool_mutex_exit(buf_pool);
+ } else if (!buf_page_is_accessed(&block->page)) {
+ /* Above, we do a dirty read on purpose, to avoid
+ mutex contention. The field buf_page_t::access_time
+ is only used for heuristic purposes. Writes to the
+ field must be protected by mutex, however. */
+ ulint time_ms = ut_time_ms();
+
+ buf_pool_mutex_enter(buf_pool);
+ buf_page_set_accessed(&block->page, time_ms);
+ buf_pool_mutex_exit(buf_pool);
+ }
+
+ ut_ad(!ibuf_inside(mtr) || mode == BUF_KEEP_OLD);
+
+ if (rw_latch == RW_S_LATCH) {
+ success = rw_lock_s_lock_nowait(&(block->lock),
+ file, line);
+ fix_type = MTR_MEMO_PAGE_S_FIX;
+ } else {
+ success = rw_lock_x_lock_func_nowait(&(block->lock),
+ file, line);
+ fix_type = MTR_MEMO_PAGE_X_FIX;
+ }
+
+ if (!success) {
+ mutex_enter(&block->mutex);
+ buf_block_buf_fix_dec(block);
+ mutex_exit(&block->mutex);
+
+ return(FALSE);
+ }
+
+ mtr_memo_push(mtr, block, fix_type);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 5771 || buf_validate());
+ ut_a(block->page.buf_fix_count > 0);
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+ if (mode != BUF_KEEP_OLD) {
+ /* If mode == BUF_KEEP_OLD, we are executing an I/O
+ completion routine. Avoid a bogus assertion failure
+ when ibuf_merge_or_delete_for_page() is processing a
+ page that was just freed due to DROP INDEX, or
+ deleting a record from SYS_INDEXES. This check will be
+ skipped in recv_recover_page() as well. */
+
+ mutex_enter(&block->mutex);
+ ut_a(!block->page.file_page_was_freed);
+ mutex_exit(&block->mutex);
+ }
+#endif
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a((mode == BUF_KEEP_OLD)
+ || (ibuf_count_get(buf_block_get_space(block),
+ buf_block_get_page_no(block)) == 0));
+#endif
+ buf_pool->stat.n_page_gets++;
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Given a tablespace id and page number tries to get that page. If the
+page is not in the buffer pool it is not loaded and NULL is returned.
+Suitable for using when holding the lock_sys_t::mutex.
+@return pointer to a page or NULL */
+UNIV_INTERN
+const buf_block_t*
+buf_page_try_get_func(
+/*==================*/
+ ulint space_id,/*!< in: tablespace id */
+ ulint page_no,/*!< in: page number */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ buf_block_t* block;
+ ibool success;
+ ulint fix_type;
+ buf_pool_t* buf_pool = buf_pool_get(space_id, page_no);
+ rw_lock_t* hash_lock;
+
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
+
+ block = buf_block_hash_get_s_locked(buf_pool, space_id,
+ page_no, &hash_lock);
+
+ if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
+ if (block) {
+ rw_lock_s_unlock(hash_lock);
+ }
+ return(NULL);
+ }
+
+ ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
+
+ mutex_enter(&block->mutex);
+ rw_lock_s_unlock(hash_lock);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+ ut_a(buf_block_get_space(block) == space_id);
+ ut_a(buf_block_get_page_no(block) == page_no);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ buf_block_buf_fix_inc(block, file, line);
+ mutex_exit(&block->mutex);
+
+ fix_type = MTR_MEMO_PAGE_S_FIX;
+ success = rw_lock_s_lock_nowait(&block->lock, file, line);
+
+ if (!success) {
+ /* Let us try to get an X-latch. If the current thread
+ is holding an X-latch on the page, we cannot get an
+ S-latch. */
+
+ fix_type = MTR_MEMO_PAGE_X_FIX;
+ success = rw_lock_x_lock_func_nowait(&block->lock,
+ file, line);
+ }
+
+ if (!success) {
+ mutex_enter(&block->mutex);
+ buf_block_buf_fix_dec(block);
+ mutex_exit(&block->mutex);
+
+ return(NULL);
+ }
+
+ mtr_memo_push(mtr, block, fix_type);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 5771 || buf_validate());
+ ut_a(block->page.buf_fix_count > 0);
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+ mutex_enter(&block->mutex);
+ ut_a(!block->page.file_page_was_freed);
+ mutex_exit(&block->mutex);
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ buf_pool->stat.n_page_gets++;
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(buf_block_get_space(block),
+ buf_block_get_page_no(block)) == 0);
+#endif
+
+ return(block);
+}
+
+/********************************************************************//**
+Initialize some fields of a control block. */
+UNIV_INLINE
+void
+buf_page_init_low(
+/*==============*/
+ buf_page_t* bpage) /*!< in: block to init */
+{
+ bpage->flush_type = BUF_FLUSH_LRU;
+ bpage->io_fix = BUF_IO_NONE;
+ bpage->buf_fix_count = 0;
+ bpage->freed_page_clock = 0;
+ bpage->access_time = 0;
+ bpage->newest_modification = 0;
+ bpage->oldest_modification = 0;
+ HASH_INVALIDATE(bpage, hash);
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+ bpage->file_page_was_freed = FALSE;
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+}
+
+/********************************************************************//**
+Inits a page to the buffer buf_pool. */
+static __attribute__((nonnull))
+void
+buf_page_init(
+/*==========*/
+ buf_pool_t* buf_pool,/*!< in/out: buffer pool */
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: offset of the page within space
+ in units of a page */
+ ulint fold, /*!< in: buf_page_address_fold(space,offset) */
+ buf_block_t* block) /*!< in/out: block to init */
+{
+ buf_page_t* hash_page;
+
+ ut_ad(buf_pool == buf_pool_get(space, offset));
+ ut_ad(buf_pool_mutex_own(buf_pool));
+
+ ut_ad(mutex_own(&(block->mutex)));
+ ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(buf_page_hash_lock_get(buf_pool, fold),
+ RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ /* Set the state of the block */
+ buf_block_set_file_page(block, space, offset);
+
+#ifdef UNIV_DEBUG_VALGRIND
+ if (!space) {
+ /* Silence valid Valgrind warnings about uninitialized
+ data being written to data files. There are some unused
+ bytes on some pages that InnoDB does not initialize. */
+ UNIV_MEM_VALID(block->frame, UNIV_PAGE_SIZE);
+ }
+#endif /* UNIV_DEBUG_VALGRIND */
+
+ buf_block_init_low(block);
+
+ block->lock_hash_val = lock_rec_hash(space, offset);
+
+ buf_page_init_low(&block->page);
+
+ /* Insert into the hash table of file pages */
+
+ hash_page = buf_page_hash_get_low(buf_pool, space, offset, fold);
+
+ if (UNIV_LIKELY(!hash_page)) {
+ } else if (buf_pool_watch_is_sentinel(buf_pool, hash_page)) {
+ /* Preserve the reference count. */
+ ulint buf_fix_count = hash_page->buf_fix_count;
+
+ ut_a(buf_fix_count > 0);
+ block->page.buf_fix_count += buf_fix_count;
+ buf_pool_watch_remove(buf_pool, fold, hash_page);
+ } else {
+ fprintf(stderr,
+ "InnoDB: Error: page %lu %lu already found"
+ " in the hash table: %p, %p\n",
+ (ulong) space,
+ (ulong) offset,
+ (const void*) hash_page, (const void*) block);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ mutex_exit(&block->mutex);
+ buf_pool_mutex_exit(buf_pool);
+ buf_print();
+ buf_LRU_print();
+ buf_validate();
+ buf_LRU_validate();
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+ ut_error;
+ }
+
+ ut_ad(!block->page.in_zip_hash);
+ ut_ad(!block->page.in_page_hash);
+ ut_d(block->page.in_page_hash = TRUE);
+ HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
+ fold, &block->page);
+}
+
+/********************************************************************//**
+Function which inits a page for read to the buffer buf_pool. If the page is
+(1) already in buf_pool, or
+(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
+(3) if the space is deleted or being deleted,
+then this function does nothing.
+Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
+on the buffer frame. The io-handler must take care that the flag is cleared
+and the lock released later.
+@return pointer to the block or NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_init_for_read(
+/*===================*/
+ ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
+ ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size, or 0 */
+ ibool unzip, /*!< in: TRUE=request uncompressed page */
+ ib_int64_t tablespace_version,
+ /*!< in: prevents reading from a wrong
+ version of the tablespace in case we have done
+ DISCARD + IMPORT */
+ ulint offset) /*!< in: page number */
+{
+ buf_block_t* block;
+ buf_page_t* bpage = NULL;
+ buf_page_t* watch_page;
+ rw_lock_t* hash_lock;
+ mtr_t mtr;
+ ulint fold;
+ ibool lru = FALSE;
+ void* data;
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
+
+ ut_ad(buf_pool);
+
+ *err = DB_SUCCESS;
+
+ if (mode == BUF_READ_IBUF_PAGES_ONLY) {
+ /* It is a read-ahead within an ibuf routine */
+
+ ut_ad(!ibuf_bitmap_page(zip_size, offset));
+
+ ibuf_mtr_start(&mtr);
+
+ if (!recv_no_ibuf_operations
+ && !ibuf_page(space, zip_size, offset, &mtr)) {
+
+ ibuf_mtr_commit(&mtr);
+
+ return(NULL);
+ }
+ } else {
+ ut_ad(mode == BUF_READ_ANY_PAGE);
+ }
+
+ if (zip_size && !unzip && !recv_recovery_is_on()) {
+ block = NULL;
+ } else {
+ block = buf_LRU_get_free_block(buf_pool);
+ ut_ad(block);
+ ut_ad(buf_pool_from_block(block) == buf_pool);
+ }
+
+ fold = buf_page_address_fold(space, offset);
+ hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+ buf_pool_mutex_enter(buf_pool);
+ rw_lock_x_lock(hash_lock);
+
+ watch_page = buf_page_hash_get_low(buf_pool, space, offset, fold);
+ if (watch_page && !buf_pool_watch_is_sentinel(buf_pool, watch_page)) {
+ /* The page is already in the buffer pool. */
+ watch_page = NULL;
+err_exit:
+ rw_lock_x_unlock(hash_lock);
+ if (block) {
+ mutex_enter(&block->mutex);
+ buf_LRU_block_free_non_file_page(block);
+ mutex_exit(&block->mutex);
+ }
+
+ bpage = NULL;
+ goto func_exit;
+ }
+
+ if (fil_tablespace_deleted_or_being_deleted_in_mem(
+ space, tablespace_version)) {
+ /* The page belongs to a space which has been
+ deleted or is being deleted. */
+ *err = DB_TABLESPACE_DELETED;
+
+ goto err_exit;
+ }
+
+ if (block) {
+ bpage = &block->page;
+
+ mutex_enter(&block->mutex);
+
+ ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
+
+ buf_page_init(buf_pool, space, offset, fold, block);
+ rw_lock_x_unlock(hash_lock);
+
+ /* The block must be put to the LRU list, to the old blocks */
+ buf_LRU_add_block(bpage, TRUE/* to old blocks */);
+
+ /* We set a pass-type x-lock on the frame because then
+ the same thread which called for the read operation
+ (and is running now at this point of code) can wait
+ for the read to complete by waiting for the x-lock on
+ the frame; if the x-lock were recursive, the same
+ thread would illegally get the x-lock before the page
+ read is completed. The x-lock is cleared by the
+ io-handler thread. */
+
+ rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
+ buf_page_set_io_fix(bpage, BUF_IO_READ);
+
+ if (zip_size) {
+ page_zip_set_size(&block->page.zip, zip_size);
+
+ /* buf_pool->mutex may be released and
+ reacquired by buf_buddy_alloc(). Thus, we
+ must release block->mutex in order not to
+ break the latching order in the reacquisition
+ of buf_pool->mutex. We also must defer this
+ operation until after the block descriptor has
+ been added to buf_pool->LRU and
+ buf_pool->page_hash. */
+ mutex_exit(&block->mutex);
+ data = buf_buddy_alloc(buf_pool, zip_size, &lru);
+ mutex_enter(&block->mutex);
+ block->page.zip.data = (page_zip_t*) data;
+
+ /* To maintain the invariant
+ block->in_unzip_LRU_list
+ == buf_page_belongs_to_unzip_LRU(&block->page)
+ we have to add this block to unzip_LRU
+ after block->page.zip.data is set. */
+ ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
+ buf_unzip_LRU_add_block(block, TRUE);
+ }
+
+ mutex_exit(&block->mutex);
+ } else {
+ rw_lock_x_unlock(hash_lock);
+
+ /* The compressed page must be allocated before the
+ control block (bpage), in order to avoid the
+ invocation of buf_buddy_relocate_block() on
+ uninitialized data. */
+ data = buf_buddy_alloc(buf_pool, zip_size, &lru);
+
+ rw_lock_x_lock(hash_lock);
+
+ /* If buf_buddy_alloc() allocated storage from the LRU list,
+ it released and reacquired buf_pool->mutex. Thus, we must
+ check the page_hash again, as it may have been modified. */
+ if (UNIV_UNLIKELY(lru)) {
+
+ watch_page = buf_page_hash_get_low(
+ buf_pool, space, offset, fold);
+
+ if (UNIV_UNLIKELY(watch_page
+ && !buf_pool_watch_is_sentinel(buf_pool,
+ watch_page))) {
+
+ /* The block was added by some other thread. */
+ rw_lock_x_unlock(hash_lock);
+ watch_page = NULL;
+ buf_buddy_free(buf_pool, data, zip_size);
+
+ bpage = NULL;
+ goto func_exit;
+ }
+ }
+
+ bpage = buf_page_alloc_descriptor();
+
+ /* Initialize the buf_pool pointer. */
+ bpage->buf_pool_index = buf_pool_index(buf_pool);
+
+ page_zip_des_init(&bpage->zip);
+ page_zip_set_size(&bpage->zip, zip_size);
+ bpage->zip.data = (page_zip_t*) data;
+
+ mutex_enter(&buf_pool->zip_mutex);
+ UNIV_MEM_DESC(bpage->zip.data,
+ page_zip_get_size(&bpage->zip));
+
+ buf_page_init_low(bpage);
+
+ bpage->state = BUF_BLOCK_ZIP_PAGE;
+ bpage->space = space;
+ bpage->offset = offset;
+
+#ifdef UNIV_DEBUG
+ bpage->in_page_hash = FALSE;
+ bpage->in_zip_hash = FALSE;
+ bpage->in_flush_list = FALSE;
+ bpage->in_free_list = FALSE;
+ bpage->in_LRU_list = FALSE;
+#endif /* UNIV_DEBUG */
+
+ ut_d(bpage->in_page_hash = TRUE);
+
+ if (UNIV_LIKELY_NULL(watch_page)) {
+
+ /* Preserve the reference count. */
+ ulint buf_fix_count = watch_page->buf_fix_count;
+ ut_a(buf_fix_count > 0);
+ bpage->buf_fix_count += buf_fix_count;
+ ut_ad(buf_pool_watch_is_sentinel(buf_pool, watch_page));
+ buf_pool_watch_remove(buf_pool, fold, watch_page);
+ }
+
+ HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold,
+ bpage);
+
+ rw_lock_x_unlock(hash_lock);
+
+ /* The block must be put to the LRU list, to the old blocks */
+ buf_LRU_add_block(bpage, TRUE/* to old blocks */);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ buf_LRU_insert_zip_clean(bpage);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ buf_page_set_io_fix(bpage, BUF_IO_READ);
+
+ mutex_exit(&buf_pool->zip_mutex);
+ }
+
+ buf_pool->n_pend_reads++;
+func_exit:
+ buf_pool_mutex_exit(buf_pool);
+
+ if (mode == BUF_READ_IBUF_PAGES_ONLY) {
+
+ ibuf_mtr_commit(&mtr);
+ }
+
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+ ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ ut_ad(!bpage || buf_page_in_file(bpage));
+ return(bpage);
+}
+
+/********************************************************************//**
+Initializes a page to the buffer buf_pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_get_gen).
+@return pointer to the block, page bufferfixed */
+UNIV_INTERN
+buf_block_t*
+buf_page_create(
+/*============*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: offset of the page within space in units of
+ a page */
+ ulint zip_size,/*!< in: compressed page size, or 0 */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ buf_frame_t* frame;
+ buf_block_t* block;
+ ulint fold;
+ buf_block_t* free_block = NULL;
+ ulint time_ms = ut_time_ms();
+ buf_pool_t* buf_pool = buf_pool_get(space, offset);
+ rw_lock_t* hash_lock;
+
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
+ ut_ad(space || !zip_size);
+
+ free_block = buf_LRU_get_free_block(buf_pool);
+
+ fold = buf_page_address_fold(space, offset);
+ hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+ buf_pool_mutex_enter(buf_pool);
+ rw_lock_x_lock(hash_lock);
+
+ block = (buf_block_t*) buf_page_hash_get_low(
+ buf_pool, space, offset, fold);
+
+ if (block
+ && buf_page_in_file(&block->page)
+ && !buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(space, offset) == 0);
+#endif
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+ block->page.file_page_was_freed = FALSE;
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+
+ /* Page can be found in buf_pool */
+ buf_pool_mutex_exit(buf_pool);
+ rw_lock_x_unlock(hash_lock);
+
+ buf_block_free(free_block);
+
+ return(buf_page_get_with_no_latch(space, zip_size,
+ offset, mtr));
+ }
+
+ /* If we get here, the page was not in buf_pool: init it there */
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+ fprintf(stderr, "Creating space %lu page %lu to buffer\n",
+ (ulong) space, (ulong) offset);
+ }
+#endif /* UNIV_DEBUG */
+
+ block = free_block;
+
+ mutex_enter(&block->mutex);
+
+ buf_page_init(buf_pool, space, offset, fold, block);
+
+ rw_lock_x_unlock(hash_lock);
+
+ /* The block must be put to the LRU list */
+ buf_LRU_add_block(&block->page, FALSE);
+
+ buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+ buf_pool->stat.n_pages_created++;
+
+ if (zip_size) {
+ void* data;
+ ibool lru;
+
+ /* Prevent race conditions during buf_buddy_alloc(),
+ which may release and reacquire buf_pool->mutex,
+ by IO-fixing and X-latching the block. */
+
+ buf_page_set_io_fix(&block->page, BUF_IO_READ);
+ rw_lock_x_lock(&block->lock);
+
+ page_zip_set_size(&block->page.zip, zip_size);
+ mutex_exit(&block->mutex);
+ /* buf_pool->mutex may be released and reacquired by
+ buf_buddy_alloc(). Thus, we must release block->mutex
+ in order not to break the latching order in
+ the reacquisition of buf_pool->mutex. We also must
+ defer this operation until after the block descriptor
+ has been added to buf_pool->LRU and buf_pool->page_hash. */
+ data = buf_buddy_alloc(buf_pool, zip_size, &lru);
+ mutex_enter(&block->mutex);
+ block->page.zip.data = (page_zip_t*) data;
+
+ /* To maintain the invariant
+ block->in_unzip_LRU_list
+ == buf_page_belongs_to_unzip_LRU(&block->page)
+ we have to add this block to unzip_LRU after
+ block->page.zip.data is set. */
+ ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
+ buf_unzip_LRU_add_block(block, FALSE);
+
+ buf_page_set_io_fix(&block->page, BUF_IO_NONE);
+ rw_lock_x_unlock(&block->lock);
+ }
+
+ buf_page_set_accessed(&block->page, time_ms);
+
+ buf_pool_mutex_exit(buf_pool);
+
+ mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX);
+
+ mutex_exit(&block->mutex);
+
+ /* Delete possible entries for the page from the insert buffer:
+ such can exist if the page belonged to an index which was dropped */
+
+ ibuf_merge_or_delete_for_page(NULL, space, offset, zip_size, TRUE);
+
+ frame = block->frame;
+
+ memset(frame + FIL_PAGE_PREV, 0xff, 4);
+ memset(frame + FIL_PAGE_NEXT, 0xff, 4);
+ mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
+
+ /* Reset to zero the file flush lsn field in the page; if the first
+ page of an ibdata file is 'created' in this function into the buffer
+ pool then we lose the original contents of the file flush lsn stamp.
+ Then InnoDB could in a crash recovery print a big, false, corruption
+ warning if the stamp contains an lsn bigger than the ib_logfile lsn. */
+
+ memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 357 || buf_validate());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(buf_block_get_space(block),
+ buf_block_get_page_no(block)) == 0);
+#endif
+ return(block);
+}
+
+/********************************************************************//**
+Monitor the buffer page read/write activity, and increment corresponding
+counter value if MONITOR_MODULE_BUF_PAGE (module_buf_page) module is
+enabled. */
+static
+void
+buf_page_monitor(
+/*=============*/
+ const buf_page_t* bpage, /*!< in: pointer to the block */
+ enum buf_io_fix io_type)/*!< in: io_fix types */
+{
+ const byte* frame;
+ monitor_id_t counter;
+
+ /* If the counter module is not turned on, just return */
+ if (!MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)) {
+ return;
+ }
+
+ ut_a(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
+
+ frame = bpage->zip.data
+ ? bpage->zip.data
+ : ((buf_block_t*) bpage)->frame;
+
+ switch (fil_page_get_type(frame)) {
+ ulint level;
+
+ case FIL_PAGE_INDEX:
+ level = btr_page_get_level_low(frame);
+
+ /* Check if it is an index page for insert buffer */
+ if (btr_page_get_index_id(frame)
+ == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
+ if (level == 0) {
+ counter = MONITOR_RW_COUNTER(
+ io_type, MONITOR_INDEX_IBUF_LEAF_PAGE);
+ } else {
+ counter = MONITOR_RW_COUNTER(
+ io_type,
+ MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
+ }
+ } else {
+ if (level == 0) {
+ counter = MONITOR_RW_COUNTER(
+ io_type, MONITOR_INDEX_LEAF_PAGE);
+ } else {
+ counter = MONITOR_RW_COUNTER(
+ io_type, MONITOR_INDEX_NON_LEAF_PAGE);
+ }
+ }
+ break;
+
+ case FIL_PAGE_UNDO_LOG:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE);
+ break;
+
+ case FIL_PAGE_INODE:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE);
+ break;
+
+ case FIL_PAGE_IBUF_FREE_LIST:
+ counter = MONITOR_RW_COUNTER(io_type,
+ MONITOR_IBUF_FREELIST_PAGE);
+ break;
+
+ case FIL_PAGE_IBUF_BITMAP:
+ counter = MONITOR_RW_COUNTER(io_type,
+ MONITOR_IBUF_BITMAP_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_SYS:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_TRX_SYS:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_FSP_HDR:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_XDES:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_BLOB:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_ZBLOB:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_ZBLOB2:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE);
+ break;
+
+ default:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE);
+ }
+
+ MONITOR_INC_NOCHECK(counter);
+}
+
+/********************************************************************//**
+Mark a table with the specified space pointed by bpage->space corrupted.
+Also remove the bpage from LRU list.
+@return TRUE if successful */
+static
+ibool
+buf_mark_space_corrupt(
+/*===================*/
+ buf_page_t* bpage) /*!< in: pointer to the block in question */
+{
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+ const ibool uncompressed = (buf_page_get_state(bpage)
+ == BUF_BLOCK_FILE_PAGE);
+ ulint space = bpage->space;
+ ibool ret = TRUE;
+
+ /* First unfix and release lock on the bpage */
+ buf_pool_mutex_enter(buf_pool);
+ mutex_enter(buf_page_get_mutex(bpage));
+ ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
+ ut_ad(bpage->buf_fix_count == 0);
+
+ /* Set BUF_IO_NONE before we remove the block from LRU list */
+ buf_page_set_io_fix(bpage, BUF_IO_NONE);
+
+ if (uncompressed) {
+ rw_lock_x_unlock_gen(
+ &((buf_block_t*) bpage)->lock,
+ BUF_IO_READ);
+ }
+
+ /* Find the table with specified space id, and mark it corrupted */
+ if (dict_set_corrupted_by_space(space)) {
+ buf_LRU_free_one_page(bpage);
+ } else {
+ ret = FALSE;
+ }
+
+ ut_ad(buf_pool->n_pend_reads > 0);
+ buf_pool->n_pend_reads--;
+
+ mutex_exit(buf_page_get_mutex(bpage));
+ buf_pool_mutex_exit(buf_pool);
+
+ return(ret);
+}
+
+/********************************************************************//**
+Completes an asynchronous read or write request of a file page to or from
+the buffer pool.
+@return TRUE if successful */
+UNIV_INTERN
+ibool
+buf_page_io_complete(
+/*=================*/
+ buf_page_t* bpage) /*!< in: pointer to the block in question */
+{
+ enum buf_io_fix io_type;
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+ const ibool uncompressed = (buf_page_get_state(bpage)
+ == BUF_BLOCK_FILE_PAGE);
+
+ ut_a(buf_page_in_file(bpage));
+
+ /* We do not need protect io_fix here by mutex to read
+ it because this is the only function where we can change the value
+ from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code
+ ensures that this is the only thread that handles the i/o for this
+ block. */
+
+ io_type = buf_page_get_io_fix(bpage);
+ ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
+
+ if (io_type == BUF_IO_READ) {
+ ulint read_page_no;
+ ulint read_space_id;
+ byte* frame;
+
+ if (buf_page_get_zip_size(bpage)) {
+ frame = bpage->zip.data;
+ buf_pool->n_pend_unzip++;
+ if (uncompressed
+ && !buf_zip_decompress((buf_block_t*) bpage,
+ FALSE)) {
+
+ buf_pool->n_pend_unzip--;
+ goto corrupt;
+ }
+ buf_pool->n_pend_unzip--;
+ } else {
+ ut_a(uncompressed);
+ frame = ((buf_block_t*) bpage)->frame;
+ }
+
+ /* If this page is not uninitialized and not in the
+ doublewrite buffer, then the page number and space id
+ should be the same as in block. */
+ read_page_no = mach_read_from_4(frame + FIL_PAGE_OFFSET);
+ read_space_id = mach_read_from_4(
+ frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ if (bpage->space == TRX_SYS_SPACE
+ && buf_dblwr_page_inside(bpage->offset)) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: reading page %lu\n"
+ "InnoDB: which is in the"
+ " doublewrite buffer!\n",
+ (ulong) bpage->offset);
+ } else if (!read_space_id && !read_page_no) {
+ /* This is likely an uninitialized page. */
+ } else if ((bpage->space
+ && bpage->space != read_space_id)
+ || bpage->offset != read_page_no) {
+ /* We did not compare space_id to read_space_id
+ if bpage->space == 0, because the field on the
+ page may contain garbage in MySQL < 4.1.1,
+ which only supported bpage->space == 0. */
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: space id and page n:o"
+ " stored in the page\n"
+ "InnoDB: read in are %lu:%lu,"
+ " should be %lu:%lu!\n",
+ (ulong) read_space_id, (ulong) read_page_no,
+ (ulong) bpage->space,
+ (ulong) bpage->offset);
+ }
+
+ /* From version 3.23.38 up we store the page checksum
+ to the 4 first bytes of the page end lsn field */
+
+ if (buf_page_is_corrupted(frame,
+ buf_page_get_zip_size(bpage))) {
+corrupt:
+ fprintf(stderr,
+ "InnoDB: Database page corruption on disk"
+ " or a failed\n"
+ "InnoDB: file read of page %lu.\n"
+ "InnoDB: You may have to recover"
+ " from a backup.\n",
+ (ulong) bpage->offset);
+ buf_page_print(frame, buf_page_get_zip_size(bpage),
+ BUF_PAGE_PRINT_NO_CRASH);
+ fprintf(stderr,
+ "InnoDB: Database page corruption on disk"
+ " or a failed\n"
+ "InnoDB: file read of page %lu.\n"
+ "InnoDB: You may have to recover"
+ " from a backup.\n",
+ (ulong) bpage->offset);
+ fputs("InnoDB: It is also possible that"
+ " your operating\n"
+ "InnoDB: system has corrupted its"
+ " own file cache\n"
+ "InnoDB: and rebooting your computer"
+ " removes the\n"
+ "InnoDB: error.\n"
+ "InnoDB: If the corrupt page is an index page\n"
+ "InnoDB: you can also try to"
+ " fix the corruption\n"
+ "InnoDB: by dumping, dropping,"
+ " and reimporting\n"
+ "InnoDB: the corrupt table."
+ " You can use CHECK\n"
+ "InnoDB: TABLE to scan your"
+ " table for corruption.\n"
+ "InnoDB: See also "
+ REFMAN "forcing-innodb-recovery.html\n"
+ "InnoDB: about forcing recovery.\n", stderr);
+
+ if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) {
+ /* If page space id is larger than TRX_SYS_SPACE
+ (0), we will attempt to mark the corresponding
+ table as corrupted instead of crashing server */
+ if (bpage->space > TRX_SYS_SPACE
+ && buf_mark_space_corrupt(bpage)) {
+ return(FALSE);
+ } else {
+ fputs("InnoDB: Ending processing"
+ " because of"
+ " a corrupt database page.\n",
+ stderr);
+ ut_error;
+ }
+ }
+ }
+
+ if (recv_recovery_is_on()) {
+ /* Pages must be uncompressed for crash recovery. */
+ ut_a(uncompressed);
+ recv_recover_page(TRUE, (buf_block_t*) bpage);
+ }
+
+ if (uncompressed && !recv_no_ibuf_operations) {
+ ibuf_merge_or_delete_for_page(
+ (buf_block_t*) bpage, bpage->space,
+ bpage->offset, buf_page_get_zip_size(bpage),
+ TRUE);
+ }
+ }
+
+ buf_pool_mutex_enter(buf_pool);
+ mutex_enter(buf_page_get_mutex(bpage));
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ if (io_type == BUF_IO_WRITE || uncompressed) {
+ /* For BUF_IO_READ of compressed-only blocks, the
+ buffered operations will be merged by buf_page_get_gen()
+ after the block has been uncompressed. */
+ ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
+ }
+#endif
+ /* Because this thread which does the unlocking is not the same that
+ did the locking, we use a pass value != 0 in unlock, which simply
+ removes the newest lock debug record, without checking the thread
+ id. */
+
+ buf_page_set_io_fix(bpage, BUF_IO_NONE);
+
+ switch (io_type) {
+ case BUF_IO_READ:
+ /* NOTE that the call to ibuf may have moved the ownership of
+ the x-latch to this OS thread: do not let this confuse you in
+ debugging! */
+
+ ut_ad(buf_pool->n_pend_reads > 0);
+ buf_pool->n_pend_reads--;
+ buf_pool->stat.n_pages_read++;
+
+ if (uncompressed) {
+ rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock,
+ BUF_IO_READ);
+ }
+
+ break;
+
+ case BUF_IO_WRITE:
+ /* Write means a flush operation: call the completion
+ routine in the flush system */
+
+ buf_flush_write_complete(bpage);
+
+ if (uncompressed) {
+ rw_lock_s_unlock_gen(&((buf_block_t*) bpage)->lock,
+ BUF_IO_WRITE);
+ }
+
+ buf_pool->stat.n_pages_written++;
+
+ break;
+
+ default:
+ ut_error;
+ }
+
+ buf_page_monitor(bpage, io_type);
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+ fprintf(stderr, "Has %s page space %lu page no %lu\n",
+ io_type == BUF_IO_READ ? "read" : "written",
+ (ulong) buf_page_get_space(bpage),
+ (ulong) buf_page_get_page_no(bpage));
+ }
+#endif /* UNIV_DEBUG */
+
+ mutex_exit(buf_page_get_mutex(bpage));
+ buf_pool_mutex_exit(buf_pool);
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Asserts that all file pages in the buffer are in a replaceable state.
+@return TRUE */
+static
+ibool
+buf_all_freed_instance(
+/*===================*/
+ buf_pool_t* buf_pool) /*!< in: buffer pool instancce */
+{
+ ulint i;
+ buf_chunk_t* chunk;
+
+ ut_ad(buf_pool);
+
+ buf_pool_mutex_enter(buf_pool);
+
+ chunk = buf_pool->chunks;
+
+ for (i = buf_pool->n_chunks; i--; chunk++) {
+
+ const buf_block_t* block = buf_chunk_not_freed(chunk);
+
+ if (UNIV_LIKELY_NULL(block)) {
+ fprintf(stderr,
+ "Page %lu %lu still fixed or dirty\n",
+ (ulong) block->page.space,
+ (ulong) block->page.offset);
+ ut_error;
+ }
+ }
+
+ buf_pool_mutex_exit(buf_pool);
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Invalidates file pages in one buffer pool instance */
+static
+void
+buf_pool_invalidate_instance(
+/*=========================*/
+ buf_pool_t* buf_pool) /*!< in: buffer pool instance */
+{
+ ulint i;
+
+ buf_pool_mutex_enter(buf_pool);
+
+ for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
+
+ /* As this function is called during startup and
+ during redo application phase during recovery, InnoDB
+ is single threaded (apart from IO helper threads) at
+ this stage. No new write batch can be in intialization
+ stage at this point. */
+ ut_ad(buf_pool->init_flush[i] == FALSE);
+
+ /* However, it is possible that a write batch that has
+ been posted earlier is still not complete. For buffer
+ pool invalidation to proceed we must ensure there is NO
+ write activity happening. */
+ if (buf_pool->n_flush[i] > 0) {
+ enum buf_flush type = static_cast<enum buf_flush>(i);
+
+ buf_pool_mutex_exit(buf_pool);
+ buf_flush_wait_batch_end(buf_pool, type);
+ buf_pool_mutex_enter(buf_pool);
+ }
+ }
+
+ buf_pool_mutex_exit(buf_pool);
+
+ ut_ad(buf_all_freed_instance(buf_pool));
+
+ buf_pool_mutex_enter(buf_pool);
+
+ while (buf_LRU_scan_and_free_block(buf_pool, TRUE)) {
+ }
+
+ ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
+ ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0);
+
+ buf_pool->freed_page_clock = 0;
+ buf_pool->LRU_old = NULL;
+ buf_pool->LRU_old_len = 0;
+
+ memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat));
+ buf_refresh_io_stats(buf_pool);
+
+ buf_pool_mutex_exit(buf_pool);
+}
+
+/*********************************************************************//**
+Invalidates the file pages in the buffer pool when an archive recovery is
+completed. All the file pages buffered must be in a replaceable state when
+this function is called: not latched and not modified. */
+UNIV_INTERN
+void
+buf_pool_invalidate(void)
+/*=====================*/
+{
+ ulint i;
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_invalidate_instance(buf_pool_from_array(i));
+ }
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Validates data in one buffer pool instance
+@return TRUE */
+static
+ibool
+buf_pool_validate_instance(
+/*=======================*/
+ buf_pool_t* buf_pool) /*!< in: buffer pool instance */
+{
+ buf_page_t* b;
+ buf_chunk_t* chunk;
+ ulint i;
+ ulint n_lru_flush = 0;
+ ulint n_page_flush = 0;
+ ulint n_list_flush = 0;
+ ulint n_lru = 0;
+ ulint n_flush = 0;
+ ulint n_free = 0;
+ ulint n_zip = 0;
+ ulint fold = 0;
+ ulint space = 0;
+ ulint offset = 0;
+
+ ut_ad(buf_pool);
+
+ buf_pool_mutex_enter(buf_pool);
+ hash_lock_x_all(buf_pool->page_hash);
+
+ chunk = buf_pool->chunks;
+
+ /* Check the uncompressed blocks. */
+
+ for (i = buf_pool->n_chunks; i--; chunk++) {
+
+ ulint j;
+ buf_block_t* block = chunk->blocks;
+
+ for (j = chunk->size; j--; block++) {
+
+ mutex_enter(&block->mutex);
+
+ switch (buf_block_get_state(block)) {
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ /* These should only occur on
+ zip_clean, zip_free[], or flush_list. */
+ ut_error;
+ break;
+
+ case BUF_BLOCK_FILE_PAGE:
+ space = buf_block_get_space(block);
+ offset = buf_block_get_page_no(block);
+ fold = buf_page_address_fold(space, offset);
+ ut_a(buf_page_hash_get_low(buf_pool,
+ space,
+ offset,
+ fold)
+ == &block->page);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(buf_page_get_io_fix(&block->page)
+ == BUF_IO_READ
+ || !ibuf_count_get(buf_block_get_space(
+ block),
+ buf_block_get_page_no(
+ block)));
+#endif
+ switch (buf_page_get_io_fix(&block->page)) {
+ case BUF_IO_NONE:
+ break;
+
+ case BUF_IO_WRITE:
+ switch (buf_page_get_flush_type(
+ &block->page)) {
+ case BUF_FLUSH_LRU:
+ n_lru_flush++;
+ goto assert_s_latched;
+ case BUF_FLUSH_SINGLE_PAGE:
+ n_page_flush++;
+assert_s_latched:
+ ut_a(rw_lock_is_locked(
+ &block->lock,
+ RW_LOCK_SHARED));
+ break;
+ case BUF_FLUSH_LIST:
+ n_list_flush++;
+ break;
+ default:
+ ut_error;
+ }
+
+ break;
+
+ case BUF_IO_READ:
+
+ ut_a(rw_lock_is_locked(&block->lock,
+ RW_LOCK_EX));
+ break;
+
+ case BUF_IO_PIN:
+ break;
+ }
+
+ n_lru++;
+ break;
+
+ case BUF_BLOCK_NOT_USED:
+ n_free++;
+ break;
+
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ /* do nothing */
+ break;
+ }
+
+ mutex_exit(&block->mutex);
+ }
+ }
+
+ mutex_enter(&buf_pool->zip_mutex);
+
+ /* Check clean compressed-only blocks. */
+
+ for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
+ b = UT_LIST_GET_NEXT(list, b)) {
+ ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
+ switch (buf_page_get_io_fix(b)) {
+ case BUF_IO_NONE:
+ case BUF_IO_PIN:
+ /* All clean blocks should be I/O-unfixed. */
+ break;
+ case BUF_IO_READ:
+ /* In buf_LRU_free_block(), we temporarily set
+ b->io_fix = BUF_IO_READ for a newly allocated
+ control block in order to prevent
+ buf_page_get_gen() from decompressing the block. */
+ break;
+ default:
+ ut_error;
+ break;
+ }
+
+ /* It is OK to read oldest_modification here because
+ we have acquired buf_pool->zip_mutex above which acts
+ as the 'block->mutex' for these bpages. */
+ ut_a(!b->oldest_modification);
+ fold = buf_page_address_fold(b->space, b->offset);
+ ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset,
+ fold) == b);
+ n_lru++;
+ n_zip++;
+ }
+
+ /* Check dirty blocks. */
+
+ buf_flush_list_mutex_enter(buf_pool);
+ for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
+ b = UT_LIST_GET_NEXT(list, b)) {
+ ut_ad(b->in_flush_list);
+ ut_a(b->oldest_modification);
+ n_flush++;
+
+ switch (buf_page_get_state(b)) {
+ case BUF_BLOCK_ZIP_DIRTY:
+ n_lru++;
+ n_zip++;
+ switch (buf_page_get_io_fix(b)) {
+ case BUF_IO_NONE:
+ case BUF_IO_READ:
+ case BUF_IO_PIN:
+ break;
+ case BUF_IO_WRITE:
+ switch (buf_page_get_flush_type(b)) {
+ case BUF_FLUSH_LRU:
+ n_lru_flush++;
+ break;
+ case BUF_FLUSH_SINGLE_PAGE:
+ n_page_flush++;
+ break;
+ case BUF_FLUSH_LIST:
+ n_list_flush++;
+ break;
+ default:
+ ut_error;
+ }
+ break;
+ }
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ /* uncompressed page */
+ break;
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ }
+ fold = buf_page_address_fold(b->space, b->offset);
+ ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset,
+ fold) == b);
+ }
+
+ ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
+
+ hash_unlock_x_all(buf_pool->page_hash);
+ buf_flush_list_mutex_exit(buf_pool);
+
+ mutex_exit(&buf_pool->zip_mutex);
+
+ if (n_lru + n_free > buf_pool->curr_size + n_zip) {
+ fprintf(stderr, "n LRU %lu, n free %lu, pool %lu zip %lu\n",
+ (ulong) n_lru, (ulong) n_free,
+ (ulong) buf_pool->curr_size, (ulong) n_zip);
+ ut_error;
+ }
+
+ ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
+ if (UT_LIST_GET_LEN(buf_pool->free) != n_free) {
+ fprintf(stderr, "Free list len %lu, free blocks %lu\n",
+ (ulong) UT_LIST_GET_LEN(buf_pool->free),
+ (ulong) n_free);
+ ut_error;
+ }
+
+ ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
+ ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
+ ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush);
+
+ buf_pool_mutex_exit(buf_pool);
+
+ ut_a(buf_LRU_validate());
+ ut_a(buf_flush_validate(buf_pool));
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Validates the buffer buf_pool data structure.
+@return TRUE */
+UNIV_INTERN
+ibool
+buf_validate(void)
+/*==============*/
+{
+ ulint i;
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+
+ buf_pool_validate_instance(buf_pool);
+ }
+ return(TRUE);
+}
+
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Prints info of the buffer buf_pool data structure for one instance. */
+static
+void
+buf_print_instance(
+/*===============*/
+ buf_pool_t* buf_pool)
+{
+ index_id_t* index_ids;
+ ulint* counts;
+ ulint size;
+ ulint i;
+ ulint j;
+ index_id_t id;
+ ulint n_found;
+ buf_chunk_t* chunk;
+ dict_index_t* index;
+
+ ut_ad(buf_pool);
+
+ size = buf_pool->curr_size;
+
+ index_ids = static_cast<index_id_t*>(
+ mem_alloc(size * sizeof *index_ids));
+
+ counts = static_cast<ulint*>(mem_alloc(sizeof(ulint) * size));
+
+ buf_pool_mutex_enter(buf_pool);
+ buf_flush_list_mutex_enter(buf_pool);
+
+ fprintf(stderr,
+ "buf_pool size %lu\n"
+ "database pages %lu\n"
+ "free pages %lu\n"
+ "modified database pages %lu\n"
+ "n pending decompressions %lu\n"
+ "n pending reads %lu\n"
+ "n pending flush LRU %lu list %lu single page %lu\n"
+ "pages made young %lu, not young %lu\n"
+ "pages read %lu, created %lu, written %lu\n",
+ (ulong) size,
+ (ulong) UT_LIST_GET_LEN(buf_pool->LRU),
+ (ulong) UT_LIST_GET_LEN(buf_pool->free),
+ (ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
+ (ulong) buf_pool->n_pend_unzip,
+ (ulong) buf_pool->n_pend_reads,
+ (ulong) buf_pool->n_flush[BUF_FLUSH_LRU],
+ (ulong) buf_pool->n_flush[BUF_FLUSH_LIST],
+ (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE],
+ (ulong) buf_pool->stat.n_pages_made_young,
+ (ulong) buf_pool->stat.n_pages_not_made_young,
+ (ulong) buf_pool->stat.n_pages_read,
+ (ulong) buf_pool->stat.n_pages_created,
+ (ulong) buf_pool->stat.n_pages_written);
+
+ buf_flush_list_mutex_exit(buf_pool);
+
+ /* Count the number of blocks belonging to each index in the buffer */
+
+ n_found = 0;
+
+ chunk = buf_pool->chunks;
+
+ for (i = buf_pool->n_chunks; i--; chunk++) {
+ buf_block_t* block = chunk->blocks;
+ ulint n_blocks = chunk->size;
+
+ for (; n_blocks--; block++) {
+ const buf_frame_t* frame = block->frame;
+
+ if (fil_page_get_type(frame) == FIL_PAGE_INDEX) {
+
+ id = btr_page_get_index_id(frame);
+
+ /* Look for the id in the index_ids array */
+ j = 0;
+
+ while (j < n_found) {
+
+ if (index_ids[j] == id) {
+ counts[j]++;
+
+ break;
+ }
+ j++;
+ }
+
+ if (j == n_found) {
+ n_found++;
+ index_ids[j] = id;
+ counts[j] = 1;
+ }
+ }
+ }
+ }
+
+ buf_pool_mutex_exit(buf_pool);
+
+ for (i = 0; i < n_found; i++) {
+ index = dict_index_get_if_in_cache(index_ids[i]);
+
+ fprintf(stderr,
+ "Block count for index %llu in buffer is about %lu",
+ (ullint) index_ids[i],
+ (ulong) counts[i]);
+
+ if (index) {
+ putc(' ', stderr);
+ dict_index_name_print(stderr, NULL, index);
+ }
+
+ putc('\n', stderr);
+ }
+
+ mem_free(index_ids);
+ mem_free(counts);
+
+ ut_a(buf_pool_validate_instance(buf_pool));
+}
+
+/*********************************************************************//**
+Prints info of the buffer buf_pool data structure. */
+UNIV_INTERN
+void
+buf_print(void)
+/*===========*/
+{
+ ulint i;
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+ buf_print_instance(buf_pool);
+ }
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the number of latched pages in the buffer pool.
+@return number of latched pages */
+UNIV_INTERN
+ulint
+buf_get_latched_pages_number_instance(
+/*==================================*/
+ buf_pool_t* buf_pool) /*!< in: buffer pool instance */
+{
+ buf_page_t* b;
+ ulint i;
+ buf_chunk_t* chunk;
+ ulint fixed_pages_number = 0;
+
+ buf_pool_mutex_enter(buf_pool);
+
+ chunk = buf_pool->chunks;
+
+ for (i = buf_pool->n_chunks; i--; chunk++) {
+ buf_block_t* block;
+ ulint j;
+
+ block = chunk->blocks;
+
+ for (j = chunk->size; j--; block++) {
+ if (buf_block_get_state(block)
+ != BUF_BLOCK_FILE_PAGE) {
+
+ continue;
+ }
+
+ mutex_enter(&block->mutex);
+
+ if (block->page.buf_fix_count != 0
+ || buf_page_get_io_fix(&block->page)
+ != BUF_IO_NONE) {
+ fixed_pages_number++;
+ }
+
+ mutex_exit(&block->mutex);
+ }
+ }
+
+ mutex_enter(&buf_pool->zip_mutex);
+
+ /* Traverse the lists of clean and dirty compressed-only blocks. */
+
+ for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
+ b = UT_LIST_GET_NEXT(list, b)) {
+ ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
+ ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE);
+
+ if (b->buf_fix_count != 0
+ || buf_page_get_io_fix(b) != BUF_IO_NONE) {
+ fixed_pages_number++;
+ }
+ }
+
+ buf_flush_list_mutex_enter(buf_pool);
+ for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
+ b = UT_LIST_GET_NEXT(list, b)) {
+ ut_ad(b->in_flush_list);
+
+ switch (buf_page_get_state(b)) {
+ case BUF_BLOCK_ZIP_DIRTY:
+ if (b->buf_fix_count != 0
+ || buf_page_get_io_fix(b) != BUF_IO_NONE) {
+ fixed_pages_number++;
+ }
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ /* uncompressed page */
+ break;
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ }
+ }
+
+ buf_flush_list_mutex_exit(buf_pool);
+ mutex_exit(&buf_pool->zip_mutex);
+ buf_pool_mutex_exit(buf_pool);
+
+ return(fixed_pages_number);
+}
+
+/*********************************************************************//**
+Returns the number of latched pages in all the buffer pools.
+@return number of latched pages */
+UNIV_INTERN
+ulint
+buf_get_latched_pages_number(void)
+/*==============================*/
+{
+ ulint i;
+ ulint total_latched_pages = 0;
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+
+ total_latched_pages += buf_get_latched_pages_number_instance(
+ buf_pool);
+ }
+
+ return(total_latched_pages);
+}
+
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Returns the number of pending buf pool read ios.
+@return number of pending read I/O operations */
+UNIV_INTERN
+ulint
+buf_get_n_pending_read_ios(void)
+/*============================*/
+{
+ ulint i;
+ ulint pend_ios = 0;
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ pend_ios += buf_pool_from_array(i)->n_pend_reads;
+ }
+
+ return(pend_ios);
+}
+
+/*********************************************************************//**
+Returns the ratio in percents of modified pages in the buffer pool /
+database pages in the buffer pool.
+@return modified page percentage ratio */
+UNIV_INTERN
+ulint
+buf_get_modified_ratio_pct(void)
+/*============================*/
+{
+ ulint ratio;
+ ulint lru_len = 0;
+ ulint free_len = 0;
+ ulint flush_list_len = 0;
+
+ buf_get_total_list_len(&lru_len, &free_len, &flush_list_len);
+
+ ratio = (100 * flush_list_len) / (1 + lru_len + free_len);
+
+ /* 1 + is there to avoid division by zero */
+
+ return(ratio);
+}
+
+/*******************************************************************//**
+Aggregates a pool stats information with the total buffer pool stats */
+static
+void
+buf_stats_aggregate_pool_info(
+/*==========================*/
+ buf_pool_info_t* total_info, /*!< in/out: the buffer pool
+ info to store aggregated
+ result */
+ const buf_pool_info_t* pool_info) /*!< in: individual buffer pool
+ stats info */
+{
+ ut_a(total_info && pool_info);
+
+ /* Nothing to copy if total_info is the same as pool_info */
+ if (total_info == pool_info) {
+ return;
+ }
+
+ total_info->pool_size += pool_info->pool_size;
+ total_info->lru_len += pool_info->lru_len;
+ total_info->old_lru_len += pool_info->old_lru_len;
+ total_info->free_list_len += pool_info->free_list_len;
+ total_info->flush_list_len += pool_info->flush_list_len;
+ total_info->n_pend_unzip += pool_info->n_pend_unzip;
+ total_info->n_pend_reads += pool_info->n_pend_reads;
+ total_info->n_pending_flush_lru += pool_info->n_pending_flush_lru;
+ total_info->n_pending_flush_list += pool_info->n_pending_flush_list;
+ total_info->n_pages_made_young += pool_info->n_pages_made_young;
+ total_info->n_pages_not_made_young += pool_info->n_pages_not_made_young;
+ total_info->n_pages_read += pool_info->n_pages_read;
+ total_info->n_pages_created += pool_info->n_pages_created;
+ total_info->n_pages_written += pool_info->n_pages_written;
+ total_info->n_page_gets += pool_info->n_page_gets;
+ total_info->n_ra_pages_read_rnd += pool_info->n_ra_pages_read_rnd;
+ total_info->n_ra_pages_read += pool_info->n_ra_pages_read;
+ total_info->n_ra_pages_evicted += pool_info->n_ra_pages_evicted;
+ total_info->page_made_young_rate += pool_info->page_made_young_rate;
+ total_info->page_not_made_young_rate +=
+ pool_info->page_not_made_young_rate;
+ total_info->pages_read_rate += pool_info->pages_read_rate;
+ total_info->pages_created_rate += pool_info->pages_created_rate;
+ total_info->pages_written_rate += pool_info->pages_written_rate;
+ total_info->n_page_get_delta += pool_info->n_page_get_delta;
+ total_info->page_read_delta += pool_info->page_read_delta;
+ total_info->young_making_delta += pool_info->young_making_delta;
+ total_info->not_young_making_delta += pool_info->not_young_making_delta;
+ total_info->pages_readahead_rnd_rate += pool_info->pages_readahead_rnd_rate;
+ total_info->pages_readahead_rate += pool_info->pages_readahead_rate;
+ total_info->pages_evicted_rate += pool_info->pages_evicted_rate;
+ total_info->unzip_lru_len += pool_info->unzip_lru_len;
+ total_info->io_sum += pool_info->io_sum;
+ total_info->io_cur += pool_info->io_cur;
+ total_info->unzip_sum += pool_info->unzip_sum;
+ total_info->unzip_cur += pool_info->unzip_cur;
+}
+/*******************************************************************//**
+Collect buffer pool stats information for a buffer pool. Also
+record aggregated stats if there are more than one buffer pool
+in the server */
+UNIV_INTERN
+void
+buf_stats_get_pool_info(
+/*====================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool */
+ ulint pool_id, /*!< in: buffer pool ID */
+ buf_pool_info_t* all_pool_info) /*!< in/out: buffer pool info
+ to fill */
+{
+ buf_pool_info_t* pool_info;
+ time_t current_time;
+ double time_elapsed;
+
+ /* Find appropriate pool_info to store stats for this buffer pool */
+ pool_info = &all_pool_info[pool_id];
+
+ buf_pool_mutex_enter(buf_pool);
+ buf_flush_list_mutex_enter(buf_pool);
+
+ pool_info->pool_unique_id = pool_id;
+
+ pool_info->pool_size = buf_pool->curr_size;
+
+ pool_info->lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+ pool_info->old_lru_len = buf_pool->LRU_old_len;
+
+ pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool->free);
+
+ pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool->flush_list);
+
+ pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+
+ pool_info->n_pend_reads = buf_pool->n_pend_reads;
+
+ pool_info->n_pending_flush_lru =
+ (buf_pool->n_flush[BUF_FLUSH_LRU]
+ + buf_pool->init_flush[BUF_FLUSH_LRU]);
+
+ pool_info->n_pending_flush_list =
+ (buf_pool->n_flush[BUF_FLUSH_LIST]
+ + buf_pool->init_flush[BUF_FLUSH_LIST]);
+
+ pool_info->n_pending_flush_single_page =
+ (buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
+ + buf_pool->init_flush[BUF_FLUSH_SINGLE_PAGE]);
+
+ buf_flush_list_mutex_exit(buf_pool);
+
+ current_time = time(NULL);
+ time_elapsed = 0.001 + difftime(current_time,
+ buf_pool->last_printout_time);
+
+ pool_info->n_pages_made_young = buf_pool->stat.n_pages_made_young;
+
+ pool_info->n_pages_not_made_young =
+ buf_pool->stat.n_pages_not_made_young;
+
+ pool_info->n_pages_read = buf_pool->stat.n_pages_read;
+
+ pool_info->n_pages_created = buf_pool->stat.n_pages_created;
+
+ pool_info->n_pages_written = buf_pool->stat.n_pages_written;
+
+ pool_info->n_page_gets = buf_pool->stat.n_page_gets;
+
+ pool_info->n_ra_pages_read_rnd = buf_pool->stat.n_ra_pages_read_rnd;
+ pool_info->n_ra_pages_read = buf_pool->stat.n_ra_pages_read;
+
+ pool_info->n_ra_pages_evicted = buf_pool->stat.n_ra_pages_evicted;
+
+ pool_info->page_made_young_rate =
+ (buf_pool->stat.n_pages_made_young
+ - buf_pool->old_stat.n_pages_made_young) / time_elapsed;
+
+ pool_info->page_not_made_young_rate =
+ (buf_pool->stat.n_pages_not_made_young
+ - buf_pool->old_stat.n_pages_not_made_young) / time_elapsed;
+
+ pool_info->pages_read_rate =
+ (buf_pool->stat.n_pages_read
+ - buf_pool->old_stat.n_pages_read) / time_elapsed;
+
+ pool_info->pages_created_rate =
+ (buf_pool->stat.n_pages_created
+ - buf_pool->old_stat.n_pages_created) / time_elapsed;
+
+ pool_info->pages_written_rate =
+ (buf_pool->stat.n_pages_written
+ - buf_pool->old_stat.n_pages_written) / time_elapsed;
+
+ pool_info->n_page_get_delta = buf_pool->stat.n_page_gets
+ - buf_pool->old_stat.n_page_gets;
+
+ if (pool_info->n_page_get_delta) {
+ pool_info->page_read_delta = buf_pool->stat.n_pages_read
+ - buf_pool->old_stat.n_pages_read;
+
+ pool_info->young_making_delta =
+ buf_pool->stat.n_pages_made_young
+ - buf_pool->old_stat.n_pages_made_young;
+
+ pool_info->not_young_making_delta =
+ buf_pool->stat.n_pages_not_made_young
+ - buf_pool->old_stat.n_pages_not_made_young;
+ }
+ pool_info->pages_readahead_rnd_rate =
+ (buf_pool->stat.n_ra_pages_read_rnd
+ - buf_pool->old_stat.n_ra_pages_read_rnd) / time_elapsed;
+
+
+ pool_info->pages_readahead_rate =
+ (buf_pool->stat.n_ra_pages_read
+ - buf_pool->old_stat.n_ra_pages_read) / time_elapsed;
+
+ pool_info->pages_evicted_rate =
+ (buf_pool->stat.n_ra_pages_evicted
+ - buf_pool->old_stat.n_ra_pages_evicted) / time_elapsed;
+
+ pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+
+ pool_info->io_sum = buf_LRU_stat_sum.io;
+
+ pool_info->io_cur = buf_LRU_stat_cur.io;
+
+ pool_info->unzip_sum = buf_LRU_stat_sum.unzip;
+
+ pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
+
+ buf_refresh_io_stats(buf_pool);
+ buf_pool_mutex_exit(buf_pool);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+UNIV_INTERN
+void
+buf_print_io_instance(
+/*==================*/
+ buf_pool_info_t*pool_info, /*!< in: buffer pool info */
+ FILE* file) /*!< in/out: buffer where to print */
+{
+ ut_ad(pool_info);
+
+ fprintf(file,
+ "Buffer pool size %lu\n"
+ "Free buffers %lu\n"
+ "Database pages %lu\n"
+ "Old database pages %lu\n"
+ "Modified db pages %lu\n"
+ "Pending reads %lu\n"
+ "Pending writes: LRU %lu, flush list %lu single page %lu\n",
+ pool_info->pool_size,
+ pool_info->free_list_len,
+ pool_info->lru_len,
+ pool_info->old_lru_len,
+ pool_info->flush_list_len,
+ pool_info->n_pend_reads,
+ pool_info->n_pending_flush_lru,
+ pool_info->n_pending_flush_list,
+ pool_info->n_pending_flush_single_page);
+
+ fprintf(file,
+ "Pages made young %lu, not young %lu\n"
+ "%.2f youngs/s, %.2f non-youngs/s\n"
+ "Pages read %lu, created %lu, written %lu\n"
+ "%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
+ pool_info->n_pages_made_young,
+ pool_info->n_pages_not_made_young,
+ pool_info->page_made_young_rate,
+ pool_info->page_not_made_young_rate,
+ pool_info->n_pages_read,
+ pool_info->n_pages_created,
+ pool_info->n_pages_written,
+ pool_info->pages_read_rate,
+ pool_info->pages_created_rate,
+ pool_info->pages_written_rate);
+
+ if (pool_info->n_page_get_delta) {
+ fprintf(file,
+ "Buffer pool hit rate %lu / 1000,"
+ " young-making rate %lu / 1000 not %lu / 1000\n",
+ (ulong) (1000 - (1000 * pool_info->page_read_delta
+ / pool_info->n_page_get_delta)),
+ (ulong) (1000 * pool_info->young_making_delta
+ / pool_info->n_page_get_delta),
+ (ulong) (1000 * pool_info->not_young_making_delta
+ / pool_info->n_page_get_delta));
+ } else {
+ fputs("No buffer pool page gets since the last printout\n",
+ file);
+ }
+
+ /* Statistics about read ahead algorithm */
+ fprintf(file, "Pages read ahead %.2f/s,"
+ " evicted without access %.2f/s,"
+ " Random read ahead %.2f/s\n",
+
+ pool_info->pages_readahead_rate,
+ pool_info->pages_evicted_rate,
+ pool_info->pages_readahead_rnd_rate);
+
+ /* Print some values to help us with visualizing what is
+ happening with LRU eviction. */
+ fprintf(file,
+ "LRU len: %lu, unzip_LRU len: %lu\n"
+ "I/O sum[%lu]:cur[%lu], unzip sum[%lu]:cur[%lu]\n",
+ pool_info->lru_len, pool_info->unzip_lru_len,
+ pool_info->io_sum, pool_info->io_cur,
+ pool_info->unzip_sum, pool_info->unzip_cur);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+UNIV_INTERN
+void
+buf_print_io(
+/*=========*/
+ FILE* file) /*!< in/out: buffer where to print */
+{
+ ulint i;
+ buf_pool_info_t* pool_info;
+ buf_pool_info_t* pool_info_total;
+
+ /* If srv_buf_pool_instances is greater than 1, allocate
+ one extra buf_pool_info_t, the last one stores
+ aggregated/total values from all pools */
+ if (srv_buf_pool_instances > 1) {
+ pool_info = (buf_pool_info_t*) mem_zalloc((
+ srv_buf_pool_instances + 1) * sizeof *pool_info);
+
+ pool_info_total = &pool_info[srv_buf_pool_instances];
+ } else {
+ ut_a(srv_buf_pool_instances == 1);
+
+ pool_info_total = pool_info =
+ static_cast<buf_pool_info_t*>(
+ mem_zalloc(sizeof *pool_info));
+ }
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+
+ /* Fetch individual buffer pool info and calculate
+ aggregated stats along the way */
+ buf_stats_get_pool_info(buf_pool, i, pool_info);
+
+ /* If we have more than one buffer pool, store
+ the aggregated stats */
+ if (srv_buf_pool_instances > 1) {
+ buf_stats_aggregate_pool_info(pool_info_total,
+ &pool_info[i]);
+ }
+ }
+
+ /* Print the aggreate buffer pool info */
+ buf_print_io_instance(pool_info_total, file);
+
+ /* If there are more than one buffer pool, print each individual pool
+ info */
+ if (srv_buf_pool_instances > 1) {
+ fputs("----------------------\n"
+ "INDIVIDUAL BUFFER POOL INFO\n"
+ "----------------------\n", file);
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ fprintf(file, "---BUFFER POOL %lu\n", i);
+ buf_print_io_instance(&pool_info[i], file);
+ }
+ }
+
+ mem_free(pool_info);
+}
+
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+buf_refresh_io_stats(
+/*=================*/
+ buf_pool_t* buf_pool) /*!< in: buffer pool instance */
+{
+ buf_pool->last_printout_time = ut_time();
+ buf_pool->old_stat = buf_pool->stat;
+}
+
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+buf_refresh_io_stats_all(void)
+/*==========================*/
+{
+ ulint i;
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+
+ buf_refresh_io_stats(buf_pool);
+ }
+}
+
+/**********************************************************************//**
+Check if all pages in all buffer pools are in a replacable state.
+@return FALSE if not */
+UNIV_INTERN
+ibool
+buf_all_freed(void)
+/*===============*/
+{
+ ulint i;
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+
+ if (!buf_all_freed_instance(buf_pool)) {
+ return(FALSE);
+ }
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Checks that there currently are no pending i/o-operations for the buffer
+pool.
+@return number of pending i/o */
+UNIV_INTERN
+ulint
+buf_pool_check_no_pending_io(void)
+/*==============================*/
+{
+ ulint i;
+ ulint pending_io = 0;
+
+ buf_pool_mutex_enter_all();
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ const buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+
+ pending_io += buf_pool->n_pend_reads
+ + buf_pool->n_flush[BUF_FLUSH_LRU]
+ + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
+ + buf_pool->n_flush[BUF_FLUSH_LIST];
+
+ }
+
+ buf_pool_mutex_exit_all();
+
+ return(pending_io);
+}
+
+#if 0
+Code currently not used
+/*********************************************************************//**
+Gets the current length of the free list of buffer blocks.
+@return length of the free list */
+UNIV_INTERN
+ulint
+buf_get_free_list_len(void)
+/*=======================*/
+{
+ ulint len;
+
+ buf_pool_mutex_enter(buf_pool);
+
+ len = UT_LIST_GET_LEN(buf_pool->free);
+
+ buf_pool_mutex_exit(buf_pool);
+
+ return(len);
+}
+#endif
+
+#else /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Inits a page to the buffer buf_pool, for use in ibbackup --restore. */
+UNIV_INTERN
+void
+buf_page_init_for_backup_restore(
+/*=============================*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: offset of the page within space
+ in units of a page */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ buf_block_t* block) /*!< in: block to init */
+{
+ block->page.state = BUF_BLOCK_FILE_PAGE;
+ block->page.space = space;
+ block->page.offset = offset;
+
+ page_zip_des_init(&block->page.zip);
+
+ /* We assume that block->page.data has been allocated
+ with zip_size == UNIV_PAGE_SIZE. */
+ ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+ ut_ad(ut_is_2pow(zip_size));
+ page_zip_set_size(&block->page.zip, zip_size);
+ if (zip_size) {
+ block->page.zip.data = block->frame + UNIV_PAGE_SIZE;
+ }
+}
+#endif /* !UNIV_HOTBACKUP */