diff options
Diffstat (limited to 'storage/innobase/buf/buf0buf.c')
-rw-r--r-- | storage/innobase/buf/buf0buf.c | 2395 |
1 files changed, 2395 insertions, 0 deletions
diff --git a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c new file mode 100644 index 00000000000..89f851709db --- /dev/null +++ b/storage/innobase/buf/buf0buf.c @@ -0,0 +1,2395 @@ +/* Innobase relational database engine; Copyright (C) 2001 Innobase Oy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License 2 + as published by the Free Software Foundation in June 1991. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License 2 + along with this program (in file COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +/****************************************************** +The database buffer buf_pool + +(c) 1995 Innobase Oy + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0buf.h" + +#ifdef UNIV_NONINL +#include "buf0buf.ic" +#endif + +#include "mem0mem.h" +#include "btr0btr.h" +#include "fil0fil.h" +#include "lock0lock.h" +#include "btr0sea.h" +#include "ibuf0ibuf.h" +#include "dict0dict.h" +#include "log0recv.h" +#include "log0log.h" +#include "trx0undo.h" +#include "srv0srv.h" + +/* + IMPLEMENTATION OF THE BUFFER POOL + ================================= + +Performance improvement: +------------------------ +Thread scheduling in NT may be so slow that the OS wait mechanism should +not be used even in waiting for disk reads to complete. +Rather, we should put waiting query threads to the queue of +waiting jobs, and let the OS thread do something useful while the i/o +is processed. In this way we could remove most OS thread switches in +an i/o-intensive benchmark like TPC-C. + +A possibility is to put a user space thread library between the database +and NT. User space thread libraries might be very fast. + +SQL Server 7.0 can be configured to use 'fibers' which are lightweight +threads in NT. These should be studied. + + Buffer frames and blocks + ------------------------ +Following the terminology of Gray and Reuter, we call the memory +blocks where file pages are loaded buffer frames. For each buffer +frame there is a control block, or shortly, a block, in the buffer +control array. The control info which does not need to be stored +in the file along with the file page, resides in the control block. + + Buffer pool struct + ------------------ +The buffer buf_pool contains a single mutex which protects all the +control data structures of the buf_pool. The content of a buffer frame is +protected by a separate read-write lock in its control block, though. +These locks can be locked and unlocked without owning the buf_pool mutex. +The OS events in the buf_pool struct can be waited for without owning the +buf_pool mutex. + +The buf_pool mutex is a hot-spot in main memory, causing a lot of +memory bus traffic on multiprocessor systems when processors +alternately access the mutex. On our Pentium, the mutex is accessed +maybe every 10 microseconds. We gave up the solution to have mutexes +for each control block, for instance, because it seemed to be +complicated. + +A solution to reduce mutex contention of the buf_pool mutex is to +create a separate mutex for the page hash table. On Pentium, +accessing the hash table takes 2 microseconds, about half +of the total buf_pool mutex hold time. + + Control blocks + -------------- + +The control block contains, for instance, the bufferfix count +which is incremented when a thread wants a file page to be fixed +in a buffer frame. The bufferfix operation does not lock the +contents of the frame, however. For this purpose, the control +block contains a read-write lock. + +The buffer frames have to be aligned so that the start memory +address of a frame is divisible by the universal page size, which +is a power of two. + +We intend to make the buffer buf_pool size on-line reconfigurable, +that is, the buf_pool size can be changed without closing the database. +Then the database administarator may adjust it to be bigger +at night, for example. The control block array must +contain enough control blocks for the maximum buffer buf_pool size +which is used in the particular database. +If the buf_pool size is cut, we exploit the virtual memory mechanism of +the OS, and just refrain from using frames at high addresses. Then the OS +can swap them to disk. + +The control blocks containing file pages are put to a hash table +according to the file address of the page. +We could speed up the access to an individual page by using +"pointer swizzling": we could replace the page references on +non-leaf index pages by direct pointers to the page, if it exists +in the buf_pool. We could make a separate hash table where we could +chain all the page references in non-leaf pages residing in the buf_pool, +using the page reference as the hash key, +and at the time of reading of a page update the pointers accordingly. +Drawbacks of this solution are added complexity and, +possibly, extra space required on non-leaf pages for memory pointers. +A simpler solution is just to speed up the hash table mechanism +in the database, using tables whose size is a power of 2. + + Lists of blocks + --------------- + +There are several lists of control blocks. The free list contains +blocks which are currently not used. + +The LRU-list contains all the blocks holding a file page +except those for which the bufferfix count is non-zero. +The pages are in the LRU list roughly in the order of the last +access to the page, so that the oldest pages are at the end of the +list. We also keep a pointer to near the end of the LRU list, +which we can use when we want to artificially age a page in the +buf_pool. This is used if we know that some page is not needed +again for some time: we insert the block right after the pointer, +causing it to be replaced sooner than would noramlly be the case. +Currently this aging mechanism is used for read-ahead mechanism +of pages, and it can also be used when there is a scan of a full +table which cannot fit in the memory. Putting the pages near the +of the LRU list, we make sure that most of the buf_pool stays in the +main memory, undisturbed. + +The chain of modified blocks contains the blocks +holding file pages that have been modified in the memory +but not written to disk yet. The block with the oldest modification +which has not yet been written to disk is at the end of the chain. + + Loading a file page + ------------------- + +First, a victim block for replacement has to be found in the +buf_pool. It is taken from the free list or searched for from the +end of the LRU-list. An exclusive lock is reserved for the frame, +the io_fix field is set in the block fixing the block in buf_pool, +and the io-operation for loading the page is queued. The io-handler thread +releases the X-lock on the frame and resets the io_fix field +when the io operation completes. + +A thread may request the above operation using the buf_page_get- +function. It may then continue to request a lock on the frame. +The lock is granted when the io-handler releases the x-lock. + + Read-ahead + ---------- + +The read-ahead mechanism is intended to be intelligent and +isolated from the semantically higher levels of the database +index management. From the higher level we only need the +information if a file page has a natural successor or +predecessor page. On the leaf level of a B-tree index, +these are the next and previous pages in the natural +order of the pages. + +Let us first explain the read-ahead mechanism when the leafs +of a B-tree are scanned in an ascending or descending order. +When a read page is the first time referenced in the buf_pool, +the buffer manager checks if it is at the border of a so-called +linear read-ahead area. The tablespace is divided into these +areas of size 64 blocks, for example. So if the page is at the +border of such an area, the read-ahead mechanism checks if +all the other blocks in the area have been accessed in an +ascending or descending order. If this is the case, the system +looks at the natural successor or predecessor of the page, +checks if that is at the border of another area, and in this case +issues read-requests for all the pages in that area. Maybe +we could relax the condition that all the pages in the area +have to be accessed: if data is deleted from a table, there may +appear holes of unused pages in the area. + +A different read-ahead mechanism is used when there appears +to be a random access pattern to a file. +If a new page is referenced in the buf_pool, and several pages +of its random access area (for instance, 32 consecutive pages +in a tablespace) have recently been referenced, we may predict +that the whole area may be needed in the near future, and issue +the read requests for the whole area. + + AWE implementation + ------------------ + +By a 'block' we mean the buffer header of type buf_block_t. By a 'page' +we mean the physical 16 kB memory area allocated from RAM for that block. +By a 'frame' we mean a 16 kB area in the virtual address space of the +process, in the frame_mem of buf_pool. + +We can map pages to the frames of the buffer pool. + +1) A buffer block allocated to use as a non-data page, e.g., to the lock +table, is always mapped to a frame. +2) A bufferfixed or io-fixed data page is always mapped to a frame. +3) When we need to map a block to frame, we look from the list +awe_LRU_free_mapped and try to unmap its last block, but note that +bufferfixed or io-fixed pages cannot be unmapped. +4) For every frame in the buffer pool there is always a block whose page is +mapped to it. When we create the buffer pool, we map the first elements +in the free list to the frames. +5) When we have AWE enabled, we disable adaptive hash indexes. +*/ + +buf_pool_t* buf_pool = NULL; /* The buffer buf_pool of the database */ + +ulint buf_dbg_counter = 0; /* This is used to insert validation + operations in excution in the + debug version */ +ibool buf_debug_prints = FALSE; /* If this is set TRUE, + the program prints info whenever + read-ahead or flush occurs */ + +/************************************************************************ +Calculates a page checksum which is stored to the page when it is written +to a file. Note that we must be careful to calculate the same value on +32-bit and 64-bit architectures. */ + +ulint +buf_calc_page_new_checksum( +/*=======================*/ + /* out: checksum */ + byte* page) /* in: buffer page */ +{ + ulint checksum; + + /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x + ..._ARCH_LOG_NO, are written outside the buffer pool to the first + pages of data files, we have to skip them in the page checksum + calculation. + We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the + checksum is stored, and also the last 8 bytes of page because + there we store the old formula checksum. */ + + checksum = ut_fold_binary(page + FIL_PAGE_OFFSET, + FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET) + + ut_fold_binary(page + FIL_PAGE_DATA, + UNIV_PAGE_SIZE - FIL_PAGE_DATA + - FIL_PAGE_END_LSN_OLD_CHKSUM); + checksum = checksum & 0xFFFFFFFFUL; + + return(checksum); +} + +/************************************************************************ +In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only +looked at the first few bytes of the page. This calculates that old +checksum. +NOTE: we must first store the new formula checksum to +FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum +because this takes that field as an input! */ + +ulint +buf_calc_page_old_checksum( +/*=======================*/ + /* out: checksum */ + byte* page) /* in: buffer page */ +{ + ulint checksum; + + checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN); + + checksum = checksum & 0xFFFFFFFFUL; + + return(checksum); +} + +/************************************************************************ +Checks if a page is corrupt. */ + +ibool +buf_page_is_corrupted( +/*==================*/ + /* out: TRUE if corrupted */ + byte* read_buf) /* in: a database page */ +{ + ulint checksum; + ulint old_checksum; + ulint checksum_field; + ulint old_checksum_field; +#ifndef UNIV_HOTBACKUP + dulint current_lsn; +#endif + if (mach_read_from_4(read_buf + FIL_PAGE_LSN + 4) + != mach_read_from_4(read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) { + + /* Stored log sequence numbers at the start and the end + of page do not match */ + + return(TRUE); + } + +#ifndef UNIV_HOTBACKUP + if (recv_lsn_checks_on && log_peek_lsn(¤t_lsn)) { + if (ut_dulint_cmp(current_lsn, + mach_read_from_8(read_buf + FIL_PAGE_LSN)) + < 0) { + ut_print_timestamp(stderr); + + fprintf(stderr, +" InnoDB: Error: page %lu log sequence number %lu %lu\n" +"InnoDB: is in the future! Current system log sequence number %lu %lu.\n" +"InnoDB: Your database may be corrupt.\n", + (ulong) mach_read_from_4(read_buf + FIL_PAGE_OFFSET), + (ulong) ut_dulint_get_high( + mach_read_from_8(read_buf + FIL_PAGE_LSN)), + (ulong) ut_dulint_get_low( + mach_read_from_8(read_buf + FIL_PAGE_LSN)), + (ulong) ut_dulint_get_high(current_lsn), + (ulong) ut_dulint_get_low(current_lsn)); + } + } +#endif + + /* If we use checksums validation, make additional check before returning + TRUE to ensure that the checksum is not equal to BUF_NO_CHECKSUM_MAGIC which + might be stored by InnoDB with checksums disabled. + Otherwise, skip checksum calculation and return FALSE */ + + if (srv_use_checksums) { + old_checksum = buf_calc_page_old_checksum(read_buf); + + old_checksum_field = mach_read_from_4(read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM); + + /* There are 2 valid formulas for old_checksum_field: + 1. Very old versions of InnoDB only stored 8 byte lsn to the start + and the end of the page. + 2. Newer InnoDB versions store the old formula checksum there. */ + + if (old_checksum_field != mach_read_from_4(read_buf + FIL_PAGE_LSN) + && old_checksum_field != old_checksum + && old_checksum_field != BUF_NO_CHECKSUM_MAGIC) { + + return(TRUE); + } + + checksum = buf_calc_page_new_checksum(read_buf); + checksum_field = mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM); + + /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id + (always equal to 0), to FIL_PAGE_SPACE_SPACE_OR_CHKSUM */ + + if (checksum_field != 0 && checksum_field != checksum + && checksum_field != BUF_NO_CHECKSUM_MAGIC) { + + return(TRUE); + } + } + + return(FALSE); +} + +/************************************************************************ +Prints a page to stderr. */ + +void +buf_page_print( +/*===========*/ + byte* read_buf) /* in: a database page */ +{ + dict_index_t* index; + ulint checksum; + ulint old_checksum; + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Page dump in ascii and hex (%lu bytes):\n", + (ulint)UNIV_PAGE_SIZE); + ut_print_buf(stderr, read_buf, UNIV_PAGE_SIZE); + fputs("InnoDB: End of page dump\n", stderr); + + checksum = srv_use_checksums ? + buf_calc_page_new_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC; + old_checksum = srv_use_checksums ? + buf_calc_page_old_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC; + + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Page checksum %lu, prior-to-4.0.14-form checksum %lu\n" +"InnoDB: stored checksum %lu, prior-to-4.0.14-form stored checksum %lu\n", + (ulong) checksum, (ulong) old_checksum, + (ulong) mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM), + (ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM)); + fprintf(stderr, +"InnoDB: Page lsn %lu %lu, low 4 bytes of lsn at page end %lu\n" +"InnoDB: Page number (if stored to page already) %lu,\n" +"InnoDB: space id (if created with >= MySQL-4.1.1 and stored already) %lu\n", + (ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN), + (ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN + 4), + (ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), + (ulong) mach_read_from_4(read_buf + FIL_PAGE_OFFSET), + (ulong) mach_read_from_4(read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)); + + if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) + == TRX_UNDO_INSERT) { + fprintf(stderr, + "InnoDB: Page may be an insert undo log page\n"); + } else if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) + == TRX_UNDO_UPDATE) { + fprintf(stderr, + "InnoDB: Page may be an update undo log page\n"); + } + + if (fil_page_get_type(read_buf) == FIL_PAGE_INDEX) { + fprintf(stderr, +"InnoDB: Page may be an index page where index id is %lu %lu\n", + (ulong) ut_dulint_get_high(btr_page_get_index_id(read_buf)), + (ulong) ut_dulint_get_low(btr_page_get_index_id(read_buf))); + + /* If the code is in ibbackup, dict_sys may be uninitialized, + i.e., NULL */ + + if (dict_sys != NULL) { + + index = dict_index_find_on_id_low( + btr_page_get_index_id(read_buf)); + if (index) { + fputs("InnoDB: (", stderr); + dict_index_name_print(stderr, NULL, index); + fputs(")\n", stderr); + } + } + } else if (fil_page_get_type(read_buf) == FIL_PAGE_INODE) { + fputs("InnoDB: Page may be an 'inode' page\n", stderr); + } else if (fil_page_get_type(read_buf) == FIL_PAGE_IBUF_FREE_LIST) { + fputs("InnoDB: Page may be an insert buffer free list page\n", + stderr); + } +} + +/************************************************************************ +Initializes a buffer control block when the buf_pool is created. */ +static +void +buf_block_init( +/*===========*/ + buf_block_t* block, /* in: pointer to control block */ + byte* frame) /* in: pointer to buffer frame, or NULL if in + the case of AWE there is no frame */ +{ + block->state = BUF_BLOCK_NOT_USED; + + block->frame = frame; + + block->awe_info = NULL; + + block->modify_clock = ut_dulint_zero; + + block->file_page_was_freed = FALSE; + + block->check_index_page_at_flush = FALSE; + block->index = NULL; + + block->in_free_list = FALSE; + block->in_LRU_list = FALSE; + + block->n_pointers = 0; + + rw_lock_create(&(block->lock)); + ut_ad(rw_lock_validate(&(block->lock))); + +#ifdef UNIV_SYNC_DEBUG + rw_lock_create(&(block->debug_latch)); + rw_lock_set_level(&(block->debug_latch), SYNC_NO_ORDER_CHECK); +#endif /* UNIV_SYNC_DEBUG */ +} + +/************************************************************************ +Creates the buffer pool. */ + +buf_pool_t* +buf_pool_init( +/*==========*/ + /* out, own: buf_pool object, NULL if not + enough memory or error */ + ulint max_size, /* in: maximum size of the buf_pool in + blocks */ + ulint curr_size, /* in: current size to use, must be <= + max_size, currently must be equal to + max_size */ + ulint n_frames) /* in: number of frames; if AWE is used, + this is the size of the address space window + where physical memory pages are mapped; if + AWE is not used then this must be the same + as max_size */ +{ + byte* frame; + ulint i; + buf_block_t* block; + + ut_a(max_size == curr_size); + ut_a(srv_use_awe || n_frames == max_size); + + if (n_frames > curr_size) { + fprintf(stderr, +"InnoDB: AWE: Error: you must specify in my.cnf .._awe_mem_mb larger\n" +"InnoDB: than .._buffer_pool_size. Now the former is %lu pages,\n" +"InnoDB: the latter %lu pages.\n", (ulong) curr_size, (ulong) n_frames); + + return(NULL); + } + + buf_pool = mem_alloc(sizeof(buf_pool_t)); + + /* 1. Initialize general fields + ---------------------------- */ + mutex_create(&(buf_pool->mutex)); + mutex_set_level(&(buf_pool->mutex), SYNC_BUF_POOL); + + mutex_enter(&(buf_pool->mutex)); + + if (srv_use_awe) { + /*----------------------------------------*/ + /* Allocate the virtual address space window, i.e., the + buffer pool frames */ + + buf_pool->frame_mem = os_awe_allocate_virtual_mem_window( + UNIV_PAGE_SIZE * (n_frames + 1)); + + /* Allocate the physical memory for AWE and the AWE info array + for buf_pool */ + + if ((curr_size % ((1024 * 1024) / UNIV_PAGE_SIZE)) != 0) { + + fprintf(stderr, +"InnoDB: AWE: Error: physical memory must be allocated in full megabytes.\n" +"InnoDB: Trying to allocate %lu database pages.\n", + (ulong) curr_size); + + return(NULL); + } + + if (!os_awe_allocate_physical_mem(&(buf_pool->awe_info), + curr_size / ((1024 * 1024) / UNIV_PAGE_SIZE))) { + + return(NULL); + } + /*----------------------------------------*/ + } else { + buf_pool->frame_mem = os_mem_alloc_large( + UNIV_PAGE_SIZE * (n_frames + 1), + TRUE, FALSE); + } + + if (buf_pool->frame_mem == NULL) { + + return(NULL); + } + + buf_pool->blocks = ut_malloc(sizeof(buf_block_t) * max_size); + + if (buf_pool->blocks == NULL) { + + return(NULL); + } + + buf_pool->max_size = max_size; + buf_pool->curr_size = curr_size; + + buf_pool->n_frames = n_frames; + + /* Align pointer to the first frame */ + + frame = ut_align(buf_pool->frame_mem, UNIV_PAGE_SIZE); + + buf_pool->frame_zero = frame; + buf_pool->high_end = frame + UNIV_PAGE_SIZE * n_frames; + + if (srv_use_awe) { + /*----------------------------------------*/ + /* Map an initial part of the allocated physical memory to + the window */ + + os_awe_map_physical_mem_to_window(buf_pool->frame_zero, + n_frames * + (UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE), + buf_pool->awe_info); + /*----------------------------------------*/ + } + + buf_pool->blocks_of_frames = ut_malloc(sizeof(void*) * n_frames); + + if (buf_pool->blocks_of_frames == NULL) { + + return(NULL); + } + + /* Init block structs and assign frames for them; in the case of + AWE there are less frames than blocks. Then we assign the frames + to the first blocks (we already mapped the memory above). We also + init the awe_info for every block. */ + + for (i = 0; i < max_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + + if (i < n_frames) { + frame = buf_pool->frame_zero + i * UNIV_PAGE_SIZE; + *(buf_pool->blocks_of_frames + i) = block; + } else { + frame = NULL; + } + + buf_block_init(block, frame); + + if (srv_use_awe) { + /*----------------------------------------*/ + block->awe_info = buf_pool->awe_info + + i * (UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE); + /*----------------------------------------*/ + } + } + + buf_pool->page_hash = hash_create(2 * max_size); + + buf_pool->n_pend_reads = 0; + + buf_pool->last_printout_time = time(NULL); + + buf_pool->n_pages_read = 0; + buf_pool->n_pages_written = 0; + buf_pool->n_pages_created = 0; + buf_pool->n_pages_awe_remapped = 0; + + buf_pool->n_page_gets = 0; + buf_pool->n_page_gets_old = 0; + buf_pool->n_pages_read_old = 0; + buf_pool->n_pages_written_old = 0; + buf_pool->n_pages_created_old = 0; + buf_pool->n_pages_awe_remapped_old = 0; + + /* 2. Initialize flushing fields + ---------------------------- */ + UT_LIST_INIT(buf_pool->flush_list); + + for (i = BUF_FLUSH_LRU; i <= BUF_FLUSH_LIST; i++) { + buf_pool->n_flush[i] = 0; + buf_pool->init_flush[i] = FALSE; + buf_pool->no_flush[i] = os_event_create(NULL); + } + + buf_pool->LRU_flush_ended = 0; + + buf_pool->ulint_clock = 1; + buf_pool->freed_page_clock = 0; + + /* 3. Initialize LRU fields + ---------------------------- */ + UT_LIST_INIT(buf_pool->LRU); + + buf_pool->LRU_old = NULL; + + UT_LIST_INIT(buf_pool->awe_LRU_free_mapped); + + /* Add control blocks to the free list */ + UT_LIST_INIT(buf_pool->free); + + for (i = 0; i < curr_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + + if (block->frame) { + /* Wipe contents of frame to eliminate a Purify + warning */ + +#ifdef HAVE_purify + memset(block->frame, '\0', UNIV_PAGE_SIZE); +#endif + if (srv_use_awe) { + /* Add to the list of blocks mapped to + frames */ + + UT_LIST_ADD_LAST(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, block); + } + } + + UT_LIST_ADD_LAST(free, buf_pool->free, block); + block->in_free_list = TRUE; + } + + mutex_exit(&(buf_pool->mutex)); + + if (srv_use_adaptive_hash_indexes) { + btr_search_sys_create( + curr_size * UNIV_PAGE_SIZE / sizeof(void*) / 64); + } else { + /* Create only a small dummy system */ + btr_search_sys_create(1000); + } + + return(buf_pool); +} + +/************************************************************************ +Maps the page of block to a frame, if not mapped yet. Unmaps some page +from the end of the awe_LRU_free_mapped. */ + +void +buf_awe_map_page_to_frame( +/*======================*/ + buf_block_t* block, /* in: block whose page should be + mapped to a frame */ + ibool add_to_mapped_list) /* in: TRUE if we in the case + we need to map the page should also + add the block to the + awe_LRU_free_mapped list */ +{ + buf_block_t* bck; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(buf_pool->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(block); + + if (block->frame) { + + return; + } + + /* Scan awe_LRU_free_mapped from the end and try to find a block + which is not bufferfixed or io-fixed */ + + bck = UT_LIST_GET_LAST(buf_pool->awe_LRU_free_mapped); + + while (bck) { + if (bck->state == BUF_BLOCK_FILE_PAGE + && (bck->buf_fix_count != 0 || bck->io_fix != 0)) { + + /* We have to skip this */ + bck = UT_LIST_GET_PREV(awe_LRU_free_mapped, bck); + } else { + /* We can map block to the frame of bck */ + + os_awe_map_physical_mem_to_window( + bck->frame, + UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE, + block->awe_info); + + block->frame = bck->frame; + + *(buf_pool->blocks_of_frames + + (((ulint)(block->frame + - buf_pool->frame_zero)) + >> UNIV_PAGE_SIZE_SHIFT)) + = block; + + bck->frame = NULL; + UT_LIST_REMOVE(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, + bck); + + if (add_to_mapped_list) { + UT_LIST_ADD_FIRST(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, + block); + } + + buf_pool->n_pages_awe_remapped++; + + return; + } + } + + fprintf(stderr, +"InnoDB: AWE: Fatal error: cannot find a page to unmap\n" +"InnoDB: awe_LRU_free_mapped list length %lu\n", + (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped)); + + ut_a(0); +} + +/************************************************************************ +Allocates a buffer block. */ +UNIV_INLINE +buf_block_t* +buf_block_alloc(void) +/*=================*/ + /* out, own: the allocated block; also if AWE + is used it is guaranteed that the page is + mapped to a frame */ +{ + buf_block_t* block; + + block = buf_LRU_get_free_block(); + + return(block); +} + +/************************************************************************ +Moves to the block to the start of the LRU list if there is a danger +that the block would drift out of the buffer pool. */ +UNIV_INLINE +void +buf_block_make_young( +/*=================*/ + buf_block_t* block) /* in: block to make younger */ +{ + if (buf_pool->freed_page_clock >= block->freed_page_clock + + 1 + (buf_pool->curr_size / 1024)) { + + /* There has been freeing activity in the LRU list: + best to move to the head of the LRU list */ + + buf_LRU_make_block_young(block); + } +} + +/************************************************************************ +Moves a page to the start of the buffer pool LRU list. This high-level +function can be used to prevent an important page from from slipping out of +the buffer pool. */ + +void +buf_page_make_young( +/*=================*/ + buf_frame_t* frame) /* in: buffer frame of a file page */ +{ + buf_block_t* block; + + mutex_enter(&(buf_pool->mutex)); + + block = buf_block_align(frame); + + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + + buf_LRU_make_block_young(block); + + mutex_exit(&(buf_pool->mutex)); +} + +/************************************************************************ +Frees a buffer block which does not contain a file page. */ +UNIV_INLINE +void +buf_block_free( +/*===========*/ + buf_block_t* block) /* in, own: block to be freed */ +{ + ut_a(block->state != BUF_BLOCK_FILE_PAGE); + + mutex_enter(&(buf_pool->mutex)); + + buf_LRU_block_free_non_file_page(block); + + mutex_exit(&(buf_pool->mutex)); +} + +/************************************************************************* +Allocates a buffer frame. */ + +buf_frame_t* +buf_frame_alloc(void) +/*=================*/ + /* out: buffer frame */ +{ + return(buf_block_alloc()->frame); +} + +/************************************************************************* +Frees a buffer frame which does not contain a file page. */ + +void +buf_frame_free( +/*===========*/ + buf_frame_t* frame) /* in: buffer frame */ +{ + buf_block_free(buf_block_align(frame)); +} + +/************************************************************************ +Returns the buffer control block if the page can be found in the buffer +pool. NOTE that it is possible that the page is not yet read +from disk, though. This is a very low-level function: use with care! */ + +buf_block_t* +buf_page_peek_block( +/*================*/ + /* out: control block if found from page hash table, + otherwise NULL; NOTE that the page is not necessarily + yet read from disk! */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + + mutex_enter_fast(&(buf_pool->mutex)); + + block = buf_page_hash_get(space, offset); + + mutex_exit(&(buf_pool->mutex)); + + return(block); +} + +/************************************************************************ +Resets the check_index_page_at_flush field of a page if found in the buffer +pool. */ + +void +buf_reset_check_index_page_at_flush( +/*================================*/ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + + mutex_enter_fast(&(buf_pool->mutex)); + + block = buf_page_hash_get(space, offset); + + if (block) { + block->check_index_page_at_flush = FALSE; + } + + mutex_exit(&(buf_pool->mutex)); +} + +/************************************************************************ +Returns the current state of is_hashed of a page. FALSE if the page is +not in the pool. NOTE that this operation does not fix the page in the +pool if it is found there. */ + +ibool +buf_page_peek_if_search_hashed( +/*===========================*/ + /* out: TRUE if page hash index is built in search + system */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + ibool is_hashed; + + mutex_enter_fast(&(buf_pool->mutex)); + + block = buf_page_hash_get(space, offset); + + if (!block) { + is_hashed = FALSE; + } else { + is_hashed = block->is_hashed; + } + + mutex_exit(&(buf_pool->mutex)); + + return(is_hashed); +} + +/************************************************************************ +Returns TRUE if the page can be found in the buffer pool hash table. NOTE +that it is possible that the page is not yet read from disk, though. */ + +ibool +buf_page_peek( +/*==========*/ + /* out: TRUE if found from page hash table, + NOTE that the page is not necessarily yet read + from disk! */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + if (buf_page_peek_block(space, offset)) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************ +Sets file_page_was_freed TRUE if the page is found in the buffer pool. +This function should be called when we free a file page and want the +debug version to check that it is not accessed any more unless +reallocated. */ + +buf_block_t* +buf_page_set_file_page_was_freed( +/*=============================*/ + /* out: control block if found from page hash table, + otherwise NULL */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + + mutex_enter_fast(&(buf_pool->mutex)); + + block = buf_page_hash_get(space, offset); + + if (block) { + block->file_page_was_freed = TRUE; + } + + mutex_exit(&(buf_pool->mutex)); + + return(block); +} + +/************************************************************************ +Sets file_page_was_freed FALSE if the page is found in the buffer pool. +This function should be called when we free a file page and want the +debug version to check that it is not accessed any more unless +reallocated. */ + +buf_block_t* +buf_page_reset_file_page_was_freed( +/*===============================*/ + /* out: control block if found from page hash table, + otherwise NULL */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + + mutex_enter_fast(&(buf_pool->mutex)); + + block = buf_page_hash_get(space, offset); + + if (block) { + block->file_page_was_freed = FALSE; + } + + mutex_exit(&(buf_pool->mutex)); + + return(block); +} + +/************************************************************************ +This is the general function used to get access to a database page. */ + +buf_frame_t* +buf_page_get_gen( +/*=============*/ + /* out: pointer to the frame or NULL */ + ulint space, /* in: space id */ + ulint offset, /* in: page number */ + ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ + buf_frame_t* guess, /* in: guessed frame or NULL */ + ulint mode, /* in: BUF_GET, BUF_GET_IF_IN_POOL, + BUF_GET_NO_LATCH, BUF_GET_NOWAIT */ + const char* file, /* in: file name */ + ulint line, /* in: line where called */ + mtr_t* mtr) /* in: mini-transaction */ +{ + buf_block_t* block; + ibool accessed; + ulint fix_type; + ibool success; + ibool must_read; + + ut_ad(mtr); + ut_ad((rw_latch == RW_S_LATCH) + || (rw_latch == RW_X_LATCH) + || (rw_latch == RW_NO_LATCH)); + ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH)); + ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL) + || (mode == BUF_GET_NO_LATCH) || (mode == BUF_GET_NOWAIT)); +#ifndef UNIV_LOG_DEBUG + ut_ad(!ibuf_inside() || ibuf_page(space, offset)); +#endif + buf_pool->n_page_gets++; +loop: + mutex_enter_fast(&(buf_pool->mutex)); + + block = NULL; + + if (guess) { + block = buf_block_align(guess); + + if ((offset != block->offset) || (space != block->space) + || (block->state != BUF_BLOCK_FILE_PAGE)) { + + block = NULL; + } + } + + if (block == NULL) { + block = buf_page_hash_get(space, offset); + } + + if (block == NULL) { + /* Page not in buf_pool: needs to be read from file */ + + mutex_exit(&(buf_pool->mutex)); + + if (mode == BUF_GET_IF_IN_POOL) { + + return(NULL); + } + + buf_read_page(space, offset); + +#ifdef UNIV_DEBUG + buf_dbg_counter++; + + if (buf_dbg_counter % 37 == 0) { + ut_ad(buf_validate()); + } +#endif + goto loop; + } + + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + + must_read = FALSE; + + if (block->io_fix == BUF_IO_READ) { + + must_read = TRUE; + + if (mode == BUF_GET_IF_IN_POOL) { + + /* The page is only being read to buffer */ + mutex_exit(&(buf_pool->mutex)); + + return(NULL); + } + } + + /* If AWE is enabled and the page is not mapped to a frame, then + map it */ + + if (block->frame == NULL) { + ut_a(srv_use_awe); + + /* We set second parameter TRUE because the block is in the + LRU list and we must put it to awe_LRU_free_mapped list once + mapped to a frame */ + + buf_awe_map_page_to_frame(block, TRUE); + } + +#ifdef UNIV_SYNC_DEBUG + buf_block_buf_fix_inc_debug(block, file, line); +#else + buf_block_buf_fix_inc(block); +#endif + buf_block_make_young(block); + + /* Check if this is the first access to the page */ + + accessed = block->accessed; + + block->accessed = TRUE; + +#ifdef UNIV_DEBUG_FILE_ACCESSES + ut_a(block->file_page_was_freed == FALSE); +#endif + mutex_exit(&(buf_pool->mutex)); + +#ifdef UNIV_DEBUG + buf_dbg_counter++; + + if (buf_dbg_counter % 5771 == 0) { + ut_ad(buf_validate()); + } +#endif + ut_ad(block->buf_fix_count > 0); + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + + if (mode == BUF_GET_NOWAIT) { + if (rw_latch == RW_S_LATCH) { + success = rw_lock_s_lock_func_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + ut_ad(rw_latch == RW_X_LATCH); + success = rw_lock_x_lock_func_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + if (!success) { + mutex_enter(&(buf_pool->mutex)); + + block->buf_fix_count--; +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&(block->debug_latch)); +#endif + mutex_exit(&(buf_pool->mutex)); + + return(NULL); + } + } else if (rw_latch == RW_NO_LATCH) { + + if (must_read) { + /* Let us wait until the read operation + completes */ + + for (;;) { + mutex_enter(&(buf_pool->mutex)); + + if (block->io_fix == BUF_IO_READ) { + + mutex_exit(&(buf_pool->mutex)); + + /* Sleep 20 milliseconds */ + + os_thread_sleep(20000); + } else { + + mutex_exit(&(buf_pool->mutex)); + + break; + } + } + } + + fix_type = MTR_MEMO_BUF_FIX; + } else if (rw_latch == RW_S_LATCH) { + + rw_lock_s_lock_func(&(block->lock), 0, file, line); + + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + rw_lock_x_lock_func(&(block->lock), 0, file, line); + + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + mtr_memo_push(mtr, block, fix_type); + + if (!accessed) { + /* In the case of a first access, try to apply linear + read-ahead */ + + buf_read_ahead_linear(space, offset); + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + return(block->frame); +} + +/************************************************************************ +This is the general function used to get optimistic access to a database +page. */ + +ibool +buf_page_optimistic_get_func( +/*=========================*/ + /* out: TRUE if success */ + ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ + buf_block_t* block, /* in: guessed buffer block */ + buf_frame_t* guess, /* in: guessed frame; note that AWE may move + frames */ + dulint modify_clock,/* in: modify clock value if mode is + ..._GUESS_ON_CLOCK */ + const char* file, /* in: file name */ + ulint line, /* in: line where called */ + mtr_t* mtr) /* in: mini-transaction */ +{ + ibool accessed; + ibool success; + ulint fix_type; + + ut_ad(mtr && block); + ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); + + mutex_enter(&(buf_pool->mutex)); + + /* If AWE is used, block may have a different frame now, e.g., NULL */ + + if (block->state != BUF_BLOCK_FILE_PAGE || block->frame != guess) { + + mutex_exit(&(buf_pool->mutex)); + + return(FALSE); + } + +#ifdef UNIV_SYNC_DEBUG + buf_block_buf_fix_inc_debug(block, file, line); +#else + buf_block_buf_fix_inc(block); +#endif + buf_block_make_young(block); + + /* Check if this is the first access to the page */ + + accessed = block->accessed; + + block->accessed = TRUE; + + mutex_exit(&(buf_pool->mutex)); + + ut_ad(!ibuf_inside() || ibuf_page(block->space, block->offset)); + + if (rw_latch == RW_S_LATCH) { + success = rw_lock_s_lock_func_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + success = rw_lock_x_lock_func_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + if (!success) { + mutex_enter(&(buf_pool->mutex)); + + block->buf_fix_count--; +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&(block->debug_latch)); +#endif + mutex_exit(&(buf_pool->mutex)); + + return(FALSE); + } + + if (!UT_DULINT_EQ(modify_clock, block->modify_clock)) { +#ifdef UNIV_SYNC_DEBUG + buf_page_dbg_add_level(block->frame, SYNC_NO_ORDER_CHECK); +#endif /* UNIV_SYNC_DEBUG */ + if (rw_latch == RW_S_LATCH) { + rw_lock_s_unlock(&(block->lock)); + } else { + rw_lock_x_unlock(&(block->lock)); + } + + mutex_enter(&(buf_pool->mutex)); + + block->buf_fix_count--; +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&(block->debug_latch)); +#endif + mutex_exit(&(buf_pool->mutex)); + + return(FALSE); + } + + mtr_memo_push(mtr, block, fix_type); + +#ifdef UNIV_DEBUG + buf_dbg_counter++; + + if (buf_dbg_counter % 5771 == 0) { + ut_ad(buf_validate()); + } +#endif + ut_ad(block->buf_fix_count > 0); + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + +#ifdef UNIV_DEBUG_FILE_ACCESSES + ut_a(block->file_page_was_freed == FALSE); +#endif + if (!accessed) { + /* In the case of a first access, try to apply linear + read-ahead */ + + buf_read_ahead_linear(buf_frame_get_space_id(guess), + buf_frame_get_page_no(guess)); + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + buf_pool->n_page_gets++; + + return(TRUE); +} + +/************************************************************************ +This is used to get access to a known database page, when no waiting can be +done. For example, if a search in an adaptive hash index leads us to this +frame. */ + +ibool +buf_page_get_known_nowait( +/*======================*/ + /* out: TRUE if success */ + ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ + buf_frame_t* guess, /* in: the known page frame */ + ulint mode, /* in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */ + const char* file, /* in: file name */ + ulint line, /* in: line where called */ + mtr_t* mtr) /* in: mini-transaction */ +{ + buf_block_t* block; + ibool success; + ulint fix_type; + + ut_ad(mtr); + ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); + + mutex_enter(&(buf_pool->mutex)); + + block = buf_block_align(guess); + + if (block->state == BUF_BLOCK_REMOVE_HASH) { + /* Another thread is just freeing the block from the LRU list + of the buffer pool: do not try to access this page; this + attempt to access the page can only come through the hash + index because when the buffer block state is ..._REMOVE_HASH, + we have already removed it from the page address hash table + of the buffer pool. */ + + mutex_exit(&(buf_pool->mutex)); + + return(FALSE); + } + + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + +#ifdef UNIV_SYNC_DEBUG + buf_block_buf_fix_inc_debug(block, file, line); +#else + buf_block_buf_fix_inc(block); +#endif + if (mode == BUF_MAKE_YOUNG) { + buf_block_make_young(block); + } + + mutex_exit(&(buf_pool->mutex)); + + ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD)); + + if (rw_latch == RW_S_LATCH) { + success = rw_lock_s_lock_func_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + success = rw_lock_x_lock_func_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + if (!success) { + mutex_enter(&(buf_pool->mutex)); + + block->buf_fix_count--; +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&(block->debug_latch)); +#endif + mutex_exit(&(buf_pool->mutex)); + + return(FALSE); + } + + mtr_memo_push(mtr, block, fix_type); + +#ifdef UNIV_DEBUG + buf_dbg_counter++; + + if (buf_dbg_counter % 5771 == 0) { + ut_ad(buf_validate()); + } +#endif + ut_ad(block->buf_fix_count > 0); + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); +#ifdef UNIV_DEBUG_FILE_ACCESSES + ut_a(block->file_page_was_freed == FALSE); +#endif + +#ifdef UNIV_IBUF_DEBUG + ut_a((mode == BUF_KEEP_OLD) + || (ibuf_count_get(block->space, block->offset) == 0)); +#endif + buf_pool->n_page_gets++; + + return(TRUE); +} + +/************************************************************************ +Inits a page to the buffer buf_pool, for use in ibbackup --restore. */ + +void +buf_page_init_for_backup_restore( +/*=============================*/ + ulint space, /* in: space id */ + ulint offset, /* in: offset of the page within space + in units of a page */ + buf_block_t* block) /* in: block to init */ +{ + /* Set the state of the block */ + block->magic_n = BUF_BLOCK_MAGIC_N; + + block->state = BUF_BLOCK_FILE_PAGE; + block->space = space; + block->offset = offset; + + block->lock_hash_val = 0; + block->lock_mutex = NULL; + + block->freed_page_clock = 0; + + block->newest_modification = ut_dulint_zero; + block->oldest_modification = ut_dulint_zero; + + block->accessed = FALSE; + block->buf_fix_count = 0; + block->io_fix = 0; + + block->n_hash_helps = 0; + block->is_hashed = FALSE; + block->n_fields = 1; + block->n_bytes = 0; + block->side = BTR_SEARCH_LEFT_SIDE; + + block->file_page_was_freed = FALSE; +} + +/************************************************************************ +Inits a page to the buffer buf_pool. */ +static +void +buf_page_init( +/*==========*/ + ulint space, /* in: space id */ + ulint offset, /* in: offset of the page within space + in units of a page */ + buf_block_t* block) /* in: block to init */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(buf_pool->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + ut_a(block->state != BUF_BLOCK_FILE_PAGE); + + /* Set the state of the block */ + block->magic_n = BUF_BLOCK_MAGIC_N; + + block->state = BUF_BLOCK_FILE_PAGE; + block->space = space; + block->offset = offset; + + block->check_index_page_at_flush = FALSE; + block->index = NULL; + + block->lock_hash_val = lock_rec_hash(space, offset); + block->lock_mutex = NULL; + + /* Insert into the hash table of file pages */ + + if (buf_page_hash_get(space, offset)) { + fprintf(stderr, +"InnoDB: Error: page %lu %lu already found from the hash table\n", + (ulong) space, + (ulong) offset); +#ifdef UNIV_DEBUG + buf_print(); + buf_LRU_print(); + buf_validate(); + buf_LRU_validate(); +#endif /* UNIV_DEBUG */ + ut_a(0); + } + + HASH_INSERT(buf_block_t, hash, buf_pool->page_hash, + buf_page_address_fold(space, offset), block); + + block->freed_page_clock = 0; + + block->newest_modification = ut_dulint_zero; + block->oldest_modification = ut_dulint_zero; + + block->accessed = FALSE; + block->buf_fix_count = 0; + block->io_fix = 0; + + block->n_hash_helps = 0; + block->is_hashed = FALSE; + block->n_fields = 1; + block->n_bytes = 0; + block->side = BTR_SEARCH_LEFT_SIDE; + + block->file_page_was_freed = FALSE; +} + +/************************************************************************ +Function which inits a page for read to the buffer buf_pool. If the page is +(1) already in buf_pool, or +(2) if we specify to read only ibuf pages and the page is not an ibuf page, or +(3) if the space is deleted or being deleted, +then this function does nothing. +Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock +on the buffer frame. The io-handler must take care that the flag is cleared +and the lock released later. This is one of the functions which perform the +state transition NOT_USED => FILE_PAGE to a block (the other is +buf_page_create). */ + +buf_block_t* +buf_page_init_for_read( +/*===================*/ + /* out: pointer to the block or NULL */ + ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED */ + ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */ + ulint space, /* in: space id */ + ib_longlong tablespace_version,/* in: prevents reading from a wrong + version of the tablespace in case we have done + DISCARD + IMPORT */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + mtr_t mtr; + + ut_ad(buf_pool); + + *err = DB_SUCCESS; + + if (mode == BUF_READ_IBUF_PAGES_ONLY) { + /* It is a read-ahead within an ibuf routine */ + + ut_ad(!ibuf_bitmap_page(offset)); + ut_ad(ibuf_inside()); + + mtr_start(&mtr); + + if (!ibuf_page_low(space, offset, &mtr)) { + + mtr_commit(&mtr); + + return(NULL); + } + } else { + ut_ad(mode == BUF_READ_ANY_PAGE); + } + + block = buf_block_alloc(); + + ut_a(block); + + mutex_enter(&(buf_pool->mutex)); + + if (fil_tablespace_deleted_or_being_deleted_in_mem(space, + tablespace_version)) { + *err = DB_TABLESPACE_DELETED; + } + + if (*err == DB_TABLESPACE_DELETED + || NULL != buf_page_hash_get(space, offset)) { + + /* The page belongs to a space which has been deleted or is + being deleted, or the page is already in buf_pool, return */ + + mutex_exit(&(buf_pool->mutex)); + buf_block_free(block); + + if (mode == BUF_READ_IBUF_PAGES_ONLY) { + + mtr_commit(&mtr); + } + + return(NULL); + } + + ut_ad(block); + + buf_page_init(space, offset, block); + + /* The block must be put to the LRU list, to the old blocks */ + + buf_LRU_add_block(block, TRUE); /* TRUE == to old blocks */ + + block->io_fix = BUF_IO_READ; + buf_pool->n_pend_reads++; + + /* We set a pass-type x-lock on the frame because then the same + thread which called for the read operation (and is running now at + this point of code) can wait for the read to complete by waiting + for the x-lock on the frame; if the x-lock were recursive, the + same thread would illegally get the x-lock before the page read + is completed. The x-lock is cleared by the io-handler thread. */ + + rw_lock_x_lock_gen(&(block->lock), BUF_IO_READ); + + mutex_exit(&(buf_pool->mutex)); + + if (mode == BUF_READ_IBUF_PAGES_ONLY) { + + mtr_commit(&mtr); + } + + return(block); +} + +/************************************************************************ +Initializes a page to the buffer buf_pool. The page is usually not read +from a file even if it cannot be found in the buffer buf_pool. This is one +of the functions which perform to a block a state transition NOT_USED => +FILE_PAGE (the other is buf_page_init_for_read above). */ + +buf_frame_t* +buf_page_create( +/*============*/ + /* out: pointer to the frame, page bufferfixed */ + ulint space, /* in: space id */ + ulint offset, /* in: offset of the page within space in units of + a page */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + buf_frame_t* frame; + buf_block_t* block; + buf_block_t* free_block = NULL; + + ut_ad(mtr); + + free_block = buf_LRU_get_free_block(); + + mutex_enter(&(buf_pool->mutex)); + + block = buf_page_hash_get(space, offset); + + if (block != NULL) { +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + block->file_page_was_freed = FALSE; + + /* Page can be found in buf_pool */ + mutex_exit(&(buf_pool->mutex)); + + buf_block_free(free_block); + + frame = buf_page_get_with_no_latch(space, offset, mtr); + + return(frame); + } + + /* If we get here, the page was not in buf_pool: init it there */ + + if (buf_debug_prints) { + fprintf(stderr, "Creating space %lu page %lu to buffer\n", + (ulong) space, (ulong) offset); + } + + block = free_block; + + buf_page_init(space, offset, block); + + /* The block must be put to the LRU list */ + buf_LRU_add_block(block, FALSE); + +#ifdef UNIV_SYNC_DEBUG + buf_block_buf_fix_inc_debug(block, __FILE__, __LINE__); +#else + buf_block_buf_fix_inc(block); +#endif + mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); + + block->accessed = TRUE; + + buf_pool->n_pages_created++; + + mutex_exit(&(buf_pool->mutex)); + + /* Delete possible entries for the page from the insert buffer: + such can exist if the page belonged to an index which was dropped */ + + ibuf_merge_or_delete_for_page(NULL, space, offset, TRUE); + + /* Flush pages from the end of the LRU list if necessary */ + buf_flush_free_margin(); + + frame = block->frame; +#ifdef UNIV_DEBUG + buf_dbg_counter++; + + if (buf_dbg_counter % 357 == 0) { + ut_ad(buf_validate()); + } +#endif +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + return(frame); +} + +/************************************************************************ +Completes an asynchronous read or write request of a file page to or from +the buffer pool. */ + +void +buf_page_io_complete( +/*=================*/ + buf_block_t* block) /* in: pointer to the block in question */ +{ + ulint io_type; + ulint read_page_no; + + ut_ad(block); + + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + + io_type = block->io_fix; + + if (io_type == BUF_IO_READ) { + /* If this page is not uninitialized and not in the + doublewrite buffer, then the page number should be the + same as in block */ + + read_page_no = mach_read_from_4((block->frame) + + FIL_PAGE_OFFSET); + if (read_page_no != 0 + && !trx_doublewrite_page_inside(read_page_no) + && read_page_no != block->offset) { + + fprintf(stderr, +"InnoDB: Error: page n:o stored in the page read in is %lu, should be %lu!\n", + (ulong) read_page_no, (ulong) block->offset); + } + /* From version 3.23.38 up we store the page checksum + to the 4 first bytes of the page end lsn field */ + + if (buf_page_is_corrupted(block->frame)) { + fprintf(stderr, + "InnoDB: Database page corruption on disk or a failed\n" + "InnoDB: file read of page %lu.\n", (ulong) block->offset); + + fputs( + "InnoDB: You may have to recover from a backup.\n", stderr); + + buf_page_print(block->frame); + + fprintf(stderr, + "InnoDB: Database page corruption on disk or a failed\n" + "InnoDB: file read of page %lu.\n", (ulong) block->offset); + fputs( + "InnoDB: You may have to recover from a backup.\n", stderr); + fputs( + "InnoDB: It is also possible that your operating\n" + "InnoDB: system has corrupted its own file cache\n" + "InnoDB: and rebooting your computer removes the\n" + "InnoDB: error.\n" + "InnoDB: If the corrupt page is an index page\n" + "InnoDB: you can also try to fix the corruption\n" + "InnoDB: by dumping, dropping, and reimporting\n" + "InnoDB: the corrupt table. You can use CHECK\n" + "InnoDB: TABLE to scan your table for corruption.\n" + "InnoDB: See also " + "http://dev.mysql.com/doc/mysql/en/Forcing_recovery.html\n" + "InnoDB: about forcing recovery.\n", stderr); + + if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) { + fputs( + "InnoDB: Ending processing because of a corrupt database page.\n", + stderr); + exit(1); + } + } + + if (recv_recovery_is_on()) { + recv_recover_page(FALSE, TRUE, block->frame, + block->space, block->offset); + } + + if (!recv_no_ibuf_operations) { + ibuf_merge_or_delete_for_page(block->frame, + block->space, block->offset, TRUE); + } + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + mutex_enter(&(buf_pool->mutex)); + + /* Because this thread which does the unlocking is not the same that + did the locking, we use a pass value != 0 in unlock, which simply + removes the newest lock debug record, without checking the thread + id. */ + + block->io_fix = 0; + + if (io_type == BUF_IO_READ) { + /* NOTE that the call to ibuf may have moved the ownership of + the x-latch to this OS thread: do not let this confuse you in + debugging! */ + + ut_ad(buf_pool->n_pend_reads > 0); + buf_pool->n_pend_reads--; + buf_pool->n_pages_read++; + + rw_lock_x_unlock_gen(&(block->lock), BUF_IO_READ); + + if (buf_debug_prints) { + fputs("Has read ", stderr); + } + } else { + ut_ad(io_type == BUF_IO_WRITE); + + /* Write means a flush operation: call the completion + routine in the flush system */ + + buf_flush_write_complete(block); + + rw_lock_s_unlock_gen(&(block->lock), BUF_IO_WRITE); + + buf_pool->n_pages_written++; + + if (buf_debug_prints) { + fputs("Has written ", stderr); + } + } + + mutex_exit(&(buf_pool->mutex)); + + if (buf_debug_prints) { + fprintf(stderr, "page space %lu page no %lu\n", + (ulong) block->space, (ulong) block->offset); + } +} + +/************************************************************************* +Invalidates the file pages in the buffer pool when an archive recovery is +completed. All the file pages buffered must be in a replaceable state when +this function is called: not latched and not modified. */ + +void +buf_pool_invalidate(void) +/*=====================*/ +{ + ibool freed; + + ut_ad(buf_all_freed()); + + freed = TRUE; + + while (freed) { + freed = buf_LRU_search_and_free_block(100); + } + + mutex_enter(&(buf_pool->mutex)); + + ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0); + + mutex_exit(&(buf_pool->mutex)); +} + +/************************************************************************* +Validates the buffer buf_pool data structure. */ + +ibool +buf_validate(void) +/*==============*/ +{ + buf_block_t* block; + ulint i; + ulint n_single_flush = 0; + ulint n_lru_flush = 0; + ulint n_list_flush = 0; + ulint n_lru = 0; + ulint n_flush = 0; + ulint n_free = 0; + ulint n_page = 0; + + ut_ad(buf_pool); + + mutex_enter(&(buf_pool->mutex)); + + for (i = 0; i < buf_pool->curr_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + + if (block->state == BUF_BLOCK_FILE_PAGE) { + + ut_a(buf_page_hash_get(block->space, + block->offset) == block); + n_page++; + +#ifdef UNIV_IBUF_DEBUG + ut_a((block->io_fix == BUF_IO_READ) + || ibuf_count_get(block->space, block->offset) + == 0); +#endif + if (block->io_fix == BUF_IO_WRITE) { + + if (block->flush_type == BUF_FLUSH_LRU) { + n_lru_flush++; + ut_a(rw_lock_is_locked(&(block->lock), + RW_LOCK_SHARED)); + } else if (block->flush_type == + BUF_FLUSH_LIST) { + n_list_flush++; + } else if (block->flush_type == + BUF_FLUSH_SINGLE_PAGE) { + n_single_flush++; + } else { + ut_error; + } + + } else if (block->io_fix == BUF_IO_READ) { + + ut_a(rw_lock_is_locked(&(block->lock), + RW_LOCK_EX)); + } + + n_lru++; + + if (ut_dulint_cmp(block->oldest_modification, + ut_dulint_zero) > 0) { + n_flush++; + } + + } else if (block->state == BUF_BLOCK_NOT_USED) { + n_free++; + } + } + + if (n_lru + n_free > buf_pool->curr_size) { + fprintf(stderr, "n LRU %lu, n free %lu\n", (ulong) n_lru, (ulong) n_free); + ut_error; + } + + ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru); + if (UT_LIST_GET_LEN(buf_pool->free) != n_free) { + fprintf(stderr, "Free list len %lu, free blocks %lu\n", + (ulong) UT_LIST_GET_LEN(buf_pool->free), (ulong) n_free); + ut_error; + } + ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush); + + ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush); + ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush); + ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush); + + mutex_exit(&(buf_pool->mutex)); + + ut_a(buf_LRU_validate()); + ut_a(buf_flush_validate()); + + return(TRUE); +} + +/************************************************************************* +Prints info of the buffer buf_pool data structure. */ + +void +buf_print(void) +/*===========*/ +{ + dulint* index_ids; + ulint* counts; + ulint size; + ulint i; + ulint j; + dulint id; + ulint n_found; + buf_frame_t* frame; + dict_index_t* index; + + ut_ad(buf_pool); + + size = buf_pool->curr_size; + + index_ids = mem_alloc(sizeof(dulint) * size); + counts = mem_alloc(sizeof(ulint) * size); + + mutex_enter(&(buf_pool->mutex)); + + fprintf(stderr, + "buf_pool size %lu\n" + "database pages %lu\n" + "free pages %lu\n" + "modified database pages %lu\n" + "n pending reads %lu\n" + "n pending flush LRU %lu list %lu single page %lu\n" + "pages read %lu, created %lu, written %lu\n", + (ulong) size, + (ulong) UT_LIST_GET_LEN(buf_pool->LRU), + (ulong) UT_LIST_GET_LEN(buf_pool->free), + (ulong) UT_LIST_GET_LEN(buf_pool->flush_list), + (ulong) buf_pool->n_pend_reads, + (ulong) buf_pool->n_flush[BUF_FLUSH_LRU], + (ulong) buf_pool->n_flush[BUF_FLUSH_LIST], + (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE], + (ulong) buf_pool->n_pages_read, buf_pool->n_pages_created, + (ulong) buf_pool->n_pages_written); + + /* Count the number of blocks belonging to each index in the buffer */ + + n_found = 0; + + for (i = 0 ; i < size; i++) { + counts[i] = 0; + } + + for (i = 0; i < size; i++) { + frame = buf_pool_get_nth_block(buf_pool, i)->frame; + + if (fil_page_get_type(frame) == FIL_PAGE_INDEX) { + + id = btr_page_get_index_id(frame); + + /* Look for the id in the index_ids array */ + j = 0; + + while (j < n_found) { + + if (ut_dulint_cmp(index_ids[j], id) == 0) { + (counts[j])++; + + break; + } + j++; + } + + if (j == n_found) { + n_found++; + index_ids[j] = id; + counts[j] = 1; + } + } + } + + mutex_exit(&(buf_pool->mutex)); + + for (i = 0; i < n_found; i++) { + index = dict_index_get_if_in_cache(index_ids[i]); + + fprintf(stderr, + "Block count for index %lu in buffer is about %lu", + (ulong) ut_dulint_get_low(index_ids[i]), + (ulong) counts[i]); + + if (index) { + putc(' ', stderr); + dict_index_name_print(stderr, NULL, index); + } + + putc('\n', stderr); + } + + mem_free(index_ids); + mem_free(counts); + + ut_a(buf_validate()); +} + +/************************************************************************* +Returns the number of latched pages in the buffer pool. */ + +ulint +buf_get_latched_pages_number(void) +{ + buf_block_t* block; + ulint i; + ulint fixed_pages_number = 0; + + mutex_enter(&(buf_pool->mutex)); + + for (i = 0; i < buf_pool->curr_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + + if (((block->buf_fix_count != 0) || (block->io_fix != 0)) && + block->magic_n == BUF_BLOCK_MAGIC_N ) + fixed_pages_number++; + } + + mutex_exit(&(buf_pool->mutex)); + return fixed_pages_number; +} + +/************************************************************************* +Returns the number of pending buf pool ios. */ + +ulint +buf_get_n_pending_ios(void) +/*=======================*/ +{ + return(buf_pool->n_pend_reads + + buf_pool->n_flush[BUF_FLUSH_LRU] + + buf_pool->n_flush[BUF_FLUSH_LIST] + + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]); +} + +/************************************************************************* +Returns the ratio in percents of modified pages in the buffer pool / +database pages in the buffer pool. */ + +ulint +buf_get_modified_ratio_pct(void) +/*============================*/ +{ + ulint ratio; + + mutex_enter(&(buf_pool->mutex)); + + ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list)) + / (1 + UT_LIST_GET_LEN(buf_pool->LRU) + + UT_LIST_GET_LEN(buf_pool->free)); + + /* 1 + is there to avoid division by zero */ + + mutex_exit(&(buf_pool->mutex)); + + return(ratio); +} + +/************************************************************************* +Prints info of the buffer i/o. */ + +void +buf_print_io( +/*=========*/ + FILE* file) /* in/out: buffer where to print */ +{ + time_t current_time; + double time_elapsed; + ulint size; + + ut_ad(buf_pool); + size = buf_pool->curr_size; + + mutex_enter(&(buf_pool->mutex)); + + if (srv_use_awe) { + fprintf(stderr, + "AWE: Buffer pool memory frames %lu\n", + (ulong) buf_pool->n_frames); + + fprintf(stderr, + "AWE: Database pages and free buffers mapped in frames %lu\n", + (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped)); + } + fprintf(file, + "Buffer pool size %lu\n" + "Free buffers %lu\n" + "Database pages %lu\n" + "Modified db pages %lu\n" + "Pending reads %lu\n" + "Pending writes: LRU %lu, flush list %lu, single page %lu\n", + (ulong) size, + (ulong) UT_LIST_GET_LEN(buf_pool->free), + (ulong) UT_LIST_GET_LEN(buf_pool->LRU), + (ulong) UT_LIST_GET_LEN(buf_pool->flush_list), + (ulong) buf_pool->n_pend_reads, + (ulong) buf_pool->n_flush[BUF_FLUSH_LRU] + + buf_pool->init_flush[BUF_FLUSH_LRU], + (ulong) buf_pool->n_flush[BUF_FLUSH_LIST] + + buf_pool->init_flush[BUF_FLUSH_LIST], + (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]); + + current_time = time(NULL); + time_elapsed = 0.001 + difftime(current_time, + buf_pool->last_printout_time); + buf_pool->last_printout_time = current_time; + + fprintf(file, + "Pages read %lu, created %lu, written %lu\n" + "%.2f reads/s, %.2f creates/s, %.2f writes/s\n", + (ulong) buf_pool->n_pages_read, + (ulong) buf_pool->n_pages_created, + (ulong) buf_pool->n_pages_written, + (buf_pool->n_pages_read - buf_pool->n_pages_read_old) + / time_elapsed, + (buf_pool->n_pages_created - buf_pool->n_pages_created_old) + / time_elapsed, + (buf_pool->n_pages_written - buf_pool->n_pages_written_old) + / time_elapsed); + + if (srv_use_awe) { + fprintf(file, "AWE: %.2f page remaps/s\n", + (buf_pool->n_pages_awe_remapped + - buf_pool->n_pages_awe_remapped_old) + / time_elapsed); + } + + if (buf_pool->n_page_gets > buf_pool->n_page_gets_old) { + fprintf(file, "Buffer pool hit rate %lu / 1000\n", + (ulong) (1000 + - ((1000 * + (buf_pool->n_pages_read - buf_pool->n_pages_read_old)) + / (buf_pool->n_page_gets - buf_pool->n_page_gets_old)))); + } else { + fputs("No buffer pool page gets since the last printout\n", + file); + } + + buf_pool->n_page_gets_old = buf_pool->n_page_gets; + buf_pool->n_pages_read_old = buf_pool->n_pages_read; + buf_pool->n_pages_created_old = buf_pool->n_pages_created; + buf_pool->n_pages_written_old = buf_pool->n_pages_written; + buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped; + + mutex_exit(&(buf_pool->mutex)); +} + +/************************************************************************** +Refreshes the statistics used to print per-second averages. */ + +void +buf_refresh_io_stats(void) +/*======================*/ +{ + buf_pool->last_printout_time = time(NULL); + buf_pool->n_page_gets_old = buf_pool->n_page_gets; + buf_pool->n_pages_read_old = buf_pool->n_pages_read; + buf_pool->n_pages_created_old = buf_pool->n_pages_created; + buf_pool->n_pages_written_old = buf_pool->n_pages_written; + buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped; +} + +/************************************************************************* +Checks that all file pages in the buffer are in a replaceable state. */ + +ibool +buf_all_freed(void) +/*===============*/ +{ + buf_block_t* block; + ulint i; + + ut_ad(buf_pool); + + mutex_enter(&(buf_pool->mutex)); + + for (i = 0; i < buf_pool->curr_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + + if (block->state == BUF_BLOCK_FILE_PAGE) { + + if (!buf_flush_ready_for_replace(block)) { + + fprintf(stderr, + "Page %lu %lu still fixed or dirty\n", + (ulong) block->space, (ulong) block->offset); + ut_error; + } + } + } + + mutex_exit(&(buf_pool->mutex)); + + return(TRUE); +} + +/************************************************************************* +Checks that there currently are no pending i/o-operations for the buffer +pool. */ + +ibool +buf_pool_check_no_pending_io(void) +/*==============================*/ + /* out: TRUE if there is no pending i/o */ +{ + ibool ret; + + mutex_enter(&(buf_pool->mutex)); + + if (buf_pool->n_pend_reads + buf_pool->n_flush[BUF_FLUSH_LRU] + + buf_pool->n_flush[BUF_FLUSH_LIST] + + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]) { + ret = FALSE; + } else { + ret = TRUE; + } + + mutex_exit(&(buf_pool->mutex)); + + return(ret); +} + +/************************************************************************* +Gets the current length of the free list of buffer blocks. */ + +ulint +buf_get_free_list_len(void) +/*=======================*/ +{ + ulint len; + + mutex_enter(&(buf_pool->mutex)); + + len = UT_LIST_GET_LEN(buf_pool->free); + + mutex_exit(&(buf_pool->mutex)); + + return(len); +} |