diff options
author | unknown <monty@donna.mysql.com> | 2001-02-17 14:19:19 +0200 |
---|---|---|
committer | unknown <monty@donna.mysql.com> | 2001-02-17 14:19:19 +0200 |
commit | 2662b59306ef0cd495fa6e2edf7129e58a11393a (patch) | |
tree | bfe39951a73e906579ab819bf5198ad8f3a64a36 /innobase/buf/buf0buf.c | |
parent | 66de55a56bdcf2f7a9c0c4f8e19b3e761475e202 (diff) | |
download | mariadb-git-2662b59306ef0cd495fa6e2edf7129e58a11393a.tar.gz |
Added Innobase to source distribution
Docs/manual.texi:
Added Innobase documentation
configure.in:
Incremented version
include/my_base.h:
Added option for Innobase
myisam/mi_check.c:
cleanup
mysql-test/t/bdb.test:
cleanup
mysql-test/t/innobase.test:
Extended with new tests from bdb.test
mysql-test/t/merge.test:
Added test of SHOW create
mysys/my_init.c:
Fix for UNIXWARE 7
scripts/mysql_install_db.sh:
Always write how to start mysqld
scripts/safe_mysqld.sh:
Fixed type
sql/ha_innobase.cc:
Update to new version
sql/ha_innobase.h:
Update to new version
sql/handler.h:
Added 'update_table_comment()' and 'append_create_info()'
sql/sql_delete.cc:
Fixes for Innobase
sql/sql_select.cc:
Fixes for Innobase
sql/sql_show.cc:
Append create information (for MERGE tables)
sql/sql_update.cc:
Fixes for Innobase
Diffstat (limited to 'innobase/buf/buf0buf.c')
-rw-r--r-- | innobase/buf/buf0buf.c | 1568 |
1 files changed, 1568 insertions, 0 deletions
diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c new file mode 100644 index 00000000000..4ffda8772f3 --- /dev/null +++ b/innobase/buf/buf0buf.c @@ -0,0 +1,1568 @@ +/* Innobase relational database engine; Copyright (C) 2001 Innobase Oy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License 2 + as published by the Free Software Foundation in June 1991. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License 2 + along with this program (in file COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +/****************************************************** +The database buffer buf_pool + +(c) 1995 Innobase Oy + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0buf.h" + +#ifdef UNIV_NONINL +#include "buf0buf.ic" +#endif + +#include "mem0mem.h" +#include "btr0btr.h" +#include "fil0fil.h" +#include "lock0lock.h" +#include "btr0sea.h" +#include "ibuf0ibuf.h" +#include "dict0dict.h" +#include "log0recv.h" + +/* + IMPLEMENTATION OF THE BUFFER POOL + ================================= + +Performance improvement: +------------------------ +Thread scheduling in NT may be so slow that the OS wait mechanism should +not be used even in waiting for disk reads to complete. +Rather, we should put waiting query threads to the queue of +waiting jobs, and let the OS thread do something useful while the i/o +is processed. In this way we could remove most OS thread switches in +an i/o-intensive benchmark like TPC-C. + +A possibility is to put a user space thread library between the database +and NT. User space thread libraries might be very fast. + +SQL Server 7.0 can be configured to use 'fibers' which are lightweight +threads in NT. These should be studied. + + Buffer frames and blocks + ------------------------ +Following the terminology of Gray and Reuter, we call the memory +blocks where file pages are loaded buffer frames. For each buffer +frame there is a control block, or shortly, a block, in the buffer +control array. The control info which does not need to be stored +in the file along with the file page, resides in the control block. + + Buffer pool struct + ------------------ +The buffer buf_pool contains a single mutex which protects all the +control data structures of the buf_pool. The content of a buffer frame is +protected by a separate read-write lock in its control block, though. +These locks can be locked and unlocked without owning the buf_pool mutex. +The OS events in the buf_pool struct can be waited for without owning the +buf_pool mutex. + +The buf_pool mutex is a hot-spot in main memory, causing a lot of +memory bus traffic on multiprocessor systems when processors +alternately access the mutex. On our Pentium, the mutex is accessed +maybe every 10 microseconds. We gave up the solution to have mutexes +for each control block, for instance, because it seemed to be +complicated. + +A solution to reduce mutex contention of the buf_pool mutex is to +create a separate mutex for the page hash table. On Pentium, +accessing the hash table takes 2 microseconds, about half +of the total buf_pool mutex hold time. + + Control blocks + -------------- + +The control block contains, for instance, the bufferfix count +which is incremented when a thread wants a file page to be fixed +in a buffer frame. The bufferfix operation does not lock the +contents of the frame, however. For this purpose, the control +block contains a read-write lock. + +The buffer frames have to be aligned so that the start memory +address of a frame is divisible by the universal page size, which +is a power of two. + +We intend to make the buffer buf_pool size on-line reconfigurable, +that is, the buf_pool size can be changed without closing the database. +Then the database administarator may adjust it to be bigger +at night, for example. The control block array must +contain enough control blocks for the maximum buffer buf_pool size +which is used in the particular database. +If the buf_pool size is cut, we exploit the virtual memory mechanism of +the OS, and just refrain from using frames at high addresses. Then the OS +can swap them to disk. + +The control blocks containing file pages are put to a hash table +according to the file address of the page. +We could speed up the access to an individual page by using +"pointer swizzling": we could replace the page references on +non-leaf index pages by direct pointers to the page, if it exists +in the buf_pool. We could make a separate hash table where we could +chain all the page references in non-leaf pages residing in the buf_pool, +using the page reference as the hash key, +and at the time of reading of a page update the pointers accordingly. +Drawbacks of this solution are added complexity and, +possibly, extra space required on non-leaf pages for memory pointers. +A simpler solution is just to speed up the hash table mechanism +in the database, using tables whose size is a power of 2. + + Lists of blocks + --------------- + +There are several lists of control blocks. The free list contains +blocks which are currently not used. + +The LRU-list contains all the blocks holding a file page +except those for which the bufferfix count is non-zero. +The pages are in the LRU list roughly in the order of the last +access to the page, so that the oldest pages are at the end of the +list. We also keep a pointer to near the end of the LRU list, +which we can use when we want to artificially age a page in the +buf_pool. This is used if we know that some page is not needed +again for some time: we insert the block right after the pointer, +causing it to be replaced sooner than would noramlly be the case. +Currently this aging mechanism is used for read-ahead mechanism +of pages, and it can also be used when there is a scan of a full +table which cannot fit in the memory. Putting the pages near the +of the LRU list, we make sure that most of the buf_pool stays in the +main memory, undisturbed. + +The chain of modified blocks contains the blocks +holding file pages that have been modified in the memory +but not written to disk yet. The block with the oldest modification +which has not yet been written to disk is at the end of the chain. + + Loading a file page + ------------------- + +First, a victim block for replacement has to be found in the +buf_pool. It is taken from the free list or searched for from the +end of the LRU-list. An exclusive lock is reserved for the frame, +the io_fix field is set in the block fixing the block in buf_pool, +and the io-operation for loading the page is queued. The io-handler thread +releases the X-lock on the frame and resets the io_fix field +when the io operation completes. + +A thread may request the above operation using the buf_page_get- +function. It may then continue to request a lock on the frame. +The lock is granted when the io-handler releases the x-lock. + + Read-ahead + ---------- + +The read-ahead mechanism is intended to be intelligent and +isolated from the semantically higher levels of the database +index management. From the higher level we only need the +information if a file page has a natural successor or +predecessor page. On the leaf level of a B-tree index, +these are the next and previous pages in the natural +order of the pages. + +Let us first explain the read-ahead mechanism when the leafs +of a B-tree are scanned in an ascending or descending order. +When a read page is the first time referenced in the buf_pool, +the buffer manager checks if it is at the border of a so-called +linear read-ahead area. The tablespace is divided into these +areas of size 64 blocks, for example. So if the page is at the +border of such an area, the read-ahead mechanism checks if +all the other blocks in the area have been accessed in an +ascending or descending order. If this is the case, the system +looks at the natural successor or predecessor of the page, +checks if that is at the border of another area, and in this case +issues read-requests for all the pages in that area. Maybe +we could relax the condition that all the pages in the area +have to be accessed: if data is deleted from a table, there may +appear holes of unused pages in the area. + +A different read-ahead mechanism is used when there appears +to be a random access pattern to a file. +If a new page is referenced in the buf_pool, and several pages +of its random access area (for instance, 32 consecutive pages +in a tablespace) have recently been referenced, we may predict +that the whole area may be needed in the near future, and issue +the read requests for the whole area. */ + +buf_pool_t* buf_pool = NULL; /* The buffer buf_pool of the database */ + +ulint buf_dbg_counter = 0; /* This is used to insert validation + operations in excution in the + debug version */ +ibool buf_debug_prints = FALSE; /* If this is set TRUE, + the program prints info whenever + read-ahead or flush occurs */ + +/************************************************************************ +Initializes a buffer control block when the buf_pool is created. */ +static +void +buf_block_init( +/*===========*/ + buf_block_t* block, /* in: pointer to control block */ + byte* frame) /* in: pointer to buffer frame */ +{ + block->state = BUF_BLOCK_NOT_USED; + + block->frame = frame; + + block->modify_clock = ut_dulint_zero; + + rw_lock_create(&(block->lock)); + ut_ad(rw_lock_validate(&(block->lock))); + + rw_lock_create(&(block->read_lock)); + rw_lock_set_level(&(block->read_lock), SYNC_NO_ORDER_CHECK); + + rw_lock_create(&(block->debug_latch)); + rw_lock_set_level(&(block->debug_latch), SYNC_NO_ORDER_CHECK); +} + +/************************************************************************ +Creates a buffer buf_pool object. */ +static +buf_pool_t* +buf_pool_create( +/*============*/ + /* out, own: buf_pool object, NULL if not + enough memory */ + ulint max_size, /* in: maximum size of the buf_pool in + blocks */ + ulint curr_size) /* in: current size to use, must be <= + max_size, currently must be equal to + max_size */ +{ + byte* frame; + ulint i; + buf_block_t* block; + + ut_a(max_size == curr_size); + + buf_pool = mem_alloc(sizeof(buf_pool_t)); + + /* 1. Initialize general fields + ---------------------------- */ + mutex_create(&(buf_pool->mutex)); + mutex_set_level(&(buf_pool->mutex), SYNC_BUF_POOL); + + mutex_enter(&(buf_pool->mutex)); + + buf_pool->frame_mem = ut_malloc(UNIV_PAGE_SIZE * (max_size + 1)); + + if (buf_pool->frame_mem == NULL) { + + return(NULL); + } + + buf_pool->blocks = ut_malloc(sizeof(buf_block_t) * max_size); + + if (buf_pool->blocks == NULL) { + + return(NULL); + } + + buf_pool->max_size = max_size; + buf_pool->curr_size = curr_size; + + /* Align pointer to the first frame */ + + frame = ut_align(buf_pool->frame_mem, UNIV_PAGE_SIZE); + buf_pool->frame_zero = frame; + + /* Init block structs and assign frames for them */ + for (i = 0; i < max_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + buf_block_init(block, frame); + frame = frame + UNIV_PAGE_SIZE; + } + + buf_pool->page_hash = hash_create(2 * max_size); + + buf_pool->n_pend_reads = 0; + buf_pool->n_pages_read = 0; + buf_pool->n_pages_written = 0; + buf_pool->n_pages_created = 0; + + /* 2. Initialize flushing fields + ---------------------------- */ + UT_LIST_INIT(buf_pool->flush_list); + + for (i = BUF_FLUSH_LRU; i <= BUF_FLUSH_LIST; i++) { + buf_pool->n_flush[i] = 0; + buf_pool->no_flush[i] = os_event_create(NULL); + } + + buf_pool->LRU_flush_ended = 0; + + buf_pool->ulint_clock = 1; + buf_pool->freed_page_clock = 0; + + /* 3. Initialize LRU fields + ---------------------------- */ + UT_LIST_INIT(buf_pool->LRU); + + buf_pool->LRU_old = NULL; + + /* Add control blocks to the free list */ + UT_LIST_INIT(buf_pool->free); + for (i = 0; i < curr_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + UT_LIST_ADD_FIRST(free, buf_pool->free, block); + } + + mutex_exit(&(buf_pool->mutex)); + + btr_search_sys_create(curr_size * UNIV_PAGE_SIZE / sizeof(void*) / 64); + + return(buf_pool); +} + +/************************************************************************ +Initializes the buffer buf_pool of the database. */ + +void +buf_pool_init( +/*==========*/ + ulint max_size, /* in: maximum size of the buf_pool in blocks */ + ulint curr_size) /* in: current size to use, must be <= + max_size */ +{ + ut_a(buf_pool == NULL); + + buf_pool_create(max_size, curr_size); + + ut_ad(buf_validate()); +} + +/************************************************************************ +Allocates a buffer block. */ +UNIV_INLINE +buf_block_t* +buf_block_alloc(void) +/*=================*/ + /* out, own: the allocated block */ +{ + buf_block_t* block; + + block = buf_LRU_get_free_block(); + + return(block); +} + +/************************************************************************ +Moves to the block to the start of the LRU list if there is a danger +that the block would drift out of the buffer pool. */ +UNIV_INLINE +void +buf_block_make_young( +/*=================*/ + buf_block_t* block) /* in: block to make younger */ +{ + if (buf_pool->freed_page_clock >= block->freed_page_clock + + 1 + (buf_pool->curr_size / 1024)) { + + /* There has been freeing activity in the LRU list: + best to move to the head of the LRU list */ + + buf_LRU_make_block_young(block); + } +} + +/************************************************************************ +Moves a page to the start of the buffer pool LRU list. This high-level +function can be used to prevent an important page from from slipping out of +the buffer pool. */ + +void +buf_page_make_young( +/*=================*/ + buf_frame_t* frame) /* in: buffer frame of a file page */ +{ + buf_block_t* block; + + mutex_enter(&(buf_pool->mutex)); + + block = buf_block_align(frame); + + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + + buf_LRU_make_block_young(block); + + mutex_exit(&(buf_pool->mutex)); +} + +/************************************************************************ +Frees a buffer block which does not contain a file page. */ +UNIV_INLINE +void +buf_block_free( +/*===========*/ + buf_block_t* block) /* in, own: block to be freed */ +{ + ut_ad(block->state != BUF_BLOCK_FILE_PAGE); + + mutex_enter(&(buf_pool->mutex)); + + buf_LRU_block_free_non_file_page(block); + + mutex_exit(&(buf_pool->mutex)); +} + +/************************************************************************* +Allocates a buffer frame. */ + +buf_frame_t* +buf_frame_alloc(void) +/*=================*/ + /* out: buffer frame */ +{ + return(buf_block_alloc()->frame); +} + +/************************************************************************* +Frees a buffer frame which does not contain a file page. */ + +void +buf_frame_free( +/*===========*/ + buf_frame_t* frame) /* in: buffer frame */ +{ + buf_block_free(buf_block_align(frame)); +} + +/************************************************************************ +Returns the buffer control block if the page can be found in the buffer +pool. NOTE that it is possible that the page is not yet read +from disk, though. This is a very low-level function: use with care! */ + +buf_block_t* +buf_page_peek_block( +/*================*/ + /* out: control block if found from page hash table, + otherwise NULL; NOTE that the page is not necessarily + yet read from disk! */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + + mutex_enter_fast(&(buf_pool->mutex)); + + block = buf_page_hash_get(space, offset); + + mutex_exit(&(buf_pool->mutex)); + + return(block); +} + +/************************************************************************ +Returns the current state of is_hashed of a page. FALSE if the page is +not in the pool. NOTE that this operation does not fix the page in the +pool if it is found there. */ + +ibool +buf_page_peek_if_search_hashed( +/*===========================*/ + /* out: TRUE if page hash index is built in search + system */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + ibool is_hashed; + + mutex_enter_fast(&(buf_pool->mutex)); + + block = buf_page_hash_get(space, offset); + + if (!block) { + is_hashed = FALSE; + } else { + is_hashed = block->is_hashed; + } + + mutex_exit(&(buf_pool->mutex)); + + return(is_hashed); +} + +/************************************************************************ +Returns TRUE if the page can be found in the buffer pool hash table. NOTE +that it is possible that the page is not yet read from disk, though. */ + +ibool +buf_page_peek( +/*==========*/ + /* out: TRUE if found from page hash table, + NOTE that the page is not necessarily yet read + from disk! */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + if (buf_page_peek_block(space, offset)) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************ +This is the general function used to get access to a database page. */ + +buf_frame_t* +buf_page_get_gen( +/*=============*/ + /* out: pointer to the frame or NULL */ + ulint space, /* in: space id */ + ulint offset, /* in: page number */ + ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ + buf_frame_t* guess, /* in: guessed frame or NULL */ + ulint mode, /* in: BUF_GET, BUF_GET_IF_IN_POOL, + BUF_GET_NO_LATCH, BUF_GET_NOWAIT */ +#ifdef UNIV_SYNC_DEBUG + char* file, /* in: file name */ + ulint line, /* in: line where called */ +#endif + mtr_t* mtr) /* in: mini-transaction */ +{ + buf_block_t* block; + ibool accessed; + ulint fix_type; + ibool success; + ibool must_read; + + ut_ad(mtr); + ut_ad((rw_latch == RW_S_LATCH) + || (rw_latch == RW_X_LATCH) + || (rw_latch == RW_NO_LATCH)); + ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH)); + ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL) + || (mode == BUF_GET_NO_LATCH) || (mode == BUF_GET_NOWAIT)); +#ifndef UNIV_LOG_DEBUG + ut_ad(!ibuf_inside() || ibuf_page(space, offset)); +#endif +loop: + mutex_enter_fast(&(buf_pool->mutex)); + + block = NULL; + + if (guess) { + block = buf_block_align(guess); + + if ((offset != block->offset) || (space != block->space) + || (block->state != BUF_BLOCK_FILE_PAGE)) { + + block = NULL; + } + } + + if (block == NULL) { + block = buf_page_hash_get(space, offset); + } + + if (block == NULL) { + /* Page not in buf_pool: needs to be read from file */ + + mutex_exit(&(buf_pool->mutex)); + + if (mode == BUF_GET_IF_IN_POOL) { + + return(NULL); + } + + buf_read_page(space, offset); + + #ifdef UNIV_DEBUG + buf_dbg_counter++; + + if (buf_dbg_counter % 37 == 0) { + ut_ad(buf_validate()); + } + #endif + goto loop; + } + + must_read = FALSE; + + if (block->io_fix == BUF_IO_READ) { + + must_read = TRUE; + + if (mode == BUF_GET_IF_IN_POOL) { + + /* The page is only being read to buffer */ + mutex_exit(&(buf_pool->mutex)); + + return(NULL); + } + } + +#ifdef UNIV_SYNC_DEBUG + buf_block_buf_fix_inc_debug(block, file, line); +#else + buf_block_buf_fix_inc(block); +#endif + buf_block_make_young(block); + + /* Check if this is the first access to the page */ + + accessed = block->accessed; + + block->accessed = TRUE; + + mutex_exit(&(buf_pool->mutex)); + +#ifdef UNIV_DEBUG + buf_dbg_counter++; + + if (buf_dbg_counter % 5771 == 0) { + ut_ad(buf_validate()); + } +#endif + ut_ad(block->buf_fix_count > 0); + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + + if (mode == BUF_GET_NOWAIT) { + if (rw_latch == RW_S_LATCH) { + success = rw_lock_s_lock_func_nowait(&(block->lock) + #ifdef UNIV_SYNC_DEBUG + ,file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + ut_ad(rw_latch == RW_X_LATCH); + success = rw_lock_x_lock_func_nowait(&(block->lock) + #ifdef UNIV_SYNC_DEBUG + ,file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + if (!success) { + mutex_enter(&(buf_pool->mutex)); + + block->buf_fix_count--; +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&(block->debug_latch)); +#endif + mutex_exit(&(buf_pool->mutex)); + + return(NULL); + } + } else if (rw_latch == RW_NO_LATCH) { + + if (must_read) { + rw_lock_x_lock(&(block->read_lock)); + rw_lock_x_unlock(&(block->read_lock)); + } + + fix_type = MTR_MEMO_BUF_FIX; + } else if (rw_latch == RW_S_LATCH) { + + rw_lock_s_lock_func(&(block->lock) + #ifdef UNIV_SYNC_DEBUG + ,0, file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + rw_lock_x_lock_func(&(block->lock), 0 + #ifdef UNIV_SYNC_DEBUG + , file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + mtr_memo_push(mtr, block, fix_type); + + if (!accessed) { + /* In the case of a first access, try to apply linear + read-ahead */ + + buf_read_ahead_linear(space, offset); + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + return(block->frame); +} + +/************************************************************************ +This is the general function used to get optimistic access to a database +page. */ + +ibool +buf_page_optimistic_get_func( +/*=========================*/ + /* out: TRUE if success */ + ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ + buf_frame_t* guess, /* in: guessed frame */ + dulint modify_clock,/* in: modify clock value if mode is + ..._GUESS_ON_CLOCK */ +#ifdef UNIV_SYNC_DEBUG + char* file, /* in: file name */ + ulint line, /* in: line where called */ +#endif + mtr_t* mtr) /* in: mini-transaction */ +{ + buf_block_t* block; + ibool accessed; + ibool success; + ulint fix_type; + + ut_ad(mtr && guess); + ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); + + block = buf_block_align(guess); + + mutex_enter(&(buf_pool->mutex)); + + if (block->state != BUF_BLOCK_FILE_PAGE) { + + mutex_exit(&(buf_pool->mutex)); + + return(FALSE); + } + +#ifdef UNIV_SYNC_DEBUG + buf_block_buf_fix_inc_debug(block, file, line); +#else + buf_block_buf_fix_inc(block); +#endif + buf_block_make_young(block); + + /* Check if this is the first access to the page */ + + accessed = block->accessed; + + block->accessed = TRUE; + + mutex_exit(&(buf_pool->mutex)); + + ut_ad(!ibuf_inside() || ibuf_page(block->space, block->offset)); + + if (rw_latch == RW_S_LATCH) { + success = rw_lock_s_lock_func_nowait(&(block->lock) + #ifdef UNIV_SYNC_DEBUG + , file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + success = rw_lock_x_lock_func_nowait(&(block->lock) + #ifdef UNIV_SYNC_DEBUG + , file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + if (!success) { + mutex_enter(&(buf_pool->mutex)); + + block->buf_fix_count--; +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&(block->debug_latch)); +#endif + mutex_exit(&(buf_pool->mutex)); + + return(FALSE); + } + + if (!UT_DULINT_EQ(modify_clock, block->modify_clock)) { + + buf_page_dbg_add_level(block->frame, SYNC_NO_ORDER_CHECK); + + if (rw_latch == RW_S_LATCH) { + rw_lock_s_unlock(&(block->lock)); + } else { + rw_lock_x_unlock(&(block->lock)); + } + + mutex_enter(&(buf_pool->mutex)); + + block->buf_fix_count--; +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&(block->debug_latch)); +#endif + mutex_exit(&(buf_pool->mutex)); + + return(FALSE); + } + + mtr_memo_push(mtr, block, fix_type); + +#ifdef UNIV_DEBUG + buf_dbg_counter++; + + if (buf_dbg_counter % 5771 == 0) { + ut_ad(buf_validate()); + } +#endif + ut_ad(block->buf_fix_count > 0); + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + + if (!accessed) { + /* In the case of a first access, try to apply linear + read-ahead */ + + buf_read_ahead_linear(buf_frame_get_space_id(guess), + buf_frame_get_page_no(guess)); + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + return(TRUE); +} + +/************************************************************************ +This is used to get access to a known database page, when no waiting can be +done. */ + +ibool +buf_page_get_known_nowait( +/*======================*/ + /* out: TRUE if success */ + ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ + buf_frame_t* guess, /* in: the known page frame */ + ulint mode, /* in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */ +#ifdef UNIV_SYNC_DEBUG + char* file, /* in: file name */ + ulint line, /* in: line where called */ +#endif + mtr_t* mtr) /* in: mini-transaction */ +{ + buf_block_t* block; + ibool success; + ulint fix_type; + + ut_ad(mtr); + ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); + + block = buf_block_align(guess); + + mutex_enter(&(buf_pool->mutex)); + +#ifdef UNIV_SYNC_DEBUG + buf_block_buf_fix_inc_debug(block, file, line); +#else + buf_block_buf_fix_inc(block); +#endif + if (mode == BUF_MAKE_YOUNG) { + buf_block_make_young(block); + } + + mutex_exit(&(buf_pool->mutex)); + + ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD)); + + if (rw_latch == RW_S_LATCH) { + success = rw_lock_s_lock_func_nowait(&(block->lock) + #ifdef UNIV_SYNC_DEBUG + , file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + success = rw_lock_x_lock_func_nowait(&(block->lock) + #ifdef UNIV_SYNC_DEBUG + , file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + if (!success) { + mutex_enter(&(buf_pool->mutex)); + + block->buf_fix_count--; +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&(block->debug_latch)); +#endif + mutex_exit(&(buf_pool->mutex)); + + return(FALSE); + } + + mtr_memo_push(mtr, block, fix_type); + +#ifdef UNIV_DEBUG + buf_dbg_counter++; + + if (buf_dbg_counter % 5771 == 0) { + ut_ad(buf_validate()); + } +#endif + ut_ad(block->buf_fix_count > 0); + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + +#ifdef UNIV_IBUF_DEBUG + ut_a((mode == BUF_KEEP_OLD) + || (ibuf_count_get(block->space, block->offset) == 0)); +#endif + return(TRUE); +} + +/************************************************************************ +Inits a page to the buffer buf_pool. */ +static +void +buf_page_init( +/*==========*/ + /* out: pointer to the block */ + ulint space, /* in: space id */ + ulint offset, /* in: offset of the page within space + in units of a page */ + buf_block_t* block) /* in: block to init */ +{ + ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(block->state == BUF_BLOCK_READY_FOR_USE); + + /* Set the state of the block */ + block->state = BUF_BLOCK_FILE_PAGE; + block->space = space; + block->offset = offset; + + block->lock_hash_val = lock_rec_hash(space, offset); + block->lock_mutex = NULL; + + /* Insert into the hash table of file pages */ + + HASH_INSERT(buf_block_t, hash, buf_pool->page_hash, + buf_page_address_fold(space, offset), block); + + block->freed_page_clock = 0; + + block->newest_modification = ut_dulint_zero; + block->oldest_modification = ut_dulint_zero; + + block->accessed = FALSE; + block->buf_fix_count = 0; + block->io_fix = 0; + + block->n_hash_helps = 0; + block->is_hashed = FALSE; +} + +/************************************************************************ +Function which inits a page for read to the buffer buf_pool. If the page is +already in buf_pool, does nothing. Sets the io_fix flag to BUF_IO_READ and +sets a non-recursive exclusive lock on the buffer frame. The io-handler must +take care that the flag is cleared and the lock released later. This is one +of the functions which perform the state transition NOT_USED => FILE_PAGE to +a block (the other is buf_page_create). */ + +buf_block_t* +buf_page_init_for_read( +/*===================*/ + /* out: pointer to the block or NULL */ + ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + mtr_t mtr; + + ut_ad(buf_pool); + + if (mode == BUF_READ_IBUF_PAGES_ONLY) { + /* It is a read-ahead within an ibuf routine */ + + ut_ad(!ibuf_bitmap_page(offset)); + ut_ad(ibuf_inside()); + + mtr_start(&mtr); + + if (!ibuf_page_low(space, offset, &mtr)) { + + mtr_commit(&mtr); + + return(NULL); + } + } else { + ut_ad(mode == BUF_READ_ANY_PAGE); + } + + block = buf_block_alloc(); + + ut_ad(block); + + mutex_enter(&(buf_pool->mutex)); + + if (NULL != buf_page_hash_get(space, offset)) { + + /* The page is already in buf_pool, return */ + + mutex_exit(&(buf_pool->mutex)); + buf_block_free(block); + + if (mode == BUF_READ_IBUF_PAGES_ONLY) { + + mtr_commit(&mtr); + } + + return(NULL); + } + + ut_ad(block); + + buf_page_init(space, offset, block); + + /* The block must be put to the LRU list, to the old blocks */ + + buf_LRU_add_block(block, TRUE); /* TRUE == to old blocks */ + + block->io_fix = BUF_IO_READ; + buf_pool->n_pend_reads++; + + /* We set a pass-type x-lock on the frame because then the same + thread which called for the read operation (and is running now at + this point of code) can wait for the read to complete by waiting + for the x-lock on the frame; if the x-lock were recursive, the + same thread would illegally get the x-lock before the page read + is completed. The x-lock is cleared by the io-handler thread. */ + + rw_lock_x_lock_gen(&(block->lock), BUF_IO_READ); + + rw_lock_x_lock_gen(&(block->read_lock), BUF_IO_READ); + + mutex_exit(&(buf_pool->mutex)); + + if (mode == BUF_READ_IBUF_PAGES_ONLY) { + + mtr_commit(&mtr); + } + + return(block); +} + +/************************************************************************ +Initializes a page to the buffer buf_pool. The page is usually not read +from a file even if it cannot be found in the buffer buf_pool. This is one +of the functions which perform to a block a state transition NOT_USED => +FILE_PAGE (the other is buf_page_init_for_read above). */ + +buf_frame_t* +buf_page_create( +/*============*/ + /* out: pointer to the frame, page bufferfixed */ + ulint space, /* in: space id */ + ulint offset, /* in: offset of the page within space in units of + a page */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + buf_frame_t* frame; + buf_block_t* block; + buf_block_t* free_block = NULL; + + ut_ad(mtr); + + free_block = buf_LRU_get_free_block(); + + /* Delete possible entries for the page from the insert buffer: + such can exist if the page belonged to an index which was dropped */ + + ibuf_merge_or_delete_for_page(NULL, space, offset); + + mutex_enter(&(buf_pool->mutex)); + + block = buf_page_hash_get(space, offset); + + if (block != NULL) { +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + /* Page can be found in buf_pool */ + mutex_exit(&(buf_pool->mutex)); + + buf_block_free(free_block); + + frame = buf_page_get_with_no_latch(space, offset, mtr); + + return(frame); + } + + /* If we get here, the page was not in buf_pool: init it there */ + + if (buf_debug_prints) { + printf("Creating space %lu page %lu to buffer\n", space, + offset); + } + + block = free_block; + + buf_page_init(space, offset, block); + + /* The block must be put to the LRU list */ + buf_LRU_add_block(block, FALSE); + +#ifdef UNIV_SYNC_DEBUG + buf_block_buf_fix_inc_debug(block, __FILE__, __LINE__); +#else + buf_block_buf_fix_inc(block); +#endif + mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); + + block->accessed = TRUE; + + buf_pool->n_pages_created++; + + mutex_exit(&(buf_pool->mutex)); + + /* Flush pages from the end of the LRU list if necessary */ + buf_flush_free_margin(); + + frame = block->frame; +#ifdef UNIV_DEBUG + buf_dbg_counter++; + + if (buf_dbg_counter % 357 == 0) { + ut_ad(buf_validate()); + } +#endif +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + return(frame); +} + +/************************************************************************ +Completes an asynchronous read or write request of a file page to or from +the buffer pool. */ + +void +buf_page_io_complete( +/*=================*/ + buf_block_t* block) /* in: pointer to the block in question */ +{ + dulint id; + dict_index_t* index; + ulint io_type; + + ut_ad(block); + + io_type = block->io_fix; + + if (io_type == BUF_IO_READ) { + if (recv_recovery_is_on()) { + recv_recover_page(TRUE, block->frame, block->space, + block->offset); + } + + if (!recv_no_ibuf_operations) { + ibuf_merge_or_delete_for_page(block->frame, + block->space, block->offset); + } + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + mutex_enter(&(buf_pool->mutex)); + + /* Because this thread which does the unlocking is not the same that + did the locking, we use a pass value != 0 in unlock, which simply + removes the newest lock debug record, without checking the thread + id. */ + + block->io_fix = 0; + + if (io_type == BUF_IO_READ) { + /* NOTE that the call to ibuf may have moved the ownership of + the x-latch to this OS thread: do not let this confuse you in + debugging! */ + + ut_ad(buf_pool->n_pend_reads > 0); + buf_pool->n_pend_reads--; + buf_pool->n_pages_read++; +/* + if (0 != ut_dulint_cmp( + mach_read_from_8(block->frame + FIL_PAGE_LSN), + mach_read_from_8(block->frame + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN))) { + + printf("DB error: file page corrupted!\n"); + + ut_error; + } +*/ + rw_lock_x_unlock_gen(&(block->lock), BUF_IO_READ); + rw_lock_x_unlock_gen(&(block->read_lock), BUF_IO_READ); + + if (buf_debug_prints) { + printf("Has read "); + } + } else { + ut_ad(io_type == BUF_IO_WRITE); + + /* Write means a flush operation: call the completion + routine in the flush system */ + + buf_flush_write_complete(block); + + rw_lock_s_unlock_gen(&(block->lock), BUF_IO_WRITE); + + buf_pool->n_pages_written++; + + if (buf_debug_prints) { + printf("Has written "); + } + } + + mutex_exit(&(buf_pool->mutex)); + + if (buf_debug_prints) { + printf("page space %lu page no %lu", block->space, + block->offset); + id = btr_page_get_index_id(block->frame); + + index = NULL; + /* The following can cause deadlocks if used: */ + /* + index = dict_index_get_if_in_cache(id); + + if (index) { + printf(" index name %s table %s", index->name, + index->table->name); + } + */ + + printf("\n"); + } +} + +/************************************************************************* +Invalidates the file pages in the buffer pool when an archive recovery is +completed. All the file pages buffered must be in a replaceable state when +this function is called: not latched and not modified. */ + +void +buf_pool_invalidate(void) +/*=====================*/ +{ + ibool freed; + + ut_ad(buf_all_freed()); + + freed = TRUE; + + while (freed) { + freed = buf_LRU_search_and_free_block(0); + } + + mutex_enter(&(buf_pool->mutex)); + + ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0); + + mutex_exit(&(buf_pool->mutex)); +} + +/************************************************************************* +Validates the buffer buf_pool data structure. */ + +ibool +buf_validate(void) +/*==============*/ +{ + buf_block_t* block; + ulint i; + ulint n_single_flush = 0; + ulint n_lru_flush = 0; + ulint n_list_flush = 0; + ulint n_lru = 0; + ulint n_flush = 0; + ulint n_free = 0; + ulint n_page = 0; + + ut_ad(buf_pool); + + mutex_enter(&(buf_pool->mutex)); + + for (i = 0; i < buf_pool->curr_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + + if (block->state == BUF_BLOCK_FILE_PAGE) { + + ut_a(buf_page_hash_get(block->space, + block->offset) == block); + n_page++; + +#ifdef UNIV_IBUF_DEBUG + ut_a((block->io_fix == BUF_IO_READ) + || ibuf_count_get(block->space, block->offset) + == 0); +#endif + if (block->io_fix == BUF_IO_WRITE) { + + if (block->flush_type == BUF_FLUSH_LRU) { + n_lru_flush++; + ut_a(rw_lock_is_locked(&(block->lock), + RW_LOCK_SHARED)); + } else if (block->flush_type == + BUF_FLUSH_LIST) { + n_list_flush++; + } else if (block->flush_type == + BUF_FLUSH_SINGLE_PAGE) { + n_single_flush++; + } else { + ut_error; + } + + } else if (block->io_fix == BUF_IO_READ) { + + ut_a(rw_lock_is_locked(&(block->lock), + RW_LOCK_EX)); + } + + n_lru++; + + if (ut_dulint_cmp(block->oldest_modification, + ut_dulint_zero) > 0) { + n_flush++; + } + + } else if (block->state == BUF_BLOCK_NOT_USED) { + n_free++; + } + } + + if (n_lru + n_free > buf_pool->curr_size) { + printf("n LRU %lu, n free %lu\n", n_lru, n_free); + ut_error; + } + + ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru); + if (UT_LIST_GET_LEN(buf_pool->free) != n_free) { + printf("Free list len %lu, free blocks %lu\n", + UT_LIST_GET_LEN(buf_pool->free), n_free); + ut_error; + } + ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush); + + ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush); + ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush); + ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush); + + mutex_exit(&(buf_pool->mutex)); + + ut_a(buf_LRU_validate()); + ut_a(buf_flush_validate()); + + return(TRUE); +} + +/************************************************************************* +Prints info of the buffer buf_pool data structure. */ + +void +buf_print(void) +/*===========*/ +{ + dulint* index_ids; + ulint* counts; + ulint size; + ulint i; + ulint j; + dulint id; + ulint n_found; + buf_frame_t* frame; + dict_index_t* index; + + ut_ad(buf_pool); + + size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE; + + index_ids = mem_alloc(sizeof(dulint) * size); + counts = mem_alloc(sizeof(ulint) * size); + + mutex_enter(&(buf_pool->mutex)); + + printf("LRU len %lu \n", UT_LIST_GET_LEN(buf_pool->LRU)); + printf("free len %lu \n", UT_LIST_GET_LEN(buf_pool->free)); + printf("flush len %lu \n", UT_LIST_GET_LEN(buf_pool->flush_list)); + printf("buf_pool size %lu \n", size); + + printf("n pending reads %lu \n", buf_pool->n_pend_reads); + + printf("n pending flush LRU %lu list %lu single page %lu\n", + buf_pool->n_flush[BUF_FLUSH_LRU], + buf_pool->n_flush[BUF_FLUSH_LIST], + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]); + + printf("pages read %lu, created %lu, written %lu\n", + buf_pool->n_pages_read, buf_pool->n_pages_created, + buf_pool->n_pages_written); + + /* Count the number of blocks belonging to each index in the buffer */ + + n_found = 0; + + for (i = 0 ; i < size; i++) { + counts[i] = 0; + } + + for (i = 0; i < size; i++) { + frame = buf_pool_get_nth_block(buf_pool, i)->frame; + + if (fil_page_get_type(frame) == FIL_PAGE_INDEX) { + + id = btr_page_get_index_id(frame); + + /* Look for the id in the index_ids array */ + j = 0; + + while (j < n_found) { + + if (ut_dulint_cmp(index_ids[j], id) == 0) { + (counts[j])++; + + break; + } + j++; + } + + if (j == n_found) { + n_found++; + index_ids[j] = id; + counts[j] = 1; + } + } + } + + mutex_exit(&(buf_pool->mutex)); + + for (i = 0; i < n_found; i++) { + index = dict_index_get_if_in_cache(index_ids[i]); + + printf("Block count for index %lu in buffer is about %lu", + ut_dulint_get_low(index_ids[i]), counts[i]); + + if (index) { + printf(" index name %s table %s", index->name, + index->table->name); + } + + printf("\n"); + } + + mem_free(index_ids); + mem_free(counts); + + ut_a(buf_validate()); +} + +/************************************************************************* +Prints info of the buffer i/o. */ + +void +buf_print_io(void) +/*==============*/ +{ + ut_ad(buf_pool); + + mutex_enter(&(buf_pool->mutex)); + + printf("pages read %lu, created %lu, written %lu\n", + buf_pool->n_pages_read, buf_pool->n_pages_created, + buf_pool->n_pages_written); + mutex_exit(&(buf_pool->mutex)); +} + +/************************************************************************* +Checks that all file pages in the buffer are in a replaceable state. */ + +ibool +buf_all_freed(void) +/*===============*/ +{ + buf_block_t* block; + ulint i; + + ut_ad(buf_pool); + + mutex_enter(&(buf_pool->mutex)); + + for (i = 0; i < buf_pool->curr_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + + if (block->state == BUF_BLOCK_FILE_PAGE) { + + if (!buf_flush_ready_for_replace(block)) { + + /* printf("Page %lu %lu still fixed or dirty\n", + block->space, block->offset); */ + ut_error; + } + } + } + + mutex_exit(&(buf_pool->mutex)); + + return(TRUE); +} + +/************************************************************************* +Checks that there currently are no pending i/o-operations for the buffer +pool. */ + +ibool +buf_pool_check_no_pending_io(void) +/*==============================*/ + /* out: TRUE if there is no pending i/o */ +{ + ibool ret; + + mutex_enter(&(buf_pool->mutex)); + + if (buf_pool->n_pend_reads + buf_pool->n_flush[BUF_FLUSH_LRU] + + buf_pool->n_flush[BUF_FLUSH_LIST] + + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]) { + ret = FALSE; + } else { + ret = TRUE; + } + + mutex_exit(&(buf_pool->mutex)); + + return(ret); +} + +/************************************************************************* +Gets the current length of the free list of buffer blocks. */ + +ulint +buf_get_free_list_len(void) +/*=======================*/ +{ + ulint len; + + mutex_enter(&(buf_pool->mutex)); + + len = UT_LIST_GET_LEN(buf_pool->free); + + mutex_exit(&(buf_pool->mutex)); + + return(len); +} |