diff options
author | unknown <monty@donna.mysql.com> | 2001-02-17 14:19:19 +0200 |
---|---|---|
committer | unknown <monty@donna.mysql.com> | 2001-02-17 14:19:19 +0200 |
commit | 2662b59306ef0cd495fa6e2edf7129e58a11393a (patch) | |
tree | bfe39951a73e906579ab819bf5198ad8f3a64a36 /innobase/buf | |
parent | 66de55a56bdcf2f7a9c0c4f8e19b3e761475e202 (diff) | |
download | mariadb-git-2662b59306ef0cd495fa6e2edf7129e58a11393a.tar.gz |
Added Innobase to source distribution
Docs/manual.texi:
Added Innobase documentation
configure.in:
Incremented version
include/my_base.h:
Added option for Innobase
myisam/mi_check.c:
cleanup
mysql-test/t/bdb.test:
cleanup
mysql-test/t/innobase.test:
Extended with new tests from bdb.test
mysql-test/t/merge.test:
Added test of SHOW create
mysys/my_init.c:
Fix for UNIXWARE 7
scripts/mysql_install_db.sh:
Always write how to start mysqld
scripts/safe_mysqld.sh:
Fixed type
sql/ha_innobase.cc:
Update to new version
sql/ha_innobase.h:
Update to new version
sql/handler.h:
Added 'update_table_comment()' and 'append_create_info()'
sql/sql_delete.cc:
Fixes for Innobase
sql/sql_select.cc:
Fixes for Innobase
sql/sql_show.cc:
Append create information (for MERGE tables)
sql/sql_update.cc:
Fixes for Innobase
Diffstat (limited to 'innobase/buf')
-rw-r--r-- | innobase/buf/Makefile.am | 24 | ||||
-rw-r--r-- | innobase/buf/buf0buf.c | 1568 | ||||
-rw-r--r-- | innobase/buf/buf0flu.c | 702 | ||||
-rw-r--r-- | innobase/buf/buf0lru.c | 734 | ||||
-rw-r--r-- | innobase/buf/buf0rea.c | 559 | ||||
-rw-r--r-- | innobase/buf/makefilewin | 20 | ||||
-rw-r--r-- | innobase/buf/ts/makefile | 20 | ||||
-rw-r--r-- | innobase/buf/ts/tsbuf.c | 885 | ||||
-rw-r--r-- | innobase/buf/ts/tsos.c | 185 |
9 files changed, 4697 insertions, 0 deletions
diff --git a/innobase/buf/Makefile.am b/innobase/buf/Makefile.am new file mode 100644 index 00000000000..b1463c2220e --- /dev/null +++ b/innobase/buf/Makefile.am @@ -0,0 +1,24 @@ +# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB +# & Innobase Oy +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +include ../include/Makefile.i + +libs_LIBRARIES = libbuf.a + +libbuf_a_SOURCES = buf0buf.c buf0flu.c buf0lru.c buf0rea.c + +EXTRA_PROGRAMS = diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c new file mode 100644 index 00000000000..4ffda8772f3 --- /dev/null +++ b/innobase/buf/buf0buf.c @@ -0,0 +1,1568 @@ +/* Innobase relational database engine; Copyright (C) 2001 Innobase Oy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License 2 + as published by the Free Software Foundation in June 1991. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License 2 + along with this program (in file COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +/****************************************************** +The database buffer buf_pool + +(c) 1995 Innobase Oy + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0buf.h" + +#ifdef UNIV_NONINL +#include "buf0buf.ic" +#endif + +#include "mem0mem.h" +#include "btr0btr.h" +#include "fil0fil.h" +#include "lock0lock.h" +#include "btr0sea.h" +#include "ibuf0ibuf.h" +#include "dict0dict.h" +#include "log0recv.h" + +/* + IMPLEMENTATION OF THE BUFFER POOL + ================================= + +Performance improvement: +------------------------ +Thread scheduling in NT may be so slow that the OS wait mechanism should +not be used even in waiting for disk reads to complete. +Rather, we should put waiting query threads to the queue of +waiting jobs, and let the OS thread do something useful while the i/o +is processed. In this way we could remove most OS thread switches in +an i/o-intensive benchmark like TPC-C. + +A possibility is to put a user space thread library between the database +and NT. User space thread libraries might be very fast. + +SQL Server 7.0 can be configured to use 'fibers' which are lightweight +threads in NT. These should be studied. + + Buffer frames and blocks + ------------------------ +Following the terminology of Gray and Reuter, we call the memory +blocks where file pages are loaded buffer frames. For each buffer +frame there is a control block, or shortly, a block, in the buffer +control array. The control info which does not need to be stored +in the file along with the file page, resides in the control block. + + Buffer pool struct + ------------------ +The buffer buf_pool contains a single mutex which protects all the +control data structures of the buf_pool. The content of a buffer frame is +protected by a separate read-write lock in its control block, though. +These locks can be locked and unlocked without owning the buf_pool mutex. +The OS events in the buf_pool struct can be waited for without owning the +buf_pool mutex. + +The buf_pool mutex is a hot-spot in main memory, causing a lot of +memory bus traffic on multiprocessor systems when processors +alternately access the mutex. On our Pentium, the mutex is accessed +maybe every 10 microseconds. We gave up the solution to have mutexes +for each control block, for instance, because it seemed to be +complicated. + +A solution to reduce mutex contention of the buf_pool mutex is to +create a separate mutex for the page hash table. On Pentium, +accessing the hash table takes 2 microseconds, about half +of the total buf_pool mutex hold time. + + Control blocks + -------------- + +The control block contains, for instance, the bufferfix count +which is incremented when a thread wants a file page to be fixed +in a buffer frame. The bufferfix operation does not lock the +contents of the frame, however. For this purpose, the control +block contains a read-write lock. + +The buffer frames have to be aligned so that the start memory +address of a frame is divisible by the universal page size, which +is a power of two. + +We intend to make the buffer buf_pool size on-line reconfigurable, +that is, the buf_pool size can be changed without closing the database. +Then the database administarator may adjust it to be bigger +at night, for example. The control block array must +contain enough control blocks for the maximum buffer buf_pool size +which is used in the particular database. +If the buf_pool size is cut, we exploit the virtual memory mechanism of +the OS, and just refrain from using frames at high addresses. Then the OS +can swap them to disk. + +The control blocks containing file pages are put to a hash table +according to the file address of the page. +We could speed up the access to an individual page by using +"pointer swizzling": we could replace the page references on +non-leaf index pages by direct pointers to the page, if it exists +in the buf_pool. We could make a separate hash table where we could +chain all the page references in non-leaf pages residing in the buf_pool, +using the page reference as the hash key, +and at the time of reading of a page update the pointers accordingly. +Drawbacks of this solution are added complexity and, +possibly, extra space required on non-leaf pages for memory pointers. +A simpler solution is just to speed up the hash table mechanism +in the database, using tables whose size is a power of 2. + + Lists of blocks + --------------- + +There are several lists of control blocks. The free list contains +blocks which are currently not used. + +The LRU-list contains all the blocks holding a file page +except those for which the bufferfix count is non-zero. +The pages are in the LRU list roughly in the order of the last +access to the page, so that the oldest pages are at the end of the +list. We also keep a pointer to near the end of the LRU list, +which we can use when we want to artificially age a page in the +buf_pool. This is used if we know that some page is not needed +again for some time: we insert the block right after the pointer, +causing it to be replaced sooner than would noramlly be the case. +Currently this aging mechanism is used for read-ahead mechanism +of pages, and it can also be used when there is a scan of a full +table which cannot fit in the memory. Putting the pages near the +of the LRU list, we make sure that most of the buf_pool stays in the +main memory, undisturbed. + +The chain of modified blocks contains the blocks +holding file pages that have been modified in the memory +but not written to disk yet. The block with the oldest modification +which has not yet been written to disk is at the end of the chain. + + Loading a file page + ------------------- + +First, a victim block for replacement has to be found in the +buf_pool. It is taken from the free list or searched for from the +end of the LRU-list. An exclusive lock is reserved for the frame, +the io_fix field is set in the block fixing the block in buf_pool, +and the io-operation for loading the page is queued. The io-handler thread +releases the X-lock on the frame and resets the io_fix field +when the io operation completes. + +A thread may request the above operation using the buf_page_get- +function. It may then continue to request a lock on the frame. +The lock is granted when the io-handler releases the x-lock. + + Read-ahead + ---------- + +The read-ahead mechanism is intended to be intelligent and +isolated from the semantically higher levels of the database +index management. From the higher level we only need the +information if a file page has a natural successor or +predecessor page. On the leaf level of a B-tree index, +these are the next and previous pages in the natural +order of the pages. + +Let us first explain the read-ahead mechanism when the leafs +of a B-tree are scanned in an ascending or descending order. +When a read page is the first time referenced in the buf_pool, +the buffer manager checks if it is at the border of a so-called +linear read-ahead area. The tablespace is divided into these +areas of size 64 blocks, for example. So if the page is at the +border of such an area, the read-ahead mechanism checks if +all the other blocks in the area have been accessed in an +ascending or descending order. If this is the case, the system +looks at the natural successor or predecessor of the page, +checks if that is at the border of another area, and in this case +issues read-requests for all the pages in that area. Maybe +we could relax the condition that all the pages in the area +have to be accessed: if data is deleted from a table, there may +appear holes of unused pages in the area. + +A different read-ahead mechanism is used when there appears +to be a random access pattern to a file. +If a new page is referenced in the buf_pool, and several pages +of its random access area (for instance, 32 consecutive pages +in a tablespace) have recently been referenced, we may predict +that the whole area may be needed in the near future, and issue +the read requests for the whole area. */ + +buf_pool_t* buf_pool = NULL; /* The buffer buf_pool of the database */ + +ulint buf_dbg_counter = 0; /* This is used to insert validation + operations in excution in the + debug version */ +ibool buf_debug_prints = FALSE; /* If this is set TRUE, + the program prints info whenever + read-ahead or flush occurs */ + +/************************************************************************ +Initializes a buffer control block when the buf_pool is created. */ +static +void +buf_block_init( +/*===========*/ + buf_block_t* block, /* in: pointer to control block */ + byte* frame) /* in: pointer to buffer frame */ +{ + block->state = BUF_BLOCK_NOT_USED; + + block->frame = frame; + + block->modify_clock = ut_dulint_zero; + + rw_lock_create(&(block->lock)); + ut_ad(rw_lock_validate(&(block->lock))); + + rw_lock_create(&(block->read_lock)); + rw_lock_set_level(&(block->read_lock), SYNC_NO_ORDER_CHECK); + + rw_lock_create(&(block->debug_latch)); + rw_lock_set_level(&(block->debug_latch), SYNC_NO_ORDER_CHECK); +} + +/************************************************************************ +Creates a buffer buf_pool object. */ +static +buf_pool_t* +buf_pool_create( +/*============*/ + /* out, own: buf_pool object, NULL if not + enough memory */ + ulint max_size, /* in: maximum size of the buf_pool in + blocks */ + ulint curr_size) /* in: current size to use, must be <= + max_size, currently must be equal to + max_size */ +{ + byte* frame; + ulint i; + buf_block_t* block; + + ut_a(max_size == curr_size); + + buf_pool = mem_alloc(sizeof(buf_pool_t)); + + /* 1. Initialize general fields + ---------------------------- */ + mutex_create(&(buf_pool->mutex)); + mutex_set_level(&(buf_pool->mutex), SYNC_BUF_POOL); + + mutex_enter(&(buf_pool->mutex)); + + buf_pool->frame_mem = ut_malloc(UNIV_PAGE_SIZE * (max_size + 1)); + + if (buf_pool->frame_mem == NULL) { + + return(NULL); + } + + buf_pool->blocks = ut_malloc(sizeof(buf_block_t) * max_size); + + if (buf_pool->blocks == NULL) { + + return(NULL); + } + + buf_pool->max_size = max_size; + buf_pool->curr_size = curr_size; + + /* Align pointer to the first frame */ + + frame = ut_align(buf_pool->frame_mem, UNIV_PAGE_SIZE); + buf_pool->frame_zero = frame; + + /* Init block structs and assign frames for them */ + for (i = 0; i < max_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + buf_block_init(block, frame); + frame = frame + UNIV_PAGE_SIZE; + } + + buf_pool->page_hash = hash_create(2 * max_size); + + buf_pool->n_pend_reads = 0; + buf_pool->n_pages_read = 0; + buf_pool->n_pages_written = 0; + buf_pool->n_pages_created = 0; + + /* 2. Initialize flushing fields + ---------------------------- */ + UT_LIST_INIT(buf_pool->flush_list); + + for (i = BUF_FLUSH_LRU; i <= BUF_FLUSH_LIST; i++) { + buf_pool->n_flush[i] = 0; + buf_pool->no_flush[i] = os_event_create(NULL); + } + + buf_pool->LRU_flush_ended = 0; + + buf_pool->ulint_clock = 1; + buf_pool->freed_page_clock = 0; + + /* 3. Initialize LRU fields + ---------------------------- */ + UT_LIST_INIT(buf_pool->LRU); + + buf_pool->LRU_old = NULL; + + /* Add control blocks to the free list */ + UT_LIST_INIT(buf_pool->free); + for (i = 0; i < curr_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + UT_LIST_ADD_FIRST(free, buf_pool->free, block); + } + + mutex_exit(&(buf_pool->mutex)); + + btr_search_sys_create(curr_size * UNIV_PAGE_SIZE / sizeof(void*) / 64); + + return(buf_pool); +} + +/************************************************************************ +Initializes the buffer buf_pool of the database. */ + +void +buf_pool_init( +/*==========*/ + ulint max_size, /* in: maximum size of the buf_pool in blocks */ + ulint curr_size) /* in: current size to use, must be <= + max_size */ +{ + ut_a(buf_pool == NULL); + + buf_pool_create(max_size, curr_size); + + ut_ad(buf_validate()); +} + +/************************************************************************ +Allocates a buffer block. */ +UNIV_INLINE +buf_block_t* +buf_block_alloc(void) +/*=================*/ + /* out, own: the allocated block */ +{ + buf_block_t* block; + + block = buf_LRU_get_free_block(); + + return(block); +} + +/************************************************************************ +Moves to the block to the start of the LRU list if there is a danger +that the block would drift out of the buffer pool. */ +UNIV_INLINE +void +buf_block_make_young( +/*=================*/ + buf_block_t* block) /* in: block to make younger */ +{ + if (buf_pool->freed_page_clock >= block->freed_page_clock + + 1 + (buf_pool->curr_size / 1024)) { + + /* There has been freeing activity in the LRU list: + best to move to the head of the LRU list */ + + buf_LRU_make_block_young(block); + } +} + +/************************************************************************ +Moves a page to the start of the buffer pool LRU list. This high-level +function can be used to prevent an important page from from slipping out of +the buffer pool. */ + +void +buf_page_make_young( +/*=================*/ + buf_frame_t* frame) /* in: buffer frame of a file page */ +{ + buf_block_t* block; + + mutex_enter(&(buf_pool->mutex)); + + block = buf_block_align(frame); + + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + + buf_LRU_make_block_young(block); + + mutex_exit(&(buf_pool->mutex)); +} + +/************************************************************************ +Frees a buffer block which does not contain a file page. */ +UNIV_INLINE +void +buf_block_free( +/*===========*/ + buf_block_t* block) /* in, own: block to be freed */ +{ + ut_ad(block->state != BUF_BLOCK_FILE_PAGE); + + mutex_enter(&(buf_pool->mutex)); + + buf_LRU_block_free_non_file_page(block); + + mutex_exit(&(buf_pool->mutex)); +} + +/************************************************************************* +Allocates a buffer frame. */ + +buf_frame_t* +buf_frame_alloc(void) +/*=================*/ + /* out: buffer frame */ +{ + return(buf_block_alloc()->frame); +} + +/************************************************************************* +Frees a buffer frame which does not contain a file page. */ + +void +buf_frame_free( +/*===========*/ + buf_frame_t* frame) /* in: buffer frame */ +{ + buf_block_free(buf_block_align(frame)); +} + +/************************************************************************ +Returns the buffer control block if the page can be found in the buffer +pool. NOTE that it is possible that the page is not yet read +from disk, though. This is a very low-level function: use with care! */ + +buf_block_t* +buf_page_peek_block( +/*================*/ + /* out: control block if found from page hash table, + otherwise NULL; NOTE that the page is not necessarily + yet read from disk! */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + + mutex_enter_fast(&(buf_pool->mutex)); + + block = buf_page_hash_get(space, offset); + + mutex_exit(&(buf_pool->mutex)); + + return(block); +} + +/************************************************************************ +Returns the current state of is_hashed of a page. FALSE if the page is +not in the pool. NOTE that this operation does not fix the page in the +pool if it is found there. */ + +ibool +buf_page_peek_if_search_hashed( +/*===========================*/ + /* out: TRUE if page hash index is built in search + system */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + ibool is_hashed; + + mutex_enter_fast(&(buf_pool->mutex)); + + block = buf_page_hash_get(space, offset); + + if (!block) { + is_hashed = FALSE; + } else { + is_hashed = block->is_hashed; + } + + mutex_exit(&(buf_pool->mutex)); + + return(is_hashed); +} + +/************************************************************************ +Returns TRUE if the page can be found in the buffer pool hash table. NOTE +that it is possible that the page is not yet read from disk, though. */ + +ibool +buf_page_peek( +/*==========*/ + /* out: TRUE if found from page hash table, + NOTE that the page is not necessarily yet read + from disk! */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + if (buf_page_peek_block(space, offset)) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************ +This is the general function used to get access to a database page. */ + +buf_frame_t* +buf_page_get_gen( +/*=============*/ + /* out: pointer to the frame or NULL */ + ulint space, /* in: space id */ + ulint offset, /* in: page number */ + ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ + buf_frame_t* guess, /* in: guessed frame or NULL */ + ulint mode, /* in: BUF_GET, BUF_GET_IF_IN_POOL, + BUF_GET_NO_LATCH, BUF_GET_NOWAIT */ +#ifdef UNIV_SYNC_DEBUG + char* file, /* in: file name */ + ulint line, /* in: line where called */ +#endif + mtr_t* mtr) /* in: mini-transaction */ +{ + buf_block_t* block; + ibool accessed; + ulint fix_type; + ibool success; + ibool must_read; + + ut_ad(mtr); + ut_ad((rw_latch == RW_S_LATCH) + || (rw_latch == RW_X_LATCH) + || (rw_latch == RW_NO_LATCH)); + ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH)); + ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL) + || (mode == BUF_GET_NO_LATCH) || (mode == BUF_GET_NOWAIT)); +#ifndef UNIV_LOG_DEBUG + ut_ad(!ibuf_inside() || ibuf_page(space, offset)); +#endif +loop: + mutex_enter_fast(&(buf_pool->mutex)); + + block = NULL; + + if (guess) { + block = buf_block_align(guess); + + if ((offset != block->offset) || (space != block->space) + || (block->state != BUF_BLOCK_FILE_PAGE)) { + + block = NULL; + } + } + + if (block == NULL) { + block = buf_page_hash_get(space, offset); + } + + if (block == NULL) { + /* Page not in buf_pool: needs to be read from file */ + + mutex_exit(&(buf_pool->mutex)); + + if (mode == BUF_GET_IF_IN_POOL) { + + return(NULL); + } + + buf_read_page(space, offset); + + #ifdef UNIV_DEBUG + buf_dbg_counter++; + + if (buf_dbg_counter % 37 == 0) { + ut_ad(buf_validate()); + } + #endif + goto loop; + } + + must_read = FALSE; + + if (block->io_fix == BUF_IO_READ) { + + must_read = TRUE; + + if (mode == BUF_GET_IF_IN_POOL) { + + /* The page is only being read to buffer */ + mutex_exit(&(buf_pool->mutex)); + + return(NULL); + } + } + +#ifdef UNIV_SYNC_DEBUG + buf_block_buf_fix_inc_debug(block, file, line); +#else + buf_block_buf_fix_inc(block); +#endif + buf_block_make_young(block); + + /* Check if this is the first access to the page */ + + accessed = block->accessed; + + block->accessed = TRUE; + + mutex_exit(&(buf_pool->mutex)); + +#ifdef UNIV_DEBUG + buf_dbg_counter++; + + if (buf_dbg_counter % 5771 == 0) { + ut_ad(buf_validate()); + } +#endif + ut_ad(block->buf_fix_count > 0); + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + + if (mode == BUF_GET_NOWAIT) { + if (rw_latch == RW_S_LATCH) { + success = rw_lock_s_lock_func_nowait(&(block->lock) + #ifdef UNIV_SYNC_DEBUG + ,file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + ut_ad(rw_latch == RW_X_LATCH); + success = rw_lock_x_lock_func_nowait(&(block->lock) + #ifdef UNIV_SYNC_DEBUG + ,file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + if (!success) { + mutex_enter(&(buf_pool->mutex)); + + block->buf_fix_count--; +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&(block->debug_latch)); +#endif + mutex_exit(&(buf_pool->mutex)); + + return(NULL); + } + } else if (rw_latch == RW_NO_LATCH) { + + if (must_read) { + rw_lock_x_lock(&(block->read_lock)); + rw_lock_x_unlock(&(block->read_lock)); + } + + fix_type = MTR_MEMO_BUF_FIX; + } else if (rw_latch == RW_S_LATCH) { + + rw_lock_s_lock_func(&(block->lock) + #ifdef UNIV_SYNC_DEBUG + ,0, file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + rw_lock_x_lock_func(&(block->lock), 0 + #ifdef UNIV_SYNC_DEBUG + , file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + mtr_memo_push(mtr, block, fix_type); + + if (!accessed) { + /* In the case of a first access, try to apply linear + read-ahead */ + + buf_read_ahead_linear(space, offset); + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + return(block->frame); +} + +/************************************************************************ +This is the general function used to get optimistic access to a database +page. */ + +ibool +buf_page_optimistic_get_func( +/*=========================*/ + /* out: TRUE if success */ + ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ + buf_frame_t* guess, /* in: guessed frame */ + dulint modify_clock,/* in: modify clock value if mode is + ..._GUESS_ON_CLOCK */ +#ifdef UNIV_SYNC_DEBUG + char* file, /* in: file name */ + ulint line, /* in: line where called */ +#endif + mtr_t* mtr) /* in: mini-transaction */ +{ + buf_block_t* block; + ibool accessed; + ibool success; + ulint fix_type; + + ut_ad(mtr && guess); + ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); + + block = buf_block_align(guess); + + mutex_enter(&(buf_pool->mutex)); + + if (block->state != BUF_BLOCK_FILE_PAGE) { + + mutex_exit(&(buf_pool->mutex)); + + return(FALSE); + } + +#ifdef UNIV_SYNC_DEBUG + buf_block_buf_fix_inc_debug(block, file, line); +#else + buf_block_buf_fix_inc(block); +#endif + buf_block_make_young(block); + + /* Check if this is the first access to the page */ + + accessed = block->accessed; + + block->accessed = TRUE; + + mutex_exit(&(buf_pool->mutex)); + + ut_ad(!ibuf_inside() || ibuf_page(block->space, block->offset)); + + if (rw_latch == RW_S_LATCH) { + success = rw_lock_s_lock_func_nowait(&(block->lock) + #ifdef UNIV_SYNC_DEBUG + , file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + success = rw_lock_x_lock_func_nowait(&(block->lock) + #ifdef UNIV_SYNC_DEBUG + , file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + if (!success) { + mutex_enter(&(buf_pool->mutex)); + + block->buf_fix_count--; +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&(block->debug_latch)); +#endif + mutex_exit(&(buf_pool->mutex)); + + return(FALSE); + } + + if (!UT_DULINT_EQ(modify_clock, block->modify_clock)) { + + buf_page_dbg_add_level(block->frame, SYNC_NO_ORDER_CHECK); + + if (rw_latch == RW_S_LATCH) { + rw_lock_s_unlock(&(block->lock)); + } else { + rw_lock_x_unlock(&(block->lock)); + } + + mutex_enter(&(buf_pool->mutex)); + + block->buf_fix_count--; +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&(block->debug_latch)); +#endif + mutex_exit(&(buf_pool->mutex)); + + return(FALSE); + } + + mtr_memo_push(mtr, block, fix_type); + +#ifdef UNIV_DEBUG + buf_dbg_counter++; + + if (buf_dbg_counter % 5771 == 0) { + ut_ad(buf_validate()); + } +#endif + ut_ad(block->buf_fix_count > 0); + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + + if (!accessed) { + /* In the case of a first access, try to apply linear + read-ahead */ + + buf_read_ahead_linear(buf_frame_get_space_id(guess), + buf_frame_get_page_no(guess)); + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + return(TRUE); +} + +/************************************************************************ +This is used to get access to a known database page, when no waiting can be +done. */ + +ibool +buf_page_get_known_nowait( +/*======================*/ + /* out: TRUE if success */ + ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ + buf_frame_t* guess, /* in: the known page frame */ + ulint mode, /* in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */ +#ifdef UNIV_SYNC_DEBUG + char* file, /* in: file name */ + ulint line, /* in: line where called */ +#endif + mtr_t* mtr) /* in: mini-transaction */ +{ + buf_block_t* block; + ibool success; + ulint fix_type; + + ut_ad(mtr); + ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); + + block = buf_block_align(guess); + + mutex_enter(&(buf_pool->mutex)); + +#ifdef UNIV_SYNC_DEBUG + buf_block_buf_fix_inc_debug(block, file, line); +#else + buf_block_buf_fix_inc(block); +#endif + if (mode == BUF_MAKE_YOUNG) { + buf_block_make_young(block); + } + + mutex_exit(&(buf_pool->mutex)); + + ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD)); + + if (rw_latch == RW_S_LATCH) { + success = rw_lock_s_lock_func_nowait(&(block->lock) + #ifdef UNIV_SYNC_DEBUG + , file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + success = rw_lock_x_lock_func_nowait(&(block->lock) + #ifdef UNIV_SYNC_DEBUG + , file, line + #endif + ); + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + if (!success) { + mutex_enter(&(buf_pool->mutex)); + + block->buf_fix_count--; +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&(block->debug_latch)); +#endif + mutex_exit(&(buf_pool->mutex)); + + return(FALSE); + } + + mtr_memo_push(mtr, block, fix_type); + +#ifdef UNIV_DEBUG + buf_dbg_counter++; + + if (buf_dbg_counter % 5771 == 0) { + ut_ad(buf_validate()); + } +#endif + ut_ad(block->buf_fix_count > 0); + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + +#ifdef UNIV_IBUF_DEBUG + ut_a((mode == BUF_KEEP_OLD) + || (ibuf_count_get(block->space, block->offset) == 0)); +#endif + return(TRUE); +} + +/************************************************************************ +Inits a page to the buffer buf_pool. */ +static +void +buf_page_init( +/*==========*/ + /* out: pointer to the block */ + ulint space, /* in: space id */ + ulint offset, /* in: offset of the page within space + in units of a page */ + buf_block_t* block) /* in: block to init */ +{ + ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(block->state == BUF_BLOCK_READY_FOR_USE); + + /* Set the state of the block */ + block->state = BUF_BLOCK_FILE_PAGE; + block->space = space; + block->offset = offset; + + block->lock_hash_val = lock_rec_hash(space, offset); + block->lock_mutex = NULL; + + /* Insert into the hash table of file pages */ + + HASH_INSERT(buf_block_t, hash, buf_pool->page_hash, + buf_page_address_fold(space, offset), block); + + block->freed_page_clock = 0; + + block->newest_modification = ut_dulint_zero; + block->oldest_modification = ut_dulint_zero; + + block->accessed = FALSE; + block->buf_fix_count = 0; + block->io_fix = 0; + + block->n_hash_helps = 0; + block->is_hashed = FALSE; +} + +/************************************************************************ +Function which inits a page for read to the buffer buf_pool. If the page is +already in buf_pool, does nothing. Sets the io_fix flag to BUF_IO_READ and +sets a non-recursive exclusive lock on the buffer frame. The io-handler must +take care that the flag is cleared and the lock released later. This is one +of the functions which perform the state transition NOT_USED => FILE_PAGE to +a block (the other is buf_page_create). */ + +buf_block_t* +buf_page_init_for_read( +/*===================*/ + /* out: pointer to the block or NULL */ + ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + mtr_t mtr; + + ut_ad(buf_pool); + + if (mode == BUF_READ_IBUF_PAGES_ONLY) { + /* It is a read-ahead within an ibuf routine */ + + ut_ad(!ibuf_bitmap_page(offset)); + ut_ad(ibuf_inside()); + + mtr_start(&mtr); + + if (!ibuf_page_low(space, offset, &mtr)) { + + mtr_commit(&mtr); + + return(NULL); + } + } else { + ut_ad(mode == BUF_READ_ANY_PAGE); + } + + block = buf_block_alloc(); + + ut_ad(block); + + mutex_enter(&(buf_pool->mutex)); + + if (NULL != buf_page_hash_get(space, offset)) { + + /* The page is already in buf_pool, return */ + + mutex_exit(&(buf_pool->mutex)); + buf_block_free(block); + + if (mode == BUF_READ_IBUF_PAGES_ONLY) { + + mtr_commit(&mtr); + } + + return(NULL); + } + + ut_ad(block); + + buf_page_init(space, offset, block); + + /* The block must be put to the LRU list, to the old blocks */ + + buf_LRU_add_block(block, TRUE); /* TRUE == to old blocks */ + + block->io_fix = BUF_IO_READ; + buf_pool->n_pend_reads++; + + /* We set a pass-type x-lock on the frame because then the same + thread which called for the read operation (and is running now at + this point of code) can wait for the read to complete by waiting + for the x-lock on the frame; if the x-lock were recursive, the + same thread would illegally get the x-lock before the page read + is completed. The x-lock is cleared by the io-handler thread. */ + + rw_lock_x_lock_gen(&(block->lock), BUF_IO_READ); + + rw_lock_x_lock_gen(&(block->read_lock), BUF_IO_READ); + + mutex_exit(&(buf_pool->mutex)); + + if (mode == BUF_READ_IBUF_PAGES_ONLY) { + + mtr_commit(&mtr); + } + + return(block); +} + +/************************************************************************ +Initializes a page to the buffer buf_pool. The page is usually not read +from a file even if it cannot be found in the buffer buf_pool. This is one +of the functions which perform to a block a state transition NOT_USED => +FILE_PAGE (the other is buf_page_init_for_read above). */ + +buf_frame_t* +buf_page_create( +/*============*/ + /* out: pointer to the frame, page bufferfixed */ + ulint space, /* in: space id */ + ulint offset, /* in: offset of the page within space in units of + a page */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + buf_frame_t* frame; + buf_block_t* block; + buf_block_t* free_block = NULL; + + ut_ad(mtr); + + free_block = buf_LRU_get_free_block(); + + /* Delete possible entries for the page from the insert buffer: + such can exist if the page belonged to an index which was dropped */ + + ibuf_merge_or_delete_for_page(NULL, space, offset); + + mutex_enter(&(buf_pool->mutex)); + + block = buf_page_hash_get(space, offset); + + if (block != NULL) { +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + /* Page can be found in buf_pool */ + mutex_exit(&(buf_pool->mutex)); + + buf_block_free(free_block); + + frame = buf_page_get_with_no_latch(space, offset, mtr); + + return(frame); + } + + /* If we get here, the page was not in buf_pool: init it there */ + + if (buf_debug_prints) { + printf("Creating space %lu page %lu to buffer\n", space, + offset); + } + + block = free_block; + + buf_page_init(space, offset, block); + + /* The block must be put to the LRU list */ + buf_LRU_add_block(block, FALSE); + +#ifdef UNIV_SYNC_DEBUG + buf_block_buf_fix_inc_debug(block, __FILE__, __LINE__); +#else + buf_block_buf_fix_inc(block); +#endif + mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); + + block->accessed = TRUE; + + buf_pool->n_pages_created++; + + mutex_exit(&(buf_pool->mutex)); + + /* Flush pages from the end of the LRU list if necessary */ + buf_flush_free_margin(); + + frame = block->frame; +#ifdef UNIV_DEBUG + buf_dbg_counter++; + + if (buf_dbg_counter % 357 == 0) { + ut_ad(buf_validate()); + } +#endif +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + return(frame); +} + +/************************************************************************ +Completes an asynchronous read or write request of a file page to or from +the buffer pool. */ + +void +buf_page_io_complete( +/*=================*/ + buf_block_t* block) /* in: pointer to the block in question */ +{ + dulint id; + dict_index_t* index; + ulint io_type; + + ut_ad(block); + + io_type = block->io_fix; + + if (io_type == BUF_IO_READ) { + if (recv_recovery_is_on()) { + recv_recover_page(TRUE, block->frame, block->space, + block->offset); + } + + if (!recv_no_ibuf_operations) { + ibuf_merge_or_delete_for_page(block->frame, + block->space, block->offset); + } + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + mutex_enter(&(buf_pool->mutex)); + + /* Because this thread which does the unlocking is not the same that + did the locking, we use a pass value != 0 in unlock, which simply + removes the newest lock debug record, without checking the thread + id. */ + + block->io_fix = 0; + + if (io_type == BUF_IO_READ) { + /* NOTE that the call to ibuf may have moved the ownership of + the x-latch to this OS thread: do not let this confuse you in + debugging! */ + + ut_ad(buf_pool->n_pend_reads > 0); + buf_pool->n_pend_reads--; + buf_pool->n_pages_read++; +/* + if (0 != ut_dulint_cmp( + mach_read_from_8(block->frame + FIL_PAGE_LSN), + mach_read_from_8(block->frame + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN))) { + + printf("DB error: file page corrupted!\n"); + + ut_error; + } +*/ + rw_lock_x_unlock_gen(&(block->lock), BUF_IO_READ); + rw_lock_x_unlock_gen(&(block->read_lock), BUF_IO_READ); + + if (buf_debug_prints) { + printf("Has read "); + } + } else { + ut_ad(io_type == BUF_IO_WRITE); + + /* Write means a flush operation: call the completion + routine in the flush system */ + + buf_flush_write_complete(block); + + rw_lock_s_unlock_gen(&(block->lock), BUF_IO_WRITE); + + buf_pool->n_pages_written++; + + if (buf_debug_prints) { + printf("Has written "); + } + } + + mutex_exit(&(buf_pool->mutex)); + + if (buf_debug_prints) { + printf("page space %lu page no %lu", block->space, + block->offset); + id = btr_page_get_index_id(block->frame); + + index = NULL; + /* The following can cause deadlocks if used: */ + /* + index = dict_index_get_if_in_cache(id); + + if (index) { + printf(" index name %s table %s", index->name, + index->table->name); + } + */ + + printf("\n"); + } +} + +/************************************************************************* +Invalidates the file pages in the buffer pool when an archive recovery is +completed. All the file pages buffered must be in a replaceable state when +this function is called: not latched and not modified. */ + +void +buf_pool_invalidate(void) +/*=====================*/ +{ + ibool freed; + + ut_ad(buf_all_freed()); + + freed = TRUE; + + while (freed) { + freed = buf_LRU_search_and_free_block(0); + } + + mutex_enter(&(buf_pool->mutex)); + + ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0); + + mutex_exit(&(buf_pool->mutex)); +} + +/************************************************************************* +Validates the buffer buf_pool data structure. */ + +ibool +buf_validate(void) +/*==============*/ +{ + buf_block_t* block; + ulint i; + ulint n_single_flush = 0; + ulint n_lru_flush = 0; + ulint n_list_flush = 0; + ulint n_lru = 0; + ulint n_flush = 0; + ulint n_free = 0; + ulint n_page = 0; + + ut_ad(buf_pool); + + mutex_enter(&(buf_pool->mutex)); + + for (i = 0; i < buf_pool->curr_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + + if (block->state == BUF_BLOCK_FILE_PAGE) { + + ut_a(buf_page_hash_get(block->space, + block->offset) == block); + n_page++; + +#ifdef UNIV_IBUF_DEBUG + ut_a((block->io_fix == BUF_IO_READ) + || ibuf_count_get(block->space, block->offset) + == 0); +#endif + if (block->io_fix == BUF_IO_WRITE) { + + if (block->flush_type == BUF_FLUSH_LRU) { + n_lru_flush++; + ut_a(rw_lock_is_locked(&(block->lock), + RW_LOCK_SHARED)); + } else if (block->flush_type == + BUF_FLUSH_LIST) { + n_list_flush++; + } else if (block->flush_type == + BUF_FLUSH_SINGLE_PAGE) { + n_single_flush++; + } else { + ut_error; + } + + } else if (block->io_fix == BUF_IO_READ) { + + ut_a(rw_lock_is_locked(&(block->lock), + RW_LOCK_EX)); + } + + n_lru++; + + if (ut_dulint_cmp(block->oldest_modification, + ut_dulint_zero) > 0) { + n_flush++; + } + + } else if (block->state == BUF_BLOCK_NOT_USED) { + n_free++; + } + } + + if (n_lru + n_free > buf_pool->curr_size) { + printf("n LRU %lu, n free %lu\n", n_lru, n_free); + ut_error; + } + + ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru); + if (UT_LIST_GET_LEN(buf_pool->free) != n_free) { + printf("Free list len %lu, free blocks %lu\n", + UT_LIST_GET_LEN(buf_pool->free), n_free); + ut_error; + } + ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush); + + ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush); + ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush); + ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush); + + mutex_exit(&(buf_pool->mutex)); + + ut_a(buf_LRU_validate()); + ut_a(buf_flush_validate()); + + return(TRUE); +} + +/************************************************************************* +Prints info of the buffer buf_pool data structure. */ + +void +buf_print(void) +/*===========*/ +{ + dulint* index_ids; + ulint* counts; + ulint size; + ulint i; + ulint j; + dulint id; + ulint n_found; + buf_frame_t* frame; + dict_index_t* index; + + ut_ad(buf_pool); + + size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE; + + index_ids = mem_alloc(sizeof(dulint) * size); + counts = mem_alloc(sizeof(ulint) * size); + + mutex_enter(&(buf_pool->mutex)); + + printf("LRU len %lu \n", UT_LIST_GET_LEN(buf_pool->LRU)); + printf("free len %lu \n", UT_LIST_GET_LEN(buf_pool->free)); + printf("flush len %lu \n", UT_LIST_GET_LEN(buf_pool->flush_list)); + printf("buf_pool size %lu \n", size); + + printf("n pending reads %lu \n", buf_pool->n_pend_reads); + + printf("n pending flush LRU %lu list %lu single page %lu\n", + buf_pool->n_flush[BUF_FLUSH_LRU], + buf_pool->n_flush[BUF_FLUSH_LIST], + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]); + + printf("pages read %lu, created %lu, written %lu\n", + buf_pool->n_pages_read, buf_pool->n_pages_created, + buf_pool->n_pages_written); + + /* Count the number of blocks belonging to each index in the buffer */ + + n_found = 0; + + for (i = 0 ; i < size; i++) { + counts[i] = 0; + } + + for (i = 0; i < size; i++) { + frame = buf_pool_get_nth_block(buf_pool, i)->frame; + + if (fil_page_get_type(frame) == FIL_PAGE_INDEX) { + + id = btr_page_get_index_id(frame); + + /* Look for the id in the index_ids array */ + j = 0; + + while (j < n_found) { + + if (ut_dulint_cmp(index_ids[j], id) == 0) { + (counts[j])++; + + break; + } + j++; + } + + if (j == n_found) { + n_found++; + index_ids[j] = id; + counts[j] = 1; + } + } + } + + mutex_exit(&(buf_pool->mutex)); + + for (i = 0; i < n_found; i++) { + index = dict_index_get_if_in_cache(index_ids[i]); + + printf("Block count for index %lu in buffer is about %lu", + ut_dulint_get_low(index_ids[i]), counts[i]); + + if (index) { + printf(" index name %s table %s", index->name, + index->table->name); + } + + printf("\n"); + } + + mem_free(index_ids); + mem_free(counts); + + ut_a(buf_validate()); +} + +/************************************************************************* +Prints info of the buffer i/o. */ + +void +buf_print_io(void) +/*==============*/ +{ + ut_ad(buf_pool); + + mutex_enter(&(buf_pool->mutex)); + + printf("pages read %lu, created %lu, written %lu\n", + buf_pool->n_pages_read, buf_pool->n_pages_created, + buf_pool->n_pages_written); + mutex_exit(&(buf_pool->mutex)); +} + +/************************************************************************* +Checks that all file pages in the buffer are in a replaceable state. */ + +ibool +buf_all_freed(void) +/*===============*/ +{ + buf_block_t* block; + ulint i; + + ut_ad(buf_pool); + + mutex_enter(&(buf_pool->mutex)); + + for (i = 0; i < buf_pool->curr_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + + if (block->state == BUF_BLOCK_FILE_PAGE) { + + if (!buf_flush_ready_for_replace(block)) { + + /* printf("Page %lu %lu still fixed or dirty\n", + block->space, block->offset); */ + ut_error; + } + } + } + + mutex_exit(&(buf_pool->mutex)); + + return(TRUE); +} + +/************************************************************************* +Checks that there currently are no pending i/o-operations for the buffer +pool. */ + +ibool +buf_pool_check_no_pending_io(void) +/*==============================*/ + /* out: TRUE if there is no pending i/o */ +{ + ibool ret; + + mutex_enter(&(buf_pool->mutex)); + + if (buf_pool->n_pend_reads + buf_pool->n_flush[BUF_FLUSH_LRU] + + buf_pool->n_flush[BUF_FLUSH_LIST] + + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]) { + ret = FALSE; + } else { + ret = TRUE; + } + + mutex_exit(&(buf_pool->mutex)); + + return(ret); +} + +/************************************************************************* +Gets the current length of the free list of buffer blocks. */ + +ulint +buf_get_free_list_len(void) +/*=======================*/ +{ + ulint len; + + mutex_enter(&(buf_pool->mutex)); + + len = UT_LIST_GET_LEN(buf_pool->free); + + mutex_exit(&(buf_pool->mutex)); + + return(len); +} diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c new file mode 100644 index 00000000000..d3a86b0c18d --- /dev/null +++ b/innobase/buf/buf0flu.c @@ -0,0 +1,702 @@ +/****************************************************** +The database buffer buf_pool flush algorithm + +(c) 1995 Innobase Oy + +Created 11/11/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0flu.h" + +#ifdef UNIV_NONINL +#include "buf0flu.ic" +#endif + +#include "ut0byte.h" +#include "ut0lst.h" +#include "fil0fil.h" + +#include "buf0buf.h" +#include "buf0lru.h" +#include "buf0rea.h" +#include "ibuf0ibuf.h" +#include "log0log.h" +#include "os0file.h" + +/* When flushed, dirty blocks are searched in neigborhoods of this size, and +flushed along with the original page. */ + +#define BUF_FLUSH_AREA ut_min(BUF_READ_AHEAD_AREA,\ + buf_pool->curr_size / 16) + +/********************************************************************** +Validates the flush list. */ +static +ibool +buf_flush_validate_low(void); +/*========================*/ + /* out: TRUE if ok */ + +/************************************************************************ +Inserts a modified block into the flush list. */ + +void +buf_flush_insert_into_flush_list( +/*=============================*/ + buf_block_t* block) /* in: block which is modified */ +{ + ut_ad(mutex_own(&(buf_pool->mutex))); + + ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL) + || (ut_dulint_cmp( + (UT_LIST_GET_FIRST(buf_pool->flush_list)) + ->oldest_modification, + block->oldest_modification) <= 0)); + + UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block); + + ut_ad(buf_flush_validate_low()); +} + +/************************************************************************ +Inserts a modified block into the flush list in the right sorted position. +This function is used by recovery, because there the modifications do not +necessarily come in the order of lsn's. */ + +void +buf_flush_insert_sorted_into_flush_list( +/*====================================*/ + buf_block_t* block) /* in: block which is modified */ +{ + buf_block_t* prev_b; + buf_block_t* b; + + ut_ad(mutex_own(&(buf_pool->mutex))); + + prev_b = NULL; + b = UT_LIST_GET_FIRST(buf_pool->flush_list); + + while (b && (ut_dulint_cmp(b->oldest_modification, + block->oldest_modification) > 0)) { + prev_b = b; + b = UT_LIST_GET_NEXT(flush_list, b); + } + + if (prev_b == NULL) { + UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block); + } else { + UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list, prev_b, + block); + } + + ut_ad(buf_flush_validate_low()); +} + +/************************************************************************ +Returns TRUE if the file page block is immediately suitable for replacement, +i.e., the transition FILE_PAGE => NOT_USED allowed. */ + +ibool +buf_flush_ready_for_replace( +/*========================*/ + /* out: TRUE if can replace immediately */ + buf_block_t* block) /* in: buffer control block, must be in state + BUF_BLOCK_FILE_PAGE and in the LRU list*/ +{ + ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + + if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0) + || (block->buf_fix_count != 0) + || (block->io_fix != 0)) { + + return(FALSE); + } + + return(TRUE); +} + +/************************************************************************ +Returns TRUE if the block is modified and ready for flushing. */ +UNIV_INLINE +ibool +buf_flush_ready_for_flush( +/*======================*/ + /* out: TRUE if can flush immediately */ + buf_block_t* block, /* in: buffer control block, must be in state + BUF_BLOCK_FILE_PAGE */ + ulint flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ +{ + ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + + if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0) + && (block->io_fix == 0)) { + + if (flush_type != BUF_FLUSH_LRU) { + + return(TRUE); + + } else if ((block->old || (UT_LIST_GET_LEN(buf_pool->LRU) + < BUF_LRU_OLD_MIN_LEN)) + && (block->buf_fix_count == 0)) { + + /* If we are flushing the LRU list, to avoid deadlocks + we require the block not to be bufferfixed, and hence + not latched. Since LRU flushed blocks are soon moved + to the free list, it is good to flush only old blocks + from the end of the LRU list. */ + + return(TRUE); + } + } + + return(FALSE); +} + +/************************************************************************ +Updates the flush system data structures when a write is completed. */ + +void +buf_flush_write_complete( +/*=====================*/ + buf_block_t* block) /* in: pointer to the block in question */ +{ + ut_ad(block); + ut_ad(mutex_own(&(buf_pool->mutex))); + + block->oldest_modification = ut_dulint_zero; + + UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block); + + ut_d(UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list)); + + (buf_pool->n_flush[block->flush_type])--; + + if (block->flush_type == BUF_FLUSH_LRU) { + /* Put the block to the end of the LRU list to wait to be + moved to the free list */ + + buf_LRU_make_block_old(block); + + buf_pool->LRU_flush_ended++; + } + +/* printf("n pending flush %lu\n", + buf_pool->n_flush[block->flush_type]); */ + + if ((buf_pool->n_flush[block->flush_type] == 0) + && (buf_pool->init_flush[block->flush_type] == FALSE)) { + + /* The running flush batch has ended */ + + os_event_set(buf_pool->no_flush[block->flush_type]); + } +} + +/************************************************************************ +Does an asynchronous write of a buffer page. NOTE: in simulated aio we must +call os_aio_simulated_wake_handler_threads after we have posted a batch +of writes! */ +static +void +buf_flush_write_block_low( +/*======================*/ + buf_block_t* block) /* in: buffer block to write */ +{ +#ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); +#endif + ut_ad(!ut_dulint_is_zero(block->newest_modification)); + +#ifdef UNIV_LOG_DEBUG + printf( + "Warning: cannot force log to disk in the log debug version!\n"); +#else + /* Force the log to the disk before writing the modified block */ + log_flush_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS); +#endif + /* Write the newest modification lsn to the page */ + mach_write_to_8(block->frame + FIL_PAGE_LSN, + block->newest_modification); + mach_write_to_8(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN, + block->newest_modification); + + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE, + (void*)block->frame, (void*)block); +} + +/************************************************************************ +Writes a page asynchronously from the buffer buf_pool to a file, if it can be +found in the buf_pool and it is in a flushable state. NOTE: in simulated aio +we must call os_aio_simulated_wake_handler_threads after we have posted a batch +of writes! */ +static +ulint +buf_flush_try_page( +/*===============*/ + /* out: 1 if a page was flushed, 0 otherwise */ + ulint space, /* in: space id */ + ulint offset, /* in: page offset */ + ulint flush_type) /* in: BUF_FLUSH_LRU, BUF_FLUSH_LIST, or + BUF_FLUSH_SINGLE_PAGE */ +{ + buf_block_t* block; + ibool locked; + + ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST) + || (flush_type == BUF_FLUSH_SINGLE_PAGE)); + + mutex_enter(&(buf_pool->mutex)); + + block = buf_page_hash_get(space, offset); + + if ((flush_type == BUF_FLUSH_LIST) + && block && buf_flush_ready_for_flush(block, flush_type)) { + + block->io_fix = BUF_IO_WRITE; + block->flush_type = flush_type; + + if (buf_pool->n_flush[block->flush_type] == 0) { + + os_event_reset(buf_pool->no_flush[block->flush_type]); + } + + (buf_pool->n_flush[flush_type])++; + + locked = FALSE; + + /* If the simulated aio thread is not running, we must + not wait for any latch, as we may end up in a deadlock: + if buf_fix_count == 0, then we know we need not wait */ + + if (block->buf_fix_count == 0) { + rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE); + + locked = TRUE; + } + + mutex_exit(&(buf_pool->mutex)); + + if (!locked) { + os_aio_simulated_wake_handler_threads(); + + rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE); + } + + if (buf_debug_prints) { + printf("Flushing page space %lu, page no %lu \n", + block->space, block->offset); + } + + buf_flush_write_block_low(block); + + return(1); + + } else if ((flush_type == BUF_FLUSH_LRU) && block + && buf_flush_ready_for_flush(block, flush_type)) { + + /* VERY IMPORTANT: + Because any thread may call the LRU flush, even when owning + locks on pages, to avoid deadlocks, we must make sure that the + s-lock is acquired on the page without waiting: this is + accomplished because in the if-condition above we require + the page not to be bufferfixed (in function + ..._ready_for_flush). */ + + block->io_fix = BUF_IO_WRITE; + block->flush_type = flush_type; + + (buf_pool->n_flush[flush_type])++; + + rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE); + + /* Note that the s-latch is acquired before releasing the + buf_pool mutex: this ensures that the latch is acquired + immediately. */ + + mutex_exit(&(buf_pool->mutex)); + + buf_flush_write_block_low(block); + + return(1); + + } else if ((flush_type == BUF_FLUSH_SINGLE_PAGE) && block + && buf_flush_ready_for_flush(block, flush_type)) { + + block->io_fix = BUF_IO_WRITE; + block->flush_type = flush_type; + + if (buf_pool->n_flush[block->flush_type] == 0) { + + os_event_reset(buf_pool->no_flush[block->flush_type]); + } + + (buf_pool->n_flush[flush_type])++; + + mutex_exit(&(buf_pool->mutex)); + + rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE); + + if (buf_debug_prints) { + printf("Flushing single page space %lu, page no %lu \n", + block->space, block->offset); + } + + buf_flush_write_block_low(block); + + return(1); + } else { + mutex_exit(&(buf_pool->mutex)); + + return(0); + } +} + +/*************************************************************** +Flushes to disk all flushable pages within the flush area. */ +static +ulint +buf_flush_try_neighbors( +/*====================*/ + /* out: number of pages flushed */ + ulint space, /* in: space id */ + ulint offset, /* in: page offset */ + ulint flush_type) /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ +{ + buf_block_t* block; + ulint low, high; + ulint count = 0; + ulint i; + + ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); + + low = (offset / BUF_FLUSH_AREA) * BUF_FLUSH_AREA; + high = (offset / BUF_FLUSH_AREA + 1) * BUF_FLUSH_AREA; + + if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) { + /* If there is little space, it is better not to flush any + block except from the end of the LRU list */ + + low = offset; + high = offset + 1; + } + + /* printf("Flush area: low %lu high %lu\n", low, high); */ + + if (high > fil_space_get_size(space)) { + high = fil_space_get_size(space); + } + + mutex_enter(&(buf_pool->mutex)); + + for (i = low; i < high; i++) { + + block = buf_page_hash_get(space, i); + + if (block && buf_flush_ready_for_flush(block, flush_type)) { + + mutex_exit(&(buf_pool->mutex)); + + /* Note: as we release the buf_pool mutex above, in + buf_flush_try_page we cannot be sure the page is still + in a flushable state: therefore we check it again + inside that function. */ + + count += buf_flush_try_page(space, i, flush_type); + + mutex_enter(&(buf_pool->mutex)); + } + } + + mutex_exit(&(buf_pool->mutex)); + + /* In simulated aio we wake up the i/o-handler threads now that + we have posted a batch of writes: */ + + os_aio_simulated_wake_handler_threads(); + + return(count); +} + +/*********************************************************************** +This utility flushes dirty blocks from the end of the LRU list or flush_list. +NOTE 1: in the case of an LRU flush the calling thread may own latches to +pages: to avoid deadlocks, this function must be written so that it cannot +end up waiting for these latches! NOTE 2: in the case of a flush list flush, +the calling thread is not allowed to own any latches on pages! */ + +ulint +buf_flush_batch( +/*============*/ + /* out: number of blocks for which the write + request was queued; ULINT_UNDEFINED if there + was a flush of the same type already running */ + ulint flush_type, /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST; if + BUF_FLUSH_LIST, then the caller must not own + any latches on pages */ + ulint min_n, /* in: wished minimum mumber of blocks flushed + (it is not guaranteed that the actual number + is that big, though) */ + dulint lsn_limit) /* in the case BUF_FLUSH_LIST all blocks whose + oldest_modification is smaller than this + should be flushed (if their number does not + exceed min_n), otherwise ignored */ +{ + buf_block_t* block; + ulint page_count = 0; + ulint old_page_count; + ulint space; + ulint offset; + ibool found; + + ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST)); + ut_ad((flush_type != BUF_FLUSH_LIST) || + sync_thread_levels_empty_gen(TRUE)); + + mutex_enter(&(buf_pool->mutex)); + + if ((buf_pool->n_flush[flush_type] > 0) + || (buf_pool->init_flush[flush_type] == TRUE)) { + + /* There is already a flush batch of the same type running */ + + mutex_exit(&(buf_pool->mutex)); + + return(ULINT_UNDEFINED); + } + + (buf_pool->init_flush)[flush_type] = TRUE; + + for (;;) { + /* If we have flushed enough, leave the loop */ + if (page_count >= min_n) { + + break; + } + + /* Start from the end of the list looking for a suitable + block to be flushed. */ + + if (flush_type == BUF_FLUSH_LRU) { + block = UT_LIST_GET_LAST(buf_pool->LRU); + } else { + ut_ad(flush_type == BUF_FLUSH_LIST); + + block = UT_LIST_GET_LAST(buf_pool->flush_list); + + if (!block + || (ut_dulint_cmp(block->oldest_modification, + lsn_limit) >= 0)) { + /* We have flushed enough */ + + break; + } + } + + found = FALSE; + + /* Note that after finding a single flushable page, we try to + flush also all its neighbors, and after that start from the + END of the LRU list or flush list again: the list may change + during the flushing and we cannot safely preserve within this + function a pointer to a block in the list! */ + + while ((block != NULL) && !found) { + + if (buf_flush_ready_for_flush(block, flush_type)) { + + found = TRUE; + space = block->space; + offset = block->offset; + + mutex_exit(&(buf_pool->mutex)); + + old_page_count = page_count; + + /* Try to flush also all the neighbors */ + page_count += + buf_flush_try_neighbors(space, offset, + flush_type); + + /* printf( + "Flush type %lu, page no %lu, neighb %lu\n", + flush_type, offset, + page_count - old_page_count); */ + + mutex_enter(&(buf_pool->mutex)); + + } else if (flush_type == BUF_FLUSH_LRU) { + + block = UT_LIST_GET_PREV(LRU, block); + + } else { + ut_ad(flush_type == BUF_FLUSH_LIST); + + block = UT_LIST_GET_PREV(flush_list, block); + } + } + + /* If we could not find anything to flush, leave the loop */ + + if (!found) { + break; + } + } + + (buf_pool->init_flush)[flush_type] = FALSE; + + if ((buf_pool->n_flush[flush_type] == 0) + && (buf_pool->init_flush[flush_type] == FALSE)) { + + /* The running flush batch has ended */ + + os_event_set(buf_pool->no_flush[flush_type]); + } + + mutex_exit(&(buf_pool->mutex)); + + if (buf_debug_prints && (page_count > 0)) { + if (flush_type == BUF_FLUSH_LRU) { + printf("To flush %lu pages in LRU flush\n", + page_count, flush_type); + } else if (flush_type == BUF_FLUSH_LIST) { + printf("To flush %lu pages in flush list flush\n", + page_count, flush_type); + } else { + ut_error; + } + } + + return(page_count); +} + +/********************************************************************** +Waits until a flush batch of the given type ends */ + +void +buf_flush_wait_batch_end( +/*=====================*/ + ulint type) /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ +{ + ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST)); + + os_event_wait(buf_pool->no_flush[type]); +} + +/********************************************************************** +Gives a recommendation of how many blocks should be flushed to establish +a big enough margin of replaceable blocks near the end of the LRU list +and in the free list. */ +static +ulint +buf_flush_LRU_recommendation(void) +/*==============================*/ + /* out: number of blocks which should be flushed + from the end of the LRU list */ +{ + buf_block_t* block; + ulint n_replaceable; + ulint distance = 0; + + mutex_enter(&(buf_pool->mutex)); + + n_replaceable = UT_LIST_GET_LEN(buf_pool->free); + + block = UT_LIST_GET_LAST(buf_pool->LRU); + + while ((block != NULL) + && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN + + BUF_FLUSH_EXTRA_MARGIN) + && (distance < BUF_LRU_FREE_SEARCH_LEN)) { + + if (buf_flush_ready_for_replace(block)) { + n_replaceable++; + } + + distance++; + + block = UT_LIST_GET_PREV(LRU, block); + } + + mutex_exit(&(buf_pool->mutex)); + + if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) { + + return(0); + } + + return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN + - n_replaceable); +} + +/************************************************************************* +Flushes pages from the end of the LRU list if there is too small a margin +of replaceable pages there or in the free list. VERY IMPORTANT: this function +is called also by threads which have locks on pages. To avoid deadlocks, we +flush only pages such that the s-lock required for flushing can be acquired +immediately, without waiting. */ + +void +buf_flush_free_margin(void) +/*=======================*/ +{ + ulint n_to_flush; + + n_to_flush = buf_flush_LRU_recommendation(); + + if (n_to_flush > 0) { + buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, ut_dulint_zero); + } +} + +/********************************************************************** +Validates the flush list. */ +static +ibool +buf_flush_validate_low(void) +/*========================*/ + /* out: TRUE if ok */ +{ + buf_block_t* block; + dulint om; + + UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list); + + block = UT_LIST_GET_FIRST(buf_pool->flush_list); + + while (block != NULL) { + om = block->oldest_modification; + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + ut_a(ut_dulint_cmp(om, ut_dulint_zero) > 0); + + block = UT_LIST_GET_NEXT(flush_list, block); + + if (block) { + ut_a(ut_dulint_cmp(om, block->oldest_modification) + >= 0); + } + } + + return(TRUE); +} + +/********************************************************************** +Validates the flush list. */ + +ibool +buf_flush_validate(void) +/*====================*/ + /* out: TRUE if ok */ +{ + ibool ret; + + mutex_enter(&(buf_pool->mutex)); + + ret = buf_flush_validate_low(); + + mutex_exit(&(buf_pool->mutex)); + + return(ret); +} diff --git a/innobase/buf/buf0lru.c b/innobase/buf/buf0lru.c new file mode 100644 index 00000000000..26bdd7db1fe --- /dev/null +++ b/innobase/buf/buf0lru.c @@ -0,0 +1,734 @@ +/****************************************************** +The database buffer replacement algorithm + +(c) 1995 Innobase Oy + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0lru.h" + +#ifdef UNIV_NONINL +#include "buf0lru.ic" +#endif + +#include "ut0byte.h" +#include "ut0lst.h" +#include "ut0rnd.h" +#include "sync0sync.h" +#include "sync0rw.h" +#include "hash0hash.h" +#include "os0sync.h" +#include "fil0fil.h" +#include "btr0btr.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0rea.h" +#include "btr0sea.h" +#include "os0file.h" + +/* The number of blocks from the LRU_old pointer onward, including the block +pointed to, must be 3/8 of the whole LRU list length, except that the +tolerance defined below is allowed. Note that the tolerance must be small +enough such that for even the BUF_LRU_OLD_MIN_LEN long LRU list, the +LRU_old pointer is not allowed to point to either end of the LRU list. */ + +#define BUF_LRU_OLD_TOLERANCE 20 + +/* The whole LRU list length is divided by this number to determine an +initial segment in buf_LRU_get_recent_limit */ + +#define BUF_LRU_INITIAL_RATIO 8 + +/********************************************************************** +Takes a block out of the LRU list and page hash table and sets the block +state to BUF_BLOCK_REMOVE_HASH. */ +static +void +buf_LRU_block_remove_hashed_page( +/*=============================*/ + buf_block_t* block); /* in: block, must contain a file page and + be in a state where it can be freed; there + may or may not be a hash index to the page */ +/********************************************************************** +Puts a file page whose has no hash index to the free list. */ +static +void +buf_LRU_block_free_hashed_page( +/*===========================*/ + buf_block_t* block); /* in: block, must contain a file page and + be in a state where it can be freed */ + +/********************************************************************** +Gets the minimum LRU_position field for the blocks in an initial segment +(determined by BUF_LRU_INITIAL_RATIO) of the LRU list. The limit is not +guaranteed to be precise, because the ulint_clock may wrap around. */ + +ulint +buf_LRU_get_recent_limit(void) +/*==========================*/ + /* out: the limit; zero if could not determine it */ +{ + buf_block_t* block; + ulint len; + ulint limit; + + mutex_enter(&(buf_pool->mutex)); + + len = UT_LIST_GET_LEN(buf_pool->LRU); + + if (len < BUF_LRU_OLD_MIN_LEN) { + /* The LRU list is too short to do read-ahead */ + + mutex_exit(&(buf_pool->mutex)); + + return(0); + } + + block = UT_LIST_GET_FIRST(buf_pool->LRU); + + limit = block->LRU_position - len / BUF_LRU_INITIAL_RATIO; + + mutex_exit(&(buf_pool->mutex)); + + return(limit); +} + +/********************************************************************** +Look for a replaceable block from the end of the LRU list and put it to +the free list if found. */ + +ibool +buf_LRU_search_and_free_block( +/*==========================*/ + /* out: TRUE if freed */ + ulint n_iterations) /* in: how many times this has been called + repeatedly without result: a high value + means that we should search farther */ +{ + buf_block_t* block; + ulint distance; + ibool freed; + ulint i; + + mutex_enter(&(buf_pool->mutex)); + + freed = FALSE; + + distance = BUF_LRU_FREE_SEARCH_LEN * (1 + n_iterations / 5); + + i = 0; + block = UT_LIST_GET_LAST(buf_pool->LRU); + + while (i < distance && block != NULL) { + + if (buf_flush_ready_for_replace(block)) { + + if (buf_debug_prints) { + printf( + "Putting space %lu page %lu to free list\n", + block->space, block->offset); + } + + buf_LRU_block_remove_hashed_page(block); + + mutex_exit(&(buf_pool->mutex)); + + btr_search_drop_page_hash_index(block->frame); + + mutex_enter(&(buf_pool->mutex)); + + buf_LRU_block_free_hashed_page(block); + + freed = TRUE; + + break; + } + + block = UT_LIST_GET_PREV(LRU, block); + } + + if (buf_pool->LRU_flush_ended > 0) { + buf_pool->LRU_flush_ended--; + } + + if (!freed) { + buf_pool->LRU_flush_ended = 0; + } + + mutex_exit(&(buf_pool->mutex)); + + return(freed); +} + +/********************************************************************** +Tries to remove LRU flushed blocks from the end of the LRU list and put them +to the free list. This is beneficial for the efficiency of the insert buffer +operation, as flushed pages from non-unique non-clustered indexes are here +taken out of the buffer pool, and their inserts redirected to the insert +buffer. Otherwise, the flushed blocks could get modified again before read +operations need new buffer blocks, and the i/o work done in flushing would be +wasted. */ + +void +buf_LRU_try_free_flushed_blocks(void) +/*=================================*/ +{ + mutex_enter(&(buf_pool->mutex)); + + while (buf_pool->LRU_flush_ended > 0) { + + mutex_exit(&(buf_pool->mutex)); + + buf_LRU_search_and_free_block(0); + + mutex_enter(&(buf_pool->mutex)); + } + + mutex_exit(&(buf_pool->mutex)); +} + +/********************************************************************** +Returns a free block from buf_pool. The block is taken off the free list. +If it is empty, blocks are moved from the end of the LRU list to the free +list. */ + +buf_block_t* +buf_LRU_get_free_block(void) +/*========================*/ + /* out: the free control block */ +{ + buf_block_t* block = NULL; + ibool freed; + ulint n_iterations = 0; +loop: + mutex_enter(&(buf_pool->mutex)); + + if (buf_pool->LRU_flush_ended > 0) { + mutex_exit(&(buf_pool->mutex)); + + buf_LRU_try_free_flushed_blocks(); + + mutex_enter(&(buf_pool->mutex)); + } + + /* If there is a block in the free list, take it */ + if (UT_LIST_GET_LEN(buf_pool->free) > 0) { + + block = UT_LIST_GET_FIRST(buf_pool->free); + UT_LIST_REMOVE(free, buf_pool->free, block); + block->state = BUF_BLOCK_READY_FOR_USE; + + mutex_exit(&(buf_pool->mutex)); + + return(block); + } + + /* If no block was in the free list, search from the end of the LRU + list and try to free a block there */ + + mutex_exit(&(buf_pool->mutex)); + + freed = buf_LRU_search_and_free_block(n_iterations); + + if (freed > 0) { + goto loop; + } + + /* No free block was found near the end of the list: try to flush + the LRU list */ + + buf_flush_free_margin(); + + os_event_wait(buf_pool->no_flush[BUF_FLUSH_LRU]); + + n_iterations++; + + os_aio_simulated_wake_handler_threads(); + + if (n_iterations > 10) { + + os_thread_sleep(500000); + } + + if (n_iterations > 20) { +/* buf_print(); + os_aio_print(); + rw_lock_list_print_info(); +*/ + if (n_iterations > 30) { + fprintf(stderr, + "Innobase: Warning: difficult to find free blocks from\n" + "Innobase: the buffer pool! Consider increasing the\n" + "Innobase: buffer pool size.\n"); + } + } + + goto loop; +} + +/*********************************************************************** +Moves the LRU_old pointer so that the length of the old blocks list +is inside the allowed limits. */ +UNIV_INLINE +void +buf_LRU_old_adjust_len(void) +/*========================*/ +{ + ulint old_len; + ulint new_len; + + ut_ad(buf_pool->LRU_old); + ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(3 * (BUF_LRU_OLD_MIN_LEN / 8) > BUF_LRU_OLD_TOLERANCE + 5); + + for (;;) { + old_len = buf_pool->LRU_old_len; + new_len = 3 * (UT_LIST_GET_LEN(buf_pool->LRU) / 8); + + /* Update the LRU_old pointer if necessary */ + + if (old_len < new_len - BUF_LRU_OLD_TOLERANCE) { + + buf_pool->LRU_old = UT_LIST_GET_PREV(LRU, + buf_pool->LRU_old); + (buf_pool->LRU_old)->old = TRUE; + buf_pool->LRU_old_len++; + + } else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) { + + (buf_pool->LRU_old)->old = FALSE; + buf_pool->LRU_old = UT_LIST_GET_NEXT(LRU, + buf_pool->LRU_old); + buf_pool->LRU_old_len--; + } else { + ut_ad(buf_pool->LRU_old); /* Check that we did not + fall out of the LRU list */ + return; + } + } +} + +/*********************************************************************** +Initializes the old blocks pointer in the LRU list. +This function should be called when the LRU list grows to +BUF_LRU_OLD_MIN_LEN length. */ +static +void +buf_LRU_old_init(void) +/*==================*/ +{ + buf_block_t* block; + + ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN); + + /* We first initialize all blocks in the LRU list as old and then use + the adjust function to move the LRU_old pointer to the right + position */ + + block = UT_LIST_GET_FIRST(buf_pool->LRU); + + while (block != NULL) { + block->old = TRUE; + block = UT_LIST_GET_NEXT(LRU, block); + } + + buf_pool->LRU_old = UT_LIST_GET_FIRST(buf_pool->LRU); + buf_pool->LRU_old_len = UT_LIST_GET_LEN(buf_pool->LRU); + + buf_LRU_old_adjust_len(); +} + +/********************************************************************** +Removes a block from the LRU list. */ +UNIV_INLINE +void +buf_LRU_remove_block( +/*=================*/ + buf_block_t* block) /* in: control block */ +{ + ut_ad(buf_pool); + ut_ad(block); + ut_ad(mutex_own(&(buf_pool->mutex))); + + /* If the LRU_old pointer is defined and points to just this block, + move it backward one step */ + + if (block == buf_pool->LRU_old) { + + /* Below: the previous block is guaranteed to exist, because + the LRU_old pointer is only allowed to differ by the + tolerance value from strict 3/8 of the LRU list length. */ + + buf_pool->LRU_old = UT_LIST_GET_PREV(LRU, block); + (buf_pool->LRU_old)->old = TRUE; + + buf_pool->LRU_old_len++; + ut_ad(buf_pool->LRU_old); + } + + /* Remove the block from the LRU list */ + UT_LIST_REMOVE(LRU, buf_pool->LRU, block); + + /* If the LRU list is so short that LRU_old not defined, return */ + if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) { + + buf_pool->LRU_old = NULL; + + return; + } + + ut_ad(buf_pool->LRU_old); + + /* Update the LRU_old_len field if necessary */ + if (block->old) { + + buf_pool->LRU_old_len--; + } + + /* Adjust the length of the old block list if necessary */ + buf_LRU_old_adjust_len(); +} + +/********************************************************************** +Adds a block to the LRU list end. */ +UNIV_INLINE +void +buf_LRU_add_block_to_end_low( +/*=========================*/ + buf_block_t* block) /* in: control block */ +{ + buf_block_t* last_block; + + ut_ad(buf_pool); + ut_ad(block); + ut_ad(mutex_own(&(buf_pool->mutex))); + + block->old = TRUE; + + last_block = UT_LIST_GET_LAST(buf_pool->LRU); + + if (last_block) { + block->LRU_position = last_block->LRU_position; + } else { + block->LRU_position = buf_pool_clock_tic(); + } + + UT_LIST_ADD_LAST(LRU, buf_pool->LRU, block); + + if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { + + buf_pool->LRU_old_len++; + } + + if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) { + + ut_ad(buf_pool->LRU_old); + + /* Adjust the length of the old block list if necessary */ + + buf_LRU_old_adjust_len(); + + } else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) { + + /* The LRU list is now long enough for LRU_old to become + defined: init it */ + + buf_LRU_old_init(); + } +} + +/********************************************************************** +Adds a block to the LRU list. */ +UNIV_INLINE +void +buf_LRU_add_block_low( +/*==================*/ + buf_block_t* block, /* in: control block */ + ibool old) /* in: TRUE if should be put to the old blocks + in the LRU list, else put to the start; if the + LRU list is very short, the block is added to + the start, regardless of this parameter */ +{ + ulint cl; + + ut_ad(buf_pool); + ut_ad(block); + ut_ad(mutex_own(&(buf_pool->mutex))); + + block->old = old; + cl = buf_pool_clock_tic(); + + if (!old || (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) { + + UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, block); + + block->LRU_position = cl; + block->freed_page_clock = buf_pool->freed_page_clock; + } else { + UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, buf_pool->LRU_old, + block); + buf_pool->LRU_old_len++; + + /* We copy the LRU position field of the previous block + to the new block */ + + block->LRU_position = (buf_pool->LRU_old)->LRU_position; + } + + if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) { + + ut_ad(buf_pool->LRU_old); + + /* Adjust the length of the old block list if necessary */ + + buf_LRU_old_adjust_len(); + + } else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) { + + /* The LRU list is now long enough for LRU_old to become + defined: init it */ + + buf_LRU_old_init(); + } +} + +/********************************************************************** +Adds a block to the LRU list. */ + +void +buf_LRU_add_block( +/*==============*/ + buf_block_t* block, /* in: control block */ + ibool old) /* in: TRUE if should be put to the old + blocks in the LRU list, else put to the start; + if the LRU list is very short, the block is + added to the start, regardless of this + parameter */ +{ + buf_LRU_add_block_low(block, old); +} + +/********************************************************************** +Moves a block to the start of the LRU list. */ + +void +buf_LRU_make_block_young( +/*=====================*/ + buf_block_t* block) /* in: control block */ +{ + buf_LRU_remove_block(block); + buf_LRU_add_block_low(block, FALSE); +} + +/********************************************************************** +Moves a block to the end of the LRU list. */ + +void +buf_LRU_make_block_old( +/*===================*/ + buf_block_t* block) /* in: control block */ +{ + buf_LRU_remove_block(block); + buf_LRU_add_block_to_end_low(block); +} + +/********************************************************************** +Puts a block back to the free list. */ + +void +buf_LRU_block_free_non_file_page( +/*=============================*/ + buf_block_t* block) /* in: block, must not contain a file page */ +{ + ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(block); + + ut_ad((block->state == BUF_BLOCK_MEMORY) + || (block->state == BUF_BLOCK_READY_FOR_USE)); + + block->state = BUF_BLOCK_NOT_USED; + + UT_LIST_ADD_FIRST(free, buf_pool->free, block); +} + +/********************************************************************** +Takes a block out of the LRU list and page hash table and sets the block +state to BUF_BLOCK_REMOVE_HASH. */ +static +void +buf_LRU_block_remove_hashed_page( +/*=============================*/ + buf_block_t* block) /* in: block, must contain a file page and + be in a state where it can be freed; there + may or may not be a hash index to the page */ +{ + ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(block); + + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + + ut_a(block->io_fix == 0); + ut_a(block->buf_fix_count == 0); + ut_a(ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) == 0); + + buf_LRU_remove_block(block); + + buf_pool->freed_page_clock += 1; + + buf_frame_modify_clock_inc(block->frame); + + HASH_DELETE(buf_block_t, hash, buf_pool->page_hash, + buf_page_address_fold(block->space, block->offset), + block); + + block->state = BUF_BLOCK_REMOVE_HASH; +} + +/********************************************************************** +Puts a file page whose has no hash index to the free list. */ +static +void +buf_LRU_block_free_hashed_page( +/*===========================*/ + buf_block_t* block) /* in: block, must contain a file page and + be in a state where it can be freed */ +{ + ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(block->state == BUF_BLOCK_REMOVE_HASH); + + block->state = BUF_BLOCK_MEMORY; + + buf_LRU_block_free_non_file_page(block); +} + +/************************************************************************** +Validates the LRU list. */ + +ibool +buf_LRU_validate(void) +/*==================*/ +{ + buf_block_t* block; + ulint old_len; + ulint new_len; + ulint LRU_pos; + + ut_ad(buf_pool); + mutex_enter(&(buf_pool->mutex)); + + if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { + + ut_a(buf_pool->LRU_old); + old_len = buf_pool->LRU_old_len; + new_len = 3 * (UT_LIST_GET_LEN(buf_pool->LRU) / 8); + ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE); + ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE); + } + + UT_LIST_VALIDATE(LRU, buf_block_t, buf_pool->LRU); + + block = UT_LIST_GET_FIRST(buf_pool->LRU); + + old_len = 0; + + while (block != NULL) { + + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + + if (block->old) { + old_len++; + } + + if (buf_pool->LRU_old && (old_len == 1)) { + ut_a(buf_pool->LRU_old == block); + } + + LRU_pos = block->LRU_position; + + block = UT_LIST_GET_NEXT(LRU, block); + + if (block) { + /* If the following assert fails, it may + not be an error: just the buf_pool clock + has wrapped around */ + ut_a(LRU_pos >= block->LRU_position); + } + } + + if (buf_pool->LRU_old) { + ut_a(buf_pool->LRU_old_len == old_len); + } + + UT_LIST_VALIDATE(free, buf_block_t, buf_pool->free); + + block = UT_LIST_GET_FIRST(buf_pool->free); + + while (block != NULL) { + ut_a(block->state == BUF_BLOCK_NOT_USED); + + block = UT_LIST_GET_NEXT(free, block); + } + + mutex_exit(&(buf_pool->mutex)); + return(TRUE); +} + +/************************************************************************** +Prints the LRU list. */ + +void +buf_LRU_print(void) +/*===============*/ +{ + buf_block_t* block; + buf_frame_t* frame; + ulint len; + + ut_ad(buf_pool); + mutex_enter(&(buf_pool->mutex)); + + printf("Pool ulint clock %lu\n", buf_pool->ulint_clock); + + block = UT_LIST_GET_FIRST(buf_pool->LRU); + + len = 0; + + while (block != NULL) { + + printf("BLOCK %lu ", block->offset); + + if (block->old) { + printf("old "); + } + + if (block->buf_fix_count) { + printf("buffix count %lu ", block->buf_fix_count); + } + + if (block->io_fix) { + printf("io_fix %lu ", block->io_fix); + } + + if (ut_dulint_cmp(block->oldest_modification, + ut_dulint_zero) > 0) { + printf("modif. "); + } + + printf("LRU pos %lu ", block->LRU_position); + + frame = buf_block_get_frame(block); + + printf("type %lu ", fil_page_get_type(frame)); + printf("index id %lu ", ut_dulint_get_low( + btr_page_get_index_id(frame))); + + block = UT_LIST_GET_NEXT(LRU, block); + len++; + if (len % 10 == 0) { + printf("\n"); + } + } + + mutex_exit(&(buf_pool->mutex)); +} diff --git a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c new file mode 100644 index 00000000000..13e9ed0476b --- /dev/null +++ b/innobase/buf/buf0rea.c @@ -0,0 +1,559 @@ +/****************************************************** +The database buffer read + +(c) 1995 Innobase Oy + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0rea.h" + +#include "fil0fil.h" +#include "mtr0mtr.h" + +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0lru.h" +#include "ibuf0ibuf.h" +#include "log0recv.h" +#include "trx0sys.h" +#include "os0file.h" + +/* The size in blocks of the area where the random read-ahead algorithm counts +the accessed pages when deciding whether to read-ahead */ +#define BUF_READ_AHEAD_RANDOM_AREA BUF_READ_AHEAD_AREA + +/* There must be at least this many pages in buf_pool in the area to start +a random read-ahead */ +#define BUF_READ_AHEAD_RANDOM_THRESHOLD (5 + BUF_READ_AHEAD_RANDOM_AREA / 8) + +/* The linear read-ahead area size */ +#define BUF_READ_AHEAD_LINEAR_AREA BUF_READ_AHEAD_AREA + +/* The linear read-ahead threshold */ +#define BUF_READ_AHEAD_LINEAR_THRESHOLD (3 * BUF_READ_AHEAD_LINEAR_AREA / 8) + +/* If there are buf_pool->curr_size per the number below pending reads, then +read-ahead is not done: this is to prevent flooding the buffer pool with +i/o-fixed buffer blocks */ +#define BUF_READ_AHEAD_PEND_LIMIT 2 + +/************************************************************************ +Low-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there, in which case does nothing. +Sets the io_fix flag and sets an exclusive lock on the buffer frame. The +flag is cleared and the x-lock released by an i/o-handler thread. */ +static +ulint +buf_read_page_low( +/*==============*/ + /* out: 1 if a read request was queued, 0 if the page + already resided in buf_pool */ + ibool sync, /* in: TRUE if synchronous aio is desired */ + ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ..., + ORed to OS_AIO_SIMULATED_WAKE_LATER (see below + at read-ahead functions) */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + ulint wake_later; + + wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; + mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER; + +#ifdef UNIV_LOG_DEBUG + if (space % 2 == 1) { + /* We are updating a replicate space while holding the + log mutex: the read must be handled before other reads + which might incur ibuf operations and thus write to the log */ + + printf("Log debug: reading replicate page in sync mode\n"); + + sync = TRUE; + } +#endif + if (trx_sys_hdr_page(space, offset)) { + + /* Trx sys header is so low in the latching order that we play + safe and do not leave the i/o-completion to an asynchronous + i/o-thread: */ + + sync = TRUE; + } + + block = buf_page_init_for_read(mode, space, offset); + + if (block != NULL) { + fil_io(OS_FILE_READ | wake_later, + sync, space, offset, 0, UNIV_PAGE_SIZE, + (void*)block->frame, (void*)block); + if (sync) { + /* The i/o is already completed when we arrive from + fil_read */ + buf_page_io_complete(block); + } + + return(1); + } + + return(0); +} + +/************************************************************************ +Applies a random read-ahead in buf_pool if there are at least a threshold +value of accessed pages from the random read-ahead area. Does not read any +page, not even the one at the position (space, offset), if the read-ahead +mechanism is not activated. NOTE 1: the calling thread may own latches on +pages: to avoid deadlocks this function must be written such that it cannot +end up waiting for these latches! NOTE 2: the calling thread must want +access to the page given: this rule is set to prevent unintended read-aheads +performed by ibuf routines, a situation which could result in a deadlock if +the OS does not support asynchronous i/o. */ +static +ulint +buf_read_ahead_random( +/*==================*/ + /* out: number of page read requests issued; NOTE + that if we read ibuf pages, it may happen that + the page at the given page number does not get + read even if we return a value > 0! */ + ulint space, /* in: space id */ + ulint offset) /* in: page number of a page which the current thread + wants to access */ +{ + buf_block_t* block; + ulint recent_blocks = 0; + ulint count; + ulint LRU_recent_limit; + ulint ibuf_mode; + ulint low, high; + ulint i; + + if (ibuf_bitmap_page(offset)) { + + /* If it is an ibuf bitmap page, we do no read-ahead, as + that could break the ibuf page access order */ + + return(0); + } + + low = (offset / BUF_READ_AHEAD_RANDOM_AREA) + * BUF_READ_AHEAD_RANDOM_AREA; + high = (offset / BUF_READ_AHEAD_RANDOM_AREA + 1) + * BUF_READ_AHEAD_RANDOM_AREA; + + if (high > fil_space_get_size(space)) { + + high = fil_space_get_size(space); + } + + /* Get the minimum LRU_position field value for an initial segment + of the LRU list, to determine which blocks have recently been added + to the start of the list. */ + + LRU_recent_limit = buf_LRU_get_recent_limit(); + + mutex_enter(&(buf_pool->mutex)); + + if (buf_pool->n_pend_reads > + buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { + mutex_exit(&(buf_pool->mutex)); + + return(0); + } + + /* Count how many blocks in the area have been recently accessed, + that is, reside near the start of the LRU list. */ + + for (i = low; i < high; i++) { + + block = buf_page_hash_get(space, i); + + if ((block) + && (block->LRU_position > LRU_recent_limit) + && block->accessed) { + + recent_blocks++; + } + } + + mutex_exit(&(buf_pool->mutex)); + + if (recent_blocks < BUF_READ_AHEAD_RANDOM_THRESHOLD) { + /* Do nothing */ + + return(0); + } + + /* Read all the suitable blocks within the area */ + + if (ibuf_inside()) { + ibuf_mode = BUF_READ_IBUF_PAGES_ONLY; + } else { + ibuf_mode = BUF_READ_ANY_PAGE; + } + + count = 0; + + for (i = low; i < high; i++) { + /* It is only sensible to do read-ahead in the non-sync aio + mode: hence FALSE as the first parameter */ + + if (!ibuf_bitmap_page(i)) { + + count += buf_read_page_low(FALSE, ibuf_mode + | OS_AIO_SIMULATED_WAKE_LATER, + space, i); + } + } + + /* In simulated aio we wake the aio handler threads only after + queuing all aio requests, in native aio the following call does + nothing: */ + + os_aio_simulated_wake_handler_threads(); + + if (buf_debug_prints && (count > 0)) { + + printf("Random read-ahead space %lu offset %lu pages %lu\n", + space, offset, count); + } + + return(count); +} + +/************************************************************************ +High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. Does a random read-ahead if it seems +sensible. */ + +ulint +buf_read_page( +/*==========*/ + /* out: number of page read requests issued: this can + be > 1 if read-ahead occurred */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + ulint count; + ulint count2; + + count = buf_read_ahead_random(space, offset); + + /* We do the i/o in the synchronous aio mode to save thread + switches: hence TRUE */ + + count2 = buf_read_page_low(TRUE, BUF_READ_ANY_PAGE, space, offset); + + /* Flush pages from the end of the LRU list if necessary */ + buf_flush_free_margin(); + + return(count + count2); +} + +/************************************************************************ +Applies linear read-ahead if in the buf_pool the page is a border page of +a linear read-ahead area and all the pages in the area have been accessed. +Does not read any page if the read-ahead mechanism is not activated. Note +that the the algorithm looks at the 'natural' adjacent successor and +predecessor of the page, which on the leaf level of a B-tree are the next +and previous page in the chain of leaves. To know these, the page specified +in (space, offset) must already be present in the buf_pool. Thus, the +natural way to use this function is to call it when a page in the buf_pool +is accessed the first time, calling this function just after it has been +bufferfixed. +NOTE 1: as this function looks at the natural predecessor and successor +fields on the page, what happens, if these are not initialized to any +sensible value? No problem, before applying read-ahead we check that the +area to read is within the span of the space, if not, read-ahead is not +applied. An uninitialized value may result in a useless read operation, but +only very improbably. +NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this +function must be written such that it cannot end up waiting for these +latches! +NOTE 3: the calling thread must want access to the page given: this rule is +set to prevent unintended read-aheads performed by ibuf routines, a situation +which could result in a deadlock if the OS does not support asynchronous io. */ + +ulint +buf_read_ahead_linear( +/*==================*/ + /* out: number of page read requests issued */ + ulint space, /* in: space id */ + ulint offset) /* in: page number of a page; NOTE: the current thread + must want access to this page (see NOTE 3 above) */ +{ + buf_block_t* block; + buf_frame_t* frame; + buf_block_t* pred_block = NULL; + ulint pred_offset; + ulint succ_offset; + ulint count; + int asc_or_desc; + ulint new_offset; + ulint fail_count; + ulint ibuf_mode; + ulint low, high; + ulint i; + + if (ibuf_bitmap_page(offset)) { + /* If it is an ibuf bitmap page, we do no read-ahead, as + that could break the ibuf page access order */ + + return(0); + } + + low = (offset / BUF_READ_AHEAD_LINEAR_AREA) + * BUF_READ_AHEAD_LINEAR_AREA; + high = (offset / BUF_READ_AHEAD_LINEAR_AREA + 1) + * BUF_READ_AHEAD_LINEAR_AREA; + + if ((offset != low) && (offset != high - 1)) { + /* This is not a border page of the area: return */ + + return(0); + } + + if (high > fil_space_get_size(space)) { + /* The area is not whole, return */ + + return(0); + } + + mutex_enter(&(buf_pool->mutex)); + + if (buf_pool->n_pend_reads > + buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { + mutex_exit(&(buf_pool->mutex)); + + return(0); + } + + /* Check that almost all pages in the area have been accessed; if + offset == low, the accesses must be in a descending order, otherwise, + in an ascending order. */ + + asc_or_desc = 1; + + if (offset == low) { + asc_or_desc = -1; + } + + fail_count = 0; + + for (i = low; i < high; i++) { + + block = buf_page_hash_get(space, i); + + if ((block == NULL) || !block->accessed) { + + /* Not accessed */ + fail_count++; + + } else if (pred_block && (ut_ulint_cmp(block->LRU_position, + pred_block->LRU_position) + != asc_or_desc)) { + + /* Accesses not in the right order */ + + fail_count++; + pred_block = block; + } + } + + if (fail_count > BUF_READ_AHEAD_LINEAR_AREA - + BUF_READ_AHEAD_LINEAR_THRESHOLD) { + /* Too many failures: return */ + + mutex_exit(&(buf_pool->mutex)); + + return(0); + } + + /* If we got this far, we know that enough pages in the area have + been accessed in the right order: linear read-ahead can be sensible */ + + block = buf_page_hash_get(space, offset); + + if (block == NULL) { + mutex_exit(&(buf_pool->mutex)); + + return(0); + } + + frame = block->frame; + + /* Read the natural predecessor and successor page addresses from + the page; NOTE that because the calling thread may have an x-latch + on the page, we do not acquire an s-latch on the page, this is to + prevent deadlocks. Even if we read values which are nonsense, the + algorithm will work. */ + + pred_offset = fil_page_get_prev(frame); + succ_offset = fil_page_get_next(frame); + + mutex_exit(&(buf_pool->mutex)); + + if ((offset == low) && (succ_offset == offset + 1)) { + + /* This is ok, we can continue */ + new_offset = pred_offset; + + } else if ((offset == high - 1) && (pred_offset == offset - 1)) { + + /* This is ok, we can continue */ + new_offset = succ_offset; + } else { + /* Successor or predecessor not in the right order */ + + return(0); + } + + low = (new_offset / BUF_READ_AHEAD_LINEAR_AREA) + * BUF_READ_AHEAD_LINEAR_AREA; + high = (new_offset / BUF_READ_AHEAD_LINEAR_AREA + 1) + * BUF_READ_AHEAD_LINEAR_AREA; + + if ((new_offset != low) && (new_offset != high - 1)) { + /* This is not a border page of the area: return */ + + return(0); + } + + if (high > fil_space_get_size(space)) { + /* The area is not whole, return */ + + return(0); + } + + /* If we got this far, read-ahead can be sensible: do it */ + + if (ibuf_inside()) { + ibuf_mode = BUF_READ_IBUF_PAGES_ONLY; + } else { + ibuf_mode = BUF_READ_ANY_PAGE; + } + + count = 0; + + for (i = low; i < high; i++) { + /* It is only sensible to do read-ahead in the non-sync + aio mode: hence FALSE as the first parameter */ + + if (!ibuf_bitmap_page(i)) { + count += buf_read_page_low(FALSE, ibuf_mode + | OS_AIO_SIMULATED_WAKE_LATER, + space, i); + } + } + + /* In simulated aio we wake the aio handler threads only after + queuing all aio requests, in native aio the following call does + nothing: */ + + os_aio_simulated_wake_handler_threads(); + + /* Flush pages from the end of the LRU list if necessary */ + buf_flush_free_margin(); + + if (buf_debug_prints && (count > 0)) { + printf( + "LINEAR read-ahead space %lu offset %lu pages %lu\n", + space, offset, count); + } + + return(count); +} + +/************************************************************************ +Issues read requests for pages which the ibuf module wants to read in, in +order to contract insert buffer trees. Technically, this function is like +a read-ahead function. */ + +void +buf_read_ibuf_merge_pages( +/*======================*/ + ibool sync, /* in: TRUE if the caller wants this function + to wait for the highest address page to get + read in, before this function returns */ + ulint space, /* in: space id */ + ulint* page_nos, /* in: array of page numbers to read, with the + highest page number the last in the array */ + ulint n_stored) /* in: number of page numbers in the array */ +{ + ulint i; + + ut_ad(!ibuf_inside()); +#ifdef UNIV_IBUF_DEBUG + ut_a(n_stored < UNIV_PAGE_SIZE); +#endif + while (buf_pool->n_pend_reads > + buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { + os_thread_sleep(500000); + } + + for (i = 0; i < n_stored; i++) { + if ((i + 1 == n_stored) && sync) { + buf_read_page_low(TRUE, BUF_READ_ANY_PAGE, space, + page_nos[i]); + } else { + buf_read_page_low(FALSE, BUF_READ_ANY_PAGE, space, + page_nos[i]); + } + } + + /* Flush pages from the end of the LRU list if necessary */ + buf_flush_free_margin(); + + if (buf_debug_prints) { + printf("Ibuf merge read-ahead space %lu pages %lu\n", + space, n_stored); + } +} + +/************************************************************************ +Issues read requests for pages which recovery wants to read in. */ + +void +buf_read_recv_pages( +/*================*/ + ibool sync, /* in: TRUE if the caller wants this function + to wait for the highest address page to get + read in, before this function returns */ + ulint space, /* in: space id */ + ulint* page_nos, /* in: array of page numbers to read, with the + highest page number the last in the array */ + ulint n_stored) /* in: number of page numbers in the array */ +{ + ulint i; + + for (i = 0; i < n_stored; i++) { + + while (buf_pool->n_pend_reads >= RECV_POOL_N_FREE_BLOCKS / 2) { + + os_aio_simulated_wake_handler_threads(); + os_thread_sleep(500000); + } + + if ((i + 1 == n_stored) && sync) { + buf_read_page_low(TRUE, BUF_READ_ANY_PAGE, space, + page_nos[i]); + } else { + buf_read_page_low(FALSE, BUF_READ_ANY_PAGE + | OS_AIO_SIMULATED_WAKE_LATER, + space, page_nos[i]); + } + } + + os_aio_simulated_wake_handler_threads(); + + /* Flush pages from the end of the LRU list if necessary */ + buf_flush_free_margin(); + + if (buf_debug_prints) { + printf("Recovery applies read-ahead pages %lu\n", n_stored); + } +} diff --git a/innobase/buf/makefilewin b/innobase/buf/makefilewin new file mode 100644 index 00000000000..ce62cb95958 --- /dev/null +++ b/innobase/buf/makefilewin @@ -0,0 +1,20 @@ +include ..\include\makefile.i + +buf.lib: buf0buf.obj buf0lru.obj buf0flu.obj buf0rea.obj + lib -out:..\libs\buf.lib buf0buf.obj buf0lru.obj buf0flu.obj buf0rea.obj + +buf0buf.obj: buf0buf.c + $(CCOM) $(CFL) -c buf0buf.c + +buf0lru.obj: buf0lru.c + $(CCOM) $(CFL) -c buf0lru.c + +buf0flu.obj: buf0flu.c + $(CCOM) $(CFL) -c buf0flu.c + +buf0rea.obj: buf0rea.c + $(CCOM) $(CFL) -c buf0rea.c + + + + diff --git a/innobase/buf/ts/makefile b/innobase/buf/ts/makefile new file mode 100644 index 00000000000..5886d06d7fa --- /dev/null +++ b/innobase/buf/ts/makefile @@ -0,0 +1,20 @@ + + + +include ..\..\makefile.i + +doall: tsbuf + +tsbuf: ..\buf.lib tsbuf.c + $(CCOM) $(CFL) -I.. -I..\.. ..\buf.lib ..\..\btr.lib ..\..\trx.lib ..\..\pars.lib ..\..\que.lib ..\..\lock.lib ..\..\row.lib ..\..\read.lib ..\..\srv.lib ..\..\com.lib ..\..\usr.lib ..\..\thr.lib ..\..\fut.lib ..\..\fsp.lib ..\..\page.lib ..\..\dyn.lib ..\..\mtr.lib ..\..\log.lib ..\..\rem.lib ..\..\fil.lib ..\..\dict.lib ..\..\data.lib ..\..\mach.lib ..\..\ha.lib ..\..\ut.lib ..\..\sync.lib ..\..\mem.lib ..\..\os.lib tsbuf.c $(LFL) + +tsos: ..\buf.lib tsos.c + $(CCOM) $(CFL) -I.. -I..\.. ..\buf.lib ..\..\mach.lib ..\..\fil.lib ..\..\ha.lib ..\..\ut.lib ..\..\sync.lib ..\..\mem.lib ..\..\os.lib tsos.c $(LFL) + + + + + + + + diff --git a/innobase/buf/ts/tsbuf.c b/innobase/buf/ts/tsbuf.c new file mode 100644 index 00000000000..fd6ae69653f --- /dev/null +++ b/innobase/buf/ts/tsbuf.c @@ -0,0 +1,885 @@ +/************************************************************************ +The test module for the file system and buffer manager + +(c) 1995 Innobase Oy + +Created 11/16/1995 Heikki Tuuri +*************************************************************************/ + +#include "string.h" + +#include "os0thread.h" +#include "os0file.h" +#include "ut0ut.h" +#include "ut0byte.h" +#include "sync0sync.h" +#include "mem0mem.h" +#include "fil0fil.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "log0log.h" +#include "mach0data.h" +#include "..\buf0buf.h" +#include "..\buf0flu.h" +#include "..\buf0lru.h" + +os_file_t files[1000]; + +mutex_t ios_mutex; +ulint ios; +ulint n[10]; + +mutex_t incs_mutex; +ulint incs; + +#define N_SPACES 1 +#define N_FILES 1 +#define FILE_SIZE 4000 +#define POOL_SIZE 1000 +#define COUNTER_OFFSET 1500 + +#define LOOP_SIZE 150 +#define N_THREADS 5 + + +ulint zero = 0; + +buf_frame_t* bl_arr[POOL_SIZE]; + +/************************************************************************ +Io-handler thread function. */ + +ulint +handler_thread( +/*===========*/ + void* arg) +{ + ulint segment; + void* mess; + ulint i; + bool ret; + + segment = *((ulint*)arg); + + printf("Io handler thread %lu starts\n", segment); + + for (i = 0;; i++) { + ret = fil_aio_wait(segment, &mess); + ut_a(ret); + + buf_page_io_complete((buf_block_t*)mess); + + mutex_enter(&ios_mutex); + ios++; + mutex_exit(&ios_mutex); + + } + + return(0); +} + +/************************************************************************* +This thread reports the status of sync system. */ + +ulint +info_thread( +/*========*/ + void* arg) +{ + ulint segment; + + segment = *((ulint*)arg); + + for (;;) { + sync_print(); + os_aio_print(); + printf("Debug stop threads == %lu\n", ut_dbg_stop_threads); + os_thread_sleep(30000000); + } + + return(0); +} + +/************************************************************************* +Creates the files for the file system test and inserts them to +the file system. */ + +void +create_files(void) +/*==============*/ +{ + bool ret; + ulint i, k; + char name[20]; + os_thread_t thr[5]; + os_thread_id_t id[5]; + ulint err; + + printf("--------------------------------------------------------\n"); + printf("Create or open database files\n"); + + strcpy(name, "tsfile00"); + + for (k = 0; k < N_SPACES; k++) { + for (i = 0; i < N_FILES; i++) { + + name[9] = (char)((ulint)'0' + k); + name[10] = (char)((ulint)'0' + i); + + files[i] = os_file_create(name, OS_FILE_CREATE, + OS_FILE_TABLESPACE, &ret); + + if (ret == FALSE) { + err = os_file_get_last_error(); + if (err != OS_FILE_ALREADY_EXISTS) { + printf("OS error %lu in file creation\n", err); + ut_error; + } + + files[i] = os_file_create( + name, OS_FILE_OPEN, + OS_FILE_TABLESPACE, &ret); + + ut_a(ret); + } + + ret = os_file_close(files[i]); + ut_a(ret); + + if (i == 0) { + fil_space_create(name, k, OS_FILE_TABLESPACE); + } + + ut_a(fil_validate()); + + fil_node_create(name, FILE_SIZE, k); + } + } + + ios = 0; + + mutex_create(&ios_mutex); + + for (i = 0; i < 5; i++) { + n[i] = i; + + thr[i] = os_thread_create(handler_thread, n + i, id + i); + } +/* + n[9] = 9; + os_thread_create(info_thread, n + 9, id); +*/ +} + +/************************************************************************ +Creates the test database files. */ + +void +create_db(void) +/*===========*/ +{ + ulint i; + byte* frame; + ulint j; + ulint tm, oldtm; + mtr_t mtr; + + printf("--------------------------------------------------------\n"); + printf("Write database pages\n"); + + oldtm = ut_clock(); + + for (i = 0; i < N_SPACES; i++) { + for (j = 0; j < FILE_SIZE * N_FILES; j++) { + mtr_start(&mtr); + mtr_set_log_mode(&mtr, MTR_LOG_NONE); + + frame = buf_page_create(i, j, &mtr); + buf_page_get(i, j, RW_X_LATCH, &mtr); + + if (j > FILE_SIZE * N_FILES - 64 * 2 - 1) { + mlog_write_ulint(frame + FIL_PAGE_PREV, j - 5, + MLOG_4BYTES, &mtr); + mlog_write_ulint(frame + FIL_PAGE_NEXT, j - 7, + MLOG_4BYTES, &mtr); + } else { + mlog_write_ulint(frame + FIL_PAGE_PREV, j - 1, + MLOG_4BYTES, &mtr); + mlog_write_ulint(frame + FIL_PAGE_NEXT, j + 1, + MLOG_4BYTES, &mtr); + } + + mlog_write_ulint(frame + FIL_PAGE_OFFSET, j, + MLOG_4BYTES, &mtr); + mlog_write_ulint(frame + FIL_PAGE_SPACE, i, + MLOG_4BYTES, &mtr); + mlog_write_ulint(frame + COUNTER_OFFSET, 0, + MLOG_4BYTES, &mtr); + + mtr_commit(&mtr); + } + } + + tm = ut_clock(); + printf("Wall clock time for test %lu milliseconds\n", tm - oldtm); + + printf("--------------------------------------------------------\n"); + printf("TEST 1 A. Test of page creation when page resides in buffer\n"); + for (i = 0; i < N_SPACES; i++) { + for (j = FILE_SIZE * N_FILES - 200; + j < FILE_SIZE * N_FILES; j++) { + mtr_start(&mtr); + mtr_set_log_mode(&mtr, MTR_LOG_NONE); + + frame = buf_page_create(i, j, &mtr); + buf_page_get(i, j, RW_X_LATCH, &mtr); + + mlog_write_ulint(frame + FIL_PAGE_PREV, + j - 1, MLOG_4BYTES, &mtr); + + mlog_write_ulint(frame + FIL_PAGE_NEXT, + j + 1, MLOG_4BYTES, &mtr); + + mlog_write_ulint(frame + FIL_PAGE_OFFSET, j, + MLOG_4BYTES, &mtr); + mlog_write_ulint(frame + FIL_PAGE_SPACE, i, + MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + } + } + + printf("--------------------------------------------------------\n"); + printf("TEST 1 B. Flush pages\n"); + + buf_flush_batch(BUF_FLUSH_LIST, POOL_SIZE / 2); + buf_validate(); + + printf("--------------------------------------------------------\n"); + printf("TEST 1 C. Allocate POOL_SIZE blocks to flush pages\n"); + + buf_validate(); + /* Flush the pool of dirty pages */ + for (i = 0; i < POOL_SIZE; i++) { + + bl_arr[i] = buf_frame_alloc(); + } + buf_validate(); + buf_LRU_print(); + + for (i = 0; i < POOL_SIZE; i++) { + + buf_frame_free(bl_arr[i]); + } + + buf_validate(); + ut_a(buf_all_freed()); + + mtr_start(&mtr); + frame = buf_page_get(0, 313, RW_S_LATCH, &mtr); +#ifdef UNIV_ASYNC_IO + ut_a(buf_page_io_query(buf_block_align(frame)) == TRUE); +#endif + mtr_commit(&mtr); +} + +/************************************************************************ +Reads the test database files. */ + +void +test1(void) +/*=======*/ +{ + ulint i, j, k, c; + byte* frame; + ulint tm, oldtm; + mtr_t mtr; + + printf("--------------------------------------------------------\n"); + printf("TEST 1 D. Read linearly database files\n"); + + oldtm = ut_clock(); + + for (k = 0; k < 1; k++) { + for (i = 0; i < N_SPACES; i++) { + for (j = 0; j < N_FILES * FILE_SIZE; j++) { + mtr_start(&mtr); + + frame = buf_page_get(i, j, RW_S_LATCH, &mtr); + + ut_a(mtr_read_ulint(frame + FIL_PAGE_OFFSET, + MLOG_4BYTES, &mtr) + == j); + ut_a(mtr_read_ulint(frame + FIL_PAGE_SPACE, + MLOG_4BYTES, &mtr) + == i); + + mtr_commit(&mtr); + } + } + } + + tm = ut_clock(); + printf("Wall clock time for %lu pages %lu milliseconds\n", + k * i * j, tm - oldtm); + buf_validate(); + + printf("--------------------------------------------------------\n"); + printf("TEST 1 E. Read linearly downward database files\n"); + + oldtm = ut_clock(); + + c = 0; + + for (k = 0; k < 1; k++) { + for (i = 0; i < N_SPACES; i++) { + for (j = ut_min(1000, FILE_SIZE - 1); j > 0; j--) { + mtr_start(&mtr); + + frame = buf_page_get(i, j, RW_S_LATCH, &mtr); + c++; + + ut_a(mtr_read_ulint(frame + FIL_PAGE_OFFSET, + MLOG_4BYTES, &mtr) + == j); + ut_a(mtr_read_ulint(frame + FIL_PAGE_SPACE, + MLOG_4BYTES, &mtr) + == i); + + + ut_a(buf_page_io_query(buf_block_align(frame)) + == FALSE); + + mtr_commit(&mtr); + } + } + } + + tm = ut_clock(); + printf("Wall clock time for %lu pages %lu milliseconds\n", + c, tm - oldtm); + buf_validate(); +} + +/************************************************************************ +Reads the test database files. */ + +void +test2(void) +/*=======*/ +{ + ulint i, j, k; + byte* frame; + ulint tm, oldtm; + mtr_t mtr; + + printf("--------------------------------------------------------\n"); + printf("TEST 2. Read randomly database files\n"); + + oldtm = ut_clock(); + + for (k = 0; k < 100; k++) { + i = ut_rnd_gen_ulint() % N_SPACES; + j = ut_rnd_gen_ulint() % (N_FILES * FILE_SIZE); + + mtr_start(&mtr); + + frame = buf_page_get(i, j, RW_S_LATCH, &mtr); + + ut_a(mtr_read_ulint(frame + FIL_PAGE_OFFSET, + MLOG_4BYTES, &mtr) + == j); + ut_a(mtr_read_ulint(frame + FIL_PAGE_SPACE, + MLOG_4BYTES, &mtr) + == i); + + mtr_commit(&mtr); + } + + tm = ut_clock(); + printf("Wall clock time for random %lu read %lu milliseconds\n", + k, tm - oldtm); +} + +/************************************************************************ +Reads the test database files. */ + +void +test3(void) +/*=======*/ +{ + ulint i, j, k; + byte* frame; + ulint tm, oldtm; + ulint rnd; + mtr_t mtr; + + if (FILE_SIZE < POOL_SIZE + 3050 + ut_dbg_zero) { + return; + } + + printf("Flush the pool of high-offset pages\n"); + + /* Flush the pool of high-offset pages */ + for (i = 0; i < POOL_SIZE; i++) { + + mtr_start(&mtr); + + frame = buf_page_get(0, i, RW_S_LATCH, &mtr); + + mtr_commit(&mtr); + } + buf_validate(); + + printf("--------------------------------------------------------\n"); + printf("TEST 3. Read randomly database pages, no read-ahead\n"); + + oldtm = ut_clock(); + + rnd = 123; + + for (k = 0; k < 400; k++) { + rnd += 23477; + + i = 0; + j = POOL_SIZE + 10 + rnd % 3000; + + mtr_start(&mtr); + + frame = buf_page_get(i, j, RW_S_LATCH, &mtr); + + ut_a(mtr_read_ulint(frame + FIL_PAGE_OFFSET, + MLOG_4BYTES, &mtr) + == j); + ut_a(mtr_read_ulint(frame + FIL_PAGE_SPACE, + MLOG_4BYTES, &mtr) + == i); + mtr_commit(&mtr); + } + + tm = ut_clock(); + printf( + "Wall clock time for %lu random no read-ahead %lu milliseconds\n", + k, tm - oldtm); + + buf_validate(); + printf("Flush the pool of high-offset pages\n"); + /* Flush the pool of high-offset pages */ + for (i = 0; i < POOL_SIZE; i++) { + + mtr_start(&mtr); + + frame = buf_page_get(0, i, RW_S_LATCH, &mtr); + + mtr_commit(&mtr); + } + + buf_validate(); + printf("--------------------------------------------------------\n"); + printf("TEST 3 B. Read randomly database pages, random read-ahead\n"); + + oldtm = ut_clock(); + + rnd = 123; + for (k = 0; k < 400; k++) { + rnd += 23477; + + i = 0; + j = POOL_SIZE + 10 + rnd % 400; + + mtr_start(&mtr); + + frame = buf_page_get(i, j, RW_S_LATCH, &mtr); + + ut_a(mtr_read_ulint(frame + FIL_PAGE_OFFSET, + MLOG_4BYTES, &mtr) + == j); + ut_a(mtr_read_ulint(frame + FIL_PAGE_SPACE, + MLOG_4BYTES, &mtr) + == i); + mtr_commit(&mtr); + } + + tm = ut_clock(); + printf( + "Wall clock time for %lu random read-ahead %lu milliseconds\n", + k, tm - oldtm); +} + +/************************************************************************ +Tests speed of CPU algorithms. */ + +void +test4(void) +/*=======*/ +{ + ulint i, j; + ulint tm, oldtm; + mtr_t mtr; + buf_frame_t* frame; + + os_thread_sleep(2000000); + + printf("--------------------------------------------------------\n"); + printf("TEST 4. Speed of CPU algorithms\n"); + + oldtm = ut_clock(); + + for (j = 0; j < 1000; j++) { + + mtr_start(&mtr); + for (i = 0; i < 20; i++) { + + frame = buf_page_get(0, i, RW_S_LATCH, &mtr); + } + mtr_commit(&mtr); + } + + tm = ut_clock(); + printf("Wall clock time for %lu page get-release %lu milliseconds\n", + i * j, tm - oldtm); + + buf_validate(); + + oldtm = ut_clock(); + + for (i = 0; i < 10000; i++) { + frame = buf_frame_alloc(); + buf_frame_free(frame); + } + + tm = ut_clock(); + printf("Wall clock time for %lu block alloc-free %lu milliseconds\n", + i, tm - oldtm); + + ha_print_info(buf_pool->page_hash); + buf_print(); +} + +/************************************************************************ +Tests various points of code. */ + +void +test5(void) +/*=======*/ +{ + buf_frame_t* frame; + fil_addr_t addr; + ulint space; + mtr_t mtr; + + printf("--------------------------------------------------------\n"); + printf("TEST 5. Various tests \n"); + + mtr_start(&mtr); + + frame = buf_page_get(0, 313, RW_S_LATCH, &mtr); + + ut_a(buf_frame_get_space_id(frame) == 0); + ut_a(buf_frame_get_page_no(frame) == 313); + + ut_a(buf_frame_align(frame + UNIV_PAGE_SIZE - 1) == frame); + ut_a(buf_frame_align(frame) == frame); + + ut_a(buf_block_align(frame + UNIV_PAGE_SIZE - 1) == + buf_block_align(frame)); + + buf_ptr_get_fsp_addr(frame + UNIV_PAGE_SIZE - 1, &space, &addr); + + ut_a(addr.page == 313) + ut_a(addr.boffset == UNIV_PAGE_SIZE - 1); + ut_a(space == 0); + + mtr_commit(&mtr); +} + +/************************************************************************ +Random test thread function. */ + +ulint +random_thread( +/*===========*/ + void* arg) +{ + ulint n; + ulint i, j, r, t, p, sp, count; + ulint s; + buf_frame_t* arr[POOL_SIZE / N_THREADS]; + buf_frame_t* frame; + mtr_t mtr; + mtr_t mtr2; + + n = *((ulint*)arg); + + printf("Random test thread %lu starts\n", os_thread_get_curr_id()); + + for (i = 0; i < 30; i++) { + t = ut_rnd_gen_ulint() % 10; + r = ut_rnd_gen_ulint() % 100; + s = ut_rnd_gen_ulint() % (POOL_SIZE / N_THREADS); + p = ut_rnd_gen_ulint(); + sp = ut_rnd_gen_ulint() % N_SPACES; + + if (i % 100 == 0) { + printf("Thr %lu tst %lu starts\n", os_thread_get_curr_id(), t); + } + ut_a(buf_validate()); + + mtr_start(&mtr); + if (t == 6) { + /* Allocate free blocks */ + for (j = 0; j < s; j++) { + arr[j] = buf_frame_alloc(); + ut_a(arr[j]); + } + for (j = 0; j < s; j++) { + buf_frame_free(arr[j]); + } + } else if (t == 9) { +/* buf_flush_batch(BUF_FLUSH_LIST, 30); */ + + } else if (t == 7) { + /* x-lock many blocks */ + for (j = 0; j < s; j++) { + arr[j] = buf_page_get(sp, (p + j) + % (N_FILES * FILE_SIZE), + RW_X_LATCH, + &mtr); + ut_a(arr[j]); + if (j > 0) { + ut_a(arr[j] != arr[j - 1]); + } + } + ut_a(buf_validate()); + } else if (t == 8) { + /* s-lock many blocks */ + for (j = 0; j < s; j++) { + arr[j] = buf_page_get(sp, (p + j) + % (N_FILES * FILE_SIZE), + RW_S_LATCH, + &mtr); + ut_a(arr[j]); + if (j > 0) { + ut_a(arr[j] != arr[j - 1]); + } + } + } else if (t <= 2) { + for (j = 0; j < r; j++) { + /* Read pages */ + mtr_start(&mtr2); + frame = buf_page_get(sp, + p % (N_FILES * FILE_SIZE), + RW_S_LATCH, &mtr2); + + ut_a(mtr_read_ulint(frame + FIL_PAGE_OFFSET, + MLOG_4BYTES, &mtr2) + == p % (N_FILES * FILE_SIZE)); + ut_a(mtr_read_ulint(frame + FIL_PAGE_SPACE, + MLOG_4BYTES, &mtr2) + == sp); + mtr_commit(&mtr2); + if (t == 0) { + p++; /* upward */ + } else if (t == 1) { + p--; /* downward */ + } else if (t == 2) { + p = ut_rnd_gen_ulint(); /* randomly */ + } + } + } else if (t <= 5) { + for (j = 0; j < r; j++) { + /* Write pages */ + mtr_start(&mtr2); + frame = buf_page_get(sp, p % (N_FILES * FILE_SIZE), + RW_X_LATCH, &mtr2); + count = 1 + mtr_read_ulint(frame + COUNTER_OFFSET, + MLOG_4BYTES, &mtr2); + mutex_enter(&incs_mutex); + incs++; + mutex_exit(&incs_mutex); + mlog_write_ulint(frame + COUNTER_OFFSET, count, + MLOG_4BYTES, &mtr2); + mtr_commit(&mtr2); + if (t == 3) { + p++; /* upward */ + } else if (t == 4) { + p--; /* downward */ + } else if (t == 5) { + p = ut_rnd_gen_ulint(); /* randomly */ + } + } + } /* if t = */ + + mtr_commit(&mtr); +/* printf("Thr %lu tst %lu ends ", os_thread_get_curr_id(), t); */ + ut_a(buf_validate()); + } /* for i */ + printf("\nRandom test thread %lu exits\n", os_thread_get_curr_id()); + return(0); +} + +/************************************************************************ +Random test thread function which reports the rw-lock list. */ + +ulint +rw_list_thread( +/*===========*/ + void* arg) +{ + ulint n; + ulint i; + + n = *((ulint*)arg); + + printf("\nRw list test thread %lu starts\n", os_thread_get_curr_id()); + + for (i = 0; i < 10; i++) { + os_thread_sleep(3000000); + rw_lock_list_print_info(); + buf_validate(); + } + + return(0); +} + +/************************************************************************* +Performs random operations on the buffer with several threads. */ + +void +test6(void) +/*=======*/ +{ + ulint i, j; + os_thread_t thr[N_THREADS + 1]; + os_thread_id_t id[N_THREADS + 1]; + ulint n[N_THREADS + 1]; + ulint count = 0; + buf_frame_t* frame; + mtr_t mtr; + + printf("--------------------------------------------------------\n"); + printf("TEST 6. Random multi-thread test on the buffer \n"); + + incs = 0; + mutex_create(&incs_mutex); + + for (i = 0; i < N_THREADS; i++) { + n[i] = i; + + thr[i] = os_thread_create(random_thread, n + i, id + i); + } +/* + n[N_THREADS] = N_THREADS; + + thr[N_THREADS] = os_thread_create(rw_list_thread, n + N_THREADS, + id + N_THREADS); +*/ + for (i = 0; i < N_THREADS; i++) { + os_thread_wait(thr[i]); + } + +/* os_thread_wait(thr[N_THREADS]); */ + + for (i = 0; i < N_SPACES; i++) { + for (j = 0; j < N_FILES * FILE_SIZE; j++) { + mtr_start(&mtr); + + frame = buf_page_get(i, j, RW_S_LATCH, &mtr); + + ut_a(mtr_read_ulint(frame + FIL_PAGE_OFFSET, + MLOG_4BYTES, &mtr) + == j); + ut_a(mtr_read_ulint(frame + FIL_PAGE_SPACE, + MLOG_4BYTES, &mtr) + == i); + + count += mtr_read_ulint(frame + COUNTER_OFFSET, + MLOG_4BYTES, &mtr); + + mtr_commit(&mtr); + } + } + + printf("Count %lu incs %lu\n", count, incs); + ut_a(count == incs); +} + +/************************************************************************ +Frees the spaces in the file system. */ + +void +free_system(void) +/*=============*/ +{ + ulint i; + + for (i = 0; i < N_SPACES; i++) { + fil_space_free(i); + } +} + +/************************************************************************ +Main test function. */ + +void +main(void) +/*======*/ +{ + ulint tm, oldtm; + +/* buf_debug_prints = TRUE; */ + + oldtm = ut_clock(); + + os_aio_init(160, 5); + sync_init(); + mem_init(1500000); + fil_init(26); /* Allow 25 open files at a time */ + buf_pool_init(POOL_SIZE, POOL_SIZE); + log_init(); + + buf_validate(); + + ut_a(fil_validate()); + + create_files(); + + create_db(); + + buf_validate(); + + test1(); + buf_validate(); + + test2(); + buf_validate(); + + test3(); + buf_validate(); + + test4(); + + test5(); + + buf_validate(); + + test6(); + + buf_validate(); + + buf_print(); + + buf_flush_batch(BUF_FLUSH_LIST, POOL_SIZE + 1); + buf_print(); + buf_validate(); + + os_thread_sleep(1000000); + + buf_print(); + buf_all_freed(); + + free_system(); + + tm = ut_clock(); + printf("Wall clock time for test %lu milliseconds\n", tm - oldtm); + printf("TESTS COMPLETED SUCCESSFULLY!\n"); +} diff --git a/innobase/buf/ts/tsos.c b/innobase/buf/ts/tsos.c new file mode 100644 index 00000000000..c1cc3f27172 --- /dev/null +++ b/innobase/buf/ts/tsos.c @@ -0,0 +1,185 @@ +/************************************************************************ +The test module for the operating system interface + +(c) 1995 Innobase Oy + +Created 9/27/1995 Heikki Tuuri +*************************************************************************/ + + +#include "../os0thread.h" +#include "../os0shm.h" +#include "../os0proc.h" +#include "../os0sync.h" +#include "../os0file.h" +#include "ut0ut.h" +#include "sync0sync.h" +#include "mem0mem.h" + +ulint last_thr = 1; + +byte global_buf[1000000]; + +os_file_t file; +os_file_t file2; + +os_event_t gl_ready; + +mutex_t ios_mutex; +ulint ios; + +/************************************************************************ +Io-handler thread function. */ + +ulint +handler_thread( +/*===========*/ + void* arg) +{ + ulint segment; + void* mess; + ulint i; + bool ret; + + segment = *((ulint*)arg); + + printf("Thread %lu starts\n", segment); + + for (i = 0;; i++) { + ret = os_aio_wait(segment, &mess); + + mutex_enter(&ios_mutex); + ios++; + mutex_exit(&ios_mutex); + + ut_a(ret); +/* printf("Message for thread %lu %lu\n", segment, + (ulint)mess); */ + if ((ulint)mess == 3333) { + os_event_set(gl_ready); + } + } + + return(0); +} + +/************************************************************************ +Test of io-handler threads */ + +void +test4(void) +/*=======*/ +{ + ulint i; + bool ret; + void* buf; + ulint rnd; + ulint tm, oldtm; + + os_thread_t thr[5]; + os_thread_id_t id[5]; + ulint n[5]; + + printf("-------------------------------------------\n"); + printf("OS-TEST 4. Test of asynchronous file io\n"); + + /* Align the buffer for file io */ + + buf = (void*)(((ulint)global_buf + 6300) & (~0xFFF)); + + gl_ready = os_event_create(NULL); + ios = 0; + + sync_init(); + mem_init(); + + mutex_create(&ios_mutex); + + for (i = 0; i < 5; i++) { + n[i] = i; + + thr[i] = os_thread_create(handler_thread, n + i, id + i); + } + + rnd = 0; + + oldtm = ut_clock(); + + for (i = 0; i < 4096; i++) { + ret = os_aio_read(file, (byte*)buf + 8192 * (rnd % 100), + 8192 * (rnd % 4096), 0, + 8192, (void*)i); + ut_a(ret); + rnd += 1; + } + + ret = os_aio_read(file, buf, 8192 * (rnd % 1024), 0, 8192, + (void*)3333); + ut_a(ret); + + ut_a(!os_aio_all_slots_free()); + + tm = ut_clock(); + + printf("All ios queued! N ios: %lu\n", ios); + + printf("Wall clock time for test %lu milliseconds\n", tm - oldtm); + + os_event_wait(gl_ready); + + tm = ut_clock(); + printf("N ios: %lu\n", ios); + printf("Wall clock time for test %lu milliseconds\n", tm - oldtm); + + os_thread_sleep(2000000); + + printf("N ios: %lu\n", ios); + + ut_a(os_aio_all_slots_free()); +} + +/************************************************************************* +Initializes the asyncronous io system for tests. */ + +void +init_aio(void) +/*==========*/ +{ + bool ret; + void* buf; + + buf = (void*)(((ulint)global_buf + 6300) & (~0xFFF)); + + os_aio_init(160, 5); + file = os_file_create("j:\\tsfile4", OS_FILE_CREATE, OS_FILE_TABLESPACE, + &ret); + + if (ret == FALSE) { + ut_a(os_file_get_last_error() == OS_FILE_ALREADY_EXISTS); + + file = os_file_create("j:\\tsfile4", OS_FILE_OPEN, + OS_FILE_TABLESPACE, &ret); + + ut_a(ret); + } +} + +/************************************************************************ +Main test function. */ + +void +main(void) +/*======*/ +{ + ulint tm, oldtm; + + oldtm = ut_clock(); + + init_aio(); + + test4(); + + tm = ut_clock(); + printf("Wall clock time for test %lu milliseconds\n", tm - oldtm); + printf("TESTS COMPLETED SUCCESSFULLY!\n"); +} |