diff options
Diffstat (limited to 'storage/innobase/buf/buf0dblwr.cc')
-rw-r--r-- | storage/innobase/buf/buf0dblwr.cc | 1086 |
1 files changed, 1086 insertions, 0 deletions
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc new file mode 100644 index 00000000000..ad6ef7c4cef --- /dev/null +++ b/storage/innobase/buf/buf0dblwr.cc @@ -0,0 +1,1086 @@ +/***************************************************************************** + +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0dblwr.cc +Doublwrite buffer module + +Created 2011/12/19 +*******************************************************/ + +#include "buf0dblwr.h" + +#include "buf0buf.h" +#include "buf0lru.h" +#include "buf0flu.h" +#include "buf0checksum.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "page0zip.h" +#include "trx0sys.h" +#include "page0page.h" +#include "mtr0log.h" + +#ifndef UNIV_HOTBACKUP + +/** Time in milliseconds that we sleep when unable to find a slot in +the doublewrite buffer or when we have to wait for a running batch +to end. */ +#define TRX_DOUBLEWRITE_BATCH_POLL_DELAY 10000 + +#ifdef UNIV_PFS_MUTEX +/* Key to register the mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t buf_dblwr_mutex_key; +#endif /* UNIV_PFS_RWLOCK */ + +/** The doublewrite buffer */ +UNIV_INTERN buf_dblwr_t* buf_dblwr = NULL; + +/** Set to TRUE when the doublewrite buffer is being created */ +UNIV_INTERN ibool buf_dblwr_being_created = FALSE; + +/****************************************************************//** +Determines if a page number is located inside the doublewrite buffer. +@return TRUE if the location is inside the two blocks of the +doublewrite buffer */ +UNIV_INTERN +ibool +buf_dblwr_page_inside( +/*==================*/ + ulint page_no) /*!< in: page number */ +{ + if (buf_dblwr == NULL) { + + return(FALSE); + } + + if (page_no >= buf_dblwr->block1 + && page_no < buf_dblwr->block1 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + return(TRUE); + } + + if (page_no >= buf_dblwr->block2 + && page_no < buf_dblwr->block2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + return(TRUE); + } + + return(FALSE); +} + +/****************************************************************//** +Calls buf_page_get() on the TRX_SYS_PAGE and returns a pointer to the +doublewrite buffer within it. +@return pointer to the doublewrite buffer within the filespace header +page. */ +UNIV_INLINE +byte* +buf_dblwr_get( +/*==========*/ + mtr_t* mtr) /*!< in/out: MTR to hold the page latch */ +{ + buf_block_t* block; + + block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, + RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + return(buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE); +} + + +/****************************************************************//** +Creates or initialializes the doublewrite buffer at a database start. */ +static +void +buf_dblwr_init( +/*===========*/ + byte* doublewrite) /*!< in: pointer to the doublewrite buf + header on trx sys page */ +{ + ulint buf_size; + + buf_dblwr = static_cast<buf_dblwr_t*>( + mem_zalloc(sizeof(buf_dblwr_t))); + + /* There are two blocks of same size in the doublewrite + buffer. */ + buf_size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + + /* There must be atleast one buffer for single page writes + and one buffer for batch writes. */ + ut_a(srv_doublewrite_batch_size > 0 + && srv_doublewrite_batch_size < buf_size); + + mutex_create(buf_dblwr_mutex_key, + &buf_dblwr->mutex, SYNC_DOUBLEWRITE); + + buf_dblwr->first_free = 0; + buf_dblwr->s_reserved = 0; + buf_dblwr->b_reserved = 0; + + buf_dblwr->block1 = mach_read_from_4( + doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1); + buf_dblwr->block2 = mach_read_from_4( + doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2); + + buf_dblwr->in_use = static_cast<ibool*>( + mem_zalloc(buf_size * sizeof(ibool))); + + buf_dblwr->write_buf_unaligned = static_cast<byte*>( + ut_malloc((1 + buf_size) * UNIV_PAGE_SIZE)); + + buf_dblwr->write_buf = static_cast<byte*>( + ut_align(buf_dblwr->write_buf_unaligned, + UNIV_PAGE_SIZE)); + + buf_dblwr->buf_block_arr = static_cast<buf_page_t**>( + mem_zalloc(buf_size * sizeof(void*))); +} + +/****************************************************************//** +Creates the doublewrite buffer to a new InnoDB installation. The header of the +doublewrite buffer is placed on the trx system header page. */ +UNIV_INTERN +void +buf_dblwr_create(void) +/*==================*/ +{ + buf_block_t* block2; + buf_block_t* new_block; + byte* doublewrite; + byte* fseg_header; + ulint page_no; + ulint prev_page_no; + ulint i; + mtr_t mtr; + + if (buf_dblwr) { + /* Already inited */ + + return; + } + +start_again: + mtr_start(&mtr); + buf_dblwr_being_created = TRUE; + + doublewrite = buf_dblwr_get(&mtr); + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) + == TRX_SYS_DOUBLEWRITE_MAGIC_N) { + /* The doublewrite buffer has already been created: + just read in some numbers */ + + buf_dblwr_init(doublewrite); + + mtr_commit(&mtr); + buf_dblwr_being_created = FALSE; + return; + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Doublewrite buffer not found:" + " creating new\n"); + + if (buf_pool_get_curr_size() + < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2 + 100) + * UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite buffer:" + " you must\n" + "InnoDB: increase your buffer pool size.\n" + "InnoDB: Cannot continue operation.\n"); + + exit(1); + } + + block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, + TRX_SYS_DOUBLEWRITE + + TRX_SYS_DOUBLEWRITE_FSEG, &mtr); + + /* fseg_create acquires a second latch on the page, + therefore we must declare it: */ + + buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK); + + if (block2 == NULL) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite buffer:" + " you must\n" + "InnoDB: increase your tablespace size.\n" + "InnoDB: Cannot continue operation.\n"); + + /* We exit without committing the mtr to prevent + its modifications to the database getting to disk */ + + exit(1); + } + + fseg_header = doublewrite + TRX_SYS_DOUBLEWRITE_FSEG; + prev_page_no = 0; + + for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2; i++) { + new_block = fseg_alloc_free_page( + fseg_header, prev_page_no + 1, FSP_UP, &mtr); + if (new_block == NULL) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite" + " buffer: you must\n" + "InnoDB: increase your" + " tablespace size.\n" + "InnoDB: Cannot continue operation.\n" + ); + + exit(1); + } + + /* We read the allocated pages to the buffer pool; + when they are written to disk in a flush, the space + id and page number fields are also written to the + pages. When we at database startup read pages + from the doublewrite buffer, we know that if the + space id and page number in them are the same as + the page position in the tablespace, then the page + has not been written to in doublewrite. */ + + ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1); + page_no = buf_block_get_page_no(new_block); + + if (i == FSP_EXTENT_SIZE / 2) { + ut_a(page_no == FSP_EXTENT_SIZE); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + + } else if (i == FSP_EXTENT_SIZE / 2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + ut_a(page_no == 2 * FSP_EXTENT_SIZE); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + + } else if (i > FSP_EXTENT_SIZE / 2) { + ut_a(page_no == prev_page_no + 1); + } + + if (((i + 1) & 15) == 0) { + /* rw_locks can only be recursively x-locked + 2048 times. (on 32 bit platforms, + (lint) 0 - (X_LOCK_DECR * 2049) + is no longer a negative number, and thus + lock_word becomes like a shared lock). + For 4k page size this loop will + lock the fseg header too many times. Since + this code is not done while any other threads + are active, restart the MTR occasionally. */ + mtr_commit(&mtr); + mtr_start(&mtr); + doublewrite = buf_dblwr_get(&mtr); + fseg_header = doublewrite + + TRX_SYS_DOUBLEWRITE_FSEG; + } + + prev_page_no = page_no; + } + + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC, + TRX_SYS_DOUBLEWRITE_MAGIC_N, + MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC + + TRX_SYS_DOUBLEWRITE_REPEAT, + TRX_SYS_DOUBLEWRITE_MAGIC_N, + MLOG_4BYTES, &mtr); + + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, + MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + + /* Flush the modified pages to disk and make a checkpoint */ + log_make_checkpoint_at(LSN_MAX, TRUE); + + /* Remove doublewrite pages from LRU */ + buf_pool_invalidate(); + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Doublewrite buffer created\n"); + + goto start_again; +} + +/****************************************************************//** +At a database startup initializes the doublewrite buffer memory structure if +we already have a doublewrite buffer created in the data files. If we are +upgrading to an InnoDB version which supports multiple tablespaces, then this +function performs the necessary update operations. If we are in a crash +recovery, this function uses a possible doublewrite buffer to restore +half-written pages in the data files. */ +UNIV_INTERN +void +buf_dblwr_init_or_restore_pages( +/*============================*/ + ibool restore_corrupt_pages) /*!< in: TRUE=restore pages */ +{ + byte* buf; + byte* read_buf; + byte* unaligned_read_buf; + ulint block1; + ulint block2; + byte* page; + ibool reset_space_ids = FALSE; + byte* doublewrite; + ulint space_id; + ulint page_no; + ulint i; + + /* We do the file i/o past the buffer pool */ + + unaligned_read_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE)); + + read_buf = static_cast<byte*>( + ut_align(unaligned_read_buf, UNIV_PAGE_SIZE)); + + /* Read the trx sys header to check if we are using the doublewrite + buffer */ + + fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0, + UNIV_PAGE_SIZE, read_buf, NULL); + doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) + == TRX_SYS_DOUBLEWRITE_MAGIC_N) { + /* The doublewrite buffer has been created */ + + buf_dblwr_init(doublewrite); + + block1 = buf_dblwr->block1; + block2 = buf_dblwr->block2; + + buf = buf_dblwr->write_buf; + } else { + goto leave_func; + } + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED) + != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) { + + /* We are upgrading from a version < 4.1.x to a version where + multiple tablespaces are supported. We must reset the space id + field in the pages in the doublewrite buffer because starting + from this version the space id is stored to + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ + + reset_space_ids = TRUE; + + fprintf(stderr, + "InnoDB: Resetting space id's in the" + " doublewrite buffer\n"); + } + + /* Read the pages from the doublewrite buffer to memory */ + + fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block1, 0, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + buf, NULL); + fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block2, 0, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + NULL); + /* Check if any of these pages is half-written in data files, in the + intended position */ + + page = buf; + + for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) { + + ulint source_page_no; + page_no = mach_read_from_4(page + FIL_PAGE_OFFSET); + + if (reset_space_ids) { + + space_id = 0; + mach_write_to_4(page + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0); + /* We do not need to calculate new checksums for the + pages because the field .._SPACE_ID does not affect + them. Write the page back to where we read it from. */ + + if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + source_page_no = block1 + i; + } else { + source_page_no = block2 + + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + } + + fil_io(OS_FILE_WRITE, TRUE, 0, 0, source_page_no, 0, + UNIV_PAGE_SIZE, page, NULL); + } else { + + space_id = mach_read_from_4( + page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + } + + if (!restore_corrupt_pages) { + /* The database was shut down gracefully: no need to + restore pages */ + + } else if (!fil_tablespace_exists_in_mem(space_id)) { + /* Maybe we have dropped the single-table tablespace + and this page once belonged to it: do nothing */ + + } else if (!fil_check_adress_in_tablespace(space_id, + page_no)) { + fprintf(stderr, + "InnoDB: Warning: a page in the" + " doublewrite buffer is not within space\n" + "InnoDB: bounds; space id %lu" + " page number %lu, page %lu in" + " doublewrite buf.\n", + (ulong) space_id, (ulong) page_no, (ulong) i); + + } else if (space_id == TRX_SYS_SPACE + && ((page_no >= block1 + && page_no + < block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) + || (page_no >= block2 + && page_no + < (block2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) { + + /* It is an unwritten doublewrite buffer page: + do nothing */ + } else { + ulint zip_size = fil_space_get_zip_size(space_id); + + /* Read in the actual page from the file */ + fil_io(OS_FILE_READ, TRUE, space_id, zip_size, + page_no, 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + read_buf, NULL); + + /* Check if the page is corrupt */ + + if (UNIV_UNLIKELY + (buf_page_is_corrupted(read_buf, zip_size))) { + + fprintf(stderr, + "InnoDB: Warning: database page" + " corruption or a failed\n" + "InnoDB: file read of" + " space %lu page %lu.\n" + "InnoDB: Trying to recover it from" + " the doublewrite buffer.\n", + (ulong) space_id, (ulong) page_no); + + if (buf_page_is_corrupted(page, zip_size)) { + fprintf(stderr, + "InnoDB: Dump of the page:\n"); + buf_page_print( + read_buf, zip_size, + BUF_PAGE_PRINT_NO_CRASH); + fprintf(stderr, + "InnoDB: Dump of" + " corresponding page" + " in doublewrite buffer:\n"); + buf_page_print( + page, zip_size, + BUF_PAGE_PRINT_NO_CRASH); + + fprintf(stderr, + "InnoDB: Also the page in the" + " doublewrite buffer" + " is corrupt.\n" + "InnoDB: Cannot continue" + " operation.\n" + "InnoDB: You can try to" + " recover the database" + " with the my.cnf\n" + "InnoDB: option:\n" + "InnoDB:" + " innodb_force_recovery=6\n"); + ut_error; + } + + /* Write the good page from the + doublewrite buffer to the intended + position */ + + fil_io(OS_FILE_WRITE, TRUE, space_id, + zip_size, page_no, 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + page, NULL); + fprintf(stderr, + "InnoDB: Recovered the page from" + " the doublewrite buffer.\n"); + } + } + + page += UNIV_PAGE_SIZE; + } + + fil_flush_file_spaces(FIL_TABLESPACE); + +leave_func: + ut_free(unaligned_read_buf); +} + +/****************************************************************//** +Frees doublewrite buffer. */ +UNIV_INTERN +void +buf_dblwr_free(void) +/*================*/ +{ + /* Free the double write data structures. */ + ut_a(buf_dblwr != NULL); + ut_ad(buf_dblwr->s_reserved == 0); + ut_ad(buf_dblwr->b_reserved == 0); + + ut_free(buf_dblwr->write_buf_unaligned); + buf_dblwr->write_buf_unaligned = NULL; + + mem_free(buf_dblwr->buf_block_arr); + buf_dblwr->buf_block_arr = NULL; + + mem_free(buf_dblwr->in_use); + buf_dblwr->in_use = NULL; + + mutex_free(&buf_dblwr->mutex); + mem_free(buf_dblwr); + buf_dblwr = NULL; +} + +/********************************************************************//** +Updates the doublewrite buffer when an IO request that is part of an +LRU or flush batch is completed. */ +UNIV_INTERN +void +buf_dblwr_update(void) +/*==================*/ +{ + if (!srv_use_doublewrite_buf || buf_dblwr == NULL) { + return; + } + + mutex_enter(&buf_dblwr->mutex); + + ut_ad(buf_dblwr->batch_running); + ut_ad(buf_dblwr->b_reserved > 0); + + buf_dblwr->b_reserved--; + if (buf_dblwr->b_reserved == 0) { + + mutex_exit(&buf_dblwr->mutex); + /* This will finish the batch. Sync data files + to the disk. */ + fil_flush_file_spaces(FIL_TABLESPACE); + mutex_enter(&buf_dblwr->mutex); + + /* We can now reuse the doublewrite memory buffer: */ + buf_dblwr->first_free = 0; + buf_dblwr->batch_running = FALSE; + } + + mutex_exit(&buf_dblwr->mutex); +} + +/********************************************************************//** +Check the LSN values on the page. */ +static +void +buf_dblwr_check_page_lsn( +/*=====================*/ + const page_t* page) /*!< in: page to check */ +{ + if (memcmp(page + (FIL_PAGE_LSN + 4), + page + (UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), + 4)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: ERROR: The page to be written" + " seems corrupt!\n" + "InnoDB: The low 4 bytes of LSN fields do not match " + "(" ULINTPF " != " ULINTPF ")!" + " Noticed in the buffer pool.\n", + mach_read_from_4( + page + FIL_PAGE_LSN + 4), + mach_read_from_4( + page + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)); + } +} + +/********************************************************************//** +Asserts when a corrupt block is find during writing out data to the +disk. */ +static +void +buf_dblwr_assert_on_corrupt_block( +/*==============================*/ + const buf_block_t* block) /*!< in: block to check */ +{ + buf_page_print(block->frame, 0, BUF_PAGE_PRINT_NO_CRASH); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Apparent corruption of an" + " index page n:o %lu in space %lu\n" + "InnoDB: to be written to data file." + " We intentionally crash server\n" + "InnoDB: to prevent corrupt data" + " from ending up in data\n" + "InnoDB: files.\n", + (ulong) buf_block_get_page_no(block), + (ulong) buf_block_get_space(block)); + + ut_error; +} + +/********************************************************************//** +Check the LSN values on the page with which this block is associated. +Also validate the page if the option is set. */ +static +void +buf_dblwr_check_block( +/*==================*/ + const buf_block_t* block) /*!< in: block to check */ +{ + if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE + || block->page.zip.data) { + /* No simple validate for compressed pages exists. */ + return; + } + + buf_dblwr_check_page_lsn(block->frame); + + if (!block->check_index_page_at_flush) { + return; + } + + if (page_is_comp(block->frame)) { + if (!page_simple_validate_new(block->frame)) { + buf_dblwr_assert_on_corrupt_block(block); + } + } else if (!page_simple_validate_old(block->frame)) { + + buf_dblwr_assert_on_corrupt_block(block); + } +} + +/********************************************************************//** +Writes a page that has already been written to the doublewrite buffer +to the datafile. It is the job of the caller to sync the datafile. */ +static +void +buf_dblwr_write_block_to_datafile( +/*==============================*/ + const buf_block_t* block) /*!< in: block to write */ +{ + ut_a(block); + ut_a(buf_page_in_file(&block->page)); + + if (block->page.zip.data) { + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, buf_page_get_space(&block->page), + buf_page_get_zip_size(&block->page), + buf_page_get_page_no(&block->page), 0, + buf_page_get_zip_size(&block->page), + (void*) block->page.zip.data, + (void*) block); + + goto exit; + } + + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + buf_dblwr_check_page_lsn(block->frame); + + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, buf_block_get_space(block), 0, + buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE, + (void*) block->frame, (void*) block); + +exit: + /* Increment the counter of I/O operations used + for selecting LRU policy. */ + buf_LRU_stat_inc_io(); +} + +/********************************************************************//** +Flushes possible buffered writes from the doublewrite memory buffer to disk, +and also wakes up the aio thread if simulated aio is used. It is very +important to call this function after a batch of writes has been posted, +and also when we may have to wait for a page latch! Otherwise a deadlock +of threads can occur. */ +UNIV_INTERN +void +buf_dblwr_flush_buffered_writes(void) +/*=================================*/ +{ + byte* write_buf; + ulint len; + ulint len2; + ulint i; + + if (!srv_use_doublewrite_buf || buf_dblwr == NULL) { + /* Sync the writes to the disk. */ + buf_flush_sync_datafiles(); + return; + } + +try_again: + mutex_enter(&(buf_dblwr->mutex)); + + /* Write first to doublewrite buffer blocks. We use synchronous + aio and thus know that file write has been completed when the + control returns. */ + + if (buf_dblwr->first_free == 0) { + + mutex_exit(&(buf_dblwr->mutex)); + + return; + } + + if (buf_dblwr->batch_running) { + mutex_exit(&buf_dblwr->mutex); + + /* Another thread is running the batch right now. Wait + for it to finish. */ + os_thread_sleep(TRX_DOUBLEWRITE_BATCH_POLL_DELAY); + goto try_again; + } + + ut_a(!buf_dblwr->batch_running); + + /* Disallow anyone else to post to doublewrite buffer or to + start another batch of flushing. */ + buf_dblwr->batch_running = TRUE; + + /* Now safe to release the mutex. Note that though no other + thread is allowed to post to the doublewrite batch flushing + but any threads working on single page flushes are allowed + to proceed. */ + mutex_exit(&buf_dblwr->mutex); + + write_buf = buf_dblwr->write_buf; + + for (len2 = 0, i = 0; + i < buf_dblwr->first_free; + len2 += UNIV_PAGE_SIZE, i++) { + + const buf_block_t* block; + + block = (buf_block_t*) buf_dblwr->buf_block_arr[i]; + + if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE + || block->page.zip.data) { + /* No simple validate for compressed + pages exists. */ + continue; + } + + /* Check that the actual page in the buffer pool is + not corrupt and the LSN values are sane. */ + buf_dblwr_check_block(block); + + /* Check that the page as written to the doublewrite + buffer has sane LSN values. */ + buf_dblwr_check_page_lsn(write_buf + len2); + } + + /* Write out the first block of the doublewrite buffer */ + len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE, + buf_dblwr->first_free) * UNIV_PAGE_SIZE; + + fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, + buf_dblwr->block1, 0, len, + (void*) write_buf, NULL); + + if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + /* No unwritten pages in the second block. */ + goto flush; + } + + /* Write out the second block of the doublewrite buffer. */ + len = (buf_dblwr->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) + * UNIV_PAGE_SIZE; + + write_buf = buf_dblwr->write_buf + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; + + fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, + buf_dblwr->block2, 0, len, + (void*) write_buf, NULL); + +flush: + /* increment the doublewrite flushed pages counter */ + srv_dblwr_pages_written += buf_dblwr->first_free; + srv_dblwr_writes++; + + /* Now flush the doublewrite buffer data to disk */ + fil_flush(TRX_SYS_SPACE); + + /* We know that the writes have been flushed to disk now + and in recovery we will find them in the doublewrite buffer + blocks. Next do the writes to the intended positions. */ + + for (i = 0; i < buf_dblwr->first_free; i++) { + const buf_block_t* block = (buf_block_t*) + buf_dblwr->buf_block_arr[i]; + + buf_dblwr_write_block_to_datafile(block); + } + + /* Wake possible simulated aio thread to actually post the + writes to the operating system. We don't flush the files + at this point. We leave it to the IO helper thread to flush + datafiles when the whole batch has been processed. */ + os_aio_simulated_wake_handler_threads(); +} + +/********************************************************************//** +Posts a buffer page for writing. If the doublewrite memory buffer is +full, calls buf_dblwr_flush_buffered_writes and waits for for free +space to appear. */ +UNIV_INTERN +void +buf_dblwr_add_to_batch( +/*====================*/ + buf_page_t* bpage) /*!< in: buffer block to write */ +{ + ulint zip_size; + + ut_a(buf_page_in_file(bpage)); + +try_again: + mutex_enter(&(buf_dblwr->mutex)); + + ut_a(buf_dblwr->first_free <= srv_doublewrite_batch_size); + + if (buf_dblwr->batch_running) { + mutex_exit(&buf_dblwr->mutex); + + /* This not nearly as bad as it looks. There is only + page_cleaner thread which does background flushing + in batches therefore it is unlikely to be a contention + point. The only exception is when a user thread is + forced to do a flush batch because of a sync + checkpoint. */ + os_thread_sleep(TRX_DOUBLEWRITE_BATCH_POLL_DELAY); + goto try_again; + } + + if (buf_dblwr->first_free == srv_doublewrite_batch_size) { + mutex_exit(&(buf_dblwr->mutex)); + + buf_dblwr_flush_buffered_writes(); + + goto try_again; + } + + zip_size = buf_page_get_zip_size(bpage); + + if (zip_size) { + UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size); + /* Copy the compressed page and clear the rest. */ + memcpy(buf_dblwr->write_buf + + UNIV_PAGE_SIZE * buf_dblwr->first_free, + bpage->zip.data, zip_size); + memset(buf_dblwr->write_buf + + UNIV_PAGE_SIZE * buf_dblwr->first_free + + zip_size, 0, UNIV_PAGE_SIZE - zip_size); + } else { + ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame, + UNIV_PAGE_SIZE); + + memcpy(buf_dblwr->write_buf + + UNIV_PAGE_SIZE * buf_dblwr->first_free, + ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE); + } + + buf_dblwr->buf_block_arr[buf_dblwr->first_free] = bpage; + + buf_dblwr->first_free++; + buf_dblwr->b_reserved++; + + ut_ad(buf_dblwr->b_reserved <= srv_doublewrite_batch_size); + + if (buf_dblwr->first_free == srv_doublewrite_batch_size) { + mutex_exit(&(buf_dblwr->mutex)); + + buf_dblwr_flush_buffered_writes(); + + return; + } + + mutex_exit(&(buf_dblwr->mutex)); +} + +/********************************************************************//** +Writes a page to the doublewrite buffer on disk, sync it, then write +the page to the datafile and sync the datafile. This function is used +for single page flushes. If all the buffers allocated for single page +flushes in the doublewrite buffer are in use we wait here for one to +become free. We are guaranteed that a slot will become free because any +thread that is using a slot must also release the slot before leaving +this function. */ +UNIV_INTERN +void +buf_dblwr_write_single_page( +/*========================*/ + buf_page_t* bpage) /*!< in: buffer block to write */ +{ + ulint n_slots; + ulint size; + ulint zip_size; + ulint offset; + ulint i; + + ut_a(buf_page_in_file(bpage)); + ut_a(srv_use_doublewrite_buf); + ut_a(buf_dblwr != NULL); + + /* total number of slots available for single page flushes + starts from srv_doublewrite_batch_size to the end of the + buffer. */ + size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + ut_a(size > srv_doublewrite_batch_size); + n_slots = size - srv_doublewrite_batch_size; + + if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { + + /* Check that the actual page in the buffer pool is + not corrupt and the LSN values are sane. */ + buf_dblwr_check_block((buf_block_t*) bpage); + + /* Check that the page as written to the doublewrite + buffer has sane LSN values. */ + if (!bpage->zip.data) { + buf_dblwr_check_page_lsn( + ((buf_block_t*) bpage)->frame); + } + } + +retry: + mutex_enter(&buf_dblwr->mutex); + if (buf_dblwr->s_reserved == n_slots) { + + mutex_exit(&buf_dblwr->mutex); + /* All slots are reserved. Since it involves two IOs + during the processing a sleep of 10ms should be + enough. */ + os_thread_sleep(TRX_DOUBLEWRITE_BATCH_POLL_DELAY); + goto retry; + } + + for (i = srv_doublewrite_batch_size; i < size; ++i) { + + if (!buf_dblwr->in_use[i]) { + break; + } + } + + /* We are guaranteed to find a slot. */ + ut_a(i < size); + buf_dblwr->in_use[i] = TRUE; + buf_dblwr->s_reserved++; + buf_dblwr->buf_block_arr[i] = bpage; + mutex_exit(&buf_dblwr->mutex); + + /* Lets see if we are going to write in the first or second + block of the doublewrite buffer. */ + if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + offset = buf_dblwr->block1 + i; + } else { + offset = buf_dblwr->block2 + i + - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + } + + /* We deal with compressed and uncompressed pages a little + differently here. In case of uncompressed pages we can + directly write the block to the allocated slot in the + doublewrite buffer in the system tablespace and then after + syncing the system table space we can proceed to write the page + in the datafile. + In case of compressed page we first do a memcpy of the block + to the in-memory buffer of doublewrite before proceeding to + write it. This is so because we want to pad the remaining + bytes in the doublewrite page with zeros. */ + + zip_size = buf_page_get_zip_size(bpage); + if (zip_size) { + memcpy(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i, + bpage->zip.data, zip_size); + memset(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i + + zip_size, 0, UNIV_PAGE_SIZE - zip_size); + + fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, + offset, 0, UNIV_PAGE_SIZE, + (void*) (buf_dblwr->write_buf + + UNIV_PAGE_SIZE * i), NULL); + } else { + /* It is a regular page. Write it directly to the + doublewrite buffer */ + fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, + offset, 0, UNIV_PAGE_SIZE, + (void*) ((buf_block_t*) bpage)->frame, + NULL); + } + + /* Now flush the doublewrite buffer data to disk */ + fil_flush(TRX_SYS_SPACE); + + /* We know that the write has been flushed to disk now + and during recovery we will find it in the doublewrite buffer + blocks. Next do the write to the intended position. */ + buf_dblwr_write_block_to_datafile((buf_block_t*) bpage); + + /* Sync the writes to the disk. */ + buf_flush_sync_datafiles(); + + mutex_enter(&buf_dblwr->mutex); + + buf_dblwr->s_reserved--; + buf_dblwr->buf_block_arr[i] = NULL; + buf_dblwr->in_use[i] = FALSE; + + /* increment the doublewrite flushed pages counter */ + srv_dblwr_pages_written += buf_dblwr->first_free; + srv_dblwr_writes++; + + mutex_exit(&(buf_dblwr->mutex)); + +} +#endif /* !UNIV_HOTBACKUP */ |