diff options
author | heikki@donna.mysql.fi <> | 2001-08-04 19:36:14 +0300 |
---|---|---|
committer | heikki@donna.mysql.fi <> | 2001-08-04 19:36:14 +0300 |
commit | 94db78ce61a998d28a9335bade3e5e1df558a4ea (patch) | |
tree | e6dc89cb458f496f2b93e907afb60d3cd886cc18 /innobase/buf | |
parent | 596d69b5ce815c325d8a1af7934ed50efce5aed3 (diff) | |
download | mariadb-git-94db78ce61a998d28a9335bade3e5e1df558a4ea.tar.gz |
srv0srv.h Support raw disk partitions as data files
srv0start.c Support raw disk partitions as data files
srv0srv.c Support raw disk partitions as data files
row0purge.c < 4 GB rows, doublewrite, hang fixes
row0row.c < 4 GB rows, doublewrite, hang fixes
row0sel.c < 4 GB rows, doublewrite, hang fixes
row0uins.c < 4 GB rows, doublewrite, hang fixes
row0umod.c < 4 GB rows, doublewrite, hang fixes
row0undo.c < 4 GB rows, doublewrite, hang fixes
row0upd.c < 4 GB rows, doublewrite, hang fixes
srv0srv.c < 4 GB rows, doublewrite, hang fixes
srv0start.c < 4 GB rows, doublewrite, hang fixes
sync0rw.c < 4 GB rows, doublewrite, hang fixes
sync0sync.c < 4 GB rows, doublewrite, hang fixes
trx0purge.c < 4 GB rows, doublewrite, hang fixes
trx0rec.c < 4 GB rows, doublewrite, hang fixes
trx0sys.c < 4 GB rows, doublewrite, hang fixes
btr0btr.c < 4 GB rows, doublewrite, hang fixes
btr0cur.c < 4 GB rows, doublewrite, hang fixes
buf0buf.c < 4 GB rows, doublewrite, hang fixes
buf0flu.c < 4 GB rows, doublewrite, hang fixes
buf0rea.c < 4 GB rows, doublewrite, hang fixes
data0data.c < 4 GB rows, doublewrite, hang fixes
fil0fil.c < 4 GB rows, doublewrite, hang fixes
fsp0fsp.c < 4 GB rows, doublewrite, hang fixes
ibuf0ibuf.c < 4 GB rows, doublewrite, hang fixes
lock0lock.c < 4 GB rows, doublewrite, hang fixes
log0log.c < 4 GB rows, doublewrite, hang fixes
log0recv.c < 4 GB rows, doublewrite, hang fixes
os0file.c < 4 GB rows, doublewrite, hang fixes
page0cur.c < 4 GB rows, doublewrite, hang fixes
pars0pars.c < 4 GB rows, doublewrite, hang fixes
rem0cmp.c < 4 GB rows, doublewrite, hang fixes
rem0rec.c < 4 GB rows, doublewrite, hang fixes
row0ins.c < 4 GB rows, doublewrite, hang fixes
row0mysql.c < 4 GB rows, doublewrite, hang fixes
univ.i < 4 GB rows, doublewrite, hang fixes
data0data.ic < 4 GB rows, doublewrite, hang fixes
mach0data.ic < 4 GB rows, doublewrite, hang fixes
rem0rec.ic < 4 GB rows, doublewrite, hang fixes
row0upd.ic < 4 GB rows, doublewrite, hang fixes
trx0rec.ic < 4 GB rows, doublewrite, hang fixes
rem0cmp.h < 4 GB rows, doublewrite, hang fixes
rem0rec.h < 4 GB rows, doublewrite, hang fixes
row0ins.h < 4 GB rows, doublewrite, hang fixes
row0mysql.h < 4 GB rows, doublewrite, hang fixes
row0row.h < 4 GB rows, doublewrite, hang fixes
row0upd.h < 4 GB rows, doublewrite, hang fixes
srv0srv.h < 4 GB rows, doublewrite, hang fixes
sync0sync.h < 4 GB rows, doublewrite, hang fixes
trx0rec.h < 4 GB rows, doublewrite, hang fixes
trx0sys.h < 4 GB rows, doublewrite, hang fixes
trx0types.h < 4 GB rows, doublewrite, hang fixes
trx0undo.h < 4 GB rows, doublewrite, hang fixes
ut0dbg.h < 4 GB rows, doublewrite, hang fixes
ut0ut.h < 4 GB rows, doublewrite, hang fixes
btr0btr.h < 4 GB rows, doublewrite, hang fixes
btr0cur.h < 4 GB rows, doublewrite, hang fixes
buf0buf.h < 4 GB rows, doublewrite, hang fixes
buf0flu.h < 4 GB rows, doublewrite, hang fixes
data0data.h < 4 GB rows, doublewrite, hang fixes
dict0mem.h < 4 GB rows, doublewrite, hang fixes
fil0fil.h < 4 GB rows, doublewrite, hang fixes
fsp0fsp.h < 4 GB rows, doublewrite, hang fixes
os0file.h < 4 GB rows, doublewrite, hang fixes
Diffstat (limited to 'innobase/buf')
-rw-r--r-- | innobase/buf/buf0buf.c | 81 | ||||
-rw-r--r-- | innobase/buf/buf0flu.c | 195 | ||||
-rw-r--r-- | innobase/buf/buf0rea.c | 14 |
3 files changed, 242 insertions, 48 deletions
diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c index ede9e621462..3fabe6c6d0e 100644 --- a/innobase/buf/buf0buf.c +++ b/innobase/buf/buf0buf.c @@ -216,14 +216,44 @@ buf_calc_page_checksum( /* out: checksum */ byte* page) /* in: buffer page */ { - ulint checksum; + ulint checksum; - checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN); - + ut_fold_binary(page + FIL_PAGE_DATA, UNIV_PAGE_SIZE - FIL_PAGE_DATA - - FIL_PAGE_END_LSN); - checksum = checksum & 0xFFFFFFFF; + checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN); + + ut_fold_binary(page + FIL_PAGE_DATA, + UNIV_PAGE_SIZE - FIL_PAGE_DATA + - FIL_PAGE_END_LSN); + checksum = checksum & 0xFFFFFFFF; - return(checksum); + return(checksum); +} + +/************************************************************************ +Checks if a page is corrupt. */ + +ibool +buf_page_is_corrupted( +/*==================*/ + /* out: TRUE if corrupted */ + byte* read_buf) /* in: a database page */ +{ + ulint checksum; + + checksum = buf_calc_page_checksum(read_buf); + + if ((mach_read_from_4(read_buf + FIL_PAGE_LSN + 4) + != mach_read_from_4(read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN + 4)) + || (checksum != mach_read_from_4(read_buf + + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN) + && mach_read_from_4(read_buf + FIL_PAGE_LSN) + != mach_read_from_4(read_buf + + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN))) { + return(TRUE); + } + + return(FALSE); } /************************************************************************ @@ -1265,34 +1295,22 @@ buf_page_io_complete( dulint id; dict_index_t* index; ulint io_type; - ulint checksum; ut_ad(block); io_type = block->io_fix; if (io_type == BUF_IO_READ) { - checksum = buf_calc_page_checksum(block->frame); - /* From version 3.23.38 up we store the page checksum to the 4 upper bytes of the page end lsn field */ - if ((mach_read_from_4(block->frame + FIL_PAGE_LSN + 4) - != mach_read_from_4(block->frame + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN + 4)) - || (checksum != mach_read_from_4(block->frame - + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN) - && mach_read_from_4(block->frame + FIL_PAGE_LSN) - != mach_read_from_4(block->frame - + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN))) { - fprintf(stderr, + if (buf_page_is_corrupted(block->frame)) { + fprintf(stderr, "InnoDB: Database page corruption or a failed\n" "InnoDB: file read of page %lu.\n", block->offset); - fprintf(stderr, + fprintf(stderr, "InnoDB: You may have to recover from a backup.\n"); - exit(1); + exit(1); } if (recv_recovery_is_on()) { @@ -1601,11 +1619,28 @@ void buf_print_io(void) /*==============*/ { + ulint size; + ut_ad(buf_pool); + size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE; + mutex_enter(&(buf_pool->mutex)); + + printf("LRU list length %lu \n", UT_LIST_GET_LEN(buf_pool->LRU)); + printf("Free list length %lu \n", UT_LIST_GET_LEN(buf_pool->free)); + printf("Flush list length %lu \n", + UT_LIST_GET_LEN(buf_pool->flush_list)); + printf("Buffer pool size in pages %lu\n", size); - printf("pages read %lu, created %lu, written %lu\n", + printf("Pending reads %lu \n", buf_pool->n_pend_reads); + + printf("Pending writes: LRU %lu, flush list %lu, single page %lu\n", + buf_pool->n_flush[BUF_FLUSH_LRU], + buf_pool->n_flush[BUF_FLUSH_LIST], + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]); + + printf("Pages read %lu, created %lu, written %lu\n", buf_pool->n_pages_read, buf_pool->n_pages_created, buf_pool->n_pages_written); mutex_exit(&(buf_pool->mutex)); diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c index 7129b8d20a9..0f27cee45a5 100644 --- a/innobase/buf/buf0flu.c +++ b/innobase/buf/buf0flu.c @@ -1,7 +1,7 @@ /****************************************************** The database buffer buf_pool flush algorithm -(c) 1995 Innobase Oy +(c) 1995-2001 Innobase Oy Created 11/11/1995 Heikki Tuuri *******************************************************/ @@ -15,7 +15,6 @@ Created 11/11/1995 Heikki Tuuri #include "ut0byte.h" #include "ut0lst.h" #include "fil0fil.h" - #include "buf0buf.h" #include "buf0lru.h" #include "buf0rea.h" @@ -195,9 +194,145 @@ buf_flush_write_complete( } /************************************************************************ -Does an asynchronous write of a buffer page. NOTE: in simulated aio we must -call os_aio_simulated_wake_handler_threads after we have posted a batch -of writes! */ +Flushes possible buffered writes from the doublewrite memory buffer to disk, +and also wakes up the aio thread if simulated aio is used. It is very +important to call this function after a batch of writes has been posted, +and also when we may have to wait for a page latch! Otherwise a deadlock +of threads can occur. */ +static +void +buf_flush_buffered_writes(void) +/*===========================*/ +{ + buf_block_t* block; + ulint len; + ulint i; + + if (trx_doublewrite == NULL) { + os_aio_simulated_wake_handler_threads(); + + return; + } + + mutex_enter(&(trx_doublewrite->mutex)); + + /* Write first to doublewrite buffer blocks. We use synchronous + aio and thus know that file write has been completed when the + control returns. */ + + if (trx_doublewrite->first_free == 0) { + + mutex_exit(&(trx_doublewrite->mutex)); + + return; + } + + if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; + } else { + len = trx_doublewrite->first_free * UNIV_PAGE_SIZE; + } + + fil_io(OS_FILE_WRITE, + TRUE, TRX_SYS_SPACE, + trx_doublewrite->block1, 0, len, + (void*)trx_doublewrite->write_buf, NULL); + + if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + len = (trx_doublewrite->first_free + - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE; + + fil_io(OS_FILE_WRITE, + TRUE, TRX_SYS_SPACE, + trx_doublewrite->block2, 0, len, + (void*)(trx_doublewrite->write_buf + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE), + NULL); + } + + /* Now flush the doublewrite buffer data to disk */ + + fil_flush(TRX_SYS_SPACE); + + /* We know that the writes have been flushed to disk now + and in recovery we will find them in the doublewrite buffer + blocks. Next do the writes to the intended positions. */ + + for (i = 0; i < trx_doublewrite->first_free; i++) { + block = trx_doublewrite->buf_block_arr[i]; + + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE, + (void*)block->frame, (void*)block); + } + + /* Wake possible simulated aio thread to actually post the + writes to the operating system */ + + os_aio_simulated_wake_handler_threads(); + + /* Wait that all async writes to tablespaces have been posted to + the OS */ + + os_aio_wait_until_no_pending_writes(); + + /* Now we flush the data to disk (for example, with fsync) */ + + fil_flush_file_spaces(FIL_TABLESPACE); + + /* We can now reuse the doublewrite memory buffer: */ + + trx_doublewrite->first_free = 0; + + mutex_exit(&(trx_doublewrite->mutex)); +} + +/************************************************************************ +Posts a buffer page for writing. If the doublewrite memory buffer is +full, calls buf_flush_buffered_writes and waits for for free space to +appear. */ +static +void +buf_flush_post_to_doublewrite_buf( +/*==============================*/ + buf_block_t* block) /* in: buffer block to write */ +{ +try_again: + mutex_enter(&(trx_doublewrite->mutex)); + + if (trx_doublewrite->first_free + >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + mutex_exit(&(trx_doublewrite->mutex)); + + buf_flush_buffered_writes(); + + goto try_again; + } + + ut_memcpy(trx_doublewrite->write_buf + + UNIV_PAGE_SIZE * trx_doublewrite->first_free, + block->frame, UNIV_PAGE_SIZE); + + trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = block; + + trx_doublewrite->first_free++; + + if (trx_doublewrite->first_free + >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + mutex_exit(&(trx_doublewrite->mutex)); + + buf_flush_buffered_writes(); + + return; + } + + mutex_exit(&(trx_doublewrite->mutex)); +} + +/************************************************************************ +Does an asynchronous write of a buffer page. NOTE: in simulated aio and +also when the doublewrite buffer is used, we must call +buf_flush_buffered_writes after we have posted a batch of writes! */ static void buf_flush_write_block_low( @@ -222,15 +357,24 @@ buf_flush_write_block_low( mach_write_to_8(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN, block->newest_modification); + /* Write to the page the space id and page number */ + + mach_write_to_4(block->frame + FIL_PAGE_SPACE, block->space); + mach_write_to_4(block->frame + FIL_PAGE_OFFSET, block->offset); + /* We overwrite the first 4 bytes of the end lsn field to store a page checksum */ mach_write_to_4(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN, buf_calc_page_checksum(block->frame)); - fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, - FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE, + if (!trx_doublewrite) { + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE, (void*)block->frame, (void*)block); + } else { + buf_flush_post_to_doublewrite_buf(block); + } } /************************************************************************ @@ -251,14 +395,14 @@ buf_flush_try_page( buf_block_t* block; ibool locked; - ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST) - || (flush_type == BUF_FLUSH_SINGLE_PAGE)); + ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST + || flush_type == BUF_FLUSH_SINGLE_PAGE); mutex_enter(&(buf_pool->mutex)); block = buf_page_hash_get(space, offset); - if ((flush_type == BUF_FLUSH_LIST) + if (flush_type == BUF_FLUSH_LIST && block && buf_flush_ready_for_flush(block, flush_type)) { block->io_fix = BUF_IO_WRITE; @@ -286,7 +430,7 @@ buf_flush_try_page( mutex_exit(&(buf_pool->mutex)); if (!locked) { - os_aio_simulated_wake_handler_threads(); + buf_flush_buffered_writes(); rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE); } @@ -300,7 +444,7 @@ buf_flush_try_page( return(1); - } else if ((flush_type == BUF_FLUSH_LRU) && block + } else if (flush_type == BUF_FLUSH_LRU && block && buf_flush_ready_for_flush(block, flush_type)) { /* VERY IMPORTANT: @@ -328,7 +472,7 @@ buf_flush_try_page( return(1); - } else if ((flush_type == BUF_FLUSH_SINGLE_PAGE) && block + } else if (flush_type == BUF_FLUSH_SINGLE_PAGE && block && buf_flush_ready_for_flush(block, flush_type)) { block->io_fix = BUF_IO_WRITE; @@ -387,6 +531,14 @@ buf_flush_try_neighbors( low = offset; high = offset + 1; + } else if (flush_type == BUF_FLUSH_LIST) { + /* Since semaphore waits require us to flush the + doublewrite buffer to disk, it is best that the + search area is just the page itself, to minimize + chances for semaphore waits */ + + low = offset; + high = offset + 1; } /* printf("Flush area: low %lu high %lu\n", low, high); */ @@ -418,13 +570,6 @@ buf_flush_try_neighbors( mutex_exit(&(buf_pool->mutex)); - /* In simulated aio we wake up the i/o-handler threads now that - we have posted a batch of writes: */ - - /* printf("Flush count %lu ; Waking i/o handlers\n", count); */ - - os_aio_simulated_wake_handler_threads(); - return(count); } @@ -565,13 +710,15 @@ buf_flush_batch( mutex_exit(&(buf_pool->mutex)); - if (buf_debug_prints && (page_count > 0)) { + buf_flush_buffered_writes(); + + if (buf_debug_prints && page_count > 0) { if (flush_type == BUF_FLUSH_LRU) { - printf("To flush %lu pages in LRU flush\n", + printf("Flushed %lu pages in LRU flush\n", page_count); } else if (flush_type == BUF_FLUSH_LIST) { - printf("To flush %lu pages in flush list flush\n", - page_count, flush_type); + printf("Flushed %lu pages in flush list flush\n", + page_count); } else { ut_error; } diff --git a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c index 728bf4404b8..db187cdd896 100644 --- a/innobase/buf/buf0rea.c +++ b/innobase/buf/buf0rea.c @@ -49,7 +49,9 @@ ulint buf_read_page_low( /*==============*/ /* out: 1 if a read request was queued, 0 if the page - already resided in buf_pool */ + already resided in buf_pool or if the page is in + the doublewrite buffer blocks in which case it is never + read into the pool */ ibool sync, /* in: TRUE if synchronous aio is desired */ ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ..., ORed to OS_AIO_SIMULATED_WAKE_LATER (see below @@ -63,6 +65,16 @@ buf_read_page_low( wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER; + if (trx_doublewrite && space == TRX_SYS_SPACE + && ( (offset >= trx_doublewrite->block1 + && offset < trx_doublewrite->block1 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) + || (offset >= trx_doublewrite->block2 + && offset < trx_doublewrite->block2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) { + return(0); + } + #ifdef UNIV_LOG_DEBUG if (space % 2 == 1) { /* We are updating a replicate space while holding the |