summaryrefslogtreecommitdiff
path: root/innobase/buf
diff options
context:
space:
mode:
authorheikki@donna.mysql.fi <>2001-08-04 19:36:14 +0300
committerheikki@donna.mysql.fi <>2001-08-04 19:36:14 +0300
commit94db78ce61a998d28a9335bade3e5e1df558a4ea (patch)
treee6dc89cb458f496f2b93e907afb60d3cd886cc18 /innobase/buf
parent596d69b5ce815c325d8a1af7934ed50efce5aed3 (diff)
downloadmariadb-git-94db78ce61a998d28a9335bade3e5e1df558a4ea.tar.gz
srv0srv.h Support raw disk partitions as data files
srv0start.c Support raw disk partitions as data files srv0srv.c Support raw disk partitions as data files row0purge.c < 4 GB rows, doublewrite, hang fixes row0row.c < 4 GB rows, doublewrite, hang fixes row0sel.c < 4 GB rows, doublewrite, hang fixes row0uins.c < 4 GB rows, doublewrite, hang fixes row0umod.c < 4 GB rows, doublewrite, hang fixes row0undo.c < 4 GB rows, doublewrite, hang fixes row0upd.c < 4 GB rows, doublewrite, hang fixes srv0srv.c < 4 GB rows, doublewrite, hang fixes srv0start.c < 4 GB rows, doublewrite, hang fixes sync0rw.c < 4 GB rows, doublewrite, hang fixes sync0sync.c < 4 GB rows, doublewrite, hang fixes trx0purge.c < 4 GB rows, doublewrite, hang fixes trx0rec.c < 4 GB rows, doublewrite, hang fixes trx0sys.c < 4 GB rows, doublewrite, hang fixes btr0btr.c < 4 GB rows, doublewrite, hang fixes btr0cur.c < 4 GB rows, doublewrite, hang fixes buf0buf.c < 4 GB rows, doublewrite, hang fixes buf0flu.c < 4 GB rows, doublewrite, hang fixes buf0rea.c < 4 GB rows, doublewrite, hang fixes data0data.c < 4 GB rows, doublewrite, hang fixes fil0fil.c < 4 GB rows, doublewrite, hang fixes fsp0fsp.c < 4 GB rows, doublewrite, hang fixes ibuf0ibuf.c < 4 GB rows, doublewrite, hang fixes lock0lock.c < 4 GB rows, doublewrite, hang fixes log0log.c < 4 GB rows, doublewrite, hang fixes log0recv.c < 4 GB rows, doublewrite, hang fixes os0file.c < 4 GB rows, doublewrite, hang fixes page0cur.c < 4 GB rows, doublewrite, hang fixes pars0pars.c < 4 GB rows, doublewrite, hang fixes rem0cmp.c < 4 GB rows, doublewrite, hang fixes rem0rec.c < 4 GB rows, doublewrite, hang fixes row0ins.c < 4 GB rows, doublewrite, hang fixes row0mysql.c < 4 GB rows, doublewrite, hang fixes univ.i < 4 GB rows, doublewrite, hang fixes data0data.ic < 4 GB rows, doublewrite, hang fixes mach0data.ic < 4 GB rows, doublewrite, hang fixes rem0rec.ic < 4 GB rows, doublewrite, hang fixes row0upd.ic < 4 GB rows, doublewrite, hang fixes trx0rec.ic < 4 GB rows, doublewrite, hang fixes rem0cmp.h < 4 GB rows, doublewrite, hang fixes rem0rec.h < 4 GB rows, doublewrite, hang fixes row0ins.h < 4 GB rows, doublewrite, hang fixes row0mysql.h < 4 GB rows, doublewrite, hang fixes row0row.h < 4 GB rows, doublewrite, hang fixes row0upd.h < 4 GB rows, doublewrite, hang fixes srv0srv.h < 4 GB rows, doublewrite, hang fixes sync0sync.h < 4 GB rows, doublewrite, hang fixes trx0rec.h < 4 GB rows, doublewrite, hang fixes trx0sys.h < 4 GB rows, doublewrite, hang fixes trx0types.h < 4 GB rows, doublewrite, hang fixes trx0undo.h < 4 GB rows, doublewrite, hang fixes ut0dbg.h < 4 GB rows, doublewrite, hang fixes ut0ut.h < 4 GB rows, doublewrite, hang fixes btr0btr.h < 4 GB rows, doublewrite, hang fixes btr0cur.h < 4 GB rows, doublewrite, hang fixes buf0buf.h < 4 GB rows, doublewrite, hang fixes buf0flu.h < 4 GB rows, doublewrite, hang fixes data0data.h < 4 GB rows, doublewrite, hang fixes dict0mem.h < 4 GB rows, doublewrite, hang fixes fil0fil.h < 4 GB rows, doublewrite, hang fixes fsp0fsp.h < 4 GB rows, doublewrite, hang fixes os0file.h < 4 GB rows, doublewrite, hang fixes
Diffstat (limited to 'innobase/buf')
-rw-r--r--innobase/buf/buf0buf.c81
-rw-r--r--innobase/buf/buf0flu.c195
-rw-r--r--innobase/buf/buf0rea.c14
3 files changed, 242 insertions, 48 deletions
diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c
index ede9e621462..3fabe6c6d0e 100644
--- a/innobase/buf/buf0buf.c
+++ b/innobase/buf/buf0buf.c
@@ -216,14 +216,44 @@ buf_calc_page_checksum(
/* out: checksum */
byte* page) /* in: buffer page */
{
- ulint checksum;
+ ulint checksum;
- checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
- + ut_fold_binary(page + FIL_PAGE_DATA, UNIV_PAGE_SIZE - FIL_PAGE_DATA
- - FIL_PAGE_END_LSN);
- checksum = checksum & 0xFFFFFFFF;
+ checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
+ + ut_fold_binary(page + FIL_PAGE_DATA,
+ UNIV_PAGE_SIZE - FIL_PAGE_DATA
+ - FIL_PAGE_END_LSN);
+ checksum = checksum & 0xFFFFFFFF;
- return(checksum);
+ return(checksum);
+}
+
+/************************************************************************
+Checks if a page is corrupt. */
+
+ibool
+buf_page_is_corrupted(
+/*==================*/
+ /* out: TRUE if corrupted */
+ byte* read_buf) /* in: a database page */
+{
+ ulint checksum;
+
+ checksum = buf_calc_page_checksum(read_buf);
+
+ if ((mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
+ != mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN + 4))
+ || (checksum != mach_read_from_4(read_buf
+ + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN)
+ && mach_read_from_4(read_buf + FIL_PAGE_LSN)
+ != mach_read_from_4(read_buf
+ + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN))) {
+ return(TRUE);
+ }
+
+ return(FALSE);
}
/************************************************************************
@@ -1265,34 +1295,22 @@ buf_page_io_complete(
dulint id;
dict_index_t* index;
ulint io_type;
- ulint checksum;
ut_ad(block);
io_type = block->io_fix;
if (io_type == BUF_IO_READ) {
- checksum = buf_calc_page_checksum(block->frame);
-
/* From version 3.23.38 up we store the page checksum
to the 4 upper bytes of the page end lsn field */
- if ((mach_read_from_4(block->frame + FIL_PAGE_LSN + 4)
- != mach_read_from_4(block->frame + UNIV_PAGE_SIZE
- - FIL_PAGE_END_LSN + 4))
- || (checksum != mach_read_from_4(block->frame
- + UNIV_PAGE_SIZE
- - FIL_PAGE_END_LSN)
- && mach_read_from_4(block->frame + FIL_PAGE_LSN)
- != mach_read_from_4(block->frame
- + UNIV_PAGE_SIZE
- - FIL_PAGE_END_LSN))) {
- fprintf(stderr,
+ if (buf_page_is_corrupted(block->frame)) {
+ fprintf(stderr,
"InnoDB: Database page corruption or a failed\n"
"InnoDB: file read of page %lu.\n", block->offset);
- fprintf(stderr,
+ fprintf(stderr,
"InnoDB: You may have to recover from a backup.\n");
- exit(1);
+ exit(1);
}
if (recv_recovery_is_on()) {
@@ -1601,11 +1619,28 @@ void
buf_print_io(void)
/*==============*/
{
+ ulint size;
+
ut_ad(buf_pool);
+ size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE;
+
mutex_enter(&(buf_pool->mutex));
+
+ printf("LRU list length %lu \n", UT_LIST_GET_LEN(buf_pool->LRU));
+ printf("Free list length %lu \n", UT_LIST_GET_LEN(buf_pool->free));
+ printf("Flush list length %lu \n",
+ UT_LIST_GET_LEN(buf_pool->flush_list));
+ printf("Buffer pool size in pages %lu\n", size);
- printf("pages read %lu, created %lu, written %lu\n",
+ printf("Pending reads %lu \n", buf_pool->n_pend_reads);
+
+ printf("Pending writes: LRU %lu, flush list %lu, single page %lu\n",
+ buf_pool->n_flush[BUF_FLUSH_LRU],
+ buf_pool->n_flush[BUF_FLUSH_LIST],
+ buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
+
+ printf("Pages read %lu, created %lu, written %lu\n",
buf_pool->n_pages_read, buf_pool->n_pages_created,
buf_pool->n_pages_written);
mutex_exit(&(buf_pool->mutex));
diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c
index 7129b8d20a9..0f27cee45a5 100644
--- a/innobase/buf/buf0flu.c
+++ b/innobase/buf/buf0flu.c
@@ -1,7 +1,7 @@
/******************************************************
The database buffer buf_pool flush algorithm
-(c) 1995 Innobase Oy
+(c) 1995-2001 Innobase Oy
Created 11/11/1995 Heikki Tuuri
*******************************************************/
@@ -15,7 +15,6 @@ Created 11/11/1995 Heikki Tuuri
#include "ut0byte.h"
#include "ut0lst.h"
#include "fil0fil.h"
-
#include "buf0buf.h"
#include "buf0lru.h"
#include "buf0rea.h"
@@ -195,9 +194,145 @@ buf_flush_write_complete(
}
/************************************************************************
-Does an asynchronous write of a buffer page. NOTE: in simulated aio we must
-call os_aio_simulated_wake_handler_threads after we have posted a batch
-of writes! */
+Flushes possible buffered writes from the doublewrite memory buffer to disk,
+and also wakes up the aio thread if simulated aio is used. It is very
+important to call this function after a batch of writes has been posted,
+and also when we may have to wait for a page latch! Otherwise a deadlock
+of threads can occur. */
+static
+void
+buf_flush_buffered_writes(void)
+/*===========================*/
+{
+ buf_block_t* block;
+ ulint len;
+ ulint i;
+
+ if (trx_doublewrite == NULL) {
+ os_aio_simulated_wake_handler_threads();
+
+ return;
+ }
+
+ mutex_enter(&(trx_doublewrite->mutex));
+
+ /* Write first to doublewrite buffer blocks. We use synchronous
+ aio and thus know that file write has been completed when the
+ control returns. */
+
+ if (trx_doublewrite->first_free == 0) {
+
+ mutex_exit(&(trx_doublewrite->mutex));
+
+ return;
+ }
+
+ if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
+ } else {
+ len = trx_doublewrite->first_free * UNIV_PAGE_SIZE;
+ }
+
+ fil_io(OS_FILE_WRITE,
+ TRUE, TRX_SYS_SPACE,
+ trx_doublewrite->block1, 0, len,
+ (void*)trx_doublewrite->write_buf, NULL);
+
+ if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ len = (trx_doublewrite->first_free
+ - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE;
+
+ fil_io(OS_FILE_WRITE,
+ TRUE, TRX_SYS_SPACE,
+ trx_doublewrite->block2, 0, len,
+ (void*)(trx_doublewrite->write_buf
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE),
+ NULL);
+ }
+
+ /* Now flush the doublewrite buffer data to disk */
+
+ fil_flush(TRX_SYS_SPACE);
+
+ /* We know that the writes have been flushed to disk now
+ and in recovery we will find them in the doublewrite buffer
+ blocks. Next do the writes to the intended positions. */
+
+ for (i = 0; i < trx_doublewrite->first_free; i++) {
+ block = trx_doublewrite->buf_block_arr[i];
+
+ fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+ FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
+ (void*)block->frame, (void*)block);
+ }
+
+ /* Wake possible simulated aio thread to actually post the
+ writes to the operating system */
+
+ os_aio_simulated_wake_handler_threads();
+
+ /* Wait that all async writes to tablespaces have been posted to
+ the OS */
+
+ os_aio_wait_until_no_pending_writes();
+
+ /* Now we flush the data to disk (for example, with fsync) */
+
+ fil_flush_file_spaces(FIL_TABLESPACE);
+
+ /* We can now reuse the doublewrite memory buffer: */
+
+ trx_doublewrite->first_free = 0;
+
+ mutex_exit(&(trx_doublewrite->mutex));
+}
+
+/************************************************************************
+Posts a buffer page for writing. If the doublewrite memory buffer is
+full, calls buf_flush_buffered_writes and waits for for free space to
+appear. */
+static
+void
+buf_flush_post_to_doublewrite_buf(
+/*==============================*/
+ buf_block_t* block) /* in: buffer block to write */
+{
+try_again:
+ mutex_enter(&(trx_doublewrite->mutex));
+
+ if (trx_doublewrite->first_free
+ >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ mutex_exit(&(trx_doublewrite->mutex));
+
+ buf_flush_buffered_writes();
+
+ goto try_again;
+ }
+
+ ut_memcpy(trx_doublewrite->write_buf
+ + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
+ block->frame, UNIV_PAGE_SIZE);
+
+ trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = block;
+
+ trx_doublewrite->first_free++;
+
+ if (trx_doublewrite->first_free
+ >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ mutex_exit(&(trx_doublewrite->mutex));
+
+ buf_flush_buffered_writes();
+
+ return;
+ }
+
+ mutex_exit(&(trx_doublewrite->mutex));
+}
+
+/************************************************************************
+Does an asynchronous write of a buffer page. NOTE: in simulated aio and
+also when the doublewrite buffer is used, we must call
+buf_flush_buffered_writes after we have posted a batch of writes! */
static
void
buf_flush_write_block_low(
@@ -222,15 +357,24 @@ buf_flush_write_block_low(
mach_write_to_8(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN,
block->newest_modification);
+ /* Write to the page the space id and page number */
+
+ mach_write_to_4(block->frame + FIL_PAGE_SPACE, block->space);
+ mach_write_to_4(block->frame + FIL_PAGE_OFFSET, block->offset);
+
/* We overwrite the first 4 bytes of the end lsn field to store
a page checksum */
mach_write_to_4(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN,
buf_calc_page_checksum(block->frame));
- fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
- FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
+ if (!trx_doublewrite) {
+ fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+ FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
(void*)block->frame, (void*)block);
+ } else {
+ buf_flush_post_to_doublewrite_buf(block);
+ }
}
/************************************************************************
@@ -251,14 +395,14 @@ buf_flush_try_page(
buf_block_t* block;
ibool locked;
- ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST)
- || (flush_type == BUF_FLUSH_SINGLE_PAGE));
+ ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
+ || flush_type == BUF_FLUSH_SINGLE_PAGE);
mutex_enter(&(buf_pool->mutex));
block = buf_page_hash_get(space, offset);
- if ((flush_type == BUF_FLUSH_LIST)
+ if (flush_type == BUF_FLUSH_LIST
&& block && buf_flush_ready_for_flush(block, flush_type)) {
block->io_fix = BUF_IO_WRITE;
@@ -286,7 +430,7 @@ buf_flush_try_page(
mutex_exit(&(buf_pool->mutex));
if (!locked) {
- os_aio_simulated_wake_handler_threads();
+ buf_flush_buffered_writes();
rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
}
@@ -300,7 +444,7 @@ buf_flush_try_page(
return(1);
- } else if ((flush_type == BUF_FLUSH_LRU) && block
+ } else if (flush_type == BUF_FLUSH_LRU && block
&& buf_flush_ready_for_flush(block, flush_type)) {
/* VERY IMPORTANT:
@@ -328,7 +472,7 @@ buf_flush_try_page(
return(1);
- } else if ((flush_type == BUF_FLUSH_SINGLE_PAGE) && block
+ } else if (flush_type == BUF_FLUSH_SINGLE_PAGE && block
&& buf_flush_ready_for_flush(block, flush_type)) {
block->io_fix = BUF_IO_WRITE;
@@ -387,6 +531,14 @@ buf_flush_try_neighbors(
low = offset;
high = offset + 1;
+ } else if (flush_type == BUF_FLUSH_LIST) {
+ /* Since semaphore waits require us to flush the
+ doublewrite buffer to disk, it is best that the
+ search area is just the page itself, to minimize
+ chances for semaphore waits */
+
+ low = offset;
+ high = offset + 1;
}
/* printf("Flush area: low %lu high %lu\n", low, high); */
@@ -418,13 +570,6 @@ buf_flush_try_neighbors(
mutex_exit(&(buf_pool->mutex));
- /* In simulated aio we wake up the i/o-handler threads now that
- we have posted a batch of writes: */
-
- /* printf("Flush count %lu ; Waking i/o handlers\n", count); */
-
- os_aio_simulated_wake_handler_threads();
-
return(count);
}
@@ -565,13 +710,15 @@ buf_flush_batch(
mutex_exit(&(buf_pool->mutex));
- if (buf_debug_prints && (page_count > 0)) {
+ buf_flush_buffered_writes();
+
+ if (buf_debug_prints && page_count > 0) {
if (flush_type == BUF_FLUSH_LRU) {
- printf("To flush %lu pages in LRU flush\n",
+ printf("Flushed %lu pages in LRU flush\n",
page_count);
} else if (flush_type == BUF_FLUSH_LIST) {
- printf("To flush %lu pages in flush list flush\n",
- page_count, flush_type);
+ printf("Flushed %lu pages in flush list flush\n",
+ page_count);
} else {
ut_error;
}
diff --git a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c
index 728bf4404b8..db187cdd896 100644
--- a/innobase/buf/buf0rea.c
+++ b/innobase/buf/buf0rea.c
@@ -49,7 +49,9 @@ ulint
buf_read_page_low(
/*==============*/
/* out: 1 if a read request was queued, 0 if the page
- already resided in buf_pool */
+ already resided in buf_pool or if the page is in
+ the doublewrite buffer blocks in which case it is never
+ read into the pool */
ibool sync, /* in: TRUE if synchronous aio is desired */
ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ...,
ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
@@ -63,6 +65,16 @@ buf_read_page_low(
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
+ if (trx_doublewrite && space == TRX_SYS_SPACE
+ && ( (offset >= trx_doublewrite->block1
+ && offset < trx_doublewrite->block1
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+ || (offset >= trx_doublewrite->block2
+ && offset < trx_doublewrite->block2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
+ return(0);
+ }
+
#ifdef UNIV_LOG_DEBUG
if (space % 2 == 1) {
/* We are updating a replicate space while holding the