summaryrefslogtreecommitdiff
path: root/innobase
diff options
context:
space:
mode:
authorunknown <heikki@hundin.mysql.fi>2003-01-06 22:07:25 +0200
committerunknown <heikki@hundin.mysql.fi>2003-01-06 22:07:25 +0200
commit9497c997041e20baab3ebaa985c9e29235fdf0ca (patch)
tree88f03b8d4b471aa94f27b4a3ed510bcd7c2c3ccb /innobase
parentef62b4c948d68108c0945ebde33540b03b392a6d (diff)
downloadmariadb-git-9497c997041e20baab3ebaa985c9e29235fdf0ca.tar.gz
buf0buf.c, buf0buf.ic, buf0buf.h:
Reduce memory usage of the buffer headers Many files: Merge InnoDB-4.1 with AWE support sql/mysqld.cc: Merge InnoDB-4.1 with AWE support sql/set_var.cc: Merge InnoDB-4.1 with AWE support sql/ha_innodb.h: Merge InnoDB-4.1 with AWE support sql/ha_innodb.cc: Merge InnoDB-4.1 with AWE support innobase/btr/btr0cur.c: Merge InnoDB-4.1 with AWE support innobase/btr/btr0pcur.c: Merge InnoDB-4.1 with AWE support innobase/buf/buf0flu.c: Merge InnoDB-4.1 with AWE support innobase/buf/buf0lru.c: Merge InnoDB-4.1 with AWE support innobase/buf/buf0rea.c: Merge InnoDB-4.1 with AWE support innobase/include/btr0pcur.h: Merge InnoDB-4.1 with AWE support innobase/include/buf0lru.h: Merge InnoDB-4.1 with AWE support innobase/include/log0recv.h: Merge InnoDB-4.1 with AWE support innobase/include/os0proc.h: Merge InnoDB-4.1 with AWE support innobase/include/srv0srv.h: Merge InnoDB-4.1 with AWE support innobase/log/log0log.c: Merge InnoDB-4.1 with AWE support innobase/log/log0recv.c: Merge InnoDB-4.1 with AWE support innobase/os/os0file.c: Merge InnoDB-4.1 with AWE support innobase/os/os0proc.c: Merge InnoDB-4.1 with AWE support innobase/srv/srv0srv.c: Merge InnoDB-4.1 with AWE support innobase/srv/srv0start.c: Merge InnoDB-4.1 with AWE support innobase/trx/trx0sys.c: Merge InnoDB-4.1 with AWE support innobase/trx/trx0trx.c: Merge InnoDB-4.1 with AWE support innobase/ut/ut0ut.c: Merge InnoDB-4.1 with AWE support innobase/include/buf0buf.h: Reduce memory usage of the buffer headers innobase/include/buf0buf.ic: Reduce memory usage of the buffer headers innobase/buf/buf0buf.c: Reduce memory usage of the buffer headers
Diffstat (limited to 'innobase')
-rw-r--r--innobase/btr/btr0cur.c7
-rw-r--r--innobase/btr/btr0pcur.c42
-rw-r--r--innobase/buf/buf0buf.c307
-rw-r--r--innobase/buf/buf0flu.c46
-rw-r--r--innobase/buf/buf0lru.c62
-rw-r--r--innobase/buf/buf0rea.c2
-rw-r--r--innobase/include/btr0pcur.h3
-rw-r--r--innobase/include/buf0buf.h120
-rw-r--r--innobase/include/buf0buf.ic119
-rw-r--r--innobase/include/buf0lru.h4
-rw-r--r--innobase/include/log0recv.h7
-rw-r--r--innobase/include/os0proc.h70
-rw-r--r--innobase/include/srv0srv.h3
-rw-r--r--innobase/log/log0log.c36
-rw-r--r--innobase/log/log0recv.c24
-rw-r--r--innobase/os/os0file.c2
-rw-r--r--innobase/os/os0proc.c462
-rw-r--r--innobase/srv/srv0srv.c38
-rw-r--r--innobase/srv/srv0start.c41
-rw-r--r--innobase/trx/trx0sys.c6
-rw-r--r--innobase/trx/trx0trx.c2
-rw-r--r--innobase/ut/ut0ut.c1
22 files changed, 1222 insertions, 182 deletions
diff --git a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c
index 24f0447d55d..e1d12c9adc4 100644
--- a/innobase/btr/btr0cur.c
+++ b/innobase/btr/btr0cur.c
@@ -291,6 +291,7 @@ btr_cur_search_to_nth_level(
&& latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
&& !estimate
&& mode != PAGE_CUR_LE_OR_EXTENDS
+ && srv_use_adaptive_hash_indexes
&& btr_search_guess_on_hash(index, info, tuple, mode,
latch_mode, cursor,
has_search_latch, mtr)) {
@@ -495,9 +496,11 @@ retry_page_get:
cursor->up_bytes = up_bytes;
#ifdef BTR_CUR_ADAPT
- btr_search_info_update(index, cursor);
-#endif
+ if (srv_use_adaptive_hash_indexes) {
+ btr_search_info_update(index, cursor);
+ }
+#endif
ut_ad(cursor->up_match != ULINT_UNDEFINED
|| mode != PAGE_CUR_GE);
ut_ad(cursor->up_match != ULINT_UNDEFINED
diff --git a/innobase/btr/btr0pcur.c b/innobase/btr/btr0pcur.c
index b2115dfdd6c..13efacb9da3 100644
--- a/innobase/btr/btr0pcur.c
+++ b/innobase/btr/btr0pcur.c
@@ -95,7 +95,9 @@ btr_pcur_store_position(
ut_a(cursor->latch_mode != BTR_NO_LATCHES);
if (page_get_n_recs(page) == 0) {
- /* It must be an empty index tree */
+ /* It must be an empty index tree; NOTE that in this case
+ we do not store the modify_clock, but always do a search
+ if we restore the cursor position */
ut_a(btr_page_get_next(page, mtr) == FIL_NULL
&& btr_page_get_prev(page, mtr) == FIL_NULL);
@@ -128,12 +130,13 @@ btr_pcur_store_position(
} else {
cursor->rel_pos = BTR_PCUR_ON;
}
-
+
cursor->old_stored = BTR_PCUR_OLD_STORED;
cursor->old_rec = dict_tree_copy_rec_order_prefix(tree, rec,
&(cursor->old_rec_buf),
&(cursor->buf_size));
+ cursor->block_when_stored = buf_block_align(page);
cursor->modify_clock = buf_frame_get_modify_clock(page);
}
@@ -205,6 +208,9 @@ btr_pcur_restore_position(
if (cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
|| cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
+ /* In these cases we do not try an optimistic restoration,
+ but always do a search */
+
if (cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
from_left = TRUE;
} else {
@@ -214,6 +220,10 @@ btr_pcur_restore_position(
btr_cur_open_at_index_side(from_left,
btr_pcur_get_btr_cur(cursor)->index, latch_mode,
btr_pcur_get_btr_cur(cursor), mtr);
+
+ cursor->block_when_stored =
+ buf_block_align(btr_pcur_get_page(cursor));
+
return(FALSE);
}
@@ -224,8 +234,9 @@ btr_pcur_restore_position(
if (latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF) {
/* Try optimistic restoration */
- if (buf_page_optimistic_get(latch_mode, page,
- cursor->modify_clock, mtr)) {
+ if (buf_page_optimistic_get(latch_mode,
+ cursor->block_when_stored, page,
+ cursor->modify_clock, mtr)) {
cursor->pos_state = BTR_PCUR_IS_POSITIONED;
buf_page_dbg_add_level(page, SYNC_TREE_NODE);
@@ -270,8 +281,6 @@ btr_pcur_restore_position(
btr_pcur_open_with_no_init(btr_pcur_get_btr_cur(cursor)->index, tuple,
mode, latch_mode, cursor, 0, mtr);
-
- cursor->old_stored = BTR_PCUR_OLD_STORED;
/* Restore the old search mode */
cursor->search_mode = old_mode;
@@ -280,11 +289,18 @@ btr_pcur_restore_position(
&& btr_pcur_is_on_user_rec(cursor, mtr)
&& 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor))) {
- /* We have to store the NEW value for the modify clock, since
- the cursor can now be on a different page! */
+ /* We have to store the NEW value for the modify clock, since
+ the cursor can now be on a different page! But we can retain
+ the value of old_rec */
+
+ cursor->modify_clock =
+ buf_frame_get_modify_clock(btr_pcur_get_page(cursor));
+
+ cursor->block_when_stored =
+ buf_block_align(btr_pcur_get_page(cursor));
+
+ cursor->old_stored = BTR_PCUR_OLD_STORED;
- cursor->modify_clock = buf_frame_get_modify_clock(
- buf_frame_align(btr_pcur_get_rec(cursor)));
mem_heap_free(heap);
return(TRUE);
@@ -292,6 +308,12 @@ btr_pcur_restore_position(
mem_heap_free(heap);
+ /* We have to store new position information, modify_clock etc.,
+ to the cursor because it can now be on a different page, the record
+ under it may have been removed, etc. */
+
+ btr_pcur_store_position(cursor, mtr);
+
return(FALSE);
}
diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c
index c9a5ec5307f..58c4ca5271b 100644
--- a/innobase/buf/buf0buf.c
+++ b/innobase/buf/buf0buf.c
@@ -196,7 +196,29 @@ If a new page is referenced in the buf_pool, and several pages
of its random access area (for instance, 32 consecutive pages
in a tablespace) have recently been referenced, we may predict
that the whole area may be needed in the near future, and issue
-the read requests for the whole area. */
+the read requests for the whole area.
+
+ AWE implementation
+ ------------------
+
+By a 'block' we mean the buffer header of type buf_block_t. By a 'page'
+we mean the physical 16 kB memory area allocated from RAM for that block.
+By a 'frame' we mean a 16 kB area in the virtual address space of the
+process, in the frame_mem of buf_pool.
+
+We can map pages to the frames of the buffer pool.
+
+1) A buffer block allocated to use as a non-data page, e.g., to the lock
+table, is always mapped to a frame.
+2) A bufferfixed or io-fixed data page is always mapped to a frame.
+3) When we need to map a block to frame, we look from the list
+awe_LRU_free_mapped and try to unmap its last block, but note that
+bufferfixed or io-fixed pages cannot be unmapped.
+4) For every frame in the buffer pool there is always a block whose page is
+mapped to it. When we create the buffer pool, we map the first elements
+in the free list to the frames.
+5) When we have AWE enabled, we disable adaptive hash indexes.
+*/
buf_pool_t* buf_pool = NULL; /* The buffer buf_pool of the database */
@@ -346,12 +368,15 @@ void
buf_block_init(
/*===========*/
buf_block_t* block, /* in: pointer to control block */
- byte* frame) /* in: pointer to buffer frame */
+ byte* frame) /* in: pointer to buffer frame, or NULL if in
+ the case of AWE there is no frame */
{
block->state = BUF_BLOCK_NOT_USED;
block->frame = frame;
+ block->awe_info = NULL;
+
block->modify_clock = ut_dulint_zero;
block->file_page_was_freed = FALSE;
@@ -364,29 +389,37 @@ buf_block_init(
rw_lock_create(&(block->read_lock));
rw_lock_set_level(&(block->read_lock), SYNC_NO_ORDER_CHECK);
+#ifdef UNIV_SYNC_DEBUG
rw_lock_create(&(block->debug_latch));
rw_lock_set_level(&(block->debug_latch), SYNC_NO_ORDER_CHECK);
+#endif
}
/************************************************************************
-Creates a buffer buf_pool object. */
-static
+Creates the buffer pool. */
+
buf_pool_t*
-buf_pool_create(
-/*============*/
+buf_pool_init(
+/*==========*/
/* out, own: buf_pool object, NULL if not
- enough memory */
+ enough memory or error */
ulint max_size, /* in: maximum size of the buf_pool in
blocks */
- ulint curr_size) /* in: current size to use, must be <=
+ ulint curr_size, /* in: current size to use, must be <=
max_size, currently must be equal to
max_size */
+ ulint n_frames) /* in: number of frames; if AWE is used,
+ this is the size of the address space window
+ where physical memory pages are mapped; if
+ AWE is not used then this must be the same
+ as max_size */
{
byte* frame;
ulint i;
buf_block_t* block;
ut_a(max_size == curr_size);
+ ut_a(srv_use_awe || n_frames == max_size);
buf_pool = mem_alloc(sizeof(buf_pool_t));
@@ -396,8 +429,38 @@ buf_pool_create(
mutex_set_level(&(buf_pool->mutex), SYNC_BUF_POOL);
mutex_enter(&(buf_pool->mutex));
-
- buf_pool->frame_mem = ut_malloc(UNIV_PAGE_SIZE * (max_size + 1));
+
+ if (srv_use_awe) {
+ /*----------------------------------------*/
+ /* Allocate the virtual address space window, i.e., the
+ buffer pool frames */
+
+ buf_pool->frame_mem = os_awe_allocate_virtual_mem_window(
+ UNIV_PAGE_SIZE * (n_frames + 1));
+
+ /* Allocate the physical memory for AWE and the AWE info array
+ for buf_pool */
+
+ if ((curr_size % ((1024 * 1024) / UNIV_PAGE_SIZE)) != 0) {
+
+ fprintf(stderr,
+"InnoDB: AWE: Error: physical memory must be allocated in full megabytes.\n"
+"InnoDB: Trying to allocate %lu database pages.\n",
+ curr_size);
+
+ return(NULL);
+ }
+
+ if (!os_awe_allocate_physical_mem(&(buf_pool->awe_info),
+ curr_size / ((1024 * 1024) / UNIV_PAGE_SIZE))) {
+
+ return(NULL);
+ }
+ /*----------------------------------------*/
+ } else {
+ buf_pool->frame_mem = ut_malloc(
+ UNIV_PAGE_SIZE * (n_frames + 1));
+ }
if (buf_pool->frame_mem == NULL) {
@@ -414,21 +477,60 @@ buf_pool_create(
buf_pool->max_size = max_size;
buf_pool->curr_size = curr_size;
+ buf_pool->n_frames = n_frames;
+
/* Align pointer to the first frame */
frame = ut_align(buf_pool->frame_mem, UNIV_PAGE_SIZE);
- buf_pool->frame_zero = frame;
+ buf_pool->frame_zero = frame;
buf_pool->high_end = frame + UNIV_PAGE_SIZE * curr_size;
- /* Init block structs and assign frames for them */
+ if (srv_use_awe) {
+ /*----------------------------------------*/
+ /* Map an initial part of the allocated physical memory to
+ the window */
+
+ os_awe_map_physical_mem_to_window(buf_pool->frame_zero,
+ n_frames *
+ (UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE),
+ buf_pool->awe_info);
+ /*----------------------------------------*/
+ }
+
+ buf_pool->blocks_of_frames = ut_malloc(sizeof(void*) * n_frames);
+
+ if (buf_pool->blocks_of_frames == NULL) {
+
+ return(NULL);
+ }
+
+ /* Init block structs and assign frames for them; in the case of
+ AWE there are less frames than blocks. Then we assign the frames
+ to the first blocks (we already mapped the memory above). We also
+ init the awe_info for every block. */
+
for (i = 0; i < max_size; i++) {
block = buf_pool_get_nth_block(buf_pool, i);
+
+ if (i < n_frames) {
+ frame = buf_pool->frame_zero + i * UNIV_PAGE_SIZE;
+ *(buf_pool->blocks_of_frames + i) = block;
+ } else {
+ frame = NULL;
+ }
+
buf_block_init(block, frame);
- frame = frame + UNIV_PAGE_SIZE;
+
+ if (srv_use_awe) {
+ /*----------------------------------------*/
+ block->awe_info = buf_pool->awe_info
+ + i * (UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE);
+ /*----------------------------------------*/
+ }
}
-
+
buf_pool->page_hash = hash_create(2 * max_size);
buf_pool->n_pend_reads = 0;
@@ -438,12 +540,14 @@ buf_pool_create(
buf_pool->n_pages_read = 0;
buf_pool->n_pages_written = 0;
buf_pool->n_pages_created = 0;
-
+ buf_pool->n_pages_awe_remapped = 0;
+
buf_pool->n_page_gets = 0;
buf_pool->n_page_gets_old = 0;
buf_pool->n_pages_read_old = 0;
buf_pool->n_pages_written_old = 0;
buf_pool->n_pages_created_old = 0;
+ buf_pool->n_pages_awe_remapped_old = 0;
/* 2. Initialize flushing fields
---------------------------- */
@@ -466,40 +570,120 @@ buf_pool_create(
buf_pool->LRU_old = NULL;
+ UT_LIST_INIT(buf_pool->awe_LRU_free_mapped);
+
/* Add control blocks to the free list */
UT_LIST_INIT(buf_pool->free);
+
for (i = 0; i < curr_size; i++) {
block = buf_pool_get_nth_block(buf_pool, i);
- /* Wipe contents of page to eliminate a Purify warning */
- memset(block->frame, '\0', UNIV_PAGE_SIZE);
+ if (block->frame) {
+ /* Wipe contents of frame to eliminate a Purify
+ warning */
+
+ memset(block->frame, '\0', UNIV_PAGE_SIZE);
+
+ if (srv_use_awe) {
+ /* Add to the list of blocks mapped to
+ frames */
+
+ UT_LIST_ADD_LAST(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped, block);
+ }
+ }
- UT_LIST_ADD_FIRST(free, buf_pool->free, block);
+ UT_LIST_ADD_LAST(free, buf_pool->free, block);
}
mutex_exit(&(buf_pool->mutex));
- btr_search_sys_create(curr_size * UNIV_PAGE_SIZE / sizeof(void*) / 64);
+ if (srv_use_adaptive_hash_indexes) {
+ btr_search_sys_create(
+ curr_size * UNIV_PAGE_SIZE / sizeof(void*) / 64);
+ } else {
+ /* Create only a small dummy system */
+ btr_search_sys_create(1000);
+ }
return(buf_pool);
}
/************************************************************************
-Initializes the buffer buf_pool of the database. */
+Maps the page of block to a frame, if not mapped yet. Unmaps some page
+from the end of the awe_LRU_free_mapped. */
void
-buf_pool_init(
-/*==========*/
- ulint max_size, /* in: maximum size of the buf_pool in blocks */
- ulint curr_size) /* in: current size to use, must be <=
- max_size */
+buf_awe_map_page_to_frame(
+/*======================*/
+ buf_block_t* block, /* in: block whose page should be
+ mapped to a frame */
+ ibool add_to_mapped_list) /* in: TRUE if we in the case
+ we need to map the page should also
+ add the block to the
+ awe_LRU_free_mapped list */
{
- ut_a(buf_pool == NULL);
+ buf_block_t* bck;
+
+ ut_ad(mutex_own(&(buf_pool->mutex)));
+ ut_ad(block);
+
+ if (block->frame) {
+
+ return;
+ }
+
+ /* Scan awe_LRU_free_mapped from the end and try to find a block
+ which is not bufferfixed or io-fixed */
+
+ bck = UT_LIST_GET_LAST(buf_pool->awe_LRU_free_mapped);
- buf_pool_create(max_size, curr_size);
+ while (bck) {
+ if (bck->state == BUF_BLOCK_FILE_PAGE
+ && (bck->buf_fix_count != 0 || bck->io_fix != 0)) {
+
+ /* We have to skip this */
+ bck = UT_LIST_GET_PREV(awe_LRU_free_mapped, bck);
+ } else {
+ /* We can map block to the frame of bck */
+
+ os_awe_map_physical_mem_to_window(
+ bck->frame,
+ UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE,
+ block->awe_info);
+
+ block->frame = bck->frame;
+
+ *(buf_pool->blocks_of_frames
+ + (((ulint)(block->frame
+ - buf_pool->frame_zero))
+ >> UNIV_PAGE_SIZE_SHIFT))
+ = block;
+
+ bck->frame = NULL;
+ UT_LIST_REMOVE(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped,
+ bck);
+
+ if (add_to_mapped_list) {
+ UT_LIST_ADD_FIRST(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped,
+ block);
+ }
+
+ buf_pool->n_pages_awe_remapped++;
+
+ return;
+ }
+ }
+
+ fprintf(stderr,
+"InnoDB: AWE: Fatal error: cannot find a page to unmap\n"
+"InnoDB: awe_LRU_free_mapped list length %lu\n",
+ UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
- ut_ad(buf_validate());
+ ut_a(0);
}
/************************************************************************
@@ -508,7 +692,9 @@ UNIV_INLINE
buf_block_t*
buf_block_alloc(void)
/*=================*/
- /* out, own: the allocated block */
+ /* out, own: the allocated block; also if AWE
+ is used it is guaranteed that the page is
+ mapped to a frame */
{
buf_block_t* block;
@@ -846,6 +1032,19 @@ loop:
}
}
+ /* If AWE is enabled and the page is not mapped to a frame, then
+ map it */
+
+ if (block->frame == NULL) {
+ ut_a(srv_use_awe);
+
+ /* We set second parameter TRUE because the block is in the
+ LRU list and we must put it to awe_LRU_free_mapped list once
+ mapped to a frame */
+
+ buf_awe_map_page_to_frame(block, TRUE);
+ }
+
#ifdef UNIV_SYNC_DEBUG
buf_block_buf_fix_inc_debug(block, file, line);
#else
@@ -940,28 +1139,27 @@ buf_page_optimistic_get_func(
/*=========================*/
/* out: TRUE if success */
ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */
- buf_frame_t* guess, /* in: guessed frame */
+ buf_block_t* block, /* in: guessed buffer block */
+ buf_frame_t* guess, /* in: guessed frame; note that AWE may move
+ frames */
dulint modify_clock,/* in: modify clock value if mode is
..._GUESS_ON_CLOCK */
char* file, /* in: file name */
ulint line, /* in: line where called */
mtr_t* mtr) /* in: mini-transaction */
{
- buf_block_t* block;
ibool accessed;
ibool success;
ulint fix_type;
- ut_ad(mtr && guess);
+ ut_ad(mtr && block);
ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
-
- buf_pool->n_page_gets++;
-
- block = buf_block_align(guess);
mutex_enter(&(buf_pool->mutex));
- if (block->state != BUF_BLOCK_FILE_PAGE) {
+ /* If AWE is used, block may have a different frame now, e.g., NULL */
+
+ if (block->state != BUF_BLOCK_FILE_PAGE || block->frame != guess) {
mutex_exit(&(buf_pool->mutex));
@@ -1054,12 +1252,15 @@ buf_page_optimistic_get_func(
#ifdef UNIV_IBUF_DEBUG
ut_a(ibuf_count_get(block->space, block->offset) == 0);
#endif
+ buf_pool->n_page_gets++;
+
return(TRUE);
}
/************************************************************************
This is used to get access to a known database page, when no waiting can be
-done. */
+done. For example, if a search in an adaptive hash index leads us to this
+frame. */
ibool
buf_page_get_known_nowait(
@@ -1078,13 +1279,11 @@ buf_page_get_known_nowait(
ut_ad(mtr);
ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
-
- buf_pool->n_page_gets++;
-
- block = buf_block_align(guess);
mutex_enter(&(buf_pool->mutex));
+ block = buf_block_align(guess);
+
if (block->state == BUF_BLOCK_REMOVE_HASH) {
/* Another thread is just freeing the block from the LRU list
of the buffer pool: do not try to access this page; this
@@ -1152,6 +1351,8 @@ buf_page_get_known_nowait(
ut_a((mode == BUF_KEEP_OLD)
|| (ibuf_count_get(block->space, block->offset) == 0));
#endif
+ buf_pool->n_page_gets++;
+
return(TRUE);
}
@@ -1732,7 +1933,7 @@ buf_print(void)
ut_ad(buf_pool);
- size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE;
+ size = buf_pool->curr_size;
index_ids = mem_alloc(sizeof(dulint) * size);
counts = mem_alloc(sizeof(ulint) * size);
@@ -1847,7 +2048,7 @@ buf_print_io(
return;
}
- size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE;
+ size = buf_pool->curr_size;
mutex_enter(&(buf_pool->mutex));
@@ -1866,6 +2067,15 @@ buf_print_io(
buf += sprintf(buf,
"Modified db pages %lu\n",
UT_LIST_GET_LEN(buf_pool->flush_list));
+ if (srv_use_awe) {
+ buf += sprintf(buf,
+ "AWE: Buffer pool memory frames %lu\n",
+ buf_pool->n_frames);
+
+ buf += sprintf(buf,
+ "AWE: Database pages and free buffers mapped in frames %lu\n",
+ UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
+ }
buf += sprintf(buf, "Pending reads %lu \n", buf_pool->n_pend_reads);
@@ -1891,6 +2101,13 @@ buf_print_io(
(buf_pool->n_pages_written - buf_pool->n_pages_written_old)
/ time_elapsed);
+ if (srv_use_awe) {
+ buf += sprintf(buf, "AWE: %.2f page remaps/s\n",
+ (buf_pool->n_pages_awe_remapped
+ - buf_pool->n_pages_awe_remapped_old)
+ / time_elapsed);
+ }
+
if (buf_pool->n_page_gets > buf_pool->n_page_gets_old) {
buf += sprintf(buf, "Buffer pool hit rate %lu / 1000\n",
1000
@@ -1906,6 +2123,7 @@ buf_print_io(
buf_pool->n_pages_read_old = buf_pool->n_pages_read;
buf_pool->n_pages_created_old = buf_pool->n_pages_created;
buf_pool->n_pages_written_old = buf_pool->n_pages_written;
+ buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped;
mutex_exit(&(buf_pool->mutex));
}
@@ -1922,6 +2140,7 @@ buf_refresh_io_stats(void)
buf_pool->n_pages_read_old = buf_pool->n_pages_read;
buf_pool->n_pages_created_old = buf_pool->n_pages_created;
buf_pool->n_pages_written_old = buf_pool->n_pages_written;
+ buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped;
}
/*************************************************************************
diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c
index 78bde60c9b2..02587487a92 100644
--- a/innobase/buf/buf0flu.c
+++ b/innobase/buf/buf0flu.c
@@ -24,6 +24,7 @@ Created 11/11/1995 Heikki Tuuri
#include "log0log.h"
#include "os0file.h"
#include "trx0sys.h"
+#include "srv0srv.h"
/* When flushed, dirty blocks are searched in neigborhoods of this size, and
flushed along with the original page. */
@@ -103,7 +104,7 @@ buf_flush_ready_for_replace(
/*========================*/
/* out: TRUE if can replace immediately */
buf_block_t* block) /* in: buffer control block, must be in state
- BUF_BLOCK_FILE_PAGE and in the LRU list*/
+ BUF_BLOCK_FILE_PAGE and in the LRU list */
{
ut_ad(mutex_own(&(buf_pool->mutex)));
ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
@@ -134,7 +135,6 @@ buf_flush_ready_for_flush(
if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0)
&& (block->io_fix == 0)) {
-
if (flush_type != BUF_FLUSH_LRU) {
return(TRUE);
@@ -436,6 +436,20 @@ buf_flush_try_page(
&& block && buf_flush_ready_for_flush(block, flush_type)) {
block->io_fix = BUF_IO_WRITE;
+
+ /* If AWE is enabled and the page is not mapped to a frame,
+ then map it */
+
+ if (block->frame == NULL) {
+ ut_a(srv_use_awe);
+
+ /* We set second parameter TRUE because the block is
+ in the LRU list and we must put it to
+ awe_LRU_free_mapped list once mapped to a frame */
+
+ buf_awe_map_page_to_frame(block, TRUE);
+ }
+
block->flush_type = flush_type;
if (buf_pool->n_flush[flush_type] == 0) {
@@ -486,6 +500,20 @@ buf_flush_try_page(
..._ready_for_flush). */
block->io_fix = BUF_IO_WRITE;
+
+ /* If AWE is enabled and the page is not mapped to a frame,
+ then map it */
+
+ if (block->frame == NULL) {
+ ut_a(srv_use_awe);
+
+ /* We set second parameter TRUE because the block is
+ in the LRU list and we must put it to
+ awe_LRU_free_mapped list once mapped to a frame */
+
+ buf_awe_map_page_to_frame(block, TRUE);
+ }
+
block->flush_type = flush_type;
if (buf_pool->n_flush[flush_type] == 0) {
@@ -511,6 +539,20 @@ buf_flush_try_page(
&& buf_flush_ready_for_flush(block, flush_type)) {
block->io_fix = BUF_IO_WRITE;
+
+ /* If AWE is enabled and the page is not mapped to a frame,
+ then map it */
+
+ if (block->frame == NULL) {
+ ut_a(srv_use_awe);
+
+ /* We set second parameter TRUE because the block is
+ in the LRU list and we must put it to
+ awe_LRU_free_mapped list once mapped to a frame */
+
+ buf_awe_map_page_to_frame(block, TRUE);
+ }
+
block->flush_type = flush_type;
if (buf_pool->n_flush[block->flush_type] == 0) {
diff --git a/innobase/buf/buf0lru.c b/innobase/buf/buf0lru.c
index 2ec1506c522..051aa0191f6 100644
--- a/innobase/buf/buf0lru.c
+++ b/innobase/buf/buf0lru.c
@@ -132,7 +132,13 @@ buf_LRU_search_and_free_block(
mutex_exit(&(buf_pool->mutex));
- btr_search_drop_page_hash_index(block->frame);
+ /* Remove possible adaptive hash index built on the
+ page; in the case of AWE the block may not have a
+ frame at all */
+
+ if (block->frame) {
+ btr_search_drop_page_hash_index(block->frame);
+ }
mutex_enter(&(buf_pool->mutex));
@@ -196,7 +202,9 @@ list. */
buf_block_t*
buf_LRU_get_free_block(void)
/*========================*/
- /* out: the free control block */
+ /* out: the free control block; also if AWE is
+ used, it is guaranteed that the block has its
+ page mapped to a frame when we return */
{
buf_block_t* block = NULL;
ibool freed;
@@ -257,6 +265,22 @@ loop:
block = UT_LIST_GET_FIRST(buf_pool->free);
UT_LIST_REMOVE(free, buf_pool->free, block);
+
+ if (srv_use_awe) {
+ if (block->frame) {
+ /* Remove from the list of mapped pages */
+
+ UT_LIST_REMOVE(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped, block);
+ } else {
+ /* We map the page to a frame; second param
+ FALSE below because we do not want it to be
+ added to the awe_LRU_free_mapped list */
+
+ buf_awe_map_page_to_frame(block, FALSE);
+ }
+ }
+
block->state = BUF_BLOCK_READY_FOR_USE;
mutex_exit(&(buf_pool->mutex));
@@ -429,6 +453,13 @@ buf_LRU_remove_block(
/* Remove the block from the LRU list */
UT_LIST_REMOVE(LRU, buf_pool->LRU, block);
+ if (srv_use_awe && block->frame) {
+ /* Remove from the list of mapped pages */
+
+ UT_LIST_REMOVE(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped, block);
+ }
+
/* If the LRU list is so short that LRU_old not defined, return */
if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
@@ -475,6 +506,13 @@ buf_LRU_add_block_to_end_low(
UT_LIST_ADD_LAST(LRU, buf_pool->LRU, block);
+ if (srv_use_awe && block->frame) {
+ /* Add to the list of mapped pages */
+
+ UT_LIST_ADD_LAST(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped, block);
+ }
+
if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) {
buf_pool->LRU_old_len++;
@@ -518,6 +556,15 @@ buf_LRU_add_block_low(
block->old = old;
cl = buf_pool_clock_tic();
+ if (srv_use_awe && block->frame) {
+ /* Add to the list of mapped pages; for simplicity we always
+ add to the start, even if the user would have set 'old'
+ TRUE */
+
+ UT_LIST_ADD_FIRST(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped, block);
+ }
+
if (!old || (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) {
UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, block);
@@ -613,6 +660,13 @@ buf_LRU_block_free_non_file_page(
memset(block->frame, '\0', UNIV_PAGE_SIZE);
#endif
UT_LIST_ADD_FIRST(free, buf_pool->free, block);
+
+ if (srv_use_awe && block->frame) {
+ /* Add to the list of mapped pages */
+
+ UT_LIST_ADD_FIRST(awe_LRU_free_mapped,
+ buf_pool->awe_LRU_free_mapped, block);
+ }
}
/**********************************************************************
@@ -639,7 +693,9 @@ buf_LRU_block_remove_hashed_page(
buf_pool->freed_page_clock += 1;
- buf_frame_modify_clock_inc(block->frame);
+ /* Note that if AWE is enabled the block may not have a frame at all */
+
+ buf_block_modify_clock_inc(block);
HASH_DELETE(buf_block_t, hash, buf_pool->page_hash,
buf_page_address_fold(block->space, block->offset),
diff --git a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c
index 475a5bd9cbd..bb6670296b9 100644
--- a/innobase/buf/buf0rea.c
+++ b/innobase/buf/buf0rea.c
@@ -576,7 +576,7 @@ buf_read_recv_pages(
os_aio_print_debug = FALSE;
- while (buf_pool->n_pend_reads >= RECV_POOL_N_FREE_BLOCKS / 2) {
+ while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
os_aio_simulated_wake_handler_threads();
os_thread_sleep(500000);
diff --git a/innobase/include/btr0pcur.h b/innobase/include/btr0pcur.h
index 9d07dd0de18..81f19af4d40 100644
--- a/innobase/include/btr0pcur.h
+++ b/innobase/include/btr0pcur.h
@@ -466,6 +466,9 @@ struct btr_pcur_struct{
BTR_PCUR_AFTER, depending on whether
cursor was on, before, or after the
old_rec record */
+ buf_block_t* block_when_stored;/* buffer block when the position was
+ stored; note that if AWE is on, frames
+ may move */
dulint modify_clock; /* the modify clock value of the
buffer block when the cursor position
was stored */
diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h
index 395f88a2c7c..c7db3d9bcc9 100644
--- a/innobase/include/buf0buf.h
+++ b/innobase/include/buf0buf.h
@@ -30,6 +30,7 @@ Created 11/5/1995 Heikki Tuuri
#include "sync0rw.h"
#include "hash0hash.h"
#include "ut0byte.h"
+#include "os0proc.h"
/* Flags for flush types */
#define BUF_FLUSH_LRU 1
@@ -58,23 +59,34 @@ extern ibool buf_debug_prints;/* If this is set TRUE, the program
occurs */
/************************************************************************
-Initializes the buffer pool of the database. */
+Creates the buffer pool. */
-void
+buf_pool_t*
buf_pool_init(
/*==========*/
- ulint max_size, /* in: maximum size of the pool in blocks */
- ulint curr_size); /* in: current size to use, must be <=
+ /* out, own: buf_pool object, NULL if not
+ enough memory or error */
+ ulint max_size, /* in: maximum size of the buf_pool in
+ blocks */
+ ulint curr_size, /* in: current size to use, must be <=
+ max_size, currently must be equal to
max_size */
+ ulint n_frames); /* in: number of frames; if AWE is used,
+ this is the size of the address space window
+ where physical memory pages are mapped; if
+ AWE is not used then this must be the same
+ as max_size */
/*************************************************************************
-Gets the current size of buffer pool in bytes. */
+Gets the current size of buffer buf_pool in bytes. In the case of AWE, the
+size of AWE window (= the frames). */
UNIV_INLINE
ulint
buf_pool_get_curr_size(void);
/*========================*/
/* out: size in bytes */
/*************************************************************************
-Gets the maximum size of buffer pool in bytes. */
+Gets the maximum size of buffer pool in bytes. In the case of AWE, the
+size of AWE window (= the frames). */
UNIV_INLINE
ulint
buf_pool_get_max_size(void);
@@ -138,8 +150,8 @@ improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed as LA! */
NOTE! The following macros should be used instead of
buf_page_optimistic_get_func, to improve debugging. Only values RW_S_LATCH and
RW_X_LATCH are allowed as LA! */
-#define buf_page_optimistic_get(LA, G, MC, MTR) buf_page_optimistic_get_func(\
- LA, G, MC, IB__FILE__, __LINE__, MTR)
+#define buf_page_optimistic_get(LA, BL, G, MC, MTR) buf_page_optimistic_get_func(\
+ LA, BL, G, MC, IB__FILE__, __LINE__, MTR)
/************************************************************************
This is the general function used to get optimistic access to a database
page. */
@@ -149,7 +161,9 @@ buf_page_optimistic_get_func(
/*=========================*/
/* out: TRUE if success */
ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */
- buf_frame_t* guess, /* in: guessed frame */
+ buf_block_t* block, /* in: guessed block */
+ buf_frame_t* guess, /* in: guessed frame; note that AWE may move
+ frames */
dulint modify_clock,/* in: modify clock value if mode is
..._GUESS_ON_CLOCK */
char* file, /* in: file name */
@@ -350,6 +364,16 @@ buf_frame_modify_clock_inc(
/* out: new value */
buf_frame_t* frame); /* in: pointer to a frame */
/************************************************************************
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+dulint
+buf_block_modify_clock_inc(
+/*=======================*/
+ /* out: new value */
+ buf_block_t* block); /* in: block */
+/************************************************************************
Returns the value of the modify clock. The caller must have an s-lock
or x-lock on the block. */
UNIV_INLINE
@@ -428,7 +452,7 @@ UNIV_INLINE
buf_frame_t*
buf_frame_align(
/*============*/
- /* out: pointer to block */
+ /* out: pointer to frame */
byte* ptr); /* in: pointer to a frame */
/***********************************************************************
Checks if a pointer points to the block array of the buffer pool (blocks, not
@@ -505,6 +529,19 @@ buf_pool_invalidate(void);
--------------------------- LOWER LEVEL ROUTINES -------------------------
=========================================================================*/
+/************************************************************************
+Maps the page of block to a frame, if not mapped yet. Unmaps some page
+from the end of the awe_LRU_free_mapped. */
+
+void
+buf_awe_map_page_to_frame(
+/*======================*/
+ buf_block_t* block, /* in: block whose page should be
+ mapped to a frame */
+ ibool add_to_mapped_list);/* in: TRUE if we in the case
+ we need to map the page should also
+ add the block to the
+ awe_LRU_free_mapped list */
/*************************************************************************
Adds latch level info for the rw-lock protecting the buffer frame. This
should be called in the debug version after a successful latching of a
@@ -638,7 +675,16 @@ struct buf_block_struct{
byte* frame; /* pointer to buffer frame which
is of size UNIV_PAGE_SIZE, and
aligned to an address divisible by
- UNIV_PAGE_SIZE */
+ UNIV_PAGE_SIZE; if AWE is used, this
+ will be NULL for the pages which are
+ currently not mapped into the virtual
+ address space window of the buffer
+ pool */
+ os_awe_t* awe_info; /* if AWE is used, then an array of
+ awe page infos for
+ UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE
+ (normally = 4) physical memory
+ pages; otherwise NULL */
ulint space; /* space id of the page */
ulint offset; /* page number within the space */
ulint lock_hash_val; /* hashed value of the page address
@@ -691,6 +737,10 @@ struct buf_block_struct{
/* node of the free block list */
UT_LIST_NODE_T(buf_block_t) LRU;
/* node of the LRU list */
+ UT_LIST_NODE_T(buf_block_t) awe_LRU_free_mapped;
+ /* in the AWE version node in the
+ list of free and LRU blocks which are
+ mapped to a frame */
ulint LRU_position; /* value which monotonically
decreases (or may stay constant if
the block is in the old blocks) toward
@@ -758,11 +808,12 @@ struct buf_block_struct{
BTR_SEARCH_RIGHT_SIDE in hash
indexing */
/* 6. Debug fields */
-
+#ifdef UNIV_SYNC_DEBUG
rw_lock_t debug_latch; /* in the debug version, each thread
which bufferfixes the block acquires
an s-latch here; so we can use the
debug utilities in sync0rw */
+#endif
ibool file_page_was_freed;
/* this is set to TRUE when fsp
frees a page in buffer pool */
@@ -781,16 +832,36 @@ struct buf_pool_struct{
struct and control blocks, except the
read-write lock in them */
byte* frame_mem; /* pointer to the memory area which
- was allocated for the frames */
+ was allocated for the frames; in AWE
+ this is the virtual address space
+ window where we map pages stored
+ in physical memory */
byte* frame_zero; /* pointer to the first buffer frame:
this may differ from frame_mem, because
this is aligned by the frame size */
- byte* high_end; /* pointer to the end of the
- buffer pool */
+ byte* high_end; /* pointer to the end of the buffer
+ frames */
+ ulint n_frames; /* number of frames */
buf_block_t* blocks; /* array of buffer control blocks */
+ buf_block_t** blocks_of_frames;/* inverse mapping which can be used
+ to retrieve the buffer control block
+ of a frame; this is an array which
+ lists the blocks of frames in the
+ order frame_zero,
+ frame_zero + UNIV_PAGE_SIZE, ...
+ a control block is always assigned
+ for each frame, even if the frame does
+ not contain any data; note that in AWE
+ there are more control blocks than
+ buffer frames */
+ os_awe_t* awe_info; /* if AWE is used, AWE info for the
+ physical 4 kB memory pages associated
+ with buffer frames */
ulint max_size; /* number of control blocks ==
maximum pool size in pages */
- ulint curr_size; /* current pool size in pages */
+ ulint curr_size; /* current pool size in pages;
+ currently always the same as
+ max_size */
hash_table_t* page_hash; /* hash table of the file pages */
ulint n_pend_reads; /* number of pending read operations */
@@ -802,11 +873,14 @@ struct buf_pool_struct{
ulint n_pages_created;/* number of pages created in the pool
with no read */
ulint n_page_gets; /* number of page gets performed;
- also successful seraches through
+ also successful searches through
the adaptive hash index are
counted as page gets; this field
is NOT protected by the buffer
pool mutex */
+ ulint n_pages_awe_remapped; /* if AWE is enabled, the
+ number of remaps of blocks to
+ buffer frames */
ulint n_page_gets_old;/* n_page_gets when buf_print was
last time called: used to calculate
hit rate */
@@ -815,6 +889,7 @@ struct buf_pool_struct{
ulint n_pages_written_old;/* number write operations */
ulint n_pages_created_old;/* number of pages created in
the pool with no read */
+ ulint n_pages_awe_remapped_old;
/* 2. Page flushing algorithm fields */
UT_LIST_BASE_NODE_T(buf_block_t) flush_list;
@@ -847,7 +922,10 @@ struct buf_pool_struct{
/* 3. LRU replacement algorithm fields */
UT_LIST_BASE_NODE_T(buf_block_t) free;
- /* base node of the free block list */
+ /* base node of the free block list;
+ in the case of AWE, at the start are
+ always free blocks for which the
+ physical memory is mapped to a frame */
UT_LIST_BASE_NODE_T(buf_block_t) LRU;
/* base node of the LRU list */
buf_block_t* LRU_old; /* pointer to the about 3/8 oldest
@@ -859,6 +937,12 @@ struct buf_pool_struct{
see buf0lru.c for the restrictions
on this value; not defined if
LRU_old == NULL */
+ UT_LIST_BASE_NODE_T(buf_block_t) awe_LRU_free_mapped;
+ /* list of those blocks which are
+ in the LRU list or the free list, and
+ where the page is mapped to a frame;
+ thus, frames allocated, e.g., to the
+ locki table, are not in this list */
};
/* States of a control block */
diff --git a/innobase/include/buf0buf.ic b/innobase/include/buf0buf.ic
index 7227c79dc6a..d4e7122f3f9 100644
--- a/innobase/include/buf0buf.ic
+++ b/innobase/include/buf0buf.ic
@@ -36,25 +36,27 @@ buf_block_peek_if_too_old(
}
/*************************************************************************
-Gets the current size of buffer buf_pool in bytes. */
+Gets the current size of buffer buf_pool in bytes. In the case of AWE, the
+size of AWE window (= the frames). */
UNIV_INLINE
ulint
buf_pool_get_curr_size(void)
/*========================*/
/* out: size in bytes */
{
- return((buf_pool->curr_size) * UNIV_PAGE_SIZE);
+ return((buf_pool->n_frames) * UNIV_PAGE_SIZE);
}
/*************************************************************************
-Gets the maximum size of buffer buf_pool in bytes. */
+Gets the maximum size of buffer buf_pool in bytes. In the case of AWE, the
+size of AWE window (= the frames). */
UNIV_INLINE
ulint
buf_pool_get_max_size(void)
/*=======================*/
/* out: size in bytes */
{
- return((buf_pool->max_size) * UNIV_PAGE_SIZE);
+ return((buf_pool->n_frames) * UNIV_PAGE_SIZE);
}
/***********************************************************************
@@ -207,54 +209,24 @@ buf_block_align(
frame_zero = buf_pool->frame_zero;
- ut_ad((ulint)ptr >= (ulint)frame_zero);
-
- block = buf_pool_get_nth_block(buf_pool, ((ulint)(ptr - frame_zero))
- >> UNIV_PAGE_SIZE_SHIFT);
- if (block < buf_pool->blocks
- || block >= buf_pool->blocks + buf_pool->max_size) {
+ if ((ulint)ptr < (ulint)frame_zero
+ || (ulint)ptr > (ulint)(buf_pool->high_end)) {
+ ut_print_timestamp(stderr);
fprintf(stderr,
-"InnoDB: Error: trying to access a stray pointer %lx\n"
-"InnoDB: buf pool start is at %lx, number of pages %lu\n", (ulint)ptr,
- (ulint)frame_zero, buf_pool->max_size);
+" InnoDB: Error: trying to access a stray pointer %lx\n"
+"InnoDB: buf pool start is at %lx, end at %lx\n"
+"InnoDB: Probable reason is database corruption or memory\n"
+"InnoDB: corruption. If this happens in an InnoDB database recovery,\n"
+"InnoDB: you can look from section 6.1 at http://www.innodb.com/ibman.html\n"
+"InnoDB: how to force recovery.\n",
+ (ulint)ptr, (ulint)frame_zero,
+ (ulint)(buf_pool->high_end));
ut_a(0);
}
-
- return(block);
-}
-
-/***********************************************************************
-Gets the block to whose frame the pointer is pointing to. Does not
-require a file page to be bufferfixed. */
-UNIV_INLINE
-buf_block_t*
-buf_block_align_low(
-/*================*/
- /* out: pointer to block */
- byte* ptr) /* in: pointer to a frame */
-{
- buf_block_t* block;
- buf_frame_t* frame_zero;
-
- ut_ad(ptr);
-
- frame_zero = buf_pool->frame_zero;
-
- ut_ad((ulint)ptr >= (ulint)frame_zero);
-
- block = buf_pool_get_nth_block(buf_pool, ((ulint)(ptr - frame_zero))
- >> UNIV_PAGE_SIZE_SHIFT);
- if (block < buf_pool->blocks
- || block >= buf_pool->blocks + buf_pool->max_size) {
-
- fprintf(stderr,
-"InnoDB: Error: trying to access a stray pointer %lx\n"
-"InnoDB: buf pool start is at %lx, number of pages %lu\n", (ulint)ptr,
- (ulint)frame_zero, buf_pool->max_size);
- ut_a(0);
- }
-
+
+ block = *(buf_pool->blocks_of_frames + (((ulint)(ptr - frame_zero))
+ >> UNIV_PAGE_SIZE_SHIFT));
return(block);
}
@@ -264,7 +236,7 @@ UNIV_INLINE
buf_frame_t*
buf_frame_align(
/*============*/
- /* out: pointer to block */
+ /* out: pointer to frame */
byte* ptr) /* in: pointer to a frame */
{
buf_frame_t* frame;
@@ -273,14 +245,19 @@ buf_frame_align(
frame = ut_align_down(ptr, UNIV_PAGE_SIZE);
- if (((ulint)frame
- < (ulint)(buf_pool->frame_zero))
- || ((ulint)frame > (ulint)(buf_pool_get_nth_block(buf_pool,
- buf_pool->max_size - 1)->frame))) {
+ if (((ulint)frame < (ulint)(buf_pool->frame_zero))
+ || (ulint)frame >= (ulint)(buf_pool->high_end)) {
+
+ ut_print_timestamp(stderr);
fprintf(stderr,
-"InnoDB: Error: trying to access a stray pointer %lx\n"
-"InnoDB: buf pool start is at %lx, number of pages %lu\n", (ulint)ptr,
- (ulint)(buf_pool->frame_zero), buf_pool->max_size);
+" InnoDB: Error: trying to access a stray pointer %lx\n"
+"InnoDB: buf pool start is at %lx, end at %lx\n"
+"InnoDB: Probable reason is database corruption or memory\n"
+"InnoDB: corruption. If this happens in an InnoDB database recovery,\n"
+"InnoDB: you can look from section 6.1 at http://www.innodb.com/ibman.html\n"
+"InnoDB: how to force recovery.\n",
+ (ulint)ptr, (ulint)(buf_pool->frame_zero),
+ (ulint)(buf_pool->high_end));
ut_a(0);
}
@@ -469,7 +446,7 @@ buf_frame_modify_clock_inc(
ut_ad(frame);
- block = buf_block_align_low(frame);
+ block = buf_block_align(frame);
ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0))
|| rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
@@ -480,6 +457,25 @@ buf_frame_modify_clock_inc(
}
/************************************************************************
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+dulint
+buf_block_modify_clock_inc(
+/*=======================*/
+ /* out: new value */
+ buf_block_t* block) /* in: block */
+{
+ ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0))
+ || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
+
+ UT_DULINT_INC(block->modify_clock);
+
+ return(block->modify_clock);
+}
+
+/************************************************************************
Returns the value of the modify clock. The caller must have an s-lock
or x-lock on the block. */
UNIV_INLINE
@@ -508,15 +504,16 @@ void
buf_block_buf_fix_inc_debug(
/*========================*/
buf_block_t* block, /* in: block to bufferfix */
- char* file, /* in: file name */
- ulint line) /* in: line */
+ char* file __attribute__ ((unused)), /* in: file name */
+ ulint line __attribute__ ((unused))) /* in: line */
{
+#ifdef UNIV_SYNC_DEBUG
ibool ret;
-
+
ret = rw_lock_s_lock_func_nowait(&(block->debug_latch), file, line);
ut_ad(ret == TRUE);
-
+#endif
block->buf_fix_count++;
}
diff --git a/innobase/include/buf0lru.h b/innobase/include/buf0lru.h
index 946b6c4e31d..6a3c948507d 100644
--- a/innobase/include/buf0lru.h
+++ b/innobase/include/buf0lru.h
@@ -53,7 +53,9 @@ LRU list to the free list. */
buf_block_t*
buf_LRU_get_free_block(void);
/*=========================*/
- /* out: the free control block */
+ /* out: the free control block; also if AWE is
+ used, it is guaranteed that the block has its
+ page mapped to a frame when we return */
/**********************************************************************
Puts a block back to the free list. */
diff --git a/innobase/include/log0recv.h b/innobase/include/log0recv.h
index 7418e4abf1b..bef42cfec1c 100644
--- a/innobase/include/log0recv.h
+++ b/innobase/include/log0recv.h
@@ -355,12 +355,7 @@ in the debug version: spaces with an odd number as the id are replicate
spaces */
#define RECV_REPLICA_SPACE_ADD 1
-/* This many blocks must be left free in the buffer pool when we scan
-the log and store the scanned log records in the buffer pool: we will
-use these free blocks to read in pages when we start applying the
-log records to the database. */
-
-#define RECV_POOL_N_FREE_BLOCKS (ut_min(256, buf_pool_get_curr_size() / 8))
+extern ulint recv_n_pool_free_frames;
#ifndef UNIV_NONINL
#include "log0recv.ic"
diff --git a/innobase/include/os0proc.h b/innobase/include/os0proc.h
index 79750e5c1f7..08510db4366 100644
--- a/innobase/include/os0proc.h
+++ b/innobase/include/os0proc.h
@@ -15,6 +15,76 @@ Created 9/30/1995 Heikki Tuuri
typedef void* os_process_t;
typedef unsigned long int os_process_id_t;
+/* The cell type in os_awe_allocate_mem page info */
+#ifdef __NT__
+typedef ULONG_PTR os_awe_t;
+#else
+typedef ulint os_awe_t;
+#endif
+
+/* Physical page size when Windows AWE is used. This is the normal
+page size of an Intel x86 processor. We cannot use AWE with 2 MB or 4 MB
+pages. */
+#define OS_AWE_X86_PAGE_SIZE 4096
+
+/********************************************************************
+Windows AWE support. Tries to enable the "lock pages in memory" privilege for
+the current process so that the current process can allocate memory-locked
+virtual address space to act as the window where AWE maps physical memory. */
+
+ibool
+os_awe_enable_lock_pages_in_mem(void);
+/*=================================*/
+ /* out: TRUE if success, FALSE if error;
+ prints error info to stderr if no success */
+/********************************************************************
+Allocates physical RAM memory up to 64 GB in an Intel 32-bit x86
+processor. */
+
+ibool
+os_awe_allocate_physical_mem(
+/*=========================*/
+ /* out: TRUE if success */
+ os_awe_t** page_info, /* out, own: array of opaque data containing
+ the info for allocated physical memory pages;
+ each allocated 4 kB physical memory page has
+ one slot of type os_awe_t in the array */
+ ulint n_megabytes); /* in: number of megabytes to allocate */
+/********************************************************************
+Allocates a window in the virtual address space where we can map then
+pages of physical memory. */
+
+byte*
+os_awe_allocate_virtual_mem_window(
+/*===============================*/
+ /* out, own: allocated memory, or NULL if did not
+ succeed */
+ ulint size); /* in: virtual memory allocation size in bytes, must
+ be < 2 GB */
+/********************************************************************
+With this function you can map parts of physical memory allocated with
+the ..._allocate_physical_mem to the virtual address space allocated with
+the previous function. Intel implements this so that the process page
+tables are updated accordingly. A test on a 1.5 GHz AMD processor and XP
+showed that this takes < 1 microsecond, much better than the estimated 80 us
+for copying a 16 kB page memory to memory. But, the operation will at least
+partially invalidate the translation lookaside buffer (TLB) of all
+processors. Under a real-world load the performance hit may be bigger. */
+
+ibool
+os_awe_map_physical_mem_to_window(
+/*==============================*/
+ /* out: TRUE if success; the function
+ calls exit(1) in case of an error */
+ byte* ptr, /* in: a page-aligned pointer to
+ somewhere in the virtual address
+ space window; we map the physical mem
+ pages here */
+ ulint n_mem_pages, /* in: number of 4 kB mem pages to
+ map */
+ os_awe_t* page_info); /* in: array of page infos for those
+ pages; each page has one slot in the
+ array */
/********************************************************************
Converts the current process id to a number. It is not guaranteed that the
number is unique. In Linux returns the 'process number' of the current
diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h
index ad6f71f7a3a..bc0960ae023 100644
--- a/innobase/include/srv0srv.h
+++ b/innobase/include/srv0srv.h
@@ -61,6 +61,7 @@ extern ulint srv_flush_log_at_trx_commit;
extern byte srv_latin1_ordering[256];/* The sort order table of the latin1
character set */
extern ulint srv_pool_size;
+extern ulint srv_awe_window_size;
extern ulint srv_mem_pool_size;
extern ulint srv_lock_table_size;
@@ -86,6 +87,8 @@ extern ibool srv_use_doublewrite_buf;
extern ibool srv_set_thread_priorities;
extern int srv_query_thread_priority;
+extern ibool srv_use_awe;
+extern ibool srv_use_adaptive_hash_indexes;
/*-------------------------------------------*/
extern ulint srv_n_rows_inserted;
diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c
index f9b785ccbd5..bdfce783a43 100644
--- a/innobase/log/log0log.c
+++ b/innobase/log/log0log.c
@@ -437,25 +437,29 @@ log_group_calc_lsn_offset(
dulint lsn, /* in: lsn, must be within 4 GB of group->lsn */
log_group_t* group) /* in: log group */
{
- dulint gr_lsn;
- ulint gr_lsn_size_offset;
- ulint difference;
- ulint group_size;
- ulint offset;
+ dulint gr_lsn;
+ ib_longlong gr_lsn_size_offset;
+ ib_longlong difference;
+ ib_longlong group_size;
+ ib_longlong offset;
ut_ad(mutex_own(&(log_sys->mutex)));
+ /* If total log file size is > 2 GB we can easily get overflows
+ with 32-bit integers. Use 64-bit integers instead. */
+
gr_lsn = group->lsn;
- gr_lsn_size_offset = log_group_calc_size_offset(group->lsn_offset,
- group);
- group_size = log_group_get_capacity(group);
+ gr_lsn_size_offset = (ib_longlong)
+ log_group_calc_size_offset(group->lsn_offset, group);
+
+ group_size = (ib_longlong) log_group_get_capacity(group);
if (ut_dulint_cmp(lsn, gr_lsn) >= 0) {
- difference = ut_dulint_minus(lsn, gr_lsn);
+ difference = (ib_longlong) ut_dulint_minus(lsn, gr_lsn);
} else {
- difference = ut_dulint_minus(gr_lsn, lsn);
+ difference = (ib_longlong) ut_dulint_minus(gr_lsn, lsn);
difference = difference % group_size;
@@ -464,7 +468,13 @@ log_group_calc_lsn_offset(
offset = (gr_lsn_size_offset + difference) % group_size;
- return(log_group_calc_real_offset(offset, group));
+ ut_a(offset <= 0xFFFFFFFF);
+
+ /* printf("Offset is %lu gr_lsn_offset is %lu difference is %lu\n",
+ (ulint)offset,(ulint)gr_lsn_size_offset, (ulint)difference);
+ */
+
+ return(log_group_calc_real_offset((ulint)offset, group));
}
/***********************************************************************
@@ -3054,8 +3064,8 @@ log_check_log_recs(
ut_memcpy(scan_buf, start, end - start);
recv_scan_log_recs(TRUE,
- buf_pool_get_curr_size() -
- RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE,
+ (buf_pool->n_frames -
+ recv_n_pool_free_frames) * UNIV_PAGE_SIZE,
FALSE, scan_buf, end - start,
ut_dulint_align_down(buf_start_lsn,
OS_FILE_LOG_BLOCK_SIZE),
diff --git a/innobase/log/log0recv.c b/innobase/log/log0recv.c
index dfe67c444b4..3945b47933d 100644
--- a/innobase/log/log0recv.c
+++ b/innobase/log/log0recv.c
@@ -71,6 +71,14 @@ ulint recv_previous_parsed_rec_is_multi = 0;
ulint recv_max_parsed_page_no = 0;
+/* This many frames must be left free in the buffer pool when we scan
+the log and store the scanned log records in the buffer pool: we will
+use these free frames to read in pages when we start applying the
+log records to the database. */
+
+ulint recv_n_pool_free_frames = 256;
+
+
/************************************************************
Creates the recovery system. */
@@ -1018,10 +1026,10 @@ recv_recover_page(
block = buf_block_align(page);
if (just_read_in) {
- /* Move the ownership of the x-latch on the page to this OS
- thread, so that we can acquire a second x-latch on it. This
- is needed for the operations to the page to pass the debug
- checks. */
+ /* Move the ownership of the x-latch on the page to
+ this OS thread, so that we can acquire a second
+ x-latch on it. This is needed for the operations to
+ the page to pass the debug checks. */
rw_lock_x_lock_move_ownership(&(block->lock));
}
@@ -2362,8 +2370,8 @@ recv_group_scan_log_recs(
group, start_lsn, end_lsn);
finished = recv_scan_log_recs(TRUE,
- buf_pool_get_curr_size()
- - RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE,
+ (buf_pool->n_frames
+ - recv_n_pool_free_frames) * UNIV_PAGE_SIZE,
TRUE, log_sys->buf,
RECV_SCAN_SIZE, start_lsn,
contiguous_lsn, group_scanned_lsn);
@@ -3001,8 +3009,8 @@ ask_again:
read_offset % UNIV_PAGE_SIZE, len, buf, NULL);
ret = recv_scan_log_recs(TRUE,
- buf_pool_get_curr_size() -
- RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE,
+ (buf_pool->n_frames -
+ recv_n_pool_free_frames) * UNIV_PAGE_SIZE,
TRUE, buf, len, start_lsn,
&dummy_lsn, &scanned_lsn);
diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c
index fa5482a8cd1..9eae358c7fb 100644
--- a/innobase/os/os0file.c
+++ b/innobase/os/os0file.c
@@ -2127,7 +2127,7 @@ os_aio_simulated_handle(
ulint offs;
ulint lowest_offset;
byte* combined_buf;
- byte* combined_buf2= 0; /* Remove warning */
+ byte* combined_buf2;
ibool ret;
ulint n;
ulint i;
diff --git a/innobase/os/os0proc.c b/innobase/os/os0proc.c
index 1ee448a4a44..61db7bd13b2 100644
--- a/innobase/os/os0proc.c
+++ b/innobase/os/os0proc.c
@@ -12,11 +12,469 @@ Created 9/30/1995 Heikki Tuuri
#include "os0proc.ic"
#endif
+#include "ut0mem.h"
+#include "ut0byte.h"
+
+
+/*
+How to get AWE to compile on Windows?
+-------------------------------------
+
+the Visual C++ has to be relatively recent and _WIN32_WINNT has to be
+defined to a value >= 0x0500 when windows.h is included. An easy way
+to accomplish that is to put
+
+#define _WIN32_WINNT 0x0500
+
+to the start of file \mysql\include\config-win.h
+
+Where does AWE work?
+-------------------
+
+See the error message in os_awe_allocate_physical_mem().
+
+How to assign privileges for mysqld to use AWE?
+-----------------------------------------------
+
+See the error message in os_awe_enable_lock_pages_in_mem().
+
+Use Windows AWE functions in this order
+---------------------------------------
+
+(1) os_awe_enable_lock_pages_in_mem();
+(2) os_awe_allocate_physical_mem();
+(3) os_awe_allocate_virtual_mem_window();
+(4) os_awe_map_physical_mem_to_window().
+
+To test 'AWE' in a computer which does not have the AWE API,
+you can compile with UNIV_SIMULATE_AWE defined in this file.
+*/
+
+#ifdef UNIV_SIMULATE_AWE
+/* If we simulate AWE, we allocate the 'physical memory' here */
+byte* os_awe_simulate_mem;
+ulint os_awe_simulate_mem_size;
+os_awe_t* os_awe_simulate_page_info;
+byte* os_awe_simulate_window;
+ulint os_awe_simulate_window_size;
+/* In simulated AWE the following contains a NULL pointer or a pointer
+to a mapped 'physical page' for each 4 kB page in the AWE window */
+byte** os_awe_simulate_map;
+#endif
+
+#ifdef __NT__
+os_awe_t* os_awe_page_info;
+ulint os_awe_n_pages;
+byte* os_awe_window;
+ulint os_awe_window_size;
+#endif
+
+/********************************************************************
+Windows AWE support. Tries to enable the "lock pages in memory" privilege for
+the current process so that the current process can allocate memory-locked
+virtual address space to act as the window where AWE maps physical memory. */
+
+ibool
+os_awe_enable_lock_pages_in_mem(void)
+/*=================================*/
+ /* out: TRUE if success, FALSE if error;
+ prints error info to stderr if no success */
+{
+#ifdef UNIV_SIMULATE_AWE
+
+ return(TRUE);
+
+#elif defined(__NT__)
+ struct {
+ DWORD Count;
+ LUID_AND_ATTRIBUTES Privilege[1];
+ } Info;
+ HANDLE hProcess;
+ HANDLE Token;
+ BOOL Result;
+
+ hProcess = GetCurrentProcess();
+
+ /* Open the token of the current process */
+
+ Result = OpenProcessToken(hProcess,
+ TOKEN_ADJUST_PRIVILEGES,
+ &Token);
+ if (Result != TRUE) {
+ fprintf(stderr,
+ "InnoDB: AWE: Cannot open process token, error %lu\n",
+ (ulint)GetLastError());
+ return(FALSE);
+ }
+
+ Info.Count = 1;
+
+ Info.Privilege[0].Attributes = SE_PRIVILEGE_ENABLED;
+
+ /* Get the local unique identifier (LUID) of the SE_LOCK_MEMORY
+ privilege */
+
+ Result = LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME,
+ &(Info.Privilege[0].Luid));
+ if (Result != TRUE) {
+ fprintf(stderr,
+ "InnoDB: AWE: Cannot get local privilege value for %s, error %lu.\n",
+ SE_LOCK_MEMORY_NAME, (ulint)GetLastError());
+
+ return(FALSE);
+ }
+
+ /* Try to adjust the privilege */
+
+ Result = AdjustTokenPrivileges(Token, FALSE,
+ (PTOKEN_PRIVILEGES)&Info,
+ 0, NULL, NULL);
+ /* Check the result */
+
+ if (Result != TRUE) {
+ fprintf(stderr,
+ "InnoDB: AWE: Cannot adjust process token privileges, error %u.\n",
+ GetLastError());
+ return(FALSE);
+ } else if (GetLastError() != ERROR_SUCCESS) {
+ fprintf(stderr,
+"InnoDB: AWE: Cannot enable SE_LOCK_MEMORY privilege, error %lu.\n"
+"InnoDB: In Windows XP Home you cannot use AWE. In Windows 2000 and XP\n"
+"InnoDB: Professional you must go to the Control Panel, to\n"
+"InnoDB: Security Settings, to Local Policies, and enable\n"
+"InnoDB: the 'lock pages in memory' privilege for the user who runs\n"
+"InnoDB: the MySQL server.\n", GetLastError());
+
+ return(FALSE);
+ }
+
+ CloseHandle(Token);
+
+ return(TRUE);
+#else
#ifdef __WIN__
-#include <windows.h>
+ fprintf(stderr,
+"InnoDB: AWE: Error: to use AWE you must use a ...-nt MySQL executable.\n");
+#endif
+ return(FALSE);
#endif
+}
-#include "ut0mem.h"
+/********************************************************************
+Allocates physical RAM memory up to 64 GB in an Intel 32-bit x86
+processor. */
+
+ibool
+os_awe_allocate_physical_mem(
+/*=========================*/
+ /* out: TRUE if success */
+ os_awe_t** page_info, /* out, own: array of opaque data containing
+ the info for allocated physical memory pages;
+ each allocated 4 kB physical memory page has
+ one slot of type os_awe_t in the array */
+ ulint n_megabytes) /* in: number of megabytes to allocate */
+{
+#ifdef UNIV_SIMULATE_AWE
+ os_awe_simulate_page_info = ut_malloc(sizeof(os_awe_t) *
+ n_megabytes * ((1024 * 1024) / OS_AWE_X86_PAGE_SIZE));
+
+ os_awe_simulate_mem = ut_align(ut_malloc(
+ 4096 + 1024 * 1024 * n_megabytes),
+ 4096);
+ os_awe_simulate_mem_size = n_megabytes * 1024 * 1024;
+
+ *page_info = os_awe_simulate_page_info;
+
+ return(TRUE);
+
+#elif defined(__NT__)
+ BOOL bResult;
+ ULONG_PTR NumberOfPages; /* Question: why does Windows
+ use the name ULONG_PTR for
+ a scalar integer type? Maybe
+ because we may also refer to
+ &NumberOfPages? */
+ ULONG_PTR NumberOfPagesInitial;
+ SYSTEM_INFO sSysInfo;
+ int PFNArraySize;
+
+ if (n_megabytes > 64 * 1024) {
+
+ fprintf(stderr,
+"InnoDB: AWE: Error: tried to allocate %lu MB.\n"
+"InnoDB: AWE cannot allocate more than 64 GB in any computer.\n", n_megabytes);
+
+ return(FALSE);
+ }
+
+ GetSystemInfo(&sSysInfo); /* fill the system information structure */
+
+ if ((ulint)OS_AWE_X86_PAGE_SIZE != (ulint)sSysInfo.dwPageSize) {
+ fprintf(stderr,
+"InnoDB: AWE: Error: this computer has a page size of %lu.\n"
+"InnoDB: Should be 4096 bytes for InnoDB AWE support to work.\n",
+ (ulint)sSysInfo.dwPageSize);
+
+ return(FALSE);
+ }
+
+ /* Calculate the number of pages of memory to request */
+
+ NumberOfPages = n_megabytes * ((1024 * 1024) / OS_AWE_X86_PAGE_SIZE);
+
+ /* Calculate the size of page_info for allocated physical pages */
+
+ PFNArraySize = NumberOfPages * sizeof(ULONG_PTR);
+
+ *page_info = (ULONG_PTR*)HeapAlloc(GetProcessHeap(), 0, PFNArraySize);
+
+ if (*page_info == NULL) {
+ fprintf(stderr,
+"InnoDB: AWE: Failed to allocate page info array from process heap, error %lu\n",
+ (ulint)GetLastError());
+
+ return(FALSE);
+ }
+
+ ut_total_allocated_memory += PFNArraySize;
+
+ /* Enable this process' privilege to lock pages to physical memory */
+
+ if (!os_awe_enable_lock_pages_in_mem()) {
+
+ return(FALSE);
+ }
+
+ /* Allocate the physical memory */
+
+ NumberOfPagesInitial = NumberOfPages;
+
+ os_awe_page_info = *page_info;
+ os_awe_n_pages = (ulint)NumberOfPages;
+
+ /* Compilation note: if the compiler complains the function is not
+ defined, see the note at the start of this file */
+
+ bResult = AllocateUserPhysicalPages(GetCurrentProcess(),
+ &NumberOfPages,
+ *page_info);
+ if (bResult != TRUE) {
+ fprintf(stderr,
+"InnoDB: AWE: Cannot allocate physical pages, error %lu.\n",
+ (ulint)GetLastError());
+
+ return(FALSE);
+ }
+
+ if (NumberOfPagesInitial != NumberOfPages) {
+ fprintf(stderr,
+"InnoDB: AWE: Error: allocated only %lu pages of %lu requested.\n"
+"InnoDB: Check that you have enough free RAM.\n"
+"InnoDB: In Windows XP Professional and 2000 Professional\n"
+"InnoDB: Windows PAE size is max 4 GB. In 2000 and .NET"
+"InnoDB: Advanced Servers and 2000 Datacenter Server it is 32 GB,\n"
+"InnoDB: and in .NET Datacenter Server it is 64 GB.\n"
+"InnoDB: A Microsoft web page said that the processor must be an Intel\n"
+"InnoDB: processor.",
+ (ulint)NumberOfPages,
+ (ulint)NumberOfPagesInitial);
+
+ return(FALSE);
+ }
+
+ fprintf(stderr,
+"InnoDB: Using Address Windowing Extensions (AWE); allocated %lu MB\n",
+ n_megabytes);
+
+ return(TRUE);
+#else
+ return(FALSE);
+#endif
+}
+
+/********************************************************************
+Allocates a window in the virtual address space where we can map then
+pages of physical memory. */
+
+byte*
+os_awe_allocate_virtual_mem_window(
+/*===============================*/
+ /* out, own: allocated memory, or NULL if did not
+ succeed */
+ ulint size) /* in: virtual memory allocation size in bytes, must
+ be < 2 GB */
+{
+#ifdef UNIV_SIMULATE_AWE
+ ulint i;
+
+ os_awe_simulate_window = ut_align(ut_malloc(4096 + size), 4096);
+ os_awe_simulate_window_size = size;
+
+ os_awe_simulate_map = ut_malloc(sizeof(byte*) * (size / 4096));
+
+ for (i = 0; i < (size / 4096); i++) {
+ *(os_awe_simulate_map + i) = NULL;
+ }
+
+ return(os_awe_simulate_window);
+
+#elif defined(__NT__)
+ byte* ptr;
+
+ if (size > 0x7FFFFFFFFF) {
+ fprintf(stderr,
+"InnoDB: AWE: Cannot allocate %lu bytes of virtual memory\n", size);
+
+ return(NULL);
+ }
+
+ ptr = VirtualAlloc(NULL, (SIZE_T)size, MEM_RESERVE | MEM_PHYSICAL,
+ PAGE_READWRITE);
+ if (ptr == NULL) {
+ fprintf(stderr,
+"InnoDB: AWE: Cannot allocate %lu bytes of virtual memory, error %lu\n",
+ size, (ulint)GetLastError());
+
+ return(NULL);
+ }
+
+ os_awe_window = ptr;
+ os_awe_window_size = size;
+
+ ut_total_allocated_memory += size;
+
+ return(ptr);
+#else
+ return(NULL);
+#endif
+}
+
+/********************************************************************
+With this function you can map parts of physical memory allocated with
+the ..._allocate_physical_mem to the virtual address space allocated with
+the previous function. Intel implements this so that the process page
+tables are updated accordingly. A test on a 1.5 GHz AMD processor and XP
+showed that this takes < 1 microsecond, much better than the estimated 80 us
+for copying a 16 kB page memory to memory. But, the operation will at least
+partially invalidate the translation lookaside buffer (TLB) of all
+processors. Under a real-world load the performance hit may be bigger. */
+
+ibool
+os_awe_map_physical_mem_to_window(
+/*==============================*/
+ /* out: TRUE if success; the function
+ calls exit(1) in case of an error */
+ byte* ptr, /* in: a page-aligned pointer to
+ somewhere in the virtual address
+ space window; we map the physical mem
+ pages here */
+ ulint n_mem_pages, /* in: number of 4 kB mem pages to
+ map */
+ os_awe_t* page_info) /* in: array of page infos for those
+ pages; each page has one slot in the
+ array */
+{
+#ifdef UNIV_SIMULATE_AWE
+ ulint i;
+ byte** map;
+ byte* page;
+ byte* phys_page;
+
+ ut_a(ptr >= os_awe_simulate_window);
+ ut_a(ptr < os_awe_simulate_window + os_awe_simulate_window_size);
+ ut_a(page_info >= os_awe_simulate_page_info);
+ ut_a(page_info < os_awe_simulate_page_info +
+ (os_awe_simulate_mem_size / 4096));
+
+ /* First look if some other 'physical pages' are mapped at ptr,
+ and copy them back to where they were if yes */
+
+ map = os_awe_simulate_map
+ + ((ulint)(ptr - os_awe_simulate_window)) / 4096;
+ page = ptr;
+
+ for (i = 0; i < n_mem_pages; i++) {
+ if (*map != NULL) {
+ ut_memcpy(*map, page, 4096);
+ }
+ map++;
+ page += 4096;
+ }
+
+ /* Then copy to ptr the 'physical pages' determined by page_info; we
+ assume page_info is a segment of the array we created at the start */
+
+ phys_page = os_awe_simulate_mem
+ + (ulint)(page_info - os_awe_simulate_page_info)
+ * 4096;
+
+ ut_memcpy(ptr, phys_page, n_mem_pages * 4096);
+
+ /* Update the map */
+
+ map = os_awe_simulate_map
+ + ((ulint)(ptr - os_awe_simulate_window)) / 4096;
+
+ for (i = 0; i < n_mem_pages; i++) {
+ *map = phys_page;
+
+ map++;
+ phys_page += 4096;
+ }
+
+ return(TRUE);
+
+#elif defined(__NT__)
+ BOOL bResult;
+ ULONG_PTR n_pages;
+
+ n_pages = (ULONG_PTR)n_mem_pages;
+
+ if (!(ptr >= os_awe_window)) {
+ fprintf(stderr,
+"InnoDB: AWE: Error: trying to map to address %lx but AWE window start %lx\n",
+ (ulint)ptr, (ulint)os_awe_window);
+ ut_a(0);
+ }
+
+ if (!(ptr <= os_awe_window + os_awe_window_size - UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+"InnoDB: AWE: Error: trying to map to address %lx but AWE window end %lx\n",
+ (ulint)ptr, (ulint)os_awe_window + os_awe_window_size);
+ ut_a(0);
+ }
+
+ if (!(page_info >= os_awe_page_info)) {
+ fprintf(stderr,
+"InnoDB: AWE: Error: trying to map page info at %lx but array start %lx\n",
+ (ulint)page_info, (ulint)os_awe_page_info);
+ ut_a(0);
+ }
+
+ if (!(page_info <= os_awe_page_info + (os_awe_n_pages - 4))) {
+ fprintf(stderr,
+"InnoDB: AWE: Error: trying to map page info at %lx but array end %lx\n",
+ (ulint)page_info, (ulint)(os_awe_page_info + os_awe_n_pages));
+ ut_a(0);
+ }
+
+ bResult = MapUserPhysicalPages((PVOID)ptr, n_pages, page_info);
+
+ if (bResult != TRUE) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: AWE: Mapping of %lu physical pages to address %lx failed,\n"
+"InnoDB: error %lu.\n"
+"InnoDB: Cannot continue operation.\n",
+ n_mem_pages, (ulint)ptr, (ulint)GetLastError());
+ exit(1);
+ }
+
+ return(TRUE);
+#else
+ return(FALSE);
+#endif
+}
/********************************************************************
Converts the current process id to a number. It is not guaranteed that the
diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c
index f9eba721cbc..56971ab86eb 100644
--- a/innobase/srv/srv0srv.c
+++ b/innobase/srv/srv0srv.c
@@ -140,9 +140,14 @@ byte srv_latin1_ordering[256] /* The sort order table of the latin1
, 0xD8, 0x55, 0x55, 0x55, 0x59, 0x59, 0xDE, 0xFF
};
-ulint srv_pool_size = ULINT_MAX; /* size in database pages;
- MySQL originally sets this
- value in megabytes */
+ulint srv_pool_size = ULINT_MAX; /* size in pages; MySQL inits
+ this to size in kilobytes but
+ we normalize this to pages in
+ srv_boot() */
+ulint srv_awe_window_size = 0; /* size in pages; MySQL inits
+ this to bytes, but we
+ normalize it to pages in
+ srv_boot() */
ulint srv_mem_pool_size = ULINT_MAX; /* size in bytes */
ulint srv_lock_table_size = ULINT_MAX;
@@ -218,6 +223,13 @@ ibool srv_use_doublewrite_buf = TRUE;
ibool srv_set_thread_priorities = TRUE;
int srv_query_thread_priority = 0;
+
+/* TRUE if the Address Windowing Extensions of Windows are used; then we must
+disable adaptive hash indexes */
+ibool srv_use_awe = FALSE;
+ibool srv_use_adaptive_hash_indexes = TRUE;
+
+
/*-------------------------------------------*/
ulint srv_n_spin_wait_rounds = 20;
ulint srv_spin_wait_delay = 5;
@@ -1956,9 +1968,19 @@ srv_normalize_init_values(void)
srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE;
- srv_pool_size = srv_pool_size / UNIV_PAGE_SIZE;
+ srv_pool_size = srv_pool_size / (UNIV_PAGE_SIZE / 1024);
+
+ srv_awe_window_size = srv_awe_window_size / UNIV_PAGE_SIZE;
- srv_lock_table_size = 20 * srv_pool_size;
+ if (srv_use_awe) {
+ /* If we are using AWE we must save memory in the 32-bit
+ address space of the process, and cannot bind the lock
+ table size to the real buffer pool size. */
+
+ srv_lock_table_size = 20 * srv_awe_window_size;
+ } else {
+ srv_lock_table_size = 20 * srv_pool_size;
+ }
return(DB_SUCCESS);
}
@@ -2323,6 +2345,12 @@ srv_sprintf_innodb_monitor(
"Total memory allocated %lu; in additional pool allocated %lu\n",
ut_total_allocated_memory,
mem_pool_get_reserved(mem_comm_pool));
+ if (srv_use_awe) {
+ buf += sprintf(buf,
+ "In addition to that %lu MB of AWE memory allocated\n",
+ srv_pool_size / ((1024 * 1024) / UNIV_PAGE_SIZE));
+ }
+
buf_print_io(buf, buf_end);
buf = buf + strlen(buf);
ut_a(buf < buf_end + 1500);
diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c
index ec674b69256..e1d436a879c 100644
--- a/innobase/srv/srv0start.c
+++ b/innobase/srv/srv0start.c
@@ -935,6 +935,7 @@ innobase_start_or_create_for_mysql(void)
/*====================================*/
/* out: DB_SUCCESS or error code */
{
+ buf_pool_t* ret;
ibool create_new_db;
ibool log_file_created;
ibool log_created = FALSE;
@@ -972,6 +973,11 @@ innobase_start_or_create_for_mysql(void)
"InnoDB: !!!!!!!!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!!!!!!!\n");
#endif
+#ifdef UNIV_SIMULATE_AWE
+ fprintf(stderr,
+"InnoDB: !!!!!!!!!!!!!! UNIV_SIMULATE_AWE switched on !!!!!!!!!!!!!!!!!\n");
+#endif
+
if (srv_sizeof_trx_t_in_ha_innodb_cc != (ulint)sizeof(trx_t)) {
fprintf(stderr,
"InnoDB: Error: trx_t size is %lu in ha_innodb.cc but %lu in srv0start.c\n"
@@ -1002,6 +1008,17 @@ innobase_start_or_create_for_mysql(void)
srv_startup_is_before_trx_rollback_phase = TRUE;
os_aio_use_native_aio = FALSE;
+#if !defined(__NT__) && !defined(UNIV_SIMULATE_AWE)
+ if (srv_use_awe) {
+
+ fprintf(stderr,
+"InnoDB: Error: You have specified innodb_buffer_pool_awe_mem_mb\n"
+"InnoDB: in my.cnf, but AWE can only be used in Windows 2000 and later.\n");
+
+ return(DB_ERROR);
+ }
+#endif
+
#ifdef __WIN__
if (os_get_os_version() == OS_WIN95
|| os_get_os_version() == OS_WIN31
@@ -1057,6 +1074,9 @@ innobase_start_or_create_for_mysql(void)
return(DB_ERROR);
}
+ /* Note that the call srv_boot() also changes the values of
+ srv_pool_size etc. to the units used by InnoDB internally */
+
err = srv_boot();
if (err != DB_SUCCESS) {
@@ -1088,7 +1108,26 @@ innobase_start_or_create_for_mysql(void)
fil_init(SRV_MAX_N_OPEN_FILES);
- buf_pool_init(srv_pool_size, srv_pool_size);
+ if (srv_use_awe) {
+ fprintf(stderr,
+"InnoDB: Using AWE: Memory window is %lu MB and AWE memory is %lu MB\n",
+ srv_awe_window_size / ((1024 * 1024) / UNIV_PAGE_SIZE),
+ srv_pool_size / ((1024 * 1024) / UNIV_PAGE_SIZE));
+
+ /* We must disable adaptive hash indexes because they do not
+ tolerate remapping of pages in AWE */
+
+ srv_use_adaptive_hash_indexes = FALSE;
+ ret = buf_pool_init(srv_pool_size, srv_pool_size,
+ srv_awe_window_size);
+ } else {
+ ret = buf_pool_init(srv_pool_size, srv_pool_size,
+ srv_pool_size);
+ }
+
+ if (ret == NULL) {
+ return(DB_ERROR);
+ }
fsp_init();
log_init();
diff --git a/innobase/trx/trx0sys.c b/innobase/trx/trx0sys.c
index 0c10040847e..33c962772e8 100644
--- a/innobase/trx/trx0sys.c
+++ b/innobase/trx/trx0sys.c
@@ -472,9 +472,9 @@ trx_sys_update_mysql_binlog_offset(
if (0 != ut_memcmp(sys_header + field + TRX_SYS_MYSQL_LOG_NAME,
file_name, 1 + ut_strlen(file_name))) {
- mlog_write_string((byte*) (sys_header + field
- + TRX_SYS_MYSQL_LOG_NAME),
- (byte*) file_name, 1 + ut_strlen(file_name), mtr);
+ mlog_write_string(sys_header + field
+ + TRX_SYS_MYSQL_LOG_NAME,
+ file_name, 1 + ut_strlen(file_name), mtr);
}
if (mach_read_from_4(sys_header + field
diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c
index d2219ed019f..f0077f941de 100644
--- a/innobase/trx/trx0trx.c
+++ b/innobase/trx/trx0trx.c
@@ -99,7 +99,7 @@ trx_create(
trx->mysql_log_file_name = NULL;
trx->mysql_log_offset = 0;
- trx->mysql_master_log_file_name = (char*) "";
+ trx->mysql_master_log_file_name = "";
trx->mysql_master_log_pos = 0;
trx->ignore_duplicates_in_insert = FALSE;
diff --git a/innobase/ut/ut0ut.c b/innobase/ut/ut0ut.c
index bb5eb662cd7..ff5d11d84ed 100644
--- a/innobase/ut/ut0ut.c
+++ b/innobase/ut/ut0ut.c
@@ -197,6 +197,7 @@ ut_get_year_month_day(
*month = (ulint)cal_tm.wMonth;
*day = (ulint)cal_tm.wDay;
#else
+ struct tm cal_tm;
struct tm* cal_tm_ptr;
time_t tm;