diff options
author | unknown <heikki@hundin.mysql.fi> | 2003-01-06 22:07:25 +0200 |
---|---|---|
committer | unknown <heikki@hundin.mysql.fi> | 2003-01-06 22:07:25 +0200 |
commit | 9497c997041e20baab3ebaa985c9e29235fdf0ca (patch) | |
tree | 88f03b8d4b471aa94f27b4a3ed510bcd7c2c3ccb /innobase | |
parent | ef62b4c948d68108c0945ebde33540b03b392a6d (diff) | |
download | mariadb-git-9497c997041e20baab3ebaa985c9e29235fdf0ca.tar.gz |
buf0buf.c, buf0buf.ic, buf0buf.h:
Reduce memory usage of the buffer headers
Many files:
Merge InnoDB-4.1 with AWE support
sql/mysqld.cc:
Merge InnoDB-4.1 with AWE support
sql/set_var.cc:
Merge InnoDB-4.1 with AWE support
sql/ha_innodb.h:
Merge InnoDB-4.1 with AWE support
sql/ha_innodb.cc:
Merge InnoDB-4.1 with AWE support
innobase/btr/btr0cur.c:
Merge InnoDB-4.1 with AWE support
innobase/btr/btr0pcur.c:
Merge InnoDB-4.1 with AWE support
innobase/buf/buf0flu.c:
Merge InnoDB-4.1 with AWE support
innobase/buf/buf0lru.c:
Merge InnoDB-4.1 with AWE support
innobase/buf/buf0rea.c:
Merge InnoDB-4.1 with AWE support
innobase/include/btr0pcur.h:
Merge InnoDB-4.1 with AWE support
innobase/include/buf0lru.h:
Merge InnoDB-4.1 with AWE support
innobase/include/log0recv.h:
Merge InnoDB-4.1 with AWE support
innobase/include/os0proc.h:
Merge InnoDB-4.1 with AWE support
innobase/include/srv0srv.h:
Merge InnoDB-4.1 with AWE support
innobase/log/log0log.c:
Merge InnoDB-4.1 with AWE support
innobase/log/log0recv.c:
Merge InnoDB-4.1 with AWE support
innobase/os/os0file.c:
Merge InnoDB-4.1 with AWE support
innobase/os/os0proc.c:
Merge InnoDB-4.1 with AWE support
innobase/srv/srv0srv.c:
Merge InnoDB-4.1 with AWE support
innobase/srv/srv0start.c:
Merge InnoDB-4.1 with AWE support
innobase/trx/trx0sys.c:
Merge InnoDB-4.1 with AWE support
innobase/trx/trx0trx.c:
Merge InnoDB-4.1 with AWE support
innobase/ut/ut0ut.c:
Merge InnoDB-4.1 with AWE support
innobase/include/buf0buf.h:
Reduce memory usage of the buffer headers
innobase/include/buf0buf.ic:
Reduce memory usage of the buffer headers
innobase/buf/buf0buf.c:
Reduce memory usage of the buffer headers
Diffstat (limited to 'innobase')
-rw-r--r-- | innobase/btr/btr0cur.c | 7 | ||||
-rw-r--r-- | innobase/btr/btr0pcur.c | 42 | ||||
-rw-r--r-- | innobase/buf/buf0buf.c | 307 | ||||
-rw-r--r-- | innobase/buf/buf0flu.c | 46 | ||||
-rw-r--r-- | innobase/buf/buf0lru.c | 62 | ||||
-rw-r--r-- | innobase/buf/buf0rea.c | 2 | ||||
-rw-r--r-- | innobase/include/btr0pcur.h | 3 | ||||
-rw-r--r-- | innobase/include/buf0buf.h | 120 | ||||
-rw-r--r-- | innobase/include/buf0buf.ic | 119 | ||||
-rw-r--r-- | innobase/include/buf0lru.h | 4 | ||||
-rw-r--r-- | innobase/include/log0recv.h | 7 | ||||
-rw-r--r-- | innobase/include/os0proc.h | 70 | ||||
-rw-r--r-- | innobase/include/srv0srv.h | 3 | ||||
-rw-r--r-- | innobase/log/log0log.c | 36 | ||||
-rw-r--r-- | innobase/log/log0recv.c | 24 | ||||
-rw-r--r-- | innobase/os/os0file.c | 2 | ||||
-rw-r--r-- | innobase/os/os0proc.c | 462 | ||||
-rw-r--r-- | innobase/srv/srv0srv.c | 38 | ||||
-rw-r--r-- | innobase/srv/srv0start.c | 41 | ||||
-rw-r--r-- | innobase/trx/trx0sys.c | 6 | ||||
-rw-r--r-- | innobase/trx/trx0trx.c | 2 | ||||
-rw-r--r-- | innobase/ut/ut0ut.c | 1 |
22 files changed, 1222 insertions, 182 deletions
diff --git a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c index 24f0447d55d..e1d12c9adc4 100644 --- a/innobase/btr/btr0cur.c +++ b/innobase/btr/btr0cur.c @@ -291,6 +291,7 @@ btr_cur_search_to_nth_level( && latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ && !estimate && mode != PAGE_CUR_LE_OR_EXTENDS + && srv_use_adaptive_hash_indexes && btr_search_guess_on_hash(index, info, tuple, mode, latch_mode, cursor, has_search_latch, mtr)) { @@ -495,9 +496,11 @@ retry_page_get: cursor->up_bytes = up_bytes; #ifdef BTR_CUR_ADAPT - btr_search_info_update(index, cursor); -#endif + if (srv_use_adaptive_hash_indexes) { + btr_search_info_update(index, cursor); + } +#endif ut_ad(cursor->up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE); ut_ad(cursor->up_match != ULINT_UNDEFINED diff --git a/innobase/btr/btr0pcur.c b/innobase/btr/btr0pcur.c index b2115dfdd6c..13efacb9da3 100644 --- a/innobase/btr/btr0pcur.c +++ b/innobase/btr/btr0pcur.c @@ -95,7 +95,9 @@ btr_pcur_store_position( ut_a(cursor->latch_mode != BTR_NO_LATCHES); if (page_get_n_recs(page) == 0) { - /* It must be an empty index tree */ + /* It must be an empty index tree; NOTE that in this case + we do not store the modify_clock, but always do a search + if we restore the cursor position */ ut_a(btr_page_get_next(page, mtr) == FIL_NULL && btr_page_get_prev(page, mtr) == FIL_NULL); @@ -128,12 +130,13 @@ btr_pcur_store_position( } else { cursor->rel_pos = BTR_PCUR_ON; } - + cursor->old_stored = BTR_PCUR_OLD_STORED; cursor->old_rec = dict_tree_copy_rec_order_prefix(tree, rec, &(cursor->old_rec_buf), &(cursor->buf_size)); + cursor->block_when_stored = buf_block_align(page); cursor->modify_clock = buf_frame_get_modify_clock(page); } @@ -205,6 +208,9 @@ btr_pcur_restore_position( if (cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE || cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE) { + /* In these cases we do not try an optimistic restoration, + but always do a search */ + if (cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE) { from_left = TRUE; } else { @@ -214,6 +220,10 @@ btr_pcur_restore_position( btr_cur_open_at_index_side(from_left, btr_pcur_get_btr_cur(cursor)->index, latch_mode, btr_pcur_get_btr_cur(cursor), mtr); + + cursor->block_when_stored = + buf_block_align(btr_pcur_get_page(cursor)); + return(FALSE); } @@ -224,8 +234,9 @@ btr_pcur_restore_position( if (latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF) { /* Try optimistic restoration */ - if (buf_page_optimistic_get(latch_mode, page, - cursor->modify_clock, mtr)) { + if (buf_page_optimistic_get(latch_mode, + cursor->block_when_stored, page, + cursor->modify_clock, mtr)) { cursor->pos_state = BTR_PCUR_IS_POSITIONED; buf_page_dbg_add_level(page, SYNC_TREE_NODE); @@ -270,8 +281,6 @@ btr_pcur_restore_position( btr_pcur_open_with_no_init(btr_pcur_get_btr_cur(cursor)->index, tuple, mode, latch_mode, cursor, 0, mtr); - - cursor->old_stored = BTR_PCUR_OLD_STORED; /* Restore the old search mode */ cursor->search_mode = old_mode; @@ -280,11 +289,18 @@ btr_pcur_restore_position( && btr_pcur_is_on_user_rec(cursor, mtr) && 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor))) { - /* We have to store the NEW value for the modify clock, since - the cursor can now be on a different page! */ + /* We have to store the NEW value for the modify clock, since + the cursor can now be on a different page! But we can retain + the value of old_rec */ + + cursor->modify_clock = + buf_frame_get_modify_clock(btr_pcur_get_page(cursor)); + + cursor->block_when_stored = + buf_block_align(btr_pcur_get_page(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_STORED; - cursor->modify_clock = buf_frame_get_modify_clock( - buf_frame_align(btr_pcur_get_rec(cursor))); mem_heap_free(heap); return(TRUE); @@ -292,6 +308,12 @@ btr_pcur_restore_position( mem_heap_free(heap); + /* We have to store new position information, modify_clock etc., + to the cursor because it can now be on a different page, the record + under it may have been removed, etc. */ + + btr_pcur_store_position(cursor, mtr); + return(FALSE); } diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c index c9a5ec5307f..58c4ca5271b 100644 --- a/innobase/buf/buf0buf.c +++ b/innobase/buf/buf0buf.c @@ -196,7 +196,29 @@ If a new page is referenced in the buf_pool, and several pages of its random access area (for instance, 32 consecutive pages in a tablespace) have recently been referenced, we may predict that the whole area may be needed in the near future, and issue -the read requests for the whole area. */ +the read requests for the whole area. + + AWE implementation + ------------------ + +By a 'block' we mean the buffer header of type buf_block_t. By a 'page' +we mean the physical 16 kB memory area allocated from RAM for that block. +By a 'frame' we mean a 16 kB area in the virtual address space of the +process, in the frame_mem of buf_pool. + +We can map pages to the frames of the buffer pool. + +1) A buffer block allocated to use as a non-data page, e.g., to the lock +table, is always mapped to a frame. +2) A bufferfixed or io-fixed data page is always mapped to a frame. +3) When we need to map a block to frame, we look from the list +awe_LRU_free_mapped and try to unmap its last block, but note that +bufferfixed or io-fixed pages cannot be unmapped. +4) For every frame in the buffer pool there is always a block whose page is +mapped to it. When we create the buffer pool, we map the first elements +in the free list to the frames. +5) When we have AWE enabled, we disable adaptive hash indexes. +*/ buf_pool_t* buf_pool = NULL; /* The buffer buf_pool of the database */ @@ -346,12 +368,15 @@ void buf_block_init( /*===========*/ buf_block_t* block, /* in: pointer to control block */ - byte* frame) /* in: pointer to buffer frame */ + byte* frame) /* in: pointer to buffer frame, or NULL if in + the case of AWE there is no frame */ { block->state = BUF_BLOCK_NOT_USED; block->frame = frame; + block->awe_info = NULL; + block->modify_clock = ut_dulint_zero; block->file_page_was_freed = FALSE; @@ -364,29 +389,37 @@ buf_block_init( rw_lock_create(&(block->read_lock)); rw_lock_set_level(&(block->read_lock), SYNC_NO_ORDER_CHECK); +#ifdef UNIV_SYNC_DEBUG rw_lock_create(&(block->debug_latch)); rw_lock_set_level(&(block->debug_latch), SYNC_NO_ORDER_CHECK); +#endif } /************************************************************************ -Creates a buffer buf_pool object. */ -static +Creates the buffer pool. */ + buf_pool_t* -buf_pool_create( -/*============*/ +buf_pool_init( +/*==========*/ /* out, own: buf_pool object, NULL if not - enough memory */ + enough memory or error */ ulint max_size, /* in: maximum size of the buf_pool in blocks */ - ulint curr_size) /* in: current size to use, must be <= + ulint curr_size, /* in: current size to use, must be <= max_size, currently must be equal to max_size */ + ulint n_frames) /* in: number of frames; if AWE is used, + this is the size of the address space window + where physical memory pages are mapped; if + AWE is not used then this must be the same + as max_size */ { byte* frame; ulint i; buf_block_t* block; ut_a(max_size == curr_size); + ut_a(srv_use_awe || n_frames == max_size); buf_pool = mem_alloc(sizeof(buf_pool_t)); @@ -396,8 +429,38 @@ buf_pool_create( mutex_set_level(&(buf_pool->mutex), SYNC_BUF_POOL); mutex_enter(&(buf_pool->mutex)); - - buf_pool->frame_mem = ut_malloc(UNIV_PAGE_SIZE * (max_size + 1)); + + if (srv_use_awe) { + /*----------------------------------------*/ + /* Allocate the virtual address space window, i.e., the + buffer pool frames */ + + buf_pool->frame_mem = os_awe_allocate_virtual_mem_window( + UNIV_PAGE_SIZE * (n_frames + 1)); + + /* Allocate the physical memory for AWE and the AWE info array + for buf_pool */ + + if ((curr_size % ((1024 * 1024) / UNIV_PAGE_SIZE)) != 0) { + + fprintf(stderr, +"InnoDB: AWE: Error: physical memory must be allocated in full megabytes.\n" +"InnoDB: Trying to allocate %lu database pages.\n", + curr_size); + + return(NULL); + } + + if (!os_awe_allocate_physical_mem(&(buf_pool->awe_info), + curr_size / ((1024 * 1024) / UNIV_PAGE_SIZE))) { + + return(NULL); + } + /*----------------------------------------*/ + } else { + buf_pool->frame_mem = ut_malloc( + UNIV_PAGE_SIZE * (n_frames + 1)); + } if (buf_pool->frame_mem == NULL) { @@ -414,21 +477,60 @@ buf_pool_create( buf_pool->max_size = max_size; buf_pool->curr_size = curr_size; + buf_pool->n_frames = n_frames; + /* Align pointer to the first frame */ frame = ut_align(buf_pool->frame_mem, UNIV_PAGE_SIZE); - buf_pool->frame_zero = frame; + buf_pool->frame_zero = frame; buf_pool->high_end = frame + UNIV_PAGE_SIZE * curr_size; - /* Init block structs and assign frames for them */ + if (srv_use_awe) { + /*----------------------------------------*/ + /* Map an initial part of the allocated physical memory to + the window */ + + os_awe_map_physical_mem_to_window(buf_pool->frame_zero, + n_frames * + (UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE), + buf_pool->awe_info); + /*----------------------------------------*/ + } + + buf_pool->blocks_of_frames = ut_malloc(sizeof(void*) * n_frames); + + if (buf_pool->blocks_of_frames == NULL) { + + return(NULL); + } + + /* Init block structs and assign frames for them; in the case of + AWE there are less frames than blocks. Then we assign the frames + to the first blocks (we already mapped the memory above). We also + init the awe_info for every block. */ + for (i = 0; i < max_size; i++) { block = buf_pool_get_nth_block(buf_pool, i); + + if (i < n_frames) { + frame = buf_pool->frame_zero + i * UNIV_PAGE_SIZE; + *(buf_pool->blocks_of_frames + i) = block; + } else { + frame = NULL; + } + buf_block_init(block, frame); - frame = frame + UNIV_PAGE_SIZE; + + if (srv_use_awe) { + /*----------------------------------------*/ + block->awe_info = buf_pool->awe_info + + i * (UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE); + /*----------------------------------------*/ + } } - + buf_pool->page_hash = hash_create(2 * max_size); buf_pool->n_pend_reads = 0; @@ -438,12 +540,14 @@ buf_pool_create( buf_pool->n_pages_read = 0; buf_pool->n_pages_written = 0; buf_pool->n_pages_created = 0; - + buf_pool->n_pages_awe_remapped = 0; + buf_pool->n_page_gets = 0; buf_pool->n_page_gets_old = 0; buf_pool->n_pages_read_old = 0; buf_pool->n_pages_written_old = 0; buf_pool->n_pages_created_old = 0; + buf_pool->n_pages_awe_remapped_old = 0; /* 2. Initialize flushing fields ---------------------------- */ @@ -466,40 +570,120 @@ buf_pool_create( buf_pool->LRU_old = NULL; + UT_LIST_INIT(buf_pool->awe_LRU_free_mapped); + /* Add control blocks to the free list */ UT_LIST_INIT(buf_pool->free); + for (i = 0; i < curr_size; i++) { block = buf_pool_get_nth_block(buf_pool, i); - /* Wipe contents of page to eliminate a Purify warning */ - memset(block->frame, '\0', UNIV_PAGE_SIZE); + if (block->frame) { + /* Wipe contents of frame to eliminate a Purify + warning */ + + memset(block->frame, '\0', UNIV_PAGE_SIZE); + + if (srv_use_awe) { + /* Add to the list of blocks mapped to + frames */ + + UT_LIST_ADD_LAST(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, block); + } + } - UT_LIST_ADD_FIRST(free, buf_pool->free, block); + UT_LIST_ADD_LAST(free, buf_pool->free, block); } mutex_exit(&(buf_pool->mutex)); - btr_search_sys_create(curr_size * UNIV_PAGE_SIZE / sizeof(void*) / 64); + if (srv_use_adaptive_hash_indexes) { + btr_search_sys_create( + curr_size * UNIV_PAGE_SIZE / sizeof(void*) / 64); + } else { + /* Create only a small dummy system */ + btr_search_sys_create(1000); + } return(buf_pool); } /************************************************************************ -Initializes the buffer buf_pool of the database. */ +Maps the page of block to a frame, if not mapped yet. Unmaps some page +from the end of the awe_LRU_free_mapped. */ void -buf_pool_init( -/*==========*/ - ulint max_size, /* in: maximum size of the buf_pool in blocks */ - ulint curr_size) /* in: current size to use, must be <= - max_size */ +buf_awe_map_page_to_frame( +/*======================*/ + buf_block_t* block, /* in: block whose page should be + mapped to a frame */ + ibool add_to_mapped_list) /* in: TRUE if we in the case + we need to map the page should also + add the block to the + awe_LRU_free_mapped list */ { - ut_a(buf_pool == NULL); + buf_block_t* bck; + + ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(block); + + if (block->frame) { + + return; + } + + /* Scan awe_LRU_free_mapped from the end and try to find a block + which is not bufferfixed or io-fixed */ + + bck = UT_LIST_GET_LAST(buf_pool->awe_LRU_free_mapped); - buf_pool_create(max_size, curr_size); + while (bck) { + if (bck->state == BUF_BLOCK_FILE_PAGE + && (bck->buf_fix_count != 0 || bck->io_fix != 0)) { + + /* We have to skip this */ + bck = UT_LIST_GET_PREV(awe_LRU_free_mapped, bck); + } else { + /* We can map block to the frame of bck */ + + os_awe_map_physical_mem_to_window( + bck->frame, + UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE, + block->awe_info); + + block->frame = bck->frame; + + *(buf_pool->blocks_of_frames + + (((ulint)(block->frame + - buf_pool->frame_zero)) + >> UNIV_PAGE_SIZE_SHIFT)) + = block; + + bck->frame = NULL; + UT_LIST_REMOVE(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, + bck); + + if (add_to_mapped_list) { + UT_LIST_ADD_FIRST(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, + block); + } + + buf_pool->n_pages_awe_remapped++; + + return; + } + } + + fprintf(stderr, +"InnoDB: AWE: Fatal error: cannot find a page to unmap\n" +"InnoDB: awe_LRU_free_mapped list length %lu\n", + UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped)); - ut_ad(buf_validate()); + ut_a(0); } /************************************************************************ @@ -508,7 +692,9 @@ UNIV_INLINE buf_block_t* buf_block_alloc(void) /*=================*/ - /* out, own: the allocated block */ + /* out, own: the allocated block; also if AWE + is used it is guaranteed that the page is + mapped to a frame */ { buf_block_t* block; @@ -846,6 +1032,19 @@ loop: } } + /* If AWE is enabled and the page is not mapped to a frame, then + map it */ + + if (block->frame == NULL) { + ut_a(srv_use_awe); + + /* We set second parameter TRUE because the block is in the + LRU list and we must put it to awe_LRU_free_mapped list once + mapped to a frame */ + + buf_awe_map_page_to_frame(block, TRUE); + } + #ifdef UNIV_SYNC_DEBUG buf_block_buf_fix_inc_debug(block, file, line); #else @@ -940,28 +1139,27 @@ buf_page_optimistic_get_func( /*=========================*/ /* out: TRUE if success */ ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ - buf_frame_t* guess, /* in: guessed frame */ + buf_block_t* block, /* in: guessed buffer block */ + buf_frame_t* guess, /* in: guessed frame; note that AWE may move + frames */ dulint modify_clock,/* in: modify clock value if mode is ..._GUESS_ON_CLOCK */ char* file, /* in: file name */ ulint line, /* in: line where called */ mtr_t* mtr) /* in: mini-transaction */ { - buf_block_t* block; ibool accessed; ibool success; ulint fix_type; - ut_ad(mtr && guess); + ut_ad(mtr && block); ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); - - buf_pool->n_page_gets++; - - block = buf_block_align(guess); mutex_enter(&(buf_pool->mutex)); - if (block->state != BUF_BLOCK_FILE_PAGE) { + /* If AWE is used, block may have a different frame now, e.g., NULL */ + + if (block->state != BUF_BLOCK_FILE_PAGE || block->frame != guess) { mutex_exit(&(buf_pool->mutex)); @@ -1054,12 +1252,15 @@ buf_page_optimistic_get_func( #ifdef UNIV_IBUF_DEBUG ut_a(ibuf_count_get(block->space, block->offset) == 0); #endif + buf_pool->n_page_gets++; + return(TRUE); } /************************************************************************ This is used to get access to a known database page, when no waiting can be -done. */ +done. For example, if a search in an adaptive hash index leads us to this +frame. */ ibool buf_page_get_known_nowait( @@ -1078,13 +1279,11 @@ buf_page_get_known_nowait( ut_ad(mtr); ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); - - buf_pool->n_page_gets++; - - block = buf_block_align(guess); mutex_enter(&(buf_pool->mutex)); + block = buf_block_align(guess); + if (block->state == BUF_BLOCK_REMOVE_HASH) { /* Another thread is just freeing the block from the LRU list of the buffer pool: do not try to access this page; this @@ -1152,6 +1351,8 @@ buf_page_get_known_nowait( ut_a((mode == BUF_KEEP_OLD) || (ibuf_count_get(block->space, block->offset) == 0)); #endif + buf_pool->n_page_gets++; + return(TRUE); } @@ -1732,7 +1933,7 @@ buf_print(void) ut_ad(buf_pool); - size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE; + size = buf_pool->curr_size; index_ids = mem_alloc(sizeof(dulint) * size); counts = mem_alloc(sizeof(ulint) * size); @@ -1847,7 +2048,7 @@ buf_print_io( return; } - size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE; + size = buf_pool->curr_size; mutex_enter(&(buf_pool->mutex)); @@ -1866,6 +2067,15 @@ buf_print_io( buf += sprintf(buf, "Modified db pages %lu\n", UT_LIST_GET_LEN(buf_pool->flush_list)); + if (srv_use_awe) { + buf += sprintf(buf, + "AWE: Buffer pool memory frames %lu\n", + buf_pool->n_frames); + + buf += sprintf(buf, + "AWE: Database pages and free buffers mapped in frames %lu\n", + UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped)); + } buf += sprintf(buf, "Pending reads %lu \n", buf_pool->n_pend_reads); @@ -1891,6 +2101,13 @@ buf_print_io( (buf_pool->n_pages_written - buf_pool->n_pages_written_old) / time_elapsed); + if (srv_use_awe) { + buf += sprintf(buf, "AWE: %.2f page remaps/s\n", + (buf_pool->n_pages_awe_remapped + - buf_pool->n_pages_awe_remapped_old) + / time_elapsed); + } + if (buf_pool->n_page_gets > buf_pool->n_page_gets_old) { buf += sprintf(buf, "Buffer pool hit rate %lu / 1000\n", 1000 @@ -1906,6 +2123,7 @@ buf_print_io( buf_pool->n_pages_read_old = buf_pool->n_pages_read; buf_pool->n_pages_created_old = buf_pool->n_pages_created; buf_pool->n_pages_written_old = buf_pool->n_pages_written; + buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped; mutex_exit(&(buf_pool->mutex)); } @@ -1922,6 +2140,7 @@ buf_refresh_io_stats(void) buf_pool->n_pages_read_old = buf_pool->n_pages_read; buf_pool->n_pages_created_old = buf_pool->n_pages_created; buf_pool->n_pages_written_old = buf_pool->n_pages_written; + buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped; } /************************************************************************* diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c index 78bde60c9b2..02587487a92 100644 --- a/innobase/buf/buf0flu.c +++ b/innobase/buf/buf0flu.c @@ -24,6 +24,7 @@ Created 11/11/1995 Heikki Tuuri #include "log0log.h" #include "os0file.h" #include "trx0sys.h" +#include "srv0srv.h" /* When flushed, dirty blocks are searched in neigborhoods of this size, and flushed along with the original page. */ @@ -103,7 +104,7 @@ buf_flush_ready_for_replace( /*========================*/ /* out: TRUE if can replace immediately */ buf_block_t* block) /* in: buffer control block, must be in state - BUF_BLOCK_FILE_PAGE and in the LRU list*/ + BUF_BLOCK_FILE_PAGE and in the LRU list */ { ut_ad(mutex_own(&(buf_pool->mutex))); ut_ad(block->state == BUF_BLOCK_FILE_PAGE); @@ -134,7 +135,6 @@ buf_flush_ready_for_flush( if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0) && (block->io_fix == 0)) { - if (flush_type != BUF_FLUSH_LRU) { return(TRUE); @@ -436,6 +436,20 @@ buf_flush_try_page( && block && buf_flush_ready_for_flush(block, flush_type)) { block->io_fix = BUF_IO_WRITE; + + /* If AWE is enabled and the page is not mapped to a frame, + then map it */ + + if (block->frame == NULL) { + ut_a(srv_use_awe); + + /* We set second parameter TRUE because the block is + in the LRU list and we must put it to + awe_LRU_free_mapped list once mapped to a frame */ + + buf_awe_map_page_to_frame(block, TRUE); + } + block->flush_type = flush_type; if (buf_pool->n_flush[flush_type] == 0) { @@ -486,6 +500,20 @@ buf_flush_try_page( ..._ready_for_flush). */ block->io_fix = BUF_IO_WRITE; + + /* If AWE is enabled and the page is not mapped to a frame, + then map it */ + + if (block->frame == NULL) { + ut_a(srv_use_awe); + + /* We set second parameter TRUE because the block is + in the LRU list and we must put it to + awe_LRU_free_mapped list once mapped to a frame */ + + buf_awe_map_page_to_frame(block, TRUE); + } + block->flush_type = flush_type; if (buf_pool->n_flush[flush_type] == 0) { @@ -511,6 +539,20 @@ buf_flush_try_page( && buf_flush_ready_for_flush(block, flush_type)) { block->io_fix = BUF_IO_WRITE; + + /* If AWE is enabled and the page is not mapped to a frame, + then map it */ + + if (block->frame == NULL) { + ut_a(srv_use_awe); + + /* We set second parameter TRUE because the block is + in the LRU list and we must put it to + awe_LRU_free_mapped list once mapped to a frame */ + + buf_awe_map_page_to_frame(block, TRUE); + } + block->flush_type = flush_type; if (buf_pool->n_flush[block->flush_type] == 0) { diff --git a/innobase/buf/buf0lru.c b/innobase/buf/buf0lru.c index 2ec1506c522..051aa0191f6 100644 --- a/innobase/buf/buf0lru.c +++ b/innobase/buf/buf0lru.c @@ -132,7 +132,13 @@ buf_LRU_search_and_free_block( mutex_exit(&(buf_pool->mutex)); - btr_search_drop_page_hash_index(block->frame); + /* Remove possible adaptive hash index built on the + page; in the case of AWE the block may not have a + frame at all */ + + if (block->frame) { + btr_search_drop_page_hash_index(block->frame); + } mutex_enter(&(buf_pool->mutex)); @@ -196,7 +202,9 @@ list. */ buf_block_t* buf_LRU_get_free_block(void) /*========================*/ - /* out: the free control block */ + /* out: the free control block; also if AWE is + used, it is guaranteed that the block has its + page mapped to a frame when we return */ { buf_block_t* block = NULL; ibool freed; @@ -257,6 +265,22 @@ loop: block = UT_LIST_GET_FIRST(buf_pool->free); UT_LIST_REMOVE(free, buf_pool->free, block); + + if (srv_use_awe) { + if (block->frame) { + /* Remove from the list of mapped pages */ + + UT_LIST_REMOVE(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, block); + } else { + /* We map the page to a frame; second param + FALSE below because we do not want it to be + added to the awe_LRU_free_mapped list */ + + buf_awe_map_page_to_frame(block, FALSE); + } + } + block->state = BUF_BLOCK_READY_FOR_USE; mutex_exit(&(buf_pool->mutex)); @@ -429,6 +453,13 @@ buf_LRU_remove_block( /* Remove the block from the LRU list */ UT_LIST_REMOVE(LRU, buf_pool->LRU, block); + if (srv_use_awe && block->frame) { + /* Remove from the list of mapped pages */ + + UT_LIST_REMOVE(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, block); + } + /* If the LRU list is so short that LRU_old not defined, return */ if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) { @@ -475,6 +506,13 @@ buf_LRU_add_block_to_end_low( UT_LIST_ADD_LAST(LRU, buf_pool->LRU, block); + if (srv_use_awe && block->frame) { + /* Add to the list of mapped pages */ + + UT_LIST_ADD_LAST(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, block); + } + if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { buf_pool->LRU_old_len++; @@ -518,6 +556,15 @@ buf_LRU_add_block_low( block->old = old; cl = buf_pool_clock_tic(); + if (srv_use_awe && block->frame) { + /* Add to the list of mapped pages; for simplicity we always + add to the start, even if the user would have set 'old' + TRUE */ + + UT_LIST_ADD_FIRST(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, block); + } + if (!old || (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) { UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, block); @@ -613,6 +660,13 @@ buf_LRU_block_free_non_file_page( memset(block->frame, '\0', UNIV_PAGE_SIZE); #endif UT_LIST_ADD_FIRST(free, buf_pool->free, block); + + if (srv_use_awe && block->frame) { + /* Add to the list of mapped pages */ + + UT_LIST_ADD_FIRST(awe_LRU_free_mapped, + buf_pool->awe_LRU_free_mapped, block); + } } /********************************************************************** @@ -639,7 +693,9 @@ buf_LRU_block_remove_hashed_page( buf_pool->freed_page_clock += 1; - buf_frame_modify_clock_inc(block->frame); + /* Note that if AWE is enabled the block may not have a frame at all */ + + buf_block_modify_clock_inc(block); HASH_DELETE(buf_block_t, hash, buf_pool->page_hash, buf_page_address_fold(block->space, block->offset), diff --git a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c index 475a5bd9cbd..bb6670296b9 100644 --- a/innobase/buf/buf0rea.c +++ b/innobase/buf/buf0rea.c @@ -576,7 +576,7 @@ buf_read_recv_pages( os_aio_print_debug = FALSE; - while (buf_pool->n_pend_reads >= RECV_POOL_N_FREE_BLOCKS / 2) { + while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) { os_aio_simulated_wake_handler_threads(); os_thread_sleep(500000); diff --git a/innobase/include/btr0pcur.h b/innobase/include/btr0pcur.h index 9d07dd0de18..81f19af4d40 100644 --- a/innobase/include/btr0pcur.h +++ b/innobase/include/btr0pcur.h @@ -466,6 +466,9 @@ struct btr_pcur_struct{ BTR_PCUR_AFTER, depending on whether cursor was on, before, or after the old_rec record */ + buf_block_t* block_when_stored;/* buffer block when the position was + stored; note that if AWE is on, frames + may move */ dulint modify_clock; /* the modify clock value of the buffer block when the cursor position was stored */ diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h index 395f88a2c7c..c7db3d9bcc9 100644 --- a/innobase/include/buf0buf.h +++ b/innobase/include/buf0buf.h @@ -30,6 +30,7 @@ Created 11/5/1995 Heikki Tuuri #include "sync0rw.h" #include "hash0hash.h" #include "ut0byte.h" +#include "os0proc.h" /* Flags for flush types */ #define BUF_FLUSH_LRU 1 @@ -58,23 +59,34 @@ extern ibool buf_debug_prints;/* If this is set TRUE, the program occurs */ /************************************************************************ -Initializes the buffer pool of the database. */ +Creates the buffer pool. */ -void +buf_pool_t* buf_pool_init( /*==========*/ - ulint max_size, /* in: maximum size of the pool in blocks */ - ulint curr_size); /* in: current size to use, must be <= + /* out, own: buf_pool object, NULL if not + enough memory or error */ + ulint max_size, /* in: maximum size of the buf_pool in + blocks */ + ulint curr_size, /* in: current size to use, must be <= + max_size, currently must be equal to max_size */ + ulint n_frames); /* in: number of frames; if AWE is used, + this is the size of the address space window + where physical memory pages are mapped; if + AWE is not used then this must be the same + as max_size */ /************************************************************************* -Gets the current size of buffer pool in bytes. */ +Gets the current size of buffer buf_pool in bytes. In the case of AWE, the +size of AWE window (= the frames). */ UNIV_INLINE ulint buf_pool_get_curr_size(void); /*========================*/ /* out: size in bytes */ /************************************************************************* -Gets the maximum size of buffer pool in bytes. */ +Gets the maximum size of buffer pool in bytes. In the case of AWE, the +size of AWE window (= the frames). */ UNIV_INLINE ulint buf_pool_get_max_size(void); @@ -138,8 +150,8 @@ improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed as LA! */ NOTE! The following macros should be used instead of buf_page_optimistic_get_func, to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed as LA! */ -#define buf_page_optimistic_get(LA, G, MC, MTR) buf_page_optimistic_get_func(\ - LA, G, MC, IB__FILE__, __LINE__, MTR) +#define buf_page_optimistic_get(LA, BL, G, MC, MTR) buf_page_optimistic_get_func(\ + LA, BL, G, MC, IB__FILE__, __LINE__, MTR) /************************************************************************ This is the general function used to get optimistic access to a database page. */ @@ -149,7 +161,9 @@ buf_page_optimistic_get_func( /*=========================*/ /* out: TRUE if success */ ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ - buf_frame_t* guess, /* in: guessed frame */ + buf_block_t* block, /* in: guessed block */ + buf_frame_t* guess, /* in: guessed frame; note that AWE may move + frames */ dulint modify_clock,/* in: modify clock value if mode is ..._GUESS_ON_CLOCK */ char* file, /* in: file name */ @@ -350,6 +364,16 @@ buf_frame_modify_clock_inc( /* out: new value */ buf_frame_t* frame); /* in: pointer to a frame */ /************************************************************************ +Increments the modify clock of a frame by 1. The caller must (1) own the +buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +dulint +buf_block_modify_clock_inc( +/*=======================*/ + /* out: new value */ + buf_block_t* block); /* in: block */ +/************************************************************************ Returns the value of the modify clock. The caller must have an s-lock or x-lock on the block. */ UNIV_INLINE @@ -428,7 +452,7 @@ UNIV_INLINE buf_frame_t* buf_frame_align( /*============*/ - /* out: pointer to block */ + /* out: pointer to frame */ byte* ptr); /* in: pointer to a frame */ /*********************************************************************** Checks if a pointer points to the block array of the buffer pool (blocks, not @@ -505,6 +529,19 @@ buf_pool_invalidate(void); --------------------------- LOWER LEVEL ROUTINES ------------------------- =========================================================================*/ +/************************************************************************ +Maps the page of block to a frame, if not mapped yet. Unmaps some page +from the end of the awe_LRU_free_mapped. */ + +void +buf_awe_map_page_to_frame( +/*======================*/ + buf_block_t* block, /* in: block whose page should be + mapped to a frame */ + ibool add_to_mapped_list);/* in: TRUE if we in the case + we need to map the page should also + add the block to the + awe_LRU_free_mapped list */ /************************************************************************* Adds latch level info for the rw-lock protecting the buffer frame. This should be called in the debug version after a successful latching of a @@ -638,7 +675,16 @@ struct buf_block_struct{ byte* frame; /* pointer to buffer frame which is of size UNIV_PAGE_SIZE, and aligned to an address divisible by - UNIV_PAGE_SIZE */ + UNIV_PAGE_SIZE; if AWE is used, this + will be NULL for the pages which are + currently not mapped into the virtual + address space window of the buffer + pool */ + os_awe_t* awe_info; /* if AWE is used, then an array of + awe page infos for + UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE + (normally = 4) physical memory + pages; otherwise NULL */ ulint space; /* space id of the page */ ulint offset; /* page number within the space */ ulint lock_hash_val; /* hashed value of the page address @@ -691,6 +737,10 @@ struct buf_block_struct{ /* node of the free block list */ UT_LIST_NODE_T(buf_block_t) LRU; /* node of the LRU list */ + UT_LIST_NODE_T(buf_block_t) awe_LRU_free_mapped; + /* in the AWE version node in the + list of free and LRU blocks which are + mapped to a frame */ ulint LRU_position; /* value which monotonically decreases (or may stay constant if the block is in the old blocks) toward @@ -758,11 +808,12 @@ struct buf_block_struct{ BTR_SEARCH_RIGHT_SIDE in hash indexing */ /* 6. Debug fields */ - +#ifdef UNIV_SYNC_DEBUG rw_lock_t debug_latch; /* in the debug version, each thread which bufferfixes the block acquires an s-latch here; so we can use the debug utilities in sync0rw */ +#endif ibool file_page_was_freed; /* this is set to TRUE when fsp frees a page in buffer pool */ @@ -781,16 +832,36 @@ struct buf_pool_struct{ struct and control blocks, except the read-write lock in them */ byte* frame_mem; /* pointer to the memory area which - was allocated for the frames */ + was allocated for the frames; in AWE + this is the virtual address space + window where we map pages stored + in physical memory */ byte* frame_zero; /* pointer to the first buffer frame: this may differ from frame_mem, because this is aligned by the frame size */ - byte* high_end; /* pointer to the end of the - buffer pool */ + byte* high_end; /* pointer to the end of the buffer + frames */ + ulint n_frames; /* number of frames */ buf_block_t* blocks; /* array of buffer control blocks */ + buf_block_t** blocks_of_frames;/* inverse mapping which can be used + to retrieve the buffer control block + of a frame; this is an array which + lists the blocks of frames in the + order frame_zero, + frame_zero + UNIV_PAGE_SIZE, ... + a control block is always assigned + for each frame, even if the frame does + not contain any data; note that in AWE + there are more control blocks than + buffer frames */ + os_awe_t* awe_info; /* if AWE is used, AWE info for the + physical 4 kB memory pages associated + with buffer frames */ ulint max_size; /* number of control blocks == maximum pool size in pages */ - ulint curr_size; /* current pool size in pages */ + ulint curr_size; /* current pool size in pages; + currently always the same as + max_size */ hash_table_t* page_hash; /* hash table of the file pages */ ulint n_pend_reads; /* number of pending read operations */ @@ -802,11 +873,14 @@ struct buf_pool_struct{ ulint n_pages_created;/* number of pages created in the pool with no read */ ulint n_page_gets; /* number of page gets performed; - also successful seraches through + also successful searches through the adaptive hash index are counted as page gets; this field is NOT protected by the buffer pool mutex */ + ulint n_pages_awe_remapped; /* if AWE is enabled, the + number of remaps of blocks to + buffer frames */ ulint n_page_gets_old;/* n_page_gets when buf_print was last time called: used to calculate hit rate */ @@ -815,6 +889,7 @@ struct buf_pool_struct{ ulint n_pages_written_old;/* number write operations */ ulint n_pages_created_old;/* number of pages created in the pool with no read */ + ulint n_pages_awe_remapped_old; /* 2. Page flushing algorithm fields */ UT_LIST_BASE_NODE_T(buf_block_t) flush_list; @@ -847,7 +922,10 @@ struct buf_pool_struct{ /* 3. LRU replacement algorithm fields */ UT_LIST_BASE_NODE_T(buf_block_t) free; - /* base node of the free block list */ + /* base node of the free block list; + in the case of AWE, at the start are + always free blocks for which the + physical memory is mapped to a frame */ UT_LIST_BASE_NODE_T(buf_block_t) LRU; /* base node of the LRU list */ buf_block_t* LRU_old; /* pointer to the about 3/8 oldest @@ -859,6 +937,12 @@ struct buf_pool_struct{ see buf0lru.c for the restrictions on this value; not defined if LRU_old == NULL */ + UT_LIST_BASE_NODE_T(buf_block_t) awe_LRU_free_mapped; + /* list of those blocks which are + in the LRU list or the free list, and + where the page is mapped to a frame; + thus, frames allocated, e.g., to the + locki table, are not in this list */ }; /* States of a control block */ diff --git a/innobase/include/buf0buf.ic b/innobase/include/buf0buf.ic index 7227c79dc6a..d4e7122f3f9 100644 --- a/innobase/include/buf0buf.ic +++ b/innobase/include/buf0buf.ic @@ -36,25 +36,27 @@ buf_block_peek_if_too_old( } /************************************************************************* -Gets the current size of buffer buf_pool in bytes. */ +Gets the current size of buffer buf_pool in bytes. In the case of AWE, the +size of AWE window (= the frames). */ UNIV_INLINE ulint buf_pool_get_curr_size(void) /*========================*/ /* out: size in bytes */ { - return((buf_pool->curr_size) * UNIV_PAGE_SIZE); + return((buf_pool->n_frames) * UNIV_PAGE_SIZE); } /************************************************************************* -Gets the maximum size of buffer buf_pool in bytes. */ +Gets the maximum size of buffer buf_pool in bytes. In the case of AWE, the +size of AWE window (= the frames). */ UNIV_INLINE ulint buf_pool_get_max_size(void) /*=======================*/ /* out: size in bytes */ { - return((buf_pool->max_size) * UNIV_PAGE_SIZE); + return((buf_pool->n_frames) * UNIV_PAGE_SIZE); } /*********************************************************************** @@ -207,54 +209,24 @@ buf_block_align( frame_zero = buf_pool->frame_zero; - ut_ad((ulint)ptr >= (ulint)frame_zero); - - block = buf_pool_get_nth_block(buf_pool, ((ulint)(ptr - frame_zero)) - >> UNIV_PAGE_SIZE_SHIFT); - if (block < buf_pool->blocks - || block >= buf_pool->blocks + buf_pool->max_size) { + if ((ulint)ptr < (ulint)frame_zero + || (ulint)ptr > (ulint)(buf_pool->high_end)) { + ut_print_timestamp(stderr); fprintf(stderr, -"InnoDB: Error: trying to access a stray pointer %lx\n" -"InnoDB: buf pool start is at %lx, number of pages %lu\n", (ulint)ptr, - (ulint)frame_zero, buf_pool->max_size); +" InnoDB: Error: trying to access a stray pointer %lx\n" +"InnoDB: buf pool start is at %lx, end at %lx\n" +"InnoDB: Probable reason is database corruption or memory\n" +"InnoDB: corruption. If this happens in an InnoDB database recovery,\n" +"InnoDB: you can look from section 6.1 at http://www.innodb.com/ibman.html\n" +"InnoDB: how to force recovery.\n", + (ulint)ptr, (ulint)frame_zero, + (ulint)(buf_pool->high_end)); ut_a(0); } - - return(block); -} - -/*********************************************************************** -Gets the block to whose frame the pointer is pointing to. Does not -require a file page to be bufferfixed. */ -UNIV_INLINE -buf_block_t* -buf_block_align_low( -/*================*/ - /* out: pointer to block */ - byte* ptr) /* in: pointer to a frame */ -{ - buf_block_t* block; - buf_frame_t* frame_zero; - - ut_ad(ptr); - - frame_zero = buf_pool->frame_zero; - - ut_ad((ulint)ptr >= (ulint)frame_zero); - - block = buf_pool_get_nth_block(buf_pool, ((ulint)(ptr - frame_zero)) - >> UNIV_PAGE_SIZE_SHIFT); - if (block < buf_pool->blocks - || block >= buf_pool->blocks + buf_pool->max_size) { - - fprintf(stderr, -"InnoDB: Error: trying to access a stray pointer %lx\n" -"InnoDB: buf pool start is at %lx, number of pages %lu\n", (ulint)ptr, - (ulint)frame_zero, buf_pool->max_size); - ut_a(0); - } - + + block = *(buf_pool->blocks_of_frames + (((ulint)(ptr - frame_zero)) + >> UNIV_PAGE_SIZE_SHIFT)); return(block); } @@ -264,7 +236,7 @@ UNIV_INLINE buf_frame_t* buf_frame_align( /*============*/ - /* out: pointer to block */ + /* out: pointer to frame */ byte* ptr) /* in: pointer to a frame */ { buf_frame_t* frame; @@ -273,14 +245,19 @@ buf_frame_align( frame = ut_align_down(ptr, UNIV_PAGE_SIZE); - if (((ulint)frame - < (ulint)(buf_pool->frame_zero)) - || ((ulint)frame > (ulint)(buf_pool_get_nth_block(buf_pool, - buf_pool->max_size - 1)->frame))) { + if (((ulint)frame < (ulint)(buf_pool->frame_zero)) + || (ulint)frame >= (ulint)(buf_pool->high_end)) { + + ut_print_timestamp(stderr); fprintf(stderr, -"InnoDB: Error: trying to access a stray pointer %lx\n" -"InnoDB: buf pool start is at %lx, number of pages %lu\n", (ulint)ptr, - (ulint)(buf_pool->frame_zero), buf_pool->max_size); +" InnoDB: Error: trying to access a stray pointer %lx\n" +"InnoDB: buf pool start is at %lx, end at %lx\n" +"InnoDB: Probable reason is database corruption or memory\n" +"InnoDB: corruption. If this happens in an InnoDB database recovery,\n" +"InnoDB: you can look from section 6.1 at http://www.innodb.com/ibman.html\n" +"InnoDB: how to force recovery.\n", + (ulint)ptr, (ulint)(buf_pool->frame_zero), + (ulint)(buf_pool->high_end)); ut_a(0); } @@ -469,7 +446,7 @@ buf_frame_modify_clock_inc( ut_ad(frame); - block = buf_block_align_low(frame); + block = buf_block_align(frame); ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0)) || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); @@ -480,6 +457,25 @@ buf_frame_modify_clock_inc( } /************************************************************************ +Increments the modify clock of a frame by 1. The caller must (1) own the +buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +dulint +buf_block_modify_clock_inc( +/*=======================*/ + /* out: new value */ + buf_block_t* block) /* in: block */ +{ + ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0)) + || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); + + UT_DULINT_INC(block->modify_clock); + + return(block->modify_clock); +} + +/************************************************************************ Returns the value of the modify clock. The caller must have an s-lock or x-lock on the block. */ UNIV_INLINE @@ -508,15 +504,16 @@ void buf_block_buf_fix_inc_debug( /*========================*/ buf_block_t* block, /* in: block to bufferfix */ - char* file, /* in: file name */ - ulint line) /* in: line */ + char* file __attribute__ ((unused)), /* in: file name */ + ulint line __attribute__ ((unused))) /* in: line */ { +#ifdef UNIV_SYNC_DEBUG ibool ret; - + ret = rw_lock_s_lock_func_nowait(&(block->debug_latch), file, line); ut_ad(ret == TRUE); - +#endif block->buf_fix_count++; } diff --git a/innobase/include/buf0lru.h b/innobase/include/buf0lru.h index 946b6c4e31d..6a3c948507d 100644 --- a/innobase/include/buf0lru.h +++ b/innobase/include/buf0lru.h @@ -53,7 +53,9 @@ LRU list to the free list. */ buf_block_t* buf_LRU_get_free_block(void); /*=========================*/ - /* out: the free control block */ + /* out: the free control block; also if AWE is + used, it is guaranteed that the block has its + page mapped to a frame when we return */ /********************************************************************** Puts a block back to the free list. */ diff --git a/innobase/include/log0recv.h b/innobase/include/log0recv.h index 7418e4abf1b..bef42cfec1c 100644 --- a/innobase/include/log0recv.h +++ b/innobase/include/log0recv.h @@ -355,12 +355,7 @@ in the debug version: spaces with an odd number as the id are replicate spaces */ #define RECV_REPLICA_SPACE_ADD 1 -/* This many blocks must be left free in the buffer pool when we scan -the log and store the scanned log records in the buffer pool: we will -use these free blocks to read in pages when we start applying the -log records to the database. */ - -#define RECV_POOL_N_FREE_BLOCKS (ut_min(256, buf_pool_get_curr_size() / 8)) +extern ulint recv_n_pool_free_frames; #ifndef UNIV_NONINL #include "log0recv.ic" diff --git a/innobase/include/os0proc.h b/innobase/include/os0proc.h index 79750e5c1f7..08510db4366 100644 --- a/innobase/include/os0proc.h +++ b/innobase/include/os0proc.h @@ -15,6 +15,76 @@ Created 9/30/1995 Heikki Tuuri typedef void* os_process_t; typedef unsigned long int os_process_id_t; +/* The cell type in os_awe_allocate_mem page info */ +#ifdef __NT__ +typedef ULONG_PTR os_awe_t; +#else +typedef ulint os_awe_t; +#endif + +/* Physical page size when Windows AWE is used. This is the normal +page size of an Intel x86 processor. We cannot use AWE with 2 MB or 4 MB +pages. */ +#define OS_AWE_X86_PAGE_SIZE 4096 + +/******************************************************************** +Windows AWE support. Tries to enable the "lock pages in memory" privilege for +the current process so that the current process can allocate memory-locked +virtual address space to act as the window where AWE maps physical memory. */ + +ibool +os_awe_enable_lock_pages_in_mem(void); +/*=================================*/ + /* out: TRUE if success, FALSE if error; + prints error info to stderr if no success */ +/******************************************************************** +Allocates physical RAM memory up to 64 GB in an Intel 32-bit x86 +processor. */ + +ibool +os_awe_allocate_physical_mem( +/*=========================*/ + /* out: TRUE if success */ + os_awe_t** page_info, /* out, own: array of opaque data containing + the info for allocated physical memory pages; + each allocated 4 kB physical memory page has + one slot of type os_awe_t in the array */ + ulint n_megabytes); /* in: number of megabytes to allocate */ +/******************************************************************** +Allocates a window in the virtual address space where we can map then +pages of physical memory. */ + +byte* +os_awe_allocate_virtual_mem_window( +/*===============================*/ + /* out, own: allocated memory, or NULL if did not + succeed */ + ulint size); /* in: virtual memory allocation size in bytes, must + be < 2 GB */ +/******************************************************************** +With this function you can map parts of physical memory allocated with +the ..._allocate_physical_mem to the virtual address space allocated with +the previous function. Intel implements this so that the process page +tables are updated accordingly. A test on a 1.5 GHz AMD processor and XP +showed that this takes < 1 microsecond, much better than the estimated 80 us +for copying a 16 kB page memory to memory. But, the operation will at least +partially invalidate the translation lookaside buffer (TLB) of all +processors. Under a real-world load the performance hit may be bigger. */ + +ibool +os_awe_map_physical_mem_to_window( +/*==============================*/ + /* out: TRUE if success; the function + calls exit(1) in case of an error */ + byte* ptr, /* in: a page-aligned pointer to + somewhere in the virtual address + space window; we map the physical mem + pages here */ + ulint n_mem_pages, /* in: number of 4 kB mem pages to + map */ + os_awe_t* page_info); /* in: array of page infos for those + pages; each page has one slot in the + array */ /******************************************************************** Converts the current process id to a number. It is not guaranteed that the number is unique. In Linux returns the 'process number' of the current diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h index ad6f71f7a3a..bc0960ae023 100644 --- a/innobase/include/srv0srv.h +++ b/innobase/include/srv0srv.h @@ -61,6 +61,7 @@ extern ulint srv_flush_log_at_trx_commit; extern byte srv_latin1_ordering[256];/* The sort order table of the latin1 character set */ extern ulint srv_pool_size; +extern ulint srv_awe_window_size; extern ulint srv_mem_pool_size; extern ulint srv_lock_table_size; @@ -86,6 +87,8 @@ extern ibool srv_use_doublewrite_buf; extern ibool srv_set_thread_priorities; extern int srv_query_thread_priority; +extern ibool srv_use_awe; +extern ibool srv_use_adaptive_hash_indexes; /*-------------------------------------------*/ extern ulint srv_n_rows_inserted; diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c index f9b785ccbd5..bdfce783a43 100644 --- a/innobase/log/log0log.c +++ b/innobase/log/log0log.c @@ -437,25 +437,29 @@ log_group_calc_lsn_offset( dulint lsn, /* in: lsn, must be within 4 GB of group->lsn */ log_group_t* group) /* in: log group */ { - dulint gr_lsn; - ulint gr_lsn_size_offset; - ulint difference; - ulint group_size; - ulint offset; + dulint gr_lsn; + ib_longlong gr_lsn_size_offset; + ib_longlong difference; + ib_longlong group_size; + ib_longlong offset; ut_ad(mutex_own(&(log_sys->mutex))); + /* If total log file size is > 2 GB we can easily get overflows + with 32-bit integers. Use 64-bit integers instead. */ + gr_lsn = group->lsn; - gr_lsn_size_offset = log_group_calc_size_offset(group->lsn_offset, - group); - group_size = log_group_get_capacity(group); + gr_lsn_size_offset = (ib_longlong) + log_group_calc_size_offset(group->lsn_offset, group); + + group_size = (ib_longlong) log_group_get_capacity(group); if (ut_dulint_cmp(lsn, gr_lsn) >= 0) { - difference = ut_dulint_minus(lsn, gr_lsn); + difference = (ib_longlong) ut_dulint_minus(lsn, gr_lsn); } else { - difference = ut_dulint_minus(gr_lsn, lsn); + difference = (ib_longlong) ut_dulint_minus(gr_lsn, lsn); difference = difference % group_size; @@ -464,7 +468,13 @@ log_group_calc_lsn_offset( offset = (gr_lsn_size_offset + difference) % group_size; - return(log_group_calc_real_offset(offset, group)); + ut_a(offset <= 0xFFFFFFFF); + + /* printf("Offset is %lu gr_lsn_offset is %lu difference is %lu\n", + (ulint)offset,(ulint)gr_lsn_size_offset, (ulint)difference); + */ + + return(log_group_calc_real_offset((ulint)offset, group)); } /*********************************************************************** @@ -3054,8 +3064,8 @@ log_check_log_recs( ut_memcpy(scan_buf, start, end - start); recv_scan_log_recs(TRUE, - buf_pool_get_curr_size() - - RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE, + (buf_pool->n_frames - + recv_n_pool_free_frames) * UNIV_PAGE_SIZE, FALSE, scan_buf, end - start, ut_dulint_align_down(buf_start_lsn, OS_FILE_LOG_BLOCK_SIZE), diff --git a/innobase/log/log0recv.c b/innobase/log/log0recv.c index dfe67c444b4..3945b47933d 100644 --- a/innobase/log/log0recv.c +++ b/innobase/log/log0recv.c @@ -71,6 +71,14 @@ ulint recv_previous_parsed_rec_is_multi = 0; ulint recv_max_parsed_page_no = 0; +/* This many frames must be left free in the buffer pool when we scan +the log and store the scanned log records in the buffer pool: we will +use these free frames to read in pages when we start applying the +log records to the database. */ + +ulint recv_n_pool_free_frames = 256; + + /************************************************************ Creates the recovery system. */ @@ -1018,10 +1026,10 @@ recv_recover_page( block = buf_block_align(page); if (just_read_in) { - /* Move the ownership of the x-latch on the page to this OS - thread, so that we can acquire a second x-latch on it. This - is needed for the operations to the page to pass the debug - checks. */ + /* Move the ownership of the x-latch on the page to + this OS thread, so that we can acquire a second + x-latch on it. This is needed for the operations to + the page to pass the debug checks. */ rw_lock_x_lock_move_ownership(&(block->lock)); } @@ -2362,8 +2370,8 @@ recv_group_scan_log_recs( group, start_lsn, end_lsn); finished = recv_scan_log_recs(TRUE, - buf_pool_get_curr_size() - - RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE, + (buf_pool->n_frames + - recv_n_pool_free_frames) * UNIV_PAGE_SIZE, TRUE, log_sys->buf, RECV_SCAN_SIZE, start_lsn, contiguous_lsn, group_scanned_lsn); @@ -3001,8 +3009,8 @@ ask_again: read_offset % UNIV_PAGE_SIZE, len, buf, NULL); ret = recv_scan_log_recs(TRUE, - buf_pool_get_curr_size() - - RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE, + (buf_pool->n_frames - + recv_n_pool_free_frames) * UNIV_PAGE_SIZE, TRUE, buf, len, start_lsn, &dummy_lsn, &scanned_lsn); diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c index fa5482a8cd1..9eae358c7fb 100644 --- a/innobase/os/os0file.c +++ b/innobase/os/os0file.c @@ -2127,7 +2127,7 @@ os_aio_simulated_handle( ulint offs; ulint lowest_offset; byte* combined_buf; - byte* combined_buf2= 0; /* Remove warning */ + byte* combined_buf2; ibool ret; ulint n; ulint i; diff --git a/innobase/os/os0proc.c b/innobase/os/os0proc.c index 1ee448a4a44..61db7bd13b2 100644 --- a/innobase/os/os0proc.c +++ b/innobase/os/os0proc.c @@ -12,11 +12,469 @@ Created 9/30/1995 Heikki Tuuri #include "os0proc.ic" #endif +#include "ut0mem.h" +#include "ut0byte.h" + + +/* +How to get AWE to compile on Windows? +------------------------------------- + +the Visual C++ has to be relatively recent and _WIN32_WINNT has to be +defined to a value >= 0x0500 when windows.h is included. An easy way +to accomplish that is to put + +#define _WIN32_WINNT 0x0500 + +to the start of file \mysql\include\config-win.h + +Where does AWE work? +------------------- + +See the error message in os_awe_allocate_physical_mem(). + +How to assign privileges for mysqld to use AWE? +----------------------------------------------- + +See the error message in os_awe_enable_lock_pages_in_mem(). + +Use Windows AWE functions in this order +--------------------------------------- + +(1) os_awe_enable_lock_pages_in_mem(); +(2) os_awe_allocate_physical_mem(); +(3) os_awe_allocate_virtual_mem_window(); +(4) os_awe_map_physical_mem_to_window(). + +To test 'AWE' in a computer which does not have the AWE API, +you can compile with UNIV_SIMULATE_AWE defined in this file. +*/ + +#ifdef UNIV_SIMULATE_AWE +/* If we simulate AWE, we allocate the 'physical memory' here */ +byte* os_awe_simulate_mem; +ulint os_awe_simulate_mem_size; +os_awe_t* os_awe_simulate_page_info; +byte* os_awe_simulate_window; +ulint os_awe_simulate_window_size; +/* In simulated AWE the following contains a NULL pointer or a pointer +to a mapped 'physical page' for each 4 kB page in the AWE window */ +byte** os_awe_simulate_map; +#endif + +#ifdef __NT__ +os_awe_t* os_awe_page_info; +ulint os_awe_n_pages; +byte* os_awe_window; +ulint os_awe_window_size; +#endif + +/******************************************************************** +Windows AWE support. Tries to enable the "lock pages in memory" privilege for +the current process so that the current process can allocate memory-locked +virtual address space to act as the window where AWE maps physical memory. */ + +ibool +os_awe_enable_lock_pages_in_mem(void) +/*=================================*/ + /* out: TRUE if success, FALSE if error; + prints error info to stderr if no success */ +{ +#ifdef UNIV_SIMULATE_AWE + + return(TRUE); + +#elif defined(__NT__) + struct { + DWORD Count; + LUID_AND_ATTRIBUTES Privilege[1]; + } Info; + HANDLE hProcess; + HANDLE Token; + BOOL Result; + + hProcess = GetCurrentProcess(); + + /* Open the token of the current process */ + + Result = OpenProcessToken(hProcess, + TOKEN_ADJUST_PRIVILEGES, + &Token); + if (Result != TRUE) { + fprintf(stderr, + "InnoDB: AWE: Cannot open process token, error %lu\n", + (ulint)GetLastError()); + return(FALSE); + } + + Info.Count = 1; + + Info.Privilege[0].Attributes = SE_PRIVILEGE_ENABLED; + + /* Get the local unique identifier (LUID) of the SE_LOCK_MEMORY + privilege */ + + Result = LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, + &(Info.Privilege[0].Luid)); + if (Result != TRUE) { + fprintf(stderr, + "InnoDB: AWE: Cannot get local privilege value for %s, error %lu.\n", + SE_LOCK_MEMORY_NAME, (ulint)GetLastError()); + + return(FALSE); + } + + /* Try to adjust the privilege */ + + Result = AdjustTokenPrivileges(Token, FALSE, + (PTOKEN_PRIVILEGES)&Info, + 0, NULL, NULL); + /* Check the result */ + + if (Result != TRUE) { + fprintf(stderr, + "InnoDB: AWE: Cannot adjust process token privileges, error %u.\n", + GetLastError()); + return(FALSE); + } else if (GetLastError() != ERROR_SUCCESS) { + fprintf(stderr, +"InnoDB: AWE: Cannot enable SE_LOCK_MEMORY privilege, error %lu.\n" +"InnoDB: In Windows XP Home you cannot use AWE. In Windows 2000 and XP\n" +"InnoDB: Professional you must go to the Control Panel, to\n" +"InnoDB: Security Settings, to Local Policies, and enable\n" +"InnoDB: the 'lock pages in memory' privilege for the user who runs\n" +"InnoDB: the MySQL server.\n", GetLastError()); + + return(FALSE); + } + + CloseHandle(Token); + + return(TRUE); +#else #ifdef __WIN__ -#include <windows.h> + fprintf(stderr, +"InnoDB: AWE: Error: to use AWE you must use a ...-nt MySQL executable.\n"); +#endif + return(FALSE); #endif +} -#include "ut0mem.h" +/******************************************************************** +Allocates physical RAM memory up to 64 GB in an Intel 32-bit x86 +processor. */ + +ibool +os_awe_allocate_physical_mem( +/*=========================*/ + /* out: TRUE if success */ + os_awe_t** page_info, /* out, own: array of opaque data containing + the info for allocated physical memory pages; + each allocated 4 kB physical memory page has + one slot of type os_awe_t in the array */ + ulint n_megabytes) /* in: number of megabytes to allocate */ +{ +#ifdef UNIV_SIMULATE_AWE + os_awe_simulate_page_info = ut_malloc(sizeof(os_awe_t) * + n_megabytes * ((1024 * 1024) / OS_AWE_X86_PAGE_SIZE)); + + os_awe_simulate_mem = ut_align(ut_malloc( + 4096 + 1024 * 1024 * n_megabytes), + 4096); + os_awe_simulate_mem_size = n_megabytes * 1024 * 1024; + + *page_info = os_awe_simulate_page_info; + + return(TRUE); + +#elif defined(__NT__) + BOOL bResult; + ULONG_PTR NumberOfPages; /* Question: why does Windows + use the name ULONG_PTR for + a scalar integer type? Maybe + because we may also refer to + &NumberOfPages? */ + ULONG_PTR NumberOfPagesInitial; + SYSTEM_INFO sSysInfo; + int PFNArraySize; + + if (n_megabytes > 64 * 1024) { + + fprintf(stderr, +"InnoDB: AWE: Error: tried to allocate %lu MB.\n" +"InnoDB: AWE cannot allocate more than 64 GB in any computer.\n", n_megabytes); + + return(FALSE); + } + + GetSystemInfo(&sSysInfo); /* fill the system information structure */ + + if ((ulint)OS_AWE_X86_PAGE_SIZE != (ulint)sSysInfo.dwPageSize) { + fprintf(stderr, +"InnoDB: AWE: Error: this computer has a page size of %lu.\n" +"InnoDB: Should be 4096 bytes for InnoDB AWE support to work.\n", + (ulint)sSysInfo.dwPageSize); + + return(FALSE); + } + + /* Calculate the number of pages of memory to request */ + + NumberOfPages = n_megabytes * ((1024 * 1024) / OS_AWE_X86_PAGE_SIZE); + + /* Calculate the size of page_info for allocated physical pages */ + + PFNArraySize = NumberOfPages * sizeof(ULONG_PTR); + + *page_info = (ULONG_PTR*)HeapAlloc(GetProcessHeap(), 0, PFNArraySize); + + if (*page_info == NULL) { + fprintf(stderr, +"InnoDB: AWE: Failed to allocate page info array from process heap, error %lu\n", + (ulint)GetLastError()); + + return(FALSE); + } + + ut_total_allocated_memory += PFNArraySize; + + /* Enable this process' privilege to lock pages to physical memory */ + + if (!os_awe_enable_lock_pages_in_mem()) { + + return(FALSE); + } + + /* Allocate the physical memory */ + + NumberOfPagesInitial = NumberOfPages; + + os_awe_page_info = *page_info; + os_awe_n_pages = (ulint)NumberOfPages; + + /* Compilation note: if the compiler complains the function is not + defined, see the note at the start of this file */ + + bResult = AllocateUserPhysicalPages(GetCurrentProcess(), + &NumberOfPages, + *page_info); + if (bResult != TRUE) { + fprintf(stderr, +"InnoDB: AWE: Cannot allocate physical pages, error %lu.\n", + (ulint)GetLastError()); + + return(FALSE); + } + + if (NumberOfPagesInitial != NumberOfPages) { + fprintf(stderr, +"InnoDB: AWE: Error: allocated only %lu pages of %lu requested.\n" +"InnoDB: Check that you have enough free RAM.\n" +"InnoDB: In Windows XP Professional and 2000 Professional\n" +"InnoDB: Windows PAE size is max 4 GB. In 2000 and .NET" +"InnoDB: Advanced Servers and 2000 Datacenter Server it is 32 GB,\n" +"InnoDB: and in .NET Datacenter Server it is 64 GB.\n" +"InnoDB: A Microsoft web page said that the processor must be an Intel\n" +"InnoDB: processor.", + (ulint)NumberOfPages, + (ulint)NumberOfPagesInitial); + + return(FALSE); + } + + fprintf(stderr, +"InnoDB: Using Address Windowing Extensions (AWE); allocated %lu MB\n", + n_megabytes); + + return(TRUE); +#else + return(FALSE); +#endif +} + +/******************************************************************** +Allocates a window in the virtual address space where we can map then +pages of physical memory. */ + +byte* +os_awe_allocate_virtual_mem_window( +/*===============================*/ + /* out, own: allocated memory, or NULL if did not + succeed */ + ulint size) /* in: virtual memory allocation size in bytes, must + be < 2 GB */ +{ +#ifdef UNIV_SIMULATE_AWE + ulint i; + + os_awe_simulate_window = ut_align(ut_malloc(4096 + size), 4096); + os_awe_simulate_window_size = size; + + os_awe_simulate_map = ut_malloc(sizeof(byte*) * (size / 4096)); + + for (i = 0; i < (size / 4096); i++) { + *(os_awe_simulate_map + i) = NULL; + } + + return(os_awe_simulate_window); + +#elif defined(__NT__) + byte* ptr; + + if (size > 0x7FFFFFFFFF) { + fprintf(stderr, +"InnoDB: AWE: Cannot allocate %lu bytes of virtual memory\n", size); + + return(NULL); + } + + ptr = VirtualAlloc(NULL, (SIZE_T)size, MEM_RESERVE | MEM_PHYSICAL, + PAGE_READWRITE); + if (ptr == NULL) { + fprintf(stderr, +"InnoDB: AWE: Cannot allocate %lu bytes of virtual memory, error %lu\n", + size, (ulint)GetLastError()); + + return(NULL); + } + + os_awe_window = ptr; + os_awe_window_size = size; + + ut_total_allocated_memory += size; + + return(ptr); +#else + return(NULL); +#endif +} + +/******************************************************************** +With this function you can map parts of physical memory allocated with +the ..._allocate_physical_mem to the virtual address space allocated with +the previous function. Intel implements this so that the process page +tables are updated accordingly. A test on a 1.5 GHz AMD processor and XP +showed that this takes < 1 microsecond, much better than the estimated 80 us +for copying a 16 kB page memory to memory. But, the operation will at least +partially invalidate the translation lookaside buffer (TLB) of all +processors. Under a real-world load the performance hit may be bigger. */ + +ibool +os_awe_map_physical_mem_to_window( +/*==============================*/ + /* out: TRUE if success; the function + calls exit(1) in case of an error */ + byte* ptr, /* in: a page-aligned pointer to + somewhere in the virtual address + space window; we map the physical mem + pages here */ + ulint n_mem_pages, /* in: number of 4 kB mem pages to + map */ + os_awe_t* page_info) /* in: array of page infos for those + pages; each page has one slot in the + array */ +{ +#ifdef UNIV_SIMULATE_AWE + ulint i; + byte** map; + byte* page; + byte* phys_page; + + ut_a(ptr >= os_awe_simulate_window); + ut_a(ptr < os_awe_simulate_window + os_awe_simulate_window_size); + ut_a(page_info >= os_awe_simulate_page_info); + ut_a(page_info < os_awe_simulate_page_info + + (os_awe_simulate_mem_size / 4096)); + + /* First look if some other 'physical pages' are mapped at ptr, + and copy them back to where they were if yes */ + + map = os_awe_simulate_map + + ((ulint)(ptr - os_awe_simulate_window)) / 4096; + page = ptr; + + for (i = 0; i < n_mem_pages; i++) { + if (*map != NULL) { + ut_memcpy(*map, page, 4096); + } + map++; + page += 4096; + } + + /* Then copy to ptr the 'physical pages' determined by page_info; we + assume page_info is a segment of the array we created at the start */ + + phys_page = os_awe_simulate_mem + + (ulint)(page_info - os_awe_simulate_page_info) + * 4096; + + ut_memcpy(ptr, phys_page, n_mem_pages * 4096); + + /* Update the map */ + + map = os_awe_simulate_map + + ((ulint)(ptr - os_awe_simulate_window)) / 4096; + + for (i = 0; i < n_mem_pages; i++) { + *map = phys_page; + + map++; + phys_page += 4096; + } + + return(TRUE); + +#elif defined(__NT__) + BOOL bResult; + ULONG_PTR n_pages; + + n_pages = (ULONG_PTR)n_mem_pages; + + if (!(ptr >= os_awe_window)) { + fprintf(stderr, +"InnoDB: AWE: Error: trying to map to address %lx but AWE window start %lx\n", + (ulint)ptr, (ulint)os_awe_window); + ut_a(0); + } + + if (!(ptr <= os_awe_window + os_awe_window_size - UNIV_PAGE_SIZE)) { + fprintf(stderr, +"InnoDB: AWE: Error: trying to map to address %lx but AWE window end %lx\n", + (ulint)ptr, (ulint)os_awe_window + os_awe_window_size); + ut_a(0); + } + + if (!(page_info >= os_awe_page_info)) { + fprintf(stderr, +"InnoDB: AWE: Error: trying to map page info at %lx but array start %lx\n", + (ulint)page_info, (ulint)os_awe_page_info); + ut_a(0); + } + + if (!(page_info <= os_awe_page_info + (os_awe_n_pages - 4))) { + fprintf(stderr, +"InnoDB: AWE: Error: trying to map page info at %lx but array end %lx\n", + (ulint)page_info, (ulint)(os_awe_page_info + os_awe_n_pages)); + ut_a(0); + } + + bResult = MapUserPhysicalPages((PVOID)ptr, n_pages, page_info); + + if (bResult != TRUE) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: AWE: Mapping of %lu physical pages to address %lx failed,\n" +"InnoDB: error %lu.\n" +"InnoDB: Cannot continue operation.\n", + n_mem_pages, (ulint)ptr, (ulint)GetLastError()); + exit(1); + } + + return(TRUE); +#else + return(FALSE); +#endif +} /******************************************************************** Converts the current process id to a number. It is not guaranteed that the diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c index f9eba721cbc..56971ab86eb 100644 --- a/innobase/srv/srv0srv.c +++ b/innobase/srv/srv0srv.c @@ -140,9 +140,14 @@ byte srv_latin1_ordering[256] /* The sort order table of the latin1 , 0xD8, 0x55, 0x55, 0x55, 0x59, 0x59, 0xDE, 0xFF }; -ulint srv_pool_size = ULINT_MAX; /* size in database pages; - MySQL originally sets this - value in megabytes */ +ulint srv_pool_size = ULINT_MAX; /* size in pages; MySQL inits + this to size in kilobytes but + we normalize this to pages in + srv_boot() */ +ulint srv_awe_window_size = 0; /* size in pages; MySQL inits + this to bytes, but we + normalize it to pages in + srv_boot() */ ulint srv_mem_pool_size = ULINT_MAX; /* size in bytes */ ulint srv_lock_table_size = ULINT_MAX; @@ -218,6 +223,13 @@ ibool srv_use_doublewrite_buf = TRUE; ibool srv_set_thread_priorities = TRUE; int srv_query_thread_priority = 0; + +/* TRUE if the Address Windowing Extensions of Windows are used; then we must +disable adaptive hash indexes */ +ibool srv_use_awe = FALSE; +ibool srv_use_adaptive_hash_indexes = TRUE; + + /*-------------------------------------------*/ ulint srv_n_spin_wait_rounds = 20; ulint srv_spin_wait_delay = 5; @@ -1956,9 +1968,19 @@ srv_normalize_init_values(void) srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE; - srv_pool_size = srv_pool_size / UNIV_PAGE_SIZE; + srv_pool_size = srv_pool_size / (UNIV_PAGE_SIZE / 1024); + + srv_awe_window_size = srv_awe_window_size / UNIV_PAGE_SIZE; - srv_lock_table_size = 20 * srv_pool_size; + if (srv_use_awe) { + /* If we are using AWE we must save memory in the 32-bit + address space of the process, and cannot bind the lock + table size to the real buffer pool size. */ + + srv_lock_table_size = 20 * srv_awe_window_size; + } else { + srv_lock_table_size = 20 * srv_pool_size; + } return(DB_SUCCESS); } @@ -2323,6 +2345,12 @@ srv_sprintf_innodb_monitor( "Total memory allocated %lu; in additional pool allocated %lu\n", ut_total_allocated_memory, mem_pool_get_reserved(mem_comm_pool)); + if (srv_use_awe) { + buf += sprintf(buf, + "In addition to that %lu MB of AWE memory allocated\n", + srv_pool_size / ((1024 * 1024) / UNIV_PAGE_SIZE)); + } + buf_print_io(buf, buf_end); buf = buf + strlen(buf); ut_a(buf < buf_end + 1500); diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c index ec674b69256..e1d436a879c 100644 --- a/innobase/srv/srv0start.c +++ b/innobase/srv/srv0start.c @@ -935,6 +935,7 @@ innobase_start_or_create_for_mysql(void) /*====================================*/ /* out: DB_SUCCESS or error code */ { + buf_pool_t* ret; ibool create_new_db; ibool log_file_created; ibool log_created = FALSE; @@ -972,6 +973,11 @@ innobase_start_or_create_for_mysql(void) "InnoDB: !!!!!!!!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!!!!!!!\n"); #endif +#ifdef UNIV_SIMULATE_AWE + fprintf(stderr, +"InnoDB: !!!!!!!!!!!!!! UNIV_SIMULATE_AWE switched on !!!!!!!!!!!!!!!!!\n"); +#endif + if (srv_sizeof_trx_t_in_ha_innodb_cc != (ulint)sizeof(trx_t)) { fprintf(stderr, "InnoDB: Error: trx_t size is %lu in ha_innodb.cc but %lu in srv0start.c\n" @@ -1002,6 +1008,17 @@ innobase_start_or_create_for_mysql(void) srv_startup_is_before_trx_rollback_phase = TRUE; os_aio_use_native_aio = FALSE; +#if !defined(__NT__) && !defined(UNIV_SIMULATE_AWE) + if (srv_use_awe) { + + fprintf(stderr, +"InnoDB: Error: You have specified innodb_buffer_pool_awe_mem_mb\n" +"InnoDB: in my.cnf, but AWE can only be used in Windows 2000 and later.\n"); + + return(DB_ERROR); + } +#endif + #ifdef __WIN__ if (os_get_os_version() == OS_WIN95 || os_get_os_version() == OS_WIN31 @@ -1057,6 +1074,9 @@ innobase_start_or_create_for_mysql(void) return(DB_ERROR); } + /* Note that the call srv_boot() also changes the values of + srv_pool_size etc. to the units used by InnoDB internally */ + err = srv_boot(); if (err != DB_SUCCESS) { @@ -1088,7 +1108,26 @@ innobase_start_or_create_for_mysql(void) fil_init(SRV_MAX_N_OPEN_FILES); - buf_pool_init(srv_pool_size, srv_pool_size); + if (srv_use_awe) { + fprintf(stderr, +"InnoDB: Using AWE: Memory window is %lu MB and AWE memory is %lu MB\n", + srv_awe_window_size / ((1024 * 1024) / UNIV_PAGE_SIZE), + srv_pool_size / ((1024 * 1024) / UNIV_PAGE_SIZE)); + + /* We must disable adaptive hash indexes because they do not + tolerate remapping of pages in AWE */ + + srv_use_adaptive_hash_indexes = FALSE; + ret = buf_pool_init(srv_pool_size, srv_pool_size, + srv_awe_window_size); + } else { + ret = buf_pool_init(srv_pool_size, srv_pool_size, + srv_pool_size); + } + + if (ret == NULL) { + return(DB_ERROR); + } fsp_init(); log_init(); diff --git a/innobase/trx/trx0sys.c b/innobase/trx/trx0sys.c index 0c10040847e..33c962772e8 100644 --- a/innobase/trx/trx0sys.c +++ b/innobase/trx/trx0sys.c @@ -472,9 +472,9 @@ trx_sys_update_mysql_binlog_offset( if (0 != ut_memcmp(sys_header + field + TRX_SYS_MYSQL_LOG_NAME, file_name, 1 + ut_strlen(file_name))) { - mlog_write_string((byte*) (sys_header + field - + TRX_SYS_MYSQL_LOG_NAME), - (byte*) file_name, 1 + ut_strlen(file_name), mtr); + mlog_write_string(sys_header + field + + TRX_SYS_MYSQL_LOG_NAME, + file_name, 1 + ut_strlen(file_name), mtr); } if (mach_read_from_4(sys_header + field diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c index d2219ed019f..f0077f941de 100644 --- a/innobase/trx/trx0trx.c +++ b/innobase/trx/trx0trx.c @@ -99,7 +99,7 @@ trx_create( trx->mysql_log_file_name = NULL; trx->mysql_log_offset = 0; - trx->mysql_master_log_file_name = (char*) ""; + trx->mysql_master_log_file_name = ""; trx->mysql_master_log_pos = 0; trx->ignore_duplicates_in_insert = FALSE; diff --git a/innobase/ut/ut0ut.c b/innobase/ut/ut0ut.c index bb5eb662cd7..ff5d11d84ed 100644 --- a/innobase/ut/ut0ut.c +++ b/innobase/ut/ut0ut.c @@ -197,6 +197,7 @@ ut_get_year_month_day( *month = (ulint)cal_tm.wMonth; *day = (ulint)cal_tm.wDay; #else + struct tm cal_tm; struct tm* cal_tm_ptr; time_t tm; |