diff options
author | heikki@hundin.mysql.fi <> | 2003-01-06 22:07:25 +0200 |
---|---|---|
committer | heikki@hundin.mysql.fi <> | 2003-01-06 22:07:25 +0200 |
commit | b1b47e93b130da5540cc80e0e464041ff37ea57a (patch) | |
tree | 88f03b8d4b471aa94f27b4a3ed510bcd7c2c3ccb /innobase/include | |
parent | edb019aeaf616442d93db2eab0df4b2b09003d14 (diff) | |
download | mariadb-git-b1b47e93b130da5540cc80e0e464041ff37ea57a.tar.gz |
buf0buf.c, buf0buf.ic, buf0buf.h:
Reduce memory usage of the buffer headers
Many files:
Merge InnoDB-4.1 with AWE support
Diffstat (limited to 'innobase/include')
-rw-r--r-- | innobase/include/btr0pcur.h | 3 | ||||
-rw-r--r-- | innobase/include/buf0buf.h | 120 | ||||
-rw-r--r-- | innobase/include/buf0buf.ic | 119 | ||||
-rw-r--r-- | innobase/include/buf0lru.h | 4 | ||||
-rw-r--r-- | innobase/include/log0recv.h | 7 | ||||
-rw-r--r-- | innobase/include/os0proc.h | 70 | ||||
-rw-r--r-- | innobase/include/srv0srv.h | 3 |
7 files changed, 240 insertions, 86 deletions
diff --git a/innobase/include/btr0pcur.h b/innobase/include/btr0pcur.h index 9d07dd0de18..81f19af4d40 100644 --- a/innobase/include/btr0pcur.h +++ b/innobase/include/btr0pcur.h @@ -466,6 +466,9 @@ struct btr_pcur_struct{ BTR_PCUR_AFTER, depending on whether cursor was on, before, or after the old_rec record */ + buf_block_t* block_when_stored;/* buffer block when the position was + stored; note that if AWE is on, frames + may move */ dulint modify_clock; /* the modify clock value of the buffer block when the cursor position was stored */ diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h index 395f88a2c7c..c7db3d9bcc9 100644 --- a/innobase/include/buf0buf.h +++ b/innobase/include/buf0buf.h @@ -30,6 +30,7 @@ Created 11/5/1995 Heikki Tuuri #include "sync0rw.h" #include "hash0hash.h" #include "ut0byte.h" +#include "os0proc.h" /* Flags for flush types */ #define BUF_FLUSH_LRU 1 @@ -58,23 +59,34 @@ extern ibool buf_debug_prints;/* If this is set TRUE, the program occurs */ /************************************************************************ -Initializes the buffer pool of the database. */ +Creates the buffer pool. */ -void +buf_pool_t* buf_pool_init( /*==========*/ - ulint max_size, /* in: maximum size of the pool in blocks */ - ulint curr_size); /* in: current size to use, must be <= + /* out, own: buf_pool object, NULL if not + enough memory or error */ + ulint max_size, /* in: maximum size of the buf_pool in + blocks */ + ulint curr_size, /* in: current size to use, must be <= + max_size, currently must be equal to max_size */ + ulint n_frames); /* in: number of frames; if AWE is used, + this is the size of the address space window + where physical memory pages are mapped; if + AWE is not used then this must be the same + as max_size */ /************************************************************************* -Gets the current size of buffer pool in bytes. */ +Gets the current size of buffer buf_pool in bytes. In the case of AWE, the +size of AWE window (= the frames). */ UNIV_INLINE ulint buf_pool_get_curr_size(void); /*========================*/ /* out: size in bytes */ /************************************************************************* -Gets the maximum size of buffer pool in bytes. */ +Gets the maximum size of buffer pool in bytes. In the case of AWE, the +size of AWE window (= the frames). */ UNIV_INLINE ulint buf_pool_get_max_size(void); @@ -138,8 +150,8 @@ improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed as LA! */ NOTE! The following macros should be used instead of buf_page_optimistic_get_func, to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed as LA! */ -#define buf_page_optimistic_get(LA, G, MC, MTR) buf_page_optimistic_get_func(\ - LA, G, MC, IB__FILE__, __LINE__, MTR) +#define buf_page_optimistic_get(LA, BL, G, MC, MTR) buf_page_optimistic_get_func(\ + LA, BL, G, MC, IB__FILE__, __LINE__, MTR) /************************************************************************ This is the general function used to get optimistic access to a database page. */ @@ -149,7 +161,9 @@ buf_page_optimistic_get_func( /*=========================*/ /* out: TRUE if success */ ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ - buf_frame_t* guess, /* in: guessed frame */ + buf_block_t* block, /* in: guessed block */ + buf_frame_t* guess, /* in: guessed frame; note that AWE may move + frames */ dulint modify_clock,/* in: modify clock value if mode is ..._GUESS_ON_CLOCK */ char* file, /* in: file name */ @@ -350,6 +364,16 @@ buf_frame_modify_clock_inc( /* out: new value */ buf_frame_t* frame); /* in: pointer to a frame */ /************************************************************************ +Increments the modify clock of a frame by 1. The caller must (1) own the +buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +dulint +buf_block_modify_clock_inc( +/*=======================*/ + /* out: new value */ + buf_block_t* block); /* in: block */ +/************************************************************************ Returns the value of the modify clock. The caller must have an s-lock or x-lock on the block. */ UNIV_INLINE @@ -428,7 +452,7 @@ UNIV_INLINE buf_frame_t* buf_frame_align( /*============*/ - /* out: pointer to block */ + /* out: pointer to frame */ byte* ptr); /* in: pointer to a frame */ /*********************************************************************** Checks if a pointer points to the block array of the buffer pool (blocks, not @@ -505,6 +529,19 @@ buf_pool_invalidate(void); --------------------------- LOWER LEVEL ROUTINES ------------------------- =========================================================================*/ +/************************************************************************ +Maps the page of block to a frame, if not mapped yet. Unmaps some page +from the end of the awe_LRU_free_mapped. */ + +void +buf_awe_map_page_to_frame( +/*======================*/ + buf_block_t* block, /* in: block whose page should be + mapped to a frame */ + ibool add_to_mapped_list);/* in: TRUE if we in the case + we need to map the page should also + add the block to the + awe_LRU_free_mapped list */ /************************************************************************* Adds latch level info for the rw-lock protecting the buffer frame. This should be called in the debug version after a successful latching of a @@ -638,7 +675,16 @@ struct buf_block_struct{ byte* frame; /* pointer to buffer frame which is of size UNIV_PAGE_SIZE, and aligned to an address divisible by - UNIV_PAGE_SIZE */ + UNIV_PAGE_SIZE; if AWE is used, this + will be NULL for the pages which are + currently not mapped into the virtual + address space window of the buffer + pool */ + os_awe_t* awe_info; /* if AWE is used, then an array of + awe page infos for + UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE + (normally = 4) physical memory + pages; otherwise NULL */ ulint space; /* space id of the page */ ulint offset; /* page number within the space */ ulint lock_hash_val; /* hashed value of the page address @@ -691,6 +737,10 @@ struct buf_block_struct{ /* node of the free block list */ UT_LIST_NODE_T(buf_block_t) LRU; /* node of the LRU list */ + UT_LIST_NODE_T(buf_block_t) awe_LRU_free_mapped; + /* in the AWE version node in the + list of free and LRU blocks which are + mapped to a frame */ ulint LRU_position; /* value which monotonically decreases (or may stay constant if the block is in the old blocks) toward @@ -758,11 +808,12 @@ struct buf_block_struct{ BTR_SEARCH_RIGHT_SIDE in hash indexing */ /* 6. Debug fields */ - +#ifdef UNIV_SYNC_DEBUG rw_lock_t debug_latch; /* in the debug version, each thread which bufferfixes the block acquires an s-latch here; so we can use the debug utilities in sync0rw */ +#endif ibool file_page_was_freed; /* this is set to TRUE when fsp frees a page in buffer pool */ @@ -781,16 +832,36 @@ struct buf_pool_struct{ struct and control blocks, except the read-write lock in them */ byte* frame_mem; /* pointer to the memory area which - was allocated for the frames */ + was allocated for the frames; in AWE + this is the virtual address space + window where we map pages stored + in physical memory */ byte* frame_zero; /* pointer to the first buffer frame: this may differ from frame_mem, because this is aligned by the frame size */ - byte* high_end; /* pointer to the end of the - buffer pool */ + byte* high_end; /* pointer to the end of the buffer + frames */ + ulint n_frames; /* number of frames */ buf_block_t* blocks; /* array of buffer control blocks */ + buf_block_t** blocks_of_frames;/* inverse mapping which can be used + to retrieve the buffer control block + of a frame; this is an array which + lists the blocks of frames in the + order frame_zero, + frame_zero + UNIV_PAGE_SIZE, ... + a control block is always assigned + for each frame, even if the frame does + not contain any data; note that in AWE + there are more control blocks than + buffer frames */ + os_awe_t* awe_info; /* if AWE is used, AWE info for the + physical 4 kB memory pages associated + with buffer frames */ ulint max_size; /* number of control blocks == maximum pool size in pages */ - ulint curr_size; /* current pool size in pages */ + ulint curr_size; /* current pool size in pages; + currently always the same as + max_size */ hash_table_t* page_hash; /* hash table of the file pages */ ulint n_pend_reads; /* number of pending read operations */ @@ -802,11 +873,14 @@ struct buf_pool_struct{ ulint n_pages_created;/* number of pages created in the pool with no read */ ulint n_page_gets; /* number of page gets performed; - also successful seraches through + also successful searches through the adaptive hash index are counted as page gets; this field is NOT protected by the buffer pool mutex */ + ulint n_pages_awe_remapped; /* if AWE is enabled, the + number of remaps of blocks to + buffer frames */ ulint n_page_gets_old;/* n_page_gets when buf_print was last time called: used to calculate hit rate */ @@ -815,6 +889,7 @@ struct buf_pool_struct{ ulint n_pages_written_old;/* number write operations */ ulint n_pages_created_old;/* number of pages created in the pool with no read */ + ulint n_pages_awe_remapped_old; /* 2. Page flushing algorithm fields */ UT_LIST_BASE_NODE_T(buf_block_t) flush_list; @@ -847,7 +922,10 @@ struct buf_pool_struct{ /* 3. LRU replacement algorithm fields */ UT_LIST_BASE_NODE_T(buf_block_t) free; - /* base node of the free block list */ + /* base node of the free block list; + in the case of AWE, at the start are + always free blocks for which the + physical memory is mapped to a frame */ UT_LIST_BASE_NODE_T(buf_block_t) LRU; /* base node of the LRU list */ buf_block_t* LRU_old; /* pointer to the about 3/8 oldest @@ -859,6 +937,12 @@ struct buf_pool_struct{ see buf0lru.c for the restrictions on this value; not defined if LRU_old == NULL */ + UT_LIST_BASE_NODE_T(buf_block_t) awe_LRU_free_mapped; + /* list of those blocks which are + in the LRU list or the free list, and + where the page is mapped to a frame; + thus, frames allocated, e.g., to the + locki table, are not in this list */ }; /* States of a control block */ diff --git a/innobase/include/buf0buf.ic b/innobase/include/buf0buf.ic index 7227c79dc6a..d4e7122f3f9 100644 --- a/innobase/include/buf0buf.ic +++ b/innobase/include/buf0buf.ic @@ -36,25 +36,27 @@ buf_block_peek_if_too_old( } /************************************************************************* -Gets the current size of buffer buf_pool in bytes. */ +Gets the current size of buffer buf_pool in bytes. In the case of AWE, the +size of AWE window (= the frames). */ UNIV_INLINE ulint buf_pool_get_curr_size(void) /*========================*/ /* out: size in bytes */ { - return((buf_pool->curr_size) * UNIV_PAGE_SIZE); + return((buf_pool->n_frames) * UNIV_PAGE_SIZE); } /************************************************************************* -Gets the maximum size of buffer buf_pool in bytes. */ +Gets the maximum size of buffer buf_pool in bytes. In the case of AWE, the +size of AWE window (= the frames). */ UNIV_INLINE ulint buf_pool_get_max_size(void) /*=======================*/ /* out: size in bytes */ { - return((buf_pool->max_size) * UNIV_PAGE_SIZE); + return((buf_pool->n_frames) * UNIV_PAGE_SIZE); } /*********************************************************************** @@ -207,54 +209,24 @@ buf_block_align( frame_zero = buf_pool->frame_zero; - ut_ad((ulint)ptr >= (ulint)frame_zero); - - block = buf_pool_get_nth_block(buf_pool, ((ulint)(ptr - frame_zero)) - >> UNIV_PAGE_SIZE_SHIFT); - if (block < buf_pool->blocks - || block >= buf_pool->blocks + buf_pool->max_size) { + if ((ulint)ptr < (ulint)frame_zero + || (ulint)ptr > (ulint)(buf_pool->high_end)) { + ut_print_timestamp(stderr); fprintf(stderr, -"InnoDB: Error: trying to access a stray pointer %lx\n" -"InnoDB: buf pool start is at %lx, number of pages %lu\n", (ulint)ptr, - (ulint)frame_zero, buf_pool->max_size); +" InnoDB: Error: trying to access a stray pointer %lx\n" +"InnoDB: buf pool start is at %lx, end at %lx\n" +"InnoDB: Probable reason is database corruption or memory\n" +"InnoDB: corruption. If this happens in an InnoDB database recovery,\n" +"InnoDB: you can look from section 6.1 at http://www.innodb.com/ibman.html\n" +"InnoDB: how to force recovery.\n", + (ulint)ptr, (ulint)frame_zero, + (ulint)(buf_pool->high_end)); ut_a(0); } - - return(block); -} - -/*********************************************************************** -Gets the block to whose frame the pointer is pointing to. Does not -require a file page to be bufferfixed. */ -UNIV_INLINE -buf_block_t* -buf_block_align_low( -/*================*/ - /* out: pointer to block */ - byte* ptr) /* in: pointer to a frame */ -{ - buf_block_t* block; - buf_frame_t* frame_zero; - - ut_ad(ptr); - - frame_zero = buf_pool->frame_zero; - - ut_ad((ulint)ptr >= (ulint)frame_zero); - - block = buf_pool_get_nth_block(buf_pool, ((ulint)(ptr - frame_zero)) - >> UNIV_PAGE_SIZE_SHIFT); - if (block < buf_pool->blocks - || block >= buf_pool->blocks + buf_pool->max_size) { - - fprintf(stderr, -"InnoDB: Error: trying to access a stray pointer %lx\n" -"InnoDB: buf pool start is at %lx, number of pages %lu\n", (ulint)ptr, - (ulint)frame_zero, buf_pool->max_size); - ut_a(0); - } - + + block = *(buf_pool->blocks_of_frames + (((ulint)(ptr - frame_zero)) + >> UNIV_PAGE_SIZE_SHIFT)); return(block); } @@ -264,7 +236,7 @@ UNIV_INLINE buf_frame_t* buf_frame_align( /*============*/ - /* out: pointer to block */ + /* out: pointer to frame */ byte* ptr) /* in: pointer to a frame */ { buf_frame_t* frame; @@ -273,14 +245,19 @@ buf_frame_align( frame = ut_align_down(ptr, UNIV_PAGE_SIZE); - if (((ulint)frame - < (ulint)(buf_pool->frame_zero)) - || ((ulint)frame > (ulint)(buf_pool_get_nth_block(buf_pool, - buf_pool->max_size - 1)->frame))) { + if (((ulint)frame < (ulint)(buf_pool->frame_zero)) + || (ulint)frame >= (ulint)(buf_pool->high_end)) { + + ut_print_timestamp(stderr); fprintf(stderr, -"InnoDB: Error: trying to access a stray pointer %lx\n" -"InnoDB: buf pool start is at %lx, number of pages %lu\n", (ulint)ptr, - (ulint)(buf_pool->frame_zero), buf_pool->max_size); +" InnoDB: Error: trying to access a stray pointer %lx\n" +"InnoDB: buf pool start is at %lx, end at %lx\n" +"InnoDB: Probable reason is database corruption or memory\n" +"InnoDB: corruption. If this happens in an InnoDB database recovery,\n" +"InnoDB: you can look from section 6.1 at http://www.innodb.com/ibman.html\n" +"InnoDB: how to force recovery.\n", + (ulint)ptr, (ulint)(buf_pool->frame_zero), + (ulint)(buf_pool->high_end)); ut_a(0); } @@ -469,7 +446,7 @@ buf_frame_modify_clock_inc( ut_ad(frame); - block = buf_block_align_low(frame); + block = buf_block_align(frame); ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0)) || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); @@ -480,6 +457,25 @@ buf_frame_modify_clock_inc( } /************************************************************************ +Increments the modify clock of a frame by 1. The caller must (1) own the +buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +dulint +buf_block_modify_clock_inc( +/*=======================*/ + /* out: new value */ + buf_block_t* block) /* in: block */ +{ + ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0)) + || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); + + UT_DULINT_INC(block->modify_clock); + + return(block->modify_clock); +} + +/************************************************************************ Returns the value of the modify clock. The caller must have an s-lock or x-lock on the block. */ UNIV_INLINE @@ -508,15 +504,16 @@ void buf_block_buf_fix_inc_debug( /*========================*/ buf_block_t* block, /* in: block to bufferfix */ - char* file, /* in: file name */ - ulint line) /* in: line */ + char* file __attribute__ ((unused)), /* in: file name */ + ulint line __attribute__ ((unused))) /* in: line */ { +#ifdef UNIV_SYNC_DEBUG ibool ret; - + ret = rw_lock_s_lock_func_nowait(&(block->debug_latch), file, line); ut_ad(ret == TRUE); - +#endif block->buf_fix_count++; } diff --git a/innobase/include/buf0lru.h b/innobase/include/buf0lru.h index 946b6c4e31d..6a3c948507d 100644 --- a/innobase/include/buf0lru.h +++ b/innobase/include/buf0lru.h @@ -53,7 +53,9 @@ LRU list to the free list. */ buf_block_t* buf_LRU_get_free_block(void); /*=========================*/ - /* out: the free control block */ + /* out: the free control block; also if AWE is + used, it is guaranteed that the block has its + page mapped to a frame when we return */ /********************************************************************** Puts a block back to the free list. */ diff --git a/innobase/include/log0recv.h b/innobase/include/log0recv.h index 7418e4abf1b..bef42cfec1c 100644 --- a/innobase/include/log0recv.h +++ b/innobase/include/log0recv.h @@ -355,12 +355,7 @@ in the debug version: spaces with an odd number as the id are replicate spaces */ #define RECV_REPLICA_SPACE_ADD 1 -/* This many blocks must be left free in the buffer pool when we scan -the log and store the scanned log records in the buffer pool: we will -use these free blocks to read in pages when we start applying the -log records to the database. */ - -#define RECV_POOL_N_FREE_BLOCKS (ut_min(256, buf_pool_get_curr_size() / 8)) +extern ulint recv_n_pool_free_frames; #ifndef UNIV_NONINL #include "log0recv.ic" diff --git a/innobase/include/os0proc.h b/innobase/include/os0proc.h index 79750e5c1f7..08510db4366 100644 --- a/innobase/include/os0proc.h +++ b/innobase/include/os0proc.h @@ -15,6 +15,76 @@ Created 9/30/1995 Heikki Tuuri typedef void* os_process_t; typedef unsigned long int os_process_id_t; +/* The cell type in os_awe_allocate_mem page info */ +#ifdef __NT__ +typedef ULONG_PTR os_awe_t; +#else +typedef ulint os_awe_t; +#endif + +/* Physical page size when Windows AWE is used. This is the normal +page size of an Intel x86 processor. We cannot use AWE with 2 MB or 4 MB +pages. */ +#define OS_AWE_X86_PAGE_SIZE 4096 + +/******************************************************************** +Windows AWE support. Tries to enable the "lock pages in memory" privilege for +the current process so that the current process can allocate memory-locked +virtual address space to act as the window where AWE maps physical memory. */ + +ibool +os_awe_enable_lock_pages_in_mem(void); +/*=================================*/ + /* out: TRUE if success, FALSE if error; + prints error info to stderr if no success */ +/******************************************************************** +Allocates physical RAM memory up to 64 GB in an Intel 32-bit x86 +processor. */ + +ibool +os_awe_allocate_physical_mem( +/*=========================*/ + /* out: TRUE if success */ + os_awe_t** page_info, /* out, own: array of opaque data containing + the info for allocated physical memory pages; + each allocated 4 kB physical memory page has + one slot of type os_awe_t in the array */ + ulint n_megabytes); /* in: number of megabytes to allocate */ +/******************************************************************** +Allocates a window in the virtual address space where we can map then +pages of physical memory. */ + +byte* +os_awe_allocate_virtual_mem_window( +/*===============================*/ + /* out, own: allocated memory, or NULL if did not + succeed */ + ulint size); /* in: virtual memory allocation size in bytes, must + be < 2 GB */ +/******************************************************************** +With this function you can map parts of physical memory allocated with +the ..._allocate_physical_mem to the virtual address space allocated with +the previous function. Intel implements this so that the process page +tables are updated accordingly. A test on a 1.5 GHz AMD processor and XP +showed that this takes < 1 microsecond, much better than the estimated 80 us +for copying a 16 kB page memory to memory. But, the operation will at least +partially invalidate the translation lookaside buffer (TLB) of all +processors. Under a real-world load the performance hit may be bigger. */ + +ibool +os_awe_map_physical_mem_to_window( +/*==============================*/ + /* out: TRUE if success; the function + calls exit(1) in case of an error */ + byte* ptr, /* in: a page-aligned pointer to + somewhere in the virtual address + space window; we map the physical mem + pages here */ + ulint n_mem_pages, /* in: number of 4 kB mem pages to + map */ + os_awe_t* page_info); /* in: array of page infos for those + pages; each page has one slot in the + array */ /******************************************************************** Converts the current process id to a number. It is not guaranteed that the number is unique. In Linux returns the 'process number' of the current diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h index ad6f71f7a3a..bc0960ae023 100644 --- a/innobase/include/srv0srv.h +++ b/innobase/include/srv0srv.h @@ -61,6 +61,7 @@ extern ulint srv_flush_log_at_trx_commit; extern byte srv_latin1_ordering[256];/* The sort order table of the latin1 character set */ extern ulint srv_pool_size; +extern ulint srv_awe_window_size; extern ulint srv_mem_pool_size; extern ulint srv_lock_table_size; @@ -86,6 +87,8 @@ extern ibool srv_use_doublewrite_buf; extern ibool srv_set_thread_priorities; extern int srv_query_thread_priority; +extern ibool srv_use_awe; +extern ibool srv_use_adaptive_hash_indexes; /*-------------------------------------------*/ extern ulint srv_n_rows_inserted; |