diff options
Diffstat (limited to 'sql/uniques.cc')
-rw-r--r-- | sql/uniques.cc | 244 |
1 files changed, 244 insertions, 0 deletions
diff --git a/sql/uniques.cc b/sql/uniques.cc index a4a5250192d..037474fae5d 100644 --- a/sql/uniques.cc +++ b/sql/uniques.cc @@ -63,12 +63,256 @@ Unique::Unique(qsort_cmp2 comp_func, void * comp_func_fixed_arg, comp_func_fixed_arg); /* If the following fail's the next add will also fail */ my_init_dynamic_array(&file_ptrs, sizeof(BUFFPEK), 16, 16); + /* + If you change the following, change it in get_max_elements function, too. + */ max_elements= max_in_memory_size / ALIGN_SIZE(sizeof(TREE_ELEMENT)+size); open_cached_file(&file, mysql_tmpdir,TEMP_PREFIX, DISK_BUFFER_SIZE, MYF(MY_WME)); } +/* + Calculate log2(n!) + + NOTES + Stirling's approximate formula is used: + + n! ~= sqrt(2*M_PI*n) * (n/M_E)^n + + Derivation of formula used for calculations is as follows: + + log2(n!) = log(n!)/log(2) = log(sqrt(2*M_PI*n)*(n/M_E)^n) / log(2) = + + = (log(2*M_PI*n)/2 + n*log(n/M_E)) / log(2). +*/ + +inline double log2_n_fact(double x) +{ + return (log(2*M_PI*x)/2 + x*log(x/M_E)) / M_LN2; +} + + +/* + Calculate cost of merge_buffers function call for given sequence of + input stream lengths and store the number of rows in result stream in *last. + + SYNOPSIS + get_merge_buffers_cost() + buff_elems Array of #s of elements in buffers + elem_size Size of element stored in buffer + output_buff Pointer to storage for result buffer size + first Pointer to first merged element size + last Pointer to last merged element size + + RETURN + Cost of merge_buffers operation in disk seeks. + + NOTES + It is assumed that no rows are eliminated during merge. + The cost is calculated as + + cost(read_and_write) + cost(merge_comparisons). + + All bytes in the sequences is read and written back during merge so cost + of disk io is 2*elem_size*total_buf_elems/IO_SIZE (2 is for read + write) + + For comparisons cost calculations we assume that all merged sequences have + the same length, so each of total_buf_size elements will be added to a sort + heap with (n_buffers-1) elements. This gives the comparison cost: + + total_buf_elems* log2(n_buffers) / TIME_FOR_COMPARE_ROWID; +*/ + +static double get_merge_buffers_cost(uint *buff_elems, uint elem_size, + uint *output_buff, uint *first, + uint *last) +{ + uint total_buf_elems= 0; + for (uint *pbuf= first; pbuf <= last; pbuf++) + total_buf_elems+= *pbuf; + *last= total_buf_elems; + + int n_buffers= last - first + 1; + + /* Using log2(n)=log(n)/log(2) formula */ + return 2*((double)total_buf_elems*elem_size) / IO_SIZE + + total_buf_elems*log(n_buffers) / (TIME_FOR_COMPARE_ROWID * M_LN2); +} + + +/* + Calculate cost of merging buffers into one in Unique::get, i.e. calculate + how long (in terms of disk seeks) the two calls + merge_many_buffs(...); + merge_buffers(...); + will take. + + SYNOPSIS + get_merge_many_buffs_cost() + buffer buffer space for temporary data, at least + Unique::get_cost_calc_buff_size bytes + maxbuffer # of full buffers + max_n_elems # of elements in first maxbuffer buffers + last_n_elems # of elements in last buffer + elem_size size of buffer element + + NOTES + maxbuffer+1 buffers are merged, where first maxbuffer buffers contain + max_n_elems elements each and last buffer contains last_n_elems elements. + + The current implementation does a dumb simulation of merge_many_buffs + function actions. + + RETURN + Cost of merge in disk seeks. +*/ + +static double get_merge_many_buffs_cost(uint *buffer, + uint maxbuffer, uint max_n_elems, + uint last_n_elems, int elem_size) +{ + register int i; + double total_cost= 0.0; + uint *buff_elems= buffer; /* #s of elements in each of merged sequences */ + uint *lastbuff; + + /* + Set initial state: first maxbuffer sequences contain max_n_elems elements + each, last sequence contains last_n_elems elements. + */ + for(i = 0; i < (int)maxbuffer; i++) + buff_elems[i]= max_n_elems; + buff_elems[maxbuffer]= last_n_elems; + + /* + Do it exactly as merge_many_buff function does, calling + get_merge_buffers_cost to get cost of merge_buffers. + */ + if (maxbuffer >= MERGEBUFF2) + { + while (maxbuffer >= MERGEBUFF2) + { + lastbuff=0; + for (i = 0; i <= (int) maxbuffer - MERGEBUFF*3/2; i += MERGEBUFF) + total_cost+=get_merge_buffers_cost(buff_elems, elem_size, lastbuff++, + buff_elems + i, + buff_elems + i + MERGEBUFF-1); + + total_cost+=get_merge_buffers_cost(buff_elems, elem_size, lastbuff++, + buff_elems + i, + buff_elems + maxbuffer); + maxbuffer= (uint)lastbuff-1; + } + } + + /* Simulate final merge_buff call. */ + total_cost += get_merge_buffers_cost(buff_elems, elem_size, buff_elems, + buff_elems, buff_elems + maxbuffer); + return total_cost; +} + + +/* + Calculate cost of using Unique for processing nkeys elements of size + key_size using max_in_memory_size memory. + + SYNOPSIS + Unique::get_use_cost() + buffer space for temporary data, use Unique::get_cost_calc_buff_size + to get # bytes needed. + nkeys #of elements in Unique + key_size size of each elements in bytes + max_in_memory_size amount of memory Unique will be allowed to use + + RETURN + Cost in disk seeks. + + NOTES + cost(using_unqiue) = + cost(create_trees) + (see #1) + cost(merge) + (see #2) + cost(read_result) (see #3) + + 1. Cost of trees creation + For each Unique::put operation there will be 2*log2(n+1) elements + comparisons, where n runs from 1 tree_size (we assume that all added + elements are different). Together this gives: + + n_compares = 2*(log2(2) + log2(3) + ... + log2(N+1)) = 2*log2((N+1)!) + + then cost(tree_creation) = n_compares*ROWID_COMPARE_COST; + + Total cost of creating trees: + (n_trees - 1)*max_size_tree_cost + non_max_size_tree_cost. + + Approximate value of log2(N!) is calculated by log2_n_fact function. + + 2. Cost of merging. + If only one tree is created by Unique no merging will be necessary. + Otherwise, we model execution of merge_many_buff function and count + #of merges. (The reason behind this is that number of buffers is small, + while size of buffers is big and we don't want to loose precision with + O(x)-style formula) + + 3. If only one tree is created by Unique no disk io will happen. + Otherwise, ceil(key_len*n_keys) disk seeks are necessary. We assume + these will be random seeks. +*/ + +double Unique::get_use_cost(uint *buffer, uint nkeys, uint key_size, + ulong max_in_memory_size) +{ + ulong max_elements_in_tree; + ulong last_tree_elems; + int n_full_trees; /* number of trees in unique - 1 */ + double result; + + max_elements_in_tree= + max_in_memory_size / ALIGN_SIZE(sizeof(TREE_ELEMENT)+key_size); + + n_full_trees= nkeys / max_elements_in_tree; + last_tree_elems= nkeys % max_elements_in_tree; + + /* Calculate cost of creating trees */ + result= 2*log2_n_fact(last_tree_elems + 1.0); + if (n_full_trees) + result+= n_full_trees * log2_n_fact(max_elements_in_tree + 1.0); + result /= TIME_FOR_COMPARE_ROWID; + + DBUG_PRINT("info",("unique trees sizes: %u=%u*%lu + %lu", nkeys, + n_full_trees, n_full_trees?max_elements_in_tree:0, + last_tree_elems)); + + if (!n_full_trees) + return result; + + /* + There is more then one tree and merging is necessary. + First, add cost of writing all trees to disk, assuming that all disk + writes are sequential. + */ + result += DISK_SEEK_BASE_COST * n_full_trees * + ceil(key_size*max_elements_in_tree / IO_SIZE); + result += DISK_SEEK_BASE_COST * ceil(key_size*last_tree_elems / IO_SIZE); + + /* Cost of merge */ + double merge_cost= get_merge_many_buffs_cost(buffer, n_full_trees, + max_elements_in_tree, + last_tree_elems, key_size); + if (merge_cost < 0.0) + return merge_cost; + + result += merge_cost; + /* + Add cost of reading the resulting sequence, assuming there were no + duplicate elements. + */ + result += ceil((double)key_size*nkeys/IO_SIZE); + + return result; +} + Unique::~Unique() { close_cached_file(&file); |