diff options
Diffstat (limited to 'sql')
-rw-r--r-- | sql/Makefile.am | 4 | ||||
-rw-r--r-- | sql/ds_mrr.cc | 1333 | ||||
-rw-r--r-- | sql/ds_mrr.h | 68 | ||||
-rw-r--r-- | sql/handler.cc | 56 | ||||
-rw-r--r-- | sql/handler.h | 258 | ||||
-rw-r--r-- | sql/item.h | 2 | ||||
-rw-r--r-- | sql/mysql_priv.h | 4 | ||||
-rw-r--r-- | sql/mysqld.cc | 9 | ||||
-rw-r--r-- | sql/opt_range.cc | 1158 | ||||
-rw-r--r-- | sql/opt_range.h | 142 | ||||
-rw-r--r-- | sql/set_var.cc | 14 | ||||
-rw-r--r-- | sql/sql_class.h | 16 | ||||
-rw-r--r-- | sql/sql_select.cc | 283 | ||||
-rw-r--r-- | sql/sql_select.h | 32 | ||||
-rw-r--r-- | sql/structs.h | 3 | ||||
-rw-r--r-- | sql/table.cc | 12 |
16 files changed, 2738 insertions, 656 deletions
diff --git a/sql/Makefile.am b/sql/Makefile.am index 00342f7034e..b2972072866 100644 --- a/sql/Makefile.am +++ b/sql/Makefile.am @@ -47,7 +47,7 @@ mysqld_LDADD = libndb.la \ $(LDADD) $(CXXLDFLAGS) $(WRAPLIBS) @LIBDL@ \ $(yassl_libs) $(openssl_libs) @MYSQLD_EXTRA_LIBS@ -noinst_HEADERS = item.h item_func.h item_sum.h item_cmpfunc.h \ +noinst_HEADERS = ds_mrr.h item.h item_func.h item_sum.h item_cmpfunc.h \ item_strfunc.h item_timefunc.h \ item_xmlfunc.h \ item_create.h item_subselect.h item_row.h \ @@ -79,7 +79,7 @@ noinst_HEADERS = item.h item_func.h item_sum.h item_cmpfunc.h \ sql_partition.h partition_info.h partition_element.h \ contributors.h sql_servers.h -mysqld_SOURCES = sql_lex.cc sql_handler.cc sql_partition.cc \ +mysqld_SOURCES = ds_mrr.cc sql_lex.cc sql_handler.cc sql_partition.cc \ item.cc item_sum.cc item_buff.cc item_func.cc \ item_cmpfunc.cc item_strfunc.cc item_timefunc.cc \ thr_malloc.cc item_create.cc item_subselect.cc \ diff --git a/sql/ds_mrr.cc b/sql/ds_mrr.cc new file mode 100644 index 00000000000..7a0fc65b8f5 --- /dev/null +++ b/sql/ds_mrr.cc @@ -0,0 +1,1333 @@ +#include "mysql_priv.h" +#include "sql_select.h" + +/* ************************************************************************** + * DS-MRR implementation + ***************************************************************************/ + +/** + DS-MRR: Initialize and start MRR scan + + Initialize and start the MRR scan. Depending on the mode parameter, this + may use default or DS-MRR implementation. + + @param h Table handler to be used + @param key Index to be used + @param seq_funcs Interval sequence enumeration functions + @param seq_init_param Interval sequence enumeration parameter + @param n_ranges Number of ranges in the sequence. + @param mode HA_MRR_* modes to use + @param buf INOUT Buffer to use + + @retval 0 Ok, Scan started. + @retval other Error +*/ + +int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, uint mode, + HANDLER_BUFFER *buf) +{ + uint elem_size; + Item *pushed_cond= NULL; + handler *new_h2= 0; + DBUG_ENTER("DsMrr_impl::dsmrr_init"); + + /* + index_merge may invoke a scan on an object for which dsmrr_info[_const] + has not been called, so set the owner handler here as well. + */ + h= h_arg; + if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED) + { + use_default_impl= TRUE; + const int retval= + h->handler::multi_range_read_init(seq_funcs, seq_init_param, + n_ranges, mode, buf); + DBUG_RETURN(retval); + } + rowids_buf= buf->buffer; + + is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION); + + if (is_mrr_assoc) + status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count); + + rowids_buf_end= buf->buffer_end; + elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*); + rowids_buf_last= rowids_buf + + ((rowids_buf_end - rowids_buf)/ elem_size)* + elem_size; + rowids_buf_end= rowids_buf_last; + + /* + There can be two cases: + - This is the first call since index_init(), h2==NULL + Need to setup h2 then. + - This is not the first call, h2 is initalized and set up appropriately. + The caller might have called h->index_init(), need to switch h to + rnd_pos calls. + */ + if (!h2) + { + /* Create a separate handler object to do rndpos() calls. */ + THD *thd= current_thd; + /* + ::clone() takes up a lot of stack, especially on 64 bit platforms. + The constant 5 is an empiric result. + */ + if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2)) + DBUG_RETURN(1); + DBUG_ASSERT(h->active_index != MAX_KEY); + uint mrr_keyno= h->active_index; + + /* Create a separate handler object to do rndpos() calls. */ + if (!(new_h2= h->clone(thd->mem_root)) || + new_h2->ha_external_lock(thd, F_RDLCK)) + { + delete new_h2; + DBUG_RETURN(1); + } + + if (mrr_keyno == h->pushed_idx_cond_keyno) + pushed_cond= h->pushed_idx_cond; + + /* + Caution: this call will invoke this->dsmrr_close(). Do not put the + created secondary table handler into this->h2 or it will delete it. + */ + if (h->ha_index_end()) + { + h2=new_h2; + goto error; + } + + h2= new_h2; /* Ok, now can put it into h2 */ + table->prepare_for_position(); + h2->extra(HA_EXTRA_KEYREAD); + + if (h2->ha_index_init(mrr_keyno, FALSE)) + goto error; + + use_default_impl= FALSE; + if (pushed_cond) + h2->idx_cond_push(mrr_keyno, pushed_cond); + } + else + { + /* + We get here when the access alternates betwen MRR scan(s) and non-MRR + scans. + + Calling h->index_end() will invoke dsmrr_close() for this object, + which will delete h2. We need to keep it, so save put it away and dont + let it be deleted: + */ + handler *save_h2= h2; + h2= NULL; + int res= (h->inited == handler::INDEX && h->ha_index_end()); + h2= save_h2; + use_default_impl= FALSE; + if (res) + goto error; + } + + if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, + mode, buf) || + dsmrr_fill_buffer()) + { + goto error; + } + /* + If the above call has scanned through all intervals in *seq, then + adjust *buf to indicate that the remaining buffer space will not be used. + */ + if (dsmrr_eof) + buf->end_of_used_area= rowids_buf_last; + + /* + h->inited == INDEX may occur when 'range checked for each record' is + used. + */ + if ((h->inited != handler::RND) && + ((h->inited==handler::INDEX? h->ha_index_end(): FALSE) || + (h->ha_rnd_init(FALSE)))) + goto error; + + use_default_impl= FALSE; + h->mrr_funcs= *seq_funcs; + + DBUG_RETURN(0); +error: + h2->ha_index_or_rnd_end(); + h2->ha_external_lock(current_thd, F_UNLCK); + h2->close(); + delete h2; + h2= NULL; + DBUG_RETURN(1); +} + + +void DsMrr_impl::dsmrr_close() +{ + DBUG_ENTER("DsMrr_impl::dsmrr_close"); + if (h2) + { + h2->ha_index_or_rnd_end(); + h2->ha_external_lock(current_thd, F_UNLCK); + h2->close(); + delete h2; + h2= NULL; + } + use_default_impl= TRUE; + DBUG_VOID_RETURN; +} + + +static int rowid_cmp(void *h, uchar *a, uchar *b) +{ + return ((handler*)h)->cmp_ref(a, b); +} + + +/** + DS-MRR: Fill the buffer with rowids and sort it by rowid + + {This is an internal function of DiskSweep MRR implementation} + Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into + buffer. When the buffer is full or scan is completed, sort the buffer by + rowid and return. + + The function assumes that rowids buffer is empty when it is invoked. + + @param h Table handler + + @retval 0 OK, the next portion of rowids is in the buffer, + properly ordered + @retval other Error +*/ + +int DsMrr_impl::dsmrr_fill_buffer() +{ + char *range_info; + int res; + DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer"); + + rowids_buf_cur= rowids_buf; + while ((rowids_buf_cur < rowids_buf_end) && + !(res= h2->handler::multi_range_read_next(&range_info))) + { + KEY_MULTI_RANGE *curr_range= &h2->handler::mrr_cur_range; + if (h2->mrr_funcs.skip_index_tuple && + h2->mrr_funcs.skip_index_tuple(h2->mrr_iter, curr_range->ptr)) + continue; + + /* Put rowid, or {rowid, range_id} pair into the buffer */ + h2->position(table->record[0]); + memcpy(rowids_buf_cur, h2->ref, h2->ref_length); + rowids_buf_cur += h2->ref_length; + + if (is_mrr_assoc) + { + memcpy(rowids_buf_cur, &range_info, sizeof(void*)); + rowids_buf_cur += sizeof(void*); + } + } + + if (res && res != HA_ERR_END_OF_FILE) + DBUG_RETURN(res); + dsmrr_eof= test(res == HA_ERR_END_OF_FILE); + + /* Sort the buffer contents by rowid */ + uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*); + uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size; + + my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp, + (void*)h); + rowids_buf_last= rowids_buf_cur; + rowids_buf_cur= rowids_buf; + DBUG_RETURN(0); +} + + +/** + DS-MRR implementation: multi_range_read_next() function +*/ + +int DsMrr_impl::dsmrr_next(char **range_info) +{ + int res; + uchar *cur_range_info= 0; + uchar *rowid; + + if (use_default_impl) + return h->handler::multi_range_read_next(range_info); + + do + { + if (rowids_buf_cur == rowids_buf_last) + { + if (dsmrr_eof) + { + res= HA_ERR_END_OF_FILE; + goto end; + } + res= dsmrr_fill_buffer(); + if (res) + goto end; + } + + /* return eof if there are no rowids in the buffer after re-fill attempt */ + if (rowids_buf_cur == rowids_buf_last) + { + res= HA_ERR_END_OF_FILE; + goto end; + } + rowid= rowids_buf_cur; + + if (is_mrr_assoc) + memcpy(&cur_range_info, rowids_buf_cur + h->ref_length, sizeof(uchar**)); + + rowids_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc); + if (h2->mrr_funcs.skip_record && + h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid)) + continue; + res= h->rnd_pos(table->record[0], rowid); + break; + } while (true); + + if (is_mrr_assoc) + { + memcpy(range_info, rowid + h->ref_length, sizeof(void*)); + } +end: + return res; +} + + +/** + DS-MRR implementation: multi_range_read_info() function +*/ +ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows, + uint *bufsz, uint *flags, COST_VECT *cost) +{ + ha_rows res; + uint def_flags= *flags; + uint def_bufsz= *bufsz; + + /* Get cost/flags/mem_usage of default MRR implementation */ + res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz, + &def_flags, cost); + DBUG_ASSERT(!res); + + if ((*flags & HA_MRR_USE_DEFAULT_IMPL) || + choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost)) + { + /* Default implementation is choosen */ + DBUG_PRINT("info", ("Default MRR implementation choosen")); + *flags= def_flags; + *bufsz= def_bufsz; + } + else + { + /* *flags and *bufsz were set by choose_mrr_impl */ + DBUG_PRINT("info", ("DS-MRR implementation choosen")); + } + return 0; +} + + +/** + DS-MRR Implementation: multi_range_read_info_const() function +*/ + +ha_rows DsMrr_impl::dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, + void *seq_init_param, uint n_ranges, + uint *bufsz, uint *flags, COST_VECT *cost) +{ + ha_rows rows; + uint def_flags= *flags; + uint def_bufsz= *bufsz; + /* Get cost/flags/mem_usage of default MRR implementation */ + rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param, + n_ranges, &def_bufsz, + &def_flags, cost); + if (rows == HA_POS_ERROR) + { + /* Default implementation can't perform MRR scan => we can't either */ + return rows; + } + + /* + If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to + use the default MRR implementation (we need it for UPDATE/DELETE). + Otherwise, make a choice based on cost and @@optimizer_use_mrr. + */ + if ((*flags & HA_MRR_USE_DEFAULT_IMPL) || + choose_mrr_impl(keyno, rows, flags, bufsz, cost)) + { + DBUG_PRINT("info", ("Default MRR implementation choosen")); + *flags= def_flags; + *bufsz= def_bufsz; + } + else + { + /* *flags and *bufsz were set by choose_mrr_impl */ + DBUG_PRINT("info", ("DS-MRR implementation choosen")); + } + return rows; +} + + +/** + Check if key has partially-covered columns + + We can't use DS-MRR to perform range scans when the ranges are over + partially-covered keys, because we'll not have full key part values + (we'll have their prefixes from the index) and will not be able to check + if we've reached the end the range. + + @param keyno Key to check + + @todo + Allow use of DS-MRR in cases where the index has partially-covered + components but they are not used for scanning. + + @retval TRUE Yes + @retval FALSE No +*/ + +bool key_uses_partial_cols(TABLE *table, uint keyno) +{ + KEY_PART_INFO *kp= table->key_info[keyno].key_part; + KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts; + for (; kp != kp_end; kp++) + { + if (!kp->field->part_of_key.is_set(keyno)) + return TRUE; + } + return FALSE; +} + +/** + DS-MRR Internals: Choose between Default MRR implementation and DS-MRR + + Make the choice between using Default MRR implementation and DS-MRR. + This function contains common functionality factored out of dsmrr_info() + and dsmrr_info_const(). The function assumes that the default MRR + implementation's applicability requirements are satisfied. + + @param keyno Index number + @param rows E(full rows to be retrieved) + @param flags IN MRR flags provided by the MRR user + OUT If DS-MRR is choosen, flags of DS-MRR implementation + else the value is not modified + @param bufsz IN If DS-MRR is choosen, buffer use of DS-MRR implementation + else the value is not modified + @param cost IN Cost of default MRR implementation + OUT If DS-MRR is choosen, cost of DS-MRR scan + else the value is not modified + + @retval TRUE Default MRR implementation should be used + @retval FALSE DS-MRR implementation should be used +*/ + +bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, + uint *bufsz, COST_VECT *cost) +{ + COST_VECT dsmrr_cost; + bool res; + THD *thd= current_thd; + if (thd->variables.optimizer_use_mrr == 2 || *flags & HA_MRR_INDEX_ONLY || + (keyno == table->s->primary_key && h->primary_key_is_clustered()) || + key_uses_partial_cols(table, keyno)) + { + /* Use the default implementation */ + *flags |= HA_MRR_USE_DEFAULT_IMPL; + return TRUE; + } + + uint add_len= table->key_info[keyno].key_length + h->ref_length; + *bufsz -= add_len; + if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost)) + return TRUE; + *bufsz += add_len; + + bool force_dsmrr; + /* + If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of + DS-MRR and Default implementations cost. This allows one to force use of + DS-MRR whenever it is applicable without affecting other cost-based + choices. + */ + if ((force_dsmrr= (thd->variables.optimizer_use_mrr == 1)) && + dsmrr_cost.total_cost() > cost->total_cost()) + dsmrr_cost= *cost; + + if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost()) + { + *flags &= ~HA_MRR_USE_DEFAULT_IMPL; /* Use the DS-MRR implementation */ + *flags &= ~HA_MRR_SORTED; /* We will return unordered output */ + *cost= dsmrr_cost; + res= FALSE; + } + else + { + /* Use the default MRR implementation */ + res= TRUE; + } + return res; +} + + +static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost); + + +/** + Get cost of DS-MRR scan + + @param keynr Index to be used + @param rows E(Number of rows to be scanned) + @param flags Scan parameters (HA_MRR_* flags) + @param buffer_size INOUT Buffer size + @param cost OUT The cost + + @retval FALSE OK + @retval TRUE Error, DS-MRR cannot be used (the buffer is too small + for even 1 rowid) +*/ + +bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, + uint *buffer_size, COST_VECT *cost) +{ + ulong max_buff_entries, elem_size; + ha_rows rows_in_full_step, rows_in_last_step; + uint n_full_steps; + double index_read_cost; + + elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION)); + max_buff_entries = *buffer_size / elem_size; + + if (!max_buff_entries) + return TRUE; /* Buffer has not enough space for even 1 rowid */ + + /* Number of iterations we'll make with full buffer */ + n_full_steps= (uint)floor(rows2double(rows) / max_buff_entries); + + /* + Get numbers of rows we'll be processing in + - non-last sweep, with full buffer + - last iteration, with non-full buffer + */ + rows_in_full_step= max_buff_entries; + rows_in_last_step= rows % max_buff_entries; + + /* Adjust buffer size if we expect to use only part of the buffer */ + if (n_full_steps) + { + get_sort_and_sweep_cost(table, rows, cost); + cost->multiply(n_full_steps); + } + else + { + cost->zero(); + *buffer_size= max(*buffer_size, + (size_t)(1.2*rows_in_last_step) * elem_size + + h->ref_length + table->key_info[keynr].key_length); + } + + COST_VECT last_step_cost; + get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost); + cost->add(&last_step_cost); + + if (n_full_steps != 0) + cost->mem_cost= *buffer_size; + else + cost->mem_cost= (double)rows_in_last_step * elem_size; + + /* Total cost of all index accesses */ + index_read_cost= h->index_only_read_time(keynr, (double)rows); + cost->add_io(index_read_cost, 1 /* Random seeks */); + return FALSE; +} + + +/* + Get cost of one sort-and-sweep step + + SYNOPSIS + get_sort_and_sweep_cost() + table Table being accessed + nrows Number of rows to be sorted and retrieved + cost OUT The cost + + DESCRIPTION + Get cost of these operations: + - sort an array of #nrows ROWIDs using qsort + - read #nrows records from table in a sweep. +*/ + +static +void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost) +{ + if (nrows) + { + get_sweep_read_cost(table, nrows, FALSE, cost); + /* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */ + double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID); + if (cmp_op < 3) + cmp_op= 3; + cost->cpu_cost += cmp_op * log2(cmp_op); + } + else + cost->zero(); +} + + +/** + Get cost of reading nrows table records in a "disk sweep" + + A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made + for an ordered sequence of rowids. + + We assume hard disk IO. The read is performed as follows: + + 1. The disk head is moved to the needed cylinder + 2. The controller waits for the plate to rotate + 3. The data is transferred + + Time to do #3 is insignificant compared to #2+#1. + + Time to move the disk head is proportional to head travel distance. + + Time to wait for the plate to rotate depends on whether the disk head + was moved or not. + + If disk head wasn't moved, the wait time is proportional to distance + between the previous block and the block we're reading. + + If the head was moved, we don't know how much we'll need to wait for the + plate to rotate. We assume the wait time to be a variate with a mean of + 0.5 of full rotation time. + + Our cost units are "random disk seeks". The cost of random disk seek is + actually not a constant, it depends one range of cylinders we're going + to access. We make it constant by introducing a fuzzy concept of "typical + datafile length" (it's fuzzy as it's hard to tell whether it should + include index file, temp.tables etc). Then random seek cost is: + + 1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length + + We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9. + + @param table Table to be accessed + @param nrows Number of rows to retrieve + @param interrupted TRUE <=> Assume that the disk sweep will be + interrupted by other disk IO. FALSE - otherwise. + @param cost OUT The cost. +*/ + +void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, + COST_VECT *cost) +{ + DBUG_ENTER("get_sweep_read_cost"); + + cost->zero(); + if (table->file->primary_key_is_clustered()) + { + cost->io_count= table->file->read_time(table->s->primary_key, + (uint) nrows, nrows); + } + else + { + double n_blocks= + ceil(ulonglong2double(table->file->stats.data_file_length) / IO_SIZE); + double busy_blocks= + n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows))); + if (busy_blocks < 1.0) + busy_blocks= 1.0; + + DBUG_PRINT("info",("sweep: nblocks=%g, busy_blocks=%g", n_blocks, + busy_blocks)); + cost->io_count= busy_blocks; + + if (!interrupted) + { + /* Assume reading is done in one 'sweep' */ + cost->avg_io_cost= (DISK_SEEK_BASE_COST + + DISK_SEEK_PROP_COST*n_blocks/busy_blocks); + } + } + DBUG_PRINT("info",("returning cost=%g", cost->total_cost())); + DBUG_VOID_RETURN; +} + + +/* ************************************************************************** + * DS-MRR implementation ends + ***************************************************************************/ + +/* ************************************************************************** + * Index Condition Pushdown code starts + ***************************************************************************/ +/* + Check if given expression uses only table fields covered by the given index + + SYNOPSIS + uses_index_fields_only() + item Expression to check + tbl The table having the index + keyno The index number + other_tbls_ok TRUE <=> Fields of other non-const tables are allowed + + DESCRIPTION + Check if given expression only uses fields covered by index #keyno in the + table tbl. The expression can use any fields in any other tables. + + The expression is guaranteed not to be AND or OR - those constructs are + handled outside of this function. + + RETURN + TRUE Yes + FALSE No +*/ + +bool uses_index_fields_only(Item *item, TABLE *tbl, uint keyno, + bool other_tbls_ok) +{ + if (item->const_item()) + return TRUE; + + /* + Don't push down the triggered conditions. Nested outer joins execution + code may need to evaluate a condition several times (both triggered and + untriggered), and there is no way to put thi + TODO: Consider cloning the triggered condition and using the copies for: + 1. push the first copy down, to have most restrictive index condition + possible + 2. Put the second copy into tab->select_cond. + */ + if (item->type() == Item::FUNC_ITEM && + ((Item_func*)item)->functype() == Item_func::TRIG_COND_FUNC) + return FALSE; + + if (!(item->used_tables() & tbl->map)) + return other_tbls_ok; + + Item::Type item_type= item->type(); + switch (item_type) { + case Item::FUNC_ITEM: + { + /* This is a function, apply condition recursively to arguments */ + Item_func *item_func= (Item_func*)item; + Item **child; + Item **item_end= (item_func->arguments()) + item_func->argument_count(); + for (child= item_func->arguments(); child != item_end; child++) + { + if (!uses_index_fields_only(*child, tbl, keyno, other_tbls_ok)) + return FALSE; + } + return TRUE; + } + case Item::COND_ITEM: + { + /* + This is a AND/OR condition. Regular AND/OR clauses are handled by + make_cond_for_index() which will chop off the part that can be + checked with index. This code is for handling non-top-level AND/ORs, + e.g. func(x AND y). + */ + List_iterator<Item> li(*((Item_cond*)item)->argument_list()); + Item *item; + while ((item=li++)) + { + if (!uses_index_fields_only(item, tbl, keyno, other_tbls_ok)) + return FALSE; + } + return TRUE; + } + case Item::FIELD_ITEM: + { + Item_field *item_field= (Item_field*)item; + if (item_field->field->table != tbl) + return TRUE; + /* + The below is probably a repetition - the first part checks the + other two, but let's play it safe: + */ + return item_field->field->part_of_key.is_set(keyno) && + item_field->field->type() != MYSQL_TYPE_GEOMETRY && + item_field->field->type() != MYSQL_TYPE_BLOB; + } + case Item::REF_ITEM: + return uses_index_fields_only(item->real_item(), tbl, keyno, + other_tbls_ok); + default: + return FALSE; /* Play it safe, don't push unknown non-const items */ + } +} + +#define ICP_COND_USES_INDEX_ONLY 10 + +/* + Get a part of the condition that can be checked using only index fields + + SYNOPSIS + make_cond_for_index() + cond The source condition + table The table that is partially available + keyno The index in the above table. Only fields covered by the index + are available + other_tbls_ok TRUE <=> Fields of other non-const tables are allowed + + DESCRIPTION + Get a part of the condition that can be checked when for the given table + we have values only of fields covered by some index. The condition may + refer to other tables, it is assumed that we have values of all of their + fields. + + Example: + make_cond_for_index( + "cond(t1.field) AND cond(t2.key1) AND cond(t2.non_key) AND cond(t2.key2)", + t2, keyno(t2.key1)) + will return + "cond(t1.field) AND cond(t2.key2)" + + RETURN + Index condition, or NULL if no condition could be inferred. +*/ + +Item *make_cond_for_index(Item *cond, TABLE *table, uint keyno, + bool other_tbls_ok) +{ + if (!cond) + return NULL; + if (cond->type() == Item::COND_ITEM) + { + uint n_marked= 0; + if (((Item_cond*) cond)->functype() == Item_func::COND_AND_FUNC) + { + table_map used_tables= 0; + Item_cond_and *new_cond=new Item_cond_and; + if (!new_cond) + return (COND*) 0; + List_iterator<Item> li(*((Item_cond*) cond)->argument_list()); + Item *item; + while ((item=li++)) + { + Item *fix= make_cond_for_index(item, table, keyno, other_tbls_ok); + if (fix) + { + new_cond->argument_list()->push_back(fix); + used_tables|= fix->used_tables(); + } + n_marked += test(item->marker == ICP_COND_USES_INDEX_ONLY); + } + if (n_marked ==((Item_cond*)cond)->argument_list()->elements) + cond->marker= ICP_COND_USES_INDEX_ONLY; + switch (new_cond->argument_list()->elements) { + case 0: + return (COND*) 0; + case 1: + new_cond->used_tables_cache= used_tables; + return new_cond->argument_list()->head(); + default: + new_cond->quick_fix_field(); + new_cond->used_tables_cache= used_tables; + return new_cond; + } + } + else /* It's OR */ + { + Item_cond_or *new_cond=new Item_cond_or; + if (!new_cond) + return (COND*) 0; + List_iterator<Item> li(*((Item_cond*) cond)->argument_list()); + Item *item; + while ((item=li++)) + { + Item *fix= make_cond_for_index(item, table, keyno, other_tbls_ok); + if (!fix) + return (COND*) 0; + new_cond->argument_list()->push_back(fix); + n_marked += test(item->marker == ICP_COND_USES_INDEX_ONLY); + } + if (n_marked ==((Item_cond*)cond)->argument_list()->elements) + cond->marker= ICP_COND_USES_INDEX_ONLY; + new_cond->quick_fix_field(); + new_cond->used_tables_cache= ((Item_cond_or*) cond)->used_tables_cache; + new_cond->top_level_item(); + return new_cond; + } + } + + if (!uses_index_fields_only(cond, table, keyno, other_tbls_ok)) + return (COND*) 0; + cond->marker= ICP_COND_USES_INDEX_ONLY; + return cond; +} + + +Item *make_cond_remainder(Item *cond, bool exclude_index) +{ + if (exclude_index && cond->marker == ICP_COND_USES_INDEX_ONLY) + return 0; /* Already checked */ + + if (cond->type() == Item::COND_ITEM) + { + table_map tbl_map= 0; + if (((Item_cond*) cond)->functype() == Item_func::COND_AND_FUNC) + { + /* Create new top level AND item */ + Item_cond_and *new_cond=new Item_cond_and; + if (!new_cond) + return (COND*) 0; + List_iterator<Item> li(*((Item_cond*) cond)->argument_list()); + Item *item; + while ((item=li++)) + { + Item *fix= make_cond_remainder(item, exclude_index); + if (fix) + { + new_cond->argument_list()->push_back(fix); + tbl_map |= fix->used_tables(); + } + } + switch (new_cond->argument_list()->elements) { + case 0: + return (COND*) 0; + case 1: + return new_cond->argument_list()->head(); + default: + new_cond->quick_fix_field(); + ((Item_cond*)new_cond)->used_tables_cache= tbl_map; + return new_cond; + } + } + else /* It's OR */ + { + Item_cond_or *new_cond=new Item_cond_or; + if (!new_cond) + return (COND*) 0; + List_iterator<Item> li(*((Item_cond*) cond)->argument_list()); + Item *item; + while ((item=li++)) + { + Item *fix= make_cond_remainder(item, FALSE); + if (!fix) + return (COND*) 0; + new_cond->argument_list()->push_back(fix); + tbl_map |= fix->used_tables(); + } + new_cond->quick_fix_field(); + ((Item_cond*)new_cond)->used_tables_cache= tbl_map; + new_cond->top_level_item(); + return new_cond; + } + } + return cond; +} + + +/* + Try to extract and push the index condition + + SYNOPSIS + push_index_cond() + tab A join tab that has tab->table->file and its condition + in tab->select_cond + keyno Index for which extract and push the condition + other_tbls_ok TRUE <=> Fields of other non-const tables are allowed + + DESCRIPTION + Try to extract and push the index condition down to table handler +*/ + +void push_index_cond(JOIN_TAB *tab, uint keyno, bool other_tbls_ok) +{ + DBUG_ENTER("push_index_cond"); + Item *idx_cond; + bool do_index_cond_pushdown= + ((tab->table->file->index_flags(keyno, 0, 1) & + HA_DO_INDEX_COND_PUSHDOWN) && + tab->join->thd->variables.engine_condition_pushdown); + // psergey: + // + KEY *key_info= tab->table->key_info + keyno; + for (uint kp= 0; kp < key_info->key_parts; kp++) + { + if ((key_info->key_part[kp].key_part_flag & HA_PART_KEY_SEG)) + { + do_index_cond_pushdown= FALSE; + break; + } + } + // :psergey + /* + When WL#5116 is done this DBUG statement must be removed. It's just a + temporary hack to allow us to discriminate whether a test failure relates + to *Engine* or *Index* Condition Pushdown. + */ + DBUG_EXECUTE_IF("optimizer_no_icp", do_index_cond_pushdown= false;); + if (do_index_cond_pushdown) + { + DBUG_EXECUTE("where", + print_where(tab->select_cond, "full cond", QT_ORDINARY);); + + idx_cond= make_cond_for_index(tab->select_cond, tab->table, keyno, + other_tbls_ok); + + DBUG_EXECUTE("where", + print_where(idx_cond, "idx cond", QT_ORDINARY);); + + if (idx_cond) + { + Item *idx_remainder_cond= 0; + tab->pre_idx_push_select_cond= tab->select_cond; +#if 0 + // The following is only needed for BKA: + /* + For BKA cache we store condition to special BKA cache field + because evaluation of the condition requires additional operations + before the evaluation. This condition is used in + JOIN_CACHE_BKA[_UNIQUE]::skip_index_tuple() functions. + */ + if (tab->use_join_cache && + /* + if cache is used then the value is TRUE only + for BKA[_UNIQUE] cache (see check_join_cache_usage func). + In this case other_tbls_ok is an equivalent of + cache->is_key_access(). + */ + other_tbls_ok && + (idx_cond->used_tables() & + ~(tab->table->map | tab->join->const_table_map))) + tab->cache_idx_cond= idx_cond; + else +#endif + idx_remainder_cond= tab->table->file->idx_cond_push(keyno, idx_cond); + + /* + Disable eq_ref's "lookup cache" if we've pushed down an index + condition. + TODO: This check happens to work on current ICP implementations, but + there may exist a compliant implementation that will not work + correctly with it. Sort this out when we stabilize the condition + pushdown APIs. + */ + if (idx_remainder_cond != idx_cond) + tab->ref.disable_cache= TRUE; + + Item *row_cond= make_cond_remainder(tab->select_cond, TRUE); + + DBUG_EXECUTE("where", + print_where(row_cond, "remainder cond", QT_ORDINARY);); + + if (row_cond) + { + if (!idx_remainder_cond) + tab->select_cond= row_cond; + else + { + COND *new_cond= new Item_cond_and(row_cond, idx_remainder_cond); + tab->select_cond= new_cond; + tab->select_cond->quick_fix_field(); + ((Item_cond_and*)tab->select_cond)->used_tables_cache= + row_cond->used_tables() | idx_remainder_cond->used_tables(); + } + } + else + tab->select_cond= idx_remainder_cond; + if (tab->select) + { + DBUG_EXECUTE("where", + print_where(tab->select->cond, + "select_cond", + QT_ORDINARY);); + + tab->select->cond= tab->select_cond; + } + } + } + DBUG_VOID_RETURN; +} + +/* ************************************************************************** + * Default MRR implementation starts + ***************************************************************************/ + + +/**************************************************************************** + * Default MRR implementation (MRR to non-MRR converter) + ***************************************************************************/ + +/** + Get cost and other information about MRR scan over a known list of ranges + + Calculate estimated cost and other information about an MRR scan for given + sequence of ranges. + + @param keyno Index number + @param seq Range sequence to be traversed + @param seq_init_param First parameter for seq->init() + @param n_ranges_arg Number of ranges in the sequence, or 0 if the caller + can't efficiently determine it + @param bufsz INOUT IN: Size of the buffer available for use + OUT: Size of the buffer that is expected to be actually + used, or 0 if buffer is not needed. + @param flags INOUT A combination of HA_MRR_* flags + @param cost OUT Estimated cost of MRR access + + @note + This method (or an overriding one in a derived class) must check for + thd->killed and return HA_POS_ERROR if it is not zero. This is required + for a user to be able to interrupt the calculation by killing the + connection/query. + + @retval + HA_POS_ERROR Error or the engine is unable to perform the requested + scan. Values of OUT parameters are undefined. + @retval + other OK, *cost contains cost of the scan, *bufsz and *flags + contain scan parameters. +*/ + +ha_rows +handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, + void *seq_init_param, uint n_ranges_arg, + uint *bufsz, uint *flags, COST_VECT *cost) +{ + KEY_MULTI_RANGE range; + range_seq_t seq_it; + ha_rows rows, total_rows= 0; + uint n_ranges=0; + THD *thd= current_thd; + + /* Default MRR implementation doesn't need buffer */ + *bufsz= 0; + + seq_it= seq->init(seq_init_param, n_ranges, *flags); + while (!seq->next(seq_it, &range)) + { + if (unlikely(thd->killed != 0)) + return HA_POS_ERROR; + + n_ranges++; + key_range *min_endp, *max_endp; + if (range.range_flag & GEOM_FLAG) + { + /* In this case tmp_min_flag contains the handler-read-function */ + range.start_key.flag= (ha_rkey_function) (range.range_flag ^ GEOM_FLAG); + min_endp= &range.start_key; + max_endp= NULL; + } + else + { + min_endp= range.start_key.length? &range.start_key : NULL; + max_endp= range.end_key.length? &range.end_key : NULL; + } + if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE)) + rows= 1; /* there can be at most one row */ + else + { + if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp, + max_endp))) + { + /* Can't scan one range => can't do MRR scan at all */ + total_rows= HA_POS_ERROR; + break; + } + } + total_rows += rows; + } + + if (total_rows != HA_POS_ERROR) + { + /* The following calculation is the same as in multi_range_read_info(): */ + *flags |= HA_MRR_USE_DEFAULT_IMPL; + cost->zero(); + cost->avg_io_cost= 1; /* assume random seeks */ + if ((*flags & HA_MRR_INDEX_ONLY) && total_rows > 2) + cost->io_count= index_only_read_time(keyno, (uint)total_rows); + else + cost->io_count= read_time(keyno, n_ranges, total_rows); + cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01; + } + return total_rows; +} + + +/** + Get cost and other information about MRR scan over some sequence of ranges + + Calculate estimated cost and other information about an MRR scan for some + sequence of ranges. + + The ranges themselves will be known only at execution phase. When this + function is called we only know number of ranges and a (rough) E(#records) + within those ranges. + + Currently this function is only called for "n-keypart singlepoint" ranges, + i.e. each range is "keypart1=someconst1 AND ... AND keypartN=someconstN" + + The flags parameter is a combination of those flags: HA_MRR_SORTED, + HA_MRR_INDEX_ONLY, HA_MRR_NO_ASSOCIATION, HA_MRR_LIMITS. + + @param keyno Index number + @param n_ranges Estimated number of ranges (i.e. intervals) in the + range sequence. + @param n_rows Estimated total number of records contained within all + of the ranges + @param bufsz INOUT IN: Size of the buffer available for use + OUT: Size of the buffer that will be actually used, or + 0 if buffer is not needed. + @param flags INOUT A combination of HA_MRR_* flags + @param cost OUT Estimated cost of MRR access + + @retval + 0 OK, *cost contains cost of the scan, *bufsz and *flags contain scan + parameters. + @retval + other Error or can't perform the requested scan +*/ + +ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows, + uint *bufsz, uint *flags, COST_VECT *cost) +{ + *bufsz= 0; /* Default implementation doesn't need a buffer */ + + *flags |= HA_MRR_USE_DEFAULT_IMPL; + + cost->zero(); + cost->avg_io_cost= 1; /* assume random seeks */ + + /* Produce the same cost as non-MRR code does */ + if (*flags & HA_MRR_INDEX_ONLY) + cost->io_count= index_only_read_time(keyno, n_rows); + else + cost->io_count= read_time(keyno, n_ranges, n_rows); + return 0; +} + + +/** + Initialize the MRR scan + + Initialize the MRR scan. This function may do heavyweight scan + initialization like row prefetching/sorting/etc (NOTE: but better not do + it here as we may not need it, e.g. if we never satisfy WHERE clause on + previous tables. For many implementations it would be natural to do such + initializations in the first multi_read_range_next() call) + + mode is a combination of the following flags: HA_MRR_SORTED, + HA_MRR_INDEX_ONLY, HA_MRR_NO_ASSOCIATION + + @param seq Range sequence to be traversed + @param seq_init_param First parameter for seq->init() + @param n_ranges Number of ranges in the sequence + @param mode Flags, see the description section for the details + @param buf INOUT: memory buffer to be used + + @note + One must have called index_init() before calling this function. Several + multi_range_read_init() calls may be made in course of one query. + + Until WL#2623 is done (see its text, section 3.2), the following will + also hold: + The caller will guarantee that if "seq->init == mrr_ranges_array_init" + then seq_init_param is an array of n_ranges KEY_MULTI_RANGE structures. + This property will only be used by NDB handler until WL#2623 is done. + + Buffer memory management is done according to the following scenario: + The caller allocates the buffer and provides it to the callee by filling + the members of HANDLER_BUFFER structure. + The callee consumes all or some fraction of the provided buffer space, and + sets the HANDLER_BUFFER members accordingly. + The callee may use the buffer memory until the next multi_range_read_init() + call is made, all records have been read, or until index_end() call is + made, whichever comes first. + + @retval 0 OK + @retval 1 Error +*/ + +int +handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param, + uint n_ranges, uint mode, HANDLER_BUFFER *buf) +{ + DBUG_ENTER("handler::multi_range_read_init"); + mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode); + mrr_funcs= *seq_funcs; + mrr_is_output_sorted= test(mode & HA_MRR_SORTED); + mrr_have_range= FALSE; + DBUG_RETURN(0); +} + + +/** + Get next record in MRR scan + + Default MRR implementation: read the next record + + @param range_info OUT Undefined if HA_MRR_NO_ASSOCIATION flag is in effect + Otherwise, the opaque value associated with the range + that contains the returned record. + + @retval 0 OK + @retval other Error code +*/ + +int handler::multi_range_read_next(char **range_info) +{ + int UNINIT_VAR(result); + int range_res; + DBUG_ENTER("handler::multi_range_read_next"); + + if (!mrr_have_range) + { + mrr_have_range= TRUE; + goto start; + } + + do + { + /* Save a call if there can be only one row in range. */ + if (mrr_cur_range.range_flag != (UNIQUE_RANGE | EQ_RANGE)) + { + result= read_range_next(); + /* On success or non-EOF errors jump to the end. */ + if (result != HA_ERR_END_OF_FILE) + break; + } + else + { + if (was_semi_consistent_read()) + goto scan_it_again; + /* + We need to set this for the last range only, but checking this + condition is more expensive than just setting the result code. + */ + result= HA_ERR_END_OF_FILE; + } + +start: + /* Try the next range(s) until one matches a record. */ + while (!(range_res= mrr_funcs.next(mrr_iter, &mrr_cur_range))) + { +scan_it_again: + result= read_range_first(mrr_cur_range.start_key.keypart_map ? + &mrr_cur_range.start_key : 0, + mrr_cur_range.end_key.keypart_map ? + &mrr_cur_range.end_key : 0, + test(mrr_cur_range.range_flag & EQ_RANGE), + mrr_is_output_sorted); + if (result != HA_ERR_END_OF_FILE) + break; + } + } + while ((result == HA_ERR_END_OF_FILE) && !range_res); + + *range_info= mrr_cur_range.ptr; + DBUG_PRINT("exit",("handler::multi_range_read_next result %d", result)); + DBUG_RETURN(result); +} + diff --git a/sql/ds_mrr.h b/sql/ds_mrr.h new file mode 100644 index 00000000000..02abe72a4c1 --- /dev/null +++ b/sql/ds_mrr.h @@ -0,0 +1,68 @@ + + +/** + A Disk-Sweep MRR interface implementation + + This implementation makes range (and, in the future, 'ref') scans to read + table rows in disk sweeps. + + Currently it is used by MyISAM and InnoDB. Potentially it can be used with + any table handler that has non-clustered indexes and on-disk rows. +*/ + +class DsMrr_impl +{ +public: + typedef void (handler::*range_check_toggle_func_t)(bool on); + + DsMrr_impl() + : h2(NULL) {}; + + /* + The "owner" handler object (the one that calls dsmrr_XXX functions. + It is used to retrieve full table rows by calling rnd_pos(). + */ + handler *h; + TABLE *table; /* Always equal to h->table */ +private: + /* Secondary handler object. It is used for scanning the index */ + handler *h2; + + /* Buffer to store rowids, or (rowid, range_id) pairs */ + uchar *rowids_buf; + uchar *rowids_buf_cur; /* Current position when reading/writing */ + uchar *rowids_buf_last; /* When reading: end of used buffer space */ + uchar *rowids_buf_end; /* End of the buffer */ + + bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */ + + /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */ + bool is_mrr_assoc; + + bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */ +public: + void init(handler *h_arg, TABLE *table_arg) + { + h= h_arg; + table= table_arg; + } + int dsmrr_init(handler *h, RANGE_SEQ_IF *seq_funcs, void *seq_init_param, + uint n_ranges, uint mode, HANDLER_BUFFER *buf); + void dsmrr_close(); + int dsmrr_fill_buffer(); + int dsmrr_next(char **range_info); + + ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint *bufsz, + uint *flags, COST_VECT *cost); + + ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, + void *seq_init_param, uint n_ranges, uint *bufsz, + uint *flags, COST_VECT *cost); +private: + bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, + COST_VECT *cost); + bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, + uint *buffer_size, COST_VECT *cost); +}; + + diff --git a/sql/handler.cc b/sql/handler.cc index 3ff9a63b76e..3f4fec8f23a 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -4161,6 +4161,41 @@ void ha_binlog_log_query(THD *thd, handlerton *hton, } #endif + +/** + Calculate cost of 'index only' scan for given index and number of records + + @param keynr Index number + @param records Estimated number of records to be retrieved + + @note + It is assumed that we will read trough the whole key range and that all + key blocks are half full (normally things are much better). It is also + assumed that each time we read the next key from the index, the handler + performs a random seek, thus the cost is proportional to the number of + blocks read. + + @todo + Consider joining this function and handler::read_time() into one + handler::read_time(keynr, records, ranges, bool index_only) function. + + @return + Estimated cost of 'index only' scan +*/ + +double handler::index_only_read_time(uint keynr, double records) +{ + double read_time; + uint keys_per_block= (stats.block_size/2/ + (table->key_info[keynr].key_length + ref_length) + 1); + read_time=((double) (records + keys_per_block-1) / + (double) keys_per_block); + return read_time; +} + +#if 0 +// psergey-mrr + /** Read the first row of a multi-range set. @@ -4287,7 +4322,7 @@ scan_it_again: DBUG_PRINT("exit",("handler::read_multi_range_next: result %d", result)); DBUG_RETURN(result); } - +#endif /** Read first row between two ranges. @@ -4391,7 +4426,7 @@ int handler::read_range_next() int handler::compare_key(key_range *range) { int cmp; - if (!range) + if (!range || in_range_check_pushed_down) return 0; // No max range cmp= key_cmp(range_key_part, range->key, range->length); if (!cmp) @@ -4400,6 +4435,23 @@ int handler::compare_key(key_range *range) } +/* + Same as compare_key() but doesn't check have in_range_check_pushed_down. + This is used by index condition pushdown implementation. +*/ + +int handler::compare_key2(key_range *range) +{ + int cmp; + if (!range) + return 0; // no max range + cmp= key_cmp(range_key_part, range->key, range->length); + if (!cmp) + cmp= key_compare_result_on_equal; + return cmp; +} + + int handler::index_read_idx_map(uchar * buf, uint index, const uchar * key, key_part_map keypart_map, enum ha_rkey_function find_flag) diff --git a/sql/handler.h b/sql/handler.h index cfea6175733..36c1a37f200 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -132,6 +132,8 @@ /* Has automatic checksums and uses the new checksum format */ #define HA_HAS_NEW_CHECKSUM (LL(1) << 36) +#define HA_MRR_CANT_SORT (LL(1) << 37) + /* Set of all binlog flags. Currently only contain the capabilities flags. @@ -147,6 +149,15 @@ #define HA_KEYREAD_ONLY 64 /* Support HA_EXTRA_KEYREAD */ /* + Index scan will not return records in rowid order. Not guaranteed to be + set for unordered (e.g. HASH) indexes. +*/ +#define HA_KEY_SCAN_NOT_ROR 128 +#define HA_DO_INDEX_COND_PUSHDOWN 256 /* Supports Index Condition Pushdown */ + + + +/* bits in alter_table_flags: */ /* @@ -199,12 +210,6 @@ #define HA_FAST_CHANGE_PARTITION (1L << 13) #define HA_PARTITION_ONE_PHASE (1L << 14) -/* - Index scan will not return records in rowid order. Not guaranteed to be - set for unordered (e.g. HASH) indexes. -*/ -#define HA_KEY_SCAN_NOT_ROR 128 - /* operations for disable/enable indexes */ #define HA_KEY_SWITCH_NONUNIQ 0 #define HA_KEY_SWITCH_ALL 1 @@ -1015,6 +1020,186 @@ typedef struct st_ha_check_opt } HA_CHECK_OPT; +/******************************************************************************** + * MRR + ********************************************************************************/ + +typedef void *range_seq_t; + +typedef struct st_range_seq_if +{ + /* + Initialize the traversal of range sequence + + SYNOPSIS + init() + init_params The seq_init_param parameter + n_ranges The number of ranges obtained + flags A combination of HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY + + RETURN + An opaque value to be used as RANGE_SEQ_IF::next() parameter + */ + range_seq_t (*init)(void *init_params, uint n_ranges, uint flags); + + + /* + Get the next range in the range sequence + + SYNOPSIS + next() + seq The value returned by RANGE_SEQ_IF::init() + range OUT Information about the next range + + RETURN + 0 - Ok, the range structure filled with info about the next range + 1 - No more ranges + */ + uint (*next) (range_seq_t seq, KEY_MULTI_RANGE *range); + + /* + Check whether range_info orders to skip the next record + + SYNOPSIS + skip_record() + seq The value returned by RANGE_SEQ_IF::init() + range_info Information about the next range + (Ignored if MRR_NO_ASSOCIATION is set) + rowid Rowid of the record to be checked (ignored if set to 0) + + RETURN + 1 - Record with this range_info and/or this rowid shall be filtered + out from the stream of records returned by multi_range_read_next() + 0 - The record shall be left in the stream + */ + bool (*skip_record) (range_seq_t seq, char *range_info, uchar *rowid); + + /* + Check if the record combination matches the index condition + SYNOPSIS + skip_index_tuple() + seq The value returned by RANGE_SEQ_IF::init() + range_info Information about the next range + + RETURN + 0 - The record combination satisfies the index condition + 1 - Otherwise + */ + bool (*skip_index_tuple) (range_seq_t seq, char *range_info); +} RANGE_SEQ_IF; + +class COST_VECT +{ +public: + double io_count; /* number of I/O */ + double avg_io_cost; /* cost of an average I/O oper. */ + double cpu_cost; /* cost of operations in CPU */ + double mem_cost; /* cost of used memory */ + double import_cost; /* cost of remote operations */ + + enum { IO_COEFF=1 }; + enum { CPU_COEFF=1 }; + enum { MEM_COEFF=1 }; + enum { IMPORT_COEFF=1 }; + + COST_VECT() {} // keep gcc happy + + double total_cost() + { + return IO_COEFF*io_count*avg_io_cost + CPU_COEFF * cpu_cost + + MEM_COEFF*mem_cost + IMPORT_COEFF*import_cost; + } + + void zero() + { + avg_io_cost= 1.0; + io_count= cpu_cost= mem_cost= import_cost= 0.0; + } + + void multiply(double m) + { + io_count *= m; + cpu_cost *= m; + import_cost *= m; + /* Don't multiply mem_cost */ + } + + void add(const COST_VECT* cost) + { + double io_count_sum= io_count + cost->io_count; + add_io(cost->io_count, cost->avg_io_cost); + io_count= io_count_sum; + cpu_cost += cost->cpu_cost; + } + void add_io(double add_io_cnt, double add_avg_cost) + { + double io_count_sum= io_count + add_io_cnt; + avg_io_cost= (io_count * avg_io_cost + + add_io_cnt * add_avg_cost) / io_count_sum; + io_count= io_count_sum; + } + + /* + To be used when we go from old single value-based cost calculations to + the new COST_VECT-based. + */ + void convert_from_cost(double cost) + { + zero(); + avg_io_cost= 1.0; + io_count= cost; + } +}; + +void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, + COST_VECT *cost); + +/* + The below two are not used (and not handled) in this milestone of this WL + entry because there seems to be no use for them at this stage of + implementation. +*/ +#define HA_MRR_SINGLE_POINT 1 +#define HA_MRR_FIXED_KEY 2 + +/* + Indicates that RANGE_SEQ_IF::next(&range) doesn't need to fill in the + 'range' parameter. +*/ +#define HA_MRR_NO_ASSOCIATION 4 + +/* + The MRR user will provide ranges in key order, and MRR implementation + must return rows in key order. +*/ +#define HA_MRR_SORTED 8 + +/* MRR implementation doesn't have to retrieve full records */ +#define HA_MRR_INDEX_ONLY 16 + +/* + The passed memory buffer is of maximum possible size, the caller can't + assume larger buffer. +*/ +#define HA_MRR_LIMITS 32 + + +/* + Flag set <=> default MRR implementation is used + (The choice is made by **_info[_const]() function which may set this + flag. SQL layer remembers the flag value and then passes it to + multi_read_range_init(). +*/ +#define HA_MRR_USE_DEFAULT_IMPL 64 + +/* + Used only as parameter to multi_range_read_info(): + Flag set <=> the caller guarantees that the bounds of the scanned ranges + will not have NULL values. +*/ +#define HA_MRR_NO_NULL_ENDPOINTS 128 + + /* This is a buffer area that the handler can use to store rows. @@ -1025,8 +1210,8 @@ typedef struct st_ha_check_opt typedef struct st_handler_buffer { - const uchar *buffer; /* Buffer one can start using */ - const uchar *buffer_end; /* End of buffer */ + /* const? */uchar *buffer; /* Buffer one can start using */ + /* const? */uchar *buffer_end; /* End of buffer */ uchar *end_of_used_area; /* End of area that was used by handler */ } HANDLER_BUFFER; @@ -1057,11 +1242,16 @@ public: ulong update_time; uint block_size; /* index block size */ + /* + number of buffer bytes that native mrr implementation needs, + */ + uint mrr_length_per_rec; + ha_statistics(): data_file_length(0), max_data_file_length(0), index_file_length(0), delete_length(0), auto_increment_value(0), records(0), deleted(0), mean_rec_length(0), create_time(0), - check_time(0), update_time(0), block_size(0) + check_time(0), update_time(0), block_size(0), mrr_length_per_rec(0) {} }; @@ -1101,10 +1291,27 @@ public: ha_statistics stats; /** The following are for read_multi_range */ +//////psergey: was: +#if 0 bool multi_range_sorted; KEY_MULTI_RANGE *multi_range_curr; KEY_MULTI_RANGE *multi_range_end; HANDLER_BUFFER *multi_range_buffer; +#endif +//////psergey + /** MultiRangeRead-related members: */ + range_seq_t mrr_iter; /* Interator to traverse the range sequence */ + RANGE_SEQ_IF mrr_funcs; /* Range sequence traversal functions */ + HANDLER_BUFFER *multi_range_buffer; /* MRR buffer info */ + uint ranges_in_seq; /* Total number of ranges in the traversed sequence */ + /* TRUE <=> source MRR ranges and the output are ordered */ + bool mrr_is_output_sorted; + + /** TRUE <=> we're currently traversing a range in mrr_cur_range. */ + bool mrr_have_range; + /** Current range (the one we're now returning rows from) */ + KEY_MULTI_RANGE mrr_cur_range; +//////psergey /** The following are for read_range() */ key_range save_end_range, *end_range; @@ -1112,6 +1319,12 @@ public: int key_compare_result_on_equal; bool eq_range; + /* + TRUE <=> the engine guarantees that returned records are within the range + being scanned. + */ + bool in_range_check_pushed_down; + uint errkey; /* Last dup key */ uint key_used_on_scan; uint active_index; @@ -1122,6 +1335,8 @@ public: bool locked; bool implicit_emptied; /* Can be !=0 only if HEAP */ const COND *pushed_cond; + Item *pushed_idx_cond; + uint pushed_idx_cond_keyno; /* The index which the above condition is for */ /** next_insert_id is the next value which should be inserted into the auto_increment column: in a inserting-multi-row statement (like INSERT @@ -1161,7 +1376,8 @@ public: handler(handlerton *ht_arg, TABLE_SHARE *share_arg) :table_share(share_arg), table(0), estimation_rows_to_insert(0), ht(ht_arg), - ref(0), key_used_on_scan(MAX_KEY), active_index(MAX_KEY), + ref(0), in_range_check_pushed_down(FALSE), + key_used_on_scan(MAX_KEY), active_index(MAX_KEY), ref_length(sizeof(my_off_t)), ft_handler(0), inited(NONE), locked(FALSE), implicit_emptied(0), @@ -1192,6 +1408,7 @@ public: DBUG_ASSERT(inited==NONE); if (!(result= index_init(idx, sorted))) inited=INDEX; + end_range= NULL; DBUG_RETURN(result); } int ha_index_end() @@ -1199,6 +1416,7 @@ public: DBUG_ENTER("ha_index_end"); DBUG_ASSERT(inited==INDEX); inited=NONE; + end_range= NULL; DBUG_RETURN(index_end()); } /* This is called after index_init() if we need to do a index scan */ @@ -1307,6 +1525,7 @@ public: { return ulonglong2double(stats.data_file_length) / IO_SIZE + 2; } virtual double read_time(uint index, uint ranges, ha_rows rows) { return rows2double(ranges+rows); } + virtual double index_only_read_time(uint keynr, double records); virtual const key_map *keys_to_use_for_scanning() { return &key_map_empty; } bool has_transactions() { return (ha_table_flags() & HA_NO_TRANSACTIONS) == 0; } @@ -1523,16 +1742,30 @@ public: update_index_statistics(); return error; } - +#if 0 virtual int read_multi_range_first(KEY_MULTI_RANGE **found_range_p, KEY_MULTI_RANGE *ranges, uint range_count, bool sorted, HANDLER_BUFFER *buffer); virtual int read_multi_range_next(KEY_MULTI_RANGE **found_range_p); +#endif + //psergey-mrr: + virtual ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, + void *seq_init_param, + uint n_ranges, uint *bufsz, + uint *flags, COST_VECT *cost); + virtual ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys, + uint *bufsz, uint *flags, COST_VECT *cost); + virtual int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param, + uint n_ranges, uint mode, + HANDLER_BUFFER *buf); + virtual int multi_range_read_next(char **range_info); + //psergey-mrr: ends virtual int read_range_first(const key_range *start_key, const key_range *end_key, bool eq_range, bool sorted); virtual int read_range_next(); int compare_key(key_range *range); + int compare_key2(key_range *range); virtual int ft_init() { return HA_ERR_WRONG_COMMAND; } void ft_end() { ft_handler=NULL; } virtual FT_INFO *ft_init_ext(uint flags, uint inx,String *key) @@ -1878,6 +2111,7 @@ public: Pops the top if condition stack, if stack is not empty. */ virtual void cond_pop() { return; }; + virtual Item *idx_cond_push(uint keyno, Item* idx_cond) { return idx_cond; } virtual bool check_if_incompatible_data(HA_CREATE_INFO *create_info, uint table_changes) { return COMPATIBLE_DATA_NO; } @@ -2093,8 +2327,10 @@ private: virtual int rename_partitions(const char *path) { return HA_ERR_WRONG_COMMAND; } friend class ha_partition; + friend class DsMrr_impl; }; +#include "ds_mrr.h" /* Some extern variables used with handlers */ diff --git a/sql/item.h b/sql/item.h index 82fb3f3a3e0..640b0cc5375 100644 --- a/sql/item.h +++ b/sql/item.h @@ -25,7 +25,7 @@ bool trace_unsupported_func(const char *where, const char *processor_name) sprintf(buff, "%s::%s", where, processor_name); DBUG_ENTER(buff); sprintf(buff, "%s returns TRUE: unsupported function", processor_name); - DBUG_PRINT("info", (buff)); + DBUG_PRINT("info", ("%s", buff)); DBUG_RETURN(TRUE); } diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h index 77f55db145a..4f576bfca4c 100644 --- a/sql/mysql_priv.h +++ b/sql/mysql_priv.h @@ -345,11 +345,11 @@ protected: The cost of average seek DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST*BLOCKS_IN_AVG_SEEK =1.0. */ -#define DISK_SEEK_BASE_COST ((double)0.5) +#define DISK_SEEK_BASE_COST ((double)0.9) #define BLOCKS_IN_AVG_SEEK 128 -#define DISK_SEEK_PROP_COST ((double)0.5/BLOCKS_IN_AVG_SEEK) +#define DISK_SEEK_PROP_COST ((double)0.1/BLOCKS_IN_AVG_SEEK) /** diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 81f02157ee0..9ef69e3f5a6 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -3526,6 +3526,8 @@ static int init_common_variables(const char *conf_file_name, int argc, global_system_variables.character_set_results= default_charset_info; global_system_variables.character_set_client= default_charset_info; + global_system_variables.optimizer_use_mrr= 1; + if (!(character_set_filesystem= get_charset_by_csname(character_set_filesystem_name, MY_CS_PRIMARY, MYF(MY_WME)))) @@ -6961,11 +6963,6 @@ The minimum value for this variable is 4096.", (uchar**) &global_system_variables.min_examined_row_limit, (uchar**) &max_system_variables.min_examined_row_limit, 0, GET_ULONG, REQUIRED_ARG, 0, 0, (longlong) ULONG_MAX, 0, 1L, 0}, - {"multi_range_count", OPT_MULTI_RANGE_COUNT, - "Number of key ranges to request at once.", - (uchar**) &global_system_variables.multi_range_count, - (uchar**) &max_system_variables.multi_range_count, 0, - GET_ULONG, REQUIRED_ARG, 256, 1, (longlong) ULONG_MAX, 0, 1, 0}, {"myisam_block_size", OPT_MYISAM_BLOCK_SIZE, "Block size to be used for MyISAM index pages.", (uchar**) &opt_myisam_block_size, @@ -7129,7 +7126,7 @@ The minimum value for this variable is 4096.", (uchar**) &global_system_variables.read_rnd_buff_size, (uchar**) &max_system_variables.read_rnd_buff_size, 0, GET_ULONG, REQUIRED_ARG, 256*1024L, IO_SIZE*2+MALLOC_OVERHEAD, - INT_MAX32, MALLOC_OVERHEAD, IO_SIZE, 0}, + INT_MAX32, MALLOC_OVERHEAD, 1 /* Small overhead to be able to test MRR, was: IO_SIZE*/ , 0}, {"record_buffer", OPT_RECORD_BUFFER, "Alias for read_buffer_size", (uchar**) &global_system_variables.read_buff_size, diff --git a/sql/opt_range.cc b/sql/opt_range.cc index adb5a7c8c96..c75999da00c 100644 --- a/sql/opt_range.cc +++ b/sql/opt_range.cc @@ -34,7 +34,7 @@ The lists are returned in form of complicated structure of interlinked SEL_TREE/SEL_IMERGE/SEL_ARG objects. - See check_quick_keys, find_used_partitions for examples of how to walk + See quick_range_seq_next, find_used_partitions for examples of how to walk this structure. All direct "users" of this module are located within this file, too. @@ -59,6 +59,48 @@ Record retrieval code for range/index_merge/groupby-min-max. Implementations of QUICK_*_SELECT classes. + + KeyTupleFormat + ~~~~~~~~~~~~~~ + The code in this file (and elsewhere) makes operations on key value tuples. + Those tuples are stored in the following format: + + The tuple is a sequence of key part values. The length of key part value + depends only on its type (and not depends on the what value is stored) + + KeyTuple: keypart1-data, keypart2-data, ... + + The value of each keypart is stored in the following format: + + keypart_data: [isnull_byte] keypart-value-bytes + + If a keypart may have a NULL value (key_part->field->real_maybe_null() can + be used to check this), then the first byte is a NULL indicator with the + following valid values: + 1 - keypart has NULL value. + 0 - keypart has non-NULL value. + + <questionable-statement> If isnull_byte==1 (NULL value), then the following + keypart->length bytes must be 0. + </questionable-statement> + + keypart-value-bytes holds the value. Its format depends on the field type. + The length of keypart-value-bytes may or may not depend on the value being + stored. The default is that length is static and equal to + KEY_PART_INFO::length. + + Key parts with (key_part_flag & HA_BLOB_PART) have length depending of the + value: + + keypart-value-bytes: value_length value_bytes + + The value_length part itself occupies HA_KEY_BLOB_LENGTH=2 bytes. + + See key_copy() and key_restore() for code to move data between index tuple + and table record + + CAUTION: the above description is only sergefp's understanding of the + subject and may omit some details. */ #ifdef USE_PRAGMA_IMPLEMENTATION @@ -403,6 +445,7 @@ public: /* returns a number of keypart values (0 or 1) appended to the key buffer */ int store_min(uint length, uchar **min_key,uint min_key_flag) { + /* "(kp1 > c1) AND (kp2 OP c2) AND ..." -> (kp1 > c1) */ if ((min_flag & GEOM_FLAG) || (!(min_flag & NO_MIN_RANGE) && !(min_key_flag & (NO_MIN_RANGE | NEAR_MIN)))) @@ -627,6 +670,11 @@ public: */ bool using_real_indexes; + /* + Aggressively remove "scans" that do not have conditions on first + keyparts. Such scans are usable when doing partition pruning but not + regular range optimization. + */ bool remove_jump_scans; /* @@ -634,8 +682,17 @@ public: using_real_indexes==TRUE */ uint real_keynr[MAX_KEY]; + + /* + Used to store 'current key tuples', in both range analysis and + partitioning (list) analysis + */ + uchar min_key[MAX_KEY_LENGTH+MAX_FIELD_WIDTH], + max_key[MAX_KEY_LENGTH+MAX_FIELD_WIDTH]; + /* Number of SEL_ARG objects allocated by SEL_ARG::clone_tree operations */ uint alloced_sel_args; + bool force_default_mrr; }; class PARAM : public RANGE_OPT_PARAM @@ -645,8 +702,6 @@ public: longlong baseflag; uint max_key_part, range_count; - uchar min_key[MAX_KEY_LENGTH+MAX_FIELD_WIDTH], - max_key[MAX_KEY_LENGTH+MAX_FIELD_WIDTH]; bool quick; // Don't calulate possible keys uint fields_bitmap_size; @@ -683,15 +738,14 @@ static SEL_ARG *get_mm_leaf(RANGE_OPT_PARAM *param,COND *cond_func,Field *field, static SEL_TREE *get_mm_tree(RANGE_OPT_PARAM *param,COND *cond); static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts); -static ha_rows check_quick_select(PARAM *param,uint index,SEL_ARG *key_tree, - bool update_tbl_stats); -static ha_rows check_quick_keys(PARAM *param,uint index,SEL_ARG *key_tree, - uchar *min_key, uint min_key_flag, int, - uchar *max_key, uint max_key_flag, int); +static ha_rows check_quick_select(PARAM *param, uint idx, bool index_only, + SEL_ARG *tree, bool update_tbl_stats, + uint *mrr_flags, uint *bufsize, + COST_VECT *cost); QUICK_RANGE_SELECT *get_quick_select(PARAM *param,uint index, - SEL_ARG *key_tree, - MEM_ROOT *alloc = NULL); + SEL_ARG *key_tree, uint mrr_flags, + uint mrr_buf_size, MEM_ROOT *alloc); static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree, bool index_read_must_be_used, bool update_tbl_stats, @@ -709,8 +763,6 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge, double read_time); static TRP_GROUP_MIN_MAX *get_best_group_min_max(PARAM *param, SEL_TREE *tree); -static double get_index_only_read_time(const PARAM* param, ha_rows records, - int keynr); #ifndef DBUG_OFF static void print_sel_tree(PARAM *param, SEL_TREE *tree, key_map *tree_map, @@ -1099,25 +1151,22 @@ QUICK_SELECT_I::QUICK_SELECT_I() {} QUICK_RANGE_SELECT::QUICK_RANGE_SELECT(THD *thd, TABLE *table, uint key_nr, - bool no_alloc, MEM_ROOT *parent_alloc) - :dont_free(0),error(0),free_file(0),in_range(0),cur_range(NULL),last_range(0) + bool no_alloc, MEM_ROOT *parent_alloc, + bool *create_error) + :free_file(0),cur_range(NULL),last_range(0),dont_free(0) { my_bitmap_map *bitmap; DBUG_ENTER("QUICK_RANGE_SELECT::QUICK_RANGE_SELECT"); in_ror_merged_scan= 0; - sorted= 0; index= key_nr; head= table; key_part_info= head->key_info[index].key_part; my_init_dynamic_array(&ranges, sizeof(QUICK_RANGE*), 16, 16); /* 'thd' is not accessible in QUICK_RANGE_SELECT::reset(). */ - multi_range_bufsiz= thd->variables.read_rnd_buff_size; - multi_range_count= thd->variables.multi_range_count; - multi_range_length= 0; - multi_range= NULL; - multi_range_buff= NULL; + mrr_buf_size= thd->variables.read_rnd_buff_size; + mrr_buf_desc= NULL; if (!no_alloc && !parent_alloc) { @@ -1132,12 +1181,12 @@ QUICK_RANGE_SELECT::QUICK_RANGE_SELECT(THD *thd, TABLE *table, uint key_nr, save_read_set= head->read_set; save_write_set= head->write_set; - /* Allocate a bitmap for used columns */ + /* Allocate a bitmap for used columns (Q: why not on MEM_ROOT?) */ if (!(bitmap= (my_bitmap_map*) my_malloc(head->s->column_bitmap_size, MYF(MY_WME)))) { column_bitmap.bitmap= 0; - error= 1; + *create_error= 1; } else bitmap_init(&column_bitmap, bitmap, head->s->fields, FALSE); @@ -1145,6 +1194,20 @@ QUICK_RANGE_SELECT::QUICK_RANGE_SELECT(THD *thd, TABLE *table, uint key_nr, } +void QUICK_RANGE_SELECT::need_sorted_output() +{ + if (!(mrr_flags & HA_MRR_SORTED)) + { + /* + Native implementation can't produce sorted output. We'll have to + switch to default + */ + mrr_flags |= HA_MRR_USE_DEFAULT_IMPL; + } + mrr_flags |= HA_MRR_SORTED; +} + + int QUICK_RANGE_SELECT::init() { DBUG_ENTER("QUICK_RANGE_SELECT::init"); @@ -1190,8 +1253,7 @@ QUICK_RANGE_SELECT::~QUICK_RANGE_SELECT() my_free((char*) column_bitmap.bitmap, MYF(MY_ALLOW_ZERO_PTR)); } head->column_bitmaps_set(save_read_set, save_write_set); - x_free(multi_range); - x_free(multi_range_buff); + x_free(mrr_buf_desc); DBUG_VOID_RETURN; } @@ -1966,9 +2028,11 @@ class TRP_RANGE : public TABLE_READ_PLAN public: SEL_ARG *key; /* set of intervals to be used in "range" method retrieval */ uint key_idx; /* key number in PARAM::key */ + uint mrr_flags; + uint mrr_buf_size; - TRP_RANGE(SEL_ARG *key_arg, uint idx_arg) - : key(key_arg), key_idx(idx_arg) + TRP_RANGE(SEL_ARG *key_arg, uint idx_arg, uint mrr_flags_arg) + : key(key_arg), key_idx(idx_arg), mrr_flags(mrr_flags_arg) {} virtual ~TRP_RANGE() {} /* Remove gcc warning */ @@ -1977,7 +2041,8 @@ public: { DBUG_ENTER("TRP_RANGE::make_quick"); QUICK_RANGE_SELECT *quick; - if ((quick= get_quick_select(param, key_idx, key, parent_alloc))) + if ((quick= get_quick_select(param, key_idx, key, mrr_flags, + mrr_buf_size, parent_alloc))) { quick->records= records; quick->read_time= read_cost; @@ -2202,7 +2267,8 @@ static int fill_used_fields_bitmap(PARAM *param) int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use, table_map prev_tables, - ha_rows limit, bool force_quick_range) + ha_rows limit, bool force_quick_range, + bool ordered_output) { uint idx; double scan_time; @@ -2260,6 +2326,7 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use, param.imerge_cost_buff_size= 0; param.using_real_indexes= TRUE; param.remove_jump_scans= TRUE; + param.force_default_mrr= ordered_output; thd->no_errors=1; // Don't warn about NULL init_sql_alloc(&alloc, thd->variables.range_alloc_block_size, 0); @@ -2313,9 +2380,10 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use, if (!head->covering_keys.is_clear_all()) { int key_for_use= find_shortest_key(head, &head->covering_keys); - double key_read_time= (get_index_only_read_time(¶m, records, - key_for_use) + - (double) records / TIME_FOR_COMPARE); + double key_read_time= + param.table->file->index_only_read_time(key_for_use, + rows2double(records)) + + (double) records / TIME_FOR_COMPARE; DBUG_PRINT("info", ("'all'+'using index' scan will be using key %d, " "read time %g", key_for_use, key_read_time)); if (key_read_time < read_time) @@ -3937,42 +4005,6 @@ skip_to_ror_scan: } -/* - Calculate cost of 'index only' scan for given index and number of records. - - SYNOPSIS - get_index_only_read_time() - param parameters structure - records #of records to read - keynr key to read - - NOTES - It is assumed that we will read trough the whole key range and that all - key blocks are half full (normally things are much better). It is also - assumed that each time we read the next key from the index, the handler - performs a random seek, thus the cost is proportional to the number of - blocks read. - - TODO: - Move this to handler->read_time() by adding a flag 'index-only-read' to - this call. The reason for doing this is that the current function doesn't - handle the case when the row is stored in the b-tree (like in innodb - clustered index) -*/ - -static double get_index_only_read_time(const PARAM* param, ha_rows records, - int keynr) -{ - double read_time; - uint keys_per_block= (param->table->file->stats.block_size/2/ - (param->table->key_info[keynr].key_length+ - param->table->file->ref_length) + 1); - read_time=((double) (records+keys_per_block-1)/ - (double) keys_per_block); - return read_time; -} - - typedef struct st_ror_scan_info { uint idx; /* # of used key in param->keys */ @@ -4048,9 +4080,9 @@ ROR_SCAN_INFO *make_ror_scan(const PARAM *param, int idx, SEL_ARG *sel_arg) if (bitmap_is_set(¶m->needed_fields, key_part->fieldnr-1)) bitmap_set_bit(&ror_scan->covered_fields, key_part->fieldnr-1); } + double rows= rows2double(param->table->quick_rows[ror_scan->keynr]); ror_scan->index_read_cost= - get_index_only_read_time(param, param->table->quick_rows[ror_scan->keynr], - ror_scan->keynr); + param->table->file->index_only_read_time(ror_scan->keynr, rows); DBUG_RETURN(ror_scan); } @@ -4837,9 +4869,11 @@ static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree, bool update_tbl_stats, double read_time) { - int idx; + uint idx; SEL_ARG **key,**end, **key_to_read= NULL; ha_rows UNINIT_VAR(best_records); /* protected by key_to_read */ + uint UNINIT_VAR(best_mrr_flags), /* protected by key_to_read */ + UNINIT_VAR(best_buf_size); /* protected by key_to_read */ TRP_RANGE* read_plan= NULL; bool pk_is_clustered= param->table->file->primary_key_is_clustered(); DBUG_ENTER("get_key_scans_params"); @@ -4852,64 +4886,40 @@ static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree, "tree scans");); tree->ror_scans_map.clear_all(); tree->n_ror_scans= 0; - for (idx= 0,key=tree->keys, end=key+param->keys; - key != end ; - key++,idx++) + for (idx= 0,key=tree->keys, end=key+param->keys; key != end; key++,idx++) { - ha_rows found_records; - double found_read_time; if (*key) { + ha_rows found_records; + COST_VECT cost; + double found_read_time; + uint mrr_flags, buf_size; uint keynr= param->real_keynr[idx]; if ((*key)->type == SEL_ARG::MAYBE_KEY || (*key)->maybe_flag) param->needed_reg->set_bit(keynr); - bool read_index_only= index_read_must_be_used ? TRUE : - (bool) param->table->covering_keys.is_set(keynr); + bool read_index_only= index_read_must_be_used || + param->table->covering_keys.is_set(keynr); - found_records= check_quick_select(param, idx, *key, update_tbl_stats); - if (param->is_ror_scan) + found_records= check_quick_select(param, idx, read_index_only, *key, + update_tbl_stats, &mrr_flags, + &buf_size, &cost); + + if ((found_records != HA_POS_ERROR) && param->is_ror_scan) { tree->n_ror_scans++; tree->ror_scans_map.set_bit(idx); } - double cpu_cost= (double) found_records / TIME_FOR_COMPARE; - if (found_records != HA_POS_ERROR && found_records > 2 && - read_index_only && - (param->table->file->index_flags(keynr, param->max_key_part,1) & - HA_KEYREAD_ONLY) && - !(pk_is_clustered && keynr == param->table->s->primary_key)) - { - /* - We can resolve this by only reading through this key. - 0.01 is added to avoid races between range and 'index' scan. - */ - found_read_time= get_index_only_read_time(param,found_records,keynr) + - cpu_cost + 0.01; - } - else - { - /* - cost(read_through_index) = cost(disk_io) + cost(row_in_range_checks) - The row_in_range check is in QUICK_RANGE_SELECT::cmp_next function. - */ - found_read_time= param->table->file->read_time(keynr, - param->range_count, - found_records) + - cpu_cost + 0.01; - } - DBUG_PRINT("info",("key %s: found_read_time: %g (cur. read_time: %g)", - param->table->key_info[keynr].name, found_read_time, - read_time)); - - if (read_time > found_read_time && found_records != HA_POS_ERROR) + if (found_records != HA_POS_ERROR && + read_time > (found_read_time= cost.total_cost())) { read_time= found_read_time; best_records= found_records; key_to_read= key; + best_mrr_flags= mrr_flags; + best_buf_size= buf_size; } - } } @@ -4918,11 +4928,13 @@ static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree, if (key_to_read) { idx= key_to_read - tree->keys; - if ((read_plan= new (param->mem_root) TRP_RANGE(*key_to_read, idx))) + if ((read_plan= new (param->mem_root) TRP_RANGE(*key_to_read, idx, + best_mrr_flags))) { read_plan->records= best_records; read_plan->is_ror= tree->ror_scans_map.is_set(idx); read_plan->read_cost= read_time; + read_plan->mrr_buf_size= best_buf_size; DBUG_PRINT("info", ("Returning range plan for key %s, cost %g, records %lu", param->table->key_info[param->real_keynr[idx]].name, @@ -4985,7 +4997,9 @@ QUICK_SELECT_I *TRP_ROR_INTERSECT::make_quick(PARAM *param, for (; first_scan != last_scan;++first_scan) { if (!(quick= get_quick_select(param, (*first_scan)->idx, - (*first_scan)->sel_arg, alloc)) || + (*first_scan)->sel_arg, + HA_MRR_USE_DEFAULT_IMPL | HA_MRR_SORTED, + 0, alloc)) || quick_intrsect->push_quick_back(quick)) { delete quick_intrsect; @@ -4995,7 +5009,9 @@ QUICK_SELECT_I *TRP_ROR_INTERSECT::make_quick(PARAM *param, if (cpk_scan) { if (!(quick= get_quick_select(param, cpk_scan->idx, - cpk_scan->sel_arg, alloc))) + cpk_scan->sel_arg, + HA_MRR_USE_DEFAULT_IMPL | HA_MRR_SORTED, + 0, alloc))) { delete quick_intrsect; DBUG_RETURN(NULL); @@ -7266,250 +7282,260 @@ void SEL_ARG::test_use_count(SEL_ARG *root) #endif +/**************************************************************************** + MRR Range Sequence Interface implementation that walks a SEL_ARG* tree. + ****************************************************************************/ + +/* MRR range sequence, SEL_ARG* implementation: stack entry */ +typedef struct st_range_seq_entry +{ + /* + Pointers in min and max keys. They point to right-after-end of key + images. The 0-th entry has these pointing to key tuple start. + */ + uchar *min_key, *max_key; + + /* + Flags, for {keypart0, keypart1, ... this_keypart} subtuple. + min_key_flag may have NULL_RANGE set. + */ + uint min_key_flag, max_key_flag; + + /* Number of key parts */ + uint min_key_parts, max_key_parts; + SEL_ARG *key_tree; +} RANGE_SEQ_ENTRY; + /* - Calculate estimate of number records that will be retrieved by a range - scan on given index using given SEL_ARG intervals tree. - SYNOPSIS - check_quick_select - param Parameter from test_quick_select - idx Number of index to use in tree->keys - tree Transformed selection condition, tree->keys[idx] - holds the range tree to be used for scanning. - update_tbl_stats If true, update table->quick_keys with information - about range scan we've evaluated. + MRR range sequence, SEL_ARG* implementation: SEL_ARG graph traversal context +*/ +typedef struct st_sel_arg_range_seq +{ + uint keyno; /* index of used tree in SEL_TREE structure */ + uint real_keyno; /* Number of the index in tables */ + PARAM *param; + SEL_ARG *start; /* Root node of the traversed SEL_ARG* graph */ + + RANGE_SEQ_ENTRY stack[MAX_REF_PARTS]; + int i; /* Index of last used element in the above array */ + + bool at_start; /* TRUE <=> The traversal has just started */ +} SEL_ARG_RANGE_SEQ; - NOTES - param->is_ror_scan is set to reflect if the key scan is a ROR (see - is_key_scan_ror function for more info) - param->table->quick_*, param->range_count (and maybe others) are - updated with data of given key scan, see check_quick_keys for details. - RETURN - Estimate # of records to be retrieved. - HA_POS_ERROR if estimate calculation failed due to table handler problems. +/* + Range sequence interface, SEL_ARG* implementation: Initialize the traversal + + SYNOPSIS + init() + init_params SEL_ARG tree traversal context + n_ranges [ignored] The number of ranges obtained + flags [ignored] HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY + RETURN + Value of init_param */ -static ha_rows -check_quick_select(PARAM *param,uint idx,SEL_ARG *tree, bool update_tbl_stats) +range_seq_t sel_arg_range_seq_init(void *init_param, uint n_ranges, uint flags) { - ha_rows records; - bool cpk_scan; - uint key; - DBUG_ENTER("check_quick_select"); + SEL_ARG_RANGE_SEQ *seq= (SEL_ARG_RANGE_SEQ*)init_param; + seq->at_start= TRUE; + seq->stack[0].key_tree= NULL; + seq->stack[0].min_key= seq->param->min_key; + seq->stack[0].min_key_flag= 0; + seq->stack[0].min_key_parts= 0; - param->is_ror_scan= FALSE; - param->first_null_comp= 0; + seq->stack[0].max_key= seq->param->max_key; + seq->stack[0].max_key_flag= 0; + seq->stack[0].max_key_parts= 0; + seq->i= 0; + return init_param; +} - if (!tree) - DBUG_RETURN(HA_POS_ERROR); // Can't use it - param->max_key_part=0; - param->range_count=0; - key= param->real_keynr[idx]; - if (tree->type == SEL_ARG::IMPOSSIBLE) - DBUG_RETURN(0L); // Impossible select. return - if (tree->type != SEL_ARG::KEY_RANGE || tree->part != 0) - DBUG_RETURN(HA_POS_ERROR); // Don't use tree +static void step_down_to(SEL_ARG_RANGE_SEQ *arg, SEL_ARG *key_tree) +{ + RANGE_SEQ_ENTRY *cur= &arg->stack[arg->i+1]; + RANGE_SEQ_ENTRY *prev= &arg->stack[arg->i]; + + cur->key_tree= key_tree; + cur->min_key= prev->min_key; + cur->max_key= prev->max_key; + cur->min_key_parts= prev->min_key_parts; + cur->max_key_parts= prev->max_key_parts; - enum ha_key_alg key_alg= param->table->key_info[key].algorithm; - if ((key_alg != HA_KEY_ALG_BTREE) && (key_alg!= HA_KEY_ALG_UNDEF)) - { - /* Records are not ordered by rowid for other types of indexes. */ - cpk_scan= FALSE; - } - else - { - /* - Clustered PK scan is a special case, check_quick_keys doesn't recognize - CPK scans as ROR scans (while actually any CPK scan is a ROR scan). - */ - cpk_scan= ((param->table->s->primary_key == param->real_keynr[idx]) && - param->table->file->primary_key_is_clustered()); - param->is_ror_scan= !cpk_scan; - } - param->n_ranges= 0; + uint16 stor_length= arg->param->key[arg->keyno][key_tree->part].store_length; + cur->min_key_parts += key_tree->store_min(stor_length, &cur->min_key, + prev->min_key_flag); + cur->max_key_parts += key_tree->store_max(stor_length, &cur->max_key, + prev->max_key_flag); - records= check_quick_keys(param, idx, tree, - param->min_key, 0, -1, - param->max_key, 0, -1); - if (records != HA_POS_ERROR) - { - if (update_tbl_stats) - { - param->table->quick_keys.set_bit(key); - param->table->quick_key_parts[key]=param->max_key_part+1; - param->table->quick_n_ranges[key]= param->n_ranges; - param->table->quick_condition_rows= - min(param->table->quick_condition_rows, records); - } - /* - Need to save quick_rows in any case as it is used when calculating - cost of ROR intersection: - */ - param->table->quick_rows[key]=records; - if (cpk_scan) - param->is_ror_scan= TRUE; - } - if (param->table->file->index_flags(key, 0, TRUE) & HA_KEY_SCAN_NOT_ROR) - param->is_ror_scan= FALSE; - DBUG_PRINT("exit", ("Records: %lu", (ulong) records)); - DBUG_RETURN(records); + cur->min_key_flag= prev->min_key_flag | key_tree->min_flag; + cur->max_key_flag= prev->max_key_flag | key_tree->max_flag; + + if (key_tree->is_null_interval()) + cur->min_key_flag |= NULL_RANGE; + (arg->i)++; } /* - Recursively calculate estimate of # rows that will be retrieved by - key scan on key idx. + Range sequence interface, SEL_ARG* implementation: get the next interval + SYNOPSIS - check_quick_keys() - param Parameter from test_quick select function. - idx Number of key to use in PARAM::keys in list of used keys - (param->real_keynr[idx] holds the key number in table) - key_tree SEL_ARG tree being examined. - min_key Buffer with partial min key value tuple - min_key_flag - max_key Buffer with partial max key value tuple - max_key_flag - - NOTES - The function does the recursive descent on the tree via SEL_ARG::left, - SEL_ARG::right, and SEL_ARG::next_key_part edges. The #rows estimates - are calculated using records_in_range calls at the leaf nodes and then - summed. - - param->min_key and param->max_key are used to hold prefixes of key value - tuples. - - The side effects are: - - param->max_key_part is updated to hold the maximum number of key parts used - in scan minus 1. + sel_arg_range_seq_next() + rseq Value returned from sel_arg_range_seq_init + range OUT Store information about the range here - param->range_count is incremented if the function finds a range that - wasn't counted by the caller. + DESCRIPTION + This is "get_next" function for Range sequence interface implementation + for SEL_ARG* tree. - param->is_ror_scan is cleared if the function detects that the key scan is - not a Rowid-Ordered Retrieval scan ( see comments for is_key_scan_ror - function for description of which key scans are ROR scans) + IMPLEMENTATION + The traversal also updates those param members: + - is_ror_scan + - range_count + - max_key_part RETURN - #records E(#records) for given subtree - HA_POS_ERROR if subtree cannot be used for record retrieval - + 0 Ok + 1 No more ranges in the sequence */ -static ha_rows -check_quick_keys(PARAM *param, uint idx, SEL_ARG *key_tree, - uchar *min_key, uint min_key_flag, int min_keypart, - uchar *max_key, uint max_key_flag, int max_keypart) +//psergey-merge-todo: support check_quick_keys:max_keypart +uint sel_arg_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range) { - ha_rows records=0, tmp; - uint tmp_min_flag, tmp_max_flag, keynr, min_key_length, max_key_length; - uint tmp_min_keypart= min_keypart, tmp_max_keypart= max_keypart; - uchar *tmp_min_key, *tmp_max_key; - uint8 save_first_null_comp= param->first_null_comp; - - param->max_key_part=max(param->max_key_part,key_tree->part); - if (key_tree->left != &null_element) + SEL_ARG *key_tree; + SEL_ARG_RANGE_SEQ *seq= (SEL_ARG_RANGE_SEQ*)rseq; + if (seq->at_start) { - /* - There are at least two intervals for current key part, i.e. condition - was converted to something like - (keyXpartY less/equals c1) OR (keyXpartY more/equals c2). - This is not a ROR scan if the key is not Clustered Primary Key. - */ - param->is_ror_scan= FALSE; - records=check_quick_keys(param, idx, key_tree->left, - min_key, min_key_flag, min_keypart, - max_key, max_key_flag, max_keypart); - if (records == HA_POS_ERROR) // Impossible - return records; + key_tree= seq->start; + seq->at_start= FALSE; + goto walk_up_n_right; } - tmp_min_key= min_key; - tmp_max_key= max_key; - tmp_min_keypart+= key_tree->store_min(param->key[idx][key_tree->part].store_length, - &tmp_min_key, min_key_flag); - tmp_max_keypart+= key_tree->store_max(param->key[idx][key_tree->part].store_length, - &tmp_max_key, max_key_flag); - min_key_length= (uint) (tmp_min_key - param->min_key); - max_key_length= (uint) (tmp_max_key - param->max_key); - - if (param->is_ror_scan) + key_tree= seq->stack[seq->i].key_tree; + /* Ok, we're at some "full tuple" position in the tree */ + + /* Step down if we can */ + if (key_tree->next && key_tree->next != &null_element) { - /* - If the index doesn't cover entire key, mark the scan as non-ROR scan. - Actually we're cutting off some ROR scans here. - */ - uint16 fieldnr= param->table->key_info[param->real_keynr[idx]]. - key_part[key_tree->part].fieldnr - 1; - if (param->table->field[fieldnr]->key_length() != - param->key[idx][key_tree->part].length) - param->is_ror_scan= FALSE; + //step down; (update the tuple, we'll step right and stay there) + seq->i--; + step_down_to(seq, key_tree->next); + key_tree= key_tree->next; + seq->param->is_ror_scan= FALSE; + goto walk_right_n_up; } - if (!param->first_null_comp && key_tree->is_null_interval()) - param->first_null_comp= key_tree->part+1; + /* Ok, can't step down, walk left until we can step down */ + while (1) + { + if (seq->i == 1) // can't step left + return 1; + /* Step left */ + seq->i--; + key_tree= seq->stack[seq->i].key_tree; - if (key_tree->next_key_part && - key_tree->next_key_part->part == key_tree->part+1 && - key_tree->next_key_part->type == SEL_ARG::KEY_RANGE) - { // const key as prefix - if (min_key_length == max_key_length && - !memcmp(min_key, max_key, (uint) (tmp_max_key - max_key)) && - !key_tree->min_flag && !key_tree->max_flag) - { - tmp=check_quick_keys(param,idx,key_tree->next_key_part, tmp_min_key, - min_key_flag | key_tree->min_flag, tmp_min_keypart, - tmp_max_key, max_key_flag | key_tree->max_flag, - tmp_max_keypart); - goto end; // Ugly, but efficient - } - else + /* Step down if we can */ + if (key_tree->next && key_tree->next != &null_element) { - /* The interval for current key part is not c1 <= keyXpartY <= c1 */ - param->is_ror_scan= FALSE; + // Step down; update the tuple + seq->i--; + step_down_to(seq, key_tree->next); + key_tree= key_tree->next; + break; } - - tmp_min_flag=key_tree->min_flag; - tmp_max_flag=key_tree->max_flag; - if (!tmp_min_flag) - tmp_min_keypart+= - key_tree->next_key_part->store_min_key(param->key[idx], &tmp_min_key, - &tmp_min_flag); - if (!tmp_max_flag) - tmp_max_keypart+= - key_tree->next_key_part->store_max_key(param->key[idx], &tmp_max_key, - &tmp_max_flag); - min_key_length= (uint) (tmp_min_key - param->min_key); - max_key_length= (uint) (tmp_max_key - param->max_key); } - else + + /* + Ok, we've stepped down from the path to previous tuple. + Walk right-up while we can + */ +walk_right_n_up: + while (key_tree->next_key_part && key_tree->next_key_part != &null_element && + key_tree->next_key_part->part == key_tree->part + 1 && + key_tree->next_key_part->type == SEL_ARG::KEY_RANGE) { - tmp_min_flag= min_key_flag | key_tree->min_flag; - tmp_max_flag= max_key_flag | key_tree->max_flag; + { + RANGE_SEQ_ENTRY *cur= &seq->stack[seq->i]; + uint min_key_length= cur->min_key - seq->param->min_key; + uint max_key_length= cur->max_key - seq->param->max_key; + uint len= cur->min_key - cur[-1].min_key; + if (!(min_key_length == max_key_length && + !memcmp(cur[-1].min_key, cur[-1].max_key, len) && + !key_tree->min_flag && !key_tree->max_flag)) + { + seq->param->is_ror_scan= FALSE; + if (!key_tree->min_flag) + cur->min_key_parts += + key_tree->next_key_part->store_min_key(seq->param->key[seq->keyno], + &cur->min_key, + &cur->min_key_flag); + if (!key_tree->max_flag) + cur->max_key_parts += + key_tree->next_key_part->store_max_key(seq->param->key[seq->keyno], + &cur->max_key, + &cur->max_key_flag); + break; + } + } + + /* + Ok, current atomic interval is in form "t.field=const" and there is + next_key_part interval. Step right, and walk up from there. + */ + key_tree= key_tree->next_key_part; + +walk_up_n_right: + while (key_tree->prev && key_tree->prev != &null_element) + { + /* Step up */ + key_tree= key_tree->prev; + } + step_down_to(seq, key_tree); } - if (unlikely(param->thd->killed != 0)) - return HA_POS_ERROR; + /* Ok got a tuple */ + RANGE_SEQ_ENTRY *cur= &seq->stack[seq->i]; + uint min_key_length= cur->min_key - seq->param->min_key; - keynr=param->real_keynr[idx]; - param->range_count++; - if (!tmp_min_flag && ! tmp_max_flag && - (uint) key_tree->part+1 == param->table->key_info[keynr].key_parts && - (param->table->key_info[keynr].flags & (HA_NOSAME | HA_END_SPACE_KEY)) == - HA_NOSAME && min_key_length == max_key_length && - !memcmp(param->min_key, param->max_key, min_key_length) && - !param->first_null_comp) - { - tmp=1; // Max one record - param->n_ranges++; + range->ptr= (char*)(int)(key_tree->part); + if (cur->min_key_flag & GEOM_FLAG) + { + range->range_flag= cur->min_key_flag; + + /* Here minimum contains also function code bits, and maximum is +inf */ + range->start_key.key= seq->param->min_key; + range->start_key.length= min_key_length; + range->start_key.flag= (ha_rkey_function) (cur->min_key_flag ^ GEOM_FLAG); } else { - if (param->is_ror_scan) + range->range_flag= cur->min_key_flag | cur->max_key_flag; + + range->start_key.key= seq->param->min_key; + range->start_key.length= cur->min_key - seq->param->min_key; + range->start_key.keypart_map= make_prev_keypart_map(cur->min_key_parts); + range->start_key.flag= (cur->min_key_flag & NEAR_MIN ? HA_READ_AFTER_KEY : + HA_READ_KEY_EXACT); + + range->end_key.key= seq->param->max_key; + range->end_key.length= cur->max_key - seq->param->max_key; + range->end_key.flag= (cur->max_key_flag & NEAR_MAX ? HA_READ_BEFORE_KEY : + HA_READ_AFTER_KEY); + range->end_key.keypart_map= make_prev_keypart_map(cur->max_key_parts); + + if (!(cur->min_key_flag & ~NULL_RANGE) && !cur->max_key_flag && + (uint)key_tree->part+1 == seq->param->table->key_info[seq->real_keyno].key_parts && + (seq->param->table->key_info[seq->real_keyno].flags & (HA_NOSAME | HA_END_SPACE_KEY)) == + HA_NOSAME && + range->start_key.length == range->end_key.length && + !memcmp(seq->param->min_key,seq->param->max_key,range->start_key.length)) + range->range_flag= UNIQUE_RANGE | (cur->min_key_flag & NULL_RANGE); + + if (seq->param->is_ror_scan) { /* If we get here, the condition on the key was converted to form @@ -7520,69 +7546,137 @@ check_quick_keys(PARAM *param, uint idx, SEL_ARG *key_tree, uncovered "tail" of KeyX parts is either empty or is identical to first members of clustered primary key. */ - if (!(min_key_length == max_key_length && - !memcmp(min_key, max_key, (uint) (tmp_max_key - max_key)) && - !key_tree->min_flag && !key_tree->max_flag && - is_key_scan_ror(param, keynr, key_tree->part + 1))) - param->is_ror_scan= FALSE; + if (!(!(cur->min_key_flag & ~NULL_RANGE) && !cur->max_key_flag && + (range->start_key.length == range->end_key.length) && + !memcmp(range->start_key.key, range->end_key.key, range->start_key.length) && + is_key_scan_ror(seq->param, seq->real_keyno, key_tree->part + 1))) + seq->param->is_ror_scan= FALSE; } - param->n_ranges++; + } + seq->param->range_count++; + seq->param->max_key_part=max(seq->param->max_key_part,key_tree->part); + return 0; +} - if (tmp_min_flag & GEOM_FLAG) - { - key_range min_range; - min_range.key= param->min_key; - min_range.length= min_key_length; - min_range.keypart_map= make_keypart_map(tmp_min_keypart); - /* In this case tmp_min_flag contains the handler-read-function */ - min_range.flag= (ha_rkey_function) (tmp_min_flag ^ GEOM_FLAG); - tmp= param->table->file->records_in_range(keynr, - &min_range, (key_range*) 0); - } - else +/* + Calculate cost and E(#rows) for a given index and intervals tree + + SYNOPSIS + check_quick_select() + param Parameter from test_quick_select + idx Number of index to use in PARAM::key SEL_TREE::key + index_only TRUE - assume only index tuples will be accessed + FALSE - assume full table rows will be read + tree Transformed selection condition, tree->key[idx] holds + the intervals for the given index. + update_tbl_stats TRUE <=> update table->quick_* with information + about range scan we've evaluated. + mrr_flags INOUT MRR access flags + cost OUT Scan cost + + NOTES + param->is_ror_scan is set to reflect if the key scan is a ROR (see + is_key_scan_ror function for more info) + param->table->quick_*, param->range_count (and maybe others) are + updated with data of given key scan, see quick_range_seq_next for details. + + RETURN + Estimate # of records to be retrieved. + HA_POS_ERROR if estimate calculation failed due to table handler problems. +*/ + +static +ha_rows check_quick_select(PARAM *param, uint idx, bool index_only, + SEL_ARG *tree, bool update_tbl_stats, + uint *mrr_flags, uint *bufsize, COST_VECT *cost) +{ + SEL_ARG_RANGE_SEQ seq; + RANGE_SEQ_IF seq_if = {sel_arg_range_seq_init, sel_arg_range_seq_next, 0, 0}; + handler *file= param->table->file; + ha_rows rows; + uint keynr= param->real_keynr[idx]; + DBUG_ENTER("check_quick_select"); + + /* Handle cases when we don't have a valid non-empty list of range */ + if (!tree) + DBUG_RETURN(HA_POS_ERROR); + if (tree->type == SEL_ARG::IMPOSSIBLE) + DBUG_RETURN(0L); + if (tree->type != SEL_ARG::KEY_RANGE || tree->part != 0) + DBUG_RETURN(HA_POS_ERROR); + + seq.keyno= idx; + seq.real_keyno= keynr; + seq.param= param; + seq.start= tree; + + param->range_count=0; + param->max_key_part=0; + + param->is_ror_scan= TRUE; + if (file->index_flags(keynr, 0, TRUE) & HA_KEY_SCAN_NOT_ROR) + param->is_ror_scan= FALSE; + + *mrr_flags= param->force_default_mrr? HA_MRR_USE_DEFAULT_IMPL: 0; + /* + Pass HA_MRR_SORTED to see if MRR implementation can handle sorting. + */ + *mrr_flags|= HA_MRR_NO_ASSOCIATION | HA_MRR_SORTED; + + bool pk_is_clustered= file->primary_key_is_clustered(); + if (index_only && + (file->index_flags(keynr, param->max_key_part, 1) & HA_KEYREAD_ONLY) && + !(pk_is_clustered && keynr == param->table->s->primary_key)) + *mrr_flags |= HA_MRR_INDEX_ONLY; + + if (current_thd->lex->sql_command != SQLCOM_SELECT) + *mrr_flags |= HA_MRR_USE_DEFAULT_IMPL; + + *bufsize= param->thd->variables.read_rnd_buff_size; + rows= file->multi_range_read_info_const(keynr, &seq_if, (void*)&seq, 0, + bufsize, mrr_flags, cost); + if (rows != HA_POS_ERROR) + { + param->table->quick_rows[keynr]=rows; + if (update_tbl_stats) { - key_range min_range, max_range; - - min_range.key= param->min_key; - min_range.length= min_key_length; - min_range.flag= (tmp_min_flag & NEAR_MIN ? HA_READ_AFTER_KEY : - HA_READ_KEY_EXACT); - min_range.keypart_map= make_keypart_map(tmp_min_keypart); - max_range.key= param->max_key; - max_range.length= max_key_length; - max_range.flag= (tmp_max_flag & NEAR_MAX ? - HA_READ_BEFORE_KEY : HA_READ_AFTER_KEY); - max_range.keypart_map= make_keypart_map(tmp_max_keypart); - tmp=param->table->file->records_in_range(keynr, - (min_key_length ? &min_range : - (key_range*) 0), - (max_key_length ? &max_range : - (key_range*) 0)); + param->table->quick_keys.set_bit(keynr); + param->table->quick_key_parts[keynr]=param->max_key_part+1; + param->table->quick_n_ranges[keynr]= param->range_count; + param->table->quick_condition_rows= + min(param->table->quick_condition_rows, rows); } } - end: - if (tmp == HA_POS_ERROR) // Impossible range - return tmp; - records+=tmp; - if (key_tree->right != &null_element) + /* Figure out if the key scan is ROR (returns rows in ROWID order) or not */ + enum ha_key_alg key_alg= param->table->key_info[seq.real_keyno].algorithm; + if ((key_alg != HA_KEY_ALG_BTREE) && (key_alg!= HA_KEY_ALG_UNDEF)) { - /* - There are at least two intervals for current key part, i.e. condition - was converted to something like - (keyXpartY less/equals c1) OR (keyXpartY more/equals c2). - This is not a ROR scan if the key is not Clustered Primary Key. + /* + All scans are non-ROR scans for those index types. + TODO: Don't have this logic here, make table engines return + appropriate flags instead. + */ + param->is_ror_scan= FALSE; + } + else if (param->table->s->primary_key == keynr && pk_is_clustered) + { + /* Clustered PK scan is always a ROR scan (TODO: same as above) */ + param->is_ror_scan= TRUE; + } + else if (param->range_count > 1) + { + /* + Scaning multiple key values in the index: the records are ROR + for each value, but not between values. E.g, "SELECT ... x IN + (1,3)" returns ROR order for all records with x=1, then ROR + order for records with x=3 */ param->is_ror_scan= FALSE; - tmp=check_quick_keys(param, idx, key_tree->right, - min_key, min_key_flag, min_keypart, - max_key, max_key_flag, max_keypart); - if (tmp == HA_POS_ERROR) - return tmp; - records+=tmp; } - param->first_null_comp= save_first_null_comp; - return records; + + DBUG_PRINT("exit", ("Records: %lu", (ulong) rows)); + DBUG_RETURN(rows); //psergey-merge:todo: maintain first_null_comp. } @@ -7610,13 +7704,14 @@ check_quick_keys(PARAM *param, uint idx, SEL_ARG *key_tree, where the index is defined on (key1_1, ..., key1_N [,a_1, ..., a_n]) and the table has a clustered Primary Key defined as - PRIMARY KEY(a_1, ..., a_n, b1, ..., b_k) i.e. the first key parts of it are identical to uncovered parts ot the key being scanned. This function assumes that the index flags do not include HA_KEY_SCAN_NOT_ROR flag (that is checked elsewhere). + Check (1) is made in quick_range_seq_next() + RETURN TRUE The scan is ROR-scan FALSE Otherwise @@ -7629,9 +7724,19 @@ static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts) KEY_PART_INFO *key_part_end= (table_key->key_part + table_key->key_parts); uint pk_number; + + for (KEY_PART_INFO *kp= table_key->key_part; kp < key_part; kp++) + { + uint16 fieldnr= param->table->key_info[keynr]. + key_part[kp - table_key->key_part].fieldnr - 1; + if (param->table->field[fieldnr]->key_length() != kp->length) + return FALSE; + } if (key_part == key_part_end) return TRUE; + + key_part= table_key->key_part + nparts; pk_number= param->table->s->primary_key; if (!param->table->file->primary_key_is_clustered() || pk_number == MAX_KEY) return FALSE; @@ -7656,12 +7761,14 @@ static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts) SYNOPSIS get_quick_select() param - idx Index of used key in param->key. - key_tree SEL_ARG tree for the used key - parent_alloc If not NULL, use it to allocate memory for - quick select data. Otherwise use quick->alloc. + idx Index of used key in param->key. + key_tree SEL_ARG tree for the used key + mrr_flags MRR parameter for quick select + mrr_buf_size MRR parameter for quick select + parent_alloc If not NULL, use it to allocate memory for + quick select data. Otherwise use quick->alloc. NOTES - The caller must call QUICK_SELECT::init for returned quick select + The caller must call QUICK_SELECT::init for returned quick select. CAUTION! This function may change thd->mem_root to a MEM_ROOT which will be deallocated when the returned quick select is deleted. @@ -7672,25 +7779,26 @@ static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts) */ QUICK_RANGE_SELECT * -get_quick_select(PARAM *param,uint idx,SEL_ARG *key_tree, - MEM_ROOT *parent_alloc) +get_quick_select(PARAM *param,uint idx,SEL_ARG *key_tree, uint mrr_flags, + uint mrr_buf_size, MEM_ROOT *parent_alloc) { QUICK_RANGE_SELECT *quick; + bool create_err= FALSE; DBUG_ENTER("get_quick_select"); if (param->table->key_info[param->real_keynr[idx]].flags & HA_SPATIAL) quick=new QUICK_RANGE_SELECT_GEOM(param->thd, param->table, param->real_keynr[idx], test(parent_alloc), - parent_alloc); + parent_alloc, &create_err); else quick=new QUICK_RANGE_SELECT(param->thd, param->table, param->real_keynr[idx], - test(parent_alloc)); + test(parent_alloc), NULL, &create_err); if (quick) { - if (quick->error || + if (create_err || get_quick_keys(param,quick,param->key[idx],key_tree,param->min_key,0, param->max_key,0)) { @@ -7699,6 +7807,8 @@ get_quick_select(PARAM *param,uint idx,SEL_ARG *key_tree, } else { + quick->mrr_flags= mrr_flags; + quick->mrr_buf_size= mrr_buf_size; quick->key_parts=(KEY_PART*) memdup_root(parent_alloc? parent_alloc : &quick->alloc, (char*) param->key[idx], @@ -7847,7 +7957,20 @@ bool QUICK_RANGE_SELECT::unique_key_range() } -/* Returns TRUE if any part of the key is NULL */ + +/* + Return TRUE if any part of the key is NULL + + SYNOPSIS + null_part_in_key() + key_part Array of key parts (index description) + key Key values tuple + length Length of key values tuple in bytes. + + RETURN + TRUE The tuple has at least one "keypartX is NULL" + FALSE Otherwise +*/ static bool null_part_in_key(KEY_PART *key_part, const uchar *key, uint length) { @@ -7904,6 +8027,19 @@ bool QUICK_ROR_UNION_SELECT::is_keys_used(const MY_BITMAP *fields) } +FT_SELECT *get_ft_select(THD *thd, TABLE *table, uint key) +{ + bool create_err= FALSE; + FT_SELECT *fts= new FT_SELECT(thd, table, key, &create_err); + if (create_err) + { + delete fts; + return NULL; + } + else + return fts; +} + /* Create quick select from ref/ref_or_null scan. @@ -7932,10 +8068,12 @@ QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table, KEY_PART *key_part; QUICK_RANGE *range; uint part; + bool create_err= FALSE; + COST_VECT cost; old_root= thd->mem_root; /* The following call may change thd->mem_root */ - quick= new QUICK_RANGE_SELECT(thd, table, ref->key, 0); + quick= new QUICK_RANGE_SELECT(thd, table, ref->key, 0, 0, &create_err); /* save mem_root set by QUICK_RANGE_SELECT constructor */ alloc= thd->mem_root; /* @@ -7944,7 +8082,7 @@ QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table, */ thd->mem_root= old_root; - if (!quick) + if (!quick || create_err) return 0; /* no ranges found */ if (quick->init()) goto err; @@ -7999,8 +8137,24 @@ QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table, goto err; } - return quick; + /* Call multi_range_read_info() to get the MRR flags and buffer size */ + quick->mrr_flags= HA_MRR_NO_ASSOCIATION | + (table->key_read ? HA_MRR_INDEX_ONLY : 0); + if (thd->lex->sql_command != SQLCOM_SELECT) + quick->mrr_flags |= HA_MRR_USE_DEFAULT_IMPL; +#ifdef WITH_NDBCLUSTER_STORAGE_ENGINE + if (!ref->null_ref_key && !key_has_nulls(key_info, range->min_key, + ref->key_length)) + quick->mrr_flags |= HA_MRR_NO_NULL_ENDPOINTS; +#endif + quick->mrr_buf_size= thd->variables.read_rnd_buff_size; + if (table->file->multi_range_read_info(quick->index, 1, (uint)records, + &quick->mrr_buf_size, + &quick->mrr_flags, &cost)) + goto err; + + return quick; err: delete quick; return 0; @@ -8312,80 +8466,123 @@ int QUICK_ROR_UNION_SELECT::get_next() int QUICK_RANGE_SELECT::reset() { - uint mrange_bufsiz; + uint buf_size; uchar *mrange_buff; + int error; + HANDLER_BUFFER empty_buf; DBUG_ENTER("QUICK_RANGE_SELECT::reset"); - next=0; last_range= NULL; - in_range= FALSE; cur_range= (QUICK_RANGE**) ranges.buffer; if (file->inited == handler::NONE && (error= file->ha_index_init(index,1))) DBUG_RETURN(error); - /* Do not allocate the buffers twice. */ - if (multi_range_length) - { - DBUG_ASSERT(multi_range_length == min(multi_range_count, ranges.elements)); - DBUG_RETURN(0); - } - - /* Allocate the ranges array. */ - DBUG_ASSERT(ranges.elements); - multi_range_length= min(multi_range_count, ranges.elements); - DBUG_ASSERT(multi_range_length > 0); - while (multi_range_length && ! (multi_range= (KEY_MULTI_RANGE*) - my_malloc(multi_range_length * - sizeof(KEY_MULTI_RANGE), - MYF(MY_WME)))) - { - /* Try to shrink the buffers until it is 0. */ - multi_range_length/= 2; - } - if (! multi_range) - { - multi_range_length= 0; - DBUG_RETURN(HA_ERR_OUT_OF_MEM); - } - - /* Allocate the handler buffer if necessary. */ - if (file->ha_table_flags() & HA_NEED_READ_RANGE_BUFFER) + /* Allocate buffer if we need one but haven't allocated it yet */ + if (mrr_buf_size && !mrr_buf_desc) { - mrange_bufsiz= min(multi_range_bufsiz, - ((uint)QUICK_SELECT_I::records + 1)* head->s->reclength); - - while (mrange_bufsiz && - ! my_multi_malloc(MYF(MY_WME), - &multi_range_buff, - (uint) sizeof(*multi_range_buff), - &mrange_buff, (uint) mrange_bufsiz, - NullS)) + buf_size= mrr_buf_size; + while (buf_size && !my_multi_malloc(MYF(MY_WME), + &mrr_buf_desc, sizeof(*mrr_buf_desc), + &mrange_buff, buf_size, + NullS)) { /* Try to shrink the buffers until both are 0. */ - mrange_bufsiz/= 2; + buf_size/= 2; } - if (! multi_range_buff) - { - my_free((char*) multi_range, MYF(0)); - multi_range= NULL; - multi_range_length= 0; + if (!mrr_buf_desc) DBUG_RETURN(HA_ERR_OUT_OF_MEM); - } /* Initialize the handler buffer. */ - multi_range_buff->buffer= mrange_buff; - multi_range_buff->buffer_end= mrange_buff + mrange_bufsiz; - multi_range_buff->end_of_used_area= mrange_buff; -#ifdef HAVE_valgrind + mrr_buf_desc->buffer= mrange_buff; + mrr_buf_desc->buffer_end= mrange_buff + buf_size; + mrr_buf_desc->end_of_used_area= mrange_buff; +#ifdef HAVE_purify /* We need this until ndb will use the buffer efficiently (Now ndb stores complete row in here, instead of only the used fields which gives us valgrind warnings in compare_record[]) */ - bzero((char*) mrange_buff, mrange_bufsiz); + bzero((char*) mrange_buff, buf_size); #endif } - DBUG_RETURN(0); + + if (!mrr_buf_desc) + empty_buf.buffer= empty_buf.buffer_end= empty_buf.end_of_used_area= NULL; + + RANGE_SEQ_IF seq_funcs= {quick_range_seq_init, quick_range_seq_next, 0, 0}; + error= file->multi_range_read_init(&seq_funcs, (void*)this, ranges.elements, + mrr_flags, mrr_buf_desc? mrr_buf_desc: + &empty_buf); + DBUG_RETURN(error); +} + + +/* + Range sequence interface implementation for array<QUICK_RANGE>: initialize + + SYNOPSIS + quick_range_seq_init() + init_param Caller-opaque paramenter: QUICK_RANGE_SELECT* pointer + n_ranges Number of ranges in the sequence (ignored) + flags MRR flags (currently not used) + + RETURN + Opaque value to be passed to quick_range_seq_next +*/ + +range_seq_t quick_range_seq_init(void *init_param, uint n_ranges, uint flags) +{ + QUICK_RANGE_SELECT *quick= (QUICK_RANGE_SELECT*)init_param; + quick->qr_traversal_ctx.first= (QUICK_RANGE**)quick->ranges.buffer; + quick->qr_traversal_ctx.cur= (QUICK_RANGE**)quick->ranges.buffer; + quick->qr_traversal_ctx.last= quick->qr_traversal_ctx.cur + + quick->ranges.elements; + return &quick->qr_traversal_ctx; +} + + +/* + Range sequence interface implementation for array<QUICK_RANGE>: get next + + SYNOPSIS + quick_range_seq_next() + rseq Value returned from quick_range_seq_init + range OUT Store information about the range here + + RETURN + 0 Ok + 1 No more ranges in the sequence +*/ + +uint quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range) +{ + QUICK_RANGE_SEQ_CTX *ctx= (QUICK_RANGE_SEQ_CTX*)rseq; + + if (ctx->cur == ctx->last) + return 1; /* no more ranges */ + + QUICK_RANGE *cur= *(ctx->cur); + key_range *start_key= &range->start_key; + key_range *end_key= &range->end_key; + + start_key->key= cur->min_key; + start_key->length= cur->min_length; + start_key->keypart_map= cur->min_keypart_map; + start_key->flag= ((cur->flag & NEAR_MIN) ? HA_READ_AFTER_KEY : + (cur->flag & EQ_RANGE) ? + HA_READ_KEY_EXACT : HA_READ_KEY_OR_NEXT); + end_key->key= cur->max_key; + end_key->length= cur->max_length; + end_key->keypart_map= cur->max_keypart_map; + /* + We use HA_READ_AFTER_KEY here because if we are reading on a key + prefix. We want to find all keys with this prefix. + */ + end_key->flag= (cur->flag & NEAR_MAX ? HA_READ_BEFORE_KEY : + HA_READ_AFTER_KEY); + range->range_flag= cur->flag; + ctx->cur++; + return 0; } @@ -8406,15 +8603,8 @@ int QUICK_RANGE_SELECT::reset() int QUICK_RANGE_SELECT::get_next() { - int result; - KEY_MULTI_RANGE *mrange; - key_range *start_key; - key_range *end_key; + char *dummy; DBUG_ENTER("QUICK_RANGE_SELECT::get_next"); - DBUG_ASSERT(multi_range_length && multi_range && - (cur_range >= (QUICK_RANGE**) ranges.buffer) && - (cur_range <= (QUICK_RANGE**) ranges.buffer + ranges.elements)); - if (in_ror_merged_scan) { /* @@ -8424,63 +8614,8 @@ int QUICK_RANGE_SELECT::get_next() head->column_bitmaps_set_no_signal(&column_bitmap, &column_bitmap); } - for (;;) - { - if (in_range) - { - /* We did already start to read this key. */ - result= file->read_multi_range_next(&mrange); - if (result != HA_ERR_END_OF_FILE) - goto end; - } + int result= file->multi_range_read_next(&dummy); - uint count= min(multi_range_length, ranges.elements - - (cur_range - (QUICK_RANGE**) ranges.buffer)); - if (count == 0) - { - /* Ranges have already been used up before. None is left for read. */ - in_range= FALSE; - if (in_ror_merged_scan) - head->column_bitmaps_set_no_signal(save_read_set, save_write_set); - DBUG_RETURN(HA_ERR_END_OF_FILE); - } - KEY_MULTI_RANGE *mrange_slot, *mrange_end; - for (mrange_slot= multi_range, mrange_end= mrange_slot+count; - mrange_slot < mrange_end; - mrange_slot++) - { - start_key= &mrange_slot->start_key; - end_key= &mrange_slot->end_key; - last_range= *(cur_range++); - - start_key->key= (const uchar*) last_range->min_key; - start_key->length= last_range->min_length; - start_key->flag= ((last_range->flag & NEAR_MIN) ? HA_READ_AFTER_KEY : - (last_range->flag & EQ_RANGE) ? - HA_READ_KEY_EXACT : HA_READ_KEY_OR_NEXT); - start_key->keypart_map= last_range->min_keypart_map; - end_key->key= (const uchar*) last_range->max_key; - end_key->length= last_range->max_length; - /* - We use HA_READ_AFTER_KEY here because if we are reading on a key - prefix. We want to find all keys with this prefix. - */ - end_key->flag= (last_range->flag & NEAR_MAX ? HA_READ_BEFORE_KEY : - HA_READ_AFTER_KEY); - end_key->keypart_map= last_range->max_keypart_map; - - mrange_slot->range_flag= last_range->flag; - } - - result= file->read_multi_range_first(&mrange, multi_range, count, - sorted, multi_range_buff); - if (result != HA_ERR_END_OF_FILE) - goto end; - in_range= FALSE; /* No matching rows; go to next set of ranges. */ - } - -end: - in_range= ! result; if (in_ror_merged_scan) { /* Restore bitmaps set on entry */ @@ -8489,6 +8624,7 @@ end: DBUG_RETURN(result); } + /* Get the next record with a different prefix. @@ -8529,9 +8665,7 @@ int QUICK_RANGE_SELECT::get_next_prefix(uint prefix_length, key_range start_key, end_key; if (last_range) { - /* - Read the next record in the same range with prefix after cur_prefix. - */ + /* Read the next record in the same range with prefix after cur_prefix. */ DBUG_ASSERT(cur_prefix != 0); result= file->ha_index_read_map(record, cur_prefix, keypart_map, HA_READ_AFTER_KEY); @@ -8667,7 +8801,8 @@ bool QUICK_RANGE_SELECT::row_in_ranges() */ QUICK_SELECT_DESC::QUICK_SELECT_DESC(QUICK_RANGE_SELECT *q, - uint used_key_parts_arg) + uint used_key_parts_arg, + bool *create_err) :QUICK_RANGE_SELECT(*q), rev_it(rev_ranges), used_key_parts (used_key_parts_arg) { @@ -8676,9 +8811,9 @@ QUICK_SELECT_DESC::QUICK_SELECT_DESC(QUICK_RANGE_SELECT *q, Use default MRR implementation for reverse scans. No table engine currently can do an MRR scan with output in reverse index order. */ - multi_range_length= 0; - multi_range= NULL; - multi_range_buff= NULL; + mrr_buf_desc= NULL; + mrr_flags |= HA_MRR_USE_DEFAULT_IMPL; + mrr_buf_size= 0; QUICK_RANGE **pr= (QUICK_RANGE**)ranges.buffer; QUICK_RANGE **end_range= pr + ranges.elements; @@ -8787,6 +8922,7 @@ int QUICK_SELECT_DESC::get_next() /* Compare if found key is over max-value Returns 0 if key <= range->max_key + TODO: Figure out why can't this function be as simple as cmp_prev(). */ int QUICK_RANGE_SELECT::cmp_next(QUICK_RANGE *range_arg) @@ -9518,8 +9654,14 @@ get_best_group_min_max(PARAM *param, SEL_TREE *tree) cur_index_tree= get_index_range_tree(cur_index, tree, param, &cur_param_idx); /* Check if this range tree can be used for prefix retrieval. */ + COST_VECT dummy_cost; + uint mrr_flags= HA_MRR_USE_DEFAULT_IMPL; + uint mrr_bufsize=0; cur_quick_prefix_records= check_quick_select(param, cur_param_idx, - cur_index_tree, TRUE); + FALSE /*don't care(*/, + cur_index_tree, TRUE, + &mrr_flags, &mrr_bufsize, + &dummy_cost); } cost_group_min_max(table, cur_index_info, cur_used_key_parts, cur_group_key_parts, tree, cur_index_tree, @@ -10103,6 +10245,7 @@ TRP_GROUP_MIN_MAX::make_quick(PARAM *param, bool retrieve_full_rows, /* Make a QUICK_RANGE_SELECT to be used for group prefix retrieval. */ quick->quick_prefix_select= get_quick_select(param, param_idx, index_tree, + HA_MRR_USE_DEFAULT_IMPL, 0, &quick->alloc); /* @@ -11209,6 +11352,7 @@ static void print_ror_scans_arr(TABLE *table, const char *msg, DBUG_VOID_RETURN; } + /***************************************************************************** ** Print a quick range for debugging ** TODO: diff --git a/sql/opt_range.h b/sql/opt_range.h index 6f7ef7842e7..dfd0cdac003 100644 --- a/sql/opt_range.h +++ b/sql/opt_range.h @@ -114,13 +114,16 @@ class QUICK_RANGE :public Sql_alloc { 4. Delete the select: delete quick; - + + NOTE + quick select doesn't use Sql_alloc/MEM_ROOT allocation because "range + checked for each record" functionality may create/destroy + O(#records_in_some_table) quick selects during query execution. */ class QUICK_SELECT_I { public: - bool sorted; ha_rows records; /* estimate of # of records to be retrieved */ double read_time; /* time to perform this retrieval */ TABLE *head; @@ -193,6 +196,12 @@ public: virtual bool reverse_sorted() = 0; virtual bool unique_key_range() { return false; } + /* + Request that this quick select produces sorted output. Not all quick + selects can do it, the caller is responsible for calling this function + only for those quick selects that can. + */ + virtual void need_sorted_output() = 0; enum { QS_TYPE_RANGE = 0, QS_TYPE_INDEX_MERGE = 1, @@ -272,6 +281,22 @@ struct st_qsel_param; class PARAM; class SEL_ARG; + +/* + MRR range sequence, array<QUICK_RANGE> implementation: sequence traversal + context. +*/ +typedef struct st_quick_range_seq_ctx +{ + QUICK_RANGE **first; + QUICK_RANGE **cur; + QUICK_RANGE **last; +} QUICK_RANGE_SEQ_CTX; + +range_seq_t quick_range_seq_init(void *init_param, uint n_ranges, uint flags); +uint quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range); + + /* Quick select that does a range scan on a single key. The records are returned in key order. @@ -279,60 +304,45 @@ class SEL_ARG; class QUICK_RANGE_SELECT : public QUICK_SELECT_I { protected: - bool next,dont_free,in_ror_merged_scan; -public: - int error; -protected: handler *file; - /* - If true, this quick select has its "own" handler object which should be - closed no later then this quick select is deleted. - */ - bool free_file; - bool in_range; - uint multi_range_count; /* copy from thd->variables.multi_range_count */ - uint multi_range_length; /* the allocated length for the array */ - uint multi_range_bufsiz; /* copy from thd->variables.read_rnd_buff_size */ - KEY_MULTI_RANGE *multi_range; /* the multi-range array (allocated and - freed by QUICK_RANGE_SELECT) */ - HANDLER_BUFFER *multi_range_buff; /* the handler buffer (allocated and - freed by QUICK_RANGE_SELECT) */ + + /* Members to deal with case when this quick select is a ROR-merged scan */ + bool in_ror_merged_scan; MY_BITMAP column_bitmap, *save_read_set, *save_write_set; + bool free_file; /* TRUE <=> this->file is "owned" by this quick select */ - friend class TRP_ROR_INTERSECT; - friend - QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table, - struct st_table_ref *ref, - ha_rows records); - friend bool get_quick_keys(PARAM *param, - QUICK_RANGE_SELECT *quick,KEY_PART *key, - SEL_ARG *key_tree, - uchar *min_key, uint min_key_flag, - uchar *max_key, uint max_key_flag); - friend QUICK_RANGE_SELECT *get_quick_select(PARAM*,uint idx, - SEL_ARG *key_tree, - MEM_ROOT *alloc); - friend class QUICK_SELECT_DESC; - friend class QUICK_INDEX_MERGE_SELECT; - friend class QUICK_ROR_INTERSECT_SELECT; - friend class QUICK_GROUP_MIN_MAX_SELECT; + /* Range pointers to be used when not using MRR interface */ + /* Members needed to use the MRR interface */ + QUICK_RANGE_SEQ_CTX qr_traversal_ctx; +public: + uint mrr_flags; /* Flags to be used with MRR interface */ +protected: + uint mrr_buf_size; /* copy from thd->variables.read_rnd_buff_size */ + HANDLER_BUFFER *mrr_buf_desc; /* the handler buffer */ + /* Info about index we're scanning */ + DYNAMIC_ARRAY ranges; /* ordered array of range ptrs */ QUICK_RANGE **cur_range; /* current element in ranges */ - + QUICK_RANGE *last_range; + KEY_PART *key_parts; KEY_PART_INFO *key_part_info; + + bool dont_free; /* Used by QUICK_SELECT_DESC */ + int cmp_next(QUICK_RANGE *range); int cmp_prev(QUICK_RANGE *range); bool row_in_ranges(); public: MEM_ROOT alloc; - QUICK_RANGE_SELECT(THD *thd, TABLE *table,uint index_arg,bool no_alloc=0, - MEM_ROOT *parent_alloc=NULL); + QUICK_RANGE_SELECT(THD *thd, TABLE *table,uint index_arg,bool no_alloc, + MEM_ROOT *parent_alloc, bool *create_err); ~QUICK_RANGE_SELECT(); - + + void need_sorted_output(); int init(); int reset(void); int get_next(); @@ -352,6 +362,27 @@ public: #endif private: /* Default copy ctor used by QUICK_SELECT_DESC */ + friend class TRP_ROR_INTERSECT; + friend + QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table, + struct st_table_ref *ref, + ha_rows records); + friend bool get_quick_keys(PARAM *param, QUICK_RANGE_SELECT *quick, + KEY_PART *key, SEL_ARG *key_tree, + uchar *min_key, uint min_key_flag, + uchar *max_key, uint max_key_flag); + friend QUICK_RANGE_SELECT *get_quick_select(PARAM*,uint idx, + SEL_ARG *key_tree, + uint mrr_flags, + uint mrr_buf_size, + MEM_ROOT *alloc); + friend class QUICK_SELECT_DESC; + friend class QUICK_INDEX_MERGE_SELECT; + friend class QUICK_ROR_INTERSECT_SELECT; + friend class QUICK_GROUP_MIN_MAX_SELECT; + friend uint quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range); + friend range_seq_t quick_range_seq_init(void *init_param, + uint n_ranges, uint flags); }; @@ -359,8 +390,10 @@ class QUICK_RANGE_SELECT_GEOM: public QUICK_RANGE_SELECT { public: QUICK_RANGE_SELECT_GEOM(THD *thd, TABLE *table, uint index_arg, - bool no_alloc, MEM_ROOT *parent_alloc) - :QUICK_RANGE_SELECT(thd, table, index_arg, no_alloc, parent_alloc) + bool no_alloc, MEM_ROOT *parent_alloc, + bool *create_err) + :QUICK_RANGE_SELECT(thd, table, index_arg, no_alloc, parent_alloc, + create_err) {}; virtual int get_next(); }; @@ -432,6 +465,7 @@ public: ~QUICK_INDEX_MERGE_SELECT(); int init(); + void need_sorted_output() { DBUG_ASSERT(0); /* Can't do it */ } int reset(void); int get_next(); bool reverse_sorted() { return false; } @@ -491,6 +525,7 @@ public: ~QUICK_ROR_INTERSECT_SELECT(); int init(); + void need_sorted_output() { DBUG_ASSERT(0); /* Can't do it */ } int reset(void); int get_next(); bool reverse_sorted() { return false; } @@ -545,6 +580,7 @@ public: ~QUICK_ROR_UNION_SELECT(); int init(); + void need_sorted_output() { DBUG_ASSERT(0); /* Can't do it */ } int reset(void); int get_next(); bool reverse_sorted() { return false; } @@ -664,6 +700,7 @@ public: void adjust_prefix_ranges(); bool alloc_buffers(); int init(); + void need_sorted_output() { /* always do it */ } int reset(); int get_next(); bool reverse_sorted() { return false; } @@ -679,7 +716,8 @@ public: class QUICK_SELECT_DESC: public QUICK_RANGE_SELECT { public: - QUICK_SELECT_DESC(QUICK_RANGE_SELECT *q, uint used_key_parts); + QUICK_SELECT_DESC(QUICK_RANGE_SELECT *q, uint used_key_parts, + bool *create_err); int get_next(); bool reverse_sorted() { return 1; } int get_type() { return QS_TYPE_RANGE_DESC; } @@ -712,25 +750,29 @@ class SQL_SELECT :public Sql_alloc { { key_map tmp; tmp.set_all(); - return test_quick_select(thd, tmp, 0, limit, force_quick_range) < 0; + return test_quick_select(thd, tmp, 0, limit, force_quick_range, FALSE) < 0; } inline bool skip_record() { return cond ? cond->val_int() == 0 : 0; } int test_quick_select(THD *thd, key_map keys, table_map prev_tables, - ha_rows limit, bool force_quick_range); + ha_rows limit, bool force_quick_range, + bool ordered_output); }; -class FT_SELECT: public QUICK_RANGE_SELECT { +class FT_SELECT: public QUICK_RANGE_SELECT +{ public: - FT_SELECT(THD *thd, TABLE *table, uint key) : - QUICK_RANGE_SELECT (thd, table, key, 1) { VOID(init()); } + FT_SELECT(THD *thd, TABLE *table, uint key, bool *create_err) : + QUICK_RANGE_SELECT (thd, table, key, 1, NULL, create_err) + { (void) init(); } ~FT_SELECT() { file->ft_end(); } - int init() { return error=file->ft_init(); } + int init() { return file->ft_init(); } int reset() { return 0; } - int get_next() { return error= file->ha_ft_read(record); } + int get_next() { return file->ha_ft_read(record); } int get_type() { return QS_TYPE_FULLTEXT; } }; +FT_SELECT *get_ft_select(THD *thd, TABLE *table, uint key); QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table, struct st_table_ref *ref, ha_rows records); diff --git a/sql/set_var.cc b/sql/set_var.cc index 7eec4c3e41b..dde8a2c6002 100644 --- a/sql/set_var.cc +++ b/sql/set_var.cc @@ -439,8 +439,6 @@ static sys_var_long_ptr sys_max_write_lock_count(&vars, "max_write_lock_count", &max_write_lock_count); static sys_var_thd_ulong sys_min_examined_row_limit(&vars, "min_examined_row_limit", &SV::min_examined_row_limit); -static sys_var_thd_ulong sys_multi_range_count(&vars, "multi_range_count", - &SV::multi_range_count); static sys_var_long_ptr sys_myisam_data_pointer_size(&vars, "myisam_data_pointer_size", &myisam_data_pointer_size); static sys_var_thd_ulonglong sys_myisam_max_sort_file_size(&vars, "myisam_max_sort_file_size", &SV::myisam_max_sort_file_size, fix_myisam_max_sort_file_size, 1); @@ -493,6 +491,18 @@ static sys_var_thd_ulong sys_optimizer_search_depth(&vars, "optimizer_sea &SV::optimizer_search_depth); static sys_var_thd_optimizer_switch sys_optimizer_switch(&vars, "optimizer_switch", &SV::optimizer_switch); + +const char *optimizer_use_mrr_names[] = {"auto", "force", "disable", NullS}; +TYPELIB optimizer_use_mrr_typelib= { + array_elements(optimizer_use_mrr_names) - 1, "", + optimizer_use_mrr_names, NULL +}; + +static sys_var_thd_enum sys_optimizer_use_mrr(&vars, "optimizer_use_mrr", + &SV::optimizer_use_mrr, + &optimizer_use_mrr_typelib, + NULL); + static sys_var_const sys_pid_file(&vars, "pid_file", OPT_GLOBAL, SHOW_CHAR, (uchar*) pidfile_name); diff --git a/sql/sql_class.h b/sql/sql_class.h index 72fd927cc1e..4a71b6776e3 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -313,7 +313,6 @@ struct system_variables ulong max_tmp_tables; ulong max_insert_delayed_threads; ulong min_examined_row_limit; - ulong multi_range_count; ulong myisam_repair_threads; ulong myisam_sort_buff_size; ulong myisam_stats_method; @@ -327,6 +326,14 @@ struct system_variables ulong optimizer_search_depth; /* A bitmap for switching optimizations on/off */ ulong optimizer_switch; + /* + Controls use of Engine-MRR: + 0 - auto, based on cost + 1 - force MRR when the storage engine is capable of doing it + 2 - disable MRR. + */ + ulong optimizer_use_mrr; + ulong preload_buff_size; ulong profiling_history_size; ulong query_cache_type; @@ -430,6 +437,13 @@ typedef struct system_status_var ulong ha_read_prev_count; ulong ha_read_rnd_count; ulong ha_read_rnd_next_count; + /* + This number doesn't include calls to the default implementation and + calls made by range access. The intent is to count only calls made by + BatchedKeyAccess. + */ + ulong ha_multi_range_read_init_count; + ulong ha_rollback_count; ulong ha_update_count; ulong ha_write_count; diff --git a/sql/sql_select.cc b/sql/sql_select.cc index f903c193270..434afe8fd69 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -178,11 +178,14 @@ int join_read_always_key_or_null(JOIN_TAB *tab); int join_read_next_same_or_null(READ_RECORD *info); static COND *make_cond_for_table(COND *cond,table_map table, table_map used_table); +static COND *make_cond_for_table_from_pred(COND *root_cond, COND *cond, + table_map tables, + table_map used_table); static Item* part_of_refkey(TABLE *form,Field *field); uint find_shortest_key(TABLE *table, const key_map *usable_keys); static bool test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order, ha_rows select_limit, bool no_changes, - key_map *map); + const key_map *map); static bool list_contains_unique_index(TABLE *table, bool (*find_func) (Field *, void *), void *data); static bool find_field_in_item_list (Field *field, void *data); @@ -237,8 +240,8 @@ static void select_describe(JOIN *join, bool need_tmp_table,bool need_order, bool distinct, const char *message=NullS); static Item *remove_additional_cond(Item* conds); static void add_group_and_distinct_keys(JOIN *join, JOIN_TAB *join_tab); -static bool test_if_ref(Item_field *left_item,Item *right_item); - +static bool test_if_ref(COND *root_cond, + Item_field *left_item,Item *right_item); /** This handles SELECT with and without UNION. @@ -711,7 +714,8 @@ void JOIN::remove_subq_pushed_predicates(Item **where) ((Item_func *)this->conds)->functype() == Item_func::EQ_FUNC && ((Item_func *)conds)->arguments()[0]->type() == Item::REF_ITEM && ((Item_func *)conds)->arguments()[1]->type() == Item::FIELD_ITEM && - test_if_ref ((Item_field *)((Item_func *)conds)->arguments()[1], + test_if_ref (this->conds, + (Item_field *)((Item_func *)conds)->arguments()[1], ((Item_func *)conds)->arguments()[0])) { *where= 0; @@ -2119,7 +2123,7 @@ JOIN::exec() */ curr_table->select->cond->quick_fix_field(); } - curr_table->select_cond= curr_table->select->cond; + curr_table->set_select_cond(curr_table->select->cond, __LINE__); curr_table->select_cond->top_level_item(); DBUG_EXECUTE("where",print_where(curr_table->select->cond, "select and having", @@ -2471,7 +2475,7 @@ static ha_rows get_quick_record_count(THD *thd, SQL_SELECT *select, select->head=table; table->reginfo.impossible_range=0; if ((error= select->test_quick_select(thd, *(key_map *)keys,(table_map) 0, - limit, 0)) == 1) + limit, 0, FALSE)) == 1) DBUG_RETURN(select->quick->records); if (error == -1) { @@ -5678,6 +5682,7 @@ static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse, j->ref.key_buff2=j->ref.key_buff+ALIGN_SIZE(length); j->ref.key_err=1; j->ref.null_rejecting= 0; + j->ref.disable_cache= FALSE; keyuse=org_keyuse; store_key **ref_key= j->ref.key_copy; @@ -5871,6 +5876,7 @@ JOIN::make_simple_join(JOIN *parent, TABLE *tmp_table) join_tab->cache.buff=0; /* No caching */ join_tab->table=tmp_table; join_tab->select=0; + join_tab->set_select_cond(NULL, __LINE__); join_tab->select_cond=0; join_tab->quick=0; join_tab->type= JT_ALL; /* Map through all records */ @@ -5901,6 +5907,7 @@ inline void add_cond_and_fix(Item **e1, Item *e2) { *e1= res; res->quick_fix_field(); + res->update_used_tables(); } } else @@ -5998,7 +6005,9 @@ static void add_not_null_conds(JOIN *join) DBUG_EXECUTE("where",print_where(notnull, referred_tab->table->alias, QT_ORDINARY);); - add_cond_and_fix(&referred_tab->select_cond, notnull); + COND *new_cond= referred_tab->select_cond; + add_cond_and_fix(&new_cond, notnull); + referred_tab->set_select_cond(new_cond, __LINE__); } } } @@ -6171,9 +6180,9 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond) if (!tmp) DBUG_RETURN(1); tmp->quick_fix_field(); - cond_tab->select_cond= !cond_tab->select_cond ? tmp : - new Item_cond_and(cond_tab->select_cond, - tmp); + COND *new_cond= !cond_tab->select_cond ? tmp : + new Item_cond_and(cond_tab->select_cond, tmp); + cond_tab->set_select_cond(new_cond, __LINE__); if (!cond_tab->select_cond) DBUG_RETURN(1); cond_tab->select_cond->quick_fix_field(); @@ -6254,7 +6263,8 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond) } } - if (tmp || !cond || tab->type == JT_REF) + if (tmp || !cond || tab->type == JT_REF || tab->type == JT_REF_OR_NULL || + tab->type == JT_EQ_REF || first_inner_tab) { DBUG_EXECUTE("where",print_where(tmp,tab->table->alias, QT_ORDINARY);); SQL_SELECT *sel= tab->select= ((SQL_SELECT*) @@ -6276,11 +6286,12 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond) */ if (!(tmp= add_found_match_trig_cond(first_inner_tab, tmp, 0))) DBUG_RETURN(1); - tab->select_cond=sel->cond=tmp; + sel->cond=tmp; + tab->set_select_cond(tmp, __LINE__); /* Push condition to storage engine if this is enabled and the condition is not guarded */ tab->table->file->pushed_cond= NULL; - if (thd->variables.engine_condition_pushdown) + if (thd->variables.engine_condition_pushdown && !first_inner_tab) { COND *push_cond= make_cond_for_table(tmp, current_map, current_map); @@ -6293,7 +6304,11 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond) } } else - tab->select_cond= sel->cond= NULL; + { + sel->cond= NULL; + tab->set_select_cond(NULL, __LINE__); + } + sel->head=tab->table; DBUG_EXECUTE("where",print_where(tmp,tab->table->alias, QT_ORDINARY);); @@ -6360,7 +6375,8 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond) (join->select_options & OPTION_FOUND_ROWS ? HA_POS_ERROR : - join->unit->select_limit_cnt), 0) < 0) + join->unit->select_limit_cnt), 0, + FALSE) < 0) { /* Before reporting "Impossible WHERE" for the whole query @@ -6373,7 +6389,8 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond) (join->select_options & OPTION_FOUND_ROWS ? HA_POS_ERROR : - join->unit->select_limit_cnt),0) < 0) + join->unit->select_limit_cnt),0, + FALSE) < 0) DBUG_RETURN(1); // Impossible WHERE } else @@ -6502,6 +6519,8 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond) if (!cond_tab->select_cond) DBUG_RETURN(1); cond_tab->select_cond->quick_fix_field(); + if (cond_tab->select) + cond_tab->select->cond= cond_tab->select_cond; } } first_inner_tab= first_inner_tab->first_upper; @@ -6560,6 +6579,8 @@ make_join_readinfo(JOIN *join, ulonglong options) table->key_read=1; table->file->extra(HA_EXTRA_KEYREAD); } + else + push_index_cond(tab, tab->ref.key, TRUE); break; case JT_EQ_REF: table->status=STATUS_NO_RECORD; @@ -6578,6 +6599,8 @@ make_join_readinfo(JOIN *join, ulonglong options) table->key_read=1; table->file->extra(HA_EXTRA_KEYREAD); } + else + push_index_cond(tab, tab->ref.key, TRUE); break; case JT_REF_OR_NULL: case JT_REF: @@ -6589,12 +6612,6 @@ make_join_readinfo(JOIN *join, ulonglong options) } delete tab->quick; tab->quick=0; - if (table->covering_keys.is_set(tab->ref.key) && - !table->no_keyread) - { - table->key_read=1; - table->file->extra(HA_EXTRA_KEYREAD); - } if (tab->type == JT_REF) { tab->read_first_record= join_read_always_key; @@ -6605,6 +6622,14 @@ make_join_readinfo(JOIN *join, ulonglong options) tab->read_first_record= join_read_always_key_or_null; tab->read_record.read_record= join_read_next_same_or_null; } + if (table->covering_keys.is_set(tab->ref.key) && + !table->no_keyread) + { + table->key_read=1; + table->file->extra(HA_EXTRA_KEYREAD); + } + else + push_index_cond(tab, tab->ref.key, TRUE); break; case JT_FT: table->status=STATUS_NO_RECORD; @@ -6700,6 +6725,9 @@ make_join_readinfo(JOIN *join, ulonglong options) tab->type=JT_NEXT; // Read with index_first / index_next } } + if (tab->select && tab->select->quick && + tab->select->quick->index != MAX_KEY && ! tab->table->key_read) + push_index_cond(tab, tab->select->quick->index, FALSE); } break; default: @@ -12098,7 +12126,8 @@ test_if_quick_select(JOIN_TAB *tab) delete tab->select->quick; tab->select->quick=0; return tab->select->test_quick_select(tab->join->thd, tab->keys, - (table_map) 0, HA_POS_ERROR, 0); + (table_map) 0, HA_POS_ERROR, 0, + FALSE); } @@ -12754,12 +12783,24 @@ end_write_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)), 1 if right_item is used removable reference key on left_item */ -static bool test_if_ref(Item_field *left_item,Item *right_item) +static bool test_if_ref(Item *root_cond, Item_field *left_item,Item *right_item) { Field *field=left_item->field; - // No need to change const test. We also have to keep tests on LEFT JOIN - if (!field->table->const_table && !field->table->maybe_null) + JOIN_TAB *join_tab= field->table->reginfo.join_tab; + // No need to change const test + if (!field->table->const_table && join_tab && + (!join_tab->first_inner || + *join_tab->first_inner->on_expr_ref == root_cond)) { + // Cond guards + for (uint i = 0; i < join_tab->ref.key_parts; i++) + { + if (join_tab->ref.cond_guards[i]) + { + return FALSE; + } + } + // Item *ref_item=part_of_refkey(field->table,field); if (ref_item && ref_item->eq(right_item,1)) { @@ -12790,9 +12831,49 @@ static bool test_if_ref(Item_field *left_item,Item *right_item) } +/* + Extract a condition that can be checked after reading given table + + SYNOPSIS + make_cond_for_table() + cond Condition to analyze + tables Tables for which "current field values" are available + used_table Table that we're extracting the condition for (may + also include PSEUDO_TABLE_BITS + exclude_expensive_cond Do not push expensive conditions + + DESCRIPTION + Extract the condition that can be checked after reading the table + specified in 'used_table', given that current-field values for tables + specified in 'tables' bitmap are available. + + The function assumes that + - Constant parts of the condition has already been checked. + - Condition that could be checked for tables in 'tables' has already + been checked. + + The function takes into account that some parts of the condition are + guaranteed to be true by employed 'ref' access methods (the code that + does this is located at the end, search down for "EQ_FUNC"). + + + SEE ALSO + make_cond_for_info_schema uses similar algorithm + + RETURN + Extracted condition +*/ + static COND * make_cond_for_table(COND *cond, table_map tables, table_map used_table) { + return make_cond_for_table_from_pred(cond, cond, tables, used_table); +} + +static COND * +make_cond_for_table_from_pred(COND *root_cond, COND *cond, + table_map tables, table_map used_table) +{ if (used_table && !(cond->used_tables() & used_table)) return (COND*) 0; // Already checked if (cond->type() == Item::COND_ITEM) @@ -12807,7 +12888,7 @@ make_cond_for_table(COND *cond, table_map tables, table_map used_table) Item *item; while ((item=li++)) { - Item *fix=make_cond_for_table(item,tables,used_table); + Item *fix=make_cond_for_table_from_pred(root_cond, item, tables, used_table); if (fix) new_cond->argument_list()->push_back(fix); } @@ -12837,7 +12918,7 @@ make_cond_for_table(COND *cond, table_map tables, table_map used_table) Item *item; while ((item=li++)) { - Item *fix=make_cond_for_table(item,tables,0L); + Item *fix=make_cond_for_table_from_pred(root_cond, item, tables, 0L); if (!fix) return (COND*) 0; // Always true new_cond->argument_list()->push_back(fix); @@ -12864,18 +12945,19 @@ make_cond_for_table(COND *cond, table_map tables, table_map used_table) if (cond->marker == 2 || cond->eq_cmp_result() == Item::COND_OK) return cond; // Not boolean op - if (((Item_func*) cond)->functype() == Item_func::EQ_FUNC) + if (cond->type() == Item::FUNC_ITEM && + ((Item_func*) cond)->functype() == Item_func::EQ_FUNC) { - Item *left_item= ((Item_func*) cond)->arguments()[0]; - Item *right_item= ((Item_func*) cond)->arguments()[1]; + Item *left_item= ((Item_func*) cond)->arguments()[0]->real_item(); + Item *right_item= ((Item_func*) cond)->arguments()[1]->real_item(); if (left_item->type() == Item::FIELD_ITEM && - test_if_ref((Item_field*) left_item,right_item)) + test_if_ref(root_cond, (Item_field*) left_item,right_item)) { cond->marker=3; // Checked when read return (COND*) 0; } if (right_item->type() == Item::FIELD_ITEM && - test_if_ref((Item_field*) right_item,left_item)) + test_if_ref(root_cond, (Item_field*) right_item,left_item)) { cond->marker=3; // Checked when read return (COND*) 0; @@ -13259,7 +13341,7 @@ find_field_in_item_list (Field *field, void *data) static bool test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, - bool no_changes, key_map *map) + bool no_changes, const key_map *map) { int ref_key; uint ref_key_parts; @@ -13269,6 +13351,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, SQL_SELECT *select=tab->select; key_map usable_keys; QUICK_SELECT_I *save_quick= 0; + COND *orig_select_cond= 0; DBUG_ENTER("test_if_skip_sort_order"); LINT_INIT(ref_key_parts); @@ -13288,7 +13371,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, } usable_keys.intersect(((Item_field*) item)->field->part_of_sortkey); if (usable_keys.is_clear_all()) - DBUG_RETURN(0); // No usable keys + goto use_filesort; // No usable keys } ref_key= -1; @@ -13298,7 +13381,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, ref_key= tab->ref.key; ref_key_parts= tab->ref.key_parts; if (tab->type == JT_REF_OR_NULL || tab->type == JT_FT) - DBUG_RETURN(0); + goto use_filesort; } else if (select && select->quick) // Range found by opt_range { @@ -13313,7 +13396,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, if (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE || quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION || quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT) - DBUG_RETURN(0); + goto use_filesort; ref_key= select->quick->index; ref_key_parts= select->quick->used_key_parts; } @@ -13335,10 +13418,15 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, */ if (table->covering_keys.is_set(ref_key)) usable_keys.intersect(table->covering_keys); + if (tab->pre_idx_push_select_cond) + orig_select_cond= tab->set_cond(tab->pre_idx_push_select_cond); + if ((new_ref_key= test_if_subkey(order, table, ref_key, ref_key_parts, &usable_keys)) < MAX_KEY) { /* Found key that can be used to retrieve data in sorted order */ + //psergey-mrr:if (tab->pre_idx_push_select_cond) + // tab->select_cond= tab->select->cond= tab->pre_idx_push_select_cond; if (tab->ref.key >= 0) { /* @@ -13352,9 +13440,10 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, KEYUSE *keyuse= tab->keyuse; while (keyuse->key != new_ref_key && keyuse->table == tab->table) keyuse++; + if (create_ref_for_key(tab->join, tab, keyuse, tab->join->const_table_map)) - DBUG_RETURN(0); + goto use_filesort; } else { @@ -13374,9 +13463,10 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, (tab->join->select_options & OPTION_FOUND_ROWS) ? HA_POS_ERROR : - tab->join->unit->select_limit_cnt,0) <= + tab->join->unit->select_limit_cnt,0, + TRUE) <= 0) - DBUG_RETURN(0); + goto use_filesort; } ref_key= new_ref_key; } @@ -13424,7 +13514,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, index order and not using join cache */ if (tab->type == JT_ALL && tab->join->tables > tab->join->const_tables + 1) - DBUG_RETURN(0); + goto use_filesort; keys= *table->file->keys_to_use_for_scanning(); keys.merge(table->covering_keys); @@ -13580,7 +13670,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, join->select_options & OPTION_FOUND_ROWS ? HA_POS_ERROR : join->unit->select_limit_cnt, - 0) > 0; + TRUE, FALSE) > 0; } if (!no_changes) { @@ -13609,6 +13699,18 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, table->key_read=1; table->file->extra(HA_EXTRA_KEYREAD); } + if (tab->pre_idx_push_select_cond) + { + COND *tmp_cond= tab->pre_idx_push_select_cond; + if (orig_select_cond) + { + tmp_cond= and_conds(tmp_cond, orig_select_cond); + tmp_cond->quick_fix_field(); + } + tab->set_cond(tmp_cond); + /* orig_select_cond was merged, no need to restore original one. */ + orig_select_cond= 0; + } table->file->ha_index_or_rnd_end(); if (join->select_options & SELECT_DESCRIBE) { @@ -13642,7 +13744,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit, order_direction= best_key_direction; } else - DBUG_RETURN(0); + goto use_filesort; } check_reverse_order: @@ -13657,6 +13759,7 @@ check_reverse_order: if (!select->quick->reverse_sorted()) { QUICK_SELECT_DESC *tmp; + bool error= FALSE; int quick_type= select->quick->get_type(); if (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE || quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT || @@ -13665,18 +13768,18 @@ check_reverse_order: { tab->limit= 0; select->quick= save_quick; - DBUG_RETURN(0); // Use filesort + goto use_filesort; // Use filesort } /* ORDER BY range_key DESC */ tmp= new QUICK_SELECT_DESC((QUICK_RANGE_SELECT*)(select->quick), - used_key_parts); - if (!tmp || tmp->error) + used_key_parts, &error); + if (!tmp || error) { delete tmp; select->quick= save_quick; tab->limit= 0; - DBUG_RETURN(0); // Reverse sort not supported + goto use_filesort; // Reverse sort not supported } select->quick=tmp; } @@ -13695,8 +13798,14 @@ check_reverse_order: } } else if (select && select->quick) - select->quick->sorted= 1; + select->quick->need_sorted_output(); + if (orig_select_cond) + tab->set_cond(orig_select_cond); DBUG_RETURN(1); +use_filesort: + if (orig_select_cond) + tab->set_cond(orig_select_cond); + DBUG_RETURN(0); } @@ -13797,7 +13906,7 @@ create_sort_index(THD *thd, JOIN *join, ORDER *order, field, quick will contain an empty record set. */ if (!(select->quick= (tab->type == JT_FT ? - new FT_SELECT(thd, table, tab->ref.key) : + get_ft_select(thd, table, tab->ref.key) : get_quick_select_for_ref(thd, table, &tab->ref, tab->found_records)))) goto err; @@ -13836,7 +13945,7 @@ create_sort_index(THD *thd, JOIN *join, ORDER *order, table->quick_keys.clear_all(); // as far as we cleanup select->quick table->sort.io_cache= tablesort_result_cache; } - tab->select_cond=0; + tab->set_select_cond(NULL, __LINE__); tab->last_inner= 0; tab->first_unmatched= 0; tab->type=JT_ALL; // Read with normal read_record @@ -13877,7 +13986,7 @@ static bool fix_having(JOIN *join, Item **having) sort_table_cond)) || table->select->cond->fix_fields(join->thd, &table->select->cond)) return 1; - table->select_cond=table->select->cond; + table->set_select_cond(table->select->cond, __LINE__); table->select_cond->top_level_item(); DBUG_EXECUTE("where",print_where(table->select_cond, "select and having", @@ -14468,19 +14577,50 @@ read_cached_record(JOIN_TAB *tab) } +/* + eq_ref: Create the lookup key and check if it is the same as saved key + + SYNOPSIS + cmp_buffer_with_ref() + tab Join tab of the accessed table + table The table to read. This is usually tab->table, except for + semi-join when we might need to make a lookup in a temptable + instead. + tab_ref The structure with methods to collect index lookup tuple. + This is usually table->ref, except for the case of when we're + doing lookup into semi-join materialization table. + + DESCRIPTION + Used by eq_ref access method: create the index lookup key and check if + we've used this key at previous lookup (If yes, we don't need to repeat + the lookup - the record has been already fetched) + + RETURN + TRUE No cached record for the key, or failed to create the key (due to + out-of-domain error) + FALSE The created key is the same as the previous one (and the record + is already in table->record) +*/ + static bool cmp_buffer_with_ref(JOIN_TAB *tab) { - bool diff; - if (!(diff=tab->ref.key_err)) + TABLE_REF *tab_ref= &tab->ref; + bool no_prev_key; + if (!tab_ref->disable_cache) { - memcpy(tab->ref.key_buff2, tab->ref.key_buff, tab->ref.key_length); + if (!(no_prev_key= tab_ref->key_err)) + { + /* Previous access found a row. Copy its key */ + memcpy(tab_ref->key_buff2, tab_ref->key_buff, tab_ref->key_length); + } } - if ((tab->ref.key_err= cp_buffer_from_ref(tab->join->thd, tab->table, - &tab->ref)) || - diff) + else + no_prev_key= TRUE; + if ((tab_ref->key_err= cp_buffer_from_ref(tab->join->thd, tab->table, tab_ref)) || + no_prev_key) return 1; - return memcmp(tab->ref.key_buff2, tab->ref.key_buff, tab->ref.key_length) + return memcmp(tab_ref->key_buff2, tab_ref->key_buff, tab_ref->key_length) != 0; } @@ -15764,11 +15904,12 @@ static bool add_ref_to_table_cond(THD *thd, JOIN_TAB *join_tab) if (join_tab->select) { error=(int) cond->add(join_tab->select->cond); - join_tab->select_cond=join_tab->select->cond=cond; + join_tab->select->cond= cond; + join_tab->set_select_cond(cond, __LINE__); } else if ((join_tab->select= make_select(join_tab->table, 0, 0, cond, 0, &error))) - join_tab->select_cond=cond; + join_tab->set_select_cond(cond, __LINE__); DBUG_RETURN(error ? TRUE : FALSE); } @@ -16584,6 +16725,20 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order, } else { + uint keyno= MAX_KEY; + if (tab->ref.key_parts) + keyno= tab->ref.key; + else if (tab->select && tab->select->quick) + keyno = tab->select->quick->index; + + if (keyno != MAX_KEY && keyno == table->file->pushed_idx_cond_keyno && + table->file->pushed_idx_cond) + extra.append(STRING_WITH_LEN("; Using index condition")); + /** + psergey-mrr: + else if (tab->cache_idx_cond) + extra.append(STRING_WITH_LEN("; Using index condition(BKA)")); + **/ if (quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION || quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT || quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE) @@ -16647,6 +16802,14 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order, } if (table->reginfo.not_exists_optimize) extra.append(STRING_WITH_LEN("; Not exists")); + + if (quick_type == QUICK_SELECT_I::QS_TYPE_RANGE && + !(((QUICK_RANGE_SELECT*)(tab->select->quick))->mrr_flags & + HA_MRR_USE_DEFAULT_IMPL)) + { + extra.append(STRING_WITH_LEN("; Using MRR")); + } + if (need_tmp_table) { need_tmp_table=0; diff --git a/sql/sql_select.h b/sql/sql_select.h index 271c88ebf66..0a6702a8135 100644 --- a/sql/sql_select.h +++ b/sql/sql_select.h @@ -85,6 +85,12 @@ typedef struct st_table_ref table_map depend_map; ///< Table depends on these tables. /* null byte position in the key_buf. Used for REF_OR_NULL optimization */ uchar *null_ref_key; + + /* + TRUE <=> disable the "cache" as doing lookup with the same key value may + produce different results (because of Index Condition Pushdown) + */ + bool disable_cache; } TABLE_REF; @@ -145,6 +151,14 @@ typedef struct st_join_table { SQL_SELECT *select; COND *select_cond; QUICK_SELECT_I *quick; + /* + The value of select_cond before we've attempted to do Index Condition + Pushdown. We may need to restore everything back if we first choose one + index but then reconsider (see test_if_skip_sort_order() for such + scenarios). + NULL means no index condition pushdown was performed. + */ + Item *pre_idx_push_select_cond; Item **on_expr_ref; /**< pointer to the associated on expression */ COND_EQUAL *cond_equal; /**< multiple equalities for the on expression */ st_join_table *first_inner; /**< first inner table for including outerjoin */ @@ -218,6 +232,20 @@ typedef struct st_join_table { (select->quick->get_type() == QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX)); } + void set_select_cond(COND *to, uint line) + { + DBUG_PRINT("info", ("select_cond changes %p -> %p at line %u tab %p", + select_cond, to, line, this)); + select_cond= to; + } + COND *set_cond(COND *new_cond) + { + COND *tmp_select_cond= select_cond; + set_select_cond(new_cond, __LINE__); + if (select) + select->cond= new_cond; + return tmp_select_cond; + } } JOIN_TAB; enum_nested_loop_state sub_select_cache(JOIN *join, JOIN_TAB *join_tab, bool @@ -752,3 +780,7 @@ inline bool optimizer_flag(THD *thd, uint flag) void eliminate_tables(JOIN *join); +/// psergey-mrr: +void push_index_cond(JOIN_TAB *tab, uint keyno, bool other_tbls_ok); + + diff --git a/sql/structs.h b/sql/structs.h index 3a090fd203f..9dffe1e1b8c 100644 --- a/sql/structs.h +++ b/sql/structs.h @@ -53,7 +53,8 @@ typedef struct st_key_part_info { /* Info about a key part */ Field *field; uint offset; /* offset in record (from 0) */ uint null_offset; /* Offset to null_bit in record */ - uint16 length; /* Length of keypart value in bytes */ + /* Length of key part in bytes, excluding NULL flag and length bytes */ + uint16 length; /* Number of bytes required to store the keypart value. This may be different from the "length" field as it also counts diff --git a/sql/table.cc b/sql/table.cc index 1c10958fdef..c2d58bac982 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -1556,21 +1556,11 @@ static int open_binary_frm(THD *thd, TABLE_SHARE *share, uchar *head, share->table_name.str, share->table_name.str); share->crashed= 1; // Marker for CHECK TABLE - goto to_be_deleted; + continue; } #endif key_part->key_part_flag|= HA_PART_KEY_SEG; } - - to_be_deleted: - - /* - If the field can be NULL, don't optimize away the test - key_part_column = expression from the WHERE clause - as we need to test for NULL = NULL. - */ - if (field->real_maybe_null()) - key_part->key_part_flag|= HA_NULL_PART; } keyinfo->usable_key_parts= usable_parts; // Filesort |