diff options
author | Igor Babaev <igor@askmonty.org> | 2019-02-03 14:56:12 -0800 |
---|---|---|
committer | Igor Babaev <igor@askmonty.org> | 2019-02-03 14:56:12 -0800 |
commit | 658128af43b4d7c6db445164f8ed25ed4d1e3109 (patch) | |
tree | 7a71580cca55759b8bb2730e117436478948d77f /sql | |
parent | 5f46670bd09babbee75a24ac82eb4ade0706da66 (diff) | |
download | mariadb-git-658128af43b4d7c6db445164f8ed25ed4d1e3109.tar.gz |
MDEV-16188 Use in-memory PK filters built from range index scans
This patch contains a full implementation of the optimization
that allows to use in-memory rowid / primary filters built for range Â
conditions over indexes. In many cases usage of such filters reduce Â
the number of disk seeks spent for fetching table rows.
In this implementation the choice of what possible filter to be applied Â
(if any) is made purely on cost-based considerations.
This implementation re-achitectured the partial implementation of
the feature pushed by Galina Shalygina in the commit
8d5a11122c32f4d9eb87536886c6e893377bdd07.
Besides this patch contains a better implementation of the generic Â
handler function handler::multi_range_read_info_const() that
takes into account gaps between ranges when calculating the cost of
range index scans. It also contains some corrections of the
implementation of the handler function records_in_range() for MyISAM.
This patch supports the feature for InnoDB and MyISAM.
Diffstat (limited to 'sql')
-rw-r--r-- | sql/ha_partition.cc | 55 | ||||
-rw-r--r-- | sql/ha_partition.h | 4 | ||||
-rw-r--r-- | sql/handler.cc | 71 | ||||
-rw-r--r-- | sql/handler.h | 79 | ||||
-rw-r--r-- | sql/multi_range_read.cc | 141 | ||||
-rw-r--r-- | sql/opt_range.cc | 23 | ||||
-rw-r--r-- | sql/opt_subselect.h | 1 | ||||
-rw-r--r-- | sql/rowid_filter.cc | 610 | ||||
-rw-r--r-- | sql/rowid_filter.h | 522 | ||||
-rw-r--r-- | sql/sql_array.h | 13 | ||||
-rw-r--r-- | sql/sql_class.cc | 2 | ||||
-rw-r--r-- | sql/sql_const.h | 6 | ||||
-rw-r--r-- | sql/sql_explain.cc | 74 | ||||
-rw-r--r-- | sql/sql_explain.h | 50 | ||||
-rw-r--r-- | sql/sql_select.cc | 280 | ||||
-rw-r--r-- | sql/sql_select.h | 27 | ||||
-rw-r--r-- | sql/structs.h | 14 | ||||
-rw-r--r-- | sql/table.cc | 45 | ||||
-rw-r--r-- | sql/table.h | 42 |
19 files changed, 1603 insertions, 456 deletions
diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc index 37a3decdbca..651ace759a3 100644 --- a/sql/ha_partition.cc +++ b/sql/ha_partition.cc @@ -6265,19 +6265,22 @@ ha_rows ha_partition::multi_range_read_info_const(uint keyno, uint ret_mrr_mode= 0; range_seq_t seq_it; part_id_range save_part_spec; + Cost_estimate part_cost; DBUG_ENTER("ha_partition::multi_range_read_info_const"); DBUG_PRINT("enter", ("partition this: %p", this)); m_mrr_new_full_buffer_size= 0; save_part_spec= m_part_spec; + cost->reset(); + seq_it= seq->init(seq_init_param, n_ranges, *mrr_mode); if (unlikely((error= multi_range_key_create_key(seq, seq_it)))) { if (likely(error == HA_ERR_END_OF_FILE)) // No keys in range { rows= 0; - goto calc_cost; + goto end; } /* This error means that we can't do multi_range_read for the moment @@ -6306,18 +6309,20 @@ ha_rows ha_partition::multi_range_read_info_const(uint keyno, ha_rows tmp_rows; uint tmp_mrr_mode; m_mrr_buffer_size[i]= 0; + part_cost.reset(); tmp_mrr_mode= *mrr_mode; tmp_rows= (*file)-> multi_range_read_info_const(keyno, &m_part_seq_if, &m_partition_part_key_multi_range_hld[i], m_part_mrr_range_length[i], &m_mrr_buffer_size[i], - &tmp_mrr_mode, cost); + &tmp_mrr_mode, &part_cost); if (tmp_rows == HA_POS_ERROR) { m_part_spec= save_part_spec; DBUG_RETURN(HA_POS_ERROR); } + cost->add(&part_cost); rows+= tmp_rows; ret_mrr_mode|= tmp_mrr_mode; m_mrr_new_full_buffer_size+= m_mrr_buffer_size[i]; @@ -6325,15 +6330,8 @@ ha_rows ha_partition::multi_range_read_info_const(uint keyno, } while (*(++file)); *mrr_mode= ret_mrr_mode; -calc_cost: +end: m_part_spec= save_part_spec; - cost->reset(); - cost->avg_io_cost= 1; - if ((*mrr_mode & HA_MRR_INDEX_ONLY) && rows > 2) - cost->io_count= keyread_time(keyno, n_ranges, (uint) rows); - else - cost->io_count= read_time(keyno, n_ranges, rows); - cost->cpu_cost= (double) rows / TIME_FOR_COMPARE + 0.01; DBUG_RETURN(rows); } @@ -9341,6 +9339,43 @@ double ha_partition::scan_time() /** + @brief + Caculate time to scan the given index (index only scan) + + @param inx Index number to scan + + @return time for scanning index inx +*/ + +double ha_partition::key_scan_time(uint inx) +{ + double scan_time= 0; + uint i; + DBUG_ENTER("ha_partition::key_scan_time"); + for (i= bitmap_get_first_set(&m_part_info->read_partitions); + i < m_tot_parts; + i= bitmap_get_next_set(&m_part_info->read_partitions, i)) + scan_time+= m_file[i]->key_scan_time(inx); + DBUG_RETURN(scan_time); +} + + +double ha_partition::keyread_time(uint inx, uint ranges, ha_rows rows) +{ + double read_time= 0; + uint i; + DBUG_ENTER("ha_partition::keyread_time"); + if (!ranges) + DBUG_RETURN(handler::keyread_time(inx, ranges, rows)); + for (i= bitmap_get_first_set(&m_part_info->read_partitions); + i < m_tot_parts; + i= bitmap_get_next_set(&m_part_info->read_partitions, i)) + read_time+= m_file[i]->keyread_time(inx, ranges, rows); + DBUG_RETURN(read_time); +} + + +/** Find number of records in a range. @param inx Index number @param min_key Start of range diff --git a/sql/ha_partition.h b/sql/ha_partition.h index 202f27840dc..f19e9ff96c8 100644 --- a/sql/ha_partition.h +++ b/sql/ha_partition.h @@ -912,6 +912,10 @@ public: */ virtual double scan_time(); + virtual double key_scan_time(uint inx); + + virtual double keyread_time(uint inx, uint ranges, ha_rows rows); + /* The next method will never be called if you do not implement indexes. */ diff --git a/sql/handler.cc b/sql/handler.cc index e5081accb30..30ce3f654df 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -43,6 +43,7 @@ #include "debug_sync.h" // DEBUG_SYNC #include "sql_audit.h" #include "ha_sequence.h" +#include "rowid_filter.h" #ifdef WITH_PARTITION_STORAGE_ENGINE #include "ha_partition.h" @@ -2602,36 +2603,26 @@ LEX_CSTRING *handler::engine_name() } -/** - The method returns the cost of the random I/O accesses when - index is used. +/* + It is assumed that the value of the parameter 'ranges' can be only 0 or 1. + If ranges == 1 then the function returns the cost of index only scan + by index 'keyno' of one range containing 'rows' key entries. + If ranges == 0 then the function returns only the cost of copying + those key entries into the engine buffers. */ -double handler::get_io_cost(uint index, ha_rows rows, uint *length) +double handler::keyread_time(uint index, uint ranges, ha_rows rows) { - uint len= table->key_info[index].key_length + ref_length; + DBUG_ASSERT(ranges == 0 || ranges == 1); + size_t len= table->key_info[index].key_length + ref_length; if (index == table->s->primary_key && table->file->primary_key_is_clustered()) len= table->s->stored_rec_length; - double keys_per_block= (stats.block_size/2.0/len+1); - *length= len; - return (rows + keys_per_block-1)/ keys_per_block; -} - - -double handler::keyread_time(uint index, uint ranges, ha_rows rows) -{ - /* - It is assumed that we will read trough the whole key range and that all - key blocks are half full (normally things are much better). It is also - assumed that each time we read the next key from the index, the handler - performs a random seek, thus the cost is proportional to the number of - blocks read. This model does not take into account clustered indexes - - engines that support that (e.g. InnoDB) may want to overwrite this method. - The model counts in the time to read index entries from cache. - */ - uint len; - return get_io_cost(index, rows, &len) + - len*rows/(stats.block_size+1)/TIME_FOR_COMPARE ; + uint keys_per_block= (stats.block_size/2.0/len+1); + ulonglong blocks= !rows ? 0 : (rows-1) / keys_per_block + 1; + double cost= (double)rows*len/(stats.block_size+1)*IDX_BLOCK_COPY_COST; + if (ranges) + cost+= blocks; + return cost; } void **handler::ha_data(THD *thd) const @@ -5766,6 +5757,35 @@ extern "C" enum icp_result handler_index_cond_check(void* h_arg) return res; } + +/** + Rowid filter callback - to be called by an engine to check rowid / primary + keys of the rows whose data is to be fetched against the used rowid filter +*/ + +extern "C" int handler_rowid_filter_check(void *h_arg) +{ + handler *h= (handler*) h_arg; + TABLE *tab= h->get_table(); + h->position(tab->record[0]); + return h->pushed_rowid_filter->check((char *) h->ref); +} + + +/** + Callback function for an engine to check whether the used rowid filter + has been already built +*/ + +extern "C" int handler_rowid_filter_is_active(void *h_arg) +{ + if (!h_arg) + return false; + handler *h= (handler*) h_arg; + return h->rowid_filter_is_active; +} + + int handler::index_read_idx_map(uchar * buf, uint index, const uchar * key, key_part_map keypart_map, enum ha_rkey_function find_flag) @@ -6230,6 +6250,7 @@ int handler::ha_reset() /* Reset information about pushed engine conditions */ cancel_pushed_idx_cond(); /* Reset information about pushed index conditions */ + cancel_pushed_rowid_filter(); clear_top_table_fields(); DBUG_RETURN(reset()); } diff --git a/sql/handler.h b/sql/handler.h index e5a6deb6659..ee730e8a725 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -47,6 +47,7 @@ class Alter_info; class Virtual_column_info; class sequence_definition; +class Rowid_filter; // the following is for checking tables @@ -324,6 +325,8 @@ enum enum_alter_inplace_result { */ #define HA_CLUSTERED_INDEX 512 +#define HA_DO_RANGE_FILTER_PUSHDOWN 1024 + /* bits in alter_table_flags: */ @@ -2561,11 +2564,14 @@ typedef bool (*SKIP_INDEX_TUPLE_FUNC) (range_seq_t seq, range_id_t range_info); class Cost_estimate { public: - double io_count; /* number of I/O */ - double avg_io_cost; /* cost of an average I/O oper. */ - double cpu_cost; /* cost of operations in CPU */ - double import_cost; /* cost of remote operations */ - double mem_cost; /* cost of used memory */ + double io_count; /* number of I/O to fetch records */ + double avg_io_cost; /* cost of an average I/O oper. to fetch records */ + double idx_io_count; /* number of I/O to read keys */ + double idx_avg_io_cost; /* cost of an average I/O oper. to fetch records */ + double cpu_cost; /* total cost of operations in CPU */ + double idx_cpu_cost; /* cost of operations in CPU for index */ + double import_cost; /* cost of remote operations */ + double mem_cost; /* cost of used memory */ enum { IO_COEFF=1 }; enum { CPU_COEFF=1 }; @@ -2579,10 +2585,18 @@ public: double total_cost() { - return IO_COEFF*io_count*avg_io_cost + CPU_COEFF * cpu_cost + + return IO_COEFF*io_count*avg_io_cost + + IO_COEFF*idx_io_count*idx_avg_io_cost + + CPU_COEFF*cpu_cost + MEM_COEFF*mem_cost + IMPORT_COEFF*import_cost; } + double index_only_cost() + { + return IO_COEFF*idx_io_count*idx_avg_io_cost + + CPU_COEFF*idx_cpu_cost; + } + /** Whether or not all costs in the object are zero @@ -2590,30 +2604,48 @@ public: */ bool is_zero() const { - return io_count == 0.0 && cpu_cost == 0.0 && + return io_count == 0.0 && idx_io_count && cpu_cost == 0.0 && import_cost == 0.0 && mem_cost == 0.0; } void reset() { avg_io_cost= 1.0; - io_count= cpu_cost= mem_cost= import_cost= 0.0; + idx_avg_io_cost= 1.0; + io_count= idx_io_count= cpu_cost= idx_cpu_cost= mem_cost= import_cost= 0.0; } void multiply(double m) { io_count *= m; cpu_cost *= m; + idx_io_count *= m; + idx_cpu_cost *= m; import_cost *= m; /* Don't multiply mem_cost */ } void add(const Cost_estimate* cost) { - double io_count_sum= io_count + cost->io_count; - add_io(cost->io_count, cost->avg_io_cost); - io_count= io_count_sum; + if (cost->io_count) + { + double io_count_sum= io_count + cost->io_count; + avg_io_cost= (io_count * avg_io_cost + + cost->io_count * cost->avg_io_cost) + /io_count_sum; + io_count= io_count_sum; + } + if (cost->idx_io_count) + { + double idx_io_count_sum= idx_io_count + cost->idx_io_count; + idx_avg_io_cost= (idx_io_count * idx_avg_io_cost + + cost->idx_io_count * cost->idx_avg_io_cost) + /idx_io_count_sum; + idx_io_count= idx_io_count_sum; + } cpu_cost += cost->cpu_cost; + idx_cpu_cost += cost->idx_cpu_cost; + import_cost += cost->import_cost; } void add_io(double add_io_cnt, double add_avg_cost) @@ -2790,6 +2822,9 @@ public: extern "C" enum icp_result handler_index_cond_check(void* h_arg); +extern "C" int handler_rowid_filter_check(void* h_arg); +extern "C" int handler_rowid_filter_is_active(void* h_arg); + uint calculate_key_len(TABLE *, uint, const uchar *, key_part_map); /* bitmap with first N+1 bits set @@ -2956,6 +2991,11 @@ public: Item *pushed_idx_cond; uint pushed_idx_cond_keyno; /* The index which the above condition is for */ + /* Rowid filter pushed into the engine */ + Rowid_filter *pushed_rowid_filter; + /* true when the pushed rowid filter has been already filled */ + bool rowid_filter_is_active; + Discrete_interval auto_inc_interval_for_cur_row; /** Number of reserved auto-increment intervals. Serves as a heuristic @@ -3020,6 +3060,8 @@ public: tracker(NULL), pushed_idx_cond(NULL), pushed_idx_cond_keyno(MAX_KEY), + pushed_rowid_filter(NULL), + rowid_filter_is_active(0), auto_inc_intervals_count(0), m_psi(NULL), set_top_table_fields(FALSE), top_table(0), top_table_field(0), top_table_fields(0), @@ -3226,6 +3268,11 @@ public: virtual double scan_time() { return ulonglong2double(stats.data_file_length) / IO_SIZE + 2; } + virtual double key_scan_time(uint index) + { + return keyread_time(index, 1, records()); + } + /** The cost of reading a set of ranges from the table using an index to access it. @@ -3249,8 +3296,6 @@ public: */ virtual double keyread_time(uint index, uint ranges, ha_rows rows); - double get_io_cost(uint index, ha_rows rows, uint *length); - virtual const key_map *keys_to_use_for_scanning() { return &key_map_empty; } /* @@ -4046,6 +4091,14 @@ public: in_range_check_pushed_down= false; } + virtual void cancel_pushed_rowid_filter() + { + pushed_rowid_filter= NULL; + rowid_filter_is_active= false; + } + + virtual bool rowid_filter_push(Rowid_filter *rowid_filter) { return true; } + /* Needed for partition / spider */ virtual TABLE_LIST *get_next_global_for_child() { return NULL; } diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc index d6952e71899..cab278f5859 100644 --- a/sql/multi_range_read.cc +++ b/sql/multi_range_read.cc @@ -20,6 +20,12 @@ #include "key.h" #include "sql_statistics.h" +static ulonglong key_block_no(handler *h, uint keyno, uint keyentry_pos) +{ + return (ulonglong) (h->keyread_time(keyno, 1, keyentry_pos + 1) - + h->keyread_time(keyno, 0, keyentry_pos + 1) + 0.5) + 1; +} + /**************************************************************************** * Default MRR implementation (MRR to non-MRR converter) ***************************************************************************/ @@ -47,6 +53,24 @@ for a user to be able to interrupt the calculation by killing the connection/query. + @note + Starting from 10.4 the implementation of this method tries to take into + account gaps between range intervals. Before this we had such paradoxical + cases when, for example, the cost of the index scan by range [1..3] was + almost twice as less than the cost of of the index scan by two intervals + [1..1] and [3..3]. + + @note + The current implementation of the method is not efficient for it + requires extra dives for gaps. Although these dives are not expensive + as they touch the index nodes that with very high probability are in + cache this is still not good. We could avoid it if records in range + also returned positions of the ends of range intervals. It's not + hard to implement it now for MyISAM as this engine provides a function + returning an approximation of the relative position of a key tuple + among other index key tuples. Unfortunately InnoDB now does not provide + anything like this function. + @retval HA_POS_ERROR Error or the engine is unable to perform the requested scan. Values of OUT parameters are undefined. @@ -61,12 +85,21 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, uint *bufsz, uint *flags, Cost_estimate *cost) { KEY_MULTI_RANGE range; + key_range prev_start_key; range_seq_t seq_it; - ha_rows rows, total_rows= 0; + ha_rows min_pos= 0; + ha_rows total_rows= 0; uint n_ranges=0; + uint n_eq_ranges= 0; + ulonglong total_touched_blocks= 0; + key_range *prev_min_endp= 0; + ulonglong prev_max_block_no=0; + ha_rows max_rows= stats.records; THD *thd= table->in_use; - uint limit= thd->variables.eq_range_index_dive_limit; + StringBuffer<64> key_value; + uint limit= thd->variables.eq_range_index_dive_limit; + bool use_statistics_for_eq_range= eq_ranges_exceeds_limit(seq, seq_init_param, limit); @@ -77,10 +110,15 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, seq_it= seq->init(seq_init_param, n_ranges, *flags); while (!seq->next(seq_it, &range)) { + ha_rows rows; + ulonglong new_touched_blocks= 0; + if (unlikely(thd->killed != 0)) return HA_POS_ERROR; n_ranges++; + if (range.range_flag & EQ_RANGE) + n_eq_ranges++; key_range *min_endp, *max_endp; if (range.range_flag & GEOM_FLAG) { @@ -95,38 +133,93 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, max_endp= range.end_key.length? &range.end_key : NULL; } int keyparts_used= my_count_bits(range.start_key.keypart_map); - if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE)) - rows= 1; /* there can be at most one row */ - else if (use_statistics_for_eq_range && - !(range.range_flag & NULL_RANGE) && - (range.range_flag & EQ_RANGE) && - table->key_info[keyno].actual_rec_per_key(keyparts_used - 1) > 0.5) - rows= - (ha_rows) table->key_info[keyno].actual_rec_per_key(keyparts_used - 1); + if (use_statistics_for_eq_range && + !(range.range_flag & NULL_RANGE) && + (range.range_flag & EQ_RANGE) && + table->key_info[keyno].actual_rec_per_key(keyparts_used - 1) > 0.5) + { + if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE)) + rows= 1; /* there can be at most one row */ + else + rows= + (ha_rows) table->key_info[keyno].actual_rec_per_key(keyparts_used-1); + } else { - if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp, + ulonglong min_block_no; + ulonglong max_block_no; + if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE)) + rows= 1; /* there can be at most one row */ + else if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp, max_endp))) { /* Can't scan one range => can't do MRR scan at all */ total_rows= HA_POS_ERROR; break; } + if (!max_endp && !(prev_min_endp && prev_min_endp->length)) + min_pos+= max_rows - rows; + else + { + key_range *start_endp= prev_min_endp; + if (start_endp && !start_endp->keypart_map) + start_endp= 0; + /* + Get the estimate of rows in the previous gap + and two ranges surrounding this gap + */ + ha_rows r= this->records_in_range(keyno,start_endp,max_endp); + if (r == HA_POS_ERROR) + { + /* Some engine cannot estimate such ranges */ + total_rows += rows; + continue; + } + min_pos+= r - rows; + } + min_block_no= key_block_no(this, keyno, min_pos); + max_block_no= key_block_no(this, keyno, min_pos + rows); + new_touched_blocks= max_block_no - min_block_no + + MY_TEST(min_block_no != prev_max_block_no); + prev_max_block_no= max_block_no; + if (!prev_min_endp) + prev_min_endp= &prev_start_key; + /* Save range.start_key for the next iteration step */ + prev_start_key= range.start_key; + key_value.copy((const char *) prev_start_key.key, prev_start_key.length, + key_value.charset()); + prev_start_key.key= (const uchar *) key_value.ptr(); } total_rows += rows; + total_touched_blocks+= new_touched_blocks; } if (total_rows != HA_POS_ERROR) { + set_if_smaller(total_rows, max_rows); /* The following calculation is the same as in multi_range_read_info(): */ *flags |= HA_MRR_USE_DEFAULT_IMPL; cost->reset(); cost->avg_io_cost= 1; /* assume random seeks */ - if ((*flags & HA_MRR_INDEX_ONLY) && total_rows > 2) - cost->io_count= keyread_time(keyno, n_ranges, (uint)total_rows); + cost->idx_avg_io_cost= 1; + if (!(keyno == table->s->primary_key && primary_key_is_clustered())) + { + cost->idx_io_count= total_touched_blocks + + keyread_time(keyno, 0, total_rows); + cost->cpu_cost= cost->idx_cpu_cost= + (double) total_rows / TIME_FOR_COMPARE_IDX + + (2 * n_ranges - n_eq_ranges) * IDX_LOOKUP_COST; + if (!(*flags & HA_MRR_INDEX_ONLY)) + { + cost->io_count= read_time(keyno, 0, total_rows); + cost->cpu_cost+= (double) total_rows / TIME_FOR_COMPARE; + } + } else - cost->io_count= read_time(keyno, n_ranges, total_rows); - cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01; + { + cost->io_count= read_time(keyno, total_touched_blocks, (uint) total_rows); + cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01; + } } return total_rows; } @@ -183,10 +276,22 @@ ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows, cost->avg_io_cost= 1; /* assume random seeks */ /* Produce the same cost as non-MRR code does */ - if (*flags & HA_MRR_INDEX_ONLY) - cost->io_count= keyread_time(keyno, n_ranges, n_rows); + if (!(keyno == table->s->primary_key && primary_key_is_clustered())) + { + cost->idx_io_count= n_ranges + keyread_time(keyno, 0, n_rows); + cost->cpu_cost= cost->idx_cpu_cost= + (double) n_rows / TIME_FOR_COMPARE_IDX + n_ranges * IDX_LOOKUP_COST; + if (!(*flags & HA_MRR_INDEX_ONLY)) + { + cost->io_count= read_time(keyno, 0, n_rows); + cost->cpu_cost+= (double) n_rows / TIME_FOR_COMPARE; + } + } else - cost->io_count= read_time(keyno, n_ranges, n_rows); + { + cost->io_count= read_time(keyno, n_ranges, (uint)n_rows); + cost->cpu_cost= (double) n_rows / TIME_FOR_COMPARE + 0.01; + } return 0; } diff --git a/sql/opt_range.cc b/sql/opt_range.cc index b9b74bdd0c4..ba2705bab40 100644 --- a/sql/opt_range.cc +++ b/sql/opt_range.cc @@ -2520,6 +2520,7 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use, quick=0; needed_reg.clear_all(); quick_keys.clear_all(); + head->with_impossible_ranges.clear_all(); DBUG_ASSERT(!head->is_filled_at_execution()); if (keys_to_use.is_clear_all() || head->is_filled_at_execution()) DBUG_RETURN(0); @@ -2639,8 +2640,8 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use, if (!force_quick_range && !head->covering_keys.is_clear_all()) { int key_for_use= find_shortest_key(head, &head->covering_keys); - double key_read_time= head->file->keyread_time(key_for_use, 1, records) + - (double) records / TIME_FOR_COMPARE; + double key_read_time= head->file->key_scan_time(key_for_use) + + (double) records / TIME_FOR_COMPARE_IDX; DBUG_PRINT("info", ("'all'+'using index' scan will be using key %d, " "read time %g", key_for_use, key_read_time)); if (key_read_time < read_time) @@ -4790,6 +4791,7 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge, double roru_index_costs; ha_rows roru_total_records; double roru_intersect_part= 1.0; + double limit_read_time= read_time; size_t n_child_scans; DBUG_ENTER("get_best_disjunct_quick"); DBUG_PRINT("info", ("Full table scan cost: %g", read_time)); @@ -4936,7 +4938,7 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge, if (imerge_trp) { TABLE_READ_PLAN *trp= merge_same_index_scans(param, imerge, imerge_trp, - read_time); + limit_read_time); if (trp != imerge_trp) DBUG_RETURN(trp); } @@ -5409,9 +5411,8 @@ bool prepare_search_best_index_intersect(PARAM *param, same_index_prefix(cpk_scan->key_info, key_info, used_key_parts)) continue; - cost= table->file->keyread_time((*index_scan)->keynr, - (*index_scan)->range_count, - (*index_scan)->records); + cost= table->quick_index_only_costs[(*index_scan)->keynr]; + if (cost >= cutoff_cost) continue; @@ -8556,6 +8557,7 @@ int and_range_trees(RANGE_OPT_PARAM *param, SEL_TREE *tree1, SEL_TREE *tree2, if (key && key->type == SEL_ARG::IMPOSSIBLE) { result->type= SEL_TREE::IMPOSSIBLE; + param->table->with_impossible_ranges.set_bit(param->real_keynr[key_no]); DBUG_RETURN(1); } result_keys.set_bit(key_no); @@ -10541,7 +10543,6 @@ ha_rows check_quick_select(PARAM *param, uint idx, bool index_only, handler *file= param->table->file; ha_rows rows= HA_POS_ERROR; uint keynr= param->real_keynr[idx]; - uint length; DBUG_ENTER("check_quick_select"); /* Handle cases when we don't have a valid non-empty list of range */ @@ -10600,8 +10601,10 @@ ha_rows check_quick_select(PARAM *param, uint idx, bool index_only, MY_MIN(param->table->quick_condition_rows, rows); param->table->quick_rows[keynr]= rows; param->table->quick_costs[keynr]= cost->total_cost(); - param->table->quick_key_io[keynr]= - file->get_io_cost(keynr, param->table->quick_rows[keynr], &length); + if (keynr == param->table->s->primary_key && pk_is_clustered) + param->table->quick_index_only_costs[keynr]= 0; + else + param->table->quick_index_only_costs[keynr]= cost->index_only_cost(); } } /* Figure out if the key scan is ROR (returns rows in ROWID order) or not */ @@ -13656,7 +13659,7 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts, 1/double(2*TIME_FOR_COMPARE); const double cpu_cost= num_groups * - (tree_traversal_cost + 1/double(TIME_FOR_COMPARE)); + (tree_traversal_cost + 1/double(TIME_FOR_COMPARE_IDX)); *read_cost= io_cost + cpu_cost; *records= num_groups; diff --git a/sql/opt_subselect.h b/sql/opt_subselect.h index 031118288b9..e81b100e2a1 100644 --- a/sql/opt_subselect.h +++ b/sql/opt_subselect.h @@ -303,6 +303,7 @@ public: pos->loosescan_picker.loosescan_parts= best_max_loose_keypart + 1; pos->use_join_buffer= FALSE; pos->table= tab; + pos->range_rowid_filter_info= tab->range_rowid_filter_info; // todo need ref_depend_map ? DBUG_PRINT("info", ("Produced a LooseScan plan, key %s, %s", tab->table->key_info[best_loose_scan_key].name.str, diff --git a/sql/rowid_filter.cc b/sql/rowid_filter.cc index bcca9a0f528..4348eea081d 100644 --- a/sql/rowid_filter.cc +++ b/sql/rowid_filter.cc @@ -1,218 +1,556 @@ +/* + Copyright (c) 2018, 2019 MariaDB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +#include "mariadb.h" +#include "table.h" +#include "sql_class.h" +#include "opt_range.h" #include "rowid_filter.h" +#include "sql_select.h" + + +inline +double Range_rowid_filter_cost_info::lookup_cost( + Rowid_filter_container_type cont_type) +{ + switch (cont_type) { + case SORTED_ARRAY_CONTAINER: + return log(est_elements)*0.01; + default: + DBUG_ASSERT(0); + return 0; + } +} /** - Sets information about filter with key_numb index. - It sets a cardinality of filter, calculates its selectivity - and gets slope and interscept values. + @brief + The average gain in cost per row to use the range filter with this cost info */ -void Range_filter_cost_info::init(TABLE *tab, uint key_numb) +inline +double Range_rowid_filter_cost_info::avg_access_and_eval_gain_per_row( + Rowid_filter_container_type cont_type) { - table= tab; - key_no= key_numb; - cardinality= table->quick_rows[key_no]; - b= filter_io_cost() + filter_write_cost() + filter_sort_cost(); - selectivity= cardinality/((double) table->stat_records()); - a= (1 + COST_COND_EVAL)*(1 - selectivity) - lookup_cost(); - intersect_x_axis_abcissa= b/a; + return (1+1.0/TIME_FOR_COMPARE) * (1 - selectivity) - + lookup_cost(cont_type); } - /** @brief - Sort available filters by their building cost in the increasing order + Initialize the cost info structure for a range filter - @details - The method starts sorting available filters from the first filter that - is not defined as the best filter. If there are two filters that are - defined as the best filters there is no need to sort other filters. - Best filters are already sorted by their building cost and have the - smallest bulding cost in comparison with other filters by definition. + @param cont_type The type of the container of the range filter + @param tab The table for which the range filter is evaluated + @param idx The index used to create this range filter +*/ + +void Range_rowid_filter_cost_info::init(Rowid_filter_container_type cont_type, + TABLE *tab, uint idx) +{ + container_type= cont_type; + table= tab; + key_no= idx; + est_elements= table->quick_rows[key_no]; + b= build_cost(container_type); + selectivity= est_elements/((double) table->stat_records()); + a= avg_access_and_eval_gain_per_row(container_type); + if (a > 0) + cross_x= b/a; + abs_independent.clear_all(); +} - As the sorting method bubble sort is used. + +/** + @brief + Return the cost of building a range filter of a certain type */ -void TABLE::sort_range_filter_cost_info_array() +double +Range_rowid_filter_cost_info::build_cost(Rowid_filter_container_type cont_type) { - if (best_filter_count == 2) - return; + double cost= 0; - for (uint i= best_filter_count; i < range_filter_cost_info_elements-1; i++) - { - for (uint j= i+1; j < range_filter_cost_info_elements; j++) - { - if (range_filter_cost_info[i].intersect_x_axis_abcissa > - range_filter_cost_info[j].intersect_x_axis_abcissa) - swap_variables(Range_filter_cost_info, - range_filter_cost_info[i], - range_filter_cost_info[j]); - } + cost+= table->quick_index_only_costs[key_no]; + + switch (cont_type) { + + case SORTED_ARRAY_CONTAINER: + cost+= ARRAY_WRITE_COST * est_elements; /* cost filling the container */ + cost+= ARRAY_SORT_C * est_elements * log(est_elements); /* sorting cost */ + break; + default: + DBUG_ASSERT(0); } + + return cost; +} + + +Rowid_filter_container *Range_rowid_filter_cost_info::create_container() +{ + THD *thd= table->in_use; + uint elem_sz= table->file->ref_length; + Rowid_filter_container *res= 0; + + switch (container_type) { + case SORTED_ARRAY_CONTAINER: + res= new (thd->mem_root) Rowid_filter_sorted_array(est_elements, elem_sz); + break; + default: + DBUG_ASSERT(0); + } + return res; +} + + +static +int compare_range_rowid_filter_cost_info_by_a( + Range_rowid_filter_cost_info **filter_ptr_1, + Range_rowid_filter_cost_info **filter_ptr_2) +{ + double diff= (*filter_ptr_2)->get_a() - (*filter_ptr_1)->get_a(); + return (diff < 0 ? -1 : (diff > 0 ? 1 : 0)); } /** @brief - The method searches for the filters that can reduce the join cost the most + Prepare the array with cost info on range filters to be used by optimizer @details - The method looks through the available filters trying to choose the best - filter and eliminate as many filters as possible. - - Filters are considered as a linear functions. The best filter is the linear - function that intersects all other linear functions not in the I quadrant - and has the biggest a (slope) value. This filter will reduce the partial - join cost the most. If it is possible the second best filter is also - chosen. The second best filter can be used if the ref access is made on - the index of the first best filter. - - So there is no need to store all other filters except filters that - intersect in the I quadrant. It is impossible to say on this step which - filter is better and will give the biggest gain. - - The number of filters that can be used is stored in the - range_filter_cost_info_elements variable. + The function removes the array of cost info on range filters the elements + for those range filters that won't be ever chosen as the best filter, no + matter what index will be used to access the table and at what step the + table will be joined. */ -void TABLE::prune_range_filters() +void TABLE::prune_range_rowid_filters() { - key_map pruned_filter_map; - pruned_filter_map.clear_all(); - Range_filter_cost_info *max_slope_filters[2] = {0, 0}; - - for (uint i= 0; i < range_filter_cost_info_elements; i++) + uint i, j; + + /* + For the elements of the array with cost info on range filters + build a bit matrix of absolutely independent elements. + Two elements are absolutely independent if they such indexes that + there is no other index that overlaps both of them or is constraint + correlated with both of them. Use abs_independent key maps to store + the elements if this bit matrix. + */ + + Range_rowid_filter_cost_info **filter_ptr_1= range_rowid_filter_cost_info_ptr; + for (i= 0; i < range_rowid_filter_cost_info_elems; i++, filter_ptr_1++) { - Range_filter_cost_info *filter= &range_filter_cost_info[i]; - if (filter->a < 0) + uint key_no= (*filter_ptr_1)->key_no; + Range_rowid_filter_cost_info **filter_ptr_2= filter_ptr_1 + 1; + for (j= i+1; j < range_rowid_filter_cost_info_elems; j++, filter_ptr_2++) { - range_filter_cost_info_elements--; - swap_variables(Range_filter_cost_info, range_filter_cost_info[i], - range_filter_cost_info[range_filter_cost_info_elements]); - continue; - } - for (uint j= i+1; j < range_filter_cost_info_elements; j++) - { - Range_filter_cost_info *cand_filter= &range_filter_cost_info[j]; - - double intersect_x= filter->get_intersect_x(cand_filter); - double intersect_y= filter->get_intersect_y(intersect_x); - - if (intersect_x > 0 && intersect_y > 0) + key_map map= key_info[key_no].overlapped; + map.intersect(key_info[(*filter_ptr_2)->key_no].overlapped); + if (map.is_clear_all()) { - pruned_filter_map.set_bit(cand_filter->key_no); - pruned_filter_map.set_bit(filter->key_no); + (*filter_ptr_1)->abs_independent.set_bit((*filter_ptr_2)->key_no); + (*filter_ptr_2)->abs_independent.set_bit(key_no); } } - if (!pruned_filter_map.is_set(filter->key_no)) + } + + /* Sort the array range_filter_cost_info by 'a' in descending order */ + my_qsort(range_rowid_filter_cost_info_ptr, + range_rowid_filter_cost_info_elems, + sizeof(Range_rowid_filter_cost_info *), + (qsort_cmp) compare_range_rowid_filter_cost_info_by_a); + + /* + For each element check whether it is created for the filter that + can be ever chosen as the best one. If it's not the case remove + from the array. Otherwise put it in the array in such a place + that all already checked elements left the array are ordered by + cross_x. + */ + + Range_rowid_filter_cost_info **cand_filter_ptr= + range_rowid_filter_cost_info_ptr; + for (i= 0; i < range_rowid_filter_cost_info_elems; i++, cand_filter_ptr++) + { + bool is_pruned= false; + Range_rowid_filter_cost_info **usable_filter_ptr= + range_rowid_filter_cost_info_ptr; + key_map abs_indep; + abs_indep.clear_all(); + for (uint j= 0; j < i; j++, usable_filter_ptr++) { - if (!max_slope_filters[0]) - max_slope_filters[0]= filter; + if ((*cand_filter_ptr)->cross_x >= (*usable_filter_ptr)->cross_x) + { + if (abs_indep.is_set((*usable_filter_ptr)->key_no)) + { + /* + The following is true here for the element e being checked: + There are at 2 elements e1 and e2 among already selected such that + e1.cross_x < e.cross_x and e1.a > e.a + and + e2.cross_x < e_cross_x and e2.a > e.a, + i.e. the range filters f1, f2 of both e1 and e2 always promise + better gains then the range filter of e. + As e1 and e2 are absolutely independent one of the range filters + f1, f2 will be always a better choice than f no matter what index + is chosen to access the table. Because of this the element e + can be safely removed from the array. + */ + + is_pruned= true; + break; + } + abs_indep.merge((*usable_filter_ptr)->abs_independent); + } else { - if (!max_slope_filters[1] || - max_slope_filters[1]->a < filter->a) - max_slope_filters[1]= filter; - if (max_slope_filters[0]->a < max_slope_filters[1]->a) - swap_variables(Range_filter_cost_info*, max_slope_filters[0], - max_slope_filters[1]); + /* + Move the element being checked to the proper position to have all + elements that have been already checked to be sorted by cross_x + */ + Range_rowid_filter_cost_info *moved= *cand_filter_ptr; + memmove(usable_filter_ptr+1, usable_filter_ptr, + sizeof(Range_rowid_filter_cost_info *) * (i-j-1)); + *usable_filter_ptr= moved; } } - } - - for (uint i= 0; i<2; i++) - { - if (max_slope_filters[i]) + if (is_pruned) { - swap_variables(Range_filter_cost_info, - range_filter_cost_info[i], - *max_slope_filters[i]); - if (i == 0 && - max_slope_filters[1] == &range_filter_cost_info[0]) - max_slope_filters[1]= max_slope_filters[0]; - - best_filter_count++; - max_slope_filters[i]= &range_filter_cost_info[i]; + /* Remove the checked element from the array */ + memmove(cand_filter_ptr, cand_filter_ptr+1, + sizeof(Range_rowid_filter_cost_info *) * + (range_rowid_filter_cost_info_elems - 1 - i)); + range_rowid_filter_cost_info_elems--; } } - sort_range_filter_cost_info_array(); } -void TABLE::select_usable_range_filters(THD *thd) +/** + @brief + Return maximum number of elements that a container allowed to have + */ + +static uint +get_max_range_rowid_filter_elems_for_table( + THD *thd, TABLE *tab, + Rowid_filter_container_type cont_type) +{ + switch (cont_type) { + case SORTED_ARRAY_CONTAINER : + return thd->variables.max_rowid_filter_size/tab->file->ref_length; + default : + DBUG_ASSERT(0); + return 0; + } +} + + +/** + @brief + Prepare info on possible range filters used by optimizer + + @param table The thread handler + + @details + The function first selects the indexes of the table that potentially + can be used for range filters and allocates an array of the objects + of the Range_rowid_filter_cost_info type to store cost info on + possible range filters and an array of pointers to these objects. + The latter is created for easy sorting of the objects with cost info + by different sort criteria. Then the function initializes the allocated + array with cost info for each possible range filter. After this + the function calls the method TABLE::prune_range_rowid_filters(). + The method removes the elements of the array for the filters that + promise less gain then others remaining in the array in any situation + and optimizes the order of the elements for faster choice of the best + range filter. +*/ + +void TABLE::init_cost_info_for_usable_range_rowid_filters(THD *thd) { uint key_no; key_map usable_range_filter_keys; usable_range_filter_keys.clear_all(); key_map::Iterator it(quick_keys); + + /* + From all indexes that can be used for range accesses select only such that + - range filter pushdown is supported by the engine for them (1) + - they are not clustered primary (2) + - the range filter containers for them are not too large (3) + */ while ((key_no= it++) != key_map::Iterator::BITMAP_END) { - if (quick_rows[key_no] > - thd->variables.max_rowid_filter_size/file->ref_length) + if (!(file->index_flags(key_no, 0, 1) & HA_DO_RANGE_FILTER_PUSHDOWN)) // !1 + continue; + if (key_no == s->primary_key && file->primary_key_is_clustered()) // !2 + continue; + if (quick_rows[key_no] > + get_max_range_rowid_filter_elems_for_table(thd, this, + SORTED_ARRAY_CONTAINER)) // !3 continue; usable_range_filter_keys.set_bit(key_no); } - if (usable_range_filter_keys.is_clear_all()) + /* + Allocate an array of objects to store cost info for the selected filters + and allocate an array of pointers to these objects + */ + + range_rowid_filter_cost_info_elems= usable_range_filter_keys.bits_set(); + if (!range_rowid_filter_cost_info_elems) + return; + + range_rowid_filter_cost_info_ptr= + (Range_rowid_filter_cost_info **) + thd->calloc(sizeof(Range_rowid_filter_cost_info *) * + range_rowid_filter_cost_info_elems); + range_rowid_filter_cost_info= + new (thd->mem_root) + Range_rowid_filter_cost_info[range_rowid_filter_cost_info_elems]; + if (!range_rowid_filter_cost_info_ptr || !range_rowid_filter_cost_info) + { + range_rowid_filter_cost_info_elems= 0; return; + } + + /* Fill the allocated array with cost info on the selected range filters */ - range_filter_cost_info_elements= usable_range_filter_keys.bits_set(); - range_filter_cost_info= - new (thd->mem_root) Range_filter_cost_info [range_filter_cost_info_elements]; - Range_filter_cost_info *curr_filter_cost_info= range_filter_cost_info; + Range_rowid_filter_cost_info **curr_ptr= range_rowid_filter_cost_info_ptr; + Range_rowid_filter_cost_info *curr_filter_cost_info= + range_rowid_filter_cost_info; key_map::Iterator li(usable_range_filter_keys); while ((key_no= li++) != key_map::Iterator::BITMAP_END) { - curr_filter_cost_info->init(this, key_no); + *curr_ptr= curr_filter_cost_info; + curr_filter_cost_info->init(SORTED_ARRAY_CONTAINER, this, key_no); + curr_ptr++; curr_filter_cost_info++; } - prune_range_filters(); + + prune_range_rowid_filters(); } -Range_filter_cost_info -*TABLE::best_filter_for_current_join_order(uint ref_key_no, - double record_count, - double records) +/** + @brief + Choose the best range filter for the given access of the table + + @param access_key_no The index by which the table is accessed + @param records The estimated total number of key tuples with this access + + @details + The function looks through the array of cost info for range filters + and chooses the element for the range filter that promise the greatest + gain with the the ref or range access of the table by access_key_no. + As the array is sorted by cross_x in ascending order the function stops + the look through as soon as it reaches the first element with + cross_x > records because the range filter for this element and the + range filters for all remaining elements do not promise positive gains + + @retval Pointer to the cost info for the range filter that promises + the greatest gain, NULL if there is no such range filter +*/ + +Range_rowid_filter_cost_info * +TABLE::best_range_rowid_filter_for_partial_join(uint access_key_no, + double records) { - if (!this || range_filter_cost_info_elements == 0) + if (!this || range_rowid_filter_cost_info_elems == 0 || + covering_keys.is_set(access_key_no)) return 0; - double card= record_count*records; - Range_filter_cost_info *best_filter= &range_filter_cost_info[0]; - - if (card < best_filter->intersect_x_axis_abcissa) + /* + Currently we do not support usage of range filters if the table + is accessed by the clustered primary key. It does not make sense + if a full key is used. If the table is accessed by a partial + clustered primary key it would, but the current InnoDB code does not + allow it. Later this limitation will be lifted + */ + if (access_key_no == s->primary_key && file->primary_key_is_clustered()) return 0; - if (best_filter_count != 0) + + Range_rowid_filter_cost_info *best_filter= 0; + double best_filter_gain= 0; + + key_map *overlapped= &key_info[access_key_no].overlapped; + for (uint i= 0; i < range_rowid_filter_cost_info_elems ; i++) { - if (best_filter->key_no == ref_key_no) + double curr_gain = 0; + Range_rowid_filter_cost_info *filter= range_rowid_filter_cost_info_ptr[i]; + + /* + Do not use a range filter that uses an in index correlated with + the index by which the table is accessed + */ + if ((filter->key_no == access_key_no) || + overlapped->is_set(filter->key_no)) + continue; + + if (records < filter->cross_x) { - if (best_filter_count == 2) - { - best_filter= &range_filter_cost_info[1]; - if (card < best_filter->intersect_x_axis_abcissa) - return 0; - return best_filter; - } + /* Does not make sense to look through the remaining filters */ + break; + } + + curr_gain= filter->get_gain(records); + if (best_filter_gain < curr_gain) + { + best_filter_gain= curr_gain; + best_filter= filter; + } + } + return best_filter; +} + + +/** + @brief + Fill the range rowid filter performing the associated range index scan + + @details + This function performs the range index scan associated with this + range filter and place into the filter the rowids / primary keys + read from key tuples when doing this scan. + @retval + false on success + true otherwise + + @note + The function assumes that the quick select object to perform + the index range scan has been already created. + + @note + Currently the same table handler is used to access the joined table + and to perform range index scan filling the filter. + In the future two different handlers will be used for this + purposes to facilitate a lazy building of the filter. +*/ + +bool Range_rowid_filter::fill() +{ + int rc= 0; + handler *file= table->file; + THD *thd= table->in_use; + QUICK_RANGE_SELECT* quick= (QUICK_RANGE_SELECT*) select->quick; + + uint table_status_save= table->status; + Item *pushed_idx_cond_save= file->pushed_idx_cond; + uint pushed_idx_cond_keyno_save= file->pushed_idx_cond_keyno; + bool in_range_check_pushed_down_save= file->in_range_check_pushed_down; + + table->status= 0; + file->pushed_idx_cond= 0; + file->pushed_idx_cond_keyno= MAX_KEY; + file->in_range_check_pushed_down= false; + + /* We're going to just read rowids / primary keys */ + table->prepare_for_position(); + + table->file->ha_start_keyread(quick->index); + + if (quick->init() || quick->reset()) + rc= 1; + + while (!rc) + { + rc= quick->get_next(); + if (thd->killed) + rc= 1; + if (!rc) + { + file->position(quick->record); + if (container->add(NULL, (char*) file->ref)) + rc= 1; } + } + + quick->range_end(); + table->file->ha_end_keyread(); + + table->status= table_status_save; + file->pushed_idx_cond= pushed_idx_cond_save; + file->pushed_idx_cond_keyno= pushed_idx_cond_keyno_save; + file->in_range_check_pushed_down= in_range_check_pushed_down_save; + + if (rc != HA_ERR_END_OF_FILE) + return 1; + table->file->rowid_filter_is_active= true; + return 0; +} + + +/** + @brief + Binary search in the sorted array of a rowid filter + + @param ctxt context of the search + @parab elem rowid / primary key to look for + + @details + The function looks for the rowid / primary key ' elem' in this container + assuming that ctxt contains a pointer to the TABLE structure created + for the table to whose row elem refers to. + + @retval + true elem is found in the container + false otherwise +*/ + +bool Rowid_filter_sorted_array::check(void *ctxt, char *elem) +{ + TABLE *table= (TABLE *) ctxt; + if (!is_checked) + { + refpos_container.sort(refpos_order_cmp, (void *) (table->file)); + is_checked= true; + } + int l= 0; + int r= refpos_container.elements()-1; + while (l <= r) + { + int m= (l + r) / 2; + int cmp= refpos_order_cmp((void *) (table->file), + refpos_container.get_pos(m), elem); + if (cmp == 0) + return true; + if (cmp < 0) + l= m + 1; else - return best_filter; + r= m-1; } + return false; +} - double best_filter_improvement= 0.0; - best_filter= 0; - for (uint i= best_filter_count; i < range_filter_cost_info_elements; i++) +Range_rowid_filter::~Range_rowid_filter() +{ + delete container; + container= 0; + if (select) { - Range_filter_cost_info *filter= &range_filter_cost_info[i]; - if (card < filter->intersect_x_axis_abcissa) - break; - if (best_filter_improvement < filter->get_filter_gain(card)) + if (select->quick) { - best_filter_improvement= filter->get_filter_gain(card); - best_filter= filter; + delete select->quick; + select->quick= 0; } + delete select; + select= 0; } - return best_filter; } diff --git a/sql/rowid_filter.h b/sql/rowid_filter.h index 0d8520f25c5..3cc2c31555c 100644 --- a/sql/rowid_filter.h +++ b/sql/rowid_filter.h @@ -1,183 +1,451 @@ +/* + Copyright (c) 2018, 2019 MariaDB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + #ifndef ROWID_FILTER_INCLUDED #define ROWID_FILTER_INCLUDED -/** - It makes sense to apply filters for a certain join order when the following - inequality holds: - #T + c4*#T > #T*sel(Fi) + c4*#T*sel(Fi) + - I/O(Fi) + c1*#(Fi) + c2*#(Fi)*log(#(Fi)) + - c3*#T (1), +#include "mariadb.h" +#include "sql_array.h" + +/* + + What rowid / primary filters are + -------------------------------- + + Consider a join query Q of the form + SELECT * FROM T1, ... , Tk WHERE P. + + For any of the table reference Ti(Q) from the from clause of Q different + rowid / primary key filters (pk-filters for short) can be built. + A pk-filter F built for Ti(Q) is a set of rowids / primary keys of Ti + F= {pk1,...,pkN} such that for any row r=r1||...||rk from the result set of Q + ri's rowid / primary key pk(ri) is contained in F. + + When pk-filters are useful + -------------------------- + + If building a pk-filter F for Ti(Q )is not too costly and its cardinality #F + is much less than the cardinality of T - #T then using the pk-filter when + executing Q might be quite beneficial. + + Let r be a random row from Ti. Let s(F) be the probability that pk(r) + belongs to F. Let BC(F) be the cost of building F. + + Suppose that the optimizer has chosen for Q a plan with this join order + T1 => ... Tk and that the table Ti is accessed by a ref access using index I. + Let K = {k1,...,kM} be the set of all rowid/primary keys values used to access + rows of Ti when looking for matches in this table.to join Ti by index I. + + Let's assume that two set sets K and F are uncorrelated. With this assumption + if before accessing data from Ti by the rowid / primary key k we first + check whether k is in F then we can expect saving on M*(1-s(S)) accesses of + data rows from Ti. If we can guarantee that test whether k is in F is + relatively cheap then we can gain a lot assuming that BC(F) is much less + then the cost of fetching M*(1-s(S)) records from Ti and following + evaluation of conditions pushed into Ti. + + Making pk-filter test cheap + --------------------------- + + If the search structure to test whether an element is in F can be fully + placed in RAM then this test is expected to be be much cheaper than a random + access of a record from Ti. We'll consider two search structures for + pk-filters: ordered array and bloom filter. Ordered array is easy to + implement, but it's space consuming. If a filter contains primary keys + then at least space for each primary key from the filter must be allocated + in the search structure. On a the opposite a bloom filter requires a + fixed number of bits and this number does not depend on the cardinality + of the pk-filter (10 bits per element will serve pk-filter of any size). - where #T - the fanout of the partial join - Fi - a filter for the index with the number i in - the key_map of available indexes for this table - sel(Fi) - the selectivity of the index with the number - i - c4*#T, - c4*#T*sel(Fi) - a cost to apply available predicates - c4 - a constant to apply available predicates - I/O(Fi) - a cost of the I/O accesses to Fi - #(Fi) - a number of estimated records that range - access would use - c1*#(Fi) - a cost to write in Fi - c1 - a constant to write one element in Fi - c2*#(Fi)*log(#(Fi)) - a cost to sort in Fi - c2 - a sorting constant - c3*(#T) - a cost to look-up into a current partial join - c3 - a constant to look-up into Fi +*/ - Let's set a new variable FBCi (filter building cost for the filter with - index i): +/* + + How and when the optimizer builds and uses range rowid filters + -------------------------------------------------------------- + + 1. In make_join_statistics() + for each join table s + after the call of get_quick_record_count() + the TABLE::method init_cost_info_for_usable_range_rowid_filters() + is called + The method build an array of Range_rowid_filter_cost_info elements + containing the cost info on possible range filters for s->table. + The array is optimized for further usage. + + 2. For each partial join order when the optimizer considers joining + table s to this partial join + In the function best_access_path() + a. When evaluating a ref access r by index idx to join s + the optimizer estimates the effect of usage of each possible + range filter f and chooses one with the best gain. The gain + is taken into account when the cost of thr ref access r is + calculated. If it turns out that this is the best ref access + to join s then the info about the chosen filter together + with the info on r is remembered in the corresponding element + of the array of POSITION structures. + [We evaluate every pair (ref access, range_filter) rather then + every pair (best ref access, range filter) because if the index + ref_idx used for ref access r correlates with the index rf_idx + used by the filter f then the pair (r,f) is not evaluated + at all as we don't know how to estimate the effect of correlation + between ref_idx and rf_idx.] + b. When evaluating the best range access to join table s the + optimizer estimates the effect of usage of each possible + range filter f and chooses one with the best gain. + [Here we should have evaluated every pair (range access, + range filter) as well, but it's not done yet.] + + 3. When the cheapest execution plan has been chosen and after the + call of JOIN::get_best_combination() + The method JOIN::make_range_rowid_filters() is called + For each range rowid filter used in the chosen execution plan + the method creates a quick select object to be able to perform + index range scan to fill the filter at the execution stage. + The method also creates Range_rowid_filter objects that are + used at the execution stage. + + 4. Just before the execution stage + The method JOIN::init_range_rowid_filters() is called. + For each join table s that is to be accessed with usage of a range + filter the method allocates containers for the range filter and + it lets the engine know that the filter will be used when + accessing s. + + 5. At the execution stage + In the function sub_select() just before the first access of a join + table s employing a range filter + The method JOIN_TAB::build_range_rowid_filter_if_needed() is called + The method fills the filter using the quick select created by + JOIN::make_range_rowid_filters(). + + 6. The accessed key tuples are checked against the filter within the engine + using the info pushed into it. - FBCi = I/O(Fi) + c1*#(Fi) + c2*#(Fi)*log(#(Fi)) +*/ - It can be seen that FBCi doesn't depend on #T. +class TABLE; +class SQL_SELECT; +class Rowid_filter_container; +class Range_rowid_filter_cost_info; - So using this variable (1) can be rewritten: +/* Cost to write rowid into array */ +#define ARRAY_WRITE_COST 0.005 +/* Factor used to calculate cost of sorting rowids in array */ +#define ARRAY_SORT_C 0.01 +/* Cost to evaluate condition */ +#define COST_COND_EVAL 0.2 - #T + c4*#T > #T*sel(Fi) + c4*#T*sel(Fi) + - FBCi + - c3*#T +typedef enum +{ + SORTED_ARRAY_CONTAINER, + BLOOM_FILTER_CONTAINER +} Rowid_filter_container_type; - To get a possible cost improvement when a filter is used right part - of the (1) inequality should be deducted from the left part. - Denote it as G(#T): +/** + @class Rowid_filter_container - G(#T)= #T + c4*#T - (#T*sel(Fi) + c4*#T*sel(Fi) + FBCi + c3*#T) (2) + The interface for different types of containers to store info on the set + of rowids / primary keys that defines a pk-filter. - On the prepare stage when filters are created #T value isn't known. + There will be two implementations of this abstract class. + - sorted array + - bloom filter +*/ - To find out what filter is the best among available one for the table - (what filter gives the biggest gain) a knowledge about linear functions - can be used. Consider filter gain as a linear function: +class Rowid_filter_container : public Sql_alloc +{ +public: - Gi(#T)= ai*#T + bi (3) + virtual Rowid_filter_container_type get_type() = 0; - where ai= 1+c4-c3-sel(Fi)*(1+c4), - bi= -FBCi + /* Allocate memory for the container */ + virtual bool alloc() = 0; - Filter gain can be interpreted as an ordinate, #T as abscissa. + /* + @brief Add info on a rowid / primary to the container + @param ctxt The context info (opaque) + @param elem The rowid / primary key to be added to the container + @retval true if elem is successfully added + */ + virtual bool add(void *ctxt, char *elem) = 0; - So the aim is to find the linear function that has the biggest ordinate value - for each positive abscissa (because #T can't be negative) comparing with - the other available functions. + /* + @brief Check whether a rowid / primary key is in container + @param ctxt The context info (opaque) + @param elem The rowid / primary key to be checked against the container + @retval False if elem is definitely not in the container + */ + virtual bool check(void *ctxt, char *elem) = 0; - Consider two filters Fi, Fj or linear functions with a positive slope. - To find out which linear function is better let's find their intersection - point coordinates. + virtual ~Rowid_filter_container() {} +}; - Gi(#T0)= Gj(#T0) (using (2))=> - #T0= (bj - bi)/(ai - aj) (using (3)) - => - #T0= (BCFj-BCFi)/((sel(Fj)-sel(Fi))*(1+c4)) - If put #T0 value into the (3) formula G(#T0) can be easily found. +/** + @class Rowid_filter - It can be seen that if two linear functions intersect in II, III or IV - quadrants the linear function with a bigger slope value will always - be better. + The interface for different types of pk-filters - If two functions intersect in the I quadrant for #T1 < #T0 a function - with a smaller slope value will give a better gain and when #T1 > #T0 - function with a bigger slope will give better gain. + Currently we support only range pk filters. +*/ - for each #T1 > #T0 if (ai > aj) => (Gi(#T1) >= Gj(#T1)) - #T1 <= #T0 if (ai > aj) => (Gi(#T1) <= Gj(#T1)) +class Rowid_filter : public Sql_alloc +{ +protected: - So both linear functions should be saved. + /* The container to store info the set of elements in the filter */ + Rowid_filter_container *container; - Interesting cases: +public: + Rowid_filter(Rowid_filter_container *container_arg) + : container(container_arg) {} - 1. For Fi,Fj filters ai=aj. + /* + Build the filter : + fill it with info on the set of elements placed there + */ + virtual bool build() = 0; - In this case intercepts bi and bj should be compared. - The filter with the biggest intercept will give a better result. + /* + Check whether an element is in the filter. + Returns false is the elements is definitely not in the filter. + */ + virtual bool check(char *elem) = 0; - 2. Only one filter remains after the calculations and for some join order - it is equal to the index that is used to access table. Therefore, this - filter can't be used. - - In this case the gain is computed for every filter that can be constructed - for this table. + virtual ~Rowid_filter() {} - After information about filters is computed for each partial join order - it is checked if the filter can be applied to the current table. - If it gives a cost improvement it is saved as the best plan for this - partial join. -*/ + Rowid_filter_container *get_container() { return container; } +}; -#include "mariadb.h" -#include "sql_class.h" -#include "table.h" -/* Cost to write into filter */ -#define COST_WRITE 0.01 -/* Weight factor for filter sorting */ -#define CNST_SORT 0.01 -/* Cost to evaluate condition */ -#define COST_COND_EVAL 0.2 +/** + @class Rowid_filter_container -class QUICK_RANGE_SELECT; + The implementation of the Rowid_interface used for pk-filters + that are filled when performing range index scans. +*/ -class Range_filter_cost_info : public Sql_alloc +class Range_rowid_filter: public Rowid_filter { -public: + /* The table for which the rowid filter is built */ TABLE *table; - uint key_no; - double cardinality; - double b; // intercept of the linear function - double a; // slope of the linear function - double selectivity; - double intersect_x_axis_abcissa; + /* The select to perform the range scan to fill the filter */ + SQL_SELECT *select; + /* The cost info on the filter (used for EXPLAIN/ANALYZE) */ + Range_rowid_filter_cost_info *cost_info; + +public: + Range_rowid_filter(TABLE *tab, + Range_rowid_filter_cost_info *cost_arg, + Rowid_filter_container *container_arg, + SQL_SELECT *sel) + : Rowid_filter(container_arg), table(tab), select(sel), cost_info(cost_arg) + {} + + ~Range_rowid_filter(); + + bool build() { return fill(); } - /** - Filter cost functions + bool check(char *elem) { return container->check(table, elem); } + + bool fill(); + + SQL_SELECT *get_select() { return select; } +}; + + +/** + @class Refpos_container_sorted_array + + The wrapper class over Dynamic_array<char> to facilitate operations over + array of elements of the type char[N] where N is the same for all elements +*/ + +class Refpos_container_sorted_array : public Sql_alloc +{ + /* + Maximum number of elements in the array + (Now is used only at the initialization of the dynamic array) */ - /* Cost to lookup into filter */ - inline double lookup_cost() + uint max_elements; + /* Number of bytes allocated for an element */ + uint elem_size; + /* The dynamic array over which the wrapper is built */ + Dynamic_array<char> *array; + +public: + + Refpos_container_sorted_array(uint max_elems, uint elem_sz) + : max_elements(max_elems), elem_size(elem_sz), array(0) {} + + ~Refpos_container_sorted_array() { - return log(cardinality)*0.01; + delete array; + array= 0; } - /* IO accesses cost to access filter */ - inline double filter_io_cost() - { return table->quick_key_io[key_no]; } + bool alloc() + { + array= new Dynamic_array<char> (elem_size * max_elements, + elem_size * max_elements/sizeof(char) + 1); + return array == NULL; + } - /* Cost to write elements in filter */ - inline double filter_write_cost() - { return COST_WRITE*cardinality; } + bool add(char *elem) + { + for (uint i= 0; i < elem_size; i++) + { + if (array->append(elem[i])) + return true; + } + return false; + } - /* Cost to sort elements in filter */ - inline double filter_sort_cost() - { - return CNST_SORT*cardinality*log(cardinality); + char *get_pos(uint n) + { + return array->get_pos(n * elem_size); } - /* End of filter cost functions */ - Range_filter_cost_info() : table(0), key_no(0) {} + uint elements() { return array->elements() / elem_size; } + + void sort (int (*cmp) (void *ctxt, const void *el1, const void *el2), + void *cmp_arg) + { + my_qsort2(array->front(), array->elements()/elem_size, + elem_size, (qsort2_cmp) cmp, cmp_arg); + } +}; + + +/** + @class Rowid_filter_sorted_array + + The implementation of the Rowid_filter_container interface as + a sorted array container of rowids / primary keys +*/ + +class Rowid_filter_sorted_array: public Rowid_filter_container +{ + /* The dynamic array to store rowids / primary keys */ + Refpos_container_sorted_array refpos_container; + /* Initially false, becomes true after the first call of (check() */ + bool is_checked; + +public: + Rowid_filter_sorted_array(uint elems, uint elem_size) + : refpos_container(elems, elem_size), is_checked(false) {} + + Rowid_filter_container_type get_type() + { return SORTED_ARRAY_CONTAINER; } + + bool alloc() { return refpos_container.alloc(); } + + bool add(void *ctxt, char *elem) { return refpos_container.add(elem); } + + bool check(void *ctxt, char *elem); +}; + +/** + @class Range_rowid_filter_cost_info + + An objects of this class is created for each potentially usable + range filter. It contains the info that allows to figure out + whether usage of the range filter promises some gain. +*/ + +class Range_rowid_filter_cost_info : public Sql_alloc +{ + /* The table for which the range filter is to be built (if needed) */ + TABLE *table; + /* Estimated number of elements in the filter */ + double est_elements; + /* The cost of building the range filter */ + double b; + /* + a*N-b yields the gain of the filter + for N key tuples of the index key_no + */ + double a; + /* The value of N where the gain is 0 */ + double cross_x; + /* Used for pruning of the potential range filters */ + key_map abs_independent; + +public: + /* The type of the container of the range filter */ + Rowid_filter_container_type container_type; + /* The index whose range scan would be used to build the range filter */ + uint key_no; + /* The selectivity of the range filter */ + double selectivity; + + Range_rowid_filter_cost_info() : table(0), key_no(0) {} - void init(TABLE *tab, uint key_numb); + void init(Rowid_filter_container_type cont_type, + TABLE *tab, uint key_no); - inline double get_intersect_x(Range_filter_cost_info *filter) + double build_cost(Rowid_filter_container_type container_type); + + inline double lookup_cost(Rowid_filter_container_type cont_type); + + inline double + avg_access_and_eval_gain_per_row(Rowid_filter_container_type cont_type); + + /* Get the gain that usage of filter promises for 'rows' key tuples */ + inline double get_gain(double rows) { - if (a == filter->a) - return DBL_MAX; - return (b - filter->b)/(a - filter->a); + return rows * a - b; } - inline double get_intersect_y(double intersect_x) + + /* + The gain promised by usage of the filter at the assumption that + when the table is accessed without the filter at most worst_seeks + pages are fetched from disk to access the data rows of the table + */ + inline double get_adjusted_gain(double rows, double worst_seeks) { - if (intersect_x == DBL_MAX) - return DBL_MAX; - return intersect_x*a - b; + return get_gain(rows) - + (1 - selectivity) * (rows - MY_MIN(rows, worst_seeks)); } - /** - Get a gain that a usage of filter in some partial join order - with the cardinaly card gives + /* + The gain promised by usage of the filter + due to less condition evaluations */ - inline double get_filter_gain(double card) - { return card*a - b; } + inline double get_cmp_gain(double rows) + { + return rows * (1 - selectivity) / TIME_FOR_COMPARE; + } + + Rowid_filter_container *create_container(); + + double get_a() { return a; } + + friend + void TABLE::prune_range_rowid_filters(); + + friend + void TABLE::init_cost_info_for_usable_range_rowid_filters(THD *thd); + + friend + Range_rowid_filter_cost_info * + TABLE::best_range_rowid_filter_for_partial_join(uint access_key_no, + double records); }; #endif /* ROWID_FILTER_INCLUDED */ diff --git a/sql/sql_array.h b/sql/sql_array.h index 0e5246b7e2a..4a49ade5576 100644 --- a/sql/sql_array.h +++ b/sql/sql_array.h @@ -166,6 +166,19 @@ public: return ((const Elem*)array.buffer) + array.elements - 1; } + /// @returns pointer to n-th element + Elem *get_pos(size_t idx) + { + return ((Elem*)array.buffer) + idx; + } + + /// @returns pointer to n-th element + const Elem *get_pos(size_t idx) const + { + return ((const Elem*)array.buffer) + idx; + } + + /** @retval false ok @retval true OOM, @c my_error() has been called. diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 0dd82351719..eae7d988772 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -2673,7 +2673,7 @@ void THD::make_explain_field_list(List<Item> &field_list, uint8 explain_flags, mem_root); item->maybe_null=1; field_list.push_back(item=new (mem_root) - Item_empty_string(this, "ref|filter", + Item_empty_string(this, "ref", NAME_CHAR_LEN*MAX_REF_PARTS, cs), mem_root); item->maybe_null=1; diff --git a/sql/sql_const.h b/sql/sql_const.h index be26de872df..e6dcc3420cf 100644 --- a/sql/sql_const.h +++ b/sql/sql_const.h @@ -194,7 +194,11 @@ instead of reading with keys. The number says how many evaluation of the WHERE clause is comparable to reading one extra row from a table. */ -#define TIME_FOR_COMPARE 5 // 5 compares == one read +#define TIME_FOR_COMPARE 5 // 5 compares == one read +#define TIME_FOR_COMPARE_IDX 20 + +#define IDX_BLOCK_COPY_COST ((double) 1 / TIME_FOR_COMPARE) +#define IDX_LOOKUP_COST ((double) 1 / 8) /** Number of comparisons of table rowids equivalent to reading one row from a diff --git a/sql/sql_explain.cc b/sql/sql_explain.cc index 23bc1e906a0..c7c6f15f7cc 100644 --- a/sql/sql_explain.cc +++ b/sql/sql_explain.cc @@ -380,11 +380,13 @@ int print_explain_row(select_result_sink *result, item_list.push_back(item, mem_root); /* 'rows' */ + StringBuffer<64> rows_str; if (rows) { + rows_str.append_ulonglong((ulonglong)(*rows)); item_list.push_back(new (mem_root) - Item_int(thd, *rows, MY_INT64_NUM_DECIMAL_DIGITS), - mem_root); + Item_string_sys(thd, rows_str.ptr(), + rows_str.length()), mem_root); } else item_list.push_back(item_null, mem_root); @@ -1113,12 +1115,14 @@ void Explain_table_access::fill_key_str(String *key_str, bool is_json) const - this is just used key length for ref/range - for index_merge, it is a comma-separated list of lengths. - for hash join, it is key_len:pseudo_key_len + - [tabular form only] rowid filter length is added after "|". - The column looks identical in tabular and json forms. In JSON, we consider - the column legacy, it is superceded by used_key_parts. + In JSON, we consider this column to be legacy, it is superceded by + used_key_parts. */ -void Explain_table_access::fill_key_len_str(String *key_len_str) const +void Explain_table_access::fill_key_len_str(String *key_len_str, + bool is_json) const { bool is_hj= (type == JT_HASH || type == JT_HASH_NEXT || type == JT_HASH_RANGE || type == JT_HASH_INDEX_MERGE); @@ -1147,13 +1151,12 @@ void Explain_table_access::fill_key_len_str(String *key_len_str) const key_len_str->append(buf, length); } - if (key.get_filter_key_length() != (uint)-1) + if (!is_json && rowid_filter) { - char buf[64]; - size_t length; - key_len_str->append(','); - length= longlong10_to_str(key.get_filter_key_length(), buf, 10) - buf; - key_len_str->append(buf, length); + key_len_str->append('|'); + StringBuffer<64> filter_key_len; + rowid_filter->quick->print_key_len(&filter_key_len); + key_len_str->append(filter_key_len); } } @@ -1240,7 +1243,18 @@ int Explain_table_access::print_explain(select_result_sink *output, uint8 explai } /* `type` column */ - push_str(thd, &item_list, join_type_str[type]); + StringBuffer<64> join_type_buf; + if (rowid_filter == NULL) + push_str(thd, &item_list, join_type_str[type]); + else + { + join_type_buf.append(join_type_str[type]); + join_type_buf.append("|filter"); + item_list.push_back(new (mem_root) + Item_string_sys(thd, join_type_buf.ptr(), + join_type_buf.length()), + mem_root); + } /* `possible_keys` column */ StringBuffer<64> possible_keys_buf; @@ -1252,6 +1266,14 @@ int Explain_table_access::print_explain(select_result_sink *output, uint8 explai /* `key` */ StringBuffer<64> key_str; fill_key_str(&key_str, false); + + if (rowid_filter) + { + key_str.append("|"); + StringBuffer<64> rowid_key_str; + rowid_filter->quick->print_key(&rowid_key_str); + key_str.append(rowid_key_str); + } if (key_str.length() > 0) push_string(thd, &item_list, &key_str); @@ -1260,7 +1282,7 @@ int Explain_table_access::print_explain(select_result_sink *output, uint8 explai /* `key_len` */ StringBuffer<64> key_len_str; - fill_key_len_str(&key_len_str); + fill_key_len_str(&key_len_str, false); if (key_len_str.length() > 0) push_string(thd, &item_list, &key_len_str); @@ -1288,10 +1310,10 @@ int Explain_table_access::print_explain(select_result_sink *output, uint8 explai { rows_str.append_ulonglong((ulonglong)rows); - if (is_filter_set()) + if (rowid_filter) { rows_str.append(" ("); - rows_str.append_ulonglong(filter_perc); + rows_str.append_ulonglong(round(rowid_filter->selectivity * 100.0)); rows_str.append("%)"); } item_list.push_back(new (mem_root) @@ -1376,7 +1398,7 @@ int Explain_table_access::print_explain(select_result_sink *output, uint8 explai extra_buf.append(STRING_WITH_LEN("Using filesort")); } - if (is_filter_set()) + if (rowid_filter) { if (first) first= false; @@ -1573,6 +1595,19 @@ void add_json_keyset(Json_writer *writer, const char *elem_name, } +void Explain_rowid_filter::print_explain_json(Explain_query *query, + Json_writer *writer, + bool is_analyze) +{ + Json_writer_nesting_guard guard(writer); + writer->add_member("rowid_filter").start_object(); + quick->print_json(writer); + writer->add_member("rows").add_ll(rows); + writer->add_member("selectivity_pct").add_double(selectivity * 100.0); + writer->end_object(); // rowid_filter +} + + void Explain_table_access::print_explain_json(Explain_query *query, Json_writer *writer, bool is_analyze) @@ -1645,7 +1680,7 @@ void Explain_table_access::print_explain_json(Explain_query *query, /* `key_length` */ StringBuffer<64> key_len_str; - fill_key_len_str(&key_len_str); + fill_key_len_str(&key_len_str, true); if (key_len_str.length()) writer->add_member("key_length").add_str(key_len_str); @@ -1670,6 +1705,11 @@ void Explain_table_access::print_explain_json(Explain_query *query, if (!ref_list.is_empty()) print_json_array(writer, "ref", ref_list); + if (rowid_filter) + { + rowid_filter->print_explain_json(query, writer, is_analyze); + } + /* r_loops (not present in tabular output) */ if (is_analyze) { diff --git a/sql/sql_explain.h b/sql/sql_explain.h index 71f90477977..a161f6c1049 100644 --- a/sql/sql_explain.h +++ b/sql/sql_explain.h @@ -583,7 +583,8 @@ class Explain_index_use : public Sql_alloc { char *key_name; uint key_len; - uint key_len_for_filter; + char *filter_name; + uint filter_len; public: String_list key_parts_list; @@ -596,16 +597,43 @@ public: { key_name= NULL; key_len= (uint)-1; - key_len_for_filter= (uint)-1; + filter_name= NULL; + filter_len= (uint)-1; } bool set(MEM_ROOT *root, KEY *key_name, uint key_len_arg); bool set_pseudo_key(MEM_ROOT *root, const char *key_name); - void set_filter_key_length(uint key_length_arg) - { key_len_for_filter= key_length_arg; } inline const char *get_key_name() const { return key_name; } inline uint get_key_len() const { return key_len; } - inline uint get_filter_key_length() const { return key_len_for_filter; } + //inline const char *get_filter_name() const { return filter_name; } +}; + + +/* + Query Plan data structure for Rowid filter. +*/ +class Explain_rowid_filter : public Sql_alloc +{ +public: + /* Quick select used to collect the rowids into filter */ + Explain_quick_select *quick; + + /* How many rows the above quick select is expected to return */ + ha_rows rows; + + /* Expected selectivity for the filter */ + double selectivity; + + void print_explain_json(Explain_query *query, Json_writer *writer, + bool is_analyze); + + /* + TODO: + Here should be ANALYZE members: + - r_rows for the quick select + - An object that tracked the table access time + - real selectivity of the filter. + */ }; @@ -675,6 +703,7 @@ public: void print_json(Json_writer *writer, bool is_analyze); }; + /* EXPLAIN data structure for a single JOIN_TAB. */ @@ -695,7 +724,7 @@ public: pushed_index_cond(NULL), sjm_nest(NULL), pre_join_sort(NULL), - filter_perc(UINT_MAX) + rowid_filter(NULL) {} ~Explain_table_access() { delete sjm_nest; } @@ -802,12 +831,7 @@ public: Exec_time_tracker op_tracker; Table_access_tracker jbuf_tracker; - /** - How many rows are left after the filter was applied - to the initial rows count in percentages. - */ - double filter_perc; - inline bool is_filter_set() const { return (filter_perc != UINT_MAX); } + Explain_rowid_filter *rowid_filter; int print_explain(select_result_sink *output, uint8 explain_flags, bool is_analyze, @@ -819,7 +843,7 @@ public: private: void append_tag_name(String *str, enum explain_extra_tag tag); void fill_key_str(String *key_str, bool is_json) const; - void fill_key_len_str(String *key_len_str) const; + void fill_key_len_str(String *key_len_str, bool is_json) const; double get_r_filtered(); void tag_to_json(Json_writer *writer, enum explain_extra_tag tag); }; diff --git a/sql/sql_select.cc b/sql/sql_select.cc index 8f55497d8d0..b5b77c2c43b 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -101,6 +101,9 @@ static int sort_keyuse(KEYUSE *a,KEYUSE *b); static bool are_tables_local(JOIN_TAB *jtab, table_map used_tables); static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse, bool allow_full_scan, table_map used_tables); +static ha_rows get_quick_record_count(THD *thd, SQL_SELECT *select, + TABLE *table, + const key_map *keys,ha_rows limit); void best_access_path(JOIN *join, JOIN_TAB *s, table_map remaining_tables, uint idx, bool disable_jbuf, double record_count, @@ -1462,6 +1465,116 @@ int JOIN::optimize() } +/** + @brief + Create range filters objects needed in execution for all join tables + + @details + For each join table from the chosen execution plan such that a range filter + is used when joining this table the function creates a Rowid_filter object + for this range filter. In order to do this the function first constructs + a quick select to scan the range for this range filter. Then it creates + a container for the range filter and finally constructs a Range_rowid_filter + object a pointer to which is set in the field JOIN_TAB::rowid_filter of + the joined table. + + @retval false always +*/ + +bool JOIN::make_range_rowid_filters() +{ + DBUG_ENTER("make_range_rowid_filters"); + + JOIN_TAB *tab; + + for (tab= first_linear_tab(this, WITH_BUSH_ROOTS, WITHOUT_CONST_TABLES); + tab; + tab= next_linear_tab(this, tab, WITH_BUSH_ROOTS)) + { + if (!tab->range_rowid_filter_info) + continue; + int err; + SQL_SELECT *sel= NULL; + Rowid_filter_container *filter_container= NULL; + Item **sargable_cond= get_sargable_cond(this, tab->table); + sel= make_select(tab->table, const_table_map, const_table_map, + *sargable_cond, (SORT_INFO*) 0, 1, &err); + if (!sel) + continue; + + key_map filter_map; + filter_map.clear_all(); + filter_map.set_bit(tab->range_rowid_filter_info->key_no); + filter_map.merge(tab->table->with_impossible_ranges); + bool force_index_save= tab->table->force_index; + tab->table->force_index= true; + (void) sel->test_quick_select(thd, filter_map, (table_map) 0, + (ha_rows) HA_POS_ERROR, + true, false, true); + tab->table->force_index= force_index_save; + if (thd->is_error()) + goto no_filter; + DBUG_ASSERT(sel->quick); + filter_container= + tab->range_rowid_filter_info->create_container(); + if (filter_container) + { + tab->rowid_filter= + new (thd->mem_root) Range_rowid_filter(tab->table, + tab->range_rowid_filter_info, + filter_container, sel); + if (tab->rowid_filter) + continue; + } + no_filter: + if (sel->quick) + delete sel->quick; + delete sel; + } + + DBUG_RETURN(0); +} + + +/** + @brief + Allocate memory the rowid containers of the used the range filters + + @details + For each join table from the chosen execution plan such that a range filter + is used when joining this table the function allocate memory for the + rowid container employed by the filter. On success it lets the table engine + know that what rowid filter will be used when accessing the table rows. + + @retval false always +*/ + +bool +JOIN::init_range_rowid_filters() +{ + DBUG_ENTER("init_range_rowid_filters"); + + JOIN_TAB *tab; + + for (tab= first_linear_tab(this, WITH_BUSH_ROOTS, WITHOUT_CONST_TABLES); + tab; + tab= next_linear_tab(this, tab, WITH_BUSH_ROOTS)) + { + if (!tab->rowid_filter) + continue; + if (tab->rowid_filter->get_container()->alloc()) + { + delete tab->rowid_filter; + tab->rowid_filter= 0; + continue; + } + tab->table->file->rowid_filter_push(tab->rowid_filter); + tab->is_rowid_filter_built= false; + } + DBUG_RETURN(0); +} + + int JOIN::init_join_caches() { JOIN_TAB *tab; @@ -1980,6 +2093,9 @@ int JOIN::optimize_stage2() if (get_best_combination()) DBUG_RETURN(1); + if (make_range_rowid_filters()) + DBUG_RETURN(1); + if (select_lex->handle_derived(thd->lex, DT_OPTIMIZE)) DBUG_RETURN(1); @@ -2644,6 +2760,9 @@ int JOIN::optimize_stage2() if (init_join_caches()) DBUG_RETURN(1); + if (init_range_rowid_filters()) + DBUG_RETURN(1); + error= 0; if (select_options & SELECT_DESCRIBE) @@ -5035,9 +5154,8 @@ make_join_statistics(JOIN *join, List<TABLE_LIST> &tables_list, select->quick=0; impossible_range= records == 0 && s->table->reginfo.impossible_range; if (join->thd->lex->sql_command == SQLCOM_SELECT && - join->table_count > 1 && optimizer_flag(join->thd, OPTIMIZER_SWITCH_USE_ROWID_FILTER)) - s->table->select_usable_range_filters(join->thd); + s->table->init_cost_info_for_usable_range_rowid_filters(join->thd); } if (!impossible_range) { @@ -6843,14 +6961,14 @@ best_access_path(JOIN *join, double best_time= DBL_MAX; double records= DBL_MAX; table_map best_ref_depends_map= 0; - Range_filter_cost_info *best_filter= 0; + Range_rowid_filter_cost_info *best_filter= 0; double tmp; ha_rows rec; bool best_uses_jbuf= FALSE; MY_BITMAP *eq_join_set= &s->table->eq_join_set; KEYUSE *hj_start_key= 0; SplM_plan_info *spl_plan= 0; - Range_filter_cost_info *filter= 0; + Range_rowid_filter_cost_info *filter= 0; disable_jbuf= disable_jbuf || idx == join->const_tables; @@ -7243,11 +7361,18 @@ best_access_path(JOIN *join, loose_scan_opt.check_ref_access_part2(key, start_key, records, tmp); } /* not ft_key */ - filter= table->best_filter_for_current_join_order(start_key->key, - records, - record_count); - if (filter && (filter->get_filter_gain(record_count*records) < tmp)) - tmp= tmp - filter->get_filter_gain(record_count*records); + if (records < DBL_MAX) + { + double rows= record_count * records; + filter= + table->best_range_rowid_filter_for_partial_join(start_key->key, rows); + if (filter) + { + tmp-= filter->get_adjusted_gain(rows, s->worst_seeks) - + filter->get_cmp_gain(rows); + DBUG_ASSERT(tmp >= 0); + } + } if (tmp + 0.0001 < best_time - records/(double) TIME_FOR_COMPARE) { @@ -7352,6 +7477,7 @@ best_access_path(JOIN *join, Here we estimate its cost. */ + filter= 0; if (s->quick) { /* @@ -7367,6 +7493,19 @@ best_access_path(JOIN *join, (s->quick->read_time + (s->found_records - rnd_records)/(double) TIME_FOR_COMPARE); + if ( s->quick->get_type() == QUICK_SELECT_I::QS_TYPE_RANGE) + { + double rows= record_count * s->found_records; + uint key_no= s->quick->index; + filter= s->table->best_range_rowid_filter_for_partial_join(key_no, + rows); + if (filter) + { + tmp-= filter->get_gain(rows); + DBUG_ASSERT(tmp >= 0); + } + } + loose_scan_opt.check_range_access(join, idx, s->quick); } else @@ -7412,21 +7551,23 @@ best_access_path(JOIN *join, else tmp+= s->startup_cost; - filter= s->table->best_filter_for_current_join_order(MAX_KEY, - rnd_records, - record_count); - if (filter && (filter->get_filter_gain(record_count*rnd_records) < tmp)) - tmp= tmp - filter->get_filter_gain(record_count*rnd_records); - /* We estimate the cost of evaluating WHERE clause for found records as record_count * rnd_records / TIME_FOR_COMPARE. This cost plus tmp give us total cost of using TABLE SCAN */ + + double filter_cmp_gain= 0; + if (filter) + { + filter_cmp_gain= filter->get_cmp_gain(record_count * s->found_records); + } + if (best == DBL_MAX || (tmp + record_count/(double) TIME_FOR_COMPARE*rnd_records < (best_key->is_for_hash_join() ? best_time : - best + record_count/(double) TIME_FOR_COMPARE*records))) + best + record_count/(double) TIME_FOR_COMPARE*records - + filter_cmp_gain))) { /* If the table has a range (s->quick is set) make_join_select() @@ -7435,7 +7576,9 @@ best_access_path(JOIN *join, best= tmp; records= best_records; best_key= 0; - best_filter= filter; + best_filter= 0; + if (s->quick && s->quick->get_type() == QUICK_SELECT_I::QS_TYPE_RANGE) + best_filter= filter; /* range/index_merge/ALL/index access method are "independent", so: */ best_ref_depends_map= 0; best_uses_jbuf= MY_TEST(!disable_jbuf && !((s->table->map & @@ -7453,7 +7596,7 @@ best_access_path(JOIN *join, pos->loosescan_picker.loosescan_key= MAX_KEY; pos->use_join_buffer= best_uses_jbuf; pos->spl_plan= spl_plan; - pos->filter= best_filter; + pos->range_rowid_filter_info= best_filter; loose_scan_opt.save_to_position(s, loose_scan_pos); @@ -9719,7 +9862,7 @@ bool JOIN::get_best_combination() is_hash_join_key_no(j->ref.key)) hash_join= TRUE; - j->filter= best_positions[tablenr].filter; + j->range_rowid_filter_info= best_positions[tablenr].range_rowid_filter_info; loop_end: /* @@ -10782,6 +10925,11 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond) sel->quick=tab->quick; // Use value from get_quick_... sel->quick_keys.clear_all(); sel->needed_reg.clear_all(); + if (is_hj && tab->rowid_filter) + { + delete tab->rowid_filter; + tab->rowid_filter= 0; + } } else { @@ -12455,6 +12603,23 @@ bool error_if_full_join(JOIN *join) } +void JOIN_TAB::build_range_rowid_filter_if_needed() +{ + if (rowid_filter && !is_rowid_filter_built) + { + if (!rowid_filter->build()) + { + is_rowid_filter_built= true; + } + else + { + delete rowid_filter; + rowid_filter= 0; + } + } +} + + /** cleanup JOIN_TAB. @@ -12478,6 +12643,11 @@ void JOIN_TAB::cleanup() select= 0; delete quick; quick= 0; + if (rowid_filter) + { + delete rowid_filter; + rowid_filter= 0; + } if (cache) { cache->free(); @@ -12589,9 +12759,7 @@ ha_rows JOIN_TAB::get_examined_rows() double examined_rows; SQL_SELECT *sel= filesort? filesort->select : this->select; - if (filter) - examined_rows= records_read; - else if (sel && sel->quick && use_quick != 2) + if (sel && sel->quick && use_quick != 2) examined_rows= (double)sel->quick->records; else if (type == JT_NEXT || type == JT_ALL || type == JT_HASH || type ==JT_HASH_NEXT) @@ -19361,6 +19529,8 @@ sub_select(JOIN *join,JOIN_TAB *join_tab,bool end_of_records) if (!join_tab->preread_init_done && join_tab->preread_init()) DBUG_RETURN(NESTED_LOOP_ERROR); + join_tab->build_range_rowid_filter_if_needed(); + join->return_tab= join_tab; if (join_tab->last_inner) @@ -20305,6 +20475,8 @@ int join_init_read_record(JOIN_TAB *tab) if (tab->filesort && tab->sort_table()) // Sort table. return 1; + tab->build_range_rowid_filter_if_needed(); + DBUG_EXECUTE_IF("kill_join_init_read_record", tab->join->thd->set_killed(KILL_QUERY);); if (tab->select && tab->select->quick && tab->select->quick->reset()) @@ -20319,6 +20491,8 @@ int join_init_read_record(JOIN_TAB *tab) tab->join->thd->reset_killed();); if (!tab->preread_init_done && tab->preread_init()) return 1; + + if (init_read_record(&tab->read_record, tab->join->thd, tab->table, tab->select, tab->filesort_result, 1,1, FALSE)) return 1; @@ -22352,6 +22526,12 @@ check_reverse_order: tab->use_quick=1; tab->ref.key= -1; tab->ref.key_parts=0; // Don't use ref key. + tab->range_rowid_filter_info= 0; + if (tab->rowid_filter) + { + delete tab->rowid_filter; + tab->rowid_filter= 0; + } tab->read_first_record= join_init_read_record; if (tab->is_using_loose_index_scan()) tab->join->tmp_table_param.precomputed_group_by= TRUE; @@ -24977,11 +25157,13 @@ int print_explain_message_line(select_result_sink *result, item_list.push_back(item_null, mem_root); /* `rows` */ + StringBuffer<64> rows_str; if (rows) { - item_list.push_back(new (mem_root) Item_int(thd, *rows, - MY_INT64_NUM_DECIMAL_DIGITS), - mem_root); + rows_str.append_ulonglong((ulonglong)(*rows)); + item_list.push_back(new (mem_root) + Item_string_sys(thd, rows_str.ptr(), + rows_str.length()), mem_root); } else item_list.push_back(item_null, mem_root); @@ -25046,31 +25228,6 @@ int append_possible_keys(MEM_ROOT *alloc, String_list &list, TABLE *table, } -/** - This method saves the data that should be printed in EXPLAIN - if any filter was used for this table. -*/ - -bool JOIN_TAB::save_filter_explain_data(Explain_table_access *eta) -{ - if (!filter) - return 0; - KEY *pk_key= get_keyinfo_by_key_no(filter->key_no); - StringBuffer<64> buff_for_pk; - const char *tmp_buff; - buff_for_pk.append("filter:"); - tmp_buff= pk_key->name.str; - buff_for_pk.append(tmp_buff, strlen(tmp_buff), system_charset_info); - if (!(eta->ref_list.append_str(join->thd->mem_root, - buff_for_pk.c_ptr_safe()))) - return 1; - eta->key.set_filter_key_length(pk_key->key_length); - (filter->selectivity*100 >= 1) ? eta->filter_perc= round(filter->selectivity*100) : - eta->filter_perc= 1; - return 0; -} - - bool JOIN_TAB::save_explain_data(Explain_table_access *eta, table_map prefix_tables, bool distinct_arg, JOIN_TAB *first_top_tab) @@ -25106,7 +25263,7 @@ bool JOIN_TAB::save_explain_data(Explain_table_access *eta, filesort))) return 1; } - + // psergey-todo: data for filtering! tracker= &eta->tracker; jbuf_tracker= &eta->jbuf_tracker; @@ -25207,6 +25364,20 @@ bool JOIN_TAB::save_explain_data(Explain_table_access *eta, // psergey-todo: ^ check for error return code /* Build "key", "key_len", and "ref" */ + + if (rowid_filter) + { + Range_rowid_filter *range_filter= (Range_rowid_filter *) rowid_filter; + QUICK_SELECT_I *quick= range_filter->get_select()->quick; + + Explain_rowid_filter *erf= new (thd->mem_root) Explain_rowid_filter; + erf->quick= quick->get_explain(thd->mem_root); + erf->selectivity= range_rowid_filter_info->selectivity; + erf->rows= quick->records; + eta->rowid_filter= erf; + //psergey-todo: also do setup for ANALYZE here. + } + if (tab_type == JT_NEXT) { key_info= table->key_info+index; @@ -25326,13 +25497,6 @@ bool JOIN_TAB::save_explain_data(Explain_table_access *eta, eta->filtered= f; } - if ((tab_select && tab_select->quick && tab_type != JT_CONST) || - (key_info && ref.key_parts && tab_type != JT_FT)) - { - if (save_filter_explain_data(eta)) - return 1; - } - /* Build "Extra" field and save it */ key_read= table->file->keyread_enabled(); if ((tab_type == JT_NEXT || tab_type == JT_CONST) && diff --git a/sql/sql_select.h b/sql/sql_select.h index 1d08b746279..3de0e894c93 100644 --- a/sql/sql_select.h +++ b/sql/sql_select.h @@ -510,9 +510,18 @@ typedef struct st_join_table { uint n_sj_tables; bool preread_init_done; - /* Copy of POSITION::filter, set by get_best_combination() */ - Range_filter_cost_info *filter; - Dynamic_array<char*> rowid_filter_pk; + + /* + Cost info to the range filter used when joining this join table + (Defined when the best join order has been already chosen) + */ + Range_rowid_filter_cost_info *range_rowid_filter_info; + /* Rowid filter to be used when joining this join table */ + Rowid_filter *rowid_filter; + /* Becomes true just after the used range filter has been built / filled */ + bool is_rowid_filter_built; + + void build_range_rowid_filter_if_needed(); void cleanup(); inline bool is_using_loose_index_scan() @@ -663,7 +672,6 @@ typedef struct st_join_table { SplM_plan_info *choose_best_splitting(double record_count, table_map remaining_tables); bool fix_splitting(SplM_plan_info *spl_plan, table_map remaining_tables); - bool save_filter_explain_data(Explain_table_access *eta); } JOIN_TAB; @@ -889,7 +897,8 @@ public: }; -class Range_filter_cost_info; +class Range_rowid_filter_cost_info; +class Rowid_filter; /** @@ -974,8 +983,10 @@ typedef struct st_position /* Info on splitting plan used at this position */ SplM_plan_info *spl_plan; - /* The index for which filter can be built */ - Range_filter_cost_info *filter; + + /* Cost info for the range filter used at this position */ + Range_rowid_filter_cost_info *range_rowid_filter_info; + } POSITION; typedef Bounds_checked_array<Item_null_result*> Item_null_array; @@ -1625,6 +1636,8 @@ public: bool optimize_unflattened_subqueries(); bool optimize_constant_subqueries(); int init_join_caches(); + bool make_range_rowid_filters(); + bool init_range_rowid_filters(); bool make_sum_func_list(List<Item> &all_fields, List<Item> &send_fields, bool before_group_by, bool recompute= FALSE); diff --git a/sql/structs.h b/sql/structs.h index 9ff52bccb40..12b5dee0055 100644 --- a/sql/structs.h +++ b/sql/structs.h @@ -27,6 +27,15 @@ #include "thr_lock.h" /* thr_lock_type */ #include "my_base.h" /* ha_rows, ha_key_alg */ #include <mysql_com.h> /* USERNAME_LENGTH */ +#include "sql_bitmap.h" + +#if MAX_INDEXES <= 64 +typedef Bitmap<64> key_map; /* Used for finding keys */ +#elif MAX_INDEXES > 128 +#error "MAX_INDEXES values greater than 128 is not supported." +#else +typedef Bitmap<((MAX_INDEXES+7)/8*8)> key_map; /* Used for finding keys */ +#endif struct TABLE; class Type_handler; @@ -111,6 +120,11 @@ typedef struct st_key { */ LEX_CSTRING name; key_part_map ext_key_part_map; + /* + Bitmap of indexes having common parts with this index + (only key parts from key definitions are taken into account) + */ + key_map overlapped; uint block_size; enum ha_key_alg algorithm; /* diff --git a/sql/table.cc b/sql/table.cc index 18be06c0b95..3df4c9f543a 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -1227,6 +1227,44 @@ static const Type_handler *old_frm_type_handler(uint pack_flag, return Type_handler::get_handler_by_real_type(field_type); } +/* Set overlapped bitmaps for each index */ + +void TABLE_SHARE::set_overlapped_keys() +{ + KEY *key1= key_info; + for (uint i= 0; i < keys; i++, key1++) + { + key1->overlapped.clear_all(); + key1->overlapped.set_bit(i); + } + key1= key_info; + for (uint i= 0; i < keys; i++, key1++) + { + KEY *key2= key1 + 1; + for (uint j= i+1; j < keys; j++, key2++) + { + KEY_PART_INFO *key_part1= key1->key_part; + uint n1= key1->user_defined_key_parts; + uint n2= key2->user_defined_key_parts; + for (uint k= 0; k < n1; k++, key_part1++) + { + KEY_PART_INFO *key_part2= key2->key_part; + for (uint l= 0; l < n2; l++, key_part2++) + { + if (key_part1->fieldnr == key_part2->fieldnr) + { + key1->overlapped.set_bit(j); + key2->overlapped.set_bit(i); + goto end_checking_overlap; + } + } + } + end_checking_overlap: + ; + } + } +} + /** Read data from a binary .frm file image into a TABLE_SHARE @@ -2522,6 +2560,8 @@ int TABLE_SHARE::init_from_binary_frm_image(THD *thd, bool write, null_length, 255); } + set_overlapped_keys(); + /* Handle virtual expressions */ if (vcol_screen_length && share->frm_version >= FRM_VER_EXPRESSSIONS) { @@ -4656,8 +4696,9 @@ void TABLE::init(THD *thd, TABLE_LIST *tl) created= TRUE; cond_selectivity= 1.0; cond_selectivity_sampling_explain= NULL; - best_filter_count= 0; - range_filter_cost_info_elements= 0; + range_rowid_filter_cost_info_elems= 0; + range_rowid_filter_cost_info_ptr= NULL; + range_rowid_filter_cost_info= NULL; #ifdef HAVE_REPLICATION /* used in RBR Triggers */ master_had_triggers= 0; diff --git a/sql/table.h b/sql/table.h index 3c782c34bc3..1b8b837c35b 100644 --- a/sql/table.h +++ b/sql/table.h @@ -55,7 +55,7 @@ class Virtual_column_info; class Table_triggers_list; class TMP_TABLE_PARAM; class SEQUENCE; -class Range_filter_cost_info; +class Range_rowid_filter_cost_info; /* Used to identify NESTED_JOIN structures within a join (applicable only to @@ -1002,6 +1002,8 @@ struct TABLE_SHARE /* frees the memory allocated in read_frm_image */ void free_frm_image(const uchar *frm); + + void set_overlapped_keys(); }; @@ -1193,7 +1195,14 @@ public: and max #key parts that range access would use. */ ha_rows quick_rows[MAX_KEY]; + uint quick_key_parts[MAX_KEY]; + double quick_costs[MAX_KEY]; + /* + If there is a range access by i-th index then the cost of + index only access for it is stored in quick_index_only_costs[i] + */ + double quick_index_only_costs[MAX_KEY]; /* Bitmaps of key parts that =const for the duration of join execution. If @@ -1202,10 +1211,7 @@ public: */ key_part_map const_key_parts[MAX_KEY]; - uint quick_key_parts[MAX_KEY]; uint quick_n_ranges[MAX_KEY]; - /* For each key I/O access cost is stored */ - double quick_key_io[MAX_KEY]; /* Estimate of number of records that satisfy SARGable part of the table @@ -1497,21 +1503,21 @@ public: double get_materialization_cost(); // Now used only if is_splittable()==true void add_splitting_info_for_key_field(struct KEY_FIELD *key_field); + key_map with_impossible_ranges; + + /* Number of cost info elements for possible range filters */ + uint range_rowid_filter_cost_info_elems; + /* Pointer to the array of cost info elements for range filters */ + Range_rowid_filter_cost_info *range_rowid_filter_cost_info; + /* The array of pointers to cost info elements for range filters */ + Range_rowid_filter_cost_info **range_rowid_filter_cost_info_ptr; + + void init_cost_info_for_usable_range_rowid_filters(THD *thd); + void prune_range_rowid_filters(); + Range_rowid_filter_cost_info * + best_range_rowid_filter_for_partial_join(uint access_key_no, + double records); - /** - Range filter info - */ - /* Minimum possible #T value to apply filter*/ - uint best_filter_count; - uint range_filter_cost_info_elements; - Range_filter_cost_info *range_filter_cost_info; - Range_filter_cost_info - *best_filter_for_current_join_order(uint ref_key_no, - double record_count, - double records); - void sort_range_filter_cost_info_array(); - void prune_range_filters(); - void select_usable_range_filters(THD *thd); /** System Versioning support */ |