diff options
author | Sergey Petrunya <psergey@askmonty.org> | 2010-11-15 04:45:49 +0300 |
---|---|---|
committer | Sergey Petrunya <psergey@askmonty.org> | 2010-11-15 04:45:49 +0300 |
commit | 9ab54acbf0a357584568633be616d3df4a7e1a2a (patch) | |
tree | 974cc4a097fa28046d414e29843034fff36849fe /sql | |
parent | 9441a9cc284b3e7b7d5582f071429340f41a3ca6 (diff) | |
parent | 3d698ac61618f658e5a25219d3eb3b29b451db00 (diff) | |
download | mariadb-git-9ab54acbf0a357584568633be616d3df4a7e1a2a.tar.gz |
Merge MWL#121-125 DS-MRR improvements into MWL#128 Classic hash join tree
Diffstat (limited to 'sql')
-rwxr-xr-x | sql/CMakeLists.txt | 1 | ||||
-rw-r--r-- | sql/Makefile.am | 1 | ||||
-rw-r--r-- | sql/filesort.cc | 15 | ||||
-rw-r--r-- | sql/handler.h | 29 | ||||
-rw-r--r-- | sql/multi_range_read.cc | 1077 | ||||
-rw-r--r-- | sql/multi_range_read.h | 585 | ||||
-rw-r--r-- | sql/mysql_priv.h | 22 | ||||
-rw-r--r-- | sql/mysqld.cc | 3 | ||||
-rw-r--r-- | sql/opt_index_cond_pushdown.cc | 1 | ||||
-rw-r--r-- | sql/opt_range.cc | 3 | ||||
-rw-r--r-- | sql/opt_range.h | 7 | ||||
-rw-r--r-- | sql/sql_join_cache.cc | 15 | ||||
-rw-r--r-- | sql/sql_lifo_buffer.h | 340 | ||||
-rw-r--r-- | sql/sql_select.cc | 4 | ||||
-rw-r--r-- | sql/sql_select.h | 1 |
15 files changed, 1857 insertions, 247 deletions
diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt index 26957c5ea43..8fc32f1e0d1 100755 --- a/sql/CMakeLists.txt +++ b/sql/CMakeLists.txt @@ -63,6 +63,7 @@ SET (SQL_SOURCE sql_cache.cc sql_class.cc sql_client.cc sql_crypt.cc sql_crypt.h sql_cursor.cc sql_db.cc sql_delete.cc sql_derived.cc sql_do.cc sql_error.cc sql_handler.cc sql_help.cc sql_insert.cc + sql_lifo_buffer.h sql_join_cache.cc sql_lex.cc sql_list.cc sql_load.cc sql_manager.cc sql_map.cc sql_parse.cc sql_partition.cc sql_plugin.cc sql_prepare.cc sql_rename.cc diff --git a/sql/Makefile.am b/sql/Makefile.am index f7a3945311f..53d27300f74 100644 --- a/sql/Makefile.am +++ b/sql/Makefile.am @@ -66,6 +66,7 @@ noinst_HEADERS = item.h item_func.h item_sum.h item_cmpfunc.h \ log.h log_slow.h sql_show.h rpl_rli.h rpl_mi.h \ sql_select.h structs.h table.h sql_udf.h hash_filo.h \ lex.h lex_symbol.h sql_acl.h sql_crypt.h \ + sql_lifo_buffer.h \ sql_repl.h slave.h rpl_filter.h rpl_injector.h \ log_event.h rpl_record.h \ log_event_old.h rpl_record_old.h \ diff --git a/sql/filesort.cc b/sql/filesort.cc index aa808a5e6b5..1ee3972c4fd 100644 --- a/sql/filesort.cc +++ b/sql/filesort.cc @@ -543,11 +543,6 @@ static ha_rows find_all_keys(SORTPARAM *param, SQL_SELECT *select, current_thd->variables.read_buff_size); } - if (quick_select) - { - if (select->quick->reset()) - DBUG_RETURN(HA_POS_ERROR); - } /* Remember original bitmaps */ save_read_set= sort_form->read_set; @@ -561,9 +556,19 @@ static ha_rows find_all_keys(SORTPARAM *param, SQL_SELECT *select, if (select && select->cond) select->cond->walk(&Item::register_field_in_read_map, 1, (uchar*) sort_form); + if (select && select->pre_idx_push_select_cond) + select->pre_idx_push_select_cond->walk(&Item::register_field_in_read_map, + 1, (uchar*) sort_form); sort_form->column_bitmaps_set(&sort_form->tmp_set, &sort_form->tmp_set, &sort_form->tmp_set); + + if (quick_select) + { + if (select->quick->reset()) + DBUG_RETURN(HA_POS_ERROR); + } + for (;;) { if (quick_select) diff --git a/sql/handler.h b/sql/handler.h index 345411f6346..bf49698939c 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -1283,9 +1283,9 @@ void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, COST_VECT *cost); /* - The below two are not used (and not handled) in this milestone of this WL - entry because there seems to be no use for them at this stage of - implementation. + Indicates that all scanned ranges will be singlepoint (aka equality) ranges. + The ranges may not use the full key but all of them will use the same number + of key parts. */ #define HA_MRR_SINGLE_POINT 1 #define HA_MRR_FIXED_KEY 2 @@ -1327,6 +1327,16 @@ void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, */ #define HA_MRR_NO_NULL_ENDPOINTS 128 +/* + The MRR user has materialized range keys somewhere in the user's buffer. + This can be used for optimization of the procedure that sorts these keys + since in this case key values don't have to be copied into the MRR buffer. + + In other words, it is guaranteed that after RANGE_SEQ_IF::next() call the + pointer in range->start_key.key will point to a key value that will remain + there until the end of the MRR scan. +*/ +#define HA_MRR_MATERIALIZED_KEYS 256 /* @@ -1817,14 +1827,19 @@ public: inline int ha_index_first(uchar * buf); inline int ha_index_last(uchar * buf); inline int ha_index_next_same(uchar *buf, const uchar *key, uint keylen); + /* + TODO: should we make for those functions non-virtual ha_func_name wrappers, + too? + */ virtual ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, void *seq_init_param, uint n_ranges, uint *bufsz, uint *flags, COST_VECT *cost); virtual ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys, - uint *bufsz, uint *flags, COST_VECT *cost); + uint key_parts, uint *bufsz, + uint *flags, COST_VECT *cost); virtual int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param, - uint n_ranges, uint mode, + uint n_ranges, uint mode, HANDLER_BUFFER *buf); virtual int multi_range_read_next(char **range_info); virtual int read_range_first(const key_range *start_key, @@ -2183,7 +2198,8 @@ public: TRUE if the engine supports virtual columns */ virtual bool check_if_supported_virtual_columns(void) { return FALSE;} - + + TABLE* get_table() { return table; } protected: /* deprecated, don't use in new engines */ inline void ha_statistic_increment(ulong SSV::*offset) const { } @@ -2376,7 +2392,6 @@ private: virtual int rename_partitions(const char *path) { return HA_ERR_WRONG_COMMAND; } friend class ha_partition; - friend class DsMrr_impl; public: /* XXX to be removed, see ha_partition::partition_ht() */ virtual handlerton *partition_ht() const diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc index 6eb1be7985c..0caa5e55e51 100644 --- a/sql/multi_range_read.cc +++ b/sql/multi_range_read.cc @@ -1,4 +1,5 @@ #include "mysql_priv.h" +#include <my_bit.h> #include "sql_select.h" /**************************************************************************** @@ -136,10 +137,16 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, */ ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows, - uint *bufsz, uint *flags, COST_VECT *cost) + uint key_parts, uint *bufsz, + uint *flags, COST_VECT *cost) { - *bufsz= 0; /* Default implementation doesn't need a buffer */ + /* + Currently we expect this function to be called only in preparation of scan + with HA_MRR_SINGLE_POINT property. + */ + DBUG_ASSERT(*flags | HA_MRR_SINGLE_POINT); + *bufsz= 0; /* Default implementation doesn't need a buffer */ *flags |= HA_MRR_USE_DEFAULT_IMPL; cost->zero(); @@ -207,7 +214,6 @@ handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param, DBUG_RETURN(0); } - /** Get next record in MRR scan @@ -277,7 +283,432 @@ scan_it_again: } /**************************************************************************** - * DS-MRR implementation + * Mrr_*_reader classes (building blocks for DS-MRR) + ***************************************************************************/ + +int Mrr_simple_index_reader::init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, + uint mode, Buffer_manager *buf_manager_arg) +{ + HANDLER_BUFFER no_buffer = {NULL, NULL, NULL}; + h= h_arg; + return h->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, + mode, &no_buffer); +} + + +int Mrr_simple_index_reader::get_next(char **range_info) +{ + int res; + while (!(res= h->handler::multi_range_read_next(range_info))) + { + KEY_MULTI_RANGE *curr_range= &h->handler::mrr_cur_range; + if (!h->mrr_funcs.skip_index_tuple || + !h->mrr_funcs.skip_index_tuple(h->mrr_iter, curr_range->ptr)) + break; + } + return res; +} + + +/** + @brief Get next index record + + @param range_info OUT identifier of range that the returned record belongs to + + @note + We actually iterate over nested sequences: + - an ordered sequence of groups of identical keys + - each key group has key value, which has multiple matching records + - thus, each record matches all members of the key group + + @retval 0 OK, next record was successfully read + @retval HA_ERR_END_OF_FILE End of records + @retval Other Some other error +*/ + +int Mrr_ordered_index_reader::get_next(char **range_info) +{ + int res; + DBUG_ENTER("Mrr_ordered_index_reader::get_next"); + + if (!know_key_tuple_params) + { + /* + We're at the very start, haven't filled the buffer or even know what + will be there. Force the caller to call refill_buffer(): + */ + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + + while (1) + { + bool have_record= FALSE; + if (scanning_key_val_iter) + { + if ((res= kv_it.get_next())) + { + kv_it.close(); + scanning_key_val_iter= FALSE; + if ((res != HA_ERR_KEY_NOT_FOUND && res != HA_ERR_END_OF_FILE)) + DBUG_RETURN(res); + } + else + have_record= TRUE; + } + else + { + while ((res= kv_it.init(this))) + { + if ((res != HA_ERR_KEY_NOT_FOUND && res != HA_ERR_END_OF_FILE) || + key_buffer->is_empty()) + { + DBUG_RETURN(res); + } + } + scanning_key_val_iter= TRUE; + } + + if (have_record && + !skip_index_tuple(*(char**)cur_range_info) && + !skip_record(*(char**)cur_range_info, NULL)) + { + break; + } + /* Go get another (record, range_id) combination */ + } /* while */ + + memcpy(range_info, cur_range_info, sizeof(void*)); + DBUG_RETURN(0); +} + + +/** + Fill the buffer with (lookup_tuple, range_id) pairs and sort + + @note + We don't know lookup_tuple before we get the first key from + mrr_funcs.get_next(). Not knowing tuple length means we can't setup the + key buffer (in particular, which part of the buffer space it should occupy + when we have both key and rowid buffers). This problem is solved by having + know_key_tuple_params variabe, and buf_manager, which we ask to set/reset + buffers for us. +*/ + +int Mrr_ordered_index_reader::refill_buffer() +{ + int res; + KEY_MULTI_RANGE cur_range; + uchar **range_info_ptr= (uchar**)&cur_range.ptr; + uchar *key_ptr; + DBUG_ENTER("Mrr_ordered_index_reader::refill_buffer"); + + DBUG_ASSERT(!know_key_tuple_params || key_buffer->is_empty()); + if (know_key_tuple_params) + { + buf_manager->reset_buffer_sizes(); + key_buffer->reset(); + key_buffer->setup_writing(&key_ptr, keypar.key_size_in_keybuf, + is_mrr_assoc? (uchar**)&range_info_ptr : NULL, + sizeof(uchar*)); + } + + while ((!know_key_tuple_params || key_buffer->can_write()) && + !(res= mrr_funcs.next(mrr_iter, &cur_range))) + { + DBUG_ASSERT(cur_range.range_flag & EQ_RANGE); + + if (!know_key_tuple_params) + { + /* This only happens when we've just started filling the buffer */ + key_range *sample_key= &cur_range.start_key; + know_key_tuple_params= TRUE; + keypar.key_tuple_length= sample_key->length; + keypar.key_tuple_map= sample_key->keypart_map; + keypar.key_size_in_keybuf= keypar.use_key_pointers ? sizeof(char*) : keypar.key_tuple_length; + KEY *key_info= &h->get_table()->key_info[h->active_index]; + keypar.index_ranges_unique= test(key_info->flags & HA_NOSAME && + key_info->key_parts == + my_count_bits(sample_key->keypart_map)); + buf_manager->setup_buffer_sizes(keypar.key_size_in_keybuf, keypar.key_tuple_map); + key_buffer= buf_manager->get_key_buffer(); + key_buffer->setup_writing(&key_ptr, keypar.key_size_in_keybuf, + is_mrr_assoc? (uchar**)&range_info_ptr : NULL, + sizeof(uchar*)); + DBUG_ASSERT(key_buffer->can_write()); + } + + /* Put key, or {key, range_id} pair into the buffer */ + if (keypar.use_key_pointers) + key_ptr=(uchar*) &cur_range.start_key.key; + else + key_ptr=(uchar*) cur_range.start_key.key; + + key_buffer->write(); + } + + bool no_more_keys= test(res); + scanning_key_val_iter= FALSE; + + if (no_more_keys && (!know_key_tuple_params || key_buffer->is_empty())) + DBUG_RETURN(HA_ERR_END_OF_FILE); + + key_buffer->sort((key_buffer->type() == Lifo_buffer::FORWARD)? + (qsort2_cmp)Mrr_ordered_index_reader::key_tuple_cmp_reverse : + (qsort2_cmp)Mrr_ordered_index_reader::key_tuple_cmp, + (void*)this); + DBUG_RETURN(0); +} + + +int Mrr_ordered_index_reader::init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, + uint mode, Buffer_manager *buf_manager_arg) +{ + h= h_arg; + mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode); + keypar.use_key_pointers= test(mode & HA_MRR_MATERIALIZED_KEYS); + is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION); + mrr_funcs= *seq_funcs; + know_key_tuple_params= FALSE; + buf_manager= buf_manager_arg; + /* + Short: don't do identical key handling when we have a pushed index + condition. + + Long: In order to check pushed index condition, we need to have both + index tuple table->record[0] and range_id. + + Key_value_records_iterator has special handling for case when we have + multiple (key_value, range_id) pairs with the same key_value. In that + case it will make an index lookup only for the first such element, + for subsequent elements it will only return the new range_id. + + The problem here is that h->table->record[0] is shared with the part that + does full record retrieval with rnd_pos() calls, and if we have the + following scenario: + + 1. We scan ranges {(key_value, range_id1), (key_value, range_id2)} + 2. Iterator makes a lookup with key_value, produces the (index_tuple, + range_id1) pair. Index tuple is read into table->record[0], which + allows us to check index condition. + 3. At this point, we figure that key buffer is full, so we sort it, + and return control to Mrr_ordered_rndpos_reader. + 3.1 Mrr_ordered_rndpos_reader gets rowids and makes rnd_pos() calls, which + puts some arbitrary data into table->record[0] in the process. + 3.2 We ask the iterator for the next (rowid, range_id) pair. The iterator + puts in range_id2, and that shuld be sufficient (this is identical key + handling at work) + However, index tuple in table->record[0] has been destroyed and we + can't check index conditon for (index_tuple, range_id2) now. + + TODO: It is possible to support identical key handling and index condition + pushdown, working together (one possible solution is to save/restore the + contents of table->record[0]). We will probably implement that. + + */ + disallow_identical_key_handling= test(mrr_funcs.skip_index_tuple); + return 0; +} + + +static int rowid_cmp_reverse(void *h, uchar *a, uchar *b) +{ + return - ((handler*)h)->cmp_ref(a, b); +} + + +int Mrr_ordered_rndpos_reader::init(handler *h_arg, + Mrr_index_reader *index_reader_arg, + uint mode, + Lifo_buffer *buf) +{ + h= h_arg; + index_reader= index_reader_arg; + rowid_buffer= buf; + is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION); + index_reader_exhausted= FALSE; + return 0; +} + + +/** + DS-MRR: Fill and sort the rowid buffer + + Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into + buffer. When the buffer is full or scan is completed, sort the buffer by + rowid and return. + + When this function returns, either rowid buffer is not empty, or the source + of lookup keys (i.e. ranges) is exhaused. + + dsmrr_eof is set to indicate whether we've exhausted the list of ranges we're + scanning. This function never returns HA_ERR_END_OF_FILE. + + @retval 0 OK, the next portion of rowids is in the buffer, + properly ordered + @retval other Error +*/ + +int Mrr_ordered_rndpos_reader::refill_buffer() +{ + int res; + DBUG_ENTER("Mrr_ordered_rndpos_reader::refill_buffer"); + + if (index_reader_exhausted) + DBUG_RETURN(HA_ERR_END_OF_FILE); + + while ((res= refill_from_key_buffer() == HA_ERR_END_OF_FILE)) + { + if ((res= index_reader->refill_buffer())) + { + if (res == HA_ERR_END_OF_FILE) + index_reader_exhausted= TRUE; + break; + } + } + DBUG_RETURN(res); +} + + +void Mrr_index_reader::position() +{ + h->position(h->get_table()->record[0]); +} + + +/* + @brief Try to refill the rowid buffer without calling + index_reader->refill_buffer(). +*/ + +int Mrr_ordered_rndpos_reader::refill_from_key_buffer() +{ + char *range_info; + uchar **range_info_ptr= (uchar**)&range_info; + int res; + DBUG_ENTER("Mrr_ordered_rndpos_reader::refill_from_key_buffer"); + + DBUG_ASSERT(rowid_buffer->is_empty()); + index_rowid= index_reader->get_rowid_ptr(); + rowid_buffer->reset(); + rowid_buffer->setup_writing(&index_rowid, h->ref_length, + is_mrr_assoc? (uchar**)&range_info_ptr: NULL, + sizeof(void*)); + + last_identical_rowid= NULL; + + while (rowid_buffer->can_write()) + { + res= index_reader->get_next(&range_info); + + if (res) + break; + + /* Put rowid, or {rowid, range_id} pair into the buffer */ + index_reader->position(); + + rowid_buffer->write(); + } + + /* Sort the buffer contents by rowid */ + rowid_buffer->sort((qsort2_cmp)rowid_cmp_reverse, (void*)h); + + rowid_buffer->setup_reading(&rowid, h->ref_length, + is_mrr_assoc? (uchar**)&rowids_range_id: NULL, + sizeof(void*)); + DBUG_RETURN(rowid_buffer->is_empty()? HA_ERR_END_OF_FILE : 0); +} + + +/* + Get the next {record, range_id} using ordered array of rowid+range_id pairs + + @note + Since we have sorted rowids, we try not to make multiple rnd_pos() calls + with the same rowid value. +*/ + +int Mrr_ordered_rndpos_reader::get_next(char **range_info) +{ + int res; + + while (last_identical_rowid) + { + /* + Current record (the one we've returned in previous call) was obtained + from a rowid that matched multiple range_ids. Return this record again, + with next matching range_id. + */ + bool UNINIT_VAR(bres); + bres= rowid_buffer->read(); + DBUG_ASSERT(!bres); + + if (is_mrr_assoc) + memcpy(range_info, rowids_range_id, sizeof(uchar*)); + + if (rowid == last_identical_rowid) + { + last_identical_rowid= NULL; /* reached the last of identical rowids */ + } + + if (!index_reader->skip_record((char*)*range_info, rowid)) + { + return 0; + } + } + + while (1) + { + last_identical_rowid= NULL; + + /* Return eof if there are no rowids in the buffer after re-fill attempt */ + if (rowid_buffer->read()) + return HA_ERR_END_OF_FILE; + + if (is_mrr_assoc) + { + memcpy(range_info, rowids_range_id, sizeof(uchar*)); + } + + if (index_reader->skip_record(*range_info, rowid)) + continue; + + res= h->ha_rnd_pos(h->get_table()->record[0], rowid); + + if (res == HA_ERR_RECORD_DELETED) + continue; + + /* + Check if subsequent buffer elements have the same rowid value as this + one. If yes, remember this fact so that we don't make any more rnd_pos() + calls with this value. + */ + if (!res) + { + uchar *cur_rowid= rowid; + /* + Note: this implies that SQL layer doesn't touch table->record[0] + between calls. + */ + Lifo_buffer_iterator it; + it.init(rowid_buffer); + while (!it.read()) // reads to (rowid, ...) + { + if (h->cmp_ref(rowid, cur_rowid)) + break; + last_identical_rowid= rowid; + } + } + return 0; + } + + return res; +} + + +/**************************************************************************** + * Top-level DS-MRR implementation functions (the ones called by storage engine) ***************************************************************************/ /** @@ -302,9 +733,8 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, void *seq_init_param, uint n_ranges, uint mode, HANDLER_BUFFER *buf) { - uint elem_size; - Item *pushed_cond= NULL; - handler *new_h2= 0; + THD *thd= current_thd; + int res; DBUG_ENTER("DsMrr_impl::dsmrr_init"); /* @@ -312,50 +742,135 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, has not been called, so set the owner handler here as well. */ h= h_arg; - if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED) + is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION); + + if ((mode & HA_MRR_USE_DEFAULT_IMPL) || (mode & HA_MRR_SORTED)) { - use_default_impl= TRUE; - const int retval= - h->handler::multi_range_read_init(seq_funcs, seq_init_param, - n_ranges, mode, buf); - DBUG_RETURN(retval); + DBUG_ASSERT(h->inited == handler::INDEX); + Mrr_simple_index_reader *s= &reader_factory.simple_index_reader; + res= s->init(h, seq_funcs, seq_init_param, n_ranges, mode, this); + strategy= s; + DBUG_RETURN(res); } - rowids_buf= buf->buffer; + + /* Neither of strategies used below can handle sorting */ + DBUG_ASSERT(!(mode & HA_MRR_SORTED)); - is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION); + /* + Determine whether we'll need to do key sorting and/or rnd_pos() scan + */ + index_strategy= NULL; + Mrr_ordered_index_reader *ordered_idx_reader= NULL; + if ((mode & HA_MRR_SINGLE_POINT) && + optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS)) + { + index_strategy= ordered_idx_reader= &reader_factory.ordered_index_reader; + } + else + index_strategy= &reader_factory.simple_index_reader; + + strategy= index_strategy; + /* + We don't need a rowid-to-rndpos step if + - We're doing a scan on clustered primary key + - [In the future] We're doing an index_only read + */ + DBUG_ASSERT(h->inited == handler::INDEX || + (h->inited == handler::RND && h2 && + h2->inited == handler::INDEX)); + + handler *h_idx= (h->inited == handler::INDEX)? h: h2; + keyno= h_idx->active_index; + + Mrr_ordered_rndpos_reader *disk_strategy= NULL; + if (!(keyno == table->s->primary_key && h_idx->primary_key_is_clustered())) + { + strategy= disk_strategy= &reader_factory.ordered_rndpos_reader; + } if (is_mrr_assoc) - status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count); - - rowids_buf_end= buf->buffer_end; - elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*); - rowids_buf_last= rowids_buf + - ((rowids_buf_end - rowids_buf)/ elem_size)* - elem_size; - rowids_buf_end= rowids_buf_last; + status_var_increment(thd->status_var.ha_multi_range_read_init_count); + full_buf= buf->buffer; + full_buf_end= buf->buffer_end; + + if (strategy == index_strategy) + { + /* Index strategy serves it all. We don't need two handlers, etc */ + /* Give the buffer to index strategy */ + if ((res= index_strategy->init(h, seq_funcs, seq_init_param, n_ranges, + mode, this))) + goto error; + } + else + { /* - There can be two cases: - - This is the first call since index_init(), h2==NULL - Need to setup h2 then. - - This is not the first call, h2 is initalized and set up appropriately. - The caller might have called h->index_init(), need to switch h to - rnd_pos calls. + If we got here the request is served by both index and rndpos strategies + working together. + + */ + rowid_buffer.set_buffer_space(buf->buffer, buf->buffer_end); + + if ((res= setup_two_handlers())) + DBUG_RETURN(res); + + if ((res= index_strategy->init(h2, seq_funcs, seq_init_param, n_ranges, + mode, this)) || + (res= disk_strategy->init(h, index_strategy, mode, &rowid_buffer))) + { + goto error; + } + } + + res= strategy->refill_buffer(); + if (res && res != HA_ERR_END_OF_FILE) //psergey-todo: remove EOF check here + goto error; + + /* + If we have scanned through all intervals in *seq, then adjust *buf to + indicate that the remaining buffer space will not be used. */ +// if (dsmrr_eof) +// buf->end_of_used_area= rowid_buffer.end_of_space(); + + + DBUG_RETURN(0); +error: + close_second_handler(); + strategy= NULL; + DBUG_RETURN(1); +} + + +/* + Whatever the current state is, make it so that we have two handler objects: + - h (the primary) - initialized for rnd_pos() scan + - h2 (the secondary) - initialized for scanning the index specified in + this->keyno + RETURN + 0 OK + HA_XXX Error code +*/ + +int DsMrr_impl::setup_two_handlers() +{ + int res; + THD *thd= current_thd; + DBUG_ENTER("DsMrr_impl::setup_two_handlers"); if (!h2) { - /* Create a separate handler object to do rndpos() calls. */ - THD *thd= current_thd; + handler *new_h2; + Item *pushed_cond= NULL; + DBUG_ASSERT(h->inited == handler::INDEX); + /* Create a separate handler object to do rnd_pos() calls. */ /* ::clone() takes up a lot of stack, especially on 64 bit platforms. The constant 5 is an empiric result. */ if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2)) DBUG_RETURN(1); - DBUG_ASSERT(h->active_index != MAX_KEY); - uint mrr_keyno= h->active_index; - /* Create a separate handler object to do rndpos() calls. */ + /* Create a separate handler object to do rnd_pos() calls. */ if (!(new_h2= h->clone(thd->mem_root)) || new_h2->ha_external_lock(thd, F_RDLCK)) { @@ -363,88 +878,69 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, DBUG_RETURN(1); } - if (mrr_keyno == h->pushed_idx_cond_keyno) + if (keyno == h->pushed_idx_cond_keyno) pushed_cond= h->pushed_idx_cond; - + + Mrr_reader *save_strategy= strategy; + strategy= NULL; /* Caution: this call will invoke this->dsmrr_close(). Do not put the - created secondary table handler into this->h2 or it will delete it. + created secondary table handler new_h2 into this->h2 or it will delete + it. Also, save the picked strategy */ - if (h->ha_index_end()) - { - h2=new_h2; - goto error; - } + res= h->ha_index_end(); + strategy= save_strategy; h2= new_h2; /* Ok, now can put it into h2 */ + + if (res || (res= (h->ha_rnd_init(FALSE)))) + goto error; + table->prepare_for_position(); h2->extra(HA_EXTRA_KEYREAD); - - if (h2->ha_index_init(mrr_keyno, FALSE)) + h2->mrr_iter= h->mrr_iter; + + if ((res= h2->ha_index_init(keyno, FALSE))) goto error; - use_default_impl= FALSE; if (pushed_cond) - h2->idx_cond_push(mrr_keyno, pushed_cond); + h2->idx_cond_push(keyno, pushed_cond); } else { + DBUG_ASSERT(h2 && h2->inited==handler::INDEX); /* We get here when the access alternates betwen MRR scan(s) and non-MRR scans. Calling h->index_end() will invoke dsmrr_close() for this object, - which will delete h2. We need to keep it, so save put it away and dont + which will delete h2. We need to keep it, so put it away and dont let it be deleted: */ - handler *save_h2= h2; - h2= NULL; - int res= (h->inited == handler::INDEX && h->ha_index_end()); - h2= save_h2; - use_default_impl= FALSE; - if (res) + if (h->inited == handler::INDEX) + { + handler *save_h2= h2; + Mrr_reader *save_strategy= strategy; + h2= NULL; + strategy= NULL; + res= h->ha_index_end(); + h2= save_h2; + strategy= save_strategy; + if (res) + goto error; + } + if ((h->inited != handler::RND) && h->ha_rnd_init(FALSE)) goto error; } - - if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, - mode, buf) || - dsmrr_fill_buffer()) - { - goto error; - } - /* - If the above call has scanned through all intervals in *seq, then - adjust *buf to indicate that the remaining buffer space will not be used. - */ - if (dsmrr_eof) - buf->end_of_used_area= rowids_buf_last; - - /* - h->inited == INDEX may occur when 'range checked for each record' is - used. - */ - if ((h->inited != handler::RND) && - ((h->inited==handler::INDEX? h->ha_index_end(): FALSE) || - (h->ha_rnd_init(FALSE)))) - goto error; - - use_default_impl= FALSE; - h->mrr_funcs= *seq_funcs; - DBUG_RETURN(0); + error: - h2->ha_index_or_rnd_end(); - h2->ha_external_lock(current_thd, F_UNLCK); - h2->close(); - delete h2; - h2= NULL; - DBUG_RETURN(1); + DBUG_RETURN(res); } -void DsMrr_impl::dsmrr_close() +void DsMrr_impl::close_second_handler() { - DBUG_ENTER("DsMrr_impl::dsmrr_close"); if (h2) { h2->ha_index_or_rnd_end(); @@ -453,128 +949,301 @@ void DsMrr_impl::dsmrr_close() delete h2; h2= NULL; } - use_default_impl= TRUE; +} + + +void DsMrr_impl::dsmrr_close() +{ + DBUG_ENTER("DsMrr_impl::dsmrr_close"); + close_second_handler(); + strategy= NULL; DBUG_VOID_RETURN; } -static int rowid_cmp(void *h, uchar *a, uchar *b) +/* + my_qsort2-compatible function to compare key tuples +*/ + +int Mrr_ordered_index_reader::key_tuple_cmp(void* arg, uchar* key1, uchar* key2) +{ + Mrr_ordered_index_reader *this_= (Mrr_ordered_index_reader*)arg; + TABLE *table= this_->h->get_table(); + int res; + KEY_PART_INFO *part= table->key_info[this_->h->active_index].key_part; + + if (this_->keypar.use_key_pointers) + { + /* the buffer stores pointers to keys, get to the keys */ + key1= *((uchar**)key1); + key2= *((uchar**)key2); // todo is this alignment-safe? + } + + uchar *key1_end= key1 + this_->keypar.key_tuple_length; + + while (key1 < key1_end) + { + Field* f = part->field; + int len = part->store_length; + if (part->null_bit) + { + if (*key1) // key1 == NULL + { + if (!*key2) // key1(NULL) < key2(notNULL) + return -1; + goto equals; + } + else if (*key2) // key1(notNULL) > key2 (NULL) + return 1; + // Step over NULL byte for f->cmp(). + key1++; + key2++; + len--; + } + + if ((res= f->key_cmp(key1, key2))) + return res; +equals: + key1 += len; + key2 += len; + part++; + } + return 0; +} + + +int Mrr_ordered_index_reader::key_tuple_cmp_reverse(void* arg, uchar* key1, + uchar* key2) { - return ((handler*)h)->cmp_ref(a, b); + return -key_tuple_cmp(arg, key1, key2); } /** - DS-MRR: Fill the buffer with rowids and sort it by rowid + Setup key/rowid buffer sizes based on sample_key and its length. + + @param + sample_key A lookup key to use as a sample. It is assumed that + all other keys will have the same length/etc. + @note + This function must be called when all buffers are empty +*/ - {This is an internal function of DiskSweep MRR implementation} - Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into - buffer. When the buffer is full or scan is completed, sort the buffer by - rowid and return. +void DsMrr_impl::setup_buffer_sizes(uint key_size_in_keybuf, + key_part_map key_tuple_map) +{ + uint key_buff_elem_size= key_size_in_keybuf + + (int)is_mrr_assoc * sizeof(void*); + + KEY *key_info= &h->get_table()->key_info[keyno]; + if (strategy == index_strategy) + { + /* Give all space to the key buffer, key buffer must be forward */ + key_buffer= &forward_key_buf; + key_buffer->set_buffer_space(full_buf, full_buf_end); + DBUG_ASSERT(key_buffer->have_space_for(key_buff_elem_size)); + + /* Just in case, tell rowid buffer that it has zero size: */ + rowid_buffer.set_buffer_space(full_buf_end, full_buf_end); + return; + } - The function assumes that rowids buffer is empty when it is invoked. + /* + Ok if we got here we need to allocate one part of the buffer + for keys and another part for rowids. + */ + uint rowid_buf_elem_size= h->ref_length + + (int)is_mrr_assoc * sizeof(char*); - @param h Table handler + /* + Use rec_per_key statistics as a basis to find out how many rowids + we'll get for each key value. + TODO: are we guaranteed to get r_p_c==1 for unique keys? + TODO: what should be the default value to use when there is no + statistics? + */ + uint parts= my_count_bits(key_tuple_map); + ulong rpc; + if ((rpc= key_info->rec_per_key[parts - 1])) + { + rowid_buf_elem_size *= rpc; + } - @retval 0 OK, the next portion of rowids is in the buffer, - properly ordered - @retval other Error -*/ + double fraction_for_rowids= + ((double) rowid_buf_elem_size / + ((double)rowid_buf_elem_size + key_buff_elem_size)); -int DsMrr_impl::dsmrr_fill_buffer() -{ - char *range_info; - int res; - DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer"); + size_t bytes_for_rowids= + round(fraction_for_rowids * (full_buf_end - full_buf)); + + uint bytes_for_keys= (full_buf_end - full_buf) - bytes_for_rowids; - rowids_buf_cur= rowids_buf; - while ((rowids_buf_cur < rowids_buf_end) && - !(res= h2->handler::multi_range_read_next(&range_info))) + if (bytes_for_keys < key_buff_elem_size + 1) { - KEY_MULTI_RANGE *curr_range= &h2->handler::mrr_cur_range; - if (h2->mrr_funcs.skip_index_tuple && - h2->mrr_funcs.skip_index_tuple(h2->mrr_iter, curr_range->ptr)) - continue; - - /* Put rowid, or {rowid, range_id} pair into the buffer */ - h2->position(table->record[0]); - memcpy(rowids_buf_cur, h2->ref, h2->ref_length); - rowids_buf_cur += h2->ref_length; + uint add= key_buff_elem_size + 1 - bytes_for_keys; + bytes_for_keys= key_buff_elem_size + 1; + bytes_for_rowids -= add; + DBUG_ASSERT(bytes_for_rowids >= rowid_buf_elem_size + 1); + } - if (is_mrr_assoc) - { - memcpy(rowids_buf_cur, &range_info, sizeof(void*)); - rowids_buf_cur += sizeof(void*); - } + if (bytes_for_rowids < rowid_buf_elem_size + 1) + { + uint add= rowid_buf_elem_size + 1 - bytes_for_rowids; + bytes_for_rowids= rowid_buf_elem_size + 1; + bytes_for_keys -= add; + DBUG_ASSERT(bytes_for_keys >= key_buff_elem_size + 1); } - if (res && res != HA_ERR_END_OF_FILE) - DBUG_RETURN(res); - dsmrr_eof= test(res == HA_ERR_END_OF_FILE); + rowid_buffer_end= full_buf + bytes_for_rowids; + rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end); + key_buffer= &backward_key_buf; + key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end); + DBUG_ASSERT(key_buffer->have_space_for(key_buff_elem_size)); + DBUG_ASSERT(rowid_buffer.have_space_for(rowid_buf_elem_size)); +} - /* Sort the buffer contents by rowid */ - uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*); - uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size; - - my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp, - (void*)h); - rowids_buf_last= rowids_buf_cur; - rowids_buf_cur= rowids_buf; - DBUG_RETURN(0); + +void DsMrr_impl::reset_buffer_sizes() +{ + if (strategy != index_strategy) + { + /* + Ok we have both ordered index reader and there is a disk rearder. + Redistribute the buffer space. + */ + rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end); + key_buffer= &backward_key_buf; + key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end); + } } -/** - DS-MRR implementation: multi_range_read_next() function +/* + Take unused space from the key buffer and give it to the rowid buffer */ -int DsMrr_impl::dsmrr_next(char **range_info) +void DsMrr_impl::redistribute_buffer_space() +{ + uchar *unused_start, *unused_end; + key_buffer->remove_unused_space(&unused_start, &unused_end); + rowid_buffer.grow(unused_start, unused_end); +} + + +/* + @brief Initialize the iterator + + @note + Initialize the iterator to produce matches for the key of the first element + in owner_arg->key_buffer + + @retval 0 OK + @retval HA_ERR_END_OF_FILE Either the owner->key_buffer is empty or + no matches for the key we've tried (check + key_buffer->is_empty() to tell these apart) + @retval other code Fatal error +*/ + +int Key_value_records_iterator::init(Mrr_ordered_index_reader *owner_arg) { int res; - uchar *cur_range_info= 0; - uchar *rowid; + owner= owner_arg; + + identical_key_it.init(owner->key_buffer); + /* Get the first pair into (cur_index_tuple, cur_range_info) */ + owner->key_buffer->setup_reading(&cur_index_tuple, + owner->keypar.key_size_in_keybuf, + owner->is_mrr_assoc? + (uchar**)&owner->cur_range_info: NULL, + sizeof(void*)); + + if (identical_key_it.read()) + return HA_ERR_END_OF_FILE; - if (use_default_impl) - return h->handler::multi_range_read_next(range_info); + uchar *key_in_buf= cur_index_tuple; + + last_identical_key_ptr= cur_index_tuple; + if (owner->keypar.use_key_pointers) + cur_index_tuple= *((uchar**)cur_index_tuple); - do + /* Check out how many more identical keys are following */ + uchar *save_cur_index_tuple= cur_index_tuple; + while (!identical_key_it.read()) { - if (rowids_buf_cur == rowids_buf_last) - { - if (dsmrr_eof) - { - res= HA_ERR_END_OF_FILE; - goto end; - } - res= dsmrr_fill_buffer(); - if (res) - goto end; - } - - /* return eof if there are no rowids in the buffer after re-fill attempt */ - if (rowids_buf_cur == rowids_buf_last) + if (owner->disallow_identical_key_handling || + Mrr_ordered_index_reader::key_tuple_cmp(owner, key_in_buf, + cur_index_tuple)) + break; + last_identical_key_ptr= cur_index_tuple; + } + identical_key_it.init(owner->key_buffer); + cur_index_tuple= save_cur_index_tuple; + res= owner->h->ha_index_read_map(owner->h->get_table()->record[0], + cur_index_tuple, + owner->keypar.key_tuple_map, + HA_READ_KEY_EXACT); + + if (res) + { + close(); + return res; + } + get_next_row= FALSE; + return 0; +} + + +int Key_value_records_iterator::get_next() +{ + int res; + + if (get_next_row) + { + if (owner->keypar.index_ranges_unique) + return HA_ERR_END_OF_FILE; /* Max one match */ + + handler *h= owner->h; + if ((res= h->ha_index_next_same(h->get_table()->record[0], + cur_index_tuple, + owner->keypar.key_tuple_length))) { - res= HA_ERR_END_OF_FILE; - goto end; + /* It's either HA_ERR_END_OF_FILE or some other error */ + return res; } - rowid= rowids_buf_cur; + identical_key_it.init(owner->key_buffer); + get_next_row= FALSE; + } - if (is_mrr_assoc) - memcpy(&cur_range_info, rowids_buf_cur + h->ref_length, sizeof(uchar**)); + identical_key_it.read(); /* This gets us next range_id */ + if (!last_identical_key_ptr || (cur_index_tuple == last_identical_key_ptr)) + { + get_next_row= TRUE; + } + return 0; +} - rowids_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc); - if (h2->mrr_funcs.skip_record && - h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid)) - continue; - res= h->ha_rnd_pos(table->record[0], rowid); - break; - } while (true); - - if (is_mrr_assoc) + +void Key_value_records_iterator::close() +{ + while (!owner->key_buffer->read() && + (cur_index_tuple != last_identical_key_ptr)) {} +} + + +/** + DS-MRR implementation: multi_range_read_next() function. + + Calling convention is like multi_range_read_next() has. +*/ + +int DsMrr_impl::dsmrr_next(char **range_info) +{ + int res; + while ((res= strategy->get_next(range_info)) == HA_ERR_END_OF_FILE) { - memcpy(range_info, rowid + h->ref_length, sizeof(void*)); + if ((res= strategy->refill_buffer())) + break; /* EOF or error */ } -end: return res; } @@ -582,7 +1251,8 @@ end: /** DS-MRR implementation: multi_range_read_info() function */ -ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows, +ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows, + uint key_parts, uint *bufsz, uint *flags, COST_VECT *cost) { ha_rows res; @@ -590,8 +1260,8 @@ ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows, uint def_bufsz= *bufsz; /* Get cost/flags/mem_usage of default MRR implementation */ - res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz, - &def_flags, cost); + res= h->handler::multi_range_read_info(keyno, n_ranges, rows, key_parts, + &def_bufsz, &def_flags, cost); DBUG_ASSERT(!res); if ((*flags & HA_MRR_USE_DEFAULT_IMPL) || @@ -683,7 +1353,29 @@ bool key_uses_partial_cols(TABLE *table, uint keyno) return FALSE; } -/** + +/* + Check if key/flags allow DS-MRR/CPK strategy to be used + + @param thd + @param keyno Index that will be used + @param mrr_flags + + @retval TRUE DS-MRR/CPK should be used + @retval FALSE Otherwise +*/ + +bool DsMrr_impl::check_cpk_scan(THD *thd, uint keyno, uint mrr_flags) +{ + return test((mrr_flags & HA_MRR_SINGLE_POINT) && // check + // !(mrr_flags & HA_MRR_SORTED) && + keyno == table->s->primary_key && + h->primary_key_is_clustered() && + optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS)); //check +} + + +/* DS-MRR Internals: Choose between Default MRR implementation and DS-MRR Make the choice between using Default MRR implementation and DS-MRR. @@ -706,21 +1398,25 @@ bool key_uses_partial_cols(TABLE *table, uint keyno) @retval FALSE DS-MRR implementation should be used */ + bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, COST_VECT *cost) { COST_VECT dsmrr_cost; bool res; THD *thd= current_thd; + + bool doing_cpk_scan= check_cpk_scan(thd, keyno, *flags); + bool using_cpk= test(keyno == table->s->primary_key && + h->primary_key_is_clustered()); if (thd->variables.optimizer_use_mrr == 2 || *flags & HA_MRR_INDEX_ONLY || - (keyno == table->s->primary_key && h->primary_key_is_clustered()) || - key_uses_partial_cols(table, keyno)) + (using_cpk && !doing_cpk_scan) || key_uses_partial_cols(table, keyno)) { /* Use the default implementation */ *flags |= HA_MRR_USE_DEFAULT_IMPL; return TRUE; } - + uint add_len= table->key_info[keyno].key_length + h->ref_length; *bufsz -= add_len; if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost)) @@ -744,6 +1440,10 @@ bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, *flags &= ~HA_MRR_SORTED; /* We will return unordered output */ *cost= dsmrr_cost; res= FALSE; + + if ((*flags & HA_MRR_SINGLE_POINT) && + optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS)) + *flags |= HA_MRR_MATERIALIZED_KEYS; } else { @@ -828,17 +1528,14 @@ bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, /* Get cost of one sort-and-sweep step + + It consists of two parts: + - sort an array of #nrows ROWIDs using qsort + - read #nrows records from table in a sweep. - SYNOPSIS - get_sort_and_sweep_cost() - table Table being accessed - nrows Number of rows to be sorted and retrieved - cost OUT The cost - - DESCRIPTION - Get cost of these operations: - - sort an array of #nrows ROWIDs using qsort - - read #nrows records from table in a sweep. + @param table Table being accessed + @param nrows Number of rows to be sorted and retrieved + @param cost OUT The cost of scan */ static diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h index 90e2e4c93d6..8e2743df5f1 100644 --- a/sql/multi_range_read.h +++ b/sql/multi_range_read.h @@ -1,49 +1,507 @@ -/* - This file contains declarations for - - Disk-Sweep MultiRangeRead (DS-MRR) implementation +/** + @defgroup DS-MRR declarations + @{ */ /** - A Disk-Sweep MRR interface implementation + A Disk-Sweep implementation of MRR Interface (DS-MRR for short) - This implementation makes range (and, in the future, 'ref') scans to read - table rows in disk sweeps. - - Currently it is used by MyISAM and InnoDB. Potentially it can be used with - any table handler that has non-clustered indexes and on-disk rows. + This is a "plugin"(*) for storage engines that allows to + 1. When doing index scans, read table rows in rowid order; + 2. when making many index lookups, do them in key order and don't + lookup the same key value multiple times; + 3. Do both #1 and #2, when applicable. + These changes are expected to speed up query execution for disk-based + storage engines running io-bound loads and "big" queries (ie. queries that + do joins and enumerate lots of records). + + (*) - only conceptually. No dynamic loading or binary compatibility of any + kind. + + General scheme of things: + + SQL Layer code + | | | + v v v + -|---|---|---- handler->multi_range_read_XXX() function calls + | | | + _____________________________________ + / DS-MRR module \ + | (order/de-duplicate lookup keys, | + | scan indexes in key order, | + | order/de-duplicate rowids, | + | retrieve full record reads in rowid | + | order) | + \_____________________________________/ + | | | + -|---|---|----- handler->read_range_first()/read_range_next(), + | | | handler->index_read(), handler->rnd_pos() calls. + | | | + v v v + Storage engine internals + + + Currently DS-MRR is used by MyISAM, InnoDB/XtraDB and Maria storage engines. + Potentially it can be used with any table handler that has disk-based data + storage and has better performance when reading data in rowid order. */ -class DsMrr_impl +#include "sql_lifo_buffer.h" + +class DsMrr_impl; +class Mrr_ordered_index_reader; + + +/* A structure with key parameters that's shared among several classes */ +class Key_parameters { public: - typedef void (handler::*range_check_toggle_func_t)(bool on); + /* TRUE <=> We can get at most one index tuple for a lookup key */ + bool index_ranges_unique; - DsMrr_impl() - : h2(NULL) {}; + uint key_tuple_length; /* Length of index lookup tuple, in bytes */ + key_part_map key_tuple_map; /* keyparts used in index lookup tuples */ + + /* + This is + = key_tuple_length if we copy keys to buffer + = sizeof(void*) if we're using pointers to materialized keys. + */ + uint key_size_in_keybuf; + + /* TRUE <=> don't copy key values, use pointers to them instead. */ + bool use_key_pointers; +}; + + +/** + A class to enumerate (record, range_id) pairs that match given key value. + + @note + + The idea is that we have a Lifo_buffer which holds (key, range_id) pairs + ordered by key value. From the front of the buffer we see + + (key_val1, range_id1), (key_val1, range_id2) ... (key_val2, range_idN) + + we take the first elements that have the same key value (key_val1 in the + example above), and make lookup into the table. The table will have + multiple matches for key_val1: + + == Table Index == + ... + key_val1 -> key_val1, index_tuple1 + key_val1, index_tuple2 + ... + key_val1, index_tupleN + ... + + Our goal is to produce all possible combinations, i.e. we need: + + {(key_val1, index_tuple1), range_id1} + {(key_val1, index_tuple1), range_id2} + ... ... | + {(key_val1, index_tuple1), range_idN}, + + {(key_val1, index_tuple2), range_id1} + {(key_val1, index_tuple2), range_id2} + ... ... | + {(key_val1, index_tuple2), range_idN}, + + ... ... ... + + {(key_val1, index_tupleK), range_idN} +*/ + +class Key_value_records_iterator +{ + /* Use this to get table handler, key buffer and other parameters */ + Mrr_ordered_index_reader *owner; + + /* Iterator to get (key, range_id) pairs from */ + Lifo_buffer_iterator identical_key_it; + /* + Last of the identical key values (when we get this pointer from + identical_key_it, it will be time to stop). + */ + uchar *last_identical_key_ptr; + /* - The "owner" handler object (the one that calls dsmrr_XXX functions. - It is used to retrieve full table rows by calling rnd_pos(). + FALSE <=> we're right after the init() call, the record has been already + read with owner->h->index_read_map() call */ - handler *h; - TABLE *table; /* Always equal to h->table */ + bool get_next_row; + + uchar *cur_index_tuple; /* key_buffer.read() reads to here */ +public: + int init(Mrr_ordered_index_reader *owner_arg); + int get_next(); + void close(); +}; + + +/* + Buffer manager interface. Mrr_reader objects use it to inqure DsMrr_impl + to manage buffer space for them. +*/ +class Buffer_manager +{ +public: + /* + Index-based reader calls this when it gets the first key, so we get to know + key length and + */ + virtual void setup_buffer_sizes(uint key_size_in_keybuf, + key_part_map key_tuple_map) = 0; + + virtual void redistribute_buffer_space() = 0; + /* + This is called when both key and rowid buffers are empty, and so it's time + to reset them to their original size (They've lost their original size, + because we were dynamically growing rowid buffer and shrinking key buffer). + */ + virtual void reset_buffer_sizes() = 0; + + virtual Lifo_buffer* get_key_buffer() = 0; + + virtual ~Buffer_manager(){} /* Shut up the compiler */ +}; + + +/* + Mrr_reader - DS-MRR execution strategy abstraction + + A reader produces ([index]_record, range_info) pairs, and requires periodic + refill operations. + + - one starts using the reader by calling reader->get_next(), + - when a get_next() call returns HA_ERR_END_OF_FILE, one must call + refill_buffer() before they can make more get_next() calls. + - when refill_buffer() returns HA_ERR_END_OF_FILE, this means the real + end of stream and get_next() should not be called anymore. + + Both functions can return other error codes, these mean unrecoverable errors + after which one cannot continue. +*/ + +class Mrr_reader +{ +public: + virtual int get_next(char **range_info) = 0; + virtual int refill_buffer() = 0; + virtual ~Mrr_reader() {}; /* just to remove compiler warning */ +}; + + +/* + A common base for readers that do index scans and produce index tuples +*/ + +class Mrr_index_reader : public Mrr_reader +{ +protected: + handler *h; /* Handler object to use */ +public: + virtual int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, + uint mode, Buffer_manager *buf_manager_arg) = 0; + + /* Get pointer to place where every get_next() call will put rowid */ + virtual uchar *get_rowid_ptr() = 0; + /* Get the rowid (call this after get_next() call) */ + void position(); + virtual bool skip_record(char *range_id, uchar *rowid) = 0; +}; + + +/* + A "bypass" index reader that just does and index scan. The index scan is done + by calling default MRR implementation (i.e. handler::multi_range_read_XXX()) + functions. +*/ + +class Mrr_simple_index_reader : public Mrr_index_reader +{ +public: + int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, + uint mode, Buffer_manager *buf_manager_arg); + int get_next(char **range_info); + int refill_buffer() { return HA_ERR_END_OF_FILE; } + uchar *get_rowid_ptr() { return h->ref; } + bool skip_record(char *range_id, uchar *rowid) + { + return (h->mrr_funcs.skip_record && + h->mrr_funcs.skip_record(h->mrr_iter, range_id, rowid)); + } +}; + + +/* + A reader that sorts the key values before it makes the index lookups. +*/ + +class Mrr_ordered_index_reader : public Mrr_index_reader +{ +public: + int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, + uint mode, Buffer_manager *buf_manager_arg); + int get_next(char **range_info); + int refill_buffer(); + uchar *get_rowid_ptr() { return h->ref; } + + bool skip_record(char *range_info, uchar *rowid) + { + return (mrr_funcs.skip_record && + mrr_funcs.skip_record(mrr_iter, range_info, rowid)); + } + + bool skip_index_tuple(char *range_info) + { + return (mrr_funcs.skip_index_tuple && + mrr_funcs.skip_index_tuple(mrr_iter, range_info)); + } + private: - /* Secondary handler object. It is used for scanning the index */ - handler *h2; + Key_value_records_iterator kv_it; - /* Buffer to store rowids, or (rowid, range_id) pairs */ - uchar *rowids_buf; - uchar *rowids_buf_cur; /* Current position when reading/writing */ - uchar *rowids_buf_last; /* When reading: end of used buffer space */ - uchar *rowids_buf_end; /* End of the buffer */ + bool scanning_key_val_iter; + + /* Key_value_records_iterator::read() will place range_info here */ + char *cur_range_info; - bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */ + /* Buffer to store (key, range_id) pairs */ + Lifo_buffer *key_buffer; + + /* This manages key buffer allocation and sizing for us */ + Buffer_manager *buf_manager; + + /* + Initially FALSE, becomes TRUE when we saw the first lookup key and set + keypar's member. + */ + bool know_key_tuple_params; + + Key_parameters keypar; /* index scan and lookup tuple parameters */ + + /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */ + bool is_mrr_assoc; + + /* + TRUE <=> Don't do optimizations for identical key value (see comment in + Mrr_ordered_index_reader::init for details) + */ + bool disallow_identical_key_handling; + + /* Range sequence iteration members */ + RANGE_SEQ_IF mrr_funcs; + range_seq_t mrr_iter; + + static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2); + static int key_tuple_cmp_reverse(void* arg, uchar* key1, uchar* key2); + + friend class Key_value_records_iterator; + friend class DsMrr_impl; + friend class Mrr_ordered_rndpos_reader; +}; - /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */ + +/* + A reader that gets rowids from an Mrr_index_reader, and then sorts them + before getting full records with handler->rndpos() calls. +*/ + +class Mrr_ordered_rndpos_reader : public Mrr_reader +{ +public: + int init(handler *h, Mrr_index_reader *index_reader, uint mode, + Lifo_buffer *buf); + int get_next(char **range_info); + int refill_buffer(); +private: + handler *h; /* Handler to use */ + + /* This what we get (rowid, range_info) pairs from */ + Mrr_index_reader *index_reader; + + /* index_reader->get_next() puts rowid here */ + uchar *index_rowid; + + /* TRUE <=> index_reader->refill_buffer() call has returned EOF */ + bool index_reader_exhausted; + + /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */ bool is_mrr_assoc; + + /* + When reading from ordered rowid buffer: the rowid element of the last + buffer element that has rowid identical to this one. + */ + uchar *last_identical_rowid; + + /* Buffer to store (rowid, range_id) pairs */ + Lifo_buffer *rowid_buffer; + + /* rowid_buffer.read() will set the following: */ + uchar *rowid; + uchar *rowids_range_id; + + int refill_from_key_buffer(); +}; + + +/* + A primitive "factory" of various Mrr_*_reader classes (the point is to + get various kinds of readers without having to allocate them on the heap) +*/ + +class Mrr_reader_factory +{ +public: + Mrr_ordered_rndpos_reader ordered_rndpos_reader; + Mrr_ordered_index_reader ordered_index_reader; + Mrr_simple_index_reader simple_index_reader; +}; + + +/* + DS-MRR implementation for one table. Create/use one object of this class for + each ha_{myisam/innobase/etc} object. That object will be further referred to + as "the handler" + + DsMrr_impl supports has the following execution strategies: + + - Bypass DS-MRR, pass all calls to default MRR implementation, which is + an MRR-to-non-MRR call converter. + - Key-Ordered Retrieval + - Rowid-Ordered Retrieval + + DsMrr_impl will use one of the above strategies, or a combination of them, + according to the following diagram: + + (mrr function calls) + | + +----------------->-----------------+ + | | + ___________v______________ _______________v________________ + / default: use lookup keys \ / KEY-ORDERED RETRIEVAL: \ + | (or ranges) in whatever | | sort lookup keys and then make | + | order they are supplied | | index lookups in index order | + \__________________________/ \________________________________/ + | | | | | + +---<---+ | +--------------->-----------|----+ + | | | | + | | +---------------+ | + | ______v___ ______ | _______________v_______________ + | / default: read \ | / ROWID-ORDERED RETRIEVAL: \ + | | table records | | | Before reading table records, | + v | in random order | v | sort their rowids and then | + | \_________________/ | | read them in rowid order | + | | | \_______________________________/ + | | | | + | | | | + +-->---+ | +----<------+-----------<--------+ + | | | + v v v + (table records and range_ids) + + The choice of strategy depends on MRR scan properties, table properties + (whether we're scanning clustered primary key), and @@optimizer_switch + settings. + + Key-Ordered Retrieval + --------------------- + The idea is: if MRR scan is essentially a series of lookups on + + tbl.key=value1 OR tbl.key=value2 OR ... OR tbl.key=valueN + + then it makes sense to collect and order the set of lookup values, i.e. + + sort(value1, value2, .. valueN) + + and then do index lookups in index order. This results in fewer index page + fetch operations, and we also can avoid making multiple index lookups for the + same value. That is, if value1=valueN we can easily discover that after + sorting and make one index lookup for them instead of two. - bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */ + Rowid-Ordered Retrieval + ----------------------- + If we do a regular index scan or a series of index lookups, we'll be hitting + table records at random. For disk-based engines, this is much slower than + reading the same records in disk order. We assume that disk ordering of + rows is the same as ordering of their rowids (which is provided by + handler::cmp_ref()) + In order to retrieve records in different order, we must separate index + scanning and record fetching, that is, MRR scan uses the following steps: + + 1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and + fill a buffer with {rowid, range_id} pairs + 2. Sort the buffer by rowid value + 3. for each {rowid, range_id} pair in the buffer + get record by rowid and return the {record, range_id} pair + 4. Repeat the above steps until we've exhausted the list of ranges we're + scanning. + + Buffer space management considerations + -------------------------------------- + With regards to buffer/memory management, MRR interface specifies that + - SQL layer provides multi_range_read_init() with buffer of certain size. + - MRR implementation may use (i.e. have at its disposal till the end of + the MRR scan) all of the buffer, or return the unused end of the buffer + to SQL layer. + + DS-MRR needs buffer in order to accumulate and sort rowids and/or keys. When + we need to accumulate/sort only keys (or only rowids), it is fairly trivial. + + When we need to accumulate/sort both keys and rowids, efficient buffer use + gets complicated. We need to: + - First, accumulate keys and sort them + - Then use the keys (smaller values go first) to obtain rowids. A key is not + needed after we've got matching rowids for it. + - Make sure that rowids are accumulated at the front of the buffer, so that we + can return the end part of the buffer to SQL layer, should there be too + few rowid values to occupy the buffer. + + All of these goals are achieved by using the following scheme: + + | | We get an empty buffer from SQL layer. + + | *-| + | *----| First, we fill the buffer with keys. Key_buffer + | *-------| part grows from end of the buffer space to start + | *----------| (In this picture, the buffer is big enough to + | *-------------| accomodate all keys and even have some space left) + + | *=============| We want to do key-ordered index scan, so we sort + the keys + + |-x *===========| Then we use the keys get rowids. Rowids are + |----x *========| stored from start of buffer space towards the end. + |--------x *=====| The part of the buffer occupied with keys + |------------x *===| gradually frees up space for rowids. In this + |--------------x *=| picture we run out of keys before we've ran out + |----------------x | of buffer space (it can be other way as well). + + |================x | Then we sort the rowids. + + | |~~~| The unused part of the buffer is at the end, so + we can return it to the SQL layer. + + |================* Sorted rowids are then used to read table records + in disk order + +*/ + +class DsMrr_impl : public Buffer_manager +{ public: + typedef void (handler::*range_check_toggle_func_t)(bool on); + + DsMrr_impl() + : h2(NULL) {}; + void init(handler *h_arg, TABLE *table_arg) { h= h_arg; @@ -52,19 +510,86 @@ public: int dsmrr_init(handler *h, RANGE_SEQ_IF *seq_funcs, void *seq_init_param, uint n_ranges, uint mode, HANDLER_BUFFER *buf); void dsmrr_close(); - int dsmrr_fill_buffer(); int dsmrr_next(char **range_info); - ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint *bufsz, - uint *flags, COST_VECT *cost); + ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts, + uint *bufsz, uint *flags, COST_VECT *cost); ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, void *seq_init_param, uint n_ranges, uint *bufsz, uint *flags, COST_VECT *cost); private: + /* Buffer to store (key, range_id) pairs */ + Lifo_buffer *key_buffer; + + /* + The "owner" handler object (the one that is expected to "own" this object + and call its functions). + */ + handler *h; + TABLE *table; /* Always equal to h->table */ + + /* + Secondary handler object. (created when needed, we need it when we need + to run both index scan and rnd_pos() scan at the same time) + */ + handler *h2; + + uint keyno; /* index we're running the scan on */ + /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */ + bool is_mrr_assoc; + + Mrr_reader_factory reader_factory; + Mrr_reader *strategy; + Mrr_index_reader *index_strategy; + + /* The whole buffer space that we're using */ + uchar *full_buf; + uchar *full_buf_end; + + /* + When using both rowid and key buffers: the boundary between key and rowid + parts of the buffer. This is the "original" value, actual memory ranges + used by key and rowid parts may be different because of dynamic space + reallocation between them. + */ + uchar *rowid_buffer_end; + + /* + One of the following two is used for key buffer: forward is used when + we only need key buffer, backward is used when we need both key and rowid + buffers. + */ + Forward_lifo_buffer forward_key_buf; + Backward_lifo_buffer backward_key_buf; + + /* + Buffer to store (rowid, range_id) pairs, or just rowids if + is_mrr_assoc==FALSE + */ + Forward_lifo_buffer rowid_buffer; + bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, COST_VECT *cost); bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, uint *buffer_size, COST_VECT *cost); + bool check_cpk_scan(THD *thd, uint keyno, uint mrr_flags); + + /* Buffer_manager implementation */ + void setup_buffer_sizes(uint key_size_in_keybuf, key_part_map key_tuple_map); + void redistribute_buffer_space(); + void reset_buffer_sizes(); + Lifo_buffer* get_key_buffer() { return key_buffer; } + + friend class Key_value_records_iterator; + friend class Mrr_ordered_index_reader; + friend class Mrr_ordered_rndpos_reader; + + int setup_two_handlers(); + void close_second_handler(); }; +/** + @} (end of group DS-MRR declarations) +*/ + diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h index d77089d194c..d1e7cea68a3 100644 --- a/sql/mysql_priv.h +++ b/sql/mysql_priv.h @@ -570,17 +570,18 @@ protected: #define OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE 512 #define OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN 1024 #define OPTIMIZER_SWITCH_SUBQUERY_CACHE (1<<11) -#define OPTIMIZER_SWITCH_OUTER_JOIN_WITH_CACHE (1<<12) -#define OPTIMIZER_SWITCH_SEMIJOIN_WITH_CACHE (1<<13) -#define OPTIMIZER_SWITCH_JOIN_CACHE_INCREMENTAL (1<<14) -#define OPTIMIZER_SWITCH_JOIN_CACHE_HASHED (1<<15) -#define OPTIMIZER_SWITCH_JOIN_CACHE_BKA (1<<16) +#define OPTIMIZER_SWITCH_MRR_SORT_KEYS (1<<12) +#define OPTIMIZER_SWITCH_OUTER_JOIN_WITH_CACHE (1<<13) +#define OPTIMIZER_SWITCH_SEMIJOIN_WITH_CACHE (1<<14) +#define OPTIMIZER_SWITCH_JOIN_CACHE_INCREMENTAL (1<<15) +#define OPTIMIZER_SWITCH_JOIN_CACHE_HASHED (1<<16) +#define OPTIMIZER_SWITCH_JOIN_CACHE_BKA (1<<17) #ifdef DBUG_OFF -# define OPTIMIZER_SWITCH_LAST (1<<17) -#else -# define OPTIMIZER_SWITCH_TABLE_ELIMINATION (1<<17) # define OPTIMIZER_SWITCH_LAST (1<<18) +#else +# define OPTIMIZER_SWITCH_TABLE_ELIMINATION (1<<18) +# define OPTIMIZER_SWITCH_LAST (1<<19) #endif #ifdef DBUG_OFF @@ -596,6 +597,8 @@ protected: OPTIMIZER_SWITCH_SEMIJOIN | \ OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE|\ OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN|\ + OPTIMIZER_SWITCH_SUBQUERY_CACHE|\ + OPTIMIZER_SWITCH_MRR_SORT_KEYS|\ OPTIMIZER_SWITCH_SUBQUERY_CACHE | \ OPTIMIZER_SWITCH_JOIN_CACHE_INCREMENTAL | \ OPTIMIZER_SWITCH_JOIN_CACHE_HASHED | \ @@ -613,7 +616,8 @@ protected: OPTIMIZER_SWITCH_SEMIJOIN | \ OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE|\ OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN|\ - OPTIMIZER_SWITCH_SUBQUERY_CACHE | \ + OPTIMIZER_SWITCH_SUBQUERY_CACHE|\ + OPTIMIZER_SWITCH_MRR_SORT_KEYS|\ OPTIMIZER_SWITCH_JOIN_CACHE_INCREMENTAL | \ OPTIMIZER_SWITCH_JOIN_CACHE_HASHED | \ OPTIMIZER_SWITCH_JOIN_CACHE_BKA) diff --git a/sql/mysqld.cc b/sql/mysqld.cc index f324d243c8e..46a8fa83acd 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -345,6 +345,7 @@ static const char *optimizer_switch_names[]= "partial_match_rowid_merge", "partial_match_table_scan", "subquery_cache", + "mrr_sort_keys", "outer_join_with_cache", "semijoin_with_cache", "join_cache_incremental", @@ -371,6 +372,7 @@ static const unsigned int optimizer_switch_names_len[]= sizeof("partial_match_rowid_merge") - 1, sizeof("partial_match_table_scan") - 1, sizeof("subquery_cache") - 1, + sizeof("mrr_sort_keys") - 1, sizeof("outer_join_with_cache") - 1, sizeof("semijoin_with_cache") - 1, sizeof("join_cache_incremental") - 1, @@ -475,6 +477,7 @@ static const char *optimizer_switch_str="index_merge=on,index_merge_union=on," "partial_match_rowid_merge=on," "partial_match_table_scan=on," "subquery_cache=on," + "mrr_sort_keys=on," "join_cache_incremental=on," "join_cache_hashed=on," "join_cache_bka=on" diff --git a/sql/opt_index_cond_pushdown.cc b/sql/opt_index_cond_pushdown.cc index 71eadfb45cf..6f0210a3d21 100644 --- a/sql/opt_index_cond_pushdown.cc +++ b/sql/opt_index_cond_pushdown.cc @@ -381,6 +381,7 @@ void push_index_cond(JOIN_TAB *tab, uint keyno, bool other_tbls_ok, QT_ORDINARY);); tab->select->cond= tab->select_cond; + tab->select->pre_idx_push_select_cond= tab->pre_idx_push_select_cond; } } } diff --git a/sql/opt_range.cc b/sql/opt_range.cc index d77893bdb1c..e139839cf56 100644 --- a/sql/opt_range.cc +++ b/sql/opt_range.cc @@ -1119,7 +1119,7 @@ SQL_SELECT *make_select(TABLE *head, table_map const_tables, } -SQL_SELECT::SQL_SELECT() :quick(0),cond(0),free_cond(0) +SQL_SELECT::SQL_SELECT() :quick(0),cond(0),pre_idx_push_select_cond(NULL),free_cond(0) { quick_keys.clear_all(); needed_reg.clear_all(); my_b_clear(&file); @@ -8006,6 +8006,7 @@ QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table, quick->mrr_buf_size= thd->variables.mrr_buff_size; if (table->file->multi_range_read_info(quick->index, 1, (uint)records, + uint(-1), &quick->mrr_buf_size, &quick->mrr_flags, &cost)) goto err; diff --git a/sql/opt_range.h b/sql/opt_range.h index 1f2df1d7d55..072b83d4fbf 100644 --- a/sql/opt_range.h +++ b/sql/opt_range.h @@ -818,6 +818,13 @@ class SQL_SELECT :public Sql_alloc { public: QUICK_SELECT_I *quick; // If quick-select used COND *cond; // where condition + + /* + When using Index Condition Pushdown: condition that we've had before + extracting and pushing index condition. + In other cases, NULL. + */ + Item *pre_idx_push_select_cond; TABLE *head; IO_CACHE file; // Positions to used records ha_rows records; // Records in use if read from file diff --git a/sql/sql_join_cache.cc b/sql/sql_join_cache.cc index c3f2dfa5541..103275cf673 100644 --- a/sql/sql_join_cache.cc +++ b/sql/sql_join_cache.cc @@ -2614,6 +2614,7 @@ int JOIN_CACHE_HASHED::realloc_buffer() return rc; } + /* Get maximum size of the additional space per record used for record keys @@ -3859,6 +3860,7 @@ void JOIN_CACHE_BKA::read_next_candidate_for_match(uchar *rec_ptr) int JOIN_CACHE_BKA::init() { + int res; bool check_only_first_match= join_tab->check_only_first_match(); RANGE_SEQ_IF rs_funcs= { bka_range_seq_init, @@ -3869,11 +3871,18 @@ int JOIN_CACHE_BKA::init() DBUG_ENTER("JOIN_CACHE_BKA::init"); - if (!(join_tab_scan= new JOIN_TAB_SCAN_MRR(join, join_tab, - mrr_mode, rs_funcs))) + JOIN_TAB_SCAN_MRR *jsm; + if (!(join_tab_scan= jsm= new JOIN_TAB_SCAN_MRR(join, join_tab, + mrr_mode, rs_funcs))) DBUG_RETURN(1); - DBUG_RETURN(JOIN_CACHE::init()); + if ((res= JOIN_CACHE::init())) + DBUG_RETURN(res); + + if (use_emb_key) + jsm->mrr_mode |= HA_MRR_MATERIALIZED_KEYS; + + DBUG_RETURN(0); } diff --git a/sql/sql_lifo_buffer.h b/sql/sql_lifo_buffer.h new file mode 100644 index 00000000000..8718177fc4f --- /dev/null +++ b/sql/sql_lifo_buffer.h @@ -0,0 +1,340 @@ +/** + @defgroup Bi-directional LIFO buffers used by DS-MRR implementation + @{ +*/ + +class Forward_lifo_buffer; +class Backward_lifo_buffer; + + +/* + A base class for in-memory buffer used by DS-MRR implementation. Common + properties: + - The buffer is last-in-first-out, i.e. elements that are written last are + read first. + - The buffer contains fixed-size elements. The elements are either atomic + byte sequences or pairs of them. + - The buffer resides in the memory provided by the user. It is possible to + = dynamically (ie. between write operations) add ajacent memory space to + the buffer + = dynamically remove unused space from the buffer. + The intent of this is to allow to have two buffers on adjacent memory + space, one is being read from (and so its space shrinks), while the other + is being written to (and so it needs more and more space). + + There are two concrete classes, Forward_lifo_buffer and Backward_lifo_buffer. +*/ + +class Lifo_buffer +{ +protected: + /** + Pointers to data to be written. write() call will assume that + (*write_ptr1) points to size1 bytes of data to be written. + If write_ptr2 != NULL then the buffer stores pairs, and (*write_ptr2) + points to size2 bytes of data that form the second component. + */ + uchar **write_ptr1; + size_t size1; + uchar **write_ptr2; + size_t size2; + + /** + read() will do reading by storing pointer to read data into *read_ptr1 (if + the buffer stores atomic elements), or into {*read_ptr1, *read_ptr2} (if + the buffer stores pairs). + */ + uchar **read_ptr1; + uchar **read_ptr2; + + uchar *start; /**< points to start of buffer space */ + uchar *end; /**< points to just beyond the end of buffer space */ +public: + + enum enum_direction { + BACKWARD=-1, /**< buffer is filled/read from bigger to smaller memory addresses */ + FORWARD=1 /**< buffer is filled/read from smaller to bigger memory addresses */ + }; + + virtual enum_direction type() = 0; + + /* Buffer space control functions */ + + /** Let the buffer store data in the given space. */ + void set_buffer_space(uchar *start_arg, uchar *end_arg) + { + start= start_arg; + end= end_arg; + TRASH(start, end - start); + reset(); + } + + /** + Specify where write() should get the source data from, as well as source + data size. + */ + void setup_writing(uchar **data1, size_t len1, uchar **data2, size_t len2) + { + write_ptr1= data1; + size1= len1; + write_ptr2= data2; + size2= len2; + } + + /** + Specify where read() should store pointers to read data, as well as read + data size. The sizes must match those passed to setup_writing(). + */ + void setup_reading(uchar **data1, size_t len1, uchar **data2, size_t len2) + { + read_ptr1= data1; + DBUG_ASSERT(len1 == size1); + read_ptr2= data2; + DBUG_ASSERT(len2 == size2); + } + + bool can_write() + { + return have_space_for(size1 + (write_ptr2 ? size2 : 0)); + } + virtual void write() = 0; + + bool is_empty() { return used_size() == 0; } + virtual bool read() = 0; + + void sort(qsort2_cmp cmp_func, void *cmp_func_arg) + { + uint elem_size= size1 + (write_ptr2 ? size2 : 0); + uint n_elements= used_size() / elem_size; + my_qsort2(used_area(), n_elements, elem_size, cmp_func, cmp_func_arg); + } + + virtual void reset() = 0; + virtual uchar *end_of_space() = 0; +protected: + virtual size_t used_size() = 0; + + /* To be used only by iterator class: */ + virtual uchar *get_pos()= 0; + virtual bool read(uchar **position)= 0; + friend class Lifo_buffer_iterator; +public: + virtual bool have_space_for(size_t bytes) = 0; + + virtual void remove_unused_space(uchar **unused_start, uchar **unused_end)=0; + virtual uchar *used_area() = 0; + virtual ~Lifo_buffer() {}; +}; + + +/** + Forward LIFO buffer + + The buffer that is being written to from start to end and read in the + reverse. 'pos' points to just beyond the end of used space. + + It is possible to grow/shink the buffer at the end bound + + used space unused space + *==============*-----------------* + ^ ^ ^ + | | +--- end + | +---- pos + +--- start +*/ + +class Forward_lifo_buffer: public Lifo_buffer +{ + uchar *pos; +public: + enum_direction type() { return FORWARD; } + size_t used_size() + { + return pos - start; + } + void reset() + { + pos= start; + } + uchar *end_of_space() { return pos; } + bool have_space_for(size_t bytes) + { + return (pos + bytes < end); + } + + void write() + { + write_bytes(*write_ptr1, size1); + if (write_ptr2) + write_bytes(*write_ptr2, size2); + } + void write_bytes(const uchar *data, size_t bytes) + { + DBUG_ASSERT(have_space_for(bytes)); + memcpy(pos, data, bytes); + pos += bytes; + } + bool have_data(uchar *position, size_t bytes) + { + return ((position - start) >= (ptrdiff_t)bytes); + } + uchar *read_bytes(uchar **position, size_t bytes) + { + DBUG_ASSERT(have_data(*position, bytes)); + *position= (*position) - bytes; + return *position; + } + bool read() { return read(&pos); } + bool read(uchar **position) + { + if (!have_data(*position, size1 + (read_ptr2 ? size2 : 0))) + return TRUE; + if (read_ptr2) + *read_ptr2= read_bytes(position, size2); + *read_ptr1= read_bytes(position, size1); + return FALSE; + } + void remove_unused_space(uchar **unused_start, uchar **unused_end) + { + DBUG_ASSERT(0); /* Don't need this yet */ + } + /** + Add more space to the buffer. The caller is responsible that the space + being added is adjacent to the end of the buffer. + + @param unused_start Start of space + @param unused_end End of space + */ + void grow(uchar *unused_start, uchar *unused_end) + { + DBUG_ASSERT(unused_end >= unused_start); + DBUG_ASSERT(end == unused_start); + TRASH(unused_start, unused_end - unused_start); + end= unused_end; + } + /* Return pointer to start of the memory area that is occupied by the data */ + uchar *used_area() { return start; } + friend class Lifo_buffer_iterator; + uchar *get_pos() { return pos; } +}; + + + +/** + Backward LIFO buffer + + The buffer that is being written to from start to end and read in the + reverse. 'pos' points to the start of used space. + + It is possible to grow/shink the buffer at the start. + + unused space used space + *--------------*=================* + ^ ^ ^ + | | +--- end + | +---- pos + +--- start +*/ +class Backward_lifo_buffer: public Lifo_buffer +{ + uchar *pos; +public: + enum_direction type() { return BACKWARD; } + + size_t used_size() + { + return end - pos; + } + void reset() + { + pos= end; + } + uchar *end_of_space() { return end; } + bool have_space_for(size_t bytes) + { + return (pos - bytes >= start); + } + void write() + { + if (write_ptr2) + write_bytes(*write_ptr2, size2); + write_bytes(*write_ptr1, size1); + } + void write_bytes(const uchar *data, size_t bytes) + { + DBUG_ASSERT(have_space_for(bytes)); + pos -= bytes; + memcpy(pos, data, bytes); + } + bool read() + { + return read(&pos); + } + bool read(uchar **position) + { + if (!have_data(*position, size1 + (read_ptr2 ? size2 : 0))) + return TRUE; + *read_ptr1= read_bytes(position, size1); + if (read_ptr2) + *read_ptr2= read_bytes(position, size2); + return FALSE; + } + bool have_data(uchar *position, size_t bytes) + { + return ((end - position) >= (ptrdiff_t)bytes); + } + uchar *read_bytes(uchar **position, size_t bytes) + { + DBUG_ASSERT(have_data(*position, bytes)); + uchar *ret= *position; + *position= *position + bytes; + return ret; + } + /** + Stop using/return the unused part of the space + @param unused_start OUT Start of the unused space + @param unused_end OUT End of the unused space + */ + void remove_unused_space(uchar **unused_start, uchar **unused_end) + { + *unused_start= start; + *unused_end= pos; + start= pos; + } + void grow(uchar *unused_start, uchar *unused_end) + { + DBUG_ASSERT(0); /* Not used for backward buffers */ + } + /* Return pointer to start of the memory area that is occupied by the data */ + uchar *used_area() { return pos; } + friend class Lifo_buffer_iterator; + uchar *get_pos() { return pos; } +}; + + + +/** Iterator to walk over contents of the buffer without reading it. */ +class Lifo_buffer_iterator +{ + uchar *pos; + Lifo_buffer *buf; +public: + void init(Lifo_buffer *buf_arg) + { + buf= buf_arg; + pos= buf->get_pos(); + } + /* + Read the next value. The calling convention is the same as buf->read() + has. + + @retval FALSE - ok + @retval TRUE - EOF, reached the end of the buffer + */ + bool read() + { + return buf->read(&pos); + } +}; + + diff --git a/sql/sql_select.cc b/sql/sql_select.cc index 8ce2c96da17..5067e595198 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -7722,11 +7722,11 @@ uint check_join_cache_usage(JOIN_TAB *tab, case JT_EQ_REF: if (cache_level <=2 || (no_hashed_cache && no_bka_cache)) goto no_join_cache; - - flags= HA_MRR_NO_NULL_ENDPOINTS; + flags= HA_MRR_NO_NULL_ENDPOINTS | HA_MRR_SINGLE_POINT; if (tab->table->covering_keys.is_set(tab->ref.key)) flags|= HA_MRR_INDEX_ONLY; rows= tab->table->file->multi_range_read_info(tab->ref.key, 10, 20, + tab->ref.key_parts, &bufsz, &flags, &cost); if ((cache_level <=4 && !no_hashed_cache) || no_bka_cache || diff --git a/sql/sql_select.h b/sql/sql_select.h index 1e57f487f69..33ce35833be 100644 --- a/sql/sql_select.h +++ b/sql/sql_select.h @@ -1590,6 +1590,7 @@ public: int next(); + friend class JOIN_CACHE_BKA; //for mrr_mode access }; /* |