diff options
Diffstat (limited to 'sql/multi_range_read.h')
-rw-r--r-- | sql/multi_range_read.h | 633 |
1 files changed, 600 insertions, 33 deletions
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h index 90e2e4c93d6..1b72e71944d 100644 --- a/sql/multi_range_read.h +++ b/sql/multi_range_read.h @@ -1,70 +1,637 @@ -/* - This file contains declarations for - - Disk-Sweep MultiRangeRead (DS-MRR) implementation +/** + @defgroup DS-MRR declarations + @{ */ /** - A Disk-Sweep MRR interface implementation + A Disk-Sweep implementation of MRR Interface (DS-MRR for short) + + This is a "plugin"(*) for storage engines that allows to + 1. When doing index scans, read table rows in rowid order; + 2. when making many index lookups, do them in key order and don't + lookup the same key value multiple times; + 3. Do both #1 and #2, when applicable. + These changes are expected to speed up query execution for disk-based + storage engines running io-bound loads and "big" queries (ie. queries that + do joins and enumerate lots of records). + + (*) - only conceptually. No dynamic loading or binary compatibility of any + kind. + + General scheme of things: + + SQL Layer code + | | | + v v v + -|---|---|---- handler->multi_range_read_XXX() function calls + | | | + _____________________________________ + / DS-MRR module \ + | (order/de-duplicate lookup keys, | + | scan indexes in key order, | + | order/de-duplicate rowids, | + | retrieve full record reads in rowid | + | order) | + \_____________________________________/ + | | | + -|---|---|----- handler->read_range_first()/read_range_next(), + | | | handler->index_read(), handler->rnd_pos() calls. + | | | + v v v + Storage engine internals + + + Currently DS-MRR is used by MyISAM, InnoDB/XtraDB and Maria storage engines. + Potentially it can be used with any table handler that has disk-based data + storage and has better performance when reading data in rowid order. +*/ + +#include "sql_lifo_buffer.h" + +class DsMrr_impl; +class Mrr_ordered_index_reader; + + +/* A structure with key parameters that's shared among several classes */ +class Key_parameters +{ +public: + uint key_tuple_length; /* Length of index lookup tuple, in bytes */ + key_part_map key_tuple_map; /* keyparts used in index lookup tuples */ + + /* + This is + = key_tuple_length if we copy keys to buffer + = sizeof(void*) if we're using pointers to materialized keys. + */ + uint key_size_in_keybuf; + + /* TRUE <=> don't copy key values, use pointers to them instead. */ + bool use_key_pointers; - This implementation makes range (and, in the future, 'ref') scans to read - table rows in disk sweeps. + /* TRUE <=> We can get at most one index tuple for a lookup key */ + bool index_ranges_unique; +}; + + +/** + A class to enumerate (record, range_id) pairs that match given key value. - Currently it is used by MyISAM and InnoDB. Potentially it can be used with - any table handler that has non-clustered indexes and on-disk rows. + @note + + The idea is that we have a Lifo_buffer which holds (key, range_id) pairs + ordered by key value. From the front of the buffer we see + + (key_val1, range_id1), (key_val1, range_id2) ... (key_val2, range_idN) + + we take the first elements that have the same key value (key_val1 in the + example above), and make lookup into the table. The table will have + multiple matches for key_val1: + + == Table Index == + ... + key_val1 -> key_val1, index_tuple1 + key_val1, index_tuple2 + ... + key_val1, index_tupleN + ... + + Our goal is to produce all possible combinations, i.e. we need: + + {(key_val1, index_tuple1), range_id1} + {(key_val1, index_tuple1), range_id2} + ... ... | + {(key_val1, index_tuple1), range_idN}, + + {(key_val1, index_tuple2), range_id1} + {(key_val1, index_tuple2), range_id2} + ... ... | + {(key_val1, index_tuple2), range_idN}, + + ... ... ... + + {(key_val1, index_tupleK), range_idN} */ -class DsMrr_impl +class Key_value_records_iterator { + /* Use this to get table handler, key buffer and other parameters */ + Mrr_ordered_index_reader *owner; + + /* Iterator to get (key, range_id) pairs from */ + Lifo_buffer_iterator identical_key_it; + + /* + Last of the identical key values (when we get this pointer from + identical_key_it, it will be time to stop). + */ + uchar *last_identical_key_ptr; + + /* + FALSE <=> we're right after the init() call, the record has been already + read with owner->file->index_read_map() call + */ + bool get_next_row; + public: - typedef void (handler::*range_check_toggle_func_t)(bool on); + int init(Mrr_ordered_index_reader *owner_arg); + int get_next(range_id_t *range_info); + void move_to_next_key_value(); +}; - DsMrr_impl() - : h2(NULL) {}; + +/* + Buffer manager interface. Mrr_reader objects use it to inqure DsMrr_impl + to manage buffer space for them. +*/ +typedef struct st_buffer_manager +{ +public: + /* Opaque value to be passed as the first argument to all member functions */ + void *arg; /* - The "owner" handler object (the one that calls dsmrr_XXX functions. - It is used to retrieve full table rows by calling rnd_pos(). + This is called when we've freed more space from the rowid buffer. The + callee will get the unused space from the rowid buffer and give it to the + key buffer. + */ + void (*redistribute_buffer_space)(void *arg); + + /* + This is called when both key and rowid buffers are empty, and so it's time + to reset them to their original size (They've lost their original size, + because we were dynamically growing rowid buffer and shrinking key buffer). */ - handler *h; - TABLE *table; /* Always equal to h->table */ + void (*reset_buffer_sizes)(void *arg); + +} Buffer_manager; + + +/* + Mrr_reader - DS-MRR execution strategy abstraction + + A reader produces ([index]_record, range_info) pairs, and requires periodic + refill operations. + + - one starts using the reader by calling reader->get_next(), + - when a get_next() call returns HA_ERR_END_OF_FILE, one must call + refill_buffer() before they can make more get_next() calls. + - when refill_buffer() returns HA_ERR_END_OF_FILE, this means the real + end of stream and get_next() should not be called anymore. + + Both functions can return other error codes, these mean unrecoverable errors + after which one cannot continue. +*/ + +class Mrr_reader +{ +public: + virtual int get_next(range_id_t *range_info) = 0; + virtual int refill_buffer(bool initial) = 0; + virtual ~Mrr_reader() {}; /* just to remove compiler warning */ +}; + + +/* + A common base for readers that do index scans and produce index tuples +*/ + +class Mrr_index_reader : public Mrr_reader +{ +protected: + handler *file; /* Handler object to use */ +public: + virtual int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, + uint mode, Key_parameters *key_par, + Lifo_buffer *key_buffer, + Buffer_manager *buf_manager_arg) = 0; + + /* Get pointer to place where every get_next() call will put rowid */ + virtual uchar *get_rowid_ptr() = 0; + /* Get the rowid (call this after get_next() call) */ + virtual void position(); + virtual bool skip_record(range_id_t range_id, uchar *rowid) = 0; + + virtual void interrupt_read() {} + virtual void resume_read() {} +}; + + +/* + A "bypass" index reader that just does and index scan. The index scan is done + by calling default MRR implementation (i.e. handler::multi_range_read_XXX()) + functions. +*/ + +class Mrr_simple_index_reader : public Mrr_index_reader +{ +public: + int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, + uint mode, Key_parameters *key_par, + Lifo_buffer *key_buffer, + Buffer_manager *buf_manager_arg); + int get_next(range_id_t *range_info); + int refill_buffer(bool initial) { return initial? 0: HA_ERR_END_OF_FILE; } + uchar *get_rowid_ptr() { return file->ref; } + bool skip_record(range_id_t range_id, uchar *rowid) + { + return (file->mrr_funcs.skip_record && + file->mrr_funcs.skip_record(file->mrr_iter, range_id, rowid)); + } +}; + + +/* + A reader that sorts the key values before it makes the index lookups. +*/ + +class Mrr_ordered_index_reader : public Mrr_index_reader +{ +public: + int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, + uint mode, Key_parameters *key_par, + Lifo_buffer *key_buffer, + Buffer_manager *buf_manager_arg); + int get_next(range_id_t *range_info); + int refill_buffer(bool initial); + uchar *get_rowid_ptr() { return file->ref; } + + bool skip_record(range_id_t range_info, uchar *rowid) + { + return (mrr_funcs.skip_record && + mrr_funcs.skip_record(mrr_iter, range_info, rowid)); + } + + bool skip_index_tuple(range_id_t range_info) + { + return (mrr_funcs.skip_index_tuple && + mrr_funcs.skip_index_tuple(mrr_iter, range_info)); + } + + bool set_interruption_temp_buffer(uint rowid_length, uint key_len, + uint saved_pk_len, + uchar **space_start, uchar *space_end); + void set_no_interruption_temp_buffer(); + + void interrupt_read(); + void resume_read(); + void position(); private: - /* Secondary handler object. It is used for scanning the index */ - handler *h2; + Key_value_records_iterator kv_it; + + bool scanning_key_val_iter; + + /* Buffer to store (key, range_id) pairs */ + Lifo_buffer *key_buffer; + + /* This manages key buffer allocation and sizing for us */ + Buffer_manager *buf_manager; - /* Buffer to store rowids, or (rowid, range_id) pairs */ - uchar *rowids_buf; - uchar *rowids_buf_cur; /* Current position when reading/writing */ - uchar *rowids_buf_last; /* When reading: end of used buffer space */ - uchar *rowids_buf_end; /* End of the buffer */ + Key_parameters keypar; /* index scan and lookup tuple parameters */ - bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */ + /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */ + bool is_mrr_assoc; + + /* Range sequence iteration members */ + RANGE_SEQ_IF mrr_funcs; + range_seq_t mrr_iter; + + /* TRUE == reached eof when enumerating ranges */ + bool source_exhausted; + + /* + Following members are for interrupt_read()/resume_read(). The idea is that + in some cases index scan that is done by this object is interrupted by + rnd_pos() calls made by Mrr_ordered_rndpos_reader. The problem is that + we're sharing handler->record[0] with that object, and it destroys its + contents. + We need to save/restore our current + - index tuple (for pushed index condition checks) + - clustered primary key values (again, for pushed index condition checks) + - rowid of the last record we've retrieved (in case this rowid matches + multiple ranges and we'll need to return it again) + */ + bool support_scan_interruptions; + /* Space where we save the rowid of the last record we've returned */ + uchar *saved_rowid; + + /* TRUE <=> saved_rowid has the last saved rowid */ + bool have_saved_rowid; + + uchar *saved_key_tuple; /* Saved current key tuple */ + uchar *saved_primary_key; /* Saved current primary key tuple */ + + static int compare_keys(void* arg, uchar* key1, uchar* key2); + static int compare_keys_reverse(void* arg, uchar* key1, uchar* key2); + + friend class Key_value_records_iterator; + friend class DsMrr_impl; + friend class Mrr_ordered_rndpos_reader; +}; + + +/* + A reader that gets rowids from an Mrr_index_reader, and then sorts them + before getting full records with handler->rndpos() calls. +*/ + +class Mrr_ordered_rndpos_reader : public Mrr_reader +{ +public: + int init(handler *file, Mrr_index_reader *index_reader, uint mode, + Lifo_buffer *buf); + int get_next(range_id_t *range_info); + int refill_buffer(bool initial); +private: + handler *file; /* Handler to use */ + + /* This what we get (rowid, range_info) pairs from */ + Mrr_index_reader *index_reader; + + /* index_reader->get_next() puts rowid here */ + uchar *index_rowid; + + /* TRUE <=> index_reader->refill_buffer() call has returned EOF */ + bool index_reader_exhausted; + + /* + TRUE <=> We should call index_reader->refill_buffer(). This happens if + 1. we've made index_reader->get_next() call which returned EOF + 2. we haven't made any index_reader calls (and our first call should + be index_reader->refill_buffer(initial=TRUE) + */ + bool index_reader_needs_refill; - /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */ + /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */ bool is_mrr_assoc; + + /* + When reading from ordered rowid buffer: the rowid element of the last + buffer element that has rowid identical to this one. + */ + uchar *last_identical_rowid; + + /* Buffer to store (rowid, range_id) pairs */ + Lifo_buffer *rowid_buffer; + + int refill_from_index_reader(); +}; + + +/* + A primitive "factory" of various Mrr_*_reader classes (the point is to + get various kinds of readers without having to allocate them on the heap) +*/ + +class Mrr_reader_factory +{ +public: + Mrr_ordered_rndpos_reader ordered_rndpos_reader; + Mrr_ordered_index_reader ordered_index_reader; + Mrr_simple_index_reader simple_index_reader; +}; + + +#define DSMRR_IMPL_SORT_KEYS HA_MRR_IMPLEMENTATION_FLAG1 +#define DSMRR_IMPL_SORT_ROWIDS HA_MRR_IMPLEMENTATION_FLAG2 + +/* + DS-MRR implementation for one table. Create/use one object of this class for + each ha_{myisam/innobase/etc} object. That object will be further referred to + as "the handler" + + DsMrr_impl supports has the following execution strategies: + + - Bypass DS-MRR, pass all calls to default MRR implementation, which is + an MRR-to-non-MRR call converter. + - Key-Ordered Retrieval + - Rowid-Ordered Retrieval + + DsMrr_impl will use one of the above strategies, or a combination of them, + according to the following diagram: + + (mrr function calls) + | + +----------------->-----------------+ + | | + ___________v______________ _______________v________________ + / default: use lookup keys \ / KEY-ORDERED RETRIEVAL: \ + | (or ranges) in whatever | | sort lookup keys and then make | + | order they are supplied | | index lookups in index order | + \__________________________/ \________________________________/ + | | | | | + +---<---+ | +--------------->-----------|----+ + | | | | + | | +---------------+ | + | ______v___ ______ | _______________v_______________ + | / default: read \ | / ROWID-ORDERED RETRIEVAL: \ + | | table records | | | Before reading table records, | + v | in random order | v | sort their rowids and then | + | \_________________/ | | read them in rowid order | + | | | \_______________________________/ + | | | | + | | | | + +-->---+ | +----<------+-----------<--------+ + | | | + v v v + (table records and range_ids) + + The choice of strategy depends on MRR scan properties, table properties + (whether we're scanning clustered primary key), and @@optimizer_switch + settings. + + Key-Ordered Retrieval + --------------------- + The idea is: if MRR scan is essentially a series of lookups on + + tbl.key=value1 OR tbl.key=value2 OR ... OR tbl.key=valueN + + then it makes sense to collect and order the set of lookup values, i.e. + + sort(value1, value2, .. valueN) + + and then do index lookups in index order. This results in fewer index page + fetch operations, and we also can avoid making multiple index lookups for the + same value. That is, if value1=valueN we can easily discover that after + sorting and make one index lookup for them instead of two. + + Rowid-Ordered Retrieval + ----------------------- + If we do a regular index scan or a series of index lookups, we'll be hitting + table records at random. For disk-based engines, this is much slower than + reading the same records in disk order. We assume that disk ordering of + rows is the same as ordering of their rowids (which is provided by + handler::cmp_ref()) + In order to retrieve records in different order, we must separate index + scanning and record fetching, that is, MRR scan uses the following steps: + + 1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and + fill a buffer with {rowid, range_id} pairs + 2. Sort the buffer by rowid value + 3. for each {rowid, range_id} pair in the buffer + get record by rowid and return the {record, range_id} pair + 4. Repeat the above steps until we've exhausted the list of ranges we're + scanning. - bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */ + Buffer space management considerations + -------------------------------------- + With regards to buffer/memory management, MRR interface specifies that + - SQL layer provides multi_range_read_init() with buffer of certain size. + - MRR implementation may use (i.e. have at its disposal till the end of + the MRR scan) all of the buffer, or return the unused end of the buffer + to SQL layer. + + DS-MRR needs buffer in order to accumulate and sort rowids and/or keys. When + we need to accumulate/sort only keys (or only rowids), it is fairly trivial. + + When we need to accumulate/sort both keys and rowids, efficient buffer use + gets complicated. We need to: + - First, accumulate keys and sort them + - Then use the keys (smaller values go first) to obtain rowids. A key is not + needed after we've got matching rowids for it. + - Make sure that rowids are accumulated at the front of the buffer, so that we + can return the end part of the buffer to SQL layer, should there be too + few rowid values to occupy the buffer. + + All of these goals are achieved by using the following scheme: + + | | We get an empty buffer from SQL layer. + + | *-| + | *----| First, we fill the buffer with keys. Key_buffer + | *-------| part grows from end of the buffer space to start + | *----------| (In this picture, the buffer is big enough to + | *-------------| accomodate all keys and even have some space left) + + | *=============| We want to do key-ordered index scan, so we sort + the keys + + |-x *===========| Then we use the keys get rowids. Rowids are + |----x *========| stored from start of buffer space towards the end. + |--------x *=====| The part of the buffer occupied with keys + |------------x *===| gradually frees up space for rowids. In this + |--------------x *=| picture we run out of keys before we've ran out + |----------------x | of buffer space (it can be other way as well). + + |================x | Then we sort the rowids. + + | |~~~| The unused part of the buffer is at the end, so + we can return it to the SQL layer. + + |================* Sorted rowids are then used to read table records + in disk order + +*/ + +class DsMrr_impl +{ public: + typedef void (handler::*range_check_toggle_func_t)(bool on); + + DsMrr_impl() + : secondary_file(NULL) {}; + void init(handler *h_arg, TABLE *table_arg) { - h= h_arg; + primary_file= h_arg; table= table_arg; } - int dsmrr_init(handler *h, RANGE_SEQ_IF *seq_funcs, void *seq_init_param, - uint n_ranges, uint mode, HANDLER_BUFFER *buf); + int dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, uint mode, + HANDLER_BUFFER *buf); void dsmrr_close(); - int dsmrr_fill_buffer(); - int dsmrr_next(char **range_info); + int dsmrr_next(range_id_t *range_info); - ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint *bufsz, - uint *flags, COST_VECT *cost); + ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts, + uint *bufsz, uint *flags, COST_VECT *cost); ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, void *seq_init_param, uint n_ranges, uint *bufsz, uint *flags, COST_VECT *cost); + + int dsmrr_explain_info(uint mrr_mode, char *str, size_t size); private: + /* Buffer to store (key, range_id) pairs */ + Lifo_buffer *key_buffer; + + /* + The "owner" handler object (the one that is expected to "own" this object + and call its functions). + */ + handler *primary_file; + TABLE *table; /* Always equal to primary_file->table */ + + /* + Secondary handler object. (created when needed, we need it when we need + to run both index scan and rnd_pos() scan at the same time) + */ + handler *secondary_file; + + uint keyno; /* index we're running the scan on */ + /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */ + bool is_mrr_assoc; + + Mrr_reader_factory reader_factory; + + Mrr_reader *strategy; + bool strategy_exhausted; + + Mrr_index_reader *index_strategy; + + /* The whole buffer space that we're using */ + uchar *full_buf; + uchar *full_buf_end; + + /* + When using both rowid and key buffers: the boundary between key and rowid + parts of the buffer. This is the "original" value, actual memory ranges + used by key and rowid parts may be different because of dynamic space + reallocation between them. + */ + uchar *rowid_buffer_end; + + /* + One of the following two is used for key buffer: forward is used when + we only need key buffer, backward is used when we need both key and rowid + buffers. + */ + Forward_lifo_buffer forward_key_buf; + Backward_lifo_buffer backward_key_buf; + + /* + Buffer to store (rowid, range_id) pairs, or just rowids if + is_mrr_assoc==FALSE + */ + Forward_lifo_buffer rowid_buffer; + bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, COST_VECT *cost); bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, uint *buffer_size, COST_VECT *cost); + bool check_cpk_scan(THD *thd, uint keyno, uint mrr_flags); + + bool setup_buffer_sharing(uint key_size_in_keybuf, key_part_map key_tuple_map); + + /* Buffer_manager and its member functions */ + Buffer_manager buf_manager; + static void redistribute_buffer_space(void *dsmrr_arg); + static void reset_buffer_sizes(void *dsmrr_arg); + static void do_nothing(void *dsmrr_arg); + + Lifo_buffer* get_key_buffer() { return key_buffer; } + + friend class Key_value_records_iterator; + friend class Mrr_ordered_index_reader; + friend class Mrr_ordered_rndpos_reader; + + int setup_two_handlers(); + void close_second_handler(); }; +/** + @} (end of group DS-MRR declarations) +*/ + |