diff options
-rw-r--r-- | sql/field.cc | 72 | ||||
-rw-r--r-- | sql/field.h | 11 | ||||
-rw-r--r-- | sql/opt_range.cc | 73 | ||||
-rw-r--r-- | sql/sql_select.h | 6 | ||||
-rw-r--r-- | sql/sql_statistics.cc | 200 | ||||
-rw-r--r-- | sql/sql_statistics.h | 4 |
6 files changed, 318 insertions, 48 deletions
diff --git a/sql/field.cc b/sql/field.cc index 103d868d834..e491afd9f25 100644 --- a/sql/field.cc +++ b/sql/field.cc @@ -1274,7 +1274,24 @@ out_of_range: } -double Field_num::middle_point_pos(Field *min, Field *max) +/** + @brief + Determine the relative position of the field value in a numeric interval + + @details + The function returns a double number between 0.0 and 1.0 as the relative + position of the value of the this field in the numeric interval of [min,max]. + If the value is not in the interval the the function returns 0.0 when + the value is less than min, and, 1.0 when the value is greater than max. + + @param min value of the left end of the interval + @param max value of the right end of the interval + + @return + relative position of the field value in the numeric interval [min,max] +*/ + +double Field_num::pos_in_interval(Field *min, Field *max) { double n, d; n= val_real() - min->val_real(); @@ -6196,7 +6213,39 @@ inline ulonglong char_prefix_to_ulonglong(uchar *src) return uint8korr(src); } -double Field_str::middle_point_pos(Field *min, Field *max) +/** + @brief + Determine the relative position of the field value in a string interval + + @details + The function returns a double number between 0.0 and 1.0 as the relative + position of the value of the this field in the string interval of [min,max]. + If the value is not in the interval the the function returns 0.0 when + the value is less than min, and, 1.0 when the value is greater than max. + + @note + To calculate the relative position of the string value v in the interval + [min, max] the function first converts the beginning of these three + strings v, min, max into the strings that are used for byte comparison. + For each string not more sizeof(ulonglong) first bytes are taken + from the result of conversion. Then these bytes are interpreted as the + big-endian representation of an ulonglong integer. The values of these + integer numbers obtained for the strings v, min, max are used to calculate + the position of v in [min,max] in the same way is it's done for numeric + fields (see Field_num::pos_in_interval). + + @todo + Improve the procedure for the case when min and max have the same + beginning + + @param min value of the left end of the interval + @param max value of the right end of the interval + + @return + relative position of the field value in the string interval [min,max] +*/ + +double Field_str::pos_in_interval(Field *min, Field *max) { uchar mp_prefix[sizeof(ulonglong)]; uchar minp_prefix[sizeof(ulonglong)]; @@ -8435,7 +8484,24 @@ my_decimal *Field_bit::val_decimal(my_decimal *deciaml_value) } -double Field_bit::middle_point_pos(Field *min, Field *max) +/** + @brief + Determine the relative position of the field value in a bit interval + + @details + The function returns a double number between 0.0 and 1.0 as the relative + position of the value of the this field in the bit interval of [min,max]. + If the value is not in the interval the the function returns 0.0 when + the value is less than min, and, 1.0 when the value is greater than max. + + @param min value of the left end of the interval + @param max value of the right end of the interval + + @return + relative position of the field value in the bit interval [min,max] +*/ + +double Field_bit::pos_in_interval(Field *min, Field *max) { double n, d; n= val_real() - min->val_real(); diff --git a/sql/field.h b/sql/field.h index 550ae9d65b9..48d873beb32 100644 --- a/sql/field.h +++ b/sql/field.h @@ -723,9 +723,10 @@ public: virtual bool hash_join_is_possible() { return TRUE; } virtual bool eq_cmp_as_binary() { return TRUE; } - virtual double middle_point_pos(Field *min, Field *max) + /* Position of the field value within the interval of [min, max] */ + virtual double pos_in_interval(Field *min, Field *max) { - return (double) 1.0; + return (double) 0.5; } friend int cre_myisam(char * name, register TABLE *form, uint options, @@ -846,7 +847,7 @@ public: bool get_int(CHARSET_INFO *cs, const char *from, uint len, longlong *rnd, ulonglong unsigned_max, longlong signed_min, longlong signed_max); - double middle_point_pos(Field *min, Field *max); + double pos_in_interval(Field *min, Field *max); }; @@ -893,7 +894,7 @@ public: uint is_equal(Create_field *new_field); bool eq_cmp_as_binary() { return test(flags & BINARY_FLAG); } virtual uint length_size() { return 0; } - double middle_point_pos(Field *min, Field *max); + double pos_in_interval(Field *min, Field *max); }; /* base class for Field_string, Field_varstring and Field_blob */ @@ -2308,7 +2309,7 @@ public: { store(*((longlong *)val), TRUE); } - double middle_point_pos(Field *min, Field *max); + double pos_in_interval(Field *min, Field *max); void get_image(uchar *buff, uint length, CHARSET_INFO *cs) { get_key_image(buff, length, itRAW); } void set_image(const uchar *buff,uint length, CHARSET_INFO *cs) diff --git a/sql/opt_range.cc b/sql/opt_range.cc index 5cadd26da6e..c29a888ea6f 100644 --- a/sql/opt_range.cc +++ b/sql/opt_range.cc @@ -3222,6 +3222,26 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use, * Condition selectivity module ****************************************************************************/ + +/* + Build descriptors of pseudo-indexes over columns to perform range analysis + + SYNOPSIS + create_key_parts_for_pseudo_indexes() + param IN/OUT data structure for the descriptors to be built + used_fields bitmap of columns for which the descriptors are to be built + + DESCRIPTION + For each column marked in the bitmap used_fields the function builds + a descriptor of a single-component pseudo-index over this column that + can be used for the range analysis of the predicates over this columns. + The descriptors are created in the memory of param->mem_root. + + RETURN + FALSE in the case of success + TRUE otherwise +*/ + static bool create_key_parts_for_pseudo_indexes(RANGE_OPT_PARAM *param, MY_BITMAP *used_fields) @@ -3275,6 +3295,31 @@ bool create_key_parts_for_pseudo_indexes(RANGE_OPT_PARAM *param, } +/* + Estimate the number of rows in all ranges built for a column + by the range optimizer + + SYNOPSIS + records_in_column_ranges() + param the data structure to access descriptors of pseudo indexes + built over columns used in the condition of the processed query + idx the index of the descriptor of interest in param + tree the tree representing ranges built for the interesting column + + DESCRIPTION + This function retrieves the ranges represented by the SEL_ARG 'tree' and + for each of them r it calls the function get_column_range_cardinality() + that estimates the number of expected rows in r. It is assumed that param + is the data structure containing the descriptors of pseudo-indexes that + has been built to perform range analysis of the range conditions imposed + on the columns used in the processed query, while idx is the index of the + descriptor created in 'param' exactly for the column for which 'tree' + has been built by the range optimizer. + + RETURN + the number of rows in the retrieved ranges +*/ + static double records_in_column_ranges(PARAM *param, uint idx, SEL_ARG *tree) @@ -3322,6 +3367,29 @@ double records_in_column_ranges(PARAM *param, uint idx, } +/* + Calculate the selectivity of the condition imposed on the rows of a table + + SYNOPSIS + calculate_cond_selectivity_for_table() + thd the context handle + table the table of interest + cond conditions imposed on the rows of the table + + DESCRIPTION + This function calculates the selectivity of range conditions cond imposed + on the rows of 'table' in the processed query. + The calculated selectivity is assigned to the field table->cond_selectivity. + + NOTE + Currently the selectivities of range conditions over different columns are + considered independent. + + RETURN + FALSE on success + TRUE otherwise +*/ + bool calculate_cond_selectivity_for_table(THD *thd, TABLE *table, Item *cond) { uint keynr; @@ -3338,6 +3406,11 @@ bool calculate_cond_selectivity_for_table(THD *thd, TABLE *table, Item *cond) if (thd->variables.optimizer_use_condition_selectivity > 2 && !bitmap_is_clear_all(used_fields)) { + /* + Calculate the selectivity of the range conditions not supported + by any index + */ + PARAM param; MEM_ROOT alloc; SEL_TREE *tree; diff --git a/sql/sql_select.h b/sql/sql_select.h index d52d312f14b..9cda68bd434 100644 --- a/sql/sql_select.h +++ b/sql/sql_select.h @@ -292,7 +292,8 @@ typedef struct st_join_table { /* psergey-todo: make the below have type double, like POSITION::records_read? */ ha_rows records_read; - double cond_selectivity; + /* The selectivity of the conditions that can be pushed to the table */ + double cond_selectivity; /* Startup cost for execution */ double startup_cost; @@ -774,7 +775,8 @@ typedef struct st_position :public Sql_alloc */ double records_read; - double cond_selectivity; + /* The selectivity of the pushed down conditions */ + double cond_selectivity; /* Cost accessing the table in course of the entire complete join execution, diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index df86686f773..4df4b4d257d 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -250,12 +250,13 @@ public: and column_name with column_name taken out of the only parameter f of the Field* type passed to this method. After this get_stat_values looks for a row by the set key value. If the row is found the values of statistical - data columns min_value, max_value, nulls_ratio, avg_length, avg_frequency - are read into internal structures. Values of nulls_ratio, avg_length, - avg_frequency are read into the corresponding fields of the read_stat - structure from the Field object f, while values from min_value and max_value - are copied into the min_value and max_value record buffers attached to the - TABLE structure for table t. + data columns min_value, max_value, nulls_ratio, avg_length, avg_frequency, + hist_size, hist_type, histogram are read into internal structures. Values + of nulls_ratio, avg_length, avg_frequency, hist_size, hist_type, histogram + are read into the corresponding fields of the read_stat structure from + the Field object f, while values from min_value and max_value are copied + into the min_value and max_value record buffers attached to the TABLE + structure for table t. If the value of a statistical column in the found row is null, then the corresponding flag in the f->read_stat.column_stat_nulls bitmap is set off. Otherwise the flag is set on. If no row is found for the column the all flags @@ -868,11 +869,12 @@ public: @details This implementation of a purely virtual method sets the value of the - columns 'min_value', 'max_value', 'nulls_ratio', 'avg_length' and - 'avg_frequency' of the stistical table columns_stat according to the - contents of the bitmap write_stat.column_stat_nulls and the values - of the fields min_value, max_value, nulls_ratio, avg_length and - avg_frequency of the structure write_stat from the Field structure + columns 'min_value', 'max_value', 'nulls_ratio', 'avg_length', + 'avg_frequency', 'hist_size', 'hist_type' and 'histogram' of the + stistical table columns_stat according to the contents of the bitmap + write_stat.column_stat_nulls and the values of the fields min_value, + max_value, nulls_ratio, avg_length, avg_frequency, hist_size, hist_type + and histogram of the structure write_stat from the Field structure for the field 'table_field'. The value of the k-th column in the table columns_stat is set to NULL if the k-th bit in the bitmap 'column_stat_nulls' is set to 1. @@ -951,14 +953,15 @@ public: @details This implementation of a purely virtual method first looks for a record - the statistical table column_stats by its primary key set the record + in the statistical table column_stats by its primary key set in the record buffer with the help of Column_stat::set_key_fields. Then, if the row is found, the function reads the values of the columns 'min_value', - 'max_value', 'nulls_ratio', 'avg_length' and 'avg_frequency' of the - table column_stat and sets accordingly the value of the bitmap - read_stat.column_stat_nulls' and the values of the fields min_value, - max_value, nulls_ratio, avg_length and avg_frequency of the structure - read_stat from the Field structure for the field 'table_field'. + 'max_value', 'nulls_ratio', 'avg_length', 'avg_frequency', 'hist_size' and + 'hist_type" of the table column_stat and sets accordingly the value of + the bitmap read_stat.column_stat_nulls' and the values of the fields + min_value, max_value, nulls_ratio, avg_length, avg_frequency, hist_size and + hist_type of the structure read_stat from the Field structure for the field + 'table_field'. */ void get_stat_values() @@ -1022,6 +1025,22 @@ public: } } + + /** + @brief + Read histogram from of column_stats + + @details + This method first looks for a record in the statistical table column_stats + by its primary key set the record buffer with the help of + Column_stat::set_key_fields. Then, if the row is found, the function reads + the value of the column 'histogram' of the table column_stat and sets + accordingly the corresponding bit in the bitmap read_stat.column_stat_nulls. + The method assumes that the value of histogram size and the pointer to + the histogram location has been already set in the fields size and values + of read_stats->histogram. + */ + void get_histogram_value() { if (find_stat()) @@ -1238,20 +1257,24 @@ public: }; +/* + Histogram_builder is a helper class that is used to build histograms + for columns +*/ class Histogram_builder { - Field *column; - uint col_length; - ha_rows records; - Field *min_value; - Field *max_value; - Histogram *histogram; - uint hist_width; - double bucket_capacity; - uint curr_bucket; - ulonglong count; - ulonglong count_distinct; + Field *column; /* table field for which the histogram is built */ + uint col_length; /* size of this field */ + ha_rows records; /* number of records the histogram is built for */ + Field *min_value; /* pointer to the minimal value for the field */ + Field *max_value; /* pointer to the maximal value for the field */ + Histogram *histogram; /* the histogram location */ + uint hist_width; /* the number of points in the histogram */ + double bucket_capacity; /* number of rows in a bucket of the histogram */ + uint curr_bucket; /* number of the current bucket to be built */ + ulonglong count; /* number of values retrieved */ + ulonglong count_distinct; /* number of distinct values retrieved */ public: Histogram_builder(Field *col, uint col_len, ha_rows rows) @@ -1280,7 +1303,7 @@ public: { column->store_field_value((uchar *) elem, col_length); histogram->set_value(curr_bucket, - column->middle_point_pos(min_value, max_value)); + column->pos_in_interval(min_value, max_value)); curr_bucket++; while (curr_bucket != hist_width && count > bucket_capacity * (curr_bucket + 1)) @@ -1389,6 +1412,10 @@ public: return count; } + /* + @brief + Build the histogram for the elements accumulated in the container of 'tree' + */ ulonglong get_value_with_histogram(ha_rows rows) { Histogram_builder hist_builder(table_field, tree_key_length, rows); @@ -1396,11 +1423,19 @@ public: return hist_builder.get_count_distinct(); } + /* + @brief + Get the size of the histogram in bytes built for table_field + */ uint get_hist_size() { return table_field->collected_stats->histogram.get_size(); } + /* + @brief + Get the pointer to the histogram built for table_field + */ uchar *get_histogram() { return table_field->collected_stats->histogram.get_values(); @@ -1936,7 +1971,6 @@ inline bool statistics_for_command_is_needed(THD *thd) @note Currently the function always is called with the parameter is_safe set to FALSE. - */ int alloc_statistics_for_table_share(THD* thd, TABLE_SHARE *table_share, @@ -2051,6 +2085,36 @@ int alloc_statistics_for_table_share(THD* thd, TABLE_SHARE *table_share, DBUG_RETURN(0); } + +/** + @brief + Allocate memory for the histogram used by a table share + + @param + thd Thread handler + @param + table_share Table share for which the memory for histogram data is allocated + @param + is_safe TRUE <-> at any time only one thread can perform the function + + @note + The function allocates the memory for the histogram built for a table in the + table's share memory with the intention to read the data there from the + system persistent statistical table mysql.column_stats, + The memory is allocated in the table_share's mem_root. + If the parameter is_safe is TRUE then it is guaranteed that at any given time + only one thread is executed the code of the function. + + @retval + 0 If the memory for all statistical data has been successfully allocated + @retval + 1 Otherwise + + @note + Currently the function always is called with the parameter is_safe set + to FALSE. +*/ + static int alloc_histograms_for_table_share(THD* thd, TABLE_SHARE *table_share, bool is_safe) @@ -2781,6 +2845,38 @@ bool statistics_for_tables_is_needed(THD *thd, TABLE_LIST *tables) } +/** + @brief + Read histogram for a table from the persistent statistical tables + + @param + thd The thread handle + @param + table The table to read histograms for + @param + stat_tables The array of TABLE_LIST objects for statistical tables + + @details + For the statistical table columns_stats the function looks for the rows + from this table that contain statistical data on 'table'. If such rows + are found the histograms from them are read into the memory allocated + for histograms of 'table'. Later at the query processing these histogram + are supposed to be used by the optimizer. + The parameter stat_tables should point to an array of TABLE_LIST + objects for all statistical tables linked into a list. All statistical + tables are supposed to be opened. + The function is called by read_statistics_for_tables_if_needed(). + + @retval + 0 If data has been successfully read for the table + @retval + 1 Otherwise + + @note + Objects of the helper Column_stat are employed read histogram + from the statistical table column_stats now. +*/ + static int read_histograms_for_table(THD *thd, TABLE *table, TABLE_LIST *stat_tables) { @@ -3311,6 +3407,17 @@ void set_statistics_for_table(THD *thd, TABLE *table) } +/** + @brief + Get the average frequency for a column + + @param + field The column whose average frequency is required + + @retval + The required average frequency +*/ + double get_column_avg_frequency(Field * field) { double res; @@ -3337,6 +3444,27 @@ double get_column_avg_frequency(Field * field) } +/** + @brief + Estimate the number of rows in a column range using data from stat tables + + @param + field The column whose range cardinality is to be estimated + @param + min_endp The left end of the range whose cardinality is required + @param + max_endp The right end of the range whose cardinality is required + @param + range_flag The range flags + + @details + The function gets an estimate of the number of rows in a column range + using the statistical data from the table column_stats. + + @retval + The required estimate of the rows in the column range +*/ + double get_column_range_cardinality(Field *field, key_range *min_endp, key_range *max_endp, @@ -3377,8 +3505,8 @@ double get_column_range_cardinality(Field *field, Histogram *hist= &col_stats->histogram; if (hist->is_available()) { - double pos= field->middle_point_pos(col_stats->min_value, - col_stats->max_value); + double pos= field->pos_in_interval(col_stats->min_value, + col_stats->max_value); res= col_non_nulls * hist->point_selectivity(pos, avg_frequency / col_non_nulls); @@ -3396,8 +3524,8 @@ double get_column_range_cardinality(Field *field, { store_key_image_to_rec(field, (uchar *) min_endp->key, min_endp->length); - min_mp_pos= field->middle_point_pos(col_stats->min_value, - col_stats->max_value); + min_mp_pos= field->pos_in_interval(col_stats->min_value, + col_stats->max_value); } else min_mp_pos= 0.0; @@ -3405,8 +3533,8 @@ double get_column_range_cardinality(Field *field, { store_key_image_to_rec(field, (uchar *) max_endp->key, max_endp->length); - max_mp_pos= field->middle_point_pos(col_stats->min_value, - col_stats->max_value); + max_mp_pos= field->pos_in_interval(col_stats->min_value, + col_stats->max_value); } else max_mp_pos= 1.0; diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h index 8c1e95df1f0..c6a72478c34 100644 --- a/sql/sql_statistics.h +++ b/sql/sql_statistics.h @@ -271,8 +271,8 @@ public: Column_statistics *column_stats; /* Array of statistical data for columns */ Index_statistics *index_stats; /* Array of statistical data for indexes */ ulong *idx_avg_frequency; /* Array of records per key for index prefixes */ - ulong total_hist_size; - uchar *histograms; /* Sequence of histograms */ + ulong total_hist_size; /* Total size of all histograms */ + uchar *histograms; /* Sequence of histograms */ }; |