diff options
author | Sergey Petrunya <psergey@askmonty.org> | 2014-03-26 17:55:00 +0400 |
---|---|---|
committer | Sergey Petrunya <psergey@askmonty.org> | 2014-03-26 17:55:00 +0400 |
commit | ad842b5f058d5342c22cdc86542baa2ae9db5e70 (patch) | |
tree | 14cf4a48c33756fe92f346a21c039ec606f0920f /sql/sql_statistics.h | |
parent | e59dec0345bba78ba83928060c20c5861cf205f1 (diff) | |
download | mariadb-git-ad842b5f058d5342c22cdc86542baa2ae9db5e70.tar.gz |
MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong
[Attempt #2]
- Use a new selectivity calculation formula in Histogram::point_selectivity.
The formula is different from the old one because it was developed from scratch.
it doesn't have any possible division-by-zero problems.
Diffstat (limited to 'sql/sql_statistics.h')
-rw-r--r-- | sql/sql_statistics.h | 92 |
1 files changed, 81 insertions, 11 deletions
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h index 68aacd69d98..936f23f1091 100644 --- a/sql/sql_statistics.h +++ b/sql/sql_statistics.h @@ -113,7 +113,7 @@ class Histogram private: Histogram_type type; - uint8 size; + uint8 size; /* Size of values array, in bytes */ uchar *values; uint prec_factor() @@ -142,6 +142,7 @@ public: private: uint get_value(uint i) { + DBUG_ASSERT(i < get_width()); switch (type) { case SINGLE_PREC_HB: return (uint) (((uint8 *) values)[i]); @@ -150,7 +151,7 @@ private: } return 0; } - + /* Find the bucket which value 'pos' falls into. */ uint find_bucket(double pos, bool first) { uint val= (uint) (pos * prec_factor()); @@ -169,6 +170,10 @@ private: else break; } + + if (val > get_value(i)) + i++; + if (val == get_value(i)) { if (first) @@ -234,24 +239,89 @@ public: sel= bucket_sel * (max - min + 1); return sel; } + + + /* + Estimate selectivity of "col=const" using a histogram + + @param pos Position of the "const" between column's min_value and + max_value. This is a number in [0..1] range. + @param avg_sel Average selectivity of condition "col=const" in this table. + It is calcuated as (#non_null_values / #distinct_values). + + @return + Expected condition selectivity (a number between 0 and 1) + */ double point_selectivity(double pos, double avg_sel) { double sel; - double bucket_sel= 1.0/(get_width() + 1); + /* Find the bucket that contains the value 'pos'. */ uint min= find_bucket(pos, TRUE); + uint pos_value= (uint) (pos * prec_factor()); + + /* Find how many buckets this value occupies */ uint max= min; - while (max + 1 < get_width() && get_value(max + 1) == get_value(max)) + while (max + 1 < get_width() && get_value(max + 1) == pos_value) max++; - double inv_prec_factor= (double) 1.0 / prec_factor(); - double width= (max + 1 == get_width() ? - 1.0 : get_value(max) * inv_prec_factor) - - (min == 0 ? - 0.0 : get_value(min-1) * inv_prec_factor); - sel= avg_sel * (bucket_sel * (max + 1 - min)) / width; + + if (max > min) + { + /* + The value occupies multiple buckets. Use start_bucket ... end_bucket as + selectivity. + */ + double bucket_sel= 1.0/(get_width() + 1); + sel= bucket_sel * (max - min + 1); + } + else + { + /* + The value 'pos' fits within one single histogram bucket. + + Histogram buckets have the same numbers of rows, but they cover + different ranges of values. + + We assume that values are uniformly distributed across the [0..1] value + range. + */ + + /* + If all buckets covered value ranges of the same size, the width of + value range would be: + */ + double avg_bucket_width= 1.0 / (get_width() + 1); + + /* + Let's see what is the width of value range that our bucket is covering. + (min==max currently. they are kept in the formula just in case we + will want to extend it to handle multi-bucket case) + */ + double inv_prec_factor= (double) 1.0 / prec_factor(); + double current_bucket_width= + (max + 1 == get_width() ? 1.0 : (get_value(max) * inv_prec_factor)) - + (min == 0 ? 0.0 : (get_value(min-1) * inv_prec_factor)); + + /* + So: + - each bucket has the same #rows + - values are unformly distributed across the [min_value,max_value] domain. + + If a bucket has value range that's N times bigger then average, than + each value will have to have N times fewer rows than average. + */ + DBUG_ASSERT(current_bucket_width); + sel= avg_sel * avg_bucket_width / current_bucket_width; + + /* + (Q: if we just follow this proportion we may end up in a situation + where number of different values we expect to find in this bucket + exceeds the number of rows that this histogram has in a bucket. Are + we ok with this or we would want to have certain caps?) + */ + } return sel; } - }; |