summaryrefslogtreecommitdiff
path: root/sql/sql_statistics.h
diff options
context:
space:
mode:
authorSergey Petrunya <psergey@askmonty.org>2014-03-26 17:55:00 +0400
committerSergey Petrunya <psergey@askmonty.org>2014-03-26 17:55:00 +0400
commitad842b5f058d5342c22cdc86542baa2ae9db5e70 (patch)
tree14cf4a48c33756fe92f346a21c039ec606f0920f /sql/sql_statistics.h
parente59dec0345bba78ba83928060c20c5861cf205f1 (diff)
downloadmariadb-git-ad842b5f058d5342c22cdc86542baa2ae9db5e70.tar.gz
MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong
[Attempt #2] - Use a new selectivity calculation formula in Histogram::point_selectivity. The formula is different from the old one because it was developed from scratch. it doesn't have any possible division-by-zero problems.
Diffstat (limited to 'sql/sql_statistics.h')
-rw-r--r--sql/sql_statistics.h92
1 files changed, 81 insertions, 11 deletions
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h
index 68aacd69d98..936f23f1091 100644
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@@ -113,7 +113,7 @@ class Histogram
private:
Histogram_type type;
- uint8 size;
+ uint8 size; /* Size of values array, in bytes */
uchar *values;
uint prec_factor()
@@ -142,6 +142,7 @@ public:
private:
uint get_value(uint i)
{
+ DBUG_ASSERT(i < get_width());
switch (type) {
case SINGLE_PREC_HB:
return (uint) (((uint8 *) values)[i]);
@@ -150,7 +151,7 @@ private:
}
return 0;
}
-
+ /* Find the bucket which value 'pos' falls into. */
uint find_bucket(double pos, bool first)
{
uint val= (uint) (pos * prec_factor());
@@ -169,6 +170,10 @@ private:
else
break;
}
+
+ if (val > get_value(i))
+ i++;
+
if (val == get_value(i))
{
if (first)
@@ -234,24 +239,89 @@ public:
sel= bucket_sel * (max - min + 1);
return sel;
}
+
+
+ /*
+ Estimate selectivity of "col=const" using a histogram
+
+ @param pos Position of the "const" between column's min_value and
+ max_value. This is a number in [0..1] range.
+ @param avg_sel Average selectivity of condition "col=const" in this table.
+ It is calcuated as (#non_null_values / #distinct_values).
+
+ @return
+ Expected condition selectivity (a number between 0 and 1)
+ */
double point_selectivity(double pos, double avg_sel)
{
double sel;
- double bucket_sel= 1.0/(get_width() + 1);
+ /* Find the bucket that contains the value 'pos'. */
uint min= find_bucket(pos, TRUE);
+ uint pos_value= (uint) (pos * prec_factor());
+
+ /* Find how many buckets this value occupies */
uint max= min;
- while (max + 1 < get_width() && get_value(max + 1) == get_value(max))
+ while (max + 1 < get_width() && get_value(max + 1) == pos_value)
max++;
- double inv_prec_factor= (double) 1.0 / prec_factor();
- double width= (max + 1 == get_width() ?
- 1.0 : get_value(max) * inv_prec_factor) -
- (min == 0 ?
- 0.0 : get_value(min-1) * inv_prec_factor);
- sel= avg_sel * (bucket_sel * (max + 1 - min)) / width;
+
+ if (max > min)
+ {
+ /*
+ The value occupies multiple buckets. Use start_bucket ... end_bucket as
+ selectivity.
+ */
+ double bucket_sel= 1.0/(get_width() + 1);
+ sel= bucket_sel * (max - min + 1);
+ }
+ else
+ {
+ /*
+ The value 'pos' fits within one single histogram bucket.
+
+ Histogram buckets have the same numbers of rows, but they cover
+ different ranges of values.
+
+ We assume that values are uniformly distributed across the [0..1] value
+ range.
+ */
+
+ /*
+ If all buckets covered value ranges of the same size, the width of
+ value range would be:
+ */
+ double avg_bucket_width= 1.0 / (get_width() + 1);
+
+ /*
+ Let's see what is the width of value range that our bucket is covering.
+ (min==max currently. they are kept in the formula just in case we
+ will want to extend it to handle multi-bucket case)
+ */
+ double inv_prec_factor= (double) 1.0 / prec_factor();
+ double current_bucket_width=
+ (max + 1 == get_width() ? 1.0 : (get_value(max) * inv_prec_factor)) -
+ (min == 0 ? 0.0 : (get_value(min-1) * inv_prec_factor));
+
+ /*
+ So:
+ - each bucket has the same #rows
+ - values are unformly distributed across the [min_value,max_value] domain.
+
+ If a bucket has value range that's N times bigger then average, than
+ each value will have to have N times fewer rows than average.
+ */
+ DBUG_ASSERT(current_bucket_width);
+ sel= avg_sel * avg_bucket_width / current_bucket_width;
+
+ /*
+ (Q: if we just follow this proportion we may end up in a situation
+ where number of different values we expect to find in this bucket
+ exceeds the number of rows that this histogram has in a bucket. Are
+ we ok with this or we would want to have certain caps?)
+ */
+ }
return sel;
}
-
};