summaryrefslogtreecommitdiff
path: root/sql/sql_statistics.h
diff options
context:
space:
mode:
authorSergey Petrunya <psergey@askmonty.org>2014-03-27 13:08:00 +0400
committerSergey Petrunya <psergey@askmonty.org>2014-03-27 13:08:00 +0400
commit79a8a6130b0c43e98a64a1fde8f277e0df06da5d (patch)
tree2f1f0f604942584382be90140695865f890d88c9 /sql/sql_statistics.h
parent0d67aafaa2c383b4d2d76f8621109f8adbfb2532 (diff)
downloadmariadb-git-79a8a6130b0c43e98a64a1fde8f277e0df06da5d.tar.gz
Code cleanup:
- Move [some] engine-agnostic tests from t/selectivity.test to t/selectivity_no_engine.test - Move Histogram::point_selectivity to sql_statistics.cc
Diffstat (limited to 'sql/sql_statistics.h')
-rw-r--r--sql/sql_statistics.h112
1 files changed, 1 insertions, 111 deletions
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h
index d0db0a3bf33..331e3559203 100644
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@@ -241,120 +241,10 @@ public:
return sel;
}
-
/*
Estimate selectivity of "col=const" using a histogram
-
- @param pos Position of the "const" between column's min_value and
- max_value. This is a number in [0..1] range.
- @param avg_sel Average selectivity of condition "col=const" in this table.
- It is calcuated as (#non_null_values / #distinct_values).
-
- @return
- Expected condition selectivity (a number between 0 and 1)
-
- @notes
- [re_zero_length_buckets] If a bucket with zero value-length is in the
- middle of the histogram, we will not have min==max. Example: suppose,
- pos_value=0x12, and the histogram is:
-
- #n #n+1 #n+2
- ... 0x10 0x12 0x12 0x14 ...
- |
- +------------- bucket with zero value-length
-
- Here, we will get min=#n+1, max=#n+2, and use the multi-bucket formula.
-
- The problem happens at the histogram ends. if pos_value=0, and the
- histogram is:
-
- 0x00 0x10 ...
-
- then min=0, max=0. This means pos_value is contained within bucket #0,
- but on the other hand, histogram data says that the bucket has only one
- value.
*/
-
- double point_selectivity(double pos, double avg_sel)
- {
- double sel;
- /* Find the bucket that contains the value 'pos'. */
- uint min= find_bucket(pos, TRUE);
- uint pos_value= (uint) (pos * prec_factor());
-
- /* Find how many buckets this value occupies */
- uint max= min;
- while (max + 1 < get_width() && get_value(max + 1) == pos_value)
- max++;
-
- /*
- A special case: we're looking at a single bucket, and that bucket has
- zero value-length. Use the multi-bucket formula (attempt to use
- single-bucket formula will cause divison by zero).
-
- For more details see [re_zero_length_buckets] above.
- */
- if (max == min && get_value(max) == ((max==0)? 0 : get_value(max-1)))
- max++;
-
- if (max > min)
- {
- /*
- The value occupies multiple buckets. Use start_bucket ... end_bucket as
- selectivity.
- */
- double bucket_sel= 1.0/(get_width() + 1);
- sel= bucket_sel * (max - min + 1);
- }
- else
- {
- /*
- The value 'pos' fits within one single histogram bucket.
-
- Histogram buckets have the same numbers of rows, but they cover
- different ranges of values.
-
- We assume that values are uniformly distributed across the [0..1] value
- range.
- */
-
- /*
- If all buckets covered value ranges of the same size, the width of
- value range would be:
- */
- double avg_bucket_width= 1.0 / (get_width() + 1);
-
- /*
- Let's see what is the width of value range that our bucket is covering.
- (min==max currently. they are kept in the formula just in case we
- will want to extend it to handle multi-bucket case)
- */
- double inv_prec_factor= (double) 1.0 / prec_factor();
- double current_bucket_width=
- (max + 1 == get_width() ? 1.0 : (get_value(max) * inv_prec_factor)) -
- (min == 0 ? 0.0 : (get_value(min-1) * inv_prec_factor));
-
- DBUG_ASSERT(current_bucket_width); /* We shouldn't get a one zero-width bucket */
-
- /*
- So:
- - each bucket has the same #rows
- - values are unformly distributed across the [min_value,max_value] domain.
-
- If a bucket has value range that's N times bigger then average, than
- each value will have to have N times fewer rows than average.
- */
- sel= avg_sel * avg_bucket_width / current_bucket_width;
-
- /*
- (Q: if we just follow this proportion we may end up in a situation
- where number of different values we expect to find in this bucket
- exceeds the number of rows that this histogram has in a bucket. Are
- we ok with this or we would want to have certain caps?)
- */
- }
- return sel;
- }
+ double point_selectivity(double pos, double avg_sel);
};