diff options
author | Sergey Petrunya <psergey@askmonty.org> | 2014-03-27 12:30:49 +0400 |
---|---|---|
committer | Sergey Petrunya <psergey@askmonty.org> | 2014-03-27 12:30:49 +0400 |
commit | ab061a2bb3723c00eb5c88ecc1cb90ee7f1458e6 (patch) | |
tree | de6934d92a0ee16aff1547b6547c22b101a6e151 /sql/sql_statistics.h | |
parent | dee11f9633be3091bd7d3c0b868e4ea1efe4ac7f (diff) | |
download | mariadb-git-ab061a2bb3723c00eb5c88ecc1cb90ee7f1458e6.tar.gz |
MDEV-5926, MDEV-4362 post-fixes:
- Histogram::find_bucket() should not walk off the end of the value range.
- Address review feedback in Histogram::point_selectivity(): different handling
for zero-width buckets, and explanations.
Diffstat (limited to 'sql/sql_statistics.h')
-rw-r--r-- | sql/sql_statistics.h | 66 |
1 files changed, 44 insertions, 22 deletions
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h index da6a9035b44..d0db0a3bf33 100644 --- a/sql/sql_statistics.h +++ b/sql/sql_statistics.h @@ -151,6 +151,7 @@ private: } return 0; } + /* Find the bucket which value 'pos' falls into. */ uint find_bucket(double pos, bool first) { @@ -171,7 +172,7 @@ private: break; } - if (val > get_value(i)) + if (val > get_value(i) && i < (get_width() - 1)) i++; if (val == get_value(i)) @@ -251,6 +252,27 @@ public: @return Expected condition selectivity (a number between 0 and 1) + + @notes + [re_zero_length_buckets] If a bucket with zero value-length is in the + middle of the histogram, we will not have min==max. Example: suppose, + pos_value=0x12, and the histogram is: + + #n #n+1 #n+2 + ... 0x10 0x12 0x12 0x14 ... + | + +------------- bucket with zero value-length + + Here, we will get min=#n+1, max=#n+2, and use the multi-bucket formula. + + The problem happens at the histogram ends. if pos_value=0, and the + histogram is: + + 0x00 0x10 ... + + then min=0, max=0. This means pos_value is contained within bucket #0, + but on the other hand, histogram data says that the bucket has only one + value. */ double point_selectivity(double pos, double avg_sel) @@ -264,6 +286,16 @@ public: uint max= min; while (max + 1 < get_width() && get_value(max + 1) == pos_value) max++; + + /* + A special case: we're looking at a single bucket, and that bucket has + zero value-length. Use the multi-bucket formula (attempt to use + single-bucket formula will cause divison by zero). + + For more details see [re_zero_length_buckets] above. + */ + if (max == min && get_value(max) == ((max==0)? 0 : get_value(max-1))) + max++; if (max > min) { @@ -302,27 +334,17 @@ public: (max + 1 == get_width() ? 1.0 : (get_value(max) * inv_prec_factor)) - (min == 0 ? 0.0 : (get_value(min-1) * inv_prec_factor)); - if (current_bucket_width < 1e-16) - { - /* - A special case: we are at the first (or the last) bucket in the - histogram, the bucket's value range is a singlepoint [x,x], and - pos_value=0 (for the first bucket) or pos_value=1 (for the last). - */ - sel= avg_sel; - } - else - { - /* - So: - - each bucket has the same #rows - - values are unformly distributed across the [min_value,max_value] domain. - - If a bucket has value range that's N times bigger then average, than - each value will have to have N times fewer rows than average. - */ - sel= avg_sel * avg_bucket_width / current_bucket_width; - } + DBUG_ASSERT(current_bucket_width); /* We shouldn't get a one zero-width bucket */ + + /* + So: + - each bucket has the same #rows + - values are unformly distributed across the [min_value,max_value] domain. + + If a bucket has value range that's N times bigger then average, than + each value will have to have N times fewer rows than average. + */ + sel= avg_sel * avg_bucket_width / current_bucket_width; /* (Q: if we just follow this proportion we may end up in a situation |