diff options
author | Sergei Petrunia <psergey@askmonty.org> | 2021-10-22 19:43:19 +0300 |
---|---|---|
committer | Sergei Petrunia <psergey@askmonty.org> | 2022-01-19 18:10:11 +0300 |
commit | 05877df472fa95cab763dfb1299e81681a12da73 (patch) | |
tree | 00753389e2ddeb3931cb3fdb605b2c92064373b1 | |
parent | f3f78bed8530e1e858d5ed87054f2ac672760824 (diff) | |
download | mariadb-git-05877df472fa95cab763dfb1299e81681a12da73.tar.gz |
MDEV-26849: JSON Histograms: point selectivity estimates are off
.. for non-existent values.
Handle this special case.
-rw-r--r-- | mysql-test/main/statistics.result | 1 | ||||
-rw-r--r-- | mysql-test/main/statistics.test | 2 | ||||
-rw-r--r-- | mysql-test/main/statistics_json.result | 102 | ||||
-rw-r--r-- | mysql-test/main/statistics_json.test | 22 | ||||
-rw-r--r-- | sql/opt_histogram_json.cc | 10 | ||||
-rw-r--r-- | sql/opt_histogram_json.h | 3 | ||||
-rw-r--r-- | sql/sql_statistics.cc | 6 | ||||
-rw-r--r-- | sql/sql_statistics.h | 6 |
8 files changed, 142 insertions, 10 deletions
diff --git a/mysql-test/main/statistics.result b/mysql-test/main/statistics.result index 7556900f4a2..3299c8904a9 100644 --- a/mysql-test/main/statistics.result +++ b/mysql-test/main/statistics.result @@ -1885,6 +1885,7 @@ t1 id 1 17384 0.0000 4.0000 14.0000 0.15705,0.15711,0.21463,0.15705,0.15711,0.15 explain select * from t1; id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 ALL NULL NULL NULL NULL 229376 +drop table t0; drop table t1; set analyze_sample_percentage=@save_analyze_sample_percentage; set histogram_size=@save_histogram_size; diff --git a/mysql-test/main/statistics.test b/mysql-test/main/statistics.test index cbb923bee66..e39a95300af 100644 --- a/mysql-test/main/statistics.test +++ b/mysql-test/main/statistics.test @@ -1019,7 +1019,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a from mysql.column_stats; explain select * from t1; - +drop table t0; drop table t1; set analyze_sample_percentage=@save_analyze_sample_percentage; set histogram_size=@save_histogram_size; diff --git a/mysql-test/main/statistics_json.result b/mysql-test/main/statistics_json.result index 3cd9bfb181f..1e7e0d88e72 100644 --- a/mysql-test/main/statistics_json.result +++ b/mysql-test/main/statistics_json.result @@ -4148,6 +4148,7 @@ t1 id 1 17384 0.0000 4.0000 14.0000 { explain select * from t1; id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 ALL NULL NULL NULL NULL 229376 +drop table t0; drop table t1; set analyze_sample_percentage=@save_analyze_sample_percentage; set histogram_size=@save_histogram_size; @@ -7530,3 +7531,104 @@ select c from t1 where c > '1'; id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra 1 SIMPLE t1 ALL NULL NULL NULL NULL 16 16.00 75.00 75.00 Using where drop table t1; +# +# MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values +# +# +create table t0(a int); +insert into t0 (a) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1(a int); +insert into t1 select 100*A.a from t0 A, t0 B, t0 C; +select a, count(*) from t1 group by a order by a; +a count(*) +0 100 +100 100 +200 100 +300 100 +400 100 +500 100 +600 100 +700 100 +800 100 +900 100 +set histogram_type=json_hb, histogram_size=default; +analyze table t1 persistent for all; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status OK +select * from mysql.column_stats where table_name='t1'; +db_name table_name column_name min_value max_value nulls_ratio avg_length avg_frequency hist_size hist_type histogram +test t1 a 0 900 0.0000 4.0000 100.0000 10 JSON_HB { + "histogram_hb_v2": [ + { + "start": "0", + "size": 0.1, + "ndv": 1 + }, + { + "start": "100", + "size": 0.1, + "ndv": 1 + }, + { + "start": "200", + "size": 0.1, + "ndv": 1 + }, + { + "start": "300", + "size": 0.1, + "ndv": 1 + }, + { + "start": "400", + "size": 0.1, + "ndv": 1 + }, + { + "start": "500", + "size": 0.1, + "ndv": 1 + }, + { + "start": "600", + "size": 0.1, + "ndv": 1 + }, + { + "start": "700", + "size": 0.1, + "ndv": 1 + }, + { + "start": "800", + "size": 0.1, + "ndv": 1 + }, + { + "start": "900", + "end": "900", + "size": 0.1, + "ndv": 1 + } + ] +} +analyze select * from t1 where a=0; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 10.00 10.00 Using where +analyze select * from t1 where a=50; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 0.10 0.00 Using where +analyze select * from t1 where a=70; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 0.10 0.00 Using where +analyze select * from t1 where a=100; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 10.00 10.00 Using where +analyze select * from t1 where a=150; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 0.10 0.00 Using where +analyze select * from t1 where a=200; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 1000.00 10.00 10.00 Using where +drop table t0,t1; diff --git a/mysql-test/main/statistics_json.test b/mysql-test/main/statistics_json.test index 352150e8497..e968c910f4b 100644 --- a/mysql-test/main/statistics_json.test +++ b/mysql-test/main/statistics_json.test @@ -295,3 +295,25 @@ select c from t1 where c > '1'; drop table t1; +--echo # +--echo # MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values +--echo # +--echo # + +create table t0(a int); +insert into t0 (a) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1(a int); +insert into t1 select 100*A.a from t0 A, t0 B, t0 C; +select a, count(*) from t1 group by a order by a; +set histogram_type=json_hb, histogram_size=default; +analyze table t1 persistent for all; +select * from mysql.column_stats where table_name='t1'; +analyze select * from t1 where a=0; +analyze select * from t1 where a=50; +analyze select * from t1 where a=70; +analyze select * from t1 where a=100; +analyze select * from t1 where a=150; +analyze select * from t1 where a=200; + +drop table t0,t1; + diff --git a/sql/opt_histogram_json.cc b/sql/opt_histogram_json.cc index 1793aa7df0d..89af9fe3c3f 100644 --- a/sql/opt_histogram_json.cc +++ b/sql/opt_histogram_json.cc @@ -616,7 +616,7 @@ double position_in_interval(Field *field, const uchar *key, uint key_len, double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint, - double avg_sel) + double avg_sel, double total_rows) { const uchar *key = endpoint->key; if (field->real_maybe_null()) @@ -631,9 +631,11 @@ double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint, if (buckets[idx].ndv == 1 && !equal) { - // The bucket has a single value and it doesn't match! Use the global - // average. - sel= avg_sel; + /* + The bucket has a single value and it doesn't match! Return a very + small value. + */ + sel= 1.0 / total_rows; } else { diff --git a/sql/opt_histogram_json.h b/sql/opt_histogram_json.h index df7547b31ee..6771942e94e 100644 --- a/sql/opt_histogram_json.h +++ b/sql/opt_histogram_json.h @@ -109,7 +109,8 @@ public: ulonglong size) override; double point_selectivity(Field *field, key_range *endpoint, - double avg_selection) override; + double avg_selection, + double total_rows) override; double range_selectivity(Field *field, key_range *min_endp, key_range *max_endp) override; diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index c62230a9af1..c87330316f4 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -3882,7 +3882,8 @@ double get_column_range_cardinality(Field *field, { res= col_non_nulls * hist->point_selectivity(field, min_endp, - avg_frequency / col_non_nulls); + avg_frequency / col_non_nulls, + tab_records); } } else if (avg_frequency == 0.0) @@ -3973,7 +3974,8 @@ double get_column_range_cardinality(Field *field, */ double Histogram_binary::point_selectivity(Field *field, key_range *endpoint, - double avg_sel) + double avg_sel, + double total_records) { double sel; Column_statistics *col_stats= field->read_stats; diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h index 20a9bb9dd83..c5b1aa54399 100644 --- a/sql/sql_statistics.h +++ b/sql/sql_statistics.h @@ -186,7 +186,8 @@ public: virtual double point_selectivity(Field *field, key_range *endpoint, - double avg_selection)=0; + double avg_selectivity, + double total_rows)=0; virtual double range_selectivity(Field *field, key_range *min_endp, key_range *max_endp)=0; @@ -355,7 +356,8 @@ public: Estimate selectivity of "col=const" using a histogram */ double point_selectivity(Field *field, key_range *endpoint, - double avg_sel) override; + double avg_sel, + double total_rows) override; }; |