MDEV-26849: JSON Histograms: point selectivity estimates are off

.. for non-existent values. Handle this special case.
author: Sergei Petrunia <psergey@askmonty.org> 2021-10-22 19:43:19 +0300
committer: Sergei Petrunia <psergey@askmonty.org> 2022-01-19 18:10:11 +0300
commit: 05877df472fa95cab763dfb1299e81681a12da73 (patch)
tree: 00753389e2ddeb3931cb3fdb605b2c92064373b1
parent: f3f78bed8530e1e858d5ed87054f2ac672760824 (diff)
download: mariadb-git-05877df472fa95cab763dfb1299e81681a12da73.tar.gz
8 files changed, 142 insertions, 10 deletions
diff --git a/mysql-test/main/statistics.result b/mysql-test/main/statistics.result
index 7556900f4a2..3299c8904a9 100644
--- a/mysql-test/main/statistics.result
+++ b/mysql-test/main/statistics.result
@@ -1885,6 +1885,7 @@ t1	id	1	17384	0.0000	4.0000	14.0000	0.15705,0.15711,0.21463,0.15705,0.15711,0.15
 explain select * from t1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	229376	
+drop table t0;
 drop table t1;
 set analyze_sample_percentage=@save_analyze_sample_percentage;
 set histogram_size=@save_histogram_size;
diff --git a/mysql-test/main/statistics.test b/mysql-test/main/statistics.test
index cbb923bee66..e39a95300af 100644
--- a/mysql-test/main/statistics.test
+++ b/mysql-test/main/statistics.test
@@ -1019,7 +1019,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
 from mysql.column_stats;
 explain select * from t1;
 
-
+drop table t0;
 drop table t1;
 set analyze_sample_percentage=@save_analyze_sample_percentage;
 set histogram_size=@save_histogram_size;
diff --git a/mysql-test/main/statistics_json.result b/mysql-test/main/statistics_json.result
index 3cd9bfb181f..1e7e0d88e72 100644
--- a/mysql-test/main/statistics_json.result
+++ b/mysql-test/main/statistics_json.result
@@ -4148,6 +4148,7 @@ t1	id	1	17384	0.0000	4.0000	14.0000	{
 explain select * from t1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	229376	
+drop table t0;
 drop table t1;
 set analyze_sample_percentage=@save_analyze_sample_percentage;
 set histogram_size=@save_histogram_size;
@@ -7530,3 +7531,104 @@ select c from t1 where c > '1';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	16	16.00	75.00	75.00	Using where
 drop table t1;
+#
+# MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values
+#
+#
+create table t0(a int);
+insert into t0 (a) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(a int);
+insert into t1 select 100*A.a from t0 A, t0 B, t0 C;
+select a, count(*) from t1 group by a order by a;
+a	count(*)
+0	100
+100	100
+200	100
+300	100
+400	100
+500	100
+600	100
+700	100
+800	100
+900	100
+set histogram_type=json_hb, histogram_size=default;
+analyze table t1 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	Engine-independent statistics collected
+test.t1	analyze	status	OK
+select * from mysql.column_stats where table_name='t1';
+db_name	table_name	column_name	min_value	max_value	nulls_ratio	avg_length	avg_frequency	hist_size	hist_type	histogram
+test	t1	a	0	900	0.0000	4.0000	100.0000	10	JSON_HB	{
+  "histogram_hb_v2": [
+    {
+      "start": "0",
+      "size": 0.1,
+      "ndv": 1
+    },
+    {
+      "start": "100",
+      "size": 0.1,
+      "ndv": 1
+    },
+    {
+      "start": "200",
+      "size": 0.1,
+      "ndv": 1
+    },
+    {
+      "start": "300",
+      "size": 0.1,
+      "ndv": 1
+    },
+    {
+      "start": "400",
+      "size": 0.1,
+      "ndv": 1
+    },
+    {
+      "start": "500",
+      "size": 0.1,
+      "ndv": 1
+    },
+    {
+      "start": "600",
+      "size": 0.1,
+      "ndv": 1
+    },
+    {
+      "start": "700",
+      "size": 0.1,
+      "ndv": 1
+    },
+    {
+      "start": "800",
+      "size": 0.1,
+      "ndv": 1
+    },
+    {
+      "start": "900",
+      "end": "900",
+      "size": 0.1,
+      "ndv": 1
+    }
+  ]
+}
+analyze select * from t1 where a=0;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	1000.00	10.00	10.00	Using where
+analyze select * from t1 where a=50;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	1000.00	0.10	0.00	Using where
+analyze select * from t1 where a=70;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	1000.00	0.10	0.00	Using where
+analyze select * from t1 where a=100;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	1000.00	10.00	10.00	Using where
+analyze select * from t1 where a=150;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	1000.00	0.10	0.00	Using where
+analyze select * from t1 where a=200;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	1000.00	10.00	10.00	Using where
+drop table t0,t1;
diff --git a/mysql-test/main/statistics_json.test b/mysql-test/main/statistics_json.test
index 352150e8497..e968c910f4b 100644
--- a/mysql-test/main/statistics_json.test
+++ b/mysql-test/main/statistics_json.test
@@ -295,3 +295,25 @@ select c from t1 where c > '1';
 
 drop table t1;
 
+--echo #
+--echo # MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values
+--echo #
+--echo #
+
+create table t0(a int);
+insert into t0 (a) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(a int);
+insert into t1 select 100*A.a from t0 A, t0 B, t0 C;
+select a, count(*) from t1 group by a order by a;
+set histogram_type=json_hb, histogram_size=default;
+analyze table t1 persistent for all;
+select * from mysql.column_stats where table_name='t1';
+analyze select * from t1 where a=0;
+analyze select * from t1 where a=50;
+analyze select * from t1 where a=70;
+analyze select * from t1 where a=100;
+analyze select * from t1 where a=150;
+analyze select * from t1 where a=200;
+
+drop table t0,t1;
+
diff --git a/sql/opt_histogram_json.cc b/sql/opt_histogram_json.cc
index 1793aa7df0d..89af9fe3c3f 100644
--- a/sql/opt_histogram_json.cc
+++ b/sql/opt_histogram_json.cc
@@ -616,7 +616,7 @@ double position_in_interval(Field *field, const  uchar *key, uint key_len,
 
 
 double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
-                                            double avg_sel)
+                                            double avg_sel, double total_rows)
 {
   const uchar *key = endpoint->key;
   if (field->real_maybe_null())
@@ -631,9 +631,11 @@ double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
 
   if (buckets[idx].ndv == 1 && !equal)
   {
-    // The bucket has a single value and it doesn't match! Use the global
-    // average.
-    sel= avg_sel;
+    /*
+      The bucket has a single value and it doesn't match! Return a very
+      small value.
+    */
+    sel= 1.0 / total_rows;
   }
   else
   {
diff --git a/sql/opt_histogram_json.h b/sql/opt_histogram_json.h
index df7547b31ee..6771942e94e 100644
--- a/sql/opt_histogram_json.h
+++ b/sql/opt_histogram_json.h
@@ -109,7 +109,8 @@ public:
                            ulonglong size) override;
 
   double point_selectivity(Field *field, key_range *endpoint,
-                           double avg_selection) override;
+                           double avg_selection,
+                           double total_rows) override;
   double range_selectivity(Field *field, key_range *min_endp,
                            key_range *max_endp) override;
 
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
index c62230a9af1..c87330316f4 100644
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@@ -3882,7 +3882,8 @@ double get_column_range_cardinality(Field *field,
         {
           res= col_non_nulls * 
 	       hist->point_selectivity(field, min_endp,
-                                       avg_frequency / col_non_nulls);
+                                       avg_frequency / col_non_nulls,
+                                       tab_records);
         }
       }
       else if (avg_frequency == 0.0)
@@ -3973,7 +3974,8 @@ double get_column_range_cardinality(Field *field,
 */
 
 double Histogram_binary::point_selectivity(Field *field, key_range *endpoint,
-                                           double avg_sel)
+                                           double avg_sel,
+                                           double total_records)
 {
   double sel;
   Column_statistics *col_stats= field->read_stats;
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h
index 20a9bb9dd83..c5b1aa54399 100644
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@@ -186,7 +186,8 @@ public:
 
 
   virtual double point_selectivity(Field *field, key_range *endpoint,
-                                   double avg_selection)=0;
+                                   double avg_selectivity,
+                                   double total_rows)=0;
   virtual double range_selectivity(Field *field, key_range *min_endp,
                                    key_range *max_endp)=0;
 
@@ -355,7 +356,8 @@ public:
     Estimate selectivity of "col=const" using a histogram
   */
   double point_selectivity(Field *field, key_range *endpoint,
-                           double avg_sel) override;
+                           double avg_sel,
+                           double total_rows) override;
 };
author	Sergei Petrunia <psergey@askmonty.org>	2021-10-22 19:43:19 +0300
committer	Sergei Petrunia <psergey@askmonty.org>	2022-01-19 18:10:11 +0300
commit	05877df472fa95cab763dfb1299e81681a12da73 (patch)
tree	00753389e2ddeb3931cb3fdb605b2c92064373b1
parent	f3f78bed8530e1e858d5ed87054f2ac672760824 (diff)
download	mariadb-git-05877df472fa95cab763dfb1299e81681a12da73.tar.gz