MDEV-22360: Sufficient conditions for accurate calculation of join cardinality10.6-mdev22360

The aim of this task is to check if the estimate of join cardinality are accurate or not. The implementation to check if we have the accurate estimate of the join cardinality is a simple one, we have to walk over the WHERE clause. The approach can be broken into 2 cases: Case 1: WHERE clause is an AND conjunct For an AND item at the top level, we need to walk over all the top level conjuncts and call walk individually on them. This is done in such a way because for an AND conjunct at the top level we may have accurate selectivity, even if the predicate belongs to a different column. Eg: t1.a > 10 and t2.a < 5. For this AND item we will have accurate selectivities. For AND conjuncts (not at the top level), the entire conjunct needs to be resolved to one column. Eg: t1.a = t2.a AND ( (t1.a > 5 AND t2.a < 10) OR t1.a <= 0) Case 2: 2a) OR item For an OR item at the top level, we need to make sure that all the columns inside the OR conjunct need to belong to one column directly or indirectly. This needs to happen for an OR conjunct even if it is not at the top level. Eg: (t1.a > 5 or t1.a < 0); 2b) Single predicate at the top level Eg: t1.a= t2.a [ For this case we need to make sure we know number of distinct values for t1.a and t2.a ] t1.a > 5 [ sargable predicate, get the estimate from the range optimizer ] We need to make sure that for the predicates in the WHERE clause we have estimates either from the first component of the index or from the EITS. The implementation of these is covered with the callback function passed to walk function.
author: Varun Gupta <varun.gupta@mariadb.com> 2020-05-20 18:59:52 +0530
committer: Varun Gupta <varun.gupta@mariadb.com> 2020-11-26 01:20:49 +0530
commit: efb171c2eac06508489804601f20c702bca1954c (patch)
tree: 7da131efa4776cce13a51b632dad3f6df961ec5e /sql
parent: c498250888ec126fddda2867d1239b2a7734482f (diff)
download: mariadb-git-10.6-mdev22360.tar.gz
13 files changed, 646 insertions, 40 deletions
diff --git a/sql/field.cc b/sql/field.cc
index fe3aebce05d..91dc1deb360 100644
--- a/sql/field.cc
+++ b/sql/field.cc
@@ -36,6 +36,8 @@
 #include "tztime.h"                      // struct Time_zone
 #include "filesort.h"                    // change_double_for_sort
 #include "log_event.h"                   // class Table_map_log_event
+#include "sql_statistics.h"
+#include "sql_partition.h"
 #include <m_ctype.h>
 
 // Maximum allowed exponent value for converting string to decimal
@@ -1851,6 +1853,7 @@ Field::Field(uchar *ptr_arg,uint32 length_arg,uchar *null_ptr_arg,
   field_index= 0;
   cond_selectivity= 1.0;
   next_equal_field= NULL;
+  stats_available= 0;
 }
 
 
@@ -11358,6 +11361,203 @@ void Field::print_key_value_binary(String *out, const uchar* key, uint32 length)
 }
 
 
+/*
+  @brief
+    Check if statistics for a column are available via keys
+
+  @details
+    If the column is the first component of a key, then statistics
+    for the column are available from the range optimizer.
+    Sets the bit in Field::stats_table
+      a) NDV is available
+      b) Statistics are available for the non-const argument of a
+         range predicate
+*/
+
+void Field::statistics_available_via_keys()
+{
+  uint key;
+  key_map::Iterator it(key_start);
+  while ((key= it++) != key_map::Iterator::BITMAP_END)
+  {
+    KEY *keyinfo= table->key_info + key;
+    if (keyinfo->usable_key_parts == 1 &&
+        field_index + 1 == keyinfo->key_part->fieldnr)
+    {
+      stats_available|= (1 << STATISTICS_FOR_RANGE_PREDICATES_AVAILABLE);
+      return;
+    }
+  }
+}
+
+
+/*
+  @brief
+    Check if statistics for a column are available via stat tables
+*/
+
+void Field::statistics_available_via_stat_tables()
+{
+  THD *thd= table->in_use;
+  if (!(check_eits_preferred(thd) &&
+        thd->variables.optimizer_use_condition_selectivity > 2))
+    return;
+  if (!(table->stats_is_read &&
+        read_stats && !read_stats->no_stat_values_provided()))
+    return;
+  stats_available|= (1 << STATISTICS_FOR_RANGE_PREDICATES_AVAILABLE);
+  if (!read_stats->is_null(COLUMN_STAT_AVG_FREQUENCY))
+    stats_available|= (1 << STATISTICS_FOR_NDV_AVAILABLE);
+}
+
+
+/*
+  @brief
+    Check if statistics for a column are available via indexes or stat tables
+
+  @retval
+    TRUE   : statistics available for the column
+    FALSE  : OTHERWISE
+*/
+
+bool Field::is_statistics_available_for_range_predicates()
+{
+  if (!(stats_available & (1 << STATISTICS_CACHED)))
+  {
+    is_statistics_available();
+    stats_available|= (1 << STATISTICS_CACHED);
+  }
+  return (stats_available & (1 << STATISTICS_FOR_RANGE_PREDICATES_AVAILABLE));
+}
+
+
+void Field::is_statistics_available()
+{
+  statistics_available_via_keys();
+  statistics_available_via_stat_tables();
+  is_ndv_available_via_keys();
+  is_ndv_available_via_stat_tables();
+}
+
+/*
+  @brief
+    Check if ndv for a column are available via indexes or stat tables
+
+  @retval
+    TRUE   : ndv available for the column
+    FALSE  : OTHERWISE
+*/
+
+bool Field::is_ndv_available()
+{
+  if (!(stats_available & (1 << STATISTICS_CACHED)))
+  {
+    is_statistics_available();
+    stats_available|= (1 << STATISTICS_CACHED);
+  }
+  return (stats_available & (1 << STATISTICS_FOR_NDV_AVAILABLE));
+}
+
+
+/*
+  @brief
+    Check if number of distinct values(ndv) for a column are available via keys
+
+  @retval
+    TRUE     : ndv available from keys
+    FALSE    : otherwise
+*/
+
+bool Field::is_ndv_available_via_keys()
+{
+  uint key;
+  key_map::Iterator it(key_start);
+  while ((key= it++) != key_map::Iterator::BITMAP_END)
+  {
+    KEY *keyinfo= table->key_info + key;
+    if (is_first_component_of_key(keyinfo) && keyinfo->actual_rec_per_key(0))
+    {
+      stats_available|= (1 << STATISTICS_FOR_NDV_AVAILABLE);
+      return true;
+    }
+  }
+  return false;
+}
+
+
+/*
+  @brief
+    Check if ndv for a column are available via statistical tables
+
+  @retval
+    TRUE     : ndv available from statistical tables
+    FALSE    : otherwise
+*/
+
+bool Field::is_ndv_available_via_stat_tables()
+{
+  if (!check_eits_preferred(table->in_use))
+    return false;
+  if (!(read_stats && !read_stats->no_stat_values_provided() &&
+        !read_stats->is_null(COLUMN_STAT_AVG_FREQUENCY)))
+    return false;
+  stats_available|= (1 << STATISTICS_FOR_NDV_AVAILABLE);
+  return true;
+}
+
+
+/*
+  @brief
+    Checks if a field is the first component of a given key
+
+  @param
+    key      given key
+
+  @retval
+    TRUE     : field is the first component of the given key
+    FALSE    : otherwise
+*/
+
+bool Field::is_first_component_of_key(KEY *key)
+{
+  DBUG_ASSERT(key->usable_key_parts >= 1);
+  return field_index + 1 == key->key_part->fieldnr;
+}
+
+
+/*
+  Check whether EITS statistics for a field are usable or not
+
+  TRUE : Use EITS for the columns
+  FALSE: Otherwise
+*/
+
+bool Field::is_eits_usable()
+{
+  // check if column_statistics was allocated for this field
+  if (!read_stats)
+    return false;
+
+  DBUG_ASSERT(table->stats_is_read);
+
+  /*
+    (1): checks if we have EITS statistics for a particular column
+    (2): Don't use EITS for GEOMETRY columns
+    (3): Disabling reading EITS statistics for columns involved in the
+         partition list of a table. We assume the selectivity for
+         such columns would be handled during partition pruning.
+  */
+
+  return !read_stats->no_stat_values_provided() &&        //(1)
+          type() != MYSQL_TYPE_GEOMETRY &&              //(2)
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+    (!table->part_info ||
+     !table->part_info->field_in_partition_expr(this)) &&     //(3)
+#endif
+    true;
+}
+
+
 Virtual_column_info* Virtual_column_info::clone(THD *thd)
 {
   Virtual_column_info* dst= new (thd->mem_root) Virtual_column_info(*this);
diff --git a/sql/field.h b/sql/field.h
index dfc02149f9d..c60c5e3bece 100644
--- a/sql/field.h
+++ b/sql/field.h
@@ -869,6 +869,18 @@ public:
   */
   Column_statistics_collected *collected_stats;
 
+  enum stats_availability
+  {
+    STATISTICS_CACHED=0,
+    STATISTICS_FOR_RANGE_PREDICATES_AVAILABLE,
+    STATISTICS_FOR_NDV_AVAILABLE
+  };
+
+  /*
+    Caches the value of whether statistics are available for a field or not.
+  */
+  uint stats_available;
+
   /* 
     This is additional data provided for any computed(virtual) field,
     default function or check constraint.
@@ -1907,6 +1919,16 @@ public:
   /* Mark field in read map. Updates also virtual fields */
   void register_field_in_read_map();
 
+  bool is_first_component_of_key(KEY *key);
+  void statistics_available_via_keys();
+  void statistics_available_via_stat_tables();
+  void is_statistics_available();
+  bool is_statistics_available_for_range_predicates();
+  bool is_ndv_available();
+  bool is_ndv_available_via_stat_tables();
+  bool is_ndv_available_via_keys();
+  bool is_eits_usable();
+
   virtual Compression_method *compression_method() const { return 0; }
 
   virtual Virtual_tmp_table **virtual_tmp_table_addr()
diff --git a/sql/item.cc b/sql/item.cc
index 52274380cd1..f80db684bb3 100644
--- a/sql/item.cc
+++ b/sql/item.cc
@@ -7491,6 +7491,122 @@ Item *Item::build_pushable_cond(THD *thd,
 }
 
 
+/*
+  @brief
+    Check if selectivity estimates are accurate for a conditional formula
+
+  @details
+    This function checks whether this item belongs to a certain class of
+    condition for which we can calculate an accurate selectivity estimate.
+
+    The definition of the class of condition is recursive.
+    1. Simple formula
+         a. Formula in the form of range predicates:
+
+            The predicate would be of type:
+               col op const
+              where op can be
+
+                op:
+                   |  >
+                   |  >=
+                   |  <
+                   |  <=
+                   |  =
+                   |  <>
+              Also the other cases are with
+              [NOT] IN predicate,
+              [NOT] NULL predicate and
+              LIKE predicate.
+            The predicate should have only one non-constant argument and
+            this argument will be a reference to a column that is used either
+            as the first component of an index or statistics are available via
+            statistical tables.
+
+         b. Equalities:
+              For an equality to have accurate selectivity estimates,
+              the number of distinct values for each column in the equality
+              needs to be known.
+              Eg: t1.a= t2.a  is transformed to MULTIPLE_EQUAL(t1.a, t2.a)
+              For this case we need to make sure we know number of distinct
+              values for t1.a and t2.a
+
+              The number of distinct values for a column can be known by
+                1) from indexes via rec_per_key
+                2) from statistical tables via avg_frequency.
+
+    2. AND / OR formula over formulas defined in section 1 of the definition.
+
+        a) AND Formula
+         For AND formula the check for accurate selectivity estimates depends
+         whether or not the AND formula is at the top level.
+
+          i) Top level
+            For an AND formula at the top level, we need to check if
+            accurate estimates are available for all the predicates
+            inside an AND formula.
+            If this is true then accurate selectivity estimates are available
+            for the AND formula.
+
+              Eg: t1.a > 10 and t2.a < 5
+
+              if we have accurate selectivity estimates
+              for t1.a > 10 and t2.a < 5 via indexes or statistical tables,
+              then selectivity estimates for this AND formula are accurate
+
+          ii) Non-top level
+            For all the predicates inside an AND formula
+            accurate selectivity estimates are needed
+            and each predicate need to be resolved by one
+            column (table column). If this scenario is satisfied then
+            accurate selectivity estimates is available for the AND formula.
+              Eg: t1.a = t2.a AND ( (t1.a > 5 AND t2.a < 10) OR t1.a <= 0)
+
+      b) OR Formula
+
+          For an OR predicate, we need to make sure that the
+          whole OR predicate can be resolved by one column
+          directly or indirectly (that is via multiple equalities).
+          If this is possible then for the resolved column we need to have
+          statistics either from the first component of an index or
+          via statistical tables.
+
+          Eg: t1.a=t2.b and (t2.b > 5 or t1.a < 0);
+
+          In the end for all fields we may have selectivity from an index or
+          statistical tables.
+
+  @notes
+    The implementation for this function use the 'walk' method to traverse
+    the tree of this item with predicate_selectivity_checker() as the
+    call-back parameter of the method.
+
+
+  @retval
+    TRUE         selectivity estimates are accurate
+    FALSE        OTHERWISE
+*/
+
+bool Item::with_accurate_selectivity_estimation()
+{
+  if (type() == Item::COND_ITEM &&
+      ((Item_cond*) this)->functype() == Item_func::COND_AND_FUNC)
+  {
+    List_iterator<Item> li(*((Item_cond*) this)->argument_list());
+    Item *item;
+    while ((item= li++))
+    {
+      SAME_FIELD arg= {NULL, false};
+      if (item->walk(&Item::predicate_selectivity_checker, 0, &arg))
+        return false;
+    }
+    return true;
+  }
+  SAME_FIELD arg= {NULL, false};
+  return !walk(&Item::predicate_selectivity_checker, 0, &arg);
+}
+
+
 static
 Item *get_field_item_for_having(THD *thd, Item *item, st_select_lex *sel)
 {
@@ -9217,6 +9333,103 @@ Item_field::excl_dep_on_grouping_fields(st_select_lex *sel)
 }
 
 
+/*
+  @brief
+    Checks if a formula of a condition contains the same column
+
+  @details
+    In the function we try to check if a formula of a condition depends
+    (directly or indirectly through equalities inferred from the
+    conjuncted multiple equalities) only on one column.
+
+    Eg:
+      WHERE clause is:
+         t1.a=t2.b and (t1.a > 5 or t2.b < 1);
+
+      the predicate (t1.a > 5 or t2.b < 1) can be resolved with the help of
+      equalities to conclude that it depends on one column.
+
+    This is used mostly for OR conjuncts where we need to make sure
+    that the entire OR conjunct contains only one column, so that we may
+    get accurate estimates.
+
+  @retval
+    TRUE   : the formula does not depend on one column
+    FALSE  : OTHERWISE
+*/
+
+bool Item_field::dep_on_one_column(void *arg)
+{
+  SAME_FIELD *same_field_arg= (SAME_FIELD*)arg;
+
+  /*
+    The same_field_arg is passed as a parameter because when we start walking
+    over the condition tree we don't know which column the predicate will be
+    dependent on. So as soon as we encounter a leaf of the condition tree
+    which is a field item, we set the SAME_FIELD::item to the found
+    field item and then compare the rest of the fields in the predicate with
+    the field item.
+  */
+
+  if (same_field_arg->item == NULL)
+  {
+    same_field_arg->item= this;
+    same_field_arg->is_stats_available=
+                field->is_statistics_available_for_range_predicates() ||
+                (item_equal &&
+                 item_equal->is_statistics_available_for_range_predicates());
+    return !same_field_arg->is_stats_available;
+  }
+
+  /* Found the same field while traversing the condition tree */
+  DBUG_ASSERT(same_field_arg->item->real_item()->type() == Item::FIELD_ITEM);
+  if (((Item_field*)same_field_arg->item->real_item())->field == field)
+    return false;
+
+  if (!same_field_arg->item->get_item_equal())
+    return true;
+
+  return !(same_field_arg->item->get_item_equal() == item_equal);
+}
+
+
+bool Item_direct_view_ref::dep_on_one_column(void *arg)
+{
+  SAME_FIELD *same_field_arg= (SAME_FIELD*)arg;
+  DBUG_ASSERT(real_item()->type() == Item::FIELD_ITEM);
+  Item_field *field_item= (Item_field*)real_item();
+
+  /*
+    The same_field_arg is passed as a parameter because when we start walking
+    over the condition tree we don't know which column the predicate will be
+    dependent on. So as soon as we encounter a leaf of the condition tree
+    which is a field item, we set the SAME_FIELD::item to the found
+    field item and then compare the rest of the fields in the predicate with
+    the field item.
+  */
+
+  if (same_field_arg->item == NULL)
+  {
+    same_field_arg->item= this;
+    same_field_arg->is_stats_available=
+          field_item->field->is_statistics_available_for_range_predicates() ||
+          (item_equal &&
+           item_equal->is_statistics_available_for_range_predicates());
+    return !same_field_arg->is_stats_available;
+  }
+
+  /* Found the same field while traversing the condition tree */
+  DBUG_ASSERT(same_field_arg->item->real_item()->type() == Item::FIELD_ITEM);
+  if (((Item_field*)same_field_arg->item->real_item())->field == field_item->field)
+    return false;
+
+  if (!same_field_arg->item->get_item_equal())
+    return true;
+
+  return !(same_field_arg->item->get_item_equal() == item_equal);
+}
+
+
 bool Item_direct_view_ref::excl_dep_on_table(table_map tab_map)
 {
   table_map used= used_tables();
diff --git a/sql/item.h b/sql/item.h
index fb480b4c578..8a6b16ad82d 100644
--- a/sql/item.h
+++ b/sql/item.h
@@ -446,6 +446,26 @@ typedef struct replace_equal_field_arg
   struct st_join_table *context_tab;
 } REPLACE_EQUAL_FIELD_ARG;
 
+
+/*
+  Structure storing information for a field on which the entire predicate is
+  dependent on (directly or indirectly via equalities)
+*/
+typedef struct same_field
+{
+  /*
+    field item for the first encountered column while traversing
+    over the conditional predicate
+  */
+  Item *item;
+  /*
+    Set to true if the statistics for the field are available
+    directly (via keys or stat tables) or indirectly (via equalities)
+  */
+  bool is_stats_available;
+}SAME_FIELD;
+
+
 class Settable_routine_parameter
 {
 public:
@@ -1968,7 +1988,25 @@ public:
   virtual bool count_sargable_conds(void *arg) { return 0; }
   virtual bool limit_index_condition_pushdown_processor(void *arg) { return 0; }
   virtual bool exists2in_processor(void *arg) { return 0; }
+
   virtual bool find_selective_predicates_list_processor(void *arg) { return 0; }
+
+  bool with_accurate_selectivity_estimation();
+
+  /*
+    @brief
+      Check if selectivity of a predicate is available via indexes or EITS
+
+    @param
+      arg      Structure storing information whether the AND/OR conjunct
+               can be resolved via a single column.
+
+    @retval
+      FALSE  :  SUCCESS
+      TRUE   :  OTHERWISE
+  */
+  virtual bool predicate_selectivity_checker(void *arg) { return FALSE; }
+  virtual bool dep_on_one_column(void *arg) { return true; }
   bool cleanup_is_expensive_cache_processor(void *arg)
   {
     is_expensive_cache= (int8)(-1);
@@ -3576,6 +3614,7 @@ public:
     return field->table->pos_in_table_list->outer_join;
   }
   bool check_index_dependence(void *arg);
+  bool dep_on_one_column(void *arg);
   friend class Item_default_value;
   friend class Item_insert_value;
   friend class st_select_lex_unit;
@@ -5946,6 +5985,7 @@ public:
   Item *field_transformer_for_having_pushdown(THD *thd, uchar *arg)
   { return this; }
   Item *remove_item_direct_ref() { return this; }
+  bool dep_on_one_column(void *arg);
 };
 
 
diff --git a/sql/item_cmpfunc.cc b/sql/item_cmpfunc.cc
index 83eb605f463..8084af72f60 100644
--- a/sql/item_cmpfunc.cc
+++ b/sql/item_cmpfunc.cc
@@ -2106,6 +2106,31 @@ bool Item_func_between::count_sargable_conds(void *arg)
   return 0;
 }
 
+bool Item_func_between::predicate_selectivity_checker(void *arg)
+{
+  if (arguments()[0]->real_item()->type() == Item::FIELD_ITEM)
+  {
+    if (is_range_predicate(args[0], args[1]) &&
+        is_range_predicate(args[0], args[2]))
+    {
+      return args[0]->dep_on_one_column(arg);
+    }
+    return true;
+  }
+
+  for (uint i= 1 ; i < arg_count ; i++)
+  {
+    if (arguments()[i]->real_item()->type() == Item::FIELD_ITEM)
+    {
+      if (!is_range_predicate(args[i], args[0]))
+        return true;
+      if (args[i]->dep_on_one_column(arg))
+        return true;
+    }
+  }
+  return false;
+}
+
 
 void Item_func_between::fix_after_pullout(st_select_lex *new_parent,
                                           Item **ref, bool merge)
@@ -4290,6 +4315,16 @@ bool Item_func_in::count_sargable_conds(void *arg)
 }
 
 
+bool Item_func_in::predicate_selectivity_checker(void *arg)
+{
+  if (!all_items_are_consts(args + 1, arg_count - 1))
+    return true;
+  if (key_item()->dep_on_one_column(arg))
+    return true;
+  return false;
+}
+
+
 bool Item_func_in::list_contains_null()
 {
   Item **arg,**arg_end;
@@ -5516,6 +5551,16 @@ bool Item_func_null_predicate::count_sargable_conds(void *arg)
 }
 
 
+bool Item_func_null_predicate::predicate_selectivity_checker(void *arg)
+{
+  if (is_range_predicate(args[0], NULL))
+  {
+    return args[0]->dep_on_one_column(arg);
+  }
+  return true;
+}
+
+
 longlong Item_func_isnull::val_int()
 {
   DBUG_ASSERT(fixed == 1);
@@ -5597,6 +5642,17 @@ bool Item_bool_func2::count_sargable_conds(void *arg)
   return 0;
 }
 
+
+bool Item_bool_func2::predicate_selectivity_checker(void *arg)
+{
+  if (is_range_predicate(args[0], args[1]))
+    return args[0]->dep_on_one_column(arg);
+  if (is_range_predicate(args[1], args[0]))
+    return args[1]->dep_on_one_column(arg);
+  return true;
+}
+
+
 void Item_func_like::print(String *str, enum_query_type query_type)
 {
   args[0]->print_parenthesised(str, query_type, precedence());
@@ -5696,8 +5752,18 @@ SEL_TREE *Item_func_like::get_mm_tree(RANGE_OPT_PARAM *param, Item **cond_ptr)
   bool sargable_pattern= with_sargable_pattern();
   param->thd->mem_root= tmp_root;
   return sargable_pattern ?
-    Item_bool_func2::get_mm_tree(param, cond_ptr) :
-    Item_func::get_mm_tree(param, cond_ptr);
+         Item_bool_func2::get_mm_tree(param, cond_ptr) :
+         Item_func::get_mm_tree(param, cond_ptr);
+}
+
+
+bool Item_func_like::predicate_selectivity_checker(void *arg)
+{
+  if (with_sargable_pattern())
+  {
+      return args[0]->dep_on_one_column(arg);
+  }
+  return true;
 }
 
 
@@ -7139,6 +7205,49 @@ bool Item_equal::count_sargable_conds(void *arg)
 }
 
 
+bool Item_equal::predicate_selectivity_checker(void *arg)
+{
+  /*
+    For equality conditions like tbl1.col = tbl2.col
+    we only want to know if the number of distinct values (ndv) is
+    available for all the fields in the multiple equality or not.
+  */
+  Item_equal_fields_iterator it(*this);
+  while (it++)
+  {
+    Field *field= it.get_curr_field();
+    if (!(field->is_ndv_available()))
+      return true;
+  }
+
+  it.rewind();
+  Item *item;
+  SAME_FIELD *same_field= (SAME_FIELD *) arg;
+  while ((item= it++))
+  {
+    if (same_field->item)
+      return item->dep_on_one_column(arg);
+    item->dep_on_one_column(arg);
+    break;
+  }
+  return false;
+}
+
+
+bool Item_equal::is_statistics_available_for_range_predicates()
+{
+  bool found= false;
+  Item_equal_fields_iterator it(*this);
+  while (it++)
+  {
+    Field *field= it.get_curr_field();
+    if (field->is_statistics_available_for_range_predicates())
+      found= true;
+  }
+  return found;
+}
+
+
 /**
   @brief
   Evaluate multiple equality
diff --git a/sql/item_cmpfunc.h b/sql/item_cmpfunc.h
index fa715badfc7..f467efd0e10 100644
--- a/sql/item_cmpfunc.h
+++ b/sql/item_cmpfunc.h
@@ -227,6 +227,7 @@ public:
   bool fix_length_and_dec() { decimals=0; max_length=1; return FALSE; }
   uint decimal_precision() const { return 1; }
   bool need_parentheses_in_default() { return true; }
+  bool predicate_selectivity_checker(void *arg) { return TRUE; }
 };
 
 
@@ -418,6 +419,7 @@ public:
   COND *remove_eq_conds(THD *thd, Item::cond_result *cond_value,
                         bool top_level);
   bool count_sargable_conds(void *arg);
+  bool predicate_selectivity_checker(void *arg);
   /*
     Specifies which result type the function uses to compare its arguments.
     This method is used in equal field propagation.
@@ -936,6 +938,7 @@ public:
   bool find_not_null_fields(table_map allowed);
   void fix_after_pullout(st_select_lex *new_parent, Item **ref, bool merge);
   bool count_sargable_conds(void *arg);
+  bool predicate_selectivity_checker(void *arg);
   void add_key_fields(JOIN *join, KEY_FIELD **key_fields,
                       uint *and_level, table_map usable_tables,
                       SARGABLE_PARAM **sargables);
@@ -2473,6 +2476,7 @@ public:
   bool find_not_null_fields(table_map allowed);
   void fix_after_pullout(st_select_lex *new_parent, Item **ref, bool merge);
   bool count_sargable_conds(void *arg);
+  bool predicate_selectivity_checker(void *arg);
   Item *get_copy(THD *thd)
   { return get_item_copy<Item_func_in>(thd, this); }
   Item *build_clone(THD *thd)
@@ -2570,6 +2574,7 @@ public:
     return FALSE;
   }
   bool count_sargable_conds(void *arg);
+  bool predicate_selectivity_checker(void *arg);
 };
 
 
@@ -2813,6 +2818,7 @@ public:
   
   Item *get_copy(THD *thd)
   { return get_item_copy<Item_func_like>(thd, this); }
+  bool predicate_selectivity_checker(void *arg);
 };
 
 
@@ -3220,6 +3226,8 @@ public:
   uint elements_count() { return equal_items.elements; }
   friend class Item_equal_fields_iterator;
   bool count_sargable_conds(void *arg);
+  bool predicate_selectivity_checker(void *arg);
+  bool is_statistics_available_for_range_predicates();
   Item *multiple_equality_transformer(THD *thd, uchar *arg);
   friend class Item_equal_iterator<List_iterator_fast,Item>;
   friend class Item_equal_iterator<List_iterator,Item>;
@@ -3371,6 +3379,7 @@ public:
   SEL_TREE *get_mm_tree(RANGE_OPT_PARAM *param, Item **cond_ptr);
   Item *get_copy(THD *thd)
   { return get_item_copy<Item_cond_and>(thd, this); }
+  bool predicate_selectivity_checker(void *arg) { return FALSE; }
 };
 
 inline bool is_cond_and(Item *item)
@@ -3395,6 +3404,7 @@ public:
   Item *neg_transformer(THD *thd);
   Item *get_copy(THD *thd)
   { return get_item_copy<Item_cond_or>(thd, this); }
+  bool predicate_selectivity_checker(void *arg) { return FALSE; }
 };
 
 class Item_func_dyncol_check :public Item_bool_func
diff --git a/sql/item_func.h b/sql/item_func.h
index 6a4a9fa5dae..d04cc8643cf 100644
--- a/sql/item_func.h
+++ b/sql/item_func.h
@@ -399,6 +399,12 @@ public:
   Item_func *get_item_func() { return this; }
   bool is_simplified_cond_processor(void *arg)
   { return const_item() && !val_int(); }
+  bool predicate_selectivity_checker(void *arg)
+  {
+    if (const_item() && !is_expensive())
+     return false;
+    return true;
+  }
 };
 
 
diff --git a/sql/opt_range.cc b/sql/opt_range.cc
index adc0572cb1c..d3f49097189 100644
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -3105,7 +3105,7 @@ bool create_key_parts_for_pseudo_indexes(RANGE_OPT_PARAM *param,
   {
     Field *field= *field_ptr;
     if (bitmap_is_set(used_fields, field->field_index) &&
-        is_eits_usable(field))
+        field->is_eits_usable())
       parts++;
   }
 
@@ -3126,7 +3126,7 @@ bool create_key_parts_for_pseudo_indexes(RANGE_OPT_PARAM *param,
     Field *field= *field_ptr;
     if (bitmap_is_set(used_fields, field->field_index))
     {
-      if (!is_eits_usable(field))
+      if (!field->is_eits_usable())
         continue;
 
       uint16 store_length;
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index bb9d82a7039..3553ee78c5b 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -8296,6 +8296,11 @@ choose_plan(JOIN *join, table_map join_tables)
             jtab_sort_func, (void*)join->emb_sjm_nest);
 
   Json_writer_object wrapper(thd);
+
+  if (join->conds)
+    wrapper.add("cardinality_accurate",
+                join->conds->with_accurate_selectivity_estimation());
+
   Json_writer_array trace_plan(thd,"considered_execution_plans");
 
   if (!join->emb_sjm_nest)
@@ -29449,6 +29454,38 @@ void unpack_to_base_table_fields(TABLE *table)
 }
 
 
+
+/*
+  @brief
+    Checks if a predicate is a range predicate with a constant part
+
+  @param
+
+    @item          the item referring to the field of the table
+    @value         the item referring to the expression on the
+                   rhs of a predicate
+
+  @details
+    Range predicate is defined as the form of field op const
+    where op can be operators like </<=/=/>/>=/BETWEEN etc.
+    Also the statistics for the field should be available via
+    an index or statistical tables.
+
+  @retval
+    TRUE  : Success
+    FALSE : Otherwise
+*/
+
+bool is_range_predicate(Item *item, Item *value)
+{
+  Item *field= item->real_item();
+  if (field->type() == Item::FIELD_ITEM && !field->const_item() &&
+      (!value || !value->is_expensive()))
+    return true;
+  return false;
+}
+
+
 /**
   @} (end of group Query_Optimizer)
 */
diff --git a/sql/sql_select.h b/sql/sql_select.h
index 2aba63ddc79..4d5a6e0abf5 100644
--- a/sql/sql_select.h
+++ b/sql/sql_select.h
@@ -2531,5 +2531,6 @@ void propagate_new_equalities(THD *thd, Item *cond,
                               List<Item_equal> *new_equalities,
                               COND_EQUAL *inherited,
                               bool *is_simplifiable_cond);
+bool is_range_predicate(Item *item, Item *value);
 
 #endif /* SQL_SELECT_INCLUDED */
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
index 2636299e330..71d472feaba 100644
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@@ -31,7 +31,6 @@
 #include "opt_range.h"
 #include "uniques.h"
 #include "sql_show.h"
-#include "sql_partition.h"
 
 /*
   The system variable 'use_stat_tables' can take one of the
@@ -3159,7 +3158,10 @@ static void dump_stats_from_share_to_table(TABLE *table)
   Field **field_ptr= table_share->field;
   Field **table_field_ptr= table->field;
   for ( ; *field_ptr; field_ptr++, table_field_ptr++)
+  {
     (*table_field_ptr)->read_stats= (*field_ptr)->read_stats;
+    (*table_field_ptr)->stats_available= (*field_ptr)->stats_available;
+  }
   table->stats_is_read= true;
 }
 
@@ -3978,37 +3980,3 @@ bool is_stat_table(const LEX_CSTRING *db, LEX_CSTRING *table)
   }
   return false;
 }
-
-/*
-  Check wheter we can use EITS statistics for a field or not
-
-  TRUE : Use EITS for the columns
-  FALSE: Otherwise
-*/
-
-bool is_eits_usable(Field *field)
-{
-  Column_statistics* col_stats= field->read_stats;
-  
-  // check if column_statistics was allocated for this field
-  if (!col_stats)
-    return false;
-
-  DBUG_ASSERT(field->table->stats_is_read);
-
-  /*
-    (1): checks if we have EITS statistics for a particular column
-    (2): Don't use EITS for GEOMETRY columns
-    (3): Disabling reading EITS statistics for columns involved in the
-         partition list of a table. We assume the selecticivity for
-         such columns would be handled during partition pruning.
-  */
-
-  return !col_stats->no_stat_values_provided() &&        //(1)
-    field->type() != MYSQL_TYPE_GEOMETRY &&              //(2)
-#ifdef WITH_PARTITION_STORAGE_ENGINE
-    (!field->table->part_info ||
-     !field->table->part_info->field_in_partition_expr(field)) &&     //(3)
-#endif
-    true;
-}
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h
index 20ecf06bfee..34471fd5270 100644
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@@ -138,7 +138,6 @@ double get_column_range_cardinality(Field *field,
                                     key_range *max_endp,
                                     uint range_flag);
 bool is_stat_table(const LEX_CSTRING *db, LEX_CSTRING *table);
-bool is_eits_usable(Field* field);
 
 class Histogram
 {
diff --git a/sql/table.cc b/sql/table.cc
index c48a6fed89a..02f4ece9c40 100644
--- a/sql/table.cc
+++ b/sql/table.cc
@@ -5435,6 +5435,7 @@ void TABLE::init(THD *thd, TABLE_LIST *tl)
   {
     (*f_ptr)->next_equal_field= NULL;
     (*f_ptr)->cond_selectivity= 1.0;
+    (*f_ptr)->stats_available= 0;
   }
 
   notnull_cond= 0;
author	Varun Gupta <varun.gupta@mariadb.com>	2020-05-20 18:59:52 +0530
committer	Varun Gupta <varun.gupta@mariadb.com>	2020-11-26 01:20:49 +0530
commit	efb171c2eac06508489804601f20c702bca1954c (patch)
tree	7da131efa4776cce13a51b632dad3f6df961ec5e /sql
parent	c498250888ec126fddda2867d1239b2a7734482f (diff)
download	mariadb-git-10.6-mdev22360.tar.gz