MDEV-16188 Use in-memory PK filters built from range index scans

This patch contains a full implementation of the optimization that allows to use in-memory rowid / primary filters built for range conditions over indexes. In many cases usage of such filters reduce the number of disk seeks spent for fetching table rows. In this implementation the choice of what possible filter to be applied (if any) is made purely on cost-based considerations. This implementation re-achitectured the partial implementation of the feature pushed by Galina Shalygina in the commit 8d5a11122c32f4d9eb87536886c6e893377bdd07. Besides this patch contains a better implementation of the generic handler function handler::multi_range_read_info_const() that takes into account gaps between ranges when calculating the cost of range index scans. It also contains some corrections of the implementation of the handler function records_in_range() for MyISAM. This patch supports the feature for InnoDB and MyISAM.
author: Igor Babaev <igor@askmonty.org> 2019-02-03 14:56:12 -0800
committer: Igor Babaev <igor@askmonty.org> 2019-02-03 14:56:12 -0800
commit: 658128af43b4d7c6db445164f8ed25ed4d1e3109 (patch)
tree: 7a71580cca55759b8bb2730e117436478948d77f /sql
parent: 5f46670bd09babbee75a24ac82eb4ade0706da66 (diff)
download: mariadb-git-658128af43b4d7c6db445164f8ed25ed4d1e3109.tar.gz
19 files changed, 1603 insertions, 456 deletions
diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc
index 37a3decdbca..651ace759a3 100644
--- a/sql/ha_partition.cc
+++ b/sql/ha_partition.cc
@@ -6265,19 +6265,22 @@ ha_rows ha_partition::multi_range_read_info_const(uint keyno,
   uint ret_mrr_mode= 0;
   range_seq_t seq_it;
   part_id_range save_part_spec;
+  Cost_estimate part_cost;
   DBUG_ENTER("ha_partition::multi_range_read_info_const");
   DBUG_PRINT("enter", ("partition this: %p", this));
 
   m_mrr_new_full_buffer_size= 0;
   save_part_spec= m_part_spec;
 
+  cost->reset();
+
   seq_it= seq->init(seq_init_param, n_ranges, *mrr_mode);
   if (unlikely((error= multi_range_key_create_key(seq, seq_it))))
   {
     if (likely(error == HA_ERR_END_OF_FILE))    // No keys in range
     {
       rows= 0;
-      goto calc_cost;
+      goto end;
     }
     /*
       This error means that we can't do multi_range_read for the moment
@@ -6306,18 +6309,20 @@ ha_rows ha_partition::multi_range_read_info_const(uint keyno,
       ha_rows tmp_rows;
       uint tmp_mrr_mode;
       m_mrr_buffer_size[i]= 0;
+      part_cost.reset();
       tmp_mrr_mode= *mrr_mode;
       tmp_rows= (*file)->
         multi_range_read_info_const(keyno, &m_part_seq_if,
                                     &m_partition_part_key_multi_range_hld[i],
                                     m_part_mrr_range_length[i],
                                     &m_mrr_buffer_size[i],
-                                    &tmp_mrr_mode, cost);
+                                    &tmp_mrr_mode, &part_cost);
       if (tmp_rows == HA_POS_ERROR)
       {
         m_part_spec= save_part_spec;
         DBUG_RETURN(HA_POS_ERROR);
       }
+      cost->add(&part_cost);
       rows+= tmp_rows;
       ret_mrr_mode|= tmp_mrr_mode;
       m_mrr_new_full_buffer_size+= m_mrr_buffer_size[i];
@@ -6325,15 +6330,8 @@ ha_rows ha_partition::multi_range_read_info_const(uint keyno,
   } while (*(++file));
   *mrr_mode= ret_mrr_mode;
 
-calc_cost:
+end:
   m_part_spec= save_part_spec;
-  cost->reset();
-  cost->avg_io_cost= 1;
-  if ((*mrr_mode & HA_MRR_INDEX_ONLY) && rows > 2)
-    cost->io_count= keyread_time(keyno, n_ranges, (uint) rows);
-  else
-    cost->io_count= read_time(keyno, n_ranges, rows);
-  cost->cpu_cost= (double) rows / TIME_FOR_COMPARE + 0.01;
   DBUG_RETURN(rows);
 }
 
@@ -9341,6 +9339,43 @@ double ha_partition::scan_time()
 
 
 /**
+  @brief
+  Caculate time to scan the given index (index only scan)
+
+  @param inx      Index number to scan
+
+  @return time for scanning index inx
+*/
+
+double ha_partition::key_scan_time(uint inx)
+{
+  double scan_time= 0;
+  uint i;
+  DBUG_ENTER("ha_partition::key_scan_time");
+  for (i= bitmap_get_first_set(&m_part_info->read_partitions);
+       i < m_tot_parts;
+       i= bitmap_get_next_set(&m_part_info->read_partitions, i))
+    scan_time+= m_file[i]->key_scan_time(inx);
+  DBUG_RETURN(scan_time);
+}
+
+
+double ha_partition::keyread_time(uint inx, uint ranges, ha_rows rows)
+{
+  double read_time= 0;
+  uint i;
+  DBUG_ENTER("ha_partition::keyread_time");
+  if (!ranges)
+    DBUG_RETURN(handler::keyread_time(inx, ranges, rows));
+  for (i= bitmap_get_first_set(&m_part_info->read_partitions);
+       i < m_tot_parts;
+       i= bitmap_get_next_set(&m_part_info->read_partitions, i))
+    read_time+= m_file[i]->keyread_time(inx, ranges, rows);
+  DBUG_RETURN(read_time);
+}
+
+
+/**
   Find number of records in a range.
   @param inx      Index number
   @param min_key  Start of range
diff --git a/sql/ha_partition.h b/sql/ha_partition.h
index 202f27840dc..f19e9ff96c8 100644
--- a/sql/ha_partition.h
+++ b/sql/ha_partition.h
@@ -912,6 +912,10 @@ public:
   */
   virtual double scan_time();
 
+  virtual double key_scan_time(uint inx);
+
+  virtual double keyread_time(uint inx, uint ranges, ha_rows rows);
+
   /*
     The next method will never be called if you do not implement indexes.
   */
diff --git a/sql/handler.cc b/sql/handler.cc
index e5081accb30..30ce3f654df 100644
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -43,6 +43,7 @@
 #include "debug_sync.h"         // DEBUG_SYNC
 #include "sql_audit.h"
 #include "ha_sequence.h"
+#include "rowid_filter.h"
 
 #ifdef WITH_PARTITION_STORAGE_ENGINE
 #include "ha_partition.h"
@@ -2602,36 +2603,26 @@ LEX_CSTRING *handler::engine_name()
 }
 
 
-/**
-  The method returns the cost of the random I/O accesses when
-  index is used.
+/*
+  It is assumed that the value of the parameter 'ranges' can be only 0 or 1.
+  If ranges == 1 then the function returns the cost of index only scan
+  by index 'keyno' of one range containing 'rows' key entries.
+  If ranges == 0 then the function returns only the cost of copying
+  those key entries into the engine buffers.
 */
 
-double handler::get_io_cost(uint index, ha_rows rows, uint *length)
+double handler::keyread_time(uint index, uint ranges, ha_rows rows)
 {
-  uint len= table->key_info[index].key_length + ref_length;
+  DBUG_ASSERT(ranges == 0 || ranges == 1);
+  size_t len= table->key_info[index].key_length + ref_length;
   if (index == table->s->primary_key && table->file->primary_key_is_clustered())
     len= table->s->stored_rec_length;
-  double keys_per_block= (stats.block_size/2.0/len+1);
-  *length= len;
-  return (rows + keys_per_block-1)/ keys_per_block;
-}
-
-
-double handler::keyread_time(uint index, uint ranges, ha_rows rows)
-{
-  /*
-    It is assumed that we will read trough the whole key range and that all
-    key blocks are half full (normally things are much better). It is also
-    assumed that each time we read the next key from the index, the handler
-    performs a random seek, thus the cost is proportional to the number of
-    blocks read. This model does not take into account clustered indexes -
-    engines that support that (e.g. InnoDB) may want to overwrite this method.
-    The model counts in the time to read index entries from cache.
-  */
-  uint len;
-  return get_io_cost(index, rows, &len) +
-         len*rows/(stats.block_size+1)/TIME_FOR_COMPARE ;
+  uint keys_per_block= (stats.block_size/2.0/len+1);
+  ulonglong blocks= !rows ? 0 : (rows-1) / keys_per_block + 1;
+  double cost= (double)rows*len/(stats.block_size+1)*IDX_BLOCK_COPY_COST;
+  if (ranges)
+    cost+= blocks;
+  return cost;
 }
 
 void **handler::ha_data(THD *thd) const
@@ -5766,6 +5757,35 @@ extern "C" enum icp_result handler_index_cond_check(void* h_arg)
   return res;
 }
 
+
+/**
+  Rowid filter callback - to be called by an engine to check rowid / primary
+  keys of the rows whose data is to be fetched against the used rowid filter
+*/
+
+extern "C" int handler_rowid_filter_check(void *h_arg)
+{
+  handler *h= (handler*) h_arg;
+  TABLE *tab= h->get_table();
+  h->position(tab->record[0]);
+  return h->pushed_rowid_filter->check((char *) h->ref);
+}
+
+
+/**
+  Callback function for an engine to check whether the used rowid filter
+  has been already built
+*/
+
+extern "C" int handler_rowid_filter_is_active(void *h_arg)
+{
+  if (!h_arg)
+    return false;
+  handler *h= (handler*) h_arg;
+  return h->rowid_filter_is_active;
+}
+
+
 int handler::index_read_idx_map(uchar * buf, uint index, const uchar * key,
                                 key_part_map keypart_map,
                                 enum ha_rkey_function find_flag)
@@ -6230,6 +6250,7 @@ int handler::ha_reset()
   /* Reset information about pushed engine conditions */
   cancel_pushed_idx_cond();
   /* Reset information about pushed index conditions */
+  cancel_pushed_rowid_filter();
   clear_top_table_fields();
   DBUG_RETURN(reset());
 }
diff --git a/sql/handler.h b/sql/handler.h
index e5a6deb6659..ee730e8a725 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -47,6 +47,7 @@
 class Alter_info;
 class Virtual_column_info;
 class sequence_definition;
+class Rowid_filter;
 
 // the following is for checking tables
 
@@ -324,6 +325,8 @@ enum enum_alter_inplace_result {
 */
 #define HA_CLUSTERED_INDEX      512
 
+#define HA_DO_RANGE_FILTER_PUSHDOWN  1024
+
 /*
   bits in alter_table_flags:
 */
@@ -2561,11 +2564,14 @@ typedef bool (*SKIP_INDEX_TUPLE_FUNC) (range_seq_t seq, range_id_t range_info);
 class Cost_estimate
 { 
 public:
-  double io_count;     /* number of I/O                 */
-  double avg_io_cost;  /* cost of an average I/O oper.  */
-  double cpu_cost;     /* cost of operations in CPU     */
-  double import_cost;  /* cost of remote operations     */
-  double mem_cost;     /* cost of used memory           */ 
+  double io_count;        /* number of I/O to fetch records                */
+  double avg_io_cost;     /* cost of an average I/O oper. to fetch records */
+  double idx_io_count;    /* number of I/O to read keys                    */
+  double idx_avg_io_cost; /* cost of an average I/O oper. to fetch records */
+  double cpu_cost;        /* total cost of operations in CPU               */
+  double idx_cpu_cost;    /* cost of operations in CPU for index           */
+  double import_cost;     /* cost of remote operations     */
+  double mem_cost;        /* cost of used memory           */
   
   enum { IO_COEFF=1 };
   enum { CPU_COEFF=1 };
@@ -2579,10 +2585,18 @@ public:
 
   double total_cost() 
   {
-    return IO_COEFF*io_count*avg_io_cost + CPU_COEFF * cpu_cost +
+    return IO_COEFF*io_count*avg_io_cost +
+           IO_COEFF*idx_io_count*idx_avg_io_cost +
+           CPU_COEFF*cpu_cost + 
            MEM_COEFF*mem_cost + IMPORT_COEFF*import_cost;
   }
 
+  double index_only_cost()
+  {
+    return IO_COEFF*idx_io_count*idx_avg_io_cost +
+           CPU_COEFF*idx_cpu_cost;
+  }
+
   /**
     Whether or not all costs in the object are zero
 
@@ -2590,30 +2604,48 @@ public:
   */
   bool is_zero() const
   {
-    return io_count == 0.0 && cpu_cost == 0.0 &&
+    return io_count == 0.0 && idx_io_count && cpu_cost == 0.0 &&
       import_cost == 0.0 && mem_cost == 0.0;
   }
 
   void reset()
   {
     avg_io_cost= 1.0;
-    io_count= cpu_cost= mem_cost= import_cost= 0.0;
+    idx_avg_io_cost= 1.0;
+    io_count= idx_io_count= cpu_cost= idx_cpu_cost= mem_cost= import_cost= 0.0;
   }
 
   void multiply(double m)
   {
     io_count *= m;
     cpu_cost *= m;
+    idx_io_count *= m;
+    idx_cpu_cost *= m;
     import_cost *= m;
     /* Don't multiply mem_cost */
   }
 
   void add(const Cost_estimate* cost)
   {
-    double io_count_sum= io_count + cost->io_count;
-    add_io(cost->io_count, cost->avg_io_cost);
-    io_count= io_count_sum;
+    if (cost->io_count)
+    {
+      double io_count_sum= io_count + cost->io_count;
+      avg_io_cost= (io_count * avg_io_cost +
+                    cost->io_count * cost->avg_io_cost)
+	            /io_count_sum;
+      io_count= io_count_sum;
+    }
+    if (cost->idx_io_count)
+    {
+      double idx_io_count_sum= idx_io_count + cost->idx_io_count;
+      idx_avg_io_cost= (idx_io_count * idx_avg_io_cost +
+                        cost->idx_io_count * cost->idx_avg_io_cost)
+	               /idx_io_count_sum;
+      idx_io_count= idx_io_count_sum;
+    }
     cpu_cost += cost->cpu_cost;
+    idx_cpu_cost += cost->idx_cpu_cost;
+    import_cost += cost->import_cost;
   }
 
   void add_io(double add_io_cnt, double add_avg_cost)
@@ -2790,6 +2822,9 @@ public:
 
 extern "C" enum icp_result handler_index_cond_check(void* h_arg);
 
+extern "C" int handler_rowid_filter_check(void* h_arg);
+extern "C" int handler_rowid_filter_is_active(void* h_arg);
+
 uint calculate_key_len(TABLE *, uint, const uchar *, key_part_map);
 /*
   bitmap with first N+1 bits set
@@ -2956,6 +2991,11 @@ public:
   Item *pushed_idx_cond;
   uint pushed_idx_cond_keyno;  /* The index which the above condition is for */
 
+  /* Rowid filter pushed into the engine */
+  Rowid_filter *pushed_rowid_filter;
+  /* true when the pushed rowid filter has been already filled */
+  bool rowid_filter_is_active;
+
   Discrete_interval auto_inc_interval_for_cur_row;
   /**
      Number of reserved auto-increment intervals. Serves as a heuristic
@@ -3020,6 +3060,8 @@ public:
     tracker(NULL),
     pushed_idx_cond(NULL),
     pushed_idx_cond_keyno(MAX_KEY),
+    pushed_rowid_filter(NULL),
+    rowid_filter_is_active(0),
     auto_inc_intervals_count(0),
     m_psi(NULL), set_top_table_fields(FALSE), top_table(0),
     top_table_field(0), top_table_fields(0),
@@ -3226,6 +3268,11 @@ public:
   virtual double scan_time()
   { return ulonglong2double(stats.data_file_length) / IO_SIZE + 2; }
 
+  virtual double key_scan_time(uint index)
+  {
+    return keyread_time(index, 1, records());
+  }
+
   /**
      The cost of reading a set of ranges from the table using an index
      to access it.
@@ -3249,8 +3296,6 @@ public:
   */
   virtual double keyread_time(uint index, uint ranges, ha_rows rows);
 
-  double get_io_cost(uint index, ha_rows rows, uint *length);
-
   virtual const key_map *keys_to_use_for_scanning() { return &key_map_empty; }
 
   /*
@@ -4046,6 +4091,14 @@ public:
    in_range_check_pushed_down= false;
  }
 
+ virtual void cancel_pushed_rowid_filter()
+ {
+   pushed_rowid_filter= NULL;
+   rowid_filter_is_active= false;
+ }
+
+ virtual bool rowid_filter_push(Rowid_filter *rowid_filter) { return true; }
+
  /* Needed for partition / spider */
   virtual TABLE_LIST *get_next_global_for_child() { return NULL; }
 
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index d6952e71899..cab278f5859 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -20,6 +20,12 @@
 #include "key.h"
 #include "sql_statistics.h"
 
+static ulonglong key_block_no(handler *h, uint keyno, uint keyentry_pos)
+{
+  return (ulonglong) (h->keyread_time(keyno, 1, keyentry_pos + 1) -
+	              h->keyread_time(keyno, 0, keyentry_pos + 1) + 0.5) + 1;
+}
+
 /****************************************************************************
  * Default MRR implementation (MRR to non-MRR converter)
  ***************************************************************************/
@@ -47,6 +53,24 @@
     for a user to be able to interrupt the calculation by killing the
     connection/query.
 
+  @note
+    Starting from 10.4 the implementation of this method tries to take into
+    account gaps between range intervals. Before this we had such paradoxical
+    cases when, for example, the cost of the index scan by range [1..3] was
+    almost twice as less than the cost of of the index scan by two intervals
+    [1..1] and [3..3].
+
+  @note
+    The current implementation of the method is not efficient for it
+    requires extra dives for gaps. Although these dives are not expensive
+    as they touch the index nodes that with very high probability are in
+    cache this is still not good. We could avoid it if records in range
+    also returned positions of the ends of range intervals. It's not
+    hard to implement it now for MyISAM as this engine provides a function
+    returning an approximation of the relative position of a key tuple
+    among other index key tuples. Unfortunately InnoDB now does not provide
+    anything like this function.
+
   @retval
     HA_POS_ERROR  Error or the engine is unable to perform the requested
                   scan. Values of OUT parameters are undefined.
@@ -61,12 +85,21 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                      uint *bufsz, uint *flags, Cost_estimate *cost)
 {
   KEY_MULTI_RANGE range;
+  key_range prev_start_key;
   range_seq_t seq_it;
-  ha_rows rows, total_rows= 0;
+  ha_rows min_pos= 0;
+  ha_rows total_rows= 0;
   uint n_ranges=0;
+  uint n_eq_ranges= 0;
+  ulonglong total_touched_blocks= 0;
+  key_range *prev_min_endp= 0;
+  ulonglong prev_max_block_no=0;
+  ha_rows max_rows= stats.records;
   THD *thd= table->in_use;
-  uint limit= thd->variables.eq_range_index_dive_limit;
+  StringBuffer<64> key_value;
   
+  uint limit= thd->variables.eq_range_index_dive_limit;
+
   bool use_statistics_for_eq_range= eq_ranges_exceeds_limit(seq,
                                                             seq_init_param,
                                                             limit);
@@ -77,10 +110,15 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
   seq_it= seq->init(seq_init_param, n_ranges, *flags);
   while (!seq->next(seq_it, &range))
   {
+    ha_rows rows;
+    ulonglong new_touched_blocks= 0;
+
     if (unlikely(thd->killed != 0))
       return HA_POS_ERROR;
     
     n_ranges++;
+    if (range.range_flag & EQ_RANGE)
+      n_eq_ranges++;
     key_range *min_endp, *max_endp;
     if (range.range_flag & GEOM_FLAG)
     {
@@ -95,38 +133,93 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
       max_endp= range.end_key.length? &range.end_key : NULL;
     }
     int keyparts_used= my_count_bits(range.start_key.keypart_map);
-    if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE))
-      rows= 1; /* there can be at most one row */
-    else if (use_statistics_for_eq_range &&
-             !(range.range_flag & NULL_RANGE) &&
-             (range.range_flag & EQ_RANGE) &&
-             table->key_info[keyno].actual_rec_per_key(keyparts_used - 1) > 0.5)
-      rows=
-        (ha_rows) table->key_info[keyno].actual_rec_per_key(keyparts_used - 1);
+    if (use_statistics_for_eq_range &&
+        !(range.range_flag & NULL_RANGE) &&
+        (range.range_flag & EQ_RANGE) &&
+        table->key_info[keyno].actual_rec_per_key(keyparts_used - 1) > 0.5)
+    {
+      if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE))
+        rows= 1; /* there can be at most one row */
+      else
+        rows=
+          (ha_rows) table->key_info[keyno].actual_rec_per_key(keyparts_used-1);
+    }
     else
     {
-      if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp, 
+      ulonglong min_block_no;
+      ulonglong max_block_no;
+      if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE))
+        rows= 1; /* there can be at most one row */
+      else if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp,
                                                         max_endp)))
       {
         /* Can't scan one range => can't do MRR scan at all */
         total_rows= HA_POS_ERROR;
         break;
       }
+      if (!max_endp && !(prev_min_endp && prev_min_endp->length))
+        min_pos+= max_rows - rows;
+      else
+      {
+        key_range *start_endp= prev_min_endp;
+        if (start_endp && !start_endp->keypart_map)
+          start_endp= 0;
+        /*
+           Get the estimate of rows in the previous gap
+           and two ranges surrounding this gap
+        */
+        ha_rows r= this->records_in_range(keyno,start_endp,max_endp);
+        if (r == HA_POS_ERROR)
+	{
+          /* Some engine cannot estimate such ranges */
+          total_rows += rows;
+          continue;
+        }
+        min_pos+= r - rows;
+      }
+      min_block_no= key_block_no(this, keyno, min_pos);
+      max_block_no= key_block_no(this, keyno, min_pos + rows);
+      new_touched_blocks= max_block_no - min_block_no +
+	                  MY_TEST(min_block_no != prev_max_block_no);
+      prev_max_block_no= max_block_no;
+      if (!prev_min_endp)
+        prev_min_endp= &prev_start_key;
+      /* Save range.start_key for the next iteration step */
+      prev_start_key= range.start_key;
+      key_value.copy((const char *) prev_start_key.key, prev_start_key.length,
+                     key_value.charset());
+      prev_start_key.key= (const uchar *) key_value.ptr();
     }
     total_rows += rows;
+    total_touched_blocks+= new_touched_blocks;
   }
   
   if (total_rows != HA_POS_ERROR)
   {
+    set_if_smaller(total_rows, max_rows);
     /* The following calculation is the same as in multi_range_read_info(): */
     *flags |= HA_MRR_USE_DEFAULT_IMPL;
     cost->reset();
     cost->avg_io_cost= 1; /* assume random seeks */
-    if ((*flags & HA_MRR_INDEX_ONLY) && total_rows > 2)
-      cost->io_count= keyread_time(keyno, n_ranges, (uint)total_rows);
+    cost->idx_avg_io_cost= 1;
+    if (!(keyno == table->s->primary_key && primary_key_is_clustered()))
+    {
+      cost->idx_io_count= total_touched_blocks +
+	                  keyread_time(keyno, 0, total_rows);
+      cost->cpu_cost= cost->idx_cpu_cost=
+        (double) total_rows / TIME_FOR_COMPARE_IDX +
+        (2 * n_ranges - n_eq_ranges) * IDX_LOOKUP_COST;
+      if (!(*flags & HA_MRR_INDEX_ONLY))
+      {
+        cost->io_count= read_time(keyno, 0, total_rows);
+        cost->cpu_cost+= (double) total_rows / TIME_FOR_COMPARE;
+      }
+    }
     else
-      cost->io_count= read_time(keyno, n_ranges, total_rows);
-    cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01;
+    {
+      cost->io_count= read_time(keyno, total_touched_blocks, (uint) total_rows);
+      cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01;
+    }
   }
   return total_rows;
 }
@@ -183,10 +276,22 @@ ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
   cost->avg_io_cost= 1; /* assume random seeks */
 
   /* Produce the same cost as non-MRR code does */
-  if (*flags & HA_MRR_INDEX_ONLY)
-    cost->io_count= keyread_time(keyno, n_ranges, n_rows);
+  if (!(keyno == table->s->primary_key && primary_key_is_clustered()))
+  {
+    cost->idx_io_count=  n_ranges + keyread_time(keyno, 0, n_rows);
+    cost->cpu_cost= cost->idx_cpu_cost=
+      (double) n_rows / TIME_FOR_COMPARE_IDX + n_ranges * IDX_LOOKUP_COST;
+    if (!(*flags & HA_MRR_INDEX_ONLY))
+    {
+      cost->io_count= read_time(keyno, 0, n_rows);
+      cost->cpu_cost+= (double) n_rows / TIME_FOR_COMPARE;
+    }
+  }
   else
-    cost->io_count= read_time(keyno, n_ranges, n_rows);
+  {
+    cost->io_count= read_time(keyno, n_ranges, (uint)n_rows);
+    cost->cpu_cost= (double) n_rows / TIME_FOR_COMPARE + 0.01;
+  }
   return 0;
 }
 
diff --git a/sql/opt_range.cc b/sql/opt_range.cc
index b9b74bdd0c4..ba2705bab40 100644
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -2520,6 +2520,7 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
   quick=0;
   needed_reg.clear_all();
   quick_keys.clear_all();
+  head->with_impossible_ranges.clear_all();
   DBUG_ASSERT(!head->is_filled_at_execution());
   if (keys_to_use.is_clear_all() || head->is_filled_at_execution())
     DBUG_RETURN(0);
@@ -2639,8 +2640,8 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
     if (!force_quick_range && !head->covering_keys.is_clear_all())
     {
       int key_for_use= find_shortest_key(head, &head->covering_keys);
-      double key_read_time= head->file->keyread_time(key_for_use, 1, records) +
-                            (double) records / TIME_FOR_COMPARE;
+      double key_read_time= head->file->key_scan_time(key_for_use) +
+                            (double) records / TIME_FOR_COMPARE_IDX;
       DBUG_PRINT("info",  ("'all'+'using index' scan will be using key %d, "
                            "read time %g", key_for_use, key_read_time));
       if (key_read_time < read_time)
@@ -4790,6 +4791,7 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge,
   double roru_index_costs;
   ha_rows roru_total_records;
   double roru_intersect_part= 1.0;
+  double limit_read_time= read_time;
   size_t n_child_scans;
   DBUG_ENTER("get_best_disjunct_quick");
   DBUG_PRINT("info", ("Full table scan cost: %g", read_time));
@@ -4936,7 +4938,7 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge,
     if (imerge_trp)
     {
       TABLE_READ_PLAN *trp= merge_same_index_scans(param, imerge, imerge_trp,
-                                                   read_time);
+                                                   limit_read_time);
       if (trp != imerge_trp)
         DBUG_RETURN(trp);
     }
@@ -5409,9 +5411,8 @@ bool prepare_search_best_index_intersect(PARAM *param,
         same_index_prefix(cpk_scan->key_info, key_info, used_key_parts))
       continue;
 
-    cost= table->file->keyread_time((*index_scan)->keynr,
-                                    (*index_scan)->range_count,
-                                    (*index_scan)->records);
+    cost= table->quick_index_only_costs[(*index_scan)->keynr];
+
     if (cost >= cutoff_cost)
       continue;
    
@@ -8556,6 +8557,7 @@ int and_range_trees(RANGE_OPT_PARAM *param, SEL_TREE *tree1, SEL_TREE *tree2,
       if (key && key->type == SEL_ARG::IMPOSSIBLE)
       {
 	result->type= SEL_TREE::IMPOSSIBLE;
+        param->table->with_impossible_ranges.set_bit(param->real_keynr[key_no]);
         DBUG_RETURN(1);
       }
       result_keys.set_bit(key_no);
@@ -10541,7 +10543,6 @@ ha_rows check_quick_select(PARAM *param, uint idx, bool index_only,
   handler *file= param->table->file;
   ha_rows rows= HA_POS_ERROR;
   uint keynr= param->real_keynr[idx];
-  uint length;
   DBUG_ENTER("check_quick_select");
   
   /* Handle cases when we don't have a valid non-empty list of range */
@@ -10600,8 +10601,10 @@ ha_rows check_quick_select(PARAM *param, uint idx, bool index_only,
         MY_MIN(param->table->quick_condition_rows, rows);
       param->table->quick_rows[keynr]= rows;
       param->table->quick_costs[keynr]= cost->total_cost();
-      param->table->quick_key_io[keynr]=
-        file->get_io_cost(keynr, param->table->quick_rows[keynr], &length);
+      if (keynr == param->table->s->primary_key && pk_is_clustered)
+	param->table->quick_index_only_costs[keynr]= 0;
+      else
+        param->table->quick_index_only_costs[keynr]= cost->index_only_cost();
     }
   }
   /* Figure out if the key scan is ROR (returns rows in ROWID order) or not */
@@ -13656,7 +13659,7 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts,
     1/double(2*TIME_FOR_COMPARE); 
 
   const double cpu_cost= num_groups *
-                         (tree_traversal_cost + 1/double(TIME_FOR_COMPARE));
+                         (tree_traversal_cost + 1/double(TIME_FOR_COMPARE_IDX));
 
   *read_cost= io_cost + cpu_cost;
   *records= num_groups;
diff --git a/sql/opt_subselect.h b/sql/opt_subselect.h
index 031118288b9..e81b100e2a1 100644
--- a/sql/opt_subselect.h
+++ b/sql/opt_subselect.h
@@ -303,6 +303,7 @@ public:
       pos->loosescan_picker.loosescan_parts= best_max_loose_keypart + 1;
       pos->use_join_buffer= FALSE;
       pos->table=           tab;
+      pos->range_rowid_filter_info= tab->range_rowid_filter_info;
       // todo need ref_depend_map ?
       DBUG_PRINT("info", ("Produced a LooseScan plan, key %s, %s",
                           tab->table->key_info[best_loose_scan_key].name.str,
diff --git a/sql/rowid_filter.cc b/sql/rowid_filter.cc
index bcca9a0f528..4348eea081d 100644
--- a/sql/rowid_filter.cc
+++ b/sql/rowid_filter.cc
@@ -1,218 +1,556 @@
+/*
+   Copyright (c) 2018, 2019 MariaDB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#include "mariadb.h"
+#include "table.h"
+#include "sql_class.h"
+#include "opt_range.h"
 #include "rowid_filter.h"
+#include "sql_select.h"
+
+
+inline
+double Range_rowid_filter_cost_info::lookup_cost(
+                                     Rowid_filter_container_type cont_type)
+{
+  switch (cont_type) {
+  case SORTED_ARRAY_CONTAINER:
+    return log(est_elements)*0.01;
+  default:
+    DBUG_ASSERT(0);
+    return 0;
+  }
+}
 
 
 /**
-  Sets information about filter with key_numb index.
-  It sets a cardinality of filter, calculates its selectivity
-  and gets slope and interscept values.
+  @brief
+    The average gain in cost per row to use the range filter with this cost info
 */
 
-void Range_filter_cost_info::init(TABLE *tab, uint key_numb)
+inline
+double Range_rowid_filter_cost_info::avg_access_and_eval_gain_per_row(
+                                     Rowid_filter_container_type cont_type)
 {
-  table= tab;
-  key_no= key_numb;
-  cardinality= table->quick_rows[key_no];
-  b= filter_io_cost() + filter_write_cost() + filter_sort_cost();
-  selectivity= cardinality/((double) table->stat_records());
-  a= (1 + COST_COND_EVAL)*(1 - selectivity) - lookup_cost();
-  intersect_x_axis_abcissa= b/a;
+  return (1+1.0/TIME_FOR_COMPARE) * (1 - selectivity) -
+         lookup_cost(cont_type);
 }
 
-
 /**
   @brief
-    Sort available filters by their building cost in the increasing order
+    Initialize the cost info structure for a range filter
 
-  @details
-    The method starts sorting available filters from the first filter that
-    is not defined as the best filter. If there are two filters that are
-    defined as the best filters there is no need to sort other filters.
-    Best filters are already sorted by their building cost and have the
-    smallest bulding cost in comparison with other filters by definition.
+  @param cont_type  The type of the container of the range filter
+  @param tab        The table for which the range filter is evaluated
+  @param idx        The index used to create this range filter
+*/
+
+void Range_rowid_filter_cost_info::init(Rowid_filter_container_type cont_type,
+                                        TABLE *tab, uint idx)
+{
+  container_type= cont_type;
+  table= tab;
+  key_no= idx;
+  est_elements= table->quick_rows[key_no];
+  b= build_cost(container_type);
+  selectivity= est_elements/((double) table->stat_records());
+  a= avg_access_and_eval_gain_per_row(container_type);
+  if (a > 0)
+    cross_x= b/a;
+  abs_independent.clear_all();
+}
 
-    As the sorting method bubble sort is used.
+
+/**
+  @brief
+   Return the cost of building a range filter of a certain type
 */
 
-void TABLE::sort_range_filter_cost_info_array()
+double
+Range_rowid_filter_cost_info::build_cost(Rowid_filter_container_type cont_type)
 {
-  if (best_filter_count == 2)
-    return;
+  double cost= 0;
 
-  for (uint i= best_filter_count; i < range_filter_cost_info_elements-1; i++)
-  {
-    for (uint j= i+1; j < range_filter_cost_info_elements; j++)
-    {
-      if (range_filter_cost_info[i].intersect_x_axis_abcissa >
-          range_filter_cost_info[j].intersect_x_axis_abcissa)
-        swap_variables(Range_filter_cost_info,
-                       range_filter_cost_info[i],
-                       range_filter_cost_info[j]);
-    }
+  cost+= table->quick_index_only_costs[key_no];
+
+  switch (cont_type) {
+
+  case SORTED_ARRAY_CONTAINER:
+    cost+= ARRAY_WRITE_COST * est_elements; /* cost filling the container */
+    cost+= ARRAY_SORT_C * est_elements * log(est_elements); /* sorting cost */
+    break;
+  default:
+    DBUG_ASSERT(0);
   }
+
+  return cost;
+}
+
+
+Rowid_filter_container *Range_rowid_filter_cost_info::create_container()
+{
+  THD *thd= table->in_use;
+  uint elem_sz= table->file->ref_length;
+  Rowid_filter_container *res= 0;
+
+  switch (container_type) {
+  case SORTED_ARRAY_CONTAINER:
+    res= new (thd->mem_root) Rowid_filter_sorted_array(est_elements, elem_sz);
+    break;
+  default:
+    DBUG_ASSERT(0);
+  }
+  return res;
+}
+
+
+static
+int compare_range_rowid_filter_cost_info_by_a(
+                        Range_rowid_filter_cost_info **filter_ptr_1,
+                        Range_rowid_filter_cost_info **filter_ptr_2)
+{
+  double diff= (*filter_ptr_2)->get_a() - (*filter_ptr_1)->get_a();
+  return (diff < 0 ? -1 : (diff > 0 ? 1 : 0));
 }
 
 
 /**
   @brief
-    The method searches for the filters that can reduce the join cost the most
+    Prepare the array with cost info on range filters to be used by optimizer
 
   @details
-    The method looks through the available filters trying to choose the best
-    filter and eliminate as many filters as possible.
-
-    Filters are considered as a linear functions. The best filter is the linear
-    function that intersects all other linear functions not in the I quadrant
-    and has the biggest a (slope) value. This filter will reduce the partial
-    join cost the most. If it is possible the second best filter is also
-    chosen. The second best filter can be used if the ref access is made on
-    the index of the first best filter.
-
-    So there is no need to store all other filters except filters that
-    intersect in the I quadrant. It is impossible to say on this step which
-    filter is better and will give the biggest gain.
-
-    The number of filters that can be used is stored in the
-    range_filter_cost_info_elements variable.
+    The function removes the array of cost info on range filters the elements
+    for those range filters that won't be ever chosen as the best filter, no
+    matter what index will be used to access the table and at what step the
+    table will be joined.
 */
 
-void TABLE::prune_range_filters()
+void TABLE::prune_range_rowid_filters()
 {
-  key_map pruned_filter_map;
-  pruned_filter_map.clear_all();
-  Range_filter_cost_info *max_slope_filters[2] = {0, 0};
-
-  for (uint i= 0; i < range_filter_cost_info_elements; i++)
+  uint i, j;
+
+  /*
+    For the elements of the array with cost info on range filters
+    build a bit matrix of absolutely independent elements.
+    Two elements are absolutely independent if they such indexes that
+    there is no other index that overlaps both of them or is constraint
+    correlated with both of them. Use abs_independent key maps to store
+    the elements if this bit matrix.
+  */
+
+  Range_rowid_filter_cost_info **filter_ptr_1= range_rowid_filter_cost_info_ptr;
+  for (i= 0; i < range_rowid_filter_cost_info_elems; i++, filter_ptr_1++)
   {
-    Range_filter_cost_info *filter= &range_filter_cost_info[i];
-    if (filter->a < 0)
+    uint key_no= (*filter_ptr_1)->key_no;
+    Range_rowid_filter_cost_info **filter_ptr_2= filter_ptr_1 + 1;
+    for (j= i+1; j < range_rowid_filter_cost_info_elems; j++, filter_ptr_2++)
     {
-      range_filter_cost_info_elements--;
-      swap_variables(Range_filter_cost_info, range_filter_cost_info[i],
-                     range_filter_cost_info[range_filter_cost_info_elements]);
-      continue;
-    }
-    for (uint j= i+1; j < range_filter_cost_info_elements; j++)
-    {
-      Range_filter_cost_info *cand_filter= &range_filter_cost_info[j];
-
-      double intersect_x= filter->get_intersect_x(cand_filter);
-      double intersect_y= filter->get_intersect_y(intersect_x);
-
-      if (intersect_x > 0 && intersect_y > 0)
+      key_map map= key_info[key_no].overlapped;
+      map.intersect(key_info[(*filter_ptr_2)->key_no].overlapped);
+      if (map.is_clear_all())
       {
-        pruned_filter_map.set_bit(cand_filter->key_no);
-        pruned_filter_map.set_bit(filter->key_no);
+        (*filter_ptr_1)->abs_independent.set_bit((*filter_ptr_2)->key_no);
+        (*filter_ptr_2)->abs_independent.set_bit(key_no);
       }
     }
-    if (!pruned_filter_map.is_set(filter->key_no))
+  }
+
+  /* Sort the array range_filter_cost_info by 'a' in descending order */
+  my_qsort(range_rowid_filter_cost_info_ptr,
+           range_rowid_filter_cost_info_elems,
+           sizeof(Range_rowid_filter_cost_info *),
+           (qsort_cmp) compare_range_rowid_filter_cost_info_by_a);
+
+  /*
+    For each element check whether it is created for the filter that
+    can be ever chosen as the best one. If it's not the case remove
+    from the array. Otherwise put it in the array in such a place
+    that all already checked elements left the array are ordered by
+    cross_x.
+  */
+
+  Range_rowid_filter_cost_info **cand_filter_ptr=
+                                   range_rowid_filter_cost_info_ptr;
+  for (i= 0; i < range_rowid_filter_cost_info_elems; i++, cand_filter_ptr++)
+  {
+    bool is_pruned= false;
+    Range_rowid_filter_cost_info **usable_filter_ptr=
+                                     range_rowid_filter_cost_info_ptr;
+    key_map abs_indep;
+    abs_indep.clear_all();
+    for (uint j= 0; j < i; j++, usable_filter_ptr++)
     {
-      if (!max_slope_filters[0])
-        max_slope_filters[0]= filter;
+      if ((*cand_filter_ptr)->cross_x >= (*usable_filter_ptr)->cross_x)
+      {
+        if (abs_indep.is_set((*usable_filter_ptr)->key_no))
+	{
+          /*
+            The following is true here for the element e being checked:
+            There are at 2 elements e1 and e2 among already selected such that
+            e1.cross_x < e.cross_x and e1.a > e.a
+            and
+            e2.cross_x < e_cross_x and e2.a > e.a,
+            i.e. the range filters f1, f2 of both e1 and e2 always promise
+            better gains then the range filter of e.
+            As e1 and e2 are absolutely independent one of the range filters
+            f1, f2 will be always a better choice than f no matter what index
+            is chosen to access the table. Because of this the element e
+            can be safely removed from the array.
+	  */
+
+	  is_pruned= true;
+          break;
+        }
+        abs_indep.merge((*usable_filter_ptr)->abs_independent);
+      }
       else
       {
-        if (!max_slope_filters[1] ||
-            max_slope_filters[1]->a < filter->a)
-          max_slope_filters[1]= filter;
-        if (max_slope_filters[0]->a < max_slope_filters[1]->a)
-          swap_variables(Range_filter_cost_info*, max_slope_filters[0],
-                                                  max_slope_filters[1]);
+        /*
+          Move the element being checked to the proper position to have all
+          elements that have been already checked to be sorted by cross_x
+	*/
+        Range_rowid_filter_cost_info *moved= *cand_filter_ptr;
+        memmove(usable_filter_ptr+1, usable_filter_ptr,
+                sizeof(Range_rowid_filter_cost_info *) * (i-j-1));
+        *usable_filter_ptr= moved;
       }
     }
-  }
-
-  for (uint i= 0; i<2; i++)
-  {
-    if (max_slope_filters[i])
+    if (is_pruned)
     {
-      swap_variables(Range_filter_cost_info,
-                     range_filter_cost_info[i],
-                     *max_slope_filters[i]);
-      if (i == 0 &&
-          max_slope_filters[1] == &range_filter_cost_info[0])
-        max_slope_filters[1]= max_slope_filters[0];
-
-      best_filter_count++;
-      max_slope_filters[i]= &range_filter_cost_info[i];
+      /* Remove the checked element from the array */
+      memmove(cand_filter_ptr, cand_filter_ptr+1,
+              sizeof(Range_rowid_filter_cost_info *) *
+              (range_rowid_filter_cost_info_elems - 1 - i));
+      range_rowid_filter_cost_info_elems--;
     }
   }
-  sort_range_filter_cost_info_array();
 }
 
 
-void TABLE::select_usable_range_filters(THD *thd)
+/**
+   @brief
+     Return maximum number of elements that a container allowed to have
+ */
+
+static uint
+get_max_range_rowid_filter_elems_for_table(
+                                 THD *thd, TABLE *tab,
+                                 Rowid_filter_container_type cont_type)
+{
+  switch (cont_type) {
+  case SORTED_ARRAY_CONTAINER :
+    return thd->variables.max_rowid_filter_size/tab->file->ref_length;
+  default :
+    DBUG_ASSERT(0);
+    return 0;
+  }
+}
+
+
+/**
+  @brief
+    Prepare info on possible range filters used by optimizer
+
+  @param table    The thread handler
+
+  @details
+    The function first selects the indexes of the table that potentially
+    can be used for range filters and allocates an array of the objects
+    of the Range_rowid_filter_cost_info type to store cost info on
+    possible range filters and an array of pointers to these objects.
+    The latter is created for easy sorting of the objects with cost info
+    by different sort criteria. Then the function initializes the allocated
+    array with cost info for each possible range filter. After this
+    the function calls the method TABLE::prune_range_rowid_filters().
+    The method removes the elements of the array for the filters that
+    promise less gain then others remaining in the array in any situation
+    and optimizes the order of the elements for faster choice of the best
+    range filter.
+*/
+
+void TABLE::init_cost_info_for_usable_range_rowid_filters(THD *thd)
 {
   uint key_no;
   key_map usable_range_filter_keys;
   usable_range_filter_keys.clear_all();
   key_map::Iterator it(quick_keys);
+
+  /*
+    From all indexes that can be used for range accesses select only such that
+    - range filter pushdown is supported by the engine for them     (1)
+    - they are not clustered primary                                (2)
+    - the range filter containers for them are not too large        (3)
+  */
   while ((key_no= it++) != key_map::Iterator::BITMAP_END)
   {
-    if (quick_rows[key_no] >
-        thd->variables.max_rowid_filter_size/file->ref_length)
+    if (!(file->index_flags(key_no, 0, 1) & HA_DO_RANGE_FILTER_PUSHDOWN))  // !1
+      continue;
+    if (key_no == s->primary_key && file->primary_key_is_clustered())      // !2
+      continue;
+   if (quick_rows[key_no] >
+       get_max_range_rowid_filter_elems_for_table(thd, this,
+                                                  SORTED_ARRAY_CONTAINER)) // !3
       continue;
     usable_range_filter_keys.set_bit(key_no);
   }
 
-  if (usable_range_filter_keys.is_clear_all())
+  /*
+    Allocate an array of objects to store cost info for the selected filters
+    and allocate an array of pointers to these objects
+  */
+
+  range_rowid_filter_cost_info_elems= usable_range_filter_keys.bits_set();
+  if (!range_rowid_filter_cost_info_elems)
+    return;
+
+  range_rowid_filter_cost_info_ptr=
+    (Range_rowid_filter_cost_info **)
+      thd->calloc(sizeof(Range_rowid_filter_cost_info *) *
+                  range_rowid_filter_cost_info_elems);
+  range_rowid_filter_cost_info=
+    new (thd->mem_root)
+      Range_rowid_filter_cost_info[range_rowid_filter_cost_info_elems];
+  if (!range_rowid_filter_cost_info_ptr || !range_rowid_filter_cost_info)
+  {
+    range_rowid_filter_cost_info_elems= 0;
     return;
+  }
+
+  /* Fill the allocated array with cost info on the selected range filters */
 
-  range_filter_cost_info_elements= usable_range_filter_keys.bits_set();
-  range_filter_cost_info=
-    new (thd->mem_root) Range_filter_cost_info [range_filter_cost_info_elements];
-  Range_filter_cost_info *curr_filter_cost_info= range_filter_cost_info;
+  Range_rowid_filter_cost_info **curr_ptr= range_rowid_filter_cost_info_ptr;
+  Range_rowid_filter_cost_info *curr_filter_cost_info=
+                                                 range_rowid_filter_cost_info;
 
   key_map::Iterator li(usable_range_filter_keys);
   while ((key_no= li++) != key_map::Iterator::BITMAP_END)
   {
-    curr_filter_cost_info->init(this, key_no);
+    *curr_ptr= curr_filter_cost_info;
+    curr_filter_cost_info->init(SORTED_ARRAY_CONTAINER, this, key_no);
+    curr_ptr++;
     curr_filter_cost_info++;
   }
-  prune_range_filters();
+
+  prune_range_rowid_filters();
 }
 
 
-Range_filter_cost_info
-*TABLE::best_filter_for_current_join_order(uint ref_key_no,
-                                           double record_count,
-                                           double records)
+/**
+  @brief
+    Choose the best range filter for the given access of the table
+
+  @param access_key_no    The index by which the table is accessed
+  @param records   The estimated total number of key tuples with this access
+
+  @details
+    The function looks through the array of cost info for range filters
+    and chooses the element for the range filter that promise the greatest
+    gain with the the ref or range access of the table by access_key_no.
+    As the array is sorted by cross_x in ascending order the function stops
+    the look through as soon as it reaches the first element with
+    cross_x > records because the range filter for this element and the
+    range filters for all remaining elements do not promise positive gains
+
+  @retval  Pointer to the cost info for the range filter that promises
+           the greatest gain, NULL if there is no such range filter
+*/
+
+Range_rowid_filter_cost_info *
+TABLE::best_range_rowid_filter_for_partial_join(uint access_key_no,
+                                                double records)
 {
-  if (!this || range_filter_cost_info_elements == 0)
+  if (!this || range_rowid_filter_cost_info_elems == 0 ||
+      covering_keys.is_set(access_key_no))
     return 0;
 
-  double card= record_count*records;
-  Range_filter_cost_info *best_filter= &range_filter_cost_info[0];
-
-  if (card < best_filter->intersect_x_axis_abcissa)
+  /*
+    Currently we do not support usage of range filters if the table
+    is accessed by the clustered primary key. It does not make sense
+    if a full key is used. If the table is accessed by a partial
+    clustered primary key it would, but the current InnoDB code does not
+    allow it. Later this limitation will be lifted
+  */
+  if (access_key_no == s->primary_key && file->primary_key_is_clustered())
     return 0;
-  if (best_filter_count != 0)
+
+  Range_rowid_filter_cost_info *best_filter= 0;
+  double best_filter_gain= 0;
+
+  key_map *overlapped= &key_info[access_key_no].overlapped;
+  for (uint i= 0; i < range_rowid_filter_cost_info_elems ;  i++)
   {
-    if (best_filter->key_no == ref_key_no)
+    double curr_gain = 0;
+    Range_rowid_filter_cost_info *filter= range_rowid_filter_cost_info_ptr[i];
+
+    /*
+      Do not use a range filter that uses an in index correlated with
+      the index by which the table is accessed
+    */
+    if ((filter->key_no == access_key_no) ||
+        overlapped->is_set(filter->key_no))
+      continue;
+
+    if (records < filter->cross_x)
     {
-      if (best_filter_count == 2)
-      {
-        best_filter= &range_filter_cost_info[1];
-        if (card < best_filter->intersect_x_axis_abcissa)
-          return 0;
-        return best_filter;
-      }
+      /* Does not make sense to look through the remaining filters */
+      break;
+    }
+
+    curr_gain= filter->get_gain(records);
+    if (best_filter_gain < curr_gain)
+    {
+      best_filter_gain= curr_gain;
+      best_filter= filter;
+    }
+  }
+  return best_filter;
+}
+
+
+/**
+  @brief
+    Fill the range rowid filter performing the associated range index scan
+
+  @details
+    This function performs the range index scan associated with this
+    range filter and place into the filter the rowids / primary keys
+    read from key tuples when doing this scan.
+  @retval
+    false  on success
+    true   otherwise
+
+  @note
+    The function assumes that the quick select object to perform
+    the index range scan has been already created.
+
+  @note
+    Currently the same table handler is used to access the joined table
+    and to perform range index scan filling the filter.
+    In the future two different handlers will be used for this
+    purposes to facilitate a lazy building of the filter.
+*/
+
+bool Range_rowid_filter::fill()
+{
+  int rc= 0;
+  handler *file= table->file;
+  THD *thd= table->in_use;
+  QUICK_RANGE_SELECT* quick= (QUICK_RANGE_SELECT*) select->quick;
+
+  uint table_status_save= table->status;
+  Item *pushed_idx_cond_save= file->pushed_idx_cond;
+  uint pushed_idx_cond_keyno_save= file->pushed_idx_cond_keyno;
+  bool in_range_check_pushed_down_save= file->in_range_check_pushed_down;
+
+  table->status= 0;
+  file->pushed_idx_cond= 0;
+  file->pushed_idx_cond_keyno= MAX_KEY;
+  file->in_range_check_pushed_down= false;
+
+  /* We're going to just read rowids / primary keys */
+  table->prepare_for_position();
+
+  table->file->ha_start_keyread(quick->index);
+
+  if (quick->init() || quick->reset())
+    rc= 1;
+
+  while (!rc)
+  {
+    rc= quick->get_next();
+    if (thd->killed)
+      rc= 1;
+    if (!rc)
+    {
+      file->position(quick->record);
+      if (container->add(NULL, (char*) file->ref))
+        rc= 1;
     }
+  }
+
+  quick->range_end();
+  table->file->ha_end_keyread();
+
+  table->status= table_status_save;
+  file->pushed_idx_cond= pushed_idx_cond_save;
+  file->pushed_idx_cond_keyno= pushed_idx_cond_keyno_save;
+  file->in_range_check_pushed_down= in_range_check_pushed_down_save;
+
+  if (rc != HA_ERR_END_OF_FILE)
+    return 1;
+  table->file->rowid_filter_is_active= true;
+  return 0;
+}
+
+
+/**
+  @brief
+    Binary search in the sorted array of a rowid filter
+
+  @param ctxt   context of the search
+  @parab elem   rowid / primary key to look for
+
+  @details
+    The function looks for the rowid / primary key ' elem' in this container
+    assuming that ctxt contains a pointer to the TABLE structure created
+    for the table to whose row elem refers to.
+
+  @retval
+    true    elem is found in the container
+    false   otherwise
+*/
+
+bool Rowid_filter_sorted_array::check(void *ctxt, char *elem)
+{
+  TABLE *table= (TABLE *) ctxt;
+  if (!is_checked)
+  {
+    refpos_container.sort(refpos_order_cmp, (void *) (table->file));
+    is_checked= true;
+  }
+  int l= 0;
+  int r= refpos_container.elements()-1;
+  while (l <= r)
+  {
+    int m= (l + r) / 2;
+    int cmp= refpos_order_cmp((void *) (table->file),
+                              refpos_container.get_pos(m), elem);
+    if (cmp == 0)
+      return true;
+    if (cmp < 0)
+      l= m + 1;
     else
-      return best_filter;
+      r= m-1;
   }
+  return false;
+}
 
-  double best_filter_improvement= 0.0;
-  best_filter= 0;
 
-  for (uint i= best_filter_count; i < range_filter_cost_info_elements; i++)
+Range_rowid_filter::~Range_rowid_filter()
+{
+  delete container;
+  container= 0;
+  if (select)
   {
-    Range_filter_cost_info *filter= &range_filter_cost_info[i];
-    if (card < filter->intersect_x_axis_abcissa)
-      break;
-    if (best_filter_improvement < filter->get_filter_gain(card))
+    if (select->quick)
     {
-      best_filter_improvement= filter->get_filter_gain(card);
-      best_filter= filter;
+      delete select->quick;
+      select->quick= 0;
     }
+    delete select;
+    select= 0;
   }
-  return best_filter;
 }
diff --git a/sql/rowid_filter.h b/sql/rowid_filter.h
index 0d8520f25c5..3cc2c31555c 100644
--- a/sql/rowid_filter.h
+++ b/sql/rowid_filter.h
@@ -1,183 +1,451 @@
+/*
+   Copyright (c) 2018, 2019 MariaDB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
 #ifndef ROWID_FILTER_INCLUDED
 #define ROWID_FILTER_INCLUDED
 
-/**
- It makes sense to apply filters for a certain join order when the following
- inequality holds:
 
- #T + c4*#T > #T*sel(Fi) + c4*#T*sel(Fi) +
-              I/O(Fi) + c1*#(Fi) + c2*#(Fi)*log(#(Fi)) +
-              c3*#T (1),
+#include "mariadb.h"
+#include "sql_array.h"
+
+/*
+
+  What rowid / primary filters are
+  --------------------------------
+
+  Consider a join query Q of the form
+    SELECT * FROM T1, ... , Tk WHERE P.
+
+  For any of the table reference Ti(Q) from the from clause of Q different
+  rowid / primary key filters (pk-filters for short) can be built.
+  A pk-filter F built for Ti(Q) is a set of rowids / primary keys of Ti
+  F= {pk1,...,pkN} such that for any row r=r1||...||rk from the result set of Q
+  ri's rowid / primary key pk(ri) is contained in F.
+
+  When pk-filters are useful
+  --------------------------
+
+  If building a pk-filter F for Ti(Q )is not too costly and its cardinality #F
+  is much less than the cardinality of T - #T then using the pk-filter when
+  executing Q might be quite beneficial.
+
+  Let r be a random row from Ti. Let s(F) be the probability that pk(r)
+  belongs to F. Let BC(F) be the cost of building F.
+
+  Suppose that the optimizer has chosen for Q a plan with this join order
+  T1 => ... Tk and that the table Ti is accessed by a ref access using index I.
+  Let K = {k1,...,kM} be the set of all rowid/primary keys values used to access
+  rows of Ti when looking for matches in this table.to join Ti by index I.
+
+  Let's assume that two set sets K and F are uncorrelated.  With this assumption
+  if before accessing data from Ti by the rowid / primary key k we first
+  check whether k is in F then we can expect saving on M*(1-s(S)) accesses of
+  data rows from Ti. If we can guarantee that test whether k is in F is
+  relatively cheap then we can gain a lot assuming that BC(F) is much less
+  then the cost of fetching M*(1-s(S)) records from Ti and following
+  evaluation of conditions pushed into Ti.
+
+  Making pk-filter test cheap
+  ---------------------------
+
+  If the search structure to test whether an element is in F can be fully
+  placed in RAM then this test is expected to be be much cheaper than a random
+  access of a record from Ti. We'll consider two search structures for
+  pk-filters: ordered array and bloom filter. Ordered array is easy to
+  implement, but it's space consuming. If a filter contains primary keys
+  then at least space for each primary key from the filter must be allocated
+  in the search structure. On a the opposite a bloom filter requires a
+  fixed number of bits and this number does not depend on the cardinality
+  of the pk-filter (10 bits per element will serve pk-filter of any size).
 
- where #T         - the fanout of the partial join
-       Fi         - a filter for the index with the number i in 
-                    the key_map of available indexes for this table
-       sel(Fi)    - the selectivity of the index with the number 
-                    i
-       c4*#T,
-       c4*#T*sel(Fi) - a cost to apply available predicates
-       c4         - a constant to apply available predicates
-       I/O(Fi)    - a cost of the I/O accesses to Fi
-       #(Fi)      - a number of estimated records that range
-                    access would use
-       c1*#(Fi)   - a cost to write in Fi
-       c1         - a constant to write one element in Fi
-       c2*#(Fi)*log(#(Fi)) - a cost to sort in Fi
-       c2         - a sorting constant
-       c3*(#T)    - a cost to look-up into a current partial join
-       c3         - a constant to look-up into Fi
+*/
 
-  Let's set a new variable FBCi (filter building cost for the filter with
-  index i):
+/*
+
+  How and when the optimizer builds and uses range rowid filters
+  --------------------------------------------------------------
+
+  1. In make_join_statistics()
+       for each join table s
+         after the call of get_quick_record_count()
+           the TABLE::method init_cost_info_for_usable_range_rowid_filters()
+           is called
+           The method build an array of Range_rowid_filter_cost_info elements
+           containing the cost info on possible range filters for s->table.
+           The array is optimized for further usage.
+
+  2. For each partial join order when the optimizer considers joining
+     table s to this partial join
+       In the function best_access_path()
+       a. When evaluating a ref access r by index idx to join s
+          the optimizer estimates the effect of usage of each possible
+          range filter f and chooses one with the best gain. The gain
+          is taken into account when the cost of thr ref access r is
+          calculated. If it turns out that this is the best ref access
+          to join s then the info about the chosen filter together
+          with the info on r is remembered in the corresponding element
+          of the array of POSITION structures.
+          [We evaluate every pair (ref access, range_filter) rather then
+           every pair (best ref access, range filter) because if the index
+           ref_idx used for ref access r correlates with the index rf_idx
+           used  by the filter f then the pair (r,f) is not evaluated
+           at all as we don't know how to estimate the effect of correlation
+           between ref_idx and rf_idx.]
+       b. When evaluating the best range access to join table s the
+          optimizer estimates the effect of usage of each possible
+          range filter f and chooses one with the best gain.
+          [Here we should have evaluated every pair (range access,
+           range filter) as well, but it's not done yet.]
+
+  3. When the cheapest execution plan has been chosen and after the
+     call of JOIN::get_best_combination()
+       The method JOIN::make_range_rowid_filters() is called
+       For each range rowid filter used in the chosen execution plan
+       the method creates a quick select object to be able to perform
+       index range scan to fill the filter at the execution stage.
+       The method also creates Range_rowid_filter objects that are
+       used at the execution stage.
+
+  4. Just before the execution stage
+       The method JOIN::init_range_rowid_filters() is called.
+       For each join table s that is to be accessed with usage of a range
+       filter the method allocates containers for the range filter and
+       it lets the engine know that the filter will be used when
+       accessing s.
+
+  5. At the execution stage
+       In the function sub_select() just before the first access of a join
+       table s employing a range filter
+         The method JOIN_TAB::build_range_rowid_filter_if_needed() is called
+         The method fills the filter using the quick select created by
+         JOIN::make_range_rowid_filters().
+
+  6. The accessed key tuples are checked against the filter within the engine
+     using the info pushed into it.
 
-  FBCi = I/O(Fi) + c1*#(Fi) + c2*#(Fi)*log(#(Fi))
+*/
 
-  It can be seen that FBCi doesn't depend on #T.
+class TABLE;
+class SQL_SELECT;
+class Rowid_filter_container;
+class Range_rowid_filter_cost_info;
 
-  So using this variable (1) can be rewritten:
+/* Cost to write rowid into array */
+#define ARRAY_WRITE_COST      0.005
+/* Factor used to calculate cost of sorting rowids in array */
+#define ARRAY_SORT_C          0.01
+/* Cost to evaluate condition */
+#define COST_COND_EVAL  0.2
 
-  #T + c4*#T > #T*sel(Fi) + c4*#T*sel(Fi) +
-               FBCi +
-               c3*#T
+typedef enum
+{
+  SORTED_ARRAY_CONTAINER,
+  BLOOM_FILTER_CONTAINER
+} Rowid_filter_container_type;
 
-  To get a possible cost improvement when a filter is used right part
-  of the (1) inequality should be deducted from the left part.
-  Denote it as G(#T):
+/**
+  @class Rowid_filter_container
 
-  G(#T)= #T + c4*#T - (#T*sel(Fi) + c4*#T*sel(Fi) + FBCi + c3*#T) (2)
+  The interface for different types of containers to store info on the set
+  of rowids / primary keys that defines a pk-filter.
 
-  On the prepare stage when filters are created #T value isn't known.
+  There will be two implementations of this abstract class.
+  - sorted array
+  - bloom filter
+*/
 
-  To find out what filter is the best among available one for the table
-  (what filter gives the biggest gain) a knowledge about linear functions
-  can be used. Consider filter gain as a linear function:
+class Rowid_filter_container : public Sql_alloc
+{
+public:
 
-  Gi(#T)= ai*#T + bi (3)
+  virtual Rowid_filter_container_type get_type() = 0;
 
-  where ai= 1+c4-c3-sel(Fi)*(1+c4),
-        bi= -FBCi
+  /* Allocate memory for the container */
+  virtual bool alloc() = 0;
 
-  Filter gain can be interpreted as an ordinate, #T as abscissa.
+  /*
+    @brief Add info on a rowid / primary to the container
+    @param ctxt   The context info (opaque)
+    @param elem   The rowid / primary key to be added to the container
+    @retval       true if elem is successfully added
+  */
+  virtual bool add(void *ctxt, char *elem) = 0;
 
-  So the aim is to find the linear function that has the biggest ordinate value
-  for each positive abscissa (because #T can't be negative) comparing with
-  the other available functions.
+  /*
+    @brief Check whether a rowid / primary key is in container
+    @param ctxt   The context info (opaque)
+    @param elem   The rowid / primary key to be checked against the container
+    @retval       False if elem is definitely not in the container
+  */
+  virtual bool check(void *ctxt, char *elem) = 0;
 
-  Consider two filters Fi, Fj or linear functions with a positive slope.
-  To find out which linear function is better let's find their intersection
-  point coordinates.
+  virtual ~Rowid_filter_container() {}
+};
 
-  Gi(#T0)= Gj(#T0) (using (2))=>
-  #T0= (bj - bi)/(ai - aj) (using (3))
-  =>
-  #T0= (BCFj-BCFi)/((sel(Fj)-sel(Fi))*(1+c4))
 
-  If put #T0 value into the (3) formula G(#T0) can be easily found.
+/**
+  @class Rowid_filter
 
-  It can be seen that if two linear functions intersect in II, III or IV
-  quadrants the linear function with a bigger slope value will always
-  be better.
+  The interface for different types of pk-filters
 
-  If two functions intersect in the I quadrant for #T1 < #T0 a function
-  with a smaller slope value will give a better gain and when #T1 > #T0
-  function with a bigger slope will give better gain.
+  Currently we support only range pk filters.
+*/
 
-  for each #T1 > #T0  if (ai > aj) => (Gi(#T1) >= Gj(#T1))
-           #T1 <= #T0 if (ai > aj) => (Gi(#T1) <= Gj(#T1))
+class Rowid_filter : public Sql_alloc
+{
+protected:
 
-  So both linear functions should be saved.
+  /* The container to store info the set of elements in the filter */
+  Rowid_filter_container *container;
 
-  Interesting cases:
+public:
+  Rowid_filter(Rowid_filter_container *container_arg)
+    : container(container_arg) {}
 
-  1. For Fi,Fj filters ai=aj. 
+  /*
+    Build the filter :
+    fill it with info on the set of elements placed there
+  */
+  virtual bool build() = 0;
 
-    In this case intercepts bi and bj should be compared.
-    The filter with the biggest intercept will give a better result.
+  /*
+    Check whether an element is in the filter.
+    Returns false is the elements is definitely not in the filter.
+  */
+  virtual bool check(char *elem) = 0;
 
-  2. Only one filter remains after the calculations and for some join order
-     it is equal to the index that is used to access table. Therefore, this
-     filter can't be used.
-     
-     In this case the gain is computed for every filter that can be constructed
-     for this table.
+  virtual ~Rowid_filter() {}
 
-  After information about filters is computed for each partial join order
-  it is checked if the filter can be applied to the current table.
-  If it gives a cost improvement it is saved as the best plan for this
-  partial join.
-*/
+  Rowid_filter_container *get_container() { return container; }
+};
 
-#include "mariadb.h"
-#include "sql_class.h"
-#include "table.h"
 
-/* Cost to write into filter */
-#define COST_WRITE      0.01
-/* Weight factor for filter sorting */
-#define CNST_SORT       0.01
-/* Cost to evaluate condition */
-#define COST_COND_EVAL  0.2
+/**
+  @class Rowid_filter_container
 
-class QUICK_RANGE_SELECT;
+  The implementation of the Rowid_interface used for pk-filters
+  that are filled when performing range index scans.
+*/
 
-class Range_filter_cost_info : public Sql_alloc
+class Range_rowid_filter: public Rowid_filter
 {
-public:
+  /* The table for which the rowid filter is built */
   TABLE *table;
-  uint key_no;
-  double cardinality;
-  double b;                         // intercept of the linear function
-  double a;                         // slope of the linear function
-  double selectivity;
-  double intersect_x_axis_abcissa;
+  /* The select to perform the range scan to fill the filter */
+  SQL_SELECT *select;
+  /* The cost info on the filter (used for EXPLAIN/ANALYZE) */
+  Range_rowid_filter_cost_info *cost_info;
+
+public:
+  Range_rowid_filter(TABLE *tab,
+                     Range_rowid_filter_cost_info *cost_arg,
+                     Rowid_filter_container *container_arg,
+                     SQL_SELECT *sel)
+    : Rowid_filter(container_arg), table(tab), select(sel), cost_info(cost_arg)
+  {}
+
+  ~Range_rowid_filter();
+
+  bool build() { return fill(); }
 
-  /**
-    Filter cost functions
+  bool check(char *elem) { return container->check(table, elem); }
+
+  bool fill();
+
+  SQL_SELECT *get_select() { return select; }
+};
+
+
+/**
+  @class Refpos_container_sorted_array
+
+  The wrapper class over Dynamic_array<char> to facilitate operations over
+  array of elements of the type char[N] where N is the same for all elements
+*/
+
+class Refpos_container_sorted_array : public Sql_alloc
+{
+  /* 
+    Maximum number of elements in the array
+    (Now is used only at the initialization of the dynamic array)
   */
-  /* Cost to lookup into filter */
-  inline double lookup_cost()
+  uint max_elements;
+  /* Number of bytes allocated for an element */
+  uint elem_size;
+  /* The dynamic array over which the wrapper is built */
+  Dynamic_array<char> *array;
+
+public:
+
+ Refpos_container_sorted_array(uint max_elems, uint elem_sz)
+    :  max_elements(max_elems), elem_size(elem_sz), array(0) {}
+
+  ~Refpos_container_sorted_array()
   {
-    return log(cardinality)*0.01;
+    delete array;
+    array= 0;
   }
 
-  /* IO accesses cost to access filter */
-  inline double filter_io_cost()
-  { return table->quick_key_io[key_no]; }
+  bool alloc()
+  {
+    array= new Dynamic_array<char> (elem_size * max_elements,
+                                    elem_size * max_elements/sizeof(char) + 1);
+    return array == NULL;
+  }
 
-  /* Cost to write elements in filter */
-  inline double filter_write_cost()
-  { return COST_WRITE*cardinality; }
+  bool add(char *elem)
+  {
+    for (uint i= 0; i < elem_size; i++)
+    {
+      if (array->append(elem[i]))
+	return true;
+    }
+    return false;
+  }
 
-  /* Cost to sort elements in filter */
-  inline double filter_sort_cost()
-  { 
-    return CNST_SORT*cardinality*log(cardinality);
+  char *get_pos(uint n)
+  {
+    return array->get_pos(n * elem_size);
   }
-  /* End of filter cost functions */
 
-  Range_filter_cost_info() : table(0), key_no(0) {}
+  uint elements() { return array->elements() / elem_size; }
+
+  void sort (int (*cmp) (void *ctxt, const void *el1, const void *el2),
+                         void *cmp_arg)
+  {
+    my_qsort2(array->front(), array->elements()/elem_size,
+              elem_size, (qsort2_cmp) cmp, cmp_arg);
+  }
+};
+
+
+/**
+  @class Rowid_filter_sorted_array
+
+  The implementation of the Rowid_filter_container interface as
+  a sorted array container of rowids / primary keys
+*/
+
+class Rowid_filter_sorted_array: public Rowid_filter_container
+{
+  /* The dynamic array to store rowids / primary keys */
+  Refpos_container_sorted_array refpos_container;
+  /* Initially false, becomes true after the first call of (check() */
+  bool is_checked;
+
+public:
+  Rowid_filter_sorted_array(uint elems, uint elem_size)
+    : refpos_container(elems, elem_size), is_checked(false) {}
+
+  Rowid_filter_container_type get_type()
+  { return SORTED_ARRAY_CONTAINER; }
+
+  bool alloc() { return refpos_container.alloc(); }
+
+  bool add(void *ctxt, char *elem) { return refpos_container.add(elem); }
+
+  bool check(void *ctxt, char *elem);
+};
+
+/**
+  @class Range_rowid_filter_cost_info
+
+  An objects of this class is created for each potentially usable
+  range filter. It contains the info that allows to figure out
+  whether usage of the range filter promises some gain.
+*/
+
+class Range_rowid_filter_cost_info : public Sql_alloc
+{
+  /* The table for which the range filter is to be built (if needed) */
+  TABLE *table;
+  /* Estimated number of elements in the filter */
+  double est_elements;
+  /* The cost of building the range filter */
+  double b;
+  /*
+     a*N-b yields the gain of the filter
+     for N key tuples of the index key_no
+  */
+  double a;
+  /* The value of N where the gain is  0 */
+  double cross_x;
+  /* Used for pruning of the potential range filters */
+  key_map abs_independent;
+
+public:
+  /* The type of the container of the range filter */
+  Rowid_filter_container_type container_type;
+  /* The index whose range scan would be used to build the range filter */
+  uint key_no;
+  /* The selectivity of the range filter */
+  double selectivity;
+
+  Range_rowid_filter_cost_info() : table(0), key_no(0) {}
 
-  void init(TABLE *tab, uint key_numb);
+  void init(Rowid_filter_container_type cont_type,
+            TABLE *tab, uint key_no);
 
-  inline double get_intersect_x(Range_filter_cost_info *filter)
+  double build_cost(Rowid_filter_container_type container_type);
+
+  inline double lookup_cost(Rowid_filter_container_type cont_type);
+
+  inline double
+  avg_access_and_eval_gain_per_row(Rowid_filter_container_type cont_type);
+
+  /* Get the gain that usage of filter promises for 'rows' key tuples */
+  inline double get_gain(double rows)
   {
-    if (a == filter->a)
-      return DBL_MAX;
-    return (b - filter->b)/(a - filter->a);
+    return rows * a - b;
   }
-  inline double get_intersect_y(double intersect_x)
+
+  /*
+    The gain promised by usage of the filter at the assumption that
+    when the table is accessed without the filter at most worst_seeks
+    pages are fetched from disk to access the data rows of the table
+  */
+  inline double get_adjusted_gain(double rows, double worst_seeks)
   {
-    if (intersect_x == DBL_MAX)
-      return DBL_MAX;
-    return intersect_x*a - b;
+    return get_gain(rows) -
+           (1 - selectivity) * (rows - MY_MIN(rows, worst_seeks));
   }
 
-  /**
-    Get a gain that a usage of filter in some partial join order
-    with the cardinaly card gives
+  /*
+    The gain promised by usage of the filter
+    due to less condition evaluations
   */
-  inline double get_filter_gain(double card)
-  {  return card*a - b;  }
+  inline double get_cmp_gain(double rows)
+  {
+    return rows * (1 - selectivity) / TIME_FOR_COMPARE;
+  }
+
+  Rowid_filter_container *create_container();
+
+  double get_a() { return a; }
+
+  friend
+  void TABLE::prune_range_rowid_filters();
+
+  friend
+  void TABLE::init_cost_info_for_usable_range_rowid_filters(THD *thd);
+
+  friend
+  Range_rowid_filter_cost_info *
+  TABLE::best_range_rowid_filter_for_partial_join(uint access_key_no,
+                                                  double records);
 };
 
 #endif /* ROWID_FILTER_INCLUDED */
diff --git a/sql/sql_array.h b/sql/sql_array.h
index 0e5246b7e2a..4a49ade5576 100644
--- a/sql/sql_array.h
+++ b/sql/sql_array.h
@@ -166,6 +166,19 @@ public:
     return ((const Elem*)array.buffer) + array.elements - 1;
   }
 
+  /// @returns pointer to n-th element
+  Elem *get_pos(size_t idx)
+  {
+    return ((Elem*)array.buffer) + idx;
+  }
+
+  /// @returns pointer to n-th element
+  const Elem *get_pos(size_t idx) const
+  {
+    return ((const Elem*)array.buffer) + idx;
+  }
+
+
   /**
      @retval false ok
      @retval true  OOM, @c my_error() has been called.
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 0dd82351719..eae7d988772 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -2673,7 +2673,7 @@ void THD::make_explain_field_list(List<Item> &field_list, uint8 explain_flags,
                        mem_root);
   item->maybe_null=1;
   field_list.push_back(item=new (mem_root)
-                       Item_empty_string(this, "ref|filter",
+                       Item_empty_string(this, "ref",
                                          NAME_CHAR_LEN*MAX_REF_PARTS, cs),
                        mem_root);
   item->maybe_null=1;
diff --git a/sql/sql_const.h b/sql/sql_const.h
index be26de872df..e6dcc3420cf 100644
--- a/sql/sql_const.h
+++ b/sql/sql_const.h
@@ -194,7 +194,11 @@
   instead of reading with keys.  The number says how many evaluation of the
   WHERE clause is comparable to reading one extra row from a table.
 */
-#define TIME_FOR_COMPARE   5	// 5 compares == one read
+#define TIME_FOR_COMPARE         5	//  5 compares == one read
+#define TIME_FOR_COMPARE_IDX    20
+
+#define IDX_BLOCK_COPY_COST  ((double) 1 / TIME_FOR_COMPARE)
+#define IDX_LOOKUP_COST      ((double) 1 / 8)
 
 /**
   Number of comparisons of table rowids equivalent to reading one row from a 
diff --git a/sql/sql_explain.cc b/sql/sql_explain.cc
index 23bc1e906a0..c7c6f15f7cc 100644
--- a/sql/sql_explain.cc
+++ b/sql/sql_explain.cc
@@ -380,11 +380,13 @@ int print_explain_row(select_result_sink *result,
   item_list.push_back(item, mem_root);
 
   /* 'rows' */
+  StringBuffer<64> rows_str;
   if (rows)
   {
+    rows_str.append_ulonglong((ulonglong)(*rows));
     item_list.push_back(new (mem_root)
-                        Item_int(thd, *rows, MY_INT64_NUM_DECIMAL_DIGITS),
-                        mem_root);
+                        Item_string_sys(thd, rows_str.ptr(),
+                                        rows_str.length()), mem_root);
   }
   else
     item_list.push_back(item_null, mem_root);
@@ -1113,12 +1115,14 @@ void Explain_table_access::fill_key_str(String *key_str, bool is_json) const
    - this is just used key length for ref/range 
    - for index_merge, it is a comma-separated list of lengths.
    - for hash join, it is key_len:pseudo_key_len
+   - [tabular form only] rowid filter length is added after "|".
 
-  The column looks identical in tabular and json forms. In JSON, we consider
-  the column legacy, it is superceded by used_key_parts.
+  In JSON, we consider this column to be legacy, it is superceded by
+  used_key_parts.
 */
 
-void Explain_table_access::fill_key_len_str(String *key_len_str) const
+void Explain_table_access::fill_key_len_str(String *key_len_str,
+                                            bool is_json) const
 {
   bool is_hj= (type == JT_HASH || type == JT_HASH_NEXT || 
                type == JT_HASH_RANGE || type == JT_HASH_INDEX_MERGE);
@@ -1147,13 +1151,12 @@ void Explain_table_access::fill_key_len_str(String *key_len_str) const
     key_len_str->append(buf, length);
   }
 
-  if (key.get_filter_key_length() != (uint)-1)
+  if (!is_json && rowid_filter)
   {
-    char buf[64];
-    size_t length;
-    key_len_str->append(',');
-    length= longlong10_to_str(key.get_filter_key_length(), buf, 10) - buf;
-    key_len_str->append(buf, length);
+    key_len_str->append('|');
+    StringBuffer<64> filter_key_len;
+    rowid_filter->quick->print_key_len(&filter_key_len);
+    key_len_str->append(filter_key_len);
   }
 }
 
@@ -1240,7 +1243,18 @@ int Explain_table_access::print_explain(select_result_sink *output, uint8 explai
   }
 
   /* `type` column */
-  push_str(thd, &item_list, join_type_str[type]);
+   StringBuffer<64> join_type_buf;
+  if (rowid_filter == NULL)
+    push_str(thd, &item_list, join_type_str[type]);
+  else
+  {
+    join_type_buf.append(join_type_str[type]);
+    join_type_buf.append("|filter");
+    item_list.push_back(new (mem_root)
+                        Item_string_sys(thd, join_type_buf.ptr(),
+                                             join_type_buf.length()),
+                        mem_root);
+  }
 
   /* `possible_keys` column */
   StringBuffer<64> possible_keys_buf;
@@ -1252,6 +1266,14 @@ int Explain_table_access::print_explain(select_result_sink *output, uint8 explai
   /* `key` */
   StringBuffer<64> key_str;
   fill_key_str(&key_str, false);
+
+  if (rowid_filter)
+  {
+    key_str.append("|");
+    StringBuffer<64> rowid_key_str;
+    rowid_filter->quick->print_key(&rowid_key_str);
+    key_str.append(rowid_key_str);
+  }
   
   if (key_str.length() > 0)
     push_string(thd, &item_list, &key_str);
@@ -1260,7 +1282,7 @@ int Explain_table_access::print_explain(select_result_sink *output, uint8 explai
 
   /* `key_len` */
   StringBuffer<64> key_len_str;
-  fill_key_len_str(&key_len_str);
+  fill_key_len_str(&key_len_str, false);
 
   if (key_len_str.length() > 0)
     push_string(thd, &item_list, &key_len_str);
@@ -1288,10 +1310,10 @@ int Explain_table_access::print_explain(select_result_sink *output, uint8 explai
   {
     rows_str.append_ulonglong((ulonglong)rows);
 
-    if (is_filter_set())
+    if (rowid_filter)
     {
       rows_str.append(" (");
-      rows_str.append_ulonglong(filter_perc);
+      rows_str.append_ulonglong(round(rowid_filter->selectivity * 100.0));
       rows_str.append("%)");
     }
     item_list.push_back(new (mem_root)
@@ -1376,7 +1398,7 @@ int Explain_table_access::print_explain(select_result_sink *output, uint8 explai
     extra_buf.append(STRING_WITH_LEN("Using filesort"));
   }
 
-  if (is_filter_set())
+  if (rowid_filter)
   {
     if (first)
       first= false;
@@ -1573,6 +1595,19 @@ void add_json_keyset(Json_writer *writer, const char *elem_name,
 }
 
 
+void Explain_rowid_filter::print_explain_json(Explain_query *query,
+                                              Json_writer *writer,
+                                              bool is_analyze)
+{
+  Json_writer_nesting_guard guard(writer);
+  writer->add_member("rowid_filter").start_object();
+  quick->print_json(writer);
+  writer->add_member("rows").add_ll(rows);
+  writer->add_member("selectivity_pct").add_double(selectivity * 100.0);
+  writer->end_object(); // rowid_filter
+}
+
+
 void Explain_table_access::print_explain_json(Explain_query *query,
                                               Json_writer *writer,
                                               bool is_analyze)
@@ -1645,7 +1680,7 @@ void Explain_table_access::print_explain_json(Explain_query *query,
 
   /* `key_length` */
   StringBuffer<64> key_len_str;
-  fill_key_len_str(&key_len_str);
+  fill_key_len_str(&key_len_str, true);
   if (key_len_str.length())
     writer->add_member("key_length").add_str(key_len_str);
 
@@ -1670,6 +1705,11 @@ void Explain_table_access::print_explain_json(Explain_query *query,
   if (!ref_list.is_empty())
     print_json_array(writer, "ref", ref_list);
 
+  if (rowid_filter)
+  {
+    rowid_filter->print_explain_json(query, writer, is_analyze);
+  }
+
   /* r_loops (not present in tabular output) */
   if (is_analyze)
   {
diff --git a/sql/sql_explain.h b/sql/sql_explain.h
index 71f90477977..a161f6c1049 100644
--- a/sql/sql_explain.h
+++ b/sql/sql_explain.h
@@ -583,7 +583,8 @@ class Explain_index_use : public Sql_alloc
 {
   char *key_name;
   uint key_len;
-  uint key_len_for_filter;
+  char *filter_name;
+  uint filter_len;
 public:
   String_list key_parts_list;
   
@@ -596,16 +597,43 @@ public:
   {
     key_name= NULL;
     key_len= (uint)-1;
-    key_len_for_filter= (uint)-1;
+    filter_name= NULL;
+    filter_len= (uint)-1;
   }
   bool set(MEM_ROOT *root, KEY *key_name, uint key_len_arg);
   bool set_pseudo_key(MEM_ROOT *root, const char *key_name);
-  void set_filter_key_length(uint key_length_arg)
-  { key_len_for_filter= key_length_arg; }
 
   inline const char *get_key_name() const { return key_name; }
   inline uint get_key_len() const { return key_len; }
-  inline uint get_filter_key_length() const { return key_len_for_filter; }
+  //inline const char *get_filter_name() const { return filter_name; }
+};
+
+
+/*
+  Query Plan data structure for Rowid filter.
+*/
+class Explain_rowid_filter : public Sql_alloc
+{
+public:
+  /* Quick select used to collect the rowids into filter */
+  Explain_quick_select *quick;
+
+  /* How many rows the above quick select is expected to return */
+  ha_rows rows;
+
+  /* Expected selectivity for the filter */
+  double selectivity;
+
+  void print_explain_json(Explain_query *query, Json_writer *writer,
+                          bool is_analyze);
+
+  /*
+    TODO:
+      Here should be ANALYZE members:
+      - r_rows for the quick select
+      - An object that tracked the table access time
+      - real selectivity of the filter.
+  */
 };
 
 
@@ -675,6 +703,7 @@ public:
   void print_json(Json_writer *writer, bool is_analyze);
 };
 
+
 /*
   EXPLAIN data structure for a single JOIN_TAB.
 */
@@ -695,7 +724,7 @@ public:
     pushed_index_cond(NULL),
     sjm_nest(NULL),
     pre_join_sort(NULL),
-    filter_perc(UINT_MAX)
+    rowid_filter(NULL)
   {}
   ~Explain_table_access() { delete sjm_nest; }
 
@@ -802,12 +831,7 @@ public:
   Exec_time_tracker op_tracker;
   Table_access_tracker jbuf_tracker;
   
-  /**
-    How many rows are left after the filter was applied
-    to the initial rows count in percentages.
-  */
-  double filter_perc;
-  inline bool is_filter_set() const { return (filter_perc != UINT_MAX); }
+  Explain_rowid_filter *rowid_filter;
 
   int print_explain(select_result_sink *output, uint8 explain_flags, 
                     bool is_analyze,
@@ -819,7 +843,7 @@ public:
 private:
   void append_tag_name(String *str, enum explain_extra_tag tag);
   void fill_key_str(String *key_str, bool is_json) const;
-  void fill_key_len_str(String *key_len_str) const;
+  void fill_key_len_str(String *key_len_str, bool is_json) const;
   double get_r_filtered();
   void tag_to_json(Json_writer *writer, enum explain_extra_tag tag);
 };
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index 8f55497d8d0..b5b77c2c43b 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -101,6 +101,9 @@ static int sort_keyuse(KEYUSE *a,KEYUSE *b);
 static bool are_tables_local(JOIN_TAB *jtab, table_map used_tables);
 static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
 			       bool allow_full_scan, table_map used_tables);
+static ha_rows get_quick_record_count(THD *thd, SQL_SELECT *select,
+				      TABLE *table,
+				      const key_map *keys,ha_rows limit);
 void best_access_path(JOIN *join, JOIN_TAB *s, 
                              table_map remaining_tables, uint idx, 
                              bool disable_jbuf, double record_count,
@@ -1462,6 +1465,116 @@ int JOIN::optimize()
 }
 
 
+/**
+  @brief
+    Create range filters objects needed in execution for all join tables
+
+  @details
+    For each join table from the chosen execution plan such that a range filter
+    is used when joining this table the function creates a Rowid_filter object
+    for this range filter. In order to do this the function first constructs
+    a quick select to scan the range for this  range filter. Then it creates
+    a container for the range filter and finally constructs a Range_rowid_filter
+    object a pointer to which is set in the field JOIN_TAB::rowid_filter of
+    the joined table.
+
+  @retval false  always
+*/
+
+bool JOIN::make_range_rowid_filters()
+{
+  DBUG_ENTER("make_range_rowid_filters");
+
+  JOIN_TAB *tab;
+
+  for (tab= first_linear_tab(this, WITH_BUSH_ROOTS, WITHOUT_CONST_TABLES);
+       tab;
+       tab= next_linear_tab(this, tab, WITH_BUSH_ROOTS))
+  {
+    if (!tab->range_rowid_filter_info)
+      continue;
+    int err;
+    SQL_SELECT *sel= NULL;
+    Rowid_filter_container *filter_container= NULL;
+    Item **sargable_cond= get_sargable_cond(this, tab->table);
+    sel= make_select(tab->table, const_table_map, const_table_map,
+                     *sargable_cond, (SORT_INFO*) 0, 1, &err);
+    if (!sel)
+      continue;
+
+    key_map filter_map;
+    filter_map.clear_all();
+    filter_map.set_bit(tab->range_rowid_filter_info->key_no);
+    filter_map.merge(tab->table->with_impossible_ranges);
+    bool force_index_save= tab->table->force_index;
+    tab->table->force_index= true;
+    (void) sel->test_quick_select(thd, filter_map, (table_map) 0,
+                                  (ha_rows) HA_POS_ERROR,
+				  true, false, true);
+    tab->table->force_index= force_index_save;
+    if (thd->is_error())
+      goto no_filter;
+    DBUG_ASSERT(sel->quick);
+    filter_container=
+      tab->range_rowid_filter_info->create_container();
+    if (filter_container)
+    {
+      tab->rowid_filter=
+        new (thd->mem_root) Range_rowid_filter(tab->table,
+                                               tab->range_rowid_filter_info,
+                                               filter_container, sel);
+      if (tab->rowid_filter)
+        continue;
+    }
+  no_filter:
+    if (sel->quick)
+      delete sel->quick;
+    delete sel;
+  }
+
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief
+    Allocate memory the rowid containers of the used the range filters
+
+  @details
+    For each join table from the chosen execution plan such that a range filter
+    is used when joining this table the function allocate memory for the
+    rowid container employed by the filter. On success it lets the table engine
+    know that what rowid filter will be used when accessing the table rows.
+
+  @retval false  always
+*/
+
+bool
+JOIN::init_range_rowid_filters()
+{
+  DBUG_ENTER("init_range_rowid_filters");
+
+  JOIN_TAB *tab;
+
+  for (tab= first_linear_tab(this, WITH_BUSH_ROOTS, WITHOUT_CONST_TABLES);
+       tab;
+       tab= next_linear_tab(this, tab, WITH_BUSH_ROOTS))
+  {
+    if (!tab->rowid_filter)
+      continue;
+    if (tab->rowid_filter->get_container()->alloc())
+    {
+      delete tab->rowid_filter;
+      tab->rowid_filter= 0;
+      continue;
+    }
+    tab->table->file->rowid_filter_push(tab->rowid_filter);
+    tab->is_rowid_filter_built= false;
+  }
+  DBUG_RETURN(0);
+}
+
+
 int JOIN::init_join_caches()
 {
   JOIN_TAB *tab;
@@ -1980,6 +2093,9 @@ int JOIN::optimize_stage2()
   if (get_best_combination())
     DBUG_RETURN(1);
   
+  if (make_range_rowid_filters())
+    DBUG_RETURN(1);
+
   if (select_lex->handle_derived(thd->lex, DT_OPTIMIZE))
     DBUG_RETURN(1);
 
@@ -2644,6 +2760,9 @@ int JOIN::optimize_stage2()
   if (init_join_caches())
     DBUG_RETURN(1);
 
+  if (init_range_rowid_filters())
+    DBUG_RETURN(1);
+
   error= 0;
 
   if (select_options & SELECT_DESCRIBE)
@@ -5035,9 +5154,8 @@ make_join_statistics(JOIN *join, List<TABLE_LIST> &tables_list,
         select->quick=0;
         impossible_range= records == 0 && s->table->reginfo.impossible_range;
         if (join->thd->lex->sql_command == SQLCOM_SELECT &&
-            join->table_count > 1 &&
             optimizer_flag(join->thd, OPTIMIZER_SWITCH_USE_ROWID_FILTER))
-          s->table->select_usable_range_filters(join->thd);
+          s->table->init_cost_info_for_usable_range_rowid_filters(join->thd);
       }
       if (!impossible_range)
       {
@@ -6843,14 +6961,14 @@ best_access_path(JOIN      *join,
   double best_time=         DBL_MAX;
   double records=           DBL_MAX;
   table_map best_ref_depends_map= 0;
-  Range_filter_cost_info *best_filter= 0;
+  Range_rowid_filter_cost_info *best_filter= 0;
   double tmp;
   ha_rows rec;
   bool best_uses_jbuf= FALSE;
   MY_BITMAP *eq_join_set= &s->table->eq_join_set;
   KEYUSE *hj_start_key= 0;
   SplM_plan_info *spl_plan= 0;
-  Range_filter_cost_info *filter= 0;
+  Range_rowid_filter_cost_info *filter= 0;
 
   disable_jbuf= disable_jbuf || idx == join->const_tables;  
 
@@ -7243,11 +7361,18 @@ best_access_path(JOIN      *join,
         loose_scan_opt.check_ref_access_part2(key, start_key, records, tmp);
       } /* not ft_key */
 
-      filter= table->best_filter_for_current_join_order(start_key->key,
-                                                        records,
-                                                        record_count);
-      if (filter && (filter->get_filter_gain(record_count*records) < tmp))
-        tmp= tmp - filter->get_filter_gain(record_count*records);
+      if (records < DBL_MAX)
+      {
+        double rows= record_count * records;
+        filter=
+          table->best_range_rowid_filter_for_partial_join(start_key->key, rows);
+        if (filter)
+	{
+          tmp-= filter->get_adjusted_gain(rows, s->worst_seeks) -
+	        filter->get_cmp_gain(rows);
+          DBUG_ASSERT(tmp >= 0);
+        }
+      }
 
       if (tmp + 0.0001 < best_time - records/(double) TIME_FOR_COMPARE)
       {
@@ -7352,6 +7477,7 @@ best_access_path(JOIN      *join,
       Here we estimate its cost.
     */
 
+    filter= 0;
     if (s->quick)
     {
       /*
@@ -7367,6 +7493,19 @@ best_access_path(JOIN      *join,
         (s->quick->read_time +
          (s->found_records - rnd_records)/(double) TIME_FOR_COMPARE);
 
+      if ( s->quick->get_type() == QUICK_SELECT_I::QS_TYPE_RANGE)
+      {
+        double rows= record_count * s->found_records;
+        uint key_no= s->quick->index;
+        filter= s->table->best_range_rowid_filter_for_partial_join(key_no,
+                                                                   rows);
+        if (filter)
+        {
+          tmp-= filter->get_gain(rows);
+          DBUG_ASSERT(tmp >= 0);
+	}
+      }
+
       loose_scan_opt.check_range_access(join, idx, s->quick);
     }
     else
@@ -7412,21 +7551,23 @@ best_access_path(JOIN      *join,
     else
       tmp+= s->startup_cost;
 
-    filter= s->table->best_filter_for_current_join_order(MAX_KEY,
-                                                         rnd_records,
-                                                         record_count);
-    if (filter && (filter->get_filter_gain(record_count*rnd_records) < tmp))
-      tmp= tmp - filter->get_filter_gain(record_count*rnd_records);
-
     /*
       We estimate the cost of evaluating WHERE clause for found records
       as record_count * rnd_records / TIME_FOR_COMPARE. This cost plus
       tmp give us total cost of using TABLE SCAN
     */
+
+    double filter_cmp_gain= 0;
+    if (filter)
+    {
+      filter_cmp_gain= filter->get_cmp_gain(record_count * s->found_records);
+    }
+
     if (best == DBL_MAX ||
         (tmp  + record_count/(double) TIME_FOR_COMPARE*rnd_records <
          (best_key->is_for_hash_join() ? best_time :
-          best + record_count/(double) TIME_FOR_COMPARE*records)))
+          best + record_count/(double) TIME_FOR_COMPARE*records -
+          filter_cmp_gain)))
     {
       /*
         If the table has a range (s->quick is set) make_join_select()
@@ -7435,7 +7576,9 @@ best_access_path(JOIN      *join,
       best= tmp;
       records= best_records;
       best_key= 0;
-      best_filter= filter;
+      best_filter= 0;
+      if (s->quick && s->quick->get_type() == QUICK_SELECT_I::QS_TYPE_RANGE)
+        best_filter= filter;
       /* range/index_merge/ALL/index access method are "independent", so: */
       best_ref_depends_map= 0;
       best_uses_jbuf= MY_TEST(!disable_jbuf && !((s->table->map &
@@ -7453,7 +7596,7 @@ best_access_path(JOIN      *join,
   pos->loosescan_picker.loosescan_key= MAX_KEY;
   pos->use_join_buffer= best_uses_jbuf;
   pos->spl_plan= spl_plan;
-  pos->filter= best_filter;
+  pos->range_rowid_filter_info= best_filter;
    
   loose_scan_opt.save_to_position(s, loose_scan_pos);
 
@@ -9719,7 +9862,7 @@ bool JOIN::get_best_combination()
         is_hash_join_key_no(j->ref.key))
       hash_join= TRUE; 
 
-    j->filter= best_positions[tablenr].filter;
+    j->range_rowid_filter_info= best_positions[tablenr].range_rowid_filter_info;
 
   loop_end:
     /* 
@@ -10782,6 +10925,11 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
 	    sel->quick=tab->quick;		// Use value from get_quick_...
 	    sel->quick_keys.clear_all();
 	    sel->needed_reg.clear_all();
+            if (is_hj && tab->rowid_filter)
+	    {
+              delete tab->rowid_filter;
+              tab->rowid_filter= 0;
+	    }
 	  }
 	  else
 	  {
@@ -12455,6 +12603,23 @@ bool error_if_full_join(JOIN *join)
 }
 
 
+void JOIN_TAB::build_range_rowid_filter_if_needed()
+{
+  if (rowid_filter && !is_rowid_filter_built)
+  {
+    if (!rowid_filter->build())
+    {
+      is_rowid_filter_built= true;
+    }
+    else
+    {
+      delete rowid_filter;
+      rowid_filter= 0;
+    }
+  }
+}
+
+
 /**
   cleanup JOIN_TAB.
 
@@ -12478,6 +12643,11 @@ void JOIN_TAB::cleanup()
   select= 0;
   delete quick;
   quick= 0;
+  if (rowid_filter)
+  {
+    delete rowid_filter;
+    rowid_filter= 0;
+  }
   if (cache)
   {
     cache->free();
@@ -12589,9 +12759,7 @@ ha_rows JOIN_TAB::get_examined_rows()
   double examined_rows;
   SQL_SELECT *sel= filesort? filesort->select : this->select;
 
-  if (filter)
-    examined_rows= records_read;
-  else if (sel && sel->quick && use_quick != 2)
+  if (sel && sel->quick && use_quick != 2)
     examined_rows= (double)sel->quick->records;
   else if (type == JT_NEXT || type == JT_ALL ||
            type == JT_HASH || type ==JT_HASH_NEXT)
@@ -19361,6 +19529,8 @@ sub_select(JOIN *join,JOIN_TAB *join_tab,bool end_of_records)
   if (!join_tab->preread_init_done && join_tab->preread_init())
     DBUG_RETURN(NESTED_LOOP_ERROR);
 
+  join_tab->build_range_rowid_filter_if_needed();
+
   join->return_tab= join_tab;
 
   if (join_tab->last_inner)
@@ -20305,6 +20475,8 @@ int join_init_read_record(JOIN_TAB *tab)
   if (tab->filesort && tab->sort_table())     // Sort table.
     return 1;
 
+  tab->build_range_rowid_filter_if_needed();
+
   DBUG_EXECUTE_IF("kill_join_init_read_record",
                   tab->join->thd->set_killed(KILL_QUERY););
   if (tab->select && tab->select->quick && tab->select->quick->reset())
@@ -20319,6 +20491,8 @@ int join_init_read_record(JOIN_TAB *tab)
                   tab->join->thd->reset_killed(););
   if (!tab->preread_init_done  && tab->preread_init())
     return 1;
+
+
   if (init_read_record(&tab->read_record, tab->join->thd, tab->table,
                        tab->select, tab->filesort_result, 1,1, FALSE))
     return 1;
@@ -22352,6 +22526,12 @@ check_reverse_order:
         tab->use_quick=1;
         tab->ref.key= -1;
         tab->ref.key_parts=0;		// Don't use ref key.
+        tab->range_rowid_filter_info= 0;
+        if (tab->rowid_filter)
+	{
+          delete tab->rowid_filter;
+          tab->rowid_filter= 0;
+        }
         tab->read_first_record= join_init_read_record;
         if (tab->is_using_loose_index_scan())
           tab->join->tmp_table_param.precomputed_group_by= TRUE;
@@ -24977,11 +25157,13 @@ int print_explain_message_line(select_result_sink *result,
     item_list.push_back(item_null, mem_root);
 
   /* `rows` */
+  StringBuffer<64> rows_str;
   if (rows)
   {
-    item_list.push_back(new (mem_root) Item_int(thd, *rows,
-                                     MY_INT64_NUM_DECIMAL_DIGITS),
-                        mem_root);
+    rows_str.append_ulonglong((ulonglong)(*rows));
+    item_list.push_back(new (mem_root)
+                        Item_string_sys(thd, rows_str.ptr(),
+                                        rows_str.length()), mem_root);
   }
   else
     item_list.push_back(item_null, mem_root);
@@ -25046,31 +25228,6 @@ int append_possible_keys(MEM_ROOT *alloc, String_list &list, TABLE *table,
 }
 
 
-/**
-  This method saves the data that should be printed in EXPLAIN
-  if any filter was used for this table.
-*/
-
-bool JOIN_TAB::save_filter_explain_data(Explain_table_access *eta)
-{
-  if (!filter)
-    return 0;
-  KEY *pk_key= get_keyinfo_by_key_no(filter->key_no);
-  StringBuffer<64> buff_for_pk;
-  const char *tmp_buff;
-  buff_for_pk.append("filter:");
-  tmp_buff= pk_key->name.str;
-  buff_for_pk.append(tmp_buff, strlen(tmp_buff), system_charset_info);
-  if (!(eta->ref_list.append_str(join->thd->mem_root,
-                                 buff_for_pk.c_ptr_safe())))
-    return 1;
-  eta->key.set_filter_key_length(pk_key->key_length);
-  (filter->selectivity*100 >= 1) ? eta->filter_perc= round(filter->selectivity*100) :
-                                   eta->filter_perc= 1;
-  return 0;
-}
-
-
 bool JOIN_TAB::save_explain_data(Explain_table_access *eta,
                                  table_map prefix_tables, 
                                  bool distinct_arg, JOIN_TAB *first_top_tab)
@@ -25106,7 +25263,7 @@ bool JOIN_TAB::save_explain_data(Explain_table_access *eta,
                                                     filesort)))
       return 1;
   }
-  
+  // psergey-todo: data for filtering!
   tracker= &eta->tracker;
   jbuf_tracker= &eta->jbuf_tracker;
 
@@ -25207,6 +25364,20 @@ bool JOIN_TAB::save_explain_data(Explain_table_access *eta,
   // psergey-todo: ^ check for error return code 
 
   /* Build "key", "key_len", and "ref" */
+
+  if (rowid_filter)
+  {
+    Range_rowid_filter *range_filter= (Range_rowid_filter *) rowid_filter;
+    QUICK_SELECT_I *quick= range_filter->get_select()->quick;
+
+    Explain_rowid_filter *erf= new (thd->mem_root) Explain_rowid_filter;
+    erf->quick= quick->get_explain(thd->mem_root);
+    erf->selectivity= range_rowid_filter_info->selectivity;
+    erf->rows= quick->records;
+    eta->rowid_filter= erf;
+    //psergey-todo: also do setup for ANALYZE here.
+  }
+
   if (tab_type == JT_NEXT)
   {
     key_info= table->key_info+index;
@@ -25326,13 +25497,6 @@ bool JOIN_TAB::save_explain_data(Explain_table_access *eta,
     eta->filtered= f;
   }
 
-  if ((tab_select && tab_select->quick && tab_type != JT_CONST) ||
-      (key_info && ref.key_parts && tab_type != JT_FT))
-  {
-    if (save_filter_explain_data(eta))
-      return 1;
-  }
-
   /* Build "Extra" field and save it */
   key_read= table->file->keyread_enabled();
   if ((tab_type == JT_NEXT || tab_type == JT_CONST) &&
diff --git a/sql/sql_select.h b/sql/sql_select.h
index 1d08b746279..3de0e894c93 100644
--- a/sql/sql_select.h
+++ b/sql/sql_select.h
@@ -510,9 +510,18 @@ typedef struct st_join_table {
   uint n_sj_tables;
 
   bool preread_init_done;
-  /* Copy of POSITION::filter, set by get_best_combination() */
-  Range_filter_cost_info *filter;
-  Dynamic_array<char*> rowid_filter_pk;
+
+  /*
+    Cost info to the range filter used when joining this join table
+    (Defined when the best join order has been already chosen)
+  */
+  Range_rowid_filter_cost_info *range_rowid_filter_info;
+  /* Rowid filter to be used when joining this join table */
+  Rowid_filter *rowid_filter;
+  /* Becomes true just after the used range filter has been built / filled */
+  bool is_rowid_filter_built;
+
+  void build_range_rowid_filter_if_needed();
 
   void cleanup();
   inline bool is_using_loose_index_scan()
@@ -663,7 +672,6 @@ typedef struct st_join_table {
   SplM_plan_info *choose_best_splitting(double record_count,
                                         table_map remaining_tables);
   bool fix_splitting(SplM_plan_info *spl_plan, table_map remaining_tables);
-  bool save_filter_explain_data(Explain_table_access *eta);
 } JOIN_TAB;
 
 
@@ -889,7 +897,8 @@ public:
 };
 
 
-class Range_filter_cost_info;
+class Range_rowid_filter_cost_info;
+class Rowid_filter;
 
 
 /**
@@ -974,8 +983,10 @@ typedef struct st_position
 
   /* Info on splitting plan used at this position */  
   SplM_plan_info *spl_plan;
-  /* The index for which filter can be built */
-  Range_filter_cost_info *filter;
+
+  /* Cost info for the range filter used at this position */
+  Range_rowid_filter_cost_info *range_rowid_filter_info;
+
 } POSITION;
 
 typedef Bounds_checked_array<Item_null_result*> Item_null_array;
@@ -1625,6 +1636,8 @@ public:
   bool optimize_unflattened_subqueries();
   bool optimize_constant_subqueries();
   int init_join_caches();
+  bool make_range_rowid_filters();
+  bool init_range_rowid_filters();
   bool make_sum_func_list(List<Item> &all_fields, List<Item> &send_fields,
 			  bool before_group_by, bool recompute= FALSE);
 
diff --git a/sql/structs.h b/sql/structs.h
index 9ff52bccb40..12b5dee0055 100644
--- a/sql/structs.h
+++ b/sql/structs.h
@@ -27,6 +27,15 @@
 #include "thr_lock.h"                  /* thr_lock_type */
 #include "my_base.h"                   /* ha_rows, ha_key_alg */
 #include <mysql_com.h>                  /* USERNAME_LENGTH */
+#include "sql_bitmap.h"
+
+#if MAX_INDEXES <= 64
+typedef Bitmap<64>  key_map;          /* Used for finding keys */
+#elif MAX_INDEXES > 128
+#error "MAX_INDEXES values greater than 128 is not supported."
+#else
+typedef Bitmap<((MAX_INDEXES+7)/8*8)> key_map; /* Used for finding keys */
+#endif
 
 struct TABLE;
 class Type_handler;
@@ -111,6 +120,11 @@ typedef struct st_key {
   */
   LEX_CSTRING name;
   key_part_map ext_key_part_map;
+  /*
+    Bitmap of indexes having common parts with this index
+    (only key parts from key definitions are taken into account)
+  */
+  key_map overlapped;
   uint  block_size;
   enum  ha_key_alg algorithm;
   /* 
diff --git a/sql/table.cc b/sql/table.cc
index 18be06c0b95..3df4c9f543a 100644
--- a/sql/table.cc
+++ b/sql/table.cc
@@ -1227,6 +1227,44 @@ static const Type_handler *old_frm_type_handler(uint pack_flag,
   return Type_handler::get_handler_by_real_type(field_type);
 }
 
+/* Set overlapped bitmaps for each index */
+
+void TABLE_SHARE::set_overlapped_keys()
+{
+  KEY *key1= key_info;
+  for (uint i= 0; i < keys; i++, key1++)
+  {
+    key1->overlapped.clear_all();
+    key1->overlapped.set_bit(i);
+  }
+  key1= key_info;
+  for (uint i= 0; i < keys; i++, key1++)
+  {
+    KEY *key2= key1 + 1;
+    for (uint j= i+1; j < keys; j++, key2++)
+    {
+      KEY_PART_INFO *key_part1= key1->key_part;
+      uint n1= key1->user_defined_key_parts;
+      uint n2= key2->user_defined_key_parts;
+      for (uint k= 0; k < n1; k++, key_part1++)
+      {
+        KEY_PART_INFO *key_part2= key2->key_part;
+        for (uint l= 0; l < n2; l++, key_part2++)
+	{
+          if (key_part1->fieldnr == key_part2->fieldnr)
+	  {
+            key1->overlapped.set_bit(j);
+            key2->overlapped.set_bit(i);
+            goto end_checking_overlap;
+          }
+        }
+      }
+    end_checking_overlap:
+      ;
+    }
+  }
+}
+
 
 /**
   Read data from a binary .frm file image into a TABLE_SHARE
@@ -2522,6 +2560,8 @@ int TABLE_SHARE::init_from_binary_frm_image(THD *thd, bool write,
           null_length, 255);
   }
 
+  set_overlapped_keys();
+
   /* Handle virtual expressions */
   if (vcol_screen_length && share->frm_version >= FRM_VER_EXPRESSSIONS)
   {
@@ -4656,8 +4696,9 @@ void TABLE::init(THD *thd, TABLE_LIST *tl)
   created= TRUE;
   cond_selectivity= 1.0;
   cond_selectivity_sampling_explain= NULL;
-  best_filter_count= 0;
-  range_filter_cost_info_elements= 0;
+  range_rowid_filter_cost_info_elems= 0;
+  range_rowid_filter_cost_info_ptr= NULL;
+  range_rowid_filter_cost_info= NULL;
 #ifdef HAVE_REPLICATION
   /* used in RBR Triggers */
   master_had_triggers= 0;
diff --git a/sql/table.h b/sql/table.h
index 3c782c34bc3..1b8b837c35b 100644
--- a/sql/table.h
+++ b/sql/table.h
@@ -55,7 +55,7 @@ class Virtual_column_info;
 class Table_triggers_list;
 class TMP_TABLE_PARAM;
 class SEQUENCE;
-class Range_filter_cost_info;
+class Range_rowid_filter_cost_info;
 
 /*
   Used to identify NESTED_JOIN structures within a join (applicable only to
@@ -1002,6 +1002,8 @@ struct TABLE_SHARE
 
   /* frees the memory allocated in read_frm_image */
   void free_frm_image(const uchar *frm);
+
+  void set_overlapped_keys();
 };
 
 
@@ -1193,7 +1195,14 @@ public:
     and max #key parts that range access would use.
   */
   ha_rows	quick_rows[MAX_KEY];
+  uint          quick_key_parts[MAX_KEY];
+
   double 	quick_costs[MAX_KEY];
+  /*
+    If there is a range access by i-th index then the cost of
+    index only access for it is stored in quick_index_only_costs[i]
+  */
+  double 	quick_index_only_costs[MAX_KEY];
 
   /* 
     Bitmaps of key parts that =const for the duration of join execution. If
@@ -1202,10 +1211,7 @@ public:
   */
   key_part_map  const_key_parts[MAX_KEY];
 
-  uint    quick_key_parts[MAX_KEY];
   uint    quick_n_ranges[MAX_KEY];
-  /* For each key I/O access cost is stored */
-  double  quick_key_io[MAX_KEY];
 
   /* 
     Estimate of number of records that satisfy SARGable part of the table
@@ -1497,21 +1503,21 @@ public:
   double get_materialization_cost(); // Now used only if is_splittable()==true
   void add_splitting_info_for_key_field(struct KEY_FIELD *key_field);
 
+  key_map with_impossible_ranges;
+
+  /* Number of cost info elements for possible range filters */
+  uint range_rowid_filter_cost_info_elems;
+  /* Pointer to the array of cost info elements for range filters */
+  Range_rowid_filter_cost_info *range_rowid_filter_cost_info;
+  /* The array of pointers to cost info elements for range filters */
+  Range_rowid_filter_cost_info **range_rowid_filter_cost_info_ptr;
+
+  void init_cost_info_for_usable_range_rowid_filters(THD *thd);
+  void prune_range_rowid_filters();
+  Range_rowid_filter_cost_info *
+  best_range_rowid_filter_for_partial_join(uint access_key_no,
+                                           double records);
 
-  /**
-    Range filter info
-  */
-  /* Minimum possible #T value to apply filter*/
-  uint best_filter_count;
-  uint range_filter_cost_info_elements;
-  Range_filter_cost_info *range_filter_cost_info;
-  Range_filter_cost_info
-    *best_filter_for_current_join_order(uint ref_key_no,
-                                        double record_count,
-                                        double records);
-  void sort_range_filter_cost_info_array();
-  void prune_range_filters();
-  void select_usable_range_filters(THD *thd);
   /**
     System Versioning support
    */
author	Igor Babaev <igor@askmonty.org>	2019-02-03 14:56:12 -0800
committer	Igor Babaev <igor@askmonty.org>	2019-02-03 14:56:12 -0800
commit	658128af43b4d7c6db445164f8ed25ed4d1e3109 (patch)
tree	7a71580cca55759b8bb2730e117436478948d77f /sql
parent	5f46670bd09babbee75a24ac82eb4ade0706da66 (diff)
download	mariadb-git-658128af43b4d7c6db445164f8ed25ed4d1e3109.tar.gz