MDEV-16188 Use in-memory PK filters built from range index scans

This patch contains a full implementation of the optimization that allows to use in-memory rowid / primary filters built for range conditions over indexes. In many cases usage of such filters reduce the number of disk seeks spent for fetching table rows. In this implementation the choice of what possible filter to be applied (if any) is made purely on cost-based considerations. This implementation re-achitectured the partial implementation of the feature pushed by Galina Shalygina in the commit 8d5a11122c32f4d9eb87536886c6e893377bdd07. Besides this patch contains a better implementation of the generic handler function handler::multi_range_read_info_const() that takes into account gaps between ranges when calculating the cost of range index scans. It also contains some corrections of the implementation of the handler function records_in_range() for MyISAM. This patch supports the feature for InnoDB and MyISAM.
author: Igor Babaev <igor@askmonty.org> 2019-02-03 14:56:12 -0800
committer: Igor Babaev <igor@askmonty.org> 2019-02-03 14:56:12 -0800
commit: 658128af43b4d7c6db445164f8ed25ed4d1e3109 (patch)
tree: 7a71580cca55759b8bb2730e117436478948d77f /sql/multi_range_read.cc
parent: 5f46670bd09babbee75a24ac82eb4ade0706da66 (diff)
download: mariadb-git-658128af43b4d7c6db445164f8ed25ed4d1e3109.tar.gz
1 files changed, 123 insertions, 18 deletions
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index d6952e71899..cab278f5859 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -20,6 +20,12 @@
 #include "key.h"
 #include "sql_statistics.h"
 
+static ulonglong key_block_no(handler *h, uint keyno, uint keyentry_pos)
+{
+  return (ulonglong) (h->keyread_time(keyno, 1, keyentry_pos + 1) -
+	              h->keyread_time(keyno, 0, keyentry_pos + 1) + 0.5) + 1;
+}
+
 /****************************************************************************
  * Default MRR implementation (MRR to non-MRR converter)
  ***************************************************************************/
@@ -47,6 +53,24 @@
     for a user to be able to interrupt the calculation by killing the
     connection/query.
 
+  @note
+    Starting from 10.4 the implementation of this method tries to take into
+    account gaps between range intervals. Before this we had such paradoxical
+    cases when, for example, the cost of the index scan by range [1..3] was
+    almost twice as less than the cost of of the index scan by two intervals
+    [1..1] and [3..3].
+
+  @note
+    The current implementation of the method is not efficient for it
+    requires extra dives for gaps. Although these dives are not expensive
+    as they touch the index nodes that with very high probability are in
+    cache this is still not good. We could avoid it if records in range
+    also returned positions of the ends of range intervals. It's not
+    hard to implement it now for MyISAM as this engine provides a function
+    returning an approximation of the relative position of a key tuple
+    among other index key tuples. Unfortunately InnoDB now does not provide
+    anything like this function.
+
   @retval
     HA_POS_ERROR  Error or the engine is unable to perform the requested
                   scan. Values of OUT parameters are undefined.
@@ -61,12 +85,21 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                      uint *bufsz, uint *flags, Cost_estimate *cost)
 {
   KEY_MULTI_RANGE range;
+  key_range prev_start_key;
   range_seq_t seq_it;
-  ha_rows rows, total_rows= 0;
+  ha_rows min_pos= 0;
+  ha_rows total_rows= 0;
   uint n_ranges=0;
+  uint n_eq_ranges= 0;
+  ulonglong total_touched_blocks= 0;
+  key_range *prev_min_endp= 0;
+  ulonglong prev_max_block_no=0;
+  ha_rows max_rows= stats.records;
   THD *thd= table->in_use;
-  uint limit= thd->variables.eq_range_index_dive_limit;
+  StringBuffer<64> key_value;
   
+  uint limit= thd->variables.eq_range_index_dive_limit;
+
   bool use_statistics_for_eq_range= eq_ranges_exceeds_limit(seq,
                                                             seq_init_param,
                                                             limit);
@@ -77,10 +110,15 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
   seq_it= seq->init(seq_init_param, n_ranges, *flags);
   while (!seq->next(seq_it, &range))
   {
+    ha_rows rows;
+    ulonglong new_touched_blocks= 0;
+
     if (unlikely(thd->killed != 0))
       return HA_POS_ERROR;
     
     n_ranges++;
+    if (range.range_flag & EQ_RANGE)
+      n_eq_ranges++;
     key_range *min_endp, *max_endp;
     if (range.range_flag & GEOM_FLAG)
     {
@@ -95,38 +133,93 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
       max_endp= range.end_key.length? &range.end_key : NULL;
     }
     int keyparts_used= my_count_bits(range.start_key.keypart_map);
-    if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE))
-      rows= 1; /* there can be at most one row */
-    else if (use_statistics_for_eq_range &&
-             !(range.range_flag & NULL_RANGE) &&
-             (range.range_flag & EQ_RANGE) &&
-             table->key_info[keyno].actual_rec_per_key(keyparts_used - 1) > 0.5)
-      rows=
-        (ha_rows) table->key_info[keyno].actual_rec_per_key(keyparts_used - 1);
+    if (use_statistics_for_eq_range &&
+        !(range.range_flag & NULL_RANGE) &&
+        (range.range_flag & EQ_RANGE) &&
+        table->key_info[keyno].actual_rec_per_key(keyparts_used - 1) > 0.5)
+    {
+      if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE))
+        rows= 1; /* there can be at most one row */
+      else
+        rows=
+          (ha_rows) table->key_info[keyno].actual_rec_per_key(keyparts_used-1);
+    }
     else
     {
-      if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp, 
+      ulonglong min_block_no;
+      ulonglong max_block_no;
+      if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE))
+        rows= 1; /* there can be at most one row */
+      else if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp,
                                                         max_endp)))
       {
         /* Can't scan one range => can't do MRR scan at all */
         total_rows= HA_POS_ERROR;
         break;
       }
+      if (!max_endp && !(prev_min_endp && prev_min_endp->length))
+        min_pos+= max_rows - rows;
+      else
+      {
+        key_range *start_endp= prev_min_endp;
+        if (start_endp && !start_endp->keypart_map)
+          start_endp= 0;
+        /*
+           Get the estimate of rows in the previous gap
+           and two ranges surrounding this gap
+        */
+        ha_rows r= this->records_in_range(keyno,start_endp,max_endp);
+        if (r == HA_POS_ERROR)
+	{
+          /* Some engine cannot estimate such ranges */
+          total_rows += rows;
+          continue;
+        }
+        min_pos+= r - rows;
+      }
+      min_block_no= key_block_no(this, keyno, min_pos);
+      max_block_no= key_block_no(this, keyno, min_pos + rows);
+      new_touched_blocks= max_block_no - min_block_no +
+	                  MY_TEST(min_block_no != prev_max_block_no);
+      prev_max_block_no= max_block_no;
+      if (!prev_min_endp)
+        prev_min_endp= &prev_start_key;
+      /* Save range.start_key for the next iteration step */
+      prev_start_key= range.start_key;
+      key_value.copy((const char *) prev_start_key.key, prev_start_key.length,
+                     key_value.charset());
+      prev_start_key.key= (const uchar *) key_value.ptr();
     }
     total_rows += rows;
+    total_touched_blocks+= new_touched_blocks;
   }
   
   if (total_rows != HA_POS_ERROR)
   {
+    set_if_smaller(total_rows, max_rows);
     /* The following calculation is the same as in multi_range_read_info(): */
     *flags |= HA_MRR_USE_DEFAULT_IMPL;
     cost->reset();
     cost->avg_io_cost= 1; /* assume random seeks */
-    if ((*flags & HA_MRR_INDEX_ONLY) && total_rows > 2)
-      cost->io_count= keyread_time(keyno, n_ranges, (uint)total_rows);
+    cost->idx_avg_io_cost= 1;
+    if (!(keyno == table->s->primary_key && primary_key_is_clustered()))
+    {
+      cost->idx_io_count= total_touched_blocks +
+	                  keyread_time(keyno, 0, total_rows);
+      cost->cpu_cost= cost->idx_cpu_cost=
+        (double) total_rows / TIME_FOR_COMPARE_IDX +
+        (2 * n_ranges - n_eq_ranges) * IDX_LOOKUP_COST;
+      if (!(*flags & HA_MRR_INDEX_ONLY))
+      {
+        cost->io_count= read_time(keyno, 0, total_rows);
+        cost->cpu_cost+= (double) total_rows / TIME_FOR_COMPARE;
+      }
+    }
     else
-      cost->io_count= read_time(keyno, n_ranges, total_rows);
-    cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01;
+    {
+      cost->io_count= read_time(keyno, total_touched_blocks, (uint) total_rows);
+      cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01;
+    }
   }
   return total_rows;
 }
@@ -183,10 +276,22 @@ ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
   cost->avg_io_cost= 1; /* assume random seeks */
 
   /* Produce the same cost as non-MRR code does */
-  if (*flags & HA_MRR_INDEX_ONLY)
-    cost->io_count= keyread_time(keyno, n_ranges, n_rows);
+  if (!(keyno == table->s->primary_key && primary_key_is_clustered()))
+  {
+    cost->idx_io_count=  n_ranges + keyread_time(keyno, 0, n_rows);
+    cost->cpu_cost= cost->idx_cpu_cost=
+      (double) n_rows / TIME_FOR_COMPARE_IDX + n_ranges * IDX_LOOKUP_COST;
+    if (!(*flags & HA_MRR_INDEX_ONLY))
+    {
+      cost->io_count= read_time(keyno, 0, n_rows);
+      cost->cpu_cost+= (double) n_rows / TIME_FOR_COMPARE;
+    }
+  }
   else
-    cost->io_count= read_time(keyno, n_ranges, n_rows);
+  {
+    cost->io_count= read_time(keyno, n_ranges, (uint)n_rows);
+    cost->cpu_cost= (double) n_rows / TIME_FOR_COMPARE + 0.01;
+  }
   return 0;
 }
author	Igor Babaev <igor@askmonty.org>	2019-02-03 14:56:12 -0800
committer	Igor Babaev <igor@askmonty.org>	2019-02-03 14:56:12 -0800
commit	658128af43b4d7c6db445164f8ed25ed4d1e3109 (patch)
tree	7a71580cca55759b8bb2730e117436478948d77f /sql/multi_range_read.cc
parent	5f46670bd09babbee75a24ac82eb4ade0706da66 (diff)
download	mariadb-git-658128af43b4d7c6db445164f8ed25ed4d1e3109.tar.gz