16 files changed, 2738 insertions, 656 deletions
diff --git a/sql/Makefile.am b/sql/Makefile.am
index 00342f7034e..b2972072866 100644
--- a/sql/Makefile.am
+++ b/sql/Makefile.am
@@ -47,7 +47,7 @@ mysqld_LDADD =		libndb.la \
 			$(LDADD)  $(CXXLDFLAGS) $(WRAPLIBS) @LIBDL@ \
 			$(yassl_libs) $(openssl_libs) @MYSQLD_EXTRA_LIBS@
 
-noinst_HEADERS =	item.h item_func.h item_sum.h item_cmpfunc.h \
+noinst_HEADERS =	ds_mrr.h item.h item_func.h item_sum.h item_cmpfunc.h \
 			item_strfunc.h item_timefunc.h \
 			item_xmlfunc.h \
 			item_create.h item_subselect.h item_row.h \
@@ -79,7 +79,7 @@ noinst_HEADERS =	item.h item_func.h item_sum.h item_cmpfunc.h \
 			sql_partition.h partition_info.h partition_element.h \
 			contributors.h sql_servers.h
 
-mysqld_SOURCES =	sql_lex.cc sql_handler.cc sql_partition.cc \
+mysqld_SOURCES =	ds_mrr.cc sql_lex.cc sql_handler.cc sql_partition.cc \
 			item.cc item_sum.cc item_buff.cc item_func.cc \
 			item_cmpfunc.cc item_strfunc.cc item_timefunc.cc \
 			thr_malloc.cc item_create.cc item_subselect.cc \
diff --git a/sql/ds_mrr.cc b/sql/ds_mrr.cc
new file mode 100644
index 00000000000..7a0fc65b8f5
--- /dev/null
+++ b/sql/ds_mrr.cc
@@ -0,0 +1,1333 @@
+#include "mysql_priv.h"
+#include "sql_select.h"
+
+/* **************************************************************************
+ * DS-MRR implementation 
+ ***************************************************************************/
+
+/**
+  DS-MRR: Initialize and start MRR scan
+
+  Initialize and start the MRR scan. Depending on the mode parameter, this
+  may use default or DS-MRR implementation.
+
+  @param h               Table handler to be used
+  @param key             Index to be used
+  @param seq_funcs       Interval sequence enumeration functions
+  @param seq_init_param  Interval sequence enumeration parameter
+  @param n_ranges        Number of ranges in the sequence.
+  @param mode            HA_MRR_* modes to use
+  @param buf             INOUT Buffer to use
+
+  @retval 0     Ok, Scan started.
+  @retval other Error
+*/
+
+int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
+                           void *seq_init_param, uint n_ranges, uint mode,
+                           HANDLER_BUFFER *buf)
+{
+  uint elem_size;
+  Item *pushed_cond= NULL;
+  handler *new_h2= 0;
+  DBUG_ENTER("DsMrr_impl::dsmrr_init");
+
+  /*
+    index_merge may invoke a scan on an object for which dsmrr_info[_const]
+    has not been called, so set the owner handler here as well.
+  */
+  h= h_arg;
+  if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
+  {
+    use_default_impl= TRUE;
+    const int retval=
+      h->handler::multi_range_read_init(seq_funcs, seq_init_param,
+                                        n_ranges, mode, buf);
+    DBUG_RETURN(retval);
+  }
+  rowids_buf= buf->buffer;
+
+  is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
+
+  if (is_mrr_assoc)
+    status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
+ 
+  rowids_buf_end= buf->buffer_end;
+  elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
+  rowids_buf_last= rowids_buf + 
+                      ((rowids_buf_end - rowids_buf)/ elem_size)*
+                      elem_size;
+  rowids_buf_end= rowids_buf_last;
+
+    /*
+    There can be two cases:
+    - This is the first call since index_init(), h2==NULL
+       Need to setup h2 then.
+    - This is not the first call, h2 is initalized and set up appropriately.
+       The caller might have called h->index_init(), need to switch h to
+       rnd_pos calls.
+  */
+  if (!h2)
+  {
+    /* Create a separate handler object to do rndpos() calls. */
+    THD *thd= current_thd;
+    /*
+      ::clone() takes up a lot of stack, especially on 64 bit platforms.
+      The constant 5 is an empiric result.
+    */
+    if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2))
+      DBUG_RETURN(1);
+    DBUG_ASSERT(h->active_index != MAX_KEY);
+    uint mrr_keyno= h->active_index;
+
+    /* Create a separate handler object to do rndpos() calls. */
+    if (!(new_h2= h->clone(thd->mem_root)) || 
+        new_h2->ha_external_lock(thd, F_RDLCK))
+    {
+      delete new_h2;
+      DBUG_RETURN(1);
+    }
+
+    if (mrr_keyno == h->pushed_idx_cond_keyno)
+      pushed_cond= h->pushed_idx_cond;
+
+    /*
+      Caution: this call will invoke this->dsmrr_close(). Do not put the
+      created secondary table handler into this->h2 or it will delete it.
+    */
+    if (h->ha_index_end())
+    {
+      h2=new_h2;
+      goto error;
+    }
+
+    h2= new_h2; /* Ok, now can put it into h2 */
+    table->prepare_for_position();
+    h2->extra(HA_EXTRA_KEYREAD);
+  
+    if (h2->ha_index_init(mrr_keyno, FALSE))
+      goto error;
+
+    use_default_impl= FALSE;
+    if (pushed_cond)
+      h2->idx_cond_push(mrr_keyno, pushed_cond);
+  }
+  else
+  {
+    /* 
+      We get here when the access alternates betwen MRR scan(s) and non-MRR
+      scans.
+
+      Calling h->index_end() will invoke dsmrr_close() for this object,
+      which will delete h2. We need to keep it, so save put it away and dont
+      let it be deleted:
+    */
+    handler *save_h2= h2;
+    h2= NULL;
+    int res= (h->inited == handler::INDEX && h->ha_index_end());
+    h2= save_h2;
+    use_default_impl= FALSE;
+    if (res)
+      goto error;
+  }
+
+  if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
+                                          mode, buf) || 
+      dsmrr_fill_buffer())
+  {
+    goto error;
+  }
+  /*
+    If the above call has scanned through all intervals in *seq, then
+    adjust *buf to indicate that the remaining buffer space will not be used.
+  */
+  if (dsmrr_eof) 
+    buf->end_of_used_area= rowids_buf_last;
+
+  /*
+     h->inited == INDEX may occur when 'range checked for each record' is
+     used.
+  */
+  if ((h->inited != handler::RND) && 
+      ((h->inited==handler::INDEX? h->ha_index_end(): FALSE) || 
+       (h->ha_rnd_init(FALSE))))
+      goto error;
+
+  use_default_impl= FALSE;
+  h->mrr_funcs= *seq_funcs;
+  
+  DBUG_RETURN(0);
+error:
+  h2->ha_index_or_rnd_end();
+  h2->ha_external_lock(current_thd, F_UNLCK);
+  h2->close();
+  delete h2;
+  h2= NULL;
+  DBUG_RETURN(1);
+}
+
+
+void DsMrr_impl::dsmrr_close()
+{
+  DBUG_ENTER("DsMrr_impl::dsmrr_close");
+  if (h2)
+  {
+    h2->ha_index_or_rnd_end();
+    h2->ha_external_lock(current_thd, F_UNLCK);
+    h2->close();
+    delete h2;
+    h2= NULL;
+  }
+  use_default_impl= TRUE;
+  DBUG_VOID_RETURN;
+}
+
+
+static int rowid_cmp(void *h, uchar *a, uchar *b)
+{
+  return ((handler*)h)->cmp_ref(a, b);
+}
+
+
+/**
+  DS-MRR: Fill the buffer with rowids and sort it by rowid
+
+  {This is an internal function of DiskSweep MRR implementation}
+  Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into 
+  buffer. When the buffer is full or scan is completed, sort the buffer by 
+  rowid and return.
+  
+  The function assumes that rowids buffer is empty when it is invoked. 
+  
+  @param h  Table handler
+
+  @retval 0      OK, the next portion of rowids is in the buffer,
+                 properly ordered
+  @retval other  Error
+*/
+
+int DsMrr_impl::dsmrr_fill_buffer()
+{
+  char *range_info;
+  int res;
+  DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer");
+
+  rowids_buf_cur= rowids_buf;
+  while ((rowids_buf_cur < rowids_buf_end) && 
+         !(res= h2->handler::multi_range_read_next(&range_info)))
+  {
+    KEY_MULTI_RANGE *curr_range= &h2->handler::mrr_cur_range;
+    if (h2->mrr_funcs.skip_index_tuple &&
+        h2->mrr_funcs.skip_index_tuple(h2->mrr_iter, curr_range->ptr))
+      continue;
+    
+    /* Put rowid, or {rowid, range_id} pair into the buffer */
+    h2->position(table->record[0]);
+    memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
+    rowids_buf_cur += h2->ref_length;
+
+    if (is_mrr_assoc)
+    {
+      memcpy(rowids_buf_cur, &range_info, sizeof(void*));
+      rowids_buf_cur += sizeof(void*);
+    }
+  }
+
+  if (res && res != HA_ERR_END_OF_FILE)
+    DBUG_RETURN(res); 
+  dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
+
+  /* Sort the buffer contents by rowid */
+  uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
+  uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
+  
+  my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
+            (void*)h);
+  rowids_buf_last= rowids_buf_cur;
+  rowids_buf_cur=  rowids_buf;
+  DBUG_RETURN(0);
+}
+
+
+/**
+  DS-MRR implementation: multi_range_read_next() function
+*/
+
+int DsMrr_impl::dsmrr_next(char **range_info)
+{
+  int res;
+  uchar *cur_range_info= 0;
+  uchar *rowid;
+
+  if (use_default_impl)
+    return h->handler::multi_range_read_next(range_info);
+  
+  do
+  {
+    if (rowids_buf_cur == rowids_buf_last)
+    {
+      if (dsmrr_eof)
+      {
+        res= HA_ERR_END_OF_FILE;
+        goto end;
+      }
+    res= dsmrr_fill_buffer();
+      if (res)
+        goto end;
+    }
+   
+    /* return eof if there are no rowids in the buffer after re-fill attempt */
+    if (rowids_buf_cur == rowids_buf_last)
+    {
+      res= HA_ERR_END_OF_FILE;
+      goto end;
+    }
+    rowid= rowids_buf_cur;
+
+    if (is_mrr_assoc)
+      memcpy(&cur_range_info, rowids_buf_cur + h->ref_length, sizeof(uchar**));
+
+    rowids_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
+    if (h2->mrr_funcs.skip_record &&
+	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
+      continue;
+    res= h->rnd_pos(table->record[0], rowid);
+    break;
+  } while (true);
+ 
+  if (is_mrr_assoc)
+  {
+    memcpy(range_info, rowid + h->ref_length, sizeof(void*));
+  }
+end:
+  return res;
+}
+
+
+/**
+  DS-MRR implementation: multi_range_read_info() function
+*/
+ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows,
+                               uint *bufsz, uint *flags, COST_VECT *cost)
+{  
+  ha_rows res;
+  uint def_flags= *flags;
+  uint def_bufsz= *bufsz;
+
+  /* Get cost/flags/mem_usage of default MRR implementation */
+  res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
+                                         &def_flags, cost);
+  DBUG_ASSERT(!res);
+
+  if ((*flags & HA_MRR_USE_DEFAULT_IMPL) || 
+      choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
+  {
+    /* Default implementation is choosen */
+    DBUG_PRINT("info", ("Default MRR implementation choosen"));
+    *flags= def_flags;
+    *bufsz= def_bufsz;
+  }
+  else
+  {
+    /* *flags and *bufsz were set by choose_mrr_impl */
+    DBUG_PRINT("info", ("DS-MRR implementation choosen"));
+  }
+  return 0;
+}
+
+
+/**
+  DS-MRR Implementation: multi_range_read_info_const() function
+*/
+
+ha_rows DsMrr_impl::dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                 void *seq_init_param, uint n_ranges, 
+                                 uint *bufsz, uint *flags, COST_VECT *cost)
+{
+  ha_rows rows;
+  uint def_flags= *flags;
+  uint def_bufsz= *bufsz;
+  /* Get cost/flags/mem_usage of default MRR implementation */
+  rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
+                                                n_ranges, &def_bufsz, 
+                                                &def_flags, cost);
+  if (rows == HA_POS_ERROR)
+  {
+    /* Default implementation can't perform MRR scan => we can't either */
+    return rows;
+  }
+
+  /*
+    If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
+    use the default MRR implementation (we need it for UPDATE/DELETE).
+    Otherwise, make a choice based on cost and @@optimizer_use_mrr.
+  */
+  if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
+      choose_mrr_impl(keyno, rows, flags, bufsz, cost))
+  {
+    DBUG_PRINT("info", ("Default MRR implementation choosen"));
+    *flags= def_flags;
+    *bufsz= def_bufsz;
+  }
+  else
+  {
+    /* *flags and *bufsz were set by choose_mrr_impl */
+    DBUG_PRINT("info", ("DS-MRR implementation choosen"));
+  }
+  return rows;
+}
+
+
+/**
+  Check if key has partially-covered columns
+
+  We can't use DS-MRR to perform range scans when the ranges are over
+  partially-covered keys, because we'll not have full key part values
+  (we'll have their prefixes from the index) and will not be able to check
+  if we've reached the end the range.
+
+  @param keyno  Key to check
+
+  @todo
+    Allow use of DS-MRR in cases where the index has partially-covered
+    components but they are not used for scanning.
+
+  @retval TRUE   Yes
+  @retval FALSE  No
+*/
+
+bool key_uses_partial_cols(TABLE *table, uint keyno)
+{
+  KEY_PART_INFO *kp= table->key_info[keyno].key_part;
+  KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
+  for (; kp != kp_end; kp++)
+  {
+    if (!kp->field->part_of_key.is_set(keyno))
+      return TRUE;
+  }
+  return FALSE;
+}
+
+/**
+  DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
+
+  Make the choice between using Default MRR implementation and DS-MRR.
+  This function contains common functionality factored out of dsmrr_info()
+  and dsmrr_info_const(). The function assumes that the default MRR
+  implementation's applicability requirements are satisfied.
+
+  @param keyno       Index number
+  @param rows        E(full rows to be retrieved)
+  @param flags  IN   MRR flags provided by the MRR user
+                OUT  If DS-MRR is choosen, flags of DS-MRR implementation
+                     else the value is not modified
+  @param bufsz  IN   If DS-MRR is choosen, buffer use of DS-MRR implementation
+                     else the value is not modified
+  @param cost   IN   Cost of default MRR implementation
+                OUT  If DS-MRR is choosen, cost of DS-MRR scan
+                     else the value is not modified
+
+  @retval TRUE   Default MRR implementation should be used
+  @retval FALSE  DS-MRR implementation should be used
+*/
+
+bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
+                                 uint *bufsz, COST_VECT *cost)
+{
+  COST_VECT dsmrr_cost;
+  bool res;
+  THD *thd= current_thd;
+  if (thd->variables.optimizer_use_mrr == 2 || *flags & HA_MRR_INDEX_ONLY ||
+      (keyno == table->s->primary_key && h->primary_key_is_clustered()) ||
+       key_uses_partial_cols(table, keyno))
+  {
+    /* Use the default implementation */
+    *flags |= HA_MRR_USE_DEFAULT_IMPL;
+    return TRUE;
+  }
+  
+  uint add_len= table->key_info[keyno].key_length + h->ref_length; 
+  *bufsz -= add_len;
+  if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
+    return TRUE;
+  *bufsz += add_len;
+  
+  bool force_dsmrr;
+  /* 
+    If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
+    DS-MRR and Default implementations cost. This allows one to force use of
+    DS-MRR whenever it is applicable without affecting other cost-based
+    choices.
+  */
+  if ((force_dsmrr= (thd->variables.optimizer_use_mrr == 1)) &&
+      dsmrr_cost.total_cost() > cost->total_cost())
+    dsmrr_cost= *cost;
+
+  if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
+  {
+    *flags &= ~HA_MRR_USE_DEFAULT_IMPL;  /* Use the DS-MRR implementation */
+    *flags &= ~HA_MRR_SORTED;          /* We will return unordered output */
+    *cost= dsmrr_cost;
+    res= FALSE;
+  }
+  else
+  {
+    /* Use the default MRR implementation */
+    res= TRUE;
+  }
+  return res;
+}
+
+
+static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost);
+
+
+/**
+  Get cost of DS-MRR scan
+
+  @param keynr              Index to be used
+  @param rows               E(Number of rows to be scanned)
+  @param flags              Scan parameters (HA_MRR_* flags)
+  @param buffer_size INOUT  Buffer size
+  @param cost        OUT    The cost
+
+  @retval FALSE  OK
+  @retval TRUE   Error, DS-MRR cannot be used (the buffer is too small
+                 for even 1 rowid)
+*/
+
+bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
+                                         uint *buffer_size, COST_VECT *cost)
+{
+  ulong max_buff_entries, elem_size;
+  ha_rows rows_in_full_step, rows_in_last_step;
+  uint n_full_steps;
+  double index_read_cost;
+
+  elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
+  max_buff_entries = *buffer_size / elem_size;
+
+  if (!max_buff_entries)
+    return TRUE; /* Buffer has not enough space for even 1 rowid */
+
+  /* Number of iterations we'll make with full buffer */
+  n_full_steps= (uint)floor(rows2double(rows) / max_buff_entries);
+  
+  /* 
+    Get numbers of rows we'll be processing in 
+     - non-last sweep, with full buffer 
+     - last iteration, with non-full buffer
+  */
+  rows_in_full_step= max_buff_entries;
+  rows_in_last_step= rows % max_buff_entries;
+  
+  /* Adjust buffer size if we expect to use only part of the buffer */
+  if (n_full_steps)
+  {
+    get_sort_and_sweep_cost(table, rows, cost);
+    cost->multiply(n_full_steps);
+  }
+  else
+  {
+    cost->zero();
+    *buffer_size= max(*buffer_size, 
+                      (size_t)(1.2*rows_in_last_step) * elem_size + 
+                      h->ref_length + table->key_info[keynr].key_length);
+  }
+  
+  COST_VECT last_step_cost;
+  get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
+  cost->add(&last_step_cost);
+ 
+  if (n_full_steps != 0)
+    cost->mem_cost= *buffer_size;
+  else
+    cost->mem_cost= (double)rows_in_last_step * elem_size;
+  
+  /* Total cost of all index accesses */
+  index_read_cost= h->index_only_read_time(keynr, (double)rows);
+  cost->add_io(index_read_cost, 1 /* Random seeks */);
+  return FALSE;
+}
+
+
+/* 
+  Get cost of one sort-and-sweep step
+
+  SYNOPSIS
+    get_sort_and_sweep_cost()
+      table       Table being accessed
+      nrows       Number of rows to be sorted and retrieved
+      cost   OUT  The cost
+
+  DESCRIPTION
+    Get cost of these operations:
+     - sort an array of #nrows ROWIDs using qsort
+     - read #nrows records from table in a sweep.
+*/
+
+static 
+void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost)
+{
+  if (nrows)
+  {
+    get_sweep_read_cost(table, nrows, FALSE, cost);
+    /* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
+    double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
+    if (cmp_op < 3)
+      cmp_op= 3;
+    cost->cpu_cost += cmp_op * log2(cmp_op);
+  }
+  else
+    cost->zero();
+}
+
+
+/**
+  Get cost of reading nrows table records in a "disk sweep"
+
+  A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
+  for an ordered sequence of rowids.
+
+  We assume hard disk IO. The read is performed as follows:
+
+   1. The disk head is moved to the needed cylinder
+   2. The controller waits for the plate to rotate
+   3. The data is transferred
+
+  Time to do #3 is insignificant compared to #2+#1.
+
+  Time to move the disk head is proportional to head travel distance.
+
+  Time to wait for the plate to rotate depends on whether the disk head
+  was moved or not. 
+
+  If disk head wasn't moved, the wait time is proportional to distance
+  between the previous block and the block we're reading.
+
+  If the head was moved, we don't know how much we'll need to wait for the
+  plate to rotate. We assume the wait time to be a variate with a mean of
+  0.5 of full rotation time.
+
+  Our cost units are "random disk seeks". The cost of random disk seek is
+  actually not a constant, it depends one range of cylinders we're going
+  to access. We make it constant by introducing a fuzzy concept of "typical 
+  datafile length" (it's fuzzy as it's hard to tell whether it should
+  include index file, temp.tables etc). Then random seek cost is:
+
+    1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
+
+  We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
+
+  @param table             Table to be accessed
+  @param nrows             Number of rows to retrieve
+  @param interrupted       TRUE <=> Assume that the disk sweep will be
+                           interrupted by other disk IO. FALSE - otherwise.
+  @param cost         OUT  The cost.
+*/
+
+void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, 
+                         COST_VECT *cost)
+{
+  DBUG_ENTER("get_sweep_read_cost");
+
+  cost->zero();
+  if (table->file->primary_key_is_clustered())
+  {
+    cost->io_count= table->file->read_time(table->s->primary_key,
+                                           (uint) nrows, nrows);
+  }
+  else
+  {
+    double n_blocks=
+      ceil(ulonglong2double(table->file->stats.data_file_length) / IO_SIZE);
+    double busy_blocks=
+      n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
+    if (busy_blocks < 1.0)
+      busy_blocks= 1.0;
+
+    DBUG_PRINT("info",("sweep: nblocks=%g, busy_blocks=%g", n_blocks,
+                       busy_blocks));
+    cost->io_count= busy_blocks;
+
+    if (!interrupted)
+    {
+      /* Assume reading is done in one 'sweep' */
+      cost->avg_io_cost= (DISK_SEEK_BASE_COST +
+                          DISK_SEEK_PROP_COST*n_blocks/busy_blocks);
+    }
+  }
+  DBUG_PRINT("info",("returning cost=%g", cost->total_cost()));
+  DBUG_VOID_RETURN;
+}
+
+
+/* **************************************************************************
+ * DS-MRR implementation ends
+ ***************************************************************************/
+
+/* **************************************************************************
+ * Index Condition Pushdown code starts
+ ***************************************************************************/
+/* 
+  Check if given expression uses only table fields covered by the given index
+
+  SYNOPSIS
+    uses_index_fields_only()
+      item           Expression to check
+      tbl            The table having the index
+      keyno          The index number
+      other_tbls_ok  TRUE <=> Fields of other non-const tables are allowed
+
+  DESCRIPTION
+    Check if given expression only uses fields covered by index #keyno in the
+    table tbl. The expression can use any fields in any other tables.
+    
+    The expression is guaranteed not to be AND or OR - those constructs are 
+    handled outside of this function.
+
+  RETURN
+    TRUE   Yes
+    FALSE  No
+*/
+
+bool uses_index_fields_only(Item *item, TABLE *tbl, uint keyno, 
+                            bool other_tbls_ok)
+{
+  if (item->const_item())
+    return TRUE;
+
+  /* 
+    Don't push down the triggered conditions. Nested outer joins execution 
+    code may need to evaluate a condition several times (both triggered and
+    untriggered), and there is no way to put thi
+    TODO: Consider cloning the triggered condition and using the copies for:
+      1. push the first copy down, to have most restrictive index condition
+         possible
+      2. Put the second copy into tab->select_cond. 
+  */
+  if (item->type() == Item::FUNC_ITEM && 
+      ((Item_func*)item)->functype() == Item_func::TRIG_COND_FUNC)
+    return FALSE;
+
+  if (!(item->used_tables() & tbl->map))
+    return other_tbls_ok;
+
+  Item::Type item_type= item->type();
+  switch (item_type) {
+  case Item::FUNC_ITEM:
+    {
+      /* This is a function, apply condition recursively to arguments */
+      Item_func *item_func= (Item_func*)item;
+      Item **child;
+      Item **item_end= (item_func->arguments()) + item_func->argument_count();
+      for (child= item_func->arguments(); child != item_end; child++)
+      {
+        if (!uses_index_fields_only(*child, tbl, keyno, other_tbls_ok))
+          return FALSE;
+      }
+      return TRUE;
+    }
+  case Item::COND_ITEM:
+    {
+      /*
+        This is a AND/OR condition. Regular AND/OR clauses are handled by
+        make_cond_for_index() which will chop off the part that can be
+        checked with index. This code is for handling non-top-level AND/ORs,
+        e.g. func(x AND y).
+      */
+      List_iterator<Item> li(*((Item_cond*)item)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+        if (!uses_index_fields_only(item, tbl, keyno, other_tbls_ok))
+          return FALSE;
+      }
+      return TRUE;
+    }
+  case Item::FIELD_ITEM:
+    {
+      Item_field *item_field= (Item_field*)item;
+      if (item_field->field->table != tbl) 
+        return TRUE;
+      /*
+        The below is probably a repetition - the first part checks the
+        other two, but let's play it safe:
+      */
+      return item_field->field->part_of_key.is_set(keyno) &&
+             item_field->field->type() != MYSQL_TYPE_GEOMETRY &&
+             item_field->field->type() != MYSQL_TYPE_BLOB;
+    }
+  case Item::REF_ITEM:
+    return uses_index_fields_only(item->real_item(), tbl, keyno,
+                                  other_tbls_ok);
+  default:
+    return FALSE; /* Play it safe, don't push unknown non-const items */
+  }
+}
+
+#define ICP_COND_USES_INDEX_ONLY 10
+
+/*
+  Get a part of the condition that can be checked using only index fields
+
+  SYNOPSIS
+    make_cond_for_index()
+      cond           The source condition
+      table          The table that is partially available
+      keyno          The index in the above table. Only fields covered by the index
+                     are available
+      other_tbls_ok  TRUE <=> Fields of other non-const tables are allowed
+
+  DESCRIPTION
+    Get a part of the condition that can be checked when for the given table 
+    we have values only of fields covered by some index. The condition may
+    refer to other tables, it is assumed that we have values of all of their 
+    fields.
+
+    Example:
+      make_cond_for_index(
+         "cond(t1.field) AND cond(t2.key1) AND cond(t2.non_key) AND cond(t2.key2)",
+          t2, keyno(t2.key1)) 
+      will return
+        "cond(t1.field) AND cond(t2.key2)"
+
+  RETURN
+    Index condition, or NULL if no condition could be inferred.
+*/
+
+Item *make_cond_for_index(Item *cond, TABLE *table, uint keyno,
+                          bool other_tbls_ok)
+{
+  if (!cond)
+    return NULL;
+  if (cond->type() == Item::COND_ITEM)
+  {
+    uint n_marked= 0;
+    if (((Item_cond*) cond)->functype() == Item_func::COND_AND_FUNC)
+    {
+      table_map used_tables= 0;
+      Item_cond_and *new_cond=new Item_cond_and;
+      if (!new_cond)
+	return (COND*) 0;
+      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+	Item *fix= make_cond_for_index(item, table, keyno, other_tbls_ok);
+	if (fix)
+        {
+	  new_cond->argument_list()->push_back(fix);
+          used_tables|= fix->used_tables();
+        }
+        n_marked += test(item->marker == ICP_COND_USES_INDEX_ONLY);
+      }
+      if (n_marked ==((Item_cond*)cond)->argument_list()->elements)
+        cond->marker= ICP_COND_USES_INDEX_ONLY;
+      switch (new_cond->argument_list()->elements) {
+      case 0:
+	return (COND*) 0;
+      case 1:
+        new_cond->used_tables_cache= used_tables;
+	return new_cond->argument_list()->head();
+      default:
+	new_cond->quick_fix_field();
+        new_cond->used_tables_cache= used_tables;
+	return new_cond;
+      }
+    }
+    else /* It's OR */
+    {
+      Item_cond_or *new_cond=new Item_cond_or;
+      if (!new_cond)
+	return (COND*) 0;
+      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+	Item *fix= make_cond_for_index(item, table, keyno, other_tbls_ok);
+	if (!fix)
+	  return (COND*) 0;
+	new_cond->argument_list()->push_back(fix);
+        n_marked += test(item->marker == ICP_COND_USES_INDEX_ONLY);
+      }
+      if (n_marked ==((Item_cond*)cond)->argument_list()->elements)
+        cond->marker= ICP_COND_USES_INDEX_ONLY;
+      new_cond->quick_fix_field();
+      new_cond->used_tables_cache= ((Item_cond_or*) cond)->used_tables_cache;
+      new_cond->top_level_item();
+      return new_cond;
+    }
+  }
+
+  if (!uses_index_fields_only(cond, table, keyno, other_tbls_ok))
+    return (COND*) 0;
+  cond->marker= ICP_COND_USES_INDEX_ONLY;
+  return cond;
+}
+
+
+Item *make_cond_remainder(Item *cond, bool exclude_index)
+{
+  if (exclude_index && cond->marker == ICP_COND_USES_INDEX_ONLY)
+    return 0; /* Already checked */
+
+  if (cond->type() == Item::COND_ITEM)
+  {
+    table_map tbl_map= 0;
+    if (((Item_cond*) cond)->functype() == Item_func::COND_AND_FUNC)
+    {
+      /* Create new top level AND item */
+      Item_cond_and *new_cond=new Item_cond_and;
+      if (!new_cond)
+	return (COND*) 0;
+      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+	Item *fix= make_cond_remainder(item, exclude_index);
+	if (fix)
+        {
+	  new_cond->argument_list()->push_back(fix);
+          tbl_map |= fix->used_tables();
+        }
+      }
+      switch (new_cond->argument_list()->elements) {
+      case 0:
+	return (COND*) 0;
+      case 1:
+	return new_cond->argument_list()->head();
+      default:
+	new_cond->quick_fix_field();
+        ((Item_cond*)new_cond)->used_tables_cache= tbl_map;
+	return new_cond;
+      }
+    }
+    else /* It's OR */
+    {
+      Item_cond_or *new_cond=new Item_cond_or;
+      if (!new_cond)
+	return (COND*) 0;
+      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+	Item *fix= make_cond_remainder(item, FALSE);
+	if (!fix)
+	  return (COND*) 0;
+	new_cond->argument_list()->push_back(fix);
+        tbl_map |= fix->used_tables();
+      }
+      new_cond->quick_fix_field();
+      ((Item_cond*)new_cond)->used_tables_cache= tbl_map;
+      new_cond->top_level_item();
+      return new_cond;
+    }
+  }
+  return cond;
+}
+
+
+/*
+  Try to extract and push the index condition
+
+  SYNOPSIS
+    push_index_cond()
+      tab            A join tab that has tab->table->file and its condition
+                     in tab->select_cond
+      keyno          Index for which extract and push the condition
+      other_tbls_ok  TRUE <=> Fields of other non-const tables are allowed
+
+  DESCRIPTION
+    Try to extract and push the index condition down to table handler
+*/
+
+void push_index_cond(JOIN_TAB *tab, uint keyno, bool other_tbls_ok)
+{
+  DBUG_ENTER("push_index_cond");
+  Item *idx_cond;
+  bool do_index_cond_pushdown=
+    ((tab->table->file->index_flags(keyno, 0, 1) &
+      HA_DO_INDEX_COND_PUSHDOWN) &&
+     tab->join->thd->variables.engine_condition_pushdown);
+  // psergey:
+  //
+  KEY *key_info= tab->table->key_info + keyno;
+  for (uint kp= 0; kp < key_info->key_parts; kp++)
+  {
+    if ((key_info->key_part[kp].key_part_flag & HA_PART_KEY_SEG))
+    {
+      do_index_cond_pushdown= FALSE;
+      break;
+    }
+  }
+  // :psergey
+  /*
+    When WL#5116 is done this DBUG statement must be removed. It's just a
+    temporary hack to allow us to discriminate whether a test failure relates
+    to *Engine* or *Index* Condition Pushdown.
+  */
+  DBUG_EXECUTE_IF("optimizer_no_icp", do_index_cond_pushdown= false;);
+  if (do_index_cond_pushdown)
+  {
+    DBUG_EXECUTE("where",
+                 print_where(tab->select_cond, "full cond", QT_ORDINARY););
+
+    idx_cond= make_cond_for_index(tab->select_cond, tab->table, keyno,
+                                  other_tbls_ok);
+
+    DBUG_EXECUTE("where",
+                 print_where(idx_cond, "idx cond", QT_ORDINARY););
+
+    if (idx_cond)
+    {
+      Item *idx_remainder_cond= 0;
+      tab->pre_idx_push_select_cond= tab->select_cond;
+#if 0
+      // The following is only needed for BKA:
+      /*
+        For BKA cache we store condition to special BKA cache field
+        because evaluation of the condition requires additional operations
+        before the evaluation. This condition is used in 
+        JOIN_CACHE_BKA[_UNIQUE]::skip_index_tuple() functions.
+      */
+      if (tab->use_join_cache &&
+          /*
+            if cache is used then the value is TRUE only 
+            for BKA[_UNIQUE] cache (see check_join_cache_usage func).
+            In this case other_tbls_ok is an equivalent of
+            cache->is_key_access().
+          */
+          other_tbls_ok &&
+          (idx_cond->used_tables() &
+           ~(tab->table->map | tab->join->const_table_map)))
+        tab->cache_idx_cond= idx_cond;
+      else
+#endif
+        idx_remainder_cond= tab->table->file->idx_cond_push(keyno, idx_cond);
+
+      /*
+        Disable eq_ref's "lookup cache" if we've pushed down an index
+        condition. 
+        TODO: This check happens to work on current ICP implementations, but
+        there may exist a compliant implementation that will not work 
+        correctly with it. Sort this out when we stabilize the condition
+        pushdown APIs.
+      */
+      if (idx_remainder_cond != idx_cond)
+        tab->ref.disable_cache= TRUE;
+
+      Item *row_cond= make_cond_remainder(tab->select_cond, TRUE);
+
+      DBUG_EXECUTE("where",
+                   print_where(row_cond, "remainder cond", QT_ORDINARY););
+      
+      if (row_cond)
+      {
+        if (!idx_remainder_cond)
+          tab->select_cond= row_cond;
+        else
+        {
+          COND *new_cond= new Item_cond_and(row_cond, idx_remainder_cond);
+          tab->select_cond= new_cond;
+	  tab->select_cond->quick_fix_field();
+          ((Item_cond_and*)tab->select_cond)->used_tables_cache= 
+            row_cond->used_tables() | idx_remainder_cond->used_tables();
+        }
+      }
+      else
+        tab->select_cond= idx_remainder_cond;
+      if (tab->select)
+      {
+        DBUG_EXECUTE("where",
+                     print_where(tab->select->cond,
+                                 "select_cond",
+                                 QT_ORDINARY););
+
+        tab->select->cond= tab->select_cond;
+      }
+    }
+  }
+  DBUG_VOID_RETURN;
+}
+
+/* **************************************************************************
+ * Default MRR implementation starts
+ ***************************************************************************/
+
+
+/****************************************************************************
+ * Default MRR implementation (MRR to non-MRR converter)
+ ***************************************************************************/
+
+/**
+  Get cost and other information about MRR scan over a known list of ranges
+
+  Calculate estimated cost and other information about an MRR scan for given
+  sequence of ranges.
+
+  @param keyno           Index number
+  @param seq             Range sequence to be traversed
+  @param seq_init_param  First parameter for seq->init()
+  @param n_ranges_arg    Number of ranges in the sequence, or 0 if the caller
+                         can't efficiently determine it
+  @param bufsz    INOUT  IN:  Size of the buffer available for use
+                         OUT: Size of the buffer that is expected to be actually
+                              used, or 0 if buffer is not needed.
+  @param flags    INOUT  A combination of HA_MRR_* flags
+  @param cost     OUT    Estimated cost of MRR access
+
+  @note
+    This method (or an overriding one in a derived class) must check for
+    thd->killed and return HA_POS_ERROR if it is not zero. This is required
+    for a user to be able to interrupt the calculation by killing the
+    connection/query.
+
+  @retval
+    HA_POS_ERROR  Error or the engine is unable to perform the requested
+                  scan. Values of OUT parameters are undefined.
+  @retval
+    other         OK, *cost contains cost of the scan, *bufsz and *flags
+                  contain scan parameters.
+*/
+
+ha_rows 
+handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                     void *seq_init_param, uint n_ranges_arg,
+                                     uint *bufsz, uint *flags, COST_VECT *cost)
+{
+  KEY_MULTI_RANGE range;
+  range_seq_t seq_it;
+  ha_rows rows, total_rows= 0;
+  uint n_ranges=0;
+  THD *thd= current_thd;
+  
+  /* Default MRR implementation doesn't need buffer */
+  *bufsz= 0;
+
+  seq_it= seq->init(seq_init_param, n_ranges, *flags);
+  while (!seq->next(seq_it, &range))
+  {
+    if (unlikely(thd->killed != 0))
+      return HA_POS_ERROR;
+    
+    n_ranges++;
+    key_range *min_endp, *max_endp;
+    if (range.range_flag & GEOM_FLAG)
+    {
+      /* In this case tmp_min_flag contains the handler-read-function */
+      range.start_key.flag= (ha_rkey_function) (range.range_flag ^ GEOM_FLAG);
+      min_endp= &range.start_key;
+      max_endp= NULL;
+    }
+    else
+    {
+      min_endp= range.start_key.length? &range.start_key : NULL;
+      max_endp= range.end_key.length? &range.end_key : NULL;
+    }
+    if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE))
+      rows= 1; /* there can be at most one row */
+    else
+    {
+      if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp, 
+                                                        max_endp)))
+      {
+        /* Can't scan one range => can't do MRR scan at all */
+        total_rows= HA_POS_ERROR;
+        break;
+      }
+    }
+    total_rows += rows;
+  }
+  
+  if (total_rows != HA_POS_ERROR)
+  {
+    /* The following calculation is the same as in multi_range_read_info(): */
+    *flags |= HA_MRR_USE_DEFAULT_IMPL;
+    cost->zero();
+    cost->avg_io_cost= 1; /* assume random seeks */
+    if ((*flags & HA_MRR_INDEX_ONLY) && total_rows > 2)
+      cost->io_count= index_only_read_time(keyno, (uint)total_rows);
+    else
+      cost->io_count= read_time(keyno, n_ranges, total_rows);
+    cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01;
+  }
+  return total_rows;
+}
+
+
+/**
+  Get cost and other information about MRR scan over some sequence of ranges
+
+  Calculate estimated cost and other information about an MRR scan for some
+  sequence of ranges.
+
+  The ranges themselves will be known only at execution phase. When this
+  function is called we only know number of ranges and a (rough) E(#records)
+  within those ranges.
+
+  Currently this function is only called for "n-keypart singlepoint" ranges,
+  i.e. each range is "keypart1=someconst1 AND ... AND keypartN=someconstN"
+
+  The flags parameter is a combination of those flags: HA_MRR_SORTED,
+  HA_MRR_INDEX_ONLY, HA_MRR_NO_ASSOCIATION, HA_MRR_LIMITS.
+
+  @param keyno           Index number
+  @param n_ranges        Estimated number of ranges (i.e. intervals) in the
+                         range sequence.
+  @param n_rows          Estimated total number of records contained within all
+                         of the ranges
+  @param bufsz    INOUT  IN:  Size of the buffer available for use
+                         OUT: Size of the buffer that will be actually used, or
+                              0 if buffer is not needed.
+  @param flags    INOUT  A combination of HA_MRR_* flags
+  @param cost     OUT    Estimated cost of MRR access
+
+  @retval
+    0     OK, *cost contains cost of the scan, *bufsz and *flags contain scan
+          parameters.
+  @retval
+    other Error or can't perform the requested scan
+*/
+
+ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
+                                       uint *bufsz, uint *flags, COST_VECT *cost)
+{
+  *bufsz= 0; /* Default implementation doesn't need a buffer */
+
+  *flags |= HA_MRR_USE_DEFAULT_IMPL;
+
+  cost->zero();
+  cost->avg_io_cost= 1; /* assume random seeks */
+
+  /* Produce the same cost as non-MRR code does */
+  if (*flags & HA_MRR_INDEX_ONLY)
+    cost->io_count= index_only_read_time(keyno, n_rows);
+  else
+    cost->io_count= read_time(keyno, n_ranges, n_rows);
+  return 0;
+}
+
+
+/**
+  Initialize the MRR scan
+
+  Initialize the MRR scan. This function may do heavyweight scan 
+  initialization like row prefetching/sorting/etc (NOTE: but better not do
+  it here as we may not need it, e.g. if we never satisfy WHERE clause on
+  previous tables. For many implementations it would be natural to do such
+  initializations in the first multi_read_range_next() call)
+
+  mode is a combination of the following flags: HA_MRR_SORTED,
+  HA_MRR_INDEX_ONLY, HA_MRR_NO_ASSOCIATION 
+
+  @param seq             Range sequence to be traversed
+  @param seq_init_param  First parameter for seq->init()
+  @param n_ranges        Number of ranges in the sequence
+  @param mode            Flags, see the description section for the details
+  @param buf             INOUT: memory buffer to be used
+
+  @note
+    One must have called index_init() before calling this function. Several
+    multi_range_read_init() calls may be made in course of one query.
+
+    Until WL#2623 is done (see its text, section 3.2), the following will 
+    also hold:
+    The caller will guarantee that if "seq->init == mrr_ranges_array_init"
+    then seq_init_param is an array of n_ranges KEY_MULTI_RANGE structures.
+    This property will only be used by NDB handler until WL#2623 is done.
+     
+    Buffer memory management is done according to the following scenario:
+    The caller allocates the buffer and provides it to the callee by filling
+    the members of HANDLER_BUFFER structure.
+    The callee consumes all or some fraction of the provided buffer space, and
+    sets the HANDLER_BUFFER members accordingly.
+    The callee may use the buffer memory until the next multi_range_read_init()
+    call is made, all records have been read, or until index_end() call is
+    made, whichever comes first.
+
+  @retval 0  OK
+  @retval 1  Error
+*/
+
+int
+handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
+                               uint n_ranges, uint mode, HANDLER_BUFFER *buf)
+{
+  DBUG_ENTER("handler::multi_range_read_init");
+  mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
+  mrr_funcs= *seq_funcs;
+  mrr_is_output_sorted= test(mode & HA_MRR_SORTED);
+  mrr_have_range= FALSE;
+  DBUG_RETURN(0);
+}
+
+
+/**
+  Get next record in MRR scan
+
+  Default MRR implementation: read the next record
+
+  @param range_info  OUT  Undefined if HA_MRR_NO_ASSOCIATION flag is in effect
+                          Otherwise, the opaque value associated with the range
+                          that contains the returned record.
+
+  @retval 0      OK
+  @retval other  Error code
+*/
+
+int handler::multi_range_read_next(char **range_info)
+{
+  int UNINIT_VAR(result);
+  int range_res;
+  DBUG_ENTER("handler::multi_range_read_next");
+
+  if (!mrr_have_range)
+  {
+    mrr_have_range= TRUE;
+    goto start;
+  }
+
+  do
+  {
+    /* Save a call if there can be only one row in range. */
+    if (mrr_cur_range.range_flag != (UNIQUE_RANGE | EQ_RANGE))
+    {
+      result= read_range_next();
+      /* On success or non-EOF errors jump to the end. */
+      if (result != HA_ERR_END_OF_FILE)
+        break;
+    }
+    else
+    {
+      if (was_semi_consistent_read())
+        goto scan_it_again;
+      /*
+        We need to set this for the last range only, but checking this
+        condition is more expensive than just setting the result code.
+      */
+      result= HA_ERR_END_OF_FILE;
+    }
+
+start:
+    /* Try the next range(s) until one matches a record. */
+    while (!(range_res= mrr_funcs.next(mrr_iter, &mrr_cur_range)))
+    {
+scan_it_again:
+      result= read_range_first(mrr_cur_range.start_key.keypart_map ?
+                                 &mrr_cur_range.start_key : 0,
+                               mrr_cur_range.end_key.keypart_map ?
+                                 &mrr_cur_range.end_key : 0,
+                               test(mrr_cur_range.range_flag & EQ_RANGE),
+                               mrr_is_output_sorted);
+      if (result != HA_ERR_END_OF_FILE)
+        break;
+    }
+  }
+  while ((result == HA_ERR_END_OF_FILE) && !range_res);
+
+  *range_info= mrr_cur_range.ptr;
+  DBUG_PRINT("exit",("handler::multi_range_read_next result %d", result));
+  DBUG_RETURN(result);
+}
+
diff --git a/sql/ds_mrr.h b/sql/ds_mrr.h
new file mode 100644
index 00000000000..02abe72a4c1
--- /dev/null
+++ b/sql/ds_mrr.h
@@ -0,0 +1,68 @@
+
+
+/**
+  A Disk-Sweep MRR interface implementation
+
+  This implementation makes range (and, in the future, 'ref') scans to read
+  table rows in disk sweeps. 
+  
+  Currently it is used by MyISAM and InnoDB. Potentially it can be used with
+  any table handler that has non-clustered indexes and on-disk rows.
+*/
+
+class DsMrr_impl
+{
+public:
+  typedef void (handler::*range_check_toggle_func_t)(bool on);
+
+  DsMrr_impl()
+    : h2(NULL) {};
+  
+  /*
+    The "owner" handler object (the one that calls dsmrr_XXX functions.
+    It is used to retrieve full table rows by calling rnd_pos().
+  */
+  handler *h;
+  TABLE *table; /* Always equal to h->table */
+private:
+  /* Secondary handler object.  It is used for scanning the index */
+  handler *h2;
+
+  /* Buffer to store rowids, or (rowid, range_id) pairs */
+  uchar *rowids_buf;
+  uchar *rowids_buf_cur;   /* Current position when reading/writing */
+  uchar *rowids_buf_last;  /* When reading: end of used buffer space */
+  uchar *rowids_buf_end;   /* End of the buffer */
+
+  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
+
+  /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
+  bool is_mrr_assoc;
+
+  bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
+public:
+  void init(handler *h_arg, TABLE *table_arg)
+  {
+    h= h_arg; 
+    table= table_arg;
+  }
+  int dsmrr_init(handler *h, RANGE_SEQ_IF *seq_funcs, void *seq_init_param, 
+                 uint n_ranges, uint mode, HANDLER_BUFFER *buf);
+  void dsmrr_close();
+  int dsmrr_fill_buffer();
+  int dsmrr_next(char **range_info);
+
+  ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint *bufsz,
+                     uint *flags, COST_VECT *cost);
+
+  ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, 
+                            void *seq_init_param, uint n_ranges, uint *bufsz,
+                            uint *flags, COST_VECT *cost);
+private:
+  bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 
+                       COST_VECT *cost);
+  bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, 
+                               uint *buffer_size, COST_VECT *cost);
+};
+
+
diff --git a/sql/handler.cc b/sql/handler.cc
index 3ff9a63b76e..3f4fec8f23a 100644
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -4161,6 +4161,41 @@ void ha_binlog_log_query(THD *thd, handlerton *hton,
 }
 #endif
 
+
+/**
+  Calculate cost of 'index only' scan for given index and number of records
+
+  @param keynr    Index number
+  @param records  Estimated number of records to be retrieved
+
+  @note
+    It is assumed that we will read trough the whole key range and that all
+    key blocks are half full (normally things are much better). It is also
+    assumed that each time we read the next key from the index, the handler
+    performs a random seek, thus the cost is proportional to the number of
+    blocks read.
+
+  @todo
+    Consider joining this function and handler::read_time() into one
+    handler::read_time(keynr, records, ranges, bool index_only) function.
+
+  @return
+    Estimated cost of 'index only' scan
+*/
+
+double handler::index_only_read_time(uint keynr, double records)
+{
+  double read_time;
+  uint keys_per_block= (stats.block_size/2/
+			(table->key_info[keynr].key_length + ref_length) + 1);
+  read_time=((double) (records + keys_per_block-1) /
+             (double) keys_per_block);
+  return read_time;
+}
+
+#if 0 
+// psergey-mrr
+
 /**
   Read the first row of a multi-range set.
 
@@ -4287,7 +4322,7 @@ scan_it_again:
   DBUG_PRINT("exit",("handler::read_multi_range_next: result %d", result));
   DBUG_RETURN(result);
 }
-
+#endif
 
 /**
   Read first row between two ranges.
@@ -4391,7 +4426,7 @@ int handler::read_range_next()
 int handler::compare_key(key_range *range)
 {
   int cmp;
-  if (!range)
+  if (!range || in_range_check_pushed_down)
     return 0;					// No max range
   cmp= key_cmp(range_key_part, range->key, range->length);
   if (!cmp)
@@ -4400,6 +4435,23 @@ int handler::compare_key(key_range *range)
 }
 
 
+/*
+  Same as compare_key() but doesn't check have in_range_check_pushed_down.
+  This is used by index condition pushdown implementation.
+*/
+
+int handler::compare_key2(key_range *range)
+{
+  int cmp;
+  if (!range)
+    return 0;					// no max range
+  cmp= key_cmp(range_key_part, range->key, range->length);
+  if (!cmp)
+    cmp= key_compare_result_on_equal;
+  return cmp;
+}
+
+
 int handler::index_read_idx_map(uchar * buf, uint index, const uchar * key,
                                 key_part_map keypart_map,
                                 enum ha_rkey_function find_flag)
diff --git a/sql/handler.h b/sql/handler.h
index cfea6175733..36c1a37f200 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -132,6 +132,8 @@
 /* Has automatic checksums and uses the new checksum format */
 #define HA_HAS_NEW_CHECKSUM    (LL(1) << 36)
 
+#define HA_MRR_CANT_SORT       (LL(1) << 37)
+
 /*
   Set of all binlog flags. Currently only contain the capabilities
   flags.
@@ -147,6 +149,15 @@
 #define HA_KEYREAD_ONLY         64	/* Support HA_EXTRA_KEYREAD */
 
 /*
+  Index scan will not return records in rowid order. Not guaranteed to be
+  set for unordered (e.g. HASH) indexes.
+*/
+#define HA_KEY_SCAN_NOT_ROR     128 
+#define HA_DO_INDEX_COND_PUSHDOWN  256 /* Supports Index Condition Pushdown */
+
+
+
+/*
   bits in alter_table_flags:
 */
 /*
@@ -199,12 +210,6 @@
 #define HA_FAST_CHANGE_PARTITION                (1L << 13)
 #define HA_PARTITION_ONE_PHASE                  (1L << 14)
 
-/*
-  Index scan will not return records in rowid order. Not guaranteed to be
-  set for unordered (e.g. HASH) indexes.
-*/
-#define HA_KEY_SCAN_NOT_ROR     128 
-
 /* operations for disable/enable indexes */
 #define HA_KEY_SWITCH_NONUNIQ      0
 #define HA_KEY_SWITCH_ALL          1
@@ -1015,6 +1020,186 @@ typedef struct st_ha_check_opt
 } HA_CHECK_OPT;
 
 
+/********************************************************************************
+ * MRR
+ ********************************************************************************/
+
+typedef void *range_seq_t;
+
+typedef struct st_range_seq_if
+{
+  /*
+    Initialize the traversal of range sequence
+    
+    SYNOPSIS
+      init()
+        init_params  The seq_init_param parameter 
+        n_ranges     The number of ranges obtained 
+        flags        A combination of HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY
+
+    RETURN
+      An opaque value to be used as RANGE_SEQ_IF::next() parameter
+  */
+  range_seq_t (*init)(void *init_params, uint n_ranges, uint flags);
+
+
+  /*
+    Get the next range in the range sequence
+
+    SYNOPSIS
+      next()
+        seq    The value returned by RANGE_SEQ_IF::init()
+        range  OUT Information about the next range
+    
+    RETURN
+      0 - Ok, the range structure filled with info about the next range
+      1 - No more ranges
+  */
+  uint (*next) (range_seq_t seq, KEY_MULTI_RANGE *range);
+
+  /*
+    Check whether range_info orders to skip the next record
+
+    SYNOPSIS
+      skip_record()
+        seq         The value returned by RANGE_SEQ_IF::init()
+        range_info  Information about the next range 
+                    (Ignored if MRR_NO_ASSOCIATION is set)
+        rowid       Rowid of the record to be checked (ignored if set to 0)
+    
+    RETURN
+      1 - Record with this range_info and/or this rowid shall be filtered
+          out from the stream of records returned by multi_range_read_next()
+      0 - The record shall be left in the stream
+  */ 
+  bool (*skip_record) (range_seq_t seq, char *range_info, uchar *rowid);
+
+  /*
+    Check if the record combination matches the index condition
+    SYNOPSIS
+      skip_index_tuple()
+        seq         The value returned by RANGE_SEQ_IF::init()
+        range_info  Information about the next range 
+    
+    RETURN
+      0 - The record combination satisfies the index condition
+      1 - Otherwise
+  */ 
+  bool (*skip_index_tuple) (range_seq_t seq, char *range_info);
+} RANGE_SEQ_IF;
+
+class COST_VECT
+{ 
+public:
+  double io_count;     /* number of I/O                 */
+  double avg_io_cost;  /* cost of an average I/O oper.  */
+  double cpu_cost;     /* cost of operations in CPU     */
+  double mem_cost;     /* cost of used memory           */ 
+  double import_cost;  /* cost of remote operations     */
+  
+  enum { IO_COEFF=1 };
+  enum { CPU_COEFF=1 };
+  enum { MEM_COEFF=1 };
+  enum { IMPORT_COEFF=1 };
+
+  COST_VECT() {}                              // keep gcc happy
+
+  double total_cost() 
+  {
+    return IO_COEFF*io_count*avg_io_cost + CPU_COEFF * cpu_cost +
+           MEM_COEFF*mem_cost + IMPORT_COEFF*import_cost;
+  }
+
+  void zero()
+  {
+    avg_io_cost= 1.0;
+    io_count= cpu_cost= mem_cost= import_cost= 0.0;
+  }
+
+  void multiply(double m)
+  {
+    io_count *= m;
+    cpu_cost *= m;
+    import_cost *= m;
+    /* Don't multiply mem_cost */
+  }
+
+  void add(const COST_VECT* cost)
+  {
+    double io_count_sum= io_count + cost->io_count;
+    add_io(cost->io_count, cost->avg_io_cost);
+    io_count= io_count_sum;
+    cpu_cost += cost->cpu_cost;
+  }
+  void add_io(double add_io_cnt, double add_avg_cost)
+  {
+    double io_count_sum= io_count + add_io_cnt;
+    avg_io_cost= (io_count * avg_io_cost + 
+                  add_io_cnt * add_avg_cost) / io_count_sum;
+    io_count= io_count_sum;
+  }
+
+  /*
+    To be used when we go from old single value-based cost calculations to
+    the new COST_VECT-based.
+  */
+  void convert_from_cost(double cost)
+  {
+    zero();
+    avg_io_cost= 1.0;
+    io_count= cost;
+  }
+};
+
+void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, 
+                         COST_VECT *cost);
+
+/*
+  The below two are not used (and not handled) in this milestone of this WL
+  entry because there seems to be no use for them at this stage of
+  implementation.
+*/
+#define HA_MRR_SINGLE_POINT 1
+#define HA_MRR_FIXED_KEY  2
+
+/* 
+  Indicates that RANGE_SEQ_IF::next(&range) doesn't need to fill in the
+  'range' parameter.
+*/
+#define HA_MRR_NO_ASSOCIATION 4
+
+/* 
+  The MRR user will provide ranges in key order, and MRR implementation
+  must return rows in key order.
+*/
+#define HA_MRR_SORTED 8
+
+/* MRR implementation doesn't have to retrieve full records */
+#define HA_MRR_INDEX_ONLY 16
+
+/* 
+  The passed memory buffer is of maximum possible size, the caller can't
+  assume larger buffer.
+*/
+#define HA_MRR_LIMITS 32
+
+
+/*
+  Flag set <=> default MRR implementation is used
+  (The choice is made by **_info[_const]() function which may set this
+   flag. SQL layer remembers the flag value and then passes it to
+   multi_read_range_init().
+*/
+#define HA_MRR_USE_DEFAULT_IMPL 64
+
+/*
+  Used only as parameter to multi_range_read_info():
+  Flag set <=> the caller guarantees that the bounds of the scanned ranges
+  will not have NULL values.
+*/
+#define HA_MRR_NO_NULL_ENDPOINTS 128
+
+
 
 /*
   This is a buffer area that the handler can use to store rows.
@@ -1025,8 +1210,8 @@ typedef struct st_ha_check_opt
 
 typedef struct st_handler_buffer
 {
-  const uchar *buffer;         /* Buffer one can start using */
-  const uchar *buffer_end;     /* End of buffer */
+  /* const? */uchar *buffer;         /* Buffer one can start using */
+  /* const? */uchar *buffer_end;     /* End of buffer */
   uchar *end_of_used_area;     /* End of area that was used by handler */
 } HANDLER_BUFFER;
 
@@ -1057,11 +1242,16 @@ public:
   ulong update_time;
   uint block_size;			/* index block size */
 
+  /*
+    number of buffer bytes that native mrr implementation needs,
+  */
+  uint mrr_length_per_rec; 
+
   ha_statistics():
     data_file_length(0), max_data_file_length(0),
     index_file_length(0), delete_length(0), auto_increment_value(0),
     records(0), deleted(0), mean_rec_length(0), create_time(0),
-    check_time(0), update_time(0), block_size(0)
+    check_time(0), update_time(0), block_size(0), mrr_length_per_rec(0)
   {}
 };
 
@@ -1101,10 +1291,27 @@ public:
   ha_statistics stats;
 
   /** The following are for read_multi_range */
+//////psergey: was: 
+#if 0
   bool multi_range_sorted;
   KEY_MULTI_RANGE *multi_range_curr;
   KEY_MULTI_RANGE *multi_range_end;
   HANDLER_BUFFER *multi_range_buffer;
+#endif
+//////psergey
+  /** MultiRangeRead-related members: */
+  range_seq_t mrr_iter;    /* Interator to traverse the range sequence */
+  RANGE_SEQ_IF mrr_funcs;  /* Range sequence traversal functions */
+  HANDLER_BUFFER *multi_range_buffer; /* MRR buffer info */
+  uint ranges_in_seq; /* Total number of ranges in the traversed sequence */
+  /* TRUE <=> source MRR ranges and the output are ordered */
+  bool mrr_is_output_sorted;
+
+  /** TRUE <=> we're currently traversing a range in mrr_cur_range. */
+  bool mrr_have_range;
+  /** Current range (the one we're now returning rows from) */
+  KEY_MULTI_RANGE mrr_cur_range;
+//////psergey
 
   /** The following are for read_range() */
   key_range save_end_range, *end_range;
@@ -1112,6 +1319,12 @@ public:
   int key_compare_result_on_equal;
   bool eq_range;
 
+  /* 
+    TRUE <=> the engine guarantees that returned records are within the range
+    being scanned.
+  */
+  bool in_range_check_pushed_down;
+
   uint errkey;				/* Last dup key */
   uint key_used_on_scan;
   uint active_index;
@@ -1122,6 +1335,8 @@ public:
   bool locked;
   bool implicit_emptied;                /* Can be !=0 only if HEAP */
   const COND *pushed_cond;
+  Item *pushed_idx_cond;
+  uint pushed_idx_cond_keyno;  /* The index which the above condition is for */
   /**
     next_insert_id is the next value which should be inserted into the
     auto_increment column: in a inserting-multi-row statement (like INSERT
@@ -1161,7 +1376,8 @@ public:
   handler(handlerton *ht_arg, TABLE_SHARE *share_arg)
     :table_share(share_arg), table(0),
     estimation_rows_to_insert(0), ht(ht_arg),
-    ref(0), key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
+    ref(0), in_range_check_pushed_down(FALSE),
+    key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
     ref_length(sizeof(my_off_t)),
     ft_handler(0), inited(NONE),
     locked(FALSE), implicit_emptied(0),
@@ -1192,6 +1408,7 @@ public:
     DBUG_ASSERT(inited==NONE);
     if (!(result= index_init(idx, sorted)))
       inited=INDEX;
+    end_range= NULL;
     DBUG_RETURN(result);
   }
   int ha_index_end()
@@ -1199,6 +1416,7 @@ public:
     DBUG_ENTER("ha_index_end");
     DBUG_ASSERT(inited==INDEX);
     inited=NONE;
+    end_range= NULL;
     DBUG_RETURN(index_end());
   }
   /* This is called after index_init() if we need to do a index scan */
@@ -1307,6 +1525,7 @@ public:
   { return ulonglong2double(stats.data_file_length) / IO_SIZE + 2; }
   virtual double read_time(uint index, uint ranges, ha_rows rows)
   { return rows2double(ranges+rows); }
+  virtual double index_only_read_time(uint keynr, double records);
   virtual const key_map *keys_to_use_for_scanning() { return &key_map_empty; }
   bool has_transactions()
   { return (ha_table_flags() & HA_NO_TRANSACTIONS) == 0; }
@@ -1523,16 +1742,30 @@ public:
       update_index_statistics();
     return error;
   }
-
+#if 0
   virtual int read_multi_range_first(KEY_MULTI_RANGE **found_range_p,
                                      KEY_MULTI_RANGE *ranges, uint range_count,
                                      bool sorted, HANDLER_BUFFER *buffer);
   virtual int read_multi_range_next(KEY_MULTI_RANGE **found_range_p);
+#endif 
+  //psergey-mrr: 
+  virtual ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                              void *seq_init_param, 
+                                              uint n_ranges, uint *bufsz,
+                                              uint *flags, COST_VECT *cost);
+  virtual ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+                                        uint *bufsz, uint *flags, COST_VECT *cost);
+  virtual int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
+                                    uint n_ranges, uint mode,
+                                    HANDLER_BUFFER *buf);
+  virtual int multi_range_read_next(char **range_info);
+  //psergey-mrr: ends
   virtual int read_range_first(const key_range *start_key,
                                const key_range *end_key,
                                bool eq_range, bool sorted);
   virtual int read_range_next();
   int compare_key(key_range *range);
+  int compare_key2(key_range *range);
   virtual int ft_init() { return HA_ERR_WRONG_COMMAND; }
   void ft_end() { ft_handler=NULL; }
   virtual FT_INFO *ft_init_ext(uint flags, uint inx,String *key)
@@ -1878,6 +2111,7 @@ public:
    Pops the top if condition stack, if stack is not empty.
  */
  virtual void cond_pop() { return; };
+ virtual Item *idx_cond_push(uint keyno, Item* idx_cond) { return idx_cond; }
  virtual bool check_if_incompatible_data(HA_CREATE_INFO *create_info,
 					 uint table_changes)
  { return COMPATIBLE_DATA_NO; }
@@ -2093,8 +2327,10 @@ private:
   virtual int rename_partitions(const char *path)
   { return HA_ERR_WRONG_COMMAND; }
   friend class ha_partition;
+  friend class DsMrr_impl;
 };
 
+#include "ds_mrr.h"
 
 	/* Some extern variables used with handlers */
 
diff --git a/sql/item.h b/sql/item.h
index 82fb3f3a3e0..640b0cc5375 100644
--- a/sql/item.h
+++ b/sql/item.h
@@ -25,7 +25,7 @@ bool trace_unsupported_func(const char *where, const char *processor_name)
   sprintf(buff, "%s::%s", where, processor_name);
   DBUG_ENTER(buff);
   sprintf(buff, "%s returns TRUE: unsupported function", processor_name);
-  DBUG_PRINT("info", (buff));
+  DBUG_PRINT("info", ("%s", buff));
   DBUG_RETURN(TRUE);
 }
 
diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h
index 77f55db145a..4f576bfca4c 100644
--- a/sql/mysql_priv.h
+++ b/sql/mysql_priv.h
@@ -345,11 +345,11 @@ protected:
   The cost of average seek 
     DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST*BLOCKS_IN_AVG_SEEK =1.0.
 */
-#define DISK_SEEK_BASE_COST ((double)0.5)
+#define DISK_SEEK_BASE_COST ((double)0.9)
 
 #define BLOCKS_IN_AVG_SEEK  128
 
-#define DISK_SEEK_PROP_COST ((double)0.5/BLOCKS_IN_AVG_SEEK)
+#define DISK_SEEK_PROP_COST ((double)0.1/BLOCKS_IN_AVG_SEEK)
 
 
 /**
diff --git a/sql/mysqld.cc b/sql/mysqld.cc
index 81f02157ee0..9ef69e3f5a6 100644
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@ -3526,6 +3526,8 @@ static int init_common_variables(const char *conf_file_name, int argc,
   global_system_variables.character_set_results= default_charset_info;
   global_system_variables.character_set_client= default_charset_info;
 
+  global_system_variables.optimizer_use_mrr= 1;
+
   if (!(character_set_filesystem=
         get_charset_by_csname(character_set_filesystem_name,
                               MY_CS_PRIMARY, MYF(MY_WME))))
@@ -6961,11 +6963,6 @@ The minimum value for this variable is 4096.",
    (uchar**) &global_system_variables.min_examined_row_limit,
    (uchar**) &max_system_variables.min_examined_row_limit, 0, GET_ULONG,
   REQUIRED_ARG, 0, 0, (longlong) ULONG_MAX, 0, 1L, 0},
-  {"multi_range_count", OPT_MULTI_RANGE_COUNT,
-   "Number of key ranges to request at once.",
-   (uchar**) &global_system_variables.multi_range_count,
-   (uchar**) &max_system_variables.multi_range_count, 0,
-   GET_ULONG, REQUIRED_ARG, 256, 1, (longlong) ULONG_MAX, 0, 1, 0},
   {"myisam_block_size", OPT_MYISAM_BLOCK_SIZE,
    "Block size to be used for MyISAM index pages.",
    (uchar**) &opt_myisam_block_size,
@@ -7129,7 +7126,7 @@ The minimum value for this variable is 4096.",
    (uchar**) &global_system_variables.read_rnd_buff_size,
    (uchar**) &max_system_variables.read_rnd_buff_size, 0,
    GET_ULONG, REQUIRED_ARG, 256*1024L, IO_SIZE*2+MALLOC_OVERHEAD,
-   INT_MAX32, MALLOC_OVERHEAD, IO_SIZE, 0},
+   INT_MAX32, MALLOC_OVERHEAD, 1 /* Small overhead to be able to test MRR, was: IO_SIZE*/ , 0},
   {"record_buffer", OPT_RECORD_BUFFER,
    "Alias for read_buffer_size",
    (uchar**) &global_system_variables.read_buff_size,
diff --git a/sql/opt_range.cc b/sql/opt_range.cc
index adb5a7c8c96..c75999da00c 100644
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -34,7 +34,7 @@
     
     The lists are returned in form of complicated structure of interlinked
     SEL_TREE/SEL_IMERGE/SEL_ARG objects.
-    See check_quick_keys, find_used_partitions for examples of how to walk 
+    See quick_range_seq_next, find_used_partitions for examples of how to walk 
     this structure.
     All direct "users" of this module are located within this file, too.
 
@@ -59,6 +59,48 @@
 
   Record retrieval code for range/index_merge/groupby-min-max.
     Implementations of QUICK_*_SELECT classes.
+
+  KeyTupleFormat
+  ~~~~~~~~~~~~~~
+  The code in this file (and elsewhere) makes operations on key value tuples.
+  Those tuples are stored in the following format:
+  
+  The tuple is a sequence of key part values. The length of key part value
+  depends only on its type (and not depends on the what value is stored)
+  
+    KeyTuple: keypart1-data, keypart2-data, ...
+  
+  The value of each keypart is stored in the following format:
+  
+    keypart_data: [isnull_byte] keypart-value-bytes
+
+  If a keypart may have a NULL value (key_part->field->real_maybe_null() can
+  be used to check this), then the first byte is a NULL indicator with the 
+  following valid values:
+    1  - keypart has NULL value.
+    0  - keypart has non-NULL value.
+
+  <questionable-statement> If isnull_byte==1 (NULL value), then the following
+  keypart->length bytes must be 0.
+  </questionable-statement>
+
+  keypart-value-bytes holds the value. Its format depends on the field type.
+  The length of keypart-value-bytes may or may not depend on the value being
+  stored. The default is that length is static and equal to 
+  KEY_PART_INFO::length.
+  
+  Key parts with (key_part_flag & HA_BLOB_PART) have length depending of the 
+  value:
+  
+     keypart-value-bytes: value_length value_bytes
+
+  The value_length part itself occupies HA_KEY_BLOB_LENGTH=2 bytes.
+
+  See key_copy() and key_restore() for code to move data between index tuple
+  and table record
+
+  CAUTION: the above description is only sergefp's understanding of the 
+           subject and may omit some details.
 */
 
 #ifdef USE_PRAGMA_IMPLEMENTATION
@@ -403,6 +445,7 @@ public:
   /* returns a number of keypart values (0 or 1) appended to the key buffer */
   int store_min(uint length, uchar **min_key,uint min_key_flag)
   {
+    /* "(kp1 > c1) AND (kp2 OP c2) AND ..." -> (kp1 > c1) */
     if ((min_flag & GEOM_FLAG) ||
         (!(min_flag & NO_MIN_RANGE) &&
 	!(min_key_flag & (NO_MIN_RANGE | NEAR_MIN))))
@@ -627,6 +670,11 @@ public:
   */
   bool using_real_indexes;
   
+  /*
+    Aggressively remove "scans" that do not have conditions on first
+    keyparts. Such scans are usable when doing partition pruning but not
+    regular range optimization.
+  */
   bool remove_jump_scans;
   
   /*
@@ -634,8 +682,17 @@ public:
     using_real_indexes==TRUE
   */
   uint real_keynr[MAX_KEY];
+
+  /*
+    Used to store 'current key tuples', in both range analysis and
+    partitioning (list) analysis
+  */
+  uchar min_key[MAX_KEY_LENGTH+MAX_FIELD_WIDTH],
+    max_key[MAX_KEY_LENGTH+MAX_FIELD_WIDTH];
+
   /* Number of SEL_ARG objects allocated by SEL_ARG::clone_tree operations */
   uint alloced_sel_args; 
+  bool force_default_mrr;
 };
 
 class PARAM : public RANGE_OPT_PARAM
@@ -645,8 +702,6 @@ public:
   longlong baseflag;
   uint max_key_part, range_count;
 
-  uchar min_key[MAX_KEY_LENGTH+MAX_FIELD_WIDTH],
-    max_key[MAX_KEY_LENGTH+MAX_FIELD_WIDTH];
   bool quick;				// Don't calulate possible keys
 
   uint fields_bitmap_size;
@@ -683,15 +738,14 @@ static SEL_ARG *get_mm_leaf(RANGE_OPT_PARAM *param,COND *cond_func,Field *field,
 static SEL_TREE *get_mm_tree(RANGE_OPT_PARAM *param,COND *cond);
 
 static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts);
-static ha_rows check_quick_select(PARAM *param,uint index,SEL_ARG *key_tree,
-                                  bool update_tbl_stats);
-static ha_rows check_quick_keys(PARAM *param,uint index,SEL_ARG *key_tree,
-                                uchar *min_key, uint min_key_flag, int,
-                                uchar *max_key, uint max_key_flag, int);
+static ha_rows check_quick_select(PARAM *param, uint idx, bool index_only,
+                                  SEL_ARG *tree, bool update_tbl_stats, 
+                                  uint *mrr_flags, uint *bufsize,
+                                  COST_VECT *cost);
 
 QUICK_RANGE_SELECT *get_quick_select(PARAM *param,uint index,
-                                     SEL_ARG *key_tree,
-                                     MEM_ROOT *alloc = NULL);
+                                     SEL_ARG *key_tree, uint mrr_flags, 
+                                     uint mrr_buf_size, MEM_ROOT *alloc);
 static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree,
                                        bool index_read_must_be_used,
                                        bool update_tbl_stats,
@@ -709,8 +763,6 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge,
                                          double read_time);
 static
 TRP_GROUP_MIN_MAX *get_best_group_min_max(PARAM *param, SEL_TREE *tree);
-static double get_index_only_read_time(const PARAM* param, ha_rows records,
-                                       int keynr);
 
 #ifndef DBUG_OFF
 static void print_sel_tree(PARAM *param, SEL_TREE *tree, key_map *tree_map,
@@ -1099,25 +1151,22 @@ QUICK_SELECT_I::QUICK_SELECT_I()
 {}
 
 QUICK_RANGE_SELECT::QUICK_RANGE_SELECT(THD *thd, TABLE *table, uint key_nr,
-                                       bool no_alloc, MEM_ROOT *parent_alloc)
-  :dont_free(0),error(0),free_file(0),in_range(0),cur_range(NULL),last_range(0)
+                                       bool no_alloc, MEM_ROOT *parent_alloc,
+                                       bool *create_error)
+  :free_file(0),cur_range(NULL),last_range(0),dont_free(0)
 {
   my_bitmap_map *bitmap;
   DBUG_ENTER("QUICK_RANGE_SELECT::QUICK_RANGE_SELECT");
 
   in_ror_merged_scan= 0;
-  sorted= 0;
   index= key_nr;
   head=  table;
   key_part_info= head->key_info[index].key_part;
   my_init_dynamic_array(&ranges, sizeof(QUICK_RANGE*), 16, 16);
 
   /* 'thd' is not accessible in QUICK_RANGE_SELECT::reset(). */
-  multi_range_bufsiz= thd->variables.read_rnd_buff_size;
-  multi_range_count= thd->variables.multi_range_count;
-  multi_range_length= 0;
-  multi_range= NULL;
-  multi_range_buff= NULL;
+  mrr_buf_size= thd->variables.read_rnd_buff_size;
+  mrr_buf_desc= NULL;
 
   if (!no_alloc && !parent_alloc)
   {
@@ -1132,12 +1181,12 @@ QUICK_RANGE_SELECT::QUICK_RANGE_SELECT(THD *thd, TABLE *table, uint key_nr,
   save_read_set= head->read_set;
   save_write_set= head->write_set;
 
-  /* Allocate a bitmap for used columns */
+  /* Allocate a bitmap for used columns (Q: why not on MEM_ROOT?) */
   if (!(bitmap= (my_bitmap_map*) my_malloc(head->s->column_bitmap_size,
                                            MYF(MY_WME))))
   {
     column_bitmap.bitmap= 0;
-    error= 1;
+    *create_error= 1;
   }
   else
     bitmap_init(&column_bitmap, bitmap, head->s->fields, FALSE);
@@ -1145,6 +1194,20 @@ QUICK_RANGE_SELECT::QUICK_RANGE_SELECT(THD *thd, TABLE *table, uint key_nr,
 }
 
 
+void QUICK_RANGE_SELECT::need_sorted_output()
+{
+  if (!(mrr_flags & HA_MRR_SORTED))
+  {
+    /*
+      Native implementation can't produce sorted output. We'll have to
+      switch to default
+    */
+    mrr_flags |= HA_MRR_USE_DEFAULT_IMPL; 
+  }
+  mrr_flags |= HA_MRR_SORTED;
+}
+
+
 int QUICK_RANGE_SELECT::init()
 {
   DBUG_ENTER("QUICK_RANGE_SELECT::init");
@@ -1190,8 +1253,7 @@ QUICK_RANGE_SELECT::~QUICK_RANGE_SELECT()
     my_free((char*) column_bitmap.bitmap, MYF(MY_ALLOW_ZERO_PTR));
   }
   head->column_bitmaps_set(save_read_set, save_write_set);
-  x_free(multi_range);
-  x_free(multi_range_buff);
+  x_free(mrr_buf_desc);
   DBUG_VOID_RETURN;
 }
 
@@ -1966,9 +2028,11 @@ class TRP_RANGE : public TABLE_READ_PLAN
 public:
   SEL_ARG *key; /* set of intervals to be used in "range" method retrieval */
   uint     key_idx; /* key number in PARAM::key */
+  uint     mrr_flags; 
+  uint     mrr_buf_size;
 
-  TRP_RANGE(SEL_ARG *key_arg, uint idx_arg)
-   : key(key_arg), key_idx(idx_arg)
+  TRP_RANGE(SEL_ARG *key_arg, uint idx_arg, uint mrr_flags_arg)
+   : key(key_arg), key_idx(idx_arg), mrr_flags(mrr_flags_arg)
   {}
   virtual ~TRP_RANGE() {}                     /* Remove gcc warning */
 
@@ -1977,7 +2041,8 @@ public:
   {
     DBUG_ENTER("TRP_RANGE::make_quick");
     QUICK_RANGE_SELECT *quick;
-    if ((quick= get_quick_select(param, key_idx, key, parent_alloc)))
+    if ((quick= get_quick_select(param, key_idx, key,  mrr_flags, 
+                                 mrr_buf_size, parent_alloc)))
     {
       quick->records= records;
       quick->read_time= read_cost;
@@ -2202,7 +2267,8 @@ static int fill_used_fields_bitmap(PARAM *param)
 
 int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
 				  table_map prev_tables,
-				  ha_rows limit, bool force_quick_range)
+				  ha_rows limit, bool force_quick_range, 
+                                  bool ordered_output)
 {
   uint idx;
   double scan_time;
@@ -2260,6 +2326,7 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
     param.imerge_cost_buff_size= 0;
     param.using_real_indexes= TRUE;
     param.remove_jump_scans= TRUE;
+    param.force_default_mrr= ordered_output;
 
     thd->no_errors=1;				// Don't warn about NULL
     init_sql_alloc(&alloc, thd->variables.range_alloc_block_size, 0);
@@ -2313,9 +2380,10 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
     if (!head->covering_keys.is_clear_all())
     {
       int key_for_use= find_shortest_key(head, &head->covering_keys);
-      double key_read_time= (get_index_only_read_time(&param, records,
-                                                     key_for_use) +
-                             (double) records / TIME_FOR_COMPARE);
+      double key_read_time= 
+        param.table->file->index_only_read_time(key_for_use, 
+                                                rows2double(records)) +
+        (double) records / TIME_FOR_COMPARE;
       DBUG_PRINT("info",  ("'all'+'using index' scan will be using key %d, "
                            "read time %g", key_for_use, key_read_time));
       if (key_read_time < read_time)
@@ -3937,42 +4005,6 @@ skip_to_ror_scan:
 }
 
 
-/*
-  Calculate cost of 'index only' scan for given index and number of records.
-
-  SYNOPSIS
-    get_index_only_read_time()
-      param    parameters structure
-      records  #of records to read
-      keynr    key to read
-
-  NOTES
-    It is assumed that we will read trough the whole key range and that all
-    key blocks are half full (normally things are much better). It is also
-    assumed that each time we read the next key from the index, the handler
-    performs a random seek, thus the cost is proportional to the number of
-    blocks read.
-
-  TODO:
-    Move this to handler->read_time() by adding a flag 'index-only-read' to
-    this call. The reason for doing this is that the current function doesn't
-    handle the case when the row is stored in the b-tree (like in innodb
-    clustered index)
-*/
-
-static double get_index_only_read_time(const PARAM* param, ha_rows records,
-                                       int keynr)
-{
-  double read_time;
-  uint keys_per_block= (param->table->file->stats.block_size/2/
-			(param->table->key_info[keynr].key_length+
-			 param->table->file->ref_length) + 1);
-  read_time=((double) (records+keys_per_block-1)/
-             (double) keys_per_block);
-  return read_time;
-}
-
-
 typedef struct st_ror_scan_info
 {
   uint      idx;      /* # of used key in param->keys */
@@ -4048,9 +4080,9 @@ ROR_SCAN_INFO *make_ror_scan(const PARAM *param, int idx, SEL_ARG *sel_arg)
     if (bitmap_is_set(&param->needed_fields, key_part->fieldnr-1))
       bitmap_set_bit(&ror_scan->covered_fields, key_part->fieldnr-1);
   }
+  double rows= rows2double(param->table->quick_rows[ror_scan->keynr]);
   ror_scan->index_read_cost=
-    get_index_only_read_time(param, param->table->quick_rows[ror_scan->keynr],
-                             ror_scan->keynr);
+    param->table->file->index_only_read_time(ror_scan->keynr, rows);
   DBUG_RETURN(ror_scan);
 }
 
@@ -4837,9 +4869,11 @@ static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree,
                                        bool update_tbl_stats,
                                        double read_time)
 {
-  int idx;
+  uint idx;
   SEL_ARG **key,**end, **key_to_read= NULL;
   ha_rows UNINIT_VAR(best_records);              /* protected by key_to_read */
+  uint    UNINIT_VAR(best_mrr_flags),            /* protected by key_to_read */
+          UNINIT_VAR(best_buf_size);             /* protected by key_to_read */
   TRP_RANGE* read_plan= NULL;
   bool pk_is_clustered= param->table->file->primary_key_is_clustered();
   DBUG_ENTER("get_key_scans_params");
@@ -4852,64 +4886,40 @@ static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree,
                                       "tree scans"););
   tree->ror_scans_map.clear_all();
   tree->n_ror_scans= 0;
-  for (idx= 0,key=tree->keys, end=key+param->keys;
-       key != end ;
-       key++,idx++)
+  for (idx= 0,key=tree->keys, end=key+param->keys; key != end; key++,idx++)
   {
-    ha_rows found_records;
-    double found_read_time;
     if (*key)
     {
+      ha_rows found_records;
+      COST_VECT cost;
+      double found_read_time;
+      uint mrr_flags, buf_size;
       uint keynr= param->real_keynr[idx];
       if ((*key)->type == SEL_ARG::MAYBE_KEY ||
           (*key)->maybe_flag)
         param->needed_reg->set_bit(keynr);
 
-      bool read_index_only= index_read_must_be_used ? TRUE :
-                            (bool) param->table->covering_keys.is_set(keynr);
+      bool read_index_only= index_read_must_be_used || 
+                            param->table->covering_keys.is_set(keynr);
 
-      found_records= check_quick_select(param, idx, *key, update_tbl_stats);
-      if (param->is_ror_scan)
+      found_records= check_quick_select(param, idx, read_index_only, *key,
+                                        update_tbl_stats, &mrr_flags,
+                                        &buf_size, &cost);
+
+      if ((found_records != HA_POS_ERROR) && param->is_ror_scan)
       {
         tree->n_ror_scans++;
         tree->ror_scans_map.set_bit(idx);
       }
-      double cpu_cost= (double) found_records / TIME_FOR_COMPARE;
-      if (found_records != HA_POS_ERROR && found_records > 2 &&
-          read_index_only &&
-          (param->table->file->index_flags(keynr, param->max_key_part,1) &
-           HA_KEYREAD_ONLY) &&
-          !(pk_is_clustered && keynr == param->table->s->primary_key))
-      {
-        /*
-          We can resolve this by only reading through this key. 
-          0.01 is added to avoid races between range and 'index' scan.
-        */
-        found_read_time= get_index_only_read_time(param,found_records,keynr) +
-                         cpu_cost + 0.01;
-      }
-      else
-      {
-        /*
-          cost(read_through_index) = cost(disk_io) + cost(row_in_range_checks)
-          The row_in_range check is in QUICK_RANGE_SELECT::cmp_next function.
-        */
-	found_read_time= param->table->file->read_time(keynr,
-                                                       param->range_count,
-                                                       found_records) +
-			 cpu_cost + 0.01;
-      }
-      DBUG_PRINT("info",("key %s: found_read_time: %g (cur. read_time: %g)",
-                         param->table->key_info[keynr].name, found_read_time,
-                         read_time));
-
-      if (read_time > found_read_time && found_records != HA_POS_ERROR)
+      if (found_records != HA_POS_ERROR &&
+          read_time > (found_read_time= cost.total_cost()))
       {
         read_time=    found_read_time;
         best_records= found_records;
         key_to_read=  key;
+        best_mrr_flags= mrr_flags;
+        best_buf_size=  buf_size;
       }
-
     }
   }
 
@@ -4918,11 +4928,13 @@ static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree,
   if (key_to_read)
   {
     idx= key_to_read - tree->keys;
-    if ((read_plan= new (param->mem_root) TRP_RANGE(*key_to_read, idx)))
+    if ((read_plan= new (param->mem_root) TRP_RANGE(*key_to_read, idx,
+                                                    best_mrr_flags)))
     {
       read_plan->records= best_records;
       read_plan->is_ror= tree->ror_scans_map.is_set(idx);
       read_plan->read_cost= read_time;
+      read_plan->mrr_buf_size= best_buf_size;
       DBUG_PRINT("info",
                  ("Returning range plan for key %s, cost %g, records %lu",
                   param->table->key_info[param->real_keynr[idx]].name,
@@ -4985,7 +4997,9 @@ QUICK_SELECT_I *TRP_ROR_INTERSECT::make_quick(PARAM *param,
     for (; first_scan != last_scan;++first_scan)
     {
       if (!(quick= get_quick_select(param, (*first_scan)->idx,
-                                    (*first_scan)->sel_arg, alloc)) ||
+                                    (*first_scan)->sel_arg,
+                                    HA_MRR_USE_DEFAULT_IMPL | HA_MRR_SORTED,
+                                    0, alloc)) ||
           quick_intrsect->push_quick_back(quick))
       {
         delete quick_intrsect;
@@ -4995,7 +5009,9 @@ QUICK_SELECT_I *TRP_ROR_INTERSECT::make_quick(PARAM *param,
     if (cpk_scan)
     {
       if (!(quick= get_quick_select(param, cpk_scan->idx,
-                                    cpk_scan->sel_arg, alloc)))
+                                    cpk_scan->sel_arg,
+                                    HA_MRR_USE_DEFAULT_IMPL | HA_MRR_SORTED,
+                                    0, alloc)))
       {
         delete quick_intrsect;
         DBUG_RETURN(NULL);
@@ -7266,250 +7282,260 @@ void SEL_ARG::test_use_count(SEL_ARG *root)
 
 #endif
 
+/****************************************************************************
+  MRR Range Sequence Interface implementation that walks a SEL_ARG* tree.
+ ****************************************************************************/
+
+/* MRR range sequence, SEL_ARG* implementation: stack entry */
+typedef struct st_range_seq_entry 
+{
+  /* 
+    Pointers in min and max keys. They point to right-after-end of key
+    images. The 0-th entry has these pointing to key tuple start.
+  */
+  uchar *min_key, *max_key;
+  
+  /* 
+    Flags, for {keypart0, keypart1, ... this_keypart} subtuple.
+    min_key_flag may have NULL_RANGE set.
+  */
+  uint min_key_flag, max_key_flag;
+  
+  /* Number of key parts */
+  uint min_key_parts, max_key_parts;
+  SEL_ARG *key_tree;
+} RANGE_SEQ_ENTRY;
+
 
 /*
-  Calculate estimate of number records that will be retrieved by a range
-  scan on given index using given SEL_ARG intervals tree.
-  SYNOPSIS
-    check_quick_select
-      param  Parameter from test_quick_select
-      idx               Number of index to use in tree->keys
-      tree              Transformed selection condition, tree->keys[idx]
-                        holds the range tree to be used for scanning.
-      update_tbl_stats  If true, update table->quick_keys with information
-                        about range scan we've evaluated.
+  MRR range sequence, SEL_ARG* implementation: SEL_ARG graph traversal context
+*/
+typedef struct st_sel_arg_range_seq
+{
+  uint keyno;      /* index of used tree in SEL_TREE structure */
+  uint real_keyno; /* Number of the index in tables */
+  PARAM *param;
+  SEL_ARG *start; /* Root node of the traversed SEL_ARG* graph */
+  
+  RANGE_SEQ_ENTRY stack[MAX_REF_PARTS];
+  int i; /* Index of last used element in the above array */
+  
+  bool at_start; /* TRUE <=> The traversal has just started */
+} SEL_ARG_RANGE_SEQ;
 
-  NOTES
-    param->is_ror_scan is set to reflect if the key scan is a ROR (see
-    is_key_scan_ror function for more info)
-    param->table->quick_*, param->range_count (and maybe others) are
-    updated with data of given key scan, see check_quick_keys for details.
 
-  RETURN
-    Estimate # of records to be retrieved.
-    HA_POS_ERROR if estimate calculation failed due to table handler problems.
+/*
+  Range sequence interface, SEL_ARG* implementation: Initialize the traversal
+
+  SYNOPSIS
+    init()
+      init_params  SEL_ARG tree traversal context
+      n_ranges     [ignored] The number of ranges obtained 
+      flags        [ignored] HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY
 
+  RETURN
+    Value of init_param
 */
 
-static ha_rows
-check_quick_select(PARAM *param,uint idx,SEL_ARG *tree, bool update_tbl_stats)
+range_seq_t sel_arg_range_seq_init(void *init_param, uint n_ranges, uint flags)
 {
-  ha_rows records;
-  bool    cpk_scan;
-  uint key;
-  DBUG_ENTER("check_quick_select");
+  SEL_ARG_RANGE_SEQ *seq= (SEL_ARG_RANGE_SEQ*)init_param;
+  seq->at_start= TRUE;
+  seq->stack[0].key_tree= NULL;
+  seq->stack[0].min_key= seq->param->min_key;
+  seq->stack[0].min_key_flag= 0;
+  seq->stack[0].min_key_parts= 0;
 
-  param->is_ror_scan= FALSE;
-  param->first_null_comp= 0;
+  seq->stack[0].max_key= seq->param->max_key;
+  seq->stack[0].max_key_flag= 0;
+  seq->stack[0].max_key_parts= 0;
+  seq->i= 0;
+  return init_param;
+}
 
-  if (!tree)
-    DBUG_RETURN(HA_POS_ERROR);			// Can't use it
-  param->max_key_part=0;
-  param->range_count=0;
-  key= param->real_keynr[idx];
 
-  if (tree->type == SEL_ARG::IMPOSSIBLE)
-    DBUG_RETURN(0L);				// Impossible select. return
-  if (tree->type != SEL_ARG::KEY_RANGE || tree->part != 0)
-    DBUG_RETURN(HA_POS_ERROR);				// Don't use tree
+static void step_down_to(SEL_ARG_RANGE_SEQ *arg, SEL_ARG *key_tree)
+{
+  RANGE_SEQ_ENTRY *cur= &arg->stack[arg->i+1];
+  RANGE_SEQ_ENTRY *prev= &arg->stack[arg->i];
+  
+  cur->key_tree= key_tree;
+  cur->min_key= prev->min_key;
+  cur->max_key= prev->max_key;
+  cur->min_key_parts= prev->min_key_parts;
+  cur->max_key_parts= prev->max_key_parts;
 
-  enum ha_key_alg key_alg= param->table->key_info[key].algorithm;
-  if ((key_alg != HA_KEY_ALG_BTREE) && (key_alg!= HA_KEY_ALG_UNDEF))
-  {
-    /* Records are not ordered by rowid for other types of indexes. */
-    cpk_scan= FALSE;
-  }
-  else
-  {
-    /*
-      Clustered PK scan is a special case, check_quick_keys doesn't recognize
-      CPK scans as ROR scans (while actually any CPK scan is a ROR scan).
-    */
-    cpk_scan= ((param->table->s->primary_key == param->real_keynr[idx]) &&
-               param->table->file->primary_key_is_clustered());
-    param->is_ror_scan= !cpk_scan;
-  }
-  param->n_ranges= 0;
+  uint16 stor_length= arg->param->key[arg->keyno][key_tree->part].store_length;
+  cur->min_key_parts += key_tree->store_min(stor_length, &cur->min_key,
+                                            prev->min_key_flag);
+  cur->max_key_parts += key_tree->store_max(stor_length, &cur->max_key,
+                                            prev->max_key_flag);
 
-  records= check_quick_keys(param, idx, tree,
-                            param->min_key, 0, -1,
-                            param->max_key, 0, -1);
-  if (records != HA_POS_ERROR)
-  {
-    if (update_tbl_stats)
-    {
-      param->table->quick_keys.set_bit(key);
-      param->table->quick_key_parts[key]=param->max_key_part+1;
-      param->table->quick_n_ranges[key]= param->n_ranges;
-      param->table->quick_condition_rows=
-        min(param->table->quick_condition_rows, records);
-    }
-    /*
-      Need to save quick_rows in any case as it is used when calculating
-      cost of ROR intersection:
-    */
-    param->table->quick_rows[key]=records;
-    if (cpk_scan)
-      param->is_ror_scan= TRUE;
-  }
-  if (param->table->file->index_flags(key, 0, TRUE) & HA_KEY_SCAN_NOT_ROR)
-    param->is_ror_scan= FALSE;
-  DBUG_PRINT("exit", ("Records: %lu", (ulong) records));
-  DBUG_RETURN(records);
+  cur->min_key_flag= prev->min_key_flag | key_tree->min_flag;
+  cur->max_key_flag= prev->max_key_flag | key_tree->max_flag;
+
+  if (key_tree->is_null_interval())
+    cur->min_key_flag |= NULL_RANGE;
+  (arg->i)++;
 }
 
 
 /*
-  Recursively calculate estimate of # rows that will be retrieved by
-  key scan on key idx.
+  Range sequence interface, SEL_ARG* implementation: get the next interval
+  
   SYNOPSIS
-    check_quick_keys()
-      param         Parameter from test_quick select function.
-      idx           Number of key to use in PARAM::keys in list of used keys
-                    (param->real_keynr[idx] holds the key number in table)
-      key_tree      SEL_ARG tree being examined.
-      min_key       Buffer with partial min key value tuple
-      min_key_flag
-      max_key       Buffer with partial max key value tuple
-      max_key_flag
-
-  NOTES
-    The function does the recursive descent on the tree via SEL_ARG::left,
-    SEL_ARG::right, and SEL_ARG::next_key_part edges. The #rows estimates
-    are calculated using records_in_range calls at the leaf nodes and then
-    summed.
-
-    param->min_key and param->max_key are used to hold prefixes of key value
-    tuples.
-
-    The side effects are:
-
-    param->max_key_part is updated to hold the maximum number of key parts used
-      in scan minus 1.
+    sel_arg_range_seq_next()
+      rseq        Value returned from sel_arg_range_seq_init
+      range  OUT  Store information about the range here
 
-    param->range_count is incremented if the function finds a range that
-      wasn't counted by the caller.
+  DESCRIPTION
+    This is "get_next" function for Range sequence interface implementation
+    for SEL_ARG* tree.
 
-    param->is_ror_scan is cleared if the function detects that the key scan is
-      not a Rowid-Ordered Retrieval scan ( see comments for is_key_scan_ror
-      function for description of which key scans are ROR scans)
+  IMPLEMENTATION
+    The traversal also updates those param members:
+      - is_ror_scan
+      - range_count
+      - max_key_part
 
   RETURN
-    #records      E(#records) for given subtree
-    HA_POS_ERROR  if subtree cannot be used for record retrieval
-
+    0  Ok
+    1  No more ranges in the sequence
 */
 
-static ha_rows
-check_quick_keys(PARAM *param, uint idx, SEL_ARG *key_tree,
-		 uchar *min_key, uint min_key_flag, int min_keypart,
-                 uchar *max_key, uint max_key_flag, int max_keypart)
+//psergey-merge-todo: support check_quick_keys:max_keypart
+uint sel_arg_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
 {
-  ha_rows records=0, tmp;
-  uint tmp_min_flag, tmp_max_flag, keynr, min_key_length, max_key_length;
-  uint tmp_min_keypart= min_keypart, tmp_max_keypart= max_keypart;
-  uchar *tmp_min_key, *tmp_max_key;
-  uint8 save_first_null_comp= param->first_null_comp;
-
-  param->max_key_part=max(param->max_key_part,key_tree->part);
-  if (key_tree->left != &null_element)
+  SEL_ARG *key_tree;
+  SEL_ARG_RANGE_SEQ *seq= (SEL_ARG_RANGE_SEQ*)rseq;
+  if (seq->at_start)
   {
-    /*
-      There are at least two intervals for current key part, i.e. condition
-      was converted to something like
-        (keyXpartY less/equals c1) OR (keyXpartY more/equals c2).
-      This is not a ROR scan if the key is not Clustered Primary Key.
-    */
-    param->is_ror_scan= FALSE;
-    records=check_quick_keys(param, idx, key_tree->left,
-                             min_key, min_key_flag, min_keypart,
-			     max_key, max_key_flag, max_keypart);
-    if (records == HA_POS_ERROR)			// Impossible
-      return records;
+    key_tree= seq->start;
+    seq->at_start= FALSE;
+    goto walk_up_n_right;
   }
 
-  tmp_min_key= min_key;
-  tmp_max_key= max_key;
-  tmp_min_keypart+= key_tree->store_min(param->key[idx][key_tree->part].store_length,
-                                        &tmp_min_key, min_key_flag);
-  tmp_max_keypart+= key_tree->store_max(param->key[idx][key_tree->part].store_length,
-                                        &tmp_max_key, max_key_flag);
-  min_key_length= (uint) (tmp_min_key - param->min_key);
-  max_key_length= (uint) (tmp_max_key - param->max_key);
-
-  if (param->is_ror_scan)
+  key_tree= seq->stack[seq->i].key_tree;
+  /* Ok, we're at some "full tuple" position in the tree */
+ 
+  /* Step down if we can */
+  if (key_tree->next && key_tree->next != &null_element)
   {
-    /*
-      If the index doesn't cover entire key, mark the scan as non-ROR scan.
-      Actually we're cutting off some ROR scans here.
-    */
-    uint16 fieldnr= param->table->key_info[param->real_keynr[idx]].
-                    key_part[key_tree->part].fieldnr - 1;
-    if (param->table->field[fieldnr]->key_length() !=
-        param->key[idx][key_tree->part].length)
-      param->is_ror_scan= FALSE;
+    //step down; (update the tuple, we'll step right and stay there)
+    seq->i--;
+    step_down_to(seq, key_tree->next);
+    key_tree= key_tree->next;
+    seq->param->is_ror_scan= FALSE;
+    goto walk_right_n_up;
   }
 
-  if (!param->first_null_comp && key_tree->is_null_interval())
-    param->first_null_comp= key_tree->part+1;
+  /* Ok, can't step down, walk left until we can step down */
+  while (1)
+  {
+    if (seq->i == 1) // can't step left
+      return 1;
+    /* Step left */
+    seq->i--;
+    key_tree= seq->stack[seq->i].key_tree;
 
-  if (key_tree->next_key_part &&
-      key_tree->next_key_part->part == key_tree->part+1 &&
-      key_tree->next_key_part->type == SEL_ARG::KEY_RANGE)
-  {						// const key as prefix
-    if (min_key_length == max_key_length &&
-	!memcmp(min_key, max_key, (uint) (tmp_max_key - max_key)) &&
-	!key_tree->min_flag && !key_tree->max_flag)
-    {
-      tmp=check_quick_keys(param,idx,key_tree->next_key_part, tmp_min_key,
-                           min_key_flag | key_tree->min_flag, tmp_min_keypart,
-                           tmp_max_key, max_key_flag | key_tree->max_flag,
-                           tmp_max_keypart);
-      goto end;					// Ugly, but efficient
-    }
-    else
+    /* Step down if we can */
+    if (key_tree->next && key_tree->next != &null_element)
     {
-      /* The interval for current key part is not c1 <= keyXpartY <= c1 */
-      param->is_ror_scan= FALSE;
+      // Step down; update the tuple
+      seq->i--;
+      step_down_to(seq, key_tree->next);
+      key_tree= key_tree->next;
+      break;
     }
-
-    tmp_min_flag=key_tree->min_flag;
-    tmp_max_flag=key_tree->max_flag;
-    if (!tmp_min_flag)
-      tmp_min_keypart+=
-      key_tree->next_key_part->store_min_key(param->key[idx], &tmp_min_key,
-					     &tmp_min_flag);
-    if (!tmp_max_flag)
-      tmp_max_keypart+=
-      key_tree->next_key_part->store_max_key(param->key[idx], &tmp_max_key,
-					     &tmp_max_flag);
-    min_key_length= (uint) (tmp_min_key - param->min_key);
-    max_key_length= (uint) (tmp_max_key - param->max_key);
   }
-  else
+
+  /*
+    Ok, we've stepped down from the path to previous tuple.
+    Walk right-up while we can
+  */
+walk_right_n_up:
+  while (key_tree->next_key_part && key_tree->next_key_part != &null_element && 
+         key_tree->next_key_part->part == key_tree->part + 1 &&
+         key_tree->next_key_part->type == SEL_ARG::KEY_RANGE)
   {
-    tmp_min_flag= min_key_flag | key_tree->min_flag;
-    tmp_max_flag= max_key_flag | key_tree->max_flag;
+    {
+      RANGE_SEQ_ENTRY *cur= &seq->stack[seq->i];
+      uint min_key_length= cur->min_key - seq->param->min_key;
+      uint max_key_length= cur->max_key - seq->param->max_key;
+      uint len= cur->min_key - cur[-1].min_key;
+      if (!(min_key_length == max_key_length &&
+            !memcmp(cur[-1].min_key, cur[-1].max_key, len) &&
+            !key_tree->min_flag && !key_tree->max_flag))
+      {
+        seq->param->is_ror_scan= FALSE;
+        if (!key_tree->min_flag)
+          cur->min_key_parts += 
+            key_tree->next_key_part->store_min_key(seq->param->key[seq->keyno],
+                                                   &cur->min_key,
+                                                   &cur->min_key_flag);
+        if (!key_tree->max_flag)
+          cur->max_key_parts += 
+            key_tree->next_key_part->store_max_key(seq->param->key[seq->keyno],
+                                                   &cur->max_key,
+                                                   &cur->max_key_flag);
+        break;
+      }
+    }
+  
+    /*
+      Ok, current atomic interval is in form "t.field=const" and there is
+      next_key_part interval. Step right, and walk up from there.
+    */
+    key_tree= key_tree->next_key_part;
+
+walk_up_n_right:
+    while (key_tree->prev && key_tree->prev != &null_element)
+    {
+      /* Step up */
+      key_tree= key_tree->prev;
+    }
+    step_down_to(seq, key_tree);
   }
 
-  if (unlikely(param->thd->killed != 0))
-    return HA_POS_ERROR;
+  /* Ok got a tuple */
+  RANGE_SEQ_ENTRY *cur= &seq->stack[seq->i];
+  uint min_key_length= cur->min_key - seq->param->min_key;
   
-  keynr=param->real_keynr[idx];
-  param->range_count++;
-  if (!tmp_min_flag && ! tmp_max_flag &&
-      (uint) key_tree->part+1 == param->table->key_info[keynr].key_parts &&
-      (param->table->key_info[keynr].flags & (HA_NOSAME | HA_END_SPACE_KEY)) ==
-      HA_NOSAME && min_key_length == max_key_length &&
-      !memcmp(param->min_key, param->max_key, min_key_length) &&
-      !param->first_null_comp)
-  {
-    tmp=1;					// Max one record
-    param->n_ranges++;
+  range->ptr= (char*)(int)(key_tree->part);
+  if (cur->min_key_flag & GEOM_FLAG)
+  {
+    range->range_flag= cur->min_key_flag;
+
+    /* Here minimum contains also function code bits, and maximum is +inf */
+    range->start_key.key=    seq->param->min_key;
+    range->start_key.length= min_key_length;
+    range->start_key.flag=  (ha_rkey_function) (cur->min_key_flag ^ GEOM_FLAG);
   }
   else
   {
-    if (param->is_ror_scan)
+    range->range_flag= cur->min_key_flag | cur->max_key_flag;
+    
+    range->start_key.key=    seq->param->min_key;
+    range->start_key.length= cur->min_key - seq->param->min_key;
+    range->start_key.keypart_map= make_prev_keypart_map(cur->min_key_parts);
+    range->start_key.flag= (cur->min_key_flag & NEAR_MIN ? HA_READ_AFTER_KEY : 
+                                                           HA_READ_KEY_EXACT);
+
+    range->end_key.key=    seq->param->max_key;
+    range->end_key.length= cur->max_key - seq->param->max_key;
+    range->end_key.flag= (cur->max_key_flag & NEAR_MAX ? HA_READ_BEFORE_KEY : 
+                                                         HA_READ_AFTER_KEY);
+    range->end_key.keypart_map= make_prev_keypart_map(cur->max_key_parts);
+
+    if (!(cur->min_key_flag & ~NULL_RANGE) && !cur->max_key_flag &&
+        (uint)key_tree->part+1 == seq->param->table->key_info[seq->real_keyno].key_parts &&
+        (seq->param->table->key_info[seq->real_keyno].flags & (HA_NOSAME | HA_END_SPACE_KEY)) ==
+        HA_NOSAME &&
+        range->start_key.length == range->end_key.length &&
+        !memcmp(seq->param->min_key,seq->param->max_key,range->start_key.length))
+      range->range_flag= UNIQUE_RANGE | (cur->min_key_flag & NULL_RANGE);
+      
+    if (seq->param->is_ror_scan)
     {
       /*
         If we get here, the condition on the key was converted to form
@@ -7520,69 +7546,137 @@ check_quick_keys(PARAM *param, uint idx, SEL_ARG *key_tree,
           uncovered "tail" of KeyX parts is either empty or is identical to
           first members of clustered primary key.
       */
-      if (!(min_key_length == max_key_length &&
-            !memcmp(min_key, max_key, (uint) (tmp_max_key - max_key)) &&
-            !key_tree->min_flag && !key_tree->max_flag &&
-            is_key_scan_ror(param, keynr, key_tree->part + 1)))
-        param->is_ror_scan= FALSE;
+      if (!(!(cur->min_key_flag & ~NULL_RANGE) && !cur->max_key_flag &&
+            (range->start_key.length == range->end_key.length) &&
+            !memcmp(range->start_key.key, range->end_key.key, range->start_key.length) &&
+            is_key_scan_ror(seq->param, seq->real_keyno, key_tree->part + 1)))
+        seq->param->is_ror_scan= FALSE;
     }
-    param->n_ranges++;
+  }
+  seq->param->range_count++;
+  seq->param->max_key_part=max(seq->param->max_key_part,key_tree->part);
+  return 0;
+}
 
-    if (tmp_min_flag & GEOM_FLAG)
-    {
-      key_range min_range;
-      min_range.key=    param->min_key;
-      min_range.length= min_key_length;
-      min_range.keypart_map= make_keypart_map(tmp_min_keypart);
-      /* In this case tmp_min_flag contains the handler-read-function */
-      min_range.flag=   (ha_rkey_function) (tmp_min_flag ^ GEOM_FLAG);
 
-      tmp= param->table->file->records_in_range(keynr,
-                                                &min_range, (key_range*) 0);
-    }
-    else
+/*
+  Calculate cost and E(#rows) for a given index and intervals tree 
+
+  SYNOPSIS
+    check_quick_select()
+      param             Parameter from test_quick_select
+      idx               Number of index to use in PARAM::key SEL_TREE::key
+      index_only        TRUE  - assume only index tuples will be accessed
+                        FALSE - assume full table rows will be read
+      tree              Transformed selection condition, tree->key[idx] holds
+                        the intervals for the given index.
+      update_tbl_stats  TRUE <=> update table->quick_* with information
+                        about range scan we've evaluated.
+      mrr_flags   INOUT MRR access flags
+      cost        OUT   Scan cost
+
+  NOTES
+    param->is_ror_scan is set to reflect if the key scan is a ROR (see
+    is_key_scan_ror function for more info)
+    param->table->quick_*, param->range_count (and maybe others) are
+    updated with data of given key scan, see quick_range_seq_next for details.
+
+  RETURN
+    Estimate # of records to be retrieved.
+    HA_POS_ERROR if estimate calculation failed due to table handler problems.
+*/
+
+static
+ha_rows check_quick_select(PARAM *param, uint idx, bool index_only,
+                           SEL_ARG *tree, bool update_tbl_stats, 
+                           uint *mrr_flags, uint *bufsize, COST_VECT *cost)
+{
+  SEL_ARG_RANGE_SEQ seq;
+  RANGE_SEQ_IF seq_if = {sel_arg_range_seq_init, sel_arg_range_seq_next, 0, 0};
+  handler *file= param->table->file;
+  ha_rows rows;
+  uint keynr= param->real_keynr[idx];
+  DBUG_ENTER("check_quick_select");
+  
+  /* Handle cases when we don't have a valid non-empty list of range */
+  if (!tree)
+    DBUG_RETURN(HA_POS_ERROR);
+  if (tree->type == SEL_ARG::IMPOSSIBLE)
+    DBUG_RETURN(0L);
+  if (tree->type != SEL_ARG::KEY_RANGE || tree->part != 0)
+    DBUG_RETURN(HA_POS_ERROR);
+
+  seq.keyno= idx;
+  seq.real_keyno= keynr;
+  seq.param= param;
+  seq.start= tree;
+
+  param->range_count=0;
+  param->max_key_part=0;
+
+  param->is_ror_scan= TRUE;
+  if (file->index_flags(keynr, 0, TRUE) & HA_KEY_SCAN_NOT_ROR)
+    param->is_ror_scan= FALSE;
+  
+  *mrr_flags= param->force_default_mrr? HA_MRR_USE_DEFAULT_IMPL: 0;
+  /*
+    Pass HA_MRR_SORTED to see if MRR implementation can handle sorting.
+  */
+  *mrr_flags|= HA_MRR_NO_ASSOCIATION | HA_MRR_SORTED;
+
+  bool pk_is_clustered= file->primary_key_is_clustered();
+  if (index_only && 
+      (file->index_flags(keynr, param->max_key_part, 1) & HA_KEYREAD_ONLY) &&
+      !(pk_is_clustered && keynr == param->table->s->primary_key))
+     *mrr_flags |= HA_MRR_INDEX_ONLY;
+  
+  if (current_thd->lex->sql_command != SQLCOM_SELECT)
+    *mrr_flags |= HA_MRR_USE_DEFAULT_IMPL;
+
+  *bufsize= param->thd->variables.read_rnd_buff_size;
+  rows= file->multi_range_read_info_const(keynr, &seq_if, (void*)&seq, 0,
+                                          bufsize, mrr_flags, cost);
+  if (rows != HA_POS_ERROR)
+  {
+    param->table->quick_rows[keynr]=rows;
+    if (update_tbl_stats)
     {
-      key_range min_range, max_range;
-
-      min_range.key=    param->min_key;
-      min_range.length= min_key_length;
-      min_range.flag=   (tmp_min_flag & NEAR_MIN ? HA_READ_AFTER_KEY :
-                         HA_READ_KEY_EXACT);
-      min_range.keypart_map= make_keypart_map(tmp_min_keypart);
-      max_range.key=    param->max_key;
-      max_range.length= max_key_length;
-      max_range.flag=   (tmp_max_flag & NEAR_MAX ?
-                         HA_READ_BEFORE_KEY : HA_READ_AFTER_KEY);
-      max_range.keypart_map= make_keypart_map(tmp_max_keypart);
-      tmp=param->table->file->records_in_range(keynr,
-                                               (min_key_length ? &min_range :
-                                                (key_range*) 0),
-                                               (max_key_length ? &max_range :
-                                                (key_range*) 0));
+      param->table->quick_keys.set_bit(keynr);
+      param->table->quick_key_parts[keynr]=param->max_key_part+1;
+      param->table->quick_n_ranges[keynr]= param->range_count;
+      param->table->quick_condition_rows=
+        min(param->table->quick_condition_rows, rows);
     }
   }
- end:
-  if (tmp == HA_POS_ERROR)			// Impossible range
-    return tmp;
-  records+=tmp;
-  if (key_tree->right != &null_element)
+  /* Figure out if the key scan is ROR (returns rows in ROWID order) or not */
+  enum ha_key_alg key_alg= param->table->key_info[seq.real_keyno].algorithm;
+  if ((key_alg != HA_KEY_ALG_BTREE) && (key_alg!= HA_KEY_ALG_UNDEF))
   {
-    /*
-      There are at least two intervals for current key part, i.e. condition
-      was converted to something like
-        (keyXpartY less/equals c1) OR (keyXpartY more/equals c2).
-      This is not a ROR scan if the key is not Clustered Primary Key.
+    /* 
+      All scans are non-ROR scans for those index types.
+      TODO: Don't have this logic here, make table engines return 
+      appropriate flags instead.
+    */
+    param->is_ror_scan= FALSE;
+  }
+  else if (param->table->s->primary_key == keynr && pk_is_clustered)
+  {
+    /* Clustered PK scan is always a ROR scan (TODO: same as above) */
+    param->is_ror_scan= TRUE;
+  }
+  else if (param->range_count > 1)
+  {
+    /* 
+      Scaning multiple key values in the index: the records are ROR
+      for each value, but not between values. E.g, "SELECT ... x IN
+      (1,3)" returns ROR order for all records with x=1, then ROR
+      order for records with x=3
     */
     param->is_ror_scan= FALSE;
-    tmp=check_quick_keys(param, idx, key_tree->right,
-                         min_key, min_key_flag, min_keypart,
-                         max_key, max_key_flag, max_keypart);
-    if (tmp == HA_POS_ERROR)
-      return tmp;
-    records+=tmp;
   }
-  param->first_null_comp= save_first_null_comp;
-  return records;
+
+  DBUG_PRINT("exit", ("Records: %lu", (ulong) rows));
+  DBUG_RETURN(rows); //psergey-merge:todo: maintain first_null_comp.
 }
 
 
@@ -7610,13 +7704,14 @@ check_quick_keys(PARAM *param, uint idx, SEL_ARG *key_tree,
     where the index is defined on (key1_1, ..., key1_N [,a_1, ..., a_n])
 
     and the table has a clustered Primary Key defined as 
-
       PRIMARY KEY(a_1, ..., a_n, b1, ..., b_k) 
     
     i.e. the first key parts of it are identical to uncovered parts ot the 
     key being scanned. This function assumes that the index flags do not
     include HA_KEY_SCAN_NOT_ROR flag (that is checked elsewhere).
 
+    Check (1) is made in quick_range_seq_next()
+
   RETURN
     TRUE   The scan is ROR-scan
     FALSE  Otherwise
@@ -7629,9 +7724,19 @@ static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts)
   KEY_PART_INFO *key_part_end= (table_key->key_part +
                                 table_key->key_parts);
   uint pk_number;
+  
+  for (KEY_PART_INFO *kp= table_key->key_part; kp < key_part; kp++)
+  {
+    uint16 fieldnr= param->table->key_info[keynr].
+                    key_part[kp - table_key->key_part].fieldnr - 1;
+    if (param->table->field[fieldnr]->key_length() != kp->length)
+      return FALSE;
+  }
 
   if (key_part == key_part_end)
     return TRUE;
+
+  key_part= table_key->key_part + nparts;
   pk_number= param->table->s->primary_key;
   if (!param->table->file->primary_key_is_clustered() || pk_number == MAX_KEY)
     return FALSE;
@@ -7656,12 +7761,14 @@ static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts)
   SYNOPSIS
     get_quick_select()
       param
-      idx          Index of used key in param->key.
-      key_tree     SEL_ARG tree for the used key
-      parent_alloc If not NULL, use it to allocate memory for
-                   quick select data. Otherwise use quick->alloc.
+      idx            Index of used key in param->key.
+      key_tree       SEL_ARG tree for the used key
+      mrr_flags      MRR parameter for quick select
+      mrr_buf_size   MRR parameter for quick select
+      parent_alloc   If not NULL, use it to allocate memory for
+                     quick select data. Otherwise use quick->alloc.
   NOTES
-    The caller must call QUICK_SELECT::init for returned quick select
+    The caller must call QUICK_SELECT::init for returned quick select.
 
     CAUTION! This function may change thd->mem_root to a MEM_ROOT which will be
     deallocated when the returned quick select is deleted.
@@ -7672,25 +7779,26 @@ static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts)
 */
 
 QUICK_RANGE_SELECT *
-get_quick_select(PARAM *param,uint idx,SEL_ARG *key_tree,
-                 MEM_ROOT *parent_alloc)
+get_quick_select(PARAM *param,uint idx,SEL_ARG *key_tree, uint mrr_flags,
+                 uint mrr_buf_size, MEM_ROOT *parent_alloc)
 {
   QUICK_RANGE_SELECT *quick;
+  bool create_err= FALSE;
   DBUG_ENTER("get_quick_select");
 
   if (param->table->key_info[param->real_keynr[idx]].flags & HA_SPATIAL)
     quick=new QUICK_RANGE_SELECT_GEOM(param->thd, param->table,
                                       param->real_keynr[idx],
                                       test(parent_alloc),
-                                      parent_alloc);
+                                      parent_alloc, &create_err);
   else
     quick=new QUICK_RANGE_SELECT(param->thd, param->table,
                                  param->real_keynr[idx],
-                                 test(parent_alloc));
+                                 test(parent_alloc), NULL, &create_err);
 
   if (quick)
   {
-    if (quick->error ||
+    if (create_err ||
 	get_quick_keys(param,quick,param->key[idx],key_tree,param->min_key,0,
 		       param->max_key,0))
     {
@@ -7699,6 +7807,8 @@ get_quick_select(PARAM *param,uint idx,SEL_ARG *key_tree,
     }
     else
     {
+      quick->mrr_flags= mrr_flags;
+      quick->mrr_buf_size= mrr_buf_size;
       quick->key_parts=(KEY_PART*)
         memdup_root(parent_alloc? parent_alloc : &quick->alloc,
                     (char*) param->key[idx],
@@ -7847,7 +7957,20 @@ bool QUICK_RANGE_SELECT::unique_key_range()
 }
 
 
-/* Returns TRUE if any part of the key is NULL */
+
+/*
+  Return TRUE if any part of the key is NULL
+
+  SYNOPSIS
+    null_part_in_key()    
+      key_part  Array of key parts (index description)
+      key       Key values tuple
+      length    Length of key values tuple in bytes.
+
+  RETURN
+    TRUE   The tuple has at least one "keypartX is NULL"
+    FALSE  Otherwise
+*/
 
 static bool null_part_in_key(KEY_PART *key_part, const uchar *key, uint length)
 {
@@ -7904,6 +8027,19 @@ bool QUICK_ROR_UNION_SELECT::is_keys_used(const MY_BITMAP *fields)
 }
 
 
+FT_SELECT *get_ft_select(THD *thd, TABLE *table, uint key)
+{
+  bool create_err= FALSE;
+  FT_SELECT *fts= new FT_SELECT(thd, table, key, &create_err);
+  if (create_err)
+  {
+    delete fts;
+    return NULL;
+  }
+  else
+    return fts;
+}
+
 /*
   Create quick select from ref/ref_or_null scan.
 
@@ -7932,10 +8068,12 @@ QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
   KEY_PART *key_part;
   QUICK_RANGE *range;
   uint part;
+  bool create_err= FALSE;
+  COST_VECT cost;
 
   old_root= thd->mem_root;
   /* The following call may change thd->mem_root */
-  quick= new QUICK_RANGE_SELECT(thd, table, ref->key, 0);
+  quick= new QUICK_RANGE_SELECT(thd, table, ref->key, 0, 0, &create_err);
   /* save mem_root set by QUICK_RANGE_SELECT constructor */
   alloc= thd->mem_root;
   /*
@@ -7944,7 +8082,7 @@ QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
   */
   thd->mem_root= old_root;
 
-  if (!quick)
+  if (!quick || create_err)
     return 0;			/* no ranges found */
   if (quick->init())
     goto err;
@@ -7999,8 +8137,24 @@ QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
       goto err;
   }
 
-  return quick;
+  /* Call multi_range_read_info() to get the MRR flags and buffer size */
+  quick->mrr_flags= HA_MRR_NO_ASSOCIATION | 
+                    (table->key_read ? HA_MRR_INDEX_ONLY : 0);
+  if (thd->lex->sql_command != SQLCOM_SELECT)
+    quick->mrr_flags |= HA_MRR_USE_DEFAULT_IMPL;
+#ifdef WITH_NDBCLUSTER_STORAGE_ENGINE
+  if (!ref->null_ref_key && !key_has_nulls(key_info, range->min_key,
+                                           ref->key_length))
+    quick->mrr_flags |= HA_MRR_NO_NULL_ENDPOINTS;
+#endif
 
+  quick->mrr_buf_size= thd->variables.read_rnd_buff_size;
+  if (table->file->multi_range_read_info(quick->index, 1, (uint)records,
+                                         &quick->mrr_buf_size,
+                                         &quick->mrr_flags, &cost))
+    goto err;
+
+  return quick;
 err:
   delete quick;
   return 0;
@@ -8312,80 +8466,123 @@ int QUICK_ROR_UNION_SELECT::get_next()
 
 int QUICK_RANGE_SELECT::reset()
 {
-  uint  mrange_bufsiz;
+  uint  buf_size;
   uchar *mrange_buff;
+  int   error;
+  HANDLER_BUFFER empty_buf;
   DBUG_ENTER("QUICK_RANGE_SELECT::reset");
-  next=0;
   last_range= NULL;
-  in_range= FALSE;
   cur_range= (QUICK_RANGE**) ranges.buffer;
 
   if (file->inited == handler::NONE && (error= file->ha_index_init(index,1)))
     DBUG_RETURN(error);
  
-  /* Do not allocate the buffers twice. */
-  if (multi_range_length)
-  {
-    DBUG_ASSERT(multi_range_length == min(multi_range_count, ranges.elements));
-    DBUG_RETURN(0);
-  }
-
-  /* Allocate the ranges array. */
-  DBUG_ASSERT(ranges.elements);
-  multi_range_length= min(multi_range_count, ranges.elements);
-  DBUG_ASSERT(multi_range_length > 0);
-  while (multi_range_length && ! (multi_range= (KEY_MULTI_RANGE*)
-                                  my_malloc(multi_range_length *
-                                            sizeof(KEY_MULTI_RANGE),
-                                            MYF(MY_WME))))
-  {
-    /* Try to shrink the buffers until it is 0. */
-    multi_range_length/= 2;
-  }
-  if (! multi_range)
-  {
-    multi_range_length= 0;
-    DBUG_RETURN(HA_ERR_OUT_OF_MEM);
-  }
-
-  /* Allocate the handler buffer if necessary.  */
-  if (file->ha_table_flags() & HA_NEED_READ_RANGE_BUFFER)
+  /* Allocate buffer if we need one but haven't allocated it yet */
+  if (mrr_buf_size && !mrr_buf_desc)
   {
-    mrange_bufsiz= min(multi_range_bufsiz,
-                       ((uint)QUICK_SELECT_I::records + 1)* head->s->reclength);
-
-    while (mrange_bufsiz &&
-           ! my_multi_malloc(MYF(MY_WME),
-                             &multi_range_buff,
-                             (uint) sizeof(*multi_range_buff),
-                             &mrange_buff, (uint) mrange_bufsiz,
-                             NullS))
+    buf_size= mrr_buf_size;
+    while (buf_size && !my_multi_malloc(MYF(MY_WME),
+                                        &mrr_buf_desc, sizeof(*mrr_buf_desc),
+                                        &mrange_buff, buf_size,
+                                        NullS))
     {
       /* Try to shrink the buffers until both are 0. */
-      mrange_bufsiz/= 2;
+      buf_size/= 2;
     }
-    if (! multi_range_buff)
-    {
-      my_free((char*) multi_range, MYF(0));
-      multi_range= NULL;
-      multi_range_length= 0;
+    if (!mrr_buf_desc)
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
-    }
 
     /* Initialize the handler buffer. */
-    multi_range_buff->buffer= mrange_buff;
-    multi_range_buff->buffer_end= mrange_buff + mrange_bufsiz;
-    multi_range_buff->end_of_used_area= mrange_buff;
-#ifdef HAVE_valgrind
+    mrr_buf_desc->buffer= mrange_buff;
+    mrr_buf_desc->buffer_end= mrange_buff + buf_size;
+    mrr_buf_desc->end_of_used_area= mrange_buff;
+#ifdef HAVE_purify
     /*
       We need this until ndb will use the buffer efficiently
       (Now ndb stores  complete row in here, instead of only the used fields
       which gives us valgrind warnings in compare_record[])
     */
-    bzero((char*) mrange_buff, mrange_bufsiz);
+    bzero((char*) mrange_buff, buf_size);
 #endif
   }
-  DBUG_RETURN(0);
+
+  if (!mrr_buf_desc)
+    empty_buf.buffer= empty_buf.buffer_end= empty_buf.end_of_used_area= NULL;
+ 
+  RANGE_SEQ_IF seq_funcs= {quick_range_seq_init, quick_range_seq_next, 0, 0};
+  error= file->multi_range_read_init(&seq_funcs, (void*)this, ranges.elements,
+                                     mrr_flags, mrr_buf_desc? mrr_buf_desc: 
+                                                              &empty_buf);
+  DBUG_RETURN(error);
+}
+
+
+/*
+  Range sequence interface implementation for array<QUICK_RANGE>: initialize
+  
+  SYNOPSIS
+    quick_range_seq_init()
+      init_param  Caller-opaque paramenter: QUICK_RANGE_SELECT* pointer
+      n_ranges    Number of ranges in the sequence (ignored)
+      flags       MRR flags (currently not used) 
+
+  RETURN
+    Opaque value to be passed to quick_range_seq_next
+*/
+
+range_seq_t quick_range_seq_init(void *init_param, uint n_ranges, uint flags)
+{
+  QUICK_RANGE_SELECT *quick= (QUICK_RANGE_SELECT*)init_param;
+  quick->qr_traversal_ctx.first=  (QUICK_RANGE**)quick->ranges.buffer;
+  quick->qr_traversal_ctx.cur=    (QUICK_RANGE**)quick->ranges.buffer;
+  quick->qr_traversal_ctx.last=   quick->qr_traversal_ctx.cur + 
+                                  quick->ranges.elements;
+  return &quick->qr_traversal_ctx;
+}
+
+
+/*
+  Range sequence interface implementation for array<QUICK_RANGE>: get next
+  
+  SYNOPSIS
+    quick_range_seq_next()
+      rseq        Value returned from quick_range_seq_init
+      range  OUT  Store information about the range here
+
+  RETURN
+    0  Ok
+    1  No more ranges in the sequence
+*/
+
+uint quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
+{
+  QUICK_RANGE_SEQ_CTX *ctx= (QUICK_RANGE_SEQ_CTX*)rseq;
+
+  if (ctx->cur == ctx->last)
+    return 1; /* no more ranges */
+
+  QUICK_RANGE *cur= *(ctx->cur);
+  key_range *start_key= &range->start_key;
+  key_range *end_key=   &range->end_key;
+
+  start_key->key=    cur->min_key;
+  start_key->length= cur->min_length;
+  start_key->keypart_map= cur->min_keypart_map;
+  start_key->flag=   ((cur->flag & NEAR_MIN) ? HA_READ_AFTER_KEY :
+                      (cur->flag & EQ_RANGE) ?
+                      HA_READ_KEY_EXACT : HA_READ_KEY_OR_NEXT);
+  end_key->key=      cur->max_key;
+  end_key->length=   cur->max_length;
+  end_key->keypart_map= cur->max_keypart_map;
+  /*
+    We use HA_READ_AFTER_KEY here because if we are reading on a key
+    prefix. We want to find all keys with this prefix.
+  */
+  end_key->flag=     (cur->flag & NEAR_MAX ? HA_READ_BEFORE_KEY :
+                      HA_READ_AFTER_KEY);
+  range->range_flag= cur->flag;
+  ctx->cur++;
+  return 0;
 }
 
 
@@ -8406,15 +8603,8 @@ int QUICK_RANGE_SELECT::reset()
 
 int QUICK_RANGE_SELECT::get_next()
 {
-  int             result;
-  KEY_MULTI_RANGE *mrange;
-  key_range       *start_key;
-  key_range       *end_key;
+  char *dummy;
   DBUG_ENTER("QUICK_RANGE_SELECT::get_next");
-  DBUG_ASSERT(multi_range_length && multi_range &&
-              (cur_range >= (QUICK_RANGE**) ranges.buffer) &&
-              (cur_range <= (QUICK_RANGE**) ranges.buffer + ranges.elements));
-
   if (in_ror_merged_scan)
   {
     /*
@@ -8424,63 +8614,8 @@ int QUICK_RANGE_SELECT::get_next()
     head->column_bitmaps_set_no_signal(&column_bitmap, &column_bitmap);
   }
 
-  for (;;)
-  {
-    if (in_range)
-    {
-      /* We did already start to read this key. */
-      result= file->read_multi_range_next(&mrange);
-      if (result != HA_ERR_END_OF_FILE)
-        goto end;
-    }
+  int result= file->multi_range_read_next(&dummy);
 
-    uint count= min(multi_range_length, ranges.elements -
-                    (cur_range - (QUICK_RANGE**) ranges.buffer));
-    if (count == 0)
-    {
-      /* Ranges have already been used up before. None is left for read. */
-      in_range= FALSE;
-      if (in_ror_merged_scan)
-        head->column_bitmaps_set_no_signal(save_read_set, save_write_set);
-      DBUG_RETURN(HA_ERR_END_OF_FILE);
-    }
-    KEY_MULTI_RANGE *mrange_slot, *mrange_end;
-    for (mrange_slot= multi_range, mrange_end= mrange_slot+count;
-         mrange_slot < mrange_end;
-         mrange_slot++)
-    {
-      start_key= &mrange_slot->start_key;
-      end_key= &mrange_slot->end_key;
-      last_range= *(cur_range++);
-
-      start_key->key=    (const uchar*) last_range->min_key;
-      start_key->length= last_range->min_length;
-      start_key->flag=   ((last_range->flag & NEAR_MIN) ? HA_READ_AFTER_KEY :
-                          (last_range->flag & EQ_RANGE) ?
-                          HA_READ_KEY_EXACT : HA_READ_KEY_OR_NEXT);
-      start_key->keypart_map= last_range->min_keypart_map;
-      end_key->key=      (const uchar*) last_range->max_key;
-      end_key->length=   last_range->max_length;
-      /*
-        We use HA_READ_AFTER_KEY here because if we are reading on a key
-        prefix. We want to find all keys with this prefix.
-      */
-      end_key->flag=     (last_range->flag & NEAR_MAX ? HA_READ_BEFORE_KEY :
-                          HA_READ_AFTER_KEY);
-      end_key->keypart_map= last_range->max_keypart_map;
-
-      mrange_slot->range_flag= last_range->flag;
-    }
-
-    result= file->read_multi_range_first(&mrange, multi_range, count,
-                                         sorted, multi_range_buff);
-    if (result != HA_ERR_END_OF_FILE)
-      goto end;
-    in_range= FALSE; /* No matching rows; go to next set of ranges. */
-  }
-
-end:
-  in_range= ! result;
   if (in_ror_merged_scan)
   {
     /* Restore bitmaps set on entry */
@@ -8489,6 +8624,7 @@ end:
   DBUG_RETURN(result);
 }
 
+
 /*
   Get the next record with a different prefix.
 
@@ -8529,9 +8665,7 @@ int QUICK_RANGE_SELECT::get_next_prefix(uint prefix_length,
     key_range start_key, end_key;
     if (last_range)
     {
-      /*
-        Read the next record in the same range with prefix after cur_prefix.
-       */
+      /* Read the next record in the same range with prefix after cur_prefix. */
       DBUG_ASSERT(cur_prefix != 0);
       result= file->ha_index_read_map(record, cur_prefix, keypart_map,
                                       HA_READ_AFTER_KEY);
@@ -8667,7 +8801,8 @@ bool QUICK_RANGE_SELECT::row_in_ranges()
  */
 
 QUICK_SELECT_DESC::QUICK_SELECT_DESC(QUICK_RANGE_SELECT *q,
-                                     uint used_key_parts_arg)
+                                     uint used_key_parts_arg,
+                                     bool *create_err)
  :QUICK_RANGE_SELECT(*q), rev_it(rev_ranges),
   used_key_parts (used_key_parts_arg)
 {
@@ -8676,9 +8811,9 @@ QUICK_SELECT_DESC::QUICK_SELECT_DESC(QUICK_RANGE_SELECT *q,
     Use default MRR implementation for reverse scans. No table engine
     currently can do an MRR scan with output in reverse index order.
   */
-  multi_range_length= 0;
-  multi_range= NULL;
-  multi_range_buff= NULL;
+  mrr_buf_desc= NULL;
+  mrr_flags |= HA_MRR_USE_DEFAULT_IMPL;
+  mrr_buf_size= 0;
 
   QUICK_RANGE **pr= (QUICK_RANGE**)ranges.buffer;
   QUICK_RANGE **end_range= pr + ranges.elements;
@@ -8787,6 +8922,7 @@ int QUICK_SELECT_DESC::get_next()
 /*
   Compare if found key is over max-value
   Returns 0 if key <= range->max_key
+  TODO: Figure out why can't this function be as simple as cmp_prev(). 
 */
 
 int QUICK_RANGE_SELECT::cmp_next(QUICK_RANGE *range_arg)
@@ -9518,8 +9654,14 @@ get_best_group_min_max(PARAM *param, SEL_TREE *tree)
       cur_index_tree= get_index_range_tree(cur_index, tree, param,
                                            &cur_param_idx);
       /* Check if this range tree can be used for prefix retrieval. */
+      COST_VECT dummy_cost;
+      uint mrr_flags= HA_MRR_USE_DEFAULT_IMPL;
+      uint mrr_bufsize=0;
       cur_quick_prefix_records= check_quick_select(param, cur_param_idx,
-                                                    cur_index_tree, TRUE);
+                                                   FALSE /*don't care(*/,
+                                                   cur_index_tree, TRUE,
+                                                   &mrr_flags, &mrr_bufsize,
+                                                   &dummy_cost);
     }
     cost_group_min_max(table, cur_index_info, cur_used_key_parts,
                        cur_group_key_parts, tree, cur_index_tree,
@@ -10103,6 +10245,7 @@ TRP_GROUP_MIN_MAX::make_quick(PARAM *param, bool retrieve_full_rows,
       /* Make a QUICK_RANGE_SELECT to be used for group prefix retrieval. */
       quick->quick_prefix_select= get_quick_select(param, param_idx,
                                                    index_tree,
+                                                   HA_MRR_USE_DEFAULT_IMPL, 0,
                                                    &quick->alloc);
 
     /*
@@ -11209,6 +11352,7 @@ static void print_ror_scans_arr(TABLE *table, const char *msg,
   DBUG_VOID_RETURN;
 }
 
+
 /*****************************************************************************
 ** Print a quick range for debugging
 ** TODO:
diff --git a/sql/opt_range.h b/sql/opt_range.h
index 6f7ef7842e7..dfd0cdac003 100644
--- a/sql/opt_range.h
+++ b/sql/opt_range.h
@@ -114,13 +114,16 @@ class QUICK_RANGE :public Sql_alloc {
 
   4. Delete the select:
     delete quick;
-
+  
+  NOTE 
+    quick select doesn't use Sql_alloc/MEM_ROOT allocation because "range
+    checked for each record" functionality may create/destroy
+    O(#records_in_some_table) quick selects during query execution.
 */
 
 class QUICK_SELECT_I
 {
 public:
-  bool sorted;
   ha_rows records;  /* estimate of # of records to be retrieved */
   double  read_time; /* time to perform this retrieval          */
   TABLE   *head;
@@ -193,6 +196,12 @@ public:
   virtual bool reverse_sorted() = 0;
   virtual bool unique_key_range() { return false; }
 
+  /*
+    Request that this quick select produces sorted output. Not all quick
+    selects can do it, the caller is responsible for calling this function
+    only for those quick selects that can.
+  */
+  virtual void need_sorted_output() = 0;
   enum {
     QS_TYPE_RANGE = 0,
     QS_TYPE_INDEX_MERGE = 1,
@@ -272,6 +281,22 @@ struct st_qsel_param;
 class PARAM;
 class SEL_ARG;
 
+
+/*
+  MRR range sequence, array<QUICK_RANGE> implementation: sequence traversal
+  context.
+*/
+typedef struct st_quick_range_seq_ctx
+{
+  QUICK_RANGE **first;
+  QUICK_RANGE **cur;
+  QUICK_RANGE **last;
+} QUICK_RANGE_SEQ_CTX;
+
+range_seq_t quick_range_seq_init(void *init_param, uint n_ranges, uint flags);
+uint quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
+
+
 /*
   Quick select that does a range scan on a single key. The records are
   returned in key order.
@@ -279,60 +304,45 @@ class SEL_ARG;
 class QUICK_RANGE_SELECT : public QUICK_SELECT_I
 {
 protected:
-  bool next,dont_free,in_ror_merged_scan;
-public:
-  int error;
-protected:
   handler *file;
-  /*
-    If true, this quick select has its "own" handler object which should be
-    closed no later then this quick select is deleted.
-  */
-  bool free_file;
-  bool in_range;
-  uint multi_range_count; /* copy from thd->variables.multi_range_count */
-  uint multi_range_length; /* the allocated length for the array */
-  uint multi_range_bufsiz; /* copy from thd->variables.read_rnd_buff_size */
-  KEY_MULTI_RANGE *multi_range; /* the multi-range array (allocated and
-                                       freed by QUICK_RANGE_SELECT) */
-  HANDLER_BUFFER *multi_range_buff; /* the handler buffer (allocated and
-                                       freed by QUICK_RANGE_SELECT) */
+
+  /* Members to deal with case when this quick select is a ROR-merged scan */
+  bool in_ror_merged_scan;
   MY_BITMAP column_bitmap, *save_read_set, *save_write_set;
+  bool free_file;   /* TRUE <=> this->file is "owned" by this quick select */
 
-  friend class TRP_ROR_INTERSECT;
-  friend
-  QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
-                                               struct st_table_ref *ref,
-                                               ha_rows records);
-  friend bool get_quick_keys(PARAM *param,
-                             QUICK_RANGE_SELECT *quick,KEY_PART *key,
-                             SEL_ARG *key_tree,
-                             uchar *min_key, uint min_key_flag,
-                             uchar *max_key, uint max_key_flag);
-  friend QUICK_RANGE_SELECT *get_quick_select(PARAM*,uint idx,
-                                              SEL_ARG *key_tree,
-                                              MEM_ROOT *alloc);
-  friend class QUICK_SELECT_DESC;
-  friend class QUICK_INDEX_MERGE_SELECT;
-  friend class QUICK_ROR_INTERSECT_SELECT;
-  friend class QUICK_GROUP_MIN_MAX_SELECT;
+  /* Range pointers to be used when not using MRR interface */
+  /* Members needed to use the MRR interface */
+  QUICK_RANGE_SEQ_CTX qr_traversal_ctx;
+public:
+  uint mrr_flags; /* Flags to be used with MRR interface */
+protected:
+  uint mrr_buf_size; /* copy from thd->variables.read_rnd_buff_size */  
+  HANDLER_BUFFER *mrr_buf_desc; /* the handler buffer */
 
+  /* Info about index we're scanning */
+  
   DYNAMIC_ARRAY ranges;     /* ordered array of range ptrs */
   QUICK_RANGE **cur_range;  /* current element in ranges  */
-
+  
   QUICK_RANGE *last_range;
+  
   KEY_PART *key_parts;
   KEY_PART_INFO *key_part_info;
+  
+  bool dont_free; /* Used by QUICK_SELECT_DESC */
+
   int cmp_next(QUICK_RANGE *range);
   int cmp_prev(QUICK_RANGE *range);
   bool row_in_ranges();
 public:
   MEM_ROOT alloc;
 
-  QUICK_RANGE_SELECT(THD *thd, TABLE *table,uint index_arg,bool no_alloc=0,
-                     MEM_ROOT *parent_alloc=NULL);
+  QUICK_RANGE_SELECT(THD *thd, TABLE *table,uint index_arg,bool no_alloc,
+                     MEM_ROOT *parent_alloc, bool *create_err);
   ~QUICK_RANGE_SELECT();
-
+  
+  void need_sorted_output();
   int init();
   int reset(void);
   int get_next();
@@ -352,6 +362,27 @@ public:
 #endif
 private:
   /* Default copy ctor used by QUICK_SELECT_DESC */
+  friend class TRP_ROR_INTERSECT;
+  friend
+  QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
+                                               struct st_table_ref *ref,
+                                               ha_rows records);
+  friend bool get_quick_keys(PARAM *param, QUICK_RANGE_SELECT *quick, 
+                             KEY_PART *key, SEL_ARG *key_tree, 
+                             uchar *min_key, uint min_key_flag,
+                             uchar *max_key, uint max_key_flag);
+  friend QUICK_RANGE_SELECT *get_quick_select(PARAM*,uint idx,
+                                              SEL_ARG *key_tree,
+                                              uint mrr_flags,
+                                              uint mrr_buf_size,
+                                              MEM_ROOT *alloc);
+  friend class QUICK_SELECT_DESC;
+  friend class QUICK_INDEX_MERGE_SELECT;
+  friend class QUICK_ROR_INTERSECT_SELECT;
+  friend class QUICK_GROUP_MIN_MAX_SELECT;
+  friend uint quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
+  friend range_seq_t quick_range_seq_init(void *init_param,
+                                          uint n_ranges, uint flags);
 };
 
 
@@ -359,8 +390,10 @@ class QUICK_RANGE_SELECT_GEOM: public QUICK_RANGE_SELECT
 {
 public:
   QUICK_RANGE_SELECT_GEOM(THD *thd, TABLE *table, uint index_arg,
-                          bool no_alloc, MEM_ROOT *parent_alloc)
-    :QUICK_RANGE_SELECT(thd, table, index_arg, no_alloc, parent_alloc)
+                          bool no_alloc, MEM_ROOT *parent_alloc, 
+                          bool *create_err)
+    :QUICK_RANGE_SELECT(thd, table, index_arg, no_alloc, parent_alloc,
+    create_err)
     {};
   virtual int get_next();
 };
@@ -432,6 +465,7 @@ public:
   ~QUICK_INDEX_MERGE_SELECT();
 
   int  init();
+  void need_sorted_output() { DBUG_ASSERT(0); /* Can't do it */ }
   int  reset(void);
   int  get_next();
   bool reverse_sorted() { return false; }
@@ -491,6 +525,7 @@ public:
   ~QUICK_ROR_INTERSECT_SELECT();
 
   int  init();
+  void need_sorted_output() { DBUG_ASSERT(0); /* Can't do it */ }
   int  reset(void);
   int  get_next();
   bool reverse_sorted() { return false; }
@@ -545,6 +580,7 @@ public:
   ~QUICK_ROR_UNION_SELECT();
 
   int  init();
+  void need_sorted_output() { DBUG_ASSERT(0); /* Can't do it */ }
   int  reset(void);
   int  get_next();
   bool reverse_sorted() { return false; }
@@ -664,6 +700,7 @@ public:
   void adjust_prefix_ranges();
   bool alloc_buffers();
   int init();
+  void need_sorted_output() { /* always do it */ }
   int reset();
   int get_next();
   bool reverse_sorted() { return false; }
@@ -679,7 +716,8 @@ public:
 class QUICK_SELECT_DESC: public QUICK_RANGE_SELECT
 {
 public:
-  QUICK_SELECT_DESC(QUICK_RANGE_SELECT *q, uint used_key_parts);
+  QUICK_SELECT_DESC(QUICK_RANGE_SELECT *q, uint used_key_parts, 
+                    bool *create_err);
   int get_next();
   bool reverse_sorted() { return 1; }
   int get_type() { return QS_TYPE_RANGE_DESC; }
@@ -712,25 +750,29 @@ class SQL_SELECT :public Sql_alloc {
   {
     key_map tmp;
     tmp.set_all();
-    return test_quick_select(thd, tmp, 0, limit, force_quick_range) < 0;
+    return test_quick_select(thd, tmp, 0, limit, force_quick_range, FALSE) < 0;
   }
   inline bool skip_record() { return cond ? cond->val_int() == 0 : 0; }
   int test_quick_select(THD *thd, key_map keys, table_map prev_tables,
-			ha_rows limit, bool force_quick_range);
+			ha_rows limit, bool force_quick_range, 
+                        bool ordered_output);
 };
 
 
-class FT_SELECT: public QUICK_RANGE_SELECT {
+class FT_SELECT: public QUICK_RANGE_SELECT 
+{
 public:
-  FT_SELECT(THD *thd, TABLE *table, uint key) :
-      QUICK_RANGE_SELECT (thd, table, key, 1) { VOID(init()); }
+  FT_SELECT(THD *thd, TABLE *table, uint key, bool *create_err) :
+      QUICK_RANGE_SELECT (thd, table, key, 1, NULL, create_err) 
+  { (void) init(); }
   ~FT_SELECT() { file->ft_end(); }
-  int init() { return error=file->ft_init(); }
+  int init() { return file->ft_init(); }
   int reset() { return 0; }
-  int get_next() { return error= file->ha_ft_read(record); }
+  int get_next() { return file->ha_ft_read(record); }
   int get_type() { return QS_TYPE_FULLTEXT; }
 };
 
+FT_SELECT *get_ft_select(THD *thd, TABLE *table, uint key);
 QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
                                              struct st_table_ref *ref,
                                              ha_rows records);
diff --git a/sql/set_var.cc b/sql/set_var.cc
index 7eec4c3e41b..dde8a2c6002 100644
--- a/sql/set_var.cc
+++ b/sql/set_var.cc
@@ -439,8 +439,6 @@ static sys_var_long_ptr	sys_max_write_lock_count(&vars, "max_write_lock_count",
 						 &max_write_lock_count);
 static sys_var_thd_ulong       sys_min_examined_row_limit(&vars, "min_examined_row_limit",
                                                           &SV::min_examined_row_limit);
-static sys_var_thd_ulong       sys_multi_range_count(&vars, "multi_range_count",
-                                              &SV::multi_range_count);
 static sys_var_long_ptr	sys_myisam_data_pointer_size(&vars, "myisam_data_pointer_size",
                                                     &myisam_data_pointer_size);
 static sys_var_thd_ulonglong	sys_myisam_max_sort_file_size(&vars, "myisam_max_sort_file_size", &SV::myisam_max_sort_file_size, fix_myisam_max_sort_file_size, 1);
@@ -493,6 +491,18 @@ static sys_var_thd_ulong        sys_optimizer_search_depth(&vars, "optimizer_sea
                                                    &SV::optimizer_search_depth);
 static sys_var_thd_optimizer_switch   sys_optimizer_switch(&vars, "optimizer_switch",
                                      &SV::optimizer_switch);
+
+const char *optimizer_use_mrr_names[] = {"auto", "force", "disable", NullS};
+TYPELIB optimizer_use_mrr_typelib= {
+  array_elements(optimizer_use_mrr_names) - 1, "",
+  optimizer_use_mrr_names, NULL
+};
+
+static sys_var_thd_enum        sys_optimizer_use_mrr(&vars, "optimizer_use_mrr",
+                                              &SV::optimizer_use_mrr,
+                                              &optimizer_use_mrr_typelib,
+                                              NULL);
+
 static sys_var_const            sys_pid_file(&vars, "pid_file",
                                              OPT_GLOBAL, SHOW_CHAR,
                                              (uchar*) pidfile_name);
diff --git a/sql/sql_class.h b/sql/sql_class.h
index 72fd927cc1e..4a71b6776e3 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -313,7 +313,6 @@ struct system_variables
   ulong max_tmp_tables;
   ulong max_insert_delayed_threads;
   ulong min_examined_row_limit;
-  ulong multi_range_count;
   ulong myisam_repair_threads;
   ulong myisam_sort_buff_size;
   ulong myisam_stats_method;
@@ -327,6 +326,14 @@ struct system_variables
   ulong optimizer_search_depth;
   /* A bitmap for switching optimizations on/off */
   ulong optimizer_switch;
+  /*
+    Controls use of Engine-MRR:
+      0 - auto, based on cost
+      1 - force MRR when the storage engine is capable of doing it
+      2 - disable MRR.
+  */
+  ulong optimizer_use_mrr; 
+
   ulong preload_buff_size;
   ulong profiling_history_size;
   ulong query_cache_type;
@@ -430,6 +437,13 @@ typedef struct system_status_var
   ulong ha_read_prev_count;
   ulong ha_read_rnd_count;
   ulong ha_read_rnd_next_count;
+  /*
+    This number doesn't include calls to the default implementation and
+    calls made by range access. The intent is to count only calls made by
+    BatchedKeyAccess.
+  */
+  ulong ha_multi_range_read_init_count;
+
   ulong ha_rollback_count;
   ulong ha_update_count;
   ulong ha_write_count;
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index f903c193270..434afe8fd69 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -178,11 +178,14 @@ int join_read_always_key_or_null(JOIN_TAB *tab);
 int join_read_next_same_or_null(READ_RECORD *info);
 static COND *make_cond_for_table(COND *cond,table_map table,
 				 table_map used_table);
+static COND *make_cond_for_table_from_pred(COND *root_cond, COND *cond,
+                                           table_map tables,
+                                           table_map used_table);
 static Item* part_of_refkey(TABLE *form,Field *field);
 uint find_shortest_key(TABLE *table, const key_map *usable_keys);
 static bool test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,
 				    ha_rows select_limit, bool no_changes,
-                                    key_map *map);
+                                    const key_map *map);
 static bool list_contains_unique_index(TABLE *table,
                           bool (*find_func) (Field *, void *), void *data);
 static bool find_field_in_item_list (Field *field, void *data);
@@ -237,8 +240,8 @@ static void select_describe(JOIN *join, bool need_tmp_table,bool need_order,
 			    bool distinct, const char *message=NullS);
 static Item *remove_additional_cond(Item* conds);
 static void add_group_and_distinct_keys(JOIN *join, JOIN_TAB *join_tab);
-static bool test_if_ref(Item_field *left_item,Item *right_item);
-
+static bool test_if_ref(COND *root_cond, 
+                        Item_field *left_item,Item *right_item);
 
 /**
   This handles SELECT with and without UNION.
@@ -711,7 +714,8 @@ void JOIN::remove_subq_pushed_predicates(Item **where)
       ((Item_func *)this->conds)->functype() == Item_func::EQ_FUNC &&
       ((Item_func *)conds)->arguments()[0]->type() == Item::REF_ITEM &&
       ((Item_func *)conds)->arguments()[1]->type() == Item::FIELD_ITEM &&
-      test_if_ref ((Item_field *)((Item_func *)conds)->arguments()[1],
+      test_if_ref (this->conds,
+                   (Item_field *)((Item_func *)conds)->arguments()[1],
                    ((Item_func *)conds)->arguments()[0]))
   {
     *where= 0;
@@ -2119,7 +2123,7 @@ JOIN::exec()
 	  */
 	  curr_table->select->cond->quick_fix_field();
 	}
-	curr_table->select_cond= curr_table->select->cond;
+        curr_table->set_select_cond(curr_table->select->cond, __LINE__);
 	curr_table->select_cond->top_level_item();
 	DBUG_EXECUTE("where",print_where(curr_table->select->cond,
 					 "select and having",
@@ -2471,7 +2475,7 @@ static ha_rows get_quick_record_count(THD *thd, SQL_SELECT *select,
     select->head=table;
     table->reginfo.impossible_range=0;
     if ((error= select->test_quick_select(thd, *(key_map *)keys,(table_map) 0,
-                                          limit, 0)) == 1)
+                                          limit, 0, FALSE)) == 1)
       DBUG_RETURN(select->quick->records);
     if (error == -1)
     {
@@ -5678,6 +5682,7 @@ static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
   j->ref.key_buff2=j->ref.key_buff+ALIGN_SIZE(length);
   j->ref.key_err=1;
   j->ref.null_rejecting= 0;
+  j->ref.disable_cache= FALSE;
   keyuse=org_keyuse;
 
   store_key **ref_key= j->ref.key_copy;
@@ -5871,6 +5876,7 @@ JOIN::make_simple_join(JOIN *parent, TABLE *tmp_table)
   join_tab->cache.buff=0;			/* No caching */
   join_tab->table=tmp_table;
   join_tab->select=0;
+  join_tab->set_select_cond(NULL, __LINE__);
   join_tab->select_cond=0;
   join_tab->quick=0;
   join_tab->type= JT_ALL;			/* Map through all records */
@@ -5901,6 +5907,7 @@ inline void add_cond_and_fix(Item **e1, Item *e2)
     {
       *e1= res;
       res->quick_fix_field();
+      res->update_used_tables();
     }
   }
   else
@@ -5998,7 +6005,9 @@ static void add_not_null_conds(JOIN *join)
           DBUG_EXECUTE("where",print_where(notnull,
                                            referred_tab->table->alias,
                                            QT_ORDINARY););
-          add_cond_and_fix(&referred_tab->select_cond, notnull);
+          COND *new_cond= referred_tab->select_cond;
+          add_cond_and_fix(&new_cond, notnull);
+          referred_tab->set_select_cond(new_cond, __LINE__);
         }
       }
     }
@@ -6171,9 +6180,9 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
             if (!tmp)
               DBUG_RETURN(1);
             tmp->quick_fix_field();
-            cond_tab->select_cond= !cond_tab->select_cond ? tmp :
-	                            new Item_cond_and(cond_tab->select_cond,
-                                                      tmp);
+            COND *new_cond= !cond_tab->select_cond ? tmp :
+              new Item_cond_and(cond_tab->select_cond, tmp);
+            cond_tab->set_select_cond(new_cond, __LINE__);
             if (!cond_tab->select_cond)
 	      DBUG_RETURN(1);
             cond_tab->select_cond->quick_fix_field();
@@ -6254,7 +6263,8 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
         }
 
       }
-      if (tmp || !cond || tab->type == JT_REF)
+      if (tmp || !cond || tab->type == JT_REF || tab->type == JT_REF_OR_NULL ||
+          tab->type == JT_EQ_REF || first_inner_tab)
       {
         DBUG_EXECUTE("where",print_where(tmp,tab->table->alias, QT_ORDINARY););
 	SQL_SELECT *sel= tab->select= ((SQL_SELECT*)
@@ -6276,11 +6286,12 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
           */
           if (!(tmp= add_found_match_trig_cond(first_inner_tab, tmp, 0)))
             DBUG_RETURN(1);
-          tab->select_cond=sel->cond=tmp;
+          sel->cond=tmp;
+          tab->set_select_cond(tmp, __LINE__);
           /* Push condition to storage engine if this is enabled
              and the condition is not guarded */
           tab->table->file->pushed_cond= NULL;
-	  if (thd->variables.engine_condition_pushdown)
+	  if (thd->variables.engine_condition_pushdown && !first_inner_tab)
           {
             COND *push_cond= 
               make_cond_for_table(tmp, current_map, current_map);
@@ -6293,7 +6304,11 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
           }
         }
         else
-          tab->select_cond= sel->cond= NULL;
+        {
+          sel->cond= NULL;
+          tab->set_select_cond(NULL, __LINE__);
+        }
+
 
 	sel->head=tab->table;
         DBUG_EXECUTE("where",print_where(tmp,tab->table->alias, QT_ORDINARY););
@@ -6360,7 +6375,8 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
 				       (join->select_options &
 					OPTION_FOUND_ROWS ?
 					HA_POS_ERROR :
-					join->unit->select_limit_cnt), 0) < 0)
+					join->unit->select_limit_cnt), 0,
+                                        FALSE) < 0)
             {
 	      /*
 		Before reporting "Impossible WHERE" for the whole query
@@ -6373,7 +6389,8 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
                                          (join->select_options &
                                           OPTION_FOUND_ROWS ?
                                           HA_POS_ERROR :
-                                          join->unit->select_limit_cnt),0) < 0)
+                                          join->unit->select_limit_cnt),0,
+                                          FALSE) < 0)
 		DBUG_RETURN(1);			// Impossible WHERE
             }
             else
@@ -6502,6 +6519,8 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
             if (!cond_tab->select_cond)
 	      DBUG_RETURN(1);
             cond_tab->select_cond->quick_fix_field();
+            if (cond_tab->select)
+              cond_tab->select->cond= cond_tab->select_cond;
           }              
         }
         first_inner_tab= first_inner_tab->first_upper;       
@@ -6560,6 +6579,8 @@ make_join_readinfo(JOIN *join, ulonglong options)
         table->key_read=1;
         table->file->extra(HA_EXTRA_KEYREAD);
       }
+      else
+        push_index_cond(tab, tab->ref.key, TRUE);
       break;
     case JT_EQ_REF:
       table->status=STATUS_NO_RECORD;
@@ -6578,6 +6599,8 @@ make_join_readinfo(JOIN *join, ulonglong options)
 	table->key_read=1;
 	table->file->extra(HA_EXTRA_KEYREAD);
       }
+      else
+        push_index_cond(tab, tab->ref.key, TRUE);
       break;
     case JT_REF_OR_NULL:
     case JT_REF:
@@ -6589,12 +6612,6 @@ make_join_readinfo(JOIN *join, ulonglong options)
       }
       delete tab->quick;
       tab->quick=0;
-      if (table->covering_keys.is_set(tab->ref.key) &&
-	  !table->no_keyread)
-      {
-	table->key_read=1;
-	table->file->extra(HA_EXTRA_KEYREAD);
-      }
       if (tab->type == JT_REF)
       {
 	tab->read_first_record= join_read_always_key;
@@ -6605,6 +6622,14 @@ make_join_readinfo(JOIN *join, ulonglong options)
 	tab->read_first_record= join_read_always_key_or_null;
 	tab->read_record.read_record= join_read_next_same_or_null;
       }
+      if (table->covering_keys.is_set(tab->ref.key) &&
+	  !table->no_keyread)
+      {
+	table->key_read=1;
+	table->file->extra(HA_EXTRA_KEYREAD);
+      }
+      else
+        push_index_cond(tab, tab->ref.key, TRUE);
       break;
     case JT_FT:
       table->status=STATUS_NO_RECORD;
@@ -6700,6 +6725,9 @@ make_join_readinfo(JOIN *join, ulonglong options)
 	    tab->type=JT_NEXT;		// Read with index_first / index_next
 	  }
 	}
+        if (tab->select && tab->select->quick &&
+            tab->select->quick->index != MAX_KEY && ! tab->table->key_read)
+          push_index_cond(tab, tab->select->quick->index, FALSE);
       }
       break;
     default:
@@ -12098,7 +12126,8 @@ test_if_quick_select(JOIN_TAB *tab)
   delete tab->select->quick;
   tab->select->quick=0;
   return tab->select->test_quick_select(tab->join->thd, tab->keys,
-					(table_map) 0, HA_POS_ERROR, 0);
+					(table_map) 0, HA_POS_ERROR, 0,
+                                        FALSE);
 }
 
 
@@ -12754,12 +12783,24 @@ end_write_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
     1 if right_item is used removable reference key on left_item
 */
 
-static bool test_if_ref(Item_field *left_item,Item *right_item)
+static bool test_if_ref(Item *root_cond, Item_field *left_item,Item *right_item)
 {
   Field *field=left_item->field;
-  // No need to change const test. We also have to keep tests on LEFT JOIN
-  if (!field->table->const_table && !field->table->maybe_null)
+  JOIN_TAB *join_tab= field->table->reginfo.join_tab;
+  // No need to change const test
+  if (!field->table->const_table && join_tab &&
+      (!join_tab->first_inner ||
+       *join_tab->first_inner->on_expr_ref == root_cond))
   {
+    // Cond guards
+    for (uint i = 0; i < join_tab->ref.key_parts; i++)
+    {
+      if (join_tab->ref.cond_guards[i])
+      {
+        return FALSE;
+      }
+    }
+    //
     Item *ref_item=part_of_refkey(field->table,field);
     if (ref_item && ref_item->eq(right_item,1))
     {
@@ -12790,9 +12831,49 @@ static bool test_if_ref(Item_field *left_item,Item *right_item)
 }
 
 
+/*
+  Extract a condition that can be checked after reading given table
+  
+  SYNOPSIS
+    make_cond_for_table()
+      cond         Condition to analyze
+      tables       Tables for which "current field values" are available
+      used_table   Table that we're extracting the condition for (may 
+                   also include PSEUDO_TABLE_BITS
+      exclude_expensive_cond  Do not push expensive conditions
+
+  DESCRIPTION
+    Extract the condition that can be checked after reading the table
+    specified in 'used_table', given that current-field values for tables
+    specified in 'tables' bitmap are available.
+
+    The function assumes that
+      - Constant parts of the condition has already been checked.
+      - Condition that could be checked for tables in 'tables' has already 
+        been checked.
+        
+    The function takes into account that some parts of the condition are
+    guaranteed to be true by employed 'ref' access methods (the code that
+    does this is located at the end, search down for "EQ_FUNC").
+
+
+  SEE ALSO 
+    make_cond_for_info_schema uses similar algorithm
+
+  RETURN
+    Extracted condition
+*/
+
 static COND *
 make_cond_for_table(COND *cond, table_map tables, table_map used_table)
 {
+  return make_cond_for_table_from_pred(cond, cond, tables, used_table);
+}
+               
+static COND *
+make_cond_for_table_from_pred(COND *root_cond, COND *cond,
+                              table_map tables, table_map used_table)
+{
   if (used_table && !(cond->used_tables() & used_table))
     return (COND*) 0;				// Already checked
   if (cond->type() == Item::COND_ITEM)
@@ -12807,7 +12888,7 @@ make_cond_for_table(COND *cond, table_map tables, table_map used_table)
       Item *item;
       while ((item=li++))
       {
-	Item *fix=make_cond_for_table(item,tables,used_table);
+	Item *fix=make_cond_for_table_from_pred(root_cond, item, tables, used_table);
 	if (fix)
 	  new_cond->argument_list()->push_back(fix);
       }
@@ -12837,7 +12918,7 @@ make_cond_for_table(COND *cond, table_map tables, table_map used_table)
       Item *item;
       while ((item=li++))
       {
-	Item *fix=make_cond_for_table(item,tables,0L);
+	Item *fix=make_cond_for_table_from_pred(root_cond, item, tables, 0L);
 	if (!fix)
 	  return (COND*) 0;			// Always true
 	new_cond->argument_list()->push_back(fix);
@@ -12864,18 +12945,19 @@ make_cond_for_table(COND *cond, table_map tables, table_map used_table)
   if (cond->marker == 2 || cond->eq_cmp_result() == Item::COND_OK)
     return cond;				// Not boolean op
 
-  if (((Item_func*) cond)->functype() == Item_func::EQ_FUNC)
+  if (cond->type() == Item::FUNC_ITEM && 
+      ((Item_func*) cond)->functype() == Item_func::EQ_FUNC)
   {
-    Item *left_item=	((Item_func*) cond)->arguments()[0];
-    Item *right_item= ((Item_func*) cond)->arguments()[1];
+    Item *left_item=	((Item_func*) cond)->arguments()[0]->real_item();
+    Item *right_item= ((Item_func*) cond)->arguments()[1]->real_item();
     if (left_item->type() == Item::FIELD_ITEM &&
-	test_if_ref((Item_field*) left_item,right_item))
+	test_if_ref(root_cond, (Item_field*) left_item,right_item))
     {
       cond->marker=3;			// Checked when read
       return (COND*) 0;
     }
     if (right_item->type() == Item::FIELD_ITEM &&
-	test_if_ref((Item_field*) right_item,left_item))
+	test_if_ref(root_cond, (Item_field*) right_item,left_item))
     {
       cond->marker=3;			// Checked when read
       return (COND*) 0;
@@ -13259,7 +13341,7 @@ find_field_in_item_list (Field *field, void *data)
 
 static bool
 test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
-			bool no_changes, key_map *map)
+			bool no_changes, const key_map *map)
 {
   int ref_key;
   uint ref_key_parts;
@@ -13269,6 +13351,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
   SQL_SELECT *select=tab->select;
   key_map usable_keys;
   QUICK_SELECT_I *save_quick= 0;
+  COND *orig_select_cond= 0;
   DBUG_ENTER("test_if_skip_sort_order");
   LINT_INIT(ref_key_parts);
 
@@ -13288,7 +13371,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
     }
     usable_keys.intersect(((Item_field*) item)->field->part_of_sortkey);
     if (usable_keys.is_clear_all())
-      DBUG_RETURN(0);					// No usable keys
+      goto use_filesort;					// No usable keys
   }
 
   ref_key= -1;
@@ -13298,7 +13381,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
     ref_key=	   tab->ref.key;
     ref_key_parts= tab->ref.key_parts;
     if (tab->type == JT_REF_OR_NULL || tab->type == JT_FT)
-      DBUG_RETURN(0);
+      goto use_filesort;
   }
   else if (select && select->quick)		// Range found by opt_range
   {
@@ -13313,7 +13396,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
     if (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE || 
         quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION || 
         quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT)
-      DBUG_RETURN(0);
+      goto use_filesort;
     ref_key=	   select->quick->index;
     ref_key_parts= select->quick->used_key_parts;
   }
@@ -13335,10 +13418,15 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
       */
       if (table->covering_keys.is_set(ref_key))
 	usable_keys.intersect(table->covering_keys);
+      if (tab->pre_idx_push_select_cond)
+        orig_select_cond= tab->set_cond(tab->pre_idx_push_select_cond);
+
       if ((new_ref_key= test_if_subkey(order, table, ref_key, ref_key_parts,
 				       &usable_keys)) < MAX_KEY)
       {
 	/* Found key that can be used to retrieve data in sorted order */
+        //psergey-mrr:if (tab->pre_idx_push_select_cond)
+        //  tab->select_cond= tab->select->cond= tab->pre_idx_push_select_cond;
 	if (tab->ref.key >= 0)
 	{
           /*
@@ -13352,9 +13440,10 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
           KEYUSE *keyuse= tab->keyuse;
           while (keyuse->key != new_ref_key && keyuse->table == tab->table)
             keyuse++;
+
           if (create_ref_for_key(tab->join, tab, keyuse, 
                                  tab->join->const_table_map))
-            DBUG_RETURN(0);
+            goto use_filesort;
 	}
 	else
 	{
@@ -13374,9 +13463,10 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
                                         (tab->join->select_options &
                                          OPTION_FOUND_ROWS) ?
                                         HA_POS_ERROR :
-                                        tab->join->unit->select_limit_cnt,0) <=
+                                        tab->join->unit->select_limit_cnt,0,
+                                        TRUE) <=
               0)
-            DBUG_RETURN(0);
+            goto use_filesort;
 	}
         ref_key= new_ref_key;
       }
@@ -13424,7 +13514,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
         index order and not using join cache
         */
       if (tab->type == JT_ALL && tab->join->tables > tab->join->const_tables + 1)
-        DBUG_RETURN(0);
+        goto use_filesort;
       keys= *table->file->keys_to_use_for_scanning();
       keys.merge(table->covering_keys);
 
@@ -13580,7 +13670,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
                                     join->select_options & OPTION_FOUND_ROWS ?
                                     HA_POS_ERROR :
                                     join->unit->select_limit_cnt,
-                                    0) > 0;
+                                    TRUE, FALSE) > 0;
       }
       if (!no_changes)
       {
@@ -13609,6 +13699,18 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
             table->key_read=1;
             table->file->extra(HA_EXTRA_KEYREAD);
           }
+          if (tab->pre_idx_push_select_cond)
+          {
+            COND *tmp_cond= tab->pre_idx_push_select_cond;
+            if (orig_select_cond)
+            {
+              tmp_cond= and_conds(tmp_cond, orig_select_cond);
+              tmp_cond->quick_fix_field();
+            }
+            tab->set_cond(tmp_cond);
+            /* orig_select_cond was merged, no need to restore original one. */
+            orig_select_cond= 0;
+          }
           table->file->ha_index_or_rnd_end();
           if (join->select_options & SELECT_DESCRIBE)
           {
@@ -13642,7 +13744,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
       order_direction= best_key_direction;
     }
     else
-      DBUG_RETURN(0); 
+      goto use_filesort; 
   } 
 
 check_reverse_order:                  
@@ -13657,6 +13759,7 @@ check_reverse_order:
       if (!select->quick->reverse_sorted())
       {
         QUICK_SELECT_DESC *tmp;
+        bool error= FALSE;
         int quick_type= select->quick->get_type();
         if (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE ||
             quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT ||
@@ -13665,18 +13768,18 @@ check_reverse_order:
         {
           tab->limit= 0;
           select->quick= save_quick;
-          DBUG_RETURN(0);                   // Use filesort
+          goto use_filesort;                   // Use filesort
         }
             
         /* ORDER BY range_key DESC */
 	tmp= new QUICK_SELECT_DESC((QUICK_RANGE_SELECT*)(select->quick),
-                                    used_key_parts);
-	if (!tmp || tmp->error)
+                                    used_key_parts, &error);
+	if (!tmp || error)
 	{
 	  delete tmp;
           select->quick= save_quick;
           tab->limit= 0;
-	  DBUG_RETURN(0);		// Reverse sort not supported
+	  goto use_filesort;		// Reverse sort not supported
 	}
 	select->quick=tmp;
       }
@@ -13695,8 +13798,14 @@ check_reverse_order:
     }
   }
   else if (select && select->quick)
-    select->quick->sorted= 1;
+    select->quick->need_sorted_output();
+  if (orig_select_cond)
+    tab->set_cond(orig_select_cond);
   DBUG_RETURN(1);
+use_filesort:
+  if (orig_select_cond)
+    tab->set_cond(orig_select_cond);
+  DBUG_RETURN(0);
 }
 
 
@@ -13797,7 +13906,7 @@ create_sort_index(THD *thd, JOIN *join, ORDER *order,
 	field, quick will contain an empty record set.
       */
       if (!(select->quick= (tab->type == JT_FT ?
-			    new FT_SELECT(thd, table, tab->ref.key) :
+			    get_ft_select(thd, table, tab->ref.key) :
 			    get_quick_select_for_ref(thd, table, &tab->ref, 
                                                      tab->found_records))))
 	goto err;
@@ -13836,7 +13945,7 @@ create_sort_index(THD *thd, JOIN *join, ORDER *order,
     table->quick_keys.clear_all();  // as far as we cleanup select->quick
     table->sort.io_cache= tablesort_result_cache;
   }
-  tab->select_cond=0;
+  tab->set_select_cond(NULL, __LINE__);
   tab->last_inner= 0;
   tab->first_unmatched= 0;
   tab->type=JT_ALL;				// Read with normal read_record
@@ -13877,7 +13986,7 @@ static bool fix_having(JOIN *join, Item **having)
 						   sort_table_cond)) ||
 	  table->select->cond->fix_fields(join->thd, &table->select->cond))
 	return 1;
-    table->select_cond=table->select->cond;
+    table->set_select_cond(table->select->cond, __LINE__);
     table->select_cond->top_level_item();
     DBUG_EXECUTE("where",print_where(table->select_cond,
 				     "select and having",
@@ -14468,19 +14577,50 @@ read_cached_record(JOIN_TAB *tab)
 }
 
 
+/*
+  eq_ref: Create the lookup key and check if it is the same as saved key
+
+  SYNOPSIS
+    cmp_buffer_with_ref()
+      tab      Join tab of the accessed table
+      table    The table to read.  This is usually tab->table, except for 
+               semi-join when we might need to make a lookup in a temptable
+               instead.
+      tab_ref  The structure with methods to collect index lookup tuple. 
+               This is usually table->ref, except for the case of when we're 
+               doing lookup into semi-join materialization table.
+
+  DESCRIPTION 
+    Used by eq_ref access method: create the index lookup key and check if 
+    we've used this key at previous lookup (If yes, we don't need to repeat
+    the lookup - the record has been already fetched)
+
+  RETURN 
+    TRUE   No cached record for the key, or failed to create the key (due to
+           out-of-domain error)
+    FALSE  The created key is the same as the previous one (and the record 
+           is already in table->record)
+*/
+
 static bool
 cmp_buffer_with_ref(JOIN_TAB *tab)
 {
-  bool diff;
-  if (!(diff=tab->ref.key_err))
+  TABLE_REF *tab_ref= &tab->ref;
+  bool no_prev_key;
+  if (!tab_ref->disable_cache)
   {
-    memcpy(tab->ref.key_buff2, tab->ref.key_buff, tab->ref.key_length);
+    if (!(no_prev_key= tab_ref->key_err))
+    {
+      /* Previous access found a row. Copy its key */
+      memcpy(tab_ref->key_buff2, tab_ref->key_buff, tab_ref->key_length);
+    }
   }
-  if ((tab->ref.key_err= cp_buffer_from_ref(tab->join->thd, tab->table,
-                                            &tab->ref)) ||
-      diff)
+  else 
+    no_prev_key= TRUE;
+  if ((tab_ref->key_err= cp_buffer_from_ref(tab->join->thd, tab->table, tab_ref)) ||
+      no_prev_key)
     return 1;
-  return memcmp(tab->ref.key_buff2, tab->ref.key_buff, tab->ref.key_length)
+  return memcmp(tab_ref->key_buff2, tab_ref->key_buff, tab_ref->key_length)
     != 0;
 }
 
@@ -15764,11 +15904,12 @@ static bool add_ref_to_table_cond(THD *thd, JOIN_TAB *join_tab)
   if (join_tab->select)
   {
     error=(int) cond->add(join_tab->select->cond);
-    join_tab->select_cond=join_tab->select->cond=cond;
+    join_tab->select->cond= cond;
+    join_tab->set_select_cond(cond, __LINE__);
   }
   else if ((join_tab->select= make_select(join_tab->table, 0, 0, cond, 0,
                                           &error)))
-    join_tab->select_cond=cond;
+    join_tab->set_select_cond(cond, __LINE__);
 
   DBUG_RETURN(error ? TRUE : FALSE);
 }
@@ -16584,6 +16725,20 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
       }
       else
       {
+        uint keyno= MAX_KEY;
+        if (tab->ref.key_parts)
+          keyno= tab->ref.key;
+        else if (tab->select && tab->select->quick)
+          keyno = tab->select->quick->index;
+
+        if (keyno != MAX_KEY && keyno == table->file->pushed_idx_cond_keyno &&
+            table->file->pushed_idx_cond)
+          extra.append(STRING_WITH_LEN("; Using index condition"));
+        /**
+        psergey-mrr:
+        else if (tab->cache_idx_cond)
+          extra.append(STRING_WITH_LEN("; Using index condition(BKA)"));
+        **/
         if (quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION || 
             quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT ||
             quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE)
@@ -16647,6 +16802,14 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
         }
 	if (table->reginfo.not_exists_optimize)
 	  extra.append(STRING_WITH_LEN("; Not exists"));
+
+        if (quick_type == QUICK_SELECT_I::QS_TYPE_RANGE &&
+            !(((QUICK_RANGE_SELECT*)(tab->select->quick))->mrr_flags &
+             HA_MRR_USE_DEFAULT_IMPL))
+        {
+	  extra.append(STRING_WITH_LEN("; Using MRR"));
+        }
+
 	if (need_tmp_table)
 	{
 	  need_tmp_table=0;
diff --git a/sql/sql_select.h b/sql/sql_select.h
index 271c88ebf66..0a6702a8135 100644
--- a/sql/sql_select.h
+++ b/sql/sql_select.h
@@ -85,6 +85,12 @@ typedef struct st_table_ref
   table_map	depend_map;		  ///< Table depends on these tables.
   /* null byte position in the key_buf. Used for REF_OR_NULL optimization */
   uchar          *null_ref_key;
+
+  /*
+    TRUE <=> disable the "cache" as doing lookup with the same key value may
+    produce different results (because of Index Condition Pushdown)
+  */
+  bool          disable_cache;
 } TABLE_REF;
 
 
@@ -145,6 +151,14 @@ typedef struct st_join_table {
   SQL_SELECT	*select;
   COND		*select_cond;
   QUICK_SELECT_I *quick;
+  /* 
+    The value of select_cond before we've attempted to do Index Condition
+    Pushdown. We may need to restore everything back if we first choose one
+    index but then reconsider (see test_if_skip_sort_order() for such
+    scenarios).
+    NULL means no index condition pushdown was performed.
+  */
+  Item          *pre_idx_push_select_cond;
   Item	       **on_expr_ref;   /**< pointer to the associated on expression   */
   COND_EQUAL    *cond_equal;    /**< multiple equalities for the on expression */
   st_join_table *first_inner;   /**< first inner table for including outerjoin */
@@ -218,6 +232,20 @@ typedef struct st_join_table {
             (select->quick->get_type() ==
              QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX));
   }
+  void set_select_cond(COND *to, uint line)
+  {
+    DBUG_PRINT("info", ("select_cond changes %p -> %p at line %u tab %p",
+                        select_cond, to, line, this));
+    select_cond= to;
+  }
+  COND *set_cond(COND *new_cond)
+  {
+    COND *tmp_select_cond= select_cond;
+    set_select_cond(new_cond, __LINE__);
+    if (select)
+      select->cond= new_cond;
+    return tmp_select_cond;
+  }
 } JOIN_TAB;
 
 enum_nested_loop_state sub_select_cache(JOIN *join, JOIN_TAB *join_tab, bool
@@ -752,3 +780,7 @@ inline bool optimizer_flag(THD *thd, uint flag)
 
 void eliminate_tables(JOIN *join);
 
+/// psergey-mrr:
+void push_index_cond(JOIN_TAB *tab, uint keyno, bool other_tbls_ok);
+
+
diff --git a/sql/structs.h b/sql/structs.h
index 3a090fd203f..9dffe1e1b8c 100644
--- a/sql/structs.h
+++ b/sql/structs.h
@@ -53,7 +53,8 @@ typedef struct st_key_part_info {	/* Info about a key part */
   Field *field;
   uint	offset;				/* offset in record (from 0) */
   uint	null_offset;			/* Offset to null_bit in record */
-  uint16 length;                        /* Length of keypart value in bytes */
+  /* Length of key part in bytes, excluding NULL flag and length bytes */
+  uint16 length;
   /* 
     Number of bytes required to store the keypart value. This may be
     different from the "length" field as it also counts
diff --git a/sql/table.cc b/sql/table.cc
index 1c10958fdef..c2d58bac982 100644
--- a/sql/table.cc
+++ b/sql/table.cc
@@ -1556,21 +1556,11 @@ static int open_binary_frm(THD *thd, TABLE_SHARE *share, uchar *head,
                                 share->table_name.str,
                                 share->table_name.str);
             share->crashed= 1;                // Marker for CHECK TABLE
-            goto to_be_deleted;
+            continue;
           }
 #endif
           key_part->key_part_flag|= HA_PART_KEY_SEG;
         }
-
-	to_be_deleted:
-
-        /*
-          If the field can be NULL, don't optimize away the test
-          key_part_column = expression from the WHERE clause
-          as we need to test for NULL = NULL.
-        */
-        if (field->real_maybe_null())
-          key_part->key_part_flag|= HA_NULL_PART;
       }
       keyinfo->usable_key_parts= usable_parts; // Filesort