84 files changed, 28437 insertions, 3435 deletions
diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index 53ba75c4d8c..9f31bda262b 100755..100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -58,9 +58,12 @@ SET (SQL_SOURCE
                sp_rcontext.cc spatial.cc sql_acl.cc sql_analyse.cc sql_base.cc 
                sql_cache.cc sql_class.cc sql_client.cc sql_crypt.cc sql_crypt.h 
                sql_cursor.cc sql_db.cc sql_delete.cc sql_derived.cc sql_do.cc 
-               sql_error.cc sql_handler.cc sql_help.cc sql_insert.cc sql_lex.cc 
-               sql_list.cc sql_load.cc sql_manager.cc sql_map.cc sql_parse.cc 
-               sql_partition.cc sql_plugin.cc sql_prepare.cc sql_rename.cc 
+               sql_error.cc sql_handler.cc sql_help.cc sql_insert.cc
+               sql_lifo_buffer.h
+               sql_join_cache.h sql_join_cache.cc
+               sql_lex.cc sql_list.cc sql_load.cc sql_manager.cc
+               sql_map.cc sql_parse.cc  sql_partition.cc sql_plugin.cc
+               sql_prepare.cc sql_rename.cc 
                debug_sync.cc debug_sync.h
                sql_repl.cc sql_select.cc sql_show.cc sql_state.c sql_string.cc 
                sql_table.cc sql_test.cc sql_trigger.cc sql_udf.cc sql_union.cc
@@ -73,7 +76,11 @@ SET (SQL_SOURCE
                rpl_rli.cc rpl_mi.cc sql_servers.cc
                sql_connect.cc scheduler.cc 
                sql_profile.cc event_parse_data.cc opt_table_elimination.cc
+               multi_range_read.cc
+               opt_subselect.cc
+               opt_index_cond_pushdown.cc
                create_options.cc
+               sql_expression_cache.cc
                ${PROJECT_SOURCE_DIR}/sql/sql_yacc.cc
                ${PROJECT_SOURCE_DIR}/sql/sql_yacc.h
                ${PROJECT_SOURCE_DIR}/include/mysqld_error.h
diff --git a/sql/Makefile.am b/sql/Makefile.am
index c1920f41cc7..be8294fbb3f 100644
--- a/sql/Makefile.am
+++ b/sql/Makefile.am
@@ -61,10 +61,12 @@ noinst_HEADERS =	item.h item_func.h item_sum.h item_cmpfunc.h \
 			ha_partition.h rpl_constants.h \
 			debug_sync.h \
 			opt_range.h protocol.h rpl_tblmap.h rpl_utility.h \
+                        opt_subselect.h \
 			rpl_reporting.h \
 			log.h log_slow.h sql_show.h rpl_rli.h rpl_mi.h \
 			sql_select.h structs.h table.h sql_udf.h hash_filo.h \
 			lex.h lex_symbol.h sql_acl.h sql_crypt.h  \
+                        sql_lifo_buffer.h \
 			sql_repl.h slave.h rpl_filter.h rpl_injector.h \
 			log_event.h rpl_record.h \
 			log_event_old.h rpl_record_old.h \
@@ -79,7 +81,10 @@ noinst_HEADERS =	item.h item_func.h item_sum.h item_cmpfunc.h \
 			event_data_objects.h event_scheduler.h \
 			sql_partition.h partition_info.h partition_element.h \
 			contributors.h sql_servers.h \
-			create_options.h
+                        multi_range_read.h sql_handler.h \
+                        sql_join_cache.h \
+			create_options.h \
+			sql_expression_cache.h
 
 mysqld_SOURCES =	sql_lex.cc sql_handler.cc sql_partition.cc \
 			item.cc item_sum.cc item_buff.cc item_func.cc \
@@ -93,6 +98,7 @@ mysqld_SOURCES =	sql_lex.cc sql_handler.cc sql_partition.cc \
 			mysqld.cc password.c hash_filo.cc hostname.cc \
 			sql_connect.cc scheduler.cc sql_parse.cc \
 			set_var.cc sql_yacc.yy \
+                        sql_join_cache.cc \
 			sql_base.cc table.cc sql_select.cc sql_insert.cc \
 			sql_profile.cc \
 			sql_prepare.cc sql_error.cc sql_locale.cc \
@@ -102,7 +108,8 @@ mysqld_SOURCES =	sql_lex.cc sql_handler.cc sql_partition.cc \
 			unireg.cc des_key_file.cc \
 			log_event.cc rpl_record.cc \
 			log_event_old.cc rpl_record_old.cc \
-			discover.cc time.cc opt_range.cc opt_sum.cc \
+			discover.cc time.cc opt_range.cc opt_subselect.cc \
+                        opt_sum.cc \
 		   	records.cc filesort.cc handler.cc \
 		        ha_partition.cc \
 			debug_sync.cc \
@@ -125,7 +132,9 @@ mysqld_SOURCES =	sql_lex.cc sql_handler.cc sql_partition.cc \
 			sql_plugin.cc sql_binlog.cc \
 			sql_builtin.cc sql_tablespace.cc partition_info.cc \
 			sql_servers.cc event_parse_data.cc \
-                        opt_table_elimination.cc create_options.cc
+                        opt_table_elimination.cc create_options.cc \
+			multi_range_read.cc \
+			opt_index_cond_pushdown.cc sql_expression_cache.cc
 
 nodist_mysqld_SOURCES =	mini_client_errors.c pack.c client.c my_time.c my_user.c client_plugin.c
 
@@ -153,7 +162,7 @@ BUILT_SOURCES =		$(BUILT_MAINT_SRC) lex_hash.h link_sources
 EXTRA_DIST =		udf_example.c udf_example.def $(BUILT_MAINT_SRC) \
 			nt_servc.cc nt_servc.h \
 			message.mc  message.h message.rc MSG00001.bin \
-			CMakeLists.txt
+			CMakeLists.txt opt_range_mrr.cc
 
 CLEANFILES =        	lex_hash.h sql_yacc.output link_sources
 DISTCLEANFILES =        $(EXTRA_PROGRAMS)
diff --git a/sql/debug_sync.cc b/sql/debug_sync.cc
index c9c580c6442..322db38adf2 100644
--- a/sql/debug_sync.cc
+++ b/sql/debug_sync.cc
@@ -1942,4 +1942,7 @@ bool debug_sync_set_action(THD *thd, const char *action_str, size_t len)
 }
 
 
+#else /* defined(ENABLED_DEBUG_SYNC) */
+/* prevent linker/lib warning about file without public symbols */
+int debug_sync_dummy; 
 #endif /* defined(ENABLED_DEBUG_SYNC) */
diff --git a/sql/event_db_repository.cc b/sql/event_db_repository.cc
index c1a64446c12..13a2f8cf7c2 100644
--- a/sql/event_db_repository.cc
+++ b/sql/event_db_repository.cc
@@ -896,7 +896,11 @@ Event_db_repository::find_named_event(LEX_STRING db, LEX_STRING name,
     same fields.
   */
   if (db.length > table->field[ET_FIELD_DB]->field_length ||
-      name.length > table->field[ET_FIELD_NAME]->field_length)
+      name.length > table->field[ET_FIELD_NAME]->field_length ||
+      table->s->keys == 0 ||
+      table->key_info[0].key_parts != 2 ||
+      table->key_info[0].key_part[0].fieldnr != ET_FIELD_DB+1 ||
+      table->key_info[0].key_part[1].fieldnr != ET_FIELD_NAME+1)
     DBUG_RETURN(TRUE);
 
   table->field[ET_FIELD_DB]->store(db.str, db.length, &my_charset_bin);
diff --git a/sql/event_queue.cc b/sql/event_queue.cc
index d68dc8ef479..2a354fe6cfd 100644
--- a/sql/event_queue.cc
+++ b/sql/event_queue.cc
@@ -136,9 +136,9 @@ Event_queue::init_queue(THD *thd)
 
   LOCK_QUEUE_DATA();
 
-  if (init_queue_ex(&queue, EVENT_QUEUE_INITIAL_SIZE , 0 /*offset*/,
-                    0 /*max_on_top*/, event_queue_element_compare_q,
-                    NULL, EVENT_QUEUE_EXTENT))
+  if (::init_queue(&queue, EVENT_QUEUE_INITIAL_SIZE , 0 /*offset*/,
+                   0 /*max_on_top*/, event_queue_element_compare_q,
+                   NullS, 0, EVENT_QUEUE_EXTENT))
   {
     sql_print_error("Event Scheduler: Can't initialize the execution queue");
     goto err;
@@ -325,11 +325,13 @@ void
 Event_queue::drop_matching_events(THD *thd, LEX_STRING pattern,
                            bool (*comparator)(LEX_STRING, Event_basic *))
 {
-  uint i= 0;
+  uint i;
   DBUG_ENTER("Event_queue::drop_matching_events");
   DBUG_PRINT("enter", ("pattern=%s", pattern.str));
 
-  while (i < queue.elements)
+  for (i= queue_first_element(&queue) ;
+       i <= queue_last_element(&queue) ;
+       )
   {
     Event_queue_element *et= (Event_queue_element *) queue_element(&queue, i);
     DBUG_PRINT("info", ("[%s.%s]?", et->dbname.str, et->name.str));
@@ -339,7 +341,8 @@ Event_queue::drop_matching_events(THD *thd, LEX_STRING pattern,
         The queue is ordered. If we remove an element, then all elements
         after it will shift one position to the left, if we imagine it as
         an array from left to the right. In this case we should not
-        increment the counter and the (i < queue.elements) condition is ok.
+        increment the counter and the (i <= queue_last_element() condition
+        is ok.
       */
       queue_remove(&queue, i);
       delete et;
@@ -403,7 +406,9 @@ Event_queue::find_n_remove_event(LEX_STRING db, LEX_STRING name)
   uint i;
   DBUG_ENTER("Event_queue::find_n_remove_event");
 
-  for (i= 0; i < queue.elements; ++i)
+  for (i= queue_first_element(&queue);
+       i <= queue_last_element(&queue);
+       i++)
   {
     Event_queue_element *et= (Event_queue_element *) queue_element(&queue, i);
     DBUG_PRINT("info", ("[%s.%s]==[%s.%s]?", db.str, name.str,
@@ -441,7 +446,9 @@ Event_queue::recalculate_activation_times(THD *thd)
 
   LOCK_QUEUE_DATA();
   DBUG_PRINT("info", ("%u loaded events to be recalculated", queue.elements));
-  for (i= 0; i < queue.elements; i++)
+  for (i= queue_first_element(&queue);
+       i <= queue_last_element(&queue);
+       i++)
   {
     ((Event_queue_element*)queue_element(&queue, i))->compute_next_execution_time();
     ((Event_queue_element*)queue_element(&queue, i))->update_timing_fields(thd);
@@ -454,16 +461,19 @@ Event_queue::recalculate_activation_times(THD *thd)
     have removed all. The queue has been ordered in a way the disabled
     events are at the end.
   */
-  for (i= queue.elements; i > 0; i--)
+  for (i= queue_last_element(&queue);
+       (int) i >= (int) queue_first_element(&queue);
+       i--)
   {
-    Event_queue_element *element = (Event_queue_element*)queue_element(&queue, i - 1);
+    Event_queue_element *element=
+      (Event_queue_element*)queue_element(&queue, i);
     if (element->status != Event_parse_data::DISABLED)
       break;
     /*
       This won't cause queue re-order, because we remove
       always the last element.
     */
-    queue_remove(&queue, i - 1);
+    queue_remove(&queue, i);
     delete element;
   }
   UNLOCK_QUEUE_DATA();
@@ -499,7 +509,9 @@ Event_queue::empty_queue()
   sql_print_information("Event Scheduler: Purging the queue. %u events",
                         queue.elements);
   /* empty the queue */
-  for (i= 0; i < queue.elements; ++i)
+  for (i= queue_first_element(&queue);
+       i <= queue_last_element(&queue);
+       i++)
   {
     Event_queue_element *et= (Event_queue_element *) queue_element(&queue, i);
     delete et;
@@ -525,7 +537,9 @@ Event_queue::dbug_dump_queue(time_t now)
   uint i;
   DBUG_ENTER("Event_queue::dbug_dump_queue");
   DBUG_PRINT("info", ("Dumping queue . Elements=%u", queue.elements));
-  for (i = 0; i < queue.elements; i++)
+  for (i= queue_first_element(&queue);
+       i <= queue_last_element(&queue);
+       i++)
   {
     et= ((Event_queue_element*)queue_element(&queue, i));
     DBUG_PRINT("info", ("et: 0x%lx  name: %s.%s", (long) et,
@@ -592,7 +606,7 @@ Event_queue::get_top_for_execution_if_time(THD *thd,
       continue;
     }
 
-    top= ((Event_queue_element*) queue_element(&queue, 0));
+    top= (Event_queue_element*) queue_top(&queue);
 
     thd->set_current_time(); /* Get current time */
 
@@ -634,10 +648,10 @@ Event_queue::get_top_for_execution_if_time(THD *thd,
                             top->dbname.str, top->name.str,
                             top->dropped? "Dropping.":"");
       delete top;
-      queue_remove(&queue, 0);
+      queue_remove_top(&queue);
     }
     else
-      queue_replaced(&queue);
+      queue_replace_top(&queue);
 
     dbug_dump_queue(thd->query_start());
     break;
diff --git a/sql/event_scheduler.cc b/sql/event_scheduler.cc
index 4d6636eedb2..4d6636eedb2 100755..100644
--- a/sql/event_scheduler.cc
+++ b/sql/event_scheduler.cc
diff --git a/sql/field.cc b/sql/field.cc
index 13162fcb0d6..461d6c1eda2 100644
--- a/sql/field.cc
+++ b/sql/field.cc
@@ -1706,8 +1706,8 @@ my_decimal *Field_str::val_decimal(my_decimal *decimal_value)
 uint Field::fill_cache_field(CACHE_FIELD *copy)
 {
   uint store_length;
-  copy->str=ptr;
-  copy->length=pack_length();
+  copy->str= ptr;
+  copy->length= pack_length();
   copy->field= this;
   if (flags & BLOB_FLAG)
   {
@@ -1719,9 +1719,15 @@ uint Field::fill_cache_field(CACHE_FIELD *copy)
            (type() == MYSQL_TYPE_STRING && copy->length >= 4 &&
             copy->length < 256))
   {
-    copy->type= CACHE_STRIPPED;
+    copy->type= CACHE_STRIPPED;			    /* Remove end space */
     store_length= 2;
   }
+  else if (type() ==  MYSQL_TYPE_VARCHAR)
+  {
+    copy->type= pack_length()-row_pack_length() == 1 ? CACHE_VARSTR1:
+                                                      CACHE_VARSTR2;
+    store_length= 0;
+  }
   else
   {
     copy->type= 0;
diff --git a/sql/field.h b/sql/field.h
index c205c9b5582..d695479f197 100644
--- a/sql/field.h
+++ b/sql/field.h
@@ -590,6 +590,11 @@ public:
   }
   /* Hash value */
   virtual void hash(ulong *nr, ulong *nr2);
+
+  /* Check whether the field can be used as a join attribute in hash join */
+  virtual bool hash_join_is_possible() { return TRUE; }
+  virtual bool eq_cmp_as_binary() { return TRUE; }
+
   friend bool reopen_table(THD *,struct st_table *,bool);
   friend int cre_myisam(char * name, register TABLE *form, uint options,
 			ulonglong auto_increment_value);
@@ -765,6 +770,7 @@ public:
   my_decimal *val_decimal(my_decimal *);
   virtual bool str_needs_quotes() { return TRUE; }
   uint is_equal(Create_field *new_field);
+  bool eq_cmp_as_binary() { return test(flags & BINARY_FLAG); }
 };
 
 
@@ -1909,6 +1915,7 @@ public:
   uint size_of() const { return sizeof(*this); }
   int  reset(void) { return !maybe_null() || Field_blob::reset(); }
   geometry_type get_geometry_type() { return geom_type; };
+  bool hash_join_is_possible() { return FALSE; }
 };
 #endif /*HAVE_SPATIAL*/
 
diff --git a/sql/field_conv.cc b/sql/field_conv.cc
index d934e8d007a..3e52e67f2d9 100644
--- a/sql/field_conv.cc
+++ b/sql/field_conv.cc
@@ -449,7 +449,8 @@ static void do_varstring1(Copy_field *copy)
   if (length > copy->to_length- 1)
   {
     length=copy->to_length - 1;
-    if (copy->from_field->table->in_use->count_cuted_fields)
+    if (copy->from_field->table->in_use->count_cuted_fields &&
+        copy->to_field)
       copy->to_field->set_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
                                   WARN_DATA_TRUNCATED, 1);
   }
@@ -485,7 +486,8 @@ static void do_varstring2(Copy_field *copy)
   if (length > copy->to_length- HA_KEY_BLOB_LENGTH)
   {
     length=copy->to_length-HA_KEY_BLOB_LENGTH;
-    if (copy->from_field->table->in_use->count_cuted_fields)
+    if (copy->from_field->table->in_use->count_cuted_fields &&
+        copy->to_field)
       copy->to_field->set_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
                                   WARN_DATA_TRUNCATED, 1);
   }
@@ -549,9 +551,9 @@ void Copy_field::set(uchar *to,Field *from)
       do_copy=	  do_field_to_null_str;
   }
   else
-  {
+  { 
     to_null_ptr=  0;				// For easy debugging
-    do_copy=	  do_field_eq;
+    do_copy= do_field_eq;
   }
 }
 
@@ -710,6 +712,9 @@ Copy_field::get_copy_func(Field *to,Field *from)
                                                     do_varstring1_mb) :
                   (from->charset()->mbmaxlen == 1 ? do_varstring2 :
                                                     do_varstring2_mb));
+        else 
+          return  (((Field_varstring*) from)->length_bytes == 1 ?
+                    do_varstring1 : do_varstring2);
       }
       else if (to_length < from_length)
 	return (from->charset()->mbmaxlen == 1 ?
diff --git a/sql/filesort.cc b/sql/filesort.cc
index 3f2e2e55046..6e3bf27afcc 100644
--- a/sql/filesort.cc
+++ b/sql/filesort.cc
@@ -50,10 +50,6 @@ static int write_keys(SORTPARAM *param,uchar * *sort_keys,
 		      uint count, IO_CACHE *buffer_file, IO_CACHE *tempfile);
 static void make_sortkey(SORTPARAM *param,uchar *to, uchar *ref_pos);
 static void register_used_fields(SORTPARAM *param);
-static int merge_index(SORTPARAM *param,uchar *sort_buffer,
-		       BUFFPEK *buffpek,
-		       uint maxbuffer,IO_CACHE *tempfile,
-		       IO_CACHE *outfile);
 static bool save_index(SORTPARAM *param,uchar **sort_keys, uint count, 
                        FILESORT_INFO *table_sort);
 static uint suffix_length(ulong string_length);
@@ -145,8 +141,6 @@ ha_rows filesort(THD *thd, TABLE *table, SORT_FIELD *sortorder, uint s_length,
   /* filesort cannot handle zero-length records. */
   DBUG_ASSERT(param.sort_length);
   param.ref_length= table->file->ref_length;
-  param.addon_field= 0;
-  param.addon_length= 0;
   if (!(table->file->ha_table_flags() & HA_FAST_KEY_READ) &&
       !table->fulltext_searched && !sort_positions)
   {
@@ -543,11 +537,6 @@ static ha_rows find_all_keys(SORTPARAM *param, SQL_SELECT *select,
 		    current_thd->variables.read_buff_size);
   }
 
-  if (quick_select)
-  {
-    if (select->quick->reset())
-      DBUG_RETURN(HA_POS_ERROR);
-  }
 
   /* Remember original bitmaps */
   save_read_set=  sort_form->read_set;
@@ -561,9 +550,19 @@ static ha_rows find_all_keys(SORTPARAM *param, SQL_SELECT *select,
   if (select && select->cond)
     select->cond->walk(&Item::register_field_in_read_map, 1,
                        (uchar*) sort_form);
+  if (select && select->pre_idx_push_select_cond)
+    select->pre_idx_push_select_cond->walk(&Item::register_field_in_read_map,
+                                           1, (uchar*) sort_form);
   sort_form->column_bitmaps_set(&sort_form->tmp_set, &sort_form->tmp_set, 
                                 &sort_form->tmp_set);
 
+
+  if (quick_select)
+  {
+    if (select->quick->reset())
+      DBUG_RETURN(HA_POS_ERROR);
+  }
+
   for (;;)
   {
     if (quick_select)
@@ -1161,7 +1160,9 @@ uint read_to_buffer(IO_CACHE *fromfile, BUFFPEK *buffpek,
 void reuse_freed_buff(QUEUE *queue, BUFFPEK *reuse, uint key_length)
 {
   uchar *reuse_end= reuse->base + reuse->max_keys * key_length;
-  for (uint i= 0; i < queue->elements; ++i)
+  for (uint i= queue_first_element(queue);
+       i <= queue_last_element(queue);
+       i++)
   {
     BUFFPEK *bp= (BUFFPEK *) queue_element(queue, i);
     if (bp->base + bp->max_keys * key_length == reuse->base)
@@ -1214,8 +1215,11 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
   QUEUE queue;
   qsort2_cmp cmp;
   void *first_cmp_arg;
-  volatile THD::killed_state *killed= &current_thd->killed;
+  element_count dupl_count= 0;
+  uchar *src;
   THD::killed_state not_killable;
+  uchar *unique_buff= param->unique_buff;
+  volatile THD::killed_state *killed= &current_thd->killed;
   DBUG_ENTER("merge_buffers");
 
   status_var_increment(current_thd->status_var.filesort_merge_passes);
@@ -1230,7 +1234,13 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
   rec_length= param->rec_length;
   res_length= param->res_length;
   sort_length= param->sort_length;
-  offset= rec_length-res_length;
+  uint dupl_count_ofs= rec_length-sizeof(element_count);
+  uint min_dupl_count= param->min_dupl_count;
+  bool check_dupl_count= flag && min_dupl_count;
+  offset= (rec_length-
+           (flag && min_dupl_count ? sizeof(dupl_count) : 0)-res_length);
+  uint wr_len= flag ? res_length : rec_length;
+  uint wr_offset= flag ? offset : 0;
   maxcount= (ulong) (param->keys/((uint) (Tb-Fb) +1));
   to_start_filepos= my_b_tell(to_file);
   strpos= sort_buffer;
@@ -1239,7 +1249,7 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
   /* The following will fire if there is not enough space in sort_buffer */
   DBUG_ASSERT(maxcount!=0);
   
-  if (param->unique_buff)
+  if (unique_buff)
   {
     cmp= param->compare;
     first_cmp_arg= (void *) &param->cmp_context;
@@ -1250,7 +1260,7 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
     first_cmp_arg= (void*) &sort_length;
   }
   if (init_queue(&queue, (uint) (Tb-Fb)+1, offsetof(BUFFPEK,key), 0,
-                 (queue_compare) cmp, first_cmp_arg))
+                 (queue_compare) cmp, first_cmp_arg, 0, 0))
     DBUG_RETURN(1);                                /* purecov: inspected */
   for (buffpek= Fb ; buffpek <= Tb ; buffpek++)
   {
@@ -1264,30 +1274,31 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
     queue_insert(&queue, (uchar*) buffpek);
   }
 
-  if (param->unique_buff)
+  if (unique_buff)
   {
     /* 
        Called by Unique::get()
-       Copy the first argument to param->unique_buff for unique removal.
+       Copy the first argument to unique_buff for unique removal.
        Store it also in 'to_file'.
-
-       This is safe as we know that there is always more than one element
-       in each block to merge (This is guaranteed by the Unique:: algorithm
     */
     buffpek= (BUFFPEK*) queue_top(&queue);
-    memcpy(param->unique_buff, buffpek->key, rec_length);
-    if (my_b_write(to_file, (uchar*) buffpek->key, rec_length))
-    {
-      error=1; goto err;                        /* purecov: inspected */
-    }
+    memcpy(unique_buff, buffpek->key, rec_length);
+    if (min_dupl_count)
+      memcpy(&dupl_count, unique_buff+dupl_count_ofs, 
+             sizeof(dupl_count));
     buffpek->key+= rec_length;
-    buffpek->mem_count--;
-    if (!--max_rows)
+    if (! --buffpek->mem_count)
     {
-      error= 0;                                       /* purecov: inspected */
-      goto end;                                       /* purecov: inspected */
+      if (!(error= (int) read_to_buffer(from_file, buffpek,
+                                        rec_length)))
+      {
+        VOID(queue_remove(&queue,0));
+        reuse_freed_buff(&queue, buffpek, rec_length);
+      }
+      else if (error == -1)
+        goto err;                        /* purecov: inspected */ 
     }
-    queue_replaced(&queue);                        // Top element has been used
+    queue_replace_top(&queue);            // Top element has been used
   }
   else
     cmp= 0;                                        // Not unique
@@ -1301,27 +1312,50 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
     for (;;)
     {
       buffpek= (BUFFPEK*) queue_top(&queue);
+      src= buffpek->key;
       if (cmp)                                        // Remove duplicates
       {
-        if (!(*cmp)(first_cmp_arg, &(param->unique_buff),
+        if (!(*cmp)(first_cmp_arg, &unique_buff,
                     (uchar**) &buffpek->key))
-              goto skip_duplicate;
-            memcpy(param->unique_buff, (uchar*) buffpek->key, rec_length);
-      }
-      if (flag == 0)
-      {
-        if (my_b_write(to_file,(uchar*) buffpek->key, rec_length))
-        {
-          error=1; goto err;                        /* purecov: inspected */
+	{
+          if (min_dupl_count)
+	  {
+            element_count cnt;
+            memcpy(&cnt, (uchar *) buffpek->key+dupl_count_ofs, sizeof(cnt));
+            dupl_count+= cnt;
+          }
+          goto skip_duplicate;
         }
+        if (min_dupl_count)
+	{
+          memcpy(unique_buff+dupl_count_ofs, &dupl_count,
+                 sizeof(dupl_count));
+        }
+	src= unique_buff;
       }
-      else
+        
+      /* 
+        Do not write into the output file if this is the final merge called
+        for a Unique object used for intersection and dupl_count is less
+        than min_dupl_count.
+        If the Unique object is used to intersect N sets of unique elements
+        then for any element:
+        dupl_count >= N <=> the element is occurred in each of these N sets.
+      */          
+      if (!check_dupl_count || dupl_count >= min_dupl_count)
       {
-        if (my_b_write(to_file, (uchar*) buffpek->key+offset, res_length))
+        if (my_b_write(to_file, src+wr_offset, wr_len))
         {
           error=1; goto err;                        /* purecov: inspected */
         }
       }
+      if (cmp)
+      {   
+        memcpy(unique_buff, (uchar*) buffpek->key, rec_length);
+        if (min_dupl_count)
+          memcpy(&dupl_count, unique_buff+dupl_count_ofs, 
+                 sizeof(dupl_count));
+      }
       if (!--max_rows)
       {
         error= 0;                               /* purecov: inspected */
@@ -1332,17 +1366,17 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
       buffpek->key+= rec_length;
       if (! --buffpek->mem_count)
       {
-        if (!(error= (int) read_to_buffer(from_file,buffpek,
+        if (!(error= (int) read_to_buffer(from_file, buffpek,
                                           rec_length)))
         {
-          VOID(queue_remove(&queue,0));
+          VOID(queue_remove_top(&queue));
           reuse_freed_buff(&queue, buffpek, rec_length);
           break;                        /* One buffer have been removed */
         }
         else if (error == -1)
           goto err;                        /* purecov: inspected */
       }
-      queue_replaced(&queue);              /* Top element has been replaced */
+      queue_replace_top(&queue);   	/* Top element has been replaced */
     }
   }
   buffpek= (BUFFPEK*) queue_top(&queue);
@@ -1355,11 +1389,35 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
   */
   if (cmp)
   {
-    if (!(*cmp)(first_cmp_arg, &(param->unique_buff), (uchar**) &buffpek->key))
+    if (!(*cmp)(first_cmp_arg, &unique_buff, (uchar**) &buffpek->key))
     {
-      buffpek->key+= rec_length;         // Remove duplicate
+      if (min_dupl_count)
+      {
+        element_count cnt;
+        memcpy(&cnt, (uchar *) buffpek->key+dupl_count_ofs, sizeof(cnt));
+        dupl_count+= cnt;
+      }
+      buffpek->key+= rec_length;         
       --buffpek->mem_count;
     }
+
+    if (min_dupl_count)
+      memcpy(unique_buff+dupl_count_ofs, &dupl_count,
+             sizeof(dupl_count));
+
+    if (!check_dupl_count || dupl_count >= min_dupl_count)
+    {
+      src= unique_buff;
+      if (my_b_write(to_file, src+wr_offset, wr_len))
+      {
+        error=1; goto err;                        /* purecov: inspected */
+      }
+      if (!--max_rows)
+      {
+        error= 0;                               
+        goto end;                             
+      }
+    }   
   }
 
   do
@@ -1372,7 +1430,7 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
     max_rows-= buffpek->mem_count;
     if (flag == 0)
     {
-      if (my_b_write(to_file,(uchar*) buffpek->key,
+      if (my_b_write(to_file, (uchar*) buffpek->key,
                      (rec_length*buffpek->mem_count)))
       {
         error= 1; goto err;                        /* purecov: inspected */
@@ -1381,19 +1439,25 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
     else
     {
       register uchar *end;
-      strpos= buffpek->key+offset;
-      for (end= strpos+buffpek->mem_count*rec_length ;
-           strpos != end ;
-           strpos+= rec_length)
-      {     
-        if (my_b_write(to_file, strpos, res_length))
+      src= buffpek->key+offset;
+      for (end= src+buffpek->mem_count*rec_length ;
+           src != end ;
+           src+= rec_length)
+      {
+        if (check_dupl_count)
+        {
+          memcpy((uchar *) &dupl_count, src+dupl_count_ofs, sizeof(dupl_count)); 
+          if (dupl_count < min_dupl_count)
+	    continue;
+        }
+        if (my_b_write(to_file, src, wr_len))
         {
           error=1; goto err;                        
         }
       }
     }
   }
-  while ((error=(int) read_to_buffer(from_file,buffpek, rec_length))
+  while ((error=(int) read_to_buffer(from_file, buffpek, rec_length))
          != -1 && error != 0);
 
 end:
@@ -1407,7 +1471,7 @@ err:
 
 	/* Do a merge to output-file (save only positions) */
 
-static int merge_index(SORTPARAM *param, uchar *sort_buffer,
+int merge_index(SORTPARAM *param, uchar *sort_buffer,
 		       BUFFPEK *buffpek, uint maxbuffer,
 		       IO_CACHE *tempfile, IO_CACHE *outfile)
 {
@@ -1713,3 +1777,4 @@ void change_double_for_sort(double nr,uchar *to)
     }
   }
 }
+
diff --git a/sql/ha_ndbcluster.cc b/sql/ha_ndbcluster.cc
index b7b7cdf1cce..a70e5aff996 100644
--- a/sql/ha_ndbcluster.cc
+++ b/sql/ha_ndbcluster.cc
@@ -8691,6 +8691,8 @@ ha_ndbcluster::null_value_index_search(KEY_MULTI_RANGE *ranges,
   DBUG_RETURN(FALSE);
 }
 
+#if 0 
+/* MRR/NDB is disabled, for details see method declarations in ha_ndbcluster.h */
 int
 ha_ndbcluster::read_multi_range_first(KEY_MULTI_RANGE **found_range_p,
                                       KEY_MULTI_RANGE *ranges, 
@@ -9063,6 +9065,7 @@ found_next:
   m_multi_range_result_ptr += reclength;
   DBUG_RETURN(0);
 }
+#endif 
 
 int
 ha_ndbcluster::setup_recattr(const NdbRecAttr* curr)
diff --git a/sql/ha_ndbcluster.h b/sql/ha_ndbcluster.h
index a17323d3fd6..7c9dba9e30e 100644
--- a/sql/ha_ndbcluster.h
+++ b/sql/ha_ndbcluster.h
@@ -259,10 +259,20 @@ class ha_ndbcluster: public handler
   /**
    * Multi range stuff
    */
+#if 0 
+  /*
+    MRR/NDB is disabled in MariaDB. This is because in MariaDB, we've
+    backported
+     - the latest version of MRR interface (BKA needs this)
+     - the latest version of DS-MRR implementation
+    but didn't backport the latest version MRR/NDB implementation.
+
+  */
   int read_multi_range_first(KEY_MULTI_RANGE **found_range_p,
                              KEY_MULTI_RANGE*ranges, uint range_count,
                              bool sorted, HANDLER_BUFFER *buffer);
   int read_multi_range_next(KEY_MULTI_RANGE **found_range_p);
+#endif  
   bool null_value_index_search(KEY_MULTI_RANGE *ranges,
 			       KEY_MULTI_RANGE *end_range,
 			       HANDLER_BUFFER *buffer);
diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc
index 56a4005e3e2..74ea36b5bd9 100644
--- a/sql/ha_partition.cc
+++ b/sql/ha_partition.cc
@@ -2662,7 +2662,7 @@ int ha_partition::open(const char *name, int mode, uint test_if_locked)
     Initialize priority queue, initialized to reading forward.
   */
   if ((error= init_queue(&m_queue, m_tot_parts, (uint) PARTITION_BYTES_IN_POS,
-                         0, key_rec_cmp, (void*)this)))
+                         0, key_rec_cmp, (void*)this, 0, 0)))
     goto err_handler;
 
   /*
@@ -3997,6 +3997,7 @@ int ha_partition::index_init(uint inx, bool sorted)
   m_part_spec.start_part= NO_CURRENT_PART_ID;
   m_start_key.length= 0;
   m_ordered= sorted;
+  m_ordered_scan_ongoing= FALSE;
   m_curr_key_info[0]= table->key_info+inx;
   if (m_pkey_is_clustered && table->s->primary_key != MAX_KEY)
   {
@@ -4797,7 +4798,7 @@ int ha_partition::handle_unordered_scan_next_partition(uchar * buf)
 int ha_partition::handle_ordered_index_scan(uchar *buf, bool reverse_order)
 {
   uint i;
-  uint j= 0;
+  uint j= queue_first_element(&m_queue);
   bool found= FALSE;
   DBUG_ENTER("ha_partition::handle_ordered_index_scan");
 
@@ -4813,6 +4814,12 @@ int ha_partition::handle_ordered_index_scan(uchar *buf, bool reverse_order)
     int error;
     handler *file= m_file[i];
 
+    /*
+      Reset null bits (to avoid valgrind warnings) and to give a default
+      value for not read null fields.
+    */
+    bfill(rec_buf_ptr, table->s->null_bytes, 255);
+
     switch (m_index_scan_type) {
     case partition_index_read:
       error= file->ha_index_read_map(rec_buf_ptr,
@@ -4865,7 +4872,7 @@ int ha_partition::handle_ordered_index_scan(uchar *buf, bool reverse_order)
     */
     queue_set_max_at_top(&m_queue, reverse_order);
     queue_set_cmp_arg(&m_queue, (void*)m_curr_key_info);
-    m_queue.elements= j;
+    m_queue.elements= j - queue_first_element(&m_queue);
     queue_fix(&m_queue);
     return_top_record(buf);
     table->status= 0;
@@ -4936,7 +4943,7 @@ int ha_partition::handle_ordered_next(uchar *buf, bool is_next_same)
     if (error == HA_ERR_END_OF_FILE)
     {
       /* Return next buffered row */
-      queue_remove(&m_queue, (uint) 0);
+      queue_remove_top(&m_queue);
       if (m_queue.elements)
       {
          DBUG_PRINT("info", ("Record returned from partition %u (2)",
@@ -4948,7 +4955,7 @@ int ha_partition::handle_ordered_next(uchar *buf, bool is_next_same)
     }
     DBUG_RETURN(error);
   }
-  queue_replaced(&m_queue);
+  queue_replace_top(&m_queue);
   return_top_record(buf);
   DBUG_PRINT("info", ("Record returned from partition %u", m_top_entry));
   DBUG_RETURN(0);
@@ -4979,7 +4986,7 @@ int ha_partition::handle_ordered_prev(uchar *buf)
   {
     if (error == HA_ERR_END_OF_FILE)
     {
-      queue_remove(&m_queue, (uint) 0);
+      queue_remove_top(&m_queue);
       if (m_queue.elements)
       {
 	return_top_record(buf);
@@ -4991,7 +4998,7 @@ int ha_partition::handle_ordered_prev(uchar *buf)
     }
     DBUG_RETURN(error);
   }
-  queue_replaced(&m_queue);
+  queue_replace_top(&m_queue);
   return_top_record(buf);
   DBUG_PRINT("info", ("Record returned from partition %d", m_top_entry));
   DBUG_RETURN(0);
diff --git a/sql/handler.cc b/sql/handler.cc
index a89c3c7f05c..c5a870e77ad 100644
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -2701,6 +2701,12 @@ void handler::print_error(int error, myf errflag)
     SET_FATAL_ERROR;
     textno=ER_KEY_NOT_FOUND;
     break;
+  case HA_ERR_ABORTED_BY_USER:
+  {
+    DBUG_ASSERT(table->in_use->killed);
+    table->in_use->send_kill_message();
+    DBUG_VOID_RETURN;
+  }
   case HA_ERR_WRONG_MRG_TABLE_DEF:
     textno=ER_WRONG_MRG_TABLE;
     break;
@@ -4301,133 +4307,6 @@ void ha_binlog_log_query(THD *thd, handlerton *hton,
 }
 #endif
 
-/**
-  Read the first row of a multi-range set.
-
-  @param found_range_p       Returns a pointer to the element in 'ranges' that
-                             corresponds to the returned row.
-  @param ranges              An array of KEY_MULTI_RANGE range descriptions.
-  @param range_count         Number of ranges in 'ranges'.
-  @param sorted	      If result should be sorted per key.
-  @param buffer              A HANDLER_BUFFER for internal handler usage.
-
-  @note
-    - Record is read into table->record[0].
-    - *found_range_p returns a valid value only if read_multi_range_first()
-    returns 0.
-    - Sorting is done within each range. If you want an overall sort, enter
-    'ranges' with sorted ranges.
-
-  @retval
-    0			OK, found a row
-  @retval
-    HA_ERR_END_OF_FILE	No rows in range
-  @retval
-    \#			Error code
-*/
-int handler::read_multi_range_first(KEY_MULTI_RANGE **found_range_p,
-                                    KEY_MULTI_RANGE *ranges, uint range_count,
-                                    bool sorted, HANDLER_BUFFER *buffer)
-{
-  int result= HA_ERR_END_OF_FILE;
-  DBUG_ENTER("handler::read_multi_range_first");
-  multi_range_sorted= sorted;
-  multi_range_buffer= buffer;
-
-  table->mark_columns_used_by_index_no_reset(active_index, table->read_set);
-  table->column_bitmaps_set(table->read_set, table->write_set);
-
-  for (multi_range_curr= ranges, multi_range_end= ranges + range_count;
-       multi_range_curr < multi_range_end;
-       multi_range_curr++)
-  {
-    result= read_range_first(multi_range_curr->start_key.keypart_map ?
-                             &multi_range_curr->start_key : 0,
-                             multi_range_curr->end_key.keypart_map ?
-                             &multi_range_curr->end_key : 0,
-                             test(multi_range_curr->range_flag & EQ_RANGE),
-                             multi_range_sorted);
-    if (result != HA_ERR_END_OF_FILE)
-      break;
-  }
-
-  *found_range_p= multi_range_curr;
-  DBUG_PRINT("exit",("result %d", result));
-  DBUG_RETURN(result);
-}
-
-
-/**
-  Read the next row of a multi-range set.
-
-  @param found_range_p       Returns a pointer to the element in 'ranges' that
-                             corresponds to the returned row.
-
-  @note
-    - Record is read into table->record[0].
-    - *found_range_p returns a valid value only if read_multi_range_next()
-    returns 0.
-
-  @retval
-    0			OK, found a row
-  @retval
-    HA_ERR_END_OF_FILE	No (more) rows in range
-  @retval
-    \#			Error code
-*/
-int handler::read_multi_range_next(KEY_MULTI_RANGE **found_range_p)
-{
-  int result= 0;
-  DBUG_ENTER("handler::read_multi_range_next");
-
-  /* We should not be called after the last call returned EOF. */
-  DBUG_ASSERT(multi_range_curr < multi_range_end);
-
-  do
-  {
-    /* Save a call if there can be only one row in range. */
-    if (multi_range_curr->range_flag != (UNIQUE_RANGE | EQ_RANGE))
-    {
-      result= read_range_next();
-
-      /* On success or non-EOF errors jump to the end. */
-      if (result != HA_ERR_END_OF_FILE)
-        break;
-    }
-    else
-    {
-      if (was_semi_consistent_read())
-        goto scan_it_again;
-      /*
-        We need to set this for the last range only, but checking this
-        condition is more expensive than just setting the result code.
-      */
-      result= HA_ERR_END_OF_FILE;
-    }
-
-    multi_range_curr++;
-scan_it_again:
-    /* Try the next range(s) until one matches a record. */
-    for (; multi_range_curr < multi_range_end; multi_range_curr++)
-    {
-      result= read_range_first(multi_range_curr->start_key.keypart_map ?
-                               &multi_range_curr->start_key : 0,
-                               multi_range_curr->end_key.keypart_map ?
-                               &multi_range_curr->end_key : 0,
-                               test(multi_range_curr->range_flag & EQ_RANGE),
-                               multi_range_sorted);
-      if (result != HA_ERR_END_OF_FILE)
-        break;
-    }
-  }
-  while ((result == HA_ERR_END_OF_FILE) &&
-         (multi_range_curr < multi_range_end));
-
-  *found_range_p= multi_range_curr;
-  DBUG_PRINT("exit",("handler::read_multi_range_next: result %d", result));
-  DBUG_RETURN(result);
-}
-
 
 /**
   Read first row between two ranges.
@@ -4531,7 +4410,7 @@ int handler::read_range_next()
 int handler::compare_key(key_range *range)
 {
   int cmp;
-  if (!range)
+  if (!range || in_range_check_pushed_down)
     return 0;					// No max range
   cmp= key_cmp(range_key_part, range->key, range->length);
   if (!cmp)
@@ -4540,6 +4419,23 @@ int handler::compare_key(key_range *range)
 }
 
 
+/*
+  Same as compare_key() but doesn't check have in_range_check_pushed_down.
+  This is used by index condition pushdown implementation.
+*/
+
+int handler::compare_key2(key_range *range)
+{
+  int cmp;
+  if (!range)
+    return 0;					// no max range
+  cmp= key_cmp(range_key_part, range->key, range->length);
+  if (!cmp)
+    cmp= key_compare_result_on_equal;
+  return cmp;
+}
+
+
 int handler::index_read_idx_map(uchar * buf, uint index, const uchar * key,
                                 key_part_map keypart_map,
                                 enum ha_rkey_function find_flag)
diff --git a/sql/handler.h b/sql/handler.h
index 270c7bb0be2..aabb60e4252 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -139,6 +139,9 @@
 #define HA_HAS_NEW_CHECKSUM    (LL(1) << 36)
 #define HA_CAN_VIRTUAL_COLUMNS (LL(1) << 37)
 
+#define HA_MRR_CANT_SORT       (LL(1) << 37)
+#define HA_RECORD_MUST_BE_CLEAN_ON_WRITE (LL(1) << 38)
+
 /*
   Set of all binlog flags. Currently only contain the capabilities
   flags.
@@ -154,6 +157,15 @@
 #define HA_KEYREAD_ONLY         64	/* Support HA_EXTRA_KEYREAD */
 
 /*
+  Index scan will not return records in rowid order. Not guaranteed to be
+  set for unordered (e.g. HASH) indexes.
+*/
+#define HA_KEY_SCAN_NOT_ROR     128 
+#define HA_DO_INDEX_COND_PUSHDOWN  256 /* Supports Index Condition Pushdown */
+
+
+
+/*
   bits in alter_table_flags:
 */
 /*
@@ -206,12 +218,6 @@
 #define HA_FAST_CHANGE_PARTITION                (1L << 13)
 #define HA_PARTITION_ONE_PHASE                  (1L << 14)
 
-/*
-  Index scan will not return records in rowid order. Not guaranteed to be
-  set for unordered (e.g. HASH) indexes.
-*/
-#define HA_KEY_SCAN_NOT_ROR     128 
-
 /* operations for disable/enable indexes */
 #define HA_KEY_SWITCH_NONUNIQ      0
 #define HA_KEY_SWITCH_ALL          1
@@ -1143,6 +1149,215 @@ typedef struct st_ha_check_opt
 } HA_CHECK_OPT;
 
 
+/********************************************************************************
+ * MRR
+ ********************************************************************************/
+
+typedef void *range_seq_t;
+
+typedef struct st_range_seq_if
+{
+  /*
+    Get key information
+ 
+    SYNOPSIS
+      get_key_info()
+        init_params  The seq_init_param parameter 
+        length       OUT length of the keys in this range sequence
+        map          OUT key_part_map of the keys in this range sequence
+
+    DESCRIPTION
+      This function is set only when using HA_MRR_FIXED_KEY mode. In that mode, 
+      all ranges are single-point equality ranges that use the same set of key
+      parts. This function allows the MRR implementation to get the length of
+      a key, and which keyparts it uses.
+  */
+  void (*get_key_info)(void *init_params, uint *length, key_part_map *map);
+
+  /*
+    Initialize the traversal of range sequence
+    
+    SYNOPSIS
+      init()
+        init_params  The seq_init_param parameter 
+        n_ranges     The number of ranges obtained 
+        flags        A combination of HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY
+
+    RETURN
+      An opaque value to be used as RANGE_SEQ_IF::next() parameter
+  */
+  range_seq_t (*init)(void *init_params, uint n_ranges, uint flags);
+
+
+  /*
+    Get the next range in the range sequence
+
+    SYNOPSIS
+      next()
+        seq    The value returned by RANGE_SEQ_IF::init()
+        range  OUT Information about the next range
+    
+    RETURN
+      FALSE - Ok, the range structure filled with info about the next range
+      TRUE  - No more ranges
+  */
+  bool (*next) (range_seq_t seq, KEY_MULTI_RANGE *range);
+
+  /*
+    Check whether range_info orders to skip the next record
+
+    SYNOPSIS
+      skip_record()
+        seq         The value returned by RANGE_SEQ_IF::init()
+        range_info  Information about the next range 
+                    (Ignored if MRR_NO_ASSOCIATION is set)
+        rowid       Rowid of the record to be checked (ignored if set to 0)
+    
+    RETURN
+      1 - Record with this range_info and/or this rowid shall be filtered
+          out from the stream of records returned by multi_range_read_next()
+      0 - The record shall be left in the stream
+  */ 
+  bool (*skip_record) (range_seq_t seq, range_id_t range_info, uchar *rowid);
+
+  /*
+    Check if the record combination matches the index condition
+    SYNOPSIS
+      skip_index_tuple()
+        seq         The value returned by RANGE_SEQ_IF::init()
+        range_info  Information about the next range 
+    
+    RETURN
+      0 - The record combination satisfies the index condition
+      1 - Otherwise
+  */ 
+  bool (*skip_index_tuple) (range_seq_t seq, range_id_t range_info);
+} RANGE_SEQ_IF;
+
+typedef bool (*SKIP_INDEX_TUPLE_FUNC) (range_seq_t seq, range_id_t range_info);
+
+class COST_VECT
+{ 
+public:
+  double io_count;     /* number of I/O                 */
+  double avg_io_cost;  /* cost of an average I/O oper.  */
+  double cpu_cost;     /* cost of operations in CPU     */
+  double mem_cost;     /* cost of used memory           */ 
+  double import_cost;  /* cost of remote operations     */
+  
+  enum { IO_COEFF=1 };
+  enum { CPU_COEFF=1 };
+  enum { MEM_COEFF=1 };
+  enum { IMPORT_COEFF=1 };
+
+  COST_VECT() {}                              // keep gcc happy
+
+  double total_cost() 
+  {
+    return IO_COEFF*io_count*avg_io_cost + CPU_COEFF * cpu_cost +
+           MEM_COEFF*mem_cost + IMPORT_COEFF*import_cost;
+  }
+
+  void zero()
+  {
+    avg_io_cost= 1.0;
+    io_count= cpu_cost= mem_cost= import_cost= 0.0;
+  }
+
+  void multiply(double m)
+  {
+    io_count *= m;
+    cpu_cost *= m;
+    import_cost *= m;
+    /* Don't multiply mem_cost */
+  }
+
+  void add(const COST_VECT* cost)
+  {
+    double io_count_sum= io_count + cost->io_count;
+    add_io(cost->io_count, cost->avg_io_cost);
+    io_count= io_count_sum;
+    cpu_cost += cost->cpu_cost;
+  }
+  void add_io(double add_io_cnt, double add_avg_cost)
+  {
+    double io_count_sum= io_count + add_io_cnt;
+    avg_io_cost= (io_count * avg_io_cost + 
+                  add_io_cnt * add_avg_cost) / io_count_sum;
+    io_count= io_count_sum;
+  }
+
+  /*
+    To be used when we go from old single value-based cost calculations to
+    the new COST_VECT-based.
+  */
+  void convert_from_cost(double cost)
+  {
+    zero();
+    avg_io_cost= 1.0;
+    io_count= cost;
+  }
+};
+
+void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, 
+                         COST_VECT *cost);
+
+/*
+  Indicates that all scanned ranges will be singlepoint (aka equality) ranges.
+  The ranges may not use the full key but all of them will use the same number
+  of key parts.
+*/
+#define HA_MRR_SINGLE_POINT 1
+#define HA_MRR_FIXED_KEY  2
+
+/* 
+  Indicates that RANGE_SEQ_IF::next(&range) doesn't need to fill in the
+  'range' parameter.
+*/
+#define HA_MRR_NO_ASSOCIATION 4
+
+/* 
+  The MRR user will provide ranges in key order, and MRR implementation
+  must return rows in key order.
+*/
+#define HA_MRR_SORTED 8
+
+/* MRR implementation doesn't have to retrieve full records */
+#define HA_MRR_INDEX_ONLY 16
+
+/* 
+  The passed memory buffer is of maximum possible size, the caller can't
+  assume larger buffer.
+*/
+#define HA_MRR_LIMITS 32
+
+
+/*
+  Flag set <=> default MRR implementation is used
+  (The choice is made by **_info[_const]() function which may set this
+   flag. SQL layer remembers the flag value and then passes it to
+   multi_read_range_init().
+*/
+#define HA_MRR_USE_DEFAULT_IMPL 64
+
+/*
+  Used only as parameter to multi_range_read_info():
+  Flag set <=> the caller guarantees that the bounds of the scanned ranges
+  will not have NULL values.
+*/
+#define HA_MRR_NO_NULL_ENDPOINTS 128
+
+/*
+  The MRR user has materialized range keys somewhere in the user's buffer.
+  This can be used for optimization of the procedure that sorts these keys
+  since in this case key values don't have to be copied into the MRR buffer.
+
+  In other words, it is guaranteed that after RANGE_SEQ_IF::next() call the 
+  pointer in range->start_key.key will point to a key value that will remain 
+  there until the end of the MRR scan.
+*/
+#define HA_MRR_MATERIALIZED_KEYS 256
+
 
 /*
   This is a buffer area that the handler can use to store rows.
@@ -1153,8 +1368,8 @@ typedef struct st_ha_check_opt
 
 typedef struct st_handler_buffer
 {
-  const uchar *buffer;         /* Buffer one can start using */
-  const uchar *buffer_end;     /* End of buffer */
+  /* const? */uchar *buffer;         /* Buffer one can start using */
+  /* const? */uchar *buffer_end;     /* End of buffer */
   uchar *end_of_used_area;     /* End of area that was used by handler */
 } HANDLER_BUFFER;
 
@@ -1185,11 +1400,16 @@ public:
   time_t update_time;
   uint block_size;			/* index block size */
 
+  /*
+    number of buffer bytes that native mrr implementation needs,
+  */
+  uint mrr_length_per_rec; 
+
   ha_statistics():
     data_file_length(0), max_data_file_length(0),
     index_file_length(0), delete_length(0), auto_increment_value(0),
     records(0), deleted(0), mean_rec_length(0), create_time(0),
-    check_time(0), update_time(0), block_size(0)
+    check_time(0), update_time(0), block_size(0), mrr_length_per_rec(0)
   {}
 };
 
@@ -1228,11 +1448,18 @@ public:
 
   ha_statistics stats;
 
-  /** The following are for read_multi_range */
-  bool multi_range_sorted;
-  KEY_MULTI_RANGE *multi_range_curr;
-  KEY_MULTI_RANGE *multi_range_end;
-  HANDLER_BUFFER *multi_range_buffer;
+  /** MultiRangeRead-related members: */
+  range_seq_t mrr_iter;    /* Interator to traverse the range sequence */
+  RANGE_SEQ_IF mrr_funcs;  /* Range sequence traversal functions */
+  HANDLER_BUFFER *multi_range_buffer; /* MRR buffer info */
+  uint ranges_in_seq; /* Total number of ranges in the traversed sequence */
+  /* TRUE <=> source MRR ranges and the output are ordered */
+  bool mrr_is_output_sorted;
+
+  /** TRUE <=> we're currently traversing a range in mrr_cur_range. */
+  bool mrr_have_range;
+  /** Current range (the one we're now returning rows from) */
+  KEY_MULTI_RANGE mrr_cur_range;
 
   /** The following are for read_range() */
   key_range save_end_range, *end_range;
@@ -1240,6 +1467,12 @@ public:
   int key_compare_result_on_equal;
   bool eq_range;
 
+  /* 
+    TRUE <=> the engine guarantees that returned records are within the range
+    being scanned.
+  */
+  bool in_range_check_pushed_down;
+
   uint errkey;				/* Last dup key */
   uint key_used_on_scan;
   uint active_index;
@@ -1252,6 +1485,8 @@ public:
   bool mark_trx_done;
   bool cloned;                          /* 1 if this was created with clone */
   const COND *pushed_cond;
+  Item *pushed_idx_cond;
+  uint pushed_idx_cond_keyno;  /* The index which the above condition is for */
   /**
     next_insert_id is the next value which should be inserted into the
     auto_increment column: in a inserting-multi-row statement (like INSERT
@@ -1291,11 +1526,14 @@ public:
   handler(handlerton *ht_arg, TABLE_SHARE *share_arg)
     :table_share(share_arg), table(0),
     estimation_rows_to_insert(0), ht(ht_arg),
-    ref(0), key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
+    ref(0), in_range_check_pushed_down(FALSE),
+    key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
     ref_length(sizeof(my_off_t)),
     ft_handler(0), inited(NONE),
     locked(FALSE), implicit_emptied(FALSE), mark_trx_done(FALSE), cloned(0),
-    pushed_cond(0), next_insert_id(0), insert_id_for_cur_row(0),
+    pushed_cond(0), pushed_idx_cond(NULL),
+    pushed_idx_cond_keyno(MAX_KEY),
+    next_insert_id(0), insert_id_for_cur_row(0),
     auto_inc_intervals_count(0)
     {
       reset_statistics();
@@ -1324,6 +1562,7 @@ public:
     {
       inited=       INDEX;
       active_index= idx;
+      end_range= NULL;
     }
     DBUG_RETURN(result);
   }
@@ -1333,6 +1572,7 @@ public:
     DBUG_ASSERT(inited==INDEX);
     inited=       NONE;
     active_index= MAX_KEY;
+    end_range=    NULL;
     DBUG_RETURN(index_end());
   }
   /* This is called after index_init() if we need to do a index scan */
@@ -1616,16 +1856,27 @@ public:
   inline int ha_index_first(uchar * buf);
   inline int ha_index_last(uchar * buf);
   inline int ha_index_next_same(uchar *buf, const uchar *key, uint keylen);
-
-  virtual int read_multi_range_first(KEY_MULTI_RANGE **found_range_p,
-                                     KEY_MULTI_RANGE *ranges, uint range_count,
-                                     bool sorted, HANDLER_BUFFER *buffer);
-  virtual int read_multi_range_next(KEY_MULTI_RANGE **found_range_p);
+  /*
+    TODO: should we make for those functions non-virtual ha_func_name wrappers,
+    too?
+  */
+  virtual ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                              void *seq_init_param, 
+                                              uint n_ranges, uint *bufsz,
+                                              uint *flags, COST_VECT *cost);
+  virtual ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+                                        uint key_parts, uint *bufsz, 
+                                        uint *flags, COST_VECT *cost);
+  virtual int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
+                                    uint n_ranges, uint mode, 
+                                    HANDLER_BUFFER *buf);
+  virtual int multi_range_read_next(range_id_t *range_info);
   virtual int read_range_first(const key_range *start_key,
                                const key_range *end_key,
                                bool eq_range, bool sorted);
   virtual int read_range_next();
   int compare_key(key_range *range);
+  int compare_key2(key_range *range);
   virtual int ft_init() { return HA_ERR_WRONG_COMMAND; }
   void ft_end() { ft_handler=NULL; }
   virtual FT_INFO *ft_init_ext(uint flags, uint inx,String *key)
@@ -1947,6 +2198,7 @@ public:
    Pops the top if condition stack, if stack is not empty.
  */
  virtual void cond_pop() { return; };
+ virtual Item *idx_cond_push(uint keyno, Item* idx_cond) { return idx_cond; }
  virtual bool check_if_incompatible_data(HA_CREATE_INFO *create_info,
 					 uint table_changes)
  { return COMPATIBLE_DATA_NO; }
@@ -1966,6 +2218,18 @@ public:
 
   LEX_STRING *engine_name() { return hton_name(ht); }
 
+  /*
+    @brief
+    Check whether the engine supports virtual columns
+    
+    @retval
+      FALSE   if the engine does not support virtual columns    
+    @retval
+      TRUE    if the engine supports virtual columns
+  */
+  virtual bool check_if_supported_virtual_columns(void) { return FALSE;}
+  
+  TABLE* get_table() { return table; }
 protected:
   /* deprecated, don't use in new engines */
   inline void ha_statistic_increment(ulong SSV::*offset) const { }
@@ -2162,8 +2426,12 @@ public:
   /* XXX to be removed, see ha_partition::partition_ht() */
   virtual handlerton *partition_ht() const
   { return ht; }
+  inline int ha_write_tmp_row(uchar *buf);
 };
 
+#include "multi_range_read.h"
+
+bool key_uses_partial_cols(TABLE *table, uint keyno);
 
 	/* Some extern variables used with handlers */
 
diff --git a/sql/item.cc b/sql/item.cc
index 92dbff867c5..13b6484c4ce 100644
--- a/sql/item.cc
+++ b/sql/item.cc
@@ -28,6 +28,9 @@
 
 const String my_null_string("NULL", 4, default_charset_info);
 
+static int save_field_in_field(Field *from, bool *null_value,
+                               Field *to, bool no_conversions);
+
 /****************************************************************************/
 
 /* Hybrid_type_traits {_real} */
@@ -374,8 +377,8 @@ int Item::save_str_value_in_field(Field *field, String *result)
 
 
 Item::Item():
-  rsize(0), name(0), orig_name(0), name_length(0), fixed(0),
-  is_autogenerated_name(TRUE),
+  is_expensive_cache(-1), rsize(0), name(0), orig_name(0), name_length(0),
+  fixed(0), is_autogenerated_name(TRUE),
   collation(&my_charset_bin, DERIVATION_COERCIBLE)
 {
   marker= 0;
@@ -412,6 +415,7 @@ Item::Item():
   tables.
 */
 Item::Item(THD *thd, Item *item):
+  is_expensive_cache(-1),
   rsize(0),
   str_value(item->str_value),
   name(item->name),
@@ -543,6 +547,36 @@ Item* Item::transform(Item_transformer transformer, uchar *arg)
 }
 
 
+/**
+  Create and set up an expression cache for this item
+
+  @param thd             Thread handle
+  @param depends_on      List of the expression parameters
+
+  @details
+  The function creates an expression cache for an item and its parameters
+  specified by the 'depends_on' list. Then the expression cache is placed
+  into a cache wrapper that is returned as the result of the function.
+
+  @returns
+  A pointer to created wrapper item if successful, NULL - otherwise
+*/
+
+Item* Item::set_expr_cache(THD *thd, List<Item *> &depends_on)
+{
+  DBUG_ENTER("Item::set_expr_cache");
+  Item_cache_wrapper *wrapper;
+  if ((wrapper= new Item_cache_wrapper(this)) &&
+      !wrapper->fix_fields(thd, (Item**)&wrapper))
+  {
+    if (wrapper->set_cache(thd, depends_on))
+      DBUG_RETURN(NULL);
+    DBUG_RETURN(wrapper);
+  }
+  DBUG_RETURN(NULL);
+}
+
+
 Item_ident::Item_ident(Name_resolution_context *context_arg,
                        const char *db_name_arg,const char *table_name_arg,
 		       const char *field_name_arg)
@@ -551,7 +585,7 @@ Item_ident::Item_ident(Name_resolution_context *context_arg,
    db_name(db_name_arg), table_name(table_name_arg),
    field_name(field_name_arg),
    alias_name_used(FALSE), cached_field_index(NO_CACHED_FIELD_INDEX),
-   cached_table(0), depended_from(0)
+   cached_table(0), depended_from(0), can_be_depended(TRUE)
 {
   name = (char*) field_name_arg;
 }
@@ -563,7 +597,7 @@ Item_ident::Item_ident(TABLE_LIST *view_arg, const char *field_name_arg)
    db_name(NullS), table_name(view_arg->alias),
    field_name(field_name_arg),
    alias_name_used(FALSE), cached_field_index(NO_CACHED_FIELD_INDEX),
-   cached_table(NULL), depended_from(NULL)
+   cached_table(NULL), depended_from(NULL), can_be_depended(TRUE)
 {
   name = (char*) field_name_arg;
 }
@@ -585,7 +619,8 @@ Item_ident::Item_ident(THD *thd, Item_ident *item)
    alias_name_used(item->alias_name_used),
    cached_field_index(item->cached_field_index),
    cached_table(item->cached_table),
-   depended_from(item->depended_from)
+   depended_from(item->depended_from),
+   can_be_depended(item->can_be_depended)
 {}
 
 void Item_ident::cleanup()
@@ -603,7 +638,8 @@ void Item_ident::cleanup()
   db_name= orig_db_name; 
   table_name= orig_table_name;
   field_name= orig_field_name;
-  depended_from= 0;
+  /* Store if this Item was depended */
+  can_be_depended= test(depended_from);
   DBUG_VOID_RETURN;
 }
 
@@ -652,6 +688,16 @@ bool Item_field::collect_item_field_processor(uchar *arg)
 }
 
 
+bool Item_field::add_field_to_set_processor(uchar *arg)
+{
+  DBUG_ENTER("Item_field::add_field_to_set_processor");
+  DBUG_PRINT("info", ("%s", field->field_name ? field->field_name : "noname"));
+  TABLE *table= (TABLE *) arg;
+  if (field->table == table)
+    bitmap_set_bit(&table->tmp_set, field->field_index);
+  DBUG_RETURN(FALSE);
+}
+
 /**
   Check if an Item_field references some field from a list of fields.
 
@@ -1421,7 +1467,7 @@ void Item::split_sum_func2(THD *thd, Item **ref_pointer_array,
   /* An item of type Item_sum  is registered <=> ref_by != 0 */ 
   if (type() == SUM_FUNC_ITEM && skip_registered && 
       ((Item_sum *) this)->ref_by)
-    return;                                                 
+    return;
   if ((type() != SUM_FUNC_ITEM && with_sum_func) ||
       (type() == FUNC_ITEM &&
        (((Item_func *) this)->functype() == Item_func::ISNOTNULLTEST_FUNC ||
@@ -1959,7 +2005,7 @@ void Item_field::reset_field(Field *f)
 bool Item_field::enumerate_field_refs_processor(uchar *arg)
 {
   Field_enumerator *fe= (Field_enumerator*)arg;
-  fe->visit_field(field);
+  fe->visit_field(this);
   return FALSE;
 }
 
@@ -2119,6 +2165,12 @@ bool Item_field::get_time(MYSQL_TIME *ltime)
   return 0;
 }
 
+void Item_field::save_result(Field *to)
+{
+  save_field_in_field(result_field, &null_value, to, TRUE);
+}
+
+
 double Item_field::val_result()
 {
   if ((null_value=result_field->is_null()))
@@ -2212,6 +2264,24 @@ table_map Item_field::used_tables() const
 }
 
 
+void Item_field::fix_after_pullout(st_select_lex *new_parent, Item **ref)
+{
+  if (new_parent == depended_from)
+    depended_from= NULL;
+  Name_resolution_context *ctx= new Name_resolution_context();
+  ctx->outer_context= NULL; // We don't build a complete name resolver
+  ctx->table_list= NULL;    // We rely on first_name_resolution_table instead
+  ctx->select_lex= new_parent;
+  ctx->first_name_resolution_table= context->first_name_resolution_table;
+  ctx->last_name_resolution_table=  context->last_name_resolution_table;
+  ctx->error_processor=             context->error_processor;
+  ctx->error_processor_data=        context->error_processor_data;
+  ctx->resolve_in_select_list=      context->resolve_in_select_list;
+  ctx->security_ctx=                context->security_ctx;
+  this->context=ctx;
+}
+
+
 Item *Item_field::get_tmp_table_item(THD *thd)
 {
   Item_field *new_item= new Item_field(thd, this);
@@ -3566,6 +3636,15 @@ bool Item::fix_fields(THD *thd, Item **ref)
   return FALSE;
 }
 
+
+void Item_ref_null_helper::save_val(Field *to)
+{
+  DBUG_ASSERT(fixed == 1);
+  (*ref)->save_val(to);
+  owner->was_null|= null_value= (*ref)->null_value;
+}
+
+
 double Item_ref_null_helper::val_real()
 {
   DBUG_ASSERT(fixed == 1);
@@ -3629,7 +3708,7 @@ bool Item_ref_null_helper::get_date(MYSQL_TIME *ltime, uint fuzzydate)
                          substitution)
 */
 
-static void mark_as_dependent(THD *thd, SELECT_LEX *last, SELECT_LEX *current,
+static bool mark_as_dependent(THD *thd, SELECT_LEX *last, SELECT_LEX *current,
                               Item_ident *resolved_item,
                               Item_ident *mark_item)
 {
@@ -3638,9 +3717,11 @@ static void mark_as_dependent(THD *thd, SELECT_LEX *last, SELECT_LEX *current,
   const char *table_name= (resolved_item->table_name ?
                            resolved_item->table_name : "");
   /* store pointer on SELECT_LEX from which item is dependent */
-  if (mark_item)
+  if (mark_item && mark_item->can_be_depended)
     mark_item->depended_from= last;
-  current->mark_as_dependent(last, resolved_item);
+  if (current->mark_as_dependent(thd, last, /** resolved_item psergey-thu
+    **/mark_item))
+    return TRUE;
   if (thd->lex->describe & DESCRIBE_EXTENDED)
   {
     push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_NOTE,
@@ -3650,6 +3731,7 @@ static void mark_as_dependent(THD *thd, SELECT_LEX *last, SELECT_LEX *current,
                  resolved_item->field_name,
                  current->select_number, last->select_number);
   }
+  return FALSE;
 }
 
 
@@ -4101,6 +4183,9 @@ Item_field::fix_outer_field(THD *thd, Field **from_field, Item **reference)
                             ((ref_type == REF_ITEM || ref_type == FIELD_ITEM) ?
                              (Item_ident*) (*reference) :
                              0));
+          context->select_lex->
+              register_dependency_item(last_checked_context->select_lex,
+                                       reference);
           /*
             A reference to a view field had been found and we
             substituted it instead of this Item (find_field_in_tables
@@ -4199,8 +4284,12 @@ Item_field::fix_outer_field(THD *thd, Field **from_field, Item **reference)
       return -1;
 
     mark_as_dependent(thd, last_checked_context->select_lex,
-                      context->select_lex, this,
+                      context->select_lex, rf,
                       rf);
+    context->select_lex->
+              register_dependency_item(last_checked_context->select_lex,
+                                       reference);
+
     return 0;
   }
   else
@@ -4208,6 +4297,9 @@ Item_field::fix_outer_field(THD *thd, Field **from_field, Item **reference)
     mark_as_dependent(thd, last_checked_context->select_lex,
                       context->select_lex,
                       this, (Item_ident*)*reference);
+    context->select_lex->
+              register_dependency_item(last_checked_context->select_lex,
+                                       reference);
     if (last_checked_context->select_lex->having_fix_field)
     {
       Item_ref *rf;
@@ -4514,6 +4606,7 @@ void Item_field::cleanup()
 {
   DBUG_ENTER("Item_field::cleanup");
   Item_ident::cleanup();
+  depended_from= NULL;
   /*
     Even if this object was created by direct link to field in setup_wild()
     it will be linked correctly next time by name of field and table alias.
@@ -4709,7 +4802,8 @@ bool Item_field::set_no_const_sub(uchar *arg)
   Replace an Item_field for an equal Item_field that evaluated earlier
   (if any).
 
-  The function returns a pointer to an item that is taken from
+  If this->item_equal points to some item and coincides with arg then
+  the function returns a pointer to an item that is taken from
   the very beginning of the item_equal list which the Item_field
   object refers to (belongs to) unless item_equal contains  a constant
   item. In this case the function returns this constant item, 
@@ -4717,7 +4811,7 @@ bool Item_field::set_no_const_sub(uchar *arg)
   If the Item_field object does not refer any Item_equal object
   'this' is returned .
 
-  @param arg   a dummy parameter, is not used here
+  @param arg   NULL or points to so some item of the Item_equal type  
 
 
   @note
@@ -4732,7 +4826,7 @@ bool Item_field::set_no_const_sub(uchar *arg)
 
 Item *Item_field::replace_equal_field(uchar *arg)
 {
-  if (item_equal)
+  if (item_equal && item_equal == (Item_equal *) arg)
   {
     Item *const_item= item_equal->get_const();
     if (const_item)
@@ -4742,8 +4836,8 @@ Item *Item_field::replace_equal_field(uchar *arg)
         return this;
       return const_item;
     }
-    Item_field *subst= item_equal->get_first();
-    if (subst && field->table != subst->field->table && !field->eq(subst->field))
+    Item_field *subst= item_equal->get_first(this);
+    if (subst && !field->eq(subst->field))
       return subst;
   }
   return this;
@@ -5072,47 +5166,69 @@ void Item_field::make_field(Send_field *tmp_field)
 
 
 /**
-  Set a field's value from a item.
-*/
+  Save a field value in another field
 
-void Item_field::save_org_in_field(Field *to)
-{
-  if (field->is_null())
-  {
-    null_value=1;
-    set_field_to_null_with_conversions(to, 1);
-  }
-  else
-  {
-    to->set_notnull();
-    field_conv(to,field);
-    null_value=0;
-  }
-}
+  @param from             Field to take the value from
+  @param [out] null_value Pointer to the null_value flag to set
+  @param to               Field to save the value in
+  @param no_conversions   How to deal with NULL value
 
-int Item_field::save_in_field(Field *to, bool no_conversions)
+  @details
+  The function takes the value of the field 'from' and, if this value
+  is not null, it saves in the field 'to' setting off the flag referenced
+  by 'null_value'. Otherwise this flag is set on and field 'to' is
+  also set to null possibly with conversion.
+
+  @note
+  This function is used by the functions Item_field::save_in_field,
+  Item_field::save_org_in_field and Item_ref::save_in_field
+
+  @retval FALSE OK
+  @retval TRUE  Error
+
+*/
+
+static int save_field_in_field(Field *from, bool *null_value,
+                               Field *to, bool no_conversions)
 {
   int res;
-  if (result_field->is_null())
+  DBUG_ENTER("save_field_in_field");
+  if (from->is_null())
   {
-    null_value=1;
-    return set_field_to_null_with_conversions(to, no_conversions);
+    (*null_value)= 1;
+    DBUG_RETURN(set_field_to_null_with_conversions(to, no_conversions));
   }
   to->set_notnull();
 
   /*
     If we're setting the same field as the one we're reading from there's 
     nothing to do. This can happen in 'SET x = x' type of scenarios.
-  */  
-  if (to == result_field)
+  */
+  if (to == from)
   {
-    null_value=0;
-    return 0;
+    (*null_value)= 0;
+    DBUG_RETURN(0);
   }
 
-  res= field_conv(to,result_field);
-  null_value=0;
-  return res;
+  res= field_conv(to, from);
+  (*null_value)= 0;
+  DBUG_RETURN(res);
+}
+
+
+/**
+  Set a field's value from a item.
+*/
+
+void Item_field::save_org_in_field(Field *to)
+{
+  save_field_in_field(field, &null_value, to, TRUE);
+}
+
+
+int Item_field::save_in_field(Field *to, bool no_conversions)
+{
+  return save_field_in_field(result_field, &null_value, to, no_conversions);
 }
 
 
@@ -5795,6 +5911,35 @@ Item_ref::Item_ref(Name_resolution_context *context_arg,
     set_properties();
 }
 
+/*
+  A Field_enumerator-compatible class that invokes mark_as_dependent() for
+  each field that is a reference to some ancestor of current_select.
+*/
+class Dependency_marker: public Field_enumerator
+{
+public:
+  THD *thd;
+  st_select_lex *current_select;
+  virtual void visit_field(Item_field *item)
+  {
+    // Find which select the field is in. This is achieved by walking up 
+    // the select tree and looking for the table of interest.
+    st_select_lex *sel;
+    for (sel= current_select; sel; sel= sel->outer_select())
+    {
+      TABLE_LIST *tbl;
+      for (tbl= sel->leaf_tables; tbl; tbl= tbl->next_leaf)
+      {
+        if (tbl->table == item->field->table)
+        {
+          if (sel != current_select)
+            mark_as_dependent(thd, sel, current_select, item, item);
+          return;
+        }
+      }
+    }
+  }
+};
 
 Item_ref::Item_ref(TABLE_LIST *view_arg, Item **item,
                    const char *field_name_arg, bool alias_name_used_arg)
@@ -5983,6 +6128,9 @@ bool Item_ref::fix_fields(THD *thd, Item **reference)
                                 refer_type == FIELD_ITEM) ?
                                (Item_ident*) (*reference) :
                                0));
+           context->select_lex->
+              register_dependency_item(last_checked_context->select_lex,
+                                       reference);
             /*
               view reference found, we substituted it instead of this
               Item, so can quit
@@ -6032,7 +6180,10 @@ bool Item_ref::fix_fields(THD *thd, Item **reference)
           goto error;
         thd->change_item_tree(reference, fld);
         mark_as_dependent(thd, last_checked_context->select_lex,
-                          thd->lex->current_select, this, fld);
+                          thd->lex->current_select, fld, fld);
+        context->select_lex->
+              register_dependency_item(last_checked_context->select_lex,
+                                       reference);
         /*
           A reference is resolved to a nest level that's outer or the same as
           the nest level of the enclosing set function : adjust the value of
@@ -6056,6 +6207,9 @@ bool Item_ref::fix_fields(THD *thd, Item **reference)
       DBUG_ASSERT(*ref && (*ref)->fixed);
       mark_as_dependent(thd, last_checked_context->select_lex,
                         context->select_lex, this, this);
+      context->select_lex->
+              register_dependency_item(last_checked_context->select_lex,
+                                       reference);
       /*
         A reference is resolved to a nest level that's outer or the same as
         the nest level of the enclosing set function : adjust the value of
@@ -6068,6 +6222,24 @@ bool Item_ref::fix_fields(THD *thd, Item **reference)
                       last_checked_context->select_lex->nest_level);
     }
   }
+  else
+  {
+    if (depended_from && reference)
+    {
+      DBUG_ASSERT(context->select_lex != depended_from);
+      context->select_lex->register_dependency_item(depended_from, reference);
+    }
+    /*
+      It could be that we're referring to something that's in ancestor selects.
+      We must make an appropriate mark_as_dependent() call for each such
+      outside reference.
+    */
+    Dependency_marker dep_marker;
+    dep_marker.current_select= current_sel;
+    dep_marker.thd= thd;
+    (*ref)->walk(&Item::enumerate_field_refs_processor, FALSE,
+                 (uchar*)&dep_marker);
+  }
 
   DBUG_ASSERT(*ref);
   /*
@@ -6133,6 +6305,85 @@ void Item_ref::cleanup()
 }
 
 
+/**
+  Transform an Item_ref object with a transformer callback function.
+
+  The function first applies the transform method to the item
+  referenced by this Item_reg object. If this returns a new item the
+  old item is substituted for a new one. After this the transformer
+  is applied to the Item_ref object.
+
+  @param transformer   the transformer callback function to be applied to
+                       the nodes of the tree of the object
+  @param argument      parameter to be passed to the transformer
+
+  @return Item returned as the result of transformation of the Item_ref object
+    @retval !NULL The transformation was successful
+    @retval NULL  Out of memory error
+*/
+
+Item* Item_ref::transform(Item_transformer transformer, uchar *arg)
+{
+  DBUG_ASSERT(!current_thd->is_stmt_prepare());
+  DBUG_ASSERT((*ref) != NULL);
+
+  /* Transform the object we are referencing. */
+  Item *new_item= (*ref)->transform(transformer, arg);
+  if (!new_item)
+    return NULL;
+
+  /*
+    THD::change_item_tree() should be called only if the tree was
+    really transformed, i.e. when a new item has been created.
+    Otherwise we'll be allocating a lot of unnecessary memory for
+    change records at each execution.
+  */
+  if (*ref != new_item)
+    current_thd->change_item_tree(ref, new_item);
+
+  /* Transform the item ref object. */
+  return (this->*transformer)(arg);
+}
+
+
+/**
+  Compile an Item_ref object with a processor and a transformer
+  callback functions.
+
+  First the function applies the analyzer to the Item_ref object. Then
+  if the analizer succeeeds we first applies the compile method to the
+  object the Item_ref object is referencing.  If this returns a new
+  item the old item is substituted for a new one.  After this the
+  transformer is applied to the Item_ref object itself.
+
+  @param analyzer      the analyzer callback function to be applied to the
+                       nodes of the tree of the object
+  @param[in,out] arg_p parameter to be passed to the processor
+  @param transformer   the transformer callback function to be applied to the
+                       nodes of the tree of the object
+  @param arg_t         parameter to be passed to the transformer
+
+  @return Item returned as the result of transformation of the Item_ref object
+*/
+
+Item* Item_ref::compile(Item_analyzer analyzer, uchar **arg_p,
+                        Item_transformer transformer, uchar *arg_t)
+{
+  /* Analyze this Item object. */
+  if (!(this->*analyzer)(arg_p))
+    return NULL;
+
+  /* Compile the Item we are referencing. */
+  DBUG_ASSERT((*ref) != NULL);
+  Item *new_item= (*ref)->compile(analyzer, arg_p, transformer, arg_t);
+  if (new_item && *ref != new_item)
+    current_thd->change_item_tree(ref, new_item);
+  
+  /* Transform this Item object. */
+  return (this->*transformer)(arg_t);
+}
+
+
 void Item_ref::print(String *str, enum_query_type query_type)
 {
   if (ref)
@@ -6247,6 +6498,25 @@ bool Item_ref::val_bool_result()
 }
 
 
+void Item_ref::save_result(Field *to)
+{
+  if (result_field)
+  {
+    save_field_in_field(result_field, &null_value, to, TRUE);
+    return;
+  }
+  (*ref)->save_result(to);
+  null_value= (*ref)->null_value;
+}
+
+
+void Item_ref::save_val(Field *to)
+{
+  (*ref)->save_result(to);
+  null_value= (*ref)->null_value;
+}
+
+
 double Item_ref::val_real()
 {
   DBUG_ASSERT(fixed);
@@ -6364,6 +6634,13 @@ void Item_ref_null_helper::print(String *str, enum_query_type query_type)
 }
 
 
+void Item_direct_ref::save_val(Field *to)
+{
+  (*ref)->save_val(to);
+  null_value=(*ref)->null_value;
+}
+
+
 double Item_direct_ref::val_real()
 {
   double tmp=(*ref)->val_real();
@@ -6416,6 +6693,385 @@ bool Item_direct_ref::get_date(MYSQL_TIME *ltime,uint fuzzydate)
 }
 
 
+Item_cache_wrapper::~Item_cache_wrapper()
+{
+  delete expr_cache;
+  /* expr_value is Item so it will be destroyed from list of Items */
+}
+
+
+Item_cache_wrapper::Item_cache_wrapper(Item *item_arg)
+:orig_item(item_arg), expr_cache(NULL), expr_value(NULL)
+{
+  DBUG_ASSERT(orig_item->fixed);
+  max_length= orig_item->max_length;
+  maybe_null= orig_item->maybe_null;
+  decimals=   orig_item->decimals;
+  collation.set(orig_item->collation);
+  with_sum_func= orig_item->with_sum_func;
+  unsigned_flag= orig_item->unsigned_flag;
+  name= item_arg->name;
+  name_length= item_arg->name_length;
+
+  if ((expr_value= Item_cache::get_cache(orig_item)))
+    expr_value->setup(orig_item);
+
+  fixed= 1;
+}
+
+
+void Item_cache_wrapper::print(String *str, enum_query_type query_type)
+{
+  str->append(func_name());
+  if (expr_cache)
+    expr_cache->print(str, query_type);
+  else
+    str->append(STRING_WITH_LEN("<<DISABLED>>"));
+  str->append('(');
+  orig_item->print(str, query_type);
+  str->append(')');
+}
+
+
+/**
+  Prepare the expression cache wrapper (do nothing)
+
+  @retval FALSE OK
+*/
+
+bool Item_cache_wrapper::fix_fields(THD *thd  __attribute__((unused)),
+                                    Item **it __attribute__((unused)))
+{
+  DBUG_ASSERT(orig_item->fixed);
+  DBUG_ASSERT(fixed);
+  return FALSE;
+}
+
+
+/**
+  Clean the expression cache wrapper up before reusing it.
+*/
+
+void Item_cache_wrapper::cleanup()
+{
+  delete expr_cache;
+  expr_cache= 0;
+  // expr_value is Item so it will be destroyed from list of Items
+  expr_value= 0;
+}
+
+
+/**
+  Create an expression cache that uses a temporary table
+
+  @param thd           Thread handle
+  @param depends_on    Parameters of the expression to create cache for
+
+  @details
+  The function takes 'depends_on' as the list of all parameters for
+  the expression wrapped into this object and creates an expression
+  cache in a temporary table containing the field for the parameters
+  and the result of the expression.
+
+  @retval FALSE OK
+  @retval TRUE  Error
+*/
+
+bool Item_cache_wrapper::set_cache(THD *thd, List<Item*> &depends_on)
+{
+  DBUG_ENTER("Item_cache_wrapper::set_cache");
+  expr_cache= new Expression_cache_tmptable(thd, depends_on, expr_value);
+  DBUG_RETURN(expr_cache == NULL);
+}
+
+
+/**
+  Check if the current values of the parameters are in the expression cache
+
+  @details
+  The function checks whether the current set of the parameters of the
+  referenced item can be found in the expression cache. If so the function
+  returns the item by which the result of the expression can be easily
+  extracted from the cache with the corresponding val_* method.
+
+  @retval NULL    - parameters are not in the cache
+  @retval <item*> - item providing the result of the expression found in cache
+*/
+
+Item *Item_cache_wrapper::check_cache()
+{
+  DBUG_ENTER("Item_cache_wrapper::check_cache");
+  if (expr_cache)
+  {
+    Expression_cache_tmptable::result res;
+    Item *cached_value;
+    res= expr_cache->check_value(&cached_value);
+    if (res == Expression_cache_tmptable::HIT)
+      DBUG_RETURN(cached_value);
+  }
+  DBUG_RETURN(NULL);
+}
+
+
+/**
+  Get the value of the cached expression and put it in the cache
+*/
+
+inline void Item_cache_wrapper::cache()
+{
+  expr_value->store(orig_item);
+  expr_value->cache_value();
+  expr_cache->put_value(expr_value); // put in expr_cache
+}
+
+
+/**
+  Get the value of the possibly cached item into the field.
+*/
+
+void Item_cache_wrapper::save_val(Field *to)
+{
+  Item *cached_value;
+  DBUG_ENTER("Item_cache_wrapper::val_int");
+  if (!expr_cache)
+  {
+    orig_item->save_val(to);
+    null_value= orig_item->null_value;
+    DBUG_VOID_RETURN;
+  }
+
+  if ((cached_value= check_cache()))
+  {
+    cached_value->save_val(to);
+    null_value= cached_value->null_value;
+    DBUG_VOID_RETURN;
+  }
+  cache();
+  null_value= expr_value->null_value;
+  expr_value->save_val(to);
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  Get the integer value of the possibly cached item.
+*/
+
+longlong Item_cache_wrapper::val_int()
+{
+  Item *cached_value;
+  DBUG_ENTER("Item_cache_wrapper::val_int");
+  if (!expr_cache)
+  {
+    longlong tmp= orig_item->val_int();
+    null_value= orig_item->null_value;
+    DBUG_RETURN(tmp);
+  }
+
+  if ((cached_value= check_cache()))
+  {
+    longlong tmp= cached_value->val_int();
+    null_value= cached_value->null_value;
+    DBUG_RETURN(tmp);
+  }
+  cache();
+  null_value= expr_value->null_value;
+  DBUG_RETURN(expr_value->val_int());
+}
+
+
+/**
+  Get the real value of the possibly cached item
+*/
+
+double Item_cache_wrapper::val_real()
+{
+  Item *cached_value;
+  DBUG_ENTER("Item_cache_wrapper::val_real");
+  if (!expr_cache)
+  {
+    double tmp= orig_item->val_real();
+    null_value= orig_item->null_value;
+    DBUG_RETURN(tmp);
+  }
+
+  if ((cached_value= check_cache()))
+  {
+    double tmp= cached_value->val_real();
+    null_value= cached_value->null_value;
+    DBUG_RETURN(tmp);
+  }
+  cache();
+  null_value= expr_value->null_value;
+  DBUG_RETURN(expr_value->val_real());
+}
+
+
+/**
+  Get the string value of the possibly cached item
+*/
+
+String *Item_cache_wrapper::val_str(String* str)
+{
+  Item *cached_value;
+  DBUG_ENTER("Item_cache_wrapper::val_str");
+  if (!expr_cache)
+  {
+    String *tmp= orig_item->val_str(str);
+    null_value= orig_item->null_value;
+    DBUG_RETURN(tmp);
+  }
+
+  if ((cached_value= check_cache()))
+  {
+    String *tmp= cached_value->val_str(str);
+    null_value= cached_value->null_value;
+    DBUG_RETURN(tmp);
+  }
+  cache();
+  if ((null_value= expr_value->null_value))
+    DBUG_RETURN(NULL);
+  DBUG_RETURN(expr_value->val_str(str));
+}
+
+
+/**
+  Get the decimal value of the possibly cached item
+*/
+
+my_decimal *Item_cache_wrapper::val_decimal(my_decimal* decimal_value)
+{
+  Item *cached_value;
+  DBUG_ENTER("Item_cache_wrapper::val_decimal");
+  if (!expr_cache)
+  {
+    my_decimal *tmp= orig_item->val_decimal(decimal_value);
+    null_value= orig_item->null_value;
+    DBUG_RETURN(tmp);
+  }
+
+  if ((cached_value= check_cache()))
+  {
+    my_decimal *tmp= cached_value->val_decimal(decimal_value);
+    null_value= cached_value->null_value;
+    DBUG_RETURN(tmp);
+  }
+  cache();
+  if ((null_value= expr_value->null_value))
+    DBUG_RETURN(NULL);
+  DBUG_RETURN(expr_value->val_decimal(decimal_value));
+}
+
+
+/**
+  Get the boolean value of the possibly cached item
+*/
+
+bool Item_cache_wrapper::val_bool()
+{
+  Item *cached_value;
+  DBUG_ENTER("Item_cache_wrapper::val_bool");
+  if (!expr_cache)
+  {
+    bool tmp= orig_item->val_bool();
+    null_value= orig_item->null_value;
+    DBUG_RETURN(tmp);
+  }
+
+  if ((cached_value= check_cache()))
+  {
+    bool tmp= cached_value->val_bool();
+    null_value= cached_value->null_value;
+    DBUG_RETURN(tmp);
+  }
+  cache();
+  null_value= expr_value->null_value;
+  DBUG_RETURN(expr_value->val_bool());
+}
+
+
+/**
+  Check for NULL the value of the possibly cached item
+*/
+
+bool Item_cache_wrapper::is_null()
+{
+  Item *cached_value;
+  DBUG_ENTER("Item_cache_wrapper::is_null");
+  if (!expr_cache)
+  {
+    bool tmp= orig_item->is_null();
+    null_value= orig_item->null_value;
+    DBUG_RETURN(tmp);
+  }
+
+  if ((cached_value= check_cache()))
+  {
+    bool tmp= cached_value->is_null();
+    null_value= cached_value->null_value;
+    DBUG_RETURN(tmp);
+  }
+  cache();
+  DBUG_RETURN((null_value= expr_value->null_value));
+}
+
+
+/**
+  Get the date value of the possibly cached item
+*/
+
+bool Item_cache_wrapper::get_date(MYSQL_TIME *ltime, uint fuzzydate)
+{
+  Item *cached_value;
+  DBUG_ENTER("Item_cache_wrapper::get_date");
+  if (!expr_cache)
+    DBUG_RETURN((null_value= orig_item->get_date(ltime, fuzzydate)));
+
+  if ((cached_value= check_cache()))
+    DBUG_RETURN((null_value= cached_value->get_date(ltime, fuzzydate)));
+
+  cache();
+  DBUG_RETURN((null_value= expr_value->get_date(ltime, fuzzydate)));
+}
+
+
+/**
+  Get the time value of the possibly cached item
+*/
+
+bool Item_cache_wrapper::get_time(MYSQL_TIME *ltime)
+{
+  Item *cached_value;
+  DBUG_ENTER("Item_cache_wrapper::get_time");
+  if (!expr_cache)
+    DBUG_RETURN((null_value= orig_item->get_time(ltime)));
+
+  if ((cached_value= check_cache()))
+    DBUG_RETURN((null_value= cached_value->get_time(ltime)));
+
+  cache();
+  DBUG_RETURN((null_value= expr_value->get_time(ltime)));
+}
+
+
+int Item_cache_wrapper::save_in_field(Field *to, bool no_conversions)
+{
+  int res;
+  DBUG_ASSERT(!result_field);
+  res= orig_item->save_in_field(to, no_conversions);
+  null_value= orig_item->null_value;
+  return res;
+}
+
+
+Item* Item_cache_wrapper::get_tmp_table_item(THD *thd_arg)
+{
+  if (!orig_item->with_sum_func && !orig_item->const_item())
+    return new Item_field(result_field);
+  return copy_or_same(thd_arg);
+}
+
+
 /**
   Prepare referenced field then call usual Item_direct_ref::fix_fields .
 
@@ -6484,6 +7140,23 @@ bool Item_outer_ref::fix_fields(THD *thd, Item **reference)
 }
 
 
+void Item_outer_ref::fix_after_pullout(st_select_lex *new_parent, Item **ref)
+{
+  if (depended_from == new_parent)
+  {
+    *ref= outer_ref;
+    (*ref)->fix_after_pullout(new_parent, ref);
+  }
+}
+
+void Item_ref::fix_after_pullout(st_select_lex *new_parent, Item **refptr)
+{
+  (*ref)->fix_after_pullout(new_parent, ref);
+  if (depended_from == new_parent)
+    depended_from= NULL;
+}
+
+
 /**
   Mark references from inner selects used in group by clause
 
@@ -7162,8 +7835,11 @@ void Item_cache_int::store_longlong(Item *item, longlong val_arg)
 String *Item_cache_int::val_str(String *str)
 {
   DBUG_ASSERT(fixed == 1);
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return NULL;
+  }
   str->set(value, default_charset());
   return str;
 }
@@ -7172,8 +7848,11 @@ String *Item_cache_int::val_str(String *str)
 my_decimal *Item_cache_int::val_decimal(my_decimal *decimal_val)
 {
   DBUG_ASSERT(fixed == 1);
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return NULL;
+  }
   int2my_decimal(E_DEC_FATAL_ERROR, value, unsigned_flag, decimal_val);
   return decimal_val;
 }
@@ -7181,16 +7860,22 @@ my_decimal *Item_cache_int::val_decimal(my_decimal *decimal_val)
 double Item_cache_int::val_real()
 {
   DBUG_ASSERT(fixed == 1);
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return 0.0;
+  }
   return (double) value;
 }
 
 longlong Item_cache_int::val_int()
 {
   DBUG_ASSERT(fixed == 1);
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return 0;
+  }
   return value;
 }
 
@@ -7208,16 +7893,22 @@ bool Item_cache_real::cache_value()
 double Item_cache_real::val_real()
 {
   DBUG_ASSERT(fixed == 1);
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return 0.0;
+  }
   return value;
 }
 
 longlong Item_cache_real::val_int()
 {
   DBUG_ASSERT(fixed == 1);
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return 0;
+  }
   return (longlong) rint(value);
 }
 
@@ -7225,8 +7916,11 @@ longlong Item_cache_real::val_int()
 String* Item_cache_real::val_str(String *str)
 {
   DBUG_ASSERT(fixed == 1);
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return NULL;
+  }
   str->set_real(value, decimals, default_charset());
   return str;
 }
@@ -7235,8 +7929,11 @@ String* Item_cache_real::val_str(String *str)
 my_decimal *Item_cache_real::val_decimal(my_decimal *decimal_val)
 {
   DBUG_ASSERT(fixed == 1);
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return NULL;
+  }
   double2my_decimal(E_DEC_FATAL_ERROR, value, decimal_val);
   return decimal_val;
 }
@@ -7257,8 +7954,11 @@ double Item_cache_decimal::val_real()
 {
   DBUG_ASSERT(fixed);
   double res;
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return 0.0;
+  }
   my_decimal2double(E_DEC_FATAL_ERROR, &decimal_value, &res);
   return res;
 }
@@ -7267,8 +7967,11 @@ longlong Item_cache_decimal::val_int()
 {
   DBUG_ASSERT(fixed);
   longlong res;
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return 0;
+  }
   my_decimal2int(E_DEC_FATAL_ERROR, &decimal_value, unsigned_flag, &res);
   return res;
 }
@@ -7276,8 +7979,11 @@ longlong Item_cache_decimal::val_int()
 String* Item_cache_decimal::val_str(String *str)
 {
   DBUG_ASSERT(fixed);
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return NULL;
+  }
   my_decimal_round(E_DEC_FATAL_ERROR, &decimal_value, decimals, FALSE,
                    &decimal_value);
   my_decimal2string(E_DEC_FATAL_ERROR, &decimal_value, 0, 0, 0, str);
@@ -7287,8 +7993,11 @@ String* Item_cache_decimal::val_str(String *str)
 my_decimal *Item_cache_decimal::val_decimal(my_decimal *val)
 {
   DBUG_ASSERT(fixed);
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return NULL;
+  }
   return &decimal_value;
 }
 
@@ -7323,12 +8032,13 @@ double Item_cache_str::val_real()
   DBUG_ASSERT(fixed == 1);
   int err_not_used;
   char *end_not_used;
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return 0.0;
-  if (value)
-    return my_strntod(value->charset(), (char*) value->ptr(),
-		      value->length(), &end_not_used, &err_not_used);
-  return (double) 0;
+  }
+  return my_strntod(value->charset(), (char*) value->ptr(),
+                    value->length(), &end_not_used, &err_not_used);
 }
 
 
@@ -7336,21 +8046,24 @@ longlong Item_cache_str::val_int()
 {
   DBUG_ASSERT(fixed == 1);
   int err;
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return 0;
-  if (value)
-    return my_strntoll(value->charset(), value->ptr(),
-		       value->length(), 10, (char**) 0, &err);
-  else
-    return (longlong)0;
+  }
+  return my_strntoll(value->charset(), value->ptr(),
+                     value->length(), 10, (char**) 0, &err);
 }
 
 
 String* Item_cache_str::val_str(String *str)
 {
   DBUG_ASSERT(fixed == 1);
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return 0;
+  }
   return value;
 }
 
@@ -7358,20 +8071,24 @@ String* Item_cache_str::val_str(String *str)
 my_decimal *Item_cache_str::val_decimal(my_decimal *decimal_val)
 {
   DBUG_ASSERT(fixed == 1);
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    null_value= TRUE;
     return NULL;
-  if (value)
-    string2my_decimal(E_DEC_FATAL_ERROR, value, decimal_val);
-  else
-    decimal_val= 0;
+  }
+  string2my_decimal(E_DEC_FATAL_ERROR, value, decimal_val);
   return decimal_val;
 }
 
 
 int Item_cache_str::save_in_field(Field *field, bool no_conversions)
 {
-  if (!value_cached && !cache_value())
+  if ((!value_cached && !cache_value()) || null_value)
+  {
+    field->set_notnull();
+    null_value= TRUE;
     return 0;
+  }
   int res= Item_cache::save_in_field(field, no_conversions);
   return (is_varbinary && field->type() == MYSQL_TYPE_STRING &&
           value->length() < field->field_length) ? 1 : res;
diff --git a/sql/item.h b/sql/item.h
index 28856842b99..c16e1d89001 100644
--- a/sql/item.h
+++ b/sql/item.h
@@ -506,12 +506,16 @@ public:
 	     FIELD_VARIANCE_ITEM, INSERT_VALUE_ITEM,
              SUBSELECT_ITEM, ROW_ITEM, CACHE_ITEM, TYPE_HOLDER,
              PARAM_ITEM, TRIGGER_FIELD_ITEM, DECIMAL_ITEM,
-             XPATH_NODESET, XPATH_NODESET_CMP};
+             XPATH_NODESET, XPATH_NODESET_CMP,
+             VIEW_FIXER_ITEM, EXPR_CACHE_ITEM};
 
   enum cond_result { COND_UNDEF,COND_OK,COND_TRUE,COND_FALSE };
 
   enum traverse_order { POSTFIX, PREFIX };
   
+  /* Cache of the result of is_expensive(). */
+  int8 is_expensive_cache;
+  
   /* Reuse size, only used by SP local variable assignment, otherwize 0 */
   uint rsize;
 
@@ -574,6 +578,12 @@ public:
   Field *make_string_field(TABLE *table);
   virtual bool fix_fields(THD *, Item **);
   /*
+    Fix after some tables has been pulled out. Basically re-calculate all
+    attributes that are dependent on the tables.
+  */
+  virtual void fix_after_pullout(st_select_lex *new_parent, Item **ref) {};
+
+  /*
     This method should be used in case where we are sure that we do not need
     complete fix_fields() procedure.
     Usually this method is used by the optimizer when it has to create a new
@@ -736,6 +746,17 @@ public:
   */
   virtual bool val_bool();
   virtual String *val_nodeset(String*) { return 0; }
+
+  /*
+    save_val() is method of val_* family which stores value in the given
+    field.
+  */
+  virtual void save_val(Field *to) { save_org_in_field(to); }
+  /*
+    save_result() is method of val*result() family which stores value in
+    the given field.
+  */
+  virtual void save_result(Field *to) { save_val(to); }
   /* Helper functions, see item_sum.cc */
   String *val_string_from_real(String *str);
   String *val_string_from_int(String *str);
@@ -928,6 +949,7 @@ public:
   virtual bool remove_fixed(uchar * arg) { fixed= 0; return 0; }
   virtual bool cleanup_processor(uchar *arg);
   virtual bool collect_item_field_processor(uchar * arg) { return 0; }
+  virtual bool add_field_to_set_processor(uchar * arg) { return 0; }
   virtual bool find_item_in_field_list_processor(uchar *arg) { return 0; }
   virtual bool change_context_processor(uchar *context) { return 0; }
   virtual bool reset_query_id_processor(uchar *query_id_arg) { return 0; }
@@ -1106,6 +1128,8 @@ public:
 
   virtual Item *neg_transformer(THD *thd) { return NULL; }
   virtual Item *update_value_transformer(uchar *select_arg) { return this; }
+  virtual Item *expr_cache_insert_transformer(uchar *thd_arg) { return this; }
+  virtual bool expr_cache_is_needed(THD *) { return FALSE; }
   virtual Item *safe_charset_converter(CHARSET_INFO *tocs);
   void delete_self()
   {
@@ -1132,15 +1156,43 @@ public:
   */
   virtual bool result_as_longlong() { return FALSE; }
   bool is_datetime();
+
+  /*
+    Test whether an expression is expensive to compute. Used during
+    optimization to avoid computing expensive expressions during this
+    phase. Also used to force temp tables when sorting on expensive
+    functions.
+    TODO:
+    Normally we should have a method:
+      cost Item::execution_cost(),
+    where 'cost' is either 'double' or some structure of various cost
+    parameters.
+
+    NOTE
+      This function is now used to prevent evaluation of materialized IN
+      subquery predicates before it is allowed. grep for 
+      DontEvaluateMaterializedSubqueryTooEarly to see the uses.
+  */
+  virtual bool is_expensive()
+  {
+    if (is_expensive_cache < 0)
+      is_expensive_cache= walk(&Item::is_expensive_processor, 0, (uchar*)0);
+    return test(is_expensive_cache);
+  }
   virtual Field::geometry_type get_geometry_type() const
     { return Field::GEOM_GEOMETRY; };
   String *check_well_formed_result(String *str, bool send_error= 0);
   bool eq_by_collation(Item *item, bool binary_cmp, CHARSET_INFO *cs); 
+
+  Item* set_expr_cache(THD *thd, List<Item*> &depends_on);
+  virtual Item *get_cached_item() { return NULL; }
 };
 
 
 /*
-  Class to be used to enumerate all field references in an item tree.
+  Class to be used to enumerate all field references in an item tree. This
+  includes references to outside but not fields of the tables within a
+  subquery.
   Suggested usage:
 
     class My_enumerator : public Field_enumerator 
@@ -1157,7 +1209,7 @@ public:
 class Field_enumerator
 {
 public:
-  virtual void visit_field(Field *field)= 0;
+  virtual void visit_field(Item_field *field)= 0;
   virtual ~Field_enumerator() {};             /* purecov: inspected */
   Field_enumerator() {}                       /* Remove gcc warning */
 };
@@ -1513,6 +1565,19 @@ public:
   */
   TABLE_LIST *cached_table;
   st_select_lex *depended_from;
+  /*
+    Some Items resolved in another select should not be marked as dependency
+    of the subquery where they are. During normal name resolution, we check
+    this. Stored procedures and prepared statements first try to resolve an
+    ident item using a cached table reference and field position from the
+    previous query execution (cached_table/cached_field_index). If the
+    tables were not changed, the ident matches the table/field, and we have
+    faster resolution of the ident without looking through all tables and
+    fields in the query. But in this case, we can not check all conditions
+    about this ident item dependency, so we should cache the condition in
+    this variable.
+  */
+  bool can_be_depended;
   Item_ident(Name_resolution_context *context_arg,
              const char *db_name_arg, const char *table_name_arg,
              const char *field_name_arg);
@@ -1595,6 +1660,7 @@ public:
   longlong val_int();
   my_decimal *val_decimal(my_decimal *);
   String *val_str(String*);
+  void save_result(Field *to);
   double val_result();
   longlong val_int_result();
   String *str_result(String* tmp);
@@ -1604,6 +1670,7 @@ public:
   bool send(Protocol *protocol, String *str_arg);
   void reset_field(Field *f);
   bool fix_fields(THD *, Item **);
+  void fix_after_pullout(st_select_lex *new_parent, Item **ref);
   void make_field(Send_field *tmp_field);
   int save_in_field(Field *field,bool no_conversions);
   void save_org_in_field(Field *field);
@@ -1634,6 +1701,7 @@ public:
   void update_null_value();
   Item *get_tmp_table_item(THD *thd);
   bool collect_item_field_processor(uchar * arg);
+  bool add_field_to_set_processor(uchar * arg);
   bool find_item_in_field_list_processor(uchar *arg);
   bool register_field_in_read_map(uchar *arg);
   bool register_field_in_bitmap(uchar *arg);
@@ -1913,8 +1981,6 @@ public:
   virtual void print(String *str, enum_query_type query_type);
   Item_num *neg ();
   uint decimal_precision() const { return max_length; }
-  bool check_partition_func_processor(uchar *bool_arg) { return FALSE;}
-  bool check_vcol_func_processor(uchar *arg) { return FALSE;}
 };
 
 
@@ -2376,6 +2442,8 @@ public:
     Item *it= ((Item *) item)->real_item();
     return ref && (*ref)->eq(it, binary_cmp);
   }
+  void save_val(Field *to);
+  void save_result(Field *to);
   double val_real();
   longlong val_int();
   my_decimal *val_decimal(my_decimal *);
@@ -2392,6 +2460,7 @@ public:
   bool send(Protocol *prot, String *tmp);
   void make_field(Send_field *field);
   bool fix_fields(THD *, Item **);
+  void fix_after_pullout(st_select_lex *new_parent, Item **ref);
   int save_in_field(Field *field, bool no_conversions);
   void save_org_in_field(Field *field);
   enum Item_result result_type () const { return (*ref)->result_type(); }
@@ -2424,10 +2493,18 @@ public:
     return ref ? (*ref)->real_item() : this;
   }
   bool walk(Item_processor processor, bool walk_subquery, uchar *arg)
-  {
-    return (*ref)->walk(processor, walk_subquery, arg) ||
-           (this->*processor)(arg);
+  { 
+    if (ref && *ref)
+      return (*ref)->walk(processor, walk_subquery, arg) ||
+             (this->*processor)(arg); 
+    else
+      return FALSE;
   }
+  Item* transform(Item_transformer, uchar *arg);
+  Item* compile(Item_analyzer analyzer, uchar **arg_p,
+                Item_transformer transformer, uchar *arg_t);
+  bool enumerate_field_refs_processor(uchar *arg)
+  { return (*ref)->enumerate_field_refs_processor(arg); }
   void no_rows_in_result()
   {
     (*ref)->no_rows_in_result();
@@ -2508,6 +2585,7 @@ public:
               alias_name_used_arg)
   {}
 
+  void save_val(Field *to);
   double val_real();
   longlong val_int();
   String *val_str(String* tmp);
@@ -2518,6 +2596,131 @@ public:
   virtual Ref_Type ref_type() { return DIRECT_REF; }
 };
 
+class Expression_cache;
+class Item_cache;
+
+
+/**
+  The objects of this class can store its values in an expression cache.
+*/
+
+class Item_cache_wrapper :public Item_result_field
+{
+private:
+  /* Pointer on the cached expression */
+  Item *orig_item;
+  Expression_cache *expr_cache;
+  /*
+    In order to put the expression into the expression cache and return
+    value of val_*() method, we will need to get the expression value twice
+    (probably in different types).  In order to avoid making two
+    (potentially costly) orig_item->val_*() calls, we store expression value
+    in this Item_cache object.
+  */
+  Item_cache *expr_value;
+
+  Item *check_cache();
+  inline void cache();
+
+public:
+  Item_cache_wrapper(Item *item_arg);
+  ~Item_cache_wrapper();
+
+  const char *func_name() const { return "<expr_cache>"; }
+  enum Type type() const { return EXPR_CACHE_ITEM; }
+  virtual Item *get_cached_item() { return orig_item; }
+
+  bool set_cache(THD *thd, List<Item*> &depends_on);
+
+  bool fix_fields(THD *thd, Item **it);
+  void fix_length_and_dec() {}
+  void cleanup();
+
+  /* Methods of getting value which should be cached in the cache */
+  void save_val(Field *to);
+  double val_real();
+  longlong val_int();
+  String *val_str(String* tmp);
+  my_decimal *val_decimal(my_decimal *);
+  bool val_bool();
+  bool is_null();
+  bool get_date(MYSQL_TIME *ltime, uint fuzzydate);
+  bool get_time(MYSQL_TIME *ltime);
+  bool send(Protocol *protocol, String *buffer)
+  {
+    if (result_field)
+      return protocol->store(result_field);
+    return Item::send(protocol, buffer);
+  }
+  void save_org_in_field(Field *field)
+  {
+    save_val(field);
+  }
+  void save_in_result_field(bool no_conversions)
+  {
+    save_val(result_field);
+  }
+  Item* get_tmp_table_item(THD *thd_arg);
+
+  /* Following methods make this item transparent as much as possible */
+
+  virtual void print(String *str, enum_query_type query_type);
+  virtual const char *full_name() const { return orig_item->full_name(); }
+  virtual void make_field(Send_field *field) { orig_item->make_field(field); }
+  bool eq(const Item *item, bool binary_cmp) const
+  {
+    Item *it= ((Item *) item)->real_item();
+    return orig_item->eq(it, binary_cmp);
+  }
+  void fix_after_pullout(st_select_lex *new_parent, Item **refptr)
+  {
+    orig_item->fix_after_pullout(new_parent, &orig_item);
+  }
+  int save_in_field(Field *to, bool no_conversions);
+  enum Item_result result_type () const { return orig_item->result_type(); }
+  enum_field_types field_type() const   { return orig_item->field_type(); }
+  table_map used_tables() const { return orig_item->used_tables(); }
+  void update_used_tables() { orig_item->update_used_tables(); }
+  bool const_item() const { return orig_item->const_item(); }
+  table_map not_null_tables() const { return orig_item->not_null_tables(); }
+  bool walk(Item_processor processor, bool walk_subquery, uchar *arg)
+  {
+    return orig_item->walk(processor, walk_subquery, arg) ||
+      (this->*processor)(arg);
+  }
+  bool enumerate_field_refs_processor(uchar *arg)
+  { return orig_item->enumerate_field_refs_processor(arg); }
+  bool result_as_longlong() { return orig_item->result_as_longlong(); }
+  Item_field *filed_for_view_update()
+  { return orig_item->filed_for_view_update(); }
+
+  /* Row emulation: forwarding of ROW-related calls to orig_item */
+  uint cols()
+  { return result_type() == ROW_RESULT ? orig_item->cols() : 1; }
+  Item* element_index(uint i)
+  { return result_type() == ROW_RESULT ? orig_item->element_index(i) : this; }
+  Item** addr(uint i)
+  { return result_type() == ROW_RESULT ? orig_item->addr(i) : 0; }
+  bool check_cols(uint c)
+  {
+    return (result_type() == ROW_RESULT ?
+            orig_item->check_cols(c) :
+            Item::check_cols(c));
+  }
+  bool null_inside()
+  { return result_type() == ROW_RESULT ? orig_item->null_inside() : 0; }
+  void bring_value()
+  {
+    if (result_type() == ROW_RESULT)
+      orig_item->bring_value();
+  }
+  bool check_vcol_func_processor(uchar *arg)
+  {
+    return trace_unsupported_by_check_vcol_func_processor("cache");
+  }
+};
+
+
 /*
   Class for view fields, the same as Item_direct_ref, but call fix_fields
   of reference if it is not called yet
@@ -2595,6 +2798,7 @@ public:
     outer_ref->save_org_in_field(result_field);
   }
   bool fix_fields(THD *, Item **);
+  void fix_after_pullout(st_select_lex *new_parent, Item **ref);
   table_map used_tables() const
   {
     return (*ref)->const_item() ? 0 : OUTER_REF_TABLE_BIT;
@@ -2627,6 +2831,7 @@ public:
 		       const char *table_name_arg, const char *field_name_arg)
     :Item_ref(context_arg, item, table_name_arg, field_name_arg),
      owner(master) {}
+  void save_val(Field *to);
   double val_real();
   longlong val_int();
   String* val_str(String* s);
@@ -2889,6 +3094,17 @@ public:
 };
 
 
+/*
+  Cached_item_XXX objects are not exactly caches. They do the following:
+
+  Each Cached_item_XXX object has
+   - its source item
+   - saved value of the source item
+   - cmp() method that compares the saved value with the current value of the
+     source item, and if they were not equal saves item's value into the saved
+     value.
+*/
+
 class Cached_item :public Sql_alloc
 {
 public:
@@ -2945,9 +3161,10 @@ class Cached_item_field :public Cached_item
   uint length;
 
 public:
-  Cached_item_field(Item_field *item)
+  Cached_item_field(Field *arg_field) : field(arg_field)
   {
-    field= item->field;
+    field= arg_field;
+    /* TODO: take the memory allocation below out of the constructor. */
     buff= (uchar*) sql_calloc(length=field->pack_length());
   }
   bool cmp(void);
@@ -3125,6 +3342,13 @@ private:
 };
 
 
+/**
+  @todo
+  Implement the is_null() method for this class. Currently calling is_null()
+  on any Item_cache object resolves to Item::is_null(), which reutns FALSE
+  for any value.
+*/
+
 class Item_cache: public Item_basic_constant
 {
 protected:
@@ -3150,7 +3374,8 @@ public:
     example(0), used_table_map(0), cached_field(0), cached_field_type(MYSQL_TYPE_STRING),
     value_cached(0)
   {
-    fixed= 1; 
+    fixed= 1;
+    maybe_null= 1;
     null_value= 1;
   }
   Item_cache(enum_field_types field_type_arg):
@@ -3158,6 +3383,7 @@ public:
     value_cached(0)
   {
     fixed= 1;
+    maybe_null= 1;
     null_value= 1;
   }
 
@@ -3398,7 +3624,8 @@ void mark_select_range_as_dependent(THD *thd,
                                     Field *found_field, Item *found_item,
                                     Item_ident *resolved_item);
 
-extern Cached_item *new_Cached_item(THD *thd, Item *item);
+extern Cached_item *new_Cached_item(THD *thd, Item *item,
+                                    bool pass_through_ref);
 extern Item_result item_cmp_type(Item_result a,Item_result b);
 extern void resolve_const_item(THD *thd, Item **ref, Item *cmp_item);
 extern int stored_field_cmp_to_item(THD *thd, Field *field, Item *item);
diff --git a/sql/item_buff.cc b/sql/item_buff.cc
index 0ac4edb3656..b15083f54a2 100644
--- a/sql/item_buff.cc
+++ b/sql/item_buff.cc
@@ -27,11 +27,15 @@
   Create right type of Cached_item for an item.
 */
 
-Cached_item *new_Cached_item(THD *thd, Item *item)
+Cached_item *new_Cached_item(THD *thd, Item *item, bool pass_through_ref)
 {
-  if (item->real_item()->type() == Item::FIELD_ITEM &&
+  if (pass_through_ref && item->real_item()->type() == Item::FIELD_ITEM &&
       !(((Item_field *) (item->real_item()))->field->flags & BLOB_FLAG))
-    return new Cached_item_field((Item_field *) (item->real_item()));
+  {
+    Item_field *real_item= (Item_field *) item->real_item();
+    Field *cached_field= real_item->field;
+    return new Cached_item_field(cached_field);
+  }
   switch (item->result_type()) {
   case STRING_RESULT:
     return new Cached_item_str(thd, (Item_field *) item);
@@ -117,14 +121,20 @@ bool Cached_item_int::cmp(void)
 
 bool Cached_item_field::cmp(void)
 {
-  bool tmp= field->cmp(buff) != 0;		// This is not a blob!
-  if (tmp)
-    field->get_image(buff,length,field->charset());
+  bool tmp= FALSE;                              // Value is identical
+  /* Note that field can't be a blob here ! */
   if (null_value != field->is_null())
   {
     null_value= !null_value;
-    tmp=TRUE;
+    tmp= TRUE;                                  // Value has changed
   }
+
+  /*
+    If value is not null and value changed (from null to not null or
+    becasue of value change), then copy the new value to buffer.
+    */
+  if (! null_value && (tmp || (tmp= (field->cmp(buff) != 0))))
+    field->get_image(buff,length,field->charset());
   return tmp;
 }
 
diff --git a/sql/item_cmpfunc.cc b/sql/item_cmpfunc.cc
index 51a73d774a8..7ac31a44004 100644
--- a/sql/item_cmpfunc.cc
+++ b/sql/item_cmpfunc.cc
@@ -1775,6 +1775,54 @@ bool Item_in_optimizer::fix_fields(THD *thd, Item **ref)
 
 
 /**
+  Add an expression cache for this subquery if it is needed
+
+  @param thd_arg         Thread handle
+
+  @details
+  The function checks whether an expression cache is needed for this item
+  and if if so wraps the item into an item of the class
+  Item_exp_cache_wrapper with an appropriate expression cache set up there.
+
+  @note
+  used from Item::transform()
+
+  @return
+  new wrapper item if an expression cache is needed,
+  this item - otherwise
+*/
+
+Item *Item_in_optimizer::expr_cache_insert_transformer(uchar *thd_arg)
+{
+  THD *thd= (THD*) thd_arg;
+  DBUG_ENTER("Item_in_optimizer::expr_cache_insert_transformer");
+  List<Item*> &depends_on= ((Item_subselect *)args[1])->depends_on;
+
+  if (expr_cache)
+    DBUG_RETURN(expr_cache);
+
+  /* Add left expression to the list of the parameters of the subquery */
+  if (args[0]->cols() == 1)
+    depends_on.push_front((Item**)args);
+  else
+  {
+    for (uint i= 0; i < args[0]->cols(); i++)
+    {
+      depends_on.push_front(args[0]->addr(i));
+    }
+  }
+
+  if (args[1]->expr_cache_is_needed(thd) &&
+      (expr_cache= set_expr_cache(thd, depends_on)))
+    DBUG_RETURN(expr_cache);
+
+  /* no cache => return list in original state just to be safe */
+  for (uint i= 0; i < args[0]->cols(); i++)
+    depends_on.pop();
+  DBUG_RETURN(this);
+}
+
+/*
    The implementation of optimized \<outer expression\> [NOT] IN \<subquery\>
    predicates. The implementation works as follows.
 
@@ -1844,6 +1892,7 @@ bool Item_in_optimizer::fix_fields(THD *thd, Item **ref)
      @see Item_in_subselect::val_bool()
      @see Item_is_not_null_test::val_int()
  */
+
 longlong Item_in_optimizer::val_int()
 {
   bool tmp;
@@ -1945,6 +1994,7 @@ void Item_in_optimizer::cleanup()
   Item_bool_func::cleanup();
   if (!save_cache)
     cache= 0;
+  expr_cache= 0;
   DBUG_VOID_RETURN;
 }
 
@@ -1956,6 +2006,70 @@ bool Item_in_optimizer::is_null()
 }
 
 
+/**
+  Transform an Item_in_optimizer and its arguments with a callback function.
+
+  @param transformer the transformer callback function to be applied to the
+         nodes of the tree of the object
+  @param parameter to be passed to the transformer
+
+  @detail
+    Recursively transform the left and the right operand of this Item. The
+    Right operand is an Item_in_subselect or its subclass. To avoid the
+    creation of new Items, we use the fact the the left operand of the
+    Item_in_subselect is the same as the one of 'this', so instead of
+    transforming its operand, we just assign the left operand of the
+    Item_in_subselect to be equal to the left operand of 'this'.
+    The transformation is not applied further to the subquery operand
+    if the IN predicate.
+
+  @returns
+    @retval pointer to the transformed item
+    @retval NULL if an error occurred
+*/
+
+Item *Item_in_optimizer::transform(Item_transformer transformer, uchar *argument)
+{
+  Item *new_item;
+
+  DBUG_ASSERT(!current_thd->is_stmt_prepare());
+  DBUG_ASSERT(arg_count == 2);
+
+  /* Transform the left IN operand. */
+  new_item= (*args)->transform(transformer, argument);
+  if (!new_item)
+    return 0;
+  /*
+    THD::change_item_tree() should be called only if the tree was
+    really transformed, i.e. when a new item has been created.
+    Otherwise we'll be allocating a lot of unnecessary memory for
+    change records at each execution.
+  */
+  if ((*args) != new_item)
+    current_thd->change_item_tree(args, new_item);
+
+  /*
+    Transform the right IN operand which should be an Item_in_subselect or a
+    subclass of it. The left operand of the IN must be the same as the left
+    operand of this Item_in_optimizer, so in this case there is no further
+    transformation, we only make both operands the same.
+    TODO: is it the way it should be?
+  */
+  DBUG_ASSERT((args[1])->type() == Item::SUBSELECT_ITEM &&
+              (((Item_subselect*)(args[1]))->substype() ==
+               Item_subselect::IN_SUBS ||
+               ((Item_subselect*)(args[1]))->substype() ==
+               Item_subselect::ALL_SUBS ||
+               ((Item_subselect*)(args[1]))->substype() ==
+               Item_subselect::ANY_SUBS));
+
+  Item_in_subselect *in_arg= (Item_in_subselect*)args[1];
+  in_arg->left_expr= args[0];
+
+  return (this->*transformer)(argument);
+}
+
+
 longlong Item_func_eq::val_int()
 {
   DBUG_ASSERT(fixed == 1);
@@ -4204,11 +4318,15 @@ Item_cond::fix_fields(THD *thd, Item **ref)
   DBUG_ASSERT(fixed == 0);
   List_iterator<Item> li(list);
   Item *item;
+  TABLE_LIST *save_emb_on_expr_nest= thd->thd_marker.emb_on_expr_nest;
 #ifndef EMBEDDED_LIBRARY
   uchar buff[sizeof(char*)];			// Max local vars in function
 #endif
   not_null_tables_cache= used_tables_cache= 0;
   const_item_cache= 1;
+
+  if (functype() != COND_AND_FUNC)
+    thd->thd_marker.emb_on_expr_nest= NULL;
   /*
     and_table_cache is the value that Item_cond_or() returns for
     not_null_tables()
@@ -4267,11 +4385,45 @@ Item_cond::fix_fields(THD *thd, Item **ref)
       maybe_null=1;
   }
   thd->lex->current_select->cond_count+= list.elements;
+  thd->thd_marker.emb_on_expr_nest= save_emb_on_expr_nest;
   fix_length_and_dec();
   fixed= 1;
   return FALSE;
 }
 
+
+void Item_cond::fix_after_pullout(st_select_lex *new_parent, Item **ref)
+{
+  List_iterator<Item> li(list);
+  Item *item;
+
+  used_tables_cache=0;
+  const_item_cache=1;
+
+  and_tables_cache= ~(table_map) 0; // Here and below we do as fix_fields does
+  not_null_tables_cache= 0;
+
+  while ((item=li++))
+  {
+    table_map tmp_table_map;
+    item->fix_after_pullout(new_parent, li.ref());
+    item= *li.ref();
+    used_tables_cache|= item->used_tables();
+    const_item_cache&= item->const_item();
+
+    if (item->const_item())
+      and_tables_cache= (table_map) 0;
+    else
+    {
+      tmp_table_map= item->not_null_tables();
+      not_null_tables_cache|= tmp_table_map;
+      and_tables_cache&= tmp_table_map;
+      const_item_cache= FALSE;
+    }  
+  }
+}
+
+
 bool Item_cond::walk(Item_processor processor, bool walk_subquery, uchar *arg)
 {
   List_iterator_fast<Item> li(list);
@@ -5522,7 +5674,7 @@ void Item_equal::merge(Item_equal *item)
   If cmp(item_field1,item_field2,arg)<0 than item_field1 must be
   placed after item_fiel2.
 
-  The function sorts field items by the exchange sort algorithm.
+  The function sorts field items by the bubble sort algorithm.
   The list of field items is looked through and whenever two neighboring
   members follow in a wrong order they are swapped. This is performed
   again and again until we get all members in a right order.
@@ -5533,7 +5685,7 @@ void Item_equal::merge(Item_equal *item)
 
 void Item_equal::sort(Item_field_cmpfunc compare, void *arg)
 {
-  exchange_sort<Item_field>(&fields, compare, arg);
+  bubble_sort<Item_field>(&fields, compare, arg);
 }
 
 
@@ -5564,7 +5716,7 @@ void Item_equal::update_const()
 bool Item_equal::fix_fields(THD *thd, Item **ref)
 {
   List_iterator_fast<Item_field> li(fields);
-  Item *item;
+  Item_field *item;
   not_null_tables_cache= used_tables_cache= 0;
   const_item_cache= 0;
   while ((item= li++))
@@ -5575,6 +5727,7 @@ bool Item_equal::fix_fields(THD *thd, Item **ref)
     not_null_tables_cache|= tmp_table_map;
     if (item->maybe_null)
       maybe_null=1;
+    item->item_equal= this;
   }
   fix_length_and_dec();
   fixed= 1;
@@ -5621,7 +5774,7 @@ longlong Item_equal::val_int()
 
 void Item_equal::fix_length_and_dec()
 {
-  Item *item= get_first();
+  Item *item= get_first(NULL);
   eval_item= cmp_item::get_comparator(item->result_type(),
                                       item->collation.collation);
 }
@@ -5684,3 +5837,130 @@ void Item_equal::print(String *str, enum_query_type query_type)
   str->append(')');
 }
 
+
+/*
+  @brief Get the first equal field of multiple equality.
+  @param[in] field   the field to get equal field to
+
+  @details Get the first field of multiple equality that is equal to the
+  given field. In order to make semi-join materialization strategy work
+  correctly we can't propagate equal fields from upper select to a
+  materialized semi-join.
+  Thus the fields is returned according to following rules:
+
+  1) If the given field belongs to a semi-join then the first field in
+     multiple equality which belong to the same semi-join is returned.
+     Otherwise NULL is returned.
+  2) If the given field doesn't belong to a semi-join then
+     the first field in the multiple equality that doesn't belong to any
+     semi-join is returned.
+     If all fields in the equality are belong to semi-join(s) then NULL
+     is returned.
+  3) If no field is given then the first field in the multiple equality
+     is returned without regarding whether it belongs to a semi-join or not.
+
+  @retval Found first field in the multiple equality.
+  @retval 0 if no field found.
+*/
+
+Item_field* Item_equal::get_first(Item_field *field)
+{
+  List_iterator<Item_field> it(fields);
+  Item_field *item;
+  JOIN_TAB *field_tab;
+  if (!field)
+    return fields.head();
+
+  /*
+    Of all equal fields, return the first one we can use. Normally, this is the
+    field which belongs to the table that is the first in the join order.
+
+    There is one exception to this: When semi-join materialization strategy is
+    used, and the given field belongs to a table within the semi-join nest, we
+    must pick the first field in the semi-join nest.
+
+    Example: suppose we have a join order:
+
+       ot1 ot2  SJ-Mat(it1  it2  it3)  ot3
+
+    and equality ot2.col = it1.col = it2.col
+    If we're looking for best substitute for 'it2.col', we should pick it1.col
+    and not ot2.col.
+    
+    eliminate_item_equal() also has code that deals with equality substitution
+    in presense of SJM nests.
+  */
+
+  field_tab= field->field->table->reginfo.join_tab;
+
+  TABLE_LIST *emb_nest= field->field->table->pos_in_table_list->embedding;
+
+  if (emb_nest && emb_nest->sj_mat_info && emb_nest->sj_mat_info->is_used)
+  {
+    /*
+      It's a field from an materialized semi-join. We can substitute it only
+      for a field from the same semi-join.
+    */
+    JOIN_TAB *first= field_tab;
+    JOIN *join= field_tab->join;
+    int tab_idx= field_tab - field_tab->join->join_tab;
+
+    DBUG_ASSERT(join->join_tab[tab_idx].table->map &
+                emb_nest->sj_inner_tables);
+
+    /* Find the first table of this semi-join nest */
+    for (int i= tab_idx-1; i >= (int)join->const_tables; i--)
+    {
+      if (join->join_tab[i].table->map & emb_nest->sj_inner_tables)
+        first= join->join_tab + i;
+      else
+        // Found first tab that doesn't belong to current SJ.
+        break;
+    }
+    /* Find an item to substitute for. */
+    while ((item= it++))
+    {
+      if (item->field->table->reginfo.join_tab >= first)
+      {
+        /*
+          If we found given field then return NULL to avoid unnecessary
+          substitution.
+        */
+        return (item != field) ? item : NULL;
+      }
+    }
+  }
+  else
+  {
+#if 0    
+    /*
+      The field is not in SJ-Materialization nest. We must return the first
+      field that's not embedded in a SJ-Materialization nest.
+      Example: suppose we have a join order:
+
+          SJ-Mat(it1  it2)  ot1  ot2
+
+      and equality ot2.col = ot1.col = it2.col
+      If we're looking for best substitute for 'ot2.col', we should pick ot1.col
+      and not it2.col, because when we run a join between ot1 and ot2
+      execution of SJ-Mat(...) has already finished and we can't rely on the
+      value of it*.*.
+      psergey-fix-fix: ^^ THAT IS INCORRECT ^^. Pick the first, whatever that
+      is.
+    */
+    while ((item= it++))
+    {
+      TABLE_LIST *emb_nest= item->field->table->pos_in_table_list->embedding;
+      if (!emb_nest || !emb_nest->sj_mat_info || 
+          !emb_nest->sj_mat_info->is_used)
+      {
+        return item;
+      }
+    }
+#endif
+    return fields.head();
+  }
+  // Shouldn't get here.
+  DBUG_ASSERT(0);
+  return NULL;
+}
diff --git a/sql/item_cmpfunc.h b/sql/item_cmpfunc.h
index 2cacd92bc8a..2e5e4df6c2c 100644
--- a/sql/item_cmpfunc.h
+++ b/sql/item_cmpfunc.h
@@ -242,6 +242,7 @@ class Item_in_optimizer: public Item_bool_func
 {
 protected:
   Item_cache *cache;
+  Item *expr_cache;
   bool save_cache;
   /* 
     Stores the value of "NULL IN (SELECT ...)" for uncorrelated subqueries:
@@ -252,7 +253,7 @@ protected:
   int result_for_null_param;
 public:
   Item_in_optimizer(Item *a, Item_in_subselect *b):
-    Item_bool_func(a, my_reinterpret_cast(Item *)(b)), cache(0),
+    Item_bool_func(a, my_reinterpret_cast(Item *)(b)), cache(0), expr_cache(0),
     save_cache(0), result_for_null_param(UNKNOWN)
   {}
   bool fix_fields(THD *, Item **);
@@ -263,6 +264,8 @@ public:
   const char *func_name() const { return "<in_optimizer>"; }
   Item_cache **get_cache() { return &cache; }
   void keep_top_level_cache();
+  Item *transform(Item_transformer transformer, uchar *arg);
+  virtual Item *expr_cache_insert_transformer(uchar *thd_arg);
 };
 
 class Comp_creator
@@ -370,6 +373,7 @@ public:
   CHARSET_INFO *compare_collation() { return cmp.cmp_collation.collation; }
   uint decimal_precision() const { return 1; }
   void top_level_item() { abort_on_null= TRUE; }
+  Arg_comparator *get_comparator() { return &cmp; }
   void cleanup()
   {
     Item_int_func::cleanup();
@@ -486,13 +490,23 @@ public:
 class Item_func_eq :public Item_bool_rowready_func2
 {
 public:
-  Item_func_eq(Item *a,Item *b) :Item_bool_rowready_func2(a,b) {}
+  Item_func_eq(Item *a,Item *b) :
+    Item_bool_rowready_func2(a,b), in_equality_no(UINT_MAX)
+  {}
   longlong val_int();
   enum Functype functype() const { return EQ_FUNC; }
   enum Functype rev_functype() const { return EQ_FUNC; }
   cond_result eq_cmp_result() const { return COND_TRUE; }
   const char *func_name() const { return "="; }
   Item *negated_item();
+  /* 
+    - If this equality is created from the subquery's IN-equality:
+      number of the item it was created from, e.g. for
+       (a,b) IN (SELECT c,d ...)  a=c will have in_equality_no=0, 
+       and b=d will have in_equality_no=1.
+    - Otherwise, UINT_MAX
+  */
+  uint in_equality_no;
 };
 
 class Item_func_equal :public Item_bool_rowready_func2
@@ -1500,6 +1514,7 @@ public:
     list.prepand(nlist);
   }
   bool fix_fields(THD *, Item **ref);
+  void fix_after_pullout(st_select_lex *new_parent, Item **ref);
 
   enum Type type() const { return COND_ITEM; }
   List<Item>* argument_list() { return &list; }
@@ -1619,7 +1634,7 @@ public:
   void add(Item_field *f);
   uint members();
   bool contains(Field *field);
-  Item_field* get_first() { return fields.head(); }
+  Item_field* get_first(Item_field *field);
   uint n_fields() { return fields.elements; }
   void merge(Item_equal *item);
   void update_const();
@@ -1637,6 +1652,9 @@ public:
   virtual void print(String *str, enum_query_type query_type);
   CHARSET_INFO *compare_collation() 
   { return fields.head()->collation.collation; }
+  friend Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
+                           Item_equal *item_equal);
+  friend bool setup_sj_materialization(struct st_join_table *tab);
 }; 
 
 class COND_EQUAL: public Sql_alloc
@@ -1743,14 +1761,34 @@ inline bool is_cond_or(Item *item)
 class Item_cond_xor :public Item_cond
 {
 public:
-  Item_cond_xor() :Item_cond() {}
-  Item_cond_xor(Item *i1,Item *i2) :Item_cond(i1,i2) {}
+  Item_cond_xor(Item *i1,Item *i2) :Item_cond(i1,i2) 
+  {
+    /* 
+      Items must be stored in args[] as well because this Item_cond is
+      treated as a FUNC_ITEM (see type()). I.e., users of it will get
+      it's children by calling arguments(), not argument_list(). This
+      is a temporary solution until XOR is optimized and treated like
+      a full Item_cond citizen.
+     */
+    arg_count= 2;
+    args= tmp_arg;
+    args[0]= i1; 
+    args[1]= i2;
+  }
   enum Functype functype() const { return COND_XOR_FUNC; }
   /* TODO: remove the next line when implementing XOR optimization */
   enum Type type() const { return FUNC_ITEM; }
   longlong val_int();
   const char *func_name() const { return "xor"; }
   void top_level_item() {}
+  /* Since child Items are stored in args[], Items cannot be added.
+     However, since Item_cond_xor is treated as a FUNC_ITEM (see
+     type()), the methods below should never be called. 
+  */
+  bool add(Item *item) { DBUG_ASSERT(FALSE); return FALSE; }
+  bool add_at_head(Item *item) { DBUG_ASSERT(FALSE); return FALSE; }
+  bool add_at_head(List<Item> *nlist) { DBUG_ASSERT(FALSE); return FALSE; }
+  void copy_andor_arguments(THD *thd, Item_cond *item) { DBUG_ASSERT(FALSE); }
 };
 
 
diff --git a/sql/item_func.cc b/sql/item_func.cc
index 6ff743ed91d..1823f4bb7a5 100644
--- a/sql/item_func.cc
+++ b/sql/item_func.cc
@@ -150,9 +150,11 @@ Item_func::fix_fields(THD *thd, Item **ref)
 {
   DBUG_ASSERT(fixed == 0);
   Item **arg,**arg_end;
+  TABLE_LIST *save_emb_on_expr_nest= thd->thd_marker.emb_on_expr_nest;
 #ifndef EMBEDDED_LIBRARY			// Avoid compiler warning
   uchar buff[STACK_BUFF_ALLOC];			// Max argument in function
 #endif
+  thd->thd_marker.emb_on_expr_nest= NULL;
 
   used_tables_cache= not_null_tables_cache= 0;
   const_item_cache=1;
@@ -206,6 +208,7 @@ Item_func::fix_fields(THD *thd, Item **ref)
   if (thd->is_error()) // An error inside fix_length_and_dec occured
     return TRUE;
   fixed= 1;
+  thd->thd_marker.emb_on_expr_nest= save_emb_on_expr_nest;
   return FALSE;
 }
 
@@ -225,6 +228,28 @@ Item_func::quick_fix_field()
 }
 
 
+void Item_func::fix_after_pullout(st_select_lex *new_parent, Item **ref)
+{
+  Item **arg,**arg_end;
+
+  used_tables_cache= not_null_tables_cache= 0;
+  const_item_cache=1;
+
+  if (arg_count)
+  {
+    for (arg=args, arg_end=args+arg_count; arg != arg_end ; arg++)
+    {
+      (*arg)->fix_after_pullout(new_parent, arg);
+      Item *item= *arg;
+
+      used_tables_cache|=     item->used_tables();
+      not_null_tables_cache|= item->not_null_tables();
+      const_item_cache&=      item->const_item();
+    }
+  }
+}
+
+
 bool Item_func::walk(Item_processor processor, bool walk_subquery,
                      uchar *argument)
 {
@@ -486,12 +511,12 @@ Field *Item_func::tmp_table_field(TABLE *table)
   return field;
 }
 
-
+/*
 bool Item_func::is_expensive_processor(uchar *arg)
 {
   return is_expensive();
 }
-
+*/
 
 my_decimal *Item_func::val_decimal(my_decimal *decimal_value)
 {
@@ -2967,7 +2992,7 @@ udf_handler::fix_fields(THD *thd, Item_result_field *func,
           String *res= arguments[i]->val_str(&buffers[i]);
           if (arguments[i]->null_value)
             continue;
-          f_args.args[i]= (char*) res->c_ptr();
+          f_args.args[i]= (char*) res->c_ptr_safe();
           f_args.lengths[i]= res->length();
           break;
         }
diff --git a/sql/item_func.h b/sql/item_func.h
index a41324b4102..36f089de529 100644
--- a/sql/item_func.h
+++ b/sql/item_func.h
@@ -117,6 +117,7 @@ public:
   // Constructor used for Item_cond_and/or (see Item comment)
   Item_func(THD *thd, Item_func *item);
   bool fix_fields(THD *, Item **ref);
+  void fix_after_pullout(st_select_lex *new_parent, Item **ref);
   void quick_fix_field();
   table_map used_tables() const;
   table_map not_null_tables() const;
@@ -181,8 +182,8 @@ public:
                 Item_transformer transformer, uchar *arg_t);
   void traverse_cond(Cond_traverser traverser,
                      void * arg, traverse_order order);
-  bool is_expensive_processor(uchar *arg);
-  virtual bool is_expensive() { return 0; }
+ // bool is_expensive_processor(uchar *arg);
+ // virtual bool is_expensive() { return 0; }
   inline double fix_result(double value)
   {
     if (isfinite(value))
@@ -1125,6 +1126,7 @@ class Item_udf_func :public Item_func
 {
 protected:
   udf_handler udf;
+  bool is_expensive_processor(uchar *arg) { return TRUE; }
 
 public:
   Item_udf_func(udf_func *udf_arg)
@@ -1748,6 +1750,9 @@ private:
   bool execute();
   bool execute_impl(THD *thd);
   bool init_result_field(THD *thd);
+
+protected:
+  bool is_expensive_processor(uchar *arg) { return TRUE; }
   
 public:
 
diff --git a/sql/item_row.cc b/sql/item_row.cc
index 408bc11eb9b..22421199b76 100644
--- a/sql/item_row.cc
+++ b/sql/item_row.cc
@@ -127,6 +127,20 @@ void Item_row::update_used_tables()
   }
 }
 
+
+void Item_row::fix_after_pullout(st_select_lex *new_parent, Item **ref)
+{
+  used_tables_cache= 0;
+  const_item_cache= 1;
+  for (uint i= 0; i < arg_count; i++)
+  {
+    items[i]->fix_after_pullout(new_parent, &items[i]);
+    used_tables_cache|= items[i]->used_tables();
+    const_item_cache&= items[i]->const_item();
+  }
+}
+
+
 bool Item_row::check_cols(uint c)
 {
   if (c != arg_count)
diff --git a/sql/item_row.h b/sql/item_row.h
index 3566ff079c6..c7fb8d59220 100644
--- a/sql/item_row.h
+++ b/sql/item_row.h
@@ -60,6 +60,7 @@ public:
     return 0;
   };
   bool fix_fields(THD *thd, Item **ref);
+  void fix_after_pullout(st_select_lex *new_parent, Item **ref);
   void cleanup();
   void split_sum_func(THD *thd, Item **ref_pointer_array, List<Item> &fields);
   table_map used_tables() const { return used_tables_cache; };
diff --git a/sql/item_subselect.cc b/sql/item_subselect.cc
index 02f708cdf91..854b328f862 100644
--- a/sql/item_subselect.cc
+++ b/sql/item_subselect.cc
@@ -31,21 +31,18 @@
 #include "mysql_priv.h"
 #include "sql_select.h"
 
-inline Item * and_items(Item* cond, Item *item)
-{
-  return (cond? (new Item_cond_and(cond, item)) : item);
-}
 
 Item_subselect::Item_subselect():
   Item_result_field(), value_assigned(0), thd(0), substitution(0),
-  engine(0), old_engine(0), used_tables_cache(0), have_to_be_excluded(0),
-  const_item_cache(1), in_fix_fields(0), eliminated(FALSE), 
-  engine_changed(0), changed(0), is_correlated(FALSE)
+  expr_cache(0), engine(0), old_engine(0), used_tables_cache(0),
+  have_to_be_excluded(0), const_item_cache(1), inside_first_fix_fields(0),
+  done_first_fix_fields(FALSE), eliminated(FALSE), engine_changed(0),
+  changed(0), is_correlated(FALSE)
 {
   with_subselect= 1;
   reset();
   /*
-    item value is NULL if select_subselect not changed this value
+    Item value is NULL if select_result_interceptor didn't change this value
     (i.e. some rows will be found returned)
   */
   null_value= TRUE;
@@ -53,7 +50,7 @@ Item_subselect::Item_subselect():
 
 
 void Item_subselect::init(st_select_lex *select_lex,
-			  select_subselect *result)
+			  select_result_interceptor *result)
 {
   /*
     Please see Item_singlerow_subselect::invalidate_and_restore_select_lex(),
@@ -63,6 +60,7 @@ void Item_subselect::init(st_select_lex *select_lex,
   DBUG_ENTER("Item_subselect::init");
   DBUG_PRINT("enter", ("select_lex: 0x%lx", (long) select_lex));
   unit= select_lex->master_unit();
+  thd= unit->thd;
 
   if (unit->item)
   {
@@ -79,6 +77,7 @@ void Item_subselect::init(st_select_lex *select_lex,
   else
   {
     SELECT_LEX *outer_select= unit->outer_select();
+    DBUG_ASSERT(thd);
     /*
       do not take into account expression inside aggregate functions because
       they can access original table fields
@@ -87,14 +86,16 @@ void Item_subselect::init(st_select_lex *select_lex,
                     NO_MATTER :
                     outer_select->parsing_place);
     if (unit->is_union())
-      engine= new subselect_union_engine(unit, result, this);
+      engine= new subselect_union_engine(thd, unit, result, this);
     else
-      engine= new subselect_single_select_engine(select_lex, result, this);
+      engine= new subselect_single_select_engine(thd, select_lex, result, this);
   }
   {
     SELECT_LEX *upper= unit->outer_select();
     if (upper->parsing_place == IN_HAVING)
       upper->subquery_in_having= 1;
+    /* The subquery is an expression cache candidate */
+    upper->expr_cache_may_be_used[upper->parsing_place]= TRUE;
   }
   DBUG_VOID_RETURN;
 }
@@ -118,8 +119,10 @@ void Item_subselect::cleanup()
   }
   if (engine)
     engine->cleanup();
+  depends_on.empty();
   reset();
   value_assigned= 0;
+  expr_cache= 0;
   DBUG_VOID_RETURN;
 }
 
@@ -132,6 +135,22 @@ void Item_singlerow_subselect::cleanup()
   DBUG_VOID_RETURN;
 }
 
+
+void Item_in_subselect::cleanup()
+{
+  DBUG_ENTER("Item_in_subselect::cleanup");
+  if (left_expr_cache)
+  {
+    left_expr_cache->delete_elements();
+    delete left_expr_cache;
+    left_expr_cache= NULL;
+  }
+  first_execution= TRUE;
+  is_constant= FALSE;
+  Item_subselect::cleanup();
+  DBUG_VOID_RETURN;
+}
+
 Item_subselect::~Item_subselect()
 {
   delete engine;
@@ -152,21 +171,42 @@ bool Item_subselect::fix_fields(THD *thd_param, Item **ref)
   bool res;
 
   DBUG_ASSERT(fixed == 0);
-  engine->set_thd((thd= thd_param));
-  if (!in_fix_fields)
-    refers_to.empty();
+  /* There is no reason to get a different THD. */
+  DBUG_ASSERT(thd == thd_param);
+  if (!done_first_fix_fields)
+  {
+    done_first_fix_fields= TRUE;
+    inside_first_fix_fields= TRUE;
+    upper_refs.empty();
+    /*
+      psergey-todo: remove _first_fix_fields calls, we need changes on every
+      execution
+    */
+  }
+
   eliminated= FALSE;
+  parent_select= thd_param->lex->current_select;
 
   if (check_stack_overrun(thd, STACK_MIN_SIZE, (uchar*)&res))
     return TRUE;
   
-  in_fix_fields++;
 
   if (!(res= engine->prepare()))
   {
     // all transformation is done (used by prepared statements)
     changed= 1;
+  inside_first_fix_fields= FALSE;
+
 
+    // all transformation is done (used by prepared statements)
+    changed= 1;
+
+    /*
+      Substitute the current item with an Item_in_optimizer that was
+      created by Item_in_subselect::select_in_like_transformer and
+      call fix_fields for the substituted item which in turn calls
+      engine->prepare for the subquery predicate.
+    */
     if (substitution)
     {
       // did we changed top item of WHERE condition
@@ -184,12 +224,13 @@ bool Item_subselect::fix_fields(THD *thd_param, Item **ref)
       if (!(*ref)->fixed)
 	res= (*ref)->fix_fields(thd, ref);
       goto end;
+//psergey-merge:  done_first_fix_fields= FALSE;
     }
     // Is it one field subselect?
     if (engine->cols() > max_columns)
     {
       my_error(ER_OPERAND_COLUMNS, MYF(0), 1);
-      res= 1;
+//psergey-merge:  done_first_fix_fields= FALSE;
       goto end;
     }
     fix_length_and_dec();
@@ -206,7 +247,7 @@ bool Item_subselect::fix_fields(THD *thd_param, Item **ref)
   fixed= 1;
 
 end:
-  in_fix_fields--;
+  done_first_fix_fields= FALSE;
   thd->where= save_where;
   return res;
 }
@@ -214,11 +255,12 @@ end:
 
 bool Item_subselect::enumerate_field_refs_processor(uchar *arg)
 {
-  List_iterator<Item> it(refers_to);
-  Item *item;
-  while ((item= it++))
+  List_iterator<Ref_to_outside> it(upper_refs);
+  Ref_to_outside *upper;
+  
+  while ((upper= it++))
   {
-    if (item->walk(&Item::enumerate_field_refs_processor, FALSE, arg))
+    if (upper->item->walk(&Item::enumerate_field_refs_processor, FALSE, arg))
       return TRUE;
   }
   return FALSE;
@@ -230,6 +272,144 @@ bool Item_subselect::mark_as_eliminated_processor(uchar *arg)
   return FALSE;
 }
 
+
+bool Item_subselect::mark_as_dependent(THD *thd, st_select_lex *select, 
+                                       Item *item)
+{
+  if (inside_first_fix_fields)
+  {
+    is_correlated= TRUE;
+    Ref_to_outside *upper;
+    if (!(upper= new (thd->stmt_arena->mem_root) Ref_to_outside()))
+      return TRUE;
+    upper->select= select;
+    upper->item= item;
+    if (upper_refs.push_back(upper, thd->stmt_arena->mem_root))
+      return TRUE;
+  }
+  return FALSE;
+}
+
+
+/*
+  Adjust attributes after our parent select has been merged into grandparent
+
+  DESCRIPTION
+    Subquery is a composite object which may be correlated, that is, it may
+    have
+    1. references to tables of the parent select (i.e. one that has the clause
+      with the subquery predicate)
+    2. references to tables of the grandparent select
+    3. references to tables of further ancestors.
+    
+    Before the pullout, this item indicates:
+    - #1 with table bits in used_tables()
+    - #2 and #3 with OUTER_REF_TABLE_BIT.
+
+    After parent has been merged with grandparent:
+    - references to parent and grandparent tables should be indicated with 
+      table bits.
+    - references to greatgrandparent and further ancestors - with
+      OUTER_REF_TABLE_BIT.
+*/
+
+void Item_subselect::fix_after_pullout(st_select_lex *new_parent, Item **ref)
+{
+  recalc_used_tables(new_parent, TRUE);
+  parent_select= new_parent;
+}
+
+
+class Field_fixer: public Field_enumerator
+{
+public:
+  table_map used_tables; /* Collect used_tables here */
+  st_select_lex *new_parent; /* Select we're in */
+  virtual void visit_field(Item_field *item)
+  {
+    //for (TABLE_LIST *tbl= new_parent->leaf_tables; tbl; tbl= tbl->next_local)
+    //{
+    //  if (tbl->table == field->table)
+    //  {
+        used_tables|= item->field->table->map;
+    //    return;
+    //  }
+    //}
+    //used_tables |= OUTER_REF_TABLE_BIT;
+  }
+};
+
+
+/*
+  Recalculate used_tables_cache 
+*/
+
+void Item_subselect::recalc_used_tables(st_select_lex *new_parent, 
+                                        bool after_pullout)
+{
+  List_iterator<Ref_to_outside> it(upper_refs);
+  Ref_to_outside *upper;
+  
+  used_tables_cache= 0;
+  while ((upper= it++))
+  {
+    bool found= FALSE;
+    /*
+      Check if
+        1. the upper reference refers to the new immediate parent select, or
+        2. one of the further ancestors.
+
+      We rely on the fact that the tree of selects is modified by some kind of
+      'flattening', i.e. a process where child selects are merged into their
+      parents.
+      The merged selects are removed from the select tree but keep pointers to
+      their parents.
+    */
+    for (st_select_lex *sel= upper->select; sel; sel= sel->outer_select())
+    {
+      /* 
+        If we've reached the new parent select by walking upwards from
+        reference's original select, this means that the reference is now 
+        referring to the direct parent:
+      */
+      if (sel == new_parent)
+      {
+        found= TRUE;
+        /* 
+          upper->item may be NULL when we've referred to a grouping function,
+          in which case we don't care about what it's table_map really is,
+          because item->with_sum_func==1 will ensure correct placement of the
+          item.
+        */
+        if (upper->item)
+        {
+          // Now, iterate over fields and collect used_tables() attribute:
+          Field_fixer fixer;
+          fixer.used_tables= 0;
+          fixer.new_parent= new_parent;
+          upper->item->walk(&Item::enumerate_field_refs_processor, FALSE,
+                            (uchar*)&fixer);
+          used_tables_cache |= fixer.used_tables;
+          /*
+          if (after_pullout)
+            upper->item->fix_after_pullout(new_parent, &(upper->item));
+          upper->item->update_used_tables();
+          used_tables_cache |= upper->item->used_tables();
+          */
+        }
+      }
+    }
+    if (!found)
+      used_tables_cache|= OUTER_REF_TABLE_BIT;
+  }
+  /* 
+    Don't update const_tables_cache yet as we don't yet know which of the
+    parent's tables are constant. Parent will call update_used_tables() after
+    he has done const table detection, and that will be our chance to update
+    const_tables_cache.
+  */
+}
+
 bool Item_subselect::walk(Item_processor processor, bool walk_subquery,
                           uchar *argument)
 {
@@ -298,6 +478,97 @@ bool Item_subselect::exec()
   return (res);
 }
 
+
+/**
+  Check if an expression cache is needed for this subquery
+
+  @param thd             Thread handle
+
+  @details
+  The function checks whether a cache is needed for a subquery and whether
+  the result of the subquery can be put in cache.
+
+  @retval TRUE  cache is needed
+  @retval FALSE otherwise
+*/
+
+bool Item_subselect::expr_cache_is_needed(THD *thd)
+{
+  return (depends_on.elements &&
+          engine->cols() == 1 &&
+          optimizer_flag(thd, OPTIMIZER_SWITCH_SUBQUERY_CACHE) &&
+          !(engine->uncacheable() & (UNCACHEABLE_RAND |
+                                     UNCACHEABLE_SIDEEFFECT)));
+}
+
+
+/**
+  Check if an expression cache is needed for this subquery
+
+  @param thd             Thread handle
+
+  @details
+  The function checks whether a cache is needed for a subquery and whether
+  the result of the subquery can be put in cache.
+
+  @note
+  This method allows many columns in the subquery because it is supported by
+  Item_in optimizer and result of the IN subquery will be scalar in this
+  case.
+
+  @retval TRUE  cache is needed
+  @retval FALSE otherwise
+*/
+
+bool Item_in_subselect::expr_cache_is_needed(THD *thd)
+{
+  return (depends_on.elements &&
+          optimizer_flag(thd, OPTIMIZER_SWITCH_SUBQUERY_CACHE) &&
+          !(engine->uncacheable() & (UNCACHEABLE_RAND |
+                                     UNCACHEABLE_SIDEEFFECT)));
+}
+
+
+/*
+  Compute the IN predicate if the left operand's cache changed.
+*/
+
+bool Item_in_subselect::exec()
+{
+  DBUG_ENTER("Item_in_subselect::exec");
+  /*
+    Initialize the cache of the left predicate operand. This has to be done as
+    late as now, because Cached_item directly contains a resolved field (not
+    an item, and in some cases (when temp tables are created), these fields
+    end up pointing to the wrong field. One solution is to change Cached_item
+    to not resolve its field upon creation, but to resolve it dynamically
+    from a given Item_ref object.
+    TODO: the cache should be applied conditionally based on:
+    - rules - e.g. only if the left operand is known to be ordered, and/or
+    - on a cost-based basis, that takes into account the cost of a cache
+      lookup, the cache hit rate, and the savings per cache hit.
+  */
+  if (!left_expr_cache && exec_method == MATERIALIZATION)
+    init_left_expr_cache();
+
+  /*
+    If the new left operand is already in the cache, reuse the old result.
+    Use the cached result only if this is not the first execution of IN
+    because the cache is not valid for the first execution.
+  */
+  if (!first_execution && left_expr_cache &&
+      test_if_item_cache_changed(*left_expr_cache) < 0)
+    DBUG_RETURN(FALSE);
+
+  /*
+    The exec() method below updates item::value, and item::null_value, thus if
+    we don't call it, the next call to item::val_int() will return whatever
+    result was computed by its previous call.
+  */
+  DBUG_RETURN(Item_subselect::exec());
+}
+
+
 Item::Type Item_subselect::type() const
 {
   return SUBSELECT_ITEM;
@@ -330,6 +601,7 @@ Item *Item_subselect::get_tmp_table_item(THD *thd_arg)
 
 void Item_subselect::update_used_tables()
 {
+  recalc_used_tables(parent_select, FALSE);
   if (!engine->uncacheable())
   {
     // did all used tables become static?
@@ -458,8 +730,9 @@ void Item_singlerow_subselect::reset()
 Item_subselect::trans_res
 Item_singlerow_subselect::select_transformer(JOIN *join)
 {
+  DBUG_ENTER("Item_singlerow_subselect::select_transformer");
   if (changed)
-    return RES_OK;
+    DBUG_RETURN(RES_OK);
 
   SELECT_LEX *select_lex= join->select_lex;
   Query_arena *arena= thd->stmt_arena;
@@ -502,15 +775,18 @@ Item_singlerow_subselect::select_transformer(JOIN *join)
     */
     substitution->walk(&Item::remove_dependence_processor, 0,
 		       (uchar *) select_lex->outer_select());
-    return RES_REDUCE;
+    DBUG_RETURN(RES_REDUCE);
   }
-  return RES_OK;
+  DBUG_RETURN(RES_OK);
 }
 
 
 void Item_singlerow_subselect::store(uint i, Item *item)
 {
   row[i]->store(item);
+  //psergey-merge: can do without that: row[i]->cache_value();
+  //psergey-backport-timours: ^ really, without that ^ 
+  //psergey-try-merge-again:
   row[i]->cache_value();
 }
 
@@ -551,6 +827,40 @@ void Item_singlerow_subselect::fix_length_and_dec()
     maybe_null= engine->may_be_null();
 }
 
+
+/**
+  Add an expression cache for this subquery if it is needed
+
+  @param thd_arg         Thread handle
+
+  @details
+  The function checks whether an expression cache is needed for this item
+  and if if so wraps the item into an item of the class
+  Item_exp_cache_wrapper with an appropriate expression cache set up there.
+
+  @note
+  used from Item::transform()
+
+  @return
+  new wrapper item if an expression cache is needed,
+  this item - otherwise
+*/
+
+Item* Item_singlerow_subselect::expr_cache_insert_transformer(uchar *thd_arg)
+{
+  THD *thd= (THD*) thd_arg;
+  DBUG_ENTER("Item_singlerow_subselect::expr_cache_insert_transformer");
+
+  if (expr_cache)
+    DBUG_RETURN(expr_cache);
+
+  if (expr_cache_is_needed(thd) &&
+      (expr_cache= set_expr_cache(thd, depends_on)))
+    DBUG_RETURN(expr_cache);
+  DBUG_RETURN(this);
+}
+
+
 uint Item_singlerow_subselect::cols()
 {
   return engine->cols();
@@ -697,8 +1007,9 @@ bool Item_in_subselect::test_limit(st_select_lex_unit *unit_arg)
 
 Item_in_subselect::Item_in_subselect(Item * left_exp,
 				     st_select_lex *select_lex):
-  Item_exists_subselect(), optimizer(0), transformed(0),
-  pushed_cond_guards(NULL), upper_item(0)
+  Item_exists_subselect(), left_expr_cache(0), first_execution(TRUE),
+  is_constant(FALSE), optimizer(0), pushed_cond_guards(NULL),
+  exec_method(NOT_TRANSFORMED), upper_item(0)
 {
   DBUG_ENTER("Item_in_subselect::Item_in_subselect");
   left_expr= left_exp;
@@ -712,13 +1023,18 @@ Item_in_subselect::Item_in_subselect(Item * left_exp,
   DBUG_VOID_RETURN;
 }
 
+int Item_in_subselect::get_identifier()
+{
+  return engine->get_identifier();
+}
+
 Item_allany_subselect::Item_allany_subselect(Item * left_exp,
                                              chooser_compare_func_creator fc,
 					     st_select_lex *select_lex,
 					     bool all_arg)
   :Item_in_subselect(), func_creator(fc), all(all_arg)
 {
-  DBUG_ENTER("Item_in_subselect::Item_in_subselect");
+  DBUG_ENTER("Item_allany_subselect::Item_allany_subselect");
   left_expr= left_exp;
   func= func_creator(all_arg);
   init(select_lex, new select_exists_subselect(this));
@@ -740,6 +1056,40 @@ void Item_exists_subselect::fix_length_and_dec()
   unit->global_parameters->select_limit= new Item_int((int32) 1);
 }
 
+
+/**
+  Add an expression cache for this subquery if it is needed
+
+  @param thd_arg         Thread handle
+
+  @details
+  The function checks whether an expression cache is needed for this item
+  and if if so wraps the item into an item of the class
+  Item_exp_cache_wrapper with an appropriate expression cache set up there.
+
+  @note
+  used from Item::transform()
+
+  @return
+  new wrapper item if an expression cache is needed,
+  this item - otherwise
+*/
+
+Item* Item_exists_subselect::expr_cache_insert_transformer(uchar *thd_arg)
+{
+  THD *thd= (THD*) thd_arg;
+  DBUG_ENTER("Item_exists_subselect::expr_cache_insert_transformer");
+
+  if (expr_cache)
+    DBUG_RETURN(expr_cache);
+
+  if (substype() == EXISTS_SUBS && expr_cache_is_needed(thd) &&
+      (expr_cache= set_expr_cache(thd, depends_on)))
+    DBUG_RETURN(expr_cache);
+  DBUG_RETURN(this);
+}
+
+
 double Item_exists_subselect::val_real()
 {
   DBUG_ASSERT(fixed == 1);
@@ -889,6 +1239,8 @@ bool Item_in_subselect::val_bool()
 {
   DBUG_ASSERT(fixed == 1);
   null_value= was_null= FALSE;
+  if (is_constant)
+    return value;
   if (exec())
   {
     reset();
@@ -964,10 +1316,10 @@ my_decimal *Item_in_subselect::val_decimal(my_decimal *decimal_value)
           HAVING trigcond(<is_not_null_test>(ie))
 
   RETURN
-    RES_OK     - OK, either subquery was transformed, or appopriate
-                 predicates where injected into it.
-    RES_REDUCE - The subquery was reduced to non-subquery
-    RES_ERROR  - Error
+    RES_OK     Either subquery was transformed, or appopriate
+                       predicates where injected into it.
+    RES_REDUCE The subquery was reduced to non-subquery
+    RES_ERROR  Error
 */
 
 Item_subselect::trans_res
@@ -981,6 +1333,7 @@ Item_in_subselect::single_value_transformer(JOIN *join,
     Check that the right part of the subselect contains no more than one
     column. E.g. in SELECT 1 IN (SELECT * ..) the right part is (SELECT * ...)
   */
+  // psergey: duplicated_subselect_card_check
   if (select_lex->item_list.elements > 1)
   {
     my_error(ER_OPERAND_COLUMNS, MYF(0), 1);
@@ -1086,6 +1439,9 @@ Item_in_subselect::single_value_transformer(JOIN *join,
     }
     thd->lex->current_select= current;
 
+    /* We will refer to upper level cache array => we have to save it for SP */
+    optimizer->keep_top_level_cache();
+
     /*
       As far as  Item_ref_in_optimizer do not substitute itself on fix_fields
       we can use same item for all selects.
@@ -1095,8 +1451,17 @@ Item_in_subselect::single_value_transformer(JOIN *join,
 			      (char *)"<no matter>",
 			      (char *)in_left_expr_name);
 
-    master_unit->uncacheable|= UNCACHEABLE_DEPENDENT;
+    /*
+      The uncacheable property controls a number of actions, e.g. whether to
+      save/restore (via init_save_join_tab/restore_tmp) the original JOIN for
+      plans with a temp table where the original JOIN was overriden by
+      make_simple_join. The UNCACHEABLE_EXPLAIN is ignored by EXPLAIN, thus
+      non-correlated subqueries will not appear as such to EXPLAIN.
+    */
+    master_unit->uncacheable|= UNCACHEABLE_EXPLAIN;
+    select_lex->uncacheable|= UNCACHEABLE_EXPLAIN;
   }
+
   if (!abort_on_null && left_expr->maybe_null && !pushed_cond_guards)
   {
     if (!(pushed_cond_guards= (bool*)join->thd->alloc(sizeof(bool))))
@@ -1104,6 +1469,63 @@ Item_in_subselect::single_value_transformer(JOIN *join,
     pushed_cond_guards[0]= TRUE;
   }
 
+  /*
+    If this IN predicate can be computed via materialization, do not
+    perform the IN -> EXISTS transformation.
+  */
+  if (exec_method == MATERIALIZATION)
+    DBUG_RETURN(RES_OK);
+
+  /* Perform the IN=>EXISTS transformation. */
+  DBUG_RETURN(single_value_in_to_exists_transformer(join, func));
+}
+
+
+/**
+  Transofrm an IN predicate into EXISTS via predicate injection.
+
+  @details The transformation injects additional predicates into the subquery
+  (and makes the subquery correlated) as follows.
+
+  - If the subquery has aggregates, GROUP BY, or HAVING, convert to
+
+    SELECT ie FROM ...  HAVING subq_having AND 
+                               trigcond(oe $cmp$ ref_or_null_helper<ie>)
+                                   
+    the addition is wrapped into trigger only when we want to distinguish
+    between NULL and FALSE results.
+
+  - Otherwise (no aggregates/GROUP BY/HAVING) convert it to one of the
+    following:
+
+    = If we don't need to distinguish between NULL and FALSE subquery:
+        
+      SELECT 1 FROM ... WHERE (oe $cmp$ ie) AND subq_where
+
+    = If we need to distinguish between those:
+
+      SELECT 1 FROM ...
+        WHERE  subq_where AND trigcond((oe $cmp$ ie) OR (ie IS NULL))
+        HAVING trigcond(<is_not_null_test>(ie))
+
+    @param join  Join object of the subquery (i.e. 'child' join).
+    @param func  Subquery comparison creator
+
+    @retval RES_OK     Either subquery was transformed, or appopriate
+                       predicates where injected into it.
+    @retval RES_REDUCE The subquery was reduced to non-subquery
+    @retval RES_ERROR  Error
+*/
+
+Item_subselect::trans_res
+Item_in_subselect::single_value_in_to_exists_transformer(JOIN * join, Comp_creator *func)
+{
+  SELECT_LEX *select_lex= join->select_lex;
+  DBUG_ENTER("Item_in_subselect::single_value_in_to_exists_transformer");
+
+  /*
+    The IN=>EXISTS transformation makes non-correlated subqueries correlated.
+  */
   select_lex->uncacheable|= UNCACHEABLE_DEPENDENT;
   if (join->having || select_lex->with_sum_func ||
       select_lex->group_list.elements)
@@ -1283,19 +1705,21 @@ Item_subselect::trans_res
 Item_in_subselect::row_value_transformer(JOIN *join)
 {
   SELECT_LEX *select_lex= join->select_lex;
-  Item *having_item= 0;
   uint cols_num= left_expr->cols();
-  bool is_having_used= (join->having || select_lex->with_sum_func ||
-                        select_lex->group_list.first ||
-                        !select_lex->table_list.elements);
+
   DBUG_ENTER("Item_in_subselect::row_value_transformer");
 
-  if (select_lex->item_list.elements != left_expr->cols())
+  // psergey: duplicated_subselect_card_check
+  if (select_lex->item_list.elements != cols_num)
   {
-    my_error(ER_OPERAND_COLUMNS, MYF(0), left_expr->cols());
+    my_error(ER_OPERAND_COLUMNS, MYF(0), cols_num);
     DBUG_RETURN(RES_ERROR);
   }
 
+  /*
+    Wrap the current IN predicate in an Item_in_optimizer. The actual
+    substitution in the Item tree takes place in Item_subselect::fix_fields.
+  */
   if (!substitution)
   {
     //first call for this unit
@@ -1315,7 +1739,15 @@ Item_in_subselect::row_value_transformer(JOIN *join)
     optimizer->keep_top_level_cache();
 
     thd->lex->current_select= current;
-    master_unit->uncacheable|= UNCACHEABLE_DEPENDENT;
+    /*
+      The uncacheable property controls a number of actions, e.g. whether to
+      save/restore (via init_save_join_tab/restore_tmp) the original JOIN for
+      plans with a temp table where the original JOIN was overriden by
+      make_simple_join. The UNCACHEABLE_EXPLAIN is ignored by EXPLAIN, thus
+      non-correlated subqueries will not appear as such to EXPLAIN.
+    */
+    master_unit->uncacheable|= UNCACHEABLE_EXPLAIN;
+    select_lex->uncacheable|= UNCACHEABLE_EXPLAIN;
 
     if (!abort_on_null && left_expr->maybe_null && !pushed_cond_guards)
     {
@@ -1327,6 +1759,51 @@ Item_in_subselect::row_value_transformer(JOIN *join)
     }
   }
 
+  /*
+    If this IN predicate can be computed via materialization, do not
+    perform the IN -> EXISTS transformation.
+  */
+  if (exec_method == MATERIALIZATION)
+    DBUG_RETURN(RES_OK);
+
+  /* Perform the IN=>EXISTS transformation. */
+  DBUG_RETURN(row_value_in_to_exists_transformer(join));
+}
+
+
+/**
+  Tranform a (possibly non-correlated) IN subquery into a correlated EXISTS.
+
+  @todo
+  The IF-ELSE below can be refactored so that there is no duplication of the
+  statements that create the new conditions. For this we have to invert the IF
+  and the FOR statements as this:
+  for (each left operand)
+    create the equi-join condition
+    if (is_having_used || !abort_on_null)
+      create the "is null" and is_not_null_test items
+    if (is_having_used)
+      add the equi-join and the null tests to HAVING
+    else
+      add the equi-join and the "is null" to WHERE
+      add the is_not_null_test to HAVING
+*/
+
+Item_subselect::trans_res
+Item_in_subselect::row_value_in_to_exists_transformer(JOIN * join)
+{
+  SELECT_LEX *select_lex= join->select_lex;
+  Item *having_item= 0;
+  uint cols_num= left_expr->cols();
+  bool is_having_used= (join->having || select_lex->with_sum_func ||
+                        select_lex->group_list.first ||
+                        !select_lex->table_list.elements);
+
+  DBUG_ENTER("Item_in_subselect::row_value_in_to_exists_transformer");
+
+  /*
+    The IN=>EXISTS transformation makes non-correlated subqueries correlated.
+  */
   select_lex->uncacheable|= UNCACHEABLE_DEPENDENT;
   if (is_having_used)
   {
@@ -1347,7 +1824,7 @@ Item_in_subselect::row_value_transformer(JOIN *join)
     for (uint i= 0; i < cols_num; i++)
     {
       DBUG_ASSERT((left_expr->fixed &&
-                   select_lex->ref_pointer_array[i]->fixed) ||
+                  select_lex->ref_pointer_array[i]->fixed) ||
                   (select_lex->ref_pointer_array[i]->type() == REF_ITEM &&
                    ((Item_ref*)(select_lex->ref_pointer_array[i]))->ref_type() ==
                     Item_ref::OUTER_REF));
@@ -1425,7 +1902,7 @@ Item_in_subselect::row_value_transformer(JOIN *join)
     {
       Item *item, *item_isnull;
       DBUG_ASSERT((left_expr->fixed &&
-                   select_lex->ref_pointer_array[i]->fixed) ||
+                  select_lex->ref_pointer_array[i]->fixed) ||
                   (select_lex->ref_pointer_array[i]->type() == REF_ITEM &&
                    ((Item_ref*)(select_lex->ref_pointer_array[i]))->ref_type() ==
                     Item_ref::OUTER_REF));
@@ -1570,9 +2047,7 @@ Item_in_subselect::select_in_like_transformer(JOIN *join, Comp_creator *func)
   }
 
   if (changed)
-  {
     DBUG_RETURN(RES_OK);
-  }
 
   thd->where= "IN/ALL/ANY subquery";
 
@@ -1580,6 +2055,8 @@ Item_in_subselect::select_in_like_transformer(JOIN *join, Comp_creator *func)
     In some optimisation cases we will not need this Item_in_optimizer
     object, but we can't know it here, but here we need address correct
     reference on left expresion.
+
+    //psergey: he means degenerate cases like "... IN (SELECT 1)"
   */
   if (!optimizer)
   {
@@ -1601,7 +2078,12 @@ Item_in_subselect::select_in_like_transformer(JOIN *join, Comp_creator *func)
   if (result)
     goto err;
 
-  transformed= 1;
+  /*
+    If we didn't choose an execution method up to this point, we choose
+    the IN=>EXISTS transformation.
+  */
+  if (exec_method == NOT_TRANSFORMED)
+    exec_method= IN_TO_EXISTS;
   arena= thd->activate_stmt_arena_if_needed(&backup);
 
   /*
@@ -1635,7 +2117,7 @@ err:
 
 void Item_in_subselect::print(String *str, enum_query_type query_type)
 {
-  if (transformed)
+  if (exec_method == IN_TO_EXISTS)
     str->append(STRING_WITH_LEN("<exists>"));
   else
   {
@@ -1648,29 +2130,250 @@ void Item_in_subselect::print(String *str, enum_query_type query_type)
 
 bool Item_in_subselect::fix_fields(THD *thd_arg, Item **ref)
 {
-  bool result = 0;
-  
+  uint outer_cols_num;
+  List<Item> *inner_cols;
+
+  if (exec_method == SEMI_JOIN)
+    return !( (*ref)= new Item_int(1));
+
+  /*
+    Check if the outer and inner IN operands match in those cases when we
+    will not perform IN=>EXISTS transformation. Currently this is when we
+    use subquery materialization.
+
+    The condition below is true when this method was called recursively from
+    inside JOIN::prepare for the JOIN object created by the call chain
+    Item_subselect::fix_fields -> subselect_single_select_engine::prepare,
+    which creates a JOIN object for the subquery and calls JOIN::prepare for
+    the JOIN of the subquery.
+    Notice that in some cases, this doesn't happen, and the check_cols()
+    test for each Item happens later in
+    Item_in_subselect::row_value_in_to_exists_transformer.
+    The reason for this mess is that our JOIN::prepare phase works top-down
+    instead of bottom-up, so we first do name resoluton and semantic checks
+    for the outer selects, then for the inner.
+  */
+  if (engine &&
+      engine->engine_type() == subselect_engine::SINGLE_SELECT_ENGINE &&
+      ((subselect_single_select_engine*)engine)->join)
+  {
+    outer_cols_num= left_expr->cols();
+
+    if (unit->is_union())
+      inner_cols= &(unit->types);
+    else
+      inner_cols= &(unit->first_select()->item_list);
+    if (outer_cols_num != inner_cols->elements)
+    {
+      my_error(ER_OPERAND_COLUMNS, MYF(0), outer_cols_num);
+      return TRUE;
+    }
+    if (outer_cols_num > 1)
+    {
+      List_iterator<Item> inner_col_it(*inner_cols);
+      Item *inner_col;
+      for (uint i= 0; i < outer_cols_num; i++)
+      {
+        inner_col= inner_col_it++;
+        if (inner_col->check_cols(left_expr->element_index(i)->cols()))
+          return TRUE;
+      }
+    }
+  }
+
   if ((thd_arg->lex->context_analysis_only & CONTEXT_ANALYSIS_ONLY_VIEW) &&
-      left_expr && !left_expr->fixed)
-    result = left_expr->fix_fields(thd_arg, &left_expr);
+      left_expr && !left_expr->fixed &&
+      left_expr->fix_fields(thd_arg, &left_expr))
+    return TRUE;
+  if (Item_subselect::fix_fields(thd_arg, ref))
+    return TRUE;
+
+  fixed= TRUE;
+  return FALSE;
+}
+
+
+void Item_in_subselect::fix_after_pullout(st_select_lex *new_parent, Item **ref)
+{
+  left_expr->fix_after_pullout(new_parent, &left_expr);
+  Item_subselect::fix_after_pullout(new_parent, ref);
+}
+
+void Item_in_subselect::update_used_tables()
+{
+  Item_subselect::update_used_tables();
+  left_expr->update_used_tables();
+  used_tables_cache |= left_expr->used_tables();
+}
+
+/**
+  Try to create an engine to compute the subselect via materialization,
+  and if this fails, revert to execution via the IN=>EXISTS transformation.
+
+  @details
+    The purpose of this method is to hide the implementation details
+    of this Item's execution. The method creates a new engine for
+    materialized execution, and initializes the engine.
+
+    If this initialization fails
+    - either because it wasn't possible to create the needed temporary table
+      and its index,
+    - or because of a memory allocation error,
+    then we revert back to execution via the IN=>EXISTS tranformation.
+
+    The initialization of the new engine is divided in two parts - a permanent
+    one that lives across prepared statements, and one that is repeated for each
+    execution.
+
+  @returns
+    @retval TRUE  memory allocation error occurred
+    @retval FALSE an execution method was chosen successfully
+*/
+
+bool Item_in_subselect::setup_engine()
+{
+  subselect_hash_sj_engine *new_engine= NULL;
+  bool res= FALSE;
+
+  DBUG_ENTER("Item_in_subselect::setup_engine");
+
+  if (engine->engine_type() == subselect_engine::SINGLE_SELECT_ENGINE)
+  {
+    /* Create/initialize objects in permanent memory. */
+    subselect_single_select_engine *old_engine;
+    Query_arena *arena= thd->stmt_arena, backup;
+
+    old_engine= (subselect_single_select_engine*) engine;
+
+    if (arena->is_conventional())
+      arena= 0;
+    else
+      thd->set_n_backup_active_arena(arena, &backup);
+
+    if (!(new_engine= new subselect_hash_sj_engine(thd, this,
+                                                   old_engine)) ||
+        new_engine->init_permanent(unit->get_unit_column_types()))
+    {
+      Item_subselect::trans_res trans_res;
+      /*
+        If for some reason we cannot use materialization for this IN predicate,
+        delete all materialization-related objects, and apply the IN=>EXISTS
+        transformation.
+      */
+      delete new_engine;
+      new_engine= NULL;
+      exec_method= NOT_TRANSFORMED;
+      if (left_expr->cols() == 1)
+        trans_res= single_value_in_to_exists_transformer(old_engine->join,
+                                                         &eq_creator);
+      else
+        trans_res= row_value_in_to_exists_transformer(old_engine->join);
+      res= (trans_res != Item_subselect::RES_OK);
+    }
+    if (new_engine)
+      engine= new_engine;
 
-  return result || Item_subselect::fix_fields(thd_arg, ref);
+    if (arena)
+      thd->restore_active_arena(arena, &backup);
+  }
+  else
+  {
+    DBUG_ASSERT(engine->engine_type() == subselect_engine::HASH_SJ_ENGINE);
+    new_engine= (subselect_hash_sj_engine*) engine;
+  }
+
+  /* Initilizations done in runtime memory, repeated for each execution. */
+  if (new_engine)
+  {
+    /*
+      Reset the LIMIT 1 set in Item_exists_subselect::fix_length_and_dec.
+      TODO:
+      Currently we set the subquery LIMIT to infinity, and this is correct
+      because we forbid at parse time LIMIT inside IN subqueries (see
+      Item_in_subselect::test_limit). However, once we allow this, here
+      we should set the correct limit if given in the query.
+    */
+    unit->global_parameters->select_limit= NULL;
+    if ((res= new_engine->init_runtime()))
+      DBUG_RETURN(res);
+  }
+
+  DBUG_RETURN(res);
+}
+
+
+/**
+  Initialize the cache of the left operand of the IN predicate.
+
+  @note This method has the same purpose as alloc_group_fields(),
+  but it takes a different kind of collection of items, and the
+  list we push to is dynamically allocated.
+
+  @retval TRUE  if a memory allocation error occurred or the cache is
+                not applicable to the current query
+  @retval FALSE if success
+*/
+
+bool Item_in_subselect::init_left_expr_cache()
+{
+  JOIN *outer_join;
+
+  outer_join= unit->outer_select()->join;
+  /*
+    An IN predicate might be evaluated in a query for which all tables have
+    been optimzied away.
+  */ 
+  if (!outer_join || !outer_join->tables || !outer_join->tables_list)
+    return TRUE;
+
+  if (!(left_expr_cache= new List<Cached_item>))
+    return TRUE;
+
+  for (uint i= 0; i < left_expr->cols(); i++)
+  {
+    Cached_item *cur_item_cache= new_Cached_item(thd,
+                                                 left_expr->element_index(i),
+                                                 FALSE);
+    if (!cur_item_cache || left_expr_cache->push_front(cur_item_cache))
+      return TRUE;
+  }
+  return FALSE;
+}
+
+
+/*
+  Callback to test if an IN predicate is expensive.
+
+  @details
+    IN predicates are considered expensive only if they will be executed via
+    materialization. The return value affects the behavior of
+    make_cond_for_table() in such a way that it is unchanged when we use
+    the IN=>EXISTS transformation to compute IN.
+
+  @retval TRUE  if the predicate is expensive
+  @retval FALSE otherwise
+*/
+
+bool Item_in_subselect::is_expensive_processor(uchar *arg)
+{
+  return exec_method == MATERIALIZATION;
 }
 
 
 Item_subselect::trans_res
 Item_allany_subselect::select_transformer(JOIN *join)
 {
-  transformed= 1;
+  DBUG_ENTER("Item_allany_subselect::select_transformer");
+  exec_method= IN_TO_EXISTS;
   if (upper_item)
     upper_item->show= 1;
-  return select_in_like_transformer(join, func);
+  DBUG_RETURN(select_in_like_transformer(join, func));
 }
 
 
 void Item_allany_subselect::print(String *str, enum_query_type query_type)
 {
-  if (transformed)
+  if (exec_method == IN_TO_EXISTS)
     str->append(STRING_WITH_LEN("<exists>"));
   else
   {
@@ -1692,21 +2395,24 @@ void subselect_engine::set_thd(THD *thd_arg)
 
 
 subselect_single_select_engine::
-subselect_single_select_engine(st_select_lex *select,
-			       select_subselect *result_arg,
+subselect_single_select_engine(THD *thd_arg, st_select_lex *select,
+			       select_result_interceptor *result_arg,
 			       Item_subselect *item_arg)
-  :subselect_engine(item_arg, result_arg),
-   prepared(0), optimized(0), executed(0),
-   select_lex(select), join(0)
+  :subselect_engine(thd_arg, item_arg, result_arg),
+   prepared(0), executed(0), select_lex(select), join(0)
 {
   select_lex->master_unit()->item= item_arg;
 }
 
+int subselect_single_select_engine::get_identifier()
+{
+  return select_lex->select_number; 
+}
 
 void subselect_single_select_engine::cleanup()
 {
   DBUG_ENTER("subselect_single_select_engine::cleanup");
-  prepared= optimized= executed= 0;
+  prepared= executed= 0;
   join= 0;
   result->cleanup();
   DBUG_VOID_RETURN;
@@ -1749,21 +2455,21 @@ bool subselect_union_engine::no_rows()
   return test(!unit->fake_select_lex->join->send_records);
 }
 
+
 void subselect_uniquesubquery_engine::cleanup()
 {
   DBUG_ENTER("subselect_uniquesubquery_engine::cleanup");
-  /*
-    subselect_uniquesubquery_engine have not 'result' assigbed, so we do not
-    cleanup() it
-  */
+  /* Tell handler we don't need the index anymore */
+  if (tab->table->file->inited)
+    tab->table->file->ha_index_end();
   DBUG_VOID_RETURN;
 }
 
 
-subselect_union_engine::subselect_union_engine(st_select_lex_unit *u,
-					       select_subselect *result_arg,
+subselect_union_engine::subselect_union_engine(THD *thd_arg, st_select_lex_unit *u,
+					       select_result_interceptor *result_arg,
 					       Item_subselect *item_arg)
-  :subselect_engine(item_arg, result_arg)
+  :subselect_engine(thd_arg, item_arg, result_arg)
 {
   unit= u;
   if (!result_arg)				//out of memory
@@ -1772,6 +2478,32 @@ subselect_union_engine::subselect_union_engine(st_select_lex_unit *u,
 }
 
 
+/**
+  Create and prepare the JOIN object that represents the query execution
+  plan for the subquery.
+
+  @details
+  This method is called from Item_subselect::fix_fields. For prepared
+  statements it is called both during the PREPARE and EXECUTE phases in the
+  following ways:
+  - During PREPARE the optimizer needs some properties
+    (join->fields_list.elements) of the JOIN to proceed with preparation of
+    the remaining query (namely to complete ::fix_fields for the subselect
+    related classes. In the end of PREPARE the JOIN is deleted.
+  - When we EXECUTE the query, Item_subselect::fix_fields is called again, and
+    the JOIN object is re-created again, prepared and executed. In the end of
+    execution it is deleted.
+  In all cases the JOIN is created in runtime memory (not in the permanent
+  memory root).
+
+  @todo
+  Re-check what properties of 'join' are needed during prepare, and see if
+  we can avoid creating a JOIN during JOIN::prepare of the outer join.
+
+  @retval 0  if success
+  @retval 1  if error
+*/
+
 int subselect_single_select_engine::prepare()
 {
   if (prepared)
@@ -1813,8 +2545,8 @@ int subselect_union_engine::prepare()
 
 int subselect_uniquesubquery_engine::prepare()
 {
-  //this never should be called
-  DBUG_ASSERT(0);
+  /* Should never be called. */
+  DBUG_ASSERT(FALSE);
   return 1;
 }
 
@@ -1861,7 +2593,7 @@ void subselect_engine::set_row(List<Item> &item_list, Item_cache **row)
     if (!(row[i]= Item_cache::get_cache(sel_item)))
       return;
     row[i]->setup(sel_item);
-    row[i]->store(sel_item);
+ //psergey-backport-timours:   row[i]->store(sel_item);
   }
   if (item_list.elements > 1)
     res_type= ROW_RESULT;
@@ -1909,11 +2641,10 @@ int subselect_single_select_engine::exec()
   char const *save_where= thd->where;
   SELECT_LEX *save_select= thd->lex->current_select;
   thd->lex->current_select= select_lex;
-  if (!optimized)
+  if (!join->optimized)
   {
     SELECT_LEX_UNIT *unit= select_lex->master_unit();
 
-    optimized= 1;
     unit->set_limit(unit->global_parameters);
     if (join->optimize())
     {
@@ -2068,14 +2799,23 @@ int subselect_uniquesubquery_engine::scan_table()
   for (;;)
   {
     error=table->file->ha_rnd_next(table->record[0]);
-    if (error && error != HA_ERR_END_OF_FILE)
-    {
-      error= report_error(table, error);
-      break;
+    if (error) {
+      if (error == HA_ERR_RECORD_DELETED)
+      {
+        error= 0;
+        continue;
+      }
+      if (error == HA_ERR_END_OF_FILE)
+      {
+        error= 0;
+        break;
+      }
+      else
+      {
+        error= report_error(table, error);
+        break;
+      }
     }
-    /* No more rows */
-    if (table->status)
-      break;
 
     if (!cond || cond->val_int())
     {
@@ -2186,6 +2926,56 @@ bool subselect_uniquesubquery_engine::copy_ref_key()
 
 
 /*
+  @retval  1  A NULL was found in the outer reference, index lookup is
+              not applicable, the outer ref is unsusable as a lookup key,
+              use some other method to find a match.
+  @retval  0  The outer ref was copied into an index lookup key.
+  @retval -1  The outer ref cannot possibly match any row, IN is FALSE.
+*/
+/* TIMOUR: this method is a variant of copy_ref_key(), needs refactoring. */
+
+int subselect_uniquesubquery_engine::copy_ref_key_simple()
+{
+  for (store_key **copy= tab->ref.key_copy ; *copy ; copy++)
+  {
+    enum store_key::store_key_result store_res;
+    store_res= (*copy)->copy();
+    tab->ref.key_err= store_res;
+
+    /*
+      When there is a NULL part in the key we don't need to make index
+      lookup for such key thus we don't need to copy whole key.
+      If we later should do a sequential scan return OK. Fail otherwise.
+
+      See also the comment for the subselect_uniquesubquery_engine::exec()
+      function.
+    */
+    null_keypart= (*copy)->null_key;
+    if (null_keypart)
+      return 1;
+
+    /*
+      Check if the error is equal to STORE_KEY_FATAL. This is not expressed 
+      using the store_key::store_key_result enum because ref.key_err is a 
+      boolean and we want to detect both TRUE and STORE_KEY_FATAL from the 
+      space of the union of the values of [TRUE, FALSE] and 
+      store_key::store_key_result.  
+      TODO: fix the variable an return types.
+    */
+    if (store_res == store_key::STORE_KEY_FATAL)
+    {
+      /*
+       Error converting the left IN operand to the column type of the right
+       IN operand. 
+      */
+      return -1;
+    }
+  }
+  return 0;
+}
+
+
+/*
   Execute subselect
 
   SYNOPSIS
@@ -2225,7 +3015,13 @@ int subselect_uniquesubquery_engine::exec()
  
   /* TODO: change to use of 'full_scan' here? */
   if (copy_ref_key())
+  {
+    /*
+      TIMOUR: copy_ref_key() == 1 means NULL result, not error, why return 1?
+      Check who reiles on this result.
+    */
     DBUG_RETURN(1);
+  }
   if (table->status)
   {
     /* 
@@ -2266,10 +3062,51 @@ int subselect_uniquesubquery_engine::exec()
 }
 
 
+/*
+  TIMOUR: write comment
+*/
+
+int subselect_uniquesubquery_engine::index_lookup()
+{
+  DBUG_ENTER("subselect_uniquesubquery_engine::index_lookup");
+  int error;
+  TABLE *table= tab->table;
+ 
+  if (!table->file->inited)
+    table->file->ha_index_init(tab->ref.key, 0);
+  error= table->file->ha_index_read_map(table->record[0],
+                                        tab->ref.key_buff,
+                                        make_prev_keypart_map(tab->
+                                                              ref.key_parts),
+                                        HA_READ_KEY_EXACT);
+  DBUG_PRINT("info", ("lookup result: %i", error));
+
+  if (error && error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE)
+  {
+    /*
+      TIMOUR: I don't understand at all when do we need to call report_error.
+      In most places where we access an index, we don't do this. Why here?
+    */
+    error= report_error(table, error);
+    DBUG_RETURN(error);
+  }
+
+  table->null_row= 0;
+  if (!error && (!cond || cond->val_int()))
+    ((Item_in_subselect *) item)->value= 1;
+  else
+    ((Item_in_subselect *) item)->value= 0;
+
+  DBUG_RETURN(0);
+}
+
+
+
 subselect_uniquesubquery_engine::~subselect_uniquesubquery_engine()
 {
   /* Tell handler we don't need the index anymore */
-  tab->table->file->ha_index_end();
+  //psergey-merge-todo: the following was gone in 6.0:
+ //psergey-merge: don't need this after all: tab->table->file->ha_index_end();
 }
 
 
@@ -2277,7 +3114,7 @@ subselect_uniquesubquery_engine::~subselect_uniquesubquery_engine()
   Index-lookup subselect 'engine' - run the subquery
 
   SYNOPSIS
-    subselect_uniquesubquery_engine:exec()
+    subselect_indexsubquery_engine:exec()
       full_scan 
 
   DESCRIPTION
@@ -2415,8 +3252,10 @@ int subselect_indexsubquery_engine::exec()
 
 uint subselect_single_select_engine::cols()
 {
-  DBUG_ASSERT(select_lex->join != 0); // should be called after fix_fields()
-  return select_lex->join->fields_list.elements;
+  //psergey-sj-backport: the following assert was gone in 6.0:
+  //DBUG_ASSERT(select_lex->join != 0); // should be called after fix_fields()
+  //return select_lex->join->fields_list.elements;
+  return select_lex->item_list.elements;
 }
 
 
@@ -2498,10 +3337,20 @@ void subselect_union_engine::print(String *str, enum_query_type query_type)
 void subselect_uniquesubquery_engine::print(String *str,
                                             enum_query_type query_type)
 {
+  char *table_name= tab->table->s->table_name.str;
   str->append(STRING_WITH_LEN("<primary_index_lookup>("));
   tab->ref.items[0]->print(str, query_type);
   str->append(STRING_WITH_LEN(" in "));
-  str->append(tab->table->s->table_name.str, tab->table->s->table_name.length);
+  if (tab->table->s->table_category == TABLE_CATEGORY_TEMPORARY)
+  {
+    /*
+      Temporary tables' names change across runs, so they can't be used for
+      EXPLAIN EXTENDED.
+    */
+    str->append(STRING_WITH_LEN("<temporary table>"));
+  }
+  else
+    str->append(table_name, tab->table->s->table_name.length);
   KEY *key_info= tab->table->key_info+ tab->ref.key;
   str->append(STRING_WITH_LEN(" on "));
   str->append(key_info->name);
@@ -2513,6 +3362,29 @@ void subselect_uniquesubquery_engine::print(String *str,
   str->append(')');
 }
 
+/*
+TODO:
+The above ::print method should be changed as below. Do it after
+all other tests pass.
+
+void subselect_uniquesubquery_engine::print(String *str)
+{
+  KEY *key_info= tab->table->key_info + tab->ref.key;
+  str->append(STRING_WITH_LEN("<primary_index_lookup>("));
+  for (uint i= 0; i < key_info->key_parts; i++)
+    tab->ref.items[i]->print(str);
+  str->append(STRING_WITH_LEN(" in "));
+  str->append(tab->table->s->table_name.str, tab->table->s->table_name.length);
+  str->append(STRING_WITH_LEN(" on "));
+  str->append(key_info->name);
+  if (cond)
+  {
+    str->append(STRING_WITH_LEN(" where "));
+    cond->print(str);
+  }
+  str->append(')');
+}
+*/
 
 void subselect_indexsubquery_engine::print(String *str,
                                            enum_query_type query_type)
@@ -2552,7 +3424,7 @@ void subselect_indexsubquery_engine::print(String *str,
 */
 
 bool subselect_single_select_engine::change_result(Item_subselect *si,
-                                                 select_subselect *res)
+                                                 select_result_interceptor *res)
 {
   item= si;
   result= res;
@@ -2573,7 +3445,7 @@ bool subselect_single_select_engine::change_result(Item_subselect *si,
 */
 
 bool subselect_union_engine::change_result(Item_subselect *si,
-                                         select_subselect *res)
+                                           select_result_interceptor *res)
 {
   item= si;
   int rc= unit->change_result(res, result);
@@ -2595,7 +3467,7 @@ bool subselect_union_engine::change_result(Item_subselect *si,
 */
 
 bool subselect_uniquesubquery_engine::change_result(Item_subselect *si,
-                                                  select_subselect *res)
+                                                    select_result_interceptor *res)
 {
   DBUG_ASSERT(0);
   return TRUE;
@@ -2663,5 +3535,1722 @@ bool subselect_union_engine::no_tables()
 bool subselect_uniquesubquery_engine::no_tables()
 {
   /* returning value is correct, but this method should never be called */
+  DBUG_ASSERT(FALSE);
   return 0;
 }
+
+
+/******************************************************************************
+  WL#1110 - Implementation of class subselect_hash_sj_engine
+******************************************************************************/
+
+
+/**
+  Check if an IN predicate should be executed via partial matching using
+  only schema information.
+
+  @details
+  This test essentially has three results:
+  - partial matching is applicable, but cannot be executed due to a
+    limitation in the total number of indexes, as a result we can't
+    use subquery materialization at all.
+  - partial matching is either applicable or not, and this can be
+    determined by looking at 'this->max_keys'.
+  If max_keys > 1, then we need partial matching because there are
+  more indexes than just the one we use during materialization to
+  remove duplicates.
+
+  @note
+  TIMOUR: The schema-based analysis for partial matching can be done once for
+  prepared statement and remembered. It is done here to remove the need to
+  save/restore all related variables between each re-execution, thus making
+  the code simpler.
+
+  @retval PARTIAL_MATCH  if a partial match should be used
+  @retval COMPLETE_MATCH if a complete match (index lookup) should be used
+*/
+
+subselect_hash_sj_engine::exec_strategy
+subselect_hash_sj_engine::get_strategy_using_schema()
+{
+  Item_in_subselect *item_in= (Item_in_subselect *) item;
+
+  if (item_in->is_top_level_item())
+    return COMPLETE_MATCH;
+  else
+  {
+    List_iterator<Item> inner_col_it(*item_in->unit->get_unit_column_types());
+    Item *outer_col, *inner_col;
+
+    for (uint i= 0; i < item_in->left_expr->cols(); i++)
+    {
+      outer_col= item_in->left_expr->element_index(i);
+      inner_col= inner_col_it++;
+
+      if (!inner_col->maybe_null && !outer_col->maybe_null)
+        bitmap_set_bit(&non_null_key_parts, i);
+      else
+      {
+        bitmap_set_bit(&partial_match_key_parts, i);
+        ++count_partial_match_columns;
+      }
+    }
+  }
+
+  /* If no column contains NULLs use regular hash index lookups. */
+  if (count_partial_match_columns)
+    return PARTIAL_MATCH;
+  return COMPLETE_MATCH;
+}
+
+
+/**
+  Test whether an IN predicate must be computed via partial matching
+  based on the NULL statistics for each column of a materialized subquery.
+
+  @details The procedure analyzes column NULL statistics, updates the
+  matching type of columns that cannot be NULL or that contain only NULLs.
+  Based on this, the procedure determines the final execution strategy for
+  the [NOT] IN predicate.
+
+  @retval PARTIAL_MATCH  if a partial match should be used
+  @retval COMPLETE_MATCH if a complete match (index lookup) should be used
+*/
+
+subselect_hash_sj_engine::exec_strategy
+subselect_hash_sj_engine::get_strategy_using_data()
+{
+  Item_in_subselect *item_in= (Item_in_subselect *) item;
+  select_materialize_with_stats *result_sink=
+    (select_materialize_with_stats *) result;
+  Item *outer_col;
+
+  /*
+    If we already determined that a complete match is enough based on schema
+    information, nothing can be better.
+  */
+  if (strategy == COMPLETE_MATCH)
+    return COMPLETE_MATCH;
+
+  for (uint i= 0; i < item_in->left_expr->cols(); i++)
+  {
+    if (!bitmap_is_set(&partial_match_key_parts, i))
+      continue;
+    outer_col= item_in->left_expr->element_index(i);
+    /*
+      If column 'i' doesn't contain NULLs, and the corresponding outer reference
+      cannot have a NULL value, then 'i' is a non-nullable column.
+    */
+    if (result_sink->get_null_count_of_col(i) == 0 && !outer_col->maybe_null)
+    {
+      bitmap_clear_bit(&partial_match_key_parts, i);
+      bitmap_set_bit(&non_null_key_parts, i);
+      --count_partial_match_columns;
+    }
+    if (result_sink->get_null_count_of_col(i) ==
+               tmp_table->file->stats.records)
+      ++count_null_only_columns;
+  }
+
+  /* If no column contains NULLs use regular hash index lookups. */
+  if (!count_partial_match_columns)
+    return COMPLETE_MATCH;
+  return PARTIAL_MATCH;
+}
+
+
+void
+subselect_hash_sj_engine::choose_partial_match_strategy(
+  bool has_non_null_key, bool has_covering_null_row,
+  MY_BITMAP *partial_match_key_parts)
+{
+  ulonglong pm_buff_size;
+
+  DBUG_ASSERT(strategy == PARTIAL_MATCH);
+  /*
+    Choose according to global optimizer switch. If only one of the switches is
+    'ON', then the remaining strategy is the only possible one. The only cases
+    when this will be overriden is when the total size of all buffers for the
+    merge strategy is bigger than the 'rowid_merge_buff_size' system variable,
+    or if there isn't enough physical memory to allocate the buffers.
+  */
+  if (!optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE) &&
+       optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN))
+    strategy= PARTIAL_MATCH_SCAN;
+  else if
+     ( optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE) &&
+      !optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN))
+    strategy= PARTIAL_MATCH_MERGE;
+
+  /*
+    If both switches are ON, or both are OFF, we interpret that as "let the
+    optimizer decide". Perform a cost based choice between the two partial
+    matching strategies.
+  */
+  /*
+    TIMOUR: the above interpretation of the switch values could be changed to:
+    - if both are ON - let the optimizer decide,
+    - if both are OFF - do not use partial matching, therefore do not use
+      materialization in non-top-level predicates.
+    The problem with this is that we know for sure if we need partial matching
+    only after the subquery is materialized, and this is too late to revert to
+    the IN=>EXISTS strategy.
+  */
+  if (strategy == PARTIAL_MATCH)
+  {
+    /*
+      TIMOUR: Currently we use a super simplistic measure. This will be
+      addressed in a separate task.
+    */
+    if (tmp_table->file->stats.records < 100)
+      strategy= PARTIAL_MATCH_SCAN;
+    else
+      strategy= PARTIAL_MATCH_MERGE;
+  }
+
+  /* Check if there is enough memory for the rowid merge strategy. */
+  if (strategy == PARTIAL_MATCH_MERGE)
+  {
+    pm_buff_size= rowid_merge_buff_size(has_non_null_key,
+                                        has_covering_null_row,
+                                        partial_match_key_parts);
+    if (pm_buff_size > thd->variables.rowid_merge_buff_size)
+      strategy= PARTIAL_MATCH_SCAN;
+  }
+}
+
+
+/*
+  Compute the memory size of all buffers proportional to the number of rows
+  in tmp_table.
+
+  @details
+  If the result is bigger than thd->variables.rowid_merge_buff_size, partial
+  matching via merging is not applicable.
+*/
+
+ulonglong subselect_hash_sj_engine::rowid_merge_buff_size(
+  bool has_non_null_key, bool has_covering_null_row,
+  MY_BITMAP *partial_match_key_parts)
+{
+  /* Total size of all buffers used by partial matching. */
+  ulonglong buff_size;
+  ha_rows row_count= tmp_table->file->stats.records;
+  uint rowid_length= tmp_table->file->ref_length;
+  select_materialize_with_stats *result_sink=
+    (select_materialize_with_stats *) result;
+
+  /* Size of the subselect_rowid_merge_engine::row_num_to_rowid buffer. */
+  buff_size= row_count * rowid_length * sizeof(uchar);
+
+  if (has_non_null_key)
+  {
+    /* Add the size of Ordered_key::key_buff of the only non-NULL key. */
+    buff_size+= row_count * sizeof(rownum_t);
+  }
+
+  if (!has_covering_null_row)
+  {
+    for (uint i= 0; i < partial_match_key_parts->n_bits; i++)
+    {
+      if (!bitmap_is_set(partial_match_key_parts, i) ||
+          result_sink->get_null_count_of_col(i) == row_count)
+        continue; /* In these cases we wouldn't construct Ordered keys. */
+
+      /* Add the size of Ordered_key::key_buff */
+      buff_size+= (row_count - result_sink->get_null_count_of_col(i)) *
+                         sizeof(rownum_t);
+      /* Add the size of Ordered_key::null_key */
+      buff_size+= bitmap_buffer_size(result_sink->get_max_null_of_col(i));
+    }
+  }
+
+  return buff_size;
+}
+
+
+/*
+  Initialize a MY_BITMAP with a buffer allocated on the current
+  memory root.
+  TIMOUR: move to bitmap C file?
+*/
+
+static my_bool
+bitmap_init_memroot(MY_BITMAP *map, uint n_bits, MEM_ROOT *mem_root)
+{
+  my_bitmap_map *bitmap_buf;
+
+  if (!(bitmap_buf= (my_bitmap_map*) alloc_root(mem_root,
+                                                bitmap_buffer_size(n_bits))) ||
+      bitmap_init(map, bitmap_buf, n_bits, FALSE))
+    return TRUE;
+  bitmap_clear_all(map);
+  return FALSE;
+}
+
+
+/**
+  Create all structures needed for IN execution that can live between PS
+  reexecution.
+
+  @param tmp_columns the items that produce the data for the temp table
+
+  @details
+  - Create a temporary table to store the result of the IN subquery. The
+    temporary table has one hash index on all its columns.
+  - Create a new result sink that sends the result stream of the subquery to
+    the temporary table,
+
+  @notice:
+    Currently Item_subselect::init() already chooses and creates at parse
+    time an engine with a corresponding JOIN to execute the subquery.
+
+  @retval TRUE  if error
+  @retval FALSE otherwise
+*/
+
+bool subselect_hash_sj_engine::init_permanent(List<Item> *tmp_columns)
+{
+  /* Options to create_tmp_table. */
+  ulonglong tmp_create_options= thd->options | TMP_TABLE_ALL_COLUMNS;
+                             /* | TMP_TABLE_FORCE_MYISAM; TIMOUR: force MYISAM */
+
+  DBUG_ENTER("subselect_hash_sj_engine::init_permanent");
+
+  if (bitmap_init_memroot(&non_null_key_parts, tmp_columns->elements,
+                            thd->mem_root) ||
+      bitmap_init_memroot(&partial_match_key_parts, tmp_columns->elements,
+                            thd->mem_root))
+    DBUG_RETURN(TRUE);
+
+  /*
+    Create and initialize a select result interceptor that stores the
+    result stream in a temporary table. The temporary table itself is
+    managed (created/filled/etc) internally by the interceptor.
+  */
+/*
+  TIMOUR:
+  Select a more efficient result sink when we know there is no need to collect
+  data statistics.
+
+  if (strategy == COMPLETE_MATCH)
+  {
+    if (!(result= new select_union))
+      DBUG_RETURN(TRUE);
+  }
+  else if (strategy == PARTIAL_MATCH)
+  {
+  if (!(result= new select_materialize_with_stats))
+    DBUG_RETURN(TRUE);
+  }
+*/
+  if (!(result= new select_materialize_with_stats))
+    DBUG_RETURN(TRUE);
+
+  if (((select_union*) result)->create_result_table(
+                         thd, tmp_columns, TRUE, tmp_create_options,
+                         "materialized subselect", TRUE))
+    DBUG_RETURN(TRUE);
+
+  tmp_table= ((select_union*) result)->table;
+
+  /*
+    If the subquery has blobs, or the total key lenght is bigger than
+    some length, or the total number of key parts is more than the
+    allowed maximum (currently MAX_REF_PARTS == 16), then the created
+    index cannot be used for lookups and we can't use hash semi
+    join. If this is the case, delete the temporary table since it
+    will not be used, and tell the caller we failed to initialize the
+    engine.
+  */
+  if (tmp_table->s->keys == 0)
+  {
+    DBUG_ASSERT(
+      tmp_table->s->uniques ||
+      tmp_table->key_info->key_length >= tmp_table->file->max_key_length() ||
+      tmp_table->key_info->key_parts > tmp_table->file->max_key_parts());
+    free_tmp_table(thd, tmp_table);
+    tmp_table= NULL;
+    delete result;
+    result= NULL;
+    DBUG_RETURN(TRUE);
+  }
+
+  /*
+    Make sure there is only one index on the temp table, and it doesn't have
+    the extra key part created when s->uniques > 0.
+  */
+  DBUG_ASSERT(tmp_table->s->keys == 1 &&
+              ((Item_in_subselect *) item)->left_expr->cols() ==
+              tmp_table->key_info->key_parts);
+
+  if (make_semi_join_conds() ||
+      /* A unique_engine is used both for complete and partial matching. */
+      !(lookup_engine= make_unique_engine()))
+    DBUG_RETURN(TRUE);
+
+  DBUG_RETURN(FALSE);
+}
+
+
+/*
+  Create an artificial condition to post-filter those rows matched by index
+  lookups that cannot be distinguished by the index lookup procedure.
+
+  @notes
+  The need for post-filtering may occur e.g. because of
+  truncation. Prepared statements execution requires that fix_fields is
+  called for every execution. In order to call fix_fields we need to
+  create a Name_resolution_context and a corresponding TABLE_LIST for
+  the temporary table for the subquery, so that all column references
+  to the materialized subquery table can be resolved correctly.
+
+  @returns
+    @retval TRUE  memory allocation error occurred
+    @retval FALSE the conditions were created and resolved (fixed)
+*/
+
+bool subselect_hash_sj_engine::make_semi_join_conds()
+{
+  /*
+    Table reference for tmp_table that is used to resolve column references
+    (Item_fields) to columns in tmp_table.
+  */
+  TABLE_LIST *tmp_table_ref;
+  /* Name resolution context for all tmp_table columns created below. */
+  Name_resolution_context *context;
+  Item_in_subselect *item_in= (Item_in_subselect *) item;
+
+  DBUG_ENTER("subselect_hash_sj_engine::make_semi_join_conds");
+  DBUG_ASSERT(semi_join_conds == NULL);
+
+  if (!(semi_join_conds= new Item_cond_and))
+    DBUG_RETURN(TRUE);
+
+  if (!(tmp_table_ref= (TABLE_LIST*) thd->alloc(sizeof(TABLE_LIST))))
+    DBUG_RETURN(TRUE);
+
+  tmp_table_ref->init_one_table("", "materialized subselect", TL_READ);
+  tmp_table_ref->table= tmp_table;
+
+  context= new Name_resolution_context;
+  context->init();
+  context->first_name_resolution_table=
+    context->last_name_resolution_table= tmp_table_ref;
+  
+  for (uint i= 0; i < item_in->left_expr->cols(); i++)
+  {
+    Item_func_eq *eq_cond; /* New equi-join condition for the current column. */
+    /* Item for the corresponding field from the materialized temp table. */
+    Item_field *right_col_item;
+
+    if (!(right_col_item= new Item_field(thd, context, tmp_table->field[i])) ||
+        !(eq_cond= new Item_func_eq(item_in->left_expr->element_index(i),
+                                    right_col_item)) ||
+        (((Item_cond_and*)semi_join_conds)->add(eq_cond)))
+    {
+      delete semi_join_conds;
+      semi_join_conds= NULL;
+      DBUG_RETURN(TRUE);
+    }
+  }
+  if (semi_join_conds->fix_fields(thd, (Item**)&semi_join_conds))
+    DBUG_RETURN(TRUE);
+
+  DBUG_RETURN(FALSE);
+}
+
+
+/**
+  Create a new uniquesubquery engine for the execution of an IN predicate.
+
+  @details
+  Create and initialize a new JOIN_TAB, and Table_ref objects to perform
+  lookups into the indexed temporary table.
+
+  @retval A new subselect_hash_sj_engine object
+  @retval NULL if a memory allocation error occurs
+*/
+
+subselect_uniquesubquery_engine*
+subselect_hash_sj_engine::make_unique_engine()
+{
+  Item_in_subselect *item_in= (Item_in_subselect *) item;
+  Item_iterator_row it(item_in->left_expr);
+  /* The only index on the temporary table. */
+  KEY *tmp_key= tmp_table->key_info;
+  JOIN_TAB *tab;
+
+  DBUG_ENTER("subselect_hash_sj_engine::make_unique_engine");
+
+  /*
+    Create and initialize the JOIN_TAB that represents an index lookup
+    plan operator into the materialized subquery result. Notice that:
+    - this JOIN_TAB has no corresponding JOIN (and doesn't need one), and
+    - here we initialize only those members that are used by
+      subselect_uniquesubquery_engine, so these objects are incomplete.
+  */
+  if (!(tab= (JOIN_TAB*) thd->alloc(sizeof(JOIN_TAB))))
+    DBUG_RETURN(NULL);
+
+  tab->table= tmp_table;
+  tab->ref.tmp_table_index_lookup_init(thd, tmp_key, it, FALSE);
+
+  DBUG_RETURN(new subselect_uniquesubquery_engine(thd, tab, item,
+                                                  semi_join_conds));
+}
+
+
+/**
+  Initialize members of the engine that need to be re-initilized at each
+  execution.
+
+  @retval TRUE  if a memory allocation error occurred
+  @retval FALSE if success
+*/
+
+bool subselect_hash_sj_engine::init_runtime()
+{
+  /*
+    Create and optimize the JOIN that will be used to materialize
+    the subquery if not yet created.
+  */
+  materialize_engine->prepare();
+  /*
+    Repeat name resolution for 'cond' since cond is not part of any
+    clause of the query, and it is not 'fixed' during JOIN::prepare.
+  */
+  if (semi_join_conds && !semi_join_conds->fixed &&
+      semi_join_conds->fix_fields(thd, (Item**)&semi_join_conds))
+    return TRUE;
+  /* Let our engine reuse this query plan for materialization. */
+  materialize_join= materialize_engine->join;
+  materialize_join->change_result(result);
+  return FALSE;
+}
+
+
+subselect_hash_sj_engine::~subselect_hash_sj_engine()
+{
+  delete lookup_engine;
+  delete result;
+  if (tmp_table)
+    free_tmp_table(thd, tmp_table);
+}
+
+
+/**
+  Cleanup performed after each PS execution.
+
+  @details
+  Called in the end of JOIN::prepare for PS from Item_subselect::cleanup.
+*/
+
+void subselect_hash_sj_engine::cleanup()
+{
+  enum_engine_type lookup_engine_type= lookup_engine->engine_type();
+  is_materialized= FALSE;
+  bitmap_clear_all(&non_null_key_parts);
+  bitmap_clear_all(&partial_match_key_parts);
+  count_partial_match_columns= 0;
+  count_null_only_columns= 0;
+  strategy= UNDEFINED;
+  materialize_engine->cleanup();
+  if (lookup_engine_type == TABLE_SCAN_ENGINE ||
+      lookup_engine_type == ROWID_MERGE_ENGINE)
+  {
+    subselect_engine *inner_lookup_engine;
+    inner_lookup_engine=
+      ((subselect_partial_match_engine*) lookup_engine)->lookup_engine;
+    /*
+      Partial match engines are recreated for each PS execution inside
+      subselect_hash_sj_engine::exec().
+    */
+    delete lookup_engine;
+    lookup_engine= inner_lookup_engine;
+  }
+  DBUG_ASSERT(lookup_engine->engine_type() == UNIQUESUBQUERY_ENGINE);
+  lookup_engine->cleanup();
+  result->cleanup(); /* Resets the temp table as well. */
+}
+
+
+/**
+  Execute a subquery IN predicate via materialization.
+
+  @details
+  If needed materialize the subquery into a temporary table, then
+  copmpute the predicate via a lookup into this table.
+
+  @retval TRUE  if error
+  @retval FALSE otherwise
+*/
+
+int subselect_hash_sj_engine::exec()
+{
+  Item_in_subselect *item_in= (Item_in_subselect *) item;
+  SELECT_LEX *save_select= thd->lex->current_select;
+  subselect_partial_match_engine *pm_engine= NULL;
+  int res= 0;
+
+  DBUG_ENTER("subselect_hash_sj_engine::exec");
+
+  /*
+    Optimize and materialize the subquery during the first execution of
+    the subquery predicate.
+  */
+  thd->lex->current_select= materialize_engine->select_lex;
+  if ((res= materialize_join->optimize()))
+    goto err; /* purecov: inspected */
+  DBUG_ASSERT(!is_materialized); /* We should materialize only once. */
+  materialize_join->exec();
+  if ((res= test(materialize_join->error || thd->is_fatal_error)))
+    goto err;
+
+  /*
+    TODO:
+    - Unlock all subquery tables as we don't need them. To implement this
+      we need to add new functionality to JOIN::join_free that can unlock
+      all tables in a subquery (and all its subqueries).
+    - The temp table used for grouping in the subquery can be freed
+      immediately after materialization (yet it's done together with
+      unlocking).
+  */
+  is_materialized= TRUE;
+  /*
+    If the subquery returned no rows, the temporary table is empty, so we know
+    directly that the result of IN is FALSE. We first update the table
+    statistics, then we test if the temporary table for the query result is
+    empty.
+  */
+  tmp_table->file->info(HA_STATUS_VARIABLE);
+  if (!tmp_table->file->stats.records)
+  {
+    item_in->value= FALSE;
+    /* The value of IN will not change during this execution. */
+    item_in->is_constant= TRUE;
+    item_in->set_first_execution();
+    /* TIMOUR: check if we need this: item_in->null_value= FALSE; */
+    DBUG_RETURN(FALSE);
+  }
+
+  /*
+    TIMOUR: The schema-based analysis for partial matching can be done once for
+    prepared statement and remembered. It is done here to remove the need to
+    save/restore all related variables between each re-execution, thus making
+    the code simpler.
+  */
+  strategy= get_strategy_using_schema();
+  /* This call may discover that we don't need partial matching at all. */
+  strategy= get_strategy_using_data();
+  if (strategy == PARTIAL_MATCH)
+  {
+    uint count_pm_keys; /* Total number of keys needed for partial matching. */
+    MY_BITMAP *nn_key_parts; /* The key parts of the only non-NULL index. */
+    uint covering_null_row_width;
+    select_materialize_with_stats *result_sink=
+      (select_materialize_with_stats *) result;
+
+    nn_key_parts= (count_partial_match_columns < tmp_table->s->fields) ?
+                  &non_null_key_parts : NULL;
+
+    if (result_sink->get_max_nulls_in_row() ==
+        tmp_table->s->fields -
+        (nn_key_parts ? bitmap_bits_set(nn_key_parts) : 0))
+      covering_null_row_width= result_sink->get_max_nulls_in_row();
+    else
+      covering_null_row_width= 0;
+
+    if (covering_null_row_width)
+      count_pm_keys= nn_key_parts ? 1 : 0;
+    else
+      count_pm_keys= count_partial_match_columns - count_null_only_columns +
+        (nn_key_parts ? 1 : 0);
+
+    choose_partial_match_strategy(test(nn_key_parts),
+                                  test(covering_null_row_width),
+                                  &partial_match_key_parts);
+    DBUG_ASSERT(strategy == PARTIAL_MATCH_MERGE ||
+                strategy == PARTIAL_MATCH_SCAN);
+    if (strategy == PARTIAL_MATCH_MERGE)
+    {
+      pm_engine=
+        new subselect_rowid_merge_engine(thd, (subselect_uniquesubquery_engine*)
+                                         lookup_engine, tmp_table,
+                                         count_pm_keys,
+                                         covering_null_row_width,
+                                         item, result,
+                                         semi_join_conds->argument_list());
+      if (!pm_engine ||
+          ((subselect_rowid_merge_engine*) pm_engine)->
+            init(nn_key_parts, &partial_match_key_parts))
+      {
+        /*
+          The call to init() would fail if there was not enough memory to allocate
+          all buffers for the rowid merge strategy. In this case revert to table
+          scanning which doesn't need any big buffers.
+        */
+        delete pm_engine;
+        pm_engine= NULL;
+        strategy= PARTIAL_MATCH_SCAN;
+      }
+    }
+
+    if (strategy == PARTIAL_MATCH_SCAN)
+    {
+      if (!(pm_engine=
+            new subselect_table_scan_engine(thd, (subselect_uniquesubquery_engine*)
+                                            lookup_engine, tmp_table,
+                                            item, result,
+                                            semi_join_conds->argument_list(),
+                                            covering_null_row_width)))
+      {
+        /* This is an irrecoverable error. */
+        res= 1;
+        goto err;
+      }
+    }
+  }
+
+  if (pm_engine)
+    lookup_engine= pm_engine;
+  item_in->change_engine(lookup_engine);
+
+err:
+  thd->lex->current_select= save_select;
+  DBUG_RETURN(res);
+}
+
+
+/**
+  Print the state of this engine into a string for debugging and views.
+*/
+
+void subselect_hash_sj_engine::print(String *str, enum_query_type query_type)
+{
+  str->append(STRING_WITH_LEN(" <materialize> ("));
+  materialize_engine->print(str, query_type);
+  str->append(STRING_WITH_LEN(" ), "));
+
+  if (lookup_engine)
+    lookup_engine->print(str, query_type);
+  else
+    str->append(STRING_WITH_LEN(
+           "<engine selected at execution time>"
+         ));
+}
+
+void subselect_hash_sj_engine::fix_length_and_dec(Item_cache** row)
+{
+  DBUG_ASSERT(FALSE);
+}
+
+void subselect_hash_sj_engine::exclude()
+{
+  DBUG_ASSERT(FALSE);
+}
+
+bool subselect_hash_sj_engine::no_tables()
+{
+  DBUG_ASSERT(FALSE);
+  return FALSE;
+}
+
+bool subselect_hash_sj_engine::change_result(Item_subselect *si,
+                                             select_result_interceptor *res)
+{
+  DBUG_ASSERT(FALSE);
+  return TRUE;
+}
+
+
+Ordered_key::Ordered_key(uint keyid_arg, TABLE *tbl_arg, Item *search_key_arg,
+                         ha_rows null_count_arg, ha_rows min_null_row_arg,
+                         ha_rows max_null_row_arg, uchar *row_num_to_rowid_arg)
+  : keyid(keyid_arg), tbl(tbl_arg), search_key(search_key_arg),
+    row_num_to_rowid(row_num_to_rowid_arg), null_count(null_count_arg)
+{
+  DBUG_ASSERT(tbl->file->stats.records > null_count);
+  key_buff_elements= tbl->file->stats.records - null_count;
+  cur_key_idx= HA_POS_ERROR;
+
+  DBUG_ASSERT((null_count && min_null_row_arg && max_null_row_arg) ||
+              (!null_count && !min_null_row_arg && !max_null_row_arg));
+  if (null_count)
+  {
+    /* The counters are 1-based, for key access we need 0-based indexes. */
+    min_null_row= min_null_row_arg - 1;
+    max_null_row= max_null_row_arg - 1;
+  }
+  else
+    min_null_row= max_null_row= 0;
+}
+
+
+Ordered_key::~Ordered_key()
+{
+  my_free((char*) key_buff, MYF(0));
+  bitmap_free(&null_key);
+}
+
+
+/*
+  Cleanup that needs to be done for each PS (re)execution.
+*/
+
+void Ordered_key::cleanup()
+{
+  /*
+    Currently these keys are recreated for each PS re-execution, thus
+    there is nothing to cleanup, the whole object goes away after execution
+    is over. All handler related initialization/deinitialization is done by
+    the parent subselect_rowid_merge_engine object.
+  */
+}
+
+
+/*
+  Initialize a multi-column index.
+*/
+
+bool Ordered_key::init(MY_BITMAP *columns_to_index)
+{
+  THD *thd= tbl->in_use;
+  uint cur_key_col= 0;
+  Item_field *cur_tmp_field;
+  Item_func_lt *fn_less_than;
+
+  key_column_count= bitmap_bits_set(columns_to_index);
+
+  // TIMOUR: check for mem allocation err, revert to scan
+
+  key_columns= (Item_field**) thd->alloc(key_column_count *
+                                         sizeof(Item_field*));
+  compare_pred= (Item_func_lt**) thd->alloc(key_column_count *
+                                            sizeof(Item_func_lt*));
+
+  for (uint i= 0; i < columns_to_index->n_bits; i++)
+  {
+    if (!bitmap_is_set(columns_to_index, i))
+      continue;
+    cur_tmp_field= new Item_field(tbl->field[i]);
+    /* Create the predicate (tmp_column[i] < outer_ref[i]). */
+    fn_less_than= new Item_func_lt(cur_tmp_field,
+                                   search_key->element_index(i));
+    fn_less_than->fix_fields(thd, (Item**) &fn_less_than);
+    key_columns[cur_key_col]= cur_tmp_field;
+    compare_pred[cur_key_col]= fn_less_than;
+    ++cur_key_col;
+  }
+
+  if (alloc_keys_buffers())
+  {
+    /* TIMOUR revert to partial match via table scan. */
+    return TRUE;
+  }
+  return FALSE;
+}
+
+
+/*
+  Initialize a single-column index.
+*/
+
+bool Ordered_key::init(int col_idx)
+{
+  THD *thd= tbl->in_use;
+
+  key_column_count= 1;
+
+  // TIMOUR: check for mem allocation err, revert to scan
+
+  key_columns= (Item_field**) thd->alloc(sizeof(Item_field*));
+  compare_pred= (Item_func_lt**) thd->alloc(sizeof(Item_func_lt*));
+
+  key_columns[0]= new Item_field(tbl->field[col_idx]);
+  /* Create the predicate (tmp_column[i] < outer_ref[i]). */
+  compare_pred[0]= new Item_func_lt(key_columns[0],
+                                    search_key->element_index(col_idx));
+  compare_pred[0]->fix_fields(thd, (Item**)&compare_pred[0]);
+
+  if (alloc_keys_buffers())
+  {
+    /* TIMOUR revert to partial match via table scan. */
+    return TRUE;
+  }
+  return FALSE;
+}
+
+
+/*
+  Allocate the buffers for both the row number, and the NULL-bitmap indexes.
+*/
+
+bool Ordered_key::alloc_keys_buffers()
+{
+  DBUG_ASSERT(key_buff_elements > 0);
+
+  if (!(key_buff= (rownum_t*) my_malloc((size_t)(key_buff_elements * 
+    sizeof(rownum_t)), MYF(MY_WME))))
+    return TRUE;
+
+  /*
+    TIMOUR: it is enough to create bitmaps with size
+    (max_null_row - min_null_row), and then use min_null_row as
+    lookup offset.
+  */
+  /* Notice that max_null_row is max array index, we need count, so +1. */
+  if (bitmap_init(&null_key, NULL, (uint)(max_null_row + 1), FALSE))
+    return TRUE;
+
+  cur_key_idx= HA_POS_ERROR;
+
+  return FALSE;
+}
+
+
+/*
+  Quick sort comparison function that compares two rows of the same table
+  indentfied with their row numbers.
+
+  @retval -1
+  @retval  0
+  @retval +1
+*/
+
+int
+Ordered_key::cmp_keys_by_row_data(ha_rows a, ha_rows b)
+{
+  uchar *rowid_a, *rowid_b;
+  int error, cmp_res;
+  /* The length in bytes of the rowids (positions) of tmp_table. */
+  uint rowid_length= tbl->file->ref_length;
+
+  if (a == b)
+    return 0;
+  /* Get the corresponding rowids. */
+  rowid_a= row_num_to_rowid + a * rowid_length;
+  rowid_b= row_num_to_rowid + b * rowid_length;
+  /* Fetch the rows for comparison. */
+  error= tbl->file->ha_rnd_pos(tbl->record[0], rowid_a);
+  DBUG_ASSERT(!error);
+  error= tbl->file->ha_rnd_pos(tbl->record[1], rowid_b);
+  DBUG_ASSERT(!error);
+  /*
+    Compare the two rows by the corresponding values of the indexed
+    columns.
+  */
+  for (uint i= 0; i < key_column_count; i++)
+  {
+    Field *cur_field= key_columns[i]->field;
+    if ((cmp_res= cur_field->cmp_offset(tbl->s->rec_buff_length)))
+      return (cmp_res > 0 ? 1 : -1);
+  }
+  return 0;
+}
+
+
+int
+Ordered_key::cmp_keys_by_row_data_and_rownum(Ordered_key *key,
+                                             rownum_t* a, rownum_t* b)
+{
+  /* The result of comparing the two keys according to their row data. */
+  int cmp_row_res= key->cmp_keys_by_row_data(*a, *b);
+  if (cmp_row_res)
+    return cmp_row_res;
+  return (*a < *b) ? -1 : (*a > *b) ? 1 : 0;
+}
+
+
+void Ordered_key::sort_keys()
+{
+  my_qsort2(key_buff, (size_t) key_buff_elements, sizeof(rownum_t),
+            (qsort2_cmp) &cmp_keys_by_row_data_and_rownum, (void*) this);
+  /* Invalidate the current row position. */
+  cur_key_idx= HA_POS_ERROR;
+}
+
+
+/*
+  The fraction of rows that do not contain NULL in the columns indexed by
+  this key.
+
+  @retval  1  if there are no NULLs
+  @retval  0  if only NULLs
+*/
+
+double Ordered_key::null_selectivity()
+{
+  /* We should not be processing empty tables. */
+  DBUG_ASSERT(tbl->file->stats.records);
+  return (1 - (double) null_count / (double) tbl->file->stats.records);
+}
+
+
+/*
+  Compare the value(s) of the current key in 'search_key' with the
+  data of the current table record.
+
+  @notes The comparison result follows from the way compare_pred
+  is created in Ordered_key::init. Currently compare_pred compares
+  a field in of the current row with the corresponding Item that
+  contains the search key.
+
+  @param row_num  Number of the row (not index in the key_buff array)
+
+  @retval -1  if (current row  < search_key)
+  @retval  0  if (current row == search_key)
+  @retval +1  if (current row  > search_key)
+*/
+
+int Ordered_key::cmp_key_with_search_key(rownum_t row_num)
+{
+  /* The length in bytes of the rowids (positions) of tmp_table. */
+  uint rowid_length= tbl->file->ref_length;
+  uchar *cur_rowid= row_num_to_rowid + row_num * rowid_length;
+  int error, cmp_res;
+
+  error= tbl->file->ha_rnd_pos(tbl->record[0], cur_rowid);
+  DBUG_ASSERT(!error);
+
+  for (uint i= 0; i < key_column_count; i++)
+  {
+    cmp_res= compare_pred[i]->get_comparator()->compare();
+    /* Unlike Arg_comparator::compare_row() here there should be no NULLs. */
+    DBUG_ASSERT(!compare_pred[i]->null_value);
+    if (cmp_res)
+      return (cmp_res > 0 ? 1 : -1);
+  }
+  return 0;
+}
+
+
+/*
+  Find a key in a sorted array of keys via binary search.
+
+  see create_subq_in_equalities()
+*/
+
+bool Ordered_key::lookup()
+{
+  DBUG_ASSERT(key_buff_elements);
+
+  ha_rows lo= 0;
+  ha_rows hi= key_buff_elements - 1;
+  ha_rows mid;
+  int cmp_res;
+
+  while (lo <= hi)
+  {
+    mid= lo + (hi - lo) / 2;
+    cmp_res= cmp_key_with_search_key(key_buff[mid]);
+    /*
+      In order to find the minimum match, check if the pevious element is
+      equal or smaller than the found one. If equal, we need to search further
+      to the left.
+    */
+    if (!cmp_res && mid > 0)
+      cmp_res= !cmp_key_with_search_key(key_buff[mid - 1]) ? 1 : 0;
+
+    if (cmp_res == -1)
+    {
+      /* row[mid] < search_key */
+      lo= mid + 1;
+    }
+    else if (cmp_res == 1)
+    {
+      /* row[mid] > search_key */
+      if (!mid)
+        goto not_found;
+      hi= mid - 1;
+    }
+    else
+    {
+      /* row[mid] == search_key */
+      cur_key_idx= mid;
+      return TRUE;
+    }
+  }
+not_found:
+  cur_key_idx= HA_POS_ERROR;
+  return FALSE;
+}
+
+
+/*
+  Move the current index pointer to the next key with the same column
+  values as the current key. Since the index is sorted, all such keys
+  are contiguous.
+*/
+
+bool Ordered_key::next_same()
+{
+  DBUG_ASSERT(key_buff_elements);
+
+  if (cur_key_idx < key_buff_elements - 1)
+  {
+    /*
+      TIMOUR:
+      The below is quite inefficient, since as a result we will fetch every
+      row (except the last one) twice. There must be a more efficient way,
+      e.g. swapping record[0] and record[1], and reading only the new record.
+    */
+    if (!cmp_keys_by_row_data(key_buff[cur_key_idx], key_buff[cur_key_idx + 1]))
+    {
+      ++cur_key_idx;
+      return TRUE;
+    }
+  }
+  return FALSE;
+}
+
+
+void Ordered_key::print(String *str)
+{
+  uint i;
+  str->append("{idx=");
+  str->qs_append(keyid);
+  str->append(", (");
+  for (i= 0; i < key_column_count - 1; i++)
+  {
+    str->append(key_columns[i]->field->field_name);
+    str->append(", ");
+  }
+  str->append(key_columns[i]->field->field_name);
+  str->append("), ");
+
+  str->append("null_bitmap: (bits=");
+  str->qs_append(null_key.n_bits);
+  str->append(", nulls= ");
+  str->qs_append((double)null_count);
+  str->append(", min_null= ");
+  str->qs_append((double)min_null_row);
+  str->append(", max_null= ");
+  str->qs_append((double)max_null_row);
+  str->append("), ");
+
+  str->append('}');
+}
+
+
+subselect_partial_match_engine::subselect_partial_match_engine(
+  THD *thd_arg, subselect_uniquesubquery_engine *engine_arg,
+  TABLE *tmp_table_arg, Item_subselect *item_arg,
+  select_result_interceptor *result_arg,
+  List<Item> *equi_join_conds_arg,
+  uint covering_null_row_width_arg)
+  :subselect_engine(thd_arg, item_arg, result_arg),
+   tmp_table(tmp_table_arg), lookup_engine(engine_arg),
+   equi_join_conds(equi_join_conds_arg),
+   covering_null_row_width(covering_null_row_width_arg)
+{}
+
+
+int subselect_partial_match_engine::exec()
+{
+  Item_in_subselect *item_in= (Item_in_subselect *) item;
+  int res;
+
+  /* Try to find a matching row by index lookup. */
+  res= lookup_engine->copy_ref_key_simple();
+  if (res == -1)
+  {
+    /* The result is FALSE based on the outer reference. */
+    item_in->value= 0;
+    item_in->null_value= 0;
+    return 0;
+  }
+  else if (res == 0)
+  {
+    /* Search for a complete match. */
+    if ((res= lookup_engine->index_lookup()))
+    {
+      /* An error occured during lookup(). */
+      item_in->value= 0;
+      item_in->null_value= 0;
+      return res;
+    }
+    else if (item_in->value)
+    {
+      /*
+        A complete match was found, the result of IN is TRUE.
+        Notice: (this->item == lookup_engine->item)
+      */
+      return 0;
+    }
+  }
+
+  if (covering_null_row_width == tmp_table->s->fields)
+  {
+    /*
+      If there is a NULL-only row that coveres all columns the result of IN
+      is UNKNOWN. 
+    */
+    item_in->value= 0;
+    /*
+      TIMOUR: which one is the right way to propagate an UNKNOWN result?
+      Should we also set empty_result_set= FALSE; ???
+    */
+    //item_in->was_null= 1;
+    item_in->null_value= 1;
+    return 0;
+  }
+
+  /*
+    There is no complete match. Look for a partial match (UNKNOWN result), or
+    no match (FALSE).
+  */
+  if (tmp_table->file->inited)
+    tmp_table->file->ha_index_end();
+
+  if (partial_match())
+  {
+    /* The result of IN is UNKNOWN. */
+    item_in->value= 0;
+    /*
+      TIMOUR: which one is the right way to propagate an UNKNOWN result?
+      Should we also set empty_result_set= FALSE; ???
+    */
+    //item_in->was_null= 1;
+    item_in->null_value= 1;
+  }
+  else
+  {
+    /* The result of IN is FALSE. */
+    item_in->value= 0;
+    /*
+      TIMOUR: which one is the right way to propagate an UNKNOWN result?
+      Should we also set empty_result_set= FALSE; ???
+    */
+    //item_in->was_null= 0;
+    item_in->null_value= 0;
+  }
+
+  return 0;
+}
+
+
+void subselect_partial_match_engine::print(String *str,
+                                           enum_query_type query_type)
+{
+  /*
+    Should never be called as the actual engine cannot be known at query
+    optimization time.
+    DBUG_ASSERT(FALSE);
+  */
+}
+
+
+/*
+  @param non_null_key_parts  
+  @param partial_match_key_parts  A union of all single-column NULL key parts.
+  @param count_partial_match_columns Number of NULL keyparts (set bits above).
+
+  @retval FALSE  the engine was initialized successfully
+  @retval TRUE   there was some (memory allocation) error during initialization,
+                 such errors should be interpreted as revert to other strategy
+*/
+
+bool
+subselect_rowid_merge_engine::init(MY_BITMAP *non_null_key_parts,
+                                   MY_BITMAP *partial_match_key_parts)
+{
+  /* The length in bytes of the rowids (positions) of tmp_table. */
+  uint rowid_length= tmp_table->file->ref_length;
+  ha_rows row_count= tmp_table->file->stats.records;
+  rownum_t cur_rownum= 0;
+  select_materialize_with_stats *result_sink=
+    (select_materialize_with_stats *) result;
+  uint cur_keyid= 0;
+  Item_in_subselect *item_in= (Item_in_subselect*) item;
+  int error;
+
+  if (keys_count == 0)
+  {
+    /* There is nothing to initialize, we will only do regular lookups. */
+    return FALSE;
+  }
+
+  DBUG_ASSERT(!covering_null_row_width || (covering_null_row_width &&
+                                           keys_count == 1 &&
+                                           non_null_key_parts));
+  /*
+    Allocate buffers to hold the merged keys and the mapping between rowids and
+    row numbers.
+  */
+  if (!(merge_keys= (Ordered_key**) thd->alloc(keys_count *
+                                               sizeof(Ordered_key*))) ||
+      !(row_num_to_rowid= (uchar*) my_malloc((size_t)(row_count * rowid_length),
+        MYF(MY_WME))))
+    return TRUE;
+
+  /* Create the only non-NULL key if there is any. */
+  if (non_null_key_parts)
+  {
+    non_null_key= new Ordered_key(cur_keyid, tmp_table, item_in->left_expr,
+                                  0, 0, 0, row_num_to_rowid);
+    if (non_null_key->init(non_null_key_parts))
+      return TRUE;
+    merge_keys[cur_keyid]= non_null_key;
+    merge_keys[cur_keyid]->first();
+    ++cur_keyid;
+  }
+
+  /*
+    If there is a covering NULL row, the only key that is needed is the
+    only non-NULL key that is already created above. We create keys on
+    NULL-able columns only if there is no covering NULL row.
+  */
+  if (!covering_null_row_width)
+  {
+    if (bitmap_init_memroot(&matching_keys, keys_count, thd->mem_root) ||
+        bitmap_init_memroot(&matching_outer_cols, keys_count, thd->mem_root) ||
+        bitmap_init_memroot(&null_only_columns, keys_count, thd->mem_root))
+      return TRUE;
+
+    /*
+      Create one single-column NULL-key for each column in
+      partial_match_key_parts.
+    */
+    for (uint i= 0; i < partial_match_key_parts->n_bits; i++)
+    {
+      if (!bitmap_is_set(partial_match_key_parts, i))
+        continue;
+
+      if (result_sink->get_null_count_of_col(i) == row_count)
+      {
+        bitmap_set_bit(&null_only_columns, cur_keyid);
+        continue;
+      }
+      else
+      {
+        merge_keys[cur_keyid]= new Ordered_key(
+                                     cur_keyid, tmp_table,
+                                     item_in->left_expr->element_index(i),
+                                     result_sink->get_null_count_of_col(i),
+                                     result_sink->get_min_null_of_col(i),
+                                     result_sink->get_max_null_of_col(i),
+                                     row_num_to_rowid);
+        if (merge_keys[cur_keyid]->init(i))
+          return TRUE;
+        merge_keys[cur_keyid]->first();
+      }
+      ++cur_keyid;
+    }
+  }
+  DBUG_ASSERT(cur_keyid == keys_count);
+
+  /* Populate the indexes with data from the temporary table. */
+  if (tmp_table->file->ha_rnd_init_with_error(1))
+    return TRUE;
+  tmp_table->file->extra_opt(HA_EXTRA_CACHE,
+                             current_thd->variables.read_buff_size);
+  tmp_table->null_row= 0;
+  while (TRUE)
+  {
+    error= tmp_table->file->ha_rnd_next(tmp_table->record[0]);
+    if (error == HA_ERR_RECORD_DELETED)
+    {
+      /* We get this for duplicate records that should not be in tmp_table. */
+      continue;
+    }
+    /*
+      This is a temp table that we fully own, there should be no other
+      cause to stop the iteration than EOF.
+    */
+    DBUG_ASSERT(!error || error == HA_ERR_END_OF_FILE);
+    if (error == HA_ERR_END_OF_FILE)
+    {
+      DBUG_ASSERT(cur_rownum == tmp_table->file->stats.records);
+      break;
+    }
+
+    /*
+      Save the position of this record in the row_num -> rowid mapping.
+    */
+    tmp_table->file->position(tmp_table->record[0]);
+    memcpy(row_num_to_rowid + cur_rownum * rowid_length,
+           tmp_table->file->ref, rowid_length);
+
+    /* Add the current row number to the corresponding keys. */
+    if (non_null_key)
+    {
+      /* By definition there are no NULLs in the non-NULL key. */
+      non_null_key->add_key(cur_rownum);
+    }
+
+    for (uint i= (non_null_key ? 1 : 0); i < keys_count; i++)
+    {
+      /*
+        Check if the first and only indexed column contains NULL in the curent
+        row, and add the row number to the corresponding key.
+      */
+      if (tmp_table->field[merge_keys[i]->get_field_idx(0)]->is_null())
+        merge_keys[i]->set_null(cur_rownum);
+      else
+        merge_keys[i]->add_key(cur_rownum);
+    }
+    ++cur_rownum;
+  }
+
+  tmp_table->file->ha_rnd_end();
+
+  /* Sort all the keys by their NULL selectivity. */
+  my_qsort(merge_keys, keys_count, sizeof(Ordered_key*),
+           (qsort_cmp) cmp_keys_by_null_selectivity);
+
+  /* Sort the keys in each of the indexes. */
+  for (uint i= 0; i < keys_count; i++)
+    merge_keys[i]->sort_keys();
+
+  if (init_queue(&pq, keys_count, 0, FALSE,
+                 subselect_rowid_merge_engine::cmp_keys_by_cur_rownum, NULL,
+                 0, 0))
+    return TRUE;
+
+  return FALSE;
+}
+
+
+subselect_rowid_merge_engine::~subselect_rowid_merge_engine()
+{
+  /* None of the resources below is allocated if there are no ordered keys. */
+  if (keys_count)
+  {
+    my_free((char*) row_num_to_rowid, MYF(0));
+    for (uint i= 0; i < keys_count; i++)
+      delete merge_keys[i];
+    delete_queue(&pq);
+    if (tmp_table->file->inited == handler::RND)
+      tmp_table->file->ha_rnd_end();
+  }
+}
+
+
+void subselect_rowid_merge_engine::cleanup()
+{
+}
+
+
+/*
+  Quick sort comparison function to compare keys in order of decreasing bitmap
+  selectivity, so that the most selective keys come first.
+
+  @param  k1 first key to compare
+  @param  k2 second key to compare
+
+  @retval  1  if k1 is less selective than k2
+  @retval  0  if k1 is equally selective as k2
+  @retval -1  if k1 is more selective than k2
+*/
+
+int
+subselect_rowid_merge_engine::cmp_keys_by_null_selectivity(Ordered_key **k1,
+                                                           Ordered_key **k2)
+{
+  double k1_sel= (*k1)->null_selectivity();
+  double k2_sel= (*k2)->null_selectivity();
+  if (k1_sel < k2_sel)
+    return 1;
+  if (k1_sel > k2_sel)
+    return -1;
+  return 0;
+}
+
+
+/*
+*/
+
+int
+subselect_rowid_merge_engine::cmp_keys_by_cur_rownum(void *arg,
+                                                     uchar *k1, uchar *k2)
+{
+  rownum_t r1= ((Ordered_key*) k1)->current();
+  rownum_t r2= ((Ordered_key*) k2)->current();
+
+  return (r1 < r2) ? -1 : (r1 > r2) ? 1 : 0;
+}
+
+
+/*
+  Check if certain table row contains a NULL in all columns for which there is
+  no match in the corresponding value index.
+
+  @retval TRUE if a NULL row exists
+  @retval FALSE otherwise
+*/
+
+bool subselect_rowid_merge_engine::test_null_row(rownum_t row_num)
+{
+  Ordered_key *cur_key;
+  uint cur_id;
+  for (uint i = 0; i < keys_count; i++)
+  {
+    cur_key= merge_keys[i];
+    cur_id= cur_key->get_keyid();
+    if (bitmap_is_set(&matching_keys, cur_id))
+    {
+      /*
+        The key 'i' (with id 'cur_keyid') already matches a value in row 'row_num',
+        thus we skip it as it can't possibly match a NULL.
+      */
+      continue;
+    }
+    if (!cur_key->is_null(row_num))
+      return FALSE;
+  }
+  return TRUE;
+}
+
+
+/*
+  @retval TRUE  there is a partial match (UNKNOWN)
+  @retval FALSE  there is no match at all (FALSE)
+*/
+
+bool subselect_rowid_merge_engine::partial_match()
+{
+  Ordered_key *min_key; /* Key that contains the current minimum position. */
+  rownum_t min_row_num; /* Current row number of min_key. */
+  Ordered_key *cur_key;
+  rownum_t cur_row_num;
+  uint count_nulls_in_search_key= 0;
+  bool res= FALSE;
+
+  /* If there is a non-NULL key, it must be the first key in the keys array. */
+  DBUG_ASSERT(!non_null_key || (non_null_key && merge_keys[0] == non_null_key));
+  /* The prioryty queue for keys must be empty. */
+  DBUG_ASSERT(!pq.elements);
+
+  /* All data accesses during execution are via handler::ha_rnd_pos() */
+  if (tmp_table->file->ha_rnd_init_with_error(0))
+  {
+    res= FALSE;
+    goto end;
+  }
+
+  /* Check if there is a match for the columns of the only non-NULL key. */
+  if (non_null_key && !non_null_key->lookup())
+  {
+    res= FALSE;
+    goto end;
+  }
+
+  /*
+    If there is a NULL (sub)row that covers all NULL-able columns,
+    then there is a guranteed partial match, and we don't need to search
+    for the matching row.
+   */
+  if (covering_null_row_width)
+  {
+    res= TRUE;
+    goto end;
+  }
+
+  if (non_null_key)
+    queue_insert(&pq, (uchar *) non_null_key);
+  /*
+    Do not add the non_null_key, since it was already processed above.
+  */
+  bitmap_clear_all(&matching_outer_cols);
+  for (uint i= test(non_null_key); i < keys_count; i++)
+  {
+    DBUG_ASSERT(merge_keys[i]->get_column_count() == 1);
+    if (merge_keys[i]->get_search_key(0)->null_value)
+    {
+      ++count_nulls_in_search_key;
+      bitmap_set_bit(&matching_outer_cols, merge_keys[i]->get_keyid());
+    }
+    else if (merge_keys[i]->lookup())
+      queue_insert(&pq, (uchar *) merge_keys[i]);
+  }
+
+  /*
+    If the outer reference consists of only NULLs, or if it has NULLs in all
+    nullable columns, the result is UNKNOWN.
+  */
+  if (count_nulls_in_search_key ==
+      ((Item_in_subselect *) item)->left_expr->cols() -
+      (non_null_key ? non_null_key->get_column_count() : 0))
+  {
+    res= TRUE;
+    goto end;
+  }
+
+  /*
+    If there is no NULL (sub)row that covers all NULL columns, and there is no
+    single match for any of the NULL columns, the result is FALSE.
+  */
+  if (pq.elements - test(non_null_key) == 0)
+  {
+    res= FALSE;
+    goto end;
+  }
+
+  DBUG_ASSERT(pq.elements);
+
+  min_key= (Ordered_key*) queue_remove_top(&pq);
+  min_row_num= min_key->current();
+  bitmap_copy(&matching_keys, &null_only_columns);
+  bitmap_set_bit(&matching_keys, min_key->get_keyid());
+  bitmap_union(&matching_keys, &matching_outer_cols);
+  if (min_key->next_same())
+    queue_insert(&pq, (uchar *) min_key);
+
+  if (pq.elements == 0)
+  {
+    /*
+      Check the only matching row of the only key min_key for NULL matches
+      in the other columns.
+    */
+    res= test_null_row(min_row_num);
+    goto end;
+  }
+
+  while (TRUE)
+  {
+    cur_key= (Ordered_key*) queue_remove_top(&pq);
+    cur_row_num= cur_key->current();
+
+    if (cur_row_num == min_row_num)
+      bitmap_set_bit(&matching_keys, cur_key->get_keyid());
+    else
+    {
+      /* Follows from the correct use of priority queue. */
+      DBUG_ASSERT(cur_row_num > min_row_num);
+      if (test_null_row(min_row_num))
+      {
+        res= TRUE;
+        goto end;
+      }
+      else
+      {
+        min_key= cur_key;
+        min_row_num= cur_row_num;
+        bitmap_copy(&matching_keys, &null_only_columns);
+        bitmap_set_bit(&matching_keys, min_key->get_keyid());
+        bitmap_union(&matching_keys, &matching_outer_cols);
+      }
+    }
+
+    if (cur_key->next_same())
+      queue_insert(&pq, (uchar *) cur_key);
+
+    if (pq.elements == 0)
+    {
+      /* Check the last row of the last column in PQ for NULL matches. */
+      res= test_null_row(min_row_num);
+      goto end;
+    }
+  }
+
+  /* We should never get here - all branches must be handled explicitly above. */
+  DBUG_ASSERT(FALSE);
+
+end:
+  queue_remove_all(&pq);
+  tmp_table->file->ha_rnd_end();
+  return res;
+}
+
+
+subselect_table_scan_engine::subselect_table_scan_engine(
+  THD *thd_arg, subselect_uniquesubquery_engine *engine_arg,
+  TABLE *tmp_table_arg,
+  Item_subselect *item_arg,
+  select_result_interceptor *result_arg,
+  List<Item> *equi_join_conds_arg,
+  uint covering_null_row_width_arg)
+  :subselect_partial_match_engine(thd_arg, engine_arg, tmp_table_arg, item_arg,
+                                  result_arg, equi_join_conds_arg,
+                                  covering_null_row_width_arg)
+{}
+
+
+/*
+  TIMOUR:
+  This method is based on subselect_uniquesubquery_engine::scan_table().
+  Consider refactoring somehow, 80% of the code is the same.
+
+  for each row_i in tmp_table
+  {
+    count_matches= 0;
+    for each row element row_i[j]
+    {
+      if (outer_ref[j] is NULL || row_i[j] is NULL || outer_ref[j] == row_i[j])
+        ++count_matches;
+    }
+    if (count_matches == outer_ref.elements)
+      return TRUE
+  }
+  return FALSE
+*/
+
+bool subselect_table_scan_engine::partial_match()
+{
+  List_iterator_fast<Item> equality_it(*equi_join_conds);
+  Item *cur_eq;
+  uint count_matches;
+  int error;
+  bool res;
+
+  if (tmp_table->file->ha_rnd_init_with_error(1))
+  {
+    res= FALSE;
+    goto end;
+  }
+
+  tmp_table->file->extra_opt(HA_EXTRA_CACHE,
+                             current_thd->variables.read_buff_size);
+  /*
+  TIMOUR:
+  scan_table() also calls "table->null_row= 0;", why, do we need it?
+  */
+  for (;;)
+  {
+    error= tmp_table->file->ha_rnd_next(tmp_table->record[0]);
+    if (error) {
+      if (error == HA_ERR_RECORD_DELETED)
+      {
+        error= 0;
+        continue;
+      }
+      if (error == HA_ERR_END_OF_FILE)
+      {
+        error= 0;
+        break;
+      }
+      else
+      {
+        error= report_error(tmp_table, error);
+        break;
+      }
+    }
+
+    equality_it.rewind();
+    count_matches= 0;
+    while ((cur_eq= equality_it++))
+    {
+      DBUG_ASSERT(cur_eq->type() == Item::FUNC_ITEM &&
+                  ((Item_func*)cur_eq)->functype() == Item_func::EQ_FUNC);
+      if (!cur_eq->val_int() && !cur_eq->null_value)
+        break;
+      ++count_matches;
+    }
+    if (count_matches == tmp_table->s->fields)
+    {
+      res= TRUE; /* Found a matching row. */
+      goto end;
+    }
+  }
+
+  res= FALSE;
+end:
+  tmp_table->file->ha_rnd_end();
+  return res;
+}
+
+
+void subselect_table_scan_engine::cleanup()
+{
+}
diff --git a/sql/item_subselect.h b/sql/item_subselect.h
index 16b901e1333..3b43d75f43f 100644
--- a/sql/item_subselect.h
+++ b/sql/item_subselect.h
@@ -22,9 +22,11 @@
 class st_select_lex;
 class st_select_lex_unit;
 class JOIN;
-class select_subselect;
+class select_result_interceptor;
 class subselect_engine;
+class subselect_hash_sj_engine;
 class Item_bool_func2;
+class Cached_item;
 
 /* base class for subselects */
 
@@ -34,10 +36,24 @@ class Item_subselect :public Item_result_field
 protected:
   /* thread handler, will be assigned in fix_fields only */
   THD *thd;
-  /* substitution instead of subselect in case of optimization */
+  /* 
+    Used inside Item_subselect::fix_fields() according to this scenario:
+      > Item_subselect::fix_fields
+        > engine->prepare
+          > child_join->prepare
+            (Here we realize we need to do the rewrite and set
+             substitution= some new Item, eg. Item_in_optimizer )
+          < child_join->prepare
+        < engine->prepare
+        *ref= substitution;
+      < Item_subselect::fix_fields
+  */
   Item *substitution;
+public:
   /* unit of subquery */
   st_select_lex_unit *unit;
+protected:
+  Item *expr_cache;
   /* engine that perform execution of subselect (single select or union) */
   subselect_engine *engine;
   /* old engine if engine was changed */
@@ -53,13 +69,39 @@ protected:
   /* cache of constant state */
   bool const_item_cache;
   
+  bool inside_first_fix_fields;
+  bool done_first_fix_fields;
 public:
-  /* 
-    References from inside the subquery to the select that this predicate is
-    in.  References to parent selects not included.
+  /* A reference from inside subquery predicate to somewhere outside of it */
+  class Ref_to_outside : public Sql_alloc
+  {
+  public:
+    st_select_lex *select; /* Select where the reference is pointing to */
+    /* 
+      What is being referred. This may be NULL when we're referring to an
+      aggregate function.
+    */ 
+    Item *item; 
+  };
+  /*
+    References from within this subquery to somewhere outside of it (i.e. to
+    parent select, grandparent select, etc)
+  */
+  List<Ref_to_outside> upper_refs;
+  st_select_lex *parent_select;
+
+  /**
+     List of references on items subquery depends on (externally resolved);
+
+     @note We can't store direct links on Items because it could be
+           substituted with other item (for example for grouping).
+   */
+  List<Item*> depends_on;
+
+  /*
+   TRUE<=>Table Elimination has made it redundant to evaluate this select
+          (and so it is not part of QEP, etc)
   */
-  List<Item> refers_to;
-  int in_fix_fields;
   bool eliminated;
   
   /* changed engine indicator */
@@ -79,12 +121,12 @@ public:
   virtual subs_type substype() { return UNKNOWN_SUBS; }
 
   /*
-     We need this method, because some compilers do not allow 'this'
-     pointer in constructor initialization list, but we need pass pointer
-     to subselect Item class to select_subselect classes constructor.
+    We need this method, because some compilers do not allow 'this'
+    pointer in constructor initialization list, but we need to pass a pointer
+    to subselect Item class to select_result_interceptor's constructor.
   */
   virtual void init (st_select_lex *select_lex,
-		     select_subselect *result);
+		     select_result_interceptor *result);
 
   ~Item_subselect();
   void cleanup();
@@ -103,6 +145,9 @@ public:
     return null_value;
   }
   bool fix_fields(THD *thd, Item **ref);
+  bool mark_as_dependent(THD *thd, st_select_lex *select, Item *item);
+  void fix_after_pullout(st_select_lex *new_parent, Item **ref);
+  void recalc_used_tables(st_select_lex *new_parent, bool after_pullout);
   virtual bool exec();
   virtual void fix_length_and_dec();
   table_map used_tables() const;
@@ -148,8 +193,9 @@ public:
   */
   st_select_lex* get_select_lex();
   const char *func_name() const { DBUG_ASSERT(0); return "subselect"; }
+  virtual bool expr_cache_is_needed(THD *);
 
-  friend class select_subselect;
+  friend class select_result_interceptor;
   friend class Item_in_optimizer;
   friend bool Item_field::fix_fields(THD *, Item **);
   friend int  Item_field::fix_outer_field(THD *, Field **, Item **);
@@ -157,6 +203,8 @@ public:
   friend void mark_select_range_as_dependent(THD*,
                                              st_select_lex*, st_select_lex*,
                                              Field*, Item*, Item_ident*);
+  friend bool convert_join_subqueries_to_semijoins(JOIN *join);
+
 };
 
 /* single value subselect */
@@ -168,7 +216,8 @@ protected:
   Item_cache *value, **row;
 public:
   Item_singlerow_subselect(st_select_lex *select_lex);
-  Item_singlerow_subselect() :Item_subselect(), value(0), row (0) {}
+  Item_singlerow_subselect() :Item_subselect(), value(0), row (0)
+  {}
 
   void cleanup();
   subs_type substype() { return SINGLEROW_SUBS; }
@@ -206,6 +255,8 @@ public:
   */
   st_select_lex* invalidate_and_restore_select_lex();
 
+  Item* expr_cache_insert_transformer(uchar *thd_arg);
+
   friend class select_singlerow_subselect;
 };
 
@@ -253,20 +304,23 @@ public:
   void fix_length_and_dec();
   virtual void print(String *str, enum_query_type query_type);
 
+  Item* expr_cache_insert_transformer(uchar *thd_arg);
+
   friend class select_exists_subselect;
   friend class subselect_uniquesubquery_engine;
   friend class subselect_indexsubquery_engine;
 };
 
 
-/*
-  IN subselect: this represents "left_exr IN (SELECT ...)"
+/**
+  Representation of IN subquery predicates of the form
+  "left_expr IN (SELECT ...)".
 
+  @details
   This class has: 
-   - (as a descendant of Item_subselect) a "subquery execution engine" which 
-      allows it to evaluate subqueries. (and this class participates in
-      execution by having was_null variable where part of execution result
-      is stored.
+   - A "subquery execution engine" (as a subclass of Item_subselect) that allows
+     it to evaluate subqueries. (and this class participates in execution by
+     having was_null variable where part of execution result is stored.
    - Transformation methods (todo: more on this).
 
   This class is not used directly, it is "wrapped" into Item_in_optimizer
@@ -275,8 +329,22 @@ public:
 
 class Item_in_subselect :public Item_exists_subselect
 {
-protected:
+public:
   Item *left_expr;
+protected:
+  /*
+    Cache of the left operand of the subquery predicate. Allocated in the
+    runtime memory root, for each execution, thus need not be freed.
+  */
+  List<Cached_item> *left_expr_cache;
+  bool first_execution;
+  /*
+    Set to TRUE if at query execution time we determine that this item's
+    value is a constant during this execution. We need this member because
+    it is not possible to substitute 'this' with a constant item.
+  */
+  bool is_constant;
+
   /*
     expr & optimizer used in subselect rewriting to store Item for
     all JOIN in UNION
@@ -285,10 +353,48 @@ protected:
   Item_in_optimizer *optimizer;
   bool was_null;
   bool abort_on_null;
-  bool transformed;
 public:
   /* Used to trigger on/off conditions that were pushed down to subselect */
   bool *pushed_cond_guards;
+  
+  /* Priority of this predicate in the convert-to-semi-join-nest process. */
+  int sj_convert_priority;
+  /*
+    Used by subquery optimizations to keep track about in which clause this
+    subquery predicate is located: 
+      (TABLE_LIST*) 1   - the predicate is an AND-part of the WHERE
+      join nest pointer - the predicate is an AND-part of ON expression
+                          of a join nest   
+      NULL              - for all other locations
+    See also THD::emb_on_expr_nest.
+  */
+  TABLE_LIST *emb_on_expr_nest;
+  /* 
+    Location of the subquery predicate. It is either
+     - pointer to join nest if the subquery predicate is in the ON expression
+     - (TABLE_LIST*)1 if the predicate is in the WHERE.
+  */
+  TABLE_LIST *expr_join_nest;
+  /*
+    Types of left_expr and subquery's select list allow to perform subquery
+    materialization. Currently, we set this to FALSE when it as well could
+    be TRUE. This is to be properly addressed with fix for BUG#36752.
+  */
+  bool types_allow_materialization;
+
+  /* 
+    Same as above, but they also allow to scan the materialized table. 
+  */
+  bool sjm_scan_allowed;
+
+  /* The method chosen to execute the IN predicate.  */
+  enum enum_exec_method {
+    NOT_TRANSFORMED, /* No execution method was chosen for this IN. */
+    SEMI_JOIN,   /* IN was converted to semi-join nest and should be removed. */
+    IN_TO_EXISTS, /* IN was converted to correlated EXISTS. */
+    MATERIALIZATION /* IN will be executed via subquery materialization. */
+  };
+  enum_exec_method exec_method;
 
   bool *get_cond_guard(int i)
   {
@@ -305,10 +411,11 @@ public:
 
   Item_in_subselect(Item * left_expr, st_select_lex *select_lex);
   Item_in_subselect()
-    :Item_exists_subselect(), optimizer(0), abort_on_null(0), transformed(0),
-     pushed_cond_guards(NULL), upper_item(0)
+    :Item_exists_subselect(), left_expr_cache(0), first_execution(TRUE),
+    is_constant(FALSE), optimizer(0), abort_on_null(0),
+    pushed_cond_guards(NULL), exec_method(NOT_TRANSFORMED), upper_item(0)
   {}
-
+  void cleanup();
   subs_type substype() { return IN_SUBS; }
   void reset() 
   {
@@ -321,6 +428,10 @@ public:
   trans_res select_in_like_transformer(JOIN *join, Comp_creator *func);
   trans_res single_value_transformer(JOIN *join, Comp_creator *func);
   trans_res row_value_transformer(JOIN * join);
+  trans_res single_value_in_to_exists_transformer(JOIN * join,
+                                                  Comp_creator *func);
+  trans_res row_value_in_to_exists_transformer(JOIN * join);
+  virtual bool exec();
   longlong val_int();
   double val_real();
   String *val_str(String*);
@@ -332,10 +443,26 @@ public:
   bool test_limit(st_select_lex_unit *unit);
   virtual void print(String *str, enum_query_type query_type);
   bool fix_fields(THD *thd, Item **ref);
-
+  void fix_after_pullout(st_select_lex *new_parent, Item **ref);
+  void update_used_tables();
+  bool setup_engine();
+  bool init_left_expr_cache();
+  /* Inform 'this' that it was computed, and contains a valid result. */
+  void set_first_execution() { if (first_execution) first_execution= FALSE; }
+  bool is_expensive_processor(uchar *arg);
+  bool expr_cache_is_needed(THD *thd);
+  
+  /* 
+    Return the identifier that we could use to identify the subquery for the
+    user.
+  */
+  int get_identifier();
   friend class Item_ref_null_helper;
   friend class Item_is_not_null_test;
+  friend class Item_in_optimizer;
   friend class subselect_indexsubquery_engine;
+  friend class subselect_hash_sj_engine;
+  friend class subselect_partial_match_engine;
 };
 
 
@@ -360,7 +487,7 @@ public:
 class subselect_engine: public Sql_alloc
 {
 protected:
-  select_subselect *result; /* results storage class */
+  select_result_interceptor *result; /* results storage class */
   THD *thd; /* pointer to current THD */
   Item_subselect *item; /* item, that use this engine */
   enum Item_result res_type; /* type of results */
@@ -368,14 +495,20 @@ protected:
   bool maybe_null; /* may be null (first item in select) */
 public:
 
-  subselect_engine(Item_subselect *si, select_subselect *res)
-    :thd(0)
+  enum enum_engine_type {ABSTRACT_ENGINE, SINGLE_SELECT_ENGINE,
+                         UNION_ENGINE, UNIQUESUBQUERY_ENGINE,
+                         INDEXSUBQUERY_ENGINE, HASH_SJ_ENGINE,
+                         ROWID_MERGE_ENGINE, TABLE_SCAN_ENGINE};
+
+  subselect_engine(THD *thd_arg, Item_subselect *si,
+                   select_result_interceptor *res)
   {
     result= res;
     item= si;
     res_type= STRING_RESULT;
     res_field_type= MYSQL_TYPE_VAR_STRING;
     maybe_null= 0;
+    set_thd(thd_arg);
   }
   virtual ~subselect_engine() {}; // to satisfy compiler
   virtual void cleanup()= 0;
@@ -418,12 +551,14 @@ public:
   virtual table_map upper_select_const_tables()= 0;
   static table_map calc_const_tables(TABLE_LIST *);
   virtual void print(String *str, enum_query_type query_type)= 0;
-  virtual bool change_result(Item_subselect *si, select_subselect *result)= 0;
+  virtual bool change_result(Item_subselect *si,
+                             select_result_interceptor *result)= 0;
   virtual bool no_tables()= 0;
   virtual bool is_executed() const { return FALSE; }
   /* Check if subquery produced any rows during last query execution */
   virtual bool no_rows() = 0;
-
+  virtual enum_engine_type engine_type() { return ABSTRACT_ENGINE; }
+  virtual int get_identifier() { DBUG_ASSERT(0); return 0; }
 protected:
   void set_row(List<Item> &item_list, Item_cache **row);
 };
@@ -437,8 +572,8 @@ class subselect_single_select_engine: public subselect_engine
   st_select_lex *select_lex; /* corresponding select_lex */
   JOIN * join; /* corresponding JOIN structure */
 public:
-  subselect_single_select_engine(st_select_lex *select,
-				 select_subselect *result,
+  subselect_single_select_engine(THD *thd_arg, st_select_lex *select,
+				 select_result_interceptor *result,
 				 Item_subselect *item);
   void cleanup();
   int prepare();
@@ -449,11 +584,16 @@ public:
   void exclude();
   table_map upper_select_const_tables();
   virtual void print (String *str, enum_query_type query_type);
-  bool change_result(Item_subselect *si, select_subselect *result);
+  bool change_result(Item_subselect *si, select_result_interceptor *result);
   bool no_tables();
   bool may_be_null();
   bool is_executed() const { return executed; }
   bool no_rows();
+  virtual enum_engine_type engine_type() { return SINGLE_SELECT_ENGINE; }
+  int get_identifier();
+
+  friend class subselect_hash_sj_engine;
+  friend class Item_in_subselect;
 };
 
 
@@ -461,8 +601,8 @@ class subselect_union_engine: public subselect_engine
 {
   st_select_lex_unit *unit;  /* corresponding unit structure */
 public:
-  subselect_union_engine(st_select_lex_unit *u,
-			 select_subselect *result,
+  subselect_union_engine(THD *thd_arg, st_select_lex_unit *u,
+			 select_result_interceptor *result,
 			 Item_subselect *item);
   void cleanup();
   int prepare();
@@ -473,10 +613,11 @@ public:
   void exclude();
   table_map upper_select_const_tables();
   virtual void print (String *str, enum_query_type query_type);
-  bool change_result(Item_subselect *si, select_subselect *result);
+  bool change_result(Item_subselect *si, select_result_interceptor *result);
   bool no_tables();
   bool is_executed() const;
   bool no_rows();
+  virtual enum_engine_type engine_type() { return UNION_ENGINE; }
 };
 
 
@@ -516,10 +657,8 @@ public:
   // constructor can assign THD because it will be called after JOIN::prepare
   subselect_uniquesubquery_engine(THD *thd_arg, st_join_table *tab_arg,
 				  Item_subselect *subs, Item *where)
-    :subselect_engine(subs, 0), tab(tab_arg), cond(where)
-  {
-    set_thd(thd_arg);
-  }
+    :subselect_engine(thd_arg, subs, 0), tab(tab_arg), cond(where)
+  {}
   ~subselect_uniquesubquery_engine();
   void cleanup();
   int prepare();
@@ -530,11 +669,14 @@ public:
   void exclude();
   table_map upper_select_const_tables() { return 0; }
   virtual void print (String *str, enum_query_type query_type);
-  bool change_result(Item_subselect *si, select_subselect *result);
+  bool change_result(Item_subselect *si, select_result_interceptor *result);
   bool no_tables();
+  int index_lookup(); /* TIMOUR: this method needs refactoring. */
   int scan_table();
   bool copy_ref_key();
+  int copy_ref_key_simple();  /* TIMOUR: this method needs refactoring. */
   bool no_rows() { return empty_result_set; }
+  virtual enum_engine_type engine_type() { return UNIQUESUBQUERY_ENGINE; }
 };
 
 
@@ -584,6 +726,7 @@ public:
   {}
   int exec();
   virtual void print (String *str, enum_query_type query_type);
+  virtual enum_engine_type engine_type() { return INDEXSUBQUERY_ENGINE; }
 };
 
 
@@ -592,9 +735,446 @@ inline bool Item_subselect::is_evaluated() const
   return engine->is_executed();
 }
 
+
 inline bool Item_subselect::is_uncacheable() const
 {
   return engine->uncacheable();
 }
 
 
+/**
+  Compute an IN predicate via a hash semi-join. This class is responsible for
+  the materialization of the subquery, and the selection of the correct and
+  optimal execution method (e.g. direct index lookup, or partial matching) for
+  the IN predicate.
+*/
+
+class subselect_hash_sj_engine : public subselect_engine
+{
+protected:
+  /* The table into which the subquery is materialized. */
+  TABLE *tmp_table;
+  /* TRUE if the subquery was materialized into a temp table. */
+  bool is_materialized;
+  /*
+    The old engine already chosen at parse time and stored in permanent memory.
+    Through this member we can re-create and re-prepare materialize_join for
+    each execution of a prepared statement. We also reuse the functionality
+    of subselect_single_select_engine::[prepare | cols].
+  */
+  subselect_single_select_engine *materialize_engine;
+  /* The engine used to compute the IN predicate. */
+  subselect_engine *lookup_engine;
+  /*
+    QEP to execute the subquery and materialize its result into a
+    temporary table. Created during the first call to exec().
+  */
+  JOIN *materialize_join;
+
+  /* Keyparts of the only non-NULL composite index in a rowid merge. */
+  MY_BITMAP non_null_key_parts;
+  /* Keyparts of the single column indexes with NULL, one keypart per index. */
+  MY_BITMAP partial_match_key_parts;
+  uint count_partial_match_columns;
+  uint count_null_only_columns;
+  /*
+    A conjunction of all the equality condtions between all pairs of expressions
+    that are arguments of an IN predicate. We need these to post-filter some
+    IN results because index lookups sometimes match values that are actually
+    not equal to the search key in SQL terms.
+ */
+  Item_cond_and *semi_join_conds;
+  /* Possible execution strategies that can be used to compute hash semi-join.*/
+  enum exec_strategy {
+    UNDEFINED,
+    COMPLETE_MATCH, /* Use regular index lookups. */
+    PARTIAL_MATCH,  /* Use some partial matching strategy. */
+    PARTIAL_MATCH_MERGE, /* Use partial matching through index merging. */
+    PARTIAL_MATCH_SCAN,  /* Use partial matching through table scan. */
+    IMPOSSIBLE      /* Subquery materialization is not applicable. */
+  };
+  /* The chosen execution strategy. Computed after materialization. */
+  exec_strategy strategy;
+protected:
+  exec_strategy get_strategy_using_schema();
+  exec_strategy get_strategy_using_data();
+  ulonglong rowid_merge_buff_size(bool has_non_null_key,
+                                  bool has_covering_null_row,
+                                  MY_BITMAP *partial_match_key_parts);
+  void choose_partial_match_strategy(bool has_non_null_key,
+                                     bool has_covering_null_row,
+                                     MY_BITMAP *partial_match_key_parts);
+  bool make_semi_join_conds();
+  subselect_uniquesubquery_engine* make_unique_engine();
+
+public:
+  subselect_hash_sj_engine(THD *thd, Item_subselect *in_predicate,
+                           subselect_single_select_engine *old_engine)
+    :subselect_engine(thd, in_predicate, NULL), tmp_table(NULL),
+    is_materialized(FALSE), materialize_engine(old_engine), lookup_engine(NULL),
+    materialize_join(NULL), count_partial_match_columns(0),
+    count_null_only_columns(0), semi_join_conds(NULL), strategy(UNDEFINED)
+  {}
+  ~subselect_hash_sj_engine();
+
+  bool init_permanent(List<Item> *tmp_columns);
+  bool init_runtime();
+  void cleanup();
+  int prepare() { return 0; } /* Override virtual function in base class. */
+  int exec();
+  virtual void print(String *str, enum_query_type query_type);
+  uint cols()
+  {
+    return materialize_engine->cols();
+  }
+  uint8 uncacheable() { return materialize_engine->uncacheable(); }
+  table_map upper_select_const_tables() { return 0; }
+  bool no_rows() { return !tmp_table->file->stats.records; }
+  virtual enum_engine_type engine_type() { return HASH_SJ_ENGINE; }
+  /*
+    TODO: factor out all these methods in a base subselect_index_engine class
+    because all of them have dummy implementations and should never be called.
+  */
+  void fix_length_and_dec(Item_cache** row);//=>base class
+  void exclude(); //=>base class
+  //=>base class
+  bool change_result(Item_subselect *si, select_result_interceptor *result);
+  bool no_tables();//=>base class
+};
+
+
+/*
+  Distinguish the type od (0-based) row numbers from the type of the index into
+  an array of row numbers.
+*/
+typedef ha_rows rownum_t;
+
+
+/*
+  An Ordered_key is an in-memory table index that allows O(log(N)) time
+  lookups of a multi-part key.
+
+  If the index is over a single column, then this column may contain NULLs, and
+  the NULLs are stored and tested separately for NULL in O(1) via is_null().
+  Multi-part indexes assume that the indexed columns do not contain NULLs.
+
+  TODO:
+  = Due to the unnatural assymetry between single and multi-part indexes, it
+    makes sense to somehow refactor or extend the class.
+
+  = This class can be refactored into a base abstract interface, and two
+    subclasses:
+    - one to represent single-column indexes, and
+    - another to represent multi-column indexes.
+    Such separation would allow slightly more efficient implementation of
+    the single-column indexes.
+  = The current design requires such indexes to be fully recreated for each
+    PS (re)execution, however most of the comprising objects can be reused.
+*/
+
+class Ordered_key : public Sql_alloc
+{
+protected:
+  /*
+    Index of the key in an array of keys. This index allows to
+    construct (sub)sets of keys represented by bitmaps.
+  */
+  uint keyid;
+  /* The table being indexed. */
+  TABLE *tbl;
+  /* The columns being indexed. */
+  Item_field **key_columns;
+  /* Number of elements in 'key_columns' (number of key parts). */
+  uint key_column_count;
+  /*
+    An expression, or sequence of expressions that forms the search key.
+    The search key is a sequence when it is Item_row. Each element of the
+    sequence is accessible via Item::element_index(int i).
+  */
+  Item *search_key;
+
+/* Value index related members. */
+  /*
+    The actual value index, consists of a sorted sequence of row numbers.
+  */
+  rownum_t *key_buff;
+  /* Number of elements in key_buff. */
+  ha_rows key_buff_elements;
+  /* Current element in 'key_buff'. */
+  ha_rows cur_key_idx;
+  /*
+    Mapping from row numbers to row ids. The element row_num_to_rowid[i]
+    contains a buffer with the rowid for the row numbered 'i'.
+    The memory for this member is not maintanined by this class because
+    all Ordered_key indexes of the same table share the same mapping.
+  */
+  uchar *row_num_to_rowid;
+  /*
+    A sequence of predicates to compare the search key with the corresponding
+    columns of a table row from the index.
+  */
+  Item_func_lt **compare_pred;
+
+/* Null index related members. */
+  MY_BITMAP null_key;
+  /* Count of NULLs per column. */
+  ha_rows null_count;
+  /* The row number that contains the first NULL in a column. */
+  ha_rows min_null_row;
+  /* The row number that contains the last NULL in a column. */
+  ha_rows max_null_row;
+
+protected:
+  bool alloc_keys_buffers();
+  /*
+    Quick sort comparison function that compares two rows of the same table
+    indentfied with their row numbers.
+  */
+  int cmp_keys_by_row_data(rownum_t a, rownum_t b);
+  static int cmp_keys_by_row_data_and_rownum(Ordered_key *key,
+                                             rownum_t* a, rownum_t* b);
+
+  int cmp_key_with_search_key(rownum_t row_num);
+
+public:
+  Ordered_key(uint keyid_arg, TABLE *tbl_arg,
+              Item *search_key_arg, ha_rows null_count_arg,
+              ha_rows min_null_row_arg, ha_rows max_null_row_arg,
+              uchar *row_num_to_rowid_arg);
+  ~Ordered_key();
+  void cleanup();
+  /* Initialize a multi-column index. */
+  bool init(MY_BITMAP *columns_to_index);
+  /* Initialize a single-column index. */
+  bool init(int col_idx);
+
+  uint get_column_count() { return key_column_count; }
+  uint get_keyid() { return keyid; }
+  uint get_field_idx(uint i)
+  {
+    DBUG_ASSERT(i < key_column_count);
+    return key_columns[i]->field->field_index;
+  }
+  /*
+    Get the search key element that corresponds to the i-th key part of this
+    index.
+  */
+  Item *get_search_key(uint i)
+  {
+    return search_key->element_index(key_columns[i]->field->field_index);
+  }
+  void add_key(rownum_t row_num)
+  {
+    /* The caller must know how many elements to add. */
+    DBUG_ASSERT(key_buff_elements && cur_key_idx < key_buff_elements);
+    key_buff[cur_key_idx]= row_num;
+    ++cur_key_idx;
+  }
+
+  void sort_keys();
+  double null_selectivity();
+
+  /*
+    Position the current element at the first row that matches the key.
+    The key itself is propagated by evaluating the current value(s) of
+    this->search_key.
+  */
+  bool lookup();
+  /* Move the current index cursor to the first key. */
+  void first()
+  {
+    DBUG_ASSERT(key_buff_elements);
+    cur_key_idx= 0;
+  }
+  /* TODO */
+  bool next_same();
+  /* Move the current index cursor to the next key. */
+  bool next()
+  {
+    DBUG_ASSERT(key_buff_elements);
+    if (cur_key_idx < key_buff_elements - 1)
+    {
+      ++cur_key_idx;
+      return TRUE;
+    }
+    return FALSE;
+  };
+  /* Return the current index element. */
+  rownum_t current()
+  {
+    DBUG_ASSERT(key_buff_elements && cur_key_idx < key_buff_elements);
+    return key_buff[cur_key_idx];
+  }
+
+  void set_null(rownum_t row_num)
+  {
+    bitmap_set_bit(&null_key, (uint)row_num);
+  }
+  bool is_null(rownum_t row_num)
+  {
+    /*
+      Indexes consisting of only NULLs do not have a bitmap buffer at all.
+      Their only initialized member is 'n_bits', which is equal to the number
+      of temp table rows.
+    */
+    if (null_count == tbl->file->stats.records)
+    {
+      DBUG_ASSERT(tbl->file->stats.records == null_key.n_bits);
+      return TRUE;
+    }
+    if (row_num > max_null_row || row_num < min_null_row)
+      return FALSE;
+    return bitmap_is_set(&null_key, (uint)row_num);
+  }
+  void print(String *str);
+};
+
+
+class subselect_partial_match_engine : public subselect_engine
+{
+protected:
+  /* The temporary table that contains a materialized subquery. */
+  TABLE *tmp_table;
+  /*
+    The engine used to check whether an IN predicate is TRUE or not. If not
+    TRUE, then subselect_rowid_merge_engine further distinguishes between
+    FALSE and UNKNOWN.
+  */
+  subselect_uniquesubquery_engine *lookup_engine;
+  /* A list of equalities between each pair of IN operands. */
+  List<Item> *equi_join_conds;
+  /*
+    If there is a row, such that all its NULL-able components are NULL, this
+    member is set to the number of covered columns. If there is no covering
+    row, then this is 0.
+  */
+  uint covering_null_row_width;
+protected:
+  virtual bool partial_match()= 0;
+public:
+  subselect_partial_match_engine(THD *thd_arg,
+                                 subselect_uniquesubquery_engine *engine_arg,
+                                 TABLE *tmp_table_arg, Item_subselect *item_arg,
+                                 select_result_interceptor *result_arg,
+                                 List<Item> *equi_join_conds_arg,
+                                 uint covering_null_row_width_arg);
+  int prepare() { return 0; }
+  int exec();
+  void fix_length_and_dec(Item_cache**) {}
+  uint cols() { /* TODO: what is the correct value? */ return 1; }
+  uint8 uncacheable() { return UNCACHEABLE_DEPENDENT; }
+  void exclude() {}
+  table_map upper_select_const_tables() { return 0; }
+  bool change_result(Item_subselect*, select_result_interceptor*)
+  { DBUG_ASSERT(FALSE); return false; }
+  bool no_tables() { return false; }
+  bool no_rows()
+  {
+    /*
+      TODO: It is completely unclear what is the semantics of this
+      method. The current result is computed so that the call to no_rows()
+      from Item_in_optimizer::val_int() sets Item_in_optimizer::null_value
+      correctly.
+    */
+    return !(((Item_in_subselect *) item)->null_value);
+  }
+  void print(String*, enum_query_type);
+
+  friend void subselect_hash_sj_engine::cleanup();
+};
+
+
+class subselect_rowid_merge_engine: public subselect_partial_match_engine
+{
+protected:
+  /*
+    Mapping from row numbers to row ids. The rowids are stored sequentially
+    in the array - rowid[i] is located in row_num_to_rowid + i * rowid_length.
+  */
+  uchar *row_num_to_rowid;
+  /*
+    A subset of all the keys for which there is a match for the same row.
+    Used during execution. Computed for each outer reference
+  */
+  MY_BITMAP matching_keys;
+  /*
+    The columns of the outer reference that are NULL. Computed for each
+    outer reference.
+  */
+  MY_BITMAP matching_outer_cols;
+  /*
+    Columns that consist of only NULLs. Such columns match any value.
+    Computed once per query execution.
+  */
+  MY_BITMAP null_only_columns;
+  /*
+    Indexes of row numbers, sorted by <column_value, row_number>. If an
+    index may contain NULLs, the NULLs are stored efficiently in a bitmap.
+
+    The indexes are sorted by the selectivity of their NULL sub-indexes, the
+    one with the fewer NULLs is first. Thus, if there is any index on
+    non-NULL columns, it is contained in keys[0].
+  */
+  Ordered_key **merge_keys;
+  /* The number of elements in keys. */
+  uint keys_count;
+  /*
+    An index on all non-NULL columns of 'tmp_table'. The index has the
+    logical form: <[v_i1 | ... | v_ik], rownum>. It allows to find the row
+    number where the columns c_i1,...,c1_k contain the values v_i1,...,v_ik.
+    If such an index exists, it is always the first element of 'keys'.
+  */
+  Ordered_key *non_null_key;
+  /*
+    Priority queue of Ordered_key indexes, one per NULLable column.
+    This queue is used by the partial match algorithm in method exec().
+  */
+  QUEUE pq;
+protected:
+  /*
+    Comparison function to compare keys in order of decreasing bitmap
+    selectivity.
+  */
+  static int cmp_keys_by_null_selectivity(Ordered_key **k1, Ordered_key **k2);
+  /*
+    Comparison function used by the priority queue pq, the 'smaller' key
+    is the one with the smaller current row number.
+  */
+  static int cmp_keys_by_cur_rownum(void *arg, uchar *k1, uchar *k2);
+
+  bool test_null_row(rownum_t row_num);
+  bool partial_match();
+public:
+  subselect_rowid_merge_engine(THD *thd_arg,
+                               subselect_uniquesubquery_engine *engine_arg,
+                               TABLE *tmp_table_arg, uint keys_count_arg,
+                               uint covering_null_row_width_arg,
+                               Item_subselect *item_arg,
+                               select_result_interceptor *result_arg,
+                               List<Item> *equi_join_conds_arg)
+    :subselect_partial_match_engine(thd_arg, engine_arg, tmp_table_arg,
+                                    item_arg, result_arg, equi_join_conds_arg,
+                                    covering_null_row_width_arg),
+    keys_count(keys_count_arg), non_null_key(NULL)
+  {}
+  ~subselect_rowid_merge_engine();
+  bool init(MY_BITMAP *non_null_key_parts, MY_BITMAP *partial_match_key_parts);
+  void cleanup();
+  virtual enum_engine_type engine_type() { return ROWID_MERGE_ENGINE; }
+};
+
+
+class subselect_table_scan_engine: public subselect_partial_match_engine
+{
+protected:
+  bool partial_match();
+public:
+  subselect_table_scan_engine(THD *thd_arg,
+                              subselect_uniquesubquery_engine *engine_arg,
+                              TABLE *tmp_table_arg, Item_subselect *item_arg,
+                              select_result_interceptor *result_arg,
+                              List<Item> *equi_join_conds_arg,
+                              uint covering_null_row_width_arg);
+  void cleanup();
+  virtual enum_engine_type engine_type() { return TABLE_SCAN_ENGINE; }
+};
diff --git a/sql/item_sum.cc b/sql/item_sum.cc
index 40ece652ba3..cc2704efb32 100644
--- a/sql/item_sum.cc
+++ b/sql/item_sum.cc
@@ -319,6 +319,7 @@ bool Item_sum::register_sum_func(THD *thd, Item **ref)
   if (aggr_level >= 0)
   {
     ref_by= ref;
+    thd->lex->current_select->register_dependency_item(aggr_sel, ref);
     /* Add the object to the list of registered objects assigned to aggr_sel */
     if (!aggr_sel->inner_sum_func_list)
       next= this;
@@ -350,7 +351,7 @@ bool Item_sum::register_sum_func(THD *thd, Item **ref)
          sl= sl->master_unit()->outer_select() )
       sl->master_unit()->item->with_sum_func= 1;
   }
-  thd->lex->current_select->mark_as_dependent(aggr_sel, NULL);
+  thd->lex->current_select->mark_as_dependent(thd, aggr_sel, NULL);
   return FALSE;
 }
 
diff --git a/sql/key.cc b/sql/key.cc
index 89423e5280e..19db7e9ec1f 100644
--- a/sql/key.cc
+++ b/sql/key.cc
@@ -103,30 +103,46 @@ int find_ref_key(KEY *key, uint key_count, uchar *record, Field *field,
   @param from_record full record to be copied from
   @param key_info    descriptor of the index
   @param key_length  specifies length of all keyparts that will be copied
+  @param with_zerofill  skipped bytes in the key buffer to be filled with 0
 */
 
 void key_copy(uchar *to_key, uchar *from_record, KEY *key_info,
-              uint key_length)
+              uint key_length, bool with_zerofill)
 {
   uint length;
   KEY_PART_INFO *key_part;
 
   if (key_length == 0)
     key_length= key_info->key_length;
-  for (key_part= key_info->key_part; (int) key_length > 0; key_part++)
+  for (key_part= key_info->key_part;
+       (int) key_length > 0;
+       key_part++, to_key+= length, key_length-= length)
   {
     if (key_part->null_bit)
     {
       *to_key++= test(from_record[key_part->null_offset] &
 		   key_part->null_bit);
       key_length--;
+      if (to_key[-1])
+      {
+        /*
+          Don't copy data for null values
+          The -1 below is to subtract the null byte which is already handled
+        */
+        length= min(key_length, (uint) key_part->store_length-1);
+        if (with_zerofill)
+          bzero((char*) to_key, length);
+        continue;
+      }
     }
     if (key_part->key_part_flag & HA_BLOB_PART ||
         key_part->key_part_flag & HA_VAR_LENGTH_PART)
     {
       key_length-= HA_KEY_BLOB_LENGTH;
       length= min(key_length, key_part->length);
-      key_part->field->get_key_image(to_key, length, Field::itRAW);
+      uint bytes= key_part->field->get_key_image(to_key, length, Field::itRAW);
+      if (with_zerofill && bytes < length)
+        bzero((char*) to_key + bytes, length - bytes);
       to_key+= HA_KEY_BLOB_LENGTH;
     }
     else
@@ -138,8 +154,6 @@ void key_copy(uchar *to_key, uchar *from_record, KEY *key_info,
       if (bytes < length)
         cs->cset->fill(cs, (char*) to_key + bytes, length - bytes, ' ');
     }
-    to_key+= length;
-    key_length-= length;
   }
 }
 
@@ -166,16 +180,28 @@ void key_restore(uchar *to_record, uchar *from_key, KEY *key_info,
   {
     key_length= key_info->key_length;
   }
-  for (key_part= key_info->key_part ; (int) key_length > 0 ; key_part++)
+  for (key_part= key_info->key_part ;
+       (int) key_length > 0 ;
+       key_part++, from_key+= length, key_length-= length)
   {
     uchar used_uneven_bits= 0;
     if (key_part->null_bit)
     {
-      if (*from_key++)
+      bool null_value; 
+      if ((null_value= *from_key++))
 	to_record[key_part->null_offset]|= key_part->null_bit;
       else
 	to_record[key_part->null_offset]&= ~key_part->null_bit;
       key_length--;
+      if (null_value)
+      {
+        /*
+          Don't copy data for null bytes
+          The -1 below is to subtract the null byte which is already handled
+        */
+        length= min(key_length, (uint) key_part->store_length-1);
+        continue;
+      }
     }
     if (key_part->type == HA_KEYTYPE_BIT)
     {
@@ -229,8 +255,6 @@ void key_restore(uchar *to_record, uchar *from_key, KEY *key_info,
       memcpy(to_record + key_part->offset, from_key + used_uneven_bits
              , (size_t) length - used_uneven_bits);
     }
-    from_key+= length;
-    key_length-= length;
   }
 }
 
@@ -548,3 +572,280 @@ next_loop:
   } while (key_info); /* no more keys to test */
   DBUG_RETURN(0);
 }
+
+
+/*
+  Compare two key tuples.
+
+  @brief
+    Compare two key tuples, i.e. two key values in KeyTupleFormat.
+
+  @param part          KEY_PART_INFO with key description
+  @param key1          First key to compare
+  @param key2          Second key to compare 
+  @param tuple_length  Length of key1 (and key2, they are the same) in bytes.
+
+  @return
+    @retval  0  key1 == key2
+    @retval -1  key1 < key2
+    @retval +1  key1 > key2 
+*/
+
+int key_tuple_cmp(KEY_PART_INFO *part, uchar *key1, uchar *key2, 
+                  uint tuple_length)
+{
+  uchar *key1_end= key1 + tuple_length;
+  int len;
+  int res;
+  LINT_INIT(len);
+  for (;key1 < key1_end; key1 += len, key2 += len, part++)
+  {
+    len= part->store_length;
+    if (part->null_bit)
+    {
+      if (*key1) // key1 == NULL
+      {
+        if (!*key2) // key1(NULL) < key2(notNULL)
+          return -1;
+        continue;
+      }
+      else if (*key2) // key1(notNULL) > key2 (NULL)
+        return 1;
+      /* Step over the NULL bytes for key_cmp() call */
+      key1++;
+      key2++;
+      len--;
+    }
+    if ((res= part->field->key_cmp(key1, key2)))
+      return res;
+  }
+  return 0;
+}
+
+
+/**
+  Get hash value for the key from a key buffer 
+
+  @param  key_info       the key descriptor
+  @param  used_key_part  number of key parts used for the key
+  @param  key            pointer to the buffer with the key value
+
+  @datails
+  When hashing we should take special care only of:
+  1. NULLs (and keyparts which can be null so one byte reserved for it);
+  2. Strings for which we have to take into account their collations
+  and the values of their lengths in the prefixes.
+
+  @return  hash value calculated for the key
+*/
+
+ulong key_hashnr(KEY *key_info, uint used_key_parts, const uchar *key)
+{
+  ulong nr=1, nr2=4;
+  KEY_PART_INFO *key_part= key_info->key_part;
+  KEY_PART_INFO *end_key_part= key_part + used_key_parts;
+
+  for (; key_part < end_key_part; key_part++)
+  {
+    uchar *pos= (uchar*)key;
+    CHARSET_INFO *cs;
+    uint length, pack_length;
+    bool is_string= TRUE;
+    LINT_INIT(cs);
+    key+= key_part->length;
+    if (key_part->null_bit)
+    {
+      key++;                       /* Skip null byte */
+      if (*pos)                    /* Found null */
+      {
+        nr^= (nr << 1) | 1;
+        /* Add key pack length to key for VARCHAR segments */
+        switch (key_part->type) {
+        case HA_KEYTYPE_VARTEXT1:
+        case HA_KEYTYPE_VARBINARY1:
+        case HA_KEYTYPE_VARTEXT2:
+        case HA_KEYTYPE_VARBINARY2:
+          key+= 2;
+          break;
+        default:
+          ;
+        }
+    continue;
+      }
+      pos++;                       /* Skip null byte */
+    }
+    /* If it is string set parameters of the string */
+    switch (key_part->type) {
+    case HA_KEYTYPE_TEXT:
+      cs= key_part->field->charset();
+      length= key_part->length;
+      pack_length= 0;
+      break;
+    case HA_KEYTYPE_BINARY :
+      cs= &my_charset_bin;
+      length= key_part->length;
+      pack_length= 0;
+      break;
+    case HA_KEYTYPE_VARTEXT1:
+    case HA_KEYTYPE_VARTEXT2:
+      cs= key_part->field->charset();
+      length= uint2korr(pos);
+      pack_length= 2;
+      break;
+    case HA_KEYTYPE_VARBINARY1:
+    case HA_KEYTYPE_VARBINARY2:
+      cs= &my_charset_bin;
+      length= uint2korr(pos);
+      pack_length= 2;
+      break;
+    default:
+      is_string= FALSE;
+    }
+
+    if (is_string)
+    {
+      if (cs->mbmaxlen > 1)
+      {
+        uint char_length= my_charpos(cs, pos + pack_length,
+                                     pos + pack_length + length,
+                                     length / cs->mbmaxlen);
+        set_if_smaller(length, char_length);
+      }
+      cs->coll->hash_sort(cs, pos+pack_length, length, &nr, &nr2);
+      key+= pack_length;
+    }
+    else
+    {
+      for (; pos < (uchar*)key ; pos++)
+      {
+        nr^=(ulong) ((((uint) nr & 63)+nr2)*((uint) *pos)) + (nr << 8);
+        nr2+=3;
+      }
+    }
+  }
+  DBUG_PRINT("exit", ("hash: %lx", nr));
+  return(nr);
+}
+
+
+/**
+  Check whether two keys in the key buffers are equal
+
+  @param key_info        the key descriptor
+  @param  used_key_part  number of key parts used for the keys
+  @param key1            pointer to the buffer with the first key 
+  @param key2            pointer to the buffer with the second key 
+
+  @detail See details of key_hashnr().
+
+  @retval TRUE  keys in the buffers are NOT equal
+  @retval FALSE keys in the buffers are equal
+*/
+
+bool key_buf_cmp(KEY *key_info, uint used_key_parts,
+                 const uchar *key1, const uchar *key2)
+{
+  KEY_PART_INFO *key_part= key_info->key_part;
+  KEY_PART_INFO *end_key_part= key_part + used_key_parts;
+
+  for (; key_part < end_key_part; key_part++)
+  {
+    uchar *pos1= (uchar*)key1;
+    uchar *pos2= (uchar*)key2;
+    CHARSET_INFO *cs;
+    uint length1, length2, pack_length;
+    bool is_string= TRUE;
+    LINT_INIT(cs);
+    key1+= key_part->length;
+    key2+= key_part->length;
+    if (key_part->null_bit)
+    {
+      key1++; key2++;                           /* Skip null byte */
+      if (*pos1 && *pos2)                       /* Both are null */
+      {
+        /* Add key pack length to key for VARCHAR segments */
+        switch (key_part->type) {
+        case HA_KEYTYPE_VARTEXT1:
+        case HA_KEYTYPE_VARBINARY1:
+        case HA_KEYTYPE_VARTEXT2:
+        case HA_KEYTYPE_VARBINARY2:
+          key1+= 2; key2+= 2;
+          break;
+        default:
+          ;
+        }
+        continue;
+      }
+      if (*pos1 != *pos2)
+        return TRUE;
+      pos1++; pos2++;
+    }
+
+    /* If it is string set parameters of the string */
+    switch (key_part->type) {
+    case HA_KEYTYPE_TEXT:
+      cs= key_part->field->charset();
+      length1= length2= key_part->length;
+      pack_length= 0;
+      break;
+    case HA_KEYTYPE_BINARY :
+      cs= &my_charset_bin;
+      length1= length2= key_part->length;
+      pack_length= 0;
+      break;
+    case HA_KEYTYPE_VARTEXT1:
+    case HA_KEYTYPE_VARTEXT2:
+      cs= key_part->field->charset();
+      length1= uint2korr(pos1);
+      length2= uint2korr(pos2);
+      pack_length= 2;
+      break;
+    case HA_KEYTYPE_VARBINARY1:
+    case HA_KEYTYPE_VARBINARY2:
+      cs= &my_charset_bin;
+      length1= uint2korr(pos1);
+      length2= uint2korr(pos2);
+      pack_length= 2;
+      break;
+    default:
+      is_string= FALSE;
+    }
+
+    if (is_string)
+    {
+      /*
+        Compare the strings taking into account length in characters
+        and collation
+      */
+      uint byte_len1= length1, byte_len2= length2;
+      if (cs->mbmaxlen > 1)
+      {
+        uint char_length1= my_charpos(cs, pos1 + pack_length,
+                                      pos1 + pack_length + length1,
+                                      length1 / cs->mbmaxlen);
+        uint char_length2= my_charpos(cs, pos2 + pack_length,
+                                      pos2 + pack_length + length2,
+                                      length2 / cs->mbmaxlen);
+        set_if_smaller(length1, char_length1);
+        set_if_smaller(length2, char_length2);
+      }
+      if (length1 != length2 ||
+          cs->coll->strnncollsp(cs,
+                                pos1 + pack_length, byte_len1,
+                                pos2 + pack_length, byte_len2,
+                                1))
+        return TRUE;
+      key1+= pack_length; key2+= pack_length;
+    }
+    else
+    {
+      /* it is OK to compare non-string byte per byte */
+      for (; pos1 < (uchar*)key1 ; pos1++, pos2++)
+      {
+        if (pos1[0] != pos2[0])
+          return TRUE;
+      }
+    }
+  }
+  return FALSE;
+}
diff --git a/sql/lock.cc b/sql/lock.cc
index 9a069d6d96f..740b54f9153 100644
--- a/sql/lock.cc
+++ b/sql/lock.cc
@@ -84,41 +84,10 @@
 
 extern HASH open_cache;
 
-/* flags for get_lock_data */
-#define GET_LOCK_UNLOCK         1
-#define GET_LOCK_STORE_LOCKS    2
-
-static MYSQL_LOCK *get_lock_data(THD *thd, TABLE **table,uint count,
-				 uint flags, TABLE **write_locked);
-static void reset_lock_data(MYSQL_LOCK *sql_lock);
 static int lock_external(THD *thd, TABLE **table,uint count);
 static int unlock_external(THD *thd, TABLE **table,uint count);
 static void print_lock_error(int error, const char *);
 
-/*
-  Lock tables.
-
-  SYNOPSIS
-    mysql_lock_tables()
-    thd                         The current thread.
-    tables                      An array of pointers to the tables to lock.
-    count                       The number of tables to lock.
-    flags                       Options:
-      MYSQL_LOCK_IGNORE_GLOBAL_READ_LOCK      Ignore a global read lock
-      MYSQL_LOCK_IGNORE_GLOBAL_READ_ONLY      Ignore SET GLOBAL READ_ONLY
-      MYSQL_LOCK_IGNORE_FLUSH                 Ignore a flush tables.
-      MYSQL_LOCK_NOTIFY_IF_NEED_REOPEN        Instead of reopening altered
-                                              or dropped tables by itself,
-                                              mysql_lock_tables() should
-                                              notify upper level and rely
-                                              on caller doing this.
-    need_reopen                 Out parameter, TRUE if some tables were altered
-                                or deleted and should be reopened by caller.
-
-  RETURN
-    A lock structure pointer on success.
-    NULL on error or if some tables should be reopen.
-*/
 
 /* Map the return value of thr_lock to an error from errmsg.txt */
 static int thr_lock_errno_to_mysql[]=
@@ -132,6 +101,7 @@ static int thr_lock_errno_to_mysql[]=
   @param flags Lock flags
   @return 0 if all the check passed, non zero if a check failed.
 */
+
 int mysql_lock_tables_check(THD *thd, TABLE **tables, uint count, uint flags)
 {
   bool log_table_write_query;
@@ -194,81 +164,118 @@ int mysql_lock_tables_check(THD *thd, TABLE **tables, uint count, uint flags)
   DBUG_RETURN(0);
 }
 
+
+/*
+  Lock tables.
+
+  SYNOPSIS
+    mysql_lock_tables()
+    thd                         The current thread.
+    tables                      An array of pointers to the tables to lock.
+    count                       The number of tables to lock.
+    flags                       Options:
+      MYSQL_LOCK_IGNORE_GLOBAL_READ_LOCK      Ignore a global read lock
+      MYSQL_LOCK_IGNORE_GLOBAL_READ_ONLY      Ignore SET GLOBAL READ_ONLY
+      MYSQL_LOCK_IGNORE_FLUSH                 Ignore a flush tables.
+      MYSQL_LOCK_NOTIFY_IF_NEED_REOPEN        Instead of reopening altered
+                                              or dropped tables by itself,
+                                              mysql_lock_tables() should
+                                              notify upper level and rely
+                                              on caller doing this.
+    need_reopen                 Out parameter, TRUE if some tables were altered
+                                or deleted and should be reopened by caller.
+
+  RETURN
+    A lock structure pointer on success.
+    NULL on error or if some tables should be reopen.
+*/
+
 MYSQL_LOCK *mysql_lock_tables(THD *thd, TABLE **tables, uint count,
                               uint flags, bool *need_reopen)
 {
-  MYSQL_LOCK *sql_lock;
   TABLE *write_lock_used;
-  int rc;
-
-  DBUG_ENTER("mysql_lock_tables");
+  MYSQL_LOCK *sql_lock;
+  DBUG_ENTER("mysql_lock_tables(tables)");
 
   *need_reopen= FALSE;
-
   if (mysql_lock_tables_check(thd, tables, count, flags))
-    DBUG_RETURN (NULL);
+    DBUG_RETURN(NULL);
 
-  for (;;)
+  if (!(sql_lock= get_lock_data(thd, tables, count, GET_LOCK_STORE_LOCKS,
+                                &write_lock_used)) ||
+      ! sql_lock->table_count)
+    DBUG_RETURN(sql_lock);
+
+  if (mysql_lock_tables(thd, sql_lock, write_lock_used != 0, flags,
+                        need_reopen))
   {
-    if (! (sql_lock= get_lock_data(thd, tables, count, GET_LOCK_STORE_LOCKS,
-                                   &write_lock_used)) ||
-        ! sql_lock->table_count)
-      break;
+    /* Clear the lock type of all lock data to avoid reusage. */
+    reset_lock_data(sql_lock, 1);
+    my_free(sql_lock, MYF(0));
+    sql_lock= 0;
+  }
+  DBUG_RETURN(sql_lock);
+}
 
-    if (global_read_lock && write_lock_used &&
-        ! (flags & MYSQL_LOCK_IGNORE_GLOBAL_READ_LOCK))
+
+/**
+   Lock a table based on a MYSQL_LOCK structure.
+
+   mysql_lock_tables()
+
+   @param thd			The current thread.
+   @param sql_lock		Tables that should be locked
+   @param write_lock_used 	1 if any of the tables are write locked
+   @param flags			See mysql_lock_tables()
+   @param need_reopen           Out parameter, TRUE if some tables were altered
+                                or deleted and should be reopened by caller.
+
+   @return 0   ok
+   @return 1  error
+*/
+
+bool mysql_lock_tables(THD *thd, MYSQL_LOCK *sql_lock,
+                       bool write_lock_used,
+                       uint flags, bool *need_reopen)
+{
+  int rc;
+  bool error= 1;
+  DBUG_ENTER("mysql_lock_tables(sql_lock)");
+
+  *need_reopen= FALSE;
+  for (;;)
+  {
+    if (write_lock_used && !(flags & MYSQL_LOCK_IGNORE_GLOBAL_READ_LOCK))
     {
-      /*
-	Someone has issued LOCK ALL TABLES FOR READ and we want a write lock
-	Wait until the lock is gone
-      */
-      if (wait_if_global_read_lock(thd, 1, 1))
+      if (global_read_lock)
       {
-        /* Clear the lock type of all lock data to avoid reusage. */
-        reset_lock_data(sql_lock);
-	my_free((uchar*) sql_lock,MYF(0));
-	sql_lock=0;
-	break;
+        /*
+          Someone has issued LOCK ALL TABLES FOR READ and we want a write lock
+          Wait until the lock is gone
+        */
+        if (wait_if_global_read_lock(thd, 1, 1))
+          break;
+        if (thd->version != refresh_version)
+          goto retry;
       }
-      if (thd->version != refresh_version)
+
+      if (opt_readonly &&
+          !(thd->security_ctx->master_access & SUPER_ACL) &&
+          !thd->slave_thread)
       {
-        /* Clear the lock type of all lock data to avoid reusage. */
-        reset_lock_data(sql_lock);
-	my_free((uchar*) sql_lock,MYF(0));
-	goto retry;
+        /*
+          Someone has issued SET GLOBAL READ_ONLY=1 and we want a write lock.
+          We do not wait for READ_ONLY=0, and fail.
+        */
+        my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
+        break;
       }
     }
 
-    if (!(flags & MYSQL_LOCK_IGNORE_GLOBAL_READ_ONLY) &&
-        write_lock_used &&
-        opt_readonly &&
-        !(thd->security_ctx->master_access & SUPER_ACL) &&
-        !thd->slave_thread)
-    {
-      /*
-	Someone has issued SET GLOBAL READ_ONLY=1 and we want a write lock.
-        We do not wait for READ_ONLY=0, and fail.
-      */
-      reset_lock_data(sql_lock);
-      my_free((uchar*) sql_lock, MYF(0));
-      sql_lock=0;
-      my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
-      break;
-    }
-
     thd_proc_info(thd, "System lock");
-    DBUG_PRINT("info", ("thd->proc_info %s", thd->proc_info));
     if (lock_external(thd, sql_lock->table, sql_lock->table_count))
-    {
-      /* Clear the lock type of all lock data to avoid reusage. */
-      reset_lock_data(sql_lock);
-      my_free((uchar*) sql_lock,MYF(0));
-      sql_lock=0;
       break;
-    }
     thd_proc_info(thd, "Table lock");
-    DBUG_PRINT("info", ("thd->proc_info %s", thd->proc_info));
-    thd->locked=1;
     /* Copy the lock data array. thr_multi_lock() reorders its contens. */
     memcpy(sql_lock->locks + sql_lock->lock_count, sql_lock->locks,
            sql_lock->lock_count * sizeof(*sql_lock->locks));
@@ -277,70 +284,66 @@ MYSQL_LOCK *mysql_lock_tables(THD *thd, TABLE **tables, uint count,
                                                      sql_lock->lock_count,
                                                      sql_lock->lock_count,
                                                      thd->lock_id)];
-    if (rc > 1)                                 /* a timeout or a deadlock */
+    if (rc)                                     /* Locking failed */
     {
       VOID(unlock_external(thd, sql_lock->table, sql_lock->table_count));
-      my_error(rc, MYF(0));
-      my_free((uchar*) sql_lock,MYF(0));
-      sql_lock= 0;
-      break;
-    }
-    else if (rc == 1)                           /* aborted */
-    {
-      /*
-        reset_lock_data is required here. If thr_multi_lock fails it
-        resets lock type for tables, which were locked before (and
-        including) one that caused error. Lock type for other tables
-        preserved.
-      */
-      reset_lock_data(sql_lock);
-      thd->some_tables_deleted=1;		// Try again
-      sql_lock->lock_count= 0;                  // Locks are already freed
+      if (rc > 1)
+      {
+        /* a timeout or a deadlock */
+        my_error(rc, MYF(0));
+        break;
+      }
+      /* We where aborted and should try again from upper level*/
+      thd->some_tables_deleted= 1;
     }
-    else if (!thd->some_tables_deleted || (flags & MYSQL_LOCK_IGNORE_FLUSH))
+    else
     {
       /*
-        Thread was killed or lock aborted. Let upper level close all
-        used tables and retry or give error.
+        Lock worked. Now check that nothing happend while we where waiting
+        to get the lock that would require us to free it.
       */
-      thd->locked=0;
-      break;
-    }
-    else if (!thd->open_tables)
-    {
-      // Only using temporary tables, no need to unlock
-      thd->some_tables_deleted=0;
-      thd->locked=0;
-      break;
+      error= 0;
+      if (!thd->some_tables_deleted || (flags & MYSQL_LOCK_IGNORE_FLUSH))
+      {
+        /*
+          Table was not signaled for deletion or we don't care if it was.
+          Return with table as locked.
+        */
+        break;
+      }
+      else if (!thd->open_tables && !(flags & MYSQL_LOCK_NOT_TEMPORARY))
+      {
+        /*
+          Only using temporary tables, no need to unlock.
+          We need the flag as open_tables is not enough to distingush if
+          we are only using temporary tables for tables used trough
+          the HANDLER interface.
+
+          We reset some_tables_deleted as it doesn't make sense to have this
+          one when we are only using temporary tables.
+        */
+        thd->some_tables_deleted=0;
+        break;
+      }
+      /* some table was altered or deleted. reopen tables marked deleted */
+      error= 1;
+      mysql_unlock_tables(thd, sql_lock, 0);
     }
-    thd_proc_info(thd, 0);
 
-    /* some table was altered or deleted. reopen tables marked deleted */
-    mysql_unlock_tables(thd,sql_lock);
-    thd->locked=0;
 retry:
-    sql_lock=0;
     if (flags & MYSQL_LOCK_NOTIFY_IF_NEED_REOPEN)
     {
       *need_reopen= TRUE;
       break;
     }
     if (wait_for_tables(thd))
-      break;					// Couldn't open tables
-  }
-  thd_proc_info(thd, 0);
-  if (thd->killed)
-  {
-    thd->send_kill_message();
-    if (sql_lock)
-    {
-      mysql_unlock_tables(thd,sql_lock);
-      sql_lock=0;
-    }
+      break;                                // Couldn't open tables
+    reset_lock_data(sql_lock, 0);           // Set org locks and retry
   }
 
+  thd_proc_info(thd, 0);
   thd->set_time_after_lock();
-  DBUG_RETURN (sql_lock);
+  DBUG_RETURN(error);
 }
 
 
@@ -380,15 +383,15 @@ static int lock_external(THD *thd, TABLE **tables, uint count)
   DBUG_RETURN(0);
 }
 
-
-void mysql_unlock_tables(THD *thd, MYSQL_LOCK *sql_lock)
+void mysql_unlock_tables(THD *thd, MYSQL_LOCK *sql_lock, bool free_lock)
 {
   DBUG_ENTER("mysql_unlock_tables");
   if (sql_lock->table_count)
     VOID(unlock_external(thd,sql_lock->table,sql_lock->table_count));
   if (sql_lock->lock_count)
     thr_multi_unlock(sql_lock->locks,sql_lock->lock_count, 0);
-  my_free((uchar*) sql_lock,MYF(0));
+  if (free_lock)
+    my_free((uchar*) sql_lock,MYF(0));
   DBUG_VOID_RETURN;
 }
 
@@ -847,12 +850,12 @@ static int unlock_external(THD *thd, TABLE **table,uint count)
   @param write_lock_used   Store pointer to last table with WRITE_ALLOW_WRITE
 */
 
-static MYSQL_LOCK *get_lock_data(THD *thd, TABLE **table_ptr, uint count,
-				 uint flags, TABLE **write_lock_used)
+MYSQL_LOCK *get_lock_data(THD *thd, TABLE **table_ptr, uint count,
+                          uint flags, TABLE **write_lock_used)
 {
   uint i,tables,lock_count;
   MYSQL_LOCK *sql_lock;
-  THR_LOCK_DATA **locks, **locks_buf, **locks_start;
+  THR_LOCK_DATA **locks, **locks_buf;
   TABLE **to, **table_buf;
   DBUG_ENTER("get_lock_data");
 
@@ -891,7 +894,7 @@ static MYSQL_LOCK *get_lock_data(THD *thd, TABLE **table_ptr, uint count,
   {
     TABLE *table;
     enum thr_lock_type lock_type;
-
+    THR_LOCK_DATA **locks_start;
     if ((table=table_ptr[i])->s->tmp_table == NON_TRANSACTIONAL_TMP_TABLE)
       continue;
     lock_type= table->reginfo.lock_type;
@@ -904,12 +907,11 @@ static MYSQL_LOCK *get_lock_data(THD *thd, TABLE **table_ptr, uint count,
 	my_error(ER_OPEN_AS_READONLY,MYF(0),table->alias.c_ptr());
         /* Clear the lock type of the lock data that are stored already. */
         sql_lock->lock_count= (uint) (locks - sql_lock->locks);
-        reset_lock_data(sql_lock);
+        reset_lock_data(sql_lock, 1);
 	my_free((uchar*) sql_lock,MYF(0));
 	DBUG_RETURN(0);
       }
     }
-    THR_LOCK_DATA **org_locks = locks;
     locks_start= locks;
     locks= table->file->store_lock(thd, locks,
                                    (flags & GET_LOCK_UNLOCK) ? TL_IGNORE :
@@ -922,11 +924,14 @@ static MYSQL_LOCK *get_lock_data(THD *thd, TABLE **table_ptr, uint count,
     }
     *to++= table;
     if (locks)
-      for ( ; org_locks != locks ; org_locks++)
+    {
+      for ( ; locks_start != locks ; locks_start++)
       {
-	(*org_locks)->debug_print_param= (void *) table;
-	(*org_locks)->lock->name=         table->alias.c_ptr();
+	(*locks_start)->debug_print_param= (void *) table;
+	(*locks_start)->lock->name=         table->alias.c_ptr();
+	(*locks_start)->org_type=           (*locks_start)->type;
       }
+    }
   }
   /*
     We do not use 'tables', because there are cases where store_lock()
@@ -967,10 +972,13 @@ static MYSQL_LOCK *get_lock_data(THD *thd, TABLE **table_ptr, uint count,
   Clear the lock type of all lock data. This ensures that the next
   lock request will set its lock type properly.
 
-  @param sql_lock                  The MySQL lock.
+  @param sql_lock               The MySQL lock.
+  @param unlock			If set, then set lock type to TL_UNLOCK,
+  				otherwise set to original lock type from
+				get_store_lock().
 */
 
-static void reset_lock_data(MYSQL_LOCK *sql_lock)
+void reset_lock_data(MYSQL_LOCK *sql_lock, bool unlock)
 {
   THR_LOCK_DATA **ldata;
   THR_LOCK_DATA **ldata_end;
@@ -978,10 +986,7 @@ static void reset_lock_data(MYSQL_LOCK *sql_lock)
   for (ldata= sql_lock->locks, ldata_end= ldata + sql_lock->lock_count;
        ldata < ldata_end;
        ldata++)
-  {
-    /* Reset lock type. */
-    (*ldata)->type= TL_UNLOCK;
-  }
+    (*ldata)->type= unlock ? TL_UNLOCK : (*ldata)->org_type;
 }
 
 
diff --git a/sql/log.cc b/sql/log.cc
index 5c72ce96890..73eeb5e6dba 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -1899,6 +1899,7 @@ static int find_uniq_filename(char *name)
   size_t		buf_length, length;
   char			*start, *end;
   DBUG_ENTER("find_uniq_filename");
+  LINT_INIT(number);
 
   length= dirname_part(buff, name, &buf_length);
   start=  name + length;
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
new file mode 100644
index 00000000000..2bea804beef
--- /dev/null
+++ b/sql/multi_range_read.cc
@@ -0,0 +1,1750 @@
+#include "mysql_priv.h"
+#include <my_bit.h>
+#include "sql_select.h"
+
+/****************************************************************************
+ * Default MRR implementation (MRR to non-MRR converter)
+ ***************************************************************************/
+
+/**
+  Get cost and other information about MRR scan over a known list of ranges
+
+  Calculate estimated cost and other information about an MRR scan for given
+  sequence of ranges.
+
+  @param keyno           Index number
+  @param seq             Range sequence to be traversed
+  @param seq_init_param  First parameter for seq->init()
+  @param n_ranges_arg    Number of ranges in the sequence, or 0 if the caller
+                         can't efficiently determine it
+  @param bufsz    INOUT  IN:  Size of the buffer available for use
+                         OUT: Size of the buffer that is expected to be actually
+                              used, or 0 if buffer is not needed.
+  @param flags    INOUT  A combination of HA_MRR_* flags
+  @param cost     OUT    Estimated cost of MRR access
+
+  @note
+    This method (or an overriding one in a derived class) must check for
+    thd->killed and return HA_POS_ERROR if it is not zero. This is required
+    for a user to be able to interrupt the calculation by killing the
+    connection/query.
+
+  @retval
+    HA_POS_ERROR  Error or the engine is unable to perform the requested
+                  scan. Values of OUT parameters are undefined.
+  @retval
+    other         OK, *cost contains cost of the scan, *bufsz and *flags
+                  contain scan parameters.
+*/
+
+ha_rows 
+handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                     void *seq_init_param, uint n_ranges_arg,
+                                     uint *bufsz, uint *flags, COST_VECT *cost)
+{
+  KEY_MULTI_RANGE range;
+  range_seq_t seq_it;
+  ha_rows rows, total_rows= 0;
+  uint n_ranges=0;
+  THD *thd= current_thd;
+  
+  /* Default MRR implementation doesn't need buffer */
+  *bufsz= 0;
+
+  seq_it= seq->init(seq_init_param, n_ranges, *flags);
+  while (!seq->next(seq_it, &range))
+  {
+    if (unlikely(thd->killed != 0))
+      return HA_POS_ERROR;
+    
+    n_ranges++;
+    key_range *min_endp, *max_endp;
+    if (range.range_flag & GEOM_FLAG)
+    {
+      /* In this case tmp_min_flag contains the handler-read-function */
+      range.start_key.flag= (ha_rkey_function) (range.range_flag ^ GEOM_FLAG);
+      min_endp= &range.start_key;
+      max_endp= NULL;
+    }
+    else
+    {
+      min_endp= range.start_key.length? &range.start_key : NULL;
+      max_endp= range.end_key.length? &range.end_key : NULL;
+    }
+    if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE))
+      rows= 1; /* there can be at most one row */
+    else
+    {
+      if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp, 
+                                                        max_endp)))
+      {
+        /* Can't scan one range => can't do MRR scan at all */
+        total_rows= HA_POS_ERROR;
+        break;
+      }
+    }
+    total_rows += rows;
+  }
+  
+  if (total_rows != HA_POS_ERROR)
+  {
+    /* The following calculation is the same as in multi_range_read_info(): */
+    *flags |= HA_MRR_USE_DEFAULT_IMPL;
+    cost->zero();
+    cost->avg_io_cost= 1; /* assume random seeks */
+    if ((*flags & HA_MRR_INDEX_ONLY) && total_rows > 2)
+      cost->io_count= keyread_time(keyno, n_ranges, (uint)total_rows);
+    else
+      cost->io_count= read_time(keyno, n_ranges, total_rows);
+    cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01;
+  }
+  return total_rows;
+}
+
+
+/**
+  Get cost and other information about MRR scan over some sequence of ranges
+
+  Calculate estimated cost and other information about an MRR scan for some
+  sequence of ranges.
+
+  The ranges themselves will be known only at execution phase. When this
+  function is called we only know number of ranges and a (rough) E(#records)
+  within those ranges.
+
+  Currently this function is only called for "n-keypart singlepoint" ranges,
+  i.e. each range is "keypart1=someconst1 AND ... AND keypartN=someconstN"
+
+  The flags parameter is a combination of those flags: HA_MRR_SORTED,
+  HA_MRR_INDEX_ONLY, HA_MRR_NO_ASSOCIATION, HA_MRR_LIMITS.
+
+  @param keyno           Index number
+  @param n_ranges        Estimated number of ranges (i.e. intervals) in the
+                         range sequence.
+  @param n_rows          Estimated total number of records contained within all
+                         of the ranges
+  @param bufsz    INOUT  IN:  Size of the buffer available for use
+                         OUT: Size of the buffer that will be actually used, or
+                              0 if buffer is not needed.
+  @param flags    INOUT  A combination of HA_MRR_* flags
+  @param cost     OUT    Estimated cost of MRR access
+
+  @retval
+    0     OK, *cost contains cost of the scan, *bufsz and *flags contain scan
+          parameters.
+  @retval
+    other Error or can't perform the requested scan
+*/
+
+ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
+                                       uint key_parts, uint *bufsz, 
+                                       uint *flags, COST_VECT *cost)
+{
+  /* 
+    Currently we expect this function to be called only in preparation of scan
+    with HA_MRR_SINGLE_POINT property.
+  */
+  DBUG_ASSERT(*flags | HA_MRR_SINGLE_POINT);
+
+  *bufsz= 0; /* Default implementation doesn't need a buffer */
+  *flags |= HA_MRR_USE_DEFAULT_IMPL;
+
+  cost->zero();
+  cost->avg_io_cost= 1; /* assume random seeks */
+
+  /* Produce the same cost as non-MRR code does */
+  if (*flags & HA_MRR_INDEX_ONLY)
+    cost->io_count= keyread_time(keyno, n_ranges, n_rows);
+  else
+    cost->io_count= read_time(keyno, n_ranges, n_rows);
+  return 0;
+}
+
+
+/**
+  Initialize the MRR scan
+
+  Initialize the MRR scan. This function may do heavyweight scan 
+  initialization like row prefetching/sorting/etc (NOTE: but better not do
+  it here as we may not need it, e.g. if we never satisfy WHERE clause on
+  previous tables. For many implementations it would be natural to do such
+  initializations in the first multi_read_range_next() call)
+
+  mode is a combination of the following flags: HA_MRR_SORTED,
+  HA_MRR_INDEX_ONLY, HA_MRR_NO_ASSOCIATION 
+
+  @param seq             Range sequence to be traversed
+  @param seq_init_param  First parameter for seq->init()
+  @param n_ranges        Number of ranges in the sequence
+  @param mode            Flags, see the description section for the details
+  @param buf             INOUT: memory buffer to be used
+
+  @note
+    One must have called index_init() before calling this function. Several
+    multi_range_read_init() calls may be made in course of one query.
+
+    Until WL#2623 is done (see its text, section 3.2), the following will 
+    also hold:
+    The caller will guarantee that if "seq->init == mrr_ranges_array_init"
+    then seq_init_param is an array of n_ranges KEY_MULTI_RANGE structures.
+    This property will only be used by NDB handler until WL#2623 is done.
+     
+    Buffer memory management is done according to the following scenario:
+    The caller allocates the buffer and provides it to the callee by filling
+    the members of HANDLER_BUFFER structure.
+    The callee consumes all or some fraction of the provided buffer space, and
+    sets the HANDLER_BUFFER members accordingly.
+    The callee may use the buffer memory until the next multi_range_read_init()
+    call is made, all records have been read, or until index_end() call is
+    made, whichever comes first.
+
+  @retval 0  OK
+  @retval 1  Error
+*/
+
+int
+handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
+                               uint n_ranges, uint mode, HANDLER_BUFFER *buf)
+{
+  DBUG_ENTER("handler::multi_range_read_init");
+  mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
+  mrr_funcs= *seq_funcs;
+  mrr_is_output_sorted= test(mode & HA_MRR_SORTED);
+  mrr_have_range= FALSE;
+  DBUG_RETURN(0);
+}
+
+/**
+  Get next record in MRR scan
+
+  Default MRR implementation: read the next record
+
+  @param range_info  OUT  Undefined if HA_MRR_NO_ASSOCIATION flag is in effect
+                          Otherwise, the opaque value associated with the range
+                          that contains the returned record.
+
+  @retval 0      OK
+  @retval other  Error code
+*/
+
+int handler::multi_range_read_next(range_id_t *range_info)
+{
+  int result= HA_ERR_END_OF_FILE;
+  bool range_res;
+  DBUG_ENTER("handler::multi_range_read_next");
+
+  if (!mrr_have_range)
+  {
+    mrr_have_range= TRUE;
+    goto start;
+  }
+
+  do
+  {
+    /* Save a call if there can be only one row in range. */
+    if (mrr_cur_range.range_flag != (UNIQUE_RANGE | EQ_RANGE))
+    {
+      result= read_range_next();
+      /* On success or non-EOF errors jump to the end. */
+      if (result != HA_ERR_END_OF_FILE)
+        break;
+    }
+    else
+    {
+      if (was_semi_consistent_read())
+      {
+        /*
+          The following assignment is redundant, but for extra safety and to
+          remove the compiler warning:
+        */
+        range_res= FALSE;
+        goto scan_it_again;
+      }
+      /*
+        We need to set this for the last range only, but checking this
+        condition is more expensive than just setting the result code.
+      */
+      result= HA_ERR_END_OF_FILE;
+    }
+
+start:
+    /* Try the next range(s) until one matches a record. */
+    while (!(range_res= mrr_funcs.next(mrr_iter, &mrr_cur_range)))
+    {
+scan_it_again:
+      result= read_range_first(mrr_cur_range.start_key.keypart_map ?
+                                 &mrr_cur_range.start_key : 0,
+                               mrr_cur_range.end_key.keypart_map ?
+                                 &mrr_cur_range.end_key : 0,
+                               test(mrr_cur_range.range_flag & EQ_RANGE),
+                               mrr_is_output_sorted);
+      if (result != HA_ERR_END_OF_FILE)
+        break;
+    }
+  }
+  while ((result == HA_ERR_END_OF_FILE) && !range_res);
+
+  *range_info= mrr_cur_range.ptr;
+  DBUG_PRINT("exit",("handler::multi_range_read_next result %d", result));
+  DBUG_RETURN(result);
+}
+
+/****************************************************************************
+ * Mrr_*_reader classes (building blocks for DS-MRR)
+ ***************************************************************************/
+
+int Mrr_simple_index_reader::init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
+                                  void *seq_init_param, uint n_ranges,
+                                  uint mode,  Key_parameters *key_par_arg,
+                                  Lifo_buffer *key_buffer_arg,
+                                  Buffer_manager *buf_manager_arg)
+{
+  HANDLER_BUFFER no_buffer = {NULL, NULL, NULL};
+  file= h_arg;
+  return file->handler::multi_range_read_init(seq_funcs, seq_init_param,
+                                              n_ranges, mode, &no_buffer);
+}
+
+
+int Mrr_simple_index_reader::get_next(range_id_t *range_info)
+{
+  int res;
+  while (!(res= file->handler::multi_range_read_next(range_info)))
+  {
+    KEY_MULTI_RANGE *curr_range= &file->handler::mrr_cur_range;
+    if (!file->mrr_funcs.skip_index_tuple ||
+        !file->mrr_funcs.skip_index_tuple(file->mrr_iter, curr_range->ptr))
+      break;
+  }
+  if (res && res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND)
+    file->print_error(res, MYF(0));             // Fatal error
+  return res;
+}
+
+
+/**
+  @brief Get next index record
+
+  @param range_info  OUT identifier of range that the returned record belongs to
+  
+  @note
+    We actually iterate over nested sequences:
+    - an ordered sequence of groups of identical keys
+      - each key group has key value, which has multiple matching records 
+        - thus, each record matches all members of the key group
+
+  @retval 0                   OK, next record was successfully read
+  @retval HA_ERR_END_OF_FILE  End of records
+  @retval Other               Some other error; Error is printed
+*/
+
+int Mrr_ordered_index_reader::get_next(range_id_t *range_info)
+{
+  int res;
+  DBUG_ENTER("Mrr_ordered_index_reader::get_next");
+  
+  for(;;)
+  {
+    if (!scanning_key_val_iter)
+    {
+      while ((res= kv_it.init(this)))
+      {
+        if ((res != HA_ERR_KEY_NOT_FOUND && res != HA_ERR_END_OF_FILE))
+          DBUG_RETURN(res); /* Some fatal error */
+
+        if (key_buffer->is_empty())
+        {
+          DBUG_RETURN(HA_ERR_END_OF_FILE);
+        }
+      }
+      scanning_key_val_iter= TRUE;
+    }
+
+    if ((res= kv_it.get_next(range_info)))
+    {
+      scanning_key_val_iter= FALSE;
+      if ((res != HA_ERR_KEY_NOT_FOUND && res != HA_ERR_END_OF_FILE))
+        DBUG_RETURN(res);
+      kv_it.move_to_next_key_value();
+      continue;
+    }
+    if (!skip_index_tuple(*range_info) &&
+        !skip_record(*range_info, NULL))
+    {
+      break;
+    }
+    /* Go get another (record, range_id) combination */
+  } /* while */
+
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Supply index reader with the O(1)space it needs for scan interrupt/restore
+  operation
+*/
+
+bool Mrr_ordered_index_reader::set_interruption_temp_buffer(uint rowid_length,
+                                                            uint key_len, 
+                                                            uint saved_pk_len,
+                                                            uchar **space_start,
+                                                            uchar *space_end)
+{
+  if (space_end - *space_start <= (ptrdiff_t)(rowid_length + key_len + saved_pk_len))
+    return TRUE;
+  support_scan_interruptions= TRUE; 
+  
+  saved_rowid= *space_start;
+  *space_start += rowid_length;
+  
+  if (saved_pk_len)
+  {
+    saved_primary_key= *space_start;
+    *space_start += saved_pk_len;
+  }
+  else
+    saved_primary_key= NULL;
+
+  saved_key_tuple= *space_start;
+  *space_start += key_len;
+
+  have_saved_rowid= FALSE;
+  return FALSE;
+}
+
+void Mrr_ordered_index_reader::set_no_interruption_temp_buffer()
+{
+  support_scan_interruptions= FALSE;
+  saved_key_tuple= saved_rowid= saved_primary_key= NULL; /* safety */
+  have_saved_rowid= FALSE;
+}
+
+void Mrr_ordered_index_reader::interrupt_read()
+{
+  DBUG_ASSERT(support_scan_interruptions);
+  TABLE *table= file->get_table();
+  /* Save the current key value */
+  key_copy(saved_key_tuple, table->record[0],
+           &table->key_info[file->active_index],
+           keypar.key_tuple_length);
+  
+  if (saved_primary_key)
+  {
+    key_copy(saved_primary_key, table->record[0], 
+             &table->key_info[table->s->primary_key],
+             table->key_info[table->s->primary_key].key_length);
+  }
+
+  /* Save the last rowid */
+  memcpy(saved_rowid, file->ref, file->ref_length);
+  have_saved_rowid= TRUE;
+}
+
+void Mrr_ordered_index_reader::position()
+{
+  if (have_saved_rowid)
+    memcpy(file->ref, saved_rowid, file->ref_length);
+  else
+    Mrr_index_reader::position();
+}
+
+void Mrr_ordered_index_reader::resume_read()
+{
+  TABLE *table= file->get_table();
+  key_restore(table->record[0], saved_key_tuple, 
+              &table->key_info[file->active_index],
+              keypar.key_tuple_length);
+  if (saved_primary_key)
+  {
+    key_restore(table->record[0], saved_primary_key, 
+                &table->key_info[table->s->primary_key],
+                table->key_info[table->s->primary_key].key_length);
+  }
+}
+
+
+/**
+  Fill the buffer with (lookup_tuple, range_id) pairs and sort
+*/
+
+int Mrr_ordered_index_reader::refill_buffer(bool initial)
+{
+  KEY_MULTI_RANGE cur_range;
+  DBUG_ENTER("Mrr_ordered_index_reader::refill_buffer");
+
+  DBUG_ASSERT(key_buffer->is_empty());
+
+  if (source_exhausted)
+    DBUG_RETURN(HA_ERR_END_OF_FILE);
+
+  buf_manager->reset_buffer_sizes(buf_manager->arg);
+  key_buffer->reset();
+  key_buffer->setup_writing(keypar.key_size_in_keybuf,
+                            is_mrr_assoc? sizeof(range_id_t) : 0);
+
+  while (key_buffer->can_write() && 
+         !(source_exhausted= mrr_funcs.next(mrr_iter, &cur_range)))
+  {
+    DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
+
+    /* Put key, or {key, range_id} pair into the buffer */
+    key_buffer->write_ptr1= keypar.use_key_pointers ?
+                              (uchar*)&cur_range.start_key.key : 
+                              (uchar*)cur_range.start_key.key;
+    key_buffer->write_ptr2= (uchar*)&cur_range.ptr;
+    key_buffer->write();
+  }
+  
+  /* Force get_next() to start with kv_it.init() call: */
+  scanning_key_val_iter= FALSE;
+
+  if (source_exhausted && key_buffer->is_empty())
+    DBUG_RETURN(HA_ERR_END_OF_FILE);
+
+  key_buffer->sort((key_buffer->type() == Lifo_buffer::FORWARD)? 
+                     (qsort2_cmp)Mrr_ordered_index_reader::compare_keys_reverse : 
+                     (qsort2_cmp)Mrr_ordered_index_reader::compare_keys, 
+                   this);
+  DBUG_RETURN(0);
+}
+
+
+int Mrr_ordered_index_reader::init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
+                                   void *seq_init_param, uint n_ranges,
+                                   uint mode, Key_parameters *key_par_arg,
+                                   Lifo_buffer *key_buffer_arg,
+                                   Buffer_manager *buf_manager_arg)
+{
+  file= h_arg;
+  key_buffer= key_buffer_arg;
+  buf_manager= buf_manager_arg;
+  keypar= *key_par_arg;
+
+  KEY *key_info= &file->get_table()->key_info[file->active_index];
+  keypar.index_ranges_unique= test(key_info->flags & HA_NOSAME && 
+                                   key_info->key_parts == 
+                                   my_count_bits(keypar.key_tuple_map));
+
+  mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
+  is_mrr_assoc=    !test(mode & HA_MRR_NO_ASSOCIATION);
+  mrr_funcs= *seq_funcs;
+  source_exhausted= FALSE;
+  if (support_scan_interruptions)
+    bzero(saved_key_tuple, keypar.key_tuple_length);
+  have_saved_rowid= FALSE;
+  return 0;
+}
+
+
+static int rowid_cmp_reverse(void *file, uchar *a, uchar *b)
+{
+  return - ((handler*)file)->cmp_ref(a, b);
+}
+
+
+int Mrr_ordered_rndpos_reader::init(handler *h_arg, 
+                                    Mrr_index_reader *index_reader_arg,
+                                    uint mode,
+                                    Lifo_buffer *buf)
+{
+  file= h_arg;
+  index_reader= index_reader_arg;
+  rowid_buffer= buf;
+  is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
+  index_reader_exhausted= FALSE;
+  index_reader_needs_refill= TRUE;
+  return 0;
+}
+
+
+/**
+  DS-MRR: Fill and sort the rowid buffer
+
+  Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into 
+  buffer. When the buffer is full or scan is completed, sort the buffer by 
+  rowid and return.
+
+  When this function returns, either rowid buffer is not empty, or the source
+  of lookup keys (i.e. ranges) is exhaused.
+  
+  @retval 0      OK, the next portion of rowids is in the buffer,
+                 properly ordered
+  @retval other  Error
+*/
+
+int Mrr_ordered_rndpos_reader::refill_buffer(bool initial)
+{
+  int res;
+  DBUG_ENTER("Mrr_ordered_rndpos_reader::refill_buffer");
+
+  if (index_reader_exhausted)
+    DBUG_RETURN(HA_ERR_END_OF_FILE);
+
+  while (initial || index_reader_needs_refill || 
+         (res= refill_from_index_reader()) == HA_ERR_END_OF_FILE)
+  {
+    if ((res= index_reader->refill_buffer(initial)))
+    {
+      if (res == HA_ERR_END_OF_FILE)
+        index_reader_exhausted= TRUE;
+      break;
+    }
+    initial= FALSE;
+    index_reader_needs_refill= FALSE;
+  }
+  DBUG_RETURN(res);
+}
+
+
+void Mrr_index_reader::position()
+{
+  file->position(file->get_table()->record[0]);
+}
+
+
+/* 
+  @brief Try to refill the rowid buffer without calling
+  index_reader->refill_buffer(). 
+*/
+
+int Mrr_ordered_rndpos_reader::refill_from_index_reader()
+{
+  range_id_t range_info;
+  int res;
+  DBUG_ENTER("Mrr_ordered_rndpos_reader::refill_from_index_reader");
+
+  DBUG_ASSERT(rowid_buffer->is_empty());
+  index_rowid= index_reader->get_rowid_ptr();
+  rowid_buffer->reset();
+  rowid_buffer->setup_writing(file->ref_length,
+                              is_mrr_assoc? sizeof(range_id_t) : 0);
+
+  last_identical_rowid= NULL;
+
+  index_reader->resume_read();
+  while (rowid_buffer->can_write())
+  {
+    res= index_reader->get_next(&range_info);
+
+    if (res)
+    {
+      if (res != HA_ERR_END_OF_FILE)
+        DBUG_RETURN(res);
+      index_reader_needs_refill=TRUE;
+      break;
+    }
+
+    index_reader->position();
+
+    /* Put rowid, or {rowid, range_id} pair into the buffer */
+    rowid_buffer->write_ptr1= index_rowid;
+    rowid_buffer->write_ptr2= (uchar*)&range_info;
+    rowid_buffer->write();
+  }
+   
+  index_reader->interrupt_read();
+  /* Sort the buffer contents by rowid */
+  rowid_buffer->sort((qsort2_cmp)rowid_cmp_reverse, (void*)file);
+
+  rowid_buffer->setup_reading(file->ref_length,
+                              is_mrr_assoc ? sizeof(range_id_t) : 0);
+  DBUG_RETURN(rowid_buffer->is_empty()? HA_ERR_END_OF_FILE : 0);
+}
+
+
+/*
+  Get the next {record, range_id} using ordered array of rowid+range_id pairs
+
+  @note
+    Since we have sorted rowids, we try not to make multiple rnd_pos() calls
+    with the same rowid value.
+*/
+
+int Mrr_ordered_rndpos_reader::get_next(range_id_t *range_info)
+{
+  int res;
+  
+  /* 
+    First, check if rowid buffer has elements with the same rowid value as
+    the previous.
+  */
+  while (last_identical_rowid)
+  {
+    /*
+      Current record (the one we've returned in previous call) was obtained
+      from a rowid that matched multiple range_ids. Return this record again,
+      with next matching range_id.
+    */
+    (void)rowid_buffer->read();
+
+    if (rowid_buffer->read_ptr1 == last_identical_rowid)
+      last_identical_rowid= NULL; /* reached the last of identical rowids */
+
+    if (!is_mrr_assoc)
+      return 0;
+
+    memcpy(range_info, rowid_buffer->read_ptr2, sizeof(range_id_t));
+    if (!index_reader->skip_record(*range_info, rowid_buffer->read_ptr1))
+      return 0;
+  }
+  
+  /* 
+     Ok, last_identical_rowid==NULL, it's time to read next different rowid
+     value and get record for it.
+  */
+  for(;;)
+  {
+    /* Return eof if there are no rowids in the buffer after re-fill attempt */
+    if (rowid_buffer->read())
+      return HA_ERR_END_OF_FILE;
+
+    if (is_mrr_assoc)
+    {
+      memcpy(range_info, rowid_buffer->read_ptr2, sizeof(range_id_t));
+      if (index_reader->skip_record(*range_info, rowid_buffer->read_ptr1))
+        continue;
+    }
+
+    res= file->ha_rnd_pos(file->get_table()->record[0], 
+                          rowid_buffer->read_ptr1);
+
+    if (res == HA_ERR_RECORD_DELETED)
+    {
+      /* not likely to get this code with current storage engines, but still */
+      continue;
+    }
+
+    if (res)
+      return res; /* Some fatal error */
+
+    break; /* Got another record */
+  }
+
+  /* 
+    Check if subsequent buffer elements have the same rowid value as this
+    one. If yes, remember this fact so that we don't make any more rnd_pos()
+    calls with this value.
+
+    Note: this implies that SQL layer doesn't touch table->record[0]
+    between calls.
+  */
+  Lifo_buffer_iterator it;
+  it.init(rowid_buffer);
+  while (!it.read())
+  {
+    if (file->cmp_ref(it.read_ptr1, rowid_buffer->read_ptr1))
+      break;
+    last_identical_rowid= it.read_ptr1;
+  }
+  return 0;
+}
+
+
+/****************************************************************************
+ * Top-level DS-MRR implementation functions (the ones called by storage engine)
+ ***************************************************************************/
+
+/**
+  DS-MRR: Initialize and start MRR scan
+
+  Initialize and start the MRR scan. Depending on the mode parameter, this
+  may use default or DS-MRR implementation.
+
+  @param h_arg           Table handler to be used
+  @param key             Index to be used
+  @param seq_funcs       Interval sequence enumeration functions
+  @param seq_init_param  Interval sequence enumeration parameter
+  @param n_ranges        Number of ranges in the sequence.
+  @param mode            HA_MRR_* modes to use
+  @param buf             INOUT Buffer to use
+
+  @retval 0     Ok, Scan started.
+  @retval other Error
+*/
+
+int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
+                           void *seq_init_param, uint n_ranges, uint mode,
+                           HANDLER_BUFFER *buf)
+{
+  THD *thd= current_thd;
+  int res;
+  Key_parameters keypar;
+  uint key_buff_elem_size;
+  handler *h_idx;
+  Mrr_ordered_rndpos_reader *disk_strategy= NULL;
+  bool do_sort_keys= FALSE;
+  DBUG_ENTER("DsMrr_impl::dsmrr_init");
+  LINT_INIT(key_buff_elem_size); /* set/used when do_sort_keys==TRUE */
+  /*
+    index_merge may invoke a scan on an object for which dsmrr_info[_const]
+    has not been called, so set the owner handler here as well.
+  */
+  primary_file= h_arg;
+  is_mrr_assoc=    !test(mode & HA_MRR_NO_ASSOCIATION);
+
+  strategy_exhausted= FALSE;
+  
+  /* By default, have do-nothing buffer manager */
+  buf_manager.arg= this;
+  buf_manager.reset_buffer_sizes= do_nothing;
+  buf_manager.redistribute_buffer_space= do_nothing;
+
+  if (mode & (HA_MRR_USE_DEFAULT_IMPL | HA_MRR_SORTED))
+    goto use_default_impl;
+  
+  /*
+    Determine whether we'll need to do key sorting and/or rnd_pos() scan
+  */
+  index_strategy= NULL;
+  if ((mode & HA_MRR_SINGLE_POINT) &&
+      optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS))
+  {
+    do_sort_keys= TRUE;
+    index_strategy= &reader_factory.ordered_index_reader;
+  }
+  else
+    index_strategy= &reader_factory.simple_index_reader;
+
+  strategy= index_strategy;
+  /*
+    We don't need a rowid-to-rndpos step if
+     - We're doing a scan on clustered primary key
+     - [In the future] We're doing an index_only read
+  */
+  DBUG_ASSERT(primary_file->inited == handler::INDEX || 
+              (primary_file->inited == handler::RND && 
+               secondary_file && 
+               secondary_file->inited == handler::INDEX));
+
+  h_idx= (primary_file->inited == handler::INDEX)? primary_file: secondary_file;
+  keyno= h_idx->active_index;
+
+  if (!(keyno == table->s->primary_key && h_idx->primary_key_is_clustered()))
+  {
+    strategy= disk_strategy= &reader_factory.ordered_rndpos_reader;
+  }
+
+  if (is_mrr_assoc)
+    status_var_increment(thd->status_var.ha_multi_range_read_init_count);
+
+  full_buf= buf->buffer;
+  full_buf_end= buf->buffer_end;
+
+  if (do_sort_keys)
+  {
+    /* Pre-calculate some parameters of key sorting */
+    keypar.use_key_pointers= test(mode & HA_MRR_MATERIALIZED_KEYS);
+    seq_funcs->get_key_info(seq_init_param, &keypar.key_tuple_length, 
+                            &keypar.key_tuple_map);
+    keypar.key_size_in_keybuf= keypar.use_key_pointers? 
+                                 sizeof(char*) : keypar.key_tuple_length;
+    key_buff_elem_size= keypar.key_size_in_keybuf + (int)is_mrr_assoc * sizeof(void*);
+    
+    /* Ordered index reader needs some space to store an index tuple */
+    if (strategy != index_strategy)
+    {
+      uint saved_pk_length=0;
+      if (h_idx->primary_key_is_clustered())
+      {
+        uint pk= h_idx->get_table()->s->primary_key;
+        saved_pk_length= h_idx->get_table()->key_info[pk].key_length;
+      }
+
+      if (reader_factory.ordered_index_reader.
+            set_interruption_temp_buffer(primary_file->ref_length,
+                                         keypar.key_tuple_length,
+                                         saved_pk_length,
+                                         &full_buf, full_buf_end))
+        goto use_default_impl;
+    }
+    else
+      reader_factory.ordered_index_reader.set_no_interruption_temp_buffer();
+  }
+
+  if (strategy == index_strategy)
+  {
+    /* 
+      Index strategy alone handles the record retrieval. Give all buffer space
+      to it. Key buffer should have forward orientation so we can return the
+      end of it.
+    */
+    key_buffer= &forward_key_buf;
+    key_buffer->set_buffer_space(full_buf, full_buf_end);
+    
+    /* Safety: specify that rowid buffer has zero size: */
+    rowid_buffer.set_buffer_space(full_buf_end, full_buf_end);
+
+    if (do_sort_keys && !key_buffer->have_space_for(key_buff_elem_size))
+      goto use_default_impl;
+
+    if ((res= index_strategy->init(primary_file, seq_funcs, seq_init_param, n_ranges,
+                                   mode, &keypar, key_buffer, &buf_manager)))
+      goto error;
+  }
+  else
+  {
+    /* We'll have both index and rndpos strategies working together */
+    if (do_sort_keys)
+    {
+      /* Both strategies will need buffer space, share the buffer */
+      if (setup_buffer_sharing(keypar.key_size_in_keybuf, keypar.key_tuple_map))
+        goto use_default_impl;
+
+      buf_manager.reset_buffer_sizes= reset_buffer_sizes;
+      buf_manager.redistribute_buffer_space= redistribute_buffer_space;
+    }
+    else
+    {
+      /* index strategy doesn't need buffer, give all space to rowids*/
+      rowid_buffer.set_buffer_space(full_buf, full_buf_end);
+      if (!rowid_buffer.have_space_for(primary_file->ref_length + 
+                                       (int)is_mrr_assoc * sizeof(range_id_t)))
+        goto use_default_impl;
+    }
+
+    if ((res= setup_two_handlers()))
+      goto error;
+
+    if ((res= index_strategy->init(secondary_file, seq_funcs, seq_init_param,
+                                   n_ranges, mode, &keypar, key_buffer, 
+                                   &buf_manager)) || 
+        (res= disk_strategy->init(primary_file, index_strategy, mode, 
+                                  &rowid_buffer)))
+    {
+      goto error;
+    }
+  }
+
+  res= strategy->refill_buffer(TRUE);
+  if (res)
+  {
+    if (res != HA_ERR_END_OF_FILE)
+      goto error;
+    strategy_exhausted= TRUE;
+  }
+
+  /*
+    If we have scanned through all intervals in *seq, then adjust *buf to 
+    indicate that the remaining buffer space will not be used.
+  */
+//  if (dsmrr_eof) 
+//    buf->end_of_used_area= rowid_buffer.end_of_space();
+
+  
+  DBUG_RETURN(0);
+error:
+  close_second_handler();
+   /* Safety, not really needed but: */
+  strategy= NULL;
+  DBUG_RETURN(res);
+
+use_default_impl:
+  if (primary_file->inited != handler::INDEX)
+  {
+    /* We can get here when 
+       - we've previously successfully done a DS-MRR scan (and so have 
+         secondary_file!= NULL, secondary_file->inited= INDEX, 
+         primary_file->inited=RND)
+       - for this invocation, we haven't got enough buffer space, and so we
+         have to use the default MRR implementation.
+
+      note: primary_file->ha_index_end() will call dsmrr_close() which will
+      close/destroy the secondary_file, this is intentional. 
+      (Yes this is slow, but one can't expect performance with join buffer 
+       so small that it can accomodate one rowid and one index tuple)
+    */
+    if ((res= primary_file->ha_rnd_end()) || 
+        (res= primary_file->ha_index_init(keyno, test(mode & HA_MRR_SORTED))))
+    {
+      DBUG_RETURN(res);
+    }
+  }
+  /* Call correct init function and assign to top level object */
+  Mrr_simple_index_reader *s= &reader_factory.simple_index_reader;
+  res= s->init(primary_file, seq_funcs, seq_init_param, n_ranges, mode, NULL, 
+               NULL, NULL);
+  strategy= s;
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Whatever the current state is, make it so that we have two handler objects:
+  - primary_file       -  initialized for rnd_pos() scan
+  - secondary_file     -  initialized for scanning the index specified in
+                          this->keyno
+  RETURN 
+    0        OK
+    HA_XXX   Error code
+*/
+
+int DsMrr_impl::setup_two_handlers()
+{
+  int res;
+  THD *thd= primary_file->get_table()->in_use;
+  DBUG_ENTER("DsMrr_impl::setup_two_handlers");
+  if (!secondary_file)
+  {
+    handler *new_h2;
+    Item *pushed_cond= NULL;
+    DBUG_ASSERT(primary_file->inited == handler::INDEX);
+    /* Create a separate handler object to do rnd_pos() calls. */
+    /*
+      ::clone() takes up a lot of stack, especially on 64 bit platforms.
+      The constant 5 is an empiric result.
+    */
+    if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2))
+      DBUG_RETURN(1);
+
+    /* Create a separate handler object to do rnd_pos() calls. */
+    if (!(new_h2= primary_file->clone(thd->mem_root)) || 
+        new_h2->ha_external_lock(thd, F_RDLCK))
+    {
+      delete new_h2;
+      DBUG_RETURN(1);
+    }
+
+    if (keyno == primary_file->pushed_idx_cond_keyno)
+      pushed_cond= primary_file->pushed_idx_cond;
+    
+    Mrr_reader *save_strategy= strategy;
+    strategy= NULL;
+    /*
+      Caution: this call will invoke this->dsmrr_close(). Do not put the
+      created secondary table handler new_h2 into this->secondary_file or it 
+      will delete it. Also, save the picked strategy
+    */
+    res= primary_file->ha_index_end();
+
+    strategy= save_strategy;
+    secondary_file= new_h2;
+
+    if (res || (res= (primary_file->ha_rnd_init(FALSE))))
+      goto error;
+
+    table->prepare_for_position();
+    secondary_file->extra(HA_EXTRA_KEYREAD);
+    secondary_file->mrr_iter= primary_file->mrr_iter;
+
+    if ((res= secondary_file->ha_index_init(keyno, FALSE)))
+      goto error;
+
+    if (pushed_cond)
+      secondary_file->idx_cond_push(keyno, pushed_cond);
+  }
+  else
+  {
+    DBUG_ASSERT(secondary_file && secondary_file->inited==handler::INDEX);
+    /* 
+      We get here when the access alternates betwen MRR scan(s) and non-MRR
+      scans.
+
+      Calling primary_file->index_end() will invoke dsmrr_close() for this object,
+      which will delete secondary_file. We need to keep it, so put it away and dont
+      let it be deleted:
+    */
+    if (primary_file->inited == handler::INDEX)
+    {
+      handler *save_h2= secondary_file;
+      Mrr_reader *save_strategy= strategy;
+      secondary_file= NULL;
+      strategy= NULL;
+      res= primary_file->ha_index_end();
+      secondary_file= save_h2;
+      strategy= save_strategy;
+      if (res)
+        goto error;
+    }
+    if ((primary_file->inited != handler::RND) && 
+        (res= primary_file->ha_rnd_init(FALSE)))
+      goto error;
+  }
+  DBUG_RETURN(0);
+
+error:
+  DBUG_RETURN(res);
+}
+
+
+void DsMrr_impl::close_second_handler()
+{
+  if (secondary_file)
+  {
+    secondary_file->ha_index_or_rnd_end();
+    secondary_file->ha_external_lock(current_thd, F_UNLCK);
+    secondary_file->close();
+    delete secondary_file;
+    secondary_file= NULL;
+  }
+}
+
+
+void DsMrr_impl::dsmrr_close()
+{
+  DBUG_ENTER("DsMrr_impl::dsmrr_close");
+  close_second_handler();
+  strategy= NULL;
+  DBUG_VOID_RETURN;
+}
+
+
+/* 
+  my_qsort2-compatible static member function to compare key tuples 
+*/
+
+int Mrr_ordered_index_reader::compare_keys(void* arg, uchar* key1_arg, 
+                                           uchar* key2_arg)
+{
+  Mrr_ordered_index_reader *reader= (Mrr_ordered_index_reader*)arg;
+  TABLE *table= reader->file->get_table();
+  KEY_PART_INFO *part= table->key_info[reader->file->active_index].key_part;
+  uchar *key1, *key2;
+   
+  if (reader->keypar.use_key_pointers)
+  {
+    /* the buffer stores pointers to keys, get to the keys */
+    memcpy(&key1, key1_arg, sizeof(char*));
+    memcpy(&key2, key2_arg, sizeof(char*));
+  }
+  else
+  {
+    key1= key1_arg;
+    key2= key2_arg;
+  }
+
+  return key_tuple_cmp(part, key1, key2, reader->keypar.key_tuple_length);
+}
+
+
+int Mrr_ordered_index_reader::compare_keys_reverse(void* arg, uchar* key1, 
+                                                   uchar* key2)
+{
+  return -compare_keys(arg, key1, key2);
+}
+
+
+/**
+  Set the buffer space to be shared between rowid and key buffer
+
+  @return FALSE  ok 
+  @return TRUE   There is so little buffer space that we won't be able to use
+                 the strategy. 
+                 This happens when we don't have enough space for one rowid 
+                 element and one key element so this is mainly targeted at
+                 testing.
+*/
+
+bool DsMrr_impl::setup_buffer_sharing(uint key_size_in_keybuf, 
+                                      key_part_map key_tuple_map)
+{
+  long key_buff_elem_size= key_size_in_keybuf + 
+                           (int)is_mrr_assoc * sizeof(range_id_t);
+  
+  KEY *key_info= &primary_file->get_table()->key_info[keyno];
+  /* 
+    Ok if we got here we need to allocate one part of the buffer 
+    for keys and another part for rowids.
+  */
+  ulonglong rowid_buf_elem_size= primary_file->ref_length + 
+                                 (int)is_mrr_assoc * sizeof(range_id_t);
+  
+  /*
+    Use rec_per_key statistics as a basis to find out how many rowids 
+    we'll get for each key value.
+     TODO: what should be the default value to use when there is no 
+           statistics?
+  */
+  uint parts= my_count_bits(key_tuple_map);
+  ulong rpc;
+  ulonglong rowids_size= rowid_buf_elem_size;
+  if ((rpc= key_info->rec_per_key[parts - 1]))
+    rowids_size= rowid_buf_elem_size * rpc;
+
+  double fraction_for_rowids=
+    (ulonglong2double(rowids_size) / 
+     (ulonglong2double(rowids_size) + key_buff_elem_size));
+
+  ptrdiff_t bytes_for_rowids= 
+    (ptrdiff_t)floor(0.5 + fraction_for_rowids * (full_buf_end - full_buf));
+  
+  ptrdiff_t bytes_for_keys= (full_buf_end - full_buf) - bytes_for_rowids;
+
+  if (bytes_for_keys < key_buff_elem_size + 1)
+  {
+    ptrdiff_t add= key_buff_elem_size + 1 - bytes_for_keys;
+    bytes_for_keys= key_buff_elem_size + 1;
+    bytes_for_rowids -= add;
+  }
+
+  if (bytes_for_rowids < (ptrdiff_t)rowid_buf_elem_size + 1)
+  {
+    ptrdiff_t add= (ptrdiff_t)(rowid_buf_elem_size + 1 - bytes_for_rowids);
+    bytes_for_rowids= (ptrdiff_t)rowid_buf_elem_size + 1;
+    bytes_for_keys -= add;
+  }
+
+  rowid_buffer_end= full_buf + bytes_for_rowids;
+  rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end);
+  key_buffer= &backward_key_buf;
+  key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end); 
+
+  if (!key_buffer->have_space_for(key_buff_elem_size) ||
+      !rowid_buffer.have_space_for((size_t)rowid_buf_elem_size))
+    return TRUE; /* Failed to provide minimum space for one of the buffers */
+
+  return FALSE;
+}
+
+
+void DsMrr_impl::do_nothing(void *dsmrr_arg)
+{
+  /* Do nothing */
+}
+
+
+void DsMrr_impl::reset_buffer_sizes(void *dsmrr_arg)
+{
+  DsMrr_impl *dsmrr= (DsMrr_impl*)dsmrr_arg;
+  dsmrr->rowid_buffer.set_buffer_space(dsmrr->full_buf, 
+                                       dsmrr->rowid_buffer_end);
+  dsmrr->key_buffer->set_buffer_space(dsmrr->rowid_buffer_end, 
+                                      dsmrr->full_buf_end);
+}
+
+
+/*
+  Take unused space from the key buffer and give it to the rowid buffer
+*/
+
+void DsMrr_impl::redistribute_buffer_space(void *dsmrr_arg)
+{
+  DsMrr_impl *dsmrr= (DsMrr_impl*)dsmrr_arg;
+  uchar *unused_start, *unused_end;
+  dsmrr->key_buffer->remove_unused_space(&unused_start, &unused_end);
+  dsmrr->rowid_buffer.grow(unused_start, unused_end);
+}
+
+
+/*
+  @brief Initialize the iterator
+  
+  @note
+  Initialize the iterator to produce matches for the key of the first element 
+  in owner_arg->key_buffer
+
+  @retval  0                    OK
+  @retval  HA_ERR_END_OF_FILE   Either the owner->key_buffer is empty or 
+                                no matches for the key we've tried (check
+                                key_buffer->is_empty() to tell these apart)
+  @retval  other code           Fatal error
+*/
+
+int Key_value_records_iterator::init(Mrr_ordered_index_reader *owner_arg)
+{
+  int res;
+  owner= owner_arg;
+
+  identical_key_it.init(owner->key_buffer);
+  owner->key_buffer->setup_reading(owner->keypar.key_size_in_keybuf,
+                                   owner->is_mrr_assoc ? sizeof(void*) : 0);
+
+  if (identical_key_it.read())
+    return HA_ERR_END_OF_FILE;
+
+  uchar *key_in_buf= last_identical_key_ptr= identical_key_it.read_ptr1;
+
+  uchar *index_tuple= key_in_buf;
+  if (owner->keypar.use_key_pointers)
+    memcpy(&index_tuple, key_in_buf, sizeof(char*));
+  
+  /* Check out how many more identical keys are following */
+  while (!identical_key_it.read())
+  {
+    if (Mrr_ordered_index_reader::compare_keys(owner, key_in_buf, 
+                                               identical_key_it.read_ptr1))
+      break;
+    last_identical_key_ptr= identical_key_it.read_ptr1;
+  }
+  identical_key_it.init(owner->key_buffer);
+  res= owner->file->ha_index_read_map(owner->file->get_table()->record[0], 
+                                      index_tuple, 
+                                      owner->keypar.key_tuple_map, 
+                                      HA_READ_KEY_EXACT);
+
+  if (res)
+  {
+    /* Failed to find any matching records */
+    move_to_next_key_value();
+    return res;
+  }
+  owner->have_saved_rowid= FALSE;
+  get_next_row= FALSE;
+  return 0;
+}
+
+
+int Key_value_records_iterator::get_next(range_id_t *range_info)
+{
+  int res;
+
+  if (get_next_row)
+  {
+    if (owner->keypar.index_ranges_unique)
+    {
+      /* We're using a full unique key, no point to call index_next_same */
+      return HA_ERR_END_OF_FILE;
+    }
+    
+    handler *h= owner->file;
+    if ((res= h->ha_index_next_same(h->get_table()->record[0], 
+                                    identical_key_it.read_ptr1, 
+                                    owner->keypar.key_tuple_length)))
+    {
+      /* It's either HA_ERR_END_OF_FILE or some other error */
+      return res; 
+    }
+    identical_key_it.init(owner->key_buffer);
+    owner->have_saved_rowid= FALSE;
+    get_next_row= FALSE;
+  }
+
+  identical_key_it.read(); /* This gets us next range_id */
+  memcpy(range_info, identical_key_it.read_ptr2, sizeof(range_id_t));
+
+  if (!last_identical_key_ptr || 
+      (identical_key_it.read_ptr1 == last_identical_key_ptr))
+  {
+    /* 
+      We've reached the last of the identical keys that current record is a
+      match for.  Set get_next_row=TRUE so that we read the next index record
+      on the next call to this function.
+    */
+    get_next_row= TRUE;
+  }
+  return 0;
+}
+
+
+void Key_value_records_iterator::move_to_next_key_value()
+{
+  while (!owner->key_buffer->read() && 
+         (owner->key_buffer->read_ptr1 != last_identical_key_ptr)) {}
+}
+
+
+/**
+  DS-MRR implementation: multi_range_read_next() function.
+
+  Calling convention is like multi_range_read_next() has.
+*/
+
+int DsMrr_impl::dsmrr_next(range_id_t *range_info)
+{
+  int res;
+  if (strategy_exhausted)
+    return HA_ERR_END_OF_FILE;
+
+  while ((res= strategy->get_next(range_info)) == HA_ERR_END_OF_FILE)
+  {
+    if ((res= strategy->refill_buffer(FALSE)))
+      break; /* EOF or error */
+  }
+  return res;
+}
+
+
+/**
+  DS-MRR implementation: multi_range_read_info() function
+*/
+ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows, 
+                               uint key_parts,
+                               uint *bufsz, uint *flags, COST_VECT *cost)
+{  
+  ha_rows res;
+  uint def_flags= *flags;
+  uint def_bufsz= *bufsz;
+
+  /* Get cost/flags/mem_usage of default MRR implementation */
+  res= primary_file->handler::multi_range_read_info(keyno, n_ranges, rows,
+                                                    key_parts, &def_bufsz, 
+                                                    &def_flags, cost);
+  DBUG_ASSERT(!res);
+
+  if ((*flags & HA_MRR_USE_DEFAULT_IMPL) || 
+      choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
+  {
+    /* Default implementation is choosen */
+    DBUG_PRINT("info", ("Default MRR implementation choosen"));
+    *flags= def_flags;
+    *bufsz= def_bufsz;
+  }
+  else
+  {
+    /* *flags and *bufsz were set by choose_mrr_impl */
+    DBUG_PRINT("info", ("DS-MRR implementation choosen"));
+  }
+  return 0;
+}
+
+
+/**
+  DS-MRR Implementation: multi_range_read_info_const() function
+*/
+
+ha_rows DsMrr_impl::dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                 void *seq_init_param, uint n_ranges, 
+                                 uint *bufsz, uint *flags, COST_VECT *cost)
+{
+  ha_rows rows;
+  uint def_flags= *flags;
+  uint def_bufsz= *bufsz;
+  /* Get cost/flags/mem_usage of default MRR implementation */
+  rows= primary_file->handler::multi_range_read_info_const(keyno, seq, 
+                                                           seq_init_param,
+                                                           n_ranges, 
+                                                           &def_bufsz, 
+                                                           &def_flags, cost);
+  if (rows == HA_POS_ERROR)
+  {
+    /* Default implementation can't perform MRR scan => we can't either */
+    return rows;
+  }
+
+  /*
+    If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
+    use the default MRR implementation (we need it for UPDATE/DELETE).
+    Otherwise, make a choice based on cost and @@optimizer_use_mrr.
+  */
+  if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
+      choose_mrr_impl(keyno, rows, flags, bufsz, cost))
+  {
+    DBUG_PRINT("info", ("Default MRR implementation choosen"));
+    *flags= def_flags;
+    *bufsz= def_bufsz;
+  }
+  else
+  {
+    /* *flags and *bufsz were set by choose_mrr_impl */
+    DBUG_PRINT("info", ("DS-MRR implementation choosen"));
+  }
+  return rows;
+}
+
+
+/**
+  Check if key has partially-covered columns
+
+  We can't use DS-MRR to perform range scans when the ranges are over
+  partially-covered keys, because we'll not have full key part values
+  (we'll have their prefixes from the index) and will not be able to check
+  if we've reached the end the range.
+
+  @param keyno  Key to check
+
+  @todo
+    Allow use of DS-MRR in cases where the index has partially-covered
+    components but they are not used for scanning.
+
+  @retval TRUE   Yes
+  @retval FALSE  No
+*/
+
+bool key_uses_partial_cols(TABLE *table, uint keyno)
+{
+  KEY_PART_INFO *kp= table->key_info[keyno].key_part;
+  KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
+  for (; kp != kp_end; kp++)
+  {
+    if (!kp->field->part_of_key.is_set(keyno))
+      return TRUE;
+  }
+  return FALSE;
+}
+
+
+/*
+  Check if key/flags allow DS-MRR/CPK strategy to be used
+  
+  @param thd
+  @param keyno      Index that will be used
+  @param  mrr_flags  
+  
+  @retval TRUE   DS-MRR/CPK should be used
+  @retval FALSE  Otherwise
+*/
+
+bool DsMrr_impl::check_cpk_scan(THD *thd, uint keyno, uint mrr_flags)
+{
+  return test((mrr_flags & HA_MRR_SINGLE_POINT) &&
+              keyno == table->s->primary_key && 
+              primary_file->primary_key_is_clustered() && 
+              optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS));
+}
+
+
+/*
+  DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
+
+  Make the choice between using Default MRR implementation and DS-MRR.
+  This function contains common functionality factored out of dsmrr_info()
+  and dsmrr_info_const(). The function assumes that the default MRR
+  implementation's applicability requirements are satisfied.
+
+  @param keyno       Index number
+  @param rows        E(full rows to be retrieved)
+  @param flags  IN   MRR flags provided by the MRR user
+                OUT  If DS-MRR is choosen, flags of DS-MRR implementation
+                     else the value is not modified
+  @param bufsz  IN   If DS-MRR is choosen, buffer use of DS-MRR implementation
+                     else the value is not modified
+  @param cost   IN   Cost of default MRR implementation
+                OUT  If DS-MRR is choosen, cost of DS-MRR scan
+                     else the value is not modified
+
+  @retval TRUE   Default MRR implementation should be used
+  @retval FALSE  DS-MRR implementation should be used
+*/
+
+
+bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
+                                 uint *bufsz, COST_VECT *cost)
+{
+  COST_VECT dsmrr_cost;
+  bool res;
+  THD *thd= current_thd;
+
+  bool doing_cpk_scan= check_cpk_scan(thd, keyno, *flags); 
+  bool using_cpk= test(keyno == table->s->primary_key &&
+                       primary_file->primary_key_is_clustered());
+  if (thd->variables.optimizer_use_mrr == 2 || *flags & HA_MRR_INDEX_ONLY ||
+      (using_cpk && !doing_cpk_scan) || key_uses_partial_cols(table, keyno))
+  {
+    /* Use the default implementation */
+    *flags |= HA_MRR_USE_DEFAULT_IMPL;
+    return TRUE;
+  }
+
+  uint add_len= table->key_info[keyno].key_length + primary_file->ref_length; 
+  *bufsz -= add_len;
+  if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
+    return TRUE;
+  *bufsz += add_len;
+  
+  bool force_dsmrr;
+  /* 
+    If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
+    DS-MRR and Default implementations cost. This allows one to force use of
+    DS-MRR whenever it is applicable without affecting other cost-based
+    choices.
+  */
+  if ((force_dsmrr= (thd->variables.optimizer_use_mrr == 1)) &&
+      dsmrr_cost.total_cost() > cost->total_cost())
+    dsmrr_cost= *cost;
+
+  if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
+  {
+    *flags &= ~HA_MRR_USE_DEFAULT_IMPL;  /* Use the DS-MRR implementation */
+    *flags &= ~HA_MRR_SORTED;          /* We will return unordered output */
+    *cost= dsmrr_cost;
+    res= FALSE;
+
+    if ((*flags & HA_MRR_SINGLE_POINT) && 
+         optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS))
+      *flags |= HA_MRR_MATERIALIZED_KEYS;
+  }
+  else
+  {
+    /* Use the default MRR implementation */
+    res= TRUE;
+  }
+  return res;
+}
+
+
+static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost);
+
+
+/**
+  Get cost of DS-MRR scan
+
+  @param keynr              Index to be used
+  @param rows               E(Number of rows to be scanned)
+  @param flags              Scan parameters (HA_MRR_* flags)
+  @param buffer_size INOUT  Buffer size
+  @param cost        OUT    The cost
+
+  @retval FALSE  OK
+  @retval TRUE   Error, DS-MRR cannot be used (the buffer is too small
+                 for even 1 rowid)
+*/
+
+bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
+                                         uint *buffer_size, COST_VECT *cost)
+{
+  ulong max_buff_entries, elem_size;
+  ha_rows rows_in_full_step, rows_in_last_step;
+  uint n_full_steps;
+  double index_read_cost;
+
+  elem_size= primary_file->ref_length + 
+             sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
+  max_buff_entries = *buffer_size / elem_size;
+
+  if (!max_buff_entries)
+    return TRUE; /* Buffer has not enough space for even 1 rowid */
+
+  /* Number of iterations we'll make with full buffer */
+  n_full_steps= (uint)floor(rows2double(rows) / max_buff_entries);
+  
+  /* 
+    Get numbers of rows we'll be processing in 
+     - non-last sweep, with full buffer 
+     - last iteration, with non-full buffer
+  */
+  rows_in_full_step= max_buff_entries;
+  rows_in_last_step= rows % max_buff_entries;
+  
+  /* Adjust buffer size if we expect to use only part of the buffer */
+  if (n_full_steps)
+  {
+    get_sort_and_sweep_cost(table, rows, cost);
+    cost->multiply(n_full_steps);
+  }
+  else
+  {
+    cost->zero();
+    *buffer_size= max(*buffer_size, 
+                      (size_t)(1.2*rows_in_last_step) * elem_size + 
+                      primary_file->ref_length + table->key_info[keynr].key_length);
+  }
+  
+  COST_VECT last_step_cost;
+  get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
+  cost->add(&last_step_cost);
+ 
+  if (n_full_steps != 0)
+    cost->mem_cost= *buffer_size;
+  else
+    cost->mem_cost= (double)rows_in_last_step * elem_size;
+  
+  /* Total cost of all index accesses */
+  index_read_cost= primary_file->keyread_time(keynr, 1, rows);
+  cost->add_io(index_read_cost, 1 /* Random seeks */);
+  return FALSE;
+}
+
+
+/* 
+  Get cost of one sort-and-sweep step
+  
+  It consists of two parts:
+   - sort an array of #nrows ROWIDs using qsort
+   - read #nrows records from table in a sweep.
+
+  @param table       Table being accessed
+  @param nrows       Number of rows to be sorted and retrieved
+  @param cost   OUT  The cost of scan
+*/
+
+static 
+void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost)
+{
+  if (nrows)
+  {
+    get_sweep_read_cost(table, nrows, FALSE, cost);
+    /* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
+    double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
+    if (cmp_op < 3)
+      cmp_op= 3;
+    cost->cpu_cost += cmp_op * log2(cmp_op);
+  }
+  else
+    cost->zero();
+}
+
+
+/**
+  Get cost of reading nrows table records in a "disk sweep"
+
+  A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
+  for an ordered sequence of rowids.
+
+  We assume hard disk IO. The read is performed as follows:
+
+   1. The disk head is moved to the needed cylinder
+   2. The controller waits for the plate to rotate
+   3. The data is transferred
+
+  Time to do #3 is insignificant compared to #2+#1.
+
+  Time to move the disk head is proportional to head travel distance.
+
+  Time to wait for the plate to rotate depends on whether the disk head
+  was moved or not. 
+
+  If disk head wasn't moved, the wait time is proportional to distance
+  between the previous block and the block we're reading.
+
+  If the head was moved, we don't know how much we'll need to wait for the
+  plate to rotate. We assume the wait time to be a variate with a mean of
+  0.5 of full rotation time.
+
+  Our cost units are "random disk seeks". The cost of random disk seek is
+  actually not a constant, it depends one range of cylinders we're going
+  to access. We make it constant by introducing a fuzzy concept of "typical 
+  datafile length" (it's fuzzy as it's hard to tell whether it should
+  include index file, temp.tables etc). Then random seek cost is:
+
+    1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
+
+  We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
+
+  @param table             Table to be accessed
+  @param nrows             Number of rows to retrieve
+  @param interrupted       TRUE <=> Assume that the disk sweep will be
+                           interrupted by other disk IO. FALSE - otherwise.
+  @param cost         OUT  The cost.
+*/
+
+void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, 
+                         COST_VECT *cost)
+{
+  DBUG_ENTER("get_sweep_read_cost");
+
+  cost->zero();
+  if (table->file->primary_key_is_clustered())
+  {
+    cost->io_count= table->file->read_time(table->s->primary_key,
+                                           (uint) nrows, nrows);
+  }
+  else
+  {
+    double n_blocks=
+      ceil(ulonglong2double(table->file->stats.data_file_length) / IO_SIZE);
+    double busy_blocks=
+      n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
+    if (busy_blocks < 1.0)
+      busy_blocks= 1.0;
+
+    DBUG_PRINT("info",("sweep: nblocks=%g, busy_blocks=%g", n_blocks,
+                       busy_blocks));
+    cost->io_count= busy_blocks;
+
+    if (!interrupted)
+    {
+      /* Assume reading is done in one 'sweep' */
+      cost->avg_io_cost= (DISK_SEEK_BASE_COST +
+                          DISK_SEEK_PROP_COST*n_blocks/busy_blocks);
+    }
+  }
+  DBUG_PRINT("info",("returning cost=%g", cost->total_cost()));
+  DBUG_VOID_RETURN;
+}
+
+
+/* **************************************************************************
+ * DS-MRR implementation ends
+ ***************************************************************************/
+
+
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
new file mode 100644
index 00000000000..08ac1b6f6a4
--- /dev/null
+++ b/sql/multi_range_read.h
@@ -0,0 +1,632 @@
+/**
+  @defgroup DS-MRR declarations
+  @{
+*/
+
+/**
+  A Disk-Sweep implementation of MRR Interface (DS-MRR for short)
+
+  This is a "plugin"(*) for storage engines that allows to
+    1. When doing index scans, read table rows in rowid order;
+    2. when making many index lookups, do them in key order and don't
+       lookup the same key value multiple times;
+    3. Do both #1 and #2, when applicable.
+  These changes are expected to speed up query execution for disk-based 
+  storage engines running io-bound loads and "big" queries (ie. queries that
+  do joins and enumerate lots of records).
+
+  (*) - only conceptually. No dynamic loading or binary compatibility of any
+        kind.
+
+  General scheme of things:
+   
+      SQL Layer code
+       |   |   |
+       v   v   v 
+      -|---|---|---- handler->multi_range_read_XXX() function calls
+       |   |   |
+      _____________________________________
+     / DS-MRR module                       \
+     | (order/de-duplicate lookup keys,    |
+     | scan indexes in key order,          |
+     | order/de-duplicate rowids,          |
+     | retrieve full record reads in rowid |
+     | order)                              |
+     \_____________________________________/
+       |   |   |
+      -|---|---|----- handler->read_range_first()/read_range_next(), 
+       |   |   |      handler->index_read(), handler->rnd_pos() calls.
+       |   |   |
+       v   v   v
+      Storage engine internals
+
+
+  Currently DS-MRR is used by MyISAM, InnoDB/XtraDB and Maria storage engines.
+  Potentially it can be used with any table handler that has disk-based data
+  storage and has better performance when reading data in rowid order.
+*/
+
+#include "sql_lifo_buffer.h"
+
+class DsMrr_impl;
+class Mrr_ordered_index_reader;
+
+
+/* A structure with key parameters that's shared among several classes */
+class Key_parameters
+{
+public:
+  uint         key_tuple_length; /* Length of index lookup tuple, in bytes */
+  key_part_map key_tuple_map;    /* keyparts used in index lookup tuples */
+
+  /*
+    This is 
+      = key_tuple_length   if we copy keys to buffer
+      = sizeof(void*)      if we're using pointers to materialized keys.
+  */
+  uint key_size_in_keybuf;
+
+  /* TRUE <=> don't copy key values, use pointers to them instead.  */
+  bool use_key_pointers;
+
+  /* TRUE <=> We can get at most one index tuple for a lookup key */
+  bool index_ranges_unique;
+};
+
+
+/**
+  A class to enumerate (record, range_id) pairs that match given key value.
+  
+  @note
+
+  The idea is that we have a Lifo_buffer which holds (key, range_id) pairs
+  ordered by key value. From the front of the buffer we see
+
+    (key_val1, range_id1), (key_val1, range_id2) ... (key_val2, range_idN)
+
+  we take the first elements that have the same key value (key_val1 in the
+  example above), and make lookup into the table.  The table will have 
+  multiple matches for key_val1:
+ 
+                  == Table Index ==
+                   ...
+     key_val1 ->  key_val1, index_tuple1
+                  key_val1, index_tuple2
+                   ...
+                  key_val1, index_tupleN
+                   ...
+  
+  Our goal is to produce all possible combinations, i.e. we need:
+  
+    {(key_val1, index_tuple1), range_id1}
+    {(key_val1, index_tuple1), range_id2}
+       ...           ...               |
+    {(key_val1, index_tuple1), range_idN},
+                  
+    {(key_val1, index_tuple2), range_id1}
+    {(key_val1, index_tuple2), range_id2}
+        ...          ...               |
+    {(key_val1, index_tuple2), range_idN},
+
+        ...          ...          ...                          
+
+    {(key_val1, index_tupleK), range_idN}
+*/
+
+class Key_value_records_iterator
+{
+  /* Use this to get table handler, key buffer and other parameters */
+  Mrr_ordered_index_reader *owner;
+
+  /* Iterator to get (key, range_id) pairs from */
+  Lifo_buffer_iterator identical_key_it;
+  
+  /* 
+    Last of the identical key values (when we get this pointer from
+    identical_key_it, it will be time to stop).
+  */
+  uchar *last_identical_key_ptr;
+
+  /*
+    FALSE <=> we're right after the init() call, the record has been already
+    read with owner->file->index_read_map() call
+  */
+  bool get_next_row;
+  
+public:
+  int init(Mrr_ordered_index_reader *owner_arg);
+  int get_next(range_id_t *range_info);
+  void move_to_next_key_value();
+};
+
+
+/*
+  Buffer manager interface. Mrr_reader objects use it to inqure DsMrr_impl
+  to manage buffer space for them.
+*/
+typedef struct st_buffer_manager
+{
+public:
+  /* Opaque value to be passed as the first argument to all member functions */
+  void *arg;
+  
+  /*
+    This is called when we've freed more space from the rowid buffer. The
+    callee will get the unused space from the rowid buffer and give it to the
+    key buffer.
+  */
+  void (*redistribute_buffer_space)(void *arg);
+
+  /* 
+    This is called when both key and rowid buffers are empty, and so it's time 
+    to reset them to their original size (They've lost their original size,
+    because we were dynamically growing rowid buffer and shrinking key buffer).
+  */
+  void (*reset_buffer_sizes)(void *arg);
+
+} Buffer_manager;
+
+
+/* 
+  Mrr_reader - DS-MRR execution strategy abstraction
+
+  A reader produces ([index]_record, range_info) pairs, and requires periodic
+  refill operations.
+
+  - one starts using the reader by calling reader->get_next(),
+  - when a get_next() call returns HA_ERR_END_OF_FILE, one must call 
+    refill_buffer() before they can make more get_next() calls.
+  - when refill_buffer() returns HA_ERR_END_OF_FILE, this means the real
+    end of stream and get_next() should not be called anymore.
+
+  Both functions can return other error codes, these mean unrecoverable errors
+  after which one cannot continue.
+*/
+
+class Mrr_reader 
+{
+public:
+  virtual int get_next(range_id_t *range_info) = 0;
+  virtual int refill_buffer(bool initial) = 0;
+  virtual ~Mrr_reader() {}; /* just to remove compiler warning */
+};
+
+
+/* 
+  A common base for readers that do index scans and produce index tuples 
+*/
+
+class Mrr_index_reader : public Mrr_reader
+{
+protected:
+  handler *file; /* Handler object to use */
+public:
+  virtual int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
+                   void *seq_init_param, uint n_ranges,
+                   uint mode, Key_parameters *key_par, 
+                   Lifo_buffer *key_buffer, 
+                   Buffer_manager *buf_manager_arg) = 0;
+
+  /* Get pointer to place where every get_next() call will put rowid */
+  virtual uchar *get_rowid_ptr() = 0;
+  /* Get the rowid (call this after get_next() call) */
+  virtual void position();
+  virtual bool skip_record(range_id_t range_id, uchar *rowid) = 0;
+
+  virtual void interrupt_read() {}
+  virtual void resume_read() {}
+};
+
+
+/*
+  A "bypass" index reader that just does and index scan. The index scan is done 
+  by calling default MRR implementation (i.e.  handler::multi_range_read_XXX())
+  functions.
+*/
+
+class Mrr_simple_index_reader : public Mrr_index_reader
+{
+public:
+  int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
+           void *seq_init_param, uint n_ranges,
+           uint mode, Key_parameters *key_par,
+           Lifo_buffer *key_buffer,
+           Buffer_manager *buf_manager_arg);
+  int get_next(range_id_t *range_info);
+  int refill_buffer(bool initial) { return initial? 0: HA_ERR_END_OF_FILE; }
+  uchar *get_rowid_ptr() { return file->ref; }
+  bool skip_record(range_id_t range_id, uchar *rowid)
+  {
+    return (file->mrr_funcs.skip_record &&
+            file->mrr_funcs.skip_record(file->mrr_iter, range_id, rowid));
+  }
+};
+
+
+/* 
+  A reader that sorts the key values before it makes the index lookups.
+*/
+
+class Mrr_ordered_index_reader : public Mrr_index_reader
+{
+public:
+  int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
+           void *seq_init_param, uint n_ranges,
+           uint mode, Key_parameters *key_par,
+           Lifo_buffer *key_buffer,
+           Buffer_manager *buf_manager_arg);
+  int get_next(range_id_t *range_info);
+  int refill_buffer(bool initial);
+  uchar *get_rowid_ptr() { return file->ref; }
+  
+  bool skip_record(range_id_t range_info, uchar *rowid)
+  {
+    return (mrr_funcs.skip_record &&
+            mrr_funcs.skip_record(mrr_iter, range_info, rowid));
+  }
+
+  bool skip_index_tuple(range_id_t range_info)
+  {
+    return (mrr_funcs.skip_index_tuple &&
+            mrr_funcs.skip_index_tuple(mrr_iter, range_info));
+  }
+  
+  bool set_interruption_temp_buffer(uint rowid_length, uint key_len, 
+                                    uint saved_pk_len,
+                                    uchar **space_start, uchar *space_end);
+  void set_no_interruption_temp_buffer();
+
+  void interrupt_read();
+  void resume_read();
+  void position();
+private:
+  Key_value_records_iterator kv_it;
+
+  bool scanning_key_val_iter;
+  
+  /* Buffer to store (key, range_id) pairs */
+  Lifo_buffer *key_buffer;
+  
+  /* This manages key buffer allocation and sizing for us */
+  Buffer_manager *buf_manager;
+
+  Key_parameters  keypar; /* index scan and lookup tuple parameters */
+
+  /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
+  bool is_mrr_assoc;
+  
+  /* Range sequence iteration members */
+  RANGE_SEQ_IF mrr_funcs;
+  range_seq_t mrr_iter;
+  
+  /* TRUE == reached eof when enumerating ranges */
+  bool source_exhausted;
+   
+  /* 
+    Following members are for interrupt_read()/resume_read(). The idea is that 
+    in some cases index scan that is done by this object is interrupted by
+    rnd_pos() calls made by Mrr_ordered_rndpos_reader. The problem is that
+    we're sharing handler->record[0] with that object, and it destroys its
+    contents.
+    We need to save/restore our current
+    - index tuple (for pushed index condition checks)
+    - clustered primary key values (again, for pushed index condition checks)
+    - rowid of the last record we've retrieved (in case this rowid matches
+      multiple ranges and we'll need to return it again)
+  */ 
+  bool support_scan_interruptions;
+  /* Space where we save the rowid of the last record we've returned */
+  uchar *saved_rowid;
+  
+  /* TRUE <=> saved_rowid has the last saved rowid */
+  bool have_saved_rowid;
+  
+  uchar *saved_key_tuple; /* Saved current key tuple */
+  uchar *saved_primary_key; /* Saved current primary key tuple */
+
+  static int compare_keys(void* arg, uchar* key1, uchar* key2);
+  static int compare_keys_reverse(void* arg, uchar* key1, uchar* key2);
+  
+  friend class Key_value_records_iterator; 
+  friend class DsMrr_impl;
+  friend class Mrr_ordered_rndpos_reader;
+};
+
+
+/* 
+  A reader that gets rowids from an Mrr_index_reader, and then sorts them 
+  before getting full records with handler->rndpos() calls.
+*/
+
+class Mrr_ordered_rndpos_reader : public Mrr_reader 
+{
+public:
+  int init(handler *file, Mrr_index_reader *index_reader, uint mode,
+           Lifo_buffer *buf);
+  int get_next(range_id_t *range_info);
+  int refill_buffer(bool initial);
+private:
+  handler *file; /* Handler to use */
+  
+  /* This what we get (rowid, range_info) pairs from */
+  Mrr_index_reader *index_reader;
+
+  /* index_reader->get_next() puts rowid here */
+  uchar *index_rowid;
+  
+  /* TRUE <=> index_reader->refill_buffer() call has returned EOF */
+  bool index_reader_exhausted;
+  
+  /* 
+    TRUE <=> We should call index_reader->refill_buffer(). This happens if
+    1. we've made index_reader->get_next() call which returned EOF
+    2. we haven't made any index_reader calls (and our first call should 
+       be index_reader->refill_buffer(initial=TRUE)
+  */
+  bool index_reader_needs_refill;
+
+  /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
+  bool is_mrr_assoc;
+  
+  /* 
+    When reading from ordered rowid buffer: the rowid element of the last
+    buffer element that has rowid identical to this one.
+  */
+  uchar *last_identical_rowid;
+
+  /* Buffer to store (rowid, range_id) pairs */
+  Lifo_buffer *rowid_buffer;
+  
+  int refill_from_index_reader();
+};
+
+
+/*
+  A primitive "factory" of various Mrr_*_reader classes (the point is to 
+  get various kinds of readers without having to allocate them on the heap)
+*/
+
+class Mrr_reader_factory
+{
+public:
+  Mrr_ordered_rndpos_reader ordered_rndpos_reader;
+  Mrr_ordered_index_reader  ordered_index_reader;
+  Mrr_simple_index_reader   simple_index_reader;
+};
+
+
+/*
+  DS-MRR implementation for one table. Create/use one object of this class for
+  each ha_{myisam/innobase/etc} object. That object will be further referred to
+  as "the handler"
+
+  DsMrr_impl supports has the following execution strategies:
+
+  - Bypass DS-MRR, pass all calls to default MRR implementation, which is 
+    an MRR-to-non-MRR call converter.
+  - Key-Ordered Retrieval
+  - Rowid-Ordered Retrieval
+
+  DsMrr_impl will use one of the above strategies, or a combination of them, 
+  according to the following diagram:
+
+         (mrr function calls)
+                |
+                +----------------->-----------------+
+                |                                   |
+     ___________v______________      _______________v________________
+    / default: use lookup keys \    / KEY-ORDERED RETRIEVAL:         \
+    | (or ranges) in whatever  |    | sort lookup keys and then make | 
+    | order they are supplied  |    | index lookups in index order   |
+    \__________________________/    \________________________________/
+              | |  |                           |    |
+      +---<---+ |  +--------------->-----------|----+
+      |         |                              |    |
+      |         |              +---------------+    |
+      |   ______v___ ______    |     _______________v_______________
+      |  / default: read   \   |    / ROWID-ORDERED RETRIEVAL:      \
+      |  | table records   |   |    | Before reading table records, |
+      v  | in random order |   v    | sort their rowids and then    |
+      |  \_________________/   |    | read them in rowid order      |
+      |         |              |    \_______________________________/
+      |         |              |                    |
+      |         |              |                    |
+      +-->---+  |  +----<------+-----------<--------+
+             |  |  |                                
+             v  v  v
+      (table records and range_ids)
+
+  The choice of strategy depends on MRR scan properties, table properties
+  (whether we're scanning clustered primary key), and @@optimizer_switch
+  settings.
+  
+  Key-Ordered Retrieval
+  ---------------------
+  The idea is: if MRR scan is essentially a series of lookups on 
+   
+    tbl.key=value1 OR tbl.key=value2 OR ... OR tbl.key=valueN
+  
+  then it makes sense to collect and order the set of lookup values, i.e.
+   
+     sort(value1, value2, .. valueN)
+
+  and then do index lookups in index order. This results in fewer index page
+  fetch operations, and we also can avoid making multiple index lookups for the
+  same value. That is, if value1=valueN we can easily discover that after
+  sorting and make one index lookup for them instead of two.
+
+  Rowid-Ordered Retrieval
+  -----------------------
+  If we do a regular index scan or a series of index lookups, we'll be hitting
+  table records at random. For disk-based engines, this is much slower than 
+  reading the same records in disk order. We assume that disk ordering of
+  rows is the same as ordering of their rowids (which is provided by 
+  handler::cmp_ref())
+  In order to retrieve records in different order, we must separate index
+  scanning and record fetching, that is, MRR scan uses the following steps:
+
+    1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and 
+        fill a buffer with {rowid, range_id} pairs
+    2. Sort the buffer by rowid value
+    3. for each {rowid, range_id} pair in the buffer
+         get record by rowid and return the {record, range_id} pair
+    4. Repeat the above steps until we've exhausted the list of ranges we're
+       scanning.
+
+  Buffer space management considerations
+  --------------------------------------
+  With regards to buffer/memory management, MRR interface specifies that 
+   - SQL layer provides multi_range_read_init() with buffer of certain size.
+   - MRR implementation may use (i.e. have at its disposal till the end of 
+     the MRR scan) all of the buffer, or return the unused end of the buffer 
+     to SQL layer.
+
+  DS-MRR needs buffer in order to accumulate and sort rowids and/or keys. When
+  we need to accumulate/sort only keys (or only rowids), it is fairly trivial.
+
+  When we need to accumulate/sort both keys and rowids, efficient buffer use
+  gets complicated. We need to:
+   - First, accumulate keys and sort them
+   - Then use the keys (smaller values go first) to obtain rowids. A key is not
+     needed after we've got matching rowids for it.
+   - Make sure that rowids are accumulated at the front of the buffer, so that we
+     can return the end part of the buffer to SQL layer, should there be too
+     few rowid values to occupy the buffer.
+
+  All of these goals are achieved by using the following scheme:
+
+     |                    |   We get an empty buffer from SQL layer.   
+
+     |                  *-|    
+     |               *----|   First, we fill the buffer with keys. Key_buffer
+     |            *-------|   part grows from end of the buffer space to start
+     |         *----------|   (In this picture, the buffer is big enough to
+     |      *-------------|    accomodate all keys and even have some space left)
+
+     |      *=============|   We want to do key-ordered index scan, so we sort
+                              the keys
+
+     |-x      *===========|   Then we use the keys get rowids. Rowids are 
+     |----x      *========|   stored from start of buffer space towards the end.
+     |--------x     *=====|   The part of the buffer occupied with keys
+     |------------x   *===|   gradually frees up space for rowids. In this
+     |--------------x   *=|   picture we run out of keys before we've ran out
+     |----------------x   |   of buffer space (it can be other way as well).
+
+     |================x   |   Then we sort the rowids.
+                     
+     |                |~~~|   The unused part of the buffer is at the end, so
+                              we can return it to the SQL layer.
+
+     |================*       Sorted rowids are then used to read table records 
+                              in disk order
+
+*/
+
+class DsMrr_impl
+{
+public:
+  typedef void (handler::*range_check_toggle_func_t)(bool on);
+
+  DsMrr_impl()
+    : secondary_file(NULL) {};
+  
+  void init(handler *h_arg, TABLE *table_arg)
+  {
+    primary_file= h_arg; 
+    table= table_arg;
+  }
+  int dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
+                 void *seq_init_param, uint n_ranges, uint mode, 
+                 HANDLER_BUFFER *buf);
+  void dsmrr_close();
+  int dsmrr_next(range_id_t *range_info);
+
+  ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts, 
+                     uint *bufsz, uint *flags, COST_VECT *cost);
+
+  ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, 
+                            void *seq_init_param, uint n_ranges, uint *bufsz,
+                            uint *flags, COST_VECT *cost);
+private:
+  /* Buffer to store (key, range_id) pairs */
+  Lifo_buffer *key_buffer;
+
+  /*
+    The "owner" handler object (the one that is expected to "own" this object
+    and call its functions).
+  */
+  handler *primary_file;
+  TABLE *table; /* Always equal to primary_file->table */
+
+  /*
+    Secondary handler object. (created when needed, we need it when we need 
+    to run both index scan and rnd_pos() scan at the same time)
+  */
+  handler *secondary_file;
+  
+  uint keyno; /* index we're running the scan on */
+  /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
+  bool is_mrr_assoc;
+
+  Mrr_reader_factory reader_factory;
+
+  Mrr_reader *strategy;
+  bool strategy_exhausted;
+
+  Mrr_index_reader *index_strategy;
+
+  /* The whole buffer space that we're using */
+  uchar *full_buf;
+  uchar *full_buf_end;
+  
+  /* 
+    When using both rowid and key buffers: the boundary between key and rowid
+    parts of the buffer. This is the "original" value, actual memory ranges 
+    used by key and rowid parts may be different because of dynamic space 
+    reallocation between them.
+  */
+  uchar *rowid_buffer_end;
+ 
+  /*
+    One of the following two is used for key buffer: forward is used when 
+    we only need key buffer, backward is used when we need both key and rowid
+    buffers.
+  */
+  Forward_lifo_buffer forward_key_buf;
+  Backward_lifo_buffer backward_key_buf;
+
+  /*
+    Buffer to store (rowid, range_id) pairs, or just rowids if 
+    is_mrr_assoc==FALSE
+  */
+  Forward_lifo_buffer rowid_buffer;
+  
+  bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 
+                       COST_VECT *cost);
+  bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, 
+                               uint *buffer_size, COST_VECT *cost);
+  bool check_cpk_scan(THD *thd, uint keyno, uint mrr_flags);
+
+  bool setup_buffer_sharing(uint key_size_in_keybuf, key_part_map key_tuple_map);
+
+  /* Buffer_manager and its member functions */
+  Buffer_manager buf_manager;
+  static void redistribute_buffer_space(void *dsmrr_arg);
+  static void reset_buffer_sizes(void *dsmrr_arg);
+  static void do_nothing(void *dsmrr_arg);
+
+  Lifo_buffer* get_key_buffer() { return key_buffer; }
+
+  friend class Key_value_records_iterator;
+  friend class Mrr_ordered_index_reader;
+  friend class Mrr_ordered_rndpos_reader;
+
+  int  setup_two_handlers();
+  void close_second_handler();
+};
+
+/**
+  @} (end of group DS-MRR declarations)
+*/
+
diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h
index 46e202df071..d919eddfffe 100644
--- a/sql/mysql_priv.h
+++ b/sql/mysql_priv.h
@@ -351,7 +351,10 @@ protected:
   Number of comparisons of table rowids equivalent to reading one row from a 
   table.
 */
-#define TIME_FOR_COMPARE_ROWID  (TIME_FOR_COMPARE*2)
+#define TIME_FOR_COMPARE_ROWID  (TIME_FOR_COMPARE*100)
+
+/* cost1 is better that cost2 only if cost1 + COST_EPS < cost2 */
+#define COST_EPS  0.001
 
 /*
   For sequential disk seeks the cost formula is:
@@ -360,11 +363,11 @@ protected:
   The cost of average seek 
     DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST*BLOCKS_IN_AVG_SEEK =1.0.
 */
-#define DISK_SEEK_BASE_COST ((double)0.5)
+#define DISK_SEEK_BASE_COST ((double)0.9)
 
 #define BLOCKS_IN_AVG_SEEK  128
 
-#define DISK_SEEK_PROP_COST ((double)0.5/BLOCKS_IN_AVG_SEEK)
+#define DISK_SEEK_PROP_COST ((double)0.1/BLOCKS_IN_AVG_SEEK)
 
 
 /**
@@ -374,6 +377,12 @@ protected:
 */
 #define MATCHING_ROWS_IN_OTHER_TABLE 10
 
+/*
+  Subquery materialization-related constants
+*/
+#define HEAP_TEMPTABLE_LOOKUP_COST 0.05
+#define DISK_TEMPTABLE_LOOKUP_COST 1.0
+
 /** Don't pack string keys shorter than this (if PACK_KEYS=1 isn't used). */
 #define KEY_DEFAULT_PACK_LENGTH 8
 
@@ -555,12 +564,28 @@ protected:
 #define OPTIMIZER_SWITCH_INDEX_MERGE_UNION 2
 #define OPTIMIZER_SWITCH_INDEX_MERGE_SORT_UNION 4
 #define OPTIMIZER_SWITCH_INDEX_MERGE_INTERSECT 8
-
+#define OPTIMIZER_SWITCH_INDEX_MERGE_SORT_INTERSECT 16
+#define OPTIMIZER_SWITCH_INDEX_COND_PUSHDOWN 32
+
+#define OPTIMIZER_SWITCH_FIRSTMATCH 64
+#define OPTIMIZER_SWITCH_LOOSE_SCAN 128
+#define OPTIMIZER_SWITCH_MATERIALIZATION 256
+#define OPTIMIZER_SWITCH_SEMIJOIN 512
+#define OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE 1024
+#define OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN (1<<11)
+#define OPTIMIZER_SWITCH_SUBQUERY_CACHE (1<<12)
+#define OPTIMIZER_SWITCH_MRR_SORT_KEYS (1<<13)
+#define OPTIMIZER_SWITCH_OUTER_JOIN_WITH_CACHE (1<<14)
+#define OPTIMIZER_SWITCH_SEMIJOIN_WITH_CACHE (1<<15)
+#define OPTIMIZER_SWITCH_JOIN_CACHE_INCREMENTAL (1<<16)
+#define OPTIMIZER_SWITCH_JOIN_CACHE_HASHED (1<<17)
+#define OPTIMIZER_SWITCH_JOIN_CACHE_BKA (1<<18)
+#define OPTIMIZER_SWITCH_OPTIMIZE_JOIN_BUFFER_SIZE (1<<19)
 #ifdef DBUG_OFF
-#  define OPTIMIZER_SWITCH_LAST 16
+#  define OPTIMIZER_SWITCH_LAST (1<<20)
 #else
-#  define OPTIMIZER_SWITCH_TABLE_ELIMINATION 16
-#  define OPTIMIZER_SWITCH_LAST 32
+#  define OPTIMIZER_SWITCH_TABLE_ELIMINATION (1<<20)
+#  define OPTIMIZER_SWITCH_LAST (1<<21)
 #endif
 
 #ifdef DBUG_OFF 
@@ -568,13 +593,40 @@ protected:
 #  define OPTIMIZER_SWITCH_DEFAULT (OPTIMIZER_SWITCH_INDEX_MERGE | \
                                     OPTIMIZER_SWITCH_INDEX_MERGE_UNION | \
                                     OPTIMIZER_SWITCH_INDEX_MERGE_SORT_UNION | \
-                                    OPTIMIZER_SWITCH_INDEX_MERGE_INTERSECT)
-#else 
+                                    OPTIMIZER_SWITCH_INDEX_MERGE_INTERSECT | \
+                                    OPTIMIZER_SWITCH_INDEX_COND_PUSHDOWN | \
+                                    OPTIMIZER_SWITCH_FIRSTMATCH | \
+                                    OPTIMIZER_SWITCH_LOOSE_SCAN | \
+                                    OPTIMIZER_SWITCH_MATERIALIZATION | \
+                                    OPTIMIZER_SWITCH_SEMIJOIN | \
+                                    OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE|\
+                                    OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN|\
+                                    OPTIMIZER_SWITCH_SUBQUERY_CACHE|\
+                                    OPTIMIZER_SWITCH_MRR_SORT_KEYS|\
+                                    OPTIMIZER_SWITCH_SUBQUERY_CACHE | \
+                                    OPTIMIZER_SWITCH_JOIN_CACHE_INCREMENTAL | \
+                                    OPTIMIZER_SWITCH_JOIN_CACHE_HASHED | \
+                                    OPTIMIZER_SWITCH_JOIN_CACHE_BKA | \
+                                    OPTIMIZER_SWITCH_OPTIMIZE_JOIN_BUFFER_SIZE)
+#else
 #  define OPTIMIZER_SWITCH_DEFAULT (OPTIMIZER_SWITCH_INDEX_MERGE | \
                                     OPTIMIZER_SWITCH_INDEX_MERGE_UNION | \
                                     OPTIMIZER_SWITCH_INDEX_MERGE_SORT_UNION | \
                                     OPTIMIZER_SWITCH_INDEX_MERGE_INTERSECT | \
-                                    OPTIMIZER_SWITCH_TABLE_ELIMINATION)
+                                    OPTIMIZER_SWITCH_INDEX_COND_PUSHDOWN | \
+                                    OPTIMIZER_SWITCH_TABLE_ELIMINATION | \
+                                    OPTIMIZER_SWITCH_FIRSTMATCH | \
+                                    OPTIMIZER_SWITCH_LOOSE_SCAN | \
+                                    OPTIMIZER_SWITCH_MATERIALIZATION | \
+                                    OPTIMIZER_SWITCH_SEMIJOIN | \
+                                    OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE|\
+                                    OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN|\
+                                    OPTIMIZER_SWITCH_SUBQUERY_CACHE|\
+                                    OPTIMIZER_SWITCH_MRR_SORT_KEYS|\
+                                    OPTIMIZER_SWITCH_JOIN_CACHE_INCREMENTAL | \
+                                    OPTIMIZER_SWITCH_JOIN_CACHE_HASHED | \
+                                    OPTIMIZER_SWITCH_JOIN_CACHE_BKA | \
+                                    OPTIMIZER_SWITCH_OPTIMIZE_JOIN_BUFFER_SIZE)
 #endif
 
 /*
@@ -679,7 +731,8 @@ enum enum_parsing_place
   SELECT_LIST,
   IN_WHERE,
   IN_ON,
-  IN_GROUP_BY
+  IN_GROUP_BY,
+  PARSING_PLACE_SIZE /* always should be the last */
 };
 
 struct st_table;
@@ -878,6 +931,7 @@ bool general_log_write(THD *thd, enum enum_server_command command,
 #ifdef MYSQL_SERVER
 #include "sql_servers.h"
 #include "opt_range.h"
+#include "sql_expression_cache.h"
 
 #ifdef HAVE_QUERY_CACHE
 struct Query_cache_query_flags
@@ -1212,6 +1266,9 @@ bool mysql_select(THD *thd, Item ***rref_pointer_array,
                   Item *having, ORDER *proc_param, ulonglong select_type, 
                   select_result *result, SELECT_LEX_UNIT *unit, 
                   SELECT_LEX *select_lex);
+
+int join_read_key2(THD *thd, struct st_join_table *tab, TABLE *table,
+                   struct st_table_ref *table_ref);
 void free_underlaid_joins(THD *thd, SELECT_LEX *select);
 bool mysql_explain_union(THD *thd, SELECT_LEX_UNIT *unit,
                          select_result *result);
@@ -1231,6 +1288,7 @@ Field *create_tmp_field(THD *thd, TABLE *table,Item *item, Item::Type type,
 			bool table_cant_handle_bit_fields,
                         bool make_copy_field,
                         uint convert_blob_length);
+bool open_tmp_table(TABLE *table);
 void sp_prepare_create_field(THD *thd, Create_field *sql_field);
 int prepare_create_field(Create_field *sql_field, 
 			 uint *blob_columns, 
@@ -1460,15 +1518,6 @@ void mysqld_stmt_reset(THD *thd, char *packet);
 void mysql_stmt_get_longdata(THD *thd, char *pos, ulong packet_length);
 void reinit_stmt_before_use(THD *thd, LEX *lex);
 
-/* sql_handler.cc */
-bool mysql_ha_open(THD *thd, TABLE_LIST *tables, bool reopen);
-bool mysql_ha_close(THD *thd, TABLE_LIST *tables);
-bool mysql_ha_read(THD *, TABLE_LIST *,enum enum_ha_read_modes,char *,
-                   List<Item> *,enum ha_rkey_function,Item *,ha_rows,ha_rows);
-void mysql_ha_flush(THD *thd);
-void mysql_ha_rm_tables(THD *thd, TABLE_LIST *tables, bool is_locked);
-void mysql_ha_cleanup(THD *thd);
-
 /* sql_base.cc */
 #define TMP_TABLE_KEY_EXTRA 8
 void set_item_name(Item *item,char *pos,uint length);
@@ -1787,7 +1836,7 @@ bool close_cached_connection_tables(THD *thd, bool wait_for_refresh,
                                     bool have_lock = FALSE);
 void copy_field_from_tmp_record(Field *field,int offset);
 bool fill_record(THD *thd, Field **field, List<Item> &values,
-                 bool ignore_errors);
+                 bool ignore_errors, bool use_value);
 bool fill_record_n_invoke_before_triggers(THD *thd, List<Item> &fields,
                                           List<Item> &values,
                                           bool ignore_errors,
@@ -1842,19 +1891,26 @@ void print_cached_tables(void);
 void TEST_filesort(SORT_FIELD *sortorder,uint s_length);
 void print_plan(JOIN* join,uint idx, double record_count, double read_time,
                 double current_read_time, const char *info);
+void print_keyuse_array(DYNAMIC_ARRAY *keyuse_array);
+void print_sjm(SJ_MATERIALIZATION_INFO *sjm);
 #endif
 void mysql_print_status();
 /* key.cc */
 int find_ref_key(KEY *key, uint key_count, uchar *record, Field *field,
                  uint *key_length, uint *keypart);
-void key_copy(uchar *to_key, uchar *from_record, KEY *key_info, uint key_length);
+void key_copy(uchar *to_key, uchar *from_record, KEY *key_info, uint key_length,
+              bool with_zerofill= FALSE);
 void key_restore(uchar *to_record, uchar *from_key, KEY *key_info,
                  uint key_length);
 bool key_cmp_if_same(TABLE *form,const uchar *key,uint index,uint key_length);
 void key_unpack(String *to,TABLE *form,uint index);
 bool is_key_used(TABLE *table, uint idx, const MY_BITMAP *fields);
 int key_cmp(KEY_PART_INFO *key_part, const uchar *key, uint key_length);
+ulong key_hashnr(KEY *key_info, uint used_key_parts, const uchar *key);
+bool key_buf_cmp(KEY *key_info, uint used_key_parts,
+                 const uchar *key1, const uchar *key2);
 extern "C" int key_rec_cmp(void *key_info, uchar *a, uchar *b);
+int key_tuple_cmp(KEY_PART_INFO *part, uchar *key1, uchar *key2, uint tuple_length);
 
 bool init_errmessage(void);
 #endif /* MYSQL_SERVER */
@@ -2161,6 +2217,10 @@ extern struct st_VioSSLFd * ssl_acceptor_fd;
 
 MYSQL_LOCK *mysql_lock_tables(THD *thd, TABLE **table, uint count,
                               uint flags, bool *need_reopen);
+bool mysql_lock_tables(THD *thd, MYSQL_LOCK *sql_lock,
+                       bool write_lock_used,
+                       uint flags, bool *need_reopen);
+
 /* mysql_lock_tables() and open_table() flags bits */
 #define MYSQL_LOCK_IGNORE_GLOBAL_READ_LOCK      0x0001
 #define MYSQL_LOCK_IGNORE_FLUSH                 0x0002
@@ -2168,8 +2228,12 @@ MYSQL_LOCK *mysql_lock_tables(THD *thd, TABLE **table, uint count,
 #define MYSQL_OPEN_TEMPORARY_ONLY               0x0008
 #define MYSQL_LOCK_IGNORE_GLOBAL_READ_ONLY      0x0010
 #define MYSQL_LOCK_PERF_SCHEMA                  0x0020
+#define MYSQL_LOCK_NOT_TEMPORARY		0x0040
+/* flags for get_lock_data */
+#define GET_LOCK_UNLOCK         1
+#define GET_LOCK_STORE_LOCKS    2
 
-void mysql_unlock_tables(THD *thd, MYSQL_LOCK *sql_lock);
+void mysql_unlock_tables(THD *thd, MYSQL_LOCK *sql_lock, bool free_lock= 1);
 void mysql_unlock_read_tables(THD *thd, MYSQL_LOCK *sql_lock);
 void mysql_unlock_some_tables(THD *thd, TABLE **table,uint count);
 void mysql_lock_remove(THD *thd, MYSQL_LOCK *locked,TABLE *table,
@@ -2181,6 +2245,7 @@ bool mysql_lock_abort_for_thread(THD *thd, TABLE *table);
 MYSQL_LOCK *mysql_lock_merge(MYSQL_LOCK *a,MYSQL_LOCK *b);
 TABLE_LIST *mysql_lock_have_duplicate(THD *thd, TABLE_LIST *needle,
                                       TABLE_LIST *haystack);
+void reset_lock_data(MYSQL_LOCK *sql_lock, bool unlock);
 bool lock_global_read_lock(THD *thd);
 void unlock_global_read_lock(THD *thd);
 bool wait_if_global_read_lock(THD *thd, bool abort_on_refresh,
@@ -2190,6 +2255,8 @@ bool make_global_read_lock_block_commit(THD *thd);
 bool set_protect_against_global_read_lock(void);
 void unset_protect_against_global_read_lock(void);
 void broadcast_refresh(void);
+MYSQL_LOCK *get_lock_data(THD *thd, TABLE **table_ptr, uint count,
+                          uint flags, TABLE **write_lock_used);
 
 /* Lock based on name */
 int lock_and_wait_for_table_name(THD *thd, TABLE_LIST *table_list);
@@ -2297,6 +2364,8 @@ ha_rows filesort(THD *thd, TABLE *form,struct st_sort_field *sortorder,
 		 ha_rows max_rows, bool sort_positions,
                  ha_rows *examined_rows);
 void filesort_free_buffers(TABLE *table, bool full);
+double get_merge_many_buffs_cost(uint *buffer, uint last_n_elems,
+                                 int elem_size);
 void change_double_for_sort(double nr,uchar *to);
 double my_double_round(double value, longlong dec, bool dec_unsigned,
                        bool truncate);
@@ -2520,6 +2589,19 @@ inline bool is_user_table(TABLE * table)
   return strncmp(name, tmp_file_prefix, tmp_file_prefix_length);
 }
 
+
+#ifndef HAVE_LOG2
+/*
+  This will be slightly slower and perhaps a tiny bit less accurate than
+  doing it the IEEE754 way but log2() should be available on C99 systems.
+*/
+inline double log2(double x)
+{
+  return (log(x) / M_LN2);
+}
+#endif
+
+
 /*
   Some functions that are different in the embedded library and the normal
   server
diff --git a/sql/mysqld.cc b/sql/mysqld.cc
index 047534e6173..a601411250a 100644
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@ -333,7 +333,19 @@ TYPELIB sql_mode_typelib= { array_elements(sql_mode_names)-1,"",
 static const char *optimizer_switch_names[]=
 {
   "index_merge","index_merge_union","index_merge_sort_union",
-  "index_merge_intersection",
+  "index_merge_intersection","index_merge_sort_intersection",
+  "index_condition_pushdown",
+  "firstmatch","loosescan","materialization", "semijoin",
+  "partial_match_rowid_merge",
+  "partial_match_table_scan",
+  "subquery_cache",
+  "mrr_sort_keys",
+  "outer_join_with_cache",
+  "semijoin_with_cache",
+  "join_cache_incremental",
+  "join_cache_hashed",
+  "join_cache_bka",
+  "optimize_join_buffer_size",
 #ifndef DBUG_OFF
   "table_elimination",
 #endif
@@ -347,6 +359,22 @@ static const unsigned int optimizer_switch_names_len[]=
   sizeof("index_merge_union") - 1,
   sizeof("index_merge_sort_union") - 1,
   sizeof("index_merge_intersection") - 1,
+  sizeof("index_merge_sort_intersection") - 1,
+  sizeof("index_condition_pushdown") - 1,
+  sizeof("firstmatch") - 1,
+  sizeof("loosescan") - 1,
+  sizeof("materialization") - 1,
+  sizeof("semijoin") - 1,
+  sizeof("partial_match_rowid_merge") - 1,
+  sizeof("partial_match_table_scan") - 1,
+  sizeof("subquery_cache") - 1,
+  sizeof("mrr_sort_keys") - 1,
+  sizeof("outer_join_with_cache") - 1,
+  sizeof("semijoin_with_cache") - 1,
+  sizeof("join_cache_incremental") - 1,
+  sizeof("join_cache_hashed") - 1,
+  sizeof("join_cache_bka") - 1,
+  sizeof("optimize_join_buffer_size") - 1,
 #ifndef DBUG_OFF
   sizeof("table_elimination") - 1,
 #endif
@@ -426,7 +454,7 @@ static bool volatile ready_to_exit;
 static my_bool opt_debugging= 0, opt_external_locking= 0, opt_console= 0;
 static my_bool opt_short_log_format= 0;
 static my_bool opt_ignore_wrong_options= 0, opt_expect_abort= 0;
-static my_bool opt_sync= 0;
+static my_bool opt_sync= 0, opt_thread_alarm;
 static uint kill_cached_threads, wake_thread;
 ulong thread_created;
 uint thread_handling;
@@ -437,8 +465,22 @@ static const char *sql_mode_str= "OFF";
 /* Text representation for OPTIMIZER_SWITCH_DEFAULT */
 static const char *optimizer_switch_str="index_merge=on,index_merge_union=on,"
                                         "index_merge_sort_union=on,"
-                                        "index_merge_intersection=on"
-#ifndef DBUG_OFF                                        
+                                        "index_merge_intersection=on,"
+                                        "index_merge_sort_intersection=off,"
+                                        "index_condition_pushdown=on,"
+                                        "firstmatch=on,"
+                                        "loosescan=on,"
+                                        "materialization=on,"
+                                        "semijoin=on,"
+                                        "partial_match_rowid_merge=on,"
+                                        "partial_match_table_scan=on,"
+                                        "subquery_cache=on,"
+                                        "mrr_sort_keys=on,"
+                                        "join_cache_incremental=on,"
+                                        "join_cache_hashed=on,"
+                                        "join_cache_bka=on,"
+                                        "optimize_join_buffer_size=on"
+#ifndef DBUG_OFF
                                         ",table_elimination=on";
 #else
                                         ;
@@ -3638,6 +3680,8 @@ static int init_common_variables(const char *conf_file_name, int argc,
   global_system_variables.character_set_results= default_charset_info;
   global_system_variables.character_set_client= default_charset_info;
 
+  global_system_variables.optimizer_use_mrr= 1;
+
   if (!(character_set_filesystem=
         get_charset_by_csname(character_set_filesystem_name,
                               MY_CS_PRIMARY, MYF(MY_WME))))
@@ -5903,6 +5947,7 @@ enum options_mysqld
   OPT_FLUSH_TIME, OPT_FT_MIN_WORD_LEN, OPT_FT_BOOLEAN_SYNTAX,
   OPT_FT_MAX_WORD_LEN, OPT_FT_QUERY_EXPANSION_LIMIT, OPT_FT_STOPWORD_FILE,
   OPT_INTERACTIVE_TIMEOUT, OPT_JOIN_BUFF_SIZE,
+  OPT_JOIN_BUFF_SPACE_LIMIT, OPT_JOIN_CACHE_LEVEL,
   OPT_KEY_BUFFER_SIZE, OPT_KEY_CACHE_BLOCK_SIZE,
   OPT_KEY_CACHE_DIVISION_LIMIT, OPT_KEY_CACHE_AGE_THRESHOLD,
   OPT_KEY_CACHE_PARTITIONS,
@@ -5916,7 +5961,7 @@ enum options_mysqld
   OPT_MAX_SEEKS_FOR_KEY, OPT_MAX_TMP_TABLES, OPT_MAX_USER_CONNECTIONS,
   OPT_MAX_LENGTH_FOR_SORT_DATA,
   OPT_MAX_WRITE_LOCK_COUNT, OPT_BULK_INSERT_BUFFER_SIZE,
-  OPT_MAX_ERROR_COUNT, OPT_MULTI_RANGE_COUNT, OPT_MYISAM_DATA_POINTER_SIZE,
+  OPT_MAX_ERROR_COUNT, OPT_MRR_BUFFER_SIZE, OPT_MYISAM_DATA_POINTER_SIZE,
 
   OPT_MYISAM_BLOCK_SIZE, OPT_MYISAM_MAX_EXTRA_SORT_FILE_SIZE,
   OPT_MYISAM_MAX_SORT_FILE_SIZE, OPT_MYISAM_SORT_BUFFER_SIZE,
@@ -5936,7 +5981,10 @@ enum options_mysqld
   OPT_RECORD_RND_BUFFER, OPT_DIV_PRECINCREMENT, OPT_RELAY_LOG_SPACE_LIMIT,
   OPT_RELAY_LOG_PURGE,
   OPT_SLAVE_NET_TIMEOUT, OPT_SLAVE_COMPRESSED_PROTOCOL, OPT_SLOW_LAUNCH_TIME,
-  OPT_SLAVE_TRANS_RETRIES, OPT_READONLY, OPT_DEBUGGING, OPT_DEBUG_FLUSH,
+  OPT_SLAVE_TRANS_RETRIES,
+  OPT_SUBQUERY_CACHE,
+  OPT_READONLY, OPT_ROWID_MERGE_BUFF_SIZE,
+  OPT_DEBUGGING, OPT_DEBUG_FLUSH,
   OPT_SORT_BUFFER, OPT_TABLE_OPEN_CACHE, OPT_TABLE_DEF_CACHE,
   OPT_THREAD_CONCURRENCY, OPT_THREAD_CACHE_SIZE,
   OPT_TMP_TABLE_SIZE, OPT_THREAD_STACK,
@@ -5946,7 +5994,7 @@ enum options_mysqld
   OPT_RANGE_ALLOC_BLOCK_SIZE, OPT_ALLOW_SUSPICIOUS_UDFS,
   OPT_QUERY_ALLOC_BLOCK_SIZE, OPT_QUERY_PREALLOC_SIZE,
   OPT_TRANS_ALLOC_BLOCK_SIZE, OPT_TRANS_PREALLOC_SIZE,
-  OPT_SYNC_FRM, OPT_SYNC_BINLOG, OPT_SYNC,
+  OPT_SYNC_FRM, OPT_SYNC_BINLOG, OPT_SYNC, OPT_THREAD_ALARM,
   OPT_SYNC_REPLICATION,
   OPT_SYNC_REPLICATION_SLAVE_ID,
   OPT_SYNC_REPLICATION_TIMEOUT,
@@ -6271,7 +6319,7 @@ struct my_option my_long_options[] =
    "Disable initialization of builtin InnoDB plugin.",
    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
   {"init-connect", OPT_INIT_CONNECT, 
-   "Command(s) that are executed for each new connection.",
+   "Command(s) that are executed for each new connection (but not for SUPER users).",
    &opt_init_connect, &opt_init_connect, 0, GET_STR_ALLOC,
    REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
 #ifndef DISABLE_GRANT_OPTIONS
@@ -7044,11 +7092,22 @@ thread is in the relay logs.",
    &max_system_variables.net_interactive_timeout, 0,
    GET_ULONG, REQUIRED_ARG, NET_WAIT_TIMEOUT, 1, LONG_TIMEOUT, 0, 1, 0},
   {"join_buffer_size", OPT_JOIN_BUFF_SIZE,
-   "The size of the buffer that is used for full joins.",
-   &global_system_variables.join_buff_size,
-   &max_system_variables.join_buff_size, 0, GET_ULONG,
-   REQUIRED_ARG, 128*1024L, IO_SIZE*2+MALLOC_OVERHEAD, (longlong) ULONG_MAX,
-   MALLOC_OVERHEAD, IO_SIZE, 0},
+   "The size of the buffer that is used for joins.",
+    &global_system_variables.join_buff_size,
+    &max_system_variables.join_buff_size, 0, GET_ULONG,
+   REQUIRED_ARG, 128*1024L, 128+MALLOC_OVERHEAD, (longlong) ULONG_MAX,
+   MALLOC_OVERHEAD, 128, 0},
+  {"join_buffer_space_limit", OPT_JOIN_BUFF_SPACE_LIMIT,
+   "The limit of the space for all join buffers used by a query.",
+   &global_system_variables.join_buff_space_limit,
+   &max_system_variables.join_buff_space_limit, 0, GET_ULL,
+   REQUIRED_ARG, 16*128*1024L, 2048+MALLOC_OVERHEAD, (longlong) ULONGLONG_MAX,
+   MALLOC_OVERHEAD, 2048, 0},
+   {"join_cache_level", OPT_JOIN_CACHE_LEVEL,
+   "Controls what join operations can be executed with join buffers. Odd numbers are used for plain join buffers while even numbers are used for linked buffers",
+   &global_system_variables.join_cache_level,
+   &max_system_variables.join_cache_level,
+   0, GET_ULONG, REQUIRED_ARG, 1, 0, 8, 0, 1, 0},
   {"keep_files_on_create", OPT_KEEP_FILES_ON_CREATE,
    "Don't overwrite stale .MYD and .MYI even if no directory is specified.",
    &global_system_variables.keep_files_on_create,
@@ -7221,11 +7280,12 @@ thread is in the relay logs.",
    &global_system_variables.min_examined_row_limit,
    &max_system_variables.min_examined_row_limit, 0, GET_ULONG,
   REQUIRED_ARG, 0, 0, (longlong) ULONG_MAX, 0, 1L, 0},
-  {"multi_range_count", OPT_MULTI_RANGE_COUNT,
-   "Number of key ranges to request at once.",
-   &global_system_variables.multi_range_count,
-   &max_system_variables.multi_range_count, 0,
-   GET_ULONG, REQUIRED_ARG, 256, 1, (longlong) ULONG_MAX, 0, 1, 0},
+  {"mrr_buffer_size", OPT_MRR_BUFFER_SIZE,
+   "Size of buffer to use when using MRR with range access",
+   (uchar**) &global_system_variables.mrr_buff_size,
+   (uchar**) &max_system_variables.mrr_buff_size, 0,
+   GET_ULONG, REQUIRED_ARG, 256*1024L, IO_SIZE*2+MALLOC_OVERHEAD,
+   INT_MAX32, MALLOC_OVERHEAD, 1 /* Small to be able to do tests */ , 0},
   {"myisam_block_size", OPT_MYISAM_BLOCK_SIZE,
    "Block size to be used for MyISAM index pages.",
    &opt_myisam_block_size, &opt_myisam_block_size, 0, GET_ULONG, REQUIRED_ARG,
@@ -7331,7 +7391,13 @@ thread is in the relay logs.",
    0, GET_ULONG, OPT_ARG, MAX_TABLES+1, 0, MAX_TABLES+2, 0, 1, 0},
   {"optimizer_switch", OPT_OPTIMIZER_SWITCH,
    "optimizer_switch=option=val[,option=val...], where option={index_merge, "
-   "index_merge_union, index_merge_sort_union, index_merge_intersection"
+   "index_merge_union, index_merge_sort_union, index_merge_intersection, "
+   "index_merge_sort_intersection, "
+   "index_condition_pushdown, firstmatch, loosescan, materialization, "
+   "semijoin, partial_match_rowid_merge, partial_match_table_scan, "
+   "subquery_cache, outer_join_with_cache, semijoin_with_cache, "
+   "join_cache_incremental, join_cache_hashed, join_cache_bka, "
+   "optimize_join_buffer_size"
 #ifndef DBUG_OFF
    ", table_elimination"
 #endif 
@@ -7404,6 +7470,11 @@ thread is in the relay logs.",
    &max_system_variables.range_alloc_block_size, 0, GET_ULONG,
    REQUIRED_ARG, RANGE_ALLOC_BLOCK_SIZE, RANGE_ALLOC_BLOCK_SIZE,
    (longlong) ULONG_MAX, 0, 1024, 0},
+  {"rowid_merge_buff_size", OPT_ROWID_MERGE_BUFF_SIZE,
+   "The size of the buffers used [NOT] IN evaluation via partial matching.",
+   (uchar**) &global_system_variables.rowid_merge_buff_size,
+   (uchar**) &max_system_variables.rowid_merge_buff_size, 0, GET_ULONG,
+   REQUIRED_ARG, 8*1024*1024L, 0, MAX_MEM_TABLE_SIZE/2, 0, 1, 0},
   {"read_buffer_size", OPT_RECORD_BUFFER,
    "Each thread that does a sequential scan allocates a buffer of this size "
    "for each table it scans. If you do many sequential scans, you may want "
@@ -7498,6 +7569,10 @@ thread is in the relay logs.",
    "error. Used only if the connection has active cursors.",
    &table_lock_wait_timeout, &table_lock_wait_timeout,
    0, GET_ULONG, REQUIRED_ARG, 50, 1, 1024 * 1024 * 1024, 0, 1, 0},
+  {"thread-alarm", OPT_THREAD_ALARM,
+   "Enable/disable system thread alarm calls. Should only be turned off when running tests or debugging!!",
+   &opt_thread_alarm, &opt_thread_alarm, 0, GET_BOOL, NO_ARG, 1, 0, 0, 0, 0,
+   0},
   {"thread_cache_size", OPT_THREAD_CACHE_SIZE,
    "How many threads we should keep in a cache for reuse.",
    &thread_cache_size, &thread_cache_size, 0, GET_ULONG,
@@ -8094,6 +8169,12 @@ SHOW_VAR status_vars[]= {
   {"Ssl_version",              (char*) &show_ssl_get_version, SHOW_FUNC},
 #endif /* HAVE_OPENSSL */
   {"Syncs",                    (char*) &my_sync_count,          SHOW_LONG_NOFLUSH},
+  /*
+    Expression cache used only for caching subqueries now, so its statistic
+    variables we call subquery_cache*.
+  */
+  {"Subquery_cache_hit",       (char*) &subquery_cache_hit, SHOW_LONG},
+  {"Subquery_cache_miss",      (char*) &subquery_cache_miss, SHOW_LONG},
   {"Table_locks_immediate",    (char*) &locks_immediate,        SHOW_LONG},
   {"Table_locks_waited",       (char*) &locks_waited,           SHOW_LONG},
 #ifdef HAVE_MMAP
@@ -8233,6 +8314,7 @@ static int mysql_init_variables(void)
   abort_loop= select_thread_in_use= signal_thread_in_use= 0;
   ready_to_exit= shutdown_in_progress= grant_option= 0;
   aborted_threads= aborted_connects= 0;
+  subquery_cache_miss= subquery_cache_hit= 0;
   delayed_insert_threads= delayed_insert_writes= delayed_rows_in_use= 0;
   delayed_insert_errors= thread_created= 0;
   specialflag= 0;
@@ -9268,6 +9350,7 @@ static int get_options(int *argc,char **argv)
   */
   my_disable_locking= myisam_single_user= test(opt_external_locking == 0);
   my_disable_sync= opt_sync == 0;
+  my_disable_thr_alarm= opt_thread_alarm == 0;
   my_default_record_cache_size=global_system_variables.read_buff_size;
   myisam_max_temp_length=
     (my_off_t) global_system_variables.myisam_max_sort_file_size;
diff --git a/sql/net_serv.cc b/sql/net_serv.cc
index 272b1759151..df392616676 100644
--- a/sql/net_serv.cc
+++ b/sql/net_serv.cc
@@ -258,18 +258,20 @@ static int net_data_is_ready(my_socket sd)
 #endif /* EMBEDDED_LIBRARY */
 
 /**
-  Remove unwanted characters from connection
-  and check if disconnected.
+   Intialize NET handler for new reads:
 
-    Read from socket until there is nothing more to read. Discard
-    what is read.
+   - Read from socket until there is nothing more to read. Discard
+     what is read.
+   - Initialize net for new net_read/net_write calls.
 
-    If there is anything when to read 'net_clear' is called this
-    normally indicates an error in the protocol.
+   If there is anything when to read 'net_clear' is called this
+   normally indicates an error in the protocol. Normally one should not
+   need to do clear the communication buffer. If one compiles without
+   -DUSE_NET_CLEAR then one wins one read call / query.
 
-    When connection is properly closed (for TCP it means with
-    a FIN packet), then select() considers a socket "ready to read",
-    in the sense that there's EOF to read, but read() returns 0.
+   When connection is properly closed (for TCP it means with
+   a FIN packet), then select() considers a socket "ready to read",
+   in the sense that there's EOF to read, but read() returns 0.
 
   @param net			NET handler
   @param clear_buffer           if <> 0, then clear all data from comm buff
@@ -277,20 +279,18 @@ static int net_data_is_ready(my_socket sd)
 
 void net_clear(NET *net, my_bool clear_buffer __attribute__((unused)))
 {
-#if !defined(EMBEDDED_LIBRARY) && defined(DBUG_OFF)
-  size_t count;
-  int ready;
-#endif
   DBUG_ENTER("net_clear");
 
 /*
-  We don't do a clear in case of DBUG_OFF to catch bugs
-  in the protocol handling
+  We don't do a clear in case of not DBUG_OFF to catch bugs in the
+  protocol handling.
 */
 
-#if !defined(EMBEDDED_LIBRARY) && defined(DBUG_OFF)
+#if (!defined(EMBEDDED_LIBRARY) && defined(DBUG_OFF)) || defined(USE_NET_CLEAR)
   if (clear_buffer)
   {
+    size_t count;
+    int ready;
     while ((ready= net_data_is_ready(net->vio->sd)) > 0)
     {
       /* The socket is ready */
@@ -687,7 +687,8 @@ net_real_write(NET *net,const uchar *packet, size_t len)
   {
     my_bool old_mode;
     thr_end_alarm(&alarmed);
-    vio_blocking(net->vio, net_blocking, &old_mode);
+    if (!net_blocking)
+      vio_blocking(net->vio, net_blocking, &old_mode);
   }
   net->reading_or_writing=0;
   DBUG_RETURN(((int) (pos != end)));
@@ -978,7 +979,8 @@ end:
   {
     my_bool old_mode;
     thr_end_alarm(&alarmed);
-    vio_blocking(net->vio, net_blocking, &old_mode);
+    if (!net_blocking)
+      vio_blocking(net->vio, net_blocking, &old_mode);
   }
   net->reading_or_writing=0;
 #ifdef DEBUG_DATA_PACKETS
diff --git a/sql/opt_index_cond_pushdown.cc b/sql/opt_index_cond_pushdown.cc
new file mode 100644
index 00000000000..e0a2d3b1f30
--- /dev/null
+++ b/sql/opt_index_cond_pushdown.cc
@@ -0,0 +1,395 @@
+#include "mysql_priv.h"
+#include "sql_select.h"
+
+/****************************************************************************
+ * Index Condition Pushdown code starts
+ ***************************************************************************/
+/* 
+  Check if given expression uses only table fields covered by the given index
+
+  SYNOPSIS
+    uses_index_fields_only()
+      item           Expression to check
+      tbl            The table having the index
+      keyno          The index number
+      other_tbls_ok  TRUE <=> Fields of other non-const tables are allowed
+
+  DESCRIPTION
+    Check if given expression only uses fields covered by index #keyno in the
+    table tbl. The expression can use any fields in any other tables.
+    
+    The expression is guaranteed not to be AND or OR - those constructs are 
+    handled outside of this function.
+
+  RETURN
+    TRUE   Yes
+    FALSE  No
+*/
+
+bool uses_index_fields_only(Item *item, TABLE *tbl, uint keyno,
+                            bool other_tbls_ok)
+{
+  if (item->const_item())
+    return TRUE;
+
+  /* 
+    Don't push down the triggered conditions. Nested outer joins execution 
+    code may need to evaluate a condition several times (both triggered and
+    untriggered), and there is no way to put thi
+    TODO: Consider cloning the triggered condition and using the copies for:
+      1. push the first copy down, to have most restrictive index condition
+         possible
+      2. Put the second copy into tab->select_cond. 
+  */
+  if (item->type() == Item::FUNC_ITEM && 
+      ((Item_func*)item)->functype() == Item_func::TRIG_COND_FUNC)
+    return FALSE;
+
+  if (!(item->used_tables() & tbl->map))
+    return other_tbls_ok;
+
+  Item::Type item_type= item->type();
+  switch (item_type) {
+  case Item::FUNC_ITEM:
+    {
+      /* This is a function, apply condition recursively to arguments */
+      Item_func *item_func= (Item_func*)item;
+      Item **child;
+      Item **item_end= (item_func->arguments()) + item_func->argument_count();
+      for (child= item_func->arguments(); child != item_end; child++)
+      {
+        if (!uses_index_fields_only(*child, tbl, keyno, other_tbls_ok))
+          return FALSE;
+      }
+      return TRUE;
+    }
+  case Item::COND_ITEM:
+    {
+      /*
+        This is a AND/OR condition. Regular AND/OR clauses are handled by
+        make_cond_for_index() which will chop off the part that can be
+        checked with index. This code is for handling non-top-level AND/ORs,
+        e.g. func(x AND y).
+      */
+      List_iterator<Item> li(*((Item_cond*)item)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+        if (!uses_index_fields_only(item, tbl, keyno, other_tbls_ok))
+          return FALSE;
+      }
+      return TRUE;
+    }
+  case Item::FIELD_ITEM:
+    {
+      Item_field *item_field= (Item_field*)item;
+      if (item_field->field->table != tbl) 
+        return TRUE;
+      /*
+        The below is probably a repetition - the first part checks the
+        other two, but let's play it safe:
+      */
+      return item_field->field->part_of_key.is_set(keyno) &&
+             item_field->field->type() != MYSQL_TYPE_GEOMETRY &&
+             item_field->field->type() != MYSQL_TYPE_BLOB;
+    }
+  case Item::REF_ITEM:
+    return uses_index_fields_only(item->real_item(), tbl, keyno,
+                                  other_tbls_ok);
+  default:
+    return FALSE; /* Play it safe, don't push unknown non-const items */
+  }
+}
+
+#define ICP_COND_USES_INDEX_ONLY 10
+
+/*
+  Get a part of the condition that can be checked using only index fields
+
+  SYNOPSIS
+    make_cond_for_index()
+      cond           The source condition
+      table          The table that is partially available
+      keyno          The index in the above table. Only fields covered by the index
+                     are available
+      other_tbls_ok  TRUE <=> Fields of other non-const tables are allowed
+
+  DESCRIPTION
+    Get a part of the condition that can be checked when for the given table 
+    we have values only of fields covered by some index. The condition may
+    refer to other tables, it is assumed that we have values of all of their 
+    fields.
+
+    Example:
+      make_cond_for_index(
+         "cond(t1.field) AND cond(t2.key1) AND cond(t2.non_key) AND cond(t2.key2)",
+          t2, keyno(t2.key1)) 
+      will return
+        "cond(t1.field) AND cond(t2.key2)"
+
+  RETURN
+    Index condition, or NULL if no condition could be inferred.
+*/
+
+Item *make_cond_for_index(Item *cond, TABLE *table, uint keyno,
+                          bool other_tbls_ok)
+{
+  if (!cond)
+    return NULL;
+  if (cond->type() == Item::COND_ITEM)
+  {
+    uint n_marked= 0;
+    if (((Item_cond*) cond)->functype() == Item_func::COND_AND_FUNC)
+    {
+      table_map used_tables= 0;
+      Item_cond_and *new_cond=new Item_cond_and;
+      if (!new_cond)
+	return (COND*) 0;
+      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+	Item *fix= make_cond_for_index(item, table, keyno, other_tbls_ok);
+	if (fix)
+        {
+	  new_cond->argument_list()->push_back(fix);
+          used_tables|= fix->used_tables();
+        }
+        if (test(item->marker == ICP_COND_USES_INDEX_ONLY))
+        {
+          n_marked++;
+          item->marker= 0;
+        } 
+      }
+      if (n_marked ==((Item_cond*)cond)->argument_list()->elements)
+        cond->marker= ICP_COND_USES_INDEX_ONLY;
+      switch (new_cond->argument_list()->elements) {
+      case 0:
+	return (COND*) 0;
+      case 1:
+        new_cond->used_tables_cache= used_tables;
+	return new_cond->argument_list()->head();
+      default:
+	new_cond->quick_fix_field();
+        new_cond->used_tables_cache= used_tables;
+	return new_cond;
+      }
+    }
+    else /* It's OR */
+    {
+      Item_cond_or *new_cond=new Item_cond_or;
+      if (!new_cond)
+	return (COND*) 0;
+      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+	Item *fix= make_cond_for_index(item, table, keyno, other_tbls_ok);
+	if (!fix)
+	  return (COND*) 0;
+	new_cond->argument_list()->push_back(fix);
+        if (test(item->marker == ICP_COND_USES_INDEX_ONLY))
+        {
+          n_marked++;
+          item->marker= 0;
+        } 
+      }
+      if (n_marked ==((Item_cond*)cond)->argument_list()->elements)
+        cond->marker= ICP_COND_USES_INDEX_ONLY;
+      new_cond->quick_fix_field();
+      new_cond->used_tables_cache= ((Item_cond_or*) cond)->used_tables_cache;
+      new_cond->top_level_item();
+      return new_cond;
+    }
+  }
+
+  if (!uses_index_fields_only(cond, table, keyno, other_tbls_ok))
+    return (COND*) 0;
+  cond->marker= ICP_COND_USES_INDEX_ONLY;
+  return cond;
+}
+
+
+Item *make_cond_remainder(Item *cond, bool exclude_index)
+{
+  if (exclude_index && cond->marker == ICP_COND_USES_INDEX_ONLY)
+    return 0; /* Already checked */
+
+  if (cond->type() == Item::COND_ITEM)
+  {
+    table_map tbl_map= 0;
+    if (((Item_cond*) cond)->functype() == Item_func::COND_AND_FUNC)
+    {
+      /* Create new top level AND item */
+      Item_cond_and *new_cond=new Item_cond_and;
+      if (!new_cond)
+	return (COND*) 0;
+      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+	Item *fix= make_cond_remainder(item, exclude_index);
+	if (fix)
+        {
+	  new_cond->argument_list()->push_back(fix);
+          tbl_map |= fix->used_tables();
+        }
+      }
+      switch (new_cond->argument_list()->elements) {
+      case 0:
+	return (COND*) 0;
+      case 1:
+	return new_cond->argument_list()->head();
+      default:
+	new_cond->quick_fix_field();
+        ((Item_cond*)new_cond)->used_tables_cache= tbl_map;
+	return new_cond;
+      }
+    }
+    else /* It's OR */
+    {
+      Item_cond_or *new_cond=new Item_cond_or;
+      if (!new_cond)
+	return (COND*) 0;
+      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+	Item *fix= make_cond_remainder(item, FALSE);
+	if (!fix)
+	  return (COND*) 0;
+	new_cond->argument_list()->push_back(fix);
+        tbl_map |= fix->used_tables();
+      }
+      new_cond->quick_fix_field();
+      ((Item_cond*)new_cond)->used_tables_cache= tbl_map;
+      new_cond->top_level_item();
+      return new_cond;
+    }
+  }
+  return cond;
+}
+
+
+/*
+  Try to extract and push the index condition
+
+  SYNOPSIS
+    push_index_cond()
+      tab            A join tab that has tab->table->file and its condition
+                     in tab->select_cond
+      keyno          Index for which extract and push the condition
+
+  DESCRIPTION
+    Try to extract and push the index condition down to table handler
+*/
+
+void push_index_cond(JOIN_TAB *tab, uint keyno)
+{
+  DBUG_ENTER("push_index_cond");
+  Item *idx_cond;
+  bool do_index_cond_pushdown=
+    ((tab->table->file->index_flags(keyno, 0, 1) &
+      HA_DO_INDEX_COND_PUSHDOWN) &&
+     optimizer_flag(tab->join->thd, OPTIMIZER_SWITCH_INDEX_COND_PUSHDOWN));
+
+  /*
+    Do not try index condition pushdown on indexes which have partially-covered
+    columns. Unpacking from a column prefix into index tuple is not a supported 
+    operation in some engines, see e.g. MySQL BUG#42991.
+    TODO: a better solution would be not to consider partially-covered columns
+    as parts of the index and still produce/check index condition for
+    fully-covered index columns.
+  */
+  KEY *key_info= tab->table->key_info + keyno;
+  for (uint kp= 0; kp < key_info->key_parts; kp++)
+  {
+    if ((key_info->key_part[kp].key_part_flag & HA_PART_KEY_SEG))
+    {
+      do_index_cond_pushdown= FALSE;
+      break;
+    }
+  }
+
+  if (do_index_cond_pushdown)
+  {
+    DBUG_EXECUTE("where",
+                 print_where(tab->select_cond, "full cond", QT_ORDINARY););
+
+    idx_cond= make_cond_for_index(tab->select_cond, tab->table, keyno,
+                                  tab->icp_other_tables_ok);
+
+    DBUG_EXECUTE("where",
+                 print_where(idx_cond, "idx cond", QT_ORDINARY););
+
+    if (idx_cond)
+    {
+      Item *idx_remainder_cond= 0;
+      tab->pre_idx_push_select_cond= tab->select->cond;
+      /*
+        For BKA cache we store condition to special BKA cache field
+        because evaluation of the condition requires additional operations
+        before the evaluation. This condition is used in 
+        JOIN_CACHE_BKA[_UNIQUE]::skip_index_tuple() functions.
+      */
+      if (tab->use_join_cache &&
+          /*
+            if cache is used then the value is TRUE only 
+            for BKA[_UNIQUE] cache (see check_join_cache_usage func).
+          */
+          tab->icp_other_tables_ok &&
+          (idx_cond->used_tables() &
+           ~(tab->table->map | tab->join->const_table_map)))
+        tab->cache_idx_cond= idx_cond;
+      else
+        idx_remainder_cond= tab->table->file->idx_cond_push(keyno, idx_cond);
+
+      /*
+        Disable eq_ref's "lookup cache" if we've pushed down an index
+        condition. 
+        TODO: This check happens to work on current ICP implementations, but
+        there may exist a compliant implementation that will not work 
+        correctly with it. Sort this out when we stabilize the condition
+        pushdown APIs.
+      */
+      if (idx_remainder_cond != idx_cond)
+        tab->ref.disable_cache= TRUE;
+
+      Item *row_cond= tab->idx_cond_fact_out ? 
+                        make_cond_remainder(tab->select_cond, TRUE) :
+	                tab->pre_idx_push_select_cond;
+
+      DBUG_EXECUTE("where",
+                   print_where(row_cond, "remainder cond", QT_ORDINARY););
+      
+      if (row_cond)
+      {
+        if (!idx_remainder_cond)
+          tab->select_cond= row_cond;
+        else
+        {
+          COND *new_cond= new Item_cond_and(row_cond, idx_remainder_cond);
+          tab->select_cond= new_cond;
+	  tab->select_cond->quick_fix_field();
+          ((Item_cond_and*)tab->select_cond)->used_tables_cache= 
+            row_cond->used_tables() | idx_remainder_cond->used_tables();
+        }
+      }
+      else
+        tab->select_cond= idx_remainder_cond;
+      if (tab->select)
+      {
+        DBUG_EXECUTE("where",
+                     print_where(tab->select->cond,
+                                 "select_cond",
+                                 QT_ORDINARY););
+
+        tab->select->cond= tab->select_cond;
+        tab->select->pre_idx_push_select_cond= tab->pre_idx_push_select_cond;
+      }
+    }
+  }
+  DBUG_VOID_RETURN;
+}
+
+
diff --git a/sql/opt_range.cc b/sql/opt_range.cc
index 2e39f367a70..24720e48861 100644
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -34,7 +34,7 @@
     
     The lists are returned in form of complicated structure of interlinked
     SEL_TREE/SEL_IMERGE/SEL_ARG objects.
-    See check_quick_keys, find_used_partitions for examples of how to walk 
+    See quick_range_seq_next, find_used_partitions for examples of how to walk 
     this structure.
     All direct "users" of this module are located within this file, too.
 
@@ -59,6 +59,48 @@
 
   Record retrieval code for range/index_merge/groupby-min-max.
     Implementations of QUICK_*_SELECT classes.
+
+  KeyTupleFormat
+  ~~~~~~~~~~~~~~
+  The code in this file (and elsewhere) makes operations on key value tuples.
+  Those tuples are stored in the following format:
+  
+  The tuple is a sequence of key part values. The length of key part value
+  depends only on its type (and not depends on the what value is stored)
+  
+    KeyTuple: keypart1-data, keypart2-data, ...
+  
+  The value of each keypart is stored in the following format:
+  
+    keypart_data: [isnull_byte] keypart-value-bytes
+
+  If a keypart may have a NULL value (key_part->field->real_maybe_null() can
+  be used to check this), then the first byte is a NULL indicator with the 
+  following valid values:
+    1  - keypart has NULL value.
+    0  - keypart has non-NULL value.
+
+  <questionable-statement> If isnull_byte==1 (NULL value), then the following
+  keypart->length bytes must be 0.
+  </questionable-statement>
+
+  keypart-value-bytes holds the value. Its format depends on the field type.
+  The length of keypart-value-bytes may or may not depend on the value being
+  stored. The default is that length is static and equal to 
+  KEY_PART_INFO::length.
+  
+  Key parts with (key_part_flag & HA_BLOB_PART) have length depending of the 
+  value:
+  
+     keypart-value-bytes: value_length value_bytes
+
+  The value_length part itself occupies HA_KEY_BLOB_LENGTH=2 bytes.
+
+  See key_copy() and key_restore() for code to move data between index tuple
+  and table record
+
+  CAUTION: the above description is only sergefp's understanding of the 
+           subject and may omit some details.
 */
 
 #ifdef USE_PRAGMA_IMPLEMENTATION
@@ -262,6 +304,11 @@ public:
   uint8 part;					// Which key part
   uint8 maybe_null;
   /* 
+    The ordinal number the least significant component encountered in
+    the ranges of the SEL_ARG tree (the first component has number 1) 
+  */
+  uint16 max_part_no; 
+  /* 
     Number of children of this element in the RB-tree, plus 1 for this
     element itself.
   */
@@ -403,6 +450,7 @@ public:
   /* returns a number of keypart values (0 or 1) appended to the key buffer */
   int store_min(uint length, uchar **min_key,uint min_key_flag)
   {
+    /* "(kp1 > c1) AND (kp2 OP c2) AND ..." -> (kp1 > c1) */
     if ((min_flag & GEOM_FLAG) ||
         (!(min_flag & NO_MIN_RANGE) &&
 	!(min_key_flag & (NO_MIN_RANGE | NEAR_MIN))))
@@ -497,6 +545,11 @@ public:
 	  pos->increment_use_count(count);
     }
   }
+  void incr_refs()
+  {
+    increment_use_count(1);
+    use_count++;
+  }
   void free_tree()
   {
     for (SEL_ARG *pos=first(); pos ; pos=pos->next)
@@ -558,7 +611,100 @@ public:
 
 class SEL_IMERGE;
 
+#define CLONE_KEY1_MAYBE 1
+#define CLONE_KEY2_MAYBE 2
+#define swap_clone_flag(A) ((A & 1) << 1) | ((A & 2) >> 1)
+
 
+/*
+  While objects of the class SEL_ARG represent ranges for indexes or
+  index infixes (including ranges for index prefixes and index suffixes),
+  objects of the class SEL_TREE represent AND/OR formulas of such ranges.
+  Currently an AND/OR formula represented by a SEL_TREE object can have
+  at most three levels: 
+
+    <SEL_TREE formula> ::= 
+      [ <SEL_RANGE_TREE formula> AND ]
+      [ <SEL_IMERGE formula> [ AND <SEL_IMERGE formula> ...] ]
+
+    <SEL_RANGE_TREE formula> ::=
+      <SEL_ARG formula> [ AND  <SEL_ARG_formula> ... ]
+
+    <SEL_IMERGE formula> ::=  
+      <SEL_RANGE_TREE formula> [ OR <SEL_RANGE_TREE formula> ]
+
+  As we can see from the above definitions:
+   - SEL_RANGE_TREE formula is a conjunction of SEL_ARG formulas
+   - SEL_IMERGE formula is a disjunction of SEL_RANGE_TREE formulas
+   - SEL_TREE formula is a conjunction of a SEL_RANGE_TREE formula
+     and SEL_IMERGE formulas. 
+  It's required above that a SEL_TREE formula has at least one conjunct.
+
+  Usually we will consider normalized SEL_RANGE_TREE formulas where we use
+  TRUE as conjunct members for those indexes whose SEL_ARG trees are empty.
+  
+  We will call an SEL_TREE object simply 'tree'. 
+  The part of a tree that represents SEL_RANGE_TREE formula is called
+  'range part' of the tree while the remaining part is called 'imerge part'. 
+  If a tree contains only a range part then we call such a tree 'range tree'.
+  Components of a range tree that represent SEL_ARG formulas are called ranges.
+  If a tree does not contain any range part we call such a tree 'imerge tree'.
+  Components of the imerge part of a tree that represent SEL_IMERGE formula
+  are called imerges.
+
+  Usually we'll designate:
+    SEL_TREE formulas         by T_1,...,T_k
+    SEL_ARG formulas          by R_1,...,R_k
+    SEL_RANGE_TREE formulas   by RT_1,...,RT_k
+    SEL_IMERGE formulas       by M_1,...,M_k
+  Accordingly we'll use:
+    t_1,...,t_k - to designate trees representing T_1,...,T_k
+    r_1,...,r_k - to designate ranges representing R_1,...,R_k 
+    rt_1,...,r_tk - to designate range trees representing RT_1,...,RT_k
+    m_1,...,m_k - to designate imerges representing M_1,...,M_k
+
+  SEL_TREE objects are usually built from WHERE conditions or
+  ON expressions.
+  A SEL_TREE object always represents an inference of the condition it is
+  built from. Therefore, if a row satisfies a SEL_TREE formula it also
+  satisfies the condition it is built from.
+
+  The following transformations of tree t representing SEL_TREE formula T 
+  yield a new tree t1 thar represents an inference of T: T=>T1.  
+    (1) remove any of SEL_ARG tree from the range part of t
+    (2) remove any imerge from the tree t 
+    (3) remove any of SEL_ARG tree from any range tree contained
+        in any imerge of tree   
+ 
+  Since the basic blocks of any SEL_TREE objects are ranges, SEL_TREE
+  objects in many cases can be effectively used to filter out a big part
+  of table rows that do not satisfy WHERE/IN conditions utilizing
+  only single or multiple range index scans.
+
+  A single range index scan is constructed for a range tree that contains
+  only one SEL_ARG object for an index or an index prefix.
+  An index intersection scan can be constructed for a range tree
+  that contains several SEL_ARG objects. Currently index intersection
+  scans are constructed only for single-point ranges.
+  An index merge scan is constructed for a imerge tree that contains only
+  one imerge. If range trees of this imerge contain only single-point merges
+  than a union of index intersections can be built.
+
+  Usually the tree built by the range optimizer for a query table contains
+  more than one range in the range part, and additionally may contain some
+  imerges in the imerge part. The range optimizer evaluates all of them one
+  by one and chooses the range or the imerge that provides the cheapest
+  single or multiple range index scan of the table.  According to rules 
+  (1)-(3) this scan always filter out only those rows that do not satisfy
+  the query conditions. 
+
+  For any condition the SEL_TREE object for it is built in a bottom up
+  manner starting from the range trees for the predicates. The tree_and
+  function builds a tree for any conjunction of formulas from the trees
+  for its conjuncts. The tree_or function builds a tree for any disjunction
+  of formulas from the trees for its disjuncts.    
+*/ 
+  
 class SEL_TREE :public Sql_alloc
 {
 public:
@@ -574,7 +720,7 @@ public:
     keys_map.clear_all();
     bzero((char*) keys,sizeof(keys));
   }
-  SEL_TREE(SEL_TREE *arg, RANGE_OPT_PARAM *param);
+  SEL_TREE(SEL_TREE *arg, bool without_merges, RANGE_OPT_PARAM *param);
   /*
     Note: there may exist SEL_TREE objects with sel_tree->type=KEY and
     keys[i]=0 for all i. (SergeyP: it is not clear whether there is any
@@ -594,9 +740,15 @@ public:
   key_map ror_scans_map;   /* bitmask of ROR scan-able elements in keys */
   uint    n_ror_scans;     /* number of set bits in ror_scans_map */
 
+  struct st_index_scan_info **index_scans;     /* list of index scans */
+  struct st_index_scan_info **index_scans_end; /* last index scan */
+
   struct st_ror_scan_info **ror_scans;     /* list of ROR key scans */
   struct st_ror_scan_info **ror_scans_end; /* last ROR scan */
   /* Note that #records for each key scan is stored in table->quick_rows */
+
+  bool without_ranges() { return keys_map.is_clear_all(); }
+  bool without_imerges() { return merges.is_empty(); }
 };
 
 class RANGE_OPT_PARAM
@@ -627,6 +779,11 @@ public:
   */
   bool using_real_indexes;
   
+  /*
+    Aggressively remove "scans" that do not have conditions on first
+    keyparts. Such scans are usable when doing partition pruning but not
+    regular range optimization.
+  */
   bool remove_jump_scans;
   
   /*
@@ -634,19 +791,27 @@ public:
     using_real_indexes==TRUE
   */
   uint real_keynr[MAX_KEY];
+
+  /*
+    Used to store 'current key tuples', in both range analysis and
+    partitioning (list) analysis
+  */
+  uchar min_key[MAX_KEY_LENGTH+MAX_FIELD_WIDTH],
+    max_key[MAX_KEY_LENGTH+MAX_FIELD_WIDTH];
+
   /* Number of SEL_ARG objects allocated by SEL_ARG::clone_tree operations */
   uint alloced_sel_args; 
+  bool force_default_mrr;
+  KEY_PART *key[MAX_KEY]; /* First key parts of keys used in the query */
 };
 
 class PARAM : public RANGE_OPT_PARAM
 {
 public:
-  KEY_PART *key[MAX_KEY]; /* First key parts of keys used in the query */
+  ha_rows quick_rows[MAX_KEY];
   longlong baseflag;
   uint max_key_part, range_count;
 
-  uchar min_key[MAX_KEY_LENGTH+MAX_FIELD_WIDTH],
-    max_key[MAX_KEY_LENGTH+MAX_FIELD_WIDTH];
   bool quick;				// Don't calulate possible keys
 
   uint fields_bitmap_size;
@@ -665,13 +830,16 @@ public:
   uint8 first_null_comp; /* first null component if any, 0 - otherwise */
 };
 
+
 class TABLE_READ_PLAN;
   class TRP_RANGE;
   class TRP_ROR_INTERSECT;
   class TRP_ROR_UNION;
-  class TRP_ROR_INDEX_MERGE;
+  class TRP_INDEX_INTERSECT;
+  class TRP_INDEX_MERGE;
   class TRP_GROUP_MIN_MAX;
 
+struct st_index_scan_info;
 struct st_ror_scan_info;
 
 static SEL_TREE * get_mm_parts(RANGE_OPT_PARAM *param,COND *cond_func,Field *field,
@@ -683,20 +851,22 @@ static SEL_ARG *get_mm_leaf(RANGE_OPT_PARAM *param,COND *cond_func,Field *field,
 static SEL_TREE *get_mm_tree(RANGE_OPT_PARAM *param,COND *cond);
 
 static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts);
-static ha_rows check_quick_select(PARAM *param,uint index,SEL_ARG *key_tree,
-                                  bool update_tbl_stats);
-static ha_rows check_quick_keys(PARAM *param,uint index,SEL_ARG *key_tree,
-                                uchar *min_key, uint min_key_flag, int,
-                                uchar *max_key, uint max_key_flag, int);
+static ha_rows check_quick_select(PARAM *param, uint idx, bool index_only,
+                                  SEL_ARG *tree, bool update_tbl_stats, 
+                                  uint *mrr_flags, uint *bufsize,
+                                  COST_VECT *cost);
 
 QUICK_RANGE_SELECT *get_quick_select(PARAM *param,uint index,
-                                     SEL_ARG *key_tree,
-                                     MEM_ROOT *alloc = NULL);
+                                     SEL_ARG *key_tree, uint mrr_flags, 
+                                     uint mrr_buf_size, MEM_ROOT *alloc);
 static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree,
                                        bool index_read_must_be_used,
                                        bool update_tbl_stats,
                                        double read_time);
 static
+TRP_INDEX_INTERSECT *get_best_index_intersect(PARAM *param, SEL_TREE *tree,
+                                              double read_time);
+static
 TRP_ROR_INTERSECT *get_best_ror_intersect(const PARAM *param, SEL_TREE *tree,
                                           double read_time,
                                           bool *are_all_covering);
@@ -707,7 +877,12 @@ TRP_ROR_INTERSECT *get_best_covering_ror_intersect(PARAM *param,
 static
 TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge,
                                          double read_time);
-static TRP_GROUP_MIN_MAX *get_best_group_min_max(PARAM *param, SEL_TREE *tree);
+static
+TABLE_READ_PLAN *merge_same_index_scans(PARAM *param, SEL_IMERGE *imerge,
+                                        TRP_INDEX_MERGE *imerge_trp,
+                                        double read_time);
+static
+TRP_GROUP_MIN_MAX *get_best_group_min_max(PARAM *param, SEL_TREE *tree);
 
 #ifndef DBUG_OFF
 static void print_sel_tree(PARAM *param, SEL_TREE *tree, key_map *tree_map,
@@ -718,11 +893,15 @@ static void print_ror_scans_arr(TABLE *table, const char *msg,
 static void print_quick(QUICK_SELECT_I *quick, const key_map *needed_reg);
 #endif
 
-static SEL_TREE *tree_and(RANGE_OPT_PARAM *param,SEL_TREE *tree1,SEL_TREE *tree2);
-static SEL_TREE *tree_or(RANGE_OPT_PARAM *param,SEL_TREE *tree1,SEL_TREE *tree2);
+static SEL_TREE *tree_and(RANGE_OPT_PARAM *param,
+                          SEL_TREE *tree1, SEL_TREE *tree2);
+static SEL_TREE *tree_or(RANGE_OPT_PARAM *param,
+                         SEL_TREE *tree1,SEL_TREE *tree2);
 static SEL_ARG *sel_add(SEL_ARG *key1,SEL_ARG *key2);
-static SEL_ARG *key_or(RANGE_OPT_PARAM *param, SEL_ARG *key1, SEL_ARG *key2);
-static SEL_ARG *key_and(RANGE_OPT_PARAM *param, SEL_ARG *key1, SEL_ARG *key2,
+static SEL_ARG *key_or(RANGE_OPT_PARAM *param,
+                       SEL_ARG *key1, SEL_ARG *key2);
+static SEL_ARG *key_and(RANGE_OPT_PARAM *param,
+                        SEL_ARG *key1, SEL_ARG *key2,
                         uint clone_flag);
 static bool get_range(SEL_ARG **e1,SEL_ARG **e2,SEL_ARG *root1);
 bool get_quick_keys(PARAM *param,QUICK_RANGE_SELECT *quick,KEY_PART *key,
@@ -733,7 +912,25 @@ static bool eq_tree(SEL_ARG* a,SEL_ARG *b);
 static SEL_ARG null_element(SEL_ARG::IMPOSSIBLE);
 static bool null_part_in_key(KEY_PART *key_part, const uchar *key,
                              uint length);
-bool sel_trees_can_be_ored(SEL_TREE *tree1, SEL_TREE *tree2, RANGE_OPT_PARAM* param);
+static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts);
+
+#include "opt_range_mrr.cc"
+
+static bool sel_trees_have_common_keys(SEL_TREE *tree1, SEL_TREE *tree2, 
+                                       key_map *common_keys);
+static void eliminate_single_tree_imerges(RANGE_OPT_PARAM *param,
+                                          SEL_TREE *tree);
+
+static bool sel_trees_can_be_ored(RANGE_OPT_PARAM* param,
+                                  SEL_TREE *tree1, SEL_TREE *tree2, 
+                                  key_map *common_keys);
+static bool sel_trees_must_be_ored(RANGE_OPT_PARAM* param,
+                                   SEL_TREE *tree1, SEL_TREE *tree2,
+                                   key_map common_keys);
+static int and_range_trees(RANGE_OPT_PARAM *param,
+                           SEL_TREE *tree1, SEL_TREE *tree2,
+                           SEL_TREE *result);
+static bool remove_nonrange_trees(RANGE_OPT_PARAM *param, SEL_TREE *tree);
 
 
 /*
@@ -765,23 +962,39 @@ public:
     trees_next(trees),
     trees_end(trees + PREALLOCED_TREES)
   {}
-  SEL_IMERGE (SEL_IMERGE *arg, RANGE_OPT_PARAM *param);
+  SEL_IMERGE (SEL_IMERGE *arg, uint cnt, RANGE_OPT_PARAM *param);
   int or_sel_tree(RANGE_OPT_PARAM *param, SEL_TREE *tree);
-  int or_sel_tree_with_checks(RANGE_OPT_PARAM *param, SEL_TREE *new_tree);
-  int or_sel_imerge_with_checks(RANGE_OPT_PARAM *param, SEL_IMERGE* imerge);
+  bool have_common_keys(RANGE_OPT_PARAM *param, SEL_TREE *tree);
+  int and_sel_tree(RANGE_OPT_PARAM *param, SEL_TREE *tree, 
+                   SEL_IMERGE *new_imerge);
+  int or_sel_tree_with_checks(RANGE_OPT_PARAM *param,
+                              uint n_init_trees, 
+                              SEL_TREE *new_tree,
+                              bool is_first_check_pass,
+                              bool *is_last_check_pass);
+  int or_sel_imerge_with_checks(RANGE_OPT_PARAM *param,
+                                uint n_init_trees,
+                                SEL_IMERGE* imerge,
+                                bool is_first_check_pass,
+                                bool *is_last_check_pass);
 };
 
 
 /*
-  Add SEL_TREE to this index_merge without any checks,
+  Add a range tree to the range trees of this imerge 
 
-  NOTES
-    This function implements the following:
-      (x_1||...||x_N) || t = (x_1||...||x_N||t), where x_i, t are SEL_TREEs
+  SYNOPSIS
+    or_sel_tree()
+      param                  Context info for the operation         
+      tree                   SEL_TREE to add to this imerge 
+
+  DESCRIPTION 
+    The function just adds the range tree 'tree' to the range trees
+    of this imerge.
 
   RETURN
-     0 - OK
-    -1 - Out of memory.
+     0   if the operation is success
+    -1   if the function runs out memory
 */
 
 int SEL_IMERGE::or_sel_tree(RANGE_OPT_PARAM *param, SEL_TREE *tree)
@@ -806,96 +1019,303 @@ int SEL_IMERGE::or_sel_tree(RANGE_OPT_PARAM *param, SEL_TREE *tree)
 
 
 /*
-  Perform OR operation on this SEL_IMERGE and supplied SEL_TREE new_tree,
-  combining new_tree with one of the trees in this SEL_IMERGE if they both
-  have SEL_ARGs for the same key.
+  Check if any of the range trees of this imerge intersects with a given tree 
 
   SYNOPSIS
-    or_sel_tree_with_checks()
-      param    PARAM from SQL_SELECT::test_quick_select
-      new_tree SEL_TREE with type KEY or KEY_SMALLER.
+    have_common_keys()
+      param    Context info for the function
+      tree     SEL_TREE intersection with the imerge range trees is checked for 
 
-  NOTES
-    This does the following:
-    (t_1||...||t_k)||new_tree =
-     either
-       = (t_1||...||t_k||new_tree)
-     or
-       = (t_1||....||(t_j|| new_tree)||...||t_k),
-
-     where t_i, y are SEL_TREEs.
-    new_tree is combined with the first t_j it has a SEL_ARG on common
-    key with. As a consequence of this, choice of keys to do index_merge
-    read may depend on the order of conditions in WHERE part of the query.
+  DESCRIPTION
+    The function checks whether there is any range tree rt_i in this imerge
+    such that there are some indexes for which ranges are defined in both
+    rt_i and the range part of the SEL_TREE tree.  
+    To check this the function calls the function sel_trees_have_common_keys.
 
+  RETURN 
+    TRUE    if there are such range trees in this imerge
+    FALSE   otherwise
+*/
+
+bool SEL_IMERGE::have_common_keys(RANGE_OPT_PARAM *param, SEL_TREE *tree)
+{
+  for (SEL_TREE** or_tree= trees, **bound= trees_next;
+       or_tree != bound; or_tree++)
+  {
+    key_map common_keys;
+    if (sel_trees_have_common_keys(*or_tree, tree, &common_keys))
+      return TRUE;
+  }
+  return FALSE;
+}
+
+
+/* 
+  Perform AND operation for this imerge and the range part of a tree
+
+  SYNOPSIS
+    and_sel_tree()
+      param           Context info for the operation
+      tree            SEL_TREE for the second operand of the operation
+      new_imerge  OUT imerge for the result of the operation
+
+  DESCRIPTION
+    This function performs AND operation for this imerge m and the
+    range part of the SEL_TREE tree rt. In other words the function
+    pushes rt into this imerge. The resulting imerge is returned in
+    the parameter new_imerge.
+    If this imerge m represent the formula
+      RT_1 OR ... OR RT_k
+    then the resulting imerge of the function represents the formula
+      (RT_1 AND RT) OR ... OR (RT_k AND RT)
+    The function calls the function and_range_trees to construct the
+    range tree representing (RT_i AND RT).
+    
+  NOTE
+    The function may return an empty imerge without any range trees.
+    This happens when each call of and_range_trees returns an 
+    impossible range tree (SEL_TREE::IMPOSSIBLE).
+    Example: (key1 < 2 AND key2 > 10) AND (key1 > 4 OR key2 < 6).
+         
   RETURN
-    0  OK
-    1  One of the trees was combined with new_tree to SEL_TREE::ALWAYS,
-       and (*this) should be discarded.
-   -1  An error occurred.
+     0  if the operation is a success
+    -1  otherwise: there is not enough memory to perform the operation
 */
 
-int SEL_IMERGE::or_sel_tree_with_checks(RANGE_OPT_PARAM *param, SEL_TREE *new_tree)
+int SEL_IMERGE::and_sel_tree(RANGE_OPT_PARAM *param, SEL_TREE *tree,
+                             SEL_IMERGE *new_imerge)
 {
-  for (SEL_TREE** tree = trees;
-       tree != trees_next;
-       tree++)
+  for (SEL_TREE** or_tree= trees; or_tree != trees_next; or_tree++) 
   {
-    if (sel_trees_can_be_ored(*tree, new_tree, param))
+    SEL_TREE *res_or_tree= 0;
+    if (!(res_or_tree= new SEL_TREE()))
+      return (-1);
+    if (!and_range_trees(param, *or_tree, tree, res_or_tree))
     {
-      *tree = tree_or(param, *tree, new_tree);
-      if (!*tree)
-        return 1;
-      if (((*tree)->type == SEL_TREE::MAYBE) ||
-          ((*tree)->type == SEL_TREE::ALWAYS))
+      if (new_imerge->or_sel_tree(param, res_or_tree))
+        return (-1);
+    }        
+  }
+  return 0;
+}      
+
+
+/*
+  Perform OR operation on this imerge and the range part of a tree
+
+  SYNOPSIS
+    or_sel_tree_with_checks()
+      param                  Context info for the operation 
+      n_trees                Number of trees in this imerge to check for oring        
+      tree                   SEL_TREE whose range part is to be ored 
+      is_first_check_pass    <=> the first call of the function for this imerge  
+      is_last_check_pass OUT <=> no more calls of the function for this imerge
+
+  DESCRIPTION
+    The function performs OR operation on this imerge m and the range part
+    of the SEL_TREE tree rt. It always replaces this imerge with the result
+    of the operation.
+ 
+    The operation can be performed in two different modes: with
+    is_first_check_pass==TRUE and is_first_check_pass==FALSE, transforming
+    this imerge differently.
+
+    Given this imerge represents the formula
+      RT_1 OR ... OR RT_k:
+
+    1. In the first mode, when is_first_check_pass==TRUE :
+      1.1. If rt must be ored(see the function sel_trees_must_be_ored) with
+           some rt_j (there may be only one such range tree in the imerge)
+           then the function produces an imerge representing the formula
+             RT_1 OR ... OR (RT_j OR RT) OR ... OR RT_k,
+           where the tree for (RT_j OR RT) is built by oring the pairs
+           of SEL_ARG trees for the corresponding indexes
+      1.2. Otherwise the function produces the imerge representing the formula:
+           RT_1 OR ... OR RT_k OR RT.
+
+    2. In the second mode, when is_first_check_pass==FALSE :
+      2.1. For each rt_j in the imerge that can be ored (see the function
+           sel_trees_can_be_ored), but not must be ored, with rt the function
+           replaces rt_j for a range tree such that for each index for which
+           ranges are defined in both in rt_j and rt  the tree contains the
+           result of oring of these ranges.
+      2.2. In other cases the function does not produce any imerge.
+
+    When is_first_check==TRUE the function returns FALSE in the parameter
+    is_last_check_pass if there is no rt_j such that rt_j can be ored with rt,
+    but, at the same time, it's not true that rt_j must be ored with rt.
+    When is_first_check==FALSE the function always returns FALSE in the
+    parameter is_last_check_pass.    
+          
+  RETURN
+    1  The result of oring of rt_j and rt that must be ored returns the
+       the range tree with type==SEL_TREE::ALWAYS
+       (in this case the imerge m should be discarded)
+   -1  The function runs out of memory
+    0  in all other cases 
+*/
+
+int SEL_IMERGE::or_sel_tree_with_checks(RANGE_OPT_PARAM *param,
+                                        uint n_trees,
+                                        SEL_TREE *tree,
+                                        bool is_first_check_pass,
+                                        bool *is_last_check_pass)
+{
+  bool was_ored= FALSE;
+  *is_last_check_pass= TRUE;
+  SEL_TREE** or_tree = trees;
+  for (uint i= 0; i < n_trees; i++, or_tree++)
+  {
+    SEL_TREE *result= 0;
+    key_map result_keys;
+    key_map ored_keys;
+    if (sel_trees_can_be_ored(param, *or_tree, tree, &ored_keys))
+    {
+      bool must_be_ored= sel_trees_must_be_ored(param, *or_tree, tree,
+                                                ored_keys);
+      if (must_be_ored || !is_first_check_pass) 
+      {
+        result_keys.clear_all();
+        result= *or_tree;
+        for (uint key_no= 0; key_no < param->keys; key_no++)
+        {
+          if (!ored_keys.is_set(key_no))
+	  {
+            result->keys[key_no]= 0;
+	    continue;
+          }
+          SEL_ARG *key1= (*or_tree)->keys[key_no];
+          SEL_ARG *key2= tree->keys[key_no];
+          key2->incr_refs();
+          if ((result->keys[key_no]= key_or(param, key1, key2)))
+          {
+            
+            result_keys.set_bit(key_no);
+#ifdef EXTRA_DEBUG
+            if (param->alloced_sel_args < SEL_ARG::MAX_SEL_ARGS)
+	    {
+              key1= result->keys[key_no]; 
+              (key1)->test_use_count(key1);
+            }
+#endif
+          }       
+        }
+      }
+      else if(is_first_check_pass) 
+        *is_last_check_pass= FALSE;
+    } 
+
+    if (result)
+    {
+      if (result_keys.is_clear_all())
+        result->type= SEL_TREE::ALWAYS;
+      *is_last_check_pass= TRUE;
+      if ((result->type == SEL_TREE::MAYBE) ||
+          (result->type == SEL_TREE::ALWAYS))
         return 1;
       /* SEL_TREE::IMPOSSIBLE is impossible here */
-      return 0;
+      result->keys_map= result_keys; 
+      *or_tree= result;
+      if (is_first_check_pass)
+        return 0;
+      was_ored= TRUE;
     }
   }
+  if (was_ored)
+    return 0;
 
-  /* New tree cannot be combined with any of existing trees. */
-  return or_sel_tree(param, new_tree);
+  if (!*is_last_check_pass && 
+      !(tree= new SEL_TREE(tree, FALSE, param)))
+    return (-1);
+  return or_sel_tree(param, tree);
 }
 
 
 /*
-  Perform OR operation on this index_merge and supplied index_merge list.
+  Perform OR operation on this imerge and and another imerge
 
+  SYNOPSIS
+    or_sel_imerge_with_checks()
+      param                  Context info for the operation 
+      n_trees           Number of trees in this imerge to check for oring        
+      imerge                 The second operand of the operation 
+      is_first_check_pass    <=> the first call of the function for this imerge  
+      is_last_check_pass OUT <=> no more calls of the function for this imerge
+
+  DESCRIPTION
+    For each range tree rt from 'imerge' the function calls the method
+    SEL_IMERGE::or_sel_tree_with_checks that performs OR operation on this
+    SEL_IMERGE object m and the tree rt. The mode of the operation is
+    specified by the parameter is_first_check_pass. Each call of
+    SEL_IMERGE::or_sel_tree_with_checks transforms this SEL_IMERGE object m.
+    The function returns FALSE in the prameter is_last_check_pass if
+    at least one of the calls of SEL_IMERGE::or_sel_tree_with_checks
+    returns FALSE as the value of its last parameter. 
+    
   RETURN
-    0 - OK
-    1 - One of conditions in result is always TRUE and this SEL_IMERGE
-        should be discarded.
-   -1 - An error occurred
+    1  One of the calls of SEL_IMERGE::or_sel_tree_with_checks returns 1.
+       (in this case the imerge m should be discarded)
+   -1  The function runs out of memory
+    0  in all other cases 
 */
 
-int SEL_IMERGE::or_sel_imerge_with_checks(RANGE_OPT_PARAM *param, SEL_IMERGE* imerge)
-{
-  for (SEL_TREE** tree= imerge->trees;
-       tree != imerge->trees_next;
-       tree++)
-  {
-    if (or_sel_tree_with_checks(param, *tree))
-      return 1;
+int SEL_IMERGE::or_sel_imerge_with_checks(RANGE_OPT_PARAM *param,
+                                          uint n_trees,
+                                          SEL_IMERGE* imerge,
+                                          bool is_first_check_pass,
+                                          bool *is_last_check_pass)
+{
+  *is_last_check_pass= TRUE;
+  SEL_TREE** tree= imerge->trees;
+  SEL_TREE** tree_end= imerge->trees_next;
+  for ( ; tree < tree_end; tree++)
+  {
+    uint rc;
+    bool is_last= TRUE; 
+    rc= or_sel_tree_with_checks(param, n_trees, *tree, 
+                               is_first_check_pass, &is_last);
+    if (!is_last)
+      *is_last_check_pass= FALSE;
+    if (rc)
+      return rc;
   }
   return 0;
 }
 
 
-SEL_TREE::SEL_TREE(SEL_TREE *arg, RANGE_OPT_PARAM *param): Sql_alloc()
+/*
+  Copy constructor for SEL_TREE objects
+
+  SYNOPSIS
+    SEL_TREE
+      arg            The source tree for the constructor
+      without_merges <=> only the range part of the tree arg is copied
+      param          Context info for the operation
+
+  DESCRIPTION
+    The constructor creates a full copy of the SEL_TREE arg if
+    the prameter without_merges==FALSE. Otherwise a tree is created
+    that contains the copy only of the range part of the tree arg. 
+*/ 
+
+SEL_TREE::SEL_TREE(SEL_TREE *arg, bool without_merges,
+                   RANGE_OPT_PARAM *param): Sql_alloc()
 {
   keys_map= arg->keys_map;
   type= arg->type;
-  for (int idx= 0; idx < MAX_KEY; idx++)
+  for (uint idx= 0; idx < param->keys; idx++)
   {
     if ((keys[idx]= arg->keys[idx]))
-      keys[idx]->increment_use_count(1);
+      keys[idx]->incr_refs();
   }
 
+  if (without_merges)
+    return;
+
   List_iterator<SEL_IMERGE> it(arg->merges);
   for (SEL_IMERGE *el= it++; el; el= it++)
   {
-    SEL_IMERGE *merge= new SEL_IMERGE(el, param);
+    SEL_IMERGE *merge= new SEL_IMERGE(el, 0, param);
     if (!merge || merge->trees == merge->trees_next)
     {
       merges.empty();
@@ -906,7 +1326,23 @@ SEL_TREE::SEL_TREE(SEL_TREE *arg, RANGE_OPT_PARAM *param): Sql_alloc()
 }
 
 
-SEL_IMERGE::SEL_IMERGE (SEL_IMERGE *arg, RANGE_OPT_PARAM *param) : Sql_alloc()
+/*
+  Copy constructor for SEL_IMERGE objects
+
+  SYNOPSIS
+    SEL_IMERGE
+      arg         The source imerge for the constructor
+      cnt         How many trees from arg are to be copied
+      param       Context info for the operation
+
+  DESCRIPTION
+    The cnt==0 then the constructor creates a full copy of the 
+    imerge arg. Otherwise only the first cnt trees of the imerge
+    are copied.
+*/ 
+
+SEL_IMERGE::SEL_IMERGE(SEL_IMERGE *arg, uint cnt,
+                       RANGE_OPT_PARAM *param) : Sql_alloc()
 {
   uint elements= (arg->trees_end - arg->trees);
   if (elements > PREALLOCED_TREES)
@@ -918,13 +1354,13 @@ SEL_IMERGE::SEL_IMERGE (SEL_IMERGE *arg, RANGE_OPT_PARAM *param) : Sql_alloc()
   else
     trees= &trees_prealloced[0];
 
-  trees_next= trees;
+  trees_next= trees + (cnt ? cnt : arg->trees_next-arg->trees);
   trees_end= trees + elements;
 
-  for (SEL_TREE **tree = trees, **arg_tree= arg->trees; tree < trees_end; 
+  for (SEL_TREE **tree = trees, **arg_tree= arg->trees; tree < trees_next; 
        tree++, arg_tree++)
   {
-    if (!(*tree= new SEL_TREE(*arg_tree, param)))
+    if (!(*tree= new SEL_TREE(*arg_tree, FALSE, param)))
       goto mem_err;
   }
 
@@ -938,7 +1374,19 @@ mem_err:
 
 
 /*
-  Perform AND operation on two index_merge lists and store result in *im1.
+  Perform AND operation on two imerge lists
+
+  SYNOPSIS
+    imerge_list_and_list()
+      param             Context info for the operation         
+      im1               The first imerge list for the operation
+      im2               The second imerge list for the operation
+
+  DESCRIPTION
+    The function just appends the imerge list im2 to the imerge list im1  
+    
+  RETURN VALUE
+    none
 */
 
 inline void imerge_list_and_list(List<SEL_IMERGE> *im1, List<SEL_IMERGE> *im2)
@@ -948,73 +1396,242 @@ inline void imerge_list_and_list(List<SEL_IMERGE> *im1, List<SEL_IMERGE> *im2)
 
 
 /*
-  Perform OR operation on 2 index_merge lists, storing result in first list.
-
-  NOTES
-    The following conversion is implemented:
-     (a_1 &&...&& a_N)||(b_1 &&...&& b_K) = AND_i,j(a_i || b_j) =>
-      => (a_1||b_1).
-
-    i.e. all conjuncts except the first one are currently dropped.
-    This is done to avoid producing N*K ways to do index_merge.
-
-    If (a_1||b_1) produce a condition that is always TRUE, NULL is returned
-    and index_merge is discarded (while it is actually possible to try
-    harder).
-
-    As a consequence of this, choice of keys to do index_merge read may depend
-    on the order of conditions in WHERE part of the query.
+  Perform OR operation on two imerge lists
 
+  SYNOPSIS
+    imerge_list_or_list()
+      param             Context info for the operation         
+      im1               The first imerge list for the operation
+      im2               The second imerge list for the operation
+     
+  DESCRIPTION
+    Assuming that the first imerge list represents the formula
+      F1= M1_1 AND ... AND M1_k1 
+    while the second imerge list represents the formula 
+      F2= M2_1 AND ... AND M2_k2,
+    where M1_i= RT1_i_1 OR ... OR RT1_i_l1i (i in [1..k1])
+    and M2_i = RT2_i_1 OR ... OR RT2_i_l2i (i in [1..k2]),
+    the function builds a list of imerges for some formula that can be 
+    inferred from the formula (F1 OR F2).
+
+    More exactly the function builds imerges for the formula (M1_1 OR M2_1).
+    Note that
+      (F1 OR F2) = (M1_1 AND ... AND M1_k1) OR (M2_1 AND ... AND M2_k2) =
+      AND (M1_i OR M2_j) (i in [1..k1], j in [1..k2]) =>
+      M1_1 OR M2_1.
+    So (M1_1 OR M2_1) is indeed an inference formula for (F1 OR F2).
+
+    To build imerges for the formula (M1_1 OR M2_1) the function invokes,
+    possibly twice, the method SEL_IMERGE::or_sel_imerge_with_checks
+    for the imerge m1_1.
+    At its first invocation the method SEL_IMERGE::or_sel_imerge_with_checks
+    performs OR operation on the imerge m1_1 and the range tree rt2_1_1 by
+    calling SEL_IMERGE::or_sel_tree_with_checks with is_first_pass_check==TRUE.
+    The resulting imerge of the operation is ored with the next range tree of
+    the imerge m2_1. This oring continues until the last range tree from
+    m2_1 has been ored. 
+    At its second invocation the method SEL_IMERGE::or_sel_imerge_with_checks
+    performs the same sequence of OR operations, but now calling
+    SEL_IMERGE::or_sel_tree_with_checks with is_first_pass_check==FALSE.
+
+    The imerges that the operation produces replace those in the list im1   
+       
   RETURN
-    0     OK, result is stored in *im1
-    other Error, both passed lists are unusable
+    0     if the operation is a success 
+   -1     if the function has run out of memory 
 */
 
 int imerge_list_or_list(RANGE_OPT_PARAM *param,
                         List<SEL_IMERGE> *im1,
                         List<SEL_IMERGE> *im2)
 {
+
+  uint rc;
+  bool is_last_check_pass= FALSE;
+
   SEL_IMERGE *imerge= im1->head();
+  uint elems= imerge->trees_next-imerge->trees;
   im1->empty();
   im1->push_back(imerge);
 
-  return imerge->or_sel_imerge_with_checks(param, im2->head());
+  rc= imerge->or_sel_imerge_with_checks(param, elems, im2->head(),
+                                        TRUE, &is_last_check_pass);
+  if (rc)
+  {
+    if (rc == 1)
+    {
+      im1->empty();
+      rc= 0;
+    }
+    return rc;
+  }
+
+  if (!is_last_check_pass)
+  {
+    SEL_IMERGE* new_imerge= new SEL_IMERGE(imerge, elems, param);
+    if (new_imerge)
+    {
+      is_last_check_pass= TRUE;
+      rc= new_imerge->or_sel_imerge_with_checks(param, elems, im2->head(),
+                                                 FALSE, &is_last_check_pass);
+      if (!rc)
+        im1->push_back(new_imerge); 
+    }
+  }
+  return rc;  
 }
 
 
 /*
-  Perform OR operation on index_merge list and key tree.
+  Perform OR operation for each imerge from a list and the range part of a tree
+
+  SYNOPSIS
+    imerge_list_or_tree()
+      param       Context info for the operation
+      merges      The list of imerges to be ored with the range part of tree          
+      tree        SEL_TREE whose range part is to be ored with the imerges
+
+  DESCRIPTION
+    For each imerge mi from the list 'merges' the function performes OR
+    operation with mi and the range part of 'tree' rt, producing one or
+    two imerges.
 
+    Given the merge mi represent the formula RTi_1 OR ... OR RTi_k, 
+    the function forms the merges by the following rules:
+ 
+    1. If rt cannot be ored with any of the trees rti the function just
+       produces an imerge that represents the formula
+         RTi_1 OR ... RTi_k OR RT.
+    2. If there exist a tree rtj that must be ored with rt the function
+       produces an imerge the represents the formula
+         RTi_1 OR ... OR (RTi_j OR RT) OR ... OR RTi_k,
+       where the range tree for (RTi_j OR RT) is constructed by oring the
+       SEL_ARG trees that must be ored.
+    3. For each rti_j that can be ored with rt the function produces
+       the new tree rti_j' and substitutes rti_j for this new range tree.
+
+    In any case the function removes mi from the list and then adds all
+    produced imerges.
+
+    To build imerges by rules 1-3 the function calls the method
+    SEL_IMERGE::or_sel_tree_with_checks, possibly twice. With the first
+    call it passes TRUE for the third parameter of the function.
+    At this first call imerges by rules 1-2 are built. If the call
+    returns FALSE as the return value of its fourth parameter then the
+    function are called for the second time. At this call the imerge
+    of rule 3 is produced.
+
+    If a call of SEL_IMERGE::or_sel_tree_with_checks returns 1 then
+    then it means that the produced tree contains an always true
+    range tree and the whole imerge can be discarded.
+    
   RETURN
-    0     OK, result is stored in *im1.
-    other Error
+    1     if no imerges are produced
+    0     otherwise
 */
 
+static
 int imerge_list_or_tree(RANGE_OPT_PARAM *param,
-                        List<SEL_IMERGE> *im1,
+                        List<SEL_IMERGE> *merges,
                         SEL_TREE *tree)
 {
+
   SEL_IMERGE *imerge;
-  List_iterator<SEL_IMERGE> it(*im1);
-  bool tree_used= FALSE;
+  List<SEL_IMERGE> additional_merges;
+  List_iterator<SEL_IMERGE> it(*merges);
+  
   while ((imerge= it++))
   {
-    SEL_TREE *or_tree;
-    if (tree_used)
+    bool is_last_check_pass;
+    int rc= 0;
+    int rc1= 0;
+    SEL_TREE *or_tree= new SEL_TREE (tree, FALSE, param);
+    if (or_tree)
     {
-      or_tree= new SEL_TREE (tree, param);
-      if (!or_tree ||
-          (or_tree->keys_map.is_clear_all() && or_tree->merges.is_empty()))
-        return FALSE;
+      uint elems= imerge->trees_next-imerge->trees;
+      rc= imerge->or_sel_tree_with_checks(param, elems, or_tree,
+                                          TRUE, &is_last_check_pass);
+      if (!is_last_check_pass)
+      {
+        SEL_IMERGE *new_imerge= new SEL_IMERGE(imerge, elems, param);
+        if (new_imerge)
+	{ 
+          rc1= new_imerge->or_sel_tree_with_checks(param, elems, or_tree,
+                                                   FALSE, &is_last_check_pass);
+          if (!rc1)
+            additional_merges.push_back(new_imerge);
+        }
+      }
     }
-    else
-      or_tree= tree;
-
-    if (imerge->or_sel_tree_with_checks(param, or_tree))
+    if (rc || rc1 || !or_tree)
       it.remove();
-    tree_used= TRUE;
   }
-  return im1->is_empty();
+
+  merges->concat(&additional_merges);  
+  return merges->is_empty();
+}
+
+
+/*
+  Perform pushdown operation of the range part of a tree into given imerges 
+
+  SYNOPSIS
+    imerge_list_and_tree()
+      param           Context info for the operation
+      merges   IN/OUT List of imerges to push the range part of 'tree' into
+      tree            SEL_TREE whose range part is to be pushed into imerges
+
+  DESCRIPTION
+    For each imerge from the list merges the function pushes the range part
+    rt of 'tree' into the imerge. 
+    More exactly if the imerge mi from the list represents the formula
+      RTi_1 OR ... OR RTi_k 
+    the function bulds a new imerge that represents the formula
+      (RTi_1 AND RT) OR ... OR (RTi_k AND RT)
+    and adds this imerge to the list merges.
+    To perform this pushdown operation the function calls the method
+    SEL_IMERGE::and_sel_tree. 
+    For any imerge mi the new imerge is not created if for each pair of
+    trees rti_j and rt the intersection of the indexes with defined ranges
+    is empty.
+    If the result of the pushdown operation for the imerge mi returns an
+    imerge with no trees then then not only nothing is added to the list 
+    merges but mi itself is removed from the list. 
+     
+  RETURN
+    1    if no imerges are left in the list merges             
+    0    otherwise
+*/
+
+static
+int imerge_list_and_tree(RANGE_OPT_PARAM *param,
+                         List<SEL_IMERGE> *merges,
+                         SEL_TREE *tree)
+{
+  SEL_IMERGE *imerge;
+  SEL_IMERGE *new_imerge= NULL;
+  List<SEL_IMERGE> new_merges;
+  List_iterator<SEL_IMERGE> it(*merges);
+  
+  while ((imerge= it++))
+  {
+    if (!new_imerge)
+       new_imerge= new SEL_IMERGE();
+    if (imerge->have_common_keys(param, tree) && 
+        new_imerge && !imerge->and_sel_tree(param, tree, new_imerge))
+    {
+      if (new_imerge->trees == new_imerge->trees_next)
+        it.remove();
+      else
+      {         
+        new_merges.push_back(new_imerge);
+        new_imerge= NULL;
+      }
+    }
+  }
+  imerge_list_and_list(&new_merges, merges);
+  *merges= new_merges;
+  return merges->is_empty();
 }
 
 
@@ -1062,7 +1679,7 @@ SQL_SELECT *make_select(TABLE *head, table_map const_tables,
 }
 
 
-SQL_SELECT::SQL_SELECT() :quick(0),cond(0),free_cond(0)
+SQL_SELECT::SQL_SELECT() :quick(0),cond(0),pre_idx_push_select_cond(NULL),free_cond(0)
 {
   quick_keys.clear_all(); needed_reg.clear_all();
   my_b_clear(&file);
@@ -1096,25 +1713,22 @@ QUICK_SELECT_I::QUICK_SELECT_I()
 {}
 
 QUICK_RANGE_SELECT::QUICK_RANGE_SELECT(THD *thd, TABLE *table, uint key_nr,
-                                       bool no_alloc, MEM_ROOT *parent_alloc)
-  :dont_free(0),doing_key_read(0),error(0),free_file(0),in_range(0),cur_range(NULL),last_range(0)
+                                       bool no_alloc, MEM_ROOT *parent_alloc,
+                                       bool *create_error)
+  :doing_key_read(0),/*error(0),*/free_file(0),/*in_range(0),*/cur_range(NULL),last_range(0),dont_free(0)
 {
   my_bitmap_map *bitmap;
   DBUG_ENTER("QUICK_RANGE_SELECT::QUICK_RANGE_SELECT");
 
   in_ror_merged_scan= 0;
-  sorted= 0;
   index= key_nr;
   head=  table;
   key_part_info= head->key_info[index].key_part;
   my_init_dynamic_array(&ranges, sizeof(QUICK_RANGE*), 16, 16);
 
   /* 'thd' is not accessible in QUICK_RANGE_SELECT::reset(). */
-  multi_range_bufsiz= thd->variables.read_rnd_buff_size;
-  multi_range_count= thd->variables.multi_range_count;
-  multi_range_length= 0;
-  multi_range= NULL;
-  multi_range_buff= NULL;
+  mrr_buf_size= thd->variables.mrr_buff_size;
+  mrr_buf_desc= NULL;
 
   if (!no_alloc && !parent_alloc)
   {
@@ -1129,12 +1743,12 @@ QUICK_RANGE_SELECT::QUICK_RANGE_SELECT(THD *thd, TABLE *table, uint key_nr,
   save_read_set= head->read_set;
   save_write_set= head->write_set;
 
-  /* Allocate a bitmap for used columns */
+  /* Allocate a bitmap for used columns (Q: why not on MEM_ROOT?) */
   if (!(bitmap= (my_bitmap_map*) my_malloc(head->s->column_bitmap_size,
                                            MYF(MY_WME))))
   {
     column_bitmap.bitmap= 0;
-    error= 1;
+    *create_error= 1;
   }
   else
     bitmap_init(&column_bitmap, bitmap, head->s->fields, FALSE);
@@ -1142,6 +1756,20 @@ QUICK_RANGE_SELECT::QUICK_RANGE_SELECT(THD *thd, TABLE *table, uint key_nr,
 }
 
 
+void QUICK_RANGE_SELECT::need_sorted_output()
+{
+  if (!(mrr_flags & HA_MRR_SORTED))
+  {
+    /*
+      Native implementation can't produce sorted output. We'll have to
+      switch to default
+    */
+    mrr_flags |= HA_MRR_USE_DEFAULT_IMPL; 
+  }
+  mrr_flags |= HA_MRR_SORTED;
+}
+
+
 int QUICK_RANGE_SELECT::init()
 {
   DBUG_ENTER("QUICK_RANGE_SELECT::init");
@@ -1184,17 +1812,16 @@ QUICK_RANGE_SELECT::~QUICK_RANGE_SELECT()
     my_free((char*) column_bitmap.bitmap, MYF(MY_ALLOW_ZERO_PTR));
   }
   head->column_bitmaps_set(save_read_set, save_write_set);
-  x_free(multi_range);
-  x_free(multi_range_buff);
+  x_free(mrr_buf_desc);
   DBUG_VOID_RETURN;
 }
 
 
-QUICK_INDEX_MERGE_SELECT::QUICK_INDEX_MERGE_SELECT(THD *thd_param,
-                                                   TABLE *table)
+QUICK_INDEX_SORT_SELECT::QUICK_INDEX_SORT_SELECT(THD *thd_param,
+                                                 TABLE *table)
   :unique(NULL), pk_quick_select(NULL), thd(thd_param)
 {
-  DBUG_ENTER("QUICK_INDEX_MERGE_SELECT::QUICK_INDEX_MERGE_SELECT");
+  DBUG_ENTER("QUICK_INDEX_SORT_SELECT::QUICK_INDEX_SORT_SELECT");
   index= MAX_KEY;
   head= table;
   bzero(&read_record, sizeof(read_record));
@@ -1202,38 +1829,37 @@ QUICK_INDEX_MERGE_SELECT::QUICK_INDEX_MERGE_SELECT(THD *thd_param,
   DBUG_VOID_RETURN;
 }
 
-int QUICK_INDEX_MERGE_SELECT::init()
+int QUICK_INDEX_SORT_SELECT::init()
 {
-  DBUG_ENTER("QUICK_INDEX_MERGE_SELECT::init");
+  DBUG_ENTER("QUICK_INDEX_SORT_SELECT::init");
   DBUG_RETURN(0);
 }
 
-int QUICK_INDEX_MERGE_SELECT::reset()
+int QUICK_INDEX_SORT_SELECT::reset()
 {
-  DBUG_ENTER("QUICK_INDEX_MERGE_SELECT::reset");
+  DBUG_ENTER("QUICK_INDEX_SORT_SELECT::reset");
   DBUG_RETURN(read_keys_and_merge());
 }
 
 bool
-QUICK_INDEX_MERGE_SELECT::push_quick_back(QUICK_RANGE_SELECT *quick_sel_range)
+QUICK_INDEX_SORT_SELECT::push_quick_back(QUICK_RANGE_SELECT *quick_sel_range)
 {
-  /*
-    Save quick_select that does scan on clustered primary key as it will be
-    processed separately.
-  */
+  DBUG_ENTER("QUICK_INDEX_SORT_SELECT::push_quick_back");
   if (head->file->primary_key_is_clustered() &&
       quick_sel_range->index == head->s->primary_key)
+  {
+   /* A quick_select over a clustered primary key is handled specifically */
     pk_quick_select= quick_sel_range;
-  else
-    return quick_selects.push_back(quick_sel_range);
-  return 0;
+    DBUG_RETURN(0);
+  }
+  DBUG_RETURN(quick_selects.push_back(quick_sel_range));
 }
 
-QUICK_INDEX_MERGE_SELECT::~QUICK_INDEX_MERGE_SELECT()
+QUICK_INDEX_SORT_SELECT::~QUICK_INDEX_SORT_SELECT()
 {
   List_iterator_fast<QUICK_RANGE_SELECT> quick_it(quick_selects);
   QUICK_RANGE_SELECT* quick;
-  DBUG_ENTER("QUICK_INDEX_MERGE_SELECT::~QUICK_INDEX_MERGE_SELECT");
+  DBUG_ENTER("QUICK_INDEX_SORT_SELECT::~QUICK_INDEX_SORT_SELECT");
   delete unique;
   quick_it.rewind();
   while ((quick= quick_it++))
@@ -1247,7 +1873,6 @@ QUICK_INDEX_MERGE_SELECT::~QUICK_INDEX_MERGE_SELECT()
   DBUG_VOID_RETURN;
 }
 
-
 QUICK_ROR_INTERSECT_SELECT::QUICK_ROR_INTERSECT_SELECT(THD *thd_param,
                                                        TABLE *table,
                                                        bool retrieve_full_rows,
@@ -1535,7 +2160,7 @@ int QUICK_ROR_UNION_SELECT::init()
   DBUG_ENTER("QUICK_ROR_UNION_SELECT::init");
   if (init_queue(&queue, quick_selects.elements, 0,
                  FALSE , QUICK_ROR_UNION_SELECT::queue_cmp,
-                 (void*) this))
+                 (void*) this, 0, 0))
   {
     bzero(&queue, sizeof(QUEUE));
     DBUG_RETURN(1);
@@ -1659,6 +2284,7 @@ SEL_ARG::SEL_ARG(SEL_ARG &arg) :Sql_alloc()
   min_value=arg.min_value;
   max_value=arg.max_value;
   next_key_part=arg.next_key_part;
+  max_part_no= arg.max_part_no;
   use_count=1; elements=1;
 }
 
@@ -1676,9 +2302,10 @@ SEL_ARG::SEL_ARG(Field *f,const uchar *min_value_arg,
   :min_flag(0), max_flag(0), maybe_flag(0), maybe_null(f->real_maybe_null()),
    elements(1), use_count(1), field(f), min_value((uchar*) min_value_arg),
    max_value((uchar*) max_value_arg), next(0),prev(0),
-   next_key_part(0),color(BLACK),type(KEY_RANGE)
+   next_key_part(0), color(BLACK), type(KEY_RANGE)
 {
   left=right= &null_element;
+  max_part_no= 1;
 }
 
 SEL_ARG::SEL_ARG(Field *field_,uint8 part_,
@@ -1689,6 +2316,7 @@ SEL_ARG::SEL_ARG(Field *field_,uint8 part_,
    field(field_), min_value(min_value_), max_value(max_value_),
    next(0),prev(0),next_key_part(0),color(BLACK),type(KEY_RANGE)
 {
+  max_part_no= part+1;
   left=right= &null_element;
 }
 
@@ -1732,6 +2360,7 @@ SEL_ARG *SEL_ARG::clone(RANGE_OPT_PARAM *param, SEL_ARG *new_parent,
   increment_use_count(1);
   tmp->color= color;
   tmp->elements= this->elements;
+  tmp->max_part_no= max_part_no;
   return tmp;
 }
 
@@ -1975,9 +2604,11 @@ class TRP_RANGE : public TABLE_READ_PLAN
 public:
   SEL_ARG *key; /* set of intervals to be used in "range" method retrieval */
   uint     key_idx; /* key number in PARAM::key */
+  uint     mrr_flags; 
+  uint     mrr_buf_size;
 
-  TRP_RANGE(SEL_ARG *key_arg, uint idx_arg)
-   : key(key_arg), key_idx(idx_arg)
+  TRP_RANGE(SEL_ARG *key_arg, uint idx_arg, uint mrr_flags_arg)
+   : key(key_arg), key_idx(idx_arg), mrr_flags(mrr_flags_arg)
   {}
   virtual ~TRP_RANGE() {}                     /* Remove gcc warning */
 
@@ -1986,7 +2617,8 @@ public:
   {
     DBUG_ENTER("TRP_RANGE::make_quick");
     QUICK_RANGE_SELECT *quick;
-    if ((quick= get_quick_select(param, key_idx, key, parent_alloc)))
+    if ((quick= get_quick_select(param, key_idx, key,  mrr_flags, 
+                                 mrr_buf_size, parent_alloc)))
     {
       quick->records= records;
       quick->read_time= read_cost;
@@ -2034,6 +2666,26 @@ public:
 
 
 /*
+  Plan for QUICK_INDEX_INTERSECT_SELECT scan.
+  QUICK_INDEX_INTERSECT_SELECT always retrieves full rows, so retrieve_full_rows
+  is ignored by make_quick.
+*/
+
+class TRP_INDEX_INTERSECT : public TABLE_READ_PLAN
+{
+public:
+  TRP_INDEX_INTERSECT() {}                        /* Remove gcc warning */
+  virtual ~TRP_INDEX_INTERSECT() {}               /* Remove gcc warning */
+  QUICK_SELECT_I *make_quick(PARAM *param, bool retrieve_full_rows,
+                             MEM_ROOT *parent_alloc);
+  TRP_RANGE **range_scans; /* array of ptrs to plans of intersected scans */
+  TRP_RANGE **range_scans_end; /* end of the array */
+  /* keys whose scans are to be filtered by cpk conditions */
+  key_map filtered_scans;  
+};
+
+
+/*
   Plan for QUICK_INDEX_MERGE_SELECT scan.
   QUICK_ROR_INTERSECT_SELECT always retrieves full rows, so retrieve_full_rows
   is ignored by make_quick.
@@ -2100,6 +2752,38 @@ public:
 };
 
 
+typedef struct st_index_scan_info
+{
+  uint      idx;      /* # of used key in param->keys */
+  uint      keynr;    /* # of used key in table */
+  uint      range_count;
+  ha_rows   records;  /* estimate of # records this scan will return */
+
+  /* Set of intervals over key fields that will be used for row retrieval. */
+  SEL_ARG   *sel_arg;
+
+  KEY *key_info;
+  uint used_key_parts;
+
+  /* Estimate of # records filtered out by intersection with cpk */
+  ha_rows   filtered_out;
+  /* Bitmap of fields used in index intersection */ 
+  MY_BITMAP used_fields;
+
+  /* Fields used in the query and covered by ROR scan. */
+  MY_BITMAP covered_fields;
+  uint      used_fields_covered; /* # of set bits in covered_fields */
+  int       key_rec_length; /* length of key record (including rowid) */
+
+  /*
+    Cost of reading all index records with values in sel_arg intervals set
+    (assuming there is no need to access full table records)
+  */
+  double    index_read_cost;
+  uint      first_uncovered_field; /* first unused bit in covered_fields */
+  uint      key_components; /* # of parts in the key */
+} INDEX_SCAN_INFO;
+
 /*
   Fill param->needed_fields with bitmap of fields used in the query.
   SYNOPSIS
@@ -2211,7 +2895,8 @@ static int fill_used_fields_bitmap(PARAM *param)
 
 int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
 				  table_map prev_tables,
-				  ha_rows limit, bool force_quick_range)
+				  ha_rows limit, bool force_quick_range, 
+                                  bool ordered_output)
 {
   uint idx;
   double scan_time;
@@ -2269,6 +2954,7 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
     param.imerge_cost_buff_size= 0;
     param.using_real_indexes= TRUE;
     param.remove_jump_scans= TRUE;
+    param.force_default_mrr= ordered_output;
 
     thd->no_errors=1;				// Don't warn about NULL
     init_sql_alloc(&alloc, thd->variables.range_alloc_block_size, 0);
@@ -2375,72 +3061,90 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
         It is possible to use a range-based quick select (but it might be
         slower than 'all' table scan).
       */
-      if (tree->merges.is_empty())
-      {
-        TRP_RANGE         *range_trp;
-        TRP_ROR_INTERSECT *rori_trp;
-        bool can_build_covering= FALSE;
+      TRP_RANGE         *range_trp;
+      TRP_ROR_INTERSECT *rori_trp;
+      TRP_INDEX_INTERSECT *intersect_trp;
+      bool can_build_covering= FALSE;
+      
+      remove_nonrange_trees(&param, tree);
 
-        /* Get best 'range' plan and prepare data for making other plans */
-        if ((range_trp= get_key_scans_params(&param, tree, FALSE, TRUE,
-                                             best_read_time)))
-        {
-          best_trp= range_trp;
-          best_read_time= best_trp->read_cost;
-        }
+      /* Get best 'range' plan and prepare data for making other plans */
+      if ((range_trp= get_key_scans_params(&param, tree, FALSE, TRUE,
+                                           best_read_time)))
+      {
+        best_trp= range_trp;
+        best_read_time= best_trp->read_cost;
+      }
 
+      /*
+        Simultaneous key scans and row deletes on several handler
+        objects are not allowed so don't use ROR-intersection for
+        table deletes.
+      */
+      if ((thd->lex->sql_command != SQLCOM_DELETE) && 
+           optimizer_flag(thd, OPTIMIZER_SWITCH_INDEX_MERGE))
+      {
         /*
-          Simultaneous key scans and row deletes on several handler
-          objects are not allowed so don't use ROR-intersection for
-          table deletes.
+          Get best non-covering ROR-intersection plan and prepare data for
+          building covering ROR-intersection.
         */
-        if ((thd->lex->sql_command != SQLCOM_DELETE) && 
-             optimizer_flag(thd, OPTIMIZER_SWITCH_INDEX_MERGE))
+        if ((rori_trp= get_best_ror_intersect(&param, tree, best_read_time,
+                                              &can_build_covering)))
         {
+          best_trp= rori_trp;
+          best_read_time= best_trp->read_cost;
           /*
-            Get best non-covering ROR-intersection plan and prepare data for
-            building covering ROR-intersection.
+            Try constructing covering ROR-intersect only if it looks possible
+            and worth doing.
           */
-          if ((rori_trp= get_best_ror_intersect(&param, tree, best_read_time,
-                                                &can_build_covering)))
-          {
+          if (!rori_trp->is_covering && can_build_covering &&
+              (rori_trp= get_best_covering_ror_intersect(&param, tree,
+                                                         best_read_time)))
             best_trp= rori_trp;
-            best_read_time= best_trp->read_cost;
-            /*
-              Try constructing covering ROR-intersect only if it looks possible
-              and worth doing.
-            */
-            if (!rori_trp->is_covering && can_build_covering &&
-                (rori_trp= get_best_covering_ror_intersect(&param, tree,
-                                                           best_read_time)))
-              best_trp= rori_trp;
-          }
         }
       }
-      else
+      /*
+        Do not look for an index intersection  plan if there is a covering
+        index. The scan by this covering index will be always cheaper than
+        any index intersection.
+      */
+      if (param.table->covering_keys.is_clear_all() &&
+          optimizer_flag(thd, OPTIMIZER_SWITCH_INDEX_MERGE) &&
+          optimizer_flag(thd, OPTIMIZER_SWITCH_INDEX_MERGE_SORT_INTERSECT))
       {
-        if (optimizer_flag(thd, OPTIMIZER_SWITCH_INDEX_MERGE))
+        if ((intersect_trp= get_best_index_intersect(&param, tree,
+                                                    best_read_time)))
         {
-          /* Try creating index_merge/ROR-union scan. */
-          SEL_IMERGE *imerge;
-          TABLE_READ_PLAN *best_conj_trp= NULL, *new_conj_trp;
-          LINT_INIT(new_conj_trp); /* no empty index_merge lists possible */
-          DBUG_PRINT("info",("No range reads possible,"
-                             " trying to construct index_merge"));
-          List_iterator_fast<SEL_IMERGE> it(tree->merges);
-          while ((imerge= it++))
+          best_trp= intersect_trp;
+          best_read_time= best_trp->read_cost;         
+        }
+      }
+
+      if (optimizer_flag(thd, OPTIMIZER_SWITCH_INDEX_MERGE))
+      {
+        /* Try creating index_merge/ROR-union scan. */
+        SEL_IMERGE *imerge;
+        TABLE_READ_PLAN *best_conj_trp= NULL, *new_conj_trp;
+        LINT_INIT(new_conj_trp); /* no empty index_merge lists possible */
+        DBUG_PRINT("info",("No range reads possible,"
+                           " trying to construct index_merge"));
+        List_iterator_fast<SEL_IMERGE> it(tree->merges);
+        while ((imerge= it++))
+        {
+          new_conj_trp= get_best_disjunct_quick(&param, imerge, best_read_time);
+          if (new_conj_trp)
+            set_if_smaller(param.table->quick_condition_rows, 
+                           new_conj_trp->records);
+          if (new_conj_trp &&
+              (!best_conj_trp || 
+               new_conj_trp->read_cost < best_conj_trp->read_cost))
           {
-            new_conj_trp= get_best_disjunct_quick(&param, imerge, best_read_time);
-            if (new_conj_trp)
-              set_if_smaller(param.table->quick_condition_rows, 
-                             new_conj_trp->records);
-            if (!best_conj_trp || (new_conj_trp && new_conj_trp->read_cost <
-                                   best_conj_trp->read_cost))
-              best_conj_trp= new_conj_trp;
+            best_conj_trp= new_conj_trp;
+            best_read_time= best_conj_trp->read_cost;
           }
-          if (best_conj_trp)
-            best_trp= best_conj_trp;
         }
+        if (best_conj_trp)
+          best_trp= best_conj_trp;
       }
     }
 
@@ -3704,7 +4408,6 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge,
 {
   SEL_TREE **ptree;
   TRP_INDEX_MERGE *imerge_trp= NULL;
-  uint n_child_scans= imerge->trees_next - imerge->trees;
   TRP_RANGE **range_scans;
   TRP_RANGE **cur_child;
   TRP_RANGE **cpk_scan= NULL;
@@ -3724,6 +4427,24 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge,
   DBUG_ENTER("get_best_disjunct_quick");
   DBUG_PRINT("info", ("Full table scan cost: %g", read_time));
 
+  /*
+    In every tree of imerge remove SEL_ARG trees that do not make ranges.
+    If after this removal some SEL_ARG tree becomes empty discard imerge.  
+  */
+  for (ptree= imerge->trees; ptree != imerge->trees_next; ptree++)
+  {
+    if (remove_nonrange_trees(param, *ptree))
+    {
+      imerge->trees_next= imerge->trees;
+      break;
+    }
+  }
+
+  uint n_child_scans= imerge->trees_next - imerge->trees;
+  
+  if (!n_child_scans)
+    DBUG_RETURN(NULL);
+
   if (!(range_scans= (TRP_RANGE**)alloc_root(param->mem_root,
                                              sizeof(TRP_RANGE*)*
                                              n_child_scans)))
@@ -3828,7 +4549,9 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge,
   imerge_cost +=
     Unique::get_use_cost(param->imerge_cost_buff, (uint)non_cpk_scan_records,
                          param->table->file->ref_length,
-                         param->thd->variables.sortbuff_size);
+                         param->thd->variables.sortbuff_size,
+                         TIME_FOR_COMPARE_ROWID,
+                         FALSE, NULL);
   DBUG_PRINT("info",("index_merge total cost: %g (wanted: less then %g)",
                      imerge_cost, read_time));
   if (imerge_cost < read_time)
@@ -3843,6 +4566,13 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge,
       imerge_trp->range_scans_end= range_scans + n_child_scans;
       read_time= imerge_cost;
     }
+    if (imerge_trp)
+    {
+      TABLE_READ_PLAN *trp= merge_same_index_scans(param, imerge, imerge_trp,
+                                                   read_time);
+      if (trp != imerge_trp)
+        DBUG_RETURN(trp);
+    }
   }
 
 build_ror_index_merge:
@@ -3858,6 +4588,7 @@ build_ror_index_merge:
                                         sizeof(TABLE_READ_PLAN*)*
                                         n_child_scans)))
     DBUG_RETURN(imerge_trp);
+
 skip_to_ror_scan:
   roru_index_costs= 0.0;
   roru_total_records= 0;
@@ -3941,30 +4672,990 @@ skip_to_ror_scan:
       DBUG_RETURN(roru);
     }
   }
-  DBUG_RETURN(imerge_trp);
+    DBUG_RETURN(imerge_trp);
 }
 
-typedef struct st_ror_scan_info
+
+/*
+  Merge index scans for the same indexes in an index merge plan
+
+  SYNOPSIS
+    merge_same_index_scans()
+      param           Context info for the operation
+      imerge   IN/OUT SEL_IMERGE from which imerge_trp has been extracted          
+      imerge_trp      The index merge plan where index scans for the same
+                      indexes are to be merges
+      read_time       The upper bound for the cost of the plan to be evaluated
+
+  DESRIPTION
+    For the given index merge plan imerge_trp extracted from the SEL_MERGE
+    imerge the function looks for range scans with the same indexes and merges
+    them into SEL_ARG trees. Then for each such SEL_ARG tree r_i the function
+    creates a range tree rt_i that contains only r_i. All rt_i are joined
+    into one index merge that replaces the original index merge imerge.
+    The function calls get_best_disjunct_quick for the new index merge to
+    get a new index merge plan that contains index scans only for different
+    indexes.
+    If there are no index scans for the same index in the original index
+    merge plan the function does not change the original imerge and returns
+    imerge_trp as its result.
+
+  RETURN
+    The original or or improved index merge plan                        
+*/
+
+static
+TABLE_READ_PLAN *merge_same_index_scans(PARAM *param, SEL_IMERGE *imerge,
+                                        TRP_INDEX_MERGE *imerge_trp,
+                                        double read_time)
 {
-  uint      idx;      /* # of used key in param->keys */
-  uint      keynr;    /* # of used key in table */
-  ha_rows   records;  /* estimate of # records this scan will return */
+  uint16 first_scan_tree_idx[MAX_KEY];
+  SEL_TREE **tree;
+  TRP_RANGE **cur_child;
+  uint removed_cnt= 0;
 
-  /* Set of intervals over key fields that will be used for row retrieval. */
-  SEL_ARG   *sel_arg;
+  DBUG_ENTER("merge_same_index_scans");
 
-  /* Fields used in the query and covered by this ROR scan. */
-  MY_BITMAP covered_fields;
-  uint      used_fields_covered; /* # of set bits in covered_fields */
-  int       key_rec_length; /* length of key record (including rowid) */
+  bzero(first_scan_tree_idx, sizeof(first_scan_tree_idx[0])*param->keys);
 
-  /*
-    Cost of reading all index records with values in sel_arg intervals set
-    (assuming there is no need to access full table records)
-  */
-  double    index_read_cost;
-  uint      first_uncovered_field; /* first unused bit in covered_fields */
-  uint      key_components; /* # of parts in the key */
+  for (tree= imerge->trees, cur_child= imerge_trp->range_scans;
+       tree != imerge->trees_next;
+       tree++, cur_child++)
+  {
+    DBUG_ASSERT(tree);
+    uint key_idx= (*cur_child)->key_idx;
+    uint16 *tree_idx_ptr= &first_scan_tree_idx[key_idx];
+    if (!*tree_idx_ptr)
+      *tree_idx_ptr= (uint16) (tree-imerge->trees+1);
+    else
+    {
+      SEL_TREE **changed_tree= imerge->trees+(*tree_idx_ptr-1);
+      SEL_ARG *key= (*changed_tree)->keys[key_idx];
+      bzero((*changed_tree)->keys,
+            sizeof((*changed_tree)->keys[0])*param->keys);
+      (*changed_tree)->keys_map.clear_all();
+      if (((*changed_tree)->keys[key_idx]=
+             key_or(param, key, (*tree)->keys[key_idx])))
+        (*changed_tree)->keys_map.set_bit(key_idx);
+      *tree= NULL;
+      removed_cnt++;
+    }
+  }
+  if (!removed_cnt)
+    DBUG_RETURN(imerge_trp);
+
+  TABLE_READ_PLAN *trp= NULL;
+  SEL_TREE **new_trees_next= imerge->trees;
+  for (tree= new_trees_next; tree != imerge->trees_next; tree++)
+  {
+    if (!*tree)
+      continue;
+    if (tree > new_trees_next)
+      *new_trees_next= *tree;
+    new_trees_next++;
+  }
+  imerge->trees_next= new_trees_next;
+
+  DBUG_ASSERT(imerge->trees_next>imerge->trees);
+
+  if (imerge->trees_next-imerge->trees > 1)
+    trp= get_best_disjunct_quick(param, imerge, read_time);
+  else
+  {
+    /*
+      This alternative theoretically can be reached when the cost
+      of the index merge for such a formula as
+        (key1 BETWEEN c1_1 AND c1_2) AND key2 > c2 OR
+        (key1 BETWEEN c1_3 AND c1_4) AND key3 > c3
+      is estimated as being cheaper than the cost of index scan for
+      the formula
+        (key1 BETWEEN c1_1 AND c1_2) OR (key1 BETWEEN c1_3 AND c1_4)
+      
+      In the current code this may happen for two reasons:
+      1. for a single index range scan data records are accessed in
+         a random order
+      2. the functions that estimate the cost of a range scan and an
+         index merge retrievals are not well calibrated
+    */
+    trp= get_key_scans_params(param, *imerge->trees, FALSE, TRUE,
+                              read_time);
+  }
+
+  DBUG_RETURN(trp); 
+}
+
+
+/*
+  This structure contains the info common for all steps of a partial
+  index intersection plan. Morever it contains also the info common
+  for index intersect plans. This info is filled in by the function
+  prepare_search_best just before searching for the best index
+  intersection plan.
+*/  
+
+typedef struct st_common_index_intersect_info
+{
+  PARAM *param;           /* context info for range optimizations            */
+  uint key_size;          /* size of a ROWID element stored in Unique object */
+  uint compare_factor;         /* 1/compare - cost to compare two ROWIDs     */
+  ulonglong max_memory_size;   /* maximum space allowed for Unique objects   */   
+  ha_rows table_cardinality;   /* estimate of the number of records in table */
+  double cutoff_cost;        /* discard index intersects with greater costs  */ 
+  INDEX_SCAN_INFO *cpk_scan;  /* clustered primary key used in intersection  */
+
+  bool in_memory;  /* unique object for intersection is completely in memory */
+
+  INDEX_SCAN_INFO **search_scans;    /* scans possibly included in intersect */ 
+  uint n_search_scans;               /* number of elements in search_scans   */
+
+  bool best_uses_cpk;   /* current best intersect uses clustered primary key */
+  double best_cost;       /* cost of the current best index intersection     */
+  /* estimate of the number of records in the current best intersection      */
+  ha_rows best_records;
+  uint best_length;    /* number of indexes in the current best intersection */
+  INDEX_SCAN_INFO **best_intersect;  /* the current best index intersection  */
+  /* scans from the best intersect to be filtrered by cpk conditions         */
+  key_map filtered_scans; 
+
+  uint *buff_elems;        /* buffer to calculate cost of index intersection */
+  
+} COMMON_INDEX_INTERSECT_INFO;
+
+
+/*
+  This structure contains the info specific for one step of an index
+  intersection plan. The structure is filled in by the function 
+   check_index_intersect_extension.
+*/
+
+typedef struct st_partial_index_intersect_info
+{
+  COMMON_INDEX_INTERSECT_INFO *common_info;    /* shared by index intersects */
+  uint length;         /* number of index scans in the partial intersection  */
+  ha_rows records;     /* estimate of the number of records in intersection  */
+  double cost;         /* cost of the partial index intersection             */
+
+  /* estimate of total number of records of all scans of the partial index
+     intersect sent to the Unique object used for the intersection  */
+  ha_rows records_sent_to_unique;
+
+  /* total cost of the scans of indexes from the partial index intersection  */
+  double index_read_cost; 
+
+  bool use_cpk_filter;      /* cpk filter is to be used for this       scan  */  
+  bool in_memory;            /* uses unique object in memory                 */
+  double in_memory_cost;     /* cost of using unique object in memory        */
+
+  key_map filtered_scans;    /* scans to be filtered by cpk conditions       */
+         
+  MY_BITMAP *intersect_fields;     /* bitmap of fields used in intersection  */
+} PARTIAL_INDEX_INTERSECT_INFO;
+
+
+/* Check whether two indexes have the same first n components */
+
+static
+bool same_index_prefix(KEY *key1, KEY *key2, uint used_parts)
+{
+  KEY_PART_INFO *part1= key1->key_part;
+  KEY_PART_INFO *part2= key2->key_part;
+  for(uint i= 0; i < used_parts; i++, part1++, part2++)
+  {
+    if (part1->fieldnr != part2->fieldnr)
+      return FALSE;
+  }
+  return TRUE;
+}
+
+
+/* Create a bitmap for all fields of a table */
+
+static
+bool create_fields_bitmap(PARAM *param, MY_BITMAP *fields_bitmap)
+{
+  my_bitmap_map *bitmap_buf;
+
+  if (!(bitmap_buf= (my_bitmap_map *) alloc_root(param->mem_root,
+                                                 param->fields_bitmap_size)))
+    return TRUE;
+  if (bitmap_init(fields_bitmap, bitmap_buf, param->table->s->fields, FALSE))
+    return TRUE;
+  
+  return FALSE;
+}
+
+/* Compare two indexes scans for sort before search for the best intersection */
+
+static
+int cmp_intersect_index_scan(INDEX_SCAN_INFO **a, INDEX_SCAN_INFO **b)
+{
+  return (*a)->records < (*b)->records ?
+          -1 : (*a)->records == (*b)->records ? 0 : 1;
+}
+
+
+static inline
+void set_field_bitmap_for_index_prefix(MY_BITMAP *field_bitmap,
+                                       KEY_PART_INFO *key_part,
+                                       uint used_key_parts)
+{
+  bitmap_clear_all(field_bitmap);
+  for (KEY_PART_INFO *key_part_end= key_part+used_key_parts;
+       key_part < key_part_end; key_part++)
+  {
+    bitmap_set_bit(field_bitmap, key_part->fieldnr-1);
+  }
+}
+
+
+/*
+  Round up table cardinality read from statistics provided by engine.
+  This function should go away when mysql test will allow to handle
+  more or less easily in the test suites deviations of InnoDB 
+  statistical data.
+*/
+ 
+static inline
+ha_rows get_table_cardinality_for_index_intersect(TABLE *table)
+{
+  if (table->file->ha_table_flags() & HA_STATS_RECORDS_IS_EXACT)
+    return table->file->stats.records;
+  else
+  {
+    ha_rows d;
+    double q;
+    for (q= (double)table->file->stats.records, d= 1 ; q >= 10; q/= 10, d*= 10 ) ;
+    return (ha_rows) (floor(q+0.5) * d);
+  } 
+}
+
+  
+static
+ha_rows records_in_index_intersect_extension(PARTIAL_INDEX_INTERSECT_INFO *curr,
+                                             INDEX_SCAN_INFO *ext_index_scan);
+
+/*
+  Prepare to search for the best index intersection
+
+  SYNOPSIS
+    prepare_search_best_index_intersect()
+      param         common info about index ranges
+      tree          tree of ranges for indexes than can be intersected
+      common    OUT info needed for search to be filled by the function 
+      init      OUT info for an initial pseudo step of the intersection plans
+      cutoff_cost   cut off cost of the interesting index intersection 
+
+  DESCRIPTION
+    The function initializes all fields of the structure 'common' to be used
+    when searching for the best intersection plan. It also allocates
+    memory to store the most cheap index intersection.
+
+  NOTES
+    When selecting candidates for index intersection we always take only
+    one representative out of any set of indexes that share the same range
+    conditions. These indexes always have the same prefixes and the
+    components of this prefixes are exactly those used in these range
+    conditions.
+    Range conditions over clustered primary key (cpk) is always used only
+    as the condition that filters out some rowids retrieved by the scans
+    for secondary indexes. The cpk index will be handled in special way by
+    the function that search for the best index intersection. 
+
+  RETURN
+    FALSE  in the case of success
+    TRUE   otherwise
+*/
+
+static
+bool prepare_search_best_index_intersect(PARAM *param, 
+                                         SEL_TREE *tree,
+                                         COMMON_INDEX_INTERSECT_INFO *common,
+                                         PARTIAL_INDEX_INTERSECT_INFO *init,
+                                         double cutoff_cost)
+{
+  uint i;
+  uint n_search_scans;
+  double cost;
+  INDEX_SCAN_INFO **index_scan;
+  INDEX_SCAN_INFO **scan_ptr;
+  INDEX_SCAN_INFO *cpk_scan= NULL;
+  TABLE *table= param->table;
+  uint n_index_scans= tree->index_scans_end - tree->index_scans;
+
+  if (!n_index_scans)
+    return 1;
+
+  bzero(init, sizeof(*init));
+  init->common_info= common;
+  init->cost= cutoff_cost;
+
+  common->param= param;
+  common->key_size= table->file->ref_length;
+  common->compare_factor= TIME_FOR_COMPARE_ROWID;
+  common->max_memory_size= param->thd->variables.sortbuff_size;
+  common->cutoff_cost= cutoff_cost;
+  common->cpk_scan= NULL;
+  common->table_cardinality= 
+    get_table_cardinality_for_index_intersect(table);
+
+  if (n_index_scans <= 1)
+    return TRUE;
+
+  if (table->file->primary_key_is_clustered())
+  {
+    INDEX_SCAN_INFO **index_scan_end;
+    index_scan= tree->index_scans;
+    index_scan_end= index_scan+n_index_scans;
+    for ( ; index_scan < index_scan_end; index_scan++)
+    {  
+      if ((*index_scan)->keynr == table->s->primary_key)
+      {
+        common->cpk_scan= cpk_scan= *index_scan;
+          break;
+      }
+    }
+  }
+
+  i= n_index_scans - test(cpk_scan != NULL) + 1;
+
+  if (!(common->search_scans =
+	(INDEX_SCAN_INFO **) alloc_root (param->mem_root,
+                                         sizeof(INDEX_SCAN_INFO *) * i)))
+    return TRUE;
+  bzero(common->search_scans, sizeof(INDEX_SCAN_INFO *) * i);
+
+  INDEX_SCAN_INFO **selected_index_scans= common->search_scans;
+    
+  for (i=0, index_scan= tree->index_scans; i < n_index_scans; i++, index_scan++)
+  {
+    uint used_key_parts= (*index_scan)->used_key_parts;
+    KEY *key_info= (*index_scan)->key_info;
+
+    if (*index_scan == cpk_scan)
+      continue;
+    if (cpk_scan && cpk_scan->used_key_parts >= used_key_parts &&
+        same_index_prefix(cpk_scan->key_info, key_info, used_key_parts))
+      continue;
+
+    cost= table->file->keyread_time((*index_scan)->keynr,
+                                    (*index_scan)->range_count,
+                                    (*index_scan)->records);
+    if (cost >= cutoff_cost)
+      continue;
+   
+    for (scan_ptr= selected_index_scans; *scan_ptr ; scan_ptr++)
+    {
+      /*
+        When we have range conditions for two different indexes with the same
+        beginning it does not make sense to consider both of them for index 
+        intersection if the range conditions are covered by common initial
+        components of the indexes. Actually in this case the indexes are
+        guaranteed to have the same range conditions.
+      */
+      if ((*scan_ptr)->used_key_parts == used_key_parts &&
+          same_index_prefix((*scan_ptr)->key_info, key_info, used_key_parts))
+        break;
+    }
+    if (!*scan_ptr || cost < (*scan_ptr)->index_read_cost)
+    {
+      *scan_ptr= *index_scan;
+      (*scan_ptr)->index_read_cost= cost;
+    }
+  } 
+
+  ha_rows records_in_scans= 0;
+
+  for (scan_ptr=selected_index_scans, i= 0; *scan_ptr; scan_ptr++, i++)
+  {
+    if (create_fields_bitmap(param, &(*scan_ptr)->used_fields))
+      return TRUE;
+    records_in_scans+= (*scan_ptr)->records;
+  }
+  n_search_scans= i;
+
+  if (cpk_scan && create_fields_bitmap(param, &cpk_scan->used_fields))
+    return TRUE;
+  
+  if (!(common->n_search_scans= n_search_scans))
+    return TRUE;
+    
+  common->best_uses_cpk= FALSE;
+  common->best_cost= cutoff_cost + COST_EPS;
+  common->best_length= 0;
+
+  if (!(common->best_intersect=
+	(INDEX_SCAN_INFO **) alloc_root (param->mem_root,
+                                         sizeof(INDEX_SCAN_INFO *) *
+                                         (i + test(cpk_scan != NULL)))))
+    return TRUE;
+
+  size_t calc_cost_buff_size=
+         Unique::get_cost_calc_buff_size((size_t)records_in_scans,
+                                         common->key_size,
+				         common->max_memory_size);
+  if (!(common->buff_elems= (uint *) alloc_root(param->mem_root,
+                                                calc_cost_buff_size)))
+    return TRUE;
+
+  my_qsort(selected_index_scans, n_search_scans, sizeof(INDEX_SCAN_INFO *),
+           (qsort_cmp) cmp_intersect_index_scan);
+
+  if (cpk_scan)
+  {
+    PARTIAL_INDEX_INTERSECT_INFO curr;
+    set_field_bitmap_for_index_prefix(&cpk_scan->used_fields,
+                                      cpk_scan->key_info->key_part,
+                                      cpk_scan->used_key_parts);
+    curr.common_info= common;
+    curr.intersect_fields= &cpk_scan->used_fields;
+    curr.records= cpk_scan->records;
+    curr.length= 1;
+    for (scan_ptr=selected_index_scans; *scan_ptr; scan_ptr++)
+    {
+      ha_rows scan_records= (*scan_ptr)->records;
+      ha_rows records= records_in_index_intersect_extension(&curr, *scan_ptr);
+      (*scan_ptr)->filtered_out= records >= scan_records ?
+                                   0 : scan_records-records; 
+    }
+  } 
+  else
+  {
+    for (scan_ptr=selected_index_scans; *scan_ptr; scan_ptr++)
+      (*scan_ptr)->filtered_out= 0;
+  }
+
+  return FALSE;
+}
+
+
+/*
+  On Estimation of the Number of Records in an Index Intersection 
+  ===============================================================
+
+  Consider query Q over table t. Let C be the WHERE condition of  this query,
+  and, idx1(a1_1,...,a1_k1) and idx2(a2_1,...,a2_k2) be some indexes defined
+  on table t.
+  Let rt1 and rt2 be the range trees extracted by the range optimizer from C
+  for idx1 and idx2 respectively.
+  Let #t be the estimate of the number of records in table t provided for the
+  optimizer. 
+  Let #r1 and #r2 be the estimates of the number of records in the range trees
+  rt1 and rt2, respectively, obtained by the range optimizer.
+
+  We need to get an estimate for the number of records in the index 
+  intersection of rt1 and rt2. In other words, we need to estimate the
+  cardinality of the set of records that are in both trees. Let's designate
+  this number by #r.
+
+  If we do not make any assumptions then we can only state that
+     #r<=min(#r1,#r2).
+  With this estimate we can't say that the index intersection scan will be 
+  cheaper than the cheapest index scan.
+
+  Let Rt1 and Rt2 be AND/OR conditions representing rt and rt2 respectively.
+  The probability that a record belongs to rt1 is sel(Rt1)=#r1/#t.
+  The probability that a record belongs to rt2 is sel(Rt2)=#r2/#t.
+
+  If we assume that the values in columns of idx1 and idx2 are independent
+  then #r/#t=sel(Rt1&Rt2)=sel(Rt1)*sel(Rt2)=(#r1/#t)*(#r2/#t).
+  So in this case we have: #r=#r1*#r2/#t.
+
+  The above assumption of independence of the columns in idx1 and idx2 means
+  that:
+  - all columns are different
+  - values from one column do not correlate with values from any other column.
+
+  We can't help with the case when column correlate with each other.
+  Yet, if they are assumed to be uncorrelated the value of #r theoretically can
+  be evaluated . Unfortunately this evaluation, in general, is rather complex.
+
+  Let's consider two indexes idx1:(dept, manager),  idx2:(dept, building)
+  over table 'employee' and two range conditions over these indexes:
+    Rt1: dept=10 AND manager LIKE 'S%'
+    Rt2: dept=10 AND building LIKE 'L%'.
+  We can state that:
+    sel(Rt1&Rt2)=sel(dept=10)*sel(manager LIKE 'S%')*sel(building LIKE 'L%')
+    =sel(Rt1)*sel(Rt2)/sel(dept=10).
+  sel(Rt1/2_0:dept=10) can be estimated if we know the cardinality #r1_0 of
+  the range for sub-index idx1_0 (dept) of the index idx1 or the cardinality
+  #rt2_0 of the same range for sub-index idx2_0(dept) of the index idx2.
+  The current code does not make an estimate either for #rt1_0, or for #rt2_0,
+  but it can be adjusted to provide those numbers.
+  Alternatively, min(rec_per_key) for (dept) could be used to get an upper 
+  bound for the value of sel(Rt1&Rt2). Yet this statistics is not provided
+  now.  
+ 
+  Let's consider two other indexes idx1:(dept, last_name), 
+  idx2:(first_name, last_name) and two range conditions over these indexes:
+    Rt1: dept=5 AND last_name='Sm%'
+    Rt2: first_name='Robert' AND last_name='Sm%'.
+
+  sel(Rt1&Rt2)=sel(dept=5)*sel(last_name='Sm5')*sel(first_name='Robert')
+  =sel(Rt2)*sel(dept=5)
+  Here max(rec_per_key) for (dept) could be used to get an upper bound for
+  the value of sel(Rt1&Rt2).
+  
+  When the intersected indexes have different major columns, but some
+  minor column are common the picture may be more complicated.
+
+  Let's consider the following range conditions for the same indexes as in
+  the previous example:
+    Rt1: (Rt11: dept=5 AND last_name='So%') 
+         OR 
+         (Rt12: dept=7 AND last_name='Saw%')
+    Rt2: (Rt21: first_name='Robert' AND last_name='Saw%')
+         OR
+         (Rt22: first_name='Bob' AND last_name='So%')
+  Here we have:
+  sel(Rt1&Rt2)= sel(Rt11)*sel(Rt21)+sel(Rt22)*sel(dept=5) +
+                sel(Rt21)*sel(dept=7)+sel(Rt12)*sel(Rt22)
+  Now consider the range condition:
+    Rt1_0: (dept=5 OR dept=7)
+  For this condition we can state that:
+  sel(Rt1_0&Rt2)=(sel(dept=5)+sel(dept=7))*(sel(Rt21)+sel(Rt22))=
+  sel(dept=5)*sel(Rt21)+sel(dept=7)*sel(Rt21)+
+  sel(dept=5)*sel(Rt22)+sel(dept=7)*sel(Rt22)=
+  sel(dept=5)*sel(Rt21)+sel(Rt21)*sel(dept=7)+
+  sel(Rt22)*sel(dept=5)+sel(dept=7)*sel(Rt22) >
+  sel(Rt11)*sel(Rt21)+sel(Rt22)*sel(dept=5)+
+  sel(Rt21)*sel(dept=7)+sel(Rt12)*sel(Rt22) >
+  sel(Rt1 & Rt2) 
+
+ We've just demonstrated for an example what is intuitively almost obvious
+ in general. We can  remove the ending parts fromrange trees getting less
+ selective range conditions for sub-indexes.
+ So if not a most major component with the number k of an index idx is
+ encountered in the index with which we intersect we can use the sub-index
+ idx_k-1 that includes the components of idx up to the i-th component and
+ the range tree for idx_k-1 to make an upper bound estimate for the number
+  of records in the index intersection.
+ The range tree for idx_k-1 we use here is the subtree of the original range
+  tree for idx that contains only parts from the first k-1 components.
+
+  As it was mentioned above the range optimizer currently does not provide
+  an estimate for the number of records in the ranges for sub-indexes.
+  However, some reasonable upper bound estimate can be obtained.
+
+  Let's consider the following range tree:
+    Rt: (first_name='Robert' AND last_name='Saw%')
+        OR
+        (first_name='Bob' AND last_name='So%')
+  Let #r be the number of records in Rt. Let f_1 be the fan-out of column
+  last_name:
+    f_1 = rec_per_key[first_name]/rec_per_key[last_name].
+  The the number of records in the range tree:
+    Rt_0:  (first_name='Robert' OR first_name='Bob')
+  for the sub-index (first_name) is not greater than max(#r*f_1, #t).
+  Strictly speaking, we can state only that it's not greater than 
+  max(#r*max_f_1, #t), where
+    max_f_1= max_rec_per_key[first_name]/min_rec_per_key[last_name].
+  Yet, if #r/#t is big enough (and this is the case of an index intersection,
+  because using this index range with a single index scan is cheaper than
+  the cost of the intersection when #r/#t is small) then almost safely we
+  can use here f_1 instead of max_f_1.
+
+  The above considerations can be used in future development. Now, they are
+  used partly in the function that provides a rough upper bound estimate for
+  the number of records in an index intersection that follow below.
+*/
+
+/*
+  Estimate the number of records selected by an extension a partial intersection
+
+  SYNOPSIS
+    records_in_index_intersect_extension()
+     curr            partial intersection plan to be extended
+     ext_index_scan  the evaluated extension of this partial plan
+
+  DESCRIPTION
+    The function provides an estimate for the number of records in the
+    intersection of the partial index intersection curr with the index
+    ext_index_scan. If all intersected indexes does not have common columns
+    then  the function returns an exact estimate (assuming there are no
+    correlations between values in the columns). If the intersected indexes
+    have common  columns the function returns an upper bound for the number
+    of records in the intersection provided that the intersection of curr
+    with ext_index_scan can is expected to have less records than the expected
+    number of records in the partial intersection curr. In this case the
+    function also assigns the bitmap of the columns in the extended 
+    intersection to ext_index_scan->used_fields.
+    If the function cannot expect that the number of records in the extended
+    intersection is less that the expected number of records #r in curr then
+    the function returns a number bigger than #r.
+
+  NOTES
+   See the comment before the desription of the function that explains the
+   reasoning used  by this function.
+    
+  RETURN
+    The expected number of rows in the extended index intersection
+*/
+
+static
+ha_rows records_in_index_intersect_extension(PARTIAL_INDEX_INTERSECT_INFO *curr,
+                                             INDEX_SCAN_INFO *ext_index_scan)
+{
+  KEY *key_info= ext_index_scan->key_info;
+  KEY_PART_INFO* key_part= key_info->key_part;
+  uint used_key_parts= ext_index_scan->used_key_parts;
+  MY_BITMAP *used_fields= &ext_index_scan->used_fields;
+  
+  if (!curr->length)
+  {
+    /* 
+      If this the first index in the intersection just mark the
+      fields in the used_fields bitmap and return the expected
+      number of records in the range scan for the index provided
+      by the range optimizer.
+    */ 
+    set_field_bitmap_for_index_prefix(used_fields, key_part, used_key_parts);
+    return ext_index_scan->records;
+  }
+
+  uint i;
+  bool better_selectivity= FALSE;
+  ha_rows records= curr->records;
+  MY_BITMAP *curr_intersect_fields= curr->intersect_fields; 
+  for (i= 0; i < used_key_parts; i++, key_part++)
+  {
+    if (bitmap_is_set(curr_intersect_fields, key_part->fieldnr-1))
+      break;
+  }
+  if (i)
+  {
+    ha_rows table_cardinality= curr->common_info->table_cardinality;
+    ha_rows ext_records= ext_index_scan->records;
+    if (i < used_key_parts)
+    {
+      ulong *rec_per_key= key_info->rec_per_key+i-1;
+      ulong f1= rec_per_key[0] ? rec_per_key[0] : 1;
+      ulong f2= rec_per_key[1] ? rec_per_key[1] : 1;
+      ext_records= (ha_rows) ((double) ext_records / f2 * f1);
+    }
+    if (ext_records < table_cardinality)
+    {
+      better_selectivity= TRUE;
+      records= (ha_rows) ((double) records / table_cardinality *
+			  ext_records);
+      bitmap_copy(used_fields, curr_intersect_fields);
+      key_part= key_info->key_part;
+      for (uint j= 0; j < used_key_parts; j++, key_part++)
+        bitmap_set_bit(used_fields, key_part->fieldnr-1);
+    }
+  }
+  return !better_selectivity ? records+1 :
+                               !records ? 1 : records;
+}
+
+
+/* 
+  Estimate the cost a binary search within disjoint cpk range intervals
+
+  Number of comparisons to check whether a cpk value satisfies
+  the cpk range condition = log2(cpk_scan->range_count).
+*/ 
+
+static inline
+double get_cpk_filter_cost(ha_rows filtered_records, 
+                           INDEX_SCAN_INFO *cpk_scan,
+                           double compare_factor)
+{
+  return log((double) (cpk_scan->range_count+1)) / (compare_factor * M_LN2) *
+           filtered_records;
+}
+
+
+/*
+  Check whether a patial index intersection plan can be extended 
+
+  SYNOPSIS
+    check_index_intersect_extension()
+     curr            partial intersection plan to be extended
+     ext_index_scan  a possible extension of this plan to be checked
+     next       OUT  the structure to be filled for the extended plan 
+
+  DESCRIPTION
+    The function checks whether it makes sense to extend the index
+    intersection plan adding the index ext_index_scan, and, if this
+    the case, the function fills in the structure for the extended plan.
+
+  RETURN
+    TRUE      if it makes sense to extend the given plan 
+    FALSE     otherwise
+*/
+
+static
+bool check_index_intersect_extension(PARTIAL_INDEX_INTERSECT_INFO *curr,
+                                     INDEX_SCAN_INFO *ext_index_scan,
+                                     PARTIAL_INDEX_INTERSECT_INFO *next)
+{
+  ha_rows records;
+  ha_rows records_sent_to_unique;
+  double cost;
+  ha_rows ext_index_scan_records= ext_index_scan->records;
+  ha_rows records_filtered_out_by_cpk= ext_index_scan->filtered_out;
+  COMMON_INDEX_INTERSECT_INFO *common_info= curr->common_info;
+  double cutoff_cost= common_info->cutoff_cost;
+  uint idx= curr->length;
+  next->index_read_cost= curr->index_read_cost+ext_index_scan->index_read_cost;
+  if (next->index_read_cost > cutoff_cost)
+    return FALSE; 
+
+  if ((next->in_memory= curr->in_memory))
+    next->in_memory_cost= curr->in_memory_cost;
+
+  next->intersect_fields= &ext_index_scan->used_fields;
+  next->filtered_scans= curr->filtered_scans;
+
+  records_sent_to_unique= curr->records_sent_to_unique;
+
+  next->use_cpk_filter= FALSE;
+
+  /* Calculate the cost of using a Unique object for index intersection */
+  if (idx && next->in_memory)
+  { 
+    /* 
+      All rowids received from the first scan are expected in one unique tree
+    */
+    ha_rows elems_in_tree= common_info->search_scans[0]->records-
+                           common_info->search_scans[0]->filtered_out ;
+    next->in_memory_cost+= Unique::get_search_cost(elems_in_tree,
+                                                   common_info->compare_factor)* 
+                             ext_index_scan_records;
+    cost= next->in_memory_cost;
+  }
+  else
+  {
+    uint *buff_elems= common_info->buff_elems;
+    uint key_size= common_info->key_size;
+    uint compare_factor= common_info->compare_factor;         
+    ulonglong max_memory_size= common_info->max_memory_size; 
+    
+    records_sent_to_unique+= ext_index_scan_records;
+    cost= Unique::get_use_cost(buff_elems, (size_t) records_sent_to_unique, key_size,
+                               max_memory_size, compare_factor, TRUE,
+                               &next->in_memory);
+    if (records_filtered_out_by_cpk)
+    {
+      /* Check whether using cpk filter for this scan is beneficial */
+
+      double cost2;
+      bool in_memory2;
+      ha_rows records2= records_sent_to_unique-records_filtered_out_by_cpk;
+      cost2=  Unique::get_use_cost(buff_elems, (size_t) records2, key_size,
+                                   max_memory_size, compare_factor, TRUE,
+                                   &in_memory2);
+      cost2+= get_cpk_filter_cost(ext_index_scan_records, common_info->cpk_scan,
+                                  compare_factor);
+      if (cost > cost2 + COST_EPS)
+      {
+        cost= cost2;
+        next->in_memory= in_memory2;
+        next->use_cpk_filter= TRUE;
+        records_sent_to_unique= records2;
+      }
+
+    }   
+    if (next->in_memory)
+      next->in_memory_cost= cost;
+  }
+
+  if (next->use_cpk_filter)
+  {
+    next->filtered_scans.set_bit(ext_index_scan->keynr);
+    bitmap_union(&ext_index_scan->used_fields,
+                 &common_info->cpk_scan->used_fields);
+  }
+  next->records_sent_to_unique= records_sent_to_unique;
+       
+  records= records_in_index_intersect_extension(curr, ext_index_scan);
+  if (idx && records > curr->records)
+    return FALSE;
+  if (next->use_cpk_filter && curr->filtered_scans.is_clear_all())
+    records-= records_filtered_out_by_cpk;
+  next->records= records;
+
+  cost+= next->index_read_cost;
+  if (cost >= cutoff_cost)
+    return FALSE;
+
+  cost+= get_sweep_read_cost(common_info->param, records);
+
+  next->cost= cost;
+  next->length= curr->length+1;
+
+  return TRUE;
+}
+
+
+/*
+  Search for the cheapest extensions of range scans used to access a table    
+
+  SYNOPSIS
+    find_index_intersect_best_extension()
+      curr        partial intersection to evaluate all possible extension for 
+
+  DESCRIPTION
+    The function tries to extend the partial plan curr in all possible ways
+    to look for a cheapest index intersection whose cost less than the 
+    cut off value set in curr->common_info.cutoff_cost. 
+*/
+
+static 
+void find_index_intersect_best_extension(PARTIAL_INDEX_INTERSECT_INFO *curr)
+{
+  PARTIAL_INDEX_INTERSECT_INFO next;
+  COMMON_INDEX_INTERSECT_INFO *common_info= curr->common_info;
+  INDEX_SCAN_INFO **index_scans= common_info->search_scans;
+  uint idx= curr->length;
+  INDEX_SCAN_INFO **rem_first_index_scan_ptr= &index_scans[idx];
+  double cost= curr->cost;
+
+  if (cost + COST_EPS < common_info->best_cost)
+  {
+    common_info->best_cost= cost;
+    common_info->best_length= curr->length;
+    common_info->best_records= curr->records;
+    common_info->filtered_scans= curr->filtered_scans;
+    /* common_info->best_uses_cpk <=> at least one scan uses a cpk filter */
+    common_info->best_uses_cpk= !curr->filtered_scans.is_clear_all();
+    uint sz= sizeof(INDEX_SCAN_INFO *) * curr->length;
+    memcpy(common_info->best_intersect, common_info->search_scans, sz);
+    common_info->cutoff_cost= cost;
+  }   
+
+  if (!(*rem_first_index_scan_ptr))
+    return;  
+
+  next.common_info= common_info;
+ 
+  INDEX_SCAN_INFO *rem_first_index_scan= *rem_first_index_scan_ptr;
+  for (INDEX_SCAN_INFO **index_scan_ptr= rem_first_index_scan_ptr;
+       *index_scan_ptr; index_scan_ptr++)
+  {
+    *rem_first_index_scan_ptr= *index_scan_ptr;
+    *index_scan_ptr= rem_first_index_scan;
+    if (check_index_intersect_extension(curr, *rem_first_index_scan_ptr, &next))
+      find_index_intersect_best_extension(&next);
+    *index_scan_ptr= *rem_first_index_scan_ptr;
+    *rem_first_index_scan_ptr= rem_first_index_scan;
+  }
+}
+
+
+/*
+  Get the plan of the best intersection of range scans used to access a table    
+
+  SYNOPSIS
+    get_best_index_intersect()
+      param         common info about index ranges
+      tree          tree of ranges for indexes than can be intersected
+      read_time     cut off value for the evaluated plans 
+
+  DESCRIPTION
+    The function looks for the cheapest index intersection of the range
+    scans to access a table. The info about the ranges for all indexes
+    is provided by the range optimizer and is passed through the
+    parameters param and tree. Any plan whose cost is greater than read_time
+    is rejected. 
+    After the best index intersection is found the function constructs
+    the structure that manages the execution by the chosen plan.
+
+  RETURN
+    Pointer to the generated execution structure if a success,
+    0 - otherwise.
+*/
+
+static
+TRP_INDEX_INTERSECT *get_best_index_intersect(PARAM *param, SEL_TREE *tree,
+                                              double read_time)
+{
+  uint i;
+  uint count;
+  TRP_RANGE **cur_range;
+  TRP_RANGE **range_scans;
+  INDEX_SCAN_INFO *index_scan;
+  COMMON_INDEX_INTERSECT_INFO common;
+  PARTIAL_INDEX_INTERSECT_INFO init;
+  TRP_INDEX_INTERSECT *intersect_trp= NULL;
+  TABLE *table= param->table;
+  
+  
+  DBUG_ENTER("get_best_index_intersect");
+
+  if (prepare_search_best_index_intersect(param, tree, &common, &init,
+                                          read_time))
+    DBUG_RETURN(NULL);
+
+  find_index_intersect_best_extension(&init);
+
+  if (common.best_length <= 1 && !common.best_uses_cpk)
+    DBUG_RETURN(NULL);
+
+  if (common.best_uses_cpk)
+  {
+    memmove((char *) (common.best_intersect+1), (char *) common.best_intersect,
+            sizeof(INDEX_SCAN_INFO *) * common.best_length);
+    common.best_intersect[0]= common.cpk_scan;
+    common.best_length++;
+  }
+
+  count= common.best_length;
+
+  if (!(range_scans= (TRP_RANGE**)alloc_root(param->mem_root,
+                                            sizeof(TRP_RANGE *)*
+                                            count)))
+    DBUG_RETURN(NULL);
+
+  for (i= 0, cur_range= range_scans; i < count; i++)
+  {
+    index_scan= common.best_intersect[i];
+    if ((*cur_range= new (param->mem_root) TRP_RANGE(index_scan->sel_arg,
+                                                     index_scan->idx, 0)))
+    {  
+      TRP_RANGE *trp= *cur_range;  
+      trp->read_cost= index_scan->index_read_cost;  
+      trp->records= index_scan->records;        
+      trp->is_ror= FALSE;
+      trp->mrr_buf_size= 0;
+      table->intersect_keys.set_bit(index_scan->keynr);
+      cur_range++;
+    }
+  }
+  
+  count= tree->index_scans_end - tree->index_scans;
+  for (i= 0; i < count; i++)
+  {
+    index_scan= tree->index_scans[i]; 
+    if (!table->intersect_keys.is_set(index_scan->keynr))
+    {
+      for (uint j= 0; j < common.best_length; j++)
+      {
+	INDEX_SCAN_INFO *scan= common.best_intersect[j];
+        if (same_index_prefix(index_scan->key_info, scan->key_info,
+                              scan->used_key_parts))
+	{
+          table->intersect_keys.set_bit(index_scan->keynr);
+          break;
+        } 
+      }
+    }
+  }
+      
+  if ((intersect_trp= new (param->mem_root)TRP_INDEX_INTERSECT))
+  {
+    intersect_trp->read_cost= common.best_cost;
+    intersect_trp->records= common.best_records;
+    intersect_trp->range_scans= range_scans;
+    intersect_trp->range_scans_end= cur_range;
+    intersect_trp->filtered_scans= common.filtered_scans;
+  }
+  DBUG_RETURN(intersect_trp);
+}
+
+
+typedef struct st_ror_scan_info : INDEX_SCAN_INFO
+{ 
 } ROR_SCAN_INFO;
 
 
@@ -4000,7 +5691,7 @@ ROR_SCAN_INFO *make_ror_scan(const PARAM *param, int idx, SEL_ARG *sel_arg)
   ror_scan->key_rec_length= (param->table->key_info[keynr].key_length +
                              param->table->file->ref_length);
   ror_scan->sel_arg= sel_arg;
-  ror_scan->records= param->table->quick_rows[keynr];
+  ror_scan->records= param->quick_rows[keynr];
 
   if (!(bitmap_buf= (my_bitmap_map*) alloc_root(param->mem_root,
                                                 param->fields_bitmap_size)))
@@ -4020,8 +5711,7 @@ ROR_SCAN_INFO *make_ror_scan(const PARAM *param, int idx, SEL_ARG *sel_arg)
       bitmap_set_bit(&ror_scan->covered_fields, key_part->fieldnr-1);
   }
   ror_scan->index_read_cost=
-    param->table->file->keyread_time(ror_scan->keynr, 1,
-                                     param->table->quick_rows[ror_scan->keynr]);
+    param->table->file->keyread_time(ror_scan->keynr, 1, ror_scan->records);
   DBUG_RETURN(ror_scan);
 }
 
@@ -4306,7 +5996,7 @@ static double ror_scan_selectivity(const ROR_INTERSECT_INFO *info,
   }
   if (!prev_covered)
   {
-    double tmp= rows2double(info->param->table->quick_rows[scan->keynr]) /
+    double tmp= rows2double(info->param->quick_rows[scan->keynr]) /
                 rows2double(prev_records);
     DBUG_PRINT("info", ("Selectivity multiplier: %g", tmp));
     selectivity_mult *= tmp;
@@ -4385,7 +6075,7 @@ static bool ror_intersect_add(ROR_INTERSECT_INFO *info,
   }
   else
   {
-    info->index_records += info->param->table->quick_rows[ror_scan->keynr];
+    info->index_records += info->param->quick_rows[ror_scan->keynr];
     info->index_scan_costs += ror_scan->index_read_cost;
     bitmap_union(&info->covered_fields, &ror_scan->covered_fields);
     if (!info->is_covering && bitmap_is_subset(&info->param->needed_fields,
@@ -4636,7 +6326,7 @@ TRP_ROR_INTERSECT *get_best_ror_intersect(const PARAM *param, SEL_TREE *tree,
 /*
   Get best covering ROR-intersection.
   SYNOPSIS
-    get_best_covering_ror_intersect()
+    get_best_ntersectcovering_ror_intersect()
       param     Parameter from test_quick_select function.
       tree      SEL_TREE with sets of intervals for different keys.
       read_time Don't return table read plans with cost > read_time.
@@ -4808,11 +6498,12 @@ static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree,
                                        bool update_tbl_stats,
                                        double read_time)
 {
-  int idx;
+  uint idx;
   SEL_ARG **key,**end, **key_to_read= NULL;
   ha_rows UNINIT_VAR(best_records);              /* protected by key_to_read */
+  uint    UNINIT_VAR(best_mrr_flags),            /* protected by key_to_read */
+          UNINIT_VAR(best_buf_size);             /* protected by key_to_read */
   TRP_RANGE* read_plan= NULL;
-  bool pk_is_clustered= param->table->file->primary_key_is_clustered();
   DBUG_ENTER("get_key_scans_params");
   /*
     Note that there may be trees that have type SEL_TREE::KEY but contain no
@@ -4823,14 +6514,23 @@ static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree,
                                       "tree scans"););
   tree->ror_scans_map.clear_all();
   tree->n_ror_scans= 0;
-  for (idx= 0,key=tree->keys, end=key+param->keys;
-       key != end ;
-       key++,idx++)
+  tree->index_scans= 0;
+  if (!tree->keys_map.is_clear_all())
+  {
+    tree->index_scans=
+      (INDEX_SCAN_INFO **) alloc_root(param->mem_root,
+                                      sizeof(INDEX_SCAN_INFO *) * param->keys);
+  }
+  tree->index_scans_end= tree->index_scans;                                                  
+  for (idx= 0,key=tree->keys, end=key+param->keys; key != end; key++,idx++)
   {
-    ha_rows found_records;
-    double found_read_time;
     if (*key)
     {
+      ha_rows found_records;
+      COST_VECT cost;
+      double found_read_time;
+      uint mrr_flags, buf_size;
+      INDEX_SCAN_INFO *index_scan;
       uint keynr= param->real_keynr[idx];
       if ((*key)->type == SEL_ARG::MAYBE_KEY ||
           (*key)->maybe_flag)
@@ -4839,48 +6539,37 @@ static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree,
       bool read_index_only= index_read_must_be_used ? TRUE :
                             (bool) param->table->covering_keys.is_set(keynr);
 
-      found_records= check_quick_select(param, idx, *key, update_tbl_stats);
-      if (param->is_ror_scan)
+      found_records= check_quick_select(param, idx, read_index_only, *key,
+                                        update_tbl_stats, &mrr_flags,
+                                        &buf_size, &cost);
+
+      if (found_records != HA_POS_ERROR && tree->index_scans &&
+          (index_scan= (INDEX_SCAN_INFO *)alloc_root(param->mem_root,
+						     sizeof(INDEX_SCAN_INFO))))
+      {
+        index_scan->idx= idx;
+        index_scan->keynr= keynr;
+        index_scan->key_info= &param->table->key_info[keynr];
+        index_scan->used_key_parts= param->max_key_part+1;
+        index_scan->range_count= param->range_count;
+        index_scan->records= found_records;
+        index_scan->sel_arg= *key;
+        *tree->index_scans_end++= index_scan;
+      }        
+      if ((found_records != HA_POS_ERROR) && param->is_ror_scan)
       {
         tree->n_ror_scans++;
         tree->ror_scans_map.set_bit(idx);
       }
-      double cpu_cost= (double) found_records / TIME_FOR_COMPARE;
-      if (found_records != HA_POS_ERROR && found_records > 2 &&
-          read_index_only &&
-          (param->table->file->index_flags(keynr, param->max_key_part,1) &
-           HA_KEYREAD_ONLY) &&
-          !(pk_is_clustered && keynr == param->table->s->primary_key))
-      {
-        /*
-          We can resolve this by only reading through this key. 
-          0.01 is added to avoid races between range and 'index' scan.
-        */
-        found_read_time= param->table->file->keyread_time(keynr, 1, found_records) +
-                         cpu_cost + 0.01;
-      }
-      else
-      {
-        /*
-          cost(read_through_index) = cost(disk_io) + cost(row_in_range_checks)
-          The row_in_range check is in QUICK_RANGE_SELECT::cmp_next function.
-        */
-	found_read_time= param->table->file->read_time(keynr,
-                                                       param->range_count,
-                                                       found_records) +
-			 cpu_cost + 0.01;
-      }
-      DBUG_PRINT("info",("key %s: found_read_time: %g (cur. read_time: %g)",
-                         param->table->key_info[keynr].name, found_read_time,
-                         read_time));
-
-      if (read_time > found_read_time && found_records != HA_POS_ERROR)
+      if (found_records != HA_POS_ERROR &&
+          read_time > (found_read_time= cost.total_cost()))
       {
         read_time=    found_read_time;
         best_records= found_records;
         key_to_read=  key;
+        best_mrr_flags= mrr_flags;
+        best_buf_size=  buf_size;
       }
-
     }
   }
 
@@ -4889,11 +6578,13 @@ static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree,
   if (key_to_read)
   {
     idx= key_to_read - tree->keys;
-    if ((read_plan= new (param->mem_root) TRP_RANGE(*key_to_read, idx)))
+    if ((read_plan= new (param->mem_root) TRP_RANGE(*key_to_read, idx,
+                                                    best_mrr_flags)))
     {
       read_plan->records= best_records;
       read_plan->is_ror= tree->ror_scans_map.is_set(idx);
       read_plan->read_cost= read_time;
+      read_plan->mrr_buf_size= best_buf_size;
       DBUG_PRINT("info",
                  ("Returning range plan for key %s, cost %g, records %lu",
                   param->table->key_info[param->real_keynr[idx]].name,
@@ -4934,6 +6625,36 @@ QUICK_SELECT_I *TRP_INDEX_MERGE::make_quick(PARAM *param,
   return quick_imerge;
 }
 
+
+QUICK_SELECT_I *TRP_INDEX_INTERSECT::make_quick(PARAM *param,
+                                                bool retrieve_full_rows,
+                                                MEM_ROOT *parent_alloc)
+{
+  QUICK_INDEX_INTERSECT_SELECT *quick_intersect;
+  QUICK_RANGE_SELECT *quick;
+  /* index_merge always retrieves full rows, ignore retrieve_full_rows */
+  if (!(quick_intersect= new QUICK_INDEX_INTERSECT_SELECT(param->thd, param->table)))
+    return NULL;
+
+  quick_intersect->records= records;
+  quick_intersect->read_time= read_cost;
+  quick_intersect->filtered_scans= filtered_scans;
+  for (TRP_RANGE **range_scan= range_scans; range_scan != range_scans_end;
+       range_scan++)
+  {
+    if (!(quick= (QUICK_RANGE_SELECT*)
+          ((*range_scan)->make_quick(param, FALSE, &quick_intersect->alloc)))||
+        quick_intersect->push_quick_back(quick))
+    {
+      delete quick;
+      delete quick_intersect;
+      return NULL;
+    }
+  }
+  return quick_intersect;
+}
+
+
 QUICK_SELECT_I *TRP_ROR_INTERSECT::make_quick(PARAM *param,
                                               bool retrieve_full_rows,
                                               MEM_ROOT *parent_alloc)
@@ -4956,7 +6677,9 @@ QUICK_SELECT_I *TRP_ROR_INTERSECT::make_quick(PARAM *param,
     for (; first_scan != last_scan;++first_scan)
     {
       if (!(quick= get_quick_select(param, (*first_scan)->idx,
-                                    (*first_scan)->sel_arg, alloc)) ||
+                                    (*first_scan)->sel_arg,
+                                    HA_MRR_USE_DEFAULT_IMPL | HA_MRR_SORTED,
+                                    0, alloc)) ||
           quick_intrsect->push_quick_back(alloc, quick))
       {
         delete quick_intrsect;
@@ -4966,7 +6689,9 @@ QUICK_SELECT_I *TRP_ROR_INTERSECT::make_quick(PARAM *param,
     if (cpk_scan)
     {
       if (!(quick= get_quick_select(param, cpk_scan->idx,
-                                    cpk_scan->sel_arg, alloc)))
+                                    cpk_scan->sel_arg,
+                                    HA_MRR_USE_DEFAULT_IMPL | HA_MRR_SORTED,
+                                    0, alloc)))
       {
         delete quick_intrsect;
         DBUG_RETURN(NULL);
@@ -5446,7 +7171,7 @@ static SEL_TREE *get_mm_tree(RANGE_OPT_PARAM *param,COND *cond)
     DBUG_RETURN(tree);
   }
   /* Here when simple cond */
-  if (cond->const_item())
+  if (cond->const_item() && !cond->is_expensive())
   {
     /*
       During the cond->val_int() evaluation we can come across a subselect 
@@ -6062,13 +7787,138 @@ sel_add(SEL_ARG *key1,SEL_ARG *key2)
   return root;
 }
 
-#define CLONE_KEY1_MAYBE 1
-#define CLONE_KEY2_MAYBE 2
-#define swap_clone_flag(A) ((A & 1) << 1) | ((A & 2) >> 1)
 
+/* 
+  Build a range tree for the conjunction of the range parts of two trees
 
-static SEL_TREE *
-tree_and(RANGE_OPT_PARAM *param,SEL_TREE *tree1,SEL_TREE *tree2)
+  SYNOPSIS
+    and_range_trees()
+      param           Context info for the operation
+      tree1           SEL_TREE for the first conjunct          
+      tree2           SEL_TREE for the second conjunct
+      result          SEL_TREE for the result
+
+  DESCRIPTION
+    This function takes range parts of two trees tree1 and tree2 and builds
+    a range tree for the conjunction of the formulas that these two range parts
+    represent.
+    More exactly: 
+    if the range part of tree1 represents the normalized formula 
+      R1_1 AND ... AND R1_k,
+    and the range part of tree2 represents the normalized formula
+      R2_1 AND ... AND R2_k,
+    then the range part of the result represents the formula:
+     RT = R_1 AND ... AND R_k, where R_i=(R1_i AND R2_i) for each i from [1..k]
+
+    The function assumes that tree1 is never equal to tree2. At the same
+    time the tree result can be the same as tree1 (but never as tree2).
+    If result==tree1 then rt replaces the range part of tree1 leaving
+    imerges as they are.
+    if result!=tree1 than it is assumed that the SEL_ARG trees in tree1 and
+    tree2 should be preserved. Otherwise they can be destroyed.
+
+  RETURN 
+    1    if the type the result tree is  SEL_TREE::IMPOSSIBLE
+    0    otherwise    
+*/
+
+static
+int and_range_trees(RANGE_OPT_PARAM *param, SEL_TREE *tree1, SEL_TREE *tree2,
+                    SEL_TREE *result)
+{
+  DBUG_ENTER("and_ranges");
+  key_map  result_keys;
+  result_keys.clear_all();
+  key_map anded_keys= tree1->keys_map;
+  anded_keys.merge(tree2->keys_map);
+  int key_no;
+  key_map::Iterator it(anded_keys);
+  while ((key_no= it++) != key_map::Iterator::BITMAP_END)
+  {
+    uint flag=0;
+    SEL_ARG *key1= tree1->keys[key_no];
+    SEL_ARG *key2= tree2->keys[key_no];
+    if (key1 && !key1->simple_key())
+      flag|= CLONE_KEY1_MAYBE;
+    if (key2 && !key2->simple_key())
+      flag|=CLONE_KEY2_MAYBE;
+    if (result != tree1)
+    { 
+      if (key1)
+        key1->incr_refs();
+      if (key2)
+        key2->incr_refs();
+    }
+    SEL_ARG *key;
+    if ((result->keys[key_no]= key =key_and(param, key1, key2, flag)))
+    {
+      if (key && key->type == SEL_ARG::IMPOSSIBLE)
+      {
+	result->type= SEL_TREE::IMPOSSIBLE;
+        DBUG_RETURN(1);
+      }
+      result_keys.set_bit(key_no);
+#ifdef EXTRA_DEBUG
+      if (param->alloced_sel_args < SEL_ARG::MAX_SEL_ARGS) 
+        key->test_use_count(key);
+#endif
+    }
+  }
+  result->keys_map= result_keys;
+  DBUG_RETURN(0);
+}
+  
+
+/*
+  Build a SEL_TREE for a conjunction out of such trees for the conjuncts
+
+  SYNOPSIS
+    tree_and()
+      param           Context info for the operation
+      tree1           SEL_TREE for the first conjunct          
+      tree2           SEL_TREE for the second conjunct
+
+  DESCRIPTION
+    This function builds a tree for the formula (A AND B) out of the trees
+    tree1 and tree2 that has been built for the formulas A and B respectively.
+
+    In a general case
+      tree1 represents the formula RT1 AND MT1,
+        where RT1 = R1_1 AND ... AND R1_k1, MT1=M1_1 AND ... AND M1_l1;
+      tree2 represents the formula RT2 AND MT2 
+        where RT2 = R2_1 AND ... AND R2_k2, MT2=M2_1 and ... and M2_l2.
+
+    The result tree will represent the formula of the the following structure:
+      RT AND MT1 AND MT2 AND RT1MT2 AND RT2MT1, such that
+        rt is a tree obtained by range intersection of trees tree1 and tree2,
+        RT1MT2 = RT1M2_1 AND ... AND RT1M2_l2,
+        RT2MT1 = RT2M1_1 AND ... AND RT2M1_l1,
+        where rt1m2_i (i=1,...,l2) is the result of the pushdown operation
+        of range tree rt1 into imerge m2_i, while rt2m1_j (j=1,...,l1) is the
+        result of the pushdown operation of range tree rt2 into imerge m1_j.
+
+    RT1MT2/RT2MT is empty if MT2/MT1 is empty.
+ 
+    The range intersection of two range trees is produced by the function
+    and_range_trees. The pushdown of a range tree to a imerge is performed
+    by the function imerge_list_and_tree. This function may produce imerges
+    containing only one range tree. Such trees are intersected with rt and 
+    the result of intersection is returned as the range part of the result
+    tree, while the corresponding imerges are removed altogether from its
+    imerge part. 
+    
+  NOTE.
+    The pushdown operation of range trees into imerges is needed to be able
+    to construct valid imerges for the condition like this:
+      key1_p1=c1 AND (key1_p2 BETWEEN c21 AND c22 OR key2 < c2)
+
+  RETURN
+    The result tree, if a success
+    0 - otherwise.        
+*/
+
+static 
+SEL_TREE *tree_and(RANGE_OPT_PARAM *param, SEL_TREE *tree1, SEL_TREE *tree2)
 {
   DBUG_ENTER("tree_and");
   if (!tree1)
@@ -6090,87 +7940,216 @@ tree_and(RANGE_OPT_PARAM *param,SEL_TREE *tree1,SEL_TREE *tree2)
     tree1->type=SEL_TREE::KEY_SMALLER;
     DBUG_RETURN(tree1);
   }
-  key_map  result_keys;
-  result_keys.clear_all();
-  
-  /* Join the trees key per key */
-  SEL_ARG **key1,**key2,**end;
-  for (key1= tree1->keys,key2= tree2->keys,end=key1+param->keys ;
-       key1 != end ; key1++,key2++)
+
+  if (!tree1->merges.is_empty())
+    imerge_list_and_tree(param, &tree1->merges, tree2);
+  if (!tree2->merges.is_empty())
+    imerge_list_and_tree(param, &tree2->merges, tree1);
+  if (and_range_trees(param, tree1, tree2, tree1))
+    DBUG_RETURN(tree1);
+  imerge_list_and_list(&tree1->merges, &tree2->merges);
+  eliminate_single_tree_imerges(param, tree1);
+  DBUG_RETURN(tree1);
+}
+
+
+/*
+  Eliminate single tree imerges in a SEL_TREE objects
+
+  SYNOPSIS
+    eliminate_single_tree_imerges()
+      param      Context info for the function
+      tree       SEL_TREE where single tree imerges are to be eliminated 
+
+  DESCRIPTION
+    For each imerge in 'tree' that contains only one disjunct tree, i.e.
+    for any imerge of the form m=rt, the function performs and operation
+    the range part of tree, replaces rt the with the result of anding and
+    removes imerge m from the the merge part of 'tree'.
+
+  RETURN VALUE
+    none          
+*/
+
+static
+void eliminate_single_tree_imerges(RANGE_OPT_PARAM *param, SEL_TREE *tree)
+{
+  SEL_IMERGE *imerge;
+  List<SEL_IMERGE> merges= tree->merges;
+  List_iterator<SEL_IMERGE> it(merges);
+  tree->merges.empty();
+  while ((imerge= it++))
   {
-    uint flag=0;
-    if (*key1 || *key2)
-    {
-      if (*key1 && !(*key1)->simple_key())
-	flag|=CLONE_KEY1_MAYBE;
-      if (*key2 && !(*key2)->simple_key())
-	flag|=CLONE_KEY2_MAYBE;
-      *key1=key_and(param, *key1, *key2, flag);
-      if (*key1 && (*key1)->type == SEL_ARG::IMPOSSIBLE)
-      {
-	tree1->type= SEL_TREE::IMPOSSIBLE;
-        DBUG_RETURN(tree1);
-      }
-      result_keys.set_bit(key1 - tree1->keys);
-#ifdef EXTRA_DEBUG
-        if (*key1 && param->alloced_sel_args < SEL_ARG::MAX_SEL_ARGS) 
-          (*key1)->test_use_count(*key1);
-#endif
+    if (imerge->trees+1 == imerge->trees_next)
+    {
+      tree= tree_and(param, tree, *imerge->trees);
+      it.remove();
     }
   }
-  tree1->keys_map= result_keys;
-  /* dispose index_merge if there is a "range" option */
-  if (!result_keys.is_clear_all())
-  {
-    tree1->merges.empty();
-    DBUG_RETURN(tree1);
-  }
+  tree->merges= merges;
+} 
 
-  /* ok, both trees are index_merge trees */
-  imerge_list_and_list(&tree1->merges, &tree2->merges);
-  DBUG_RETURN(tree1);
+
+/*
+  For two trees check that there are indexes with ranges in both of them  
+ 
+  SYNOPSIS
+    sel_trees_have_common_keys()
+      tree1           SEL_TREE for the first tree
+      tree2           SEL_TREE for the second tree
+      common_keys OUT bitmap of all indexes with ranges in both trees
+
+  DESCRIPTION
+    For two trees tree1 and tree1 the function checks if there are indexes
+    in their range parts such that SEL_ARG trees are defined for them in the
+    range parts of both trees. The function returns the bitmap of such 
+    indexes in the parameter common_keys.
+
+  RETURN 
+    TRUE    if there are such indexes (common_keys is nor empty)
+    FALSE   otherwise
+*/
+
+static
+bool sel_trees_have_common_keys(SEL_TREE *tree1, SEL_TREE *tree2, 
+                                key_map *common_keys)
+{
+  *common_keys= tree1->keys_map;
+  common_keys->intersect(tree2->keys_map);
+  return !common_keys->is_clear_all();
 }
 
 
 /*
-  Check if two SEL_TREES can be combined into one (i.e. a single key range
-  read can be constructed for "cond_of_tree1 OR cond_of_tree2" ) without
-  using index_merge.
+  Check whether range parts of two trees can be ored for some indexes
+
+  SYNOPSIS
+    sel_trees_can_be_ored()
+      param              Context info for the function
+      tree1              SEL_TREE for the first tree
+      tree2              SEL_TREE for the second tree
+      common_keys IN/OUT IN: bitmap of all indexes with SEL_ARG in both trees
+                        OUT: bitmap of all indexes that can be ored
+
+  DESCRIPTION
+    For two trees tree1 and tree2 and the bitmap common_keys containing
+    bits for indexes that have SEL_ARG trees in range parts of both trees
+    the function checks if there are indexes for which SEL_ARG trees can
+    be ored. Two SEL_ARG trees for the same index can be ored if the most
+    major components of the index used in these trees coincide. If the 
+    SEL_ARG trees for an index cannot be ored the function clears the bit
+    for this index in the bitmap common_keys.
+
+    The function does not verify that indexes marked in common_keys really
+    have SEL_ARG trees in both tree1 and tree2. It assumes that this is true.
+
+  NOTE
+    The function sel_trees_can_be_ored is usually used in pair with the
+    function sel_trees_have_common_keys.
+
+  RETURN
+    TRUE    if there are indexes for which SEL_ARG trees can be ored 
+    FALSE   otherwise
 */
 
-bool sel_trees_can_be_ored(SEL_TREE *tree1, SEL_TREE *tree2, 
-                           RANGE_OPT_PARAM* param)
+static
+bool sel_trees_can_be_ored(RANGE_OPT_PARAM* param,
+                           SEL_TREE *tree1, SEL_TREE *tree2, 
+                           key_map *common_keys)
 {
-  key_map common_keys= tree1->keys_map;
   DBUG_ENTER("sel_trees_can_be_ored");
-  common_keys.intersect(tree2->keys_map);
+  if (!sel_trees_have_common_keys(tree1, tree2, common_keys))
+    DBUG_RETURN(FALSE);
+  int key_no;
+  key_map::Iterator it(*common_keys);
+  while ((key_no= it++) != key_map::Iterator::BITMAP_END)
+  {
+    DBUG_ASSERT(tree1->keys[key_no] && tree2->keys[key_no]);
+    /* Trees have a common key, check if they refer to the same key part */
+    if (tree1->keys[key_no]->part != tree2->keys[key_no]->part)
+      common_keys->clear_bit(key_no);
+  }
+  DBUG_RETURN(!common_keys->is_clear_all());
+}
 
-  if (common_keys.is_clear_all())
+/*
+  Check whether range parts of two trees must be ored for some indexes
+
+  SYNOPSIS
+    sel_trees_must_be_ored()
+      param              Context info for the function
+      tree1              SEL_TREE for the first tree
+      tree2              SEL_TREE for the second tree
+      ordable_keys       bitmap of SEL_ARG trees that can be ored
+
+  DESCRIPTION
+    For two trees tree1 and tree2 the function checks whether they must be
+    ored. The function assumes that the bitmap ordable_keys contains bits for
+    those corresponding pairs of SEL_ARG trees from tree1 and tree2 that can
+    be ored.
+    We believe that tree1 and tree2 must be ored if any pair of SEL_ARG trees
+    r1 and r2, such that r1 is from tree1 and r2 is from tree2 and both
+    of them are marked in ordable_keys, can be merged.
+    
+  NOTE
+    The function sel_trees_must_be_ored as a rule is used in pair with the
+    function sel_trees_can_be_ored.
+
+  RETURN
+    TRUE    if there are indexes for which SEL_ARG trees must be ored 
+    FALSE   otherwise
+*/
+
+static
+bool sel_trees_must_be_ored(RANGE_OPT_PARAM* param,
+                            SEL_TREE *tree1, SEL_TREE *tree2,
+                            key_map oredable_keys)
+{
+  key_map tmp;
+  DBUG_ENTER("sel_trees_must_be_ored");
+
+  tmp= tree1->keys_map;
+  tmp.merge(tree2->keys_map);
+  tmp.subtract(oredable_keys);
+  if (!tmp.is_clear_all())
     DBUG_RETURN(FALSE);
 
-  /* trees have a common key, check if they refer to same key part */
-  SEL_ARG **key1,**key2;
-  for (uint key_no=0; key_no < param->keys; key_no++)
+  int idx1, idx2;
+  key_map::Iterator it1(oredable_keys);
+  while ((idx1= it1++) != key_map::Iterator::BITMAP_END)
   {
-    if (common_keys.is_set(key_no))
+    KEY_PART *key1_init= param->key[idx1]+tree1->keys[idx1]->part;
+    KEY_PART *key1_end= param->key[idx1]+tree1->keys[idx1]->max_part_no;
+    key_map::Iterator it2(oredable_keys);
+    while ((idx2= it2++) != key_map::Iterator::BITMAP_END)
     {
-      key1= tree1->keys + key_no;
-      key2= tree2->keys + key_no;
-      if ((*key1)->part == (*key2)->part)
-      {
-        DBUG_RETURN(TRUE);
+      if (idx2 <= idx1)
+        continue;
+      
+      KEY_PART *key2_init= param->key[idx2]+tree2->keys[idx2]->part;
+      KEY_PART *key2_end= param->key[idx2]+tree2->keys[idx2]->max_part_no;
+      KEY_PART *part1, *part2;
+      for (part1= key1_init, part2= key2_init;
+           part1 < key1_end && part2 < key2_end;
+           part1++, part2++)
+      { 
+        if (!part1->field->eq(part2->field))
+          DBUG_RETURN(FALSE);
       }
     }
   }
-  DBUG_RETURN(FALSE);
-}
+      
+  DBUG_RETURN(TRUE);
+}  
 
 
 /*
-  Remove the trees that are not suitable for record retrieval.
+  Remove the trees that are not suitable for record retrieval
+
   SYNOPSIS
-    param  Range analysis parameter
-    tree   Tree to be processed, tree->type is KEY or KEY_SMALLER
+    remove_nonrange_trees()
+      param  Context info for the function
+      tree   Tree to be processed, tree->type is KEY or KEY_SMALLER
  
   DESCRIPTION
     This function walks through tree->keys[] and removes the SEL_ARG* trees
@@ -6181,41 +8160,36 @@ bool sel_trees_can_be_ored(SEL_TREE *tree1, SEL_TREE *tree2,
 
     A SEL_ARG* tree cannot be used to construct quick select if it has
     tree->part != 0. (e.g. it could represent "keypart2 < const").
-
-    WHY THIS FUNCTION IS NEEDED
     
     Normally we allow construction of SEL_TREE objects that have SEL_ARG
-    trees that do not allow quick range select construction. For example for
-    " keypart1=1 AND keypart2=2 " the execution will proceed as follows:
+    trees that do not allow quick range select construction.
+    For example:
+    for " keypart1=1 AND keypart2=2 " the execution will proceed as follows:
     tree1= SEL_TREE { SEL_ARG{keypart1=1} }
     tree2= SEL_TREE { SEL_ARG{keypart2=2} } -- can't make quick range select
                                                from this
     call tree_and(tree1, tree2) -- this joins SEL_ARGs into a usable SEL_ARG
                                    tree.
-    
-    There is an exception though: when we construct index_merge SEL_TREE,
-    any SEL_ARG* tree that cannot be used to construct quick range select can
-    be removed, because current range analysis code doesn't provide any way
-    that tree could be later combined with another tree.
-    Consider an example: we should not construct
-    st1 = SEL_TREE { 
-      merges = SEL_IMERGE { 
-                            SEL_TREE(t.key1part1 = 1), 
-                            SEL_TREE(t.key2part2 = 2)   -- (*)
-                          } 
-                   };
-    because 
-     - (*) cannot be used to construct quick range select, 
-     - There is no execution path that would cause (*) to be converted to 
-       a tree that could be used.
-
-    The latter is easy to verify: first, notice that the only way to convert
-    (*) into a usable tree is to call tree_and(something, (*)).
-
-    Second look at what tree_and/tree_or function would do when passed a
-    SEL_TREE that has the structure like st1 tree has, and conlcude that 
-    tree_and(something, (*)) will not be called.
 
+    Another example:
+    tree3= SEL_TREE { SEL_ARG{key1part1 = 1} }
+    tree4= SEL_TREE { SEL_ARG{key2part2 = 2} }  -- can't make quick range select
+                                               from this
+    call tree_or(tree3, tree4) -- creates a SEL_MERGE ot of which no index
+    merge can be constructed, but it is potentially useful, as anding it with
+    tree5= SEL_TREE { SEL_ARG{key2part1 = 3} } creates an index merge that
+    represents the formula
+      key1part1=1 AND key2part1=3 OR key2part1=3 AND key2part2=2 
+    for which an index merge can be built. 
+
+    Any final SEL_TREE may contain SEL_ARG trees for which no quick select
+    can be built. Such SEL_ARG trees should be removed from the range part
+    before different range scans are evaluated. Such SEL_ARG trees also should
+    be removed from all range trees of each index merge before different
+    possible index merge plans are evaluated. If after this removal one
+    of the range trees in the index merge becomes empty the whole index merge
+    must be discarded.
+       
   RETURN
     0  Ok, some suitable trees left
     1  No tree->keys[] left.
@@ -6241,6 +8215,74 @@ static bool remove_nonrange_trees(RANGE_OPT_PARAM *param, SEL_TREE *tree)
 }
 
 
+/*
+  Build a SEL_TREE for a disjunction out of such trees for the disjuncts
+
+  SYNOPSIS
+    tree_or()
+      param           Context info for the operation
+      tree1           SEL_TREE for the first disjunct          
+      tree2           SEL_TREE for the second disjunct
+
+  DESCRIPTION
+    This function builds a tree for the formula (A OR B) out of the trees
+    tree1 and tree2 that has been built for the formulas A and B respectively.
+
+    In a general case
+      tree1 represents the formula RT1 AND MT1,
+        where RT1=R1_1 AND ... AND R1_k1, MT1=M1_1 AND ... AND M1_l1;
+      tree2 represents the formula RT2 AND MT2 
+        where RT2=R2_1 AND ... AND R2_k2, MT2=M2_1 and ... and M2_l2.
+
+    The function constructs the result tree according the formula
+      (RT1 OR RT2) AND (MT1 OR RT1) AND (MT2 OR RT2) AND (MT1 OR MT2)
+    that is equivalent to the formula (RT1 AND MT1) OR (RT2 AND MT2).
+
+    To limit the number of produced imerges the function considers
+    a weaker formula than the original one:
+      (RT1 AND M1_1) OR (RT2 AND M2_1) 
+    that is equivalent to:
+      (RT1 OR RT2)                  (1)
+        AND 
+      (M1_1 OR M2_1)                (2)
+        AND
+      (M1_1 OR RT2)                 (3)
+        AND
+      (M2_1 OR RT1)                 (4)
+
+    For the first conjunct (1) the function builds a tree with a range part
+    and, possibly, one imerge. For the other conjuncts (2-4)the function
+    produces sets of imerges. All constructed imerges are included into the
+    result tree.
+    
+    For the formula (1) the function produces the tree representing a formula  
+    of the structure RT [AND M], such that:
+     - the range tree rt contains the result of oring SEL_ARG trees from rt1
+       and rt2
+     - the imerge m consists of two range trees rt1 and rt2.
+    The imerge m is added if it's not true that rt1 and rt2 must be ored
+    If rt1 and rt2 can't be ored rt is empty and only m is produced for (1).
+
+    To produce imerges for the formula (2) the function calls the function
+    imerge_list_or_list passing it the merge parts of tree1 and tree2 as
+    parameters.
+
+    To produce imerges for the formula (3) the function calls the function
+    imerge_list_or_tree passing it the imerge m1_1 and the range tree rt2 as
+    parameters. Similarly, to produce imerges for the formula (4) the function
+    calls the function imerge_list_or_tree passing it the imerge m2_1 and the
+    range tree rt1.
+
+    If rt1 is empty then the trees for (1) and (4) are empty.
+    If rt2 is empty then the trees for (1) and (3) are empty.
+    If mt1 is empty then the trees for (2) and (3) are empty.
+    If mt2 is empty then the trees for (2) and (4) are empty.
+
+  RETURN
+    The result tree for the operation if a success
+    0 - otherwise
+*/
+
 static SEL_TREE *
 tree_or(RANGE_OPT_PARAM *param,SEL_TREE *tree1,SEL_TREE *tree2)
 {
@@ -6256,74 +8298,100 @@ tree_or(RANGE_OPT_PARAM *param,SEL_TREE *tree1,SEL_TREE *tree2)
   if (tree2->type == SEL_TREE::MAYBE)
     DBUG_RETURN(tree2);
 
-  SEL_TREE *result= 0;
-  key_map  result_keys;
-  result_keys.clear_all();
-  if (sel_trees_can_be_ored(tree1, tree2, param))
+  SEL_TREE *result= NULL;
+  key_map result_keys;
+  key_map ored_keys;
+  SEL_TREE *rtree[2]= {NULL,NULL};
+  SEL_IMERGE *imerge[2]= {NULL, NULL};
+  bool no_ranges1= tree1->without_ranges();
+  bool no_ranges2= tree2->without_ranges();
+  bool no_merges1= tree1->without_imerges();
+  bool no_merges2= tree2->without_imerges();
+  if (!no_ranges1 && !no_merges2)
   {
-    /* Join the trees key per key */
-    SEL_ARG **key1,**key2,**end;
-    for (key1= tree1->keys,key2= tree2->keys,end= key1+param->keys ;
-         key1 != end ; key1++,key2++)
-    {
-      *key1=key_or(param, *key1, *key2);
-      if (*key1)
-      {
-        result=tree1;				// Added to tree1
-        result_keys.set_bit(key1 - tree1->keys);
-#ifdef EXTRA_DEBUG
-        if (param->alloced_sel_args < SEL_ARG::MAX_SEL_ARGS) 
-          (*key1)->test_use_count(*key1);
-#endif
-      }
-    }
-    if (result)
-      result->keys_map= result_keys;
+    rtree[0]= new SEL_TREE(tree1, TRUE, param);
+    imerge[1]= new SEL_IMERGE(tree2->merges.head(), 0, param);
   }
-  else
+  if (!no_ranges2 && !no_merges1)
   {
-    /* ok, two trees have KEY type but cannot be used without index merge */
-    if (tree1->merges.is_empty() && tree2->merges.is_empty())
+    rtree[1]= new SEL_TREE(tree2, TRUE, param);
+    imerge[0]= new SEL_IMERGE(tree1->merges.head(), 0, param);
+  }
+  bool no_imerge_from_ranges= FALSE;
+  if (!(result= new SEL_TREE()))
+    DBUG_RETURN(result);
+
+  /* Build the range part of the tree for the formula (1) */ 
+  if (sel_trees_can_be_ored(param, tree1, tree2, &ored_keys))
+  {
+    bool must_be_ored= sel_trees_must_be_ored(param, tree1, tree2, ored_keys);
+    no_imerge_from_ranges= must_be_ored;
+    key_map::Iterator it(ored_keys);
+    int key_no;
+    while ((key_no= it++) != key_map::Iterator::BITMAP_END)
     {
-      if (param->remove_jump_scans)
+      SEL_ARG *key1= tree1->keys[key_no];
+      SEL_ARG *key2= tree2->keys[key_no];
+      if (!must_be_ored)
       {
-        bool no_trees= remove_nonrange_trees(param, tree1);
-        no_trees= no_trees || remove_nonrange_trees(param, tree2);
-        if (no_trees)
-          DBUG_RETURN(new SEL_TREE(SEL_TREE::ALWAYS));
+        key1->incr_refs();
+        key2->incr_refs();
       }
-      SEL_IMERGE *merge;
-      /* both trees are "range" trees, produce new index merge structure */
-      if (!(result= new SEL_TREE()) || !(merge= new SEL_IMERGE()) ||
-          (result->merges.push_back(merge)) ||
-          (merge->or_sel_tree(param, tree1)) ||
-          (merge->or_sel_tree(param, tree2)))
-        result= NULL;
-      else
-        result->type= tree1->type;
+      if ((result->keys[key_no]= key_or(param, key1, key2)))
+        result->keys_map.set_bit(key_no);
     }
-    else if (!tree1->merges.is_empty() && !tree2->merges.is_empty())
-    {
-      if (imerge_list_or_list(param, &tree1->merges, &tree2->merges))
-        result= new SEL_TREE(SEL_TREE::ALWAYS);
-      else
-        result= tree1;
-    }
-    else
-    {
-      /* one tree is index merge tree and another is range tree */
-      if (tree1->merges.is_empty())
-        swap_variables(SEL_TREE*, tree1, tree2);
+    result->type= tree1->type;
+  }
       
-      if (param->remove_jump_scans && remove_nonrange_trees(param, tree2))
-         DBUG_RETURN(new SEL_TREE(SEL_TREE::ALWAYS));
-      /* add tree2 to tree1->merges, checking if it collapses to ALWAYS */
-      if (imerge_list_or_tree(param, &tree1->merges, tree2))
-        result= new SEL_TREE(SEL_TREE::ALWAYS);
-      else
-        result= tree1;
-    }
+  if (no_imerge_from_ranges && no_merges1 && no_merges2)
+  {
+    if (result->keys_map.is_clear_all())
+      result->type= SEL_TREE::ALWAYS;
+    DBUG_RETURN(result);
   }
+
+  SEL_IMERGE *imerge_from_ranges;
+  if (!(imerge_from_ranges= new SEL_IMERGE()))
+    result= NULL;
+  else if (!no_ranges1 && !no_ranges2 && !no_imerge_from_ranges)
+  {
+    /* Build the imerge part of the tree for the formula (1) */
+    SEL_TREE *rt1= tree1;
+    SEL_TREE *rt2= tree2;
+    if (!no_merges1)
+      rt1= new SEL_TREE(tree1, TRUE, param);
+    if (!no_merges2)
+      rt2= new SEL_TREE(tree2, TRUE, param);
+    if (!rt1 || !rt2 ||
+        result->merges.push_back(imerge_from_ranges) ||
+        imerge_from_ranges->or_sel_tree(param, rt1) ||
+        imerge_from_ranges->or_sel_tree(param, rt2))
+      result= NULL;
+  }
+  if (!result)
+    DBUG_RETURN(result);
+
+  result->type= tree1->type;
+
+  if (!no_merges1 && !no_merges2 && 
+      !imerge_list_or_list(param, &tree1->merges, &tree2->merges))
+  {
+    /* Build the imerges for the formula (2) */
+    imerge_list_and_list(&result->merges, &tree1->merges);
+  }
+
+  /* Build the imerges for the formulas (3) and (4) */
+  for (uint i=0; i < 2; i++)
+  {
+    List<SEL_IMERGE> merges;
+    SEL_TREE *rt= rtree[i];
+    SEL_IMERGE *im= imerge[1-i];
+    
+    if (rt && im && !merges.push_back(im) && 
+        !imerge_list_or_tree(param, &merges, rt))
+      imerge_list_and_list(&result->merges, &merges);
+  }
+ 
   DBUG_RETURN(result);
 }
 
@@ -6369,6 +8437,7 @@ and_all_keys(RANGE_OPT_PARAM *param, SEL_ARG *key1, SEL_ARG *key2,
   if (!key1)
     return &null_element;			// Impossible ranges
   key1->use_count++;
+  key1->max_part_no= max(key2->max_part_no, key2->part+1);
   return key1;
 }
 
@@ -6461,6 +8530,7 @@ key_and(RANGE_OPT_PARAM *param, SEL_ARG *key1, SEL_ARG *key2, uint clone_flag)
   key1->use_count--;
   key2->use_count--;
   SEL_ARG *e1=key1->first(), *e2=key2->first(), *new_tree=0;
+  uint max_part_no= max(key1->max_part_no, key2->max_part_no);
 
   while (e1 && e2)
   {
@@ -6498,6 +8568,7 @@ key_and(RANGE_OPT_PARAM *param, SEL_ARG *key1, SEL_ARG *key2, uint clone_flag)
   key2->free_tree();
   if (!new_tree)
     return &null_element;			// Impossible range
+  new_tree->max_part_no= max_part_no;
   return new_tree;
 }
 
@@ -6626,7 +8697,7 @@ key_or(RANGE_OPT_PARAM *param, SEL_ARG *key1,SEL_ARG *key2)
     {
       swap_variables(SEL_ARG *,key1,key2);
     }
-    if (key1->use_count > 0 || !(key1=key1->clone_tree(param)))
+    if (key1->use_count > 0 && !(key1=key1->clone_tree(param)))
       return 0;					// OOM
   }
 
@@ -6634,6 +8705,8 @@ key_or(RANGE_OPT_PARAM *param, SEL_ARG *key1,SEL_ARG *key2)
   bool key2_shared=key2->use_count != 0;
   key1->maybe_flag|=key2->maybe_flag;
 
+  uint max_part_no= max(key1->max_part_no, key2->max_part_no);
+
   for (key2=key2->first(); key2; )
   {
     SEL_ARG *tmp=key1->find_range(key2);	// Find key1.min <= key2.min
@@ -6867,6 +8940,7 @@ end:
     key2=next;
   }
   key1->use_count++;
+  key1->max_part_no= max_part_no;
   return key1;
 }
 
@@ -7342,11 +9416,7 @@ static ulong count_key_part_usage(SEL_ARG *root, SEL_ARG *key)
 void SEL_ARG::test_use_count(SEL_ARG *root)
 {
   uint e_count=0;
-  if (this == root && use_count != 1)
-  {
-    sql_print_information("Use_count: Wrong count %lu for root",use_count);
-    return;
-  }
+
   if (this->type != SEL_ARG::KEY_RANGE)
     return;
   for (SEL_ARG *pos=first(); pos ; pos=pos->next)
@@ -7371,324 +9441,125 @@ void SEL_ARG::test_use_count(SEL_ARG *root)
 }
 
 #endif
-
-
 /*
-  Calculate estimate of number records that will be retrieved by a range
-  scan on given index using given SEL_ARG intervals tree.
+  Calculate cost and E(#rows) for a given index and intervals tree 
+
   SYNOPSIS
-    check_quick_select
-      param  Parameter from test_quick_select
-      idx               Number of index to use in tree->keys
-      tree              Transformed selection condition, tree->keys[idx]
-                        holds the range tree to be used for scanning.
-      update_tbl_stats  If true, update table->quick_keys with information
+    check_quick_select()
+      param             Parameter from test_quick_select
+      idx               Number of index to use in PARAM::key SEL_TREE::key
+      index_only        TRUE  - assume only index tuples will be accessed
+                        FALSE - assume full table rows will be read
+      tree              Transformed selection condition, tree->key[idx] holds
+                        the intervals for the given index.
+      update_tbl_stats  TRUE <=> update table->quick_* with information
                         about range scan we've evaluated.
+      mrr_flags   INOUT MRR access flags
+      cost        OUT   Scan cost
 
   NOTES
     param->is_ror_scan is set to reflect if the key scan is a ROR (see
     is_key_scan_ror function for more info)
     param->table->quick_*, param->range_count (and maybe others) are
-    updated with data of given key scan, see check_quick_keys for details.
+    updated with data of given key scan, see quick_range_seq_next for details.
 
   RETURN
     Estimate # of records to be retrieved.
     HA_POS_ERROR if estimate calculation failed due to table handler problems.
-
 */
 
-static ha_rows
-check_quick_select(PARAM *param,uint idx,SEL_ARG *tree, bool update_tbl_stats)
-{
-  ha_rows records;
-  bool    cpk_scan;
-  uint key;
+static
+ha_rows check_quick_select(PARAM *param, uint idx, bool index_only,
+                           SEL_ARG *tree, bool update_tbl_stats, 
+                           uint *mrr_flags, uint *bufsize, COST_VECT *cost)
+{
+  SEL_ARG_RANGE_SEQ seq;
+  RANGE_SEQ_IF seq_if = {NULL, sel_arg_range_seq_init, sel_arg_range_seq_next, 0, 0};
+  handler *file= param->table->file;
+  ha_rows rows;
+  uint keynr= param->real_keynr[idx];
   DBUG_ENTER("check_quick_select");
+  
+  /* Handle cases when we don't have a valid non-empty list of range */
+  if (!tree)
+    DBUG_RETURN(HA_POS_ERROR);
+  if (tree->type == SEL_ARG::IMPOSSIBLE)
+    DBUG_RETURN(0L);
+  if (tree->type != SEL_ARG::KEY_RANGE || tree->part != 0)
+    DBUG_RETURN(HA_POS_ERROR);
 
-  param->is_ror_scan= FALSE;
-  param->first_null_comp= 0;
+  seq.keyno= idx;
+  seq.real_keyno= keynr;
+  seq.param= param;
+  seq.start= tree;
 
-  if (!tree)
-    DBUG_RETURN(HA_POS_ERROR);			// Can't use it
-  param->max_key_part=0;
   param->range_count=0;
-  key= param->real_keynr[idx];
+  param->max_key_part=0;
 
-  if (tree->type == SEL_ARG::IMPOSSIBLE)
-    DBUG_RETURN(0L);				// Impossible select. return
-  if (tree->type != SEL_ARG::KEY_RANGE || tree->part != 0)
-    DBUG_RETURN(HA_POS_ERROR);				// Don't use tree
+  param->is_ror_scan= TRUE;
+  if (file->index_flags(keynr, 0, TRUE) & HA_KEY_SCAN_NOT_ROR)
+    param->is_ror_scan= FALSE;
+  
+  *mrr_flags= param->force_default_mrr? HA_MRR_USE_DEFAULT_IMPL: 0;
+  /*
+    Pass HA_MRR_SORTED to see if MRR implementation can handle sorting.
+  */
+  *mrr_flags|= HA_MRR_NO_ASSOCIATION | HA_MRR_SORTED;
 
-  enum ha_key_alg key_alg= param->table->key_info[key].algorithm;
-  if ((key_alg != HA_KEY_ALG_BTREE) && (key_alg!= HA_KEY_ALG_UNDEF))
-  {
-    /* Records are not ordered by rowid for other types of indexes. */
-    cpk_scan= FALSE;
-  }
-  else
-  {
-    /*
-      Clustered PK scan is a special case, check_quick_keys doesn't recognize
-      CPK scans as ROR scans (while actually any CPK scan is a ROR scan).
-    */
-    cpk_scan= ((param->table->s->primary_key == param->real_keynr[idx]) &&
-               param->table->file->primary_key_is_clustered());
-    param->is_ror_scan= !cpk_scan;
-  }
-  param->n_ranges= 0;
+  bool pk_is_clustered= file->primary_key_is_clustered();
+  if (index_only && 
+      (file->index_flags(keynr, param->max_key_part, 1) & HA_KEYREAD_ONLY) &&
+      !(pk_is_clustered && keynr == param->table->s->primary_key))
+     *mrr_flags |= HA_MRR_INDEX_ONLY;
+  
+  if (current_thd->lex->sql_command != SQLCOM_SELECT)
+    *mrr_flags |= HA_MRR_USE_DEFAULT_IMPL;
 
-  records= check_quick_keys(param, idx, tree,
-                            param->min_key, 0, -1,
-                            param->max_key, 0, -1);
-  if (records != HA_POS_ERROR)
+  *bufsize= param->thd->variables.mrr_buff_size;
+  rows= file->multi_range_read_info_const(keynr, &seq_if, (void*)&seq, 0,
+                                          bufsize, mrr_flags, cost);
+  if (rows != HA_POS_ERROR)
   {
+    param->quick_rows[keynr]= rows;
     if (update_tbl_stats)
     {
-      param->table->quick_keys.set_bit(key);
-      param->table->quick_key_parts[key]=param->max_key_part+1;
-      param->table->quick_n_ranges[key]= param->n_ranges;
+      param->table->quick_keys.set_bit(keynr);
+      param->table->quick_key_parts[keynr]= param->max_key_part+1;
+      param->table->quick_n_ranges[keynr]= param->range_count;
       param->table->quick_condition_rows=
-        min(param->table->quick_condition_rows, records);
+        min(param->table->quick_condition_rows, rows);
+      param->table->quick_rows[keynr]= rows;
     }
-    /*
-      Need to save quick_rows in any case as it is used when calculating
-      cost of ROR intersection:
-    */
-    param->table->quick_rows[key]=records;
-    if (cpk_scan)
-      param->is_ror_scan= TRUE;
   }
-  if (param->table->file->index_flags(key, 0, TRUE) & HA_KEY_SCAN_NOT_ROR)
-    param->is_ror_scan= FALSE;
-  DBUG_PRINT("exit", ("Records: %lu", (ulong) records));
-  DBUG_RETURN(records);
-}
-
-
-/*
-  Recursively calculate estimate of # rows that will be retrieved by
-  key scan on key idx.
-  SYNOPSIS
-    check_quick_keys()
-      param         Parameter from test_quick select function.
-      idx           Number of key to use in PARAM::keys in list of used keys
-                    (param->real_keynr[idx] holds the key number in table)
-      key_tree      SEL_ARG tree being examined.
-      min_key       Buffer with partial min key value tuple
-      min_key_flag
-      max_key       Buffer with partial max key value tuple
-      max_key_flag
-
-  NOTES
-    The function does the recursive descent on the tree via SEL_ARG::left,
-    SEL_ARG::right, and SEL_ARG::next_key_part edges. The #rows estimates
-    are calculated using records_in_range calls at the leaf nodes and then
-    summed.
-
-    param->min_key and param->max_key are used to hold prefixes of key value
-    tuples.
-
-    The side effects are:
-
-    param->max_key_part is updated to hold the maximum number of key parts used
-      in scan minus 1.
-
-    param->range_count is incremented if the function finds a range that
-      wasn't counted by the caller.
-
-    param->is_ror_scan is cleared if the function detects that the key scan is
-      not a Rowid-Ordered Retrieval scan ( see comments for is_key_scan_ror
-      function for description of which key scans are ROR scans)
-
-  RETURN
-    #records      E(#records) for given subtree
-    HA_POS_ERROR  if subtree cannot be used for record retrieval
-
-*/
-
-static ha_rows
-check_quick_keys(PARAM *param, uint idx, SEL_ARG *key_tree,
-		 uchar *min_key, uint min_key_flag, int min_keypart,
-                 uchar *max_key, uint max_key_flag, int max_keypart)
-{
-  ha_rows records=0, tmp;
-  uint tmp_min_flag, tmp_max_flag, keynr, min_key_length, max_key_length;
-  uint tmp_min_keypart= min_keypart, tmp_max_keypart= max_keypart;
-  uchar *tmp_min_key, *tmp_max_key;
-  uint8 save_first_null_comp= param->first_null_comp;
-
-  param->max_key_part=max(param->max_key_part,key_tree->part);
-  if (key_tree->left != &null_element)
+  /* Figure out if the key scan is ROR (returns rows in ROWID order) or not */
+  enum ha_key_alg key_alg= param->table->key_info[seq.real_keyno].algorithm;
+  if ((key_alg != HA_KEY_ALG_BTREE) && (key_alg!= HA_KEY_ALG_UNDEF))
   {
-    /*
-      There are at least two intervals for current key part, i.e. condition
-      was converted to something like
-        (keyXpartY less/equals c1) OR (keyXpartY more/equals c2).
-      This is not a ROR scan if the key is not Clustered Primary Key.
+    /* 
+      All scans are non-ROR scans for those index types.
+      TODO: Don't have this logic here, make table engines return 
+      appropriate flags instead.
     */
     param->is_ror_scan= FALSE;
-    records=check_quick_keys(param, idx, key_tree->left,
-                             min_key, min_key_flag, min_keypart,
-			     max_key, max_key_flag, max_keypart);
-    if (records == HA_POS_ERROR)			// Impossible
-      return records;
   }
-
-  tmp_min_key= min_key;
-  tmp_max_key= max_key;
-  tmp_min_keypart+= key_tree->store_min(param->key[idx][key_tree->part].store_length,
-                                        &tmp_min_key, min_key_flag);
-  tmp_max_keypart+= key_tree->store_max(param->key[idx][key_tree->part].store_length,
-                                        &tmp_max_key, max_key_flag);
-  min_key_length= (uint) (tmp_min_key - param->min_key);
-  max_key_length= (uint) (tmp_max_key - param->max_key);
-
-  if (param->is_ror_scan)
+  else if (param->table->s->primary_key == keynr && pk_is_clustered)
   {
-    /*
-      If the index doesn't cover entire key, mark the scan as non-ROR scan.
-      Actually we're cutting off some ROR scans here.
-    */
-    uint16 fieldnr= param->table->key_info[param->real_keynr[idx]].
-                    key_part[key_tree->part].fieldnr - 1;
-    if (param->table->field[fieldnr]->key_length() !=
-        param->key[idx][key_tree->part].length)
-      param->is_ror_scan= FALSE;
+    /* Clustered PK scan is always a ROR scan (TODO: same as above) */
+    param->is_ror_scan= TRUE;
   }
-
-  if (!param->first_null_comp && key_tree->is_null_interval())
-    param->first_null_comp= key_tree->part+1;
-
-  if (key_tree->next_key_part &&
-      key_tree->next_key_part->type == SEL_ARG::KEY_RANGE &&
-      key_tree->next_key_part->part == key_tree->part+1)
-  {						// const key as prefix
-    if (min_key_length == max_key_length &&
-	!memcmp(min_key, max_key, (uint) (tmp_max_key - max_key)) &&
-	!key_tree->min_flag && !key_tree->max_flag)
-    {
-      tmp=check_quick_keys(param,idx,key_tree->next_key_part, tmp_min_key,
-                           min_key_flag | key_tree->min_flag, tmp_min_keypart,
-                           tmp_max_key, max_key_flag | key_tree->max_flag,
-                           tmp_max_keypart);
-      goto end;					// Ugly, but efficient
-    }
-    else
-    {
-      /* The interval for current key part is not c1 <= keyXpartY <= c1 */
-      param->is_ror_scan= FALSE;
-    }
-
-    tmp_min_flag=key_tree->min_flag;
-    tmp_max_flag=key_tree->max_flag;
-    if (!tmp_min_flag)
-      tmp_min_keypart+=
-      key_tree->next_key_part->store_min_key(param->key[idx], &tmp_min_key,
-					     &tmp_min_flag);
-    if (!tmp_max_flag)
-      tmp_max_keypart+=
-      key_tree->next_key_part->store_max_key(param->key[idx], &tmp_max_key,
-					     &tmp_max_flag);
-    min_key_length= (uint) (tmp_min_key - param->min_key);
-    max_key_length= (uint) (tmp_max_key - param->max_key);
-  }
-  else
-  {
-    tmp_min_flag= min_key_flag | key_tree->min_flag;
-    tmp_max_flag= max_key_flag | key_tree->max_flag;
-  }
-
-  if (unlikely(param->thd->killed != 0))
-    return HA_POS_ERROR;
-
-  keynr=param->real_keynr[idx];
-  param->range_count++;
-  if (!tmp_min_flag && ! tmp_max_flag &&
-      (uint) key_tree->part+1 == param->table->key_info[keynr].key_parts &&
-      (param->table->key_info[keynr].flags & HA_NOSAME) &&
-      min_key_length == max_key_length &&
-      !memcmp(param->min_key, param->max_key, min_key_length) &&
-      !param->first_null_comp)
-  {
-    tmp=1;					// Max one record
-    param->n_ranges++;
-  }
-  else
+  else if (param->range_count > 1)
   {
-    if (param->is_ror_scan)
-    {
-      /*
-        If we get here, the condition on the key was converted to form
-        "(keyXpart1 = c1) AND ... AND (keyXpart{key_tree->part - 1} = cN) AND
-          somecond(keyXpart{key_tree->part})"
-        Check if
-          somecond is "keyXpart{key_tree->part} = const" and
-          uncovered "tail" of KeyX parts is either empty or is identical to
-          first members of clustered primary key.
-      */
-      if (!(min_key_length == max_key_length &&
-            !memcmp(min_key, max_key, (uint) (tmp_max_key - max_key)) &&
-            !key_tree->min_flag && !key_tree->max_flag &&
-            is_key_scan_ror(param, keynr, key_tree->part + 1)))
-        param->is_ror_scan= FALSE;
-    }
-    param->n_ranges++;
-
-    if (tmp_min_flag & GEOM_FLAG)
-    {
-      key_range min_range;
-      min_range.key=    param->min_key;
-      min_range.length= min_key_length;
-      min_range.keypart_map= make_keypart_map(tmp_min_keypart);
-      /* In this case tmp_min_flag contains the handler-read-function */
-      min_range.flag=   (ha_rkey_function) (tmp_min_flag ^ GEOM_FLAG);
-
-      tmp= param->table->file->records_in_range(keynr,
-                                                &min_range, (key_range*) 0);
-    }
-    else
-    {
-      key_range min_range, max_range;
-
-      min_range.key=    param->min_key;
-      min_range.length= min_key_length;
-      min_range.flag=   (tmp_min_flag & NEAR_MIN ? HA_READ_AFTER_KEY :
-                         HA_READ_KEY_EXACT);
-      min_range.keypart_map= make_keypart_map(tmp_min_keypart);
-      max_range.key=    param->max_key;
-      max_range.length= max_key_length;
-      max_range.flag=   (tmp_max_flag & NEAR_MAX ?
-                         HA_READ_BEFORE_KEY : HA_READ_AFTER_KEY);
-      max_range.keypart_map= make_keypart_map(tmp_max_keypart);
-      tmp=param->table->file->records_in_range(keynr,
-                                               (min_key_length ? &min_range :
-                                                (key_range*) 0),
-                                               (max_key_length ? &max_range :
-                                                (key_range*) 0));
-    }
-  }
- end:
-  if (tmp == HA_POS_ERROR)			// Impossible range
-    return tmp;
-  records+=tmp;
-  if (key_tree->right != &null_element)
-  {
-    /*
-      There are at least two intervals for current key part, i.e. condition
-      was converted to something like
-        (keyXpartY less/equals c1) OR (keyXpartY more/equals c2).
-      This is not a ROR scan if the key is not Clustered Primary Key.
+    /* 
+      Scaning multiple key values in the index: the records are ROR
+      for each value, but not between values. E.g, "SELECT ... x IN
+      (1,3)" returns ROR order for all records with x=1, then ROR
+      order for records with x=3
     */
     param->is_ror_scan= FALSE;
-    tmp=check_quick_keys(param, idx, key_tree->right,
-                         min_key, min_key_flag, min_keypart,
-                         max_key, max_key_flag, max_keypart);
-    if (tmp == HA_POS_ERROR)
-      return tmp;
-    records+=tmp;
   }
-  param->first_null_comp= save_first_null_comp;
-  return records;
+
+  DBUG_PRINT("exit", ("Records: %lu", (ulong) rows));
+  DBUG_RETURN(rows); //psergey-merge:todo: maintain first_null_comp.
 }
 
 
@@ -7716,13 +9587,14 @@ check_quick_keys(PARAM *param, uint idx, SEL_ARG *key_tree,
     where the index is defined on (key1_1, ..., key1_N [,a_1, ..., a_n])
 
     and the table has a clustered Primary Key defined as 
-
       PRIMARY KEY(a_1, ..., a_n, b1, ..., b_k) 
     
     i.e. the first key parts of it are identical to uncovered parts ot the 
     key being scanned. This function assumes that the index flags do not
     include HA_KEY_SCAN_NOT_ROR flag (that is checked elsewhere).
 
+    Check (1) is made in quick_range_seq_next()
+
   RETURN
     TRUE   The scan is ROR-scan
     FALSE  Otherwise
@@ -7735,9 +9607,19 @@ static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts)
   KEY_PART_INFO *key_part_end= (table_key->key_part +
                                 table_key->key_parts);
   uint pk_number;
+  
+  for (KEY_PART_INFO *kp= table_key->key_part; kp < key_part; kp++)
+  {
+    uint16 fieldnr= param->table->key_info[keynr].
+                    key_part[kp - table_key->key_part].fieldnr - 1;
+    if (param->table->field[fieldnr]->key_length() != kp->length)
+      return FALSE;
+  }
 
   if (key_part == key_part_end)
     return TRUE;
+
+  key_part= table_key->key_part + nparts;
   pk_number= param->table->s->primary_key;
   if (!param->table->file->primary_key_is_clustered() || pk_number == MAX_KEY)
     return FALSE;
@@ -7762,12 +9644,14 @@ static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts)
   SYNOPSIS
     get_quick_select()
       param
-      idx          Index of used key in param->key.
-      key_tree     SEL_ARG tree for the used key
-      parent_alloc If not NULL, use it to allocate memory for
-                   quick select data. Otherwise use quick->alloc.
+      idx            Index of used key in param->key.
+      key_tree       SEL_ARG tree for the used key
+      mrr_flags      MRR parameter for quick select
+      mrr_buf_size   MRR parameter for quick select
+      parent_alloc   If not NULL, use it to allocate memory for
+                     quick select data. Otherwise use quick->alloc.
   NOTES
-    The caller must call QUICK_SELECT::init for returned quick select
+    The caller must call QUICK_SELECT::init for returned quick select.
 
     CAUTION! This function may change thd->mem_root to a MEM_ROOT which will be
     deallocated when the returned quick select is deleted.
@@ -7778,25 +9662,26 @@ static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts)
 */
 
 QUICK_RANGE_SELECT *
-get_quick_select(PARAM *param,uint idx,SEL_ARG *key_tree,
-                 MEM_ROOT *parent_alloc)
+get_quick_select(PARAM *param,uint idx,SEL_ARG *key_tree, uint mrr_flags,
+                 uint mrr_buf_size, MEM_ROOT *parent_alloc)
 {
   QUICK_RANGE_SELECT *quick;
+  bool create_err= FALSE;
   DBUG_ENTER("get_quick_select");
 
   if (param->table->key_info[param->real_keynr[idx]].flags & HA_SPATIAL)
     quick=new QUICK_RANGE_SELECT_GEOM(param->thd, param->table,
                                       param->real_keynr[idx],
                                       test(parent_alloc),
-                                      parent_alloc);
+                                      parent_alloc, &create_err);
   else
     quick=new QUICK_RANGE_SELECT(param->thd, param->table,
                                  param->real_keynr[idx],
-                                 test(parent_alloc));
+                                 test(parent_alloc), NULL, &create_err);
 
   if (quick)
   {
-    if (quick->error ||
+    if (create_err ||
 	get_quick_keys(param,quick,param->key[idx],key_tree,param->min_key,0,
 		       param->max_key,0))
     {
@@ -7805,6 +9690,8 @@ get_quick_select(PARAM *param,uint idx,SEL_ARG *key_tree,
     }
     else
     {
+      quick->mrr_flags= mrr_flags;
+      quick->mrr_buf_size= mrr_buf_size;
       quick->key_parts=(KEY_PART*)
         memdup_root(parent_alloc? parent_alloc : &quick->alloc,
                     (char*) param->key[idx],
@@ -7953,7 +9840,20 @@ bool QUICK_RANGE_SELECT::unique_key_range()
 }
 
 
-/* Returns TRUE if any part of the key is NULL */
+
+/*
+  Return TRUE if any part of the key is NULL
+
+  SYNOPSIS
+    null_part_in_key()    
+      key_part  Array of key parts (index description)
+      key       Key values tuple
+      length    Length of key values tuple in bytes.
+
+  RETURN
+    TRUE   The tuple has at least one "keypartX is NULL"
+    FALSE  Otherwise
+*/
 
 static bool null_part_in_key(KEY_PART *key_part, const uchar *key, uint length)
 {
@@ -7973,7 +9873,7 @@ bool QUICK_SELECT_I::is_keys_used(const MY_BITMAP *fields)
   return is_key_used(head, index, fields);
 }
 
-bool QUICK_INDEX_MERGE_SELECT::is_keys_used(const MY_BITMAP *fields)
+bool QUICK_INDEX_SORT_SELECT::is_keys_used(const MY_BITMAP *fields)
 {
   QUICK_RANGE_SELECT *quick;
   List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
@@ -8010,6 +9910,19 @@ bool QUICK_ROR_UNION_SELECT::is_keys_used(const MY_BITMAP *fields)
 }
 
 
+FT_SELECT *get_ft_select(THD *thd, TABLE *table, uint key)
+{
+  bool create_err= FALSE;
+  FT_SELECT *fts= new FT_SELECT(thd, table, key, &create_err);
+  if (create_err)
+  {
+    delete fts;
+    return NULL;
+  }
+  else
+    return fts;
+}
+
 /*
   Create quick select from ref/ref_or_null scan.
 
@@ -8038,10 +9951,12 @@ QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
   KEY_PART *key_part;
   QUICK_RANGE *range;
   uint part;
+  bool create_err= FALSE;
+  COST_VECT cost;
 
   old_root= thd->mem_root;
   /* The following call may change thd->mem_root */
-  quick= new QUICK_RANGE_SELECT(thd, table, ref->key, 0);
+  quick= new QUICK_RANGE_SELECT(thd, table, ref->key, 0, 0, &create_err);
   /* save mem_root set by QUICK_RANGE_SELECT constructor */
   alloc= thd->mem_root;
   /*
@@ -8050,7 +9965,7 @@ QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
   */
   thd->mem_root= old_root;
 
-  if (!quick)
+  if (!quick || create_err)
     return 0;			/* no ranges found */
   if (quick->init())
     goto err;
@@ -8104,8 +10019,25 @@ QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
       goto err;
   }
 
-  return quick;
+  /* Call multi_range_read_info() to get the MRR flags and buffer size */
+  quick->mrr_flags= HA_MRR_NO_ASSOCIATION | 
+                    (table->key_read ? HA_MRR_INDEX_ONLY : 0);
+  if (thd->lex->sql_command != SQLCOM_SELECT)
+    quick->mrr_flags |= HA_MRR_USE_DEFAULT_IMPL;
+#ifdef WITH_NDBCLUSTER_STORAGE_ENGINE
+  if (!ref->null_ref_key && !key_has_nulls(key_info, range->min_key,
+                                           ref->key_length))
+    quick->mrr_flags |= HA_MRR_NO_NULL_ENDPOINTS;
+#endif
 
+  quick->mrr_buf_size= thd->variables.mrr_buff_size;
+  if (table->file->multi_range_read_info(quick->index, 1, (uint)records,
+                                         ~0, 
+                                         &quick->mrr_buf_size,
+                                         &quick->mrr_flags, &cost))
+    goto err;
+
+  return quick;
 err:
   delete quick;
   return 0;
@@ -8129,13 +10061,23 @@ err:
     other error
 */
 
-int QUICK_INDEX_MERGE_SELECT::read_keys_and_merge()
+int read_keys_and_merge_scans(THD *thd,
+                              TABLE *head,
+                              List<QUICK_RANGE_SELECT> quick_selects,
+                              QUICK_RANGE_SELECT *pk_quick_select,
+                              READ_RECORD *read_record,
+                              bool intersection,
+                              key_map *filtered_scans,
+                              Unique **unique_ptr)
 {
   List_iterator_fast<QUICK_RANGE_SELECT> cur_quick_it(quick_selects);
   QUICK_RANGE_SELECT* cur_quick;
   int result;
+  Unique *unique= *unique_ptr;
   handler *file= head->file;
-  DBUG_ENTER("QUICK_INDEX_MERGE_SELECT::read_keys_and_merge");
+  bool with_cpk_filter= pk_quick_select != NULL;
+
+  DBUG_ENTER("read_keys_and_merge");
 
   /* We're going to just read rowids. */
   if (!head->key_read)
@@ -8146,6 +10088,7 @@ int QUICK_INDEX_MERGE_SELECT::read_keys_and_merge()
 
   cur_quick_it.rewind();
   cur_quick= cur_quick_it++;
+  bool first_quick= TRUE;
   DBUG_ASSERT(cur_quick != 0);
   
   /*
@@ -8163,9 +10106,11 @@ int QUICK_INDEX_MERGE_SELECT::read_keys_and_merge()
 
     unique= new Unique(refpos_order_cmp, (void *)file,
                        file->ref_length,
-                       thd->variables.sortbuff_size);
+                       thd->variables.sortbuff_size,
+		       intersection ? quick_selects.elements : 0);                     
     if (!unique)
       goto err;
+    *unique_ptr= unique;
   }
   else
     unique->reset();
@@ -8177,6 +10122,14 @@ int QUICK_INDEX_MERGE_SELECT::read_keys_and_merge()
   {
     while ((result= cur_quick->get_next()) == HA_ERR_END_OF_FILE)
     {
+      if (intersection)
+        with_cpk_filter= filtered_scans->is_set(cur_quick->index);
+      if (first_quick)
+      {
+        first_quick= FALSE;
+        if (intersection && unique->is_in_memory())
+          unique->close_for_expansion();
+      }
       cur_quick->range_end();
       cur_quick= cur_quick_it++;
       if (!cur_quick)
@@ -8201,8 +10154,8 @@ int QUICK_INDEX_MERGE_SELECT::read_keys_and_merge()
     if (thd->killed)
       goto err;
 
-    /* skip row if it will be retrieved by clustered PK scan */
-    if (pk_quick_select && pk_quick_select->row_in_ranges())
+    if (with_cpk_filter &&
+        pk_quick_select->row_in_ranges() != intersection )
       continue;
 
     cur_quick->file->position(cur_quick->record);
@@ -8216,14 +10169,13 @@ int QUICK_INDEX_MERGE_SELECT::read_keys_and_merge()
     sequence.
   */
   result= unique->get(head);
-  doing_pk_scan= FALSE;
   /*
-    index_merge currently doesn't support "using index" at all
+    index merge currently doesn't support "using index" at all
   */
   head->disable_keyread();
-  if (init_read_record(&read_record, thd, head, (SQL_SELECT*) 0, 1 , 1, TRUE))
+  if (init_read_record(read_record, thd, head, (SQL_SELECT*) 0, 1 , 1, TRUE))
     result= 1;
-  DBUG_RETURN(result);
+ DBUG_RETURN(result);
 
 err:
   head->disable_keyread();
@@ -8231,6 +10183,17 @@ err:
 }
 
 
+int QUICK_INDEX_MERGE_SELECT::read_keys_and_merge()
+
+{
+  int result;
+  DBUG_ENTER("QUICK_INDEX_MERGE_SELECT::read_keys_and_merge");
+  result= read_keys_and_merge_scans(thd, head, quick_selects, pk_quick_select,
+                                    &read_record, FALSE, NULL, &unique);
+  doing_pk_scan= FALSE;
+  DBUG_RETURN(result);
+}
+
 /*
   Get next row for index_merge.
   NOTES
@@ -8267,6 +10230,32 @@ int QUICK_INDEX_MERGE_SELECT::get_next()
   DBUG_RETURN(result);
 }
 
+int QUICK_INDEX_INTERSECT_SELECT::read_keys_and_merge()
+
+{
+  int result;
+  DBUG_ENTER("QUICK_INDEX_INTERSECT_SELECT::read_keys_and_merge");
+  result= read_keys_and_merge_scans(thd, head, quick_selects, pk_quick_select,
+                                    &read_record, TRUE, &filtered_scans,
+                                    &unique);
+  DBUG_RETURN(result);
+}
+
+int QUICK_INDEX_INTERSECT_SELECT::get_next()
+{
+  int result;
+  DBUG_ENTER("QUICK_INDEX_INTERSECT_SELECT::get_next");
+
+  if ((result= read_record.read_record(&read_record)) == -1)
+  {
+    result= HA_ERR_END_OF_FILE;
+    end_read_record(&read_record);
+    free_io_cache(head);
+  }
+
+  DBUG_RETURN(result);
+}
+
 
 /*
   Retrieve next record.
@@ -8427,12 +10416,12 @@ int QUICK_ROR_UNION_SELECT::get_next()
       {
         if (error != HA_ERR_END_OF_FILE)
           DBUG_RETURN(error);
-        queue_remove(&queue, 0);
+        queue_remove_top(&queue);
       }
       else
       {
         quick->save_last_pos();
-        queue_replaced(&queue);
+        queue_replace_top(&queue);
       }
 
       if (!have_prev_rowid)
@@ -8457,12 +10446,12 @@ int QUICK_ROR_UNION_SELECT::get_next()
 
 int QUICK_RANGE_SELECT::reset()
 {
-  uint  mrange_bufsiz;
+  uint  buf_size;
   uchar *mrange_buff;
+  int   error;
+  HANDLER_BUFFER empty_buf;
   DBUG_ENTER("QUICK_RANGE_SELECT::reset");
-  next=0;
   last_range= NULL;
-  in_range= FALSE;
   cur_range= (QUICK_RANGE**) ranges.buffer;
 
   if (file->inited == handler::NONE)
@@ -8473,69 +10462,43 @@ int QUICK_RANGE_SELECT::reset()
         DBUG_RETURN(error);
   }
 
-  /* Do not allocate the buffers twice. */
-  if (multi_range_length)
-  {
-    DBUG_ASSERT(multi_range_length == min(multi_range_count, ranges.elements));
-    DBUG_RETURN(0);
-  }
-
-  /* Allocate the ranges array. */
-  DBUG_ASSERT(ranges.elements);
-  multi_range_length= min(multi_range_count, ranges.elements);
-  DBUG_ASSERT(multi_range_length > 0);
-  while (multi_range_length && ! (multi_range= (KEY_MULTI_RANGE*)
-                                  my_malloc(multi_range_length *
-                                            sizeof(KEY_MULTI_RANGE),
-                                            MYF(MY_WME))))
-  {
-    /* Try to shrink the buffers until it is 0. */
-    multi_range_length/= 2;
-  }
-  if (! multi_range)
+  /* Allocate buffer if we need one but haven't allocated it yet */
+  if (mrr_buf_size && !mrr_buf_desc)
   {
-    multi_range_length= 0;
-    DBUG_RETURN(HA_ERR_OUT_OF_MEM);
-  }
-
-  /* Allocate the handler buffer if necessary.  */
-  if (file->ha_table_flags() & HA_NEED_READ_RANGE_BUFFER)
-  {
-    mrange_bufsiz= min(multi_range_bufsiz,
-                       ((uint)QUICK_SELECT_I::records + 1)* head->s->reclength);
-
-    while (mrange_bufsiz &&
-           ! my_multi_malloc(MYF(MY_WME),
-                             &multi_range_buff,
-                             (uint) sizeof(*multi_range_buff),
-                             &mrange_buff, (uint) mrange_bufsiz,
-                             NullS))
+    buf_size= mrr_buf_size;
+    while (buf_size && !my_multi_malloc(MYF(MY_WME),
+                                        &mrr_buf_desc, sizeof(*mrr_buf_desc),
+                                        &mrange_buff, buf_size,
+                                        NullS))
     {
       /* Try to shrink the buffers until both are 0. */
-      mrange_bufsiz/= 2;
+      buf_size/= 2;
     }
-    if (! multi_range_buff)
-    {
-      my_free((char*) multi_range, MYF(0));
-      multi_range= NULL;
-      multi_range_length= 0;
+    if (!mrr_buf_desc)
       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
-    }
 
     /* Initialize the handler buffer. */
-    multi_range_buff->buffer= mrange_buff;
-    multi_range_buff->buffer_end= mrange_buff + mrange_bufsiz;
-    multi_range_buff->end_of_used_area= mrange_buff;
-#ifdef HAVE_valgrind
+    mrr_buf_desc->buffer= mrange_buff;
+    mrr_buf_desc->buffer_end= mrange_buff + buf_size;
+    mrr_buf_desc->end_of_used_area= mrange_buff;
+#ifdef HAVE_purify
     /*
       We need this until ndb will use the buffer efficiently
       (Now ndb stores  complete row in here, instead of only the used fields
       which gives us valgrind warnings in compare_record[])
     */
-    bzero((char*) mrange_buff, mrange_bufsiz);
+    bzero((char*) mrange_buff, buf_size);
 #endif
   }
-  DBUG_RETURN(0);
+
+  if (!mrr_buf_desc)
+    empty_buf.buffer= empty_buf.buffer_end= empty_buf.end_of_used_area= NULL;
+ 
+  RANGE_SEQ_IF seq_funcs= {NULL, quick_range_seq_init, quick_range_seq_next, 0, 0};
+  error= file->multi_range_read_init(&seq_funcs, (void*)this, ranges.elements,
+                                     mrr_flags, mrr_buf_desc? mrr_buf_desc: 
+                                                              &empty_buf);
+  DBUG_RETURN(error);
 }
 
 
@@ -8556,13 +10519,8 @@ int QUICK_RANGE_SELECT::reset()
 
 int QUICK_RANGE_SELECT::get_next()
 {
-  int             result;
-  KEY_MULTI_RANGE *mrange;
+  range_id_t dummy;
   DBUG_ENTER("QUICK_RANGE_SELECT::get_next");
-  DBUG_ASSERT(multi_range_length && multi_range &&
-              (cur_range >= (QUICK_RANGE**) ranges.buffer) &&
-              (cur_range <= (QUICK_RANGE**) ranges.buffer + ranges.elements));
-
   if (in_ror_merged_scan)
   {
     /*
@@ -8572,46 +10530,8 @@ int QUICK_RANGE_SELECT::get_next()
     head->column_bitmaps_set_no_signal(&column_bitmap, &column_bitmap);
   }
 
-  for (;;)
-  {
-    if (in_range)
-    {
-      /* We did already start to read this key. */
-      result= file->read_multi_range_next(&mrange);
-      if (result != HA_ERR_END_OF_FILE)
-        goto end;
-    }
-
-    uint count= min(multi_range_length, ranges.elements -
-                    (cur_range - (QUICK_RANGE**) ranges.buffer));
-    if (count == 0)
-    {
-      /* Ranges have already been used up before. None is left for read. */
-      in_range= FALSE;
-      if (in_ror_merged_scan)
-        head->column_bitmaps_set_no_signal(save_read_set, save_write_set);
-      DBUG_RETURN(HA_ERR_END_OF_FILE);
-    }
-    KEY_MULTI_RANGE *mrange_slot, *mrange_end;
-    for (mrange_slot= multi_range, mrange_end= mrange_slot+count;
-         mrange_slot < mrange_end;
-         mrange_slot++)
-    {
-      last_range= *(cur_range++);
-      last_range->make_min_endpoint(&mrange_slot->start_key);
-      last_range->make_max_endpoint(&mrange_slot->end_key);
-      mrange_slot->range_flag= last_range->flag;
-    }
-
-    result= file->read_multi_range_first(&mrange, multi_range, count,
-                                         sorted, multi_range_buff);
-    if (result != HA_ERR_END_OF_FILE)
-      goto end;
-    in_range= FALSE; /* No matching rows; go to next set of ranges. */
-  }
+  int result= file->multi_range_read_next(&dummy);
 
-end:
-  in_range= ! result;
   if (in_ror_merged_scan)
   {
     /* Restore bitmaps set on entry */
@@ -8620,6 +10540,7 @@ end:
   DBUG_RETURN(result);
 }
 
+
 /*
   Get the next record with a different prefix.
 
@@ -8658,9 +10579,7 @@ int QUICK_RANGE_SELECT::get_next_prefix(uint prefix_length,
     int result;
     if (last_range)
     {
-      /*
-        Read the next record in the same range with prefix after cur_prefix.
-      */
+      /* Read the next record in the same range with prefix after cur_prefix. */
       DBUG_ASSERT(cur_prefix != NULL);
       result= file->ha_index_read_map(record, cur_prefix, keypart_map,
                                       HA_READ_AFTER_KEY);
@@ -8789,7 +10708,8 @@ bool QUICK_RANGE_SELECT::row_in_ranges()
  */
 
 QUICK_SELECT_DESC::QUICK_SELECT_DESC(QUICK_RANGE_SELECT *q,
-                                     uint used_key_parts_arg)
+                                     uint used_key_parts_arg,
+                                     bool *create_err)
  :QUICK_RANGE_SELECT(*q), rev_it(rev_ranges),
   used_key_parts (used_key_parts_arg)
 {
@@ -8798,9 +10718,9 @@ QUICK_SELECT_DESC::QUICK_SELECT_DESC(QUICK_RANGE_SELECT *q,
     Use default MRR implementation for reverse scans. No table engine
     currently can do an MRR scan with output in reverse index order.
   */
-  multi_range_length= 0;
-  multi_range= NULL;
-  multi_range_buff= NULL;
+  mrr_buf_desc= NULL;
+  mrr_flags |= HA_MRR_USE_DEFAULT_IMPL;
+  mrr_buf_size= 0;
 
   QUICK_RANGE **pr= (QUICK_RANGE**)ranges.buffer;
   QUICK_RANGE **end_range= pr + ranges.elements;
@@ -8909,6 +10829,7 @@ int QUICK_SELECT_DESC::get_next()
 /*
   Compare if found key is over max-value
   Returns 0 if key <= range->max_key
+  TODO: Figure out why can't this function be as simple as cmp_prev(). 
 */
 
 int QUICK_RANGE_SELECT::cmp_next(QUICK_RANGE *range_arg)
@@ -8978,30 +10899,53 @@ bool QUICK_SELECT_DESC::range_reads_after_key(QUICK_RANGE *range_arg)
 }
 
 
-void QUICK_RANGE_SELECT::add_info_string(String *str)
+void QUICK_SELECT_I::add_key_name(String *str, bool *first)
 {
   KEY *key_info= head->key_info + index;
+
+  if (*first)
+    *first= FALSE;
+  else
+    str->append(',');
   str->append(key_info->name);
 }
+ 
+
+void QUICK_RANGE_SELECT::add_info_string(String *str)
+{
+  bool first= TRUE;
+  
+  add_key_name(str, &first);
+}
 
 void QUICK_INDEX_MERGE_SELECT::add_info_string(String *str)
 {
   QUICK_RANGE_SELECT *quick;
   bool first= TRUE;
   List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
+
   str->append(STRING_WITH_LEN("sort_union("));
   while ((quick= it++))
   {
-    if (!first)
-      str->append(',');
-    else
-      first= FALSE;
-    quick->add_info_string(str);
+    quick->add_key_name(str, &first);
   }
   if (pk_quick_select)
+    pk_quick_select->add_key_name(str, &first);
+  str->append(')');
+}
+
+void QUICK_INDEX_INTERSECT_SELECT::add_info_string(String *str)
+{
+  QUICK_RANGE_SELECT *quick;
+  bool first= TRUE;
+  List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
+
+  str->append(STRING_WITH_LEN("sort_intersect("));
+  if (pk_quick_select)
+    pk_quick_select->add_key_name(str, &first);
+  while ((quick= it++))
   {
-    str->append(',');
-    pk_quick_select->add_info_string(str);
+    quick->add_key_name(str, &first);
   }
   str->append(')');
 }
@@ -9011,130 +10955,125 @@ void QUICK_ROR_INTERSECT_SELECT::add_info_string(String *str)
   bool first= TRUE;
   QUICK_SELECT_WITH_RECORD *qr;
   List_iterator_fast<QUICK_SELECT_WITH_RECORD> it(quick_selects);
+
   str->append(STRING_WITH_LEN("intersect("));
   while ((qr= it++))
   {
-    KEY *key_info= head->key_info + qr->quick->index;
-    if (!first)
-      str->append(',');
-    else
-      first= FALSE;
-    str->append(key_info->name);
+    qr->quick->add_key_name(str, &first);
   }
   if (cpk_quick)
-  {
-    KEY *key_info= head->key_info + cpk_quick->index;
-    str->append(',');
-    str->append(key_info->name);
-  }
+    cpk_quick->add_key_name(str, &first);
   str->append(')');
 }
 
+
 void QUICK_ROR_UNION_SELECT::add_info_string(String *str)
 {
-  bool first= TRUE;
   QUICK_SELECT_I *quick;
+  bool first= TRUE;
   List_iterator_fast<QUICK_SELECT_I> it(quick_selects);
+
   str->append(STRING_WITH_LEN("union("));
   while ((quick= it++))
   {
-    if (!first)
-      str->append(',');
-    else
+    if (first)
       first= FALSE;
+    else
+      str->append(',');
     quick->add_info_string(str);
   }
   str->append(')');
 }
 
 
-void QUICK_RANGE_SELECT::add_keys_and_lengths(String *key_names,
-                                              String *used_lengths)
+void QUICK_SELECT_I::add_key_and_length(String *key_names,
+                                        String *used_lengths,
+                                        bool *first)
 {
   char buf[64];
   uint length;
   KEY *key_info= head->key_info + index;
+
+  if (*first)
+    *first= FALSE;
+  else
+  {
+    key_names->append(',');
+    used_lengths->append(',');
+  }
   key_names->append(key_info->name);
   length= longlong10_to_str(max_used_key_length, buf, 10) - buf;
   used_lengths->append(buf, length);
 }
 
+
+void QUICK_RANGE_SELECT::add_keys_and_lengths(String *key_names,
+                                              String *used_lengths)
+{
+  bool first= TRUE;
+
+  add_key_and_length(key_names, used_lengths, &first);
+}
+
 void QUICK_INDEX_MERGE_SELECT::add_keys_and_lengths(String *key_names,
                                                     String *used_lengths)
 {
-  char buf[64];
-  uint length;
-  bool first= TRUE;
   QUICK_RANGE_SELECT *quick;
+  bool first= TRUE;
 
   List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
+
   while ((quick= it++))
   {
-    if (first)
-      first= FALSE;
-    else
-    {
-      key_names->append(',');
-      used_lengths->append(',');
-    }
-
-    KEY *key_info= head->key_info + quick->index;
-    key_names->append(key_info->name);
-    length= longlong10_to_str(quick->max_used_key_length, buf, 10) - buf;
-    used_lengths->append(buf, length);
+    quick->add_key_and_length(key_names, used_lengths, &first);
   }
+
+  if (pk_quick_select)
+    pk_quick_select->add_key_and_length(key_names, used_lengths, &first);
+}
+
+
+void QUICK_INDEX_INTERSECT_SELECT::add_keys_and_lengths(String *key_names,
+                                                        String *used_lengths)
+{
+  QUICK_RANGE_SELECT *quick;
+  bool first= TRUE;
+
+  List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
+
   if (pk_quick_select)
+    pk_quick_select->add_key_and_length(key_names, used_lengths, &first);
+
+  while ((quick= it++))
   {
-    KEY *key_info= head->key_info + pk_quick_select->index;
-    key_names->append(',');
-    key_names->append(key_info->name);
-    length= (longlong10_to_str(pk_quick_select->max_used_key_length, buf, 10)
-             - buf);
-    used_lengths->append(',');
-    used_lengths->append(buf, length);
+    quick->add_key_and_length(key_names, used_lengths, &first);
   }
 }
 
 void QUICK_ROR_INTERSECT_SELECT::add_keys_and_lengths(String *key_names,
                                                       String *used_lengths)
 {
-  char buf[64];
-  uint length;
-  bool first= TRUE;
   QUICK_SELECT_WITH_RECORD *qr;
+  bool first= TRUE;
+
   List_iterator_fast<QUICK_SELECT_WITH_RECORD> it(quick_selects);
+
   while ((qr= it++))
   {
-    KEY *key_info= head->key_info + qr->quick->index;
-    if (first)
-      first= FALSE;
-    else
-    {
-      key_names->append(',');
-      used_lengths->append(',');
-    }
-    key_names->append(key_info->name);
-    length= longlong10_to_str(qr->quick->max_used_key_length, buf, 10) - buf;
-    used_lengths->append(buf, length);
+    qr->quick->add_key_and_length(key_names, used_lengths, &first);
   }
-
   if (cpk_quick)
-  {
-    KEY *key_info= head->key_info + cpk_quick->index;
-    key_names->append(',');
-    key_names->append(key_info->name);
-    length= longlong10_to_str(cpk_quick->max_used_key_length, buf, 10) - buf;
-    used_lengths->append(',');
-    used_lengths->append(buf, length);
-  }
+    cpk_quick->add_key_and_length(key_names, used_lengths, &first);
 }
 
 void QUICK_ROR_UNION_SELECT::add_keys_and_lengths(String *key_names,
                                                   String *used_lengths)
 {
-  bool first= TRUE;
   QUICK_SELECT_I *quick;
+  bool first= TRUE;
+
   List_iterator_fast<QUICK_SELECT_I> it(quick_selects);
+
   while ((quick= it++))
   {
     if (first)
@@ -9641,8 +11580,14 @@ get_best_group_min_max(PARAM *param, SEL_TREE *tree)
       cur_index_tree= get_index_range_tree(cur_index, tree, param,
                                            &cur_param_idx);
       /* Check if this range tree can be used for prefix retrieval. */
+      COST_VECT dummy_cost;
+      uint mrr_flags= HA_MRR_USE_DEFAULT_IMPL;
+      uint mrr_bufsize=0;
       cur_quick_prefix_records= check_quick_select(param, cur_param_idx,
-                                                    cur_index_tree, TRUE);
+                                                   FALSE /*don't care*/,
+                                                   cur_index_tree, TRUE,
+                                                   &mrr_flags, &mrr_bufsize,
+                                                   &dummy_cost);
     }
     cost_group_min_max(table, cur_index_info, cur_used_key_parts,
                        cur_group_key_parts, tree, cur_index_tree,
@@ -9762,7 +11707,9 @@ check_group_min_max_predicates(COND *cond, Item_field *min_max_arg_item,
     the MIN/MAX argument field, and disallow the optimization only if this is
     so.
   */
-  if (cond_type == Item::SUBSELECT_ITEM)
+  if (cond_type == Item::SUBSELECT_ITEM ||
+      (cond->get_cached_item() &&
+       cond->get_cached_item()->type() == Item::SUBSELECT_ITEM))
     DBUG_RETURN(FALSE);
 
   /*
@@ -9910,13 +11857,14 @@ get_constant_key_infix(KEY *index_info, SEL_ARG *index_range_tree,
       Find the range tree for the current keypart. We assume that
       index_range_tree points to the leftmost keypart in the index.
     */
-    for (cur_range= index_range_tree; cur_range;
+    for (cur_range= index_range_tree; 
+         cur_range && cur_range->type == SEL_ARG::KEY_RANGE;
          cur_range= cur_range->next_key_part)
     {
       if (cur_range->field->eq(cur_part->field))
         break;
     }
-    if (!cur_range)
+    if (!cur_range || cur_range->type != SEL_ARG::KEY_RANGE)
     {
       if (min_max_arg_part)
         return FALSE; /* The current keypart has no range predicates at all. */
@@ -10230,6 +12178,7 @@ TRP_GROUP_MIN_MAX::make_quick(PARAM *param, bool retrieve_full_rows,
       /* Make a QUICK_RANGE_SELECT to be used for group prefix retrieval. */
       quick->quick_prefix_select= get_quick_select(param, param_idx,
                                                    index_tree,
+                                                   HA_MRR_USE_DEFAULT_IMPL, 0,
                                                    &quick->alloc);
 
     /*
@@ -11285,11 +13234,9 @@ void QUICK_GROUP_MIN_MAX_SELECT::update_max_result()
 void QUICK_GROUP_MIN_MAX_SELECT::add_keys_and_lengths(String *key_names,
                                                       String *used_lengths)
 {
-  char buf[64];
-  uint length;
-  key_names->append(index_info->name);
-  length= longlong10_to_str(max_used_key_length, buf, 10) - buf;
-  used_lengths->append(buf, length);
+  bool first= TRUE;
+
+  add_key_and_length(key_names, used_lengths, &first);
 }
 
 
@@ -11321,7 +13268,7 @@ static void print_sel_tree(PARAM *param, SEL_TREE *tree, key_map *tree_map,
     tmp.append(STRING_WITH_LEN("(empty)"));
 
   DBUG_PRINT("info", ("SEL_TREE: 0x%lx (%s)  scans: %s", (long) tree, msg,
-                      tmp.c_ptr()));
+                      tmp.c_ptr_safe()));
 
   DBUG_VOID_RETURN;
 }
@@ -11348,6 +13295,7 @@ static void print_ror_scans_arr(TABLE *table, const char *msg,
   DBUG_VOID_RETURN;
 }
 
+
 /*****************************************************************************
 ** Print a quick range for debugging
 ** TODO:
@@ -11456,7 +13404,7 @@ void QUICK_RANGE_SELECT::dbug_dump(int indent, bool verbose)
   /* purecov: end */    
 }
 
-void QUICK_INDEX_MERGE_SELECT::dbug_dump(int indent, bool verbose)
+void QUICK_INDEX_SORT_SELECT::dbug_dump(int indent, bool verbose)
 {
   List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
   QUICK_RANGE_SELECT *quick;
diff --git a/sql/opt_range.h b/sql/opt_range.h
index 1321b3886ee..d7a0c1e2f8f 100644
--- a/sql/opt_range.h
+++ b/sql/opt_range.h
@@ -193,13 +193,16 @@ class QUICK_RANGE :public Sql_alloc {
 
   4. Delete the select:
     delete quick;
-
+  
+  NOTE 
+    quick select doesn't use Sql_alloc/MEM_ROOT allocation because "range
+    checked for each record" functionality may create/destroy
+    O(#records_in_some_table) quick selects during query execution.
 */
 
 class QUICK_SELECT_I
 {
 public:
-  bool sorted;
   ha_rows records;  /* estimate of # of records to be retrieved */
   double  read_time; /* time to perform this retrieval          */
   TABLE   *head;
@@ -273,14 +276,21 @@ public:
   virtual bool unique_key_range() { return false; }
   virtual bool clustered_pk_range() { return false; }
 
+  /*
+    Request that this quick select produces sorted output. Not all quick
+    selects can do it, the caller is responsible for calling this function
+    only for those quick selects that can.
+  */
+  virtual void need_sorted_output() = 0;
   enum {
     QS_TYPE_RANGE = 0,
-    QS_TYPE_INDEX_MERGE = 1,
-    QS_TYPE_RANGE_DESC = 2,
-    QS_TYPE_FULLTEXT   = 3,
-    QS_TYPE_ROR_INTERSECT = 4,
-    QS_TYPE_ROR_UNION = 5,
-    QS_TYPE_GROUP_MIN_MAX = 6
+    QS_TYPE_INDEX_INTERSECT = 1,
+    QS_TYPE_INDEX_MERGE = 2,
+    QS_TYPE_RANGE_DESC = 3,
+    QS_TYPE_FULLTEXT   = 4,
+    QS_TYPE_ROR_INTERSECT = 5,
+    QS_TYPE_ROR_UNION = 6,
+    QS_TYPE_GROUP_MIN_MAX = 7
   };
 
   /* Get type of this quick select - one of the QS_TYPE_* values */
@@ -306,6 +316,10 @@ public:
     Save ROWID of last retrieved row in file->ref. This used in ROR-merging.
   */
   virtual void save_last_pos(){};
+  
+  void add_key_and_length(String *key_names,
+                          String *used_lengths,
+                          bool *first);
 
   /*
     Append comma-separated list of keys this quick select uses to key_names;
@@ -315,13 +329,16 @@ public:
   virtual void add_keys_and_lengths(String *key_names,
                                     String *used_lengths)=0;
 
+  void add_key_name(String *str, bool *first);
+
   /*
     Append text representation of quick select structure (what and how is
     merged) to str. The result is added to "Extra" field in EXPLAIN output.
     This function is implemented only by quick selects that merge other quick
     selects output and/or can produce output suitable for merging.
   */
-  virtual void add_info_string(String *str) {};
+  virtual void add_info_string(String *str) {}
+
   /*
     Return 1 if any index used by this quick select
     uses field which is marked in passed bitmap.
@@ -352,6 +369,22 @@ struct st_qsel_param;
 class PARAM;
 class SEL_ARG;
 
+
+/*
+  MRR range sequence, array<QUICK_RANGE> implementation: sequence traversal
+  context.
+*/
+typedef struct st_quick_range_seq_ctx
+{
+  QUICK_RANGE **first;
+  QUICK_RANGE **cur;
+  QUICK_RANGE **last;
+} QUICK_RANGE_SEQ_CTX;
+
+range_seq_t quick_range_seq_init(void *init_param, uint n_ranges, uint flags);
+bool quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
+
+
 /*
   Quick select that does a range scan on a single key. The records are
   returned in key order.
@@ -359,62 +392,47 @@ class SEL_ARG;
 class QUICK_RANGE_SELECT : public QUICK_SELECT_I
 {
 protected:
-  bool next,dont_free,in_ror_merged_scan;
   /* true if we enabled key only reads */
   bool doing_key_read;
-public:
-  int error;
-protected:
   handler *file;
-  /*
-    If true, this quick select has its "own" handler object which should be
-    closed no later then this quick select is deleted.
-  */
-  bool free_file;
-  bool in_range;
-  uint multi_range_count; /* copy from thd->variables.multi_range_count */
-  uint multi_range_length; /* the allocated length for the array */
-  uint multi_range_bufsiz; /* copy from thd->variables.read_rnd_buff_size */
-  KEY_MULTI_RANGE *multi_range; /* the multi-range array (allocated and
-                                       freed by QUICK_RANGE_SELECT) */
-  HANDLER_BUFFER *multi_range_buff; /* the handler buffer (allocated and
-                                       freed by QUICK_RANGE_SELECT) */
+
+  /* Members to deal with case when this quick select is a ROR-merged scan */
+  bool in_ror_merged_scan;
   MY_BITMAP column_bitmap, *save_read_set, *save_write_set;
+  bool free_file;   /* TRUE <=> this->file is "owned" by this quick select */
 
-  friend class TRP_ROR_INTERSECT;
-  friend
-  QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
-                                               struct st_table_ref *ref,
-                                               ha_rows records);
-  friend bool get_quick_keys(PARAM *param,
-                             QUICK_RANGE_SELECT *quick,KEY_PART *key,
-                             SEL_ARG *key_tree,
-                             uchar *min_key, uint min_key_flag,
-                             uchar *max_key, uint max_key_flag);
-  friend QUICK_RANGE_SELECT *get_quick_select(PARAM*,uint idx,
-                                              SEL_ARG *key_tree,
-                                              MEM_ROOT *alloc);
-  friend class QUICK_SELECT_DESC;
-  friend class QUICK_INDEX_MERGE_SELECT;
-  friend class QUICK_ROR_INTERSECT_SELECT;
-  friend class QUICK_GROUP_MIN_MAX_SELECT;
+  /* Range pointers to be used when not using MRR interface */
+  /* Members needed to use the MRR interface */
+  QUICK_RANGE_SEQ_CTX qr_traversal_ctx;
+public:
+  uint mrr_flags; /* Flags to be used with MRR interface */
+protected:
+  uint mrr_buf_size; /* copy from thd->variables.mrr_buff_size */  
+  HANDLER_BUFFER *mrr_buf_desc; /* the handler buffer */
 
+  /* Info about index we're scanning */
+  
   DYNAMIC_ARRAY ranges;     /* ordered array of range ptrs */
   QUICK_RANGE **cur_range;  /* current element in ranges  */
-
+  
   QUICK_RANGE *last_range;
+  
   KEY_PART *key_parts;
   KEY_PART_INFO *key_part_info;
+  
+  bool dont_free; /* Used by QUICK_SELECT_DESC */
+
   int cmp_next(QUICK_RANGE *range);
   int cmp_prev(QUICK_RANGE *range);
   bool row_in_ranges();
 public:
   MEM_ROOT alloc;
 
-  QUICK_RANGE_SELECT(THD *thd, TABLE *table,uint index_arg,bool no_alloc=0,
-                     MEM_ROOT *parent_alloc=NULL);
+  QUICK_RANGE_SELECT(THD *thd, TABLE *table,uint index_arg,bool no_alloc,
+                     MEM_ROOT *parent_alloc, bool *create_err);
   ~QUICK_RANGE_SELECT();
-
+  
+  void need_sorted_output();
   int init();
   int reset(void);
   int get_next();
@@ -434,6 +452,38 @@ public:
 #endif
 private:
   /* Default copy ctor used by QUICK_SELECT_DESC */
+  friend class TRP_ROR_INTERSECT;
+  friend
+  QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
+                                               struct st_table_ref *ref,
+                                               ha_rows records);
+  friend bool get_quick_keys(PARAM *param, QUICK_RANGE_SELECT *quick, 
+                             KEY_PART *key, SEL_ARG *key_tree, 
+                             uchar *min_key, uint min_key_flag,
+                             uchar *max_key, uint max_key_flag);
+  friend QUICK_RANGE_SELECT *get_quick_select(PARAM*,uint idx,
+                                              SEL_ARG *key_tree,
+                                              uint mrr_flags,
+                                              uint mrr_buf_size,
+                                              MEM_ROOT *alloc);
+  friend class QUICK_SELECT_DESC;
+  friend class QUICK_INDEX_SORT_SELECT;
+  friend class QUICK_INDEX_MERGE_SELECT;
+  friend class QUICK_ROR_INTERSECT_SELECT;
+  friend class QUICK_INDEX_INTERSECT_SELECT;
+  friend class QUICK_GROUP_MIN_MAX_SELECT;
+  friend bool quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
+  friend range_seq_t quick_range_seq_init(void *init_param,
+                                          uint n_ranges, uint flags);
+  friend 
+  int read_keys_and_merge_scans(THD *thd, TABLE *head,
+                                List<QUICK_RANGE_SELECT> quick_selects,
+                                QUICK_RANGE_SELECT *pk_quick_select,
+                                READ_RECORD *read_record,
+                                bool intersection,
+                                key_map *filtered_scans,
+                                Unique **unique_ptr);
+
 };
 
 
@@ -441,48 +491,53 @@ class QUICK_RANGE_SELECT_GEOM: public QUICK_RANGE_SELECT
 {
 public:
   QUICK_RANGE_SELECT_GEOM(THD *thd, TABLE *table, uint index_arg,
-                          bool no_alloc, MEM_ROOT *parent_alloc)
-    :QUICK_RANGE_SELECT(thd, table, index_arg, no_alloc, parent_alloc)
+                          bool no_alloc, MEM_ROOT *parent_alloc, 
+                          bool *create_err)
+    :QUICK_RANGE_SELECT(thd, table, index_arg, no_alloc, parent_alloc,
+    create_err)
     {};
   virtual int get_next();
 };
 
 
 /*
-  QUICK_INDEX_MERGE_SELECT - index_merge access method quick select.
+  QUICK_INDEX_SORT_SELECT is the base class for the common functionality of:
+  - QUICK_INDEX_MERGE_SELECT, access based on multi-index merge/union 
+  - QUICK_INDEX_INTERSECT_SELECT, access based on  multi-index intersection 
+    
 
-    QUICK_INDEX_MERGE_SELECT uses
+    QUICK_INDEX_SORT_SELECT uses
      * QUICK_RANGE_SELECTs to get rows
-     * Unique class to remove duplicate rows
+     * Unique class
+       - to remove duplicate rows for QUICK_INDEX_MERGE_SELECT
+       - to intersect rows for QUICK_INDEX_INTERSECT_SELECT
 
   INDEX MERGE OPTIMIZER
-    Current implementation doesn't detect all cases where index_merge could
+    Current implementation doesn't detect all cases where index merge could
     be used, in particular:
-     * index_merge will never be used if range scan is possible (even if
-       range scan is more expensive)
 
-     * index_merge+'using index' is not supported (this the consequence of
-       the above restriction)
+     * index_merge+'using index' is not supported
 
      * If WHERE part contains complex nested AND and OR conditions, some ways
-       to retrieve rows using index_merge will not be considered. The choice
+       to retrieve rows using index merge will not be considered. The choice
        of read plan may depend on the order of conjuncts/disjuncts in WHERE
        part of the query, see comments near imerge_list_or_list and
        SEL_IMERGE::or_sel_tree_with_checks functions for details.
 
-     * There is no "index_merge_ref" method (but index_merge on non-first
+     * There is no "index_merge_ref" method (but index merge on non-first
        table in join is possible with 'range checked for each record').
 
-    See comments around SEL_IMERGE class and test_quick_select for more
-    details.
 
   ROW RETRIEVAL ALGORITHM
 
-    index_merge uses Unique class for duplicates removal.  index_merge takes
-    advantage of Clustered Primary Key (CPK) if the table has one.
-    The index_merge algorithm consists of two phases:
+    index merge/intersection uses Unique class for duplicates removal. 
+    index merge/intersection takes advantage of Clustered Primary Key (CPK)
+    if the table has one.
+    The index merge/intersection algorithm consists of two phases:
+
+    Phase 1 
+    (implemented by a QUICK_INDEX_MERGE_SELECT::read_keys_and_merge call):
 
-    Phase 1 (implemented in QUICK_INDEX_MERGE_SELECT::prepare_unique):
     prepare()
     {
       activate 'index only';
@@ -496,32 +551,32 @@ public:
       deactivate 'index only';
     }
 
-    Phase 2 (implemented as sequence of QUICK_INDEX_MERGE_SELECT::get_next
-    calls):
+    Phase 2 
+    (implemented as sequence of QUICK_INDEX_MERGE_SELECT::get_next calls):
 
     fetch()
     {
-      retrieve all rows from row pointers stored in Unique;
+      retrieve all rows from row pointers stored in Unique
+      (merging/intersecting them);
       free Unique;
-      retrieve all rows for CPK scan;
+      if (! intersection) 
+        retrieve all rows for CPK scan;
     }
 */
 
-class QUICK_INDEX_MERGE_SELECT : public QUICK_SELECT_I
+class QUICK_INDEX_SORT_SELECT : public QUICK_SELECT_I
 {
+protected:
   Unique *unique;
 public:
-  QUICK_INDEX_MERGE_SELECT(THD *thd, TABLE *table);
-  ~QUICK_INDEX_MERGE_SELECT();
+  QUICK_INDEX_SORT_SELECT(THD *thd, TABLE *table);
+  ~QUICK_INDEX_SORT_SELECT();
 
   int  init();
+  void need_sorted_output() { DBUG_ASSERT(0); /* Can't do it */ }
   int  reset(void);
-  int  get_next();
   bool reverse_sorted() { return false; }
   bool unique_key_range() { return false; }
-  int get_type() { return QS_TYPE_INDEX_MERGE; }
-  void add_keys_and_lengths(String *key_names, String *used_lengths);
-  void add_info_string(String *str);
   bool is_keys_used(const MY_BITMAP *fields);
 #ifndef DBUG_OFF
   void dbug_dump(int indent, bool verbose);
@@ -529,18 +584,15 @@ public:
 
   bool push_quick_back(QUICK_RANGE_SELECT *quick_sel_range);
 
-  /* range quick selects this index_merge read consists of */
+  /* range quick selects this index merge/intersect consists of */
   List<QUICK_RANGE_SELECT> quick_selects;
 
   /* quick select that uses clustered primary key (NULL if none) */
   QUICK_RANGE_SELECT* pk_quick_select;
 
-  /* true if this select is currently doing a clustered PK scan */
-  bool  doing_pk_scan;
-
   MEM_ROOT alloc;
   THD *thd;
-  int read_keys_and_merge();
+  virtual int read_keys_and_merge()= 0;
 
   bool clustered_pk_range() { return test(pk_quick_select); }
 
@@ -549,6 +601,42 @@ public:
 };
 
 
+
+class QUICK_INDEX_MERGE_SELECT : public QUICK_INDEX_SORT_SELECT
+{
+private:
+  /* true if this select is currently doing a clustered PK scan */
+  bool  doing_pk_scan;
+protected:
+  int read_keys_and_merge();
+
+public:
+  QUICK_INDEX_MERGE_SELECT(THD *thd, TABLE *table)
+    :QUICK_INDEX_SORT_SELECT(thd, table) {}
+
+  int get_next();
+  int get_type() { return QS_TYPE_INDEX_MERGE; }
+  void add_keys_and_lengths(String *key_names, String *used_lengths);
+  void add_info_string(String *str);
+};
+
+class QUICK_INDEX_INTERSECT_SELECT : public QUICK_INDEX_SORT_SELECT
+{
+protected:
+  int read_keys_and_merge();
+
+public:
+  QUICK_INDEX_INTERSECT_SELECT(THD *thd, TABLE *table)
+    :QUICK_INDEX_SORT_SELECT(thd, table) {}
+
+  key_map filtered_scans;
+  int get_next();
+  int get_type() { return QS_TYPE_INDEX_INTERSECT; }
+  void add_keys_and_lengths(String *key_names, String *used_lengths);
+  void add_info_string(String *str);
+};
+
+
 /*
   Rowid-Ordered Retrieval (ROR) index intersection quick select.
   This quick select produces intersection of row sequences returned
@@ -576,6 +664,7 @@ public:
   ~QUICK_ROR_INTERSECT_SELECT();
 
   int  init();
+  void need_sorted_output() { DBUG_ASSERT(0); /* Can't do it */ }
   int  reset(void);
   int  get_next();
   bool reverse_sorted() { return false; }
@@ -638,6 +727,7 @@ public:
   ~QUICK_ROR_UNION_SELECT();
 
   int  init();
+  void need_sorted_output() { DBUG_ASSERT(0); /* Can't do it */ }
   int  reset(void);
   int  get_next();
   bool reverse_sorted() { return false; }
@@ -759,6 +849,7 @@ public:
   void adjust_prefix_ranges();
   bool alloc_buffers();
   int init();
+  void need_sorted_output() { /* always do it */ }
   int reset();
   int get_next();
   bool reverse_sorted() { return false; }
@@ -774,7 +865,8 @@ public:
 class QUICK_SELECT_DESC: public QUICK_RANGE_SELECT
 {
 public:
-  QUICK_SELECT_DESC(QUICK_RANGE_SELECT *q, uint used_key_parts);
+  QUICK_SELECT_DESC(QUICK_RANGE_SELECT *q, uint used_key_parts, 
+                    bool *create_err);
   int get_next();
   bool reverse_sorted() { return 1; }
   int get_type() { return QS_TYPE_RANGE_DESC; }
@@ -791,6 +883,13 @@ class SQL_SELECT :public Sql_alloc {
  public:
   QUICK_SELECT_I *quick;	// If quick-select used
   COND		*cond;		// where condition
+
+  /*
+    When using Index Condition Pushdown: condition that we've had before
+    extracting and pushing index condition.
+    In other cases, NULL.
+  */
+  Item *pre_idx_push_select_cond;
   TABLE	*head;
   IO_CACHE file;		// Positions to used records
   ha_rows records;		// Records in use if read from file
@@ -807,7 +906,7 @@ class SQL_SELECT :public Sql_alloc {
   {
     key_map tmp;
     tmp.set_all();
-    return test_quick_select(thd, tmp, 0, limit, force_quick_range) < 0;
+    return test_quick_select(thd, tmp, 0, limit, force_quick_range, FALSE) < 0;
   }
   /* 
     RETURN
@@ -823,21 +922,25 @@ class SQL_SELECT :public Sql_alloc {
     return rc;
   }
   int test_quick_select(THD *thd, key_map keys, table_map prev_tables,
-			ha_rows limit, bool force_quick_range);
+			ha_rows limit, bool force_quick_range, 
+                        bool ordered_output);
 };
 
 
-class FT_SELECT: public QUICK_RANGE_SELECT {
+class FT_SELECT: public QUICK_RANGE_SELECT 
+{
 public:
-  FT_SELECT(THD *thd, TABLE *table, uint key) :
-      QUICK_RANGE_SELECT (thd, table, key, 1) { VOID(init()); }
+  FT_SELECT(THD *thd, TABLE *table, uint key, bool *create_err) :
+      QUICK_RANGE_SELECT (thd, table, key, 1, NULL, create_err) 
+  { (void) init(); }
   ~FT_SELECT() { file->ft_end(); }
-  int init() { return error=file->ft_init(); }
+  int init() { return file->ft_init(); }
   int reset() { return 0; }
-  int get_next() { return error= file->ha_ft_read(record); }
+  int get_next() { return file->ha_ft_read(record); }
   int get_type() { return QS_TYPE_FULLTEXT; }
 };
 
+FT_SELECT *get_ft_select(THD *thd, TABLE *table, uint key);
 QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
                                              struct st_table_ref *ref,
                                              ha_rows records);
diff --git a/sql/opt_range_mrr.cc b/sql/opt_range_mrr.cc
new file mode 100644
index 00000000000..da6086d6cdc
--- /dev/null
+++ b/sql/opt_range_mrr.cc
@@ -0,0 +1,346 @@
+
+/****************************************************************************
+  MRR Range Sequence Interface implementation that walks a SEL_ARG* tree.
+ ****************************************************************************/
+
+/* MRR range sequence, SEL_ARG* implementation: stack entry */
+typedef struct st_range_seq_entry 
+{
+  /* 
+    Pointers in min and max keys. They point to right-after-end of key
+    images. The 0-th entry has these pointing to key tuple start.
+  */
+  uchar *min_key, *max_key;
+  
+  /* 
+    Flags, for {keypart0, keypart1, ... this_keypart} subtuple.
+    min_key_flag may have NULL_RANGE set.
+  */
+  uint min_key_flag, max_key_flag;
+  
+  /* Number of key parts */
+  uint min_key_parts, max_key_parts;
+  SEL_ARG *key_tree;
+} RANGE_SEQ_ENTRY;
+
+
+/*
+  MRR range sequence, SEL_ARG* implementation: SEL_ARG graph traversal context
+*/
+typedef struct st_sel_arg_range_seq
+{
+  uint keyno;      /* index of used tree in SEL_TREE structure */
+  uint real_keyno; /* Number of the index in tables */
+  PARAM *param;
+  SEL_ARG *start; /* Root node of the traversed SEL_ARG* graph */
+  
+  RANGE_SEQ_ENTRY stack[MAX_REF_PARTS];
+  int i; /* Index of last used element in the above array */
+  
+  bool at_start; /* TRUE <=> The traversal has just started */
+} SEL_ARG_RANGE_SEQ;
+
+
+/*
+  Range sequence interface, SEL_ARG* implementation: Initialize the traversal
+
+  SYNOPSIS
+    init()
+      init_params  SEL_ARG tree traversal context
+      n_ranges     [ignored] The number of ranges obtained 
+      flags        [ignored] HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY
+
+  RETURN
+    Value of init_param
+*/
+
+range_seq_t sel_arg_range_seq_init(void *init_param, uint n_ranges, uint flags)
+{
+  SEL_ARG_RANGE_SEQ *seq= (SEL_ARG_RANGE_SEQ*)init_param;
+  seq->at_start= TRUE;
+  seq->stack[0].key_tree= NULL;
+  seq->stack[0].min_key= seq->param->min_key;
+  seq->stack[0].min_key_flag= 0;
+  seq->stack[0].min_key_parts= 0;
+
+  seq->stack[0].max_key= seq->param->max_key;
+  seq->stack[0].max_key_flag= 0;
+  seq->stack[0].max_key_parts= 0;
+  seq->i= 0;
+  return init_param;
+}
+
+
+static void step_down_to(SEL_ARG_RANGE_SEQ *arg, SEL_ARG *key_tree)
+{
+  RANGE_SEQ_ENTRY *cur= &arg->stack[arg->i+1];
+  RANGE_SEQ_ENTRY *prev= &arg->stack[arg->i];
+  
+  cur->key_tree= key_tree;
+  cur->min_key= prev->min_key;
+  cur->max_key= prev->max_key;
+  cur->min_key_parts= prev->min_key_parts;
+  cur->max_key_parts= prev->max_key_parts;
+
+  uint16 stor_length= arg->param->key[arg->keyno][key_tree->part].store_length;
+  cur->min_key_parts += key_tree->store_min(stor_length, &cur->min_key,
+                                            prev->min_key_flag);
+  cur->max_key_parts += key_tree->store_max(stor_length, &cur->max_key,
+                                            prev->max_key_flag);
+
+  cur->min_key_flag= prev->min_key_flag | key_tree->min_flag;
+  cur->max_key_flag= prev->max_key_flag | key_tree->max_flag;
+
+  if (key_tree->is_null_interval())
+    cur->min_key_flag |= NULL_RANGE;
+  (arg->i)++;
+}
+
+
+/*
+  Range sequence interface, SEL_ARG* implementation: get the next interval
+  
+  SYNOPSIS
+    sel_arg_range_seq_next()
+      rseq        Value returned from sel_arg_range_seq_init
+      range  OUT  Store information about the range here
+
+  DESCRIPTION
+    This is "get_next" function for Range sequence interface implementation
+    for SEL_ARG* tree.
+
+  IMPLEMENTATION
+    The traversal also updates those param members:
+      - is_ror_scan
+      - range_count
+      - max_key_part
+
+  RETURN
+    FALSE  Ok
+    TRUE   No more ranges in the sequence
+*/
+
+#if (_MSC_FULL_VER == 160030319)
+/*
+   Workaround Visual Studio 2010 RTM compiler backend bug, the function enters 
+   infinite loop.
+ */
+#pragma optimize("g", off)
+#endif
+
+bool sel_arg_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
+{
+  SEL_ARG *key_tree;
+  SEL_ARG_RANGE_SEQ *seq= (SEL_ARG_RANGE_SEQ*)rseq;
+  if (seq->at_start)
+  {
+    key_tree= seq->start;
+    seq->at_start= FALSE;
+    goto walk_up_n_right;
+  }
+
+  key_tree= seq->stack[seq->i].key_tree;
+  /* Ok, we're at some "full tuple" position in the tree */
+ 
+  /* Step down if we can */
+  if (key_tree->next && key_tree->next != &null_element)
+  {
+    //step down; (update the tuple, we'll step right and stay there)
+    seq->i--;
+    step_down_to(seq, key_tree->next);
+    key_tree= key_tree->next;
+    seq->param->is_ror_scan= FALSE;
+    goto walk_right_n_up;
+  }
+
+  /* Ok, can't step down, walk left until we can step down */
+  while (1)
+  {
+    if (seq->i == 1) // can't step left
+      return 1;
+    /* Step left */
+    seq->i--;
+    key_tree= seq->stack[seq->i].key_tree;
+
+    /* Step down if we can */
+    if (key_tree->next && key_tree->next != &null_element)
+    {
+      // Step down; update the tuple
+      seq->i--;
+      step_down_to(seq, key_tree->next);
+      key_tree= key_tree->next;
+      break;
+    }
+  }
+
+  /*
+    Ok, we've stepped down from the path to previous tuple.
+    Walk right-up while we can
+  */
+walk_right_n_up:
+  while (key_tree->next_key_part && key_tree->next_key_part != &null_element && 
+         key_tree->next_key_part->part == key_tree->part + 1 &&
+         key_tree->next_key_part->type == SEL_ARG::KEY_RANGE)
+  {
+    {
+      RANGE_SEQ_ENTRY *cur= &seq->stack[seq->i];
+      uint min_key_length= cur->min_key - seq->param->min_key;
+      uint max_key_length= cur->max_key - seq->param->max_key;
+      uint len= cur->min_key - cur[-1].min_key;
+      if (!(min_key_length == max_key_length &&
+            !memcmp(cur[-1].min_key, cur[-1].max_key, len) &&
+            !key_tree->min_flag && !key_tree->max_flag))
+      {
+        seq->param->is_ror_scan= FALSE;
+        if (!key_tree->min_flag)
+          cur->min_key_parts += 
+            key_tree->next_key_part->store_min_key(seq->param->key[seq->keyno],
+                                                   &cur->min_key,
+                                                   &cur->min_key_flag);
+        if (!key_tree->max_flag)
+          cur->max_key_parts += 
+            key_tree->next_key_part->store_max_key(seq->param->key[seq->keyno],
+                                                   &cur->max_key,
+                                                   &cur->max_key_flag);
+        break;
+      }
+    }
+  
+    /*
+      Ok, current atomic interval is in form "t.field=const" and there is
+      next_key_part interval. Step right, and walk up from there.
+    */
+    key_tree= key_tree->next_key_part;
+
+walk_up_n_right:
+    while (key_tree->prev && key_tree->prev != &null_element)
+    {
+      /* Step up */
+      key_tree= key_tree->prev;
+    }
+    step_down_to(seq, key_tree);
+  }
+
+  /* Ok got a tuple */
+  RANGE_SEQ_ENTRY *cur= &seq->stack[seq->i];
+  uint min_key_length= cur->min_key - seq->param->min_key;
+  
+  range->ptr= (char*)(intptr)(key_tree->part);
+  if (cur->min_key_flag & GEOM_FLAG)
+  {
+    range->range_flag= cur->min_key_flag;
+
+    /* Here minimum contains also function code bits, and maximum is +inf */
+    range->start_key.key=    seq->param->min_key;
+    range->start_key.length= min_key_length;
+    range->start_key.flag=  (ha_rkey_function) (cur->min_key_flag ^ GEOM_FLAG);
+  }
+  else
+  {
+    range->range_flag= cur->min_key_flag | cur->max_key_flag;
+    
+    range->start_key.key=    seq->param->min_key;
+    range->start_key.length= cur->min_key - seq->param->min_key;
+    range->start_key.keypart_map= make_prev_keypart_map(cur->min_key_parts);
+    range->start_key.flag= (cur->min_key_flag & NEAR_MIN ? HA_READ_AFTER_KEY : 
+                                                           HA_READ_KEY_EXACT);
+
+    range->end_key.key=    seq->param->max_key;
+    range->end_key.length= cur->max_key - seq->param->max_key;
+    range->end_key.flag= (cur->max_key_flag & NEAR_MAX ? HA_READ_BEFORE_KEY : 
+                                                         HA_READ_AFTER_KEY);
+    range->end_key.keypart_map= make_prev_keypart_map(cur->max_key_parts);
+
+    if (!(cur->min_key_flag & ~NULL_RANGE) && !cur->max_key_flag &&
+        (uint)key_tree->part+1 == seq->param->table->key_info[seq->real_keyno].key_parts &&
+        (seq->param->table->key_info[seq->real_keyno].flags & HA_NOSAME) &&
+        range->start_key.length == range->end_key.length &&
+        !memcmp(seq->param->min_key,seq->param->max_key,range->start_key.length))
+      range->range_flag= UNIQUE_RANGE | (cur->min_key_flag & NULL_RANGE);
+      
+    if (seq->param->is_ror_scan)
+    {
+      /*
+        If we get here, the condition on the key was converted to form
+        "(keyXpart1 = c1) AND ... AND (keyXpart{key_tree->part - 1} = cN) AND
+          somecond(keyXpart{key_tree->part})"
+        Check if
+          somecond is "keyXpart{key_tree->part} = const" and
+          uncovered "tail" of KeyX parts is either empty or is identical to
+          first members of clustered primary key.
+      */
+      if (!(!(cur->min_key_flag & ~NULL_RANGE) && !cur->max_key_flag &&
+            (range->start_key.length == range->end_key.length) &&
+            !memcmp(range->start_key.key, range->end_key.key, range->start_key.length) &&
+            is_key_scan_ror(seq->param, seq->real_keyno, key_tree->part + 1)))
+        seq->param->is_ror_scan= FALSE;
+    }
+  }
+  seq->param->range_count++;
+  seq->param->max_key_part=max(seq->param->max_key_part,key_tree->part);
+  return 0;
+}
+
+#if (_MSC_FULL_VER == 160030319)
+/* VS2010 compiler bug workaround */
+#pragma optimize("g", on)
+#endif
+
+
+/****************************************************************************
+  MRR Range Sequence Interface implementation that walks array<QUICK_RANGE>
+ ****************************************************************************/
+
+/*
+  Range sequence interface implementation for array<QUICK_RANGE>: initialize
+  
+  SYNOPSIS
+    quick_range_seq_init()
+      init_param  Caller-opaque paramenter: QUICK_RANGE_SELECT* pointer
+      n_ranges    Number of ranges in the sequence (ignored)
+      flags       MRR flags (currently not used) 
+
+  RETURN
+    Opaque value to be passed to quick_range_seq_next
+*/
+
+range_seq_t quick_range_seq_init(void *init_param, uint n_ranges, uint flags)
+{
+  QUICK_RANGE_SELECT *quick= (QUICK_RANGE_SELECT*)init_param;
+  quick->qr_traversal_ctx.first=  (QUICK_RANGE**)quick->ranges.buffer;
+  quick->qr_traversal_ctx.cur=    (QUICK_RANGE**)quick->ranges.buffer;
+  quick->qr_traversal_ctx.last=   quick->qr_traversal_ctx.cur + 
+                                  quick->ranges.elements;
+  return &quick->qr_traversal_ctx;
+}
+
+
+/*
+  Range sequence interface implementation for array<QUICK_RANGE>: get next
+  
+  SYNOPSIS
+    quick_range_seq_next()
+      rseq        Value returned from quick_range_seq_init
+      range  OUT  Store information about the range here
+
+  RETURN
+    0  Ok
+    1  No more ranges in the sequence
+*/
+
+bool quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
+{
+  QUICK_RANGE_SEQ_CTX *ctx= (QUICK_RANGE_SEQ_CTX*)rseq;
+
+  if (ctx->cur == ctx->last)
+    return 1; /* no more ranges */
+
+  QUICK_RANGE *cur= *(ctx->cur);
+  cur->make_min_endpoint(&range->start_key);
+  cur->make_max_endpoint(&range->end_key);
+  range->range_flag= cur->flag;
+  ctx->cur++;
+  return 0;
+}
+
+
diff --git a/sql/opt_subselect.cc b/sql/opt_subselect.cc
new file mode 100644
index 00000000000..319704a6e8f
--- /dev/null
+++ b/sql/opt_subselect.cc
@@ -0,0 +1,3502 @@
+/**
+  @file
+
+  @brief
+    Subquery optimization code here.
+
+*/
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation				// gcc: Class implementation
+#endif
+
+#include "mysql_priv.h"
+#include "sql_select.h"
+#include "opt_subselect.h"
+
+#include <my_bit.h>
+
+// Our own:
+static
+bool subquery_types_allow_materialization(Item_in_subselect *in_subs);
+static bool replace_where_subcondition(JOIN *join, Item **expr, 
+                                       Item *old_cond, Item *new_cond,
+                                       bool do_fix_fields);
+static int subq_sj_candidate_cmp(Item_in_subselect* const *el1, 
+                                 Item_in_subselect* const *el2);
+static bool convert_subq_to_sj(JOIN *parent_join, Item_in_subselect *subq_pred);
+static TABLE_LIST *alloc_join_nest(THD *thd);
+static 
+void fix_list_after_tbl_changes(SELECT_LEX *new_parent, List<TABLE_LIST> *tlist);
+static uint get_tmp_table_rec_length(List<Item> &items);
+bool find_eq_ref_candidate(TABLE *table, table_map sj_inner_tables);
+static SJ_MATERIALIZATION_INFO *
+at_sjmat_pos(const JOIN *join, table_map remaining_tables, const JOIN_TAB *tab,
+             uint idx, bool *loose_scan);
+void best_access_path(JOIN *join, JOIN_TAB *s, 
+                             table_map remaining_tables, uint idx, 
+                             bool disable_jbuf, double record_count,
+                             POSITION *pos, POSITION *loose_scan_pos);
+
+static Item *create_subq_in_equalities(THD *thd, SJ_MATERIALIZATION_INFO *sjm, 
+                                Item_in_subselect *subq_pred);
+static void remove_sj_conds(Item **tree);
+static bool is_cond_sj_in_equality(Item *item);
+static bool sj_table_is_included(JOIN *join, JOIN_TAB *join_tab);
+static Item *remove_additional_cond(Item* conds);
+static void remove_subq_pushed_predicates(JOIN *join, Item **where);
+
+
+/*
+  Check if we need JOIN::prepare()-phase subquery rewrites and if yes, do them
+
+  DESCRIPTION
+    Check if we need to do
+     - subquery->semi-join rewrite
+     - if the subquery can be handled with materialization
+     - 'substitution' rewrite for table-less subqueries like "(select 1)"
+
+    and mark appropriately
+
+  RETURN
+     0  - OK
+    -1  - Some sort of query error
+*/
+
+int check_and_do_in_subquery_rewrites(JOIN *join)
+{
+  THD *thd=join->thd;
+  st_select_lex *select_lex= join->select_lex;
+  DBUG_ENTER("check_and_do_in_subquery_rewrites");
+  /*
+    If 
+      1) this join is inside a subquery (of any type except FROM-clause 
+         subquery) and
+      2) we aren't just normalizing a VIEW
+
+    Then perform early unconditional subquery transformations:
+     - Convert subquery predicate into semi-join, or
+     - Mark the subquery for execution using materialization, or
+     - Perform IN->EXISTS transformation, or
+     - Perform more/less ALL/ANY -> MIN/MAX rewrite
+     - Substitute trivial scalar-context subquery with its value
+
+    TODO: for PS, make the whole block execute only on the first execution
+  */
+  Item_subselect *subselect;
+  if (!(thd->lex->context_analysis_only & CONTEXT_ANALYSIS_ONLY_VIEW) && // (1)
+    (subselect= select_lex->master_unit()->item))      // (2)
+  {
+    Item_in_subselect *in_subs= NULL;
+    if (subselect->substype() == Item_subselect::IN_SUBS)
+      in_subs= (Item_in_subselect*)subselect;
+
+    /* Resolve expressions and perform semantic analysis for IN query */
+    if (in_subs != NULL)
+      /*
+        TODO: Add the condition below to this if statement when we have proper
+        support for is_correlated handling for materialized semijoins.
+        If we were to add this condition now, the fix_fields() call in
+        convert_subq_to_sj() would force the flag is_correlated to be set
+        erroneously for prepared queries.
+
+        thd->stmt_arena->state != Query_arena::PREPARED)
+      */
+    {
+      /*
+        Check if the left and right expressions have the same # of
+        columns, i.e. we don't have a case like 
+          (oe1, oe2) IN (SELECT ie1, ie2, ie3 ...)
+
+        TODO why do we have this duplicated in IN->EXISTS transformers?
+        psergey-todo: fix these: grep for duplicated_subselect_card_check
+      */
+      if (select_lex->item_list.elements != in_subs->left_expr->cols())
+      {
+        my_error(ER_OPERAND_COLUMNS, MYF(0), in_subs->left_expr->cols());
+        DBUG_RETURN(-1);
+      }
+
+      SELECT_LEX *current= thd->lex->current_select;
+      thd->lex->current_select= current->return_after_parsing();
+      char const *save_where= thd->where;
+      thd->where= "IN/ALL/ANY subquery";
+        
+      bool failure= !in_subs->left_expr->fixed &&
+                     in_subs->left_expr->fix_fields(thd, &in_subs->left_expr);
+      thd->lex->current_select= current;
+      thd->where= save_where;
+      if (failure)
+        DBUG_RETURN(-1); /* purecov: deadcode */
+    }
+    DBUG_PRINT("info", ("Checking if subq can be converted to semi-join"));
+    /*
+      Check if we're in subquery that is a candidate for flattening into a
+      semi-join (which is done in flatten_subqueries()). The
+      requirements are:
+        1. Subquery predicate is an IN/=ANY subq predicate
+        2. Subquery is a single SELECT (not a UNION)
+        3. Subquery does not have GROUP BY or ORDER BY
+        4. Subquery does not use aggregate functions or HAVING
+        5. Subquery predicate is at the AND-top-level of ON/WHERE clause
+        6. We are not in a subquery of a single table UPDATE/DELETE that 
+             doesn't have a JOIN (TODO: We should handle this at some
+             point by switching to multi-table UPDATE/DELETE)
+        7. We're not in a table-less subquery like "SELECT 1"
+        8. No execution method was already chosen (by a prepared statement)
+        9. Parent select is not a table-less select
+        10. Neither parent nor child select have STRAIGHT_JOIN option.
+    */
+    if (optimizer_flag(thd, OPTIMIZER_SWITCH_SEMIJOIN) &&
+        in_subs &&                                                    // 1
+        !select_lex->is_part_of_union() &&                            // 2
+        !select_lex->group_list.elements && !join->order &&           // 3
+        !join->having && !select_lex->with_sum_func &&                // 4
+        thd->thd_marker.emb_on_expr_nest &&                           // 5
+        select_lex->outer_select()->join &&                           // 6
+        select_lex->master_unit()->first_select()->leaf_tables &&     // 7
+        in_subs->exec_method == Item_in_subselect::NOT_TRANSFORMED && // 8
+        select_lex->outer_select()->leaf_tables &&                    // 9
+        !((join->select_options |                                     // 10
+           select_lex->outer_select()->join->select_options)          // 10
+          & SELECT_STRAIGHT_JOIN))                                    // 10
+    {
+      DBUG_PRINT("info", ("Subquery is semi-join conversion candidate"));
+
+      (void)subquery_types_allow_materialization(in_subs);
+
+      in_subs->emb_on_expr_nest= thd->thd_marker.emb_on_expr_nest;
+
+      /* Register the subquery for further processing in flatten_subqueries() */
+      select_lex->
+        outer_select()->join->sj_subselects.append(thd->mem_root, in_subs);
+      in_subs->expr_join_nest= thd->thd_marker.emb_on_expr_nest;
+    }
+    else
+    {
+      DBUG_PRINT("info", ("Subquery can't be converted to semi-join"));
+      /*
+        Check if the subquery predicate can be executed via materialization.
+        The required conditions are:
+        1. Subquery predicate is an IN/=ANY subq predicate
+        2. Subquery is a single SELECT (not a UNION)
+        3. Subquery is not a table-less query. In this case there is no
+           point in materializing.
+          3A The upper query is not a table-less SELECT ... FROM DUAL. We
+             can't do materialization for SELECT .. FROM DUAL because it
+             does not call setup_subquery_materialization(). We could make 
+             SELECT ... FROM DUAL call that function but that doesn't seem
+             to be the case that is worth handling.
+        4. Either the subquery predicate is a top-level predicate, or at
+           least one partial match strategy is enabled. If no partial match
+           strategy is enabled, then materialization cannot be used for
+           non-top-level queries because it cannot handle NULLs correctly.
+        5. Subquery is non-correlated
+           TODO:
+           This is an overly restrictive condition. It can be extended to:
+           (Subquery is non-correlated ||
+            Subquery is correlated to any query outer to IN predicate ||
+            (Subquery is correlated to the immediate outer query &&
+             Subquery !contains {GROUP BY, ORDER BY [LIMIT],
+             aggregate functions}) && subquery predicate is not under "NOT IN"))
+        6. No execution method was already chosen (by a prepared statement).
+
+        (*) The subquery must be part of a SELECT statement. The current
+             condition also excludes multi-table update statements.
+
+        Determine whether we will perform subquery materialization before
+        calling the IN=>EXISTS transformation, so that we know whether to
+        perform the whole transformation or only that part of it which wraps
+        Item_in_subselect in an Item_in_optimizer.
+      */
+      if (optimizer_flag(thd, OPTIMIZER_SWITCH_MATERIALIZATION)  && 
+          in_subs  &&                                                   // 1
+          !select_lex->is_part_of_union() &&                            // 2
+          select_lex->master_unit()->first_select()->leaf_tables &&     // 3
+          thd->lex->sql_command == SQLCOM_SELECT &&                     // *
+          select_lex->outer_select()->leaf_tables &&                    // 3A
+          subquery_types_allow_materialization(in_subs) &&
+          // psergey-todo: duplicated_subselect_card_check: where it's done?
+          (in_subs->is_top_level_item() ||
+           optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE) ||
+           optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN)) &&//4
+          !in_subs->is_correlated &&                                  // 5
+          in_subs->exec_method == Item_in_subselect::NOT_TRANSFORMED) // 6
+      {
+          in_subs->exec_method= Item_in_subselect::MATERIALIZATION;
+      }
+
+      Item_subselect::trans_res trans_res;
+      if ((trans_res= subselect->select_transformer(join)) !=
+          Item_subselect::RES_OK)
+      {
+        DBUG_RETURN((trans_res == Item_subselect::RES_ERROR));
+      }
+    }
+  }
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief Check if subquery's compared types allow materialization.
+
+  @param in_subs Subquery predicate, updated as follows:
+    types_allow_materialization TRUE if subquery materialization is allowed.
+    sjm_scan_allowed            If types_allow_materialization is TRUE,
+                                indicates whether it is possible to use subquery
+                                materialization and scan the materialized table.
+
+  @retval TRUE   If subquery types allow materialization.
+  @retval FALSE  Otherwise.
+
+  @details
+    This is a temporary fix for BUG#36752.
+    
+    There are two subquery materialization strategies:
+
+    1. Materialize and do index lookups in the materialized table. See 
+       BUG#36752 for description of restrictions we need to put on the
+       compared expressions.
+
+    2. Materialize and then do a full scan of the materialized table. At the
+       moment, this strategy's applicability criteria are even stricter than
+       in #1.
+
+       This is so because of the following: consider an uncorrelated subquery
+       
+       ...WHERE (ot1.col1, ot2.col2 ...) IN (SELECT ie1,ie2,... FROM it1 ...)
+
+       and a join order that could be used to do sjm-materialization: 
+          
+          SJM-Scan(it1, it1), ot1, ot2
+       
+       IN-equalities will be parts of conditions attached to the outer tables:
+
+         ot1:  ot1.col1 = ie1 AND ... (C1)
+         ot2:  ot1.col2 = ie2 AND ... (C2)
+       
+       besides those there may be additional references to ie1 and ie2
+       generated by equality propagation. The problem with evaluating C1 and
+       C2 is that ie{1,2} refer to subquery tables' columns, while we only have 
+       current value of materialization temptable. Our solution is to 
+        * require that all ie{N} are table column references. This allows 
+          to copy the values of materialization temptable columns to the
+          original table's columns (see setup_sj_materialization for more
+          details)
+        * require that compared columns have exactly the same type. This is
+          a temporary measure to avoid BUG#36752-type problems.
+*/
+
+static 
+bool subquery_types_allow_materialization(Item_in_subselect *in_subs)
+{
+  DBUG_ENTER("subquery_types_allow_materialization");
+
+  DBUG_ASSERT(in_subs->left_expr->fixed);
+
+  List_iterator<Item> it(in_subs->unit->first_select()->item_list);
+  uint elements= in_subs->unit->first_select()->item_list.elements;
+
+  in_subs->types_allow_materialization= FALSE;  // Assign default values
+  in_subs->sjm_scan_allowed= FALSE;
+  
+  bool all_are_fields= TRUE;
+  for (uint i= 0; i < elements; i++)
+  {
+    Item *outer= in_subs->left_expr->element_index(i);
+    Item *inner= it++;
+    all_are_fields &= (outer->real_item()->type() == Item::FIELD_ITEM && 
+                       inner->real_item()->type() == Item::FIELD_ITEM);
+    if (outer->result_type() != inner->result_type())
+      DBUG_RETURN(FALSE);
+    switch (outer->result_type()) {
+    case STRING_RESULT:
+      if (outer->is_datetime() != inner->is_datetime())
+        DBUG_RETURN(FALSE);
+
+      if (!(outer->collation.collation == inner->collation.collation 
+          /*&& outer->max_length <= inner->max_length */))
+        DBUG_RETURN(FALSE);
+    /*case INT_RESULT:
+      if (!(outer->unsigned_flag ^ inner->unsigned_flag))
+        DBUG_RETURN(FALSE); */
+    default:
+      ;/* suitable for materialization */
+    }
+
+    // Materialization does not work with BLOB columns
+    if (inner->field_type() == MYSQL_TYPE_BLOB || 
+	inner->field_type() == MYSQL_TYPE_GEOMETRY)
+        DBUG_RETURN(FALSE);
+  }
+    
+  in_subs->types_allow_materialization= TRUE;
+  in_subs->sjm_scan_allowed= all_are_fields;
+  DBUG_PRINT("info",("subquery_types_allow_materialization: ok, allowed"));
+  DBUG_RETURN(TRUE);
+}
+
+
+/*
+  Convert semi-join subquery predicates into semi-join join nests
+
+  SYNOPSIS
+    convert_join_subqueries_to_semijoins()
+ 
+  DESCRIPTION
+
+    Convert candidate subquery predicates into semi-join join nests. This 
+    transformation is performed once in query lifetime and is irreversible.
+    
+    Conversion of one subquery predicate
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    We start with a join that has a semi-join subquery:
+
+      SELECT ...
+      FROM ot, ...
+      WHERE oe IN (SELECT ie FROM it1 ... itN WHERE subq_where) AND outer_where
+
+    and convert it into a semi-join nest:
+
+      SELECT ...
+      FROM ot SEMI JOIN (it1 ... itN), ...
+      WHERE outer_where AND subq_where AND oe=ie
+
+    that is, in order to do the conversion, we need to 
+
+     * Create the "SEMI JOIN (it1 .. itN)" part and add it into the parent
+       query's FROM structure.
+     * Add "AND subq_where AND oe=ie" into parent query's WHERE (or ON if
+       the subquery predicate was in an ON expression)
+     * Remove the subquery predicate from the parent query's WHERE
+
+    Considerations when converting many predicates
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    A join may have at most MAX_TABLES tables. This may prevent us from
+    flattening all subqueries when the total number of tables in parent and
+    child selects exceeds MAX_TABLES.
+    We deal with this problem by flattening children's subqueries first and
+    then using a heuristic rule to determine each subquery predicate's
+    "priority".
+
+  RETURN 
+    FALSE  OK
+    TRUE   Error
+*/
+
+bool convert_join_subqueries_to_semijoins(JOIN *join)
+{
+  Query_arena *arena, backup;
+  Item_in_subselect **in_subq;
+  Item_in_subselect **in_subq_end;
+  THD *thd= join->thd;
+  DBUG_ENTER("convert_join_subqueries_to_semijoins");
+
+  if (join->sj_subselects.elements() == 0)
+    DBUG_RETURN(FALSE);
+
+  /* First, convert child join's subqueries. We proceed bottom-up here */
+  for (in_subq= join->sj_subselects.front(), 
+       in_subq_end= join->sj_subselects.back(); 
+       in_subq != in_subq_end; 
+       in_subq++)
+  {
+    st_select_lex *child_select= (*in_subq)->get_select_lex();
+    JOIN *child_join= child_select->join;
+    child_join->outer_tables = child_join->tables;
+
+    /*
+      child_select->where contains only the WHERE predicate of the
+      subquery itself here. We may be selecting from a VIEW, which has its
+      own predicate. The combined predicates are available in child_join->conds,
+      which was built by setup_conds() doing prepare_where() for all views.
+    */
+    child_select->where= child_join->conds;
+
+    if (convert_join_subqueries_to_semijoins(child_join))
+      DBUG_RETURN(TRUE);
+    (*in_subq)->sj_convert_priority= 
+      (*in_subq)->is_correlated * MAX_TABLES + child_join->outer_tables;
+  }
+  
+  // Temporary measure: disable semi-joins when they are together with outer
+  // joins.
+  for (TABLE_LIST *tbl= join->select_lex->leaf_tables; tbl; tbl=tbl->next_leaf)
+  {
+    TABLE_LIST *embedding= tbl->embedding;
+    if (tbl->on_expr || (tbl->embedding && !(embedding->sj_on_expr && 
+                                            !embedding->embedding)))
+    {
+      in_subq= join->sj_subselects.front();
+      arena= thd->activate_stmt_arena_if_needed(&backup);
+      goto skip_conversion;
+    }
+  }
+
+  //dump_TABLE_LIST_struct(select_lex, select_lex->leaf_tables);
+  /* 
+    2. Pick which subqueries to convert:
+      sort the subquery array
+      - prefer correlated subqueries over uncorrelated;
+      - prefer subqueries that have greater number of outer tables;
+  */
+  join->sj_subselects.sort(subq_sj_candidate_cmp);
+  // #tables-in-parent-query + #tables-in-subquery < MAX_TABLES
+  /* Replace all subqueries to be flattened with Item_int(1) */
+  arena= thd->activate_stmt_arena_if_needed(&backup);
+  for (in_subq= join->sj_subselects.front(); 
+       in_subq != in_subq_end && 
+       join->tables + (*in_subq)->unit->first_select()->join->tables < MAX_TABLES;
+       in_subq++)
+  {
+    Item **tree= ((*in_subq)->emb_on_expr_nest == (TABLE_LIST*)1)?
+                   &join->conds : &((*in_subq)->emb_on_expr_nest->on_expr);
+    if (replace_where_subcondition(join, tree, *in_subq, new Item_int(1),
+                                   FALSE))
+      DBUG_RETURN(TRUE); /* purecov: inspected */
+  }
+ 
+  for (in_subq= join->sj_subselects.front(); 
+       in_subq != in_subq_end && 
+       join->tables + (*in_subq)->unit->first_select()->join->tables < MAX_TABLES;
+       in_subq++)
+  {
+    if (convert_subq_to_sj(join, *in_subq))
+      DBUG_RETURN(TRUE);
+  }
+skip_conversion:
+  /* 
+    3. Finalize (perform IN->EXISTS rewrite) the subqueries that we didn't
+    convert:
+  */
+  for (; in_subq!= in_subq_end; in_subq++)
+  {
+    JOIN *child_join= (*in_subq)->unit->first_select()->join;
+    Item_subselect::trans_res res;
+    (*in_subq)->changed= 0;
+    (*in_subq)->fixed= 0;
+
+    SELECT_LEX *save_select_lex= thd->lex->current_select;
+    thd->lex->current_select= (*in_subq)->unit->first_select();
+
+    res= (*in_subq)->select_transformer(child_join);
+
+    thd->lex->current_select= save_select_lex;
+
+    if (res == Item_subselect::RES_ERROR)
+      DBUG_RETURN(TRUE);
+
+    (*in_subq)->changed= 1;
+    (*in_subq)->fixed= 1;
+
+    Item *substitute= (*in_subq)->substitution;
+    bool do_fix_fields= !(*in_subq)->substitution->fixed;
+    Item **tree= ((*in_subq)->emb_on_expr_nest == (TABLE_LIST*)1)?
+                   &join->conds : &((*in_subq)->emb_on_expr_nest->on_expr);
+    if (replace_where_subcondition(join, tree, *in_subq, substitute, 
+                                   do_fix_fields))
+      DBUG_RETURN(TRUE);
+    (*in_subq)->substitution= NULL;
+     
+    if (!thd->stmt_arena->is_conventional())
+    {
+      tree= ((*in_subq)->emb_on_expr_nest == (TABLE_LIST*)1)?
+             &join->select_lex->prep_where : 
+             &((*in_subq)->emb_on_expr_nest->prep_on_expr);
+
+      if (replace_where_subcondition(join, tree, *in_subq, substitute, 
+                                     FALSE))
+        DBUG_RETURN(TRUE);
+    }
+  }
+
+  if (arena)
+    thd->restore_active_arena(arena, &backup);
+  join->sj_subselects.clear();
+  DBUG_RETURN(FALSE);
+}
+
+/**
+   @brief Replaces an expression destructively inside the expression tree of
+   the WHERE clase.
+
+   @note Because of current requirements for semijoin flattening, we do not
+   need to recurse here, hence this function will only examine the top-level
+   AND conditions. (see JOIN::prepare, comment starting with "Check if the 
+   subquery predicate can be executed via materialization".
+   
+   @param join The top-level query.
+   @param old_cond The expression to be replaced.
+   @param new_cond The expression to be substituted.
+   @param do_fix_fields If true, Item::fix_fields(THD*, Item**) is called for
+   the new expression.
+   @return <code>true</code> if there was an error, <code>false</code> if
+   successful.
+*/
+static bool replace_where_subcondition(JOIN *join, Item **expr, 
+                                       Item *old_cond, Item *new_cond,
+                                       bool do_fix_fields)
+{
+  //Item **expr= (emb_nest == (TABLE_LIST*)1)? &join->conds : &emb_nest->on_expr;
+  if (*expr == old_cond)
+  {
+    *expr= new_cond;
+    if (do_fix_fields)
+      new_cond->fix_fields(join->thd, expr);
+    return FALSE;
+  }
+  
+  if ((*expr)->type() == Item::COND_ITEM) 
+  {
+    List_iterator<Item> li(*((Item_cond*)(*expr))->argument_list());
+    Item *item;
+    while ((item= li++))
+    {
+      if (item == old_cond) 
+      {
+        li.replace(new_cond);
+        if (do_fix_fields)
+          new_cond->fix_fields(join->thd, li.ref());
+        return FALSE;
+      }
+    }
+  }
+  // If we came here it means there were an error during prerequisites check.
+  DBUG_ASSERT(0);
+  return TRUE;
+}
+
+static int subq_sj_candidate_cmp(Item_in_subselect* const *el1, 
+                                 Item_in_subselect* const *el2)
+{
+  return ((*el1)->sj_convert_priority < (*el2)->sj_convert_priority) ? 1 : 
+         ( ((*el1)->sj_convert_priority == (*el2)->sj_convert_priority)? 0 : -1);
+}
+
+
+/*
+  Convert a subquery predicate into a TABLE_LIST semi-join nest
+
+  SYNOPSIS
+    convert_subq_to_sj()
+       parent_join  Parent join, the one that has subq_pred in its WHERE/ON 
+                    clause
+       subq_pred    Subquery predicate to be converted
+  
+  DESCRIPTION
+    Convert a subquery predicate into a TABLE_LIST semi-join nest. All the 
+    prerequisites are already checked, so the conversion is always successfull.
+
+    Prepared Statements: the transformation is permanent:
+     - Changes in TABLE_LIST structures are naturally permanent
+     - Item tree changes are performed on statement MEM_ROOT:
+        = we activate statement MEM_ROOT 
+        = this function is called before the first fix_prepare_information
+          call.
+
+    This is intended because the criteria for subquery-to-sj conversion remain
+    constant for the lifetime of the Prepared Statement.
+
+  RETURN
+    FALSE  OK
+    TRUE   Out of memory error
+*/
+
+static bool convert_subq_to_sj(JOIN *parent_join, Item_in_subselect *subq_pred)
+{
+  SELECT_LEX *parent_lex= parent_join->select_lex;
+  TABLE_LIST *emb_tbl_nest= NULL;
+  List<TABLE_LIST> *emb_join_list= &parent_lex->top_join_list;
+  THD *thd= parent_join->thd;
+  DBUG_ENTER("convert_subq_to_sj");
+
+  /*
+    1. Find out where to put the predicate into.
+     Note: for "t1 LEFT JOIN t2" this will be t2, a leaf.
+  */
+  if ((void*)subq_pred->expr_join_nest != (void*)1)
+  {
+    if (subq_pred->expr_join_nest->nested_join)
+    {
+      /*
+        We're dealing with
+
+          ... [LEFT] JOIN  ( ... ) ON (subquery AND whatever) ...
+
+        The sj-nest will be inserted into the brackets nest.
+      */
+      emb_tbl_nest=  subq_pred->expr_join_nest;
+      emb_join_list= &emb_tbl_nest->nested_join->join_list;
+    }
+    else if (!subq_pred->expr_join_nest->outer_join)
+    {
+      /*
+        We're dealing with
+
+          ... INNER JOIN tblX ON (subquery AND whatever) ...
+
+        The sj-nest will be tblX's "sibling", i.e. another child of its
+        parent. This is ok because tblX is joined as an inner join.
+      */
+      emb_tbl_nest= subq_pred->expr_join_nest->embedding;
+      if (emb_tbl_nest)
+        emb_join_list= &emb_tbl_nest->nested_join->join_list;
+    }
+    else if (!subq_pred->expr_join_nest->nested_join)
+    {
+      TABLE_LIST *outer_tbl= subq_pred->expr_join_nest;      
+      TABLE_LIST *wrap_nest;
+      /*
+        We're dealing with
+
+          ... LEFT JOIN tbl ON (on_expr AND subq_pred) ...
+
+        we'll need to convert it into:
+
+          ... LEFT JOIN ( tbl SJ (subq_tables) ) ON (on_expr AND subq_pred) ...
+                        |                      |
+                        |<----- wrap_nest ---->|
+        
+        Q:  other subqueries may be pointing to this element. What to do?
+        A1: simple solution: copy *subq_pred->expr_join_nest= *parent_nest.
+            But we'll need to fix other pointers.
+        A2: Another way: have TABLE_LIST::next_ptr so the following
+            subqueries know the table has been nested.
+        A3: changes in the TABLE_LIST::outer_join will make everything work
+            automatically.
+      */
+      if (!(wrap_nest= alloc_join_nest(parent_join->thd)))
+      {
+        DBUG_RETURN(TRUE);
+      }
+      wrap_nest->embedding= outer_tbl->embedding;
+      wrap_nest->join_list= outer_tbl->join_list;
+      wrap_nest->alias= (char*) "(sj-wrap)";
+
+      wrap_nest->nested_join->join_list.empty();
+      wrap_nest->nested_join->join_list.push_back(outer_tbl);
+
+      outer_tbl->embedding= wrap_nest;
+      outer_tbl->join_list= &wrap_nest->nested_join->join_list;
+
+      /*
+        wrap_nest will take place of outer_tbl, so move the outer join flag
+        and on_expr
+      */
+      wrap_nest->outer_join= outer_tbl->outer_join;
+      outer_tbl->outer_join= 0;
+
+      wrap_nest->on_expr= outer_tbl->on_expr;
+      outer_tbl->on_expr= NULL;
+
+      List_iterator<TABLE_LIST> li(*wrap_nest->join_list);
+      TABLE_LIST *tbl;
+      while ((tbl= li++))
+      {
+        if (tbl == outer_tbl)
+        {
+          li.replace(wrap_nest);
+          break;
+        }
+      }
+      /*
+        Ok now wrap_nest 'contains' outer_tbl and we're ready to add the 
+        semi-join nest into it
+      */
+      emb_join_list= &wrap_nest->nested_join->join_list;
+      emb_tbl_nest=  wrap_nest;
+    }
+  }
+
+  TABLE_LIST *sj_nest;
+  NESTED_JOIN *nested_join;
+  if (!(sj_nest= alloc_join_nest(parent_join->thd)))
+  {
+    DBUG_RETURN(TRUE);
+  }
+  nested_join= sj_nest->nested_join;
+
+  sj_nest->join_list= emb_join_list;
+  sj_nest->embedding= emb_tbl_nest;
+  sj_nest->alias= (char*) "(sj-nest)";
+  sj_nest->sj_subq_pred= subq_pred;
+  /* Nests do not participate in those 'chains', so: */
+  /* sj_nest->next_leaf= sj_nest->next_local= sj_nest->next_global == NULL*/
+  emb_join_list->push_back(sj_nest);
+
+  /* 
+    nested_join->used_tables and nested_join->not_null_tables are
+    initialized in simplify_joins().
+  */
+  
+  /* 
+    2. Walk through subquery's top list and set 'embedding' to point to the
+       sj-nest.
+  */
+  st_select_lex *subq_lex= subq_pred->unit->first_select();
+  nested_join->join_list.empty();
+  List_iterator_fast<TABLE_LIST> li(subq_lex->top_join_list);
+  TABLE_LIST *tl, *last_leaf;
+  while ((tl= li++))
+  {
+    tl->embedding= sj_nest;
+    tl->join_list= &nested_join->join_list;
+    nested_join->join_list.push_back(tl);
+  }
+  
+  /*
+    Reconnect the next_leaf chain.
+    TODO: Do we have to put subquery's tables at the end of the chain?
+          Inserting them at the beginning would be a bit faster.
+    NOTE: We actually insert them at the front! That's because the order is
+          reversed in this list.
+  */
+  for (tl= parent_lex->leaf_tables; tl->next_leaf; tl= tl->next_leaf) ;
+  tl->next_leaf= subq_lex->leaf_tables;
+  last_leaf= tl;
+
+  /*
+    Same as above for next_local chain
+    (a theory: a next_local chain always starts with ::leaf_tables
+     because view's tables are inserted after the view)
+  */
+  for (tl= parent_lex->leaf_tables; tl->next_local; tl= tl->next_local) ;
+  tl->next_local= subq_lex->leaf_tables;
+
+  /* A theory: no need to re-connect the next_global chain */
+
+  /* 3. Remove the original subquery predicate from the WHERE/ON */
+
+  // The subqueries were replaced for Item_int(1) earlier
+  subq_pred->exec_method=
+    Item_in_subselect::SEMI_JOIN;         // for subsequent executions
+  /*TODO: also reset the 'with_subselect' there. */
+
+  /* n. Adjust the parent_join->tables counter */
+  uint table_no= parent_join->tables;
+  /* n. Walk through child's tables and adjust table->map */
+  for (tl= subq_lex->leaf_tables; tl; tl= tl->next_leaf, table_no++)
+  {
+    tl->table->tablenr= table_no;
+    tl->table->map= ((table_map)1) << table_no;
+    SELECT_LEX *old_sl= tl->select_lex;
+    tl->select_lex= parent_join->select_lex; 
+    for (TABLE_LIST *emb= tl->embedding;
+         emb && emb->select_lex == old_sl;
+         emb= emb->embedding)
+      emb->select_lex= parent_join->select_lex;
+  }
+  parent_join->tables += subq_lex->join->tables;
+
+  /* 
+    Put the subquery's WHERE into semi-join's sj_on_expr
+    Add the subquery-induced equalities too.
+  */
+  SELECT_LEX *save_lex= thd->lex->current_select;
+  thd->lex->current_select=subq_lex;
+  if (!subq_pred->left_expr->fixed &&
+       subq_pred->left_expr->fix_fields(thd, &subq_pred->left_expr))
+    DBUG_RETURN(TRUE);
+  thd->lex->current_select=save_lex;
+
+  sj_nest->nested_join->sj_corr_tables= subq_pred->used_tables();
+  sj_nest->nested_join->sj_depends_on=  subq_pred->used_tables() |
+                                        subq_pred->left_expr->used_tables();
+  sj_nest->sj_on_expr= subq_lex->join->conds;
+
+  /*
+    Create the IN-equalities and inject them into semi-join's ON expression.
+    Additionally, for LooseScan strategy
+     - Record the number of IN-equalities.
+     - Create list of pointers to (oe1, ..., ieN). We'll need the list to
+       see which of the expressions are bound and which are not (for those
+       we'll produce a distinct stream of (ie_i1,...ie_ik).
+
+       (TODO: can we just create a list of pointers and hope the expressions
+       will not substitute themselves on fix_fields()? or we need to wrap
+       them into Item_direct_view_refs and store pointers to those. The
+       pointers to Item_direct_view_refs are guaranteed to be stable as 
+       Item_direct_view_refs doesn't substitute itself with anything in 
+       Item_direct_view_ref::fix_fields.
+  */
+  sj_nest->sj_in_exprs= subq_pred->left_expr->cols();
+  sj_nest->nested_join->sj_outer_expr_list.empty();
+
+  if (subq_pred->left_expr->cols() == 1)
+  {
+    nested_join->sj_outer_expr_list.push_back(subq_pred->left_expr);
+    Item_func_eq *item_eq=
+      new Item_func_eq(subq_pred->left_expr, subq_lex->ref_pointer_array[0]);
+    item_eq->in_equality_no= 0;
+    sj_nest->sj_on_expr= and_items(sj_nest->sj_on_expr, item_eq);
+  }
+  else
+  {
+    for (uint i= 0; i < subq_pred->left_expr->cols(); i++)
+    {
+      nested_join->sj_outer_expr_list.push_back(subq_pred->left_expr->
+                                                element_index(i));
+      Item_func_eq *item_eq= 
+        new Item_func_eq(subq_pred->left_expr->element_index(i), 
+                         subq_lex->ref_pointer_array[i]);
+      item_eq->in_equality_no= i;
+      sj_nest->sj_on_expr= and_items(sj_nest->sj_on_expr, item_eq);
+    }
+  }
+  /* Fix the created equality and AND */
+  sj_nest->sj_on_expr->fix_fields(parent_join->thd, &sj_nest->sj_on_expr);
+
+  /*
+    Walk through sj nest's WHERE and ON expressions and call
+    item->fix_table_changes() for all items.
+  */
+  sj_nest->sj_on_expr->fix_after_pullout(parent_lex, &sj_nest->sj_on_expr);
+  fix_list_after_tbl_changes(parent_lex, &sj_nest->nested_join->join_list);
+
+
+  /* Unlink the child select_lex so it doesn't show up in EXPLAIN: */
+  subq_lex->master_unit()->exclude_level();
+
+  DBUG_EXECUTE("where",
+               print_where(sj_nest->sj_on_expr,"SJ-EXPR", QT_ORDINARY););
+
+  /* Inject sj_on_expr into the parent's WHERE or ON */
+  if (emb_tbl_nest)
+  {
+    emb_tbl_nest->on_expr= and_items(emb_tbl_nest->on_expr, 
+                                     sj_nest->sj_on_expr);
+    emb_tbl_nest->on_expr->fix_fields(parent_join->thd, &emb_tbl_nest->on_expr);
+  }
+  else
+  {
+    /* Inject into the WHERE */
+    parent_join->conds= and_items(parent_join->conds, sj_nest->sj_on_expr);
+    parent_join->conds->fix_fields(parent_join->thd, &parent_join->conds);
+    parent_join->select_lex->where= parent_join->conds;
+  }
+
+  if (subq_lex->ftfunc_list->elements)
+  {
+    Item_func_match *ifm;
+    List_iterator_fast<Item_func_match> li(*(subq_lex->ftfunc_list));
+    while ((ifm= li++))
+      parent_lex->ftfunc_list->push_front(ifm);
+  }
+
+  DBUG_RETURN(FALSE);
+}
+
+static TABLE_LIST *alloc_join_nest(THD *thd)
+{
+  TABLE_LIST *tbl;
+  if (!(tbl= (TABLE_LIST*) thd->calloc(ALIGN_SIZE(sizeof(TABLE_LIST))+
+                                       sizeof(NESTED_JOIN))))
+    return NULL;
+  tbl->nested_join= (NESTED_JOIN*) ((uchar*)tbl + 
+                                    ALIGN_SIZE(sizeof(TABLE_LIST)));
+  return tbl;
+}
+
+
+static
+void fix_list_after_tbl_changes(SELECT_LEX *new_parent, List<TABLE_LIST> *tlist)
+{
+  List_iterator<TABLE_LIST> it(*tlist);
+  TABLE_LIST *table;
+  while ((table= it++))
+  {
+    if (table->on_expr)
+      table->on_expr->fix_after_pullout(new_parent, &table->on_expr);
+    if (table->nested_join)
+      fix_list_after_tbl_changes(new_parent, &table->nested_join->join_list);
+  }
+}
+
+
+/*
+  Pull tables out of semi-join nests, if possible
+
+  SYNOPSIS
+    pull_out_semijoin_tables()
+      join  The join where to do the semi-join flattening
+
+  DESCRIPTION
+    Try to pull tables out of semi-join nests.
+     
+    PRECONDITIONS
+    When this function is called, the join may have several semi-join nests
+    but it is guaranteed that one semi-join nest does not contain another.
+   
+    ACTION
+    A table can be pulled out of the semi-join nest if
+     - It is a constant table, or
+     - It is accessed via eq_ref(outer_tables)
+
+    POSTCONDITIONS
+     * Tables that were pulled out have JOIN_TAB::emb_sj_nest == NULL
+     * Tables that were not pulled out have JOIN_TAB::emb_sj_nest pointing 
+       to semi-join nest they are in.
+     * Semi-join nests' TABLE_LIST::sj_inner_tables is updated accordingly
+
+    This operation is (and should be) performed at each PS execution since
+    tables may become/cease to be constant across PS reexecutions.
+    
+  NOTE
+    Table pullout may make uncorrelated subquery correlated. Consider this
+    example:
+    
+     ... WHERE oe IN (SELECT it1.primary_key WHERE p(it1, it2) ... ) 
+    
+    here table it1 can be pulled out (we have it1.primary_key=oe which gives
+    us functional dependency). Once it1 is pulled out, all references to it1
+    from p(it1, it2) become references to outside of the subquery and thus
+    make the subquery (i.e. its semi-join nest) correlated.
+    Making the subquery (i.e. its semi-join nest) correlated prevents us from
+    using Materialization or LooseScan to execute it. 
+
+  RETURN 
+    0 - OK
+    1 - Out of memory error
+*/
+
+int pull_out_semijoin_tables(JOIN *join)
+{
+  TABLE_LIST *sj_nest;
+  DBUG_ENTER("pull_out_semijoin_tables");
+  List_iterator<TABLE_LIST> sj_list_it(join->select_lex->sj_nests);
+   
+  /* Try pulling out of the each of the semi-joins */
+  while ((sj_nest= sj_list_it++))
+  {
+    /* Action #1: Mark the constant tables to be pulled out */
+    table_map pulled_tables= 0;
+    List_iterator<TABLE_LIST> child_li(sj_nest->nested_join->join_list);
+    TABLE_LIST *tbl;
+    while ((tbl= child_li++))
+    {
+      if (tbl->table)
+      {
+        tbl->table->reginfo.join_tab->emb_sj_nest= sj_nest;
+#if 0 
+        /* 
+          Do not pull out tables because they are constant. This operation has
+          a problem:
+          - Some constant tables may become/cease to be constant across PS
+            re-executions
+          - Contrary to our initial assumption, it turned out that table pullout 
+            operation is not easily undoable.
+
+          The solution is to leave constant tables where they are. This will
+          affect only constant tables that are 1-row or empty, tables that are
+          constant because they are accessed via eq_ref(const) access will
+          still be pulled out as functionally-dependent.
+
+          This will cause us to miss the chance to flatten some of the 
+          subqueries, but since const tables do not generate many duplicates,
+          it really doesn't matter that much whether they were pulled out or
+          not.
+
+          All of this was done as fix for BUG#43768.
+        */
+        if (tbl->table->map & join->const_table_map)
+        {
+          pulled_tables |= tbl->table->map;
+          DBUG_PRINT("info", ("Table %s pulled out (reason: constant)",
+                              tbl->table->alias));
+        }
+#endif
+      }
+    }
+    
+    /*
+      Action #2: Find which tables we can pull out based on
+      update_ref_and_keys() data. Note that pulling one table out can allow
+      us to pull out some other tables too.
+    */
+    bool pulled_a_table;
+    do 
+    {
+      pulled_a_table= FALSE;
+      child_li.rewind();
+      while ((tbl= child_li++))
+      {
+        if (tbl->table && !(pulled_tables & tbl->table->map))
+        {
+          if (find_eq_ref_candidate(tbl->table, 
+                                    sj_nest->nested_join->used_tables & 
+                                    ~pulled_tables))
+          {
+            pulled_a_table= TRUE;
+            pulled_tables |= tbl->table->map;
+            DBUG_PRINT("info", ("Table %s pulled out (reason: func dep)",
+                                tbl->table->alias.c_ptr()));
+            /*
+              Pulling a table out of uncorrelated subquery in general makes
+              makes it correlated. See the NOTE to this funtion. 
+            */
+            sj_nest->sj_subq_pred->is_correlated= TRUE;
+            sj_nest->nested_join->sj_corr_tables|= tbl->table->map;
+            sj_nest->nested_join->sj_depends_on|= tbl->table->map;
+          }
+        }
+      }
+    } while (pulled_a_table);
+ 
+    child_li.rewind();
+    /*
+      Action #3: Move the pulled out TABLE_LIST elements to the parents.
+    */
+    table_map inner_tables= sj_nest->nested_join->used_tables & 
+                            ~pulled_tables;
+    /* Record the bitmap of inner tables */
+    sj_nest->sj_inner_tables= inner_tables;
+    if (pulled_tables)
+    {
+      List<TABLE_LIST> *upper_join_list= (sj_nest->embedding != NULL)?
+                                           (&sj_nest->embedding->nested_join->join_list): 
+                                           (&join->select_lex->top_join_list);
+      Query_arena *arena, backup;
+      arena= join->thd->activate_stmt_arena_if_needed(&backup);
+      while ((tbl= child_li++))
+      {
+        if (tbl->table)
+        {
+          if (inner_tables & tbl->table->map)
+          {
+            /* This table is not pulled out */
+            tbl->table->reginfo.join_tab->emb_sj_nest= sj_nest;
+          }
+          else
+          {
+            /* This table has been pulled out of the semi-join nest */
+            tbl->table->reginfo.join_tab->emb_sj_nest= NULL;
+            /*
+              Pull the table up in the same way as simplify_joins() does:
+              update join_list and embedding pointers but keep next[_local]
+              pointers.
+            */
+            child_li.remove();
+            sj_nest->nested_join->used_tables &= ~tbl->table->map;
+            upper_join_list->push_back(tbl);
+            tbl->join_list= upper_join_list;
+            tbl->embedding= sj_nest->embedding;
+          }
+        }
+      }
+
+      /* Remove the sj-nest itself if we've removed everything from it */
+      if (!inner_tables)
+      {
+        List_iterator<TABLE_LIST> li(*upper_join_list);
+        /* Find the sj_nest in the list. */
+        while (sj_nest != li++) ;
+        li.remove();
+        /* Also remove it from the list of SJ-nests: */
+        sj_list_it.remove();
+      }
+
+      if (arena)
+        join->thd->restore_active_arena(arena, &backup);
+    }
+  }
+  DBUG_RETURN(0);
+}
+
+
+/* 
+  Optimize semi-join nests that could be run with sj-materialization
+
+  SYNOPSIS
+    optimize_semijoin_nests()
+      join           The join to optimize semi-join nests for
+      all_table_map  Bitmap of all tables in the join
+
+  DESCRIPTION
+    Optimize each of the semi-join nests that can be run with
+    materialization. For each of the nests, we
+     - Generate the best join order for this "sub-join" and remember it;
+     - Remember the sub-join execution cost (it's part of materialization
+       cost);
+     - Calculate other costs that will be incurred if we decide 
+       to use materialization strategy for this semi-join nest.
+
+    All obtained information is saved and will be used by the main join
+    optimization pass.
+
+  RETURN
+    FALSE  Ok 
+    TRUE   Out of memory error
+*/
+
+bool optimize_semijoin_nests(JOIN *join, table_map all_table_map)
+{
+  DBUG_ENTER("optimize_semijoin_nests");
+  List_iterator<TABLE_LIST> sj_list_it(join->select_lex->sj_nests);
+  TABLE_LIST *sj_nest;
+  while ((sj_nest= sj_list_it++))
+  {
+    /* semi-join nests with only constant tables are not valid */
+   /// DBUG_ASSERT(sj_nest->sj_inner_tables & ~join->const_table_map);
+
+    sj_nest->sj_mat_info= NULL;
+    /*
+      The statement may have been executed with 'semijoin=on' earlier.
+      We need to verify that 'semijoin=on' still holds.
+     */
+    if (optimizer_flag(join->thd, OPTIMIZER_SWITCH_SEMIJOIN) &&
+        optimizer_flag(join->thd, OPTIMIZER_SWITCH_MATERIALIZATION))
+    {
+      if ((sj_nest->sj_inner_tables  & ~join->const_table_map) && /* not everything was pulled out */
+          !sj_nest->sj_subq_pred->is_correlated && 
+           sj_nest->sj_subq_pred->types_allow_materialization)
+      {
+        join->emb_sjm_nest= sj_nest;
+        if (choose_plan(join, all_table_map &~join->const_table_map))
+          DBUG_RETURN(TRUE); /* purecov: inspected */
+        /*
+          The best plan to run the subquery is now in join->best_positions,
+          save it.
+        */
+        uint n_tables= my_count_bits(sj_nest->sj_inner_tables & ~join->const_table_map);
+        SJ_MATERIALIZATION_INFO* sjm;
+        if (!(sjm= new SJ_MATERIALIZATION_INFO) ||
+            !(sjm->positions= (POSITION*)join->thd->alloc(sizeof(POSITION)*
+                                                          n_tables)))
+          DBUG_RETURN(TRUE); /* purecov: inspected */
+        sjm->tables= n_tables;
+        sjm->is_used= FALSE;
+        double subjoin_out_rows, subjoin_read_time;
+        get_partial_join_cost(join, n_tables,
+                              &subjoin_read_time, &subjoin_out_rows);
+
+        sjm->materialization_cost.convert_from_cost(subjoin_read_time);
+        sjm->rows= subjoin_out_rows;
+
+        List<Item> &right_expr_list= 
+          sj_nest->sj_subq_pred->unit->first_select()->item_list;
+        /*
+          Adjust output cardinality estimates. If the subquery has form
+
+           ... oe IN (SELECT t1.colX, t2.colY, func(X,Y,Z) )
+
+           then the number of distinct output record combinations has an
+           upper bound of product of number of records matching the tables 
+           that are used by the SELECT clause.
+           TODO:
+             We can get a more precise estimate if we
+              - use rec_per_key cardinality estimates. For simple cases like 
+                "oe IN (SELECT t.key ...)" it is trivial. 
+              - Functional dependencies between the tables in the semi-join
+                nest (the payoff is probably less here?)
+        */
+        {
+          for (uint i=0 ; i < join->const_tables + sjm->tables ; i++)
+          {
+            JOIN_TAB *tab= join->best_positions[i].table;
+            join->map2table[tab->table->tablenr]= tab;
+          }
+          List_iterator<Item> it(right_expr_list);
+          Item *item;
+          table_map map= 0;
+          while ((item= it++))
+            map |= item->used_tables();
+          map= map & ~PSEUDO_TABLE_BITS;
+          Table_map_iterator tm_it(map);
+          int tableno;
+          double rows= 1.0;
+          while ((tableno = tm_it.next_bit()) != Table_map_iterator::BITMAP_END)
+            rows *= join->map2table[tableno]->table->quick_condition_rows;
+          sjm->rows= min(sjm->rows, rows);
+        }
+        memcpy(sjm->positions, join->best_positions + join->const_tables, 
+               sizeof(POSITION) * n_tables);
+
+        /*
+          Calculate temporary table parameters and usage costs
+        */
+        uint rowlen= get_tmp_table_rec_length(right_expr_list);
+        double lookup_cost;
+        if (rowlen * subjoin_out_rows< join->thd->variables.max_heap_table_size)
+          lookup_cost= HEAP_TEMPTABLE_LOOKUP_COST;
+        else
+          lookup_cost= DISK_TEMPTABLE_LOOKUP_COST;
+
+        /*
+          Let materialization cost include the cost to write the data into the
+          temporary table:
+        */ 
+        sjm->materialization_cost.add_io(subjoin_out_rows, lookup_cost);
+        
+        /*
+          Set the cost to do a full scan of the temptable (will need this to 
+          consider doing sjm-scan):
+        */ 
+        sjm->scan_cost.zero();
+        sjm->scan_cost.add_io(sjm->rows, lookup_cost);
+
+        sjm->lookup_cost.convert_from_cost(lookup_cost);
+        sj_nest->sj_mat_info= sjm;
+        DBUG_EXECUTE("opt", print_sjm(sjm););
+      }
+    }
+  }
+  join->emb_sjm_nest= NULL;
+  DBUG_RETURN(FALSE);
+}
+
+/*
+  Get estimated record length for semi-join materialization temptable
+  
+  SYNOPSIS
+    get_tmp_table_rec_length()
+      items  IN subquery's select list.
+
+  DESCRIPTION
+    Calculate estimated record length for semi-join materialization
+    temptable. It's an estimate because we don't follow every bit of
+    create_tmp_table()'s logic. This isn't necessary as the return value of
+    this function is used only for cost calculations.
+
+  RETURN
+    Length of the temptable record, in bytes
+*/
+
+static uint get_tmp_table_rec_length(List<Item> &items)
+{
+  uint len= 0;
+  Item *item;
+  List_iterator<Item> it(items);
+  while ((item= it++))
+  {
+    switch (item->result_type()) {
+    case REAL_RESULT:
+      len += sizeof(double);
+      break;
+    case INT_RESULT:
+      if (item->max_length >= (MY_INT32_NUM_DECIMAL_DIGITS - 1))
+        len += 8;
+      else
+        len += 4;
+      break;
+    case STRING_RESULT:
+      enum enum_field_types type;
+      /* DATE/TIME and GEOMETRY fields have STRING_RESULT result type.  */
+      if ((type= item->field_type()) == MYSQL_TYPE_DATETIME ||
+          type == MYSQL_TYPE_TIME || type == MYSQL_TYPE_DATE ||
+          type == MYSQL_TYPE_TIMESTAMP || type == MYSQL_TYPE_GEOMETRY)
+        len += 8;
+      else
+        len += item->max_length;
+      break;
+    case DECIMAL_RESULT:
+      len += 10;
+      break;
+    case ROW_RESULT:
+    default:
+      DBUG_ASSERT(0); /* purecov: deadcode */
+      break;
+    }
+  }
+  return len;
+}
+
+//psergey-todo: is the below a kind of table elimination??
+/*
+  Check if table's KEYUSE elements have an eq_ref(outer_tables) candidate
+
+  SYNOPSIS
+    find_eq_ref_candidate()
+      table             Table to be checked
+      sj_inner_tables   Bitmap of inner tables. eq_ref(inner_table) doesn't
+                        count.
+
+  DESCRIPTION
+    Check if table's KEYUSE elements have an eq_ref(outer_tables) candidate
+
+  TODO
+    Check again if it is feasible to factor common parts with constant table
+    search
+
+  RETURN
+    TRUE  - There exists an eq_ref(outer-tables) candidate
+    FALSE - Otherwise
+*/
+
+bool find_eq_ref_candidate(TABLE *table, table_map sj_inner_tables)
+{
+  KEYUSE *keyuse= table->reginfo.join_tab->keyuse;
+
+  if (keyuse)
+  {
+    do
+    {
+      uint key= keyuse->key;
+      KEY *keyinfo;
+      key_part_map bound_parts= 0;
+      bool is_excluded_key= keyuse->is_for_hash_join(); 
+      if (!is_excluded_key)
+      {
+        keyinfo= table->key_info + key;
+        is_excluded_key= !test(keyinfo->flags & HA_NOSAME);
+      }
+      if (!is_excluded_key)
+      {
+        do  /* For all equalities on all key parts */
+        {
+          /* Check if this is "t.keypart = expr(outer_tables) */
+          if (!(keyuse->used_tables & sj_inner_tables) &&
+              !(keyuse->optimize & KEY_OPTIMIZE_REF_OR_NULL))
+          {
+            bound_parts |= 1 << keyuse->keypart;
+          }
+          keyuse++;
+        } while (keyuse->key == key && keyuse->table == table);
+
+        if (bound_parts == PREV_BITS(uint, keyinfo->key_parts))
+          return TRUE;
+      }
+      else
+      {
+        do
+        {
+          keyuse++;
+        } while (keyuse->key == key && keyuse->table == table);
+      }
+    } while (keyuse->table == table);
+  }
+  return FALSE;
+}
+
+/*
+  Do semi-join optimization step after we've added a new tab to join prefix
+
+  SYNOPSIS
+    advance_sj_state()
+      join                        The join we're optimizing
+      remaining_tables            Tables not in the join prefix
+      new_join_tab                Join tab we've just added to the join prefix
+      idx                         Index of this join tab (i.e. number of tables
+                                  in the prefix minus one)
+      current_record_count INOUT  Estimate of #records in join prefix's output
+      current_read_time    INOUT  Cost to execute the join prefix
+      loose_scan_pos       IN     A POSITION with LooseScan plan to access 
+                                  table new_join_tab
+                                  (produced by the last best_access_path call)
+
+  DESCRIPTION
+    Update semi-join optimization state after we've added another tab (table 
+    and access method) to the join prefix.
+    
+    The state is maintained in join->positions[#prefix_size]. Each of the
+    available strategies has its own state variables.
+    
+    for each semi-join strategy
+    {
+      update strategy's state variables;
+
+      if (join prefix has all the tables that are needed to consider
+          using this strategy for the semi-join(s))
+      {
+        calculate cost of using the strategy
+        if ((this is the first strategy to handle the semi-join nest(s)  ||
+            the cost is less than other strategies))
+        {
+          // Pick this strategy
+          pos->sj_strategy= ..
+          ..
+        }
+      }
+
+    Most of the new state is saved join->positions[idx] (and hence no undo
+    is necessary). Several members of class JOIN are updated also, these
+    changes can be rolled back with restore_prev_sj_state().
+
+    See setup_semijoin_dups_elimination() for a description of what kinds of
+    join prefixes each strategy can handle.
+*/
+
+void advance_sj_state(JOIN *join, table_map remaining_tables, 
+                      const JOIN_TAB *new_join_tab, uint idx, 
+                      double *current_record_count, double *current_read_time, 
+                      POSITION *loose_scan_pos)
+{
+  TABLE_LIST *emb_sj_nest;
+  POSITION *pos= join->positions + idx;
+  remaining_tables &= ~new_join_tab->table->map;
+
+  pos->prefix_cost.convert_from_cost(*current_read_time);
+  pos->prefix_record_count= *current_record_count;
+  pos->sj_strategy= SJ_OPT_NONE;
+  
+  /* Initialize the state or copy it from prev. tables */
+  if (idx == join->const_tables)
+  {
+    pos->first_firstmatch_table= MAX_TABLES;
+    pos->first_loosescan_table= MAX_TABLES; 
+    pos->dupsweedout_tables= 0;
+    pos->sjm_scan_need_tables= 0;
+    LINT_INIT(pos->sjm_scan_last_inner);
+  }
+  else
+  {
+    // FirstMatch
+    pos->first_firstmatch_table=
+      (pos[-1].sj_strategy == SJ_OPT_FIRST_MATCH) ?
+      MAX_TABLES : pos[-1].first_firstmatch_table;
+    pos->first_firstmatch_rtbl= pos[-1].first_firstmatch_rtbl;
+    pos->firstmatch_need_tables= pos[-1].firstmatch_need_tables;
+
+    // LooseScan
+    pos->first_loosescan_table=
+      (pos[-1].sj_strategy == SJ_OPT_LOOSE_SCAN) ?
+      MAX_TABLES : pos[-1].first_loosescan_table;
+    pos->loosescan_need_tables= pos[-1].loosescan_need_tables;
+
+    // SJ-Materialization Scan
+    pos->sjm_scan_need_tables=
+      (pos[-1].sj_strategy == SJ_OPT_MATERIALIZE_SCAN) ?
+      0 : pos[-1].sjm_scan_need_tables;
+    pos->sjm_scan_last_inner= pos[-1].sjm_scan_last_inner;
+
+    // Duplicate Weedout
+    pos->dupsweedout_tables=      pos[-1].dupsweedout_tables;
+    pos->first_dupsweedout_table= pos[-1].first_dupsweedout_table;
+  }
+  
+  table_map handled_by_fm_or_ls= 0;
+  /* FirstMatch Strategy */
+  if (new_join_tab->emb_sj_nest &&
+      optimizer_flag(join->thd, OPTIMIZER_SWITCH_FIRSTMATCH))
+  {
+    const table_map outer_corr_tables=
+      new_join_tab->emb_sj_nest->nested_join->sj_corr_tables |
+      new_join_tab->emb_sj_nest->nested_join->sj_depends_on;
+    const table_map sj_inner_tables=
+      new_join_tab->emb_sj_nest->sj_inner_tables & ~join->const_table_map;
+
+    /* 
+      Enter condition:
+       1. The next join tab belongs to semi-join nest
+          (verified for the encompassing code block above).
+       2. We're not in a duplicate producer range yet
+       3. All outer tables that
+           - the subquery is correlated with, or
+           - referred to from the outer_expr 
+          are in the join prefix
+       4. All inner tables are still part of remaining_tables.
+    */
+    if (!join->cur_sj_inner_tables &&              // (2)
+        !(remaining_tables & outer_corr_tables) && // (3)
+        (sj_inner_tables ==                        // (4)
+         ((remaining_tables | new_join_tab->table->map) & sj_inner_tables)))
+    {
+      /* Start tracking potential FirstMatch range */
+      pos->first_firstmatch_table= idx;
+      pos->firstmatch_need_tables= sj_inner_tables;
+      pos->first_firstmatch_rtbl= remaining_tables;
+    }
+
+    if (pos->first_firstmatch_table != MAX_TABLES)
+    {
+      if (outer_corr_tables & pos->first_firstmatch_rtbl)
+      {
+        /*
+          Trying to add an sj-inner table whose sj-nest has an outer correlated 
+          table that was not in the prefix. This means FirstMatch can't be used.
+        */
+        pos->first_firstmatch_table= MAX_TABLES;
+      }
+      else
+      {
+        /* Record that we need all of this semi-join's inner tables, too */
+        pos->firstmatch_need_tables|= sj_inner_tables;
+      }
+    
+      if (!(pos->firstmatch_need_tables & remaining_tables))
+      {
+        /*
+          Got a complete FirstMatch range.
+            Calculate correct costs and fanout
+        */
+        optimize_wo_join_buffering(join, pos->first_firstmatch_table, idx,
+                                   remaining_tables, FALSE, idx,
+                                   current_record_count, 
+                                   current_read_time);
+        /*
+          We don't yet know what are the other strategies, so pick the
+          FirstMatch.
+
+          We ought to save the alternate POSITIONs produced by
+          optimize_wo_join_buffering but the problem is that providing save
+          space uses too much space. Instead, we will re-calculate the
+          alternate POSITIONs after we've picked the best QEP.
+        */
+        pos->sj_strategy= SJ_OPT_FIRST_MATCH;
+        handled_by_fm_or_ls=  pos->firstmatch_need_tables;
+      }
+    }
+  }
+
+  /* LooseScan Strategy */
+  {
+    POSITION *first=join->positions+pos->first_loosescan_table; 
+    /* 
+      LooseScan strategy can't handle interleaving between tables from the 
+      semi-join that LooseScan is handling and any other tables.
+
+      If we were considering LooseScan for the join prefix (1)
+         and the table we're adding creates an interleaving (2)
+      then 
+         stop considering loose scan
+    */
+    if ((pos->first_loosescan_table != MAX_TABLES) &&   // (1)
+        (first->table->emb_sj_nest->sj_inner_tables & remaining_tables) && //(2)
+        new_join_tab->emb_sj_nest != first->table->emb_sj_nest) //(2)
+    {
+      pos->first_loosescan_table= MAX_TABLES;
+    }
+
+    /*
+      If we got an option to use LooseScan for the current table, start
+      considering using LooseScan strategy
+    */
+    if (loose_scan_pos->read_time != DBL_MAX)
+    {
+      pos->first_loosescan_table= idx;
+      pos->loosescan_need_tables=
+        new_join_tab->emb_sj_nest->sj_inner_tables | 
+        new_join_tab->emb_sj_nest->nested_join->sj_depends_on |
+        new_join_tab->emb_sj_nest->nested_join->sj_corr_tables;
+    }
+    
+    if ((pos->first_loosescan_table != MAX_TABLES) && 
+        !(remaining_tables & pos->loosescan_need_tables))
+    {
+      /* 
+        Ok we have LooseScan plan and also have all LooseScan sj-nest's
+        inner tables and outer correlated tables into the prefix.
+      */
+
+      first=join->positions + pos->first_loosescan_table; 
+      uint n_tables= my_count_bits(first->table->emb_sj_nest->sj_inner_tables);
+      /* Got a complete LooseScan range. Calculate its cost */
+      /*
+        The same problem as with FirstMatch - we need to save POSITIONs
+        somewhere but reserving space for all cases would require too
+        much space. We will re-calculate POSITION structures later on. 
+      */
+      optimize_wo_join_buffering(join, pos->first_loosescan_table, idx,
+                                 remaining_tables, 
+                                 TRUE,  //first_alt
+                                 pos->first_loosescan_table + n_tables,
+                                 current_record_count,
+                                 current_read_time);
+      /*
+        We don't yet have any other strategies that could handle this
+        semi-join nest (the other options are Duplicate Elimination or
+        Materialization, which need at least the same set of tables in 
+        the join prefix to be considered) so unconditionally pick the 
+        LooseScan.
+      */
+      pos->sj_strategy= SJ_OPT_LOOSE_SCAN;
+      handled_by_fm_or_ls= first->table->emb_sj_nest->sj_inner_tables;
+    }
+  }
+
+  /* 
+    Update join->cur_sj_inner_tables (Used by FirstMatch in this function and
+    LooseScan detector in best_access_path)
+  */
+  if ((emb_sj_nest= new_join_tab->emb_sj_nest))
+  {
+    join->cur_sj_inner_tables |= emb_sj_nest->sj_inner_tables;
+    join->cur_dups_producing_tables |= emb_sj_nest->sj_inner_tables;
+
+    /* Remove the sj_nest if all of its SJ-inner tables are in cur_table_map */
+    if (!(remaining_tables &
+          emb_sj_nest->sj_inner_tables & ~new_join_tab->table->map))
+      join->cur_sj_inner_tables &= ~emb_sj_nest->sj_inner_tables;
+  }
+  join->cur_dups_producing_tables &= ~handled_by_fm_or_ls;
+
+  /* 4. SJ-Materialization and SJ-Materialization-scan strategy handler */
+  bool sjm_scan;
+  SJ_MATERIALIZATION_INFO *mat_info;
+  if ((mat_info= at_sjmat_pos(join, remaining_tables,
+                              new_join_tab, idx, &sjm_scan)))
+  {
+    if (sjm_scan)
+    {
+      /*
+        We can't yet evaluate this option yet. This is because we can't
+        accout for fanout of sj-inner tables yet:
+
+          ntX  SJM-SCAN(it1 ... itN) | ot1 ... otN  |
+                                     ^(1)           ^(2)
+
+        we're now at position (1). SJM temptable in general has multiple
+        records, so at point (1) we'll get the fanout from sj-inner tables (ie
+        there will be multiple record combinations).
+
+        The final join result will not contain any semi-join produced
+        fanout, i.e. tables within SJM-SCAN(...) will not contribute to
+        the cardinality of the join output.  Extra fanout produced by 
+        SJM-SCAN(...) will be 'absorbed' into fanout produced by ot1 ...  otN.
+
+        The simple way to model this is to remove SJM-SCAN(...) fanout once
+        we reach the point #2.
+      */
+      pos->sjm_scan_need_tables=
+        new_join_tab->emb_sj_nest->sj_inner_tables | 
+        new_join_tab->emb_sj_nest->nested_join->sj_depends_on |
+        new_join_tab->emb_sj_nest->nested_join->sj_corr_tables;
+      pos->sjm_scan_last_inner= idx;
+    }
+    else
+    {
+      /* This is SJ-Materialization with lookups */
+      COST_VECT prefix_cost; 
+      signed int first_tab= (int)idx - mat_info->tables;
+      double prefix_rec_count;
+      if (first_tab < (int)join->const_tables)
+      {
+        prefix_cost.zero();
+        prefix_rec_count= 1.0;
+      }
+      else
+      {
+        prefix_cost= join->positions[first_tab].prefix_cost;
+        prefix_rec_count= join->positions[first_tab].prefix_record_count;
+      }
+
+      double mat_read_time= prefix_cost.total_cost();
+      mat_read_time += mat_info->materialization_cost.total_cost() +
+                       prefix_rec_count * mat_info->lookup_cost.total_cost();
+
+      if (mat_read_time < *current_read_time || join->cur_dups_producing_tables)
+      {
+        /*
+          NOTE: When we pick to use SJM[-Scan] we don't memcpy its POSITION
+          elements to join->positions as that makes it hard to return things
+          back when making one step back in join optimization. That's done 
+          after the QEP has been chosen.
+        */
+        pos->sj_strategy= SJ_OPT_MATERIALIZE;
+        *current_read_time=    mat_read_time;
+        *current_record_count= prefix_rec_count;
+        join->cur_dups_producing_tables&=
+          ~new_join_tab->emb_sj_nest->sj_inner_tables;
+      }
+    }
+  }
+  
+  /* 4.A SJM-Scan second phase check */
+  if (pos->sjm_scan_need_tables && /* Have SJM-Scan prefix */
+      !(pos->sjm_scan_need_tables & remaining_tables))
+  {
+    TABLE_LIST *mat_nest= 
+      join->positions[pos->sjm_scan_last_inner].table->emb_sj_nest;
+    SJ_MATERIALIZATION_INFO *mat_info= mat_nest->sj_mat_info;
+
+    double prefix_cost;
+    double prefix_rec_count;
+    int first_tab= pos->sjm_scan_last_inner + 1 - mat_info->tables;
+    /* Get the prefix cost */
+    if (first_tab == (int)join->const_tables)
+    {
+      prefix_rec_count= 1.0;
+      prefix_cost= 0.0;
+    }
+    else
+    {
+      prefix_cost= join->positions[first_tab - 1].prefix_cost.total_cost();
+      prefix_rec_count= join->positions[first_tab - 1].prefix_record_count;
+    }
+
+    /* Add materialization cost */
+    prefix_cost += mat_info->materialization_cost.total_cost() +
+                   prefix_rec_count * mat_info->scan_cost.total_cost();
+    prefix_rec_count *= mat_info->rows;
+    
+    uint i;
+    table_map rem_tables= remaining_tables;
+    for (i= idx; i != (first_tab + mat_info->tables - 1); i--)
+      rem_tables |= join->positions[i].table->table->map;
+
+    POSITION curpos, dummy;
+    /* Need to re-run best-access-path as we prefix_rec_count has changed */
+    for (i= first_tab + mat_info->tables; i <= idx; i++)
+    {
+      best_access_path(join, join->positions[i].table, rem_tables, i, FALSE,
+                       prefix_rec_count, &curpos, &dummy);
+      prefix_rec_count *= curpos.records_read;
+      prefix_cost += curpos.read_time;
+    }
+
+    /*
+      Use the strategy if 
+       * it is cheaper then what we've had, or
+       * we haven't picked any other semi-join strategy yet
+      In the second case, we pick this strategy unconditionally because
+      comparing cost without semi-join duplicate removal with cost with
+      duplicate removal is not an apples-to-apples comparison.
+    */
+    if (prefix_cost < *current_read_time || join->cur_dups_producing_tables)
+    {
+      pos->sj_strategy= SJ_OPT_MATERIALIZE_SCAN;
+      *current_read_time=    prefix_cost;
+      *current_record_count= prefix_rec_count;
+      join->cur_dups_producing_tables&= ~mat_nest->sj_inner_tables;
+
+    }
+  }
+
+  /* 5. Duplicate Weedout strategy handler */
+  {
+    /* 
+       Duplicate weedout can be applied after all ON-correlated and 
+       correlated 
+    */
+    TABLE_LIST *nest;
+    if ((nest= new_join_tab->emb_sj_nest))
+    {
+      if (!pos->dupsweedout_tables)
+        pos->first_dupsweedout_table= idx;
+
+      pos->dupsweedout_tables |= nest->sj_inner_tables |
+                                 nest->nested_join->sj_depends_on |
+                                 nest->nested_join->sj_corr_tables;
+    }
+
+    if (pos->dupsweedout_tables && 
+        !(remaining_tables &
+          ~new_join_tab->table->map & pos->dupsweedout_tables))
+    {
+      /*
+        Ok, reached a state where we could put a dups weedout point.
+        Walk back and calculate
+          - the join cost (this is needed as the accumulated cost may assume 
+            some other duplicate elimination method)
+          - extra fanout that will be removed by duplicate elimination
+          - duplicate elimination cost
+        There are two cases:
+          1. We have other strategy/ies to remove all of the duplicates.
+          2. We don't.
+        
+        We need to calculate the cost in case #2 also because we need to make
+        choice between this join order and others.
+      */
+      uint first_tab= pos->first_dupsweedout_table;
+      double dups_cost;
+      double prefix_rec_count;
+      double sj_inner_fanout= 1.0;
+      double sj_outer_fanout= 1.0;
+      uint temptable_rec_size;
+      if (first_tab == join->const_tables)
+      {
+        prefix_rec_count= 1.0;
+        temptable_rec_size= 0;
+        dups_cost= 0.0;
+      }
+      else
+      {
+        dups_cost= join->positions[first_tab - 1].prefix_cost.total_cost();
+        prefix_rec_count= join->positions[first_tab - 1].prefix_record_count;
+        temptable_rec_size= 8; /* This is not true but we'll make it so */
+      }
+      
+      table_map dups_removed_fanout= 0;
+      for (uint j= pos->first_dupsweedout_table; j <= idx; j++)
+      {
+        POSITION *p= join->positions + j;
+        dups_cost += p->read_time;
+        if (p->table->emb_sj_nest)
+        {
+          sj_inner_fanout *= p->records_read;
+          dups_removed_fanout |= p->table->table->map;
+        }
+        else
+        {
+          sj_outer_fanout *= p->records_read;
+          temptable_rec_size += p->table->table->file->ref_length;
+        }
+      }
+
+      /*
+        Add the cost of temptable use. The table will have sj_outer_fanout
+        records, and we will make 
+        - sj_outer_fanout table writes
+        - sj_inner_fanout*sj_outer_fanout  lookups.
+
+      */
+      double one_lookup_cost;
+      if (sj_outer_fanout*temptable_rec_size > 
+          join->thd->variables.max_heap_table_size)
+        one_lookup_cost= DISK_TEMPTABLE_LOOKUP_COST;
+      else
+        one_lookup_cost= HEAP_TEMPTABLE_LOOKUP_COST;
+
+      double write_cost= join->positions[first_tab].prefix_record_count* 
+                         sj_outer_fanout * one_lookup_cost;
+      double full_lookup_cost= join->positions[first_tab].prefix_record_count* 
+                               sj_outer_fanout* sj_inner_fanout * 
+                               one_lookup_cost;
+      dups_cost += write_cost + full_lookup_cost;
+      
+      /*
+        Use the strategy if 
+         * it is cheaper then what we've had, or
+         * we haven't picked any other semi-join strategy yet
+        The second part is necessary because this strategy is the last one
+        to consider (it needs "the most" tables in the prefix) and we can't
+        leave duplicate-producing tables not handled by any strategy.
+      */
+      if (dups_cost < *current_read_time || join->cur_dups_producing_tables)
+      {
+        pos->sj_strategy= SJ_OPT_DUPS_WEEDOUT;
+        *current_read_time= dups_cost;
+        *current_record_count= prefix_rec_count * sj_outer_fanout;
+        join->cur_dups_producing_tables &= ~dups_removed_fanout;
+      }
+    }
+  }
+}
+
+
+/*
+  Remove the last join tab from from join->cur_sj_inner_tables bitmap
+  we assume remaining_tables doesnt contain @tab.
+*/
+
+void restore_prev_sj_state(const table_map remaining_tables, 
+                                  const JOIN_TAB *tab, uint idx)
+{
+  TABLE_LIST *emb_sj_nest;
+  if ((emb_sj_nest= tab->emb_sj_nest))
+  {
+    /* If we're removing the last SJ-inner table, remove the sj-nest */
+    if ((remaining_tables & emb_sj_nest->sj_inner_tables) == 
+        (emb_sj_nest->sj_inner_tables & ~tab->table->map))
+    {
+      tab->join->cur_sj_inner_tables &= ~emb_sj_nest->sj_inner_tables;
+    }
+  }
+}
+
+
+/*
+  Given a semi-join nest, find out which of the IN-equalities are bound
+
+  SYNOPSIS
+    get_bound_sj_equalities()
+      sj_nest           Semi-join nest
+      remaining_tables  Tables that are not yet bound
+
+  DESCRIPTION
+    Given a semi-join nest, find out which of the IN-equalities have their
+    left part expression bound (i.e. the said expression doesn't refer to
+    any of remaining_tables and can be evaluated).
+
+  RETURN
+    Bitmap of bound IN-equalities.
+*/
+
+ulonglong get_bound_sj_equalities(TABLE_LIST *sj_nest, 
+                                  table_map remaining_tables)
+{
+  List_iterator<Item> li(sj_nest->nested_join->sj_outer_expr_list);
+  Item *item;
+  uint i= 0;
+  ulonglong res= 0;
+  while ((item= li++))
+  {
+    /*
+      Q: should this take into account equality propagation and how?
+      A: If e->outer_side is an Item_field, walk over the equality
+         class and see if there is an element that is bound?
+      (this is an optional feature)
+    */
+    if (!(item->used_tables() & remaining_tables))
+    {
+      res |= 1ULL << i;
+    }
+  }
+  return res;
+}
+
+
+/*
+  Check if the last tables of the partial join order allow to use
+  sj-materialization strategy for them
+
+  SYNOPSIS
+    at_sjmat_pos()
+      join              
+      remaining_tables
+      tab                the last table's join tab
+      idx                last table's index
+      loose_scan    OUT  TRUE <=> use LooseScan
+
+  RETURN
+    TRUE   Yes, can apply sj-materialization
+    FALSE  No, some of the requirements are not met
+*/
+
+static SJ_MATERIALIZATION_INFO *
+at_sjmat_pos(const JOIN *join, table_map remaining_tables, const JOIN_TAB *tab,
+             uint idx, bool *loose_scan)
+{
+  /*
+   Check if 
+    1. We're in a semi-join nest that can be run with SJ-materialization
+    2. All the tables correlated through the IN subquery are in the prefix
+  */
+  TABLE_LIST *emb_sj_nest= tab->emb_sj_nest;
+  table_map suffix= remaining_tables & ~tab->table->map;
+  if (emb_sj_nest && emb_sj_nest->sj_mat_info &&
+      !(suffix & emb_sj_nest->sj_inner_tables))
+  {
+    /* 
+      Walk back and check if all immediately preceding tables are from
+      this semi-join.
+    */
+    uint n_tables= my_count_bits(tab->emb_sj_nest->sj_inner_tables);
+    for (uint i= 1; i < n_tables ; i++)
+    {
+      if (join->positions[idx - i].table->emb_sj_nest != tab->emb_sj_nest)
+        return NULL;
+    }
+    *loose_scan= test(remaining_tables & ~tab->table->map &
+                             (emb_sj_nest->sj_inner_tables |
+                              emb_sj_nest->nested_join->sj_depends_on));
+    if (*loose_scan && !emb_sj_nest->sj_subq_pred->sjm_scan_allowed)
+      return NULL;
+    else
+      return emb_sj_nest->sj_mat_info;
+  }
+  return NULL;
+}
+
+
+
+/*
+  Fix semi-join strategies for the picked join order
+
+  SYNOPSIS
+    fix_semijoin_strategies_for_picked_join_order()
+      join  The join with the picked join order
+
+  DESCRIPTION
+    Fix semi-join strategies for the picked join order. This is a step that
+    needs to be done right after we have fixed the join order. What we do
+    here is switch join's semi-join strategy description from backward-based
+    to forwards based.
+    
+    When join optimization is in progress, we re-consider semi-join
+    strategies after we've added another table. Here's an illustration.
+    Suppose the join optimization is underway:
+
+    1) ot1  it1  it2 
+                 sjX  -- looking at (ot1, it1, it2) join prefix, we decide
+                         to use semi-join strategy sjX.
+
+    2) ot1  it1  it2  ot2 
+                 sjX  sjY -- Having added table ot2, we now may consider
+                             another semi-join strategy and decide to use a 
+                             different strategy sjY. Note that the record
+                             of sjX has remained under it2. That is
+                             necessary because we need to be able to get
+                             back to (ot1, it1, it2) join prefix.
+      what makes things even worse is that there are cases where the choice
+      of sjY changes the way we should access it2. 
+
+    3) [ot1  it1  it2  ot2  ot3]
+                  sjX  sjY  -- This means that after join optimization is
+                               finished, semi-join info should be read
+                               right-to-left (while nearly all plan refinement
+                               functions, EXPLAIN, etc proceed from left to 
+                               right)
+
+    This function does the needed reversal, making it possible to read the
+    join and semi-join order from left to right.
+*/    
+
+void fix_semijoin_strategies_for_picked_join_order(JOIN *join)
+{
+  uint table_count=join->tables;
+  uint tablenr;
+  table_map remaining_tables= 0;
+  table_map handled_tabs= 0;
+  for (tablenr= table_count - 1 ; tablenr != join->const_tables - 1; tablenr--)
+  {
+    POSITION *pos= join->best_positions + tablenr;
+    JOIN_TAB *s= pos->table;
+    uint first;
+    LINT_INIT(first); // Set by every branch except SJ_OPT_NONE which doesn't use it
+
+    if ((handled_tabs & s->table->map) || pos->sj_strategy == SJ_OPT_NONE)
+    {
+      remaining_tables |= s->table->map;
+      continue;
+    }
+    
+    if (pos->sj_strategy == SJ_OPT_MATERIALIZE)
+    {
+      SJ_MATERIALIZATION_INFO *sjm= s->emb_sj_nest->sj_mat_info;
+      sjm->is_used= TRUE;
+      sjm->is_sj_scan= FALSE;
+      memcpy(pos - sjm->tables + 1, sjm->positions, 
+             sizeof(POSITION) * sjm->tables);
+      first= tablenr - sjm->tables + 1;
+      join->best_positions[first].n_sj_tables= sjm->tables;
+      join->best_positions[first].sj_strategy= SJ_OPT_MATERIALIZE;
+    }
+    else if (pos->sj_strategy == SJ_OPT_MATERIALIZE_SCAN)
+    {
+      POSITION *first_inner= join->best_positions + pos->sjm_scan_last_inner;
+      SJ_MATERIALIZATION_INFO *sjm= first_inner->table->emb_sj_nest->sj_mat_info;
+      sjm->is_used= TRUE;
+      sjm->is_sj_scan= TRUE;
+      first= pos->sjm_scan_last_inner - sjm->tables + 1;
+      memcpy(join->best_positions + first, 
+             sjm->positions, sizeof(POSITION) * sjm->tables);
+      join->best_positions[first].sj_strategy= SJ_OPT_MATERIALIZE_SCAN;
+      join->best_positions[first].n_sj_tables= sjm->tables;
+      /* 
+        Do what advance_sj_state did: re-run best_access_path for every table
+        in the [last_inner_table + 1; pos..) range
+      */
+      double prefix_rec_count;
+      /* Get the prefix record count */
+      if (first == join->const_tables)
+        prefix_rec_count= 1.0;
+      else
+        prefix_rec_count= join->best_positions[first-1].prefix_record_count;
+      
+      /* Add materialization record count*/
+      prefix_rec_count *= sjm->rows;
+      
+      uint i;
+      table_map rem_tables= remaining_tables;
+      for (i= tablenr; i != (first + sjm->tables - 1); i--)
+        rem_tables |= join->best_positions[i].table->table->map;
+
+      POSITION dummy;
+      join->cur_sj_inner_tables= 0;
+      for (i= first + sjm->tables; i <= tablenr; i++)
+      {
+        best_access_path(join, join->best_positions[i].table, rem_tables, i, FALSE,
+                         prefix_rec_count, join->best_positions + i, &dummy);
+        prefix_rec_count *= join->best_positions[i].records_read;
+        rem_tables &= ~join->best_positions[i].table->table->map;
+      }
+    }
+ 
+    if (pos->sj_strategy == SJ_OPT_FIRST_MATCH)
+    {
+      first= pos->first_firstmatch_table;
+      join->best_positions[first].sj_strategy= SJ_OPT_FIRST_MATCH;
+      join->best_positions[first].n_sj_tables= tablenr - first + 1;
+      POSITION dummy; // For loose scan paths
+      double record_count= (first== join->const_tables)? 1.0: 
+                           join->best_positions[tablenr - 1].prefix_record_count;
+      
+      table_map rem_tables= remaining_tables;
+      uint idx;
+      for (idx= first; idx <= tablenr; idx++)
+      {
+        rem_tables |= join->best_positions[idx].table->table->map;
+      }
+      /*
+        Re-run best_access_path to produce best access methods that do not use
+        join buffering
+      */ 
+      join->cur_sj_inner_tables= 0;
+      for (idx= first; idx <= tablenr; idx++)
+      {
+        if (join->best_positions[idx].use_join_buffer)
+        {
+           best_access_path(join, join->best_positions[idx].table, 
+                            rem_tables, idx, TRUE /* no jbuf */,
+                            record_count, join->best_positions + idx, &dummy);
+        }
+        record_count *= join->best_positions[idx].records_read;
+        rem_tables &= ~join->best_positions[idx].table->table->map;
+      }
+    }
+
+    if (pos->sj_strategy == SJ_OPT_LOOSE_SCAN) 
+    {
+      first= pos->first_loosescan_table;
+      POSITION *first_pos= join->best_positions + first;
+      POSITION loose_scan_pos; // For loose scan paths
+      double record_count= (first== join->const_tables)? 1.0: 
+                           join->best_positions[tablenr - 1].prefix_record_count;
+      
+      table_map rem_tables= remaining_tables;
+      uint idx;
+      for (idx= first; idx <= tablenr; idx++)
+        rem_tables |= join->best_positions[idx].table->table->map;
+      /*
+        Re-run best_access_path to produce best access methods that do not use
+        join buffering
+      */ 
+      join->cur_sj_inner_tables= 0;
+      for (idx= first; idx <= tablenr; idx++)
+      {
+        if (join->best_positions[idx].use_join_buffer || (idx == first))
+        {
+           best_access_path(join, join->best_positions[idx].table,
+                            rem_tables, idx, TRUE /* no jbuf */,
+                            record_count, join->best_positions + idx,
+                            &loose_scan_pos);
+           if (idx==first)
+             join->best_positions[idx]= loose_scan_pos;
+        }
+        rem_tables &= ~join->best_positions[idx].table->table->map;
+        record_count *= join->best_positions[idx].records_read;
+      }
+      first_pos->sj_strategy= SJ_OPT_LOOSE_SCAN;
+      first_pos->n_sj_tables= my_count_bits(first_pos->table->emb_sj_nest->sj_inner_tables);
+    }
+
+    if (pos->sj_strategy == SJ_OPT_DUPS_WEEDOUT)
+    {
+      /* 
+        Duplicate Weedout starting at pos->first_dupsweedout_table, ending at
+        this table.
+      */
+      first= pos->first_dupsweedout_table;
+      join->best_positions[first].sj_strategy= SJ_OPT_DUPS_WEEDOUT;
+      join->best_positions[first].n_sj_tables= tablenr - first + 1;
+    }
+    
+    uint i_end= first + join->best_positions[first].n_sj_tables;
+    for (uint i= first; i < i_end; i++)
+    {
+      if (i != first)
+        join->best_positions[i].sj_strategy= SJ_OPT_NONE;
+      handled_tabs |= join->best_positions[i].table->table->map;
+    }
+
+    if (tablenr != first)
+      pos->sj_strategy= SJ_OPT_NONE;
+    remaining_tables |= s->table->map;
+    //s->sj_strategy= pos->sj_strategy;
+    join->join_tab[first].sj_strategy= join->best_positions[first].sj_strategy;
+  }
+}
+
+/*
+  Setup semi-join materialization strategy for one semi-join nest
+  
+  SYNOPSIS
+
+  setup_sj_materialization()
+    tab  The first tab in the semi-join
+
+  DESCRIPTION
+    Setup execution structures for one semi-join materialization nest:
+    - Create the materialization temporary table
+    - If we're going to do index lookups
+        create TABLE_REF structure to make the lookus
+    - else (if we're going to do a full scan of the temptable)
+        create Copy_field structures to do copying.
+
+  RETURN
+    FALSE  Ok
+    TRUE   Error
+*/
+
+bool setup_sj_materialization(JOIN_TAB *tab)
+{
+  uint i;
+  DBUG_ENTER("setup_sj_materialization");
+  TABLE_LIST *emb_sj_nest= tab->table->pos_in_table_list->embedding;
+  SJ_MATERIALIZATION_INFO *sjm= emb_sj_nest->sj_mat_info;
+  THD *thd= tab->join->thd;
+  /* First the calls come to the materialization function */
+  List<Item> &item_list= emb_sj_nest->sj_subq_pred->unit->first_select()->item_list;
+
+  /* 
+    Set up the table to write to, do as select_union::create_result_table does
+  */
+  sjm->sjm_table_param.init();
+  sjm->sjm_table_param.field_count= item_list.elements;
+  sjm->sjm_table_param.bit_fields_as_long= TRUE;
+  List_iterator<Item> it(item_list);
+  Item *right_expr;
+  while((right_expr= it++))
+    sjm->sjm_table_cols.push_back(right_expr);
+
+  if (!(sjm->table= create_tmp_table(thd, &sjm->sjm_table_param, 
+                                     sjm->sjm_table_cols, (ORDER*) 0, 
+                                     TRUE /* distinct */, 
+                                     1, /*save_sum_fields*/
+                                     thd->options | TMP_TABLE_ALL_COLUMNS, 
+                                     HA_POS_ERROR /*rows_limit */, 
+                                     (char*)"sj-materialize")))
+    DBUG_RETURN(TRUE); /* purecov: inspected */
+  sjm->table->file->extra(HA_EXTRA_WRITE_CACHE);
+  sjm->table->file->extra(HA_EXTRA_IGNORE_DUP_KEY);
+  tab->join->sj_tmp_tables.push_back(sjm->table);
+  tab->join->sjm_info_list.push_back(sjm);
+  
+  sjm->materialized= FALSE;
+  if (!sjm->is_sj_scan)
+  {
+    KEY           *tmp_key; /* The only index on the temporary table. */
+    uint          tmp_key_parts; /* Number of keyparts in tmp_key. */
+    tmp_key= sjm->table->key_info;
+    tmp_key_parts= tmp_key->key_parts;
+    
+    /*
+      Create/initialize everything we will need to index lookups into the
+      temptable.
+    */
+    TABLE_REF *tab_ref;
+    if (!(tab_ref= (TABLE_REF*) thd->alloc(sizeof(TABLE_REF))))
+      DBUG_RETURN(TRUE); /* purecov: inspected */
+    tab_ref->key= 0; /* The only temp table index. */
+    tab_ref->key_length= tmp_key->key_length;
+    if (!(tab_ref->key_buff=
+          (uchar*) thd->calloc(ALIGN_SIZE(tmp_key->key_length) * 2)) ||
+        !(tab_ref->key_copy=
+          (store_key**) thd->alloc((sizeof(store_key*) *
+                                    (tmp_key_parts + 1)))) ||
+        !(tab_ref->items=
+          (Item**) thd->alloc(sizeof(Item*) * tmp_key_parts)))
+      DBUG_RETURN(TRUE); /* purecov: inspected */
+
+    tab_ref->key_buff2=tab_ref->key_buff+ALIGN_SIZE(tmp_key->key_length);
+    tab_ref->key_err=1;
+    tab_ref->null_rejecting= 1;
+    tab_ref->disable_cache= FALSE;
+
+    KEY_PART_INFO *cur_key_part= tmp_key->key_part;
+    store_key **ref_key= tab_ref->key_copy;
+    uchar *cur_ref_buff= tab_ref->key_buff;
+    
+    for (i= 0; i < tmp_key_parts; i++, cur_key_part++, ref_key++)
+    {
+      tab_ref->items[i]= emb_sj_nest->sj_subq_pred->left_expr->element_index(i);
+      int null_count= test(cur_key_part->field->real_maybe_null());
+      *ref_key= new store_key_item(thd, cur_key_part->field,
+                                   /* TODO:
+                                      the NULL byte is taken into account in
+                                      cur_key_part->store_length, so instead of
+                                      cur_ref_buff + test(maybe_null), we could
+                                      use that information instead.
+                                   */
+                                   cur_ref_buff + null_count,
+                                   null_count ? tab_ref->key_buff : 0,
+                                   cur_key_part->length, tab_ref->items[i],
+                                   FALSE);
+      cur_ref_buff+= cur_key_part->store_length;
+    }
+    *ref_key= NULL; /* End marker. */
+    tab_ref->key_err= 1;
+    tab_ref->key_parts= tmp_key_parts;
+    sjm->tab_ref= tab_ref;
+
+    /*
+      Remove the injected semi-join IN-equalities from join_tab conds. This
+      needs to be done because the IN-equalities refer to columns of
+      sj-inner tables which are not available after the materialization
+      has been finished.
+    */
+    for (i= 0; i < sjm->tables; i++)
+    {
+      remove_sj_conds(&tab[i].select_cond);
+      if (tab[i].select)
+        remove_sj_conds(&tab[i].select->cond);
+    }
+    if (!(sjm->in_equality= create_subq_in_equalities(thd, sjm,
+                                                      emb_sj_nest->sj_subq_pred)))
+      DBUG_RETURN(TRUE); /* purecov: inspected */
+  }
+  else
+  {
+    /*
+      We'll be doing full scan of the temptable.  
+      Setup copying of temptable columns back to the record buffers
+      for their source tables. We need this because IN-equalities
+      refer to the original tables.
+
+      EXAMPLE
+
+      Consider the query:
+        SELECT * FROM ot WHERE ot.col1 IN (SELECT it.col2 FROM it)
+      
+      Suppose it's executed with SJ-Materialization-scan. We choose to do scan
+      if we can't do the lookup, i.e. the join order is (it, ot). The plan
+      would look as follows:
+
+        table    access method      condition
+         it      materialize+scan    -
+         ot      (whatever)          ot1.col1=it.col2 (C2)
+
+      The condition C2 refers to current row of table it. The problem is
+      that by the time we evaluate C2, we would have finished with scanning
+      it itself and will be scanning the temptable. 
+
+      At the moment, our solution is to copy back: when we get the next
+      temptable record, we copy its columns to their corresponding columns
+      in the record buffers for the source tables. 
+    */
+    sjm->copy_field= new Copy_field[sjm->sjm_table_cols.elements];
+    it.rewind();
+    for (uint i=0; i < sjm->sjm_table_cols.elements; i++)
+    {
+      bool dummy;
+      Item_equal *item_eq;
+      Item *item= (it++)->real_item();
+      DBUG_ASSERT(item->type() == Item::FIELD_ITEM);
+      Field *copy_to= ((Item_field*)item)->field;
+      /*
+        Tricks with Item_equal are due to the following: suppose we have a
+        query:
+        
+        ... WHERE cond(ot.col) AND ot.col IN (SELECT it2.col FROM it1,it2
+                                               WHERE it1.col= it2.col)
+         then equality propagation will create an 
+         
+           Item_equal(it1.col, it2.col, ot.col) 
+         
+         then substitute_for_best_equal_field() will change the conditions
+         according to the join order:
+
+           it1
+           it2    it1.col=it2.col
+           ot     cond(it1.col)
+
+         although we've originally had "SELECT it2.col", conditions attached 
+         to subsequent outer tables will refer to it1.col, so SJM-Scan will
+         need to unpack data to there. 
+         That is, if an element from subquery's select list participates in 
+         equality propagation, then we need to unpack it to the first
+         element equality propagation member that refers to table that is
+         within the subquery.
+      */
+      item_eq= find_item_equal(tab->join->cond_equal, copy_to, &dummy);
+
+      if (item_eq)
+      {
+        List_iterator<Item_field> it(item_eq->fields);
+        Item_field *item;
+        while ((item= it++))
+        {
+          if (!(item->used_tables() & ~emb_sj_nest->sj_inner_tables))
+          {
+            copy_to= item->field;
+            break;
+          }
+        }
+      }
+      sjm->copy_field[i].set(copy_to, sjm->table->field[i], FALSE);
+      /* The write_set for source tables must be set up to allow the copying */
+      bitmap_set_bit(copy_to->table->write_set, copy_to->field_index);
+    }
+  }
+
+  DBUG_RETURN(FALSE);
+}
+
+
+
+/*
+  Create subquery IN-equalities assuming use of materialization strategy
+  
+  SYNOPSIS
+    create_subq_in_equalities()
+      thd        Thread handle
+      sjm        Semi-join materialization structure
+      subq_pred  The subquery predicate
+
+  DESCRIPTION
+    Create subquery IN-equality predicates. That is, for a subquery
+    
+      (oe1, oe2, ...) IN (SELECT ie1, ie2, ... FROM ...)
+    
+    create "oe1=ie1 AND ie1=ie2 AND ..." expression, such that ie1, ie2, ..
+    refer to the columns of the table that's used to materialize the
+    subquery.
+
+  RETURN 
+    Created condition
+*/
+
+static Item *create_subq_in_equalities(THD *thd, SJ_MATERIALIZATION_INFO *sjm, 
+                                Item_in_subselect *subq_pred)
+{
+  Item *res= NULL;
+  if (subq_pred->left_expr->cols() == 1)
+  {
+    if (!(res= new Item_func_eq(subq_pred->left_expr,
+                                new Item_field(sjm->table->field[0]))))
+      return NULL; /* purecov: inspected */
+  }
+  else
+  {
+    Item *conj;
+    for (uint i= 0; i < subq_pred->left_expr->cols(); i++)
+    {
+      if (!(conj= new Item_func_eq(subq_pred->left_expr->element_index(i), 
+                                   new Item_field(sjm->table->field[i]))) ||
+          !(res= and_items(res, conj)))
+        return NULL; /* purecov: inspected */
+    }
+  }
+  if (res->fix_fields(thd, &res))
+    return NULL; /* purecov: inspected */
+  return res;
+}
+
+
+
+
+static void remove_sj_conds(Item **tree)
+{
+  if (*tree)
+  {
+    if (is_cond_sj_in_equality(*tree))
+    {
+      *tree= NULL;
+      return;
+    }
+    else if ((*tree)->type() == Item::COND_ITEM) 
+    {
+      Item *item;
+      List_iterator<Item> li(*(((Item_cond*)*tree)->argument_list()));
+      while ((item= li++))
+      {
+        if (is_cond_sj_in_equality(item))
+          li.replace(new Item_int(1));
+      }
+    }
+  }
+}
+
+/* Check if given Item was injected by semi-join equality */
+static bool is_cond_sj_in_equality(Item *item)
+{
+  if (item->type() == Item::FUNC_ITEM &&
+      ((Item_func*)item)->functype()== Item_func::EQ_FUNC)
+  {
+    Item_func_eq *item_eq= (Item_func_eq*)item;
+    return test(item_eq->in_equality_no != UINT_MAX);
+  }
+  return FALSE;
+}
+
+
+/*
+  Create a temporary table to weed out duplicate rowid combinations
+
+  SYNOPSIS
+
+    create_duplicate_weedout_tmp_table()
+      thd                    Thread handle
+      uniq_tuple_length_arg  Length of the table's column
+      sjtbl                  Update sjtbl->[start_]recinfo values which 
+                             will be needed if we'll need to convert the 
+                             created temptable from HEAP to MyISAM/Maria.
+
+  DESCRIPTION
+    Create a temporary table to weed out duplicate rowid combinations. The
+    table has a single column that is a concatenation of all rowids in the
+    combination. 
+
+    Depending on the needed length, there are two cases:
+
+    1. When the length of the column < max_key_length:
+
+      CREATE TABLE tmp (col VARBINARY(n) NOT NULL, UNIQUE KEY(col));
+
+    2. Otherwise (not a valid SQL syntax but internally supported):
+
+      CREATE TABLE tmp (col VARBINARY NOT NULL, UNIQUE CONSTRAINT(col));
+
+    The code in this function was produced by extraction of relevant parts
+    from create_tmp_table().
+
+  RETURN
+    created table
+    NULL on error
+*/
+
+TABLE *create_duplicate_weedout_tmp_table(THD *thd, 
+                                          uint uniq_tuple_length_arg,
+                                          SJ_TMP_TABLE *sjtbl)
+{
+  MEM_ROOT *mem_root_save, own_root;
+  TABLE *table;
+  TABLE_SHARE *share;
+  uint  temp_pool_slot=MY_BIT_NONE;
+  char	*tmpname,path[FN_REFLEN];
+  Field **reg_field;
+  KEY_PART_INFO *key_part_info;
+  KEY *keyinfo;
+  uchar *group_buff;
+  uchar *bitmaps;
+  uint *blob_field;
+  ENGINE_COLUMNDEF *recinfo, *start_recinfo;
+  bool using_unique_constraint=FALSE;
+  bool use_packed_rows= FALSE;
+  Field *field, *key_field;
+  uint blob_count, null_pack_length, null_count;
+  uchar *null_flags;
+  uchar *pos;
+  DBUG_ENTER("create_duplicate_weedout_tmp_table");
+  DBUG_ASSERT(!sjtbl->is_degenerate);
+  /*
+    STEP 1: Get temporary table name
+  */
+  statistic_increment(thd->status_var.created_tmp_tables, &LOCK_status);
+  if (use_temp_pool && !(test_flags & TEST_KEEP_TMP_TABLES))
+    temp_pool_slot = bitmap_lock_set_next(&temp_pool);
+
+  if (temp_pool_slot != MY_BIT_NONE) // we got a slot
+    sprintf(path, "%s_%lx_%i", tmp_file_prefix,
+	    current_pid, temp_pool_slot);
+  else
+  {
+    /* if we run out of slots or we are not using tempool */
+    sprintf(path,"%s%lx_%lx_%x", tmp_file_prefix,current_pid,
+            thd->thread_id, thd->tmp_table++);
+  }
+  fn_format(path, path, mysql_tmpdir, "", MY_REPLACE_EXT|MY_UNPACK_FILENAME);
+
+  /* STEP 2: Figure if we'll be using a key or blob+constraint */
+  if (uniq_tuple_length_arg >= CONVERT_IF_BIGGER_TO_BLOB)
+    using_unique_constraint= TRUE;
+
+  /* STEP 3: Allocate memory for temptable description */
+  init_sql_alloc(&own_root, TABLE_ALLOC_BLOCK_SIZE, 0);
+  if (!multi_alloc_root(&own_root,
+                        &table, sizeof(*table),
+                        &share, sizeof(*share),
+                        &reg_field, sizeof(Field*) * (1+1),
+                        &blob_field, sizeof(uint)*2,
+                        &keyinfo, sizeof(*keyinfo),
+                        &key_part_info, sizeof(*key_part_info) * 2,
+                        &start_recinfo,
+                        sizeof(*recinfo)*(1*2+4),
+                        &tmpname, (uint) strlen(path)+1,
+                        &group_buff, (!using_unique_constraint ?
+                                      uniq_tuple_length_arg : 0),
+                        &bitmaps, bitmap_buffer_size(1)*3,
+                        NullS))
+  {
+    if (temp_pool_slot != MY_BIT_NONE)
+      bitmap_lock_clear_bit(&temp_pool, temp_pool_slot);
+    DBUG_RETURN(NULL);
+  }
+  strmov(tmpname,path);
+  
+
+  /* STEP 4: Create TABLE description */
+  bzero((char*) table,sizeof(*table));
+  bzero((char*) reg_field,sizeof(Field*)*2);
+
+  table->mem_root= own_root;
+  mem_root_save= thd->mem_root;
+  thd->mem_root= &table->mem_root;
+
+  table->field=reg_field;
+  table->alias.set("weedout-tmp", sizeof("weedout-tmp")-1,
+                   table_alias_charset);
+  table->reginfo.lock_type=TL_WRITE;	/* Will be updated */
+  table->db_stat=HA_OPEN_KEYFILE+HA_OPEN_RNDFILE;
+  table->map=1;
+  table->temp_pool_slot = temp_pool_slot;
+  table->copy_blobs= 1;
+  table->in_use= thd;
+  table->quick_keys.init();
+  table->covering_keys.init();
+  table->keys_in_use_for_query.init();
+
+  table->s= share;
+  init_tmp_table_share(thd, share, "", 0, tmpname, tmpname);
+  share->blob_field= blob_field;
+  share->blob_ptr_size= portable_sizeof_char_ptr;
+  share->db_low_byte_first=1;                // True for HEAP and MyISAM
+  share->table_charset= NULL;
+  share->primary_key= MAX_KEY;               // Indicate no primary key
+  share->keys_for_keyread.init();
+  share->keys_in_use.init();
+
+  blob_count= 0;
+
+  /* Create the field */
+  {
+    /*
+      For the sake of uniformity, always use Field_varstring (altough we could
+      use Field_string for shorter keys)
+    */
+    field= new Field_varstring(uniq_tuple_length_arg, FALSE, "rowids", share,
+                               &my_charset_bin);
+    if (!field)
+      DBUG_RETURN(0);
+    field->table= table;
+    field->key_start.init(0);
+    field->part_of_key.init(0);
+    field->part_of_sortkey.init(0);
+    field->unireg_check= Field::NONE;
+    field->flags= (NOT_NULL_FLAG | BINARY_FLAG | NO_DEFAULT_VALUE_FLAG);
+    field->reset_fields();
+    field->init(table);
+    field->orig_table= NULL;
+     
+    field->field_index= 0;
+    
+    *(reg_field++)= field;
+    *blob_field= 0;
+    *reg_field= 0;
+
+    share->fields= 1;
+    share->blob_fields= 0;
+  }
+
+  uint reclength= field->pack_length();
+  if (using_unique_constraint)
+  { 
+    share->db_plugin= ha_lock_engine(0, TMP_ENGINE_HTON);
+    table->file= get_new_handler(share, &table->mem_root,
+                                 share->db_type());
+    DBUG_ASSERT(uniq_tuple_length_arg <= table->file->max_key_length());
+  }
+  else
+  {
+    share->db_plugin= ha_lock_engine(0, heap_hton);
+    table->file= get_new_handler(share, &table->mem_root,
+                                 share->db_type());
+  }
+  if (!table->file)
+    goto err;
+
+  null_count=1;
+  
+  null_pack_length= 1;
+  reclength += null_pack_length;
+
+  share->reclength= reclength;
+  {
+    uint alloc_length=ALIGN_SIZE(share->reclength + MI_UNIQUE_HASH_LENGTH+1);
+    share->rec_buff_length= alloc_length;
+    if (!(table->record[0]= (uchar*)
+                            alloc_root(&table->mem_root, alloc_length*3)))
+      goto err;
+    table->record[1]= table->record[0]+alloc_length;
+    share->default_values= table->record[1]+alloc_length;
+  }
+  setup_tmp_table_column_bitmaps(table, bitmaps);
+
+  recinfo= start_recinfo;
+  null_flags=(uchar*) table->record[0];
+  pos=table->record[0]+ null_pack_length;
+  if (null_pack_length)
+  {
+    bzero((uchar*) recinfo,sizeof(*recinfo));
+    recinfo->type=FIELD_NORMAL;
+    recinfo->length=null_pack_length;
+    recinfo++;
+    bfill(null_flags,null_pack_length,255);	// Set null fields
+
+    table->null_flags= (uchar*) table->record[0];
+    share->null_fields= null_count;
+    share->null_bytes= null_pack_length;
+  }
+  null_count=1;
+
+  {
+    //Field *field= *reg_field;
+    uint length;
+    bzero((uchar*) recinfo,sizeof(*recinfo));
+    field->move_field(pos,(uchar*) 0,0);
+
+    field->reset();
+    /*
+      Test if there is a default field value. The test for ->ptr is to skip
+      'offset' fields generated by initalize_tables
+    */
+    // Initialize the table field:
+    bzero(field->ptr, field->pack_length());
+
+    length=field->pack_length();
+    pos+= length;
+
+    /* Make entry for create table */
+    recinfo->length=length;
+    if (field->flags & BLOB_FLAG)
+      recinfo->type= FIELD_BLOB;
+    else if (use_packed_rows &&
+             field->real_type() == MYSQL_TYPE_STRING &&
+	     length >= MIN_STRING_LENGTH_TO_PACK_ROWS)
+      recinfo->type=FIELD_SKIP_ENDSPACE;
+    else
+      recinfo->type=FIELD_NORMAL;
+
+    field->set_table_name(&table->alias);
+  }
+
+  if (thd->variables.tmp_table_size == ~ (ulonglong) 0)		// No limit
+    share->max_rows= ~(ha_rows) 0;
+  else
+    share->max_rows= (ha_rows) (((share->db_type() == heap_hton) ?
+                                 min(thd->variables.tmp_table_size,
+                                     thd->variables.max_heap_table_size) :
+                                 thd->variables.tmp_table_size) /
+			         share->reclength);
+  set_if_bigger(share->max_rows,1);		// For dummy start options
+
+
+  //// keyinfo= param->keyinfo;
+  if (TRUE)
+  {
+    DBUG_PRINT("info",("Creating group key in temporary table"));
+    share->keys=1;
+    share->uniques= test(using_unique_constraint);
+    table->key_info=keyinfo;
+    keyinfo->key_part=key_part_info;
+    keyinfo->flags=HA_NOSAME;
+    keyinfo->usable_key_parts= keyinfo->key_parts= 1;
+    keyinfo->key_length=0;
+    keyinfo->rec_per_key=0;
+    keyinfo->algorithm= HA_KEY_ALG_UNDEF;
+    keyinfo->name= (char*) "weedout_key";
+    {
+      key_part_info->null_bit=0;
+      key_part_info->field=  field;
+      key_part_info->offset= field->offset(table->record[0]);
+      key_part_info->length= (uint16) field->key_length();
+      key_part_info->type=   (uint8) field->key_type();
+      key_part_info->key_type = FIELDFLAG_BINARY;
+      if (!using_unique_constraint)
+      {
+	if (!(key_field= field->new_key_field(thd->mem_root, table,
+                                              group_buff,
+                                              field->null_ptr,
+                                              field->null_bit)))
+	  goto err;
+        key_part_info->key_part_flag|= HA_END_SPACE_ARE_EQUAL; //todo need this?
+      }
+      keyinfo->key_length+=  key_part_info->length;
+    }
+  }
+
+  if (thd->is_fatal_error)			// If end of memory
+    goto err;
+  share->db_record_offset= 1;
+  table->no_rows= 1;              		// We don't need the data
+
+  // recinfo must point after last field
+  recinfo++;
+  if (share->db_type() == TMP_ENGINE_HTON)
+  {
+    if (create_internal_tmp_table(table, keyinfo, start_recinfo, &recinfo, 0))
+      goto err;
+  }
+  sjtbl->start_recinfo= start_recinfo;
+  sjtbl->recinfo=       recinfo;
+  if (open_tmp_table(table))
+    goto err;
+
+  thd->mem_root= mem_root_save;
+  DBUG_RETURN(table);
+
+err:
+  thd->mem_root= mem_root_save;
+  free_tmp_table(thd,table);                    /* purecov: inspected */
+  if (temp_pool_slot != MY_BIT_NONE)
+    bitmap_lock_clear_bit(&temp_pool, temp_pool_slot);
+  DBUG_RETURN(NULL);				/* purecov: inspected */
+}
+
+
+/*
+  SemiJoinDuplicateElimination: Reset the temporary table
+*/
+
+int do_sj_reset(SJ_TMP_TABLE *sj_tbl)
+{
+  DBUG_ENTER("do_sj_reset");
+  if (sj_tbl->tmp_table)
+  {
+    int rc= sj_tbl->tmp_table->file->ha_delete_all_rows();
+    DBUG_RETURN(rc);
+  }
+  sj_tbl->have_degenerate_row= FALSE;
+  DBUG_RETURN(0);
+}
+
+/*
+  SemiJoinDuplicateElimination: Weed out duplicate row combinations
+
+  SYNPOSIS
+    do_sj_dups_weedout()
+      thd    Thread handle
+      sjtbl  Duplicate weedout table
+
+  DESCRIPTION
+    Try storing current record combination of outer tables (i.e. their
+    rowids) in the temporary table. This records the fact that we've seen 
+    this record combination and also tells us if we've seen it before.
+
+  RETURN
+    -1  Error
+    1   The row combination is a duplicate (discard it)
+    0   The row combination is not a duplicate (continue)
+*/
+
+int do_sj_dups_weedout(THD *thd, SJ_TMP_TABLE *sjtbl) 
+{
+  int error;
+  SJ_TMP_TABLE::TAB *tab= sjtbl->tabs;
+  SJ_TMP_TABLE::TAB *tab_end= sjtbl->tabs_end;
+  uchar *ptr;
+  uchar *nulls_ptr;
+
+  DBUG_ENTER("do_sj_dups_weedout");
+
+  if (sjtbl->is_degenerate)
+  {
+    if (sjtbl->have_degenerate_row) 
+      DBUG_RETURN(1);
+
+    sjtbl->have_degenerate_row= TRUE;
+    DBUG_RETURN(0);
+  }
+
+  ptr= sjtbl->tmp_table->record[0] + 1;
+  nulls_ptr= ptr;
+
+  /* Put the the rowids tuple into table->record[0]: */
+
+  // 1. Store the length 
+  if (((Field_varstring*)(sjtbl->tmp_table->field[0]))->length_bytes == 1)
+  {
+    *ptr= (uchar)(sjtbl->rowid_len + sjtbl->null_bytes);
+    ptr++;
+  }
+  else
+  {
+    int2store(ptr, sjtbl->rowid_len + sjtbl->null_bytes);
+    ptr += 2;
+  }
+
+  // 2. Zero the null bytes 
+  if (sjtbl->null_bytes)
+  {
+    bzero(ptr, sjtbl->null_bytes);
+    ptr += sjtbl->null_bytes; 
+  }
+
+  // 3. Put the rowids
+  for (uint i=0; tab != tab_end; tab++, i++)
+  {
+    handler *h= tab->join_tab->table->file;
+    if (tab->join_tab->table->maybe_null && tab->join_tab->table->null_row)
+    {
+      /* It's a NULL-complemented row */
+      *(nulls_ptr + tab->null_byte) |= tab->null_bit;
+      bzero(ptr + tab->rowid_offset, h->ref_length);
+    }
+    else
+    {
+      /* Copy the rowid value */
+      memcpy(ptr + tab->rowid_offset, h->ref, h->ref_length);
+    }
+  }
+
+  error= sjtbl->tmp_table->file->ha_write_tmp_row(sjtbl->tmp_table->record[0]);
+  if (error)
+  {
+    /* create_internal_tmp_table_from_heap will generate error if needed */
+    if (!sjtbl->tmp_table->file->is_fatal_error(error, HA_CHECK_DUP))
+      DBUG_RETURN(1); /* Duplicate */
+    if (create_internal_tmp_table_from_heap(thd, sjtbl->tmp_table,
+                                            sjtbl->start_recinfo,
+                                            &sjtbl->recinfo, error, 1))
+      DBUG_RETURN(-1);
+  }
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Setup the strategies to eliminate semi-join duplicates.
+  
+  SYNOPSIS
+    setup_semijoin_dups_elimination()
+      join           Join to process
+      options        Join options (needed to see if join buffering will be 
+                     used or not)
+      no_jbuf_after  Another bit of information re where join buffering will
+                     be used.
+
+  DESCRIPTION
+    Setup the strategies to eliminate semi-join duplicates. ATM there are 4
+    strategies:
+
+    1. DuplicateWeedout (use of temptable to remove duplicates based on rowids
+                         of row combinations)
+    2. FirstMatch (pick only the 1st matching row combination of inner tables)
+    3. LooseScan (scanning the sj-inner table in a way that groups duplicates
+                  together and picking the 1st one)
+    4. SJ-Materialization.
+    
+    The join order has "duplicate-generating ranges", and every range is
+    served by one strategy or a combination of FirstMatch with with some
+    other strategy.
+    
+    "Duplicate-generating range" is defined as a range within the join order
+    that contains all of the inner tables of a semi-join. All ranges must be
+    disjoint, if tables of several semi-joins are interleaved, then the ranges
+    are joined together, which is equivalent to converting
+      SELECT ... WHERE oe1 IN (SELECT ie1 ...) AND oe2 IN (SELECT ie2 )
+    to
+      SELECT ... WHERE (oe1, oe2) IN (SELECT ie1, ie2 ... ...)
+    .
+
+    Applicability conditions are as follows:
+
+    DuplicateWeedout strategy
+    ~~~~~~~~~~~~~~~~~~~~~~~~~
+
+      (ot|nt)*  [ it ((it|ot|nt)* (it|ot))]  (nt)*
+      +------+  +=========================+  +---+
+        (1)                 (2)               (3)
+
+       (1) - Prefix of OuterTables (those that participate in 
+             IN-equality and/or are correlated with subquery) and outer 
+             Non-correlated tables.
+       (2) - The handled range. The range starts with the first sj-inner
+             table, and covers all sj-inner and outer tables 
+             Within the range,  Inner, Outer, outer non-correlated tables
+             may follow in any order.
+       (3) - The suffix of outer non-correlated tables.
+    
+    FirstMatch strategy
+    ~~~~~~~~~~~~~~~~~~~
+
+      (ot|nt)*  [ it ((it|nt)* it) ]  (nt)*
+      +------+  +==================+  +---+
+        (1)             (2)          (3)
+
+      (1) - Prefix of outer and non-correlated tables
+      (2) - The handled range, which may contain only inner and
+            non-correlated tables.
+      (3) - The suffix of outer non-correlated tables.
+
+    LooseScan strategy 
+    ~~~~~~~~~~~~~~~~~~
+
+     (ot|ct|nt) [ loosescan_tbl (ot|nt|it)* it ]  (ot|nt)*
+     +--------+   +===========+ +=============+   +------+
+        (1)           (2)          (3)              (4)
+     
+      (1) - Prefix that may contain any outer tables. The prefix must contain
+            all the non-trivially correlated outer tables. (non-trivially means
+            that the correlation is not just through the IN-equality).
+      
+      (2) - Inner table for which the LooseScan scan is performed.
+
+      (3) - The remainder of the duplicate-generating range. It is served by 
+            application of FirstMatch strategy, with the exception that
+            outer IN-correlated tables are considered to be non-correlated.
+
+      (4) - THe suffix of outer and outer non-correlated tables.
+
+  
+  The choice between the strategies is made by the join optimizer (see
+  advance_sj_state() and fix_semijoin_strategies_for_picked_join_order()).
+  This function sets up all fields/structures/etc needed for execution except
+  for setup/initialization of semi-join materialization which is done in 
+  setup_sj_materialization() (todo: can't we move that to here also?)
+
+  RETURN
+    FALSE  OK 
+    TRUE   Out of memory error
+*/
+
+int setup_semijoin_dups_elimination(JOIN *join, ulonglong options, 
+                                    uint no_jbuf_after)
+{
+  uint i;
+  THD *thd= join->thd;
+  DBUG_ENTER("setup_semijoin_dups_elimination");
+
+  for (i= join->const_tables ; i < join->tables; )
+  {
+    JOIN_TAB *tab=join->join_tab + i;
+    POSITION *pos= join->best_positions + i;
+    uint keylen, keyno;
+    switch (pos->sj_strategy) {
+      case SJ_OPT_MATERIALIZE:
+      case SJ_OPT_MATERIALIZE_SCAN:
+        /* Do nothing */
+        i+= pos->n_sj_tables;
+        break;
+      case SJ_OPT_LOOSE_SCAN:
+      {
+        /* We jump from the last table to the first one */
+        tab->loosescan_match_tab= tab + pos->n_sj_tables - 1;
+
+        /* Calculate key length */
+        keylen= 0;
+        keyno= pos->loosescan_key;
+        for (uint kp=0; kp < pos->loosescan_parts; kp++)
+          keylen += tab->table->key_info[keyno].key_part[kp].store_length;
+
+        tab->loosescan_key_len= keylen;
+        if (pos->n_sj_tables > 1) 
+          tab[pos->n_sj_tables - 1].do_firstmatch= tab;
+        i+= pos->n_sj_tables;
+        break;
+      }
+      case SJ_OPT_DUPS_WEEDOUT:
+      {
+        /*
+          Check for join buffering. If there is one, move the first table
+          forwards, but do not destroy other duplicate elimination methods.
+        */
+        uint first_table= i;
+        uint join_cache_level= join->thd->variables.join_cache_level;
+        for (uint j= i; j < i + pos->n_sj_tables; j++)
+        {
+          /*
+            When we'll properly take join buffering into account during
+            join optimization, the below check should be changed to 
+            "if (join->best_positions[j].use_join_buffer && 
+                 j <= no_jbuf_after)".
+            For now, use a rough criteria:
+          */
+          JOIN_TAB *js_tab=join->join_tab + j; 
+          if (j != join->const_tables && js_tab->use_quick != 2 &&
+              j <= no_jbuf_after &&
+              ((js_tab->type == JT_ALL && join_cache_level != 0) ||
+               (join_cache_level > 4 && (tab->type == JT_REF || 
+                                         tab->type == JT_EQ_REF))))
+          {
+            /* Looks like we'll be using join buffer */
+            first_table= join->const_tables;
+            break;
+          }
+        }
+
+        SJ_TMP_TABLE::TAB sjtabs[MAX_TABLES];
+        SJ_TMP_TABLE::TAB *last_tab= sjtabs;
+        uint jt_rowid_offset= 0; // # tuple bytes are already occupied (w/o NULL bytes)
+        uint jt_null_bits= 0;    // # null bits in tuple bytes
+        /*
+          Walk through the range and remember
+           - tables that need their rowids to be put into temptable
+           - the last outer table
+        */
+        for (JOIN_TAB *j=join->join_tab + first_table; 
+             j < join->join_tab + i + pos->n_sj_tables; j++)
+        {
+          if (sj_table_is_included(join, j))
+          {
+            last_tab->join_tab= j;
+            last_tab->rowid_offset= jt_rowid_offset;
+            jt_rowid_offset += j->table->file->ref_length;
+            if (j->table->maybe_null)
+            {
+              last_tab->null_byte= jt_null_bits / 8;
+              last_tab->null_bit= jt_null_bits++;
+            }
+            last_tab++;
+            j->table->prepare_for_position();
+            j->keep_current_rowid= TRUE;
+          }
+        }
+
+        SJ_TMP_TABLE *sjtbl;
+        if (jt_rowid_offset) /* Temptable has at least one rowid */
+        {
+          uint tabs_size= (last_tab - sjtabs) * sizeof(SJ_TMP_TABLE::TAB);
+          if (!(sjtbl= (SJ_TMP_TABLE*)thd->alloc(sizeof(SJ_TMP_TABLE))) ||
+              !(sjtbl->tabs= (SJ_TMP_TABLE::TAB*) thd->alloc(tabs_size)))
+            DBUG_RETURN(TRUE); /* purecov: inspected */
+          memcpy(sjtbl->tabs, sjtabs, tabs_size);
+          sjtbl->is_degenerate= FALSE;
+          sjtbl->tabs_end= sjtbl->tabs + (last_tab - sjtabs);
+          sjtbl->rowid_len= jt_rowid_offset;
+          sjtbl->null_bits= jt_null_bits;
+          sjtbl->null_bytes= (jt_null_bits + 7)/8;
+          sjtbl->tmp_table= 
+            create_duplicate_weedout_tmp_table(thd, 
+                                               sjtbl->rowid_len + 
+                                               sjtbl->null_bytes,
+                                               sjtbl);
+          join->sj_tmp_tables.push_back(sjtbl->tmp_table);
+        }
+        else
+        {
+          /* 
+            This is a special case where the entire subquery predicate does 
+            not depend on anything at all, ie this is 
+              WHERE const IN (uncorrelated select)
+          */
+          if (!(sjtbl= (SJ_TMP_TABLE*)thd->alloc(sizeof(SJ_TMP_TABLE))))
+            DBUG_RETURN(TRUE); /* purecov: inspected */
+          sjtbl->tmp_table= NULL;
+          sjtbl->is_degenerate= TRUE;
+          sjtbl->have_degenerate_row= FALSE;
+        }
+        join->join_tab[first_table].flush_weedout_table= sjtbl;
+        join->join_tab[i + pos->n_sj_tables - 1].check_weed_out_table= sjtbl;
+
+        i+= pos->n_sj_tables;
+        break;
+      }
+      case SJ_OPT_FIRST_MATCH:
+      {
+        JOIN_TAB *j, *jump_to= tab-1;
+        for (j= tab; j != tab + pos->n_sj_tables; j++)
+        {
+          /*
+            NOTE: this loop probably doesn't do the right thing for the case 
+            where FirstMatch's duplicate-generating range is interleaved with
+            "unrelated" tables (as specified in WL#3750, section 2.2).
+          */
+          if (!j->emb_sj_nest)
+            jump_to= tab;
+          else
+          {
+            j->first_sj_inner_tab= tab;
+            j->last_sj_inner_tab= tab + pos->n_sj_tables - 1;
+          }
+        }
+        j[-1].do_firstmatch= jump_to;
+        i+= pos->n_sj_tables;
+        break;
+      }
+      case SJ_OPT_NONE:
+        i++;
+        break;
+    }
+  }
+  DBUG_RETURN(FALSE);
+}
+
+
+/*
+  Destroy all temporary tables created by NL-semijoin runtime
+*/
+
+void destroy_sj_tmp_tables(JOIN *join)
+{
+  List_iterator<TABLE> it(join->sj_tmp_tables);
+  TABLE *table;
+  while ((table= it++))
+  {
+    /* 
+      SJ-Materialization tables are initialized for either sequential reading 
+      or index lookup, DuplicateWeedout tables are not initialized for read 
+      (we only write to them), so need to call ha_index_or_rnd_end.
+    */
+    table->file->ha_index_or_rnd_end();
+    free_tmp_table(join->thd, table);
+  }
+  join->sj_tmp_tables.empty();
+  join->sjm_info_list.empty();
+}
+
+
+/*
+  Remove all records from all temp tables used by NL-semijoin runtime
+
+  SYNOPSIS
+    clear_sj_tmp_tables()
+      join  The join to remove tables for
+
+  DESCRIPTION
+    Remove all records from all temp tables used by NL-semijoin runtime. This 
+    must be done before every join re-execution.
+*/
+
+int clear_sj_tmp_tables(JOIN *join)
+{
+  int res;
+  List_iterator<TABLE> it(join->sj_tmp_tables);
+  TABLE *table;
+  while ((table= it++))
+  {
+    if ((res= table->file->ha_delete_all_rows()))
+      return res; /* purecov: inspected */
+  }
+
+  SJ_MATERIALIZATION_INFO *sjm;
+  List_iterator<SJ_MATERIALIZATION_INFO> it2(join->sjm_info_list);
+  while ((sjm= it2++))
+  {
+    sjm->materialized= FALSE;
+  }
+  return 0;
+}
+
+
+/*
+  Check if the table's rowid is included in the temptable
+
+  SYNOPSIS
+    sj_table_is_included()
+      join      The join
+      join_tab  The table to be checked
+
+  DESCRIPTION
+    SemiJoinDuplicateElimination: check the table's rowid should be included
+    in the temptable. This is so if
+
+    1. The table is not embedded within some semi-join nest
+    2. The has been pulled out of a semi-join nest, or
+
+    3. The table is functionally dependent on some previous table
+
+    [4. This is also true for constant tables that can't be
+        NULL-complemented but this function is not called for such tables]
+
+  RETURN
+    TRUE  - Include table's rowid
+    FALSE - Don't
+*/
+
+static bool sj_table_is_included(JOIN *join, JOIN_TAB *join_tab)
+{
+  if (join_tab->emb_sj_nest)
+    return FALSE;
+  
+  /* Check if this table is functionally dependent on the tables that
+     are within the same outer join nest
+  */
+  TABLE_LIST *embedding= join_tab->table->pos_in_table_list->embedding;
+  if (join_tab->type == JT_EQ_REF)
+  {
+    table_map depends_on= 0;
+    uint idx;
+
+    for (uint kp= 0; kp < join_tab->ref.key_parts; kp++)
+      depends_on |= join_tab->ref.items[kp]->used_tables();
+
+    Table_map_iterator it(depends_on & ~PSEUDO_TABLE_BITS);
+    while ((idx= it.next_bit())!=Table_map_iterator::BITMAP_END)
+    {
+      JOIN_TAB *ref_tab= join->map2table[idx];
+      if (embedding != ref_tab->table->pos_in_table_list->embedding)
+        return TRUE;
+    }
+    /* Ok, functionally dependent */
+    return FALSE;
+  }
+  /* Not functionally dependent => need to include*/
+  return TRUE;
+}
+
+
+/*
+  Index lookup-based subquery: save some flags for EXPLAIN output
+
+  SYNOPSIS
+    save_index_subquery_explain_info()
+      join_tab  Subquery's join tab (there is only one as index lookup is
+                only used for subqueries that are single-table SELECTs)
+      where     Subquery's WHERE clause
+
+  DESCRIPTION
+    For index lookup-based subquery (i.e. one executed with
+    subselect_uniquesubquery_engine or subselect_indexsubquery_engine),
+    check its EXPLAIN output row should contain 
+      "Using index" (TAB_INFO_FULL_SCAN_ON_NULL) 
+      "Using Where" (TAB_INFO_USING_WHERE)
+      "Full scan on NULL key" (TAB_INFO_FULL_SCAN_ON_NULL)
+    and set appropriate flags in join_tab->packed_info.
+*/
+
+static void save_index_subquery_explain_info(JOIN_TAB *join_tab, Item* where)
+{
+  join_tab->packed_info= TAB_INFO_HAVE_VALUE;
+  if (join_tab->table->covering_keys.is_set(join_tab->ref.key))
+    join_tab->packed_info |= TAB_INFO_USING_INDEX;
+  if (where)
+    join_tab->packed_info |= TAB_INFO_USING_WHERE;
+  for (uint i = 0; i < join_tab->ref.key_parts; i++)
+  {
+    if (join_tab->ref.cond_guards[i])
+    {
+      join_tab->packed_info |= TAB_INFO_FULL_SCAN_ON_NULL;
+      break;
+    }
+  }
+}
+
+
+/*
+  Check if the join can be rewritten to [unique_]indexsubquery_engine
+
+  DESCRIPTION
+    Check if the join can be changed into [unique_]indexsubquery_engine.
+
+    The check is done after join optimization, the idea is that if the join
+    has only one table and uses a [eq_]ref access generated from subselect's
+    IN-equality then we replace it with a subselect_indexsubquery_engine or a
+    subselect_uniquesubquery_engine.
+
+  RETURN 
+    0 - Ok, rewrite done (stop join optimization and return)
+    1 - Fatal error (stop join optimization and return)
+   -1 - No rewrite performed, continue with join optimization
+*/
+
+int rewrite_to_index_subquery_engine(JOIN *join)
+{
+  THD *thd= join->thd;
+  JOIN_TAB* join_tab=join->join_tab;
+  SELECT_LEX_UNIT *unit= join->unit;
+  DBUG_ENTER("rewrite_to_index_subquery_engine");
+  /*
+    is this simple IN subquery?
+  */
+  if (!join->group_list && !join->order &&
+      join->unit->item && 
+      join->unit->item->substype() == Item_subselect::IN_SUBS &&
+      join->tables == 1 && join->conds &&
+      !join->unit->is_union())
+  {
+    if (!join->having)
+    {
+      Item *where= join->conds;
+      if (join_tab[0].type == JT_EQ_REF &&
+	  join_tab[0].ref.items[0]->name == in_left_expr_name)
+      {
+        remove_subq_pushed_predicates(join, &where);
+        save_index_subquery_explain_info(join_tab, where);
+        join_tab[0].type= JT_UNIQUE_SUBQUERY;
+        join->error= 0;
+        DBUG_RETURN(unit->item->
+                    change_engine(new
+                                  subselect_uniquesubquery_engine(thd,
+                                                                  join_tab,
+                                                                  unit->item,
+                                                                  where)));
+      }
+      else if (join_tab[0].type == JT_REF &&
+	       join_tab[0].ref.items[0]->name == in_left_expr_name)
+      {
+	remove_subq_pushed_predicates(join, &where);
+        save_index_subquery_explain_info(join_tab, where);
+        join_tab[0].type= JT_INDEX_SUBQUERY;
+        join->error= 0;
+        DBUG_RETURN(unit->item->
+                    change_engine(new
+                                  subselect_indexsubquery_engine(thd,
+                                                                 join_tab,
+                                                                 unit->item,
+                                                                 where,
+                                                                 NULL,
+                                                                 0)));
+      }
+    } else if (join_tab[0].type == JT_REF_OR_NULL &&
+	       join_tab[0].ref.items[0]->name == in_left_expr_name &&
+               join->having->name == in_having_cond)
+    {
+      join_tab[0].type= JT_INDEX_SUBQUERY;
+      join->error= 0;
+      join->conds= remove_additional_cond(join->conds);
+      save_index_subquery_explain_info(join_tab, join->conds);
+      DBUG_RETURN(unit->item->
+		  change_engine(new subselect_indexsubquery_engine(thd,
+								   join_tab,
+								   unit->item,
+								   join->conds,
+                                                                   join->having,
+								   1)));
+    }
+  }
+
+  DBUG_RETURN(-1); /* Haven't done the rewrite */
+}
+
+
+/**
+  Remove additional condition inserted by IN/ALL/ANY transformation.
+
+  @param conds   condition for processing
+
+  @return
+    new conditions
+*/
+
+static Item *remove_additional_cond(Item* conds)
+{
+  if (conds->name == in_additional_cond)
+    return 0;
+  if (conds->type() == Item::COND_ITEM)
+  {
+    Item_cond *cnd= (Item_cond*) conds;
+    List_iterator<Item> li(*(cnd->argument_list()));
+    Item *item;
+    while ((item= li++))
+    {
+      if (item->name == in_additional_cond)
+      {
+	li.remove();
+	if (cnd->argument_list()->elements == 1)
+	  return cnd->argument_list()->head();
+	return conds;
+      }
+    }
+  }
+  return conds;
+}
+
+
+/*
+  Remove the predicates pushed down into the subquery
+
+  SYNOPSIS
+    remove_subq_pushed_predicates()
+      where   IN  Must be NULL
+              OUT The remaining WHERE condition, or NULL
+
+  DESCRIPTION
+    Given that this join will be executed using (unique|index)_subquery,
+    without "checking NULL", remove the predicates that were pushed down
+    into the subquery.
+
+    If the subquery compares scalar values, we can remove the condition that
+    was wrapped into trig_cond (it will be checked when needed by the subquery
+    engine)
+
+    If the subquery compares row values, we need to keep the wrapped
+    equalities in the WHERE clause: when the left (outer) tuple has both NULL
+    and non-NULL values, we'll do a full table scan and will rely on the
+    equalities corresponding to non-NULL parts of left tuple to filter out
+    non-matching records.
+
+    TODO: We can remove the equalities that will be guaranteed to be true by the
+    fact that subquery engine will be using index lookup. This must be done only
+    for cases where there are no conversion errors of significance, e.g. 257
+    that is searched in a byte. But this requires homogenization of the return 
+    codes of all Field*::store() methods.
+*/
+
+static void remove_subq_pushed_predicates(JOIN *join, Item **where)
+{
+  if (join->conds->type() == Item::FUNC_ITEM &&
+      ((Item_func *)join->conds)->functype() == Item_func::EQ_FUNC &&
+      ((Item_func *)join->conds)->arguments()[0]->type() == Item::REF_ITEM &&
+      ((Item_func *)join->conds)->arguments()[1]->type() == Item::FIELD_ITEM &&
+      test_if_ref (join->conds,
+                   (Item_field *)((Item_func *)join->conds)->arguments()[1],
+                   ((Item_func *)join->conds)->arguments()[0]))
+  {
+    *where= 0;
+    return;
+  }
+}
+
+
diff --git a/sql/opt_subselect.h b/sql/opt_subselect.h
new file mode 100644
index 00000000000..47d85d5c38d
--- /dev/null
+++ b/sql/opt_subselect.h
@@ -0,0 +1,369 @@
+/* */
+
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface			/* gcc class implementation */
+#endif
+
+int check_and_do_in_subquery_rewrites(JOIN *join);
+bool convert_join_subqueries_to_semijoins(JOIN *join);
+int pull_out_semijoin_tables(JOIN *join);
+bool optimize_semijoin_nests(JOIN *join, table_map all_table_map);
+
+// used by Loose_scan_opt
+ulonglong get_bound_sj_equalities(TABLE_LIST *sj_nest, 
+                                  table_map remaining_tables);
+
+/*
+  This is a class for considering possible loose index scan optimizations.
+  It's usage pattern is as follows:
+    best_access_path()
+    {
+       Loose_scan_opt opt;
+
+       opt.init()
+       for each index we can do ref access with
+       {
+         opt.next_ref_key();
+         for each keyuse 
+           opt.add_keyuse();
+         opt.check_ref_access();
+       }
+
+       if (some criteria for range scans)
+         opt.check_range_access();
+       
+       opt.get_best_option();
+    }
+*/
+
+class Loose_scan_opt
+{
+public:
+  /* All methods must check this before doing anything else */
+  bool try_loosescan;
+
+  /*
+    If we consider (oe1, .. oeN) IN (SELECT ie1, .. ieN) then ieK=oeK is
+    called sj-equality. If oeK depends only on preceding tables then such
+    equality is called 'bound'.
+  */
+  ulonglong bound_sj_equalities;
+ 
+  /* Accumulated properties of ref access we're now considering: */
+  ulonglong handled_sj_equalities;
+  key_part_map loose_scan_keyparts;
+  uint max_loose_keypart;
+  bool part1_conds_met;
+
+  /*
+    Use of quick select is a special case. Some of its properties:
+  */
+  uint quick_uses_applicable_index;
+  uint quick_max_loose_keypart;
+  
+  /* Best loose scan method so far */
+  uint   best_loose_scan_key;
+  double best_loose_scan_cost;
+  double best_loose_scan_records;
+  KEYUSE *best_loose_scan_start_key;
+
+  uint best_max_loose_keypart;
+
+  Loose_scan_opt():
+    try_loosescan(FALSE),
+    bound_sj_equalities(0),
+    quick_uses_applicable_index(FALSE)
+  {
+    UNINIT_VAR(quick_max_loose_keypart); /* Protected by quick_uses_applicable_index */
+    /* The following are protected by best_loose_scan_cost!= DBL_MAX */
+    UNINIT_VAR(best_loose_scan_key);
+    UNINIT_VAR(best_loose_scan_records);
+    UNINIT_VAR(best_max_loose_keypart);
+    UNINIT_VAR(best_loose_scan_start_key);
+  }
+  
+  void init(JOIN *join, JOIN_TAB *s, table_map remaining_tables)
+  {
+    /*
+      Discover the bound equalities. We need to do this if
+        1. The next table is an SJ-inner table, and
+        2. It is the first table from that semijoin, and
+        3. We're not within a semi-join range (i.e. all semi-joins either have
+           all or none of their tables in join_table_map), except
+           s->emb_sj_nest (which we've just entered, see #2).
+        4. All non-IN-equality correlation references from this sj-nest are 
+           bound
+        5. But some of the IN-equalities aren't (so this can't be handled by 
+           FirstMatch strategy)
+    */
+    best_loose_scan_cost= DBL_MAX;
+    if (!join->emb_sjm_nest && s->emb_sj_nest &&                        // (1)
+        s->emb_sj_nest->sj_in_exprs < 64 && 
+        ((remaining_tables & s->emb_sj_nest->sj_inner_tables) ==        // (2)
+         s->emb_sj_nest->sj_inner_tables) &&                            // (2)
+        join->cur_sj_inner_tables == 0 &&                                  // (3)
+        !(remaining_tables & 
+          s->emb_sj_nest->nested_join->sj_corr_tables) &&               // (4)
+        remaining_tables & s->emb_sj_nest->nested_join->sj_depends_on &&// (5)
+        optimizer_flag(join->thd, OPTIMIZER_SWITCH_LOOSE_SCAN))
+    {
+      /* This table is an LooseScan scan candidate */
+      bound_sj_equalities= get_bound_sj_equalities(s->emb_sj_nest, 
+                                                   remaining_tables);
+      try_loosescan= TRUE;
+      DBUG_PRINT("info", ("Will try LooseScan scan, bound_map=%llx",
+                          (longlong)bound_sj_equalities));
+    }
+  }
+
+  void next_ref_key()
+  {
+    handled_sj_equalities=0;
+    loose_scan_keyparts= 0;
+    max_loose_keypart= 0;
+    part1_conds_met= FALSE;
+  }
+  
+  void add_keyuse(table_map remaining_tables, KEYUSE *keyuse)
+  {
+    if (try_loosescan && keyuse->sj_pred_no != UINT_MAX)
+    {
+      if (!(remaining_tables & keyuse->used_tables))
+      {
+        /* 
+          This allows to use equality propagation to infer that some 
+          sj-equalities are bound.
+        */
+        bound_sj_equalities |= 1ULL << keyuse->sj_pred_no;
+      }
+      else
+      {
+        handled_sj_equalities |= 1ULL << keyuse->sj_pred_no;
+        loose_scan_keyparts |= ((key_part_map)1) << keyuse->keypart;
+        set_if_bigger(max_loose_keypart, keyuse->keypart);
+      }
+    }
+  }
+
+  bool have_a_case() { return test(handled_sj_equalities); }
+
+  void check_ref_access_part1(JOIN_TAB *s, uint key, KEYUSE *start_key, 
+                              table_map found_part)
+  {
+    /*
+      Check if we can use LooseScan semi-join strategy. We can if
+      1. This is the right table at right location
+      2. All IN-equalities are either
+         - "bound", ie. the outer_expr part refers to the preceding tables
+         - "handled", ie. covered by the index we're considering
+      3. Index order allows to enumerate subquery's duplicate groups in
+         order. This happens when the index definition matches this
+         pattern:
+
+           (handled_col|bound_col)* (other_col|bound_col)
+
+    */
+    if (try_loosescan &&                                       // (1)
+        (handled_sj_equalities | bound_sj_equalities) ==         // (2)
+        PREV_BITS(ulonglong, s->emb_sj_nest->sj_in_exprs) &&     // (2)
+        (PREV_BITS(key_part_map, max_loose_keypart+1) &        // (3)
+         (found_part | loose_scan_keyparts)) ==                // (3)
+         (found_part | loose_scan_keyparts) &&                 // (3)
+        !key_uses_partial_cols(s->table, key))
+    {
+      /* Ok, can use the strategy */
+      part1_conds_met= TRUE;
+      if (s->quick && s->quick->index == key && 
+          s->quick->get_type() == QUICK_SELECT_I::QS_TYPE_RANGE)
+      {
+        quick_uses_applicable_index= TRUE;
+        quick_max_loose_keypart= max_loose_keypart;
+      }
+      DBUG_PRINT("info", ("Can use LooseScan scan"));
+
+      /* 
+        Check if this is a special case where there are no usable bound
+        IN-equalities, i.e. we have
+
+          outer_expr IN (SELECT innertbl.key FROM ...) 
+        
+        and outer_expr cannot be evaluated yet, so it's actually full
+        index scan and not a ref access
+      */
+      if (!(found_part & 1 ) && /* no usable ref access for 1st key part */
+          s->table->covering_keys.is_set(key))
+      {
+        DBUG_PRINT("info", ("Can use full index scan for LooseScan"));
+        
+        /* Calculate the cost of complete loose index scan.  */
+        double records= rows2double(s->table->file->stats.records);
+
+        /* The cost is entire index scan cost (divided by 2) */
+        double read_time= s->table->file->keyread_time(key, 1,
+                                                       (ha_rows) records);
+
+        /*
+          Now find out how many different keys we will get (for now we
+          ignore the fact that we have "keypart_i=const" restriction for
+          some key components, that may make us think think that loose
+          scan will produce more distinct records than it actually will)
+        */
+        ulong rpc;
+        if ((rpc= s->table->key_info[key].rec_per_key[max_loose_keypart]))
+          records= records / rpc;
+
+        // TODO: previous version also did /2
+        if (read_time < best_loose_scan_cost)
+        {
+          best_loose_scan_key= key;
+          best_loose_scan_cost= read_time;
+          best_loose_scan_records= records;
+          best_max_loose_keypart= max_loose_keypart;
+          best_loose_scan_start_key= start_key;
+        }
+      }
+    }
+  }
+  
+  void check_ref_access_part2(uint key, KEYUSE *start_key, double records, 
+                              double read_time)
+  {
+    if (part1_conds_met && read_time < best_loose_scan_cost)
+    {
+      /* TODO use rec-per-key-based fanout calculations */
+      best_loose_scan_key= key;
+      best_loose_scan_cost= read_time;
+      best_loose_scan_records= records;
+      best_max_loose_keypart= max_loose_keypart;
+      best_loose_scan_start_key= start_key;
+    }
+  }
+
+  void check_range_access(JOIN *join, uint idx, QUICK_SELECT_I *quick)
+  {
+    /* TODO: this the right part restriction: */
+    if (quick_uses_applicable_index && idx == join->const_tables && 
+        quick->read_time < best_loose_scan_cost)
+    {
+      best_loose_scan_key= quick->index;
+      best_loose_scan_cost= quick->read_time;
+      /* this is ok because idx == join->const_tables */
+      best_loose_scan_records= rows2double(quick->records);
+      best_max_loose_keypart= quick_max_loose_keypart;
+      best_loose_scan_start_key= NULL;
+    }
+  }
+
+  void save_to_position(JOIN_TAB *tab, POSITION *pos)
+  {
+    pos->read_time=       best_loose_scan_cost;
+    if (best_loose_scan_cost != DBL_MAX)
+    {
+      pos->records_read=    best_loose_scan_records;
+      pos->key=             best_loose_scan_start_key;
+      pos->loosescan_key=   best_loose_scan_key;
+      pos->loosescan_parts= best_max_loose_keypart + 1;
+      pos->use_join_buffer= FALSE;
+      pos->table=           tab;
+      // todo need ref_depend_map ?
+      DBUG_PRINT("info", ("Produced a LooseScan plan, key %s, %s",
+                          tab->table->key_info[best_loose_scan_key].name,
+                          best_loose_scan_start_key? "(ref access)":
+                                                     "(range/index access)"));
+    }
+  }
+};
+
+
+void advance_sj_state(JOIN *join, const table_map remaining_tables, 
+                      const JOIN_TAB *new_join_tab, uint idx, 
+                      double *current_record_count, double *current_read_time,
+                      POSITION *loose_scan_pos);
+void restore_prev_sj_state(const table_map remaining_tables, 
+                                  const JOIN_TAB *tab, uint idx);
+
+void fix_semijoin_strategies_for_picked_join_order(JOIN *join);
+bool setup_sj_materialization(JOIN_TAB *tab);
+
+TABLE *create_duplicate_weedout_tmp_table(THD *thd, uint uniq_tuple_length_arg,
+                                          SJ_TMP_TABLE *sjtbl);
+int do_sj_reset(SJ_TMP_TABLE *sj_tbl);
+int do_sj_dups_weedout(THD *thd, SJ_TMP_TABLE *sjtbl);
+
+/*
+  Temporary table used by semi-join DuplicateElimination strategy
+
+  This consists of the temptable itself and data needed to put records
+  into it. The table's DDL is as follows:
+
+    CREATE TABLE tmptable (col VARCHAR(n) BINARY, PRIMARY KEY(col));
+
+  where the primary key can be replaced with unique constraint if n exceeds
+  the limit (as it is always done for query execution-time temptables).
+
+  The record value is a concatenation of rowids of tables from the join we're
+  executing. If a join table is on the inner side of the outer join, we
+  assume that its rowid can be NULL and provide means to store this rowid in
+  the tuple.
+*/
+
+class SJ_TMP_TABLE : public Sql_alloc
+{
+public:
+  /*
+    Array of pointers to tables whose rowids compose the temporary table
+    record.
+  */
+  class TAB
+  {
+  public:
+    JOIN_TAB *join_tab;
+    uint rowid_offset;
+    ushort null_byte;
+    uchar null_bit;
+  };
+  TAB *tabs;
+  TAB *tabs_end;
+  
+  /* 
+    is_degenerate==TRUE means this is a special case where the temptable record
+    has zero length (and presence of a unique key means that the temptable can
+    have either 0 or 1 records). 
+    In this case we don't create the physical temptable but instead record
+    its state in SJ_TMP_TABLE::have_degenerate_row.
+  */
+  bool is_degenerate;
+
+  /* 
+    When is_degenerate==TRUE: the contents of the table (whether it has the
+    record or not).
+  */
+  bool have_degenerate_row;
+  
+  /* table record parameters */
+  uint null_bits;
+  uint null_bytes;
+  uint rowid_len;
+
+  /* The temporary table itself (NULL means not created yet) */
+  TABLE *tmp_table;
+  
+  /*
+    These are the members we got from temptable creation code. We'll need
+    them if we'll need to convert table from HEAP to MyISAM/Maria.
+  */
+  ENGINE_COLUMNDEF *start_recinfo;
+  ENGINE_COLUMNDEF *recinfo;
+
+  /* Pointer to next table (next->start_idx > this->end_idx) */
+  SJ_TMP_TABLE *next; 
+};
+
+int setup_semijoin_dups_elimination(JOIN *join, ulonglong options, 
+                                    uint no_jbuf_after);
+void destroy_sj_tmp_tables(JOIN *join);
+int clear_sj_tmp_tables(JOIN *join);
+int rewrite_to_index_subquery_engine(JOIN *join);
+
+
+
diff --git a/sql/opt_table_elimination.cc b/sql/opt_table_elimination.cc
index fdf818abb8e..7497395d628 100644
--- a/sql/opt_table_elimination.cc
+++ b/sql/opt_table_elimination.cc
@@ -725,7 +725,11 @@ eliminate_tables_for_list(JOIN *join, List<TABLE_LIST> *join_list,
     }
     else
     {
-      DBUG_ASSERT(!tbl->nested_join);
+      DBUG_ASSERT(!tbl->nested_join || tbl->sj_on_expr);
+      //psergey-todo: is the following really correct or we'll need to descend
+      //down all ON clauses: ? 
+      if (tbl->sj_on_expr)
+        tables_used_on_left |= tbl->sj_on_expr->used_tables();
     }
   }
 
@@ -918,8 +922,9 @@ public:
   Field_dependency_recorder(Dep_analysis_context *ctx_arg): ctx(ctx_arg)
   {}
   
-  void visit_field(Field *field)
+  void visit_field(Item_field *item)
   {
+    Field *field= item->field;
     Dep_value_table *tbl_dep;
     if ((tbl_dep= ctx->table_deps[field->table->tablenr]))
     {
@@ -1227,7 +1232,7 @@ void build_eq_mods_for_cond(Dep_analysis_context *ctx,
     if (fvl->elements)
     {
       
-      exchange_sort<Dep_value_field>(fvl, compare_field_values, NULL);
+      bubble_sort<Dep_value_field>(fvl, compare_field_values, NULL);
       add_module_expr(ctx, eq_mod, *and_level, NULL, bound_item, fvl);
     }
     break;
diff --git a/sql/procedure.h b/sql/procedure.h
index 30a8a0efccb..488d461905e 100644
--- a/sql/procedure.h
+++ b/sql/procedure.h
@@ -13,6 +13,8 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
+#ifndef PROCEDURE_INCLUDED
+#define PROCEDURE_INCLUDED
 
 /* When using sql procedures */
 
@@ -153,3 +155,5 @@ public:
 
 Procedure *setup_procedure(THD *thd,ORDER *proc_param,select_result *result,
 			   List<Item> &field_list,int *error);
+
+#endif /* PROCEDURE_INCLUDED */
diff --git a/sql/set_var.cc b/sql/set_var.cc
index 5a30bd1c0f4..2d199366d5a 100644
--- a/sql/set_var.cc
+++ b/sql/set_var.cc
@@ -321,6 +321,11 @@ static sys_var_thd_ulong	sys_interactive_timeout(&vars, "interactive_timeout",
 						&SV::net_interactive_timeout);
 static sys_var_thd_ulong	sys_join_buffer_size(&vars, "join_buffer_size",
 					     &SV::join_buff_size);
+static sys_var_thd_ulonglong    sys_join_buffer_space_limit(&vars,
+                                                "join_buffer_space_limit",
+					        &SV::join_buff_space_limit);
+static sys_var_thd_ulong	sys_join_cache_level(&vars, "join_cache_level",
+					             &SV::join_cache_level);
 static sys_var_key_buffer_size	sys_key_buffer_size(&vars, "key_buffer_size");
 static sys_var_key_cache_long  sys_key_cache_block_size(&vars,
                                    "key_cache_block_size",
@@ -450,8 +455,6 @@ static sys_var_long_ptr	sys_max_write_lock_count(&vars, "max_write_lock_count",
 						 &max_write_lock_count);
 static sys_var_thd_ulong       sys_min_examined_row_limit(&vars, "min_examined_row_limit",
                                                           &SV::min_examined_row_limit);
-static sys_var_thd_ulong       sys_multi_range_count(&vars, "multi_range_count",
-                                              &SV::multi_range_count);
 static sys_var_long_ptr	sys_myisam_data_pointer_size(&vars, "myisam_data_pointer_size",
                                                     &myisam_data_pointer_size);
 static sys_var_thd_ulonglong	sys_myisam_max_sort_file_size(&vars, "myisam_max_sort_file_size", &SV::myisam_max_sort_file_size, fix_myisam_max_sort_file_size, 1);
@@ -504,6 +507,18 @@ static sys_var_thd_ulong        sys_optimizer_search_depth(&vars, "optimizer_sea
                                                    &SV::optimizer_search_depth);
 static sys_var_thd_optimizer_switch   sys_optimizer_switch(&vars, "optimizer_switch",
                                      &SV::optimizer_switch);
+
+const char *optimizer_use_mrr_names[] = {"auto", "force", "disable", NullS};
+TYPELIB optimizer_use_mrr_typelib= {
+  array_elements(optimizer_use_mrr_names) - 1, "",
+  optimizer_use_mrr_names, NULL
+};
+
+static sys_var_thd_enum        sys_optimizer_use_mrr(&vars, "optimizer_use_mrr",
+                                              &SV::optimizer_use_mrr,
+                                              &optimizer_use_mrr_typelib,
+                                              NULL);
+
 static sys_var_const            sys_pid_file(&vars, "pid_file",
                                              OPT_GLOBAL, SHOW_CHAR,
                                              (uchar*) pidfile_name);
@@ -527,6 +542,8 @@ static sys_var_bool_ptr	        sys_userstat(&vars, "userstat",
 
 static sys_var_thd_ulong	sys_read_rnd_buff_size(&vars, "read_rnd_buffer_size",
 					       &SV::read_rnd_buff_size);
+static sys_var_thd_ulong	sys_mrr_buff_size(&vars, "mrr_buffer_size",
+					          &SV::mrr_buff_size);
 static sys_var_thd_ulong	sys_div_precincrement(&vars, "div_precision_increment",
                                               &SV::div_precincrement);
 static sys_var_long_ptr	sys_rpl_recovery_rank(&vars, "rpl_recovery_rank",
@@ -537,6 +554,9 @@ static sys_var_long_ptr	sys_query_cache_size(&vars, "query_cache_size",
 
 static sys_var_thd_ulong	sys_range_alloc_block_size(&vars, "range_alloc_block_size",
 						   &SV::range_alloc_block_size);
+static sys_var_thd_ulong	sys_rowid_merge_buff_size(&vars, "rowid_merge_buff_size",
+					   &SV::rowid_merge_buff_size);
+
 static sys_var_thd_ulong	sys_query_alloc_block_size(&vars, "query_alloc_block_size",
 						   &SV::query_alloc_block_size,
 						   0, fix_thd_mem_root);
@@ -4026,7 +4046,7 @@ bool
 sys_var_thd_optimizer_switch::
 symbolic_mode_representation(THD *thd, ulonglong val, LEX_STRING *rep)
 {
-  char buff[STRING_BUFFER_USUAL_SIZE*8];
+  char buff[STRING_BUFFER_USUAL_SIZE*18];
   String tmp(buff, sizeof(buff), &my_charset_latin1);
   int i;
   ulonglong bit;
diff --git a/sql/sql_array.h b/sql/sql_array.h
index e1b22921519..233b3f24263 100644
--- a/sql/sql_array.h
+++ b/sql/sql_array.h
@@ -66,3 +66,75 @@ public:
   }
 };
 
+
+/* 
+  Array of pointers to Elem that uses memory from MEM_ROOT
+
+  MEM_ROOT has no realloc() so this is supposed to be used for cases when
+  reallocations are rare.
+*/
+
+template <class Elem> class Array
+{
+  enum {alloc_increment = 16};
+  Elem **buffer;
+  uint n_elements, max_element;
+public:
+  Array(MEM_ROOT *mem_root, uint prealloc=16)
+  {
+    buffer= (Elem**)alloc_root(mem_root, prealloc * sizeof(Elem**));
+    max_element = buffer? prealloc : 0;
+    n_elements= 0;
+  }
+
+  Elem& at(int idx)
+  {
+    return *(((Elem*)buffer) + idx);
+  }
+
+  Elem **front()
+  {
+    return buffer;
+  }
+
+  Elem **back()
+  {
+    return buffer + n_elements;
+  }
+
+  bool append(MEM_ROOT *mem_root, Elem *el)
+  {
+    if (n_elements == max_element)
+    {
+      Elem **newbuf;
+      if (!(newbuf= (Elem**)alloc_root(mem_root, (n_elements + alloc_increment)*
+                                                  sizeof(Elem**))))
+      {
+        return FALSE;
+      }
+      memcpy(newbuf, buffer, n_elements*sizeof(Elem*));
+      buffer= newbuf;
+    }
+    buffer[n_elements++]= el;
+    return FALSE;
+  }
+
+  int elements()
+  {
+    return n_elements;
+  }
+
+  void clear()
+  {
+    n_elements= 0;
+  }
+
+  typedef int (*CMP_FUNC)(Elem * const *el1, Elem *const *el2);
+
+  void sort(CMP_FUNC cmp_func)
+  {
+    my_qsort(buffer, n_elements, sizeof(Elem*), (qsort_cmp)cmp_func);
+  }
+};
+
+
diff --git a/sql/sql_base.cc b/sql/sql_base.cc
index 8816e6672f8..435dd84dca9 100644
--- a/sql/sql_base.cc
+++ b/sql/sql_base.cc
@@ -22,6 +22,7 @@
 #include "sp_head.h"
 #include "sp.h"
 #include "sql_trigger.h"
+#include "sql_handler.h"
 #include <m_ctype.h>
 #include <my_dir.h>
 #include <hash.h>
@@ -1381,10 +1382,10 @@ bool close_thread_table(THD *thd, TABLE **table_ptr)
   bool found_old_table= 0;
   TABLE *table= *table_ptr;
   DBUG_ENTER("close_thread_table");
-  DBUG_ASSERT(table->key_read == 0);
-  DBUG_ASSERT(!table->file || table->file->inited == handler::NONE);
   DBUG_PRINT("tcache", ("table: '%s'.'%s' 0x%lx", table->s->db.str,
                         table->s->table_name.str, (long) table));
+  DBUG_ASSERT(table->key_read == 0);
+  DBUG_ASSERT(!table->file || table->file->inited == handler::NONE);
 
  if (table->file)
  {
@@ -3033,13 +3034,19 @@ TABLE *open_table(THD *thd, TABLE_LIST *table_list, MEM_ROOT *mem_root,
   table->pos_in_table_list= table_list;
   table_list->updatable= 1; // It is not derived table nor non-updatable VIEW
   table->clear_column_bitmaps();
-#if !defined(DBUG_OFF) && !defined(HAVE_valgrind)
   /*
     Fill record with random values to find bugs where we access fields
     without first reading them.
   */
-  bfill(table->record[0], table->s->reclength, 254);
-#endif
+  TRASH(table->record[0], table->s->reclength);
+  /*
+    Initialize the null marker bits, to ensure that if we are doing a read
+    of only selected columns (like in keyread), all null markers are
+    initialized.
+  */
+  bfill(table->record[0], table->s->null_bytes, 255); 
+  bfill(table->record[1], table->s->null_bytes, 255); 
+
   DBUG_ASSERT(table->key_read == 0);
   DBUG_RETURN(table);
 }
@@ -6428,12 +6435,21 @@ find_field_in_tables(THD *thd, Item_ident *item,
           sub query as dependent on the outer query
         */
         if (current_sel != last_select)
+        {
           mark_select_range_as_dependent(thd, last_select, current_sel,
                                          found, *ref, item);
+          if (item->can_be_depended)
+          {
+            DBUG_ASSERT((*ref) == (Item*)item);
+            current_sel->register_dependency_item(last_select, ref);
+          }
+        }
       }
       return found;
     }
   }
+  else
+    item->can_be_depended= TRUE;
 
   if (db && lower_case_table_names)
   {
@@ -8148,6 +8164,7 @@ int setup_conds(THD *thd, TABLE_LIST *tables, TABLE_LIST *leaves,
   SELECT_LEX *select_lex= thd->lex->current_select;
   Query_arena *arena= thd->stmt_arena, backup;
   TABLE_LIST *table= NULL;	// For HP compilers
+  TABLE_LIST *save_emb_on_expr_nest= thd->thd_marker.emb_on_expr_nest;
   /*
     it_is_update set to TRUE when tables of primary SELECT_LEX (SELECT_LEX
     which belong to LEX, i.e. most up SELECT) will be updated by
@@ -8178,13 +8195,19 @@ int setup_conds(THD *thd, TABLE_LIST *tables, TABLE_LIST *leaves,
       goto err_no_arena;
   }
 
+  thd->thd_marker.emb_on_expr_nest= (TABLE_LIST*)1;
   if (*conds)
   {
     thd->where="where clause";
+    DBUG_EXECUTE("where",
+                 print_where(*conds,
+                             "WHERE in setup_conds",
+                             QT_ORDINARY););
     if ((!(*conds)->fixed && (*conds)->fix_fields(thd, conds)) ||
 	(*conds)->check_cols(1))
       goto err_no_arena;
   }
+  thd->thd_marker.emb_on_expr_nest= save_emb_on_expr_nest;
 
   /*
     Apply fix_fields() to all ON clauses at all levels of nesting,
@@ -8200,6 +8223,7 @@ int setup_conds(THD *thd, TABLE_LIST *tables, TABLE_LIST *leaves,
       if (embedded->on_expr)
       {
         /* Make a join an a expression */
+        thd->thd_marker.emb_on_expr_nest= embedded;
         thd->where="on clause";
         if ((!embedded->on_expr->fixed &&
              embedded->on_expr->fix_fields(thd, &embedded->on_expr)) ||
@@ -8224,6 +8248,7 @@ int setup_conds(THD *thd, TABLE_LIST *tables, TABLE_LIST *leaves,
       }
     }
   }
+  thd->thd_marker.emb_on_expr_nest= save_emb_on_expr_nest;
 
   if (!thd->stmt_arena->is_conventional())
   {
@@ -8422,6 +8447,7 @@ fill_record_n_invoke_before_triggers(THD *thd, List<Item> &fields,
     ptr           pointer on pointer to record
     values        list of fields
     ignore_errors TRUE if we should ignore errors
+    use_value     forces usage of value of the items instead of result
 
   NOTE
     fill_record() may set table->auto_increment_field_not_null and a
@@ -8434,7 +8460,8 @@ fill_record_n_invoke_before_triggers(THD *thd, List<Item> &fields,
 */
 
 bool
-fill_record(THD *thd, Field **ptr, List<Item> &values, bool ignore_errors)
+fill_record(THD *thd, Field **ptr, List<Item> &values, bool ignore_errors,
+            bool use_value)
 {
   List_iterator_fast<Item> v(values);
   List<TABLE> tbl_list;
@@ -8476,8 +8503,11 @@ fill_record(THD *thd, Field **ptr, List<Item> &values, bool ignore_errors)
                           field->field_name, table->s->table_name.str);
       thd->abort_on_warning= abort_on_warning_saved;
     }
-    if (value->save_in_field(field, 0) < 0)
-      goto err;
+    if (use_value)
+      value->save_val(field);
+    else
+      if (value->save_in_field(field, 0) < 0)
+        goto err;
     tbl_list.push_back(table);
   }
   /* Update virtual fields*/
@@ -8546,7 +8576,7 @@ fill_record_n_invoke_before_triggers(THD *thd, Field **ptr,
                                      enum trg_event_type event)
 {
   bool result;
-  result= (fill_record(thd, ptr, values, ignore_errors) ||
+  result= (fill_record(thd, ptr, values, ignore_errors, FALSE) ||
            (triggers && triggers->process_triggers(thd, event,
                                                    TRG_ACTION_BEFORE, TRUE)));
   /*
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index a02ab2bba9e..7d9f5b5b3ed 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -42,7 +42,9 @@
 
 #include "sp_rcontext.h"
 #include "sp_cache.h"
+#include "sql_select.h" /* declares create_tmp_table() */
 #include "debug_sync.h"
+#include "sql_handler.h"
 
 /*
   The following is used to initialise Table_ident with a internal
@@ -701,7 +703,7 @@ THD::THD()
   catalog= (char*)"std"; // the only catalog we have for now
   main_security_ctx.init();
   security_ctx= &main_security_ctx;
-  locked=some_tables_deleted=no_errors=password= 0;
+  some_tables_deleted=no_errors=password= 0;
   query_start_used= 0;
   count_cuted_fields= CHECK_FIELD_IGNORE;
   killed= NOT_KILLED;
@@ -2398,7 +2400,6 @@ bool select_export::send_data(List<Item> &items)
     {						// Fill with space
       if (item->max_length > used_length)
       {
-	/* QQ:  Fix by adding a my_b_fill() function */
 	if (!space_inited)
 	{
 	  space_inited=1;
@@ -3034,6 +3035,92 @@ bool select_dumpvar::send_eof()
   return 0;
 }
 
+
+bool
+select_materialize_with_stats::
+create_result_table(THD *thd_arg, List<Item> *column_types,
+                    bool is_union_distinct, ulonglong options,
+                    const char *table_alias, bool bit_fields_as_long)
+{
+  DBUG_ASSERT(table == 0);
+  tmp_table_param.field_count= column_types->elements;
+  tmp_table_param.bit_fields_as_long= bit_fields_as_long;
+
+  if (! (table= create_tmp_table(thd_arg, &tmp_table_param, *column_types,
+                                 (ORDER*) 0, is_union_distinct, 1,
+                                 options, HA_POS_ERROR, (char*) table_alias)))
+    return TRUE;
+
+  col_stat= (Column_statistics*) table->in_use->alloc(table->s->fields *
+                                                      sizeof(Column_statistics));
+  if (!col_stat)
+    return TRUE;
+
+  reset();
+  table->file->extra(HA_EXTRA_WRITE_CACHE);
+  table->file->extra(HA_EXTRA_IGNORE_DUP_KEY);
+  return FALSE;
+}
+
+
+void select_materialize_with_stats::reset()
+{
+  memset(col_stat, 0, table->s->fields * sizeof(Column_statistics));
+  max_nulls_in_row= 0;
+  count_rows= 0;
+}
+
+
+void select_materialize_with_stats::cleanup()
+{
+  reset();
+  select_union::cleanup();
+}
+
+
+/**
+  Override select_union::send_data to analyze each row for NULLs and to
+  update null_statistics before sending data to the client.
+
+  @return TRUE if fatal error when sending data to the client
+  @return FALSE on success
+*/
+
+bool select_materialize_with_stats::send_data(List<Item> &items)
+{
+  List_iterator_fast<Item> item_it(items);
+  Item *cur_item;
+  Column_statistics *cur_col_stat= col_stat;
+  uint nulls_in_row= 0;
+
+  if (select_union::send_data(items))
+    return 1;
+  /* Skip duplicate rows. */
+  if (write_err == HA_ERR_FOUND_DUPP_KEY ||
+      write_err == HA_ERR_FOUND_DUPP_UNIQUE)
+    return 0;
+
+  ++count_rows;
+
+  while ((cur_item= item_it++))
+  {
+    if (cur_item->is_null())
+    {
+      ++cur_col_stat->null_count;
+      cur_col_stat->max_null_row= count_rows;
+      if (!cur_col_stat->min_null_row)
+        cur_col_stat->min_null_row= count_rows;
+      ++nulls_in_row;
+    }
+    ++cur_col_stat;
+  }
+  if (nulls_in_row > max_nulls_in_row)
+    max_nulls_in_row= nulls_in_row;
+
+  return 0;
+}
+
+
 /****************************************************************************
   TMP_TABLE_PARAM
 ****************************************************************************/
@@ -3047,6 +3134,8 @@ void TMP_TABLE_PARAM::init()
   quick_group= 1;
   table_charset= 0;
   precomputed_group_by= 0;
+  bit_fields_as_long= 0;
+  skip_create_table= 0;
   DBUG_VOID_RETURN;
 }
 
diff --git a/sql/sql_class.h b/sql/sql_class.h
index 8a5cc6da741..477ae9bf751 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -24,6 +24,76 @@
 #include "log.h"
 #include "rpl_tblmap.h"
 
+
+/**
+  Interface for Item iterator
+*/
+
+class Item_iterator
+{
+public:
+  /**
+    Shall set this iterator to the position before the first item
+
+    @note
+    This method also may perform some other initialization actions like
+    allocation of certain resources.
+  */
+  virtual void open()= 0;
+  /**
+    Shall return the next Item (or NULL if there is no next item) and
+    move pointer to position after it.
+  */
+  virtual Item *next()= 0;
+  /**
+    Shall force iterator to free resources (if it holds them)
+
+    @note
+    One should not use the iterator without open() call after close()
+  */
+  virtual void close()= 0;
+
+  virtual ~Item_iterator() {}
+};
+
+
+/**
+  Item iterator over List_iterator_fast for Item references
+*/
+
+class Item_iterator_ref_list: public Item_iterator
+{
+  List_iterator<Item*> list;
+public:
+  Item_iterator_ref_list(List_iterator<Item*> &arg_list):
+    list(arg_list) {}
+  void open() { list.rewind(); }
+  Item *next() { return *(list++); }
+  void close() {}
+};
+
+
+/**
+  Item iterator over Item interface for rows
+*/
+
+class Item_iterator_row: public Item_iterator
+{
+  Item *base_item;
+  uint current;
+public:
+  Item_iterator_row(Item *base) : base_item(base), current(0) {}
+  void open() { current= 0; }
+  Item *next()
+  {
+    if (current >= base_item->cols())
+      return NULL;
+    return base_item->element_index(current++);
+  }
+  void close() {}
+};
+
+
 /**
   An interface that is used to take an action when
   the locking module notices that a table version has changed
@@ -311,11 +381,13 @@ struct system_variables
   ulonglong max_heap_table_size;
   ulonglong tmp_table_size;
   ulonglong long_query_time;
+  ulonglong join_buff_space_limit;
   ha_rows select_limit;
   ha_rows max_join_size;
   ulong auto_increment_increment, auto_increment_offset;
   ulong bulk_insert_buff_size;
   ulong join_buff_size;
+  ulong join_cache_level;
   ulong max_allowed_packet;
   ulong max_error_count;
   ulong max_length_for_sort_data;
@@ -323,7 +395,6 @@ struct system_variables
   ulong max_tmp_tables;
   ulong max_insert_delayed_threads;
   ulong min_examined_row_limit;
-  ulong multi_range_count;
   ulong myisam_repair_threads;
   ulong myisam_sort_buff_size;
   ulong myisam_stats_method;
@@ -337,13 +408,25 @@ struct system_variables
   ulong optimizer_search_depth;
   /* A bitmap for switching optimizations on/off */
   ulong optimizer_switch;
+  /*
+    Controls use of Engine-MRR:
+      0 - auto, based on cost
+      1 - force MRR when the storage engine is capable of doing it
+      2 - disable MRR.
+  */
+  ulong optimizer_use_mrr; 
+
   ulong preload_buff_size;
   ulong profiling_history_size;
   ulong query_cache_type;
   ulong read_buff_size;
   ulong read_rnd_buff_size;
+  ulong mrr_buff_size;
   ulong div_precincrement;
   ulong sortbuff_size;
+  /* Total size of all buffers used by the subselect_rowid_merge_engine. */
+  ulong rowid_merge_buff_size;
+  ulong thread_handling;
   ulong tx_isolation;
   ulong completion_type;
   /* Determines which non-standard SQL behaviour should be enabled */
@@ -444,6 +527,13 @@ typedef struct system_status_var
   ulong ha_read_prev_count;
   ulong ha_read_rnd_count;
   ulong ha_read_rnd_next_count;
+  /*
+    This number doesn't include calls to the default implementation and
+    calls made by range access. The intent is to count only calls made by
+    BatchedKeyAccess.
+  */
+  ulong ha_multi_range_read_init_count;
+
   ulong ha_rollback_count;
   ulong ha_update_count;
   ulong ha_write_count;
@@ -679,7 +769,7 @@ public:
   Server_side_cursor *cursor;
 
   inline char *query() { return query_string.str; }
-  inline uint32 query_length() { return query_string.length; }
+  inline uint32 query_length() { return (uint32)query_string.length; }
   void set_query_inner(char *query_arg, uint32 query_length_arg);
 
   /**
@@ -1471,6 +1561,14 @@ public:
   /* container for handler's private per-connection data */
   Ha_data ha_data[MAX_HA];
 
+  /* Place to store various things */
+  union 
+  { 
+    /*
+      Used by subquery optimizations, see Item_in_subselect::emb_on_expr_nest.
+    */
+    TABLE_LIST *emb_on_expr_nest;
+  } thd_marker;
 #ifndef MYSQL_CLIENT
   int binlog_setup_trx_data();
 
@@ -1847,7 +1945,7 @@ public:
   bool       slave_thread, one_shot_set;
   /* tells if current statement should binlog row-based(1) or stmt-based(0) */
   bool       current_stmt_binlog_row_based;
-  bool	     locked, some_tables_deleted;
+  bool	     some_tables_deleted;
   bool       last_cuted_field;
   bool	     no_errors, password;
   bool       extra_port;                        /* If extra connection */
@@ -2860,11 +2958,23 @@ public:
   */
   bool precomputed_group_by;
   bool force_copy_fields;
+  /*
+    If TRUE, create_tmp_field called from create_tmp_table will convert
+    all BIT fields to 64-bit longs. This is a workaround the limitation
+    that MEMORY tables cannot index BIT columns.
+  */
+  bool bit_fields_as_long;
+  /*
+    Whether to create or postpone actual creation of this temporary table.
+    TRUE <=> create_tmp_table will create only the TABLE structure.
+  */
+  bool skip_create_table;
 
   TMP_TABLE_PARAM()
     :copy_field(0), group_parts(0),
      group_length(0), group_null_parts(0), convert_blob_length(0),
-     schema_table(0), precomputed_group_by(0), force_copy_fields(0)
+     schema_table(0), precomputed_group_by(0), force_copy_fields(0),
+     bit_fields_as_long(0), skip_create_table(0)
   {}
   ~TMP_TABLE_PARAM()
   {
@@ -2883,19 +2993,21 @@ public:
 
 class select_union :public select_result_interceptor
 {
+protected:
   TMP_TABLE_PARAM tmp_table_param;
+  int write_err; /* Error code from the last send_data->ha_write_row call. */
 public:
   TABLE *table;
 
-  select_union() :table(0) {}
+  select_union() :write_err(0),table(0) { tmp_table_param.init(); }
   int prepare(List<Item> &list, SELECT_LEX_UNIT *u);
   bool send_data(List<Item> &items);
   bool send_eof();
   bool flush();
-
-  bool create_result_table(THD *thd, List<Item> *column_types,
-                           bool is_distinct, ulonglong options,
-                           const char *alias);
+  void cleanup();
+  virtual bool create_result_table(THD *thd, List<Item> *column_types,
+                                   bool is_distinct, ulonglong options,
+                                   const char *alias, bool bit_fields_as_long);
 };
 
 /* Base subselect interface class */
@@ -2919,6 +3031,72 @@ public:
   bool send_data(List<Item> &items);
 };
 
+
+/*
+  This class specializes select_union to collect statistics about the
+  data stored in the temp table. Currently the class collects statistcs
+  about NULLs.
+*/
+
+class select_materialize_with_stats : public select_union
+{
+protected:
+  class Column_statistics
+  {
+  public:
+    /* Count of NULLs per column. */
+    ha_rows null_count;
+    /* The row number that contains the first NULL in a column. */
+    ha_rows min_null_row;
+    /* The row number that contains the last NULL in a column. */
+    ha_rows max_null_row;
+  };
+
+  /* Array of statistics data per column. */
+  Column_statistics* col_stat;
+
+  /*
+    The number of columns in the biggest sub-row that consists of only
+    NULL values.
+  */
+  uint max_nulls_in_row;
+  /*
+    Count of rows writtent to the temp table. This is redundant as it is
+    already stored in handler::stats.records, however that one is relatively
+    expensive to compute (given we need that for evry row).
+  */
+  ha_rows count_rows;
+
+protected:
+  void reset();
+
+public:
+  select_materialize_with_stats() {}
+  virtual bool create_result_table(THD *thd, List<Item> *column_types,
+                                   bool is_distinct, ulonglong options,
+                                   const char *alias, bool bit_fields_as_long);
+  bool init_result_table(ulonglong select_options);
+  bool send_data(List<Item> &items);
+  void cleanup();
+  ha_rows get_null_count_of_col(uint idx)
+  {
+    DBUG_ASSERT(idx < table->s->fields);
+    return col_stat[idx].null_count;
+  }
+  ha_rows get_max_null_of_col(uint idx)
+  {
+    DBUG_ASSERT(idx < table->s->fields);
+    return col_stat[idx].max_null_row;
+  }
+  ha_rows get_min_null_of_col(uint idx)
+  {
+    DBUG_ASSERT(idx < table->s->fields);
+    return col_stat[idx].min_null_row;
+  }
+  uint get_max_nulls_in_row() { return max_nulls_in_row; }
+};
+
+
 /* used in independent ALL/ANY optimisation */
 class select_max_min_finder_subselect :public select_subselect
 {
@@ -2946,6 +3124,67 @@ public:
   bool send_data(List<Item> &items);
 };
 
+
+
+
+/*
+  Optimizer and executor structure for the materialized semi-join info. This
+  structure contains
+   - The sj-materialization temporary table
+   - Members needed to make index lookup or a full scan of the temptable.
+*/
+class SJ_MATERIALIZATION_INFO : public Sql_alloc
+{
+public:
+  /* Optimal join sub-order */
+  struct st_position *positions;
+
+  uint tables; /* Number of tables in the sj-nest */
+
+  /* Expected #rows in the materialized table */
+  double rows;
+
+  /* 
+    Cost to materialize - execute the sub-join and write rows into temp.table
+  */
+  COST_VECT materialization_cost;
+
+  /* Cost to make one lookup in the temptable */
+  COST_VECT lookup_cost;
+  
+  /* Cost of scanning the materialized table */
+  COST_VECT scan_cost;
+
+  /* --- Execution structures ---------- */
+  
+  /*
+    TRUE <=> This structure is used for execution. We don't necessarily pick
+    sj-materialization, so some of SJ_MATERIALIZATION_INFO structures are not
+    used by materialization
+  */
+  bool is_used;
+  
+  bool materialized; /* TRUE <=> materialization already performed */
+  /*
+    TRUE  - the temptable is read with full scan
+    FALSE - we use the temptable for index lookups
+  */
+  bool is_sj_scan; 
+  
+  /* The temptable and its related info */
+  TMP_TABLE_PARAM sjm_table_param;
+  List<Item> sjm_table_cols;
+  TABLE *table;
+
+  /* Structure used to make index lookups */
+  struct st_table_ref *tab_ref;
+  Item *in_equality; /* See create_subq_in_equalities() */
+
+  Item *join_cond; /* See comments in make_join_select() */
+  Copy_field *copy_field; /* Needed for SJ_Materialization scan */
+};
+
+
 /* Structs used when sorting */
 
 typedef struct st_sort_field {
@@ -3029,6 +3268,7 @@ class user_var_entry
   DTCollation collation;
 };
 
+
 /*
    Unique -- class for unique (removing of duplicates).
    Puts all values to the TREE. If the tree becomes too big,
@@ -3045,28 +3285,44 @@ class Unique :public Sql_alloc
   IO_CACHE file;
   TREE tree;
   uchar *record_pointers;
+  ulong filtered_out_elems;
   bool flush();
   uint size;
+  uint full_size;
+  uint min_dupl_count;   /* always 0 for unions, > 0 for intersections */
 
 public:
   ulong elements;
   Unique(qsort_cmp2 comp_func, void *comp_func_fixed_arg,
-	 uint size_arg, ulonglong max_in_memory_size_arg);
+	 uint size_arg, ulonglong max_in_memory_size_arg,
+         uint min_dupl_count_arg= 0);
   ~Unique();
   ulong elements_in_tree() { return tree.elements_in_tree; }
   inline bool unique_add(void *ptr)
   {
     DBUG_ENTER("unique_add");
     DBUG_PRINT("info", ("tree %u - %lu", tree.elements_in_tree, max_elements));
-    if (tree.elements_in_tree > max_elements && flush())
+    if (!(tree.flag & TREE_ONLY_DUPS) && 
+        tree.elements_in_tree >= max_elements && flush())
       DBUG_RETURN(1);
     DBUG_RETURN(!tree_insert(&tree, ptr, 0, tree.custom_arg));
   }
 
+  bool is_in_memory() { return (my_b_tell(&file) == 0); }
+  void close_for_expansion() { tree.flag= TREE_ONLY_DUPS; }
+
   bool get(TABLE *table);
-  static double get_use_cost(uint *buffer, uint nkeys, uint key_size,
-                             ulonglong max_in_memory_size);
-  inline static int get_cost_calc_buff_size(ulong nkeys, uint key_size,
+  
+  /* Cost of searching for an element in the tree */
+  inline static double get_search_cost(ulonglong tree_elems, uint compare_factor)
+  {
+    return log((double) tree_elems) / (compare_factor * M_LN2);
+  }  
+
+  static double get_use_cost(uint *buffer, size_t nkeys, uint key_size,
+                             ulonglong max_in_memory_size, uint compare_factor,
+                             bool intersect_fl, bool *in_memory);
+  inline static int get_cost_calc_buff_size(size_t nkeys, uint key_size,
                                             ulonglong max_in_memory_size)
   {
     register ulonglong max_elems_in_tree=
@@ -3082,6 +3338,11 @@ public:
 
   friend int unique_write_to_file(uchar* key, element_count count, Unique *unique);
   friend int unique_write_to_ptrs(uchar* key, element_count count, Unique *unique);
+
+  friend int unique_write_to_file_with_count(uchar* key, element_count count,
+                                             Unique *unique);
+  friend int unique_intersect_write_to_ptrs(uchar* key, element_count count, 
+				            Unique *unique);
 };
 
 
@@ -3377,5 +3638,10 @@ inline int handler::ha_read_first_row(uchar *buf, uint primary_key)
   return error;
 }
 
+inline int handler::ha_write_tmp_row(uchar *buf)
+{
+  increment_statistics(&SSV::ha_write_count);
+  return write_row(buf);
+}
 
 #endif /* MYSQL_SERVER */
diff --git a/sql/sql_cursor.cc b/sql/sql_cursor.cc
index 77be6eaff34..308c49fc15c 100644
--- a/sql/sql_cursor.cc
+++ b/sql/sql_cursor.cc
@@ -719,7 +719,8 @@ bool Select_materialize::send_fields(List<Item> &list, uint flags)
 {
   DBUG_ASSERT(table == 0);
   if (create_result_table(unit->thd, unit->get_unit_column_types(),
-                          FALSE, thd->options | TMP_TABLE_ALL_COLUMNS, ""))
+                          FALSE, thd->options | TMP_TABLE_ALL_COLUMNS, "",
+                          FALSE))
     return TRUE;
 
   materialized_cursor= new (&table->mem_root)
diff --git a/sql/sql_db.cc b/sql/sql_db.cc
index 655268da637..35823408ef2 100644
--- a/sql/sql_db.cc
+++ b/sql/sql_db.cc
@@ -20,6 +20,7 @@
 #include <mysys_err.h>
 #include "sp.h"
 #include "events.h"
+#include "sql_handler.h"
 #include <my_dir.h>
 #include <m_ctype.h>
 #include "log.h"
diff --git a/sql/sql_delete.cc b/sql/sql_delete.cc
index 5564d628594..e2cb17090a1 100644
--- a/sql/sql_delete.cc
+++ b/sql/sql_delete.cc
@@ -23,6 +23,7 @@
 #include "sql_select.h"
 #include "sp_head.h"
 #include "sql_trigger.h"
+#include "sql_handler.h"
 
 /**
   Implement DELETE SQL word.
diff --git a/sql/sql_derived.cc b/sql/sql_derived.cc
index 3214c756bc7..e92b8f83c95 100644
--- a/sql/sql_derived.cc
+++ b/sql/sql_derived.cc
@@ -169,7 +169,8 @@ bool mysql_derived_prepare(THD *thd, LEX *lex, TABLE_LIST *orig_table_list)
     */
     if ((res= derived_result->create_result_table(thd, &unit->types, FALSE,
                                                  create_options,
-                                                 orig_table_list->alias)))
+                                                 orig_table_list->alias,
+                                                 FALSE)))
       goto exit;
 
     table= derived_result->table;
diff --git a/sql/sql_expression_cache.cc b/sql/sql_expression_cache.cc
new file mode 100644
index 00000000000..188d93e8b8a
--- /dev/null
+++ b/sql/sql_expression_cache.cc
@@ -0,0 +1,276 @@
+
+#include "mysql_priv.h"
+#include "sql_select.h"
+
+/*
+  Expression cache is used only for caching subqueries now, so its statistic
+  variables we call subquery_cache*.
+*/
+ulong subquery_cache_miss, subquery_cache_hit;
+
+Expression_cache_tmptable::Expression_cache_tmptable(THD *thd,
+                                                 List<Item*> &dependants,
+                                                 Item *value)
+  :cache_table(NULL), table_thd(thd), list(&dependants), val(value),
+   inited (0)
+{
+  DBUG_ENTER("Expression_cache_tmptable::Expression_cache_tmptable");
+  DBUG_VOID_RETURN;
+};
+
+
+/**
+  Field enumerator for TABLE::add_tmp_key
+
+  @param arg             reference variable with current field number
+
+  @return field number
+*/
+
+static uint field_enumerator(uchar *arg)
+{
+  return ((uint*)arg)[0]++;
+}
+
+
+/**
+  Initialize temporary table and auxiliary structures for the expression
+  cache
+
+  @details
+  The function creates a temporary table for the expression cache, defines
+  the search index and initializes auxiliary search structures used to check
+  whether a given set of of values of the expression parameters is in some
+  cache entry.
+*/
+
+void Expression_cache_tmptable::init()
+{
+  List_iterator<Item*> li(*list);
+  Item_iterator_ref_list it(li);
+  Item **item;
+  uint field_counter;
+  DBUG_ENTER("Expression_cache_tmptable::init");
+  DBUG_ASSERT(!inited);
+  inited= TRUE;
+  cache_table= NULL;
+
+  while ((item= li++))
+  {
+    DBUG_ASSERT(item);
+    if (*item)
+    {
+      DBUG_ASSERT((*item)->fixed);
+      items.push_back((*item));
+    }
+    else
+    {
+      /*
+        This is possible when optimizer already executed this subquery and
+        optimized out the condition predicate.
+      */
+      li.remove();
+    }
+  }
+
+  if (list->elements == 0)
+  {
+    DBUG_PRINT("info", ("All parameters were removed by optimizer."));
+    DBUG_VOID_RETURN;
+  }
+
+  cache_table_param.init();
+  /* dependent items and result */
+  cache_table_param.field_count= list->elements + 1;
+  /* postpone table creation to index description */
+  cache_table_param.skip_create_table= 1;
+  cache_table= NULL;
+
+  items.push_front(val);
+
+  if (!(cache_table= create_tmp_table(table_thd, &cache_table_param,
+                                      items, (ORDER*) NULL,
+                                      FALSE, FALSE,
+                                      ((table_thd->options |
+                                        TMP_TABLE_ALL_COLUMNS) &
+                                       ~(OPTION_BIG_TABLES |
+                                         TMP_TABLE_FORCE_MYISAM)),
+                                      HA_POS_ERROR,
+                                      (char *)"subquery-cache-table")))
+  {
+    DBUG_PRINT("error", ("create_tmp_table failed, caching switched off"));
+    DBUG_VOID_RETURN;
+  }
+
+  if (cache_table->s->db_type() != heap_hton)
+  {
+    DBUG_PRINT("error", ("we need only heap table"));
+    goto error;
+  }
+
+  /* This list do not contain result field */
+  it.open();
+
+  field_counter=1;
+
+  if (cache_table->alloc_keys(1) ||
+      cache_table->add_tmp_key(0, items.elements - 1, &field_enumerator,
+                                (uchar*)&field_counter, TRUE) ||
+      ref.tmp_table_index_lookup_init(table_thd, cache_table->key_info, it,
+                                      TRUE))
+  {
+    DBUG_PRINT("error", ("creating index failed"));
+    goto error;
+  }
+  cache_table->s->keys= 1;
+  ref.null_rejecting= 1;
+  ref.disable_cache= FALSE;
+  ref.has_record= 0;
+  ref.use_count= 0;
+
+
+  if (open_tmp_table(cache_table))
+  {
+    DBUG_PRINT("error", ("Opening (creating) temporary table failed"));
+    goto error;
+  }
+
+  if (!(cached_result= new Item_field(cache_table->field[0])))
+  {
+    DBUG_PRINT("error", ("Creating Item_field failed"));
+    goto error;
+  }
+
+  DBUG_VOID_RETURN;
+
+error:
+  /* switch off cache */
+  free_tmp_table(table_thd, cache_table);
+  cache_table= NULL;
+  DBUG_VOID_RETURN;
+}
+
+
+Expression_cache_tmptable::~Expression_cache_tmptable()
+{
+  if (cache_table)
+    free_tmp_table(table_thd, cache_table);
+}
+
+
+/**
+  Check if a given set of parameters of the expression is in the cache
+
+  @param [out] value     the expression value found in the cache if any
+
+  @details
+  For a given set of the parameters of the expression the function
+  checks whether it can be found in some entry of the cache. If so
+  the function returns the result of the expression extracted from
+  the cache.
+
+  @retval Expression_cache::HIT if the set of parameters is in the cache
+  @retval Expression_cache::MISS - otherwise
+*/
+
+Expression_cache::result Expression_cache_tmptable::check_value(Item **value)
+{
+  int res;
+  DBUG_ENTER("Expression_cache_tmptable::check_value");
+
+  /*
+    We defer cache initialization to get item references that are
+    used at the execution phase.
+  */
+  if (!inited)
+    init();
+
+  if (cache_table)
+  {
+    DBUG_PRINT("info", ("status: %u  has_record %u",
+                        (uint)cache_table->status, (uint)ref.has_record));
+    if ((res= join_read_key2(table_thd, NULL, cache_table, &ref)) == 1)
+      DBUG_RETURN(ERROR);
+    if (res)
+    {
+      subquery_cache_miss++;
+      DBUG_RETURN(MISS);
+    }
+
+    subquery_cache_hit++;
+    *value= cached_result;
+    DBUG_RETURN(Expression_cache::HIT);
+  }
+  DBUG_RETURN(Expression_cache::MISS);
+}
+
+
+/**
+  Put a new entry into the expression cache
+
+  @param value     the result of the expression to be put into the cache
+
+  @details
+  The function evaluates 'value' and puts the result into the cache as the
+  result of the expression for the current set of parameters.
+
+  @retval FALSE OK
+  @retval TRUE  Error
+*/
+
+my_bool Expression_cache_tmptable::put_value(Item *value)
+{
+  int error;
+  DBUG_ENTER("Expression_cache_tmptable::put_value");
+  DBUG_ASSERT(inited);
+
+  if (!cache_table)
+  {
+    DBUG_PRINT("info", ("No table so behave as we successfully put value"));
+    DBUG_RETURN(FALSE);
+  }
+
+  *(items.head_ref())= value;
+  fill_record(table_thd, cache_table->field, items, TRUE, TRUE);
+  if (table_thd->is_error())
+    goto err;;
+
+  if ((error= cache_table->file->ha_write_row(cache_table->record[0])))
+  {
+    /* create_myisam_from_heap will generate error if needed */
+    if (cache_table->file->is_fatal_error(error, HA_CHECK_DUP) &&
+        create_internal_tmp_table_from_heap(table_thd, cache_table,
+                                            cache_table_param.start_recinfo,
+                                            &cache_table_param.recinfo,
+                                            error, 1))
+      goto err;
+  }
+  cache_table->status= 0; /* cache_table->record contains an existed record */
+  ref.has_record= TRUE; /* the same as above */
+  DBUG_PRINT("info", ("has_record: TRUE  status: 0"));
+
+  DBUG_RETURN(FALSE);
+
+err:
+  free_tmp_table(table_thd, cache_table);
+  cache_table= NULL;
+  DBUG_RETURN(TRUE);
+}
+
+
+void Expression_cache_tmptable::print(String *str, enum_query_type query_type)
+{
+  List_iterator<Item*> li(*list);
+  Item **item;
+  bool is_first= TRUE;
+
+  str->append('<');
+  while ((item= li++))
+  {
+    if (!is_first)
+      str->append(',');
+    (*item)->print(str, query_type);
+    is_first= FALSE;
+  }
+  str->append('>');
+}
diff --git a/sql/sql_expression_cache.h b/sql/sql_expression_cache.h
new file mode 100644
index 00000000000..88f71e0cf32
--- /dev/null
+++ b/sql/sql_expression_cache.h
@@ -0,0 +1,84 @@
+#ifndef SQL_EXPRESSION_CACHE_INCLUDED
+#define SQL_EXPRESSION_CACHE_INCLUDED
+
+#include "sql_select.h"
+
+/**
+  Interface for expression cache
+
+  @note
+  Parameters of an expression cache interface are set on the creation of the
+  cache. They are passed when a cache object of the implementation class is
+  constructed. That's why they are not visible in this interface.
+*/
+
+extern ulong subquery_cache_miss, subquery_cache_hit;
+
+class Expression_cache :public Sql_alloc
+{
+public:
+  enum result {ERROR, HIT, MISS};
+
+  Expression_cache(){};
+  virtual ~Expression_cache() {};
+  /**
+    Shall check the presence of expression value in the cache for a given
+    set of values of the expression parameters.  Return the result of the
+    expression if it's found in the cache.
+  */
+  virtual result check_value(Item **value)= 0;
+  /**
+    Shall put the value of an expression for given set of its parameters
+    into the expression cache
+  */
+  virtual my_bool put_value(Item *value)= 0;
+
+  /**
+    Print cache parameters
+  */
+  virtual void print(String *str, enum_query_type query_type)= 0;
+};
+
+struct st_table_ref;
+struct st_join_table;
+class Item_field;
+
+
+/**
+  Implementation of expression cache over a temporary table
+*/
+
+class Expression_cache_tmptable :public Expression_cache
+{
+public:
+  Expression_cache_tmptable(THD *thd, List<Item*> &dependants, Item *value);
+  virtual ~Expression_cache_tmptable();
+  virtual result check_value(Item **value);
+  virtual my_bool put_value(Item *value);
+
+  void print(String *str, enum_query_type query_type);
+
+private:
+  void init();
+
+  /* tmp table parameters */
+  TMP_TABLE_PARAM cache_table_param;
+  /* temporary table to store this cache */
+  TABLE *cache_table;
+  /* Thread handle for the temporary table */
+  THD *table_thd;
+  /* TABLE_REF for index lookup */
+  struct st_table_ref ref;
+  /* Cached result */
+  Item_field *cached_result;
+  /* List of references to the parameters of the expression */
+  List<Item*> *list;
+  /* List of items */
+  List<Item> items;
+  /* Value Item example */
+  Item *val;
+  /* Set on if the object has been succesfully initialized with init() */
+  bool inited;
+};
+
+#endif /* SQL_EXPRESSION_CACHE_INCLUDED */
diff --git a/sql/sql_handler.cc b/sql/sql_handler.cc
index 85b7e46a313..7e651a2b4e2 100644
--- a/sql/sql_handler.cc
+++ b/sql/sql_handler.cc
@@ -56,9 +56,13 @@
   second container. When the table is flushed, the pointer is cleared.
 */
 
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation				// gcc: Class implementation
+#endif
+
 #include "mysql_priv.h"
 #include "sql_select.h"
-#include <assert.h>
+#include "sql_handler.h"
 
 #define HANDLER_TABLES_HASH_SIZE 120
 
@@ -66,6 +70,28 @@ static enum enum_ha_read_modes rkey_to_rnext[]=
 { RNEXT_SAME, RNEXT, RPREV, RNEXT, RPREV, RNEXT, RPREV, RPREV };
 
 /*
+  Set handler to state after create, but keep base information about
+  which table is used
+*/
+
+void SQL_HANDLER::reset()
+{
+  fields.empty();
+  arena.free_items();
+  free_root(&mem_root, MYF(0));
+  my_free(lock, MYF(MY_ALLOW_ZERO_PTR));
+  init();
+}  
+  
+/* Free all allocated data */
+
+SQL_HANDLER::~SQL_HANDLER()
+{
+  reset();
+  my_free(base_data, MYF(MY_ALLOW_ZERO_PTR));
+}
+
+/*
   Get hash key and hash key length.
 
   SYNOPSIS
@@ -84,11 +110,11 @@ static enum enum_ha_read_modes rkey_to_rnext[]=
     Pointer to the TABLE_LIST struct.
 */
 
-static char *mysql_ha_hash_get_key(TABLE_LIST *tables, size_t *key_len_p,
+static char *mysql_ha_hash_get_key(SQL_HANDLER *table, size_t *key_len,
                                    my_bool first __attribute__((unused)))
 {
-  *key_len_p= strlen(tables->alias) + 1 ; /* include '\0' in comparisons */
-  return tables->alias;
+  *key_len= table->handler_name.length + 1 ; /* include '\0' in comparisons */
+  return table->handler_name.str;
 }
 
 
@@ -106,9 +132,9 @@ static char *mysql_ha_hash_get_key(TABLE_LIST *tables, size_t *key_len_p,
     Nothing
 */
 
-static void mysql_ha_hash_free(TABLE_LIST *tables)
+static void mysql_ha_hash_free(SQL_HANDLER *table)
 {
-  my_free((char*) tables, MYF(0));
+  delete table;
 }
 
 /**
@@ -120,14 +146,21 @@ static void mysql_ha_hash_free(TABLE_LIST *tables)
 
   @note Though this function takes a list of tables, only the first list entry
   will be closed.
+  @mote handler_object is not deleted!
   @note Broadcasts refresh if it closed a table with old version.
 */
 
-static void mysql_ha_close_table(THD *thd, TABLE_LIST *tables,
+static void mysql_ha_close_table(SQL_HANDLER *handler,
                                  bool is_locked)
 {
+  THD *thd= handler->thd;
+  TABLE *table= handler->table;
   TABLE **table_ptr;
 
+  /* check if table was already closed */
+  if (!table)
+    return;
+
   /*
     Though we could take the table pointer from hash_tables->table,
     we must follow the thd->handler_tables chain anyway, as we need the
@@ -135,13 +168,18 @@ static void mysql_ha_close_table(THD *thd, TABLE_LIST *tables,
     for close_thread_table().
   */
   for (table_ptr= &(thd->handler_tables);
-       *table_ptr && (*table_ptr != tables->table);
+       *table_ptr && (*table_ptr != table);
          table_ptr= &(*table_ptr)->next)
     ;
 
   if (*table_ptr)
   {
-    (*table_ptr)->file->ha_index_or_rnd_end();
+    if (handler->lock)
+    {
+      // Mark it unlocked, like in reset_lock_data()
+      reset_lock_data(handler->lock, 1);
+    }
+    table->file->ha_index_or_rnd_end();
     if (! is_locked)
       VOID(pthread_mutex_lock(&LOCK_open));
     if (close_thread_table(thd, table_ptr))
@@ -152,17 +190,15 @@ static void mysql_ha_close_table(THD *thd, TABLE_LIST *tables,
     if (! is_locked)
       VOID(pthread_mutex_unlock(&LOCK_open));
   }
-  else if (tables->table)
+  else
   {
     /* Must be a temporary table */
-    TABLE *table= tables->table;
     table->file->ha_index_or_rnd_end();
     table->query_id= thd->query_id;
     table->open_by_handler= 0;
   }
-
-  /* Mark table as closed, ready for re-open if necessary. */
-  tables->table= NULL;
+  my_free(handler->lock, MYF(MY_ALLOW_ZERO_PTR));
+  handler->init();
 }
 
 /*
@@ -178,7 +214,7 @@ static void mysql_ha_close_table(THD *thd, TABLE_LIST *tables,
     Though this function takes a list of tables, only the first list entry
     will be opened.
     'reopen' is set when a handler table is to be re-opened. In this case,
-    'tables' is the pointer to the hashed TABLE_LIST object which has been
+    'tables' is the pointer to the hashed SQL_HANDLER object which has been
     saved on the original open.
     'reopen' is also used to suppress the sending of an 'ok' message.
 
@@ -187,17 +223,17 @@ static void mysql_ha_close_table(THD *thd, TABLE_LIST *tables,
     TRUE  Error
 */
 
-bool mysql_ha_open(THD *thd, TABLE_LIST *tables, bool reopen)
+bool mysql_ha_open(THD *thd, TABLE_LIST *tables, SQL_HANDLER *reopen)
 {
-  TABLE_LIST    *hash_tables = NULL;
-  char          *db, *name, *alias;
-  uint          dblen, namelen, aliaslen, counter;
+  SQL_HANDLER   *sql_handler= 0;
+  uint          counter;
   int           error;
-  TABLE         *backup_open_tables;
+  TABLE         *table, *backup_open_tables, *write_lock_used;
+  Query_arena backup_arena;
   DBUG_ENTER("mysql_ha_open");
   DBUG_PRINT("enter",("'%s'.'%s' as '%s'  reopen: %d",
                       tables->db, tables->table_name, tables->alias,
-                      (int) reopen));
+                      reopen != 0));
 
   if (tables->schema_table)
   {
@@ -210,7 +246,7 @@ bool mysql_ha_open(THD *thd, TABLE_LIST *tables, bool reopen)
   if (! hash_inited(&thd->handler_tables_hash))
   {
     /*
-      HASH entries are of type TABLE_LIST.
+      HASH entries are of type SQL_HANDLER
     */
     if (hash_init(&thd->handler_tables_hash, &my_charset_latin1,
                   HANDLER_TABLES_HASH_SIZE, 0, 0,
@@ -288,8 +324,10 @@ bool mysql_ha_open(THD *thd, TABLE_LIST *tables, bool reopen)
   if (error)
     goto err;
 
+  table= tables->table;
+
   /* There can be only one table in '*tables'. */
-  if (! (tables->table->file->ha_table_flags() & HA_CAN_SQL_HANDLER))
+  if (! (table->file->ha_table_flags() & HA_CAN_SQL_HANDLER))
   {
     my_error(ER_ILLEGAL_HA, MYF(0), tables->alias);
     goto err;
@@ -297,36 +335,69 @@ bool mysql_ha_open(THD *thd, TABLE_LIST *tables, bool reopen)
 
   if (! reopen)
   {
-    /* copy the TABLE_LIST struct */
-    dblen= strlen(tables->db) + 1;
-    namelen= strlen(tables->table_name) + 1;
-    aliaslen= strlen(tables->alias) + 1;
-    if (!(my_multi_malloc(MYF(MY_WME),
-                          &hash_tables, (uint) sizeof(*hash_tables),
-                          &db, (uint) dblen,
-                          &name, (uint) namelen,
-                          &alias, (uint) aliaslen,
+    /* copy data to sql_handler */
+    if (!(sql_handler= new SQL_HANDLER(thd)))
+      goto err;
+    init_alloc_root(&sql_handler->mem_root, 1024, 0);
+
+    sql_handler->table= table;
+    sql_handler->db.length= strlen(tables->db);
+    sql_handler->table_name.length= strlen(tables->table_name);
+    sql_handler->handler_name.length= strlen(tables->alias);
+
+    if (!(my_multi_malloc(MY_WME,
+                          &sql_handler->db.str,
+                          (uint) sql_handler->db.length + 1,
+                          &sql_handler->table_name.str,
+                          (uint) sql_handler->table_name.length + 1,
+                          &sql_handler->handler_name.str,
+                          (uint) sql_handler->handler_name.length + 1,
                           NullS)))
       goto err;
-    /* structure copy */
-    *hash_tables= *tables;
-    hash_tables->db= db;
-    hash_tables->table_name= name;
-    hash_tables->alias= alias;
-    memcpy(hash_tables->db, tables->db, dblen);
-    memcpy(hash_tables->table_name, tables->table_name, namelen);
-    memcpy(hash_tables->alias, tables->alias, aliaslen);
+    sql_handler->base_data= sql_handler->db.str;  // Free this
+    memcpy(sql_handler->db.str, tables->db, sql_handler->db.length +1);
+    memcpy(sql_handler->table_name.str, tables->table_name,
+           sql_handler->table_name.length+1);
+    memcpy(sql_handler->handler_name.str, tables->alias,
+           sql_handler->handler_name.length +1);
 
     /* add to hash */
-    if (my_hash_insert(&thd->handler_tables_hash, (uchar*) hash_tables))
+    if (my_hash_insert(&thd->handler_tables_hash, (uchar*) sql_handler))
       goto err;
   }
+  else
+  {
+    sql_handler= reopen;
+    sql_handler->reset();
+  }    
+  sql_handler->table= table;
+
+  if (!(sql_handler->lock= get_lock_data(thd, &sql_handler->table, 1,
+                                         GET_LOCK_STORE_LOCKS,
+                                         &write_lock_used)))
+    goto err;
+
+  /* Get a list of all fields for send_fields */
+  thd->set_n_backup_active_arena(&sql_handler->arena, &backup_arena);
+  error= table->fill_item_list(&sql_handler->fields);
+  thd->restore_active_arena(&sql_handler->arena, &backup_arena);
+
+  if (error)
+  {
+    if (reopen)
+      sql_handler= 0;
+    goto err;
+  }
+
+  /* Always read all columns */
+  table->read_set= &table->s->all_set;
+  table->vcol_set= &table->s->all_set;
 
   /*
     If it's a temp table, don't reset table->query_id as the table is
     being used by this handler. Otherwise, no meaning at all.
   */
-  tables->table->open_by_handler= 1;
+  table->open_by_handler= 1;
 
   if (! reopen)
     my_ok(thd);
@@ -334,10 +405,13 @@ bool mysql_ha_open(THD *thd, TABLE_LIST *tables, bool reopen)
   DBUG_RETURN(FALSE);
 
 err:
-  if (hash_tables)
-    my_free((char*) hash_tables, MYF(0));
+  delete sql_handler;
   if (tables->table)
-    mysql_ha_close_table(thd, tables, FALSE);
+  {
+    SQL_HANDLER tmp_sql_handler(thd);
+    tmp_sql_handler.table= tables->table;
+    mysql_ha_close_table(&tmp_sql_handler, FALSE);
+  }
   DBUG_PRINT("exit",("ERROR"));
   DBUG_RETURN(TRUE);
 }
@@ -362,17 +436,17 @@ err:
 
 bool mysql_ha_close(THD *thd, TABLE_LIST *tables)
 {
-  TABLE_LIST    *hash_tables;
+  SQL_HANDLER *handler;
   DBUG_ENTER("mysql_ha_close");
   DBUG_PRINT("enter",("'%s'.'%s' as '%s'",
                       tables->db, tables->table_name, tables->alias));
 
-  if ((hash_tables= (TABLE_LIST*) hash_search(&thd->handler_tables_hash,
-                                              (uchar*) tables->alias,
-                                              strlen(tables->alias) + 1)))
+  if ((handler= (SQL_HANDLER*) hash_search(&thd->handler_tables_hash,
+                                           (uchar*) tables->alias,
+                                           strlen(tables->alias) + 1)))
   {
-    mysql_ha_close_table(thd, hash_tables, FALSE);
-    hash_delete(&thd->handler_tables_hash, (uchar*) hash_tables);
+    mysql_ha_close_table(handler, FALSE);
+    hash_delete(&thd->handler_tables_hash, (uchar*) handler);
   }
   else
   {
@@ -387,6 +461,167 @@ bool mysql_ha_close(THD *thd, TABLE_LIST *tables)
 }
 
 
+/**
+   Finds an open HANDLER table.
+
+   @params name		Name of handler to open
+
+   @return 0 failure
+   @return handler
+*/  
+
+SQL_HANDLER *mysql_ha_find_handler(THD *thd, const char *name)
+{
+  SQL_HANDLER *handler;
+  if ((handler= (SQL_HANDLER*) hash_search(&thd->handler_tables_hash,
+                                           (uchar*) name,
+                                           strlen(name) + 1)))
+  {
+    DBUG_PRINT("info-in-hash",("'%s'.'%s' as '%s' table: %p",
+                               handler->db.str,
+                               handler->table_name.str,
+                               handler->handler_name.str, handler->table));
+    if (!handler->table)
+    {
+      /* The handler table has been closed. Re-open it. */
+      TABLE_LIST tmp;
+      tmp.init_one_table(handler->db.str, handler->table_name.str,
+                         TL_READ);
+      tmp.alias= handler->handler_name.str;
+
+      if (mysql_ha_open(thd, &tmp, handler))
+      {
+        DBUG_PRINT("exit",("reopen failed"));
+        return 0;
+      }
+    }
+  }
+  else
+  {
+    my_error(ER_UNKNOWN_TABLE, MYF(0), name, "HANDLER");
+    return 0;
+  }
+  return handler;
+}
+
+
+/**
+   Check that condition and key name are ok
+
+   @param handler
+   @param mode		Read mode (RFIRST, RNEXT etc...)
+   @param keyname	Key to use.
+   @param key_expr      List of key column values
+   @param cond		Where clause
+   @param in_prepare	If we are in prepare phase (we can't evalute items yet)
+
+   @return 0 ok
+   @return 1 error
+
+   In ok, then values of used key and mode is stored in sql_handler
+*/
+
+static bool
+mysql_ha_fix_cond_and_key(SQL_HANDLER *handler, 
+                          enum enum_ha_read_modes mode, char *keyname,
+                          List<Item> *key_expr,
+                          Item *cond, bool in_prepare)
+{
+  THD *thd= handler->thd;
+  TABLE *table= handler->table;
+  if (cond)
+  {
+    /* This can only be true for temp tables */
+    if (table->query_id != thd->query_id)
+      cond->cleanup();                          // File was reopened
+    if ((!cond->fixed &&
+	 cond->fix_fields(thd, &cond)) || cond->check_cols(1))
+      return 1;
+  }
+
+  if (keyname)
+  {
+    /* Check if same as last keyname. If not, do a full lookup */
+    if (handler->keyno < 0 ||
+        my_strcasecmp(&my_charset_latin1,
+                      keyname,
+                      table->s->key_info[handler->keyno].name))
+    {
+      if ((handler->keyno= find_type(keyname, &table->s->keynames, 1+2)-1)<0)
+      {
+        my_error(ER_KEY_DOES_NOT_EXITS, MYF(0), keyname,
+                 handler->handler_name);
+        return 1;
+      }
+    }
+
+    /* Check key parts */
+    if (mode == RKEY)
+    {
+      TABLE *table= handler->table;
+      KEY *keyinfo= table->key_info + handler->keyno;
+      KEY_PART_INFO *key_part= keyinfo->key_part;
+      List_iterator<Item> it_ke(*key_expr);
+      Item *item;
+      key_part_map keypart_map;
+      uint key_len;
+
+      if (key_expr->elements > keyinfo->key_parts)
+      {
+        my_error(ER_TOO_MANY_KEY_PARTS, MYF(0), keyinfo->key_parts);
+        return 1;
+      }
+      for (keypart_map= key_len=0 ; (item=it_ke++) ; key_part++)
+      {
+        my_bitmap_map *old_map;
+	/* note that 'item' can be changed by fix_fields() call */
+        if ((!item->fixed &&
+             item->fix_fields(thd, it_ke.ref())) ||
+	    (item= *it_ke.ref())->check_cols(1))
+          return 1;
+	if (item->used_tables() & ~(RAND_TABLE_BIT | PARAM_TABLE_BIT))
+        {
+          my_error(ER_WRONG_ARGUMENTS,MYF(0),"HANDLER ... READ");
+	  return 1;
+        }
+        if (!in_prepare)
+        {
+          old_map= dbug_tmp_use_all_columns(table, table->write_set);
+          (void) item->save_in_field(key_part->field, 1);
+          dbug_tmp_restore_column_map(table->write_set, old_map);
+        }
+        key_len+= key_part->store_length;
+        keypart_map= (keypart_map << 1) | 1;
+      }
+      handler->keypart_map= keypart_map;
+      handler->key_len= key_len;
+    }
+    else
+    {
+      /*
+        Check if the same index involved.
+        We need to always do this check because we may not have yet
+        called the handler since the last keyno change.
+      */
+      if ((uint) handler->keyno != table->file->get_index())
+      {
+        if (mode == RNEXT)
+          mode= RFIRST;
+        else if (mode == RPREV)
+          mode= RLAST;
+      }
+    }
+  }
+  else if (table->file->inited != handler::RND)
+  {
+    /* Convert RNEXT to RFIRST if we haven't started row scan */
+    if (mode == RNEXT)
+      mode= RFIRST;
+  }
+  handler->mode= mode;                          // Store adjusted mode
+  return 0;
+}
+
 /*
   Read from a HANDLER table.
 
@@ -413,153 +648,76 @@ bool mysql_ha_read(THD *thd, TABLE_LIST *tables,
                    enum ha_rkey_function ha_rkey_mode, Item *cond,
                    ha_rows select_limit_cnt, ha_rows offset_limit_cnt)
 {
-  TABLE_LIST    *hash_tables;
-  TABLE         *table, *backup_open_tables;
-  MYSQL_LOCK    *lock;
+  SQL_HANDLER   *handler;
+  TABLE         *table;
   List<Item>	list;
   Protocol	*protocol= thd->protocol;
   char		buff[MAX_FIELD_WIDTH];
   String	buffer(buff, sizeof(buff), system_charset_info);
-  int           error, keyno= -1;
+  int           error, keyno;
   uint          num_rows;
   uchar		*UNINIT_VAR(key);
-  uint		UNINIT_VAR(key_len);
   bool          need_reopen;
+  List_iterator<Item> it;
   DBUG_ENTER("mysql_ha_read");
   DBUG_PRINT("enter",("'%s'.'%s' as '%s'",
                       tables->db, tables->table_name, tables->alias));
 
-  thd->lex->select_lex.context.resolve_in_table_list_only(tables);
-  list.push_front(new Item_field(&thd->lex->select_lex.context,
-                                 NULL, NULL, "*"));
-  List_iterator<Item> it(list);
-  it++;
 
 retry:
-  if ((hash_tables= (TABLE_LIST*) hash_search(&thd->handler_tables_hash,
-                                              (uchar*) tables->alias,
-                                              strlen(tables->alias) + 1)))
+  if (!(handler= mysql_ha_find_handler(thd, tables->alias)))
+    goto err0;
+
+  table= handler->table;
+  tables->table= table;                         // This is used by fix_fields
+
+  /* save open_tables state */
+  if (handler->lock->lock_count > 0)
   {
-    table= hash_tables->table;
-    DBUG_PRINT("info-in-hash",("'%s'.'%s' as '%s' table: 0x%lx",
-                               hash_tables->db, hash_tables->table_name,
-                               hash_tables->alias, (long) table));
-    if (!table)
+    bool lock_error;
+
+    handler->lock->locks[0]->type= handler->lock->locks[0]->org_type;
+    lock_error= mysql_lock_tables(thd, handler->lock, 0, 
+                                  (MYSQL_LOCK_NOTIFY_IF_NEED_REOPEN |
+                                   (handler->table->s->tmp_table ==
+                                    NO_TMP_TABLE ?
+                                    MYSQL_LOCK_NOT_TEMPORARY : 0)),
+                                  &need_reopen);
+    if (need_reopen)
     {
-      /*
-        The handler table has been closed. Re-open it.
-      */
-      if (mysql_ha_open(thd, hash_tables, 1))
+      mysql_ha_close_table(handler, FALSE);
+      if (thd->stmt_arena->is_stmt_execute())
       {
-        DBUG_PRINT("exit",("reopen failed"));
+        /*
+          As we have already sent field list and types to the client, we can't
+          handle any changes in the table format for prepared statements.
+          Better to force a reprepare.
+        */
+        my_error(ER_NEED_REPREPARE, MYF(0));
         goto err0;
       }
 
-      table= hash_tables->table;
-      DBUG_PRINT("info",("re-opened '%s'.'%s' as '%s' tab %p",
-                         hash_tables->db, hash_tables->table_name,
-                         hash_tables->alias, table));
-    }
-
-#if MYSQL_VERSION_ID < 40100
-    if (*tables->db && strcmp(table->table_cache_key, tables->db))
-    {
-      DBUG_PRINT("info",("wrong db"));
-      table= NULL;
+      /*
+        The lock might have been aborted, we need to manually reset
+        thd->some_tables_deleted because handler's tables are closed
+        in a non-standard way. Otherwise we might loop indefinitely.
+      */
+      thd->some_tables_deleted= 0;
+      goto retry;
     }
-#endif
-  }
-  else
-    table= NULL;
-
-  if (!table)
-  {
-#if MYSQL_VERSION_ID < 40100
-    char buff[MAX_DBKEY_LENGTH];
-    if (*tables->db)
-      strxnmov(buff, sizeof(buff)-1, tables->db, ".", tables->table_name,
-               NullS);
-    else
-      strncpy(buff, tables->alias, sizeof(buff));
-    my_error(ER_UNKNOWN_TABLE, MYF(0), buff, "HANDLER");
-#else
-    my_error(ER_UNKNOWN_TABLE, MYF(0), tables->alias, "HANDLER");
-#endif
-    goto err0;
-  }
-  tables->table=table;
-
-  /* save open_tables state */
-  backup_open_tables= thd->open_tables;
-  /*
-    mysql_lock_tables() needs thd->open_tables to be set correctly to
-    be able to handle aborts properly. When the abort happens, it's
-    safe to not protect thd->handler_tables because it won't close any
-    tables.
-  */
-  thd->open_tables= thd->handler_tables;
-
-  lock= mysql_lock_tables(thd, &tables->table, 1,
-                          MYSQL_LOCK_NOTIFY_IF_NEED_REOPEN, &need_reopen);
-
-  /* restore previous context */
-  thd->open_tables= backup_open_tables;
-
-  if (need_reopen)
-  {
-    mysql_ha_close_table(thd, hash_tables, FALSE);
-    /*
-      The lock might have been aborted, we need to manually reset
-      thd->some_tables_deleted because handler's tables are closed
-      in a non-standard way. Otherwise we might loop indefinitely.
-    */
-    thd->some_tables_deleted= 0;
-    goto retry;
-  }
-
-  if (!lock)
-    goto err0; // mysql_lock_tables() printed error message already
 
-  // Always read all columns
-  tables->table->read_set= &tables->table->s->all_set;
-
-  if (cond)
-  {
-    if (table->query_id != thd->query_id)
-      cond->cleanup();                          // File was reopened
-    if ((!cond->fixed &&
-	 cond->fix_fields(thd, &cond)) || cond->check_cols(1))
-      goto err;
+    if (lock_error)
+      goto err0; // mysql_lock_tables() printed error message already
   }
 
-  if (keyname)
-  {
-    if ((keyno=find_type(keyname, &table->s->keynames, 1+2)-1)<0)
-    {
-      my_error(ER_KEY_DOES_NOT_EXITS, MYF(0), keyname, tables->alias);
-      goto err;
-    }
-    /* Check if the same index involved. */
-    if ((uint) keyno != table->file->get_index())
-    {
-      if (mode == RNEXT)
-        mode= RFIRST;
-      else if (mode == RPREV)
-        mode= RLAST;
-    }
-  }
-  else if (table->file->inited != handler::RND)
-  {
-    /* Convert RNEXT to RFIRST if we haven't started row scan */
-    if (mode == RNEXT)
-      mode= RFIRST;
-  }
-
-  if (insert_fields(thd, &thd->lex->select_lex.context,
-                    tables->db, tables->alias, &it, 0))
+  if (mysql_ha_fix_cond_and_key(handler, mode, keyname, key_expr, cond, 0))
     goto err;
+  mode= handler->mode;
+  keyno= handler->keyno;
 
-  protocol->send_fields(&list, Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF);
+  it.init(handler->fields);
+  protocol->send_fields(&handler->fields,
+                        Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF);
 
   /*
     In ::external_lock InnoDB resets the fields which tell it that
@@ -584,9 +742,7 @@ retry:
           error= table->file->ha_index_next(table->record[0]);
         }
         else
-        {
           error= table->file->ha_rnd_next(table->record[0]);
-        }
         break;
       }
       /* else fall through */
@@ -603,7 +759,7 @@ retry:
 	if (!(error= table->file->ha_rnd_init(1)))
           error= table->file->ha_rnd_next(table->record[0]);
       }
-      mode=RNEXT;
+      mode= RNEXT;
       break;
     case RPREV:
       DBUG_ASSERT(keyname != 0);
@@ -622,54 +778,28 @@ retry:
       table->file->ha_index_or_rnd_end();
       table->file->ha_index_init(keyno, 1);
       error= table->file->ha_index_last(table->record[0]);
-      mode=RPREV;
+      mode= RPREV;
       break;
     case RNEXT_SAME:
       /* Continue scan on "(keypart1,keypart2,...)=(c1, c2, ...)  */
       DBUG_ASSERT(keyname != 0);
-      error= table->file->ha_index_next_same(table->record[0], key, key_len);
+      error= table->file->ha_index_next_same(table->record[0], key,
+                                             handler->key_len);
       break;
     case RKEY:
     {
       DBUG_ASSERT(keyname != 0);
-      KEY *keyinfo=table->key_info+keyno;
-      KEY_PART_INFO *key_part=keyinfo->key_part;
-      if (key_expr->elements > keyinfo->key_parts)
-      {
-	my_error(ER_TOO_MANY_KEY_PARTS, MYF(0), keyinfo->key_parts);
-	goto err;
-      }
-      List_iterator<Item> it_ke(*key_expr);
-      Item *item;
-      key_part_map keypart_map;
-      for (keypart_map= key_len=0 ; (item=it_ke++) ; key_part++)
-      {
-        my_bitmap_map *old_map;
-	// 'item' can be changed by fix_fields() call
-        if ((!item->fixed &&
-             item->fix_fields(thd, it_ke.ref())) ||
-	    (item= *it_ke.ref())->check_cols(1))
-	  goto err;
-	if (item->used_tables() & ~RAND_TABLE_BIT)
-        {
-          my_error(ER_WRONG_ARGUMENTS,MYF(0),"HANDLER ... READ");
-	  goto err;
-        }
-        old_map= dbug_tmp_use_all_columns(table, table->write_set);
-	(void) item->save_in_field(key_part->field, 1);
-        dbug_tmp_restore_column_map(table->write_set, old_map);
-	key_len+=key_part->store_length;
-        keypart_map= (keypart_map << 1) | 1;
-      }
 
-      if (!(key= (uchar*) thd->calloc(ALIGN_SIZE(key_len))))
+      if (!(key= (uchar*) thd->calloc(ALIGN_SIZE(handler->key_len))))
 	goto err;
       table->file->ha_index_or_rnd_end();
       table->file->ha_index_init(keyno, 1);
-      key_copy(key, table->record[0], table->key_info + keyno, key_len);
+      key_copy(key, table->record[0], table->key_info + keyno,
+               handler->key_len);
       error= table->file->ha_index_read_map(table->record[0],
-                                            key, keypart_map, ha_rkey_mode);
-      mode=rkey_to_rnext[(int)ha_rkey_mode];
+                                            key, handler->keypart_map,
+                                            ha_rkey_mode);
+      mode= rkey_to_rnext[(int)ha_rkey_mode];
       break;
     }
     default:
@@ -689,6 +819,7 @@ retry:
                           "table '%s'",
                           error, tables->table_name);
         table->file->print_error(error,MYF(0));
+        table->file->ha_index_or_rnd_end();
         goto err;
       }
       goto ok;
@@ -716,13 +847,13 @@ retry:
     num_rows++;
   }
 ok:
-  mysql_unlock_tables(thd,lock);
+  mysql_unlock_tables(thd, handler->lock, 0);
   my_eof(thd);
   DBUG_PRINT("exit",("OK"));
   DBUG_RETURN(FALSE);
 
 err:
-  mysql_unlock_tables(thd,lock);
+  mysql_unlock_tables(thd, handler->lock, 0);
 err0:
   DBUG_PRINT("exit",("ERROR"));
   DBUG_RETURN(TRUE);
@@ -730,6 +861,28 @@ err0:
 
 
 /**
+   Prepare for handler read
+
+   For parameters, see mysql_ha_read()
+*/
+
+SQL_HANDLER *mysql_ha_read_prepare(THD *thd, TABLE_LIST *tables,
+                                   enum enum_ha_read_modes mode, char *keyname,
+                                   List<Item> *key_expr, Item *cond)
+{
+  SQL_HANDLER *handler;
+  DBUG_ENTER("mysql_ha_read_prepare");
+  if (!(handler= mysql_ha_find_handler(thd, tables->alias)))
+    DBUG_RETURN(0);
+  tables->table= handler->table;         // This is used by fix_fields
+  if (mysql_ha_fix_cond_and_key(handler, mode, keyname, key_expr, cond, 1))
+    DBUG_RETURN(0);
+  DBUG_RETURN(handler);
+}
+
+  
+
+/**
   Scan the handler tables hash for matching tables.
 
   @param thd Thread identifier.
@@ -740,30 +893,32 @@ err0:
           table was matched.
 */
 
-static TABLE_LIST *mysql_ha_find(THD *thd, TABLE_LIST *tables)
+static SQL_HANDLER *mysql_ha_find_match(THD *thd, TABLE_LIST *tables)
 {
-  TABLE_LIST *hash_tables, *head= NULL, *first= tables;
-  DBUG_ENTER("mysql_ha_find");
+  SQL_HANDLER *hash_tables, *head= NULL;
+  TABLE_LIST *first= tables;
+  DBUG_ENTER("mysql_ha_find_match");
 
   /* search for all handlers with matching table names */
   for (uint i= 0; i < thd->handler_tables_hash.records; i++)
   {
-    hash_tables= (TABLE_LIST*) hash_element(&thd->handler_tables_hash, i);
+    hash_tables= (SQL_HANDLER*) hash_element(&thd->handler_tables_hash, i);
+
     for (tables= first; tables; tables= tables->next_local)
     {
       if ((! *tables->db ||
-          ! my_strcasecmp(&my_charset_latin1, hash_tables->db, tables->db)) &&
-          ! my_strcasecmp(&my_charset_latin1, hash_tables->table_name,
+          ! my_strcasecmp(&my_charset_latin1, hash_tables->db.str,
+                          tables->db)) &&
+          ! my_strcasecmp(&my_charset_latin1, hash_tables->table_name.str,
                           tables->table_name))
+      {
+        /* Link into hash_tables list */
+        hash_tables->next= head;
+        head= hash_tables;
         break;
-    }
-    if (tables)
-    {
-      hash_tables->next_local= head;
-      head= hash_tables;
+      }
     }
   }
-
   DBUG_RETURN(head);
 }
 
@@ -780,18 +935,18 @@ static TABLE_LIST *mysql_ha_find(THD *thd, TABLE_LIST *tables)
 
 void mysql_ha_rm_tables(THD *thd, TABLE_LIST *tables, bool is_locked)
 {
-  TABLE_LIST *hash_tables, *next;
+  SQL_HANDLER *hash_tables, *next;
   DBUG_ENTER("mysql_ha_rm_tables");
 
   DBUG_ASSERT(tables);
 
-  hash_tables= mysql_ha_find(thd, tables);
+  hash_tables= mysql_ha_find_match(thd, tables);
 
   while (hash_tables)
   {
-    next= hash_tables->next_local;
+    next= hash_tables->next;
     if (hash_tables->table)
-      mysql_ha_close_table(thd, hash_tables, is_locked);
+      mysql_ha_close_table(hash_tables, is_locked);
     hash_delete(&thd->handler_tables_hash, (uchar*) hash_tables);
     hash_tables= next;
   }
@@ -811,16 +966,16 @@ void mysql_ha_rm_tables(THD *thd, TABLE_LIST *tables, bool is_locked)
 
 void mysql_ha_flush(THD *thd)
 {
-  TABLE_LIST *hash_tables;
+  SQL_HANDLER *hash_tables;
   DBUG_ENTER("mysql_ha_flush");
 
   safe_mutex_assert_owner(&LOCK_open);
 
   for (uint i= 0; i < thd->handler_tables_hash.records; i++)
   {
-    hash_tables= (TABLE_LIST*) hash_element(&thd->handler_tables_hash, i);
+    hash_tables= (SQL_HANDLER*) hash_element(&thd->handler_tables_hash, i);
     if (hash_tables->table && hash_tables->table->needs_reopen_or_name_lock())
-      mysql_ha_close_table(thd, hash_tables, TRUE);
+      mysql_ha_close_table(hash_tables, TRUE);
   }
 
   DBUG_VOID_RETURN;
@@ -837,14 +992,14 @@ void mysql_ha_flush(THD *thd)
 
 void mysql_ha_cleanup(THD *thd)
 {
-  TABLE_LIST *hash_tables;
+  SQL_HANDLER *hash_tables;
   DBUG_ENTER("mysql_ha_cleanup");
 
   for (uint i= 0; i < thd->handler_tables_hash.records; i++)
   {
-    hash_tables= (TABLE_LIST*) hash_element(&thd->handler_tables_hash, i);
+    hash_tables= (SQL_HANDLER*) hash_element(&thd->handler_tables_hash, i);
     if (hash_tables->table)
-      mysql_ha_close_table(thd, hash_tables, FALSE);
+      mysql_ha_close_table(hash_tables, FALSE);
    }
 
   hash_free(&thd->handler_tables_hash);
diff --git a/sql/sql_handler.h b/sql/sql_handler.h
new file mode 100644
index 00000000000..54e72e9f50e
--- /dev/null
+++ b/sql/sql_handler.h
@@ -0,0 +1,61 @@
+/* Copyright (C) 2010 Monty Program Ab
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface			/* gcc class implementation */
+#endif
+
+/* Open handlers are stored here */
+
+class SQL_HANDLER {
+public:
+  TABLE *table;
+  List<Item> fields;                            /* Fields, set on open */
+  THD *thd;
+  LEX_STRING handler_name;
+  LEX_STRING db;
+  LEX_STRING table_name;
+  MEM_ROOT mem_root;
+  MYSQL_LOCK *lock;
+
+  key_part_map keypart_map;
+  int keyno;                                    /* Used key */
+  uint key_len;
+  enum enum_ha_read_modes mode;
+
+  /* This is only used when deleting many handler objects */
+  SQL_HANDLER *next;
+
+  Query_arena arena;
+  char *base_data;
+  SQL_HANDLER(THD *thd_arg) :
+    thd(thd_arg), arena(&mem_root, Query_arena::INITIALIZED)
+  { init(); clear_alloc_root(&mem_root); base_data= 0; }
+  void init() { keyno= -1; table= 0; lock= 0; }
+  void reset();
+
+  ~SQL_HANDLER();
+};
+
+bool mysql_ha_open(THD *thd, TABLE_LIST *tables, SQL_HANDLER *reopen);
+bool mysql_ha_close(THD *thd, TABLE_LIST *tables);
+bool mysql_ha_read(THD *, TABLE_LIST *,enum enum_ha_read_modes,char *,
+                   List<Item> *,enum ha_rkey_function,Item *,ha_rows,ha_rows);
+void mysql_ha_flush(THD *thd);
+void mysql_ha_rm_tables(THD *thd, TABLE_LIST *tables, bool is_locked);
+void mysql_ha_cleanup(THD *thd);
+
+SQL_HANDLER *mysql_ha_read_prepare(THD *thd, TABLE_LIST *tables,
+                                   enum enum_ha_read_modes mode, char *keyname,
+                                   List<Item> *key_expr, Item *cond);
diff --git a/sql/sql_insert.cc b/sql/sql_insert.cc
index 81f658b6a77..adc82d791b6 100644
--- a/sql/sql_insert.cc
+++ b/sql/sql_insert.cc
@@ -622,8 +622,7 @@ bool mysql_insert(THD *thd,TABLE_LIST *table_list,
   /*
     We can't write-delayed into a table locked with LOCK TABLES:
     this will lead to a deadlock, since the delayed thread will
-    never be able to get a lock on the table. QQQ: why not
-    upgrade the lock here instead?
+    never be able to get a lock on the table.
   */
   if (table_list->lock_type == TL_WRITE_DELAYED && thd->locked_tables &&
       find_locked_table(thd, table_list->db, table_list->table_name))
@@ -808,7 +807,12 @@ bool mysql_insert(THD *thd,TABLE_LIST *table_list,
           be overwritten by fill_record() anyway (and fill_record() does not
           use default values in this case).
         */
-        table->record[0][0]= share->default_values[0];
+#ifdef HAVE_valgrind
+        if (table->file->ha_table_flags() && HA_RECORD_MUST_BE_CLEAN_ON_WRITE)
+          restore_record(table,s->default_values);	// Get empty record
+        else
+#endif
+          table->record[0][0]= share->default_values[0];
 
         /* Fix undefined null_bits. */
         if (share->null_bytes > 1 && share->last_null_bit_pos)
diff --git a/sql/sql_join_cache.cc b/sql/sql_join_cache.cc
new file mode 100644
index 00000000000..e952cf3e2ef
--- /dev/null
+++ b/sql/sql_join_cache.cc
@@ -0,0 +1,4475 @@
+/* Copyright (C) 2000-2006 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file
+
+  @brief
+  join cache optimizations
+
+  @defgroup Query_Optimizer  Query Optimizer
+  @{
+*/
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation				// gcc: Class implementation
+#endif
+
+#include "mysql_priv.h"
+#include "sql_select.h"
+#include "opt_subselect.h"
+
+#define NO_MORE_RECORDS_IN_BUFFER  (uint)(-1)
+
+
+/*****************************************************************************
+ *  Join cache module
+******************************************************************************/
+
+/* 
+  Fill in the descriptor of a flag field associated with a join cache    
+
+  SYNOPSIS
+    add_field_flag_to_join_cache()
+      str           position in a record buffer to copy the field from/to
+      length        length of the field 
+      field  IN/OUT pointer to the field descriptor to fill in 
+
+  DESCRIPTION
+    The function fill in the descriptor of a cache flag field to which
+    the parameter 'field' points to. The function uses the first two
+    parameters to set the position in the record buffer from/to which 
+    the field value is to be copied and the length of the copied fragment. 
+    Before returning the result the function increments the value of
+    *field by 1.
+    The function ignores the fields 'blob_length' and 'ofset' of the
+    descriptor.
+
+  RETURN VALUE
+    the length of the field  
+*/
+
+static
+uint add_flag_field_to_join_cache(uchar *str, uint length, CACHE_FIELD **field)
+{
+  CACHE_FIELD *copy= *field;
+  copy->str= str;
+  copy->length= length;
+  copy->type= 0;
+  copy->field= 0;
+  copy->referenced_field_no= 0;
+  (*field)++;
+  return length;    
+}
+
+
+/* 
+  Fill in the descriptors of table data fields associated with a join cache    
+
+  SYNOPSIS
+    add_table_data_fields_to_join_cache()
+      tab              descriptors of fields from this table are to be filled
+      field_set        descriptors for only these fields are to be created
+      field_cnt IN/OUT     counter of data fields  
+      descr  IN/OUT        pointer to the first descriptor to be filled
+      field_ptr_cnt IN/OUT counter of pointers to the data fields
+      descr_ptr IN/OUT     pointer to the first pointer to blob descriptors 
+
+  DESCRIPTION
+    The function fills in the descriptors of cache data fields from the table
+    'tab'. The descriptors are filled only for the fields marked in the 
+    bitmap 'field_set'. 
+    The function fills the descriptors starting from the position pointed
+    by 'descr'. If an added field is of a BLOB type then a pointer to the 
+    its descriptor is added to the array descr_ptr.   
+    At the return 'descr' points to the position after the last added
+    descriptor  while 'descr_ptr' points to the position right after the
+    last added pointer.  
+
+  RETURN VALUE
+    the total length of the added fields  
+*/
+
+static
+uint add_table_data_fields_to_join_cache(JOIN_TAB *tab, 
+                                         MY_BITMAP *field_set,
+                                         uint *field_cnt, 
+                                         CACHE_FIELD **descr,
+                                         uint *field_ptr_cnt,
+                                         CACHE_FIELD ***descr_ptr)
+{
+  Field **fld_ptr;
+  uint len= 0;
+  CACHE_FIELD *copy= *descr;
+  CACHE_FIELD **copy_ptr= *descr_ptr;
+  uint used_fields= bitmap_bits_set(field_set);
+  for (fld_ptr= tab->table->field; used_fields; fld_ptr++)
+  {
+    if (bitmap_is_set(field_set, (*fld_ptr)->field_index))
+    {
+      len+= (*fld_ptr)->fill_cache_field(copy);
+      if (copy->type == CACHE_BLOB)
+      {
+        *copy_ptr= copy;
+        copy_ptr++;
+        (*field_ptr_cnt)++;
+      }
+      copy->field= *fld_ptr;
+      copy->referenced_field_no= 0;
+      copy++;
+      (*field_cnt)++;
+      used_fields--;
+    }
+  }
+  *descr= copy;
+  *descr_ptr= copy_ptr;
+  return len;
+}
+
+/* 
+  Get the next table whose records are stored in the join buffer of this cache
+
+  SYNOPSIS
+    get_next_table()
+      tab     the table for which the next table is to be returned
+
+  DESCRIPTION
+    For a given table whose records are stored in this cache the function
+    returns the next such table if there is any.
+    The function takes into account that the tables whose records are
+    are stored in the same cache now can interleave with tables from
+    materialized semijoin subqueries.
+
+  TODO
+    This function should be modified/simplified after the new code for
+     materialized semijoins is merged.
+
+  RETURN
+    The next join table whose records are stored in the buffer of this cache
+    if such table exists, 0 - otherwise
+*/
+
+JOIN_TAB *JOIN_CACHE::get_next_table(JOIN_TAB *tab)
+{
+  
+  if (++tab == join_tab)
+    return NULL;
+  if (join_tab->first_sjm_sibling)
+    return tab;
+  uint i= tab-join->join_tab;
+  /*
+  Temporary measure before MWL#90 refactorings are there: if 'tab' is at upper
+  level (i.e. it's not inside an SJM nest), still include into the join buffer
+  the tables from within SJM nest.  We might need the subquery's select list
+  columns, because SJ-Materialization-Scan upacks data to those. 
+
+  while (sj_is_materialize_strategy(join->best_positions[i].sj_strategy) &&
+         i < join->tables)
+    i+= join->best_positions[i].n_sj_tables;
+
+  */
+  return join->join_tab+i < join_tab ? join->join_tab+i : NULL; 
+}
+
+
+/* 
+  Determine different counters of fields associated with a record in the cache  
+
+  SYNOPSIS
+    calc_record_fields()
+
+  DESCRIPTION
+    The function counts the number of total fields stored in a record
+    of the cache and saves this number in the 'fields' member. It also
+    determines the number of flag fields and the number of blobs.
+    The function sets 'with_match_flag' on if 'join_tab' needs a match flag
+    i.e. if it is the first inner table of an outer join or a semi-join.  
+
+  RETURN VALUE
+    none 
+*/
+
+void JOIN_CACHE::calc_record_fields()
+{
+  JOIN_TAB *tab = prev_cache ? prev_cache->join_tab :
+                                (join_tab->first_sjm_sibling ?
+			         join_tab->first_sjm_sibling :
+			         join->join_tab+join->const_tables);
+  tables= join_tab-tab;
+
+  fields= 0;
+  blobs= 0;
+  flag_fields= 0;
+  data_field_count= 0;
+  data_field_ptr_count= 0;
+  referenced_fields= 0;
+
+  for ( ; tab ; tab= get_next_table(tab))
+  {	    
+    tab->calc_used_field_length(FALSE);
+    flag_fields+= test(tab->used_null_fields || tab->used_uneven_bit_fields);
+    flag_fields+= test(tab->table->maybe_null);
+    fields+= tab->used_fields;
+    blobs+= tab->used_blobs;
+
+    fields+= tab->check_rowid_field();
+  }
+  if ((with_match_flag= join_tab->use_match_flag()))
+    flag_fields++;
+  fields+= flag_fields;
+}
+
+
+/* 
+  Collect information on join key arguments  
+
+  SYNOPSIS
+    collect_info_on_key_args()
+
+  DESCRIPTION
+    The function traverses the ref expressions that are used to access the
+    joined table join_tab. For each table 'tab' whose fields are to be stored
+    in the join buffer of the cache the function finds the fields from 'tab'
+    that occur in the ref expressions and marks these fields in the bitmap
+    tab->table->tmp_set. The function counts the number of them stored
+    in this cache and the total number of them stored in the previous caches
+    and saves the results of the counting in 'local_key_arg_fields' and               'external_key_arg_fields' respectively.
+
+  NOTES
+    The function does not do anything if no key is used to join the records
+    from join_tab.
+    
+  RETURN VALUE
+    none 
+*/  
+
+void JOIN_CACHE::collect_info_on_key_args()
+{
+  JOIN_TAB *tab;
+  JOIN_CACHE *cache;
+  local_key_arg_fields= 0;
+  external_key_arg_fields= 0;
+
+  if (!is_key_access())
+    return;
+
+  TABLE_REF *ref= &join_tab->ref;
+  cache= this;
+  do
+  {
+    for (tab= cache->join_tab-cache->tables; tab ;
+         tab= cache->get_next_table(tab))
+    { 
+      uint key_args;
+      bitmap_clear_all(&tab->table->tmp_set);
+      for (uint i= 0; i < ref->key_parts; i++)
+      {
+        Item *ref_item= ref->items[i]; 
+        if (!(tab->table->map & ref_item->used_tables()))
+	  continue;
+	 ref_item->walk(&Item::add_field_to_set_processor, 1,
+                        (uchar *) tab->table);
+      }
+      if ((key_args= bitmap_bits_set(&tab->table->tmp_set)))
+      {
+        if (cache == this)
+          local_key_arg_fields+= key_args;
+        else
+          external_key_arg_fields+= key_args;
+      }
+    }
+    cache= cache->prev_cache;
+  } 
+  while (cache);
+
+  return;
+}
+
+
+/* 
+  Allocate memory for descriptors and pointers to them associated with the cache  
+
+  SYNOPSIS
+    alloc_fields()
+
+  DESCRIPTION
+    The function allocates memory for the array of fields descriptors
+    and the array of pointers to the field descriptors used to copy
+    join record data from record buffers into the join buffer and
+    backward. Some pointers refer to the field descriptor associated
+    with previous caches. They are placed at the beginning of the array
+    of pointers and its total number is stored in external_key_arg_fields.
+    The pointer of the first array is assigned to field_descr and the number
+    of the elements in it is precalculated by the function calc_record_fields. 
+    The allocated arrays are adjacent.
+  
+  NOTES
+    The memory is allocated in join->thd->memroot
+
+  RETURN VALUE
+    pointer to the first array  
+*/
+
+int JOIN_CACHE::alloc_fields()
+{
+  uint ptr_cnt= external_key_arg_fields+blobs+1;
+  uint fields_size= sizeof(CACHE_FIELD)*fields;
+  field_descr= (CACHE_FIELD*) sql_alloc(fields_size +
+                                        sizeof(CACHE_FIELD*)*ptr_cnt);
+  blob_ptr= (CACHE_FIELD **) ((uchar *) field_descr + fields_size);
+  return (field_descr == NULL);
+}  
+
+
+/* 
+  Create descriptors of the record flag fields stored in the join buffer 
+
+  SYNOPSIS
+    create_flag_fields()
+
+  DESCRIPTION
+    The function creates descriptors of the record flag fields stored
+    in the join buffer. These are descriptors for:
+    - an optional match flag field,
+    - table null bitmap fields, 
+    - table null row fields.
+    The match flag field is created when 'join_tab' is the first inner
+    table of an outer join our a semi-join. A null bitmap field is
+    created for any table whose fields are to be stored in the join
+    buffer if at least one of these fields is nullable or is a BIT field
+    whose bits are partially stored with null bits. A null row flag
+    is created for any table assigned to the cache if it is an inner
+    table of an outer join.
+    The descriptor for flag fields are placed one after another at the
+    beginning of the array of field descriptors 'field_descr' that
+    contains 'fields' elements. If there is a match flag field the 
+    descriptor for it is always first in the sequence of flag fields.
+    The descriptors for other flag fields can follow in an arbitrary
+    order. 
+    The flag field values follow in a record stored in the join buffer
+    in the same order as field descriptors, with the match flag always
+    following first.
+    The function sets the value of 'flag_fields' to the total number
+    of the descriptors created for the flag fields.
+    The function sets the value of 'length' to the total length of the
+    flag fields.
+  
+  RETURN VALUE
+    none
+*/
+
+void JOIN_CACHE::create_flag_fields()
+{
+  CACHE_FIELD *copy;
+  JOIN_TAB *tab;
+
+  copy= field_descr;
+
+  length=0;
+
+  /* If there is a match flag the first field is always used for this flag */ 
+  if (with_match_flag)
+    length+= add_flag_field_to_join_cache((uchar*) &join_tab->found,
+                                          sizeof(join_tab->found),
+	                                  &copy);
+
+  /* Create fields for all null bitmaps and null row flags that are needed */
+  for (tab= join_tab-tables; tab; tab= get_next_table(tab))
+  {
+    TABLE *table= tab->table;
+
+    /* Create a field for the null bitmap from table if needed */
+    if (tab->used_null_fields || tab->used_uneven_bit_fields)			    
+      length+= add_flag_field_to_join_cache(table->null_flags,
+                                            table->s->null_bytes,
+                                            &copy);
+ 
+    /* Create table for the null row flag if needed */
+    if (table->maybe_null)
+      length+= add_flag_field_to_join_cache((uchar*) &table->null_row,
+                                            sizeof(table->null_row),
+                                            &copy);
+  }
+
+  /* Theoretically the new value of flag_fields can be less than the old one */   
+  flag_fields= copy-field_descr;
+}
+
+
+/* 
+  Create descriptors of the fields used to build access keys to the joined table
+
+  SYNOPSIS
+    create_key_arg_fields()
+
+  DESCRIPTION
+    The function creates descriptors of the record fields stored in the join
+    buffer that are used to build access keys to the joined table. These
+    fields are put into the buffer ahead of other records fields stored in
+    the buffer. Such placement helps to optimize construction of access keys.
+    For each field that is used to build access keys to the joined table but
+    is stored in some other join cache buffer the function saves a pointer
+    to the the field descriptor. The array of such pointers are placed in the
+    the join cache structure just before the array of pointers to the
+    blob fields blob_ptr.
+    Any field stored in a join cache buffer that is used to construct keys
+    to access tables associated with other join caches is called a referenced
+    field. It receives a unique number that is saved by the function in the
+    member 'referenced_field_no' of the CACHE_FIELD descriptor for the field.
+    This number is used as index to the array of offsets to the referenced
+    fields that are saved and put in the join cache buffer after all record
+    fields.
+    The function also finds out whether that the keys to access join_tab
+    can be considered as embedded and, if so, sets the flag 'use_emb_key' in
+    this join cache appropriately. 
+     
+  NOTES.
+    When a key to access the joined table 'join_tab' is constructed the array
+    of pointers to the field descriptors for the external fields is looked
+    through. For each of this pointers we find out in what previous key cache
+    the referenced field is stored. The value of 'referenced_field_no'
+    provides us with the index into the array of offsets for referenced 
+    fields stored in the join cache. The offset read by the the index allows
+    us to read the field without reading all other fields of the record 
+    stored the join cache buffer. This optimizes the construction of keys
+    to access 'join_tab' when some key arguments are stored in the previous
+    join caches.  
+
+  NOTES
+    The function does not do anything if no key is used to join the records
+    from join_tab.
+ 
+  RETURN VALUE
+    none
+*/
+void JOIN_CACHE::create_key_arg_fields()
+{
+  JOIN_TAB *tab;
+  JOIN_CACHE *cache;
+
+  if (!is_key_access())
+    return;
+
+  /* 
+    Save pointers to the cache fields in previous caches
+    that  are used to build keys for this key access.
+  */
+  cache= this;
+  uint ext_key_arg_cnt= external_key_arg_fields;
+  CACHE_FIELD *copy;
+  CACHE_FIELD **copy_ptr= blob_ptr;
+  while (ext_key_arg_cnt)
+  {
+    cache= cache->prev_cache;
+    for (tab= cache->join_tab-cache->tables; tab;
+         tab= cache->get_next_table(tab))
+    { 
+      CACHE_FIELD *copy_end;
+      MY_BITMAP *key_read_set= &tab->table->tmp_set;
+      /* key_read_set contains the bitmap of tab's fields referenced by ref */ 
+      if (bitmap_is_clear_all(key_read_set))
+        continue;
+      copy_end= cache->field_descr+cache->fields;
+      for (copy= cache->field_descr+cache->flag_fields; copy < copy_end; copy++)
+      {
+        /*
+          (1) - when we store rowids for DuplicateWeedout, they have
+                copy->field==NULL
+        */
+        if (copy->field &&  // (1)
+            copy->field->table == tab->table &&
+            bitmap_is_set(key_read_set, copy->field->field_index))
+        {
+          *copy_ptr++= copy; 
+          ext_key_arg_cnt--;
+          if (!copy->referenced_field_no)
+          {
+            /* 
+              Register the referenced field 'copy': 
+              - set the offset number in copy->referenced_field_no,
+              - adjust the value of the flag 'with_length',
+              - adjust the values of 'pack_length' and 
+                of 'pack_length_with_blob_ptrs'.
+	    */
+            copy->referenced_field_no= ++cache->referenced_fields;
+            if (!cache->with_length)
+            {
+              cache->with_length= TRUE;
+              uint sz= cache->get_size_of_rec_length();
+              cache->base_prefix_length+= sz;
+              cache->pack_length+= sz;
+              cache->pack_length_with_blob_ptrs+= sz;
+            }
+	    cache->pack_length+= cache->get_size_of_fld_offset();
+            cache->pack_length_with_blob_ptrs+= cache->get_size_of_fld_offset();
+          }        
+        }
+      }
+    } 
+  }
+  /* After this 'blob_ptr' shall not be be changed */ 
+  blob_ptr= copy_ptr;
+  
+  /* Now create local fields that are used to build ref for this key access */
+  copy= field_descr+flag_fields;
+  for (tab= join_tab-tables; tab; tab= get_next_table(tab))
+  {
+    length+= add_table_data_fields_to_join_cache(tab, &tab->table->tmp_set,
+                                                 &data_field_count, &copy,
+                                                 &data_field_ptr_count, 
+                                                 &copy_ptr);
+  }
+
+  use_emb_key= check_emb_key_usage();
+
+  return;
+}
+
+
+/* 
+  Create descriptors of all remaining data fields stored in the join buffer    
+
+  SYNOPSIS
+    create_remaining_fields()
+
+  DESCRIPTION
+    The function creates descriptors for all remaining data fields of a
+    record from the join buffer. If the value returned by is_key_access() is
+    false the function creates fields for all read record fields that
+    comprise the partial join record joined with join_tab. Otherwise, 
+    for each table tab, the set of the read fields for which the descriptors
+    have to be added is determined as the difference between all read fields
+    and and those for which the descriptors have been already created.
+    The latter are supposed to be marked in the bitmap tab->table->tmp_set.
+    The function increases the value of 'length' to the the total length of
+    the added fields.
+   
+  NOTES
+    If is_key_access() returns true the function modifies the value of
+    tab->table->tmp_set for a each table whose fields are stored in the cache.
+    The function calls the method Field::fill_cache_field to figure out
+    the type of the cache field and the maximal length of its representation
+    in the join buffer. If this is a blob field then additionally a pointer
+    to this field is added as an element of the array blob_ptr. For a blob
+    field only the size of the length of the blob data is taken into account.
+    It is assumed that 'data_field_count' contains the number of descriptors
+    for data fields that have been already created and 'data_field_ptr_count'
+    contains the number of the pointers to such descriptors having been
+    stored up to the moment.
+
+  RETURN VALUE
+    none 
+*/
+
+void JOIN_CACHE:: create_remaining_fields()
+{
+  JOIN_TAB *tab;
+  bool all_read_fields= !is_key_access();
+  CACHE_FIELD *copy= field_descr+flag_fields+data_field_count;
+  CACHE_FIELD **copy_ptr= blob_ptr+data_field_ptr_count;
+
+  for (tab= join_tab-tables; tab; tab= get_next_table(tab))
+  {
+    MY_BITMAP *rem_field_set;
+    TABLE *table= tab->table;
+
+    if (all_read_fields)
+      rem_field_set= table->read_set;
+    else
+    {
+      bitmap_invert(&table->tmp_set);
+      bitmap_intersect(&table->tmp_set, table->read_set);
+      rem_field_set= &table->tmp_set;
+    }  
+
+    length+= add_table_data_fields_to_join_cache(tab, rem_field_set,
+                                                 &data_field_count, &copy,
+                                                 &data_field_ptr_count,
+                                                 &copy_ptr);
+  
+    /* SemiJoinDuplicateElimination: allocate space for rowid if needed */
+    if (tab->keep_current_rowid)
+    {
+      copy->str= table->file->ref;
+      copy->length= table->file->ref_length;
+      copy->type= 0;
+      copy->field= 0;
+      copy->referenced_field_no= 0;
+      length+= copy->length;
+      data_field_count++;
+      copy++;
+    }
+  }
+}
+
+
+
+/* 
+  Calculate and set all cache constants      
+
+  SYNOPSIS
+    set_constants()
+
+  DESCRIPTION
+    The function calculates and set all precomputed constants that are used
+    when writing records into the join buffer and reading them from it.
+    It calculates the size of offsets of a record within the join buffer
+    and of a field within a record. It also calculates the number of bytes
+    used to store record lengths.
+    The function also calculates the maximal length of the representation
+    of record in the cache excluding blob_data. This value is used when
+    making a dicision whether more records should be added into the join
+    buffer or not.
+  
+  RETURN VALUE
+    none 
+*/
+
+void JOIN_CACHE::set_constants()
+{ 
+  /* 
+    Any record from a BKA cache is prepended with the record length.
+    We use the record length when reading the buffer and building key values
+    for each record. The length allows us not to read the fields that are
+    not needed for keys.
+    If a record has match flag it also may be skipped when the match flag
+    is on. It happens if the cache is used for a semi-join operation or
+    for outer join when the 'not exist' optimization can be applied.
+    If some of the fields are referenced from other caches then
+    the record length allows us to easily reach the saved offsets for
+    these fields since the offsets are stored at the very end of the record.
+    However at this moment we don't know whether we have referenced fields for
+    the cache or not. Later when a referenced field is registered for the cache
+    we adjust the value of the flag 'with_length'.
+  */ 
+  with_length= is_key_access() || 
+               join_tab->is_inner_table_of_semi_join_with_first_match() ||
+               join_tab->is_inner_table_of_outer_join();
+  /* 
+     At this moment we don't know yet the value of 'referenced_fields',
+     but in any case it can't be greater than the value of 'fields'.
+  */
+  uint len= length + fields*sizeof(uint)+blobs*sizeof(uchar *) +
+            (prev_cache ? prev_cache->get_size_of_rec_offset() : 0) +
+            sizeof(ulong);
+  buff_size= max(join->thd->variables.join_buff_size, 2*len);
+  size_of_rec_ofs= offset_size(buff_size);
+  size_of_rec_len= blobs ? size_of_rec_ofs : offset_size(len); 
+  size_of_fld_ofs= size_of_rec_len;
+  base_prefix_length= (with_length ? size_of_rec_len : 0) +
+                      (prev_cache ? prev_cache->get_size_of_rec_offset() : 0);
+  /* 
+    The size of the offsets for referenced fields will be added later.
+    The values of 'pack_length' and 'pack_length_with_blob_ptrs' are adjusted
+    every time when the first reference to the referenced field is registered.
+  */
+  pack_length= (with_length ? size_of_rec_len : 0) +
+               (prev_cache ? prev_cache->get_size_of_rec_offset() : 0) + 
+               length;
+  pack_length_with_blob_ptrs= pack_length + blobs*sizeof(uchar *);
+}
+
+
+/* 
+  Get maximum total length of all affixes of a record in the join cache buffer
+
+  SYNOPSIS
+    get_record_max_affix_length()
+
+  DESCRIPTION
+    The function calculates the maximum possible total length of all affixes
+    of a record in the join cache buffer, that is made of:
+      - the length of all prefixes used in this cache,
+      - the length of the match flag if it's needed
+      - the total length of the maximum possible offsets to the fields of
+        a record in the buffer.
+
+  RETURN VALUE
+    The maximum total length of all affixes of a record in the join buffer  
+*/ 
+     
+uint JOIN_CACHE::get_record_max_affix_length()
+{
+  uint len= get_prefix_length() +
+            test(with_match_flag) + 
+            size_of_fld_ofs * data_field_count;
+  return len;
+}
+
+
+/* 
+  Get the minimum possible size of the cache join buffer 
+
+  SYNOPSIS
+    get_min_join_buffer_size()
+
+  DESCRIPTION
+    At the first its invocation for the cache the function calculates the
+    minimum possible size of the join buffer of the cache. This value depends
+    on the minimal number of records 'min_records' to be stored in the join
+    buffer. The number is supposed to be determined by the procedure that 
+    chooses the best access path to the joined table join_tab in the execution
+    plan. After the calculation of the interesting size the function saves it
+    in the field 'min_buff_size' in order to use it directly at the next     
+    invocations of the function.
+
+  NOTES
+    Currently the number of minimal records is just set to 1.
+
+  RETURN VALUE
+    The minimal possible size of the join buffer of this cache 
+*/
+
+ulong JOIN_CACHE::get_min_join_buffer_size()
+{
+  if (!min_buff_size)
+  {
+    size_t len= 0;
+    for (JOIN_TAB *tab= join_tab-tables; tab < join_tab; tab++)
+      len+= tab->get_max_used_fieldlength();
+    len+= get_record_max_affix_length() + get_max_key_addon_space_per_record();  
+    size_t min_sz= len*min_records;
+    size_t add_sz= 0;
+    for (uint i=0; i < min_records; i++)
+      add_sz+= join_tab_scan->aux_buffer_incr(i+1);
+    avg_aux_buffer_incr= add_sz/min_records;
+    min_sz+= add_sz;
+    min_sz+= pack_length_with_blob_ptrs;
+    set_if_bigger(min_sz, 1);
+    min_buff_size= min_sz;
+  }
+  return min_buff_size;
+}
+
+
+/* 
+  Get the maximum possible size of the cache join buffer 
+
+  SYNOPSIS
+    get_max_join_buffer_size()
+
+    optimize_buff_size  FALSE <-> do not take more memory than needed for
+                        the estimated number of records in the partial join 
+
+  DESCRIPTION
+    At the first its invocation for the cache the function calculates the
+    maximum possible size of join buffer for the cache. If the parameter
+    optimize_buff_size true then this value does not exceed the size of the
+    space needed for the estimated number of records 'max_records' in the
+    partial join that joins tables from the first one through join_tab. This
+    value is also capped off by the value of join_tab->join_buffer_size_limit,
+    if it has been set a to non-zero value, and by the value of the system
+    parameter join_buffer_size - otherwise. After the calculation of the
+    interesting size the function saves the value in the field 'max_buff_size'
+    in order to use it directly at the next  invocations of the function.
+
+  NOTES
+    Currently the value of join_tab->join_buffer_size_limit is initialized
+    to 0 and is never reset.
+
+  RETURN VALUE
+    The maximum possible size of the join buffer of this cache 
+*/
+
+ulong JOIN_CACHE::get_max_join_buffer_size(bool optimize_buff_size)
+{
+  if (!max_buff_size)
+  {
+    size_t max_sz;
+    size_t min_sz= get_min_join_buffer_size(); 
+    size_t len= 0;
+    for (JOIN_TAB *tab= join_tab-tables; tab < join_tab; tab++)
+      len+= tab->get_used_fieldlength();
+    len+= get_record_max_affix_length();
+    avg_record_length= len;
+    len+= get_max_key_addon_space_per_record() + avg_aux_buffer_incr;
+    space_per_record= len;
+    
+    size_t limit_sz= join->thd->variables.join_buff_size;
+    if (join_tab->join_buffer_size_limit)
+      set_if_smaller(limit_sz, join_tab->join_buffer_size_limit);
+    if (!optimize_buff_size)
+      max_sz= limit_sz;
+    else
+    {    
+      if (limit_sz / max_records > space_per_record)
+        max_sz= space_per_record * max_records;
+      else
+        max_sz= limit_sz;
+      max_sz+= pack_length_with_blob_ptrs;
+      set_if_smaller(max_sz, limit_sz);
+    }
+    set_if_bigger(max_sz, min_sz);
+    max_buff_size= max_sz;
+  }
+  return max_buff_size;
+}    
+      
+
+/* 
+  Allocate memory for a join buffer      
+
+  SYNOPSIS
+    alloc_buffer()
+
+  DESCRIPTION
+    The function allocates a lump of memory for the cache join buffer. 
+    Initially the function sets the size of the buffer buff_size equal to
+    the value returned by get_max_join_buffer_size(). If the total size of
+    the space intended to be used for the join buffers employed by the
+    tables from the first one through join_tab exceeds the value of the
+    system parameter join_buff_space_limit, then the function first tries
+    to shrink the used buffers to make the occupied space fit the maximum
+    memory allowed to be used for all join buffers in total. After
+    this the function tries to allocate a join buffer for join_tab.
+    If it fails to do so, it decrements the requested size of the join
+    buffer, shrinks proportionally the join buffers used for the previous
+    tables and tries to allocate a buffer for join_tab. In the case of a
+    failure the function repeats its attempts with smaller and smaller
+    requested sizes of the buffer, but not more than 4 times.
+  
+  RETURN VALUE
+    0   if the memory has been successfully allocated
+    1   otherwise
+*/
+
+int JOIN_CACHE::alloc_buffer()
+{
+  JOIN_TAB *tab;
+  JOIN_CACHE *cache;
+  ulonglong curr_buff_space_sz= 0;
+  ulonglong curr_min_buff_space_sz= 0;
+  ulonglong join_buff_space_limit=
+    join->thd->variables.join_buff_space_limit;
+  bool optimize_buff_size= 
+         optimizer_flag(join->thd, OPTIMIZER_SWITCH_OPTIMIZE_JOIN_BUFFER_SIZE);
+  double partial_join_cardinality=  (join_tab-1)->get_partial_join_cardinality();
+  buff= NULL;
+  min_buff_size= 0;
+  max_buff_size= 0;
+  min_records= 1;
+  max_records= (size_t) (partial_join_cardinality <= join_buff_space_limit ?
+                 (ulonglong) partial_join_cardinality : join_buff_space_limit);
+  set_if_bigger(max_records, 10);
+  min_buff_size= get_min_join_buffer_size();
+  buff_size= get_max_join_buffer_size(optimize_buff_size);
+  for (tab= join->join_tab+join->const_tables; tab <= join_tab; tab++)
+  {
+    cache= tab->cache;
+    if (cache)
+    {
+      curr_min_buff_space_sz+= cache->get_min_join_buffer_size();
+      curr_buff_space_sz+= cache->get_join_buffer_size();
+    }
+  }
+
+  if (curr_min_buff_space_sz > join_buff_space_limit ||
+      (curr_buff_space_sz > join_buff_space_limit &&
+       (!optimize_buff_size || 
+        join->shrink_join_buffers(join_tab, curr_buff_space_sz,
+                                  join_buff_space_limit))))
+    goto fail;
+                               
+  for (ulong buff_size_decr= (buff_size-min_buff_size)/4 + 1; ; )
+  {
+    ulong next_buff_size;
+
+    if ((buff= (uchar*) my_malloc(buff_size, MYF(0))))
+      break;
+
+    next_buff_size= buff_size > buff_size_decr ? buff_size-buff_size_decr : 0;
+    if (next_buff_size < min_buff_size ||
+        join->shrink_join_buffers(join_tab, curr_buff_space_sz,
+                                  curr_buff_space_sz-buff_size_decr))
+      goto fail;
+    buff_size= next_buff_size;
+
+    curr_buff_space_sz= 0;
+    for (tab= join->join_tab+join->const_tables; tab <= join_tab; tab++)
+    {
+      cache= tab->cache;
+      if (cache)
+        curr_buff_space_sz+= cache->get_join_buffer_size();
+    } 
+  }
+  return 0;
+
+fail:
+  buff_size= 0;
+  return 1;
+}
+
+ 
+/*
+  Shrink the size if the cache join buffer in a given ratio
+
+  SYNOPSIS
+    shrink_join_buffer_in_ratio()
+      n           nominator of the ratio to shrink the buffer in
+      d           denominator if the ratio
+
+  DESCRIPTION
+    The function first deallocates the join buffer of the cache. Then
+    it allocates a buffer that is (n/d) times smaller.
+    
+  RETURN VALUE
+    FALSE   on success with allocation of the smaller join buffer 
+    TRUE    otherwise       
+*/
+
+bool JOIN_CACHE::shrink_join_buffer_in_ratio(ulonglong n, ulonglong d)
+{
+  size_t next_buff_size;
+  if (n < d)
+    return FALSE;
+  next_buff_size= (size_t) ((double) buff_size / n * d);
+  set_if_bigger(next_buff_size, min_buff_size);
+  buff_size= next_buff_size;
+  return realloc_buffer();
+}  
+
+
+/*
+  Reallocate the join buffer of a join cache
+ 
+  SYNOPSIS
+    realloc_buffer()
+
+  DESCRITION
+    The function reallocates the join buffer of the join cache. After this
+    it resets the buffer for writing.
+
+  NOTES
+    The function assumes that buff_size contains the new value for the join
+    buffer size.  
+
+  RETURN VALUE
+    0   if the buffer has been successfully reallocated
+    1   otherwise
+*/
+
+int JOIN_CACHE::realloc_buffer()
+{
+  int rc;
+  free();
+  rc= test(!(buff= (uchar*) my_malloc(buff_size, MYF(0))));
+  reset(TRUE);
+  return rc;   	
+}
+  
+
+/* 
+  Initialize a join cache       
+
+  SYNOPSIS
+    init()
+
+  DESCRIPTION
+    The function initializes the join cache structure. It supposed to be called
+    by init methods for classes derived from the JOIN_CACHE.
+    The function allocates memory for the join buffer and for descriptors of
+    the record fields stored in the buffer.
+
+  NOTES
+    The code of this function should have been included into the constructor
+    code itself. However the new operator for the class JOIN_CACHE would
+    never fail while memory allocation for the join buffer is not absolutely
+    unlikely to fail. That's why this memory allocation has to be placed in a
+    separate function that is called in a couple with a cache constructor.
+    It is quite natural to put almost all other constructor actions into
+    this function.     
+  
+  RETURN VALUE
+    0   initialization with buffer allocations has been succeeded
+    1   otherwise
+*/
+
+int JOIN_CACHE::init()
+{
+  DBUG_ENTER("JOIN_CACHE::init");
+
+  calc_record_fields();
+
+  collect_info_on_key_args();
+
+  if (alloc_fields())
+    DBUG_RETURN(1);
+
+  create_flag_fields();
+
+  create_key_arg_fields();
+
+  create_remaining_fields();
+
+  set_constants();
+
+  if (alloc_buffer())
+    DBUG_RETURN(1); 
+  
+  reset(TRUE); 
+
+  DBUG_RETURN(0);
+}
+
+
+/* 
+  Check the possibility to read the access keys directly from the join buffer       
+  SYNOPSIS
+    check_emb_key_usage()
+
+  DESCRIPTION
+    The function checks some conditions at which the key values can be read
+    directly from the join buffer. This is possible when the key values can be
+    composed by concatenation of the record fields stored in the join buffer.
+    Sometimes when the access key is multi-component the function has to re-order
+    the fields written into the join buffer to make keys embedded. If key 
+    values for the key access are detected as embedded then 'use_emb_key'
+    is set to TRUE.
+
+  EXAMPLE
+    Let table t2 has an index defined on the columns a,b . Let's assume also
+    that the columns t2.a, t2.b as well as the columns t1.a, t1.b are all
+    of the integer type. Then if the query
+      SELECT COUNT(*) FROM t1, t2 WHERE t1.a=t2.a and t1.b=t2.b  
+    is executed with a join cache in such a way that t1 is the driving
+    table then the key values to access table t2 can be read directly
+    from the join buffer.
+  
+  NOTES
+    In some cases key values could be read directly from the join buffer but
+    we still do not consider them embedded. In the future we'll expand the
+    the class of keys which we identify as embedded.
+
+  NOTES
+    The function returns FALSE if no key is used to join the records
+    from join_tab.
+
+  RETURN VALUE
+    TRUE    key values will be considered as embedded,
+    FALSE   otherwise.
+*/
+
+bool JOIN_CACHE::check_emb_key_usage()
+{
+
+  if (!is_key_access())
+    return FALSE;
+
+  uint i;
+  Item *item; 
+  KEY_PART_INFO *key_part;
+  CACHE_FIELD *copy;
+  CACHE_FIELD *copy_end;
+  uint len= 0;
+  TABLE_REF *ref= &join_tab->ref;
+  KEY *keyinfo= join_tab->get_keyinfo_by_key_no(ref->key);
+
+  /* 
+    If some of the key arguments are not from the local cache the key
+    is not considered as embedded.
+    TODO:
+    Expand it to the case when ref->key_parts=1 and local_key_arg_fields=0.
+  */  
+  if (external_key_arg_fields != 0)
+    return FALSE;
+  /* 
+    If the number of the local key arguments is not equal to the number
+    of key parts the key value cannot be read directly from the join buffer.   
+  */
+  if (local_key_arg_fields != ref->key_parts)
+    return FALSE;
+
+  /* 
+    A key is not considered embedded if one of the following is true:
+    - one of its key parts is not equal to a field
+    - it is a partial key
+    - definition of the argument field does not coincide with the
+      definition of the corresponding key component
+    - some of the key components are nullable
+  */  
+  for (i=0; i < ref->key_parts; i++)
+  {
+    item= ref->items[i]->real_item();
+    if (item->type() != Item::FIELD_ITEM)
+      return FALSE;
+    key_part= keyinfo->key_part+i;
+    if (key_part->key_part_flag & HA_PART_KEY_SEG)
+      return FALSE;
+    if (!key_part->field->eq_def(((Item_field *) item)->field))
+      return FALSE;
+    if (key_part->field->maybe_null())
+      return FALSE;
+  }
+  
+  copy= field_descr+flag_fields;
+  copy_end= copy+local_key_arg_fields;
+  for ( ; copy < copy_end; copy++)
+  {
+    /* 
+      If some of the key arguments are of variable length the key
+      is not considered as embedded.
+    */
+    if (copy->type != 0)
+      return FALSE;
+    /* 
+      If some of the key arguments are bit fields whose bits are partially
+      stored with null bits the key is not considered as embedded.
+    */
+    if (copy->field->type() == MYSQL_TYPE_BIT &&
+	 ((Field_bit*) (copy->field))->bit_len)
+      return FALSE;
+    len+= copy->length;
+  }
+
+  emb_key_length= len;
+
+  /* 
+    Make sure that key fields follow the order of the corresponding
+    key components these fields are equal to. For this the descriptors
+    of the fields that comprise the key might be re-ordered.
+  */
+  for (i= 0; i < ref->key_parts; i++)
+  {
+    uint j;
+    Item *item= ref->items[i]->real_item();
+    Field *fld= ((Item_field *) item)->field;
+    CACHE_FIELD *init_copy= field_descr+flag_fields+i; 
+    for (j= i, copy= init_copy; i < local_key_arg_fields;  i++, copy++)
+    {
+      if (fld->eq(copy->field))
+      {
+        if (j != i)
+        {
+          CACHE_FIELD key_part_copy= *copy;
+          *copy= *init_copy;
+          *init_copy= key_part_copy;
+        }
+        break;
+      }
+    }
+  }
+
+  return TRUE;
+}    
+
+
+/* 
+  Write record fields and their required offsets into the join cache buffer
+
+  SYNOPSIS
+    write_record_data()
+      link        a reference to the associated info in the previous cache
+      is_full OUT true if it has been decided that no more records will be
+                  added to the join buffer
+
+  DESCRIPTION
+    This function put into the cache buffer the following info that it reads
+    from the join record buffers or computes somehow:
+    (1) the length of all fields written for the record (optional)
+    (2) an offset to the associated info in the previous cache (if there is any)
+        determined by the link parameter
+    (3) all flag fields of the tables whose data field are put into the cache:
+        - match flag (optional),
+        - null bitmaps for all tables,
+        - null row flags for all tables
+    (4) values of all data fields including
+        - full images of those fixed legth data fields that cannot have 
+          trailing spaces
+        - significant part of fixed length fields that can have trailing spaces
+          with the prepanded length 
+        - data of non-blob variable length fields with the prepanded data length  
+        - blob data from blob fields with the prepanded data length
+    (5) record offset values for the data fields that are referred to from 
+        other caches
+ 
+    The record is written at the current position stored in the field 'pos'.
+    At the end of the function 'pos' points at the position right after the 
+    written record data.
+    The function increments the number of records in the cache that is stored
+    in the 'records' field by 1. The function also modifies the values of
+    'curr_rec_pos' and 'last_rec_pos' to point to the written record.
+    The 'end_pos' cursor is modified accordingly.
+    The 'last_rec_blob_data_is_in_rec_buff' is set on if the blob data 
+    remains in the record buffers and not copied to the join buffer. It may
+    happen only to the blob data from the last record added into the cache.
+    If on_precond is attached to join_tab and it is not evaluated to TRUE
+    then MATCH_IMPOSSIBLE is placed in the match flag field of the record
+    written into the join buffer.
+       
+  RETURN VALUE
+    length of the written record data
+*/
+
+uint JOIN_CACHE::write_record_data(uchar * link, bool *is_full)
+{
+  uint len;
+  bool last_record;
+  CACHE_FIELD *copy;
+  CACHE_FIELD *copy_end;
+  uchar *flags_pos;
+  uchar *cp= pos;
+  uchar *init_pos= cp;
+  uchar *rec_len_ptr= 0;
+  uint key_extra= extra_key_length();
+ 
+  records++;  /* Increment the counter of records in the cache */
+
+  len= pack_length + key_extra;
+
+  /* Make an adjustment for the size of the auxiliary buffer if there is any */
+  uint incr= aux_buffer_incr(records);
+  size_t rem= rem_space();
+  aux_buff_size+= len+incr < rem ? incr : rem;
+
+  /*
+    For each blob to be put into cache save its length and a pointer
+    to the value in the corresponding element of the blob_ptr array.
+    Blobs with null values are skipped.
+    Increment 'len' by the total length of all these blobs. 
+  */    
+  if (blobs)
+  {
+    CACHE_FIELD **copy_ptr= blob_ptr;
+    CACHE_FIELD **copy_ptr_end= copy_ptr+blobs;
+    for ( ; copy_ptr < copy_ptr_end; copy_ptr++)
+    {
+      Field_blob *blob_field= (Field_blob *) (*copy_ptr)->field;
+      if (!blob_field->is_null())
+      {
+        uint blob_len= blob_field->get_length();
+        (*copy_ptr)->blob_length= blob_len;
+        len+= blob_len;
+        blob_field->get_ptr(&(*copy_ptr)->str);
+      }
+    }
+  }
+
+  /*
+    Check whether we won't be able to add any new record into the cache after
+    this one because the cache will be full. Set last_record to TRUE if it's so.
+    The assume that the cache will be full after the record has been written
+    into it if either the remaining space of the cache is not big enough for the 
+    record's blob values or if there is a chance that not all non-blob fields
+    of the next record can be placed there.
+    This function is called only in the case when there is enough space left in
+    the cache to store at least non-blob parts of the current record.
+  */
+  last_record= (len+pack_length_with_blob_ptrs+key_extra) > rem_space();
+  
+  /* 
+    Save the position for the length of the record in the cache if it's needed.
+    The length of the record will be inserted here when all fields of the record
+    are put into the cache.  
+  */
+  if (with_length)
+  {
+    rec_len_ptr= cp;   
+    cp+= size_of_rec_len;
+  }
+
+  /*
+    Put a reference to the fields of the record that are stored in the previous
+    cache if there is any. This reference is passed by the 'link' parameter.     
+  */
+  if (prev_cache)
+  {
+    cp+= prev_cache->get_size_of_rec_offset();
+    prev_cache->store_rec_ref(cp, link);
+  } 
+
+  curr_rec_pos= cp;
+  
+  /* If the there is a match flag set its value to 0 */
+  copy= field_descr;
+  if (with_match_flag)
+    *copy[0].str= 0;
+
+  /* First put into the cache the values of all flag fields */
+  copy_end= field_descr+flag_fields;
+  flags_pos= cp;
+  for ( ; copy < copy_end; copy++)
+  {
+    memcpy(cp, copy->str, copy->length);
+    cp+= copy->length;
+  } 
+  
+  /* Now put the values of the remaining fields as soon as they are not nulls */ 
+  copy_end= field_descr+fields;
+  for ( ; copy < copy_end; copy++)
+  {
+    Field *field= copy->field;
+    if (field && field->maybe_null() && field->is_null())
+    {
+      /* Do not copy a field if its value is null */
+      if (copy->referenced_field_no)
+        copy->offset= 0;
+      continue;              
+    }
+    /* Save the offset of the field to put it later at the end of the record */ 
+    if (copy->referenced_field_no)
+      copy->offset= cp-curr_rec_pos;
+
+    if (copy->type == CACHE_BLOB)
+    {
+      Field_blob *blob_field= (Field_blob *) copy->field;
+      if (last_record)
+      {
+        last_rec_blob_data_is_in_rec_buff= 1;
+        /* Put down the length of the blob and the pointer to the data */  
+	blob_field->get_image(cp, copy->length+sizeof(char*),
+                              blob_field->charset());
+	cp+= copy->length+sizeof(char*);
+      }
+      else
+      {
+        /* First put down the length of the blob and then copy the data */ 
+	blob_field->get_image(cp, copy->length, 
+			      blob_field->charset());
+	memcpy(cp+copy->length, copy->str, copy->blob_length);               
+	cp+= copy->length+copy->blob_length;
+      }
+    }
+    else
+    {
+      switch (copy->type) {
+      case CACHE_VARSTR1:
+        /* Copy the significant part of the short varstring field */ 
+        len= (uint) copy->str[0] + 1;
+        memcpy(cp, copy->str, len);
+        cp+= len;
+        break;
+      case CACHE_VARSTR2:
+        /* Copy the significant part of the long varstring field */
+        len= uint2korr(copy->str) + 2;
+        memcpy(cp, copy->str, len);
+        cp+= len;
+        break;
+      case CACHE_STRIPPED:
+      {
+        /* 
+          Put down the field value stripping all trailing spaces off.
+          After this insert the length of the written sequence of bytes.
+        */ 
+	uchar *str, *end;
+	for (str= copy->str, end= str+copy->length;
+	     end > str && end[-1] == ' ';
+	     end--) ;
+	len=(uint) (end-str);
+        int2store(cp, len);
+	memcpy(cp+2, str, len);
+	cp+= len+2;
+        break;
+      }
+      default:      
+        /* Copy the entire image of the field from the record buffer */
+	memcpy(cp, copy->str, copy->length);
+	cp+= copy->length;
+      }
+    }
+  }
+  
+  /* Add the offsets of the fields that are referenced from other caches */ 
+  if (referenced_fields)
+  {
+    uint cnt= 0;
+    for (copy= field_descr+flag_fields; copy < copy_end ; copy++)
+    {
+      if (copy->referenced_field_no)
+      {
+        store_fld_offset(cp+size_of_fld_ofs*(copy->referenced_field_no-1),
+                         copy->offset);
+        cnt++;
+      }
+    }
+    cp+= size_of_fld_ofs*cnt;
+  }
+
+  if (rec_len_ptr)
+    store_rec_length(rec_len_ptr, (ulong) (cp-rec_len_ptr-size_of_rec_len));
+  last_rec_pos= curr_rec_pos; 
+  end_pos= pos= cp;
+  *is_full= last_record;
+
+  last_written_is_null_compl= 0;   
+  if (!join_tab->first_unmatched && join_tab->on_precond)
+  { 
+    join_tab->found= 0;
+    join_tab->not_null_compl= 1;
+    if (!join_tab->on_precond->val_int())
+    {
+      flags_pos[0]= MATCH_IMPOSSIBLE;     
+      last_written_is_null_compl= 1;
+    }
+  } 
+      
+  return (uint) (cp-init_pos);
+}
+
+
+/* 
+  Reset the join buffer for reading/writing: default implementation
+
+  SYNOPSIS
+    reset()
+      for_writing  if it's TRUE the function reset the buffer for writing
+
+  DESCRIPTION
+    This default implementation of the virtual function reset() resets 
+    the join buffer for reading or writing.
+    If the buffer is reset for reading only the 'pos' value is reset
+    to point to the very beginning of the join buffer. If the buffer is
+    reset for writing additionally: 
+    - the counter of the records in the buffer is set to 0,
+    - the the value of 'last_rec_pos' gets pointing at the position just
+      before the buffer, 
+    - 'end_pos' is set to point to the beginning of the join buffer,
+    - the size of the auxiliary buffer is reset to 0,
+    - the flag 'last_rec_blob_data_is_in_rec_buff' is set to 0.
+    
+  RETURN VALUE
+    none
+*/
+
+void JOIN_CACHE::reset(bool for_writing)
+{
+  pos= buff;
+  curr_rec_link= 0;
+  if (for_writing)
+  {
+    records= 0;
+    last_rec_pos= buff;
+    aux_buff_size= 0;
+    end_pos= pos;
+    last_rec_blob_data_is_in_rec_buff= 0;
+  }
+}
+
+
+/* 
+  Add a record into the join buffer: the default implementation
+
+  SYNOPSIS
+    put_record()
+
+  DESCRIPTION
+    This default implementation of the virtual function put_record writes
+    the next matching record into the join buffer.
+    It also links the record having been written into the join buffer with
+    the matched record in the previous cache if there is any.
+    The implementation assumes that the function get_curr_link() 
+    will return exactly the pointer to this matched record.
+
+  RETURN VALUE
+    TRUE    if it has been decided that it should be the last record
+            in the join buffer,
+    FALSE   otherwise
+*/
+
+bool JOIN_CACHE::put_record()
+{
+  bool is_full;
+  uchar *link= 0;
+  if (prev_cache)
+    link= prev_cache->get_curr_rec_link();
+  write_record_data(link, &is_full);
+  return is_full;
+}
+  
+
+/* 
+  Read the next record from the join buffer: the default implementation
+
+  SYNOPSIS
+    get_record()
+
+  DESCRIPTION
+    This default implementation of the virtual function get_record
+    reads fields of the next record from the join buffer of this cache.
+    The function also reads all other fields associated with this record
+    from the the join buffers of the previous caches. The fields are read
+    into the corresponding record buffers.
+    It is supposed that 'pos' points to the position in the buffer 
+    right after the previous record when the function is called.
+    When the function returns the 'pos' values is updated to point
+    to the position after the read record.
+    The value of 'curr_rec_pos' is also updated by the function to
+    point to the beginning of the first field of the record in the
+    join buffer.    
+
+  RETURN VALUE
+    TRUE    there are no more records to read from the join buffer
+    FALSE   otherwise
+*/
+
+bool JOIN_CACHE::get_record()
+{ 
+  bool res;
+  uchar *prev_rec_ptr= 0;
+  if (with_length)
+    pos+= size_of_rec_len;
+  if (prev_cache)
+  {
+    pos+= prev_cache->get_size_of_rec_offset();
+    prev_rec_ptr= prev_cache->get_rec_ref(pos);
+  }
+  curr_rec_pos= pos;
+  if (!(res= read_all_record_fields() == NO_MORE_RECORDS_IN_BUFFER))
+  {
+    pos+= referenced_fields*size_of_fld_ofs;
+    if (prev_cache)
+      prev_cache->get_record_by_pos(prev_rec_ptr);
+  } 
+  return res; 
+}
+
+
+/* 
+  Read a positioned record from the join buffer: the default implementation
+
+  SYNOPSIS
+    get_record_by_pos()
+      rec_ptr  position of the first field of the record in the join buffer
+
+  DESCRIPTION
+    This default implementation of the virtual function get_record_pos
+    reads the fields of the record positioned at 'rec_ptr' from the join buffer.
+    The function also reads all other fields associated with this record 
+    from the the join buffers of the previous caches. The fields are read
+    into the corresponding record buffers.
+
+  RETURN VALUE
+    none
+*/
+
+void JOIN_CACHE::get_record_by_pos(uchar *rec_ptr)
+{
+  uchar *save_pos= pos;
+  pos= rec_ptr;
+  read_all_record_fields();
+  pos= save_pos;
+  if (prev_cache)
+  {
+    uchar *prev_rec_ptr= prev_cache->get_rec_ref(rec_ptr);
+    prev_cache->get_record_by_pos(prev_rec_ptr);
+  }
+}
+
+
+/* 
+  Get the match flag from the referenced record: the default implementation
+
+  SYNOPSIS
+    get_match_flag_by_pos()
+      rec_ptr  position of the first field of the record in the join buffer
+
+  DESCRIPTION
+    This default implementation of the virtual function get_match_flag_by_pos
+    get the match flag for the record pointed by the reference at the position
+    rec_ptr. If the match flag is placed in one of the previous buffers the
+    function first reaches the linked record fields in this buffer.
+
+  RETURN VALUE
+    match flag for the record at the position rec_ptr
+*/
+
+enum JOIN_CACHE::Match_flag JOIN_CACHE::get_match_flag_by_pos(uchar *rec_ptr)
+{
+  Match_flag match_fl= MATCH_NOT_FOUND;
+  if (with_match_flag)
+  {
+    match_fl= (enum Match_flag) rec_ptr[0];
+    return match_fl;
+  }
+  if (prev_cache)
+  {
+    uchar *prev_rec_ptr= prev_cache->get_rec_ref(rec_ptr);
+    return prev_cache->get_match_flag_by_pos(prev_rec_ptr);
+  } 
+  DBUG_ASSERT(0);
+  return match_fl;
+}
+
+
+/* 
+  Calculate the increment of the auxiliary buffer for a record write
+
+  SYNOPSIS
+    aux_buffer_incr()
+      recno   the number of the record the increment to be calculated for
+
+  DESCRIPTION
+    This function calls the aux_buffer_incr the method of the
+    companion member join_tab_scan to calculate the growth of the
+    auxiliary buffer when the recno-th record is added to the
+    join_buffer of this cache.
+
+  RETURN VALUE
+    the number of bytes in the increment 
+*/
+
+uint JOIN_CACHE::aux_buffer_incr(ulong recno)
+{ 
+  return join_tab_scan->aux_buffer_incr(recno);
+}
+
+/* 
+  Read all flag and data fields of a record from the join buffer
+
+  SYNOPSIS
+    read_all_record_fields()
+
+  DESCRIPTION
+    The function reads all flag and data fields of a record from the join
+    buffer into the corresponding record buffers.
+    The fields are read starting from the position 'pos' which is
+    supposed to point to the beginning og the first record field.
+    The function increments the value of 'pos' by the length of the
+    read data. 
+
+  RETURN VALUE
+    (-1)   if there is no more records in the join buffer
+    length of the data read from the join buffer - otherwise
+*/
+
+uint JOIN_CACHE::read_all_record_fields()
+{
+  uchar *init_pos= pos;
+  
+  if (pos > last_rec_pos || !records)
+    return NO_MORE_RECORDS_IN_BUFFER;
+
+  /* First match flag, read null bitmaps and null_row flag for each table */
+  read_flag_fields();
+ 
+  /* Now read the remaining table fields if needed */
+  CACHE_FIELD *copy= field_descr+flag_fields;
+  CACHE_FIELD *copy_end= field_descr+fields;
+  bool blob_in_rec_buff= blob_data_is_in_rec_buff(init_pos);
+  for ( ; copy < copy_end; copy++)
+    read_record_field(copy, blob_in_rec_buff);
+
+  return (uint) (pos-init_pos);
+}
+
+
+/* 
+  Read all flag fields of a record from the join buffer
+
+  SYNOPSIS
+    read_flag_fields()
+
+  DESCRIPTION
+    The function reads all flag fields of a record from the join
+    buffer into the corresponding record buffers.
+    The fields are read starting from the position 'pos'.
+    The function increments the value of 'pos' by the length of the
+    read data. 
+
+  RETURN VALUE
+    length of the data read from the join buffer
+*/
+
+uint JOIN_CACHE::read_flag_fields()
+{
+  uchar *init_pos= pos;
+  CACHE_FIELD *copy= field_descr;
+  CACHE_FIELD *copy_end= copy+flag_fields;
+  if (with_match_flag)
+  {
+    copy->str[0]= test((Match_flag) pos[0] == MATCH_FOUND);
+    pos+= copy->length;
+    copy++;    
+  } 
+  for ( ; copy < copy_end; copy++)
+  {
+    memcpy(copy->str, pos, copy->length);
+    pos+= copy->length;
+  }
+  return (pos-init_pos);
+}
+
+
+/* 
+  Read a data record field from the join buffer
+
+  SYNOPSIS
+    read_record_field()
+      copy             the descriptor of the data field to be read
+      blob_in_rec_buff indicates whether this is the field from the record
+                       whose blob data are in record buffers
+
+  DESCRIPTION
+    The function reads the data field specified by the parameter copy
+    from the join buffer into the corresponding record buffer. 
+    The field is read starting from the position 'pos'.
+    The data of blob values is not copied from the join buffer.
+    The function increments the value of 'pos' by the length of the
+    read data. 
+
+  RETURN VALUE
+    length of the data read from the join buffer
+*/
+
+uint JOIN_CACHE::read_record_field(CACHE_FIELD *copy, bool blob_in_rec_buff)
+{
+  uint len;
+  /* Do not copy the field if its value is null */ 
+  if (copy->field && copy->field->maybe_null() && copy->field->is_null())
+    return 0;           
+  if (copy->type == CACHE_BLOB)
+  {
+    Field_blob *blob_field= (Field_blob *) copy->field;
+    /* 
+      Copy the length and the pointer to data but not the blob data 
+      itself to the record buffer
+    */ 
+    if (blob_in_rec_buff)
+    {
+      blob_field->set_image(pos, copy->length+sizeof(char*),
+			    blob_field->charset());
+      len= copy->length+sizeof(char*);
+    }
+    else
+    {
+      blob_field->set_ptr(pos, pos+copy->length);
+      len= copy->length+blob_field->get_length();
+    }
+  }
+  else
+  {
+    switch (copy->type) {
+    case CACHE_VARSTR1:
+      /* Copy the significant part of the short varstring field */
+      len= (uint) pos[0] + 1;
+      memcpy(copy->str, pos, len);
+      break;
+    case CACHE_VARSTR2:
+      /* Copy the significant part of the long varstring field */
+      len= uint2korr(pos) + 2;
+      memcpy(copy->str, pos, len);
+      break;
+    case CACHE_STRIPPED:
+      /* Pad the value by spaces that has been stripped off */
+      len= uint2korr(pos);
+      memcpy(copy->str, pos+2, len);
+      memset(copy->str+len, ' ', copy->length-len);
+      len+= 2;
+      break;
+    default:
+      /* Copy the entire image of the field from the record buffer */
+      len= copy->length;
+      memcpy(copy->str, pos, len);
+    }
+  }
+  pos+= len;
+  return len;
+}
+
+
+/* 
+  Read a referenced field from the join buffer
+
+  SYNOPSIS
+    read_referenced_field()
+      copy         pointer to the descriptor of the referenced field
+      rec_ptr      pointer to the record that may contain this field
+      len  IN/OUT  total length of the record fields 
+
+  DESCRIPTION
+    The function checks whether copy points to a data field descriptor
+    for this cache object. If it does not then the function returns
+    FALSE. Otherwise the function reads the field of the record in
+    the join buffer pointed by 'rec_ptr' into the corresponding record
+    buffer and returns TRUE.
+    If the value of *len is 0 then the function sets it to the total
+    length of the record fields including possible trailing offset
+    values. Otherwise *len is supposed to provide this value that
+    has been obtained earlier. 
+
+  NOTE
+    If the value of the referenced field is null then the offset
+    for the value is set to 0. If the value of a field can be null
+    then the value of flag_fields is always positive. So the offset
+    for any non-null value cannot be 0 in this case. 
+
+  RETURN VALUE
+    TRUE   'copy' points to a data descriptor of this join cache
+    FALSE  otherwise
+*/
+
+bool JOIN_CACHE::read_referenced_field(CACHE_FIELD *copy,
+                                       uchar *rec_ptr, 
+                                       uint *len)
+{
+  uchar *ptr;
+  uint offset;
+  if (copy < field_descr || copy >= field_descr+fields)
+    return FALSE;
+  if (!*len)
+  {
+    /* Get the total length of the record fields */ 
+    uchar *len_ptr= rec_ptr;
+    if (prev_cache)
+      len_ptr-= prev_cache->get_size_of_rec_offset();
+    *len= get_rec_length(len_ptr-size_of_rec_len);
+  }
+  
+  ptr= rec_ptr-(prev_cache ? prev_cache->get_size_of_rec_offset() : 0);  
+  offset= get_fld_offset(ptr+ *len - 
+                         size_of_fld_ofs*
+                         (referenced_fields+1-copy->referenced_field_no));  
+  bool is_null= FALSE;
+  Field *field= copy->field;
+  if (offset == 0 && flag_fields)
+    is_null= TRUE;
+  if (is_null)
+  {
+    field->set_null();
+    if (!field->real_maybe_null())
+      field->table->null_row= 1;
+  }
+  else
+  {
+    uchar *save_pos= pos;
+    field->set_notnull(); 
+    if (!field->real_maybe_null())
+      field->table->null_row= 0;
+    pos= rec_ptr+offset;
+    read_record_field(copy, blob_data_is_in_rec_buff(rec_ptr));
+    pos= save_pos;
+  }
+  return TRUE;
+}
+   
+
+/* 
+  Skip record from join buffer if's already matched: default implementation
+
+  SYNOPSIS
+    skip_if_matched()
+
+  DESCRIPTION
+    This default implementation of the virtual function skip_if_matched
+    skips the next record from the join buffer if its  match flag is set to 
+    MATCH_FOUND.
+    If the record is skipped the value of 'pos' is set to point to the position
+    right after the record.
+
+  RETURN VALUE
+    TRUE   the match flag is set to MATCH_FOUND and the record has been skipped
+    FALSE  otherwise
+*/
+
+bool JOIN_CACHE::skip_if_matched()
+{
+  DBUG_ASSERT(with_length);
+  uint offset= size_of_rec_len;
+  if (prev_cache)
+    offset+= prev_cache->get_size_of_rec_offset();
+  /* Check whether the match flag is MATCH_FOUND */
+  if (get_match_flag_by_pos(pos+offset) == MATCH_FOUND)
+  {
+    pos+= size_of_rec_len + get_rec_length(pos);
+    return TRUE;
+  }
+  return FALSE;
+}      
+
+
+/* 
+  Skip record from join buffer if the match isn't needed: default implementation
+
+  SYNOPSIS
+    skip_if_not_needed_match()
+
+  DESCRIPTION
+    This default implementation of the virtual function skip_if_not_needed_match
+    skips the next record from the join buffer if its match flag is not 
+    MATCH_NOT_FOUND, and, either its value is MATCH_FOUND and join_tab is the
+    first inner table of an inner join, or, its value is MATCH_IMPOSSIBLE
+    and join_tab is the first inner table of an outer join.
+    If the record is skipped the value of 'pos' is set to point to the position
+    right after the record.
+
+  RETURN VALUE
+    TRUE    the record has to be skipped
+    FALSE   otherwise 
+*/
+
+bool JOIN_CACHE::skip_if_not_needed_match()
+{
+  DBUG_ASSERT(with_length);
+  enum Match_flag match_fl;
+  uint offset= size_of_rec_len;
+  if (prev_cache)
+    offset+= prev_cache->get_size_of_rec_offset();
+
+  if ((match_fl= get_match_flag_by_pos(pos+offset)) != MATCH_NOT_FOUND &&
+      (join_tab->check_only_first_match() == (match_fl == MATCH_FOUND)) )
+  {
+    pos+= size_of_rec_len + get_rec_length(pos);
+    return TRUE;
+  }
+  return FALSE;
+}      
+
+
+/* 
+  Restore the fields of the last record from the join buffer
+ 
+  SYNOPSIS
+    restore_last_record()
+
+  DESCRIPTION
+    This function restore the values of the fields of the last record put
+    into join buffer in record buffers. The values most probably have been
+    overwritten by the field values from other records when they were read
+    from the join buffer into the record buffer in order to check pushdown
+    predicates.
+
+  RETURN
+    none
+*/
+
+void JOIN_CACHE::restore_last_record()
+{
+  if (records)
+    get_record_by_pos(last_rec_pos);
+}
+
+
+/*
+  Join records from the join buffer with records from the next join table    
+
+  SYNOPSIS
+    join_records()
+      skip_last    do not find matches for the last record from the buffer
+
+  DESCRIPTION
+    The functions extends all records from the join buffer by the matched
+    records from join_tab. In the case of outer join operation it also
+    adds null complementing extensions for the records from the join buffer
+    that have no match. 
+    No extensions are generated for the last record from the buffer if
+    skip_last is true.  
+
+  NOTES
+    The function must make sure that if linked join buffers are used then
+    a join buffer cannot be refilled again until all extensions in the
+    buffers chained to this one are generated.
+    Currently an outer join operation with several inner tables always uses
+    at least two linked buffers with the match join flags placed in the
+    first buffer. Any record composed of rows of the inner tables that
+    matches a record in this buffer must refer to the position of the
+    corresponding match flag.
+
+  IMPLEMENTATION
+    When generating extensions for outer tables of an outer join operation
+    first we generate all extensions for those records from the join buffer
+    that have matches, after which null complementing extension for all
+    unmatched records from the join buffer are generated.  
+      
+  RETURN VALUE
+    return one of enum_nested_loop_state, except NESTED_LOOP_NO_MORE_ROWS.
+*/ 
+
+enum_nested_loop_state JOIN_CACHE::join_records(bool skip_last)
+{
+  JOIN_TAB *tab;
+  enum_nested_loop_state rc= NESTED_LOOP_OK;
+  bool outer_join_first_inner= join_tab->is_first_inner_for_outer_join();
+
+  if (outer_join_first_inner && !join_tab->first_unmatched)
+    join_tab->not_null_compl= TRUE;   
+
+  if (!join_tab->first_unmatched)
+  {
+    /* Find all records from join_tab that match records from join buffer */
+    rc= join_matching_records(skip_last);   
+    if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+      goto finish;
+    if (outer_join_first_inner)
+    {
+      if (next_cache)
+      {
+        /* 
+          Ensure that all matches for outer records from join buffer are to be
+          found. Now we ensure that all full records are found for records from
+          join buffer. Generally this is an overkill.
+          TODO: Ensure that only matches of the inner table records have to be
+          found for the records from join buffer.
+	*/ 
+        rc= next_cache->join_records(skip_last);
+        if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+          goto finish;
+      }
+      join_tab->not_null_compl= FALSE;
+      /* Prepare for generation of null complementing extensions */
+      for (tab= join_tab->first_inner; tab <= join_tab->last_inner; tab++)
+        tab->first_unmatched= join_tab->first_inner;
+    }
+  }
+  if (join_tab->first_unmatched)
+  {
+    if (is_key_access())
+      restore_last_record();
+
+    /* 
+      Generate all null complementing extensions for the records from
+      join buffer that don't have any matching rows from the inner tables.
+    */
+    reset(FALSE);
+    rc= join_null_complements(skip_last);   
+    if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+      goto finish;
+  }
+  if(next_cache)
+  {
+    /* 
+      When using linked caches we must ensure the records in the next caches
+      that refer to the records in the join buffer are fully extended.
+      Otherwise we could have references to the records that have been
+      already erased from the join buffer and replaced for new records. 
+    */ 
+    rc= next_cache->join_records(skip_last);
+    if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+      goto finish;
+  }
+  if (outer_join_first_inner)
+  {
+    /* 
+      All null complemented rows have been already generated for all
+      outer records from join buffer. Restore the state of the
+      first_unmatched values to 0 to avoid another null complementing.
+    */
+    for (tab= join_tab->first_inner; tab <= join_tab->last_inner; tab++)
+      tab->first_unmatched= 0;
+  } 
+ 
+  if (skip_last)
+  {
+    DBUG_ASSERT(!is_key_access());
+    /*
+       Restore the last record from the join buffer to generate
+       all extentions for it.
+    */
+    get_record();		               
+  }
+
+finish:
+  restore_last_record();
+  reset(TRUE);
+  return rc;
+}
+
+
+/*   
+  Find matches from the next table for records from the join buffer 
+
+  SYNOPSIS
+    join_matching_records()
+      skip_last    do not look for matches for the last partial join record 
+
+  DESCRIPTION
+    The function retrieves rows of the join_tab table and checks whether they
+    match partial join records from the join buffer. If a match is found
+    the function will call the sub_select function trying to look for matches
+    for the remaining join operations.
+    This function currently is called only from the function join_records.    
+    If the value of skip_last is true the function writes the partial join
+    record from the record buffer into the join buffer to save its value for
+    the future processing in the caller function.
+
+  NOTES
+    If employed by BNL or BNLH join algorithms the function performs a full
+    scan of join_tab for each refill of the join buffer. If BKA or BKAH
+    algorithms are used then the function iterates only over those records
+    from join_tab that can be accessed by keys built over records in the join
+    buffer. To apply a proper method of iteration the function just calls
+    virtual iterator methods (open, next, close) of the member join_tab_scan.
+    The member can be either of the JOIN_TAB_SCAN or JOIN_TAB_SCAN_MMR type.
+    The class JOIN_TAB_SCAN provides the iterator methods for BNL/BNLH join
+    algorithms. The class JOIN_TAB_SCAN_MRR provides the iterator methods
+    for BKA/BKAH join algorithms.
+    When the function looks for records from the join buffer that would
+    match a record from join_tab it iterates either over all records in
+    the buffer or only over selected records. If BNL join operation is
+    performed all records are checked for the match. If BNLH or BKAH
+    algorithm is employed to join join_tab then the function looks only
+    through the records with the same join key as the record from join_tab.
+    With the BKA join algorithm only one record from the join buffer is checked
+    for a match for any record from join_tab. To iterate over the candidates
+    for a match the virtual function get_next_candidate_for_match is used,
+    while the virtual function prepare_look_for_matches is called to prepare
+    for such iteration proccess.     
+
+  NOTES
+    The function produces all matching extensions for the records in the 
+    join buffer following the path of the employed blocked algorithm. 
+    When an outer join operation is performed all unmatched records from
+    the join buffer must be extended by null values. The function 
+    'join_null_complements' serves this purpose.  
+      
+  RETURN VALUE
+    return one of enum_nested_loop_state
+*/ 
+
+enum_nested_loop_state JOIN_CACHE::join_matching_records(bool skip_last)
+{
+  int error;
+  enum_nested_loop_state rc= NESTED_LOOP_OK;
+  join_tab->table->null_row= 0;
+  bool check_only_first_match= join_tab->check_only_first_match();
+  bool outer_join_first_inner= join_tab->is_first_inner_for_outer_join();
+
+  /* Return at once if there are no records in the join buffer */
+  if (!records)     
+    return NESTED_LOOP_OK;   
+ 
+  /* 
+    When joining we read records from the join buffer back into record buffers.
+    If matches for the last partial join record are found through a call to
+    the sub_select function then this partial join record must be saved in the
+    join buffer in order to be restored just before the sub_select call.
+  */             
+  if (skip_last)     
+    put_record();     
+ 
+  if (join_tab->use_quick == 2 && join_tab->select->quick)
+  { 
+    /* A dynamic range access was used last. Clean up after it */
+    delete join_tab->select->quick;
+    join_tab->select->quick= 0;
+  }
+
+  /* Prepare to retrieve all records of the joined table */
+  if ((error= join_tab_scan->open())) 
+    goto finish; /* psergey-note: if this returns error, we will assert in net_send_statement() */
+
+  while (!(error= join_tab_scan->next()))   
+  {
+    if (join->thd->killed)
+    {
+      /* The user has aborted the execution of the query */
+      join->thd->send_kill_message();
+      rc= NESTED_LOOP_KILLED;
+      goto finish; 
+    }
+
+    if (join_tab->keep_current_rowid)
+      join_tab->table->file->position(join_tab->table->record[0]);
+    
+    /* Prepare to read matching candidates from the join buffer */
+    if (prepare_look_for_matches(skip_last))
+      continue;
+
+    uchar *rec_ptr;
+    /* Read each possible candidate from the buffer and look for matches */
+    while ((rec_ptr= get_next_candidate_for_match()))
+    { 
+      /* 
+        If only the first match is needed, and, it has been already found for
+        the next record read from the join buffer, then the record is skipped.
+        Also those records that must be null complemented are not considered
+        as candidates for matches.
+      */
+      if ((!check_only_first_match && !outer_join_first_inner) ||
+          !skip_next_candidate_for_match(rec_ptr))
+      {
+	read_next_candidate_for_match(rec_ptr);
+        rc= generate_full_extensions(rec_ptr);
+        if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+	  goto finish;   
+      }
+    }
+  }
+
+finish: 
+  if (error)                 
+    rc= error < 0 ? NESTED_LOOP_NO_MORE_ROWS: NESTED_LOOP_ERROR;
+  join_tab_scan->close();
+  return rc;
+}
+
+
+/*
+  Set match flag for a record in join buffer if it has not been set yet    
+
+  SYNOPSIS
+    set_match_flag_if_none()
+      first_inner     the join table to which this flag is attached to
+      rec_ptr         pointer to the record in the join buffer 
+
+  DESCRIPTION
+    If the records of the table are accumulated in a join buffer the function
+    sets the match flag for the record in the buffer that is referred to by
+    the record from this cache positioned at 'rec_ptr'. 
+    The function also sets the match flag 'found' of the table first inner
+    if it has not been set before. 
+
+  NOTES
+    The function assumes that the match flag for any record in any cache
+    is placed in the first byte occupied by the record fields. 
+
+  RETURN VALUE
+    TRUE   the match flag is set by this call for the first time
+    FALSE  the match flag has been set before this call
+*/ 
+
+bool JOIN_CACHE::set_match_flag_if_none(JOIN_TAB *first_inner,
+                                        uchar *rec_ptr)
+{
+  if (!first_inner->cache)
+  {
+    /* 
+      Records of the first inner table to which the flag is attached to
+      are not accumulated in a join buffer.
+    */
+    if (first_inner->found)
+      return FALSE;
+    else
+    {
+      first_inner->found= 1;
+      return TRUE;
+    }
+  }
+  JOIN_CACHE *cache= this;
+  while (cache->join_tab != first_inner)
+  {
+    cache= cache->prev_cache;
+    DBUG_ASSERT(cache);
+    rec_ptr= cache->get_rec_ref(rec_ptr);
+  } 
+  if ((Match_flag) rec_ptr[0] != MATCH_FOUND)
+  {
+    rec_ptr[0]= MATCH_FOUND;
+    first_inner->found= 1;
+    return TRUE;  
+  }
+  return FALSE;
+}
+
+
+/*
+  Generate all full extensions for a partial join record in the buffer    
+
+  SYNOPSIS
+    generate_full_extensions()
+      rec_ptr     pointer to the record from join buffer to generate extensions 
+
+  DESCRIPTION
+    The function first checks whether the current record of 'join_tab' matches
+    the partial join record from join buffer located at 'rec_ptr'. If it is the
+    case the function calls the join_tab->next_select method to generate
+    all full extension for this partial join match.
+      
+  RETURN VALUE
+    return one of enum_nested_loop_state.
+*/ 
+
+enum_nested_loop_state JOIN_CACHE::generate_full_extensions(uchar *rec_ptr)
+{
+  enum_nested_loop_state rc= NESTED_LOOP_OK;
+  
+  /*
+    Check whether the extended partial join record meets
+    the pushdown conditions. 
+  */
+  if (check_match(rec_ptr))
+  {    
+    int res= 0;
+
+    if (!join_tab->check_weed_out_table || 
+        !(res= do_sj_dups_weedout(join->thd, join_tab->check_weed_out_table)))
+    {
+      set_curr_rec_link(rec_ptr);
+      rc= (join_tab->next_select)(join, join_tab+1, 0);
+      if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+      {
+        reset(TRUE);
+        return rc;
+      }
+    }
+    if (res == -1)
+    {
+      rc= NESTED_LOOP_ERROR;
+      return rc;
+    }
+  }
+  return rc;
+}
+
+
+/*
+  Check matching to a partial join record from the join buffer    
+
+  SYNOPSIS
+    check_match()
+      rec_ptr     pointer to the record from join buffer to check matching to 
+
+  DESCRIPTION
+    The function checks whether the current record of 'join_tab' matches
+    the partial join record from join buffer located at 'rec_ptr'. If this is
+    the case and 'join_tab' is the last inner table of a semi-join or an outer
+    join the function turns on the match flag for the 'rec_ptr' record unless
+    it has been already set.
+
+  NOTES
+    Setting the match flag on can trigger re-evaluation of pushdown conditions
+    for the record when join_tab is the last inner table of an outer join.
+      
+  RETURN VALUE
+    TRUE   there is a match
+    FALSE  there is no match
+*/ 
+
+inline bool JOIN_CACHE::check_match(uchar *rec_ptr)
+{
+  /* Check whether pushdown conditions are satisfied */
+  if (join_tab->select && join_tab->select->skip_record(join->thd) <= 0)
+    return FALSE;
+
+  if (!join_tab->is_last_inner_table())
+    return TRUE;
+
+  /* 
+     This is the last inner table of an outer join,
+     and maybe of other embedding outer joins, or
+     this is the last inner table of a semi-join.
+  */
+  JOIN_TAB *first_inner= join_tab->get_first_inner_table();
+  do
+  {
+    set_match_flag_if_none(first_inner, rec_ptr);
+    if (first_inner->check_only_first_match() &&
+        !join_tab->first_inner)
+      return TRUE;
+    /* 
+      This is the first match for the outer table row.
+      The function set_match_flag_if_none has turned the flag
+      first_inner->found on. The pushdown predicates for
+      inner tables must be re-evaluated with this flag on.
+      Note that, if first_inner is the first inner table 
+      of a semi-join, but is not an inner table of an outer join
+      such that 'not exists' optimization can  be applied to it, 
+      the re-evaluation of the pushdown predicates is not needed.
+    */      
+    for (JOIN_TAB *tab= first_inner; tab <= join_tab; tab++)
+    {
+      if (tab->select && tab->select->skip_record(join->thd) <= 0)
+        return FALSE;
+    }
+  }
+  while ((first_inner= first_inner->first_upper) &&
+         first_inner->last_inner == join_tab);
+  
+  return TRUE;
+} 
+
+
+/*
+  Add null complements for unmatched outer records from join buffer    
+
+  SYNOPSIS
+    join_null_complements()
+      skip_last    do not add null complements for the last record 
+
+  DESCRIPTION
+    This function is called only for inner tables of outer joins.
+    The function retrieves all rows from the join buffer and adds null
+    complements for those of them that do not have matches for outer
+    table records.
+    If the 'join_tab' is the last inner table of the embedding outer 
+    join and the null complemented record satisfies the outer join
+    condition then the the corresponding match flag is turned on
+    unless it has been set earlier. This setting may trigger
+    re-evaluation of pushdown conditions for the record. 
+
+  NOTES
+    The same implementation of the virtual method join_null_complements
+    is used for BNL/BNLH/BKA/BKA join algorthm.
+      
+  RETURN VALUE
+    return one of enum_nested_loop_state.
+*/ 
+
+enum_nested_loop_state JOIN_CACHE::join_null_complements(bool skip_last)
+{
+  ulonglong cnt; 
+  enum_nested_loop_state rc= NESTED_LOOP_OK;
+  bool is_first_inner= join_tab == join_tab->first_unmatched;
+ 
+  /* Return at once if there are no records in the join buffer */
+  if (!records)
+    return NESTED_LOOP_OK;
+  
+  cnt= records - (is_key_access() ? 0 : test(skip_last));
+
+  /* This function may be called only for inner tables of outer joins */ 
+  DBUG_ASSERT(join_tab->first_inner);
+
+  for ( ; cnt; cnt--)
+  {
+    if (join->thd->killed)
+    {
+      /* The user has aborted the execution of the query */
+      join->thd->send_kill_message();
+      rc= NESTED_LOOP_KILLED;
+      goto finish;
+    }
+    /* Just skip the whole record if a match for it has been already found */
+    if (!is_first_inner || !skip_if_matched())
+    {
+      get_record();
+      /* The outer row is complemented by nulls for each inner table */
+      restore_record(join_tab->table, s->default_values);
+      mark_as_null_row(join_tab->table);  
+      rc= generate_full_extensions(get_curr_rec());
+      if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+        goto finish;
+    }
+  }
+
+finish:
+  return rc;
+}
+
+
+/*
+  Add a comment on the join algorithm employed by the join cache 
+
+  SYNOPSIS
+    print_explain_comment()
+      str  string to add the comment on the employed join algorithm to
+
+  DESCRIPTION
+    This function adds info on the type of the used join buffer (flat or
+    incremental) and on the type of the the employed join algorithm (BNL,
+    BNLH, BKA or BKAH) to the the end of the sring str.
+
+  RETURN VALUE
+    none
+*/ 
+
+void JOIN_CACHE::print_explain_comment(String *str)
+{
+  str->append(STRING_WITH_LEN(" ("));
+  const char *buffer_type= prev_cache ? "incremental" : "flat";
+  str->append(buffer_type);
+  str->append(STRING_WITH_LEN(", "));
+  
+  const char *join_alg="";
+  switch (get_join_alg()) {
+  case BNL_JOIN_ALG:
+    join_alg= "BNL";
+    break;
+  case BNLH_JOIN_ALG:
+    join_alg= "BNLH";
+    break;
+  case BKA_JOIN_ALG:
+    join_alg= "BKA";
+    break;
+  case BKAH_JOIN_ALG:
+    join_alg= "BKAH";
+    break;
+  default:
+    DBUG_ASSERT(0);
+  }
+
+  str->append(join_alg);
+  str->append(STRING_WITH_LEN(" join"));
+  str->append(STRING_WITH_LEN(")"));
+ }
+   
+
+/* 
+  Initialize a hashed join cache       
+
+  SYNOPSIS
+    init()
+
+  DESCRIPTION
+    The function initializes the cache structure with a hash table in it.
+    The hash table will be used to store key values for the records from
+    the join buffer.
+    The function allocates memory for the join buffer and for descriptors of
+    the record fields stored in the buffer.
+    The function also initializes a hash table for record keys within the join
+    buffer space.
+
+  NOTES VALUE
+    The function is supposed to be called by the init methods of the classes 
+    derived from JOIN_CACHE_HASHED.
+  
+  RETURN VALUE
+    0   initialization with buffer allocations has been succeeded
+    1   otherwise
+*/
+
+int JOIN_CACHE_HASHED::init()
+{
+  int rc= 0;
+  TABLE_REF *ref= &join_tab->ref;
+
+  DBUG_ENTER("JOIN_CACHE_HASHED::init");
+
+  hash_table= 0;
+  key_entries= 0;
+
+  key_length= ref->key_length;
+
+  if ((rc= JOIN_CACHE::init()))
+    DBUG_RETURN (rc);
+
+  if (!(key_buff= (uchar*) sql_alloc(key_length)))
+    DBUG_RETURN(1);
+
+  /* Take into account a reference to the next record in the key chain */
+  pack_length+= get_size_of_rec_offset(); 
+  pack_length_with_blob_ptrs+= get_size_of_rec_offset();
+
+  ref_key_info= join_tab->get_keyinfo_by_key_no(join_tab->ref.key);
+  ref_used_key_parts= join_tab->ref.key_parts;
+
+  hash_func= &JOIN_CACHE_HASHED::get_hash_idx_simple;
+  hash_cmp_func= &JOIN_CACHE_HASHED::equal_keys_simple;
+
+  KEY_PART_INFO *key_part= ref_key_info->key_part;
+  KEY_PART_INFO *key_part_end= key_part+ref_used_key_parts;
+  for ( ; key_part < key_part_end; key_part++)
+  {
+    if (!key_part->field->eq_cmp_as_binary())
+    {
+      hash_func= &JOIN_CACHE_HASHED::get_hash_idx_complex;
+      hash_cmp_func= &JOIN_CACHE_HASHED::equal_keys_complex;
+      break;
+    }
+  }
+      
+  init_hash_table();
+
+  rec_fields_offset= get_size_of_rec_offset()+get_size_of_rec_length()+
+                     (prev_cache ? prev_cache->get_size_of_rec_offset() : 0);
+
+  data_fields_offset= 0;
+  if (use_emb_key)
+  {
+    CACHE_FIELD *copy= field_descr;
+    CACHE_FIELD *copy_end= copy+flag_fields;
+    for ( ; copy < copy_end; copy++)
+      data_fields_offset+= copy->length;
+  } 
+
+  DBUG_RETURN(rc);
+}
+
+
+/* 
+  Initialize the hash table of a hashed join cache 
+
+  SYNOPSIS
+    init_hash_table()
+
+  DESCRIPTION
+    The function estimates the number of hash table entries in the hash
+    table to be used and initializes this hash table within the join buffer
+    space.
+
+  RETURN VALUE
+    Currently the function always returns 0;
+*/
+
+int JOIN_CACHE_HASHED::init_hash_table()
+{
+  hash_table= 0;
+  key_entries= 0;
+
+  /* Calculate the minimal possible value of size_of_key_ofs greater than 1 */
+  uint max_size_of_key_ofs= max(2, get_size_of_rec_offset());  
+  for (size_of_key_ofs= 2;
+       size_of_key_ofs <= max_size_of_key_ofs;
+       size_of_key_ofs+= 2)
+  {    
+    key_entry_length= get_size_of_rec_offset() + // key chain header
+                      size_of_key_ofs +          // reference to the next key 
+                      (use_emb_key ?  get_size_of_rec_offset() : key_length);
+
+    ulong space_per_rec= avg_record_length +
+                         avg_aux_buffer_incr +
+                         key_entry_length+size_of_key_ofs;
+    uint n= buff_size / space_per_rec;
+
+    /*
+      TODO: Make a better estimate for this upper bound of
+            the number of records in in the join buffer.
+    */
+    uint max_n= buff_size / (pack_length-length+
+                             key_entry_length+size_of_key_ofs);
+
+    hash_entries= (uint) (n / 0.7);
+    set_if_bigger(hash_entries, 1);
+    
+    if (offset_size(max_n*key_entry_length) <=
+        size_of_key_ofs)
+      break;
+  }
+   
+  /* Initialize the hash table */ 
+  hash_table= buff + (buff_size-hash_entries*size_of_key_ofs);
+  cleanup_hash_table();
+  curr_key_entry= hash_table;
+
+  return 0;
+}
+
+
+/*
+  Reallocate the join buffer of a hashed join cache
+ 
+  SYNOPSIS
+    realloc_buffer()
+
+  DESCRITION
+    The function reallocates the join buffer of the hashed join cache.
+    After this it initializes a hash table within the buffer space and
+    resets the join cache for writing.
+
+  NOTES
+    The function assumes that buff_size contains the new value for the join
+    buffer size.  
+
+  RETURN VALUE
+    0   if the buffer has been successfully reallocated
+    1   otherwise
+*/
+
+int JOIN_CACHE_HASHED::realloc_buffer()
+{
+  int rc;
+  free();
+  rc= test(!(buff= (uchar*) my_malloc(buff_size, MYF(0))));
+  init_hash_table();
+  reset(TRUE);
+  return rc;   	
+}
+
+
+/*
+  Get maximum size of the additional space per record used for record keys
+
+  SYNOPSYS
+    get_max_key_addon_space_per_record()
+  
+  DESCRIPTION
+    The function returns the size of the space occupied by one key entry
+    and one hash table entry.
+
+  RETURN VALUE
+    maximum size of the additional space per record that is used to store
+    record keys in the hash table
+*/
+
+uint JOIN_CACHE_HASHED::get_max_key_addon_space_per_record()
+{
+  ulong len;
+  TABLE_REF *ref= &join_tab->ref;
+  /* 
+    The total number of hash entries in the hash tables is bounded by
+    ceiling(N/0.7) where N is the maximum number of records in the buffer.
+    That's why the multiplier 2 is used in the formula below. 
+  */ 
+  len= (use_emb_key ?  get_size_of_rec_offset() : ref->key_length) +
+        size_of_rec_ofs +    // size of the key chain header
+        size_of_rec_ofs +    // >= size of the reference to the next key 
+        2*size_of_rec_ofs;   // >= 2*( size of hash table entry)
+  return len; 
+}    
+
+
+/* 
+  Reset the buffer of a hashed join cache for reading/writing
+
+  SYNOPSIS
+    reset()
+      for_writing  if it's TRUE the function reset the buffer for writing
+
+  DESCRIPTION
+    This implementation of the virtual function reset() resets the join buffer
+    of the JOIN_CACHE_HASHED class for reading or writing.
+    Additionally to what the default implementation does this function
+    cleans up the hash table allocated within the buffer.  
+    
+  RETURN VALUE
+    none
+*/
+ 
+void JOIN_CACHE_HASHED::reset(bool for_writing)
+{
+  this->JOIN_CACHE::reset(for_writing);
+  if (for_writing && hash_table)
+    cleanup_hash_table();
+  curr_key_entry= hash_table;
+}
+
+
+/* 
+  Add a record into the buffer of a hashed join cache
+
+  SYNOPSIS
+    put_record()
+
+  DESCRIPTION
+    This implementation of the virtual function put_record writes the next
+    matching record into the join buffer of the JOIN_CACHE_HASHED class.
+    Additionally to what the default implementation does this function
+    performs the following. 
+    It extracts from the record the key value used in lookups for matching
+    records and searches for this key in the hash tables from the join cache.
+    If it finds the key in the hash table it joins the record to the chain
+    of records with this key. If the key is not found in the hash table the
+    key is placed into it and a chain containing only the newly added record 
+    is attached to the key entry. The key value is either placed in the hash 
+    element added for the key or, if the use_emb_key flag is set, remains in
+    the record from the partial join.
+    If the match flag field of a record contains MATCH_IMPOSSIBLE the key is
+    not created for this record. 
+    
+  RETURN VALUE
+    TRUE    if it has been decided that it should be the last record
+            in the join buffer,
+    FALSE   otherwise
+*/
+
+bool JOIN_CACHE_HASHED::put_record()
+{
+  bool is_full;
+  uchar *key;
+  uint key_len= key_length;
+  uchar *key_ref_ptr;
+  uchar *link= 0;
+  TABLE_REF *ref= &join_tab->ref;
+  uchar *next_ref_ptr= pos;
+
+  pos+= get_size_of_rec_offset();
+  /* Write the record into the join buffer */  
+  if (prev_cache)
+    link= prev_cache->get_curr_rec_link();
+  write_record_data(link, &is_full);
+
+  if (last_written_is_null_compl)
+    return is_full;    
+
+  if (use_emb_key)
+    key= get_curr_emb_key();
+  else
+  {
+    /* Build the key over the fields read into the record buffers */ 
+    cp_buffer_from_ref(join->thd, join_tab->table, ref);
+    key= ref->key_buff;
+  }
+
+  /* Look for the key in the hash table */
+  if (key_search(key, key_len, &key_ref_ptr))
+  {
+    uchar *last_next_ref_ptr;
+    /* 
+      The key is found in the hash table. 
+      Add the record to the circular list of the records attached to this key.
+      Below 'rec' is the record to be added into the record chain for the found
+      key, 'key_ref' points to a flatten representation of the st_key_entry 
+      structure that contains the key and the head of the record chain.
+    */
+    last_next_ref_ptr= get_next_rec_ref(key_ref_ptr+get_size_of_key_offset());
+    /* rec->next_rec= key_entry->last_rec->next_rec */
+    memcpy(next_ref_ptr, last_next_ref_ptr, get_size_of_rec_offset());
+    /* key_entry->last_rec->next_rec= rec */ 
+    store_next_rec_ref(last_next_ref_ptr, next_ref_ptr);
+    /* key_entry->last_rec= rec */
+    store_next_rec_ref(key_ref_ptr+get_size_of_key_offset(), next_ref_ptr);
+  }
+  else
+  {
+    /* 
+      The key is not found in the hash table.
+      Put the key into the join buffer linking it with the keys for the
+      corresponding hash entry. Create a circular list with one element
+      referencing the record and attach the list to the key in the buffer.
+    */
+    uchar *cp= last_key_entry;
+    cp-= get_size_of_rec_offset()+get_size_of_key_offset();
+    store_next_key_ref(key_ref_ptr, cp);
+    store_null_key_ref(cp);
+    store_next_rec_ref(next_ref_ptr, next_ref_ptr);
+    store_next_rec_ref(cp+get_size_of_key_offset(), next_ref_ptr);
+    if (use_emb_key)
+    {
+      cp-= get_size_of_rec_offset();
+      store_emb_key_ref(cp, key);
+    }
+    else
+    {
+      cp-= key_len;
+      memcpy(cp, key, key_len);
+    }
+    last_key_entry= cp;
+    DBUG_ASSERT(last_key_entry >= end_pos);
+    /* Increment the counter of key_entries in the hash table */ 
+    key_entries++;
+  }  
+  return is_full;
+}
+
+
+/*
+  Read the next record from the buffer of a hashed join cache
+
+  SYNOPSIS
+    get_record()
+
+  DESCRIPTION
+    Additionally to what the default implementation of the virtual 
+    function get_record does this implementation skips the link element
+    used to connect the records with the same key into a chain. 
+
+  RETURN VALUE
+    TRUE    there are no more records to read from the join buffer
+    FALSE   otherwise
+*/
+
+bool JOIN_CACHE_HASHED::get_record()
+{ 
+  pos+= get_size_of_rec_offset();
+  return this->JOIN_CACHE::get_record();
+}
+
+
+/* 
+  Skip record from a hashed join buffer if its match flag is set to MATCH_FOUND
+
+  SYNOPSIS
+    skip_if_matched()
+
+  DESCRIPTION
+    This implementation of the virtual function skip_if_matched does
+    the same as the default implementation does, but it takes into account
+    the link element used to connect the records with the same key into a chain. 
+
+  RETURN VALUE
+    TRUE    the match flag is MATCH_FOUND  and the record has been skipped
+    FALSE   otherwise 
+*/
+
+bool JOIN_CACHE_HASHED::skip_if_matched()
+{
+  uchar *save_pos= pos;
+  pos+= get_size_of_rec_offset();
+  if (!this->JOIN_CACHE::skip_if_matched())
+  {
+    pos= save_pos;
+    return FALSE;
+  }
+  return TRUE;
+}
+
+
+/* 
+  Skip record from a hashed join buffer if its match flag dictates to do so
+
+  SYNOPSIS
+    skip_if_uneeded_match()
+
+  DESCRIPTION
+    This implementation of the virtual function skip_if_not_needed_match does
+    the same as the default implementation does, but it takes into account
+    the link element used to connect the records with the same key into a chain. 
+
+  RETURN VALUE
+    TRUE    the match flag dictates to skip the record
+    FALSE   the match flag is off 
+*/
+
+bool JOIN_CACHE_HASHED::skip_if_not_needed_match()
+{
+  uchar *save_pos= pos;
+  pos+= get_size_of_rec_offset();
+  if (!this->JOIN_CACHE::skip_if_not_needed_match())
+  {
+    pos= save_pos;
+    return FALSE;
+  }
+  return TRUE;
+}
+
+
+/* 
+  Search for a key in the hash table of the join buffer
+
+  SYNOPSIS
+    key_search()
+      key             pointer to the key value
+      key_len         key value length
+      key_ref_ptr OUT position of the reference to the next key from 
+                      the hash element for the found key , or
+                      a position where the reference to the the hash 
+                      element for the key is to be added in the
+                      case when the key has not been found
+      
+  DESCRIPTION
+    The function looks for a key in the hash table of the join buffer.
+    If the key is found the functionreturns the position of the reference
+    to the next key from  to the hash element for the given key. 
+    Otherwise the function returns the position where the reference to the
+    newly created hash element for the given key is to be added.  
+
+  RETURN VALUE
+    TRUE    the key is found in the hash table
+    FALSE   otherwise
+*/
+
+bool JOIN_CACHE_HASHED::key_search(uchar *key, uint key_len,
+                                   uchar **key_ref_ptr) 
+{
+  bool is_found= FALSE;
+  uint idx= (this->*hash_func)(key, key_length);
+  uchar *ref_ptr= hash_table+size_of_key_ofs*idx;
+  while (!is_null_key_ref(ref_ptr))
+  {
+    uchar *next_key;
+    ref_ptr= get_next_key_ref(ref_ptr);
+    next_key= use_emb_key ? get_emb_key(ref_ptr-get_size_of_rec_offset()) :
+                            ref_ptr-key_length;
+
+    if ((this->*hash_cmp_func)(next_key, key, key_len))
+    {
+      is_found= TRUE;
+      break;
+    }
+  }
+  *key_ref_ptr= ref_ptr;
+  return is_found;
+} 
+
+
+/* 
+  Hash function that considers a key in the hash table as byte array
+
+  SYNOPSIS
+    get_hash_idx_simple()
+      key             pointer to the key value
+      key_len         key value length
+      
+  DESCRIPTION
+    The function calculates an index of the hash entry in the hash table
+    of the join buffer for the given key. It considers the key just as
+    a sequence of bytes of the length key_len.
+
+  RETURN VALUE
+    the calculated index of the hash entry for the given key  
+*/
+
+inline
+uint JOIN_CACHE_HASHED::get_hash_idx_simple(uchar* key, uint key_len)
+{
+  ulong nr= 1;
+  ulong nr2= 4;
+  uchar *pos= key;
+  uchar *end= key+key_len;
+  for (; pos < end ; pos++)
+  {
+    nr^= (ulong) ((((uint) nr & 63)+nr2)*((uint) *pos))+ (nr << 8);
+    nr2+= 3;
+  }
+  return nr % hash_entries;
+}
+
+
+/* 
+  Hash function that takes into account collations of the components of the key  
+
+  SYNOPSIS
+    get_hash_idx_complex()
+      key             pointer to the key value
+      key_len         key value length
+      
+  DESCRIPTION
+    The function calculates an index of the hash entry in the hash table
+    of the join buffer for the given key. It takes into account that the
+    components of the key may be of a varchar type with different collations.
+    The function guarantees that the same hash value for any two equal
+    keys that may differ as byte sequences.
+    The function takes the info about the components of the key, their
+    types and used collations from the class member ref_key_info containing
+    a pointer to the descriptor of the index that can be used for the join
+    operation.
+
+  RETURN VALUE
+    the calculated index of the hash entry for the given key  
+*/
+
+inline
+uint JOIN_CACHE_HASHED::get_hash_idx_complex(uchar *key, uint key_len)
+{
+  return 
+    (uint) (key_hashnr(ref_key_info, ref_used_key_parts, key) % hash_entries);
+}
+
+
+/* 
+  Compare two key entries in the hash table as sequence of bytes
+
+  SYNOPSIS
+    equal_keys_simple()
+      key1            pointer to the first key entry
+      key2            pointer to the second key entry 
+      key_len         the length of the key values
+      
+  DESCRIPTION
+    The function compares two key entries in the hash table key1 and key2
+    as two sequences bytes of the length key_len
+
+  RETURN VALUE
+    TRUE       key1 coincides with key2
+    FALSE      otherwise
+*/
+
+inline
+bool JOIN_CACHE_HASHED::equal_keys_simple(uchar *key1, uchar *key2,
+                                          uint key_len)
+{
+  return memcmp(key1, key2, key_len) == 0;
+}
+
+
+/* 
+  Compare two key entries taking into account the used collation
+
+  SYNOPSIS
+    equal_keys_complex()
+      key1            pointer to the first key entry
+      key2            pointer to the second key entry 
+      key_len         the length of the key values
+      
+  DESCRIPTION
+    The function checks whether two key entries in the hash table
+    key1 and key2 are equal as, possibly, compound keys of a certain
+    structure whose components may be of a varchar type and may
+    employ different collations.
+    The descriptor of the key structure is taken from the class
+    member ref_key_info.
+
+  RETURN VALUE
+    TRUE       key1 is equal tokey2
+    FALSE      otherwise
+*/
+
+inline
+bool JOIN_CACHE_HASHED::equal_keys_complex(uchar *key1, uchar *key2,
+                                          uint key_len)
+{
+  return key_buf_cmp(ref_key_info, ref_used_key_parts, key1, key2) == 0;
+}
+
+
+/* 
+  Clean up the hash table of the join buffer
+
+  SYNOPSIS
+    cleanup_hash_table()
+      key             pointer to the key value
+      key_len         key value length
+      
+  DESCRIPTION
+    The function cleans up the hash table in the join buffer removing all
+    hash elements from the table. 
+
+  RETURN VALUE
+    none  
+*/
+
+void JOIN_CACHE_HASHED:: cleanup_hash_table()
+{
+  last_key_entry= hash_table;
+  bzero(hash_table, (buff+buff_size)-hash_table);
+  key_entries= 0;
+}
+
+
+/*
+  Check whether all records in a key chain have their match flags set on   
+
+  SYNOPSIS
+    check_all_match_flags_for_key()
+      key_chain_ptr     
+
+  DESCRIPTION
+    This function retrieves records in the given circular chain and checks
+    whether their match flags are set on. The parameter key_chain_ptr shall
+    point to the position in the join buffer storing the reference to the
+    last element of this chain. 
+            
+  RETURN VALUE
+    TRUE   if each retrieved record has its match flag set to MATCH_FOUND
+    FALSE  otherwise 
+*/
+
+bool JOIN_CACHE_HASHED::check_all_match_flags_for_key(uchar *key_chain_ptr)
+{
+  uchar *last_rec_ref_ptr= get_next_rec_ref(key_chain_ptr);
+  uchar *next_rec_ref_ptr= last_rec_ref_ptr;
+  do
+  {
+    next_rec_ref_ptr= get_next_rec_ref(next_rec_ref_ptr);
+    uchar *rec_ptr= next_rec_ref_ptr+rec_fields_offset;
+    if (get_match_flag_by_pos(rec_ptr) != MATCH_FOUND)
+      return FALSE;
+  }
+  while (next_rec_ref_ptr != last_rec_ref_ptr);
+  return TRUE;
+}
+  
+
+/* 
+  Get the next key built for the records from the buffer of a hashed join cache
+
+  SYNOPSIS
+    get_next_key()
+      key    pointer to the buffer where the key value is to be placed
+
+  DESCRIPTION
+    The function reads the next key value stored in the hash table of the
+    join buffer. Depending on the value of the use_emb_key flag of the
+    join cache the value is read either from the table itself or from
+    the record field where it occurs. 
+
+  RETURN VALUE
+    length of the key value - if the starting value of 'cur_key_entry' refers
+    to the position after that referred by the the value of 'last_key_entry',    
+    0 - otherwise.     
+*/
+
+uint JOIN_CACHE_HASHED::get_next_key(uchar ** key)
+{  
+  if (curr_key_entry == last_key_entry)
+    return 0;
+
+  curr_key_entry-= key_entry_length;
+
+  *key = use_emb_key ? get_emb_key(curr_key_entry) : curr_key_entry;
+
+  DBUG_ASSERT(*key >= buff && *key < hash_table);
+
+  return key_length;
+}
+
+
+/* 
+  Initiate an iteration process over records in the joined table
+
+  SYNOPSIS
+    open()
+
+  DESCRIPTION
+    The function initiates the process of iteration over records from the 
+    joined table recurrently performed by the BNL/BKLH join algorithm.  
+
+  RETURN VALUE   
+    0            the initiation is a success 
+    error code   otherwise     
+*/
+
+int JOIN_TAB_SCAN::open()
+{
+  JOIN_TAB *bound= join_tab-cache->tables;
+  for (JOIN_TAB *tab= join_tab-1; tab != bound && !tab->cache; tab--)
+  {
+    tab->status= tab->table->status;
+    tab->table->status= 0;
+  }
+  is_first_record= TRUE;
+  return join_init_read_record(join_tab);
+}
+
+
+/* 
+  Read the next record that can match while scanning the joined table
+
+  SYNOPSIS
+    next()
+
+  DESCRIPTION
+    The function reads the next record from the joined table that can
+    match some records in the buffer of the join cache 'cache'. To do
+    this the function calls the function that scans table records and
+    looks for the next one that meets the condition pushed to the
+    joined table join_tab.
+
+  NOTES
+    The function catches the signal that kills the query.
+
+  RETURN VALUE   
+    0            the next record exists and has been successfully read 
+    error code   otherwise     
+*/
+
+int JOIN_TAB_SCAN::next()
+{
+  int err= 0;
+  int skip_rc;
+  READ_RECORD *info= &join_tab->read_record;
+  SQL_SELECT *select= join_tab->cache_select;
+  if (is_first_record)
+    is_first_record= FALSE;
+  else
+    err= info->read_record(info);
+  if (!err)
+    update_virtual_fields(join->thd, join_tab->table);
+  while (!err && select && (skip_rc= select->skip_record(join->thd)) <= 0)
+  {
+    if (join->thd->killed || skip_rc < 0) 
+      return 1;
+    /* 
+      Move to the next record if the last retrieved record does not
+      meet the condition pushed to the table join_tab.
+    */
+    err= info->read_record(info);
+    if (!err)
+      update_virtual_fields(join->thd, join_tab->table);
+  } 
+  return err; 
+}
+
+
+/* 
+  Perform finalizing actions for a scan over the table records
+
+  SYNOPSIS
+    close()
+
+  DESCRIPTION
+    The function performs the necessary restoring actions after
+    the table scan over the joined table has been finished.
+
+  RETURN VALUE   
+    none      
+*/
+
+void JOIN_TAB_SCAN::close()
+{
+  JOIN_TAB *bound= join_tab-cache->tables;
+  for (JOIN_TAB *tab= join_tab-1; tab != bound && !tab->cache; tab--)
+    tab->table->status= tab->status;
+}
+
+
+/*
+  Prepare to iterate over the BNL join cache buffer to look for matches 
+
+  SYNOPSIS
+    prepare_look_for_matches()
+      skip_last   <-> ignore the last record in the buffer
+
+  DESCRIPTION
+    The function prepares the join cache for an iteration over the
+    records in the join buffer. The iteration is performed when looking
+    for matches for the record from the joined table join_tab that 
+    has been placed into the record buffer of the joined table.
+    If the value of the parameter skip_last is TRUE then the last
+    record from the join buffer is ignored.
+    The function initializes the counter of the records that have been
+    not iterated over yet.
+    
+  RETURN VALUE   
+    TRUE    there are no records in the buffer to iterate over 
+    FALSE   otherwise
+*/
+    
+bool JOIN_CACHE_BNL::prepare_look_for_matches(bool skip_last)
+{
+  if (!records)
+    return TRUE;
+  reset(FALSE);
+  rem_records= records-test(skip_last);
+  return rem_records == 0;
+}
+
+
+/*
+  Get next record from the BNL join cache buffer when looking for matches 
+
+  SYNOPSIS
+    get_next_candidate_for_match
+
+  DESCRIPTION
+    This method is used for iterations over the records from the join
+    cache buffer when looking for matches for records from join_tab.
+    The methods performs the necessary preparations to read the next record
+    from the join buffer into the record buffer by the method
+    read_next_candidate_for_match, or, to skip the next record from the join 
+    buffer by the method skip_recurrent_candidate_for_match.    
+    This implementation of the virtual method get_next_candidate_for_match
+    just  decrements the counter of the records that are to be iterated over
+    and returns the current value of the cursor 'pos' as the position of 
+    the record to be processed. 
+    
+  RETURN VALUE    
+    pointer to the position right after the prefix of the current record
+    in the join buffer if the there is another record to iterate over,
+    0 - otherwise.  
+*/
+
+uchar *JOIN_CACHE_BNL::get_next_candidate_for_match()
+{
+  if (!rem_records)
+    return 0;
+  rem_records--;
+  return pos+base_prefix_length;
+} 
+
+
+/*
+  Check whether the matching record from the BNL cache is to be skipped 
+
+  SYNOPSIS
+    skip_next_candidate_for_match
+    rec_ptr  pointer to the position in the join buffer right after the prefix 
+             of the current record
+
+  DESCRIPTION
+    This implementation of the virtual function just calls the
+    method skip_if_not_needed_match to check whether the record referenced by
+    ref_ptr has its match flag set either to MATCH_FOUND and join_tab is the
+    first inner table of a semi-join, or it's set to MATCH_IMPOSSIBLE and
+    join_tab is the first inner table of an outer join.
+    If so, the function just skips this record setting the value of the
+    cursor 'pos' to the position right after it.
+
+  RETURN VALUE    
+    TRUE   the record referenced by rec_ptr has been skipped
+    FALSE  otherwise  
+*/
+
+bool JOIN_CACHE_BNL::skip_next_candidate_for_match(uchar *rec_ptr)
+{
+  pos= rec_ptr-base_prefix_length; 
+  return skip_if_not_needed_match();
+}
+
+
+/*
+  Read next record from the BNL join cache buffer when looking for matches 
+
+  SYNOPSIS
+    read_next_candidate_for_match
+    rec_ptr  pointer to the position in the join buffer right after the prefix
+             the current record.
+
+  DESCRIPTION
+    This implementation of the virtual method read_next_candidate_for_match
+    calls the method get_record to read the record referenced by rec_ptr from
+    the join buffer into the record buffer. If this record refers to the
+    fields in the other join buffers the call of get_record ensures that
+    these fields are read into the corresponding record buffers as well.
+    This function is supposed to be called after a successful call of
+    the method get_next_candidate_for_match.
+    
+  RETURN VALUE   
+    none
+*/
+
+void JOIN_CACHE_BNL::read_next_candidate_for_match(uchar *rec_ptr)
+{
+  pos= rec_ptr-base_prefix_length;
+  get_record();
+} 
+
+
+/*
+  Initialize the BNL join cache 
+
+  SYNOPSIS
+    init
+
+  DESCRIPTION
+    The function initializes the cache structure. It is supposed to be called
+    right after a constructor for the JOIN_CACHE_BNL.
+
+  NOTES
+    The function first constructs a companion object of the type JOIN_TAB_SCAN,
+    then it calls the init method of the parent class.
+    
+  RETURN VALUE  
+    0   initialization with buffer allocations has been succeeded
+    1   otherwise
+*/
+
+int JOIN_CACHE_BNL::init()
+{
+  DBUG_ENTER("JOIN_CACHE_BNL::init");
+
+  if (!(join_tab_scan= new JOIN_TAB_SCAN(join, join_tab)))
+    DBUG_RETURN(1);
+
+  DBUG_RETURN(JOIN_CACHE::init());
+}
+
+
+/*
+  Get the chain of records from buffer matching the current candidate for join
+
+  SYNOPSIS
+    get_matching_chain_by_join_key()
+
+  DESCRIPTION
+    This function first build a join key for the record of join_tab that
+    currently is in the join buffer for this table. Then it looks for
+    the key entry with this key in the hash table of the join cache.
+    If such a key entry is found the function returns the pointer to
+    the head of the chain of records in the join_buffer that match this
+    key.
+
+  RETURN VALUE
+    The pointer to the corresponding circular list of records if
+    the key entry with the join key is found, 0 - otherwise.
+*/  
+
+uchar *JOIN_CACHE_BNLH::get_matching_chain_by_join_key()
+{
+  uchar *key_ref_ptr;
+  TABLE *table= join_tab->table;
+  TABLE_REF *ref= &join_tab->ref;
+  KEY *keyinfo= join_tab->get_keyinfo_by_key_no(ref->key);
+  /* Build the join key value out of the record in the record buffer */
+  key_copy(key_buff, table->record[0], keyinfo, key_length, TRUE);
+  /* Look for this key in the join buffer */
+  if (!key_search(key_buff, key_length, &key_ref_ptr))
+    return 0;
+  return key_ref_ptr+get_size_of_key_offset();
+}
+
+
+/*
+  Prepare to iterate over the BNLH join cache buffer to look for matches 
+
+  SYNOPSIS
+    prepare_look_for_matches()
+      skip_last   <-> ignore the last record in the buffer
+
+  DESCRIPTION
+    The function prepares the join cache for an iteration over the
+    records in the join buffer. The iteration is performed when looking
+    for matches for the record from the joined table join_tab that 
+    has been placed into the record buffer of the joined table.
+    If the value of the parameter skip_last is TRUE then the last
+    record from the join buffer is ignored.
+    The function builds the hashed key from the join fields of join_tab
+    and uses this key to look in the hash table of the join cache for
+    the chain of matching records in in the join buffer. If it finds
+    such a chain it sets  the member last_rec_ref_ptr to point to the
+    last link of the chain while setting the member next_rec_ref_po 0.
+    
+  RETURN VALUE    
+    TRUE    there are no matching records in the buffer to iterate over 
+    FALSE   otherwise
+*/
+    
+bool JOIN_CACHE_BNLH::prepare_look_for_matches(bool skip_last)
+{
+  uchar *curr_matching_chain;
+  last_matching_rec_ref_ptr= next_matching_rec_ref_ptr= 0;
+  if (!(curr_matching_chain= get_matching_chain_by_join_key()))
+    return 1;
+  last_matching_rec_ref_ptr= get_next_rec_ref(curr_matching_chain); 
+  return 0;
+}
+
+
+/*
+  Get next record from the BNLH join cache buffer when looking for matches 
+
+  SYNOPSIS
+    get_next_candidate_for_match
+
+  DESCRIPTION
+    This method is used for iterations over the records from the join
+    cache buffer when looking for matches for records from join_tab.
+    The methods performs the necessary preparations to read the next record
+    from the join buffer into the record buffer by the method
+    read_next_candidate_for_match, or, to skip the next record from the join 
+    buffer by the method skip_next_candidate_for_match.    
+    This implementation of the virtual method moves to the next record
+    in the chain of all records from the join buffer that are to be
+    equi-joined with the current record from join_tab.
+    
+  RETURN VALUE   
+    pointer to the beginning of the record fields in the join buffer
+    if the there is another record to iterate over, 0 - otherwise.  
+*/
+
+uchar *JOIN_CACHE_BNLH::get_next_candidate_for_match()
+{
+  if (next_matching_rec_ref_ptr == last_matching_rec_ref_ptr)
+    return 0;
+  next_matching_rec_ref_ptr= get_next_rec_ref(next_matching_rec_ref_ptr ?
+                                                next_matching_rec_ref_ptr :
+                                                last_matching_rec_ref_ptr);
+  return next_matching_rec_ref_ptr+rec_fields_offset; 
+} 
+
+
+/*
+  Check whether the matching record from the BNLH cache is to be skipped 
+
+  SYNOPSIS
+    skip_next_candidate_for_match
+    rec_ptr  pointer to the position in the join buffer right after 
+             the previous record
+
+  DESCRIPTION
+    This implementation of the virtual function just calls the
+    method get_match_flag_by_pos to check whether the record referenced
+    by ref_ptr has its match flag set to MATCH_FOUND.
+
+  RETURN VALUE    
+    TRUE   the record referenced by rec_ptr has its match flag set to 
+           MATCH_FOUND
+    FALSE  otherwise  
+*/
+
+bool JOIN_CACHE_BNLH::skip_next_candidate_for_match(uchar *rec_ptr)
+{
+ return  join_tab->check_only_first_match() &&
+          (get_match_flag_by_pos(rec_ptr) == MATCH_FOUND);
+}
+
+
+/*
+  Read next record from the BNLH join cache buffer when looking for matches 
+
+  SYNOPSIS
+    read_next_candidate_for_match
+    rec_ptr  pointer to the position in the join buffer right after 
+             the previous record
+
+  DESCRIPTION
+    This implementation of the virtual method read_next_candidate_for_match
+    calls the method get_record_by_pos to read the record referenced by rec_ptr
+    from the join buffer into the record buffer. If this record refers to
+    fields in the other join buffers the call of get_record_by_po ensures that
+    these fields are read into the corresponding record buffers as well.
+    This function is supposed to be called after a successful call of
+    the method get_next_candidate_for_match.
+    
+  RETURN VALUE   
+    none
+*/
+
+void JOIN_CACHE_BNLH::read_next_candidate_for_match(uchar *rec_ptr)
+{
+  get_record_by_pos(rec_ptr);
+} 
+
+
+/*
+  Initialize the BNLH join cache 
+
+  SYNOPSIS
+    init
+
+  DESCRIPTION
+    The function initializes the cache structure. It is supposed to be called
+    right after a constructor for the JOIN_CACHE_BNLH.
+
+  NOTES
+    The function first constructs a companion object of the type JOIN_TAB_SCAN,
+    then it calls the init method of the parent class.
+    
+  RETURN VALUE  
+    0   initialization with buffer allocations has been succeeded
+    1   otherwise
+*/
+
+int JOIN_CACHE_BNLH::init()
+{
+  DBUG_ENTER("JOIN_CACHE_BNLH::init");
+
+  if (!(join_tab_scan= new JOIN_TAB_SCAN(join, join_tab)))
+    DBUG_RETURN(1);
+
+  DBUG_RETURN(JOIN_CACHE_HASHED::init());
+}
+
+
+/* 
+  Calculate the increment of the MRR buffer for a record write       
+
+  SYNOPSIS
+    aux_buffer_incr()
+
+  DESCRIPTION
+    This implementation of the virtual function aux_buffer_incr determines
+    for how much the size of the MRR buffer should be increased when another
+    record is added to the cache.   
+
+  RETURN VALUE
+    the increment of the size of the MRR buffer for the next record
+*/
+
+uint JOIN_TAB_SCAN_MRR::aux_buffer_incr(ulong recno)
+{
+  uint incr= 0;
+  TABLE_REF *ref= &join_tab->ref;
+  TABLE *tab= join_tab->table;
+  uint rec_per_key= tab->key_info[ref->key].rec_per_key[ref->key_parts-1];
+  set_if_bigger(rec_per_key, 1);
+  if (recno == 1)
+    incr=  ref->key_length + tab->file->ref_length;
+  incr+= tab->file->stats.mrr_length_per_rec * rec_per_key;
+  return incr; 
+}
+
+
+/* 
+  Initiate iteration over records returned by MRR for the current join buffer
+
+  SYNOPSIS
+    open()
+
+  DESCRIPTION
+    The function initiates the process of iteration over the records from 
+    join_tab returned by the MRR interface functions for records from
+    the join buffer. Such an iteration is performed by the BKA/BKAH join
+    algorithm for each new refill of the join buffer.
+    The function calls the MRR handler function multi_range_read_init to
+    initiate this process.
+
+  RETURN VALUE   
+    0            the initiation is a success 
+    error code   otherwise     
+*/
+
+int JOIN_TAB_SCAN_MRR::open()
+{
+  handler *file= join_tab->table->file;
+
+  join_tab->table->null_row= 0;
+
+
+  /* Dynamic range access is never used with BKA */
+  DBUG_ASSERT(join_tab->use_quick != 2);
+
+  JOIN_TAB *bound= join_tab-cache->tables;
+  for (JOIN_TAB *tab= join_tab-1; tab != bound && !tab->cache; tab--)
+  {
+    tab->status= tab->table->status;
+    tab->table->status= 0;
+  }
+
+  init_mrr_buff();
+
+  /* 
+    Prepare to iterate over keys from the join buffer and to get
+    matching candidates obtained with MMR handler functions.
+  */ 
+  if (!file->inited)
+    file->ha_index_init(join_tab->ref.key, 1);
+  ranges= cache->get_number_of_ranges_for_mrr();
+  if (!join_tab->cache_idx_cond)
+    range_seq_funcs.skip_index_tuple= 0;
+  return file->multi_range_read_init(&range_seq_funcs, (void*) cache,
+                                     ranges, mrr_mode, &mrr_buff);
+}
+
+
+/* 
+  Read the next record returned by MRR for the current join buffer
+
+  SYNOPSIS
+    next()
+
+  DESCRIPTION
+    The function reads the next record from the joined table join_tab
+    returned by the MRR handler function multi_range_read_next for
+    the current refill of the join buffer. The record is read into
+    the record buffer used for join_tab records in join operations.
+
+  RETURN VALUE   
+    0            the next record exists and has been successfully read 
+    error code   otherwise     
+*/
+
+int JOIN_TAB_SCAN_MRR::next()
+{
+  char **ptr= (char **) cache->get_curr_association_ptr();
+
+  DBUG_ASSERT(sizeof(range_id_t) == sizeof(*ptr));
+  int rc= join_tab->table->file->multi_range_read_next((range_id_t*)ptr) ? -1 : 0;
+  if (!rc)
+  {
+    /* 
+      If a record in in an incremental cache contains no fields then the
+      association for the last record in cache will be equal to cache->end_pos
+    */ 
+    DBUG_ASSERT(cache->buff <= (uchar *) (*ptr) &&
+                (uchar *) (*ptr) <= cache->end_pos);
+    update_virtual_fields(join->thd, join_tab->table);
+  }
+  return rc;
+}
+
+
+static 
+void bka_range_seq_key_info(void *init_params, uint *length, 
+                            key_part_map *map)
+{
+  TABLE_REF *ref= &(((JOIN_CACHE*)init_params)->join_tab->ref);
+  *length= ref->key_length;
+  *map= (key_part_map(1) << ref->key_parts) - 1;
+}
+
+
+/*
+  Initialize retrieval of range sequence for BKA join algorithm
+    
+  SYNOPSIS
+    bka_range_seq_init()
+     init_params   pointer to the BKA join cache object
+     n_ranges      the number of ranges obtained 
+     flags         combination of MRR flags
+
+  DESCRIPTION
+    The function interprets init_param as a pointer to a JOIN_CACHE_BKA
+    object. The function prepares for an iteration over the join keys
+    built for all records from the cache join buffer.
+
+  NOTE
+    This function are used only as a callback function.    
+
+  RETURN VALUE
+    init_param value that is to be used as a parameter of bka_range_seq_next()
+*/    
+
+static 
+range_seq_t bka_range_seq_init(void *init_param, uint n_ranges, uint flags)
+{
+  DBUG_ENTER("bka_range_seq_init");
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) init_param;
+  cache->reset(0);
+  DBUG_RETURN((range_seq_t) init_param);
+}
+
+
+/*
+  Get the next range/key over records from the join buffer used by a BKA cache
+    
+  SYNOPSIS
+    bka_range_seq_next()
+      seq        the value returned by  bka_range_seq_init
+      range  OUT reference to the next range
+  
+  DESCRIPTION
+    The function interprets seq as a pointer to a JOIN_CACHE_BKA
+    object. The function returns a pointer to the range descriptor
+    for the key built over the next record from the join buffer.
+
+  NOTE
+    This function are used only as a callback function.
+   
+  RETURN VALUE
+    FALSE   ok, the range structure filled with info about the next range/key
+    TRUE    no more ranges
+*/    
+
+static 
+bool bka_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
+{
+  DBUG_ENTER("bka_range_seq_next");
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
+  TABLE_REF *ref= &cache->join_tab->ref;
+  key_range *start_key= &range->start_key;
+  if ((start_key->length= cache->get_next_key((uchar **) &start_key->key)))
+  {
+    start_key->keypart_map= (1 << ref->key_parts) - 1;
+    start_key->flag= HA_READ_KEY_EXACT;
+    range->end_key= *start_key;
+    range->end_key.flag= HA_READ_AFTER_KEY;
+    range->ptr= (char *) cache->get_curr_rec();
+    range->range_flag= EQ_RANGE;
+    DBUG_RETURN(0);
+  } 
+  DBUG_RETURN(1);
+}
+
+
+/*
+  Check whether range_info orders to skip the next record from BKA buffer
+
+  SYNOPSIS
+    bka_range_seq_skip_record()
+      seq              value returned by bka_range_seq_init()
+      range_info       information about the next range
+      rowid [NOT USED] rowid of the record to be checked 
+
+    
+  DESCRIPTION
+    The function interprets seq as a pointer to a JOIN_CACHE_BKA object.
+    The function returns TRUE if the record with this range_info 
+    is to be filtered out from the stream of records returned by 
+    multi_range_read_next(). 
+
+  NOTE
+    This function are used only as a callback function.
+
+  RETURN VALUE
+    1    record with this range_info is to be filtered out from the stream
+         of records returned by multi_range_read_next()
+    0    the record is to be left in the stream
+*/ 
+
+static 
+bool bka_range_seq_skip_record(range_seq_t rseq, range_id_t range_info, uchar *rowid)
+{
+  DBUG_ENTER("bka_range_seq_skip_record");
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
+  bool res= cache->get_match_flag_by_pos((uchar *) range_info) ==
+            JOIN_CACHE::MATCH_FOUND;
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Check if the record combination from BKA cache matches the index condition
+
+  SYNOPSIS
+    bka_skip_index_tuple()
+      rseq             value returned by bka_range_seq_init()
+      range_info       record chain for the next range/key returned by MRR
+    
+  DESCRIPTION
+    This is wrapper for JOIN_CACHE_BKA::skip_index_tuple method,
+    see comments there.
+
+  NOTE
+    This function is used as a RANGE_SEQ_IF::skip_index_tuple callback.
+ 
+  RETURN VALUE
+    0    The record combination satisfies the index condition
+    1    Otherwise
+*/
+
+static 
+bool bka_skip_index_tuple(range_seq_t rseq, range_id_t range_info)
+{
+  DBUG_ENTER("bka_skip_index_tuple");
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
+  bool res= cache->skip_index_tuple(range_info);
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Prepare to read the record from BKA cache matching the current joined record   
+
+  SYNOPSIS
+    prepare_look_for_matches()
+      skip_last <-> ignore the last record in the buffer (always unused here)
+
+  DESCRIPTION
+    The function prepares to iterate over records in the join cache buffer
+    matching the record loaded into the record buffer for join_tab when
+    performing join operation by BKA join algorithm. With BKA algorithms the
+    record loaded into the record buffer for join_tab always has a direct
+    reference to the matching records from the join buffer. When the regular
+    BKA join algorithm is employed the record from join_tab can refer to
+    only one such record.   
+    The function sets the counter of the remaining records from the cache 
+    buffer that would match the current join_tab record to 1.
+    
+  RETURN VALUE   
+    TRUE    there are no records in the buffer to iterate over 
+    FALSE   otherwise
+*/
+    
+bool JOIN_CACHE_BKA::prepare_look_for_matches(bool skip_last)
+{
+  if (!records)
+    return TRUE;
+  rem_records= 1;
+  return FALSE;
+}
+
+
+/*
+  Get the record from the BKA cache matching the current joined record   
+
+  SYNOPSIS
+    get_next_candidate_for_match
+
+  DESCRIPTION
+    This method is used for iterations over the records from the join
+    cache buffer when looking for matches for records from join_tab.
+    The method performs the necessary preparations to read the next record
+    from the join buffer into the record buffer by the method
+    read_next_candidate_for_match, or, to skip the next record from the join 
+    buffer by the method skip_if_not_needed_match.    
+    This implementation of the virtual method get_next_candidate_for_match
+    just  decrements the counter of the records that are to be iterated over
+    and returns the value of curr_association as a reference to the position
+    of the beginning of the record fields in the buffer.
+    
+  RETURN VALUE   
+    pointer to the start of the record fields in the join buffer
+    if the there is another record to iterate over, 0 - otherwise.  
+*/
+
+uchar *JOIN_CACHE_BKA::get_next_candidate_for_match()
+{
+  if (!rem_records)
+    return 0;
+  rem_records--;
+  return curr_association;
+} 
+
+
+/*
+  Check whether the matching record from the BKA cache is to be skipped 
+
+  SYNOPSIS
+    skip_next_candidate_for_match
+    rec_ptr  pointer to the position in the join buffer right after 
+             the previous record
+
+  DESCRIPTION
+    This implementation of the virtual function just calls the
+    method get_match_flag_by_pos to check whether the record referenced
+    by ref_ptr has its match flag set to MATCH_FOUND.
+
+  RETURN VALUE   
+    TRUE   the record referenced by rec_ptr has its match flag set to
+           MATCH_FOUND
+    FALSE  otherwise  
+*/
+
+bool JOIN_CACHE_BKA::skip_next_candidate_for_match(uchar *rec_ptr)
+{
+  return join_tab->check_only_first_match() && 
+         (get_match_flag_by_pos(rec_ptr) == MATCH_FOUND);
+}
+
+
+/*
+  Read the next record from the BKA join cache buffer when looking for matches 
+
+  SYNOPSIS
+    read_next_candidate_for_match
+    rec_ptr  pointer to the position in the join buffer right after 
+             the previous record
+
+  DESCRIPTION
+    This implementation of the virtual method read_next_candidate_for_match
+    calls the method get_record_by_pos to read the record referenced by rec_ptr
+    from the join buffer into the record buffer. If this record refers to
+    fields in the other join buffers the call of get_record_by_po ensures that
+    these fields are read into the corresponding record buffers as well.
+    This function is supposed to be called after a successful call of
+    the method get_next_candidate_for_match.
+    
+  RETURN VALUE   
+    none
+*/
+
+void JOIN_CACHE_BKA::read_next_candidate_for_match(uchar *rec_ptr)
+{
+  get_record_by_pos(rec_ptr);
+} 
+
+
+/*
+  Initialize the BKA join cache 
+
+  SYNOPSIS
+    init
+
+  DESCRIPTION
+    The function initializes the cache structure. It is supposed to be called
+    right after a constructor for the JOIN_CACHE_BKA.
+
+  NOTES
+    The function first constructs a companion object of the type 
+    JOIN_TAB_SCAN_MRR, then it calls the init method of the parent class.
+    
+  RETURN VALUE   
+    0   initialization with buffer allocations has been succeeded
+    1   otherwise
+*/
+
+int JOIN_CACHE_BKA::init()
+{
+  int res;
+  bool check_only_first_match= join_tab->check_only_first_match();
+
+  RANGE_SEQ_IF rs_funcs= { bka_range_seq_key_info,
+                           bka_range_seq_init, 
+                           bka_range_seq_next,
+                           check_only_first_match ?
+                             bka_range_seq_skip_record : 0,
+                           bka_skip_index_tuple };
+
+  DBUG_ENTER("JOIN_CACHE_BKA::init");
+
+  JOIN_TAB_SCAN_MRR *jsm;
+  if (!(join_tab_scan= jsm= new JOIN_TAB_SCAN_MRR(join, join_tab, 
+                                                  mrr_mode, rs_funcs)))
+    DBUG_RETURN(1);
+
+  if ((res= JOIN_CACHE::init()))
+    DBUG_RETURN(res);
+
+  if (use_emb_key)
+    jsm->mrr_mode |= HA_MRR_MATERIALIZED_KEYS;
+
+  DBUG_RETURN(0);
+}
+
+
+/* 
+  Get the key built over the next record from BKA join buffer
+
+  SYNOPSIS
+    get_next_key()
+      key    pointer to the buffer where the key value is to be placed
+
+  DESCRIPTION
+    The function reads key fields from the current record in the join buffer.
+    and builds the key value out of these fields that will be used to access
+    the 'join_tab' table. Some of key fields may belong to previous caches.
+    They are accessed via record references to the record parts stored in the
+    previous join buffers. The other key fields always are placed right after
+    the flag fields of the record.
+    If the key is embedded, which means that its value can be read directly
+    from the join buffer, then *key is set to the beginning of the key in
+    this buffer. Otherwise the key is built in the join_tab->ref->key_buff.
+    The function returns the length of the key if it succeeds ro read it.
+    If is assumed that the functions starts reading at the position of
+    the record length which is provided for each records in a BKA cache.
+    After the key is built the 'pos' value points to the first position after
+    the current record.
+    The function just skips the records with MATCH_IMPOSSIBLE in the
+    match flag field if there is any. 
+    The function returns 0 if the initial position is after the beginning
+    of the record fields for last record from the join buffer. 
+
+  RETURN VALUE
+    length of the key value - if the starting value of 'pos' points to
+    the position before the fields for the last record,
+    0 - otherwise.     
+*/
+
+uint JOIN_CACHE_BKA::get_next_key(uchar ** key)
+{
+  uint len;
+  uint32 rec_len;
+  uchar *init_pos;
+  JOIN_CACHE *cache;
+  
+start:
+
+  /* Any record in a BKA cache is prepended with its length */
+  DBUG_ASSERT(with_length);
+   
+  if ((pos+size_of_rec_len) > last_rec_pos || !records)
+    return 0;
+
+  /* Read the length of the record */
+  rec_len= get_rec_length(pos);
+  pos+= size_of_rec_len; 
+  init_pos= pos;
+
+  /* Read a reference to the previous cache if any */
+  if (prev_cache)
+    pos+= prev_cache->get_size_of_rec_offset();
+
+  curr_rec_pos= pos;
+
+  /* Read all flag fields of the record */
+  read_flag_fields();
+
+  if (with_match_flag && 
+      (Match_flag) curr_rec_pos[0] == MATCH_IMPOSSIBLE )
+  {
+    pos= init_pos+rec_len;
+    goto start;
+  }
+ 
+  if (use_emb_key)
+  {
+    /* An embedded key is taken directly from the join buffer */
+    *key= pos;
+    len= emb_key_length;
+  }
+  else
+  {
+    /* Read key arguments from previous caches if there are any such fields */
+    if (external_key_arg_fields)
+    {
+      uchar *rec_ptr= curr_rec_pos;
+      uint key_arg_count= external_key_arg_fields;
+      CACHE_FIELD **copy_ptr= blob_ptr-key_arg_count;
+      for (cache= prev_cache; key_arg_count; cache= cache->prev_cache)
+      { 
+        uint len= 0;
+        DBUG_ASSERT(cache);
+        rec_ptr= cache->get_rec_ref(rec_ptr);
+        while (!cache->referenced_fields)
+        {
+          cache= cache->prev_cache;
+          DBUG_ASSERT(cache);
+          rec_ptr= cache->get_rec_ref(rec_ptr);
+        }
+        while (key_arg_count && 
+               cache->read_referenced_field(*copy_ptr, rec_ptr, &len))
+        {
+          copy_ptr++;
+          --key_arg_count;
+        }
+      }
+    }
+    
+    /* 
+      Read the other key arguments from the current record. The fields for
+      these arguments are always first in the sequence of the record's fields.
+    */     
+    CACHE_FIELD *copy= field_descr+flag_fields;
+    CACHE_FIELD *copy_end= copy+local_key_arg_fields;
+    bool blob_in_rec_buff= blob_data_is_in_rec_buff(curr_rec_pos);
+    for ( ; copy < copy_end; copy++)
+      read_record_field(copy, blob_in_rec_buff);
+    
+    /* Build the key over the fields read into the record buffers */ 
+    TABLE_REF *ref= &join_tab->ref;
+    cp_buffer_from_ref(join->thd, join_tab->table, ref);
+    *key= ref->key_buff;
+    len= ref->key_length;
+  }
+
+  pos= init_pos+rec_len;
+
+  return len;
+} 
+
+
+/*
+  Check the index condition of the joined table for a record from the BKA cache
+
+  SYNOPSIS
+    skip_index_tuple()
+      range_info       pointer to the record returned by MRR 
+    
+  DESCRIPTION
+    This function is invoked from MRR implementation to check if an index
+    tuple matches the index condition. It is used in the case where the index
+    condition actually depends on both columns of the used index and columns
+    from previous tables.
+   
+  NOTES 
+    Accessing columns of the previous tables requires special handling with
+    BKA. The idea of BKA is to collect record combinations in a buffer and 
+    then do a batch of ref access lookups, i.e. by the time we're doing a
+    lookup its previous-records-combination is not in prev_table->record[0]
+    but somewhere in the join buffer.    
+    We need to get it from there back into prev_table(s)->record[0] before we
+    can evaluate the index condition, and that's why we need this function
+    instead of regular IndexConditionPushdown.
+
+  NOTES
+    Possible optimization:
+    Before we unpack the record from a previous table
+    check if this table is used in the condition.
+    If so then unpack the record otherwise skip the unpacking.
+    This should be done by a special virtual method
+    get_partial_record_by_pos().
+
+  RETURN VALUE
+    1    the record combination does not satisfies the index condition
+    0    otherwise
+*/
+
+bool JOIN_CACHE_BKA::skip_index_tuple(range_id_t range_info)
+{
+  DBUG_ENTER("JOIN_CACHE_BKA::skip_index_tuple");
+  get_record_by_pos((uchar*)range_info);
+  DBUG_RETURN(!join_tab->cache_idx_cond->val_int());
+}
+
+
+
+/*
+  Initialize retrieval of range sequence for the BKAH join algorithm
+    
+  SYNOPSIS
+    bkah_range_seq_init()
+      init_params   pointer to the BKAH join cache object
+      n_ranges      the number of ranges obtained 
+      flags         combination of MRR flags
+
+  DESCRIPTION
+    The function interprets init_param as a pointer to a JOIN_CACHE_BKAH
+    object. The function prepares for an iteration over distinct join keys
+    built over the records from the cache join buffer.
+
+  NOTE
+    This function are used only as a callback function.    
+
+  RETURN VALUE
+    init_param    value that is to be used as a parameter of 
+                  bkah_range_seq_next()
+*/    
+
+static 
+range_seq_t bkah_range_seq_init(void *init_param, uint n_ranges, uint flags)
+{
+  DBUG_ENTER("bkah_range_seq_init");
+  JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) init_param;
+  cache->reset(0);
+  DBUG_RETURN((range_seq_t) init_param);
+}
+
+
+/*
+  Get the next range/key over records from the join buffer of a BKAH cache  
+    
+  SYNOPSIS
+    bkah_range_seq_next()
+      seq        value returned by  bkah_range_seq_init()
+      range  OUT reference to the next range
+  
+  DESCRIPTION
+    The function interprets seq as a pointer to a JOIN_CACHE_BKAH 
+    object. The function returns a pointer to the range descriptor
+    for the next unique key built over records from the join buffer.
+
+  NOTE
+    This function are used only as a callback function.
+   
+  RETURN VALUE
+    FALSE  ok, the range structure filled with info about the next range/key
+    TRUE   no more ranges
+*/    
+
+static 
+bool bkah_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
+{
+  DBUG_ENTER("bkah_range_seq_next");
+  JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) rseq;
+  TABLE_REF *ref= &cache->join_tab->ref;
+  key_range *start_key= &range->start_key;
+  if ((start_key->length= cache->get_next_key((uchar **) &start_key->key)))
+  {
+    start_key->keypart_map= (1 << ref->key_parts) - 1;
+    start_key->flag= HA_READ_KEY_EXACT;
+    range->end_key= *start_key;
+    range->end_key.flag= HA_READ_AFTER_KEY;
+    range->ptr= (char *) cache->get_curr_key_chain();
+    range->range_flag= EQ_RANGE;
+    DBUG_RETURN(0);
+  } 
+  DBUG_RETURN(1);
+}
+
+
+/*
+  Check whether range_info orders to skip the next record from BKAH join buffer
+
+  SYNOPSIS
+    bkah_range_seq_skip_record()
+      seq              value returned by bkah_range_seq_init()
+      range_info       information about the next range/key returned by MRR
+      rowid [NOT USED] rowid of the record to be checked (not used)
+    
+  DESCRIPTION
+    The function interprets seq as a pointer to a JOIN_CACHE_BKAH
+    object. The function returns TRUE if the record with this range_info
+    is to be filtered out from the stream of records returned by
+    multi_range_read_next(). 
+
+  NOTE
+    This function are used only as a callback function.
+
+  RETURN VALUE
+    1    record with this range_info is to be filtered out from the stream
+         of records returned by multi_range_read_next()
+    0    the record is to be left in the stream
+*/ 
+
+static 
+bool bkah_range_seq_skip_record(range_seq_t rseq, range_id_t range_info,
+                                uchar *rowid)
+{
+  DBUG_ENTER("bkah_range_seq_skip_record");
+  JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) rseq;
+  bool res= cache->check_all_match_flags_for_key((uchar *) range_info);
+  DBUG_RETURN(res);
+}
+
+ 
+/*
+  Check if the record combination from BKAH cache matches the index condition
+
+  SYNOPSIS
+    bkah_skip_index_tuple()
+      rseq             value returned by bka_range_seq_init()
+      range_info       record chain for the next range/key returned by MRR
+    
+  DESCRIPTION
+    This is wrapper for JOIN_CACHE_BKA_UNIQUE::skip_index_tuple method,
+    see comments there.
+
+  NOTE
+    This function is used as a RANGE_SEQ_IF::skip_index_tuple callback.
+ 
+  RETURN VALUE
+    0    some records from the chain satisfy the index condition
+    1    otherwise
+*/
+
+static 
+bool bkah_skip_index_tuple(range_seq_t rseq, range_id_t range_info)
+{
+  DBUG_ENTER("bka_unique_skip_index_tuple");
+  JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) rseq;
+  DBUG_RETURN(cache->skip_index_tuple(range_info));
+}
+
+
+/*
+  Prepare to read record from BKAH cache matching the current joined record   
+
+  SYNOPSIS
+    prepare_look_for_matches()
+      skip_last <-> ignore the last record in the buffer (always unused here)
+
+  DESCRIPTION
+    The function prepares to iterate over records in the join cache buffer
+    matching the record loaded into the record buffer for join_tab when
+    performing join operation by BKAH join algorithm. With BKAH algorithm, if
+    association labels are used, then record loaded into the record buffer 
+    for join_tab always has a direct reference to the chain of the mathing
+    records from the join buffer. If association labels are not used then
+    then the chain of the matching records is obtained by the call of the
+    get_key_chain_by_join_key function.
+    
+  RETURN VALUE   
+    TRUE    there are no records in the buffer to iterate over 
+    FALSE   otherwise
+*/
+    
+bool JOIN_CACHE_BKAH::prepare_look_for_matches(bool skip_last)
+{
+  last_matching_rec_ref_ptr= next_matching_rec_ref_ptr= 0;
+  if (no_association &&
+      (curr_matching_chain= get_matching_chain_by_join_key()))
+    return 1;
+  last_matching_rec_ref_ptr= get_next_rec_ref(curr_matching_chain);
+  return 0;
+}
+
+/*
+  Initialize the BKAH join cache 
+
+  SYNOPSIS
+    init
+
+  DESCRIPTION
+    The function initializes the cache structure. It is supposed to be called
+    right after a constructor for the JOIN_CACHE_BKAH.
+
+  NOTES
+    The function first constructs a companion object of the type 
+    JOIN_TAB_SCAN_MRR, then it calls the init method of the parent class.
+    
+  RETURN VALUE   
+    0   initialization with buffer allocations has been succeeded
+    1   otherwise
+*/
+
+int JOIN_CACHE_BKAH::init()
+{
+  bool check_only_first_match= join_tab->check_only_first_match();
+
+  no_association= test(mrr_mode & HA_MRR_NO_ASSOCIATION);
+
+  RANGE_SEQ_IF rs_funcs= { bka_range_seq_key_info,
+                           bkah_range_seq_init,
+                           bkah_range_seq_next,
+                           check_only_first_match && !no_association ?
+                             bkah_range_seq_skip_record : 0,
+                           bkah_skip_index_tuple };
+
+  DBUG_ENTER("JOIN_CACHE_BKAH::init");
+
+  if (!(join_tab_scan= new JOIN_TAB_SCAN_MRR(join, join_tab, 
+                                             mrr_mode, rs_funcs)))
+    DBUG_RETURN(1);
+
+  DBUG_RETURN(JOIN_CACHE_HASHED::init());
+}
+
+
+/*
+  Check the index condition of the joined table for a record from the BKA cache
+
+  SYNOPSIS
+    skip_index_tuple()
+      range_info       record chain returned by MRR 
+    
+  DESCRIPTION
+    See JOIN_CACHE_BKA::skip_index_tuple().
+    This function is the variant for use with rhe class JOIN_CACHE_BKAH.
+    The difference from JOIN_CACHE_BKA case is that there may be multiple
+    previous table record combinations that share the same key(MRR range).
+    As a consequence, we need to loop through the chain of all table record
+    combinations that match the given MRR range key range_info until we find
+    one that satisfies the index condition.
+
+  NOTE
+    Possible optimization:
+    Before we unpack the record from a previous table
+    check if this table is used in the condition.
+    If so then unpack the record otherwise skip the unpacking.
+    This should be done by a special virtual method
+    get_partial_record_by_pos().
+
+  RETURN VALUE
+    1    any record combination from the chain referred by range_info
+         does not satisfy the index condition
+    0    otherwise
+
+
+*/
+
+bool JOIN_CACHE_BKAH::skip_index_tuple(range_id_t range_info)
+{
+  uchar *last_rec_ref_ptr= get_next_rec_ref((uchar*) range_info);
+  uchar *next_rec_ref_ptr= last_rec_ref_ptr;
+  DBUG_ENTER("JOIN_CACHE_BKAH::skip_index_tuple");
+  do
+  {
+    next_rec_ref_ptr= get_next_rec_ref(next_rec_ref_ptr);
+    uchar *rec_ptr= next_rec_ref_ptr + rec_fields_offset;
+    get_record_by_pos(rec_ptr);
+    if (join_tab->cache_idx_cond->val_int())
+      DBUG_RETURN(FALSE);
+  } while(next_rec_ref_ptr != last_rec_ref_ptr);
+  DBUG_RETURN(TRUE);
+}
diff --git a/sql/sql_join_cache.h b/sql/sql_join_cache.h
new file mode 100644
index 00000000000..5498192122f
--- /dev/null
+++ b/sql/sql_join_cache.h
@@ -0,0 +1,1407 @@
+/*
+  This file contains declarations for implementations
+  of block based join algorithms
+*/
+
+#define JOIN_CACHE_INCREMENTAL_BIT           1
+#define JOIN_CACHE_HASHED_BIT                2
+#define JOIN_CACHE_BKA_BIT                   4
+
+/* 
+  Categories of data fields of variable length written into join cache buffers.
+  The value of any of these fields is written into cache together with the
+  prepended length of the value.     
+*/
+#define CACHE_BLOB      1        /* blob field  */
+#define CACHE_STRIPPED  2        /* field stripped of trailing spaces */
+#define CACHE_VARSTR1   3        /* short string value (length takes 1 byte) */ 
+#define CACHE_VARSTR2   4        /* long string value (length takes 2 bytes) */
+
+/*
+  The CACHE_FIELD structure used to describe fields of records that
+  are written into a join cache buffer from record buffers and backward.
+*/
+typedef struct st_cache_field {
+  uchar *str;   /**< buffer from/to where the field is to be copied */ 
+  uint length;  /**< maximal number of bytes to be copied from/to str */
+  /* 
+    Field object for the moved field
+    (0 - for a flag field, see JOIN_CACHE::create_flag_fields).
+  */
+  Field *field;
+  uint type;    /**< category of the of the copied field (CACHE_BLOB et al.) */
+  /* 
+    The number of the record offset value for the field in the sequence
+    of offsets placed after the last field of the record. These
+    offset values are used to access fields referred to from other caches.
+    If the value is 0 then no offset for the field is saved in the
+    trailing sequence of offsets.
+  */ 
+  uint referenced_field_no; 
+  /* The remaining structure fields are used as containers for temp values */
+  uint blob_length; /**< length of the blob to be copied */
+  uint offset;      /**< field offset to be saved in cache buffer */
+} CACHE_FIELD;
+
+
+class JOIN_TAB_SCAN;
+
+
+/*
+  JOIN_CACHE is the base class to support the implementations of 
+  - Block Nested Loop (BNL) Join Algorithm,
+  - Block Nested Loop Hash (BNLH) Join Algorithm,
+  - Batched Key Access (BKA) Join Algorithm.
+  The first algorithm is supported by the derived class JOIN_CACHE_BNL,
+  the second algorithm is supported by the derived class JOIN_CACHE_BNLH,
+  while the third algorithm is implemented in two variant supported by
+  the classes JOIN_CACHE_BKA and JOIN_CACHE_BKAH.
+  These three algorithms have a lot in common. Each of them first accumulates
+  the records of the left join operand in a join buffer and then searches for
+  matching rows of the second operand for all accumulated records.
+  For the first two algorithms this strategy saves on logical I/O operations:
+  the entire set of records from the join buffer requires only one look-through
+  of the records provided by the second operand. 
+  For the third algorithm the accumulation of records allows to optimize
+  fetching rows of the second operand from disk for some engines (MyISAM, 
+  InnoDB), or to minimize the number of round-trips between the Server and
+  the engine nodes (NDB Cluster).        
+*/ 
+
+class JOIN_CACHE :public Sql_alloc
+{
+
+private:
+
+  /* Size of the offset of a record from the cache */   
+  uint size_of_rec_ofs;    
+  /* Size of the length of a record in the cache */
+  uint size_of_rec_len;
+  /* Size of the offset of a field within a record in the cache */   
+  uint size_of_fld_ofs;
+
+protected:
+       
+  /* 3 functions below actually do not use the hidden parameter 'this' */ 
+
+  /* Calculate the number of bytes used to store an offset value */
+  uint offset_size(uint len)
+  { return (len < 256 ? 1 : len < 256*256 ? 2 : 4); }
+
+  /* Get the offset value that takes ofs_sz bytes at the position ptr */
+  ulong get_offset(uint ofs_sz, uchar *ptr)
+  {
+    switch (ofs_sz) {
+    case 1: return uint(*ptr);
+    case 2: return uint2korr(ptr);
+    case 4: return uint4korr(ptr);
+    }
+    return 0;
+  }
+
+  /* Set the offset value ofs that takes ofs_sz bytes at the position ptr */ 
+  void store_offset(uint ofs_sz, uchar *ptr, ulong ofs)
+  {
+    switch (ofs_sz) {
+    case 1: *ptr= (uchar) ofs; return;
+    case 2: int2store(ptr, (uint16) ofs); return;
+    case 4: int4store(ptr, (uint32) ofs); return;
+    }
+  }
+  
+  /* 
+    The maximum total length of the fields stored for a record in the cache.
+    For blob fields only the sizes of the blob lengths are taken into account. 
+  */
+  uint length;
+
+  /* 
+    Representation of the executed multi-way join through which all needed
+    context can be accessed.  
+  */   
+  JOIN *join;  
+
+  /* 
+    Cardinality of the range of join tables whose fields can be put into the
+    cache. A table from the range not necessarily contributes to the cache.
+  */
+  uint tables;
+
+  /* 
+    The total number of flag and data fields that can appear in a record
+    written into the cache. Fields with null values are always skipped 
+    to save space. 
+  */
+  uint fields;
+
+  /* 
+    The total number of flag fields in a record put into the cache. They are
+    used for table null bitmaps, table null row flags, and an optional match
+    flag. Flag fields go before other fields in a cache record with the match
+    flag field placed always at the very beginning of the record.
+  */
+  uint flag_fields;
+
+  /* The total number of blob fields that are written into the cache */ 
+  uint blobs;
+
+  /* 
+    The total number of fields referenced from field descriptors for other join
+    caches. These fields are used to construct key values.
+    When BKA join algorithm is employed the constructed key values serve to
+    access matching rows with index lookups.
+    The key values are put into a hash table when the BNLH join algorithm
+    is employed and when BKAH is used for the join operation. 
+  */   
+  uint referenced_fields;
+   
+  /* 
+    The current number of already created data field descriptors.
+    This number can be useful for implementations of the init methods.  
+  */
+  uint data_field_count; 
+
+  /* 
+    The current number of already created pointers to the data field
+    descriptors. This number can be useful for implementations of
+    the init methods.  
+  */
+  uint data_field_ptr_count;
+ 
+  /* 
+    Array of the descriptors of fields containing 'fields' elements.
+    These are all fields that are stored for a record in the cache. 
+  */
+  CACHE_FIELD *field_descr;
+
+  /* 
+    Array of pointers to the blob descriptors that contains 'blobs' elements.
+  */
+  CACHE_FIELD **blob_ptr;
+
+  /* 
+    This flag indicates that records written into the join buffer contain
+    a match flag field. The flag must be set by the init method. 
+  */
+  bool with_match_flag; 
+  /*
+    This flag indicates that any record is prepended with the length of the
+    record which allows us to skip the record or part of it without reading.
+  */
+  bool with_length;
+
+  /* 
+    The maximal number of bytes used for a record representation in
+    the cache excluding the space for blob data. 
+    For future derived classes this representation may contains some
+    redundant info such as a key value associated with the record.     
+  */
+  uint pack_length;
+  /* 
+    The value of pack_length incremented by the total size of all 
+    pointers of a record in the cache to the blob data. 
+  */
+  uint pack_length_with_blob_ptrs;
+
+  /* 
+    The total size of the record base prefix. The base prefix of record may
+    include the following components:
+     - the length of the record
+     - the link to a record in a previous buffer.
+    Each record in the buffer are supplied with the same set of the components.
+  */
+  uint base_prefix_length;
+
+  /*
+    The expected length of a record in the join buffer together with     
+    all prefixes and postfixes
+  */
+  size_t avg_record_length;
+
+  /* The expected size of the space per record in the auxiliary buffer */
+  size_t avg_aux_buffer_incr;
+
+  /* Expected join buffer space used for one record */
+  size_t space_per_record; 
+
+  /* Pointer to the beginning of the join buffer */
+  uchar *buff;         
+  /* 
+    Size of the entire memory allocated for the join buffer.
+    Part of this memory may be reserved for the auxiliary buffer.
+  */ 
+  size_t buff_size;
+  /* The minimal join buffer size when join buffer still makes sense to use */
+  size_t min_buff_size;
+  /* The maximum expected size if the join buffer to be used */
+  size_t max_buff_size;
+  /* Size of the auxiliary buffer */ 
+  size_t aux_buff_size;
+
+  /* The number of records put into the join buffer */ 
+  size_t records;
+  /* 
+    The number of records in the fully refilled join buffer of
+    the minimal size equal to min_buff_size
+  */
+  size_t min_records;
+  /*
+    The maximum expected number of records to be put in the join buffer
+    at one refill 
+  */
+  size_t max_records;
+
+  /* 
+    Pointer to the current position in the join buffer.
+    This member is used both when writing to buffer and
+    when reading from it.
+  */
+  uchar *pos;
+  /* 
+    Pointer to the first free position in the join buffer,
+    right after the last record into it.
+  */
+  uchar *end_pos; 
+
+  /* 
+    Pointer to the beginning of the first field of the current read/write
+    record from the join buffer. The value is adjusted by the 
+    get_record/put_record functions.
+  */
+  uchar *curr_rec_pos;
+  /* 
+    Pointer to the beginning of the first field of the last record
+    from the join buffer.
+  */
+  uchar *last_rec_pos;
+
+  /* 
+    Flag is set if the blob data for the last record in the join buffer
+    is in record buffers rather than in the join cache.
+  */
+  bool last_rec_blob_data_is_in_rec_buff;
+
+  /* 
+    Pointer to the position to the current record link. 
+    Record links are used only with linked caches. Record links allow to set
+    connections between parts of one join record that are stored in different
+    join buffers.
+    In the simplest case a record link is just a pointer to the beginning of
+    the record stored in the buffer.
+    In a more general case a link could be a reference to an array of pointers
+    to records in the buffer.
+  */
+  uchar *curr_rec_link;
+
+  /* 
+    This flag is set to TRUE if join_tab is the first inner table of an outer
+    join and  the latest record written to the join buffer is detected to be
+    null complemented after checking on conditions over the outer tables for
+    this outer join operation
+  */ 
+  bool last_written_is_null_compl;
+
+  /*
+    The number of fields put in the join buffer of the join cache that are
+    used in building keys to access the table join_tab
+  */
+  uint local_key_arg_fields;
+  /* 
+    The total number of the fields in the previous caches that are used
+    in building keys to access the table join_tab
+  */
+  uint external_key_arg_fields;
+
+  /* 
+    This flag indicates that the key values will be read directly from the join
+    buffer. It will save us building key values in the key buffer.
+  */
+  bool use_emb_key;
+  /* The length of an embedded key value */ 
+  uint emb_key_length;
+
+  /*
+    This object provides the methods to iterate over records of
+    the joined table join_tab when looking for join matches between
+    records from join buffer and records from join_tab.
+    BNL and BNLH join algorithms retrieve all records from join_tab,
+    while BKA/BKAH algorithm iterates only over those records from
+    join_tab that can be accessed by look-ups with join keys built
+    from records in join buffer.  
+  */
+  JOIN_TAB_SCAN *join_tab_scan;
+
+  void calc_record_fields();     
+  void collect_info_on_key_args();
+  int alloc_fields();
+  void create_flag_fields();
+  void create_key_arg_fields();
+  void create_remaining_fields();
+  void set_constants();
+  int alloc_buffer();
+
+  /* Shall reallocate the join buffer */
+  virtual int realloc_buffer();
+  
+  /* Check the possibility to read the access keys directly from join buffer */ 
+  bool check_emb_key_usage();
+
+  uint get_size_of_rec_offset() { return size_of_rec_ofs; }
+  uint get_size_of_rec_length() { return size_of_rec_len; }
+  uint get_size_of_fld_offset() { return size_of_fld_ofs; }
+
+  uchar *get_rec_ref(uchar *ptr)
+  {
+    return buff+get_offset(size_of_rec_ofs, ptr-size_of_rec_ofs);
+  }
+  ulong get_rec_length(uchar *ptr)
+  { 
+    return (ulong) get_offset(size_of_rec_len, ptr);
+  }
+  ulong get_fld_offset(uchar *ptr)
+  { 
+    return (ulong) get_offset(size_of_fld_ofs, ptr);
+  }
+
+  void store_rec_ref(uchar *ptr, uchar* ref)
+  {
+    store_offset(size_of_rec_ofs, ptr-size_of_rec_ofs, (ulong) (ref-buff));
+  }
+  void store_rec_length(uchar *ptr, ulong len)
+  {
+    store_offset(size_of_rec_len, ptr, len);
+  }
+  void store_fld_offset(uchar *ptr, ulong ofs)
+  {
+    store_offset(size_of_fld_ofs, ptr, ofs);
+  }
+
+  /* Write record fields and their required offsets into the join buffer */ 
+  uint write_record_data(uchar *link, bool *is_full);
+
+  /* Get the total length of all prefixes of a record in the join buffer */ 
+  virtual uint get_prefix_length() { return base_prefix_length; }
+  /* Get maximum total length of all affixes of a record in the join buffer */
+  virtual uint get_record_max_affix_length(); 
+
+  /* 
+    Shall get maximum size of the additional space per record used for
+    record keys
+  */
+  virtual uint get_max_key_addon_space_per_record() { return 0; }
+
+  /* 
+    This method must determine for how much the auxiliary buffer should be
+    incremented when a new record is added to the join buffer.
+    If no auxiliary buffer is needed the function should return 0.
+  */
+  virtual uint aux_buffer_incr(ulong recno);
+
+  /* Shall calculate how much space is remaining in the join buffer */ 
+  virtual size_t rem_space() 
+  { 
+    return max(buff_size-(end_pos-buff)-aux_buff_size,0);
+  }
+
+  /* 
+    Shall calculate how much space is taken by allocation of the key
+    for a record in the join buffer
+  */
+  virtual uint extra_key_length() { return 0; }
+
+  /*  Read all flag and data fields of a record from the join buffer */
+  uint read_all_record_fields();
+  
+  /* Read all flag fields of a record from the join buffer */
+  uint read_flag_fields();
+
+  /* Read a data record field from the join buffer */
+  uint read_record_field(CACHE_FIELD *copy, bool last_record);
+
+  /* Read a referenced field from the join buffer */
+  bool read_referenced_field(CACHE_FIELD *copy, uchar *rec_ptr, uint *len);
+
+  /* 
+    Shall skip record from the join buffer if its match flag
+    is set to MATCH_FOUND
+ */
+  virtual bool skip_if_matched();
+
+  /* 
+    Shall skip record from the join buffer if its match flag
+    commands to do so
+  */
+  virtual bool skip_if_not_needed_match();
+
+  /* 
+    True if rec_ptr points to the record whose blob data stay in
+    record buffers
+  */
+  bool blob_data_is_in_rec_buff(uchar *rec_ptr)
+  {
+    return rec_ptr == last_rec_pos && last_rec_blob_data_is_in_rec_buff;
+  }
+
+  /* Find matches from the next table for records from the join buffer */
+  virtual enum_nested_loop_state join_matching_records(bool skip_last);
+
+  /* Shall set an auxiliary buffer up (currently used only by BKA joins) */
+  virtual int setup_aux_buffer(HANDLER_BUFFER &aux_buff) 
+  {
+    DBUG_ASSERT(0);
+    return 0;
+  }
+
+  /*
+    Shall get the number of ranges in the cache buffer passed
+    to the MRR interface
+  */  
+  virtual uint get_number_of_ranges_for_mrr() { return 0; };
+
+  /* 
+    Shall prepare to look for records from the join cache buffer that would
+    match the record of the joined table read into the record buffer
+  */ 
+  virtual bool prepare_look_for_matches(bool skip_last)= 0;
+  /* 
+    Shall return a pointer to the record from join buffer that is checked
+    as the next candidate for a match with the current record from join_tab.
+    Each implementation of this virtual function should bare in mind
+    that the record position it returns shall be exactly the position
+    passed as the parameter to the implementations of the virtual functions 
+    skip_next_candidate_for_match and read_next_candidate_for_match.
+  */   
+  virtual uchar *get_next_candidate_for_match()= 0;
+  /*
+    Shall check whether the given record from the join buffer has its match
+    flag settings commands to skip the record in the buffer.
+  */
+  virtual bool skip_next_candidate_for_match(uchar *rec_ptr)= 0;
+  /*
+    Shall read the given record from the join buffer into the
+    the corresponding record buffer
+  */
+  virtual void read_next_candidate_for_match(uchar *rec_ptr)= 0;
+
+  /* 
+    Shall return the location of the association label returned by 
+    the multi_read_range_next function for the current record loaded
+    into join_tab's record buffer
+  */
+  virtual uchar **get_curr_association_ptr() { return 0; };
+
+  /* Add null complements for unmatched outer records from the join buffer */
+  virtual enum_nested_loop_state join_null_complements(bool skip_last);
+
+  /* Restore the fields of the last record from the join buffer */
+  virtual void restore_last_record();
+
+  /* Set match flag for a record in join buffer if it has not been set yet */
+  bool set_match_flag_if_none(JOIN_TAB *first_inner, uchar *rec_ptr);
+
+  enum_nested_loop_state generate_full_extensions(uchar *rec_ptr);
+
+  /* Check matching to a partial join record from the join buffer */
+  bool check_match(uchar *rec_ptr);
+
+  /* 
+    This constructor creates an unlinked join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter.
+  */   
+  JOIN_CACHE(JOIN *j, JOIN_TAB *tab)
+  {
+    join= j;
+    join_tab= tab;
+    prev_cache= next_cache= 0;
+    buff= 0;
+  }
+
+  /* 
+    This constructor creates a linked join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter. The parameter 'prev' specifies the previous
+    cache object to which this cache is linked.
+  */   
+  JOIN_CACHE(JOIN *j, JOIN_TAB *tab, JOIN_CACHE *prev)   
+  {  
+    join= j;
+    join_tab= tab;
+    next_cache= 0;
+    prev_cache= prev;
+    buff= 0;
+    if (prev)
+      prev->next_cache= this;
+  }
+
+public:
+ 
+  /*
+    The enumeration type Join_algorithm includes a mnemonic constant for
+    each join algorithm that employs join buffers
+  */
+
+  enum Join_algorithm
+  { 
+    BNL_JOIN_ALG,     /* Block Nested Loop Join algorithm                  */
+    BNLH_JOIN_ALG,    /* Block Nested Loop Hash Join algorithm             */
+    BKA_JOIN_ALG,     /* Batched Key Access Join algorithm                 */
+    BKAH_JOIN_ALG,    /* Batched Key Access with Hash Table Join Algorithm */
+  };
+
+  /* 
+    The enumeration type Match_flag describes possible states of the match flag
+    field  stored for the records of the first inner tables of outer joins and
+    semi-joins in the cases when the first match strategy is used for them.
+    When a record with match flag field is written into the join buffer the
+    state of the field usually is MATCH_NOT_FOUND unless this is a record of the
+    first inner table of the outer join for which the on precondition (the
+    condition from on expression over outer tables)  has turned out not to be 
+    true. In the last case the state of the match flag is MATCH_IMPOSSIBLE.
+    The state of the match flag field is changed to MATCH_FOUND as soon as
+    the first full matching combination of inner tables of the outer join or
+    the semi-join is discovered. 
+  */
+  enum Match_flag { MATCH_NOT_FOUND, MATCH_FOUND, MATCH_IMPOSSIBLE };
+
+  /* Table to be joined with the partial join records from the cache */ 
+  JOIN_TAB *join_tab;
+
+  /* Pointer to the previous join cache if there is any */
+  JOIN_CACHE *prev_cache;
+  /* Pointer to the next join cache if there is any */
+  JOIN_CACHE *next_cache;
+
+  /* Shall initialize the join cache structure */ 
+  virtual int init();
+
+  /* Get the current size of the cache join buffer */ 
+  size_t get_join_buffer_size() { return buff_size; }
+  /* Set the size of the cache join buffer to a new value */
+  void set_join_buffer_size(size_t sz) { buff_size= sz; }
+
+  /* Get the minimum possible size of the cache join buffer */
+  virtual ulong get_min_join_buffer_size();
+  /* Get the maximum possible size of the cache join buffer */ 
+  virtual ulong get_max_join_buffer_size(bool optimize_buff_size);
+
+  /* Shrink the size if the cache join buffer in a given ratio */
+  bool shrink_join_buffer_in_ratio(ulonglong n, ulonglong d);
+
+  /*  Shall return the type of the employed join algorithm */
+  virtual enum Join_algorithm get_join_alg()= 0;
+
+  /* 
+    The function shall return TRUE only when there is a key access
+    to the join table
+  */
+  virtual bool is_key_access()= 0;
+
+  /* Shall reset the join buffer for reading/writing */
+  virtual void reset(bool for_writing);
+
+  /* 
+    This function shall add a record into the join buffer and return TRUE
+    if it has been decided that it should be the last record in the buffer.
+  */ 
+  virtual bool put_record();
+
+  /* 
+    This function shall read the next record into the join buffer and return
+    TRUE if there is no more next records.
+  */ 
+  virtual bool get_record();
+
+  /* 
+    This function shall read the record at the position rec_ptr
+    in the join buffer
+  */ 
+  virtual void get_record_by_pos(uchar *rec_ptr);
+
+  /* Shall return the value of the match flag for the positioned record */
+  virtual enum Match_flag get_match_flag_by_pos(uchar *rec_ptr);
+
+  /* Shall return the position of the current record */
+  virtual uchar *get_curr_rec() { return curr_rec_pos; }
+
+  /* Shall set the current record link */
+  virtual void set_curr_rec_link(uchar *link) { curr_rec_link= link; }
+
+  /* Shall return the current record link */
+  virtual uchar *get_curr_rec_link()
+  { 
+    return (curr_rec_link ? curr_rec_link : get_curr_rec());
+  }
+     
+  /* Join records from the join buffer with records from the next join table */ 
+  enum_nested_loop_state join_records(bool skip_last);
+
+  /* Add a comment on the join algorithm employed by the join cache */
+  void print_explain_comment(String *str);
+
+  virtual ~JOIN_CACHE() {}
+  void reset_join(JOIN *j) { join= j; }
+  void free()
+  { 
+    x_free(buff);
+    buff= 0;
+  }   
+  
+  JOIN_TAB *get_next_table(JOIN_TAB *tab);
+  
+  friend class JOIN_CACHE_HASHED;
+  friend class JOIN_CACHE_BNL;
+  friend class JOIN_CACHE_BKA;
+  friend class JOIN_TAB_SCAN;
+  friend class JOIN_TAB_SCAN_MRR;
+
+};
+
+
+/*
+  The class JOIN_CACHE_HASHED is the base class for the classes
+  JOIN_CACHE_HASHED_BNL and JOIN_CACHE_HASHED_BKA. The first of them supports
+  an implementation of Block Nested Loop Hash (BNLH) Join Algorithm,
+  while the second is used for a variant of the BKA Join algorithm that performs
+  only one lookup for any records from join buffer with the same key value. 
+  For a join cache of this class the records from the join buffer that have
+  the same access key are linked into a chain attached to a key entry structure
+  that either itself contains the key value, or, in the case when the keys are
+  embedded, refers to its occurrence in one of the records from the chain.
+  To build the chains with the same keys a hash table is employed. It is placed
+  at the very end of the join buffer. The array of hash entries is allocated
+  first at the very bottom of the join buffer, while key entries are placed
+  before this array.
+  A hash entry contains a header of the list of the key entries with the same
+  hash value. 
+  Each key entry is a structure of the following type:
+    struct st_join_cache_key_entry {
+      union { 
+        uchar[] value;
+        cache_ref *value_ref; // offset from the beginning of the buffer
+      } hash_table_key;
+      key_ref next_key; // offset backward from the beginning of hash table
+      cache_ref *last_rec // offset from the beginning of the buffer
+    }
+  The references linking the records in a chain are always placed at the very
+  beginning of the record info stored in the join buffer. The records are 
+  linked in a circular list. A new record is always added to the end of this 
+  list.
+
+  The following picture represents a typical layout for the info stored in the
+  join buffer of a join cache object of the JOIN_CACHE_HASHED class.
+    
+  buff
+  V
+  +----------------------------------------------------------------------------+
+  |     |[*]record_1_1|                                                        |
+  |     ^ |                                                                    |
+  |     | +--------------------------------------------------+                 |
+  |     |                           |[*]record_2_1|          |                 |
+  |     |                           ^ |                      V                 |
+  |     |                           | +------------------+   |[*]record_1_2|   |
+  |     |                           +--------------------+-+   |               |
+  |+--+ +---------------------+                          | |   +-------------+ |
+  ||  |                       |                          V |                 | |
+  |||[*]record_3_1|         |[*]record_1_3|              |[*]record_2_2|     | |
+  ||^                       ^                            ^                   | |
+  ||+----------+            |                            |                   | |
+  ||^          |            |<---------------------------+-------------------+ |
+  |++          | | ... mrr  |   buffer ...           ... |     |               |
+  |            |            |                            |                     |
+  |      +-----+--------+   |                      +-----|-------+             |
+  |      V     |        |   |                      V     |       |             |
+  ||key_3|[/]|[*]|      |   |                |key_2|[/]|[*]|     |             |
+  |                   +-+---|-----------------------+            |             |
+  |                   V |   |                       |            |             |
+  |             |key_1|[*]|[*]|         |   | ... |[*]|   ...  |[*]|  ...  |   |
+  +----------------------------------------------------------------------------+
+                                        ^           ^            ^
+                                        |           i-th entry   j-th entry
+                                        hash table
+
+  i-th hash entry:
+    circular record chain for key_1:
+      record_1_1
+      record_1_2
+      record_1_3 (points to record_1_1)
+    circular record chain for key_3:
+      record_3_1 (points to itself)
+
+  j-th hash entry:
+    circular record chain for key_2:
+      record_2_1
+      record_2_2 (points to record_2_1)
+
+*/
+
+class JOIN_CACHE_HASHED: public JOIN_CACHE
+{
+
+  typedef uint (JOIN_CACHE_HASHED::*Hash_func) (uchar *key, uint key_len);
+  typedef bool (JOIN_CACHE_HASHED::*Hash_cmp_func) (uchar *key1, uchar *key2,
+                                                    uint key_len);
+  
+private:
+
+  /* Size of the offset of a key entry in the hash table */
+  uint size_of_key_ofs;
+
+  /* 
+    Length of the key entry in the hash table.
+    A key entry either contains the key value, or it contains a reference
+    to the key value if use_emb_key flag is set for the cache.
+  */ 
+  uint key_entry_length;
+ 
+  /* The beginning of the hash table in the join buffer */
+  uchar *hash_table;
+  /* Number of hash entries in the hash table */
+  uint hash_entries;
+
+
+  /* The position of the currently retrieved key entry in the hash table */
+  uchar *curr_key_entry;
+
+  /* The offset of the data fields from the beginning of the record fields */
+  uint data_fields_offset;
+
+  inline uint get_hash_idx_simple(uchar *key, uint key_len);
+  inline uint get_hash_idx_complex(uchar *key, uint key_len);
+
+  inline bool equal_keys_simple(uchar *key1, uchar *key2, uint key_len);
+  inline bool equal_keys_complex(uchar *key1, uchar *key2, uint key_len);
+
+  int init_hash_table();
+  void cleanup_hash_table();
+  
+protected:
+
+  /* 
+    Index info on the TABLE_REF object used by the hash join
+    to look for matching records
+  */    
+  KEY *ref_key_info;
+  /* 
+    Number of the key parts the TABLE_REF object used by the hash join
+    to look for matching records
+  */    
+  uint ref_used_key_parts;
+
+  /*
+    The hash function used in the hash table,
+    usually set by the init() method
+  */ 
+  Hash_func hash_func;
+  /*
+    The function to check whether two key entries in the hash table
+    are equal or not, usually set by the init() method
+  */ 
+  Hash_cmp_func hash_cmp_func;
+
+  /* 
+    Length of a key value.
+    It is assumed that all key values have the same length.
+  */
+  uint key_length;
+  /* Buffer to store key values for probing */
+  uchar *key_buff;
+
+  /* Number of key entries in the hash table (number of distinct keys) */
+  uint key_entries;
+
+  /* The position of the last key entry in the hash table */
+  uchar *last_key_entry;
+
+  /* 
+    The offset of the record fields from the beginning of the record
+    representation. The record representation starts with a reference to
+    the next record in the key record chain followed by the length of
+    the trailing record data followed by a reference to the record segment
+    in the previous cache, if any, followed by the record fields.
+  */ 
+  uint rec_fields_offset;
+
+  uint get_size_of_key_offset() { return size_of_key_ofs; }
+
+  /* 
+    Get the position of the next_key_ptr field pointed to by 
+    a linking reference stored at the position key_ref_ptr. 
+    This reference is actually the offset backward from the
+    beginning of hash table.
+  */  
+  uchar *get_next_key_ref(uchar *key_ref_ptr)
+  {
+    return hash_table-get_offset(size_of_key_ofs, key_ref_ptr);
+  }
+
+  /* 
+    Store the linking reference to the next_key_ptr field at 
+    the position key_ref_ptr. The position of the next_key_ptr
+    field is pointed to by ref. The stored reference is actually
+    the offset backward from the beginning of the hash table.
+  */  
+  void store_next_key_ref(uchar *key_ref_ptr, uchar *ref)
+  {
+    store_offset(size_of_key_ofs, key_ref_ptr, (ulong) (hash_table-ref));
+  }     
+  
+  /* 
+    Check whether the reference to the next_key_ptr field at the position
+    key_ref_ptr contains  a nil value.
+  */
+  bool is_null_key_ref(uchar *key_ref_ptr)
+  {
+    ulong nil= 0;
+    return memcmp(key_ref_ptr, &nil, size_of_key_ofs ) == 0;
+  } 
+
+  /* 
+    Set the reference to the next_key_ptr field at the position
+    key_ref_ptr equal to nil.
+  */
+  void store_null_key_ref(uchar *key_ref_ptr)
+  {
+    ulong nil= 0;
+    store_offset(size_of_key_ofs, key_ref_ptr, nil);
+  } 
+
+  uchar *get_next_rec_ref(uchar *ref_ptr)
+  {
+    return buff+get_offset(get_size_of_rec_offset(), ref_ptr);
+  }
+
+  void store_next_rec_ref(uchar *ref_ptr, uchar *ref)
+  {
+    store_offset(get_size_of_rec_offset(), ref_ptr, (ulong) (ref-buff));
+  } 
+
+  /*
+    Get the position of the embedded key value for the current
+    record pointed to by get_curr_rec().
+  */ 
+  uchar *get_curr_emb_key()
+  {
+    return get_curr_rec()+data_fields_offset;
+  }
+
+  /*
+    Get the position of the embedded key value pointed to by a reference
+    stored at ref_ptr. The stored reference is actually the offset from
+    the beginning of the join buffer.
+  */  
+  uchar *get_emb_key(uchar *ref_ptr)
+  {
+    return buff+get_offset(get_size_of_rec_offset(), ref_ptr);
+  }
+
+  /* 
+    Store the reference to an embedded key at the position key_ref_ptr.
+    The position of the embedded key is pointed to by ref. The stored
+    reference is actually the offset from the beginning of the join buffer.
+  */  
+  void store_emb_key_ref(uchar *ref_ptr, uchar *ref)
+  {
+    store_offset(get_size_of_rec_offset(), ref_ptr, (ulong) (ref-buff));
+  }
+  
+  /* Get the total length of all prefixes of a record in hashed join buffer */ 
+  uint get_prefix_length() 
+  { 
+    return base_prefix_length + get_size_of_rec_offset();
+  }
+
+  /* 
+    Get maximum size of the additional space per record used for
+    the hash table with record keys
+  */
+  uint get_max_key_addon_space_per_record();
+
+  /* 
+    Calculate how much space in the buffer would not be occupied by
+    records, key entries and additional memory for the MMR buffer.
+  */ 
+  size_t rem_space() 
+  { 
+    return max(last_key_entry-end_pos-aux_buff_size,0);
+  }
+
+  /* 
+    Calculate how much space is taken by allocation of the key
+    entry for a record in the join buffer
+  */
+  uint extra_key_length() { return key_entry_length; }
+
+  /* 
+    Skip record from a hashed join buffer if its match flag
+    is set to MATCH_FOUND
+  */
+  bool skip_if_matched();
+
+  /*
+    Skip record from a hashed join buffer if its match flag setting 
+    commands to do so
+  */
+  bool skip_if_not_needed_match();
+
+  /* Search for a key in the hash table of the join buffer */
+  bool key_search(uchar *key, uint key_len, uchar **key_ref_ptr);
+
+  /* Reallocate the join buffer of a hashed join cache */
+  int realloc_buffer();
+
+  /* 
+    This constructor creates an unlinked hashed join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter.
+  */   
+  JOIN_CACHE_HASHED(JOIN *j, JOIN_TAB *tab) :JOIN_CACHE(j, tab) {}
+
+  /* 
+    This constructor creates a linked hashed join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter. The parameter 'prev' specifies the previous
+    cache object to which this cache is linked.
+  */   
+  JOIN_CACHE_HASHED(JOIN *j, JOIN_TAB *tab, JOIN_CACHE *prev) 
+		    :JOIN_CACHE(j, tab, prev) {}
+
+public:
+
+  /* Initialize a hashed join cache */       
+  int init();
+
+  /* Reset the buffer of a hashed join cache for reading/writing */
+  void reset(bool for_writing);
+
+  /* Add a record into the buffer of a hashed join cache */
+  bool put_record();
+
+  /* Read the next record from the buffer of a hashed join cache */
+  bool get_record();
+
+  /*
+    Shall check whether all records in a key chain have 
+    their match flags set on
+  */   
+  virtual bool check_all_match_flags_for_key(uchar *key_chain_ptr);
+
+  uint get_next_key(uchar **key); 
+  
+  /* Get the head of the record chain attached to the current key entry */ 
+  uchar *get_curr_key_chain()
+  {
+    return get_next_rec_ref(curr_key_entry+key_entry_length-
+                            get_size_of_rec_offset());
+  }
+  
+};
+
+
+/*
+  The class JOIN_TAB_SCAN is a companion class for the classes JOIN_CACHE_BNL
+  and JOIN_CACHE_BNLH. Actually the class implements the iterator over the
+  table joinded by BNL/BNLH join algorithm.
+  The virtual functions open, next and close are called for any iteration over
+  the table. The function open is called to initiate the process of the 
+  iteration. The function next shall read the next record from the joined
+  table. The record is read into the record buffer of the joined table.
+  The record is to be matched with records from the join cache buffer. 
+  The function close shall perform the finalizing actions for the iteration.
+*/
+   
+class JOIN_TAB_SCAN: public Sql_alloc
+{
+
+private:
+  /* TRUE if this is the first record from the joined table to iterate over */
+  bool is_first_record;
+
+protected:
+
+  /* The joined table to be iterated over */
+  JOIN_TAB *join_tab;
+  /* The join cache used to join the table join_tab */ 
+  JOIN_CACHE *cache;
+  /* 
+    Representation of the executed multi-way join through which
+    all needed context can be accessed.  
+  */   
+  JOIN *join;
+
+public:
+  
+  JOIN_TAB_SCAN(JOIN *j, JOIN_TAB *tab)
+  {
+    join= j;
+    join_tab= tab;
+    cache= join_tab->cache;
+  }
+
+  virtual ~JOIN_TAB_SCAN() {}
+ 
+  /* 
+    Shall calculate the increment of the auxiliary buffer for a record
+    write if such a buffer is used by the table scan object 
+  */
+  virtual uint aux_buffer_incr(ulong recno) { return 0; }
+
+  /* Initiate the process of iteration over the joined table */
+  virtual int open();
+  /* 
+    Shall read the next candidate for matches with records from 
+    the join buffer.
+  */
+  virtual int next();
+  /* 
+    Perform the finalizing actions for the process of iteration
+    over the joined_table.
+  */ 
+  virtual void close();
+
+};
+
+/*
+  The class JOIN_CACHE_BNL is used when the BNL join algorithm is
+  employed to perform a join operation   
+*/
+
+class JOIN_CACHE_BNL :public JOIN_CACHE
+{
+private:
+  /* 
+    The number of the records in the join buffer that have to be
+    checked yet for a match with the current record of join_tab 
+    read into the record buffer.
+  */
+  uint rem_records;
+
+protected:
+
+  bool prepare_look_for_matches(bool skip_last);
+
+  uchar *get_next_candidate_for_match();
+
+  bool skip_next_candidate_for_match(uchar *rec_ptr);
+
+  void read_next_candidate_for_match(uchar *rec_ptr);
+
+public:
+
+  /* 
+    This constructor creates an unlinked BNL join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter.
+  */   
+  JOIN_CACHE_BNL(JOIN *j, JOIN_TAB *tab) :JOIN_CACHE(j, tab) {}
+
+  /* 
+    This constructor creates a linked BNL join cache. The cache is to be 
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter. The parameter 'prev' specifies the previous
+    cache object to which this cache is linked.
+  */   
+  JOIN_CACHE_BNL(JOIN *j, JOIN_TAB *tab, JOIN_CACHE *prev) 
+    :JOIN_CACHE(j, tab, prev) {}
+
+  /* Initialize the BNL cache */       
+  int init();
+
+  enum Join_algorithm get_join_alg() { return BNL_JOIN_ALG; }
+
+  bool is_key_access() { return FALSE; }
+
+};
+
+
+/*
+  The class JOIN_CACHE_BNLH is used when the BNLH join algorithm is
+  employed to perform a join operation   
+*/
+
+class JOIN_CACHE_BNLH :public JOIN_CACHE_HASHED
+{
+
+protected:
+
+  /* 
+    The pointer to the last record from the circular list of the records
+    that  match the join key built out of the record in the join buffer for
+    the join_tab table
+  */
+  uchar *last_matching_rec_ref_ptr;
+  /*
+    The pointer to the next current  record from the circular list of the
+    records that match the join key built out of the record in the join buffer
+    for the join_tab table. This pointer is used by the class method 
+    get_next_candidate_for_match to iterate over records from the circular
+    list.
+  */
+  uchar *next_matching_rec_ref_ptr;
+
+  /*
+    Get the chain of records from buffer matching the current candidate
+    record for join
+  */
+  uchar *get_matching_chain_by_join_key();
+
+  bool prepare_look_for_matches(bool skip_last);
+
+  uchar *get_next_candidate_for_match();
+
+  bool skip_next_candidate_for_match(uchar *rec_ptr);
+
+  void read_next_candidate_for_match(uchar *rec_ptr);
+
+public:
+
+  /* 
+    This constructor creates an unlinked BNLH join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter.
+  */   
+  JOIN_CACHE_BNLH(JOIN *j, JOIN_TAB *tab) : JOIN_CACHE_HASHED(j, tab) {}
+
+  /* 
+    This constructor creates a linked BNLH join cache. The cache is to be 
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter. The parameter 'prev' specifies the previous
+    cache object to which this cache is linked.
+  */   
+  JOIN_CACHE_BNLH(JOIN *j, JOIN_TAB *tab, JOIN_CACHE *prev) 
+    : JOIN_CACHE_HASHED(j, tab, prev) {}
+
+  /* Initialize the BNLH cache */       
+  int init();
+
+  enum Join_algorithm get_join_alg() { return BNLH_JOIN_ALG; }
+
+  bool is_key_access() { return TRUE; }
+
+};
+
+
+/*
+  The class JOIN_TAB_SCAN_MRR is a companion class for the classes
+  JOIN_CACHE_BKA and JOIN_CACHE_BKAH. Actually the class implements the
+  iterator over the records from join_tab selected by BKA/BKAH join
+  algorithm as the candidates to be joined. 
+  The virtual functions open, next and close are called for any iteration over
+  join_tab record candidates. The function open is called to initiate the
+  process of the iteration. The function next shall read the next record from
+  the set of the record candidates. The record is read into the record buffer
+  of the joined table. The function close shall perform the finalizing actions
+  for the iteration.
+*/
+   
+class JOIN_TAB_SCAN_MRR: public JOIN_TAB_SCAN
+{
+  /* Interface object to generate key ranges for MRR */
+  RANGE_SEQ_IF range_seq_funcs;
+
+  /* Number of ranges to be processed by the MRR interface */
+  uint ranges;
+
+  /* Flag to to be passed to the MRR interface */ 
+  uint mrr_mode;
+
+  /* MRR buffer assotiated with this join cache */
+  HANDLER_BUFFER mrr_buff;
+
+  /* Shall initialize the MRR buffer */
+  virtual void init_mrr_buff()
+  {
+    cache->setup_aux_buffer(mrr_buff);
+  }
+
+public:
+
+  JOIN_TAB_SCAN_MRR(JOIN *j, JOIN_TAB *tab, uint flags, RANGE_SEQ_IF rs_funcs)
+    :JOIN_TAB_SCAN(j, tab), range_seq_funcs(rs_funcs), mrr_mode(flags) {}
+
+  uint aux_buffer_incr(ulong recno);
+
+  int open();
+ 
+  int next();
+
+  friend class JOIN_CACHE_BKA; /* it needs to add an mrr_mode flag after JOIN_CACHE::init() call */
+};
+
+/*
+  The class JOIN_CACHE_BKA is used when the BKA join algorithm is
+  employed to perform a join operation   
+*/
+
+class JOIN_CACHE_BKA :public JOIN_CACHE
+{
+private:
+
+  /* Flag to to be passed to the companion JOIN_TAB_SCAN_MRR object */
+  uint mrr_mode;
+
+  /* 
+    This value is set to 1 by the class prepare_look_for_matches method
+    and back to 0 by the class get_next_candidate_for_match method
+  */
+  uint rem_records;
+
+  /*
+    This field contains the current association label set by a call of
+    the multi_range_read_next handler function.
+    See the function JOIN_CACHE_BKA::get_curr_key_association()
+  */
+  uchar *curr_association;
+
+protected:
+
+  /* 
+    Get the number of ranges in the cache buffer passed to the MRR
+    interface. For each record its own range is passed.
+  */
+  uint get_number_of_ranges_for_mrr() { return (uint)records; }
+
+ /*
+   Setup the MRR buffer as the space between the last record put
+   into the join buffer and the very end of the join buffer 
+ */
+  int setup_aux_buffer(HANDLER_BUFFER &aux_buff)
+  {
+    aux_buff.buffer= end_pos;
+    aux_buff.buffer_end= buff+buff_size;
+    return 0;
+  }
+
+  bool prepare_look_for_matches(bool skip_last);
+
+  uchar *get_next_candidate_for_match();
+
+  bool skip_next_candidate_for_match(uchar *rec_ptr);
+
+  void read_next_candidate_for_match(uchar *rec_ptr);
+
+public:
+
+  /* 
+    This constructor creates an unlinked BKA join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter.
+    The MRR mode initially is set to 'flags'.
+  */   
+  JOIN_CACHE_BKA(JOIN *j, JOIN_TAB *tab, uint flags)
+    :JOIN_CACHE(j, tab), mrr_mode(flags) {}
+  /* 
+    This constructor creates a linked BKA join cache. The cache is to be 
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter. The parameter 'prev' specifies the previous
+    cache object to which this cache is linked.
+    The MRR mode initially is set to 'flags'.
+  */   
+  JOIN_CACHE_BKA(JOIN *j, JOIN_TAB *tab, uint flags, JOIN_CACHE *prev)
+    :JOIN_CACHE(j, tab, prev), mrr_mode(flags) {}
+  
+  uchar **get_curr_association_ptr() { return &curr_association; }
+
+  /* Initialize the BKA cache */       
+  int init();
+
+  enum Join_algorithm get_join_alg() { return BKA_JOIN_ALG; }
+
+  bool is_key_access() { return TRUE; }
+
+  /* Get the key built over the next record from the join buffer */
+  uint get_next_key(uchar **key);
+
+  /* Check index condition of the joined table for a record from BKA cache */
+  bool skip_index_tuple(range_id_t range_info);
+
+};
+
+
+
+/*
+  The class JOIN_CACHE_BKAH is used when the BKAH join algorithm is
+  employed to perform a join operation   
+*/
+
+class JOIN_CACHE_BKAH :public JOIN_CACHE_BNLH
+{
+
+private:
+  /* Flag to to be passed to the companion JOIN_TAB_SCAN_MRR object */
+  uint mrr_mode;
+
+  /* 
+    This flag is set to TRUE if the implementation of the MRR interface cannot
+    handle range association labels and does not return them to the caller of
+    the multi_range_read_next handler function. E.g. the implementation of
+    the MRR inteface for the Falcon engine could not return association
+    labels to the caller of multi_range_read_next.
+    The flag is set by JOIN_CACHE_BKA::init() and is not ever changed.
+  */       
+  bool no_association;
+
+  /* 
+    This field contains the association label returned by the 
+    multi_range_read_next function.
+    See the function JOIN_CACHE_BKAH::get_curr_key_association()
+  */
+  uchar *curr_matching_chain;
+
+protected:
+
+  uint get_number_of_ranges_for_mrr() { return key_entries; }
+
+  /* 
+    Initialize the MRR buffer allocating some space within the join buffer.
+    The entire space between the last record put into the join buffer and the
+    last key entry added to the hash table is used for the MRR buffer.
+  */
+  int setup_aux_buffer(HANDLER_BUFFER &aux_buff)
+  {
+    aux_buff.buffer= end_pos;
+    aux_buff.buffer_end= last_key_entry;
+    return 0;
+  }
+
+  bool prepare_look_for_matches(bool skip_last);
+
+  /*
+    The implementations of the methods
+    - get_next_candidate_for_match
+    - skip_recurrent_candidate_for_match
+    - read_next_candidate_for_match
+    are inherited from the JOIN_CACHE_BNLH class
+  */
+
+public:
+
+  /* 
+    This constructor creates an unlinked BKAH join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter.
+    The MRR mode initially is set to 'flags'.
+  */   
+  JOIN_CACHE_BKAH(JOIN *j, JOIN_TAB *tab, uint flags) 
+    :JOIN_CACHE_BNLH(j, tab), mrr_mode(flags) {}
+
+  /* 
+    This constructor creates a linked BKAH join cache. The cache is to be 
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter. The parameter 'prev' specifies the previous
+    cache object to which this cache is linked.
+    The MRR mode initially is set to 'flags'.
+  */   
+  JOIN_CACHE_BKAH(JOIN *j, JOIN_TAB *tab, uint flags, JOIN_CACHE *prev)
+    :JOIN_CACHE_BNLH(j, tab, prev), mrr_mode(flags)  {}
+
+  uchar **get_curr_association_ptr() { return &curr_matching_chain; }
+
+  /* Initialize the BKAH cache */       
+  int init();
+
+  enum Join_algorithm get_join_alg() { return BKAH_JOIN_ALG; }
+
+  /* Check index condition of the joined table for a record from BKAH cache */
+  bool skip_index_tuple(range_id_t range_info);
+};
diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
index aea9796ba56..d0f9b5cd58d 100644
--- a/sql/sql_lex.cc
+++ b/sql/sql_lex.cc
@@ -1633,11 +1633,14 @@ void st_select_lex::init_query()
   nest_level= 0;
   link_next= 0;
   lock_option= TL_READ_DEFAULT;
+
+  bzero((char*) expr_cache_may_be_used, sizeof(expr_cache_may_be_used));
 }
 
 void st_select_lex::init_select()
 {
   st_select_lex_node::init_select();
+  sj_nests.empty();
   group_list.empty();
   type= db= 0;
   having= 0;
@@ -1829,6 +1832,55 @@ void st_select_lex_unit::exclude_tree()
 }
 
 
+/**
+  Register reference to an item which the subqueries depends on
+
+  @param def_sel         select against which the item is resolved
+  @param dependency      reference to the item
+
+  @details
+  This function puts the reference dependency to an item that is either an
+  outer field or an aggregate function resolved against an outer select into
+  the list 'depends_on'. It adds it to the 'depends_on' lists for each
+  subquery between this one and 'def_sel' - the subquery against which the
+  item is resolved.
+*/
+
+void st_select_lex::register_dependency_item(st_select_lex *def_sel,
+                                             Item **dependency)
+{
+  SELECT_LEX *s= this;
+  DBUG_ENTER("st_select_lex::register_dependency_item");
+  DBUG_ASSERT(this != def_sel);
+  DBUG_ASSERT(*dependency);
+  do
+  {
+    /* check duplicates */
+    List_iterator_fast<Item*> li(s->master_unit()->item->depends_on);
+    Item **dep;
+    while ((dep= li++))
+    {
+      if ((*dep)->eq(*dependency, FALSE))
+      {
+         DBUG_PRINT("info", ("dependency %s already present",
+                             ((*dependency)->name ?
+                              (*dependency)->name :
+                              "<no name>")));
+         DBUG_VOID_RETURN;
+      }
+    }
+
+    s->master_unit()->item->depends_on.push_back(dependency);
+    DBUG_PRINT("info", ("depends_on: Select: %d  added: %s",
+                        s->select_number,
+                        ((*dependency)->name ?
+                         (*dependency)->name :
+                         "<no name>")));
+  } while ((s= s->outer_select()) != def_sel);
+  DBUG_VOID_RETURN;
+}
+
+
 /*
   st_select_lex_node::mark_as_dependent mark all st_select_lex struct from 
   this to 'last' as dependent
@@ -1841,9 +1893,8 @@ void st_select_lex_unit::exclude_tree()
     'last' should be reachable from this st_select_lex_node
 */
 
-void st_select_lex::mark_as_dependent(st_select_lex *last, Item *dependency)
+bool st_select_lex::mark_as_dependent(THD *thd, st_select_lex *last, Item *dependency)
 {
-  SELECT_LEX *next_to_last;
 
   DBUG_ASSERT(this != last);
 
@@ -1869,13 +1920,15 @@ void st_select_lex::mark_as_dependent(st_select_lex *last, Item *dependency)
           sl->uncacheable|= UNCACHEABLE_UNITED;
       }
     }
-    next_to_last= s;
-  } while ((s= s->outer_select()) != last && s != 0);
 
+    Item_subselect *subquery_expr= s->master_unit()->item;
+    if (subquery_expr && subquery_expr->mark_as_dependent(thd, last, 
+                                                          dependency))
+      return TRUE;
+  } while ((s= s->outer_select()) != last && s != 0);
   is_correlated= TRUE;
   this->master_unit()->item->is_correlated= TRUE;
-  if (dependency)
-    next_to_last->master_unit()->item->refers_to.push_back(dependency);
+  return FALSE;
 }
 
 bool st_select_lex_node::set_braces(bool value)      { return 1; }
@@ -2078,16 +2131,28 @@ void st_select_lex::print_limit(THD *thd,
 {
   SELECT_LEX_UNIT *unit= master_unit();
   Item_subselect *item= unit->item;
-  if (item && unit->global_parameters == this &&
-      (item->substype() == Item_subselect::EXISTS_SUBS ||
-       item->substype() == Item_subselect::IN_SUBS ||
-       item->substype() == Item_subselect::ALL_SUBS))
+
+  if (item && unit->global_parameters == this)
   {
-    DBUG_ASSERT(!item->fixed ||
-                (select_limit->val_int() == LL(1) && offset_limit == 0));
-    return;
+    Item_subselect::subs_type subs_type= item->substype();
+    if (subs_type == Item_subselect::EXISTS_SUBS ||
+        subs_type == Item_subselect::IN_SUBS ||
+        subs_type == Item_subselect::ALL_SUBS)
+    {
+      DBUG_ASSERT(!item->fixed ||
+                  /*
+                    If not using materialization both:
+                    select_limit == 1, and there should be no offset_limit.
+                  */
+                  (((subs_type == Item_subselect::IN_SUBS) &&
+                    ((Item_in_subselect*)item)->exec_method ==
+                    Item_in_subselect::MATERIALIZATION) ?
+                   TRUE :
+                   (select_limit->val_int() == 1LL) &&
+                   offset_limit == 0));
+      return;
+    }
   }
-
   if (explicit_limit)
   {
     str->append(STRING_WITH_LEN(" limit "));
@@ -2100,6 +2165,7 @@ void st_select_lex::print_limit(THD *thd,
   }
 }
 
+
 /**
   @brief Restore the LEX and THD in case of a parse error.
 
diff --git a/sql/sql_lex.h b/sql/sql_lex.h
index 64e2edba0a6..06e99cd60b6 100644
--- a/sql/sql_lex.h
+++ b/sql/sql_lex.h
@@ -566,7 +566,8 @@ public:
   bool add_fake_select_lex(THD *thd);
   void init_prepare_fake_select_lex(THD *thd);
   inline bool is_prepared() { return prepared; }
-  bool change_result(select_subselect *result, select_subselect *old_result);
+  bool change_result(select_result_interceptor *result,
+                     select_result_interceptor *old_result);
   void set_limit(st_select_lex *values);
   void set_thd(THD *thd_arg) { thd= thd_arg; }
   inline bool is_union (); 
@@ -611,6 +612,7 @@ public:
   List<TABLE_LIST> top_join_list; /* join list of the top level          */
   List<TABLE_LIST> *join_list;    /* list for the currently parsed join  */
   TABLE_LIST *embedding;          /* table embedding to the above list   */
+  List<TABLE_LIST> sj_nests;      /* Semi-join nests within this join */
   /*
     Beginning of the list of leaves in a FROM clause, where the leaves
     inlcude all base tables including view tables. The tables are connected
@@ -666,6 +668,11 @@ public:
   /* explicit LIMIT clause was used */
   bool explicit_limit;
   /*
+    This array is used to note  whether we have any candidates for
+    expression caching in the corresponding clauses
+  */
+  bool expr_cache_may_be_used[PARSING_PLACE_SIZE];
+  /*
     there are subquery in HAVING clause => we can't close tables before
     query processing end even if we use temporary table
   */
@@ -753,7 +760,8 @@ public:
   }
   inline bool is_subquery_function() { return master_unit()->item != 0; }
 
-  void mark_as_dependent(st_select_lex *last, Item *dependency);
+  bool mark_as_dependent(THD *thd, st_select_lex *last, Item *dependency);
+  void register_dependency_item(st_select_lex *last, Item **dependency);
 
   bool set_braces(bool value);
   bool inc_in_sum_expr();
@@ -837,7 +845,7 @@ public:
   }
 
   void clear_index_hints(void) { index_hints= NULL; }
-
+  bool is_part_of_union() { return master_unit()->is_union(); }
 private:  
   /* current index hint kind. used in filling up index_hints */
   enum index_hint_type current_index_hint_type;
diff --git a/sql/sql_lifo_buffer.h b/sql/sql_lifo_buffer.h
new file mode 100644
index 00000000000..34f9624436d
--- /dev/null
+++ b/sql/sql_lifo_buffer.h
@@ -0,0 +1,342 @@
+/**
+  @defgroup Bi-directional LIFO buffers used by DS-MRR implementation
+  @{
+*/
+
+class Forward_lifo_buffer;
+class Backward_lifo_buffer;
+
+
+/*
+  A base class for in-memory buffer used by DS-MRR implementation. Common
+  properties:
+  - The buffer is last-in-first-out, i.e. elements that are written last are
+    read first.
+  - The buffer contains fixed-size elements. The elements are either atomic
+    byte sequences or pairs of them.
+  - The buffer resides in the memory provided by the user. It is possible to
+     = dynamically (ie. between write operations) add ajacent memory space to
+       the buffer
+     = dynamically remove unused space from the buffer.
+    The intent of this is to allow to have two buffers on adjacent memory
+    space, one is being read from (and so its space shrinks), while the other 
+    is being written to (and so it needs more and more space).
+
+  There are two concrete classes, Forward_lifo_buffer and Backward_lifo_buffer.
+*/
+
+class Lifo_buffer 
+{
+protected:
+  size_t size1;
+  size_t size2;
+
+public:
+  /**
+    write() will put into buffer size1 bytes pointed by write_ptr1. If
+    size2!=0, then they will be accompanied by size2 bytes pointed by
+    write_ptr2.
+  */
+  uchar *write_ptr1;
+  uchar *write_ptr2;
+
+  /**
+    read() will do reading by storing pointers to read data into read_ptr1 or
+    into (read_ptr1, read_ptr2), depending on whether the buffer was set to
+    store single objects or pairs.
+  */
+  uchar *read_ptr1;
+  uchar *read_ptr2;
+
+protected:
+  uchar *start; /**< points to start of buffer space */
+  uchar *end;   /**< points to just beyond the end of buffer space */
+public:
+
+  enum enum_direction {
+    BACKWARD=-1, /**< buffer is filled/read from bigger to smaller memory addresses */
+    FORWARD=1  /**< buffer is filled/read from smaller to bigger memory addresses */
+  };
+
+  virtual enum_direction type() = 0;
+
+  /* Buffer space control functions */
+
+  /** Let the buffer store data in the given space. */
+  void set_buffer_space(uchar *start_arg, uchar *end_arg) 
+  {
+    start= start_arg;
+    end= end_arg;
+    TRASH(start, end - start);
+    reset();
+  }
+  
+  /** 
+    Specify where write() should get the source data from, as well as source
+    data size.
+  */
+  void setup_writing(size_t len1, size_t len2)
+  {
+    size1= len1;
+    size2= len2;
+  }
+
+  /** 
+    Specify where read() should store pointers to read data, as well as read
+    data size. The sizes must match those passed to setup_writing().
+  */
+  void setup_reading(size_t len1, size_t len2)
+  {
+    DBUG_ASSERT(len1 == size1);
+    DBUG_ASSERT(len2 == size2);
+  }
+  
+  bool can_write()
+  {
+    return have_space_for(size1 + size2);
+  }
+  virtual void write() = 0;
+
+  bool is_empty() { return used_size() == 0; }
+  virtual bool read() = 0;
+  
+  void sort(qsort2_cmp cmp_func, void *cmp_func_arg)
+  {
+    size_t elem_size= size1 + size2;
+    size_t n_elements= used_size() / elem_size;
+    my_qsort2(used_area(), n_elements, elem_size, cmp_func, cmp_func_arg);
+  }
+
+  virtual void reset() = 0;
+  virtual uchar *end_of_space() = 0;
+protected:
+  virtual size_t used_size() = 0;
+  
+  /* To be used only by iterator class: */
+  virtual uchar *get_pos()= 0;
+  virtual bool read(uchar **position, uchar **ptr1, uchar **ptr2)= 0;
+  friend class Lifo_buffer_iterator;
+public:
+  virtual bool have_space_for(size_t bytes) = 0;
+
+  virtual void remove_unused_space(uchar **unused_start, uchar **unused_end)=0;
+  virtual uchar *used_area() = 0; 
+  virtual ~Lifo_buffer() {};
+};
+
+
+/**
+  Forward LIFO buffer
+
+  The buffer that is being written to from start to end and read in the
+  reverse.  'pos' points to just beyond the end of used space.
+
+  It is possible to grow/shink the buffer at the end bound
+
+     used space      unused space  
+   *==============*-----------------*
+   ^              ^                 ^
+   |              |                 +--- end
+   |              +---- pos              
+   +--- start           
+*/
+
+class Forward_lifo_buffer: public Lifo_buffer
+{
+  uchar *pos;
+public:
+  enum_direction type() { return FORWARD; }
+  size_t used_size()
+  {
+    return (size_t)(pos - start);
+  }
+  void reset()
+  {
+    pos= start;
+  }
+  uchar *end_of_space() { return pos; }
+  bool have_space_for(size_t bytes)
+  {
+    return (pos + bytes < end);
+  }
+
+  void write()
+  {
+    write_bytes(write_ptr1, size1);
+    if (size2)
+      write_bytes(write_ptr2, size2);
+  }
+  void write_bytes(const uchar *data, size_t bytes)
+  {
+    DBUG_ASSERT(have_space_for(bytes));
+    memcpy(pos, data, bytes);
+    pos += bytes;
+  }
+  bool have_data(uchar *position, size_t bytes)
+  {
+    return ((position - start) >= (ptrdiff_t)bytes);
+  }
+  uchar *read_bytes(uchar **position, size_t bytes)
+  {
+    DBUG_ASSERT(have_data(*position, bytes));
+    *position= (*position) - bytes;
+    return *position;
+  }
+  bool read() { return read(&pos, &read_ptr1, &read_ptr2); }
+  bool read(uchar **position, uchar **ptr1, uchar **ptr2)
+  {
+    if (!have_data(*position, size1 + size2))
+      return TRUE;
+    if (size2)
+      *ptr2= read_bytes(position, size2);
+    *ptr1= read_bytes(position, size1);
+    return FALSE;
+  }
+  void remove_unused_space(uchar **unused_start, uchar **unused_end)
+  {
+    DBUG_ASSERT(0); /* Don't need this yet */
+  }
+  /**
+    Add more space to the buffer. The caller is responsible that the space
+    being added is adjacent to the end of the buffer.
+
+    @param unused_start Start of space
+    @param unused_end   End of space
+  */
+  void grow(uchar *unused_start, uchar *unused_end)
+  {
+    DBUG_ASSERT(unused_end >= unused_start);
+    DBUG_ASSERT(end == unused_start);
+    TRASH(unused_start, unused_end - unused_start);
+    end= unused_end;
+  }
+  /* Return pointer to start of the memory area that is occupied by the data */
+  uchar *used_area() { return start; }
+  friend class Lifo_buffer_iterator;
+  uchar *get_pos() { return pos; }
+};
+
+
+
+/**
+  Backward LIFO buffer
+
+  The buffer that is being written to from start to end and read in the
+  reverse.  'pos' points to the start of used space.
+
+  It is possible to grow/shink the buffer at the start.
+
+     unused space      used space  
+   *--------------*=================*
+   ^              ^                 ^
+   |              |                 +--- end
+   |              +---- pos              
+   +--- start           
+*/
+class Backward_lifo_buffer: public Lifo_buffer
+{
+  uchar *pos;
+public:
+  enum_direction type() { return BACKWARD; }
+ 
+  size_t used_size()
+  {
+    return (size_t)(end - pos);
+  }
+  void reset()
+  {
+    pos= end;
+  }
+  uchar *end_of_space() { return end; }
+  bool have_space_for(size_t bytes)
+  {
+    return (pos - bytes >= start);
+  }
+  void write()
+  {
+    if (write_ptr2)
+      write_bytes(write_ptr2, size2);
+    write_bytes(write_ptr1, size1);
+  }
+  void write_bytes(const uchar *data, size_t bytes)
+  {
+    DBUG_ASSERT(have_space_for(bytes));
+    pos -= bytes;
+    memcpy(pos, data, bytes);
+  }
+  bool read()
+  {
+    return read(&pos, &read_ptr1, &read_ptr2);
+  }
+  bool read(uchar **position, uchar **ptr1, uchar **ptr2)
+  {
+    if (!have_data(*position, size1 + size2))
+      return TRUE;
+    *ptr1= read_bytes(position, size1);
+    if (size2)
+      *ptr2= read_bytes(position, size2);
+    return FALSE;
+  }
+  bool have_data(uchar *position, size_t bytes)
+  {
+    return ((end - position) >= (ptrdiff_t)bytes);
+  }
+  uchar *read_bytes(uchar **position, size_t bytes)
+  {
+    DBUG_ASSERT(have_data(*position, bytes));
+    uchar *ret= *position;
+    *position= *position + bytes;
+    return ret;
+  }
+  /**
+    Stop using/return the unused part of the space
+    @param unused_start  OUT Start of the unused space
+    @param unused_end    OUT End of the unused space
+  */
+  void remove_unused_space(uchar **unused_start, uchar **unused_end)
+  {
+    *unused_start= start;
+    *unused_end= pos;
+    start= pos;
+  }
+  void grow(uchar *unused_start, uchar *unused_end)
+  {
+    DBUG_ASSERT(0); /* Not used for backward buffers */
+  }
+  /* Return pointer to start of the memory area that is occupied by the data */
+  uchar *used_area() { return pos; }
+  friend class Lifo_buffer_iterator;
+  uchar *get_pos() { return pos; }
+};
+
+
+/** Iterator to walk over contents of the buffer without reading from it */
+class Lifo_buffer_iterator
+{
+  uchar *pos;
+  Lifo_buffer *buf;
+  
+public:
+  /* The data is read to here */
+  uchar *read_ptr1;
+  uchar *read_ptr2;
+
+  void init(Lifo_buffer *buf_arg)
+  {
+    buf= buf_arg;
+    pos= buf->get_pos();
+  }
+  /*
+    Read the next value. The calling convention is the same as buf->read()
+    has.
+
+    @retval FALSE - ok
+    @retval TRUE  - EOF, reached the end of the buffer
+  */
+  bool read() 
+  {
+    return buf->read(&pos, &read_ptr1, &read_ptr2);
+  }
+};
+
+
diff --git a/sql/sql_list.h b/sql/sql_list.h
index dc840cefc66..2dade14f211 100644
--- a/sql/sql_list.h
+++ b/sql/sql_list.h
@@ -235,6 +235,11 @@ public:
   {
     if (!list->is_empty())
     {
+      if (is_empty())
+      {
+        *this= *list;
+        return;
+      }
       *last= list->first;
       last= list->last;
       elements+= list->elements;
@@ -510,36 +515,40 @@ public:
 
 
 /*
-  Exchange sort algorithm for List<T>.
+  Bubble sort algorithm for List<T>.
+  This sort function is supposed to be used only for very short list.
+  Currently it is used for the lists of Item_equal objects and
+  for some lists in the table elimination algorithms. In both
+  cases the sorted lists are very short.
 */
+
 template <class T> 
-inline void exchange_sort(List<T> *list_to_sort,
-                          int (*sort_func)(T *a, T *b, void *arg), void *arg)
+inline void bubble_sort(List<T> *list_to_sort,
+                        int (*sort_func)(T *a, T *b, void *arg), void *arg)
 {
   bool swap;
+  T **ref1= 0;
+  T **ref2= 0;
   List_iterator<T> it(*list_to_sort);
   do
   {
+    T **last_ref= ref1;
     T *item1= it++;
-    T **ref1= it.ref();
+    ref1= it.ref();
     T *item2;
 
     swap= FALSE;
-    while ((item2= it++))
+    while ((item2= it++) && (ref2= it.ref()) != last_ref)
     {
-      T **ref2= it.ref();
       if (sort_func(item1, item2, arg) < 0)
       {
-        T *item= *ref1;
-        *ref1= *ref2;
-        *ref2= item;
+        *ref1= item2;
+        *ref2= item1;
         swap= TRUE;
       }
       else
-      {
         item1= item2;
-        ref1= ref2;
-      }
+      ref1= ref2;
     }
     it.rewind();
   } while (swap);
diff --git a/sql/sql_load.cc b/sql/sql_load.cc
index 3483a8226d4..d6276d2f47c 100644
--- a/sql/sql_load.cc
+++ b/sql/sql_load.cc
@@ -983,7 +983,7 @@ read_sep_field(THD *thd, COPY_INFO &info, TABLE_LIST *table_list,
           if (!field->maybe_null() && field->type() == FIELD_TYPE_TIMESTAMP)
               ((Field_timestamp*) field)->set_time();
           /*
-            QQ: We probably should not throw warning for each field.
+            TODO: We probably should not throw warning for each field.
             But how about intention to always have the same number
             of warnings in THD::cuted_fields (and get rid of cuted_fields
             in the end ?)
diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc
index a215644bce3..3088d807549 100644
--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@ -28,6 +28,7 @@
 #include "events.h"
 #include "sql_trigger.h"
 #include "debug_sync.h"
+#include "sql_handler.h"
 
 #ifdef WITH_ARIA_STORAGE_ENGINE
 #include "../storage/maria/ha_maria.h"
@@ -1199,6 +1200,7 @@ bool dispatch_command(enum enum_server_command command, THD *thd,
 
     general_log_write(thd, command, thd->query(), thd->query_length());
     DBUG_PRINT("query",("%-.4096s",thd->query()));
+    
 #if defined(ENABLED_PROFILING) && defined(COMMUNITY_SERVER)
     thd->profiling.set_query_source(thd->query(), thd->query_length());
 #endif
@@ -1350,54 +1352,6 @@ bool dispatch_command(enum enum_server_command command, THD *thd,
     error=TRUE;					// End server
     break;
 
-#ifdef REMOVED
-  case COM_CREATE_DB:				// QQ: To be removed
-    {
-      LEX_STRING db, alias;
-      HA_CREATE_INFO create_info;
-
-      status_var_increment(thd->status_var.com_stat[SQLCOM_CREATE_DB]);
-      if (thd->make_lex_string(&db, packet, packet_length, FALSE) ||
-          thd->make_lex_string(&alias, db.str, db.length, FALSE) ||
-          check_db_name(&db))
-      {
-	my_error(ER_WRONG_DB_NAME, MYF(0), db.str ? db.str : "NULL");
-	break;
-      }
-      if (check_access(thd, CREATE_ACL, db.str , 0, 1, 0,
-                       is_schema_db(db.str, db.length)))
-	break;
-      general_log_print(thd, command, "%.*s", db.length, db.str);
-      bzero(&create_info, sizeof(create_info));
-      mysql_create_db(thd, (lower_case_table_names == 2 ? alias.str : db.str),
-                      &create_info, 0);
-      break;
-    }
-  case COM_DROP_DB:				// QQ: To be removed
-    {
-      status_var_increment(thd->status_var.com_stat[SQLCOM_DROP_DB]);
-      LEX_STRING db;
-
-      if (thd->make_lex_string(&db, packet, packet_length, FALSE) ||
-          check_db_name(&db))
-      {
-	my_error(ER_WRONG_DB_NAME, MYF(0), db.str ? db.str : "NULL");
-	break;
-      }
-      if (check_access(thd, DROP_ACL, db.str, 0, 1, 0,
-                            is_schema_db(db.str, db.length)))
-	break;
-      if (thd->locked_tables || thd->active_transaction())
-      {
-	my_message(ER_LOCK_OR_ACTIVE_TRANSACTION,
-                   ER(ER_LOCK_OR_ACTIVE_TRANSACTION), MYF(0));
-	break;
-      }
-      general_log_write(thd, command, "%.*s", db.length, db.str);
-      mysql_rm_db(thd, db.str, 0, 0);
-      break;
-    }
-#endif
 #ifndef EMBEDDED_LIBRARY
   case COM_BINLOG_DUMP:
     {
@@ -5098,6 +5052,7 @@ static bool execute_sqlcom_select(THD *thd, TABLE_LIST *all_tables)
       param->select_limit=
         new Item_int((ulonglong) thd->variables.select_limit);
   }
+  thd->thd_marker.emb_on_expr_nest= NULL;
   if (!(res= open_and_lock_tables(thd, all_tables)))
   {
     if (lex->describe)
@@ -5870,6 +5825,7 @@ void mysql_reset_thd_for_next_command(THD *thd, my_bool calculate_userstat)
 
   thd->query_plan_flags= QPLAN_INIT;
   thd->query_plan_fsort_passes= 0;
+  thd->thd_marker.emb_on_expr_nest= NULL;
 
   /*
     Because we come here only for start of top-statements, binlog format is
diff --git a/sql/sql_plugin_services.h b/sql/sql_plugin_services.h
index 14a2a16561a..8d4055dd764 100644
--- a/sql/sql_plugin_services.h
+++ b/sql/sql_plugin_services.h
@@ -22,6 +22,8 @@ struct st_service_ref {
   void *service;
 };
 
+#ifdef HAVE_DLOPEN
+
 static struct my_snprintf_service_st my_snprintf_handler = {
   my_snprintf,
   my_vsnprintf
@@ -41,4 +43,4 @@ static struct st_service_ref list_of_services[] __attribute__((unused)) =
   { "my_snprintf_service", VERSION_my_snprintf, &my_snprintf_handler },
   { "thd_alloc_service",   VERSION_thd_alloc,   &thd_alloc_handler }
 };
-
+#endif
diff --git a/sql/sql_prepare.cc b/sql/sql_prepare.cc
index c7109982f93..db623133cb1 100644
--- a/sql/sql_prepare.cc
+++ b/sql/sql_prepare.cc
@@ -95,6 +95,7 @@ When one supplies long data for a placeholder:
 #else
 #include <mysql_com.h>
 #endif
+#include "sql_handler.h"
 
 /**
   A result class used to send cursor rows using the binary protocol.
@@ -243,6 +244,8 @@ static bool send_prep_stmt(Prepared_statement *stmt, uint columns)
   int error;
   THD *thd= stmt->thd;
   DBUG_ENTER("send_prep_stmt");
+  DBUG_PRINT("enter",("stmt->id: %lu  columns: %d  param_count: %d",
+                      stmt->id, columns, stmt->param_count));
 
   buff[0]= 0;                                   /* OK packet indicator */
   int4store(buff+1, stmt->id);
@@ -1385,6 +1388,7 @@ static int mysql_test_select(Prepared_statement *stmt,
     goto error;
 
   thd->used_tables= 0;                        // Updated by setup_fields
+  thd->thd_marker.emb_on_expr_nest= 0;
 
   /*
     JOIN::prepare calls
@@ -1834,6 +1838,56 @@ static bool mysql_test_insert_select(Prepared_statement *stmt,
   return res;
 }
 
+/**
+  Validate SELECT statement.
+
+    In case of success, if this query is not EXPLAIN, send column list info
+    back to the client.
+
+  @param stmt               prepared statement
+  @param tables             list of tables used in the query
+
+  @retval 0 success
+  @retval 1 error, error message is set in THD
+  @retval 2 success, and statement metadata has been sent
+*/
+
+static int mysql_test_handler_read(Prepared_statement *stmt,
+                                   TABLE_LIST *tables)
+{
+  THD *thd= stmt->thd;
+  LEX *lex= stmt->lex;
+  SQL_HANDLER *ha_table;
+  DBUG_ENTER("mysql_test_select");
+
+  lex->select_lex.context.resolve_in_select_list= TRUE;
+
+  /*
+    We don't have to test for permissions as this is already done during
+    HANDLER OPEN
+  */
+  if (!(ha_table= mysql_ha_read_prepare(thd, tables, lex->ha_read_mode,
+                                        lex->ident.str,
+                                        lex->insert_list,
+                                        lex->select_lex.where)))
+    DBUG_RETURN(1);
+
+  if (!stmt->is_sql_prepare())
+  {
+    if (!lex->result && !(lex->result= new (stmt->mem_root) select_send))
+    {
+      my_error(ER_OUTOFMEMORY, MYF(0), sizeof(select_send));
+      DBUG_RETURN(1);
+    }
+    if (send_prep_stmt(stmt, ha_table->fields.elements) ||
+        lex->result->send_fields(ha_table->fields, Protocol::SEND_EOF) ||
+        thd->protocol->flush())
+      DBUG_RETURN(1);
+    DBUG_RETURN(2);
+  }
+  DBUG_RETURN(0);
+}
+
 
 /**
   Perform semantic analysis of the parsed tree and send a response packet
@@ -1948,6 +2002,11 @@ static bool check_prepared_statement(Prepared_statement *stmt)
     res= mysql_test_insert_select(stmt, tables);
     break;
 
+  case SQLCOM_HA_READ:
+    res= mysql_test_handler_read(stmt, tables);
+    /* Statement and field info has already been sent */
+    DBUG_RETURN(res == 1 ? TRUE : FALSE);
+
     /*
       Note that we don't need to have cases in this list if they are
       marked with CF_STATUS_COMMAND in sql_command_flags
diff --git a/sql/sql_rename.cc b/sql/sql_rename.cc
index df7054c94d0..3da0fafb830 100644
--- a/sql/sql_rename.cc
+++ b/sql/sql_rename.cc
@@ -19,7 +19,7 @@
 
 #include "mysql_priv.h"
 #include "sql_trigger.h"
-
+#include "sql_handler.h"
 
 static TABLE_LIST *rename_tables(THD *thd, TABLE_LIST *table_list,
 				 bool skip_error);
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index 052c01725de..bc31fd62dfd 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -32,23 +32,20 @@
 #include "mysql_priv.h"
 #include "sql_select.h"
 #include "sql_cursor.h"
+#include "opt_subselect.h"
 
 #include <m_ctype.h>
 #include <my_bit.h>
 #include <hash.h>
 #include <ft_global.h>
-#if defined(WITH_ARIA_STORAGE_ENGINE) && defined(USE_MARIA_FOR_TMP_TABLES)
-#include "../storage/maria/ha_maria.h"
-#define TMP_ENGINE_HTON maria_hton
-#else
-#define TMP_ENGINE_HTON myisam_hton
-#endif
 
 const char *join_type_str[]={ "UNKNOWN","system","const","eq_ref","ref",
 			      "MAYBE_REF","ALL","range","index","fulltext",
 			      "ref_or_null","unique_subquery","index_subquery",
-                              "index_merge"
-};
+                              "index_merge", "hash_ALL", "hash_range",
+                              "hash_index", "hash_index_merge" };
+
+const char *copy_to_tmp_table= "Copying to tmp table";
 
 struct st_sargable_param;
 
@@ -64,11 +61,12 @@ static bool update_ref_and_keys(THD *thd, DYNAMIC_ARRAY *keyuse,
 static int sort_keyuse(KEYUSE *a,KEYUSE *b);
 static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
 			       table_map used_tables);
-static bool choose_plan(JOIN *join,table_map join_tables);
+bool choose_plan(JOIN *join,table_map join_tables);
 
-static void best_access_path(JOIN *join, JOIN_TAB *s, THD *thd,
-                             table_map remaining_tables, uint idx,
-                             double record_count, double read_time);
+void best_access_path(JOIN *join, JOIN_TAB *s, 
+                             table_map remaining_tables, uint idx, 
+                             bool disable_jbuf, double record_count,
+                             POSITION *pos, POSITION *loose_scan_pos);
 static void optimize_straight_join(JOIN *join, table_map join_tables);
 static bool greedy_search(JOIN *join, table_map remaining_tables,
                              uint depth, uint prune_level);
@@ -78,8 +76,9 @@ static bool best_extension_by_limited_search(JOIN *join,
                                              double read_time, uint depth,
                                              uint prune_level);
 static uint determine_search_depth(JOIN* join);
-static int join_tab_cmp(const void* ptr1, const void* ptr2);
-static int join_tab_cmp_straight(const void* ptr1, const void* ptr2);
+static int join_tab_cmp(const void *dummy, const void* ptr1, const void* ptr2);
+static int join_tab_cmp_straight(const void *dummy, const void* ptr1, const void* ptr2);
+static int join_tab_cmp_embedded_first(const void *emb, const void* ptr1, const void *ptr2);
 /*
   TODO: 'find_best' is here only temporarily until 'greedy_search' is
   tested and approved.
@@ -94,8 +93,11 @@ static store_key *get_store_key(THD *thd,
 				KEY_PART_INFO *key_part, uchar *key_buff,
 				uint maybe_null);
 static void make_outerjoin_info(JOIN *join);
+static Item*
+make_cond_after_sjm(Item *root_cond, Item *cond, table_map tables, table_map sjm_tables);
 static bool make_join_select(JOIN *join,SQL_SELECT *select,COND *item);
-static void make_join_readinfo(JOIN *join, ulonglong options);
+static void revise_cache_usage(JOIN_TAB *join_tab);
+static bool make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after);
 static bool only_eq_ref_tables(JOIN *join, ORDER *order, table_map tables);
 static void update_depend_map(JOIN *join);
 static void update_depend_map(JOIN *join, ORDER *order);
@@ -113,7 +115,7 @@ static COND* substitute_for_best_equal_field(COND *cond,
                                              COND_EQUAL *cond_equal,
                                              void *table_join_idx);
 static COND *simplify_joins(JOIN *join, List<TABLE_LIST> *join_list,
-                            COND *conds, bool top);
+                            COND *conds, bool top, bool in_sj);
 static bool check_interleaving_with_nj(JOIN_TAB *next);
 static void restore_prev_nj_state(JOIN_TAB *last);
 static uint reset_nj_counters(JOIN *join, List<TABLE_LIST> *join_list);
@@ -122,16 +124,12 @@ static uint build_bitmap_for_nested_joins(List<TABLE_LIST> *join_list,
 
 static COND *optimize_cond(JOIN *join, COND *conds,
                            List<TABLE_LIST> *join_list,
-			   Item::cond_result *cond_value);
+			   Item::cond_result *cond_value, 
+                           COND_EQUAL **cond_equal);
 static bool const_expression_in_where(COND *conds,Item *item, Item **comp_item);
-static bool open_tmp_table(TABLE *table);
-static bool create_internal_tmp_table(TABLE *,TMP_TABLE_PARAM *, ulonglong);
-static bool create_internal_tmp_table_from_heap2(THD *thd, TABLE *table,
-                                                 TMP_TABLE_PARAM *param,
-                                                 int error,
-                                                 bool ignore_last_dupp,
-                                                 handlerton *hton,
-                                                 const char *proc_info);
+static bool create_internal_tmp_table_from_heap2(THD *, TABLE *,
+                                     ENGINE_COLUMNDEF *, ENGINE_COLUMNDEF **, 
+                                     int, bool, handlerton *, const char *);
 static int do_select(JOIN *join,List<Item> *fields,TABLE *tmp_table,
 		     Procedure *proc);
 
@@ -141,10 +139,8 @@ evaluate_join_record(JOIN *join, JOIN_TAB *join_tab,
 static enum_nested_loop_state
 evaluate_null_complemented_join_record(JOIN *join, JOIN_TAB *join_tab);
 static enum_nested_loop_state
-flush_cached_records(JOIN *join, JOIN_TAB *join_tab, bool skip_last);
-static enum_nested_loop_state
 end_send(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
-static enum_nested_loop_state
+enum_nested_loop_state
 end_send_group(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
 static enum_nested_loop_state
 end_write(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
@@ -152,7 +148,7 @@ static enum_nested_loop_state
 end_update(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
 static enum_nested_loop_state
 end_unique_update(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
-static enum_nested_loop_state
+enum_nested_loop_state
 end_write_group(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
 
 static int test_if_group_changed(List<Cached_item> &list);
@@ -167,7 +163,7 @@ static int join_no_more_records(READ_RECORD *info);
 static int join_read_next(READ_RECORD *info);
 static int join_init_quick_read_record(JOIN_TAB *tab);
 static int test_if_quick_select(JOIN_TAB *tab);
-static int join_init_read_record(JOIN_TAB *tab);
+static bool test_if_use_dynamic_range_scan(JOIN_TAB *join_tab);
 static int join_read_first(JOIN_TAB *tab);
 static int join_read_next(READ_RECORD *info);
 static int join_read_next_same(READ_RECORD *info);
@@ -178,13 +174,21 @@ static int join_ft_read_first(JOIN_TAB *tab);
 static int join_ft_read_next(READ_RECORD *info);
 int join_read_always_key_or_null(JOIN_TAB *tab);
 int join_read_next_same_or_null(READ_RECORD *info);
-static COND *make_cond_for_table(COND *cond,table_map table,
-				 table_map used_table);
+static COND *make_cond_for_table(Item *cond,table_map table,
+                                 table_map used_table,
+                                 bool exclude_expensive_cond,
+                                 bool retain_ref_cond);
+static COND *make_cond_for_table_from_pred(Item *root_cond, Item *cond,
+                                           table_map tables,
+                                           table_map used_table,
+                                           bool exclude_expensive_cond,
+                                           bool retain_ref_cond);
+
 static Item* part_of_refkey(TABLE *form,Field *field);
 uint find_shortest_key(TABLE *table, const key_map *usable_keys);
 static bool test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,
 				    ha_rows select_limit, bool no_changes,
-                                    key_map *map);
+                                    const key_map *map);
 static bool list_contains_unique_index(TABLE *table,
                           bool (*find_func) (Field *, void *), void *data);
 static bool find_field_in_item_list (Field *field, void *data);
@@ -198,15 +202,8 @@ static int remove_dup_with_compare(THD *thd, TABLE *entry, Field **field,
 				   ulong offset,Item *having);
 static int remove_dup_with_hash_index(THD *thd,TABLE *table,
 				      uint field_count, Field **first_field,
-
 				      ulong key_length,Item *having);
-static int join_init_cache(THD *thd,JOIN_TAB *tables,uint table_count);
-static ulong used_blob_length(CACHE_FIELD **ptr);
-static bool store_record_in_cache(JOIN_CACHE *cache);
-static void reset_cache_read(JOIN_CACHE *cache);
-static void reset_cache_write(JOIN_CACHE *cache);
-static void read_cached_record(JOIN_TAB *tab);
-static bool cmp_buffer_with_ref(JOIN_TAB *tab);
+static bool cmp_buffer_with_ref(THD *thd, TABLE *table, TABLE_REF *tab_ref);
 static bool setup_new_fields(THD *thd, List<Item> &fields,
 			     List<Item> &all_fields, ORDER *new_order);
 static ORDER *create_distinct_group(THD *thd, Item **ref_pointer_array,
@@ -237,10 +234,15 @@ static bool init_sum_functions(Item_sum **func, Item_sum **end);
 static bool update_sum_func(Item_sum **func);
 static void select_describe(JOIN *join, bool need_tmp_table,bool need_order,
 			    bool distinct, const char *message=NullS);
-static Item *remove_additional_cond(Item* conds);
 static void add_group_and_distinct_keys(JOIN *join, JOIN_TAB *join_tab);
-static bool test_if_ref(Item_field *left_item,Item *right_item);
+void get_partial_join_cost(JOIN *join, uint idx, double *read_time_arg,
+                           double *record_count_arg);
+static uint make_join_orderinfo(JOIN *join);
+static int
+join_read_record_no_init(JOIN_TAB *tab);
 
+Item_equal *find_item_equal(COND_EQUAL *cond_equal, Field *field,
+                            bool *inherited_fl);
 
 /**
   This handles SELECT with and without UNION.
@@ -461,6 +463,7 @@ inline int setup_without_group(THD *thd, Item **ref_pointer_array,
   mysql_select assumes that all tables are already opened
 *****************************************************************************/
 
+
 /**
   Prepare of whole select (including sub queries in future).
 
@@ -547,25 +550,13 @@ JOIN::prepare(Item ***rref_pointer_array,
       DBUG_RETURN(-1);				/* purecov: inspected */
     thd->lex->allow_sum_func= save_allow_sum_func;
   }
-
-  if (!(thd->lex->context_analysis_only & CONTEXT_ANALYSIS_ONLY_VIEW) &&
-      !(select_options & SELECT_DESCRIBE))
-  {
-    Item_subselect *subselect;
-    /* Is it subselect? */
-    if ((subselect= select_lex->master_unit()->item))
-    {
-      Item_subselect::trans_res res;
-      if ((res= subselect->select_transformer(this)) !=
-	  Item_subselect::RES_OK)
-      {
-        select_lex->fix_prepare_information(thd, &conds, &having);
-	DBUG_RETURN((res == Item_subselect::RES_ERROR));
-      }
-    }
-  }
+  
+  int res= check_and_do_in_subquery_rewrites(this);
 
   select_lex->fix_prepare_information(thd, &conds, &having);
+  
+  if (res)
+    DBUG_RETURN(res);
 
   if (order)
   {
@@ -732,88 +723,6 @@ err:
 }
 
 
-/*
-  Remove the predicates pushed down into the subquery
-
-  SYNOPSIS
-    JOIN::remove_subq_pushed_predicates()
-      where   IN  Must be NULL
-              OUT The remaining WHERE condition, or NULL
-
-  DESCRIPTION
-    Given that this join will be executed using (unique|index)_subquery,
-    without "checking NULL", remove the predicates that were pushed down
-    into the subquery.
-
-    If the subquery compares scalar values, we can remove the condition that
-    was wrapped into trig_cond (it will be checked when needed by the subquery
-    engine)
-
-    If the subquery compares row values, we need to keep the wrapped
-    equalities in the WHERE clause: when the left (outer) tuple has both NULL
-    and non-NULL values, we'll do a full table scan and will rely on the
-    equalities corresponding to non-NULL parts of left tuple to filter out
-    non-matching records.
-
-    TODO: We can remove the equalities that will be guaranteed to be true by the
-    fact that subquery engine will be using index lookup. This must be done only
-    for cases where there are no conversion errors of significance, e.g. 257
-    that is searched in a byte. But this requires homogenization of the return 
-    codes of all Field*::store() methods.
-*/
-
-void JOIN::remove_subq_pushed_predicates(Item **where)
-{
-  if (conds->type() == Item::FUNC_ITEM &&
-      ((Item_func *)this->conds)->functype() == Item_func::EQ_FUNC &&
-      ((Item_func *)conds)->arguments()[0]->type() == Item::REF_ITEM &&
-      ((Item_func *)conds)->arguments()[1]->type() == Item::FIELD_ITEM &&
-      test_if_ref ((Item_field *)((Item_func *)conds)->arguments()[1],
-                   ((Item_func *)conds)->arguments()[0]))
-  {
-    *where= 0;
-    return;
-  }
-}
-
-
-/*
-  Index lookup-based subquery: save some flags for EXPLAIN output
-
-  SYNOPSIS
-    save_index_subquery_explain_info()
-      join_tab  Subquery's join tab (there is only one as index lookup is
-                only used for subqueries that are single-table SELECTs)
-      where     Subquery's WHERE clause
-
-  DESCRIPTION
-    For index lookup-based subquery (i.e. one executed with
-    subselect_uniquesubquery_engine or subselect_indexsubquery_engine),
-    check its EXPLAIN output row should contain 
-      "Using index" (TAB_INFO_FULL_SCAN_ON_NULL) 
-      "Using Where" (TAB_INFO_USING_WHERE)
-      "Full scan on NULL key" (TAB_INFO_FULL_SCAN_ON_NULL)
-    and set appropriate flags in join_tab->packed_info.
-*/
-
-static void save_index_subquery_explain_info(JOIN_TAB *join_tab, Item* where)
-{
-  join_tab->packed_info= TAB_INFO_HAVE_VALUE;
-  if (join_tab->table->covering_keys.is_set(join_tab->ref.key))
-    join_tab->packed_info |= TAB_INFO_USING_INDEX;
-  if (where)
-    join_tab->packed_info |= TAB_INFO_USING_WHERE;
-  for (uint i = 0; i < join_tab->ref.key_parts; i++)
-  {
-    if (join_tab->ref.cond_guards[i])
-    {
-      join_tab->packed_info |= TAB_INFO_FULL_SCAN_ON_NULL;
-      break;
-    }
-  }
-}
-
-
 /**
   global select optimisation.
 
@@ -829,6 +738,9 @@ static void save_index_subquery_explain_info(JOIN_TAB *join_tab, Item* where)
 int
 JOIN::optimize()
 {
+  ulonglong select_opts_for_readinfo;
+  uint no_jbuf_after;
+
   DBUG_ENTER("JOIN::optimize");
   // to prevent double initialization on EXPLAIN
   if (optimized)
@@ -836,6 +748,14 @@ JOIN::optimize()
   optimized= 1;
 
   thd_proc_info(thd, "optimizing");
+
+  set_allowed_join_cache_types();
+
+  /* dump_TABLE_LIST_graph(select_lex, select_lex->leaf_tables); */
+  if (convert_join_subqueries_to_semijoins(this))
+    DBUG_RETURN(1); /* purecov: inspected */
+  /* dump_TABLE_LIST_graph(select_lex, select_lex->leaf_tables); */
+
   row_limit= ((select_distinct || order || group_list) ? HA_POS_ERROR :
 	      unit->select_limit_cnt);
   /* select_limit is used to decide if we are likely to scan the whole table */
@@ -885,7 +805,7 @@ JOIN::optimize()
     sel->first_cond_optimization= 0;
 
     /* Convert all outer joins to inner joins if possible */
-    conds= simplify_joins(this, join_list, conds, TRUE);
+    conds= simplify_joins(this, join_list, conds, TRUE, FALSE);
     build_bitmap_for_nested_joins(join_list, 0);
 
     sel->prep_where= conds ? conds->copy_andor_structure(thd) : 0;
@@ -894,7 +814,7 @@ JOIN::optimize()
       thd->restore_active_arena(arena, &backup);
   }
 
-  conds= optimize_cond(this, conds, join_list, &cond_value);   
+  conds= optimize_cond(this, conds, join_list, &cond_value, &cond_equal);   
   if (thd->is_error())
   {
     error= 1;
@@ -903,7 +823,7 @@ JOIN::optimize()
   }
 
   {
-    having= optimize_cond(this, having, join_list, &having_value);
+    having= optimize_cond(this, having, join_list, &having_value, &having_equal);
     if (thd->is_error())
     {
       error= 1;
@@ -924,7 +844,7 @@ JOIN::optimize()
                            "Impossible HAVING" : "Impossible WHERE";
       tables= 0;
       error= 0;
-      DBUG_RETURN(0);
+      goto setup_subq_exit;
     }
   }
 
@@ -974,7 +894,7 @@ JOIN::optimize()
 	zero_result_cause= "No matching min/max row";
         tables= 0;
 	error=0;
-	DBUG_RETURN(0);
+        goto setup_subq_exit;
       }
       if (res > 1)
       {
@@ -988,7 +908,7 @@ JOIN::optimize()
         zero_result_cause= "No matching min/max row";
         tables= 0;
         error=0;
-        DBUG_RETURN(0);
+        goto setup_subq_exit;
       }
       DBUG_PRINT("info",("Select tables optimized away"));
       zero_result_cause= "Select tables optimized away";
@@ -1007,19 +927,23 @@ JOIN::optimize()
       if (conds && !(thd->lex->describe & DESCRIBE_EXTENDED))
       {
         COND *table_independent_conds=
-          make_cond_for_table(conds, PSEUDO_TABLE_BITS, 0);
+          make_cond_for_table(conds, PSEUDO_TABLE_BITS, 0, FALSE, FALSE);
         DBUG_EXECUTE("where",
                      print_where(table_independent_conds,
                                  "where after opt_sum_query()",
                                  QT_ORDINARY););
         conds= table_independent_conds;
       }
+      goto setup_subq_exit;
     }
   }
   if (!tables_list)
   {
     DBUG_PRINT("info",("No tables"));
     error= 0;
+    /* Create all structures needed for materialized subquery execution. */
+    if (setup_subquery_materialization())
+      DBUG_RETURN(1);
     DBUG_RETURN(0);
   }
   error= -1;					// Error is sent to client
@@ -1063,7 +987,7 @@ JOIN::optimize()
     zero_result_cause= "no matching row in const table";
     DBUG_PRINT("error",("Error: %s", zero_result_cause));
     error= 0;
-    DBUG_RETURN(0);
+    goto setup_subq_exit;
   }
   if (!(thd->options & OPTION_BIG_SELECTS) &&
       best_read > (double) thd->variables.max_join_size &&
@@ -1110,7 +1034,7 @@ JOIN::optimize()
   }
 
   /*
-    Permorm the the optimization on fields evaluation mentioned above
+    Perform the optimization on fields evaluation mentioned above
     for all on expressions.
   */ 
   for (JOIN_TAB *tab= join_tab + const_tables; tab < join_tab + tables ; tab++)
@@ -1122,8 +1046,43 @@ JOIN::optimize()
                                                          map2table);
       (*tab->on_expr_ref)->update_used_tables();
     }
+
+    
   }
 
+  /*
+    Perform the optimization on fields evaliation mentioned above
+    for all used ref items.
+  */
+  for (JOIN_TAB *tab= join_tab + const_tables; tab < join_tab + tables; tab++)
+  {
+    uint key_copy_index=0;
+    for (uint i=0; i < tab->ref.key_parts; i++)
+    {
+      
+      Item **ref_item_ptr= tab->ref.items+i;
+      Item *ref_item= *ref_item_ptr;
+      if (!ref_item->used_tables() && !(select_options & SELECT_DESCRIBE))
+        continue;
+      COND_EQUAL *equals= tab->first_inner ? tab->first_inner->cond_equal : 
+	                                     cond_equal;
+      ref_item= substitute_for_best_equal_field(ref_item, equals, map2table);
+      ref_item->update_used_tables();
+      if (*ref_item_ptr != ref_item)
+      {
+        *ref_item_ptr= ref_item;
+        Item *item= ref_item->real_item();
+        store_key *key_copy= tab->ref.key_copy[key_copy_index];
+        if (key_copy->type() == store_key::FIELD_STORE_KEY)
+	{
+          store_key_field *field_copy= ((store_key_field *)key_copy);
+          field_copy->change_source_field((Item_field *) item);
+        }
+      }
+      key_copy_index++;
+    }
+  }   
+
   if (conds && const_table_map != found_const_table_map &&
       (select_options & SELECT_DESCRIBE))
   {
@@ -1134,7 +1093,7 @@ JOIN::optimize()
   {
     zero_result_cause=
       "Impossible WHERE noticed after reading const tables";
-    DBUG_RETURN(0);				// error == 0
+    goto setup_subq_exit;
   }
 
   error= -1;					/* if goto err */
@@ -1355,76 +1314,38 @@ JOIN::optimize()
 	      test(select_options & OPTION_BUFFER_RESULT))) ||
              (rollup.state != ROLLUP::STATE_NONE && select_distinct));
 
-  // No cache for MATCH
-  make_join_readinfo(this,
-		     (select_options & (SELECT_DESCRIBE |
-					SELECT_NO_JOIN_CACHE)) |
-		     (select_lex->ftfunc_list->elements ?
-		      SELECT_NO_JOIN_CACHE : 0));
+  /*
+    If the hint FORCE INDEX FOR ORDER BY/GROUP BY is used for the table
+    whose columns are required to be returned in a sorted order, then
+    the proper value for no_jbuf_after should be yielded by a call to
+    the make_join_orderinfo function.
+    Yet the current implementation of FORCE INDEX hints does not
+    allow us to do it in a clean manner.
+  */
+  no_jbuf_after= 1 ? tables : make_join_orderinfo(this);
+
+  select_opts_for_readinfo=
+    (select_options & (SELECT_DESCRIBE | SELECT_NO_JOIN_CACHE)) |
+    (select_lex->ftfunc_list->elements ?  SELECT_NO_JOIN_CACHE : 0);
+
+  // No cache for MATCH == 'Don't use join buffering when we use MATCH'.
+  if (make_join_readinfo(this, select_opts_for_readinfo, no_jbuf_after))
+    DBUG_RETURN(1);
 
   /* Perform FULLTEXT search before all regular searches */
   if (!(select_options & SELECT_DESCRIBE))
     init_ftfuncs(thd, select_lex, test(order));
 
-  /*
-    is this simple IN subquery?
-  */
-  if (!group_list && !order &&
-      unit->item && unit->item->substype() == Item_subselect::IN_SUBS &&
-      tables == 1 && conds &&
-      !unit->is_union())
-  {
-    if (!having)
-    {
-      Item *where= conds;
-      if (join_tab[0].type == JT_EQ_REF &&
-	  join_tab[0].ref.items[0]->name == in_left_expr_name)
-      {
-        remove_subq_pushed_predicates(&where);
-        save_index_subquery_explain_info(join_tab, where);
-        join_tab[0].type= JT_UNIQUE_SUBQUERY;
-        error= 0;
-        DBUG_RETURN(unit->item->
-                    change_engine(new
-                                  subselect_uniquesubquery_engine(thd,
-                                                                  join_tab,
-                                                                  unit->item,
-                                                                  where)));
-      }
-      else if (join_tab[0].type == JT_REF &&
-	       join_tab[0].ref.items[0]->name == in_left_expr_name)
-      {
-	remove_subq_pushed_predicates(&where);
-        save_index_subquery_explain_info(join_tab, where);
-        join_tab[0].type= JT_INDEX_SUBQUERY;
-        error= 0;
-        DBUG_RETURN(unit->item->
-                    change_engine(new
-                                  subselect_indexsubquery_engine(thd,
-                                                                 join_tab,
-                                                                 unit->item,
-                                                                 where,
-                                                                 NULL,
-                                                                 0)));
-      }
-    } else if (join_tab[0].type == JT_REF_OR_NULL &&
-	       join_tab[0].ref.items[0]->name == in_left_expr_name &&
-               having->name == in_having_cond)
-    {
-      join_tab[0].type= JT_INDEX_SUBQUERY;
-      error= 0;
-      conds= remove_additional_cond(conds);
-      save_index_subquery_explain_info(join_tab, conds);
-      DBUG_RETURN(unit->item->
-		  change_engine(new subselect_indexsubquery_engine(thd,
-								   join_tab,
-								   unit->item,
-								   conds,
-                                                                   having,
-								   1)));
-    }
+  /* Create all structures needed for materialized subquery execution. */
+  if (setup_subquery_materialization())
+    DBUG_RETURN(1);
+  
+  int res;
+  if ((res= rewrite_to_index_subquery_engine(this)) != -1)
+    DBUG_RETURN(res);
+  if (setup_subquery_caches())
+    DBUG_RETURN(-1);
 
-  }
   /*
     Need to tell handlers that to play it safe, it should fetch all
     columns of the primary key of the tables: this is because MySQL may
@@ -1488,7 +1409,7 @@ JOIN::optimize()
       for (ORDER *tmp_order= order; tmp_order ; tmp_order=tmp_order->next)
       {
         Item *item= *tmp_order->item;
-        if (item->walk(&Item::is_expensive_processor, 0, (uchar*)0))
+        if (item->is_expensive())
         {
           /* Force tmp table without sort */
           need_tmp=1; simple_order=simple_group=0;
@@ -1640,6 +1561,109 @@ JOIN::optimize()
 
   error= 0;
   DBUG_RETURN(0);
+
+setup_subq_exit:
+  /*
+    Even with zero matching rows, subqueries in the HAVING clause may
+    need to be evaluated if there are aggregate functions in the
+    query. If we have planned to materialize the subquery, we need to
+    set it up properly before prematurely leaving optimize().
+  */
+  if (setup_subquery_materialization())
+    DBUG_RETURN(1);
+  error= 0;
+  DBUG_RETURN(0);
+}
+
+
+/**
+  Setup expression caches for subqueries that need them
+
+  @details
+  The function wraps correlated subquery expressions that return one value
+  into objects of the class Item_cache_wrapper setting up an expression
+  cache for each of them. The result values of the subqueries are to be
+  cached together with the corresponding sets of the parameters - outer
+  references of the subqueries.
+
+  @retval FALSE OK
+  @retval TRUE  Error
+*/
+
+bool JOIN::setup_subquery_caches()
+{
+  DBUG_ENTER("JOIN::setup_subquery_caches");
+
+  /*
+    We have to check all this condition together because items created in
+    one of this clauses can be moved to another one by optimizer
+  */
+  if (select_lex->expr_cache_may_be_used[IN_WHERE] ||
+      select_lex->expr_cache_may_be_used[IN_HAVING] ||
+      select_lex->expr_cache_may_be_used[IN_ON] ||
+      select_lex->expr_cache_may_be_used[NO_MATTER])
+  {
+    if (conds)
+      conds= conds->transform(&Item::expr_cache_insert_transformer,
+                              (uchar*) thd);
+    for (JOIN_TAB *tab= join_tab + const_tables;
+         tab < join_tab + tables ;
+         tab++)
+    {
+      if (tab->select_cond)
+        tab->select_cond=
+          tab->select_cond->transform(&Item::expr_cache_insert_transformer,
+                                      (uchar*) thd);
+      if (tab->cache_select && tab->cache_select->cond)
+        tab->cache_select->cond=
+          tab->cache_select->
+          cond->transform(&Item::expr_cache_insert_transformer,
+                          (uchar*) thd);
+
+    }
+
+    if (having)
+      having= having->transform(&Item::expr_cache_insert_transformer,
+                                (uchar*) thd);
+    if (tmp_having)
+    {
+      DBUG_ASSERT(having == NULL);
+      tmp_having= tmp_having->transform(&Item::expr_cache_insert_transformer,
+                                        (uchar*) thd);
+    }
+  }
+  if (select_lex->expr_cache_may_be_used[SELECT_LIST] ||
+      select_lex->expr_cache_may_be_used[IN_GROUP_BY] ||
+      select_lex->expr_cache_may_be_used[NO_MATTER])
+  {
+    List_iterator<Item> li(all_fields);
+    Item *item;
+    while ((item= li++))
+    {
+      Item *new_item=
+        item->transform(&Item::expr_cache_insert_transformer, (uchar*) thd);
+      if (new_item != item)
+      {
+        thd->change_item_tree(li.ref(), new_item);
+      }
+    }
+    for (ORDER *group= group_list; group ; group= group->next)
+    {
+      *group->item=
+        (*group->item)->transform(&Item::expr_cache_insert_transformer,
+                                  (uchar*) thd);
+    }
+  }
+  if (select_lex->expr_cache_may_be_used[NO_MATTER])
+  {
+    for (ORDER *ord= order; ord; ord= ord->next)
+    {
+      *ord->item=
+        (*ord->item)->transform(&Item::expr_cache_insert_transformer,
+                                (uchar*) thd);
+    }
+  }
+  DBUG_RETURN(FALSE);
 }
 
 
@@ -1652,6 +1676,62 @@ void JOIN::restore_tmp()
 }
 
 
+/*
+  Shrink join buffers used for preceding tables to reduce the occupied space
+
+  SYNOPSIS
+    shrink_join_buffers()
+      jt           table up to which the buffers are to be shrunk
+      curr_space   the size of the space used by the buffers for tables 1..jt
+      needed_space the size of the space that has to be used by these buffers
+
+  DESCRIPTION
+    The function makes an attempt to shrink all join buffers used for the
+    tables starting from the first up to jt to reduce the total size of the
+    space occupied by the buffers used for tables 1,...,jt  from curr_space
+    to needed_space.
+    The function assumes that the buffer for the table jt has not been
+    allocated yet.
+
+  RETURN
+    FALSE     if all buffer have been successfully shrunk
+    TRUE      otherwise
+*/
+  
+bool JOIN::shrink_join_buffers(JOIN_TAB *jt, 
+                               ulonglong curr_space,
+                               ulonglong needed_space)
+{
+  JOIN_CACHE *cache;
+  for (JOIN_TAB *tab= join_tab+const_tables; tab < jt; tab++)
+  {
+    cache= tab->cache;
+    if (cache)
+    { 
+      size_t buff_size;
+      if (needed_space < cache->get_min_join_buffer_size())
+        return TRUE;
+      if (cache->shrink_join_buffer_in_ratio(curr_space, needed_space))
+      { 
+        revise_cache_usage(tab);
+        return TRUE;
+      }
+      buff_size= cache->get_join_buffer_size();
+      curr_space-= buff_size;
+      needed_space-= buff_size;
+    }
+  }
+
+  cache= jt->cache;
+  DBUG_ASSERT(cache);
+  if (needed_space < cache->get_min_join_buffer_size())
+    return TRUE;
+  cache->set_join_buffer_size((size_t)needed_space);
+  
+  return FALSE;
+}
+
+
 int
 JOIN::reinit()
 {
@@ -1677,6 +1757,7 @@ JOIN::reinit()
     free_io_cache(exec_tmp_table2);
     filesort_free_buffers(exec_tmp_table2,0);
   }
+  clear_sj_tmp_tables(this);
   if (items0)
     set_items_ref_array(items0);
 
@@ -1915,11 +1996,14 @@ JOIN::exec()
     curr_tmp_table= exec_tmp_table1;
 
     /* Copy data to the temporary table */
-    thd_proc_info(thd, "Copying to tmp table");
+    thd_proc_info(thd, copy_to_tmp_table);
     DBUG_PRINT("info", ("%s", thd->proc_info));
     if (!curr_join->sort_and_group &&
         curr_join->const_tables != curr_join->tables)
-      curr_join->join_tab[curr_join->const_tables].sorted= 0;
+    {
+      JOIN_TAB *first_tab= curr_join->join_tab + curr_join->const_tables;
+      first_tab->sorted= test(first_tab->loosescan_match_tab);
+    }
     if ((tmp_error= do_select(curr_join, (List<Item> *) 0, curr_tmp_table, 0)))
     {
       error= tmp_error;
@@ -2087,7 +2171,10 @@ JOIN::exec()
       curr_join->group_list= 0;
       if (!curr_join->sort_and_group &&
           curr_join->const_tables != curr_join->tables)
-        curr_join->join_tab[curr_join->const_tables].sorted= 0;
+      {
+        JOIN_TAB *first_tab= curr_join->join_tab + curr_join->const_tables;
+        first_tab->sorted= test(first_tab->loosescan_match_tab);
+      }
       if (setup_sum_funcs(curr_join->thd, curr_join->sum_funcs) ||
 	  (tmp_error= do_select(curr_join, (List<Item> *) 0, curr_tmp_table,
 				0)))
@@ -2214,7 +2301,7 @@ JOIN::exec()
 
       Item* sort_table_cond= make_cond_for_table(curr_join->tmp_having,
 						 used_tables,
-						 (table_map) 0);
+						 (table_map)0, FALSE, FALSE);
       if (sort_table_cond)
       {
 	if (!curr_table->select)
@@ -2230,14 +2317,22 @@ JOIN::exec()
 	    DBUG_VOID_RETURN;
 	  curr_table->select->cond->fix_fields(thd, 0);
 	}
-	curr_table->select_cond= curr_table->select->cond;
+        if (curr_table->pre_idx_push_select_cond)
+	{
+          if (!(curr_table->pre_idx_push_select_cond= 
+                new Item_cond_and(curr_table->pre_idx_push_select_cond,
+                                  sort_table_cond)))
+            DBUG_VOID_RETURN;            
+          curr_table->pre_idx_push_select_cond->fix_fields(thd, 0);
+        }
+        curr_table->set_select_cond(curr_table->select->cond, __LINE__);
 	curr_table->select_cond->top_level_item();
 	DBUG_EXECUTE("where",print_where(curr_table->select->cond,
 					 "select and having",
                                          QT_ORDINARY););
 	curr_join->tmp_having= make_cond_for_table(curr_join->tmp_having,
 						   ~ (table_map) 0,
-						   ~used_tables);
+						   ~used_tables, FALSE, FALSE);
 	DBUG_EXECUTE("where",print_where(curr_join->tmp_having,
                                          "having after sort",
                                          QT_ORDINARY););
@@ -2395,6 +2490,7 @@ JOIN::destroy()
     DBUG_RETURN(tmp_join->destroy());
   }
   cond_equal= 0;
+  having_equal= 0;
 
   cleanup(1);
  /* Cleanup items referencing temporary table columns */
@@ -2405,6 +2501,7 @@ JOIN::destroy()
   if (exec_tmp_table2)
     free_tmp_table(thd, exec_tmp_table2);
   delete select;
+  destroy_sj_tmp_tables(this);
   delete_dynamic(&keyuse);
   delete procedure;
   DBUG_RETURN(error);
@@ -2582,6 +2679,52 @@ err:
   DBUG_RETURN(join->error);
 }
 
+
+/**
+  Setup for execution all subqueries of a query, for which the optimizer
+  chose hash semi-join.
+
+  @details Iterate over all subqueries of the query, and if they are under an
+  IN predicate, and the optimizer chose to compute it via hash semi-join:
+  - try to initialize all data structures needed for the materialized execution
+    of the IN predicate,
+  - if this fails, then perform the IN=>EXISTS transformation which was
+    previously blocked during JOIN::prepare.
+
+  This method is part of the "code generation" query processing phase.
+
+  This phase must be called after substitute_for_best_equal_field() because
+  that function may replace items with other items from a multiple equality,
+  and we need to reference the correct items in the index access method of the
+  IN predicate.
+
+  @return Operation status
+  @retval FALSE     success.
+  @retval TRUE      error occurred.
+*/
+
+bool JOIN::setup_subquery_materialization()
+{
+  for (SELECT_LEX_UNIT *un= select_lex->first_inner_unit(); un;
+       un= un->next_unit())
+  {
+    for (SELECT_LEX *sl= un->first_select(); sl; sl= sl->next_select())
+    {
+      Item_subselect *subquery_predicate= sl->master_unit()->item;
+      if (subquery_predicate &&
+          subquery_predicate->substype() == Item_subselect::IN_SUBS)
+      {
+        Item_in_subselect *in_subs= (Item_in_subselect*) subquery_predicate;
+        if (in_subs->exec_method == Item_in_subselect::MATERIALIZATION &&
+            in_subs->setup_engine())
+          return TRUE;
+      }
+    }
+  }
+  return FALSE;
+}
+
+
 /*****************************************************************************
   Create JOIN_TABS, make a guess about the table types,
   Approximate how many records will be used in each table
@@ -2603,7 +2746,7 @@ static ha_rows get_quick_record_count(THD *thd, SQL_SELECT *select,
     select->head=table;
     table->reginfo.impossible_range=0;
     if ((error= select->test_quick_select(thd, *(key_map *)keys,(table_map) 0,
-                                          limit, 0)) == 1)
+                                          limit, 0, FALSE)) == 1)
       DBUG_RETURN(select->quick->records);
     if (error == -1)
     {
@@ -2629,6 +2772,7 @@ typedef struct st_sargable_param
   uint num_values;           /* number of values in the above array      */
 } SARGABLE_PARAM;  
 
+
 /**
   Calculate the best possible join and initialize the join structure.
 
@@ -2689,6 +2833,7 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
       goto error;
     }
     table->quick_keys.clear_all();
+    table->intersect_keys.clear_all();
     table->reginfo.join_tab=s;
     table->reginfo.not_exists_optimize=0;
     bzero((char*) table->const_key_parts, sizeof(key_part_map)*table->s->keys);
@@ -2723,7 +2868,7 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
         s->embedding_map|= embedding->nested_join->nj_map;
       continue;
     }
-    if (embedding)
+    if (embedding && !(embedding->sj_on_expr && ! embedding->embedding))
     {
       /* s belongs to a nested join, maybe to several embedded joins */
       s->embedding_map= 0;
@@ -2875,7 +3020,8 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
 	*/              
         while (keyuse->table == table)
         {
-          if (!(keyuse->val->used_tables() & ~join->const_table_map) &&
+          if (!keyuse->is_for_hash_join() && 
+              !(keyuse->val->used_tables() & ~join->const_table_map) &&
               keyuse->val->is_null() && keyuse->null_rejecting)
           {
             s->type= JT_CONST;
@@ -2918,9 +3064,14 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
 	s->type= JT_REF;
 	while (keyuse->table == table)
 	{
+          if (keyuse->is_for_hash_join())
+	  {
+            keyuse++;
+            continue;
+          }
 	  start_keyuse=keyuse;
 	  key=keyuse->key;
-	  s->keys.set_bit(key);               // QQ: remove this ?
+	  s->keys.set_bit(key);               // TODO: remove this ?
 
 	  refs=0;
           const_ref.clear_all();
@@ -2938,9 +3089,16 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
 	    keyuse++;
 	  } while (keyuse->table == table && keyuse->key == key);
 
+          TABLE_LIST *embedding= table->pos_in_table_list->embedding;
+          /*
+            TODO (low priority): currently we ignore the const tables that
+            are within a semi-join nest which is within an outer join nest.
+            The effect of this is that we don't do const substitution for
+            such tables.
+          */
 	  if (eq_part.is_prefix(table->key_info[key].key_parts) &&
               !table->fulltext_searched && 
-              !table->pos_in_table_list->embedding)
+              (!embedding || (embedding->sj_on_expr && !embedding->embedding)))
 	  {
             if (table->key_info[key].flags & HA_NOSAME)
             {
@@ -2974,7 +3132,7 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
       }
     }
   } while (join->const_table_map & found_ref && ref_changed);
-
+ 
   /* 
     Update info on indexes that can be used for search lookups as
     reading const tables may has added new sargable predicates. 
@@ -3024,9 +3182,16 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
       all select distinct fields participate in one index.
     */
     add_group_and_distinct_keys(join, s);
-
-    if (!s->const_keys.is_clear_all() &&
-        !s->table->pos_in_table_list->embedding)
+    
+    /*
+      Perform range analysis if there are keys it could use (1). 
+      Don't do range analysis if we're on the inner side of an outer join (2).
+      Do range analysis if we're on the inner side of a semi-join (3).
+    */
+    if (!s->const_keys.is_clear_all() &&                        // (1)
+        (!s->table->pos_in_table_list->embedding ||             // (2)
+         (s->table->pos_in_table_list->embedding &&             // (3)
+          s->table->pos_in_table_list->embedding->sj_on_expr))) // (3)
     {
       ha_rows records;
       SQL_SELECT *select;
@@ -3070,16 +3235,24 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
     }
   }
 
+  if (pull_out_semijoin_tables(join))
+    DBUG_RETURN(TRUE);
+
   join->join_tab=stat;
   join->map2table=stat_ref;
   join->table= join->all_tables=table_vector;
   join->const_tables=const_count;
   join->found_const_table_map=found_const_table_map;
 
+  if (join->const_tables != join->tables)
+    optimize_keyuse(join, keyuse_array);
+   
+  if (optimize_semijoin_nests(join, all_table_map))
+    DBUG_RETURN(TRUE); /* purecov: inspected */
+
   /* Find an optimal join order of the non-constant tables. */
   if (join->const_tables != join->tables)
   {
-    optimize_keyuse(join, keyuse_array);
     if (choose_plan(join, all_table_map & ~join->const_table_map))
       goto error;
   }
@@ -3126,12 +3299,9 @@ typedef struct key_field_t {
   */
   bool          null_rejecting; 
   bool         *cond_guard; /* See KEYUSE::cond_guard */
+  uint          sj_pred_no; /* See KEYUSE::sj_pred_no */
 } KEY_FIELD;
 
-/* Values in optimize */
-#define KEY_OPTIMIZE_EXISTS		1
-#define KEY_OPTIMIZE_REF_OR_NULL	2
-
 /**
   Merge new key definitions to old ones, remove those not used in both.
 
@@ -3295,6 +3465,52 @@ merge_key_fields(KEY_FIELD *start,KEY_FIELD *new_fields,KEY_FIELD *end,
 }
 
 
+/*
+  Given a field, return its index in semi-join's select list, or UINT_MAX
+
+  DESCRIPTION
+    Given a field, we find its table; then see if the table is within a
+    semi-join nest and if the field was in select list of the subselect.
+    If it was, we return field's index in the select list. The value is used
+    by LooseScan strategy.
+*/
+
+static uint get_semi_join_select_list_index(Field *field)
+{
+  uint res= UINT_MAX;
+  TABLE_LIST *emb_sj_nest;
+  if ((emb_sj_nest= field->table->pos_in_table_list->embedding) &&
+      emb_sj_nest->sj_on_expr)
+  {
+    Item_in_subselect *subq_pred= emb_sj_nest->sj_subq_pred;
+    st_select_lex *subq_lex= subq_pred->unit->first_select();
+    if (subq_pred->left_expr->cols() == 1)
+    {
+      Item *sel_item= subq_lex->ref_pointer_array[0];
+      if (sel_item->type() == Item::FIELD_ITEM &&
+          ((Item_field*)sel_item)->field->eq(field))
+      {
+        res= 0;
+      }
+    }
+    else
+    {
+      for (uint i= 0; i < subq_pred->left_expr->cols(); i++)
+      {
+        Item *sel_item= subq_lex->ref_pointer_array[i];
+        if (sel_item->type() == Item::FIELD_ITEM &&
+            ((Item_field*)sel_item)->field->eq(field))
+        {
+          res= i;
+          break;
+        }
+      }
+    }
+  }
+  return res;
+}
+
+
 /**
   Add a possible key to array of possible keys if it's usable as a key
 
@@ -3316,21 +3532,27 @@ merge_key_fields(KEY_FIELD *start,KEY_FIELD *new_fields,KEY_FIELD *end,
 */
 
 static void
-add_key_field(KEY_FIELD **key_fields,uint and_level, Item_func *cond,
+add_key_field(JOIN *join,
+              KEY_FIELD **key_fields,uint and_level, Item_func *cond,
               Field *field, bool eq_func, Item **value, uint num_values,
               table_map usable_tables, SARGABLE_PARAM **sargables)
 {
-  uint exists_optimize= 0;
-  if (!(field->flags & PART_KEY_FLAG))
+  uint optimize= 0;  
+  if (eq_func && join->is_allowed_hash_join_access() &&
+      field->hash_join_is_possible())
+  {
+    optimize= KEY_OPTIMIZE_EQ;
+  }   
+  else if (!(field->flags & PART_KEY_FLAG))
   {
     // Don't remove column IS NULL on a LEFT JOIN table
     if (!eq_func || (*value)->type() != Item::NULL_ITEM ||
         !field->table->maybe_null || field->null_ptr)
       return;					// Not a key. Skip it
-    exists_optimize= KEY_OPTIMIZE_EXISTS;
+    optimize= KEY_OPTIMIZE_EXISTS;
     DBUG_ASSERT(num_values == 1);
   }
-  else
+  if (optimize != KEY_OPTIMIZE_EXISTS)
   {
     table_map used_tables=0;
     bool optimizable=0;
@@ -3347,7 +3569,7 @@ add_key_field(KEY_FIELD **key_fields,uint and_level, Item_func *cond,
       if (!eq_func || (*value)->type() != Item::NULL_ITEM ||
           !field->table->maybe_null || field->null_ptr)
 	return;					// Can't use left join optimize
-      exists_optimize= KEY_OPTIMIZE_EXISTS;
+      optimize= KEY_OPTIMIZE_EXISTS;
     }
     else
     {
@@ -3367,7 +3589,8 @@ add_key_field(KEY_FIELD **key_fields,uint and_level, Item_func *cond,
          Field BETWEEN ...
          Field IN ...
       */
-      stat[0].key_dependent|=used_tables;
+      if (field->flags & PART_KEY_FLAG)
+        stat[0].key_dependent|=used_tables;
 
       bool is_const=1;
       for (uint i=0; i<num_values; i++)
@@ -3445,8 +3668,8 @@ add_key_field(KEY_FIELD **key_fields,uint and_level, Item_func *cond,
   (*key_fields)->field=		field;
   (*key_fields)->eq_func=	eq_func;
   (*key_fields)->val=		*value;
-  (*key_fields)->level=		and_level;
-  (*key_fields)->optimize=	exists_optimize;
+  (*key_fields)->level=         and_level;
+  (*key_fields)->optimize=      optimize;
   /*
     If the condition has form "tbl.keypart = othertbl.field" and 
     othertbl.field can be NULL, there will be no matches if othertbl.field 
@@ -3454,11 +3677,19 @@ add_key_field(KEY_FIELD **key_fields,uint and_level, Item_func *cond,
     We use null_rejecting in add_not_null_conds() to add
     'othertbl.field IS NOT NULL' to tab->select_cond.
   */
-  (*key_fields)->null_rejecting= ((cond->functype() == Item_func::EQ_FUNC ||
-                                   cond->functype() == Item_func::MULT_EQUAL_FUNC) &&
-                                  ((*value)->type() == Item::FIELD_ITEM) &&
-                                  ((Item_field*)*value)->field->maybe_null());
+  {
+    Item *real= (*value)->real_item();
+    if (((cond->functype() == Item_func::EQ_FUNC) ||
+         (cond->functype() == Item_func::MULT_EQUAL_FUNC)) &&
+        (real->type() == Item::FIELD_ITEM) &&
+        ((Item_field*)real)->field->maybe_null())
+      (*key_fields)->null_rejecting= true;
+    else
+      (*key_fields)->null_rejecting= false;
+  }
   (*key_fields)->cond_guard= NULL;
+
+  (*key_fields)->sj_pred_no= get_semi_join_select_list_index(field);
   (*key_fields)++;
 }
 
@@ -3485,14 +3716,14 @@ add_key_field(KEY_FIELD **key_fields,uint and_level, Item_func *cond,
 */
 
 static void
-add_key_equal_fields(KEY_FIELD **key_fields, uint and_level,
+add_key_equal_fields(JOIN *join, KEY_FIELD **key_fields, uint and_level,
                      Item_func *cond, Item_field *field_item,
                      bool eq_func, Item **val,
                      uint num_values, table_map usable_tables,
                      SARGABLE_PARAM **sargables)
 {
   Field *field= field_item->field;
-  add_key_field(key_fields, and_level, cond, field,
+  add_key_field(join, key_fields, and_level, cond, field,
                 eq_func, val, num_values, usable_tables, sargables);
   Item_equal *item_equal= field_item->item_equal;
   if (item_equal)
@@ -3507,7 +3738,7 @@ add_key_equal_fields(KEY_FIELD **key_fields, uint and_level,
     {
       if (!field->eq(item->field))
       {
-        add_key_field(key_fields, and_level, cond, item->field,
+        add_key_field(join, key_fields, and_level, cond, item->field,
                       eq_func, val, num_values, usable_tables,
                       sargables);
       }
@@ -3532,8 +3763,8 @@ static bool
 is_local_field (Item *field)
 {
   return field->real_item()->type() == Item::FIELD_ITEM
-    && !(field->used_tables() & OUTER_REF_TABLE_BIT)
-    && !((Item_field *)field->real_item())->depended_from;
+     && !(field->used_tables() & OUTER_REF_TABLE_BIT)
+     && !((Item_field *)field->real_item())->depended_from;
 }
 
 
@@ -3640,7 +3871,7 @@ add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
         values--;
       DBUG_ASSERT(cond_func->functype() != Item_func::IN_FUNC ||
                   cond_func->argument_count() != 2);
-      add_key_equal_fields(key_fields, *and_level, cond_func,
+      add_key_equal_fields(join, key_fields, *and_level, cond_func,
                            (Item_field*) (cond_func->key_item()->real_item()),
                            0, values, 
                            cond_func->argument_count()-1,
@@ -3655,7 +3886,7 @@ add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
         if (is_local_field (cond_func->arguments()[i]))
         {
           field_item= (Item_field *) (cond_func->arguments()[i]->real_item());
-          add_key_equal_fields(key_fields, *and_level, cond_func,
+          add_key_equal_fields(join, key_fields, *and_level, cond_func,
                                field_item, 0, values, 1, usable_tables, 
                                sargables);
         }
@@ -3670,7 +3901,7 @@ add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
 
     if (is_local_field (cond_func->arguments()[0]))
     {
-      add_key_equal_fields(key_fields, *and_level, cond_func,
+      add_key_equal_fields(join, key_fields, *and_level, cond_func,
 	                (Item_field*) (cond_func->arguments()[0])->real_item(),
 		           equal_func,
                            cond_func->arguments()+1, 1, usable_tables,
@@ -3679,7 +3910,7 @@ add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
     if (is_local_field (cond_func->arguments()[1]) &&
 	cond_func->functype() != Item_func::LIKE_FUNC)
     {
-      add_key_equal_fields(key_fields, *and_level, cond_func, 
+      add_key_equal_fields(join, key_fields, *and_level, cond_func, 
                        (Item_field*) (cond_func->arguments()[1])->real_item(),
 		           equal_func,
                            cond_func->arguments(),1,usable_tables,
@@ -3695,7 +3926,7 @@ add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
       Item *tmp=new Item_null;
       if (unlikely(!tmp))                       // Should never be true
 	return;
-      add_key_equal_fields(key_fields, *and_level, cond_func,
+      add_key_equal_fields(join, key_fields, *and_level, cond_func,
 		    (Item_field*) (cond_func->arguments()[0])->real_item(),
 		    cond_func->functype() == Item_func::ISNULL_FUNC,
 			   &tmp, 1, usable_tables, sargables);
@@ -3715,7 +3946,7 @@ add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
       */   
       while ((item= it++))
       {
-        add_key_field(key_fields, *and_level, cond_func, item->field,
+        add_key_field(join, key_fields, *and_level, cond_func, item->field,
                       TRUE, &const_item, 1, usable_tables, sargables);
       }
     }
@@ -3736,7 +3967,7 @@ add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
           if (!field->eq(item->field))
           {
             Item *tmp_item= item;
-            add_key_field(key_fields, *and_level, cond_func, field,
+            add_key_field(join, key_fields, *and_level, cond_func, field,
                           TRUE, &tmp_item, 1, usable_tables,
                           sargables);
           }
@@ -3757,6 +3988,59 @@ max_part_bit(key_part_map bits)
   return found;
 }
 
+
+/**
+  Add a new keuse to the specified array of KEYUSE objects
+
+  @param[in,out]  keyuse_array  array of keyuses to be extended 
+  @param[in]      key_field     info on the key use occurrence
+  @param[in]      key           key number for the keyse to be added
+  @param[in]      part          key part for the keyuse to be added
+
+  @note
+  The function builds a new KEYUSE object for a key use utilizing the info
+  on the left and right parts of the given key use  extracted from the 
+  structure key_field, the key number and key part for this key use. 
+  The built object is added to the dynamic array keyuse_array.
+
+  @retval         0             the built object is succesfully added 
+  @retval         1             otherwise
+*/
+
+static bool
+add_keyuse(DYNAMIC_ARRAY *keyuse_array, KEY_FIELD *key_field,
+          uint key, uint part)
+{
+  KEYUSE keyuse;
+  Field *field= key_field->field;
+
+  keyuse.table= field->table;
+  keyuse.val= key_field->val;
+  keyuse.key= key;
+  if (!is_hash_join_key_no(key))
+  {
+    keyuse.keypart=part;
+    keyuse.keypart_map= (key_part_map) 1 << part;
+  }
+  else
+  {
+    /* 
+      If this is a key use for hash join then keypart of
+      the added element actually contains the field number.
+    */
+    keyuse.keypart= field->field_index;
+    keyuse.keypart_map= (key_part_map) 0;
+  }
+  keyuse.used_tables= key_field->val->used_tables();
+  keyuse.optimize= key_field->optimize & KEY_OPTIMIZE_REF_OR_NULL;
+  keyuse.ref_table_rows= 0;
+  keyuse.null_rejecting= key_field->null_rejecting;
+  keyuse.cond_guard= key_field->cond_guard;
+  keyuse.sj_pred_no= key_field->sj_pred_no;
+  return (insert_dynamic(keyuse_array,(uchar*) &keyuse));
+}
+
+
 /*
   Add all keys with uses 'field' for some keypart
   If field->and_level != and_level then only mark key_part as const_part
@@ -3767,11 +4051,10 @@ max_part_bit(key_part_map bits)
 */
 
 static bool
-add_key_part(DYNAMIC_ARRAY *keyuse_array,KEY_FIELD *key_field)
+add_key_part(DYNAMIC_ARRAY *keyuse_array, KEY_FIELD *key_field)
 {
   Field *field=key_field->field;
   TABLE *form= field->table;
-  KEYUSE keyuse;
 
   if (key_field->eq_func && !(key_field->optimize & KEY_OPTIMIZE_EXISTS))
   {
@@ -3787,20 +4070,24 @@ add_key_part(DYNAMIC_ARRAY *keyuse_array,KEY_FIELD *key_field)
       {
 	if (field->eq(form->key_info[key].key_part[part].field))
 	{
-	  keyuse.table= field->table;
-	  keyuse.val =  key_field->val;
-	  keyuse.key =  key;
-	  keyuse.keypart=part;
-	  keyuse.keypart_map= (key_part_map) 1 << part;
-	  keyuse.used_tables=key_field->val->used_tables();
-	  keyuse.optimize= key_field->optimize & KEY_OPTIMIZE_REF_OR_NULL;
-          keyuse.null_rejecting= key_field->null_rejecting;
-          keyuse.cond_guard= key_field->cond_guard;
-	  if (insert_dynamic(keyuse_array,(uchar*) &keyuse))
+          if (add_keyuse(keyuse_array, key_field, key, part))
             return TRUE;
 	}
       }
     }
+    if (field->hash_join_is_possible() &&
+        (key_field->optimize & KEY_OPTIMIZE_EQ) &&
+        key_field->val->used_tables())
+    {
+      /* 
+        If a key use is extracted from an equi-join predicate then it is
+        added not only as a key use for every index whose component can
+        be evalusted utilizing this key use, but also as a key use for
+        hash join. Such key uses are marked with a special key number. 
+      */    
+      if (add_keyuse(keyuse_array, key_field, get_hash_join_key_no(), 0))
+        return TRUE;
+    }
   }
   return FALSE;
 }
@@ -3868,6 +4155,7 @@ add_ft_keys(DYNAMIC_ARRAY *keyuse_array,
   keyuse.used_tables=cond_func->key_item()->used_tables();
   keyuse.optimize= 0;
   keyuse.keypart_map= 0;
+  keyuse.sj_pred_no= UINT_MAX;
   return insert_dynamic(keyuse_array,(uchar*) &keyuse);
 }
 
@@ -3932,20 +4220,34 @@ static void add_key_fields_for_nj(JOIN *join, TABLE_LIST *nested_join_table,
                                   SARGABLE_PARAM **sargables)
 {
   List_iterator<TABLE_LIST> li(nested_join_table->nested_join->join_list);
+  List_iterator<TABLE_LIST> li2(nested_join_table->nested_join->join_list);
+  bool have_another = FALSE;
   table_map tables= 0;
   TABLE_LIST *table;
   DBUG_ASSERT(nested_join_table->nested_join);
 
-  while ((table= li++))
+  while ((table= li++) || (have_another && (li=li2, have_another=FALSE,
+                                            (table= li++))))
   {
     if (table->nested_join)
-      add_key_fields_for_nj(join, table, end, and_level, sargables);
+    {
+      if (!table->on_expr)
+      {
+        /* It's a semi-join nest. Walk into it as if it wasn't a nest */
+        have_another= TRUE;
+        li2= li;
+        li= List_iterator<TABLE_LIST>(table->nested_join->join_list); 
+      }
+      else
+        add_key_fields_for_nj(join, table, end, and_level, sargables);
+    }
     else
       if (!table->on_expr)
         tables |= table->table->map;
   }
-  add_key_fields(join, end, and_level, nested_join_table->on_expr, tables,
-                 sargables);
+  if (nested_join_table->on_expr)
+    add_key_fields(join, end, and_level, nested_join_table->on_expr, tables,
+                   sargables);
 }
 
 
@@ -4098,20 +4400,26 @@ update_ref_and_keys(THD *thd, DYNAMIC_ARRAY *keyuse,JOIN_TAB *join_tab,
     found_eq_constant=0;
     for (i=0 ; i < keyuse->elements-1 ; i++,use++)
     {
-      if (!use->used_tables && use->optimize != KEY_OPTIMIZE_REF_OR_NULL)
-	use->table->const_key_parts[use->key]|= use->keypart_map;
-      if (use->keypart != FT_KEYPART)
+      if (!use->is_for_hash_join())
       {
-	if (use->key == prev->key && use->table == prev->table)
-	{
-	  if (prev->keypart+1 < use->keypart ||
-	      (prev->keypart == use->keypart && found_eq_constant))
-	    continue;				/* remove */
-	}
-	else if (use->keypart != 0)		// First found must be 0
-	  continue;
-      }
+        if (!use->used_tables && use->optimize != KEY_OPTIMIZE_REF_OR_NULL)
+	  use->table->const_key_parts[use->key]|= use->keypart_map;
+        if (use->keypart != FT_KEYPART)
+        {
+	  if (use->key == prev->key && use->table == prev->table)
+	  {
+	    if (prev->keypart+1 < use->keypart ||
+	        (prev->keypart == use->keypart && found_eq_constant))
+	      continue;				/* remove */
+	  }
+	  else if (use->keypart != 0)		// First found must be 0
+	    continue;
+        }
 
+        prev= use;
+        found_eq_constant= !use->used_tables;
+        use->table->reginfo.join_tab->checked_keys.set_bit(use->key);
+      }
       /*
         Old gcc used a memcpy(), which is undefined if save_pos==use:
         http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19410
@@ -4120,18 +4428,16 @@ update_ref_and_keys(THD *thd, DYNAMIC_ARRAY *keyuse,JOIN_TAB *join_tab,
       */
       if (save_pos != use)
         *save_pos= *use;
-      prev=use;
-      found_eq_constant= !use->used_tables;
       /* Save ptr to first use */
       if (!use->table->reginfo.join_tab->keyuse)
-	use->table->reginfo.join_tab->keyuse=save_pos;
-      use->table->reginfo.join_tab->checked_keys.set_bit(use->key);
+	use->table->reginfo.join_tab->keyuse= save_pos;
       save_pos++;
     }
     i=(uint) (save_pos-(KEYUSE*) keyuse->buffer);
     VOID(set_dynamic(keyuse,(uchar*) &key_end,i));
     keyuse->elements=i;
   }
+  DBUG_EXECUTE("opt", print_keyuse_array(keyuse););
   return FALSE;
 }
 
@@ -4252,6 +4558,10 @@ void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key)
   join->positions[idx].records_read=1.0;	/* This is a const table */
   join->positions[idx].ref_depend_map= 0;
 
+  join->positions[idx].loosescan_key= MAX_KEY; /* Not a LooseScan */
+  join->positions[idx].sj_strategy= SJ_OPT_NONE;
+  join->positions[idx].use_join_buffer= FALSE;
+
   /* Move the const table as down as possible in best_ref */
   JOIN_TAB **pos=join->best_ref+idx+1;
   JOIN_TAB *next=join->best_ref[idx];
@@ -4265,6 +4575,35 @@ void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key)
 }
 
 
+/* Estimate of the number matching candidates in the joined table */
+
+inline
+ha_rows matching_candidates_in_table(JOIN_TAB *s, bool with_found_constraint)
+{
+  ha_rows records= s->found_records;
+  /*
+    If there is a filtering condition on the table (i.e. ref analyzer found
+    at least one "table.keyXpartY= exprZ", where exprZ refers only to tables
+    preceding this table in the join order we're now considering), then 
+    assume that 25% of the rows will be filtered out by this condition.
+
+    This heuristic is supposed to force tables used in exprZ to be before
+    this table in join order.
+  */
+  if (with_found_constraint)
+    records-= records/4;
+
+    /*
+      If applicable, get a more accurate estimate. Don't use the two
+      heuristics at once.
+    */
+  if (s->table->quick_condition_rows != s->found_records)
+    records= s->table->quick_condition_rows;
+
+  return records;
+}
+
+
 /**
   Find the best access path for an extension of a partial execution
   plan and add this path to the plan.
@@ -4282,23 +4621,28 @@ void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key)
   @param thd              thread for the connection that submitted the query
   @param remaining_tables set of tables not included into the partial plan yet
   @param idx              the length of the partial plan
+  @param disable_jbuf     TRUE<=> Don't use join buffering
   @param record_count     estimate for the number of records returned by the
                           partial plan
-  @param read_time        the cost of the partial plan
+  @param pos              OUT Table access plan
+  @param loose_scan_pos   OUT Table plan that uses loosescan, or set cost to 
+                              DBL_MAX if not possible.
 
   @return
     None
 */
 
-static void
+void
 best_access_path(JOIN      *join,
                  JOIN_TAB  *s,
-                 THD       *thd,
                  table_map remaining_tables,
                  uint      idx,
+                 bool      disable_jbuf,
                  double    record_count,
-                 double    read_time)
+                 POSITION *pos,
+                 POSITION *loose_scan_pos)
 {
+  THD *thd= join->thd;
   KEYUSE *best_key=         0;
   uint best_max_key_part=   0;
   my_bool found_constraint= 0;
@@ -4308,12 +4652,22 @@ best_access_path(JOIN      *join,
   table_map best_ref_depends_map= 0;
   double tmp;
   ha_rows rec;
+  bool best_uses_jbuf= FALSE;
+  MY_BITMAP *eq_join_set= &s->table->eq_join_set;
+  KEYUSE *hj_start_key= 0;
+
+  Loose_scan_opt loose_scan_opt;
   DBUG_ENTER("best_access_path");
+  
+  bitmap_clear_all(eq_join_set);
 
+  loose_scan_opt.init(join, s, remaining_tables);
+  
   if (s->keyuse)
   {                                            /* Use key if possible */
+    KEYUSE *keyuse;
+    KEYUSE *start_key=0;
     TABLE *table= s->table;
-    KEYUSE *keyuse,*start_key=0;
     double best_records= DBL_MAX;
     uint max_key_part=0;
 
@@ -4321,19 +4675,41 @@ best_access_path(JOIN      *join,
     rec= s->records/MATCHING_ROWS_IN_OTHER_TABLE;  // Assumed records/key
     for (keyuse=s->keyuse ; keyuse->table == table ;)
     {
+      KEY *keyinfo;
       key_part_map found_part= 0;
       table_map found_ref= 0;
       uint key= keyuse->key;
-      KEY *keyinfo= table->key_info+key;
       bool ft_key=  (keyuse->keypart == FT_KEYPART);
       /* Bitmap of keyparts where the ref access is over 'keypart=const': */
       key_part_map const_part= 0;
       /* The or-null keypart in ref-or-null access: */
       key_part_map ref_or_null_part= 0;
+      if (is_hash_join_key_no(key))
+      {
+        /* 
+          Hash join as any join employing join buffer can be used to join
+          only those tables that are joined after the first non const table
+	*/  
+        if (!(remaining_tables & keyuse->used_tables) &&
+            idx > join->const_tables)
+        {
+          if (!hj_start_key)
+            hj_start_key= keyuse;
+          bitmap_set_bit(eq_join_set, keyuse->keypart);
+        }
+        keyuse++;
+        continue;
+      }
+
+      keyinfo= table->key_info+key;
 
       /* Calculate how many key segments of the current key we can use */
       start_key= keyuse;
 
+      loose_scan_opt.next_ref_key();
+      DBUG_PRINT("info", ("Considering ref access on key %s",
+                          keyuse->table->key_info[keyuse->key].name));
+
       do /* For each keypart */
       {
         uint keypart= keyuse->keypart;
@@ -4342,7 +4718,6 @@ best_access_path(JOIN      *join,
         
         do /* For each way to access the keypart */
         {
-
           /*
             if 1. expression doesn't refer to forward tables
                2. we won't get two ref-or-null's
@@ -4371,6 +4746,7 @@ best_access_path(JOIN      *join,
             if (keyuse->optimize & KEY_OPTIMIZE_REF_OR_NULL)
               ref_or_null_part |= keyuse->keypart_map;
           }
+          loose_scan_opt.add_keyuse(remaining_tables, keyuse);
           keyuse++;
         } while (keyuse->table == table && keyuse->key == key &&
                  keyuse->keypart == keypart);
@@ -4380,7 +4756,7 @@ best_access_path(JOIN      *join,
       /*
         Assume that that each key matches a proportional part of table.
       */
-      if (!found_part && !ft_key)
+      if (!found_part && !ft_key && !loose_scan_opt.have_a_case())
         continue;                               // Nothing usable found
 
       if (rec < MATCHING_ROWS_IN_OTHER_TABLE)
@@ -4400,10 +4776,10 @@ best_access_path(JOIN      *join,
       }
       else
       {
-        found_constraint= 1;
-        /*
-          Check if we found full key
-        */
+        found_constraint= test(found_part);
+        loose_scan_opt.check_ref_access_part1(s, key, start_key, found_part);
+
+        /* Check if we found full key */
         if (found_part == PREV_BITS(uint,keyinfo->key_parts) &&
             !ref_or_null_part)
         {                                         /* use eq key */
@@ -4465,7 +4841,8 @@ best_access_path(JOIN      *join,
                 in ReuseRangeEstimateForRef-3.
               */
               if (table->quick_keys.is_set(key) &&
-                  const_part & (1 << table->quick_key_parts[key]) &&
+                  (const_part & ((1 << table->quick_key_parts[key])-1)) ==
+                  (((key_part_map)1 << table->quick_key_parts[key])-1) &&
                   table->quick_n_ranges[key] == 1 &&
                   records > (double) table->quick_rows[key])
               {
@@ -4476,14 +4853,11 @@ best_access_path(JOIN      *join,
             tmp= records;
             set_if_smaller(tmp, (double) thd->variables.max_seeks_for_key);
             if (table->covering_keys.is_set(key))
-            {
-              /* we can use only index tree */
-              uint keys_per_block= table->file->stats.block_size/2/
-                (keyinfo->key_length+table->file->ref_length)+1;
-              tmp= record_count*(tmp+keys_per_block-1)/keys_per_block;
-            }
+              tmp= table->file->keyread_time(key, 1, (ha_rows) tmp);
             else
-              tmp= record_count*min(tmp,s->worst_seeks);
+              tmp= table->file->read_time(key, 1,
+                                          (ha_rows) min(tmp,s->worst_seeks)-1);
+            tmp*= record_count;
           }
         }
         else
@@ -4643,20 +5017,19 @@ best_access_path(JOIN      *join,
             /* Limit the number of matched rows */
             set_if_smaller(tmp, (double) thd->variables.max_seeks_for_key);
             if (table->covering_keys.is_set(key))
-            {
-              /* we can use only index tree */
-              uint keys_per_block= table->file->stats.block_size/2/
-                (keyinfo->key_length+table->file->ref_length)+1;
-              tmp= record_count*(tmp+keys_per_block-1)/keys_per_block;
-            }
+              tmp= table->file->keyread_time(key, 1, (ha_rows) tmp);
             else
-              tmp= record_count*min(tmp,s->worst_seeks);
+              tmp= table->file->read_time(key, 1,
+                                          (ha_rows) min(tmp,s->worst_seeks)-1);
+            tmp*= record_count;
           }
           else
             tmp= best_time;                    // Do nothing
         }
+        loose_scan_opt.check_ref_access_part2(key, start_key, records, tmp);
+
       } /* not ft_key */
-      if (tmp < best_time - records/(double) TIME_FOR_COMPARE)
+      if (tmp + 0.0001 < best_time - records/(double) TIME_FOR_COMPARE)
       {
         best_time= tmp + records/(double) TIME_FOR_COMPARE;
         best= tmp;
@@ -4665,10 +5038,45 @@ best_access_path(JOIN      *join,
         best_max_key_part= max_key_part;
         best_ref_depends_map= found_ref;
       }
-    }
+    } /* for each key */
     records= best_records;
   }
 
+  /* 
+    If there is no key to access the table, but there is an equi-join
+    predicate connecting the table with the privious tables then we
+    consider the possibility of using hash join.
+    We need also to check that:
+    (1) s is inner table of semi-join -> join cache is allowed for semijoins
+    (2) s is inner table of outer join -> join cache is allowed for outer joins
+  */  
+  if (idx > join->const_tables && best_key == 0 && 
+     !bitmap_is_clear_all(eq_join_set) &&  !disable_jbuf &&
+      (!s->emb_sj_nest ||                     
+       join->allowed_semijoin_with_cache) &&    // (1)
+      (!(s->table->map & join->outer_join) ||
+       join->allowed_outer_join_with_cache))    // (2)
+  {
+    double join_sel= 0.1;
+    /* Estimate the cost of  the hash join access to the table */
+    ha_rows rnd_records= matching_candidates_in_table(s, found_constraint);
+
+    tmp= s->quick ? s->quick->read_time : s->table->file->scan_time();
+    tmp+= (s->records - rnd_records)/(double) TIME_FOR_COMPARE;
+
+    /* We read the table as many times as join buffer becomes full. */
+    tmp*= (1.0 + floor((double) cache_record_length(join,idx) *
+                          record_count /
+                          (double) thd->variables.join_buff_size));
+    best_time= tmp + 
+               (record_count*join_sel) / TIME_FOR_COMPARE * rnd_records;
+    best= tmp;
+    records= rows2double(rnd_records);
+    best_key= hj_start_key;
+    best_ref_depends_map= 0;
+    best_uses_jbuf= TRUE;
+   }
+
   /*
     Don't test table scan if it can't be better.
     Prefer key lookup if we would use the same key for scanning.
@@ -4678,7 +5086,7 @@ best_access_path(JOIN      *join,
     This is because table scans uses index and we would not win
     anything by using a table scan.
 
-    A word for word translation of the below if-statement in psergey's
+    A word for word translation of the below if-statement in sergefp's
     understanding: we check if we should use table scan if:
     (1) The found 'ref' access produces more records than a table scan
         (or index scan, or quick select), or 'ref' is more expensive than
@@ -4704,31 +5112,14 @@ best_access_path(JOIN      *join,
         ! s->table->covering_keys.is_clear_all() && best_key && !s->quick) &&// (3)
       !(s->table->force_index && best_key && !s->quick))                 // (4)
   {                                             // Check full join
-    ha_rows rnd_records= s->found_records;
-    /*
-      If there is a filtering condition on the table (i.e. ref analyzer found
-      at least one "table.keyXpartY= exprZ", where exprZ refers only to tables
-      preceding this table in the join order we're now considering), then 
-      assume that 25% of the rows will be filtered out by this condition.
-
-      This heuristic is supposed to force tables used in exprZ to be before
-      this table in join order.
-    */
-    if (found_constraint)
-      rnd_records-= rnd_records/4;
-
-    /*
-      If applicable, get a more accurate estimate. Don't use the two
-      heuristics at once.
-    */
-    if (s->table->quick_condition_rows != s->found_records)
-      rnd_records= s->table->quick_condition_rows;
+    ha_rows rnd_records= matching_candidates_in_table(s, found_constraint);
 
     /*
       Range optimizer never proposes a RANGE if it isn't better
       than FULL: so if RANGE is present, it's always preferred to FULL.
       Here we estimate its cost.
     */
+
     if (s->quick)
     {
       /*
@@ -4743,12 +5134,14 @@ best_access_path(JOIN      *join,
       tmp= record_count *
         (s->quick->read_time +
          (s->found_records - rnd_records)/(double) TIME_FOR_COMPARE);
+
+      loose_scan_opt.check_range_access(join, idx, s->quick);
     }
     else
     {
       /* Estimate cost of reading table. */
       tmp= s->table->file->scan_time();
-      if (s->table->map & join->outer_join)     // Can't use join cache
+      if ((s->table->map & join->outer_join) || disable_jbuf)     // Can't use join cache
       {
         /*
           For each record we have to:
@@ -4783,7 +5176,8 @@ best_access_path(JOIN      *join,
     */
     if (best == DBL_MAX ||
         (tmp  + record_count/(double) TIME_FOR_COMPARE*rnd_records <
-         best + record_count/(double) TIME_FOR_COMPARE*records))
+         (best_key->is_for_hash_join() ? best_time :
+          best + record_count/(double) TIME_FOR_COMPARE*records)))
     {
       /*
         If the table has a range (s->quick is set) make_join_select()
@@ -4794,15 +5188,21 @@ best_access_path(JOIN      *join,
       best_key= 0;
       /* range/index_merge/ALL/index access method are "independent", so: */
       best_ref_depends_map= 0;
+      best_uses_jbuf= test(!disable_jbuf && !((s->table->map & 
+                                               join->outer_join)));
     }
   }
-
+  
   /* Update the cost information for the current partial plan */
-  join->positions[idx].records_read= records;
-  join->positions[idx].read_time=    best;
-  join->positions[idx].key=          best_key;
-  join->positions[idx].table=        s;
-  join->positions[idx].ref_depend_map= best_ref_depends_map;
+  pos->records_read= records;
+  pos->read_time=    best;
+  pos->key=          best_key;
+  pos->table=        s;
+  pos->ref_depend_map= best_ref_depends_map;
+  pos->loosescan_key= MAX_KEY;
+  pos->use_join_buffer= best_uses_jbuf;
+   
+  loose_scan_opt.save_to_position(s, loose_scan_pos);
 
   if (!best_key &&
       idx == join->const_tables &&
@@ -4837,7 +5237,7 @@ best_access_path(JOIN      *join,
     TRUE        Fatal error
 */
 
-static bool
+bool
 choose_plan(JOIN *join, table_map join_tables)
 {
   uint search_depth= join->thd->variables.optimizer_search_depth;
@@ -4846,19 +5246,34 @@ choose_plan(JOIN *join, table_map join_tables)
   DBUG_ENTER("choose_plan");
 
   join->cur_embedding_map= 0;
+  join->cur_dups_producing_tables= 0;
   reset_nj_counters(join, join->join_list);
-  /*
-    if (SELECT_STRAIGHT_JOIN option is set)
-      reorder tables so dependent tables come after tables they depend 
-      on, otherwise keep tables in the order they were specified in the query 
-    else
-      Apply heuristic: pre-sort all access plans with respect to the number of
-      records accessed.
-  */
-  my_qsort(join->best_ref + join->const_tables,
-           join->tables - join->const_tables, sizeof(JOIN_TAB*),
-           straight_join ? join_tab_cmp_straight : join_tab_cmp);
-  
+  qsort2_cmp jtab_sort_func;
+
+  if (join->emb_sjm_nest)
+  {
+    /* We're optimizing semi-join materialization nest, so put the 
+       tables from this semi-join as first
+    */
+    jtab_sort_func= join_tab_cmp_embedded_first;
+  }
+  else
+  {
+    /*
+      if (SELECT_STRAIGHT_JOIN option is set)
+        reorder tables so dependent tables come after tables they depend 
+        on, otherwise keep tables in the order they were specified in the query 
+      else
+        Apply heuristic: pre-sort all access plans with respect to the number of
+        records accessed.
+    */
+    jtab_sort_func= straight_join ? join_tab_cmp_straight : join_tab_cmp;
+  }
+  my_qsort2(join->best_ref + join->const_tables,
+            join->tables - join->const_tables, sizeof(JOIN_TAB*),
+            jtab_sort_func, (void*)join->emb_sjm_nest);
+  join->cur_sj_inner_tables= 0;
+
   if (straight_join)
   {
     optimize_straight_join(join, join_tables);
@@ -4922,7 +5337,7 @@ choose_plan(JOIN *join, table_map join_tables)
 */
 
 static int
-join_tab_cmp(const void* ptr1, const void* ptr2)
+join_tab_cmp(const void *dummy, const void* ptr1, const void* ptr2)
 {
   JOIN_TAB *jt1= *(JOIN_TAB**) ptr1;
   JOIN_TAB *jt2= *(JOIN_TAB**) ptr2;
@@ -4944,11 +5359,19 @@ join_tab_cmp(const void* ptr1, const void* ptr2)
 */
 
 static int
-join_tab_cmp_straight(const void* ptr1, const void* ptr2)
+join_tab_cmp_straight(const void *dummy, const void* ptr1, const void* ptr2)
 {
   JOIN_TAB *jt1= *(JOIN_TAB**) ptr1;
   JOIN_TAB *jt2= *(JOIN_TAB**) ptr2;
 
+  /*
+    We don't do subquery flattening if the parent or child select has
+    STRAIGHT_JOIN modifier. It is complicated to implement and the semantics
+    is hardly useful.
+  */
+  DBUG_ASSERT(!jt1->emb_sj_nest);
+  DBUG_ASSERT(!jt2->emb_sj_nest);
+
   if (jt1->dependent & jt2->table->map)
     return 1;
   if (jt2->dependent & jt1->table->map)
@@ -4956,6 +5379,38 @@ join_tab_cmp_straight(const void* ptr1, const void* ptr2)
   return jt1 > jt2 ? 1 : (jt1 < jt2 ? -1 : 0);
 }
 
+
+/*
+  Same as join_tab_cmp but tables from within the given semi-join nest go 
+  first. Used when the optimizing semi-join materialization nests.
+*/
+
+static int
+join_tab_cmp_embedded_first(const void *emb,  const void* ptr1, const void* ptr2)
+{
+  const TABLE_LIST *emb_nest= (TABLE_LIST*) emb;
+  JOIN_TAB *jt1= *(JOIN_TAB**) ptr1;
+  JOIN_TAB *jt2= *(JOIN_TAB**) ptr2;
+
+  if (jt1->emb_sj_nest == emb_nest && jt2->emb_sj_nest != emb_nest)
+    return -1;
+  if (jt1->emb_sj_nest != emb_nest && jt2->emb_sj_nest == emb_nest)
+    return 1;
+
+  if (jt1->dependent & jt2->table->map)
+    return 1;
+  if (jt2->dependent & jt1->table->map)
+    return -1;
+
+  if (jt1->found_records > jt2->found_records)
+    return 1;
+  if (jt1->found_records < jt2->found_records)
+    return -1; 
+  
+  return jt1 > jt2 ? 1 : (jt1 < jt2 ? -1 : 0);
+}
+
+
 /**
   Heuristic procedure to automatically guess a reasonable degree of
   exhaustiveness for the greedy search procedure.
@@ -5039,15 +5494,20 @@ optimize_straight_join(JOIN *join, table_map join_tables)
   uint idx= join->const_tables;
   double    record_count= 1.0;
   double    read_time=    0.0;
- 
+  POSITION  loose_scan_pos;
+
   for (JOIN_TAB **pos= join->best_ref + idx ; (s= *pos) ; pos++)
   {
     /* Find the best access method from 's' to the current partial plan */
-    best_access_path(join, s, join->thd, join_tables, idx,
-                     record_count, read_time);
+    best_access_path(join, s, join_tables, idx, FALSE, record_count,
+                     join->positions + idx, &loose_scan_pos);
+
     /* compute the cost of the new plan extended with 's' */
     record_count*= join->positions[idx].records_read;
     read_time+=    join->positions[idx].read_time;
+    advance_sj_state(join, join_tables, s, idx, &record_count, &read_time,
+                     &loose_scan_pos);
+
     join_tables&= ~(s->table->map);
     ++idx;
   }
@@ -5156,11 +5616,17 @@ greedy_search(JOIN      *join,
   uint      size_remain;    // cardinality of remaining_tables
   POSITION  best_pos;
   JOIN_TAB  *best_table; // the next plan node to be added to the curr QEP
+  uint      n_tables; // ==join->tables or # tables in the sj-mat nest we're optimizing
 
   DBUG_ENTER("greedy_search");
 
   /* number of tables that remain to be optimized */
-  size_remain= my_count_bits(remaining_tables);
+  n_tables= size_remain= my_count_bits(remaining_tables &
+                                       (join->emb_sjm_nest? 
+                                         (join->emb_sjm_nest->sj_inner_tables &
+                                          ~join->const_table_map)
+                                         :
+                                         ~(table_map)0));
 
   do {
     /* Find the extension of the current QEP with the lowest cost */
@@ -5180,7 +5646,7 @@ greedy_search(JOIN      *join,
         'join->best_positions' contains a complete optimal extension of the
         current partial QEP.
       */
-      DBUG_EXECUTE("opt", print_plan(join, join->tables,
+      DBUG_EXECUTE("opt", print_plan(join, n_tables,
                                      record_count, read_time, read_time,
                                      "optimal"););
       DBUG_RETURN(FALSE);
@@ -5203,11 +5669,12 @@ greedy_search(JOIN      *join,
       the interleaving state to the one of the non-extended partial plan 
       on exit.
     */
-    IF_DBUG(bool is_interleave_error= )
-    check_interleaving_with_nj (best_table);
+    bool is_interleave_error __attribute__((unused))= 
+      check_interleaving_with_nj(best_table);
     /* This has been already checked by best_extension_by_limited_search */
     DBUG_ASSERT(!is_interleave_error);
 
+
     /* find the position of 'best_table' in 'join->best_ref' */
     best_idx= idx;
     JOIN_TAB *pos= join->best_ref[best_idx];
@@ -5232,6 +5699,49 @@ greedy_search(JOIN      *join,
 }
 
 
+/*
+  Calculate a cost of given partial join order
+ 
+  SYNOPSIS
+    get_partial_join_cost()
+      join               IN    Join to use. join->positions holds the
+                               partial join order
+      idx                IN    # tables in the partial join order
+      read_time_arg      OUT   Store read time here 
+      record_count_arg   OUT   Store record count here
+
+  DESCRIPTION
+
+    This is needed for semi-join materialization code. The idea is that 
+    we detect sj-materialization after we've put all sj-inner tables into
+    the join prefix
+
+      prefix-tables semi-join-inner-tables  tN
+                                             ^--we're here
+
+    and we'll need to get the cost of prefix-tables prefix again.
+*/
+
+void get_partial_join_cost(JOIN *join, uint n_tables, double *read_time_arg,
+                           double *record_count_arg)
+{
+  double record_count= 1;
+  double read_time= 0.0;
+  for (uint i= join->const_tables; i < n_tables + join->const_tables ; i++)
+  {
+    if (join->best_positions[i].records_read)
+    {
+      record_count *= join->best_positions[i].records_read;
+      read_time += join->best_positions[i].read_time;
+    }
+  }
+  *read_time_arg= read_time;// + record_count / TIME_FOR_COMPARE;
+  *record_count_arg= record_count;
+}
+
+
+
+
 /**
   Find a good, possibly optimal, query execution plan (QEP) by a possibly
   exhaustive search.
@@ -5378,21 +5888,37 @@ best_extension_by_limited_search(JOIN      *join,
   DBUG_EXECUTE("opt", print_plan(join, idx, record_count, read_time, read_time,
                                 "part_plan"););
 
+  /* 
+    If we are searching for the execution plan of a materialized semi-join nest
+    then allowed_tables contains bits only for the tables from this nest.
+  */
+  table_map allowed_tables= ~(table_map)0;
+  if (join->emb_sjm_nest)
+    allowed_tables= join->emb_sjm_nest->sj_inner_tables & ~join->const_table_map;
+
   for (JOIN_TAB **pos= join->best_ref + idx ; (s= *pos) ; pos++)
   {
     table_map real_table_bit= s->table->map;
     if ((remaining_tables & real_table_bit) && 
+        (allowed_tables & real_table_bit) &&
         !(remaining_tables & s->dependent) && 
         (!idx || !check_interleaving_with_nj(s)))
     {
       double current_record_count, current_read_time;
+      POSITION *position= join->positions + idx;
 
       /* Find the best access method from 's' to the current partial plan */
-      best_access_path(join, s, thd, remaining_tables, idx,
-                       record_count, read_time);
+      POSITION loose_scan_pos;
+      best_access_path(join, s, remaining_tables, idx, FALSE, record_count, 
+                       join->positions + idx, &loose_scan_pos);
+
       /* Compute the cost of extending the plan with 's' */
-      current_record_count= record_count * join->positions[idx].records_read;
-      current_read_time=    read_time + join->positions[idx].read_time;
+
+      current_record_count= record_count * position->records_read;
+      current_read_time=    read_time + position->read_time;
+
+      advance_sj_state(join, remaining_tables, s, idx, &current_record_count,
+                       &current_read_time, &loose_scan_pos);
 
       /* Expand only partial plans with lower cost than the best QEP so far */
       if ((current_read_time +
@@ -5406,6 +5932,7 @@ best_extension_by_limited_search(JOIN      *join,
                                         (double) TIME_FOR_COMPARE),
                                        "prune_by_cost"););
         restore_prev_nj_state(s);
+        restore_prev_sj_state(remaining_tables, s, idx);
         continue;
       }
 
@@ -5418,12 +5945,12 @@ best_extension_by_limited_search(JOIN      *join,
         if (best_record_count > current_record_count ||
             best_read_time > current_read_time ||
             (idx == join->const_tables &&  // 's' is the first table in the QEP
-             s->table == join->sort_by_table))
+            s->table == join->sort_by_table))
         {
           if (best_record_count >= current_record_count &&
               best_read_time >= current_read_time &&
               /* TODO: What is the reasoning behind this condition? */
-              (!(s->key_dependent & remaining_tables) ||
+              (!(s->key_dependent & allowed_tables & remaining_tables) ||
                join->positions[idx].records_read < 2.0))
           {
             best_record_count= current_record_count;
@@ -5438,11 +5965,12 @@ best_extension_by_limited_search(JOIN      *join,
                                          current_read_time,
                                          "pruned_by_heuristic"););
           restore_prev_nj_state(s);
+          restore_prev_sj_state(remaining_tables, s, idx);
           continue;
         }
       }
 
-      if ( (search_depth > 1) && (remaining_tables & ~real_table_bit) )
+      if ( (search_depth > 1) && (remaining_tables & ~real_table_bit) & allowed_tables )
       { /* Recursively expand the current partial plan */
         swap_variables(JOIN_TAB*, join->best_ref[idx], *pos);
         if (best_extension_by_limited_search(join,
@@ -5479,6 +6007,7 @@ best_extension_by_limited_search(JOIN      *join,
                                        "full_plan"););
       }
       restore_prev_nj_state(s);
+      restore_prev_sj_state(remaining_tables, s, idx);
     }
   }
   DBUG_RETURN(FALSE);
@@ -5532,8 +6061,9 @@ find_best(JOIN *join,table_map rest_tables,uint idx,double record_count,
         (!idx|| !check_interleaving_with_nj(s)))
     {
       double records, best;
-      best_access_path(join, s, thd, rest_tables, idx, record_count, 
-                       read_time);
+      POSITION loose_scan_pos;
+      best_access_path(join, s, rest_tables, idx, FALSE, record_count, 
+                       join->positions + idx, &loose_scan_pos);
       records= join->positions[idx].records_read;
       best= join->positions[idx].read_time;
       /*
@@ -5542,6 +6072,9 @@ find_best(JOIN *join,table_map rest_tables,uint idx,double record_count,
       */
       double current_record_count=record_count*records;
       double current_read_time=read_time+best;
+      advance_sj_state(join, rest_tables, s, idx, &current_record_count, 
+                       &current_read_time, &loose_scan_pos);
+
       if (best_record_count > current_record_count ||
 	  best_read_time > current_read_time ||
 	  (idx == join->const_tables && s->table == join->sort_by_table))
@@ -5560,6 +6093,7 @@ find_best(JOIN *join,table_map rest_tables,uint idx,double record_count,
 	swap_variables(JOIN_TAB*, join->best_ref[idx], *pos);
       }
       restore_prev_nj_state(s);
+      restore_prev_sj_state(rest_tables, s, idx);
       if (join->select_options & SELECT_STRAIGHT_JOIN)
 	break;				// Don't test all combinations
     }
@@ -5572,14 +6106,15 @@ find_best(JOIN *join,table_map rest_tables,uint idx,double record_count,
   Find how much space the prevous read not const tables takes in cache.
 */
 
-static void calc_used_field_length(THD *thd, JOIN_TAB *join_tab)
+void JOIN_TAB::calc_used_field_length(bool max_fl)
 {
   uint null_fields,blobs,fields,rec_length;
   Field **f_ptr,*field;
-  MY_BITMAP *read_set= join_tab->table->read_set;
+  uint uneven_bit_fields;
+  MY_BITMAP *read_set= table->read_set;
 
-  null_fields= blobs= fields= rec_length=0;
-  for (f_ptr=join_tab->table->field ; (field= *f_ptr) ; f_ptr++)
+  uneven_bit_fields= null_fields= blobs= fields= rec_length=0;
+  for (f_ptr=table->field ; (field= *f_ptr) ; f_ptr++)
   {
     if (bitmap_is_set(read_set, field->field_index))
     {
@@ -5590,21 +6125,109 @@ static void calc_used_field_length(THD *thd, JOIN_TAB *join_tab)
 	blobs++;
       if (!(flags & NOT_NULL_FLAG))
 	null_fields++;
+      if (field->type() == MYSQL_TYPE_BIT &&
+          ((Field_bit*)field)->bit_len)
+        uneven_bit_fields++;
     }
   }
-  if (null_fields)
-    rec_length+=(join_tab->table->s->null_fields+7)/8;
-  if (join_tab->table->maybe_null)
+  if (null_fields || uneven_bit_fields)
+    rec_length+=(table->s->null_fields+7)/8;
+  if (table->maybe_null)
     rec_length+=sizeof(my_bool);
-  if (blobs)
+  if (max_fl)
   {
-    uint blob_length=(uint) (join_tab->table->file->stats.mean_rec_length-
-			     (join_tab->table->s->reclength- rec_length));
-    rec_length+=(uint) max(4,blob_length);
+    // TODO: to improve this estimate for max expected length 
+    if (blobs)
+    {
+      uint blob_length=(uint) (table->file->stats.mean_rec_length-
+			      (table->s->reclength-rec_length));
+      rec_length+=(uint) max(sizeof(void*) * blobs, blob_length);
+    }
+    max_used_fieldlength= rec_length;
+  } 
+  else if (table->file->stats.mean_rec_length)           
+    set_if_smaller(rec_length, table->file->stats.mean_rec_length);
+      
+  /*
+    psergey-todo: why we don't count here rowid that we might need to store
+    when using DuplicateElimination?
+  */
+  used_fields=fields;
+  used_fieldlength=rec_length;
+  used_blobs=blobs;
+  used_null_fields= null_fields;
+  used_uneven_bit_fields= uneven_bit_fields;
+}
+
+
+/* 
+  @brief
+  Extract pushdown conditions for a table scan
+
+  @details
+  This functions extracts pushdown conditions usable when this table is scanned.
+  The conditions are extracted either from WHERE or from ON expressions.
+  The conditions are attached to the field cache_select of this table.
+
+  @note 
+  Currently the extracted conditions are used only by BNL and BNLH join.
+  algorithms.
+ 
+  @retval  0   on success
+           1   otherwise
+*/ 
+
+int JOIN_TAB::make_scan_filter()
+{
+  COND *tmp;
+  DBUG_ENTER("make_scan_filter");
+
+  Item *cond= is_inner_table_of_outer_join() ?
+                *get_first_inner_table()->on_expr_ref : join->conds;
+
+  if (cond &&
+      (tmp=make_cond_for_table(cond, join->const_table_map | table->map,
+			       table->map, FALSE, TRUE)))
+  {
+     DBUG_EXECUTE("where",print_where(tmp,"cache", QT_ORDINARY););
+     if (!(cache_select=
+          (SQL_SELECT*) join->thd->memdup((uchar*) select, sizeof(SQL_SELECT))))
+	DBUG_RETURN(1);
+     cache_select->cond= tmp;
+     cache_select->read_tables=join->const_table_map;
+  }
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief
+  Check whether hash join algorithm can be used to join this table   
+
+  @details
+  This function finds out whether the ref items that have been chosen
+  by the planner to access this table can be used for hash join algorithms.
+  The answer depends on a certain property of the the fields of the
+  joined tables on which the hash join key is built.
+  
+  @note
+  At present the function is supposed to be called only after the function
+  get_best_combination has been called.
+
+  @retval TRUE    it's possible to use hash join to join this table
+  @retval FALSE   otherwise
+*/
+
+bool JOIN_TAB::hash_join_is_possible()
+{
+  if (type != JT_REF && type != JT_EQ_REF)
+    return FALSE;
+  if (!is_ref_for_hash_join())
+  {
+    KEY *keyinfo= table->key_info + ref.key;
+    return keyinfo->key_part[0].field->hash_join_is_possible();
   }
-  join_tab->used_fields=fields;
-  join_tab->used_fieldlength=rec_length;
-  join_tab->used_blobs=blobs;
+  return TRUE;
 }
 
 
@@ -5613,16 +6236,13 @@ cache_record_length(JOIN *join,uint idx)
 {
   uint length=0;
   JOIN_TAB **pos,**end;
-  THD *thd=join->thd;
 
   for (pos=join->best_ref+join->const_tables,end=join->best_ref+idx ;
        pos != end ;
        pos++)
   {
     JOIN_TAB *join_tab= *pos;
-    if (!join_tab->used_fieldlength)		/* Not calced yet */
-      calc_used_field_length(thd, join_tab);
-    length+=join_tab->used_fieldlength;
+    length+= join_tab->get_used_fieldlength();
   }
   return length;
 }
@@ -5713,8 +6333,24 @@ prev_record_reads(JOIN *join, uint idx, table_map found_ref)
 }
 
 
-/**
-  Set up join struct according to best position.
+/*
+  Set up join struct according to the picked join order in
+  
+  SYNOPSIS
+    get_best_combination()
+      join  The join to process (the picked join order is mainly in
+            join->best_positions)
+
+  DESCRIPTION
+    Setup join structures according the picked join order
+    - finalize semi-join strategy choices (see
+        fix_semijoin_strategies_for_picked_join_order)
+    - create join->join_tab array and put there the JOIN_TABs in the join order
+    - create data structures describing ref access methods.
+
+  RETURN 
+    FALSE  OK
+    TRUE   Out of memory
 */
 
 static bool
@@ -5736,6 +6372,9 @@ get_best_combination(JOIN *join)
   join->full_join=0;
 
   used_tables= OUTER_REF_TABLE_BIT;		// Outer row is already read
+
+  fix_semijoin_strategies_for_picked_join_order(join);
+  
   for (j=join_tab, tablenr=0 ; tablenr < table_count ; tablenr++,j++)
   {
     TABLE *form;
@@ -5749,14 +6388,17 @@ get_best_combination(JOIN *join)
     if (j->type == JT_CONST)
       continue;					// Handled in make_join_stat..
 
+    j->loosescan_match_tab= NULL;  //non-nulls will be set later
     j->ref.key = -1;
     j->ref.key_parts=0;
 
     if (j->type == JT_SYSTEM)
       continue;
-    if (j->keys.is_clear_all() || !(keyuse= join->best_positions[tablenr].key))
+    if ( !(keyuse= join->best_positions[tablenr].key) || 
+        (join->best_positions[tablenr].sj_strategy == SJ_OPT_LOOSE_SCAN))
     {
       j->type=JT_ALL;
+      j->index= join->best_positions[tablenr].loosescan_key;
       if (tablenr != join->const_tables)
 	join->full_join=1;
     }
@@ -5770,22 +6412,108 @@ get_best_combination(JOIN *join)
   DBUG_RETURN(0);
 }
 
+/**
+  Create a descriptor of hash join key to access a given join table  
+
+  @param   join         join which the join table belongs to
+  @param   join_tab     the join table to access
+  @param   org_keyuse   beginning of the key uses to join this table
+  @param   used_tables  bitmap of the previous tables
 
-static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
-			       table_map used_tables)
+  @details
+  This function first finds key uses that can be utilized by the hash join
+  algorithm to join join_tab to the previous tables marked in the bitmap 
+  used_tables.  The tested key uses are taken from the array of all key uses
+  for 'join' starting from the position org_keyuse. After all interesting key
+  uses have been found the function builds a descriptor of the corresponding
+  key that is used by the hash join algorithm would it be chosen to join
+  the table join_tab.
+
+  @retval  FALSE  the descriptor for a hash join key is successfully created
+  @retval  TRUE   otherwise
+*/
+
+static bool create_hj_key_for_table(JOIN *join, JOIN_TAB *join_tab,
+                                    KEYUSE *org_keyuse, table_map used_tables)
 {
-  KEYUSE *keyuse=org_keyuse;
-  bool ftkey=(keyuse->keypart == FT_KEYPART);
+  KEY *keyinfo;
+  KEY_PART_INFO *key_part_info;
+  KEYUSE *keyuse= org_keyuse;
+  uint key_parts= 0;
   THD  *thd= join->thd;
-  uint keyparts,length,key;
+  TABLE *table= join_tab->table;
+  bool first_keyuse= TRUE;
+  DBUG_ENTER("create_hj_key_for_table");
+
+  do
+  {
+    if (!(~used_tables & keyuse->used_tables) &&
+	(first_keyuse || keyuse->keypart != (keyuse-1)->keypart))
+      key_parts++;
+    first_keyuse= FALSE;
+    keyuse++;
+  } while (keyuse->table == table && keyuse->is_for_hash_join());
+  if (!key_parts)
+    DBUG_RETURN(TRUE);
+  /* This memory is allocated only once for the joined table join_tab */
+  if (!(keyinfo= (KEY *) thd->alloc(sizeof(KEY))) ||
+      !(key_part_info = (KEY_PART_INFO *) thd->alloc(sizeof(KEY_PART_INFO)*
+                                                     key_parts)))
+    DBUG_RETURN(TRUE);
+  keyinfo->usable_key_parts= keyinfo->key_parts = key_parts;
+  keyinfo->key_part= key_part_info;
+  keyinfo->key_length=0;
+  keyinfo->algorithm= HA_KEY_ALG_UNDEF;
+  keyinfo->flags= HA_GENERATED_KEY;
+  keyinfo->name= (char *) "$hj";
+  keyinfo->rec_per_key= (ulong*) thd->calloc(sizeof(ulong)*key_parts);
+  if (!keyinfo->rec_per_key)
+    DBUG_RETURN(TRUE);
+  keyinfo->key_part= key_part_info;
+
+  first_keyuse= TRUE;
+  keyuse= org_keyuse;
+  do
+  {
+    if (!(~used_tables & keyuse->used_tables) &&
+        (first_keyuse || keyuse->keypart != (keyuse-1)->keypart))
+    {
+      Field *field= table->field[keyuse->keypart];
+      table->create_key_part_by_field(keyinfo, key_part_info, field);
+      first_keyuse= FALSE;
+      key_part_info++;
+    }
+    keyuse++;
+  } while (keyuse->table == table && keyuse->is_for_hash_join());
+
+  join_tab->hj_key= keyinfo;
+
+  DBUG_RETURN(FALSE);
+}
+
+
+static bool create_ref_for_key(JOIN *join, JOIN_TAB *j,
+                               KEYUSE *org_keyuse, table_map used_tables)
+{
+  uint keyparts, length, key;
   TABLE *table;
   KEY *keyinfo;
+  KEYUSE *keyuse= org_keyuse;
+  bool ftkey= (keyuse->keypart == FT_KEYPART);
+  THD *thd= join->thd;
   DBUG_ENTER("create_ref_for_key");
 
   /*  Use best key from find_best */
-  table=j->table;
-  key=keyuse->key;
-  keyinfo=table->key_info+key;
+  table= j->table;
+  key= keyuse->key;
+  if (!is_hash_join_key_no(key))
+    keyinfo= table->key_info+key;
+  else
+  {
+    if (create_hj_key_for_table(join, j, org_keyuse, used_tables))
+      DBUG_RETURN(TRUE);
+    keyinfo= j->hj_key;
+  }
 
   if (ftkey)
   {
@@ -5808,27 +6536,28 @@ static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
     {
       if (!(~used_tables & keyuse->used_tables))
       {
-	if (keyparts == keyuse->keypart &&
-	    !(found_part_ref_or_null & keyuse->optimize))
-	{
-	  keyparts++;
-	  length+= keyinfo->key_part[keyuse->keypart].store_length;
-	  found_part_ref_or_null|= keyuse->optimize;
-	}
+        if ((is_hash_join_key_no(key) && 
+            (keyparts == 0 || keyuse->keypart != (keyuse-1)->keypart)) ||
+            (!is_hash_join_key_no(key) && keyparts == keyuse->keypart &&
+             !(found_part_ref_or_null & keyuse->optimize)))
+        {
+           length+= keyinfo->key_part[keyparts].store_length;
+           keyparts++;
+           found_part_ref_or_null|= keyuse->optimize & ~KEY_OPTIMIZE_EQ;
+        }
       }
       keyuse++;
     } while (keyuse->table == table && keyuse->key == key);
   } /* not ftkey */
 
   /* set up fieldref */
-  keyinfo=table->key_info+key;
-  j->ref.key_parts=keyparts;
-  j->ref.key_length=length;
-  j->ref.key=(int) key;
+  j->ref.key_parts= keyparts;
+  j->ref.key_length= length;
+  j->ref.key= (int) key;
   if (!(j->ref.key_buff= (uchar*) thd->calloc(ALIGN_SIZE(length)*2)) ||
       !(j->ref.key_copy= (store_key**) thd->alloc((sizeof(store_key*) *
-						   (keyparts+1)))) ||
-      !(j->ref.items=    (Item**) thd->alloc(sizeof(Item*)*keyparts)) ||
+						          (keyparts+1)))) ||
+      !(j->ref.items=(Item**) thd->alloc(sizeof(Item*)*keyparts)) ||
       !(j->ref.cond_guards= (bool**) thd->alloc(sizeof(uint*)*keyparts)))
   {
     DBUG_RETURN(TRUE);
@@ -5837,7 +6566,7 @@ static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
   j->ref.key_err=1;
   j->ref.has_record= FALSE;
   j->ref.null_rejecting= 0;
-  j->ref.use_count= 0;
+  j->ref.disable_cache= FALSE;
   keyuse=org_keyuse;
 
   store_key **ref_key= j->ref.key_copy;
@@ -5858,9 +6587,11 @@ static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
     uint i;
     for (i=0 ; i < keyparts ; keyuse++,i++)
     {
-      while (keyuse->keypart != i ||
-	     ((~used_tables) & keyuse->used_tables))
-	keyuse++;				/* Skip other parts */
+      while (((~used_tables) & keyuse->used_tables) || 
+	     (keyuse->keypart != 
+              (is_hash_join_key_no(key) ?
+                 keyinfo->key_part[i].field->field_index : i))) 
+	 keyuse++;                              	/* Skip other parts */ 
 
       uint maybe_null= test(keyinfo->key_part[i].null_bit);
       j->ref.items[i]=keyuse->val;		// Save for cond removal
@@ -5871,10 +6602,13 @@ static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
       if (!keyuse->used_tables &&
 	  !(join->select_options & SELECT_DESCRIBE))
       {					// Compare against constant
-	store_key_item tmp(thd, keyinfo->key_part[i].field,
+	store_key_item tmp(thd, 
+                           keyinfo->key_part[i].field,
                            key_buff + maybe_null,
                            maybe_null ?  key_buff : 0,
-                           keyinfo->key_part[i].length, keyuse->val);
+                           keyinfo->key_part[i].length,
+                           keyuse->val,
+                           FALSE);
 	if (thd->is_fatal_error)
 	  DBUG_RETURN(TRUE);
 	tmp.copy();
@@ -5891,7 +6625,7 @@ static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
       */
       if ((keyuse->optimize & KEY_OPTIMIZE_REF_OR_NULL) && maybe_null)
 	null_ref_key= key_buff;
-      key_buff+=keyinfo->key_part[i].store_length;
+      key_buff+= keyinfo->key_part[i].store_length;
     }
   } /* not ftkey */
   *ref_key=0;				// end_marker
@@ -5955,7 +6689,7 @@ get_store_key(THD *thd, KEYUSE *keyuse, table_map used_tables,
 			    key_buff + maybe_null,
 			    maybe_null ? key_buff : 0,
 			    key_part->length,
-			    keyuse->val);
+			    keyuse->val, FALSE);
 }
 
 /**
@@ -6033,10 +6767,13 @@ JOIN::make_simple_join(JOIN *parent, TABLE *temp_table)
   row_limit= unit->select_limit_cnt;
   do_send_rows= row_limit ? 1 : 0;
 
-  join_tab->cache.buff=0;			/* No caching */
+  join_tab->use_join_cache= FALSE;
+  join_tab->cache=0;			        /* No caching */
   join_tab->table=temp_table;
+  join_tab->cache_select= 0;
   join_tab->select=0;
-  join_tab->select_cond=0;
+  join_tab->select_cond= 0;                     // Avoid valgrind warning
+  join_tab->set_select_cond(NULL, __LINE__);
   join_tab->quick=0;
   join_tab->type= JT_ALL;			/* Map through all records */
   join_tab->keys.init();
@@ -6050,6 +6787,12 @@ JOIN::make_simple_join(JOIN *parent, TABLE *temp_table)
   join_tab->read_first_record= join_init_read_record;
   join_tab->join= this;
   join_tab->ref.key_parts= 0;
+  join_tab->keep_current_rowid= FALSE;
+  join_tab->flush_weedout_table= join_tab->check_weed_out_table= NULL;
+  join_tab->do_firstmatch= NULL;
+  join_tab->loosescan_match_tab= NULL;
+  join_tab->emb_sj_nest= NULL;
+  join_tab->pre_idx_push_select_cond= NULL;
   bzero((char*) &join_tab->read_record,sizeof(join_tab->read_record));
   temp_table->status=0;
   temp_table->null_row=0;
@@ -6061,6 +6804,8 @@ inline void add_cond_and_fix(Item **e1, Item *e2)
 {
   if (*e1)
   {
+    if (!e2)
+      return;
     Item *res;
     if ((res= new Item_cond_and(*e1, e2)))
     {
@@ -6131,9 +6876,8 @@ static void add_not_null_conds(JOIN *join)
   for (uint i=join->const_tables ; i < join->tables ; i++)
   {
     JOIN_TAB *tab=join->join_tab+i;
-    if ((tab->type == JT_REF || tab->type == JT_EQ_REF || 
-         tab->type == JT_REF_OR_NULL) &&
-        !tab->table->maybe_null)
+    if (tab->type == JT_REF || tab->type == JT_EQ_REF || 
+        tab->type == JT_REF_OR_NULL)
     {
       for (uint keypart= 0; keypart < tab->ref.key_parts; keypart++)
       {
@@ -6141,8 +6885,9 @@ static void add_not_null_conds(JOIN *join)
         {
           Item *item= tab->ref.items[keypart];
           Item *notnull;
-          DBUG_ASSERT(item->type() == Item::FIELD_ITEM);
-          Item_field *not_null_item= (Item_field*)item;
+          Item *real= item->real_item();
+          DBUG_ASSERT(real->type() == Item::FIELD_ITEM);
+          Item_field *not_null_item= (Item_field*)real;
           JOIN_TAB *referred_tab= not_null_item->field->table->reginfo.join_tab;
           /*
             For UPDATE queries such as:
@@ -6161,11 +6906,17 @@ static void add_not_null_conds(JOIN *join)
           */
           if (notnull->fix_fields(join->thd, &notnull))
             DBUG_VOID_RETURN;
-          DBUG_EXECUTE("where",
-                       print_where(notnull,
-                                   referred_tab->table->alias.c_ptr(),
-                                   QT_ORDINARY););
-          add_cond_and_fix(&referred_tab->select_cond, notnull);
+          DBUG_EXECUTE("where",print_where(notnull,
+                                           referred_tab->table->alias.c_ptr(),
+                                           QT_ORDINARY););
+          if (!tab->first_inner)
+	  {
+            COND *new_cond= referred_tab->select_cond;
+            add_cond_and_fix(&new_cond, notnull);
+            referred_tab->set_select_cond(new_cond, __LINE__);
+          }
+          else
+            add_cond_and_fix(tab->first_inner->on_expr_ref, notnull);
         }
       }
     }
@@ -6248,7 +6999,6 @@ add_found_match_trig_cond(JOIN_TAB *tab, COND *cond, JOIN_TAB *root_tab)
     This function can be called only after the execution plan
     has been chosen.
 */
-
 static void
 make_outerjoin_info(JOIN *join)
 {
@@ -6275,6 +7025,9 @@ make_outerjoin_info(JOIN *join)
     }    
     for ( ; embedding ; embedding= embedding->embedding)
     {
+      /* Ignore sj-nests: */
+      if (!embedding->on_expr)
+        continue;
       NESTED_JOIN *nested_join= embedding->nested_join;
       if (!nested_join->counter)
       {
@@ -6290,10 +7043,17 @@ make_outerjoin_info(JOIN *join)
       }
       if (!tab->first_inner)  
         tab->first_inner= nested_join->first_nested;
+      if (tab->table->reginfo.not_exists_optimize)
+        tab->first_inner->table->reginfo.not_exists_optimize= 1;         
       if (++nested_join->counter < nested_join->n_tables)
         break;
       /* Table tab is the last inner table for nested join. */
       nested_join->first_nested->last_inner= tab;
+      if (tab->first_inner->table->reginfo.not_exists_optimize)
+      {
+        for (JOIN_TAB *join_tab= tab->first_inner; join_tab <= tab; join_tab++)
+          join_tab->table->reginfo.not_exists_optimize= 1;
+      } 
     }
   }
   DBUG_VOID_RETURN;
@@ -6309,6 +7069,12 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
   {
     add_not_null_conds(join);
     table_map used_tables;
+    /*
+      Step #1: Extract constant condition
+       - Extract and check the constant part of the WHERE 
+       - Extract constant parts of ON expressions from outer 
+         joins and attach them appropriately.
+    */
     if (cond)                /* Because of QUICK_GROUP_MIN_MAX_SELECT */
     {                        /* there may be a select without a cond. */    
       if (join->tables > 1)
@@ -6317,11 +7083,23 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
 	  thd->lex->current_select->master_unit() ==
 	  &thd->lex->unit)		// not upper level SELECT
         join->const_table_map|=RAND_TABLE_BIT;
+
+      /*
+        Extract expressions that depend on constant tables
+        1. Const part of the join's WHERE clause can be checked immediately
+           and if it is not satisfied then the join has empty result
+        2. Constant parts of outer joins' ON expressions must be attached 
+           there inside the triggers.
+      */
       {						// Check const tables
         COND *const_cond=
 	  make_cond_for_table(cond,
                               join->const_table_map,
-                              (table_map) 0);
+                              (table_map) 0, TRUE, FALSE);
+        /* Add conditions added by add_not_null_conds(). */
+        for (uint i= 0 ; i < join->const_tables ; i++)
+          add_cond_and_fix(&const_cond, join->join_tab[i].select_cond);
+
         DBUG_EXECUTE("where",print_where(const_cond,"constants", QT_ORDINARY););
         for (JOIN_TAB *tab= join->join_tab+join->const_tables;
              tab < join->join_tab+join->tables ; tab++)
@@ -6331,18 +7109,19 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
             JOIN_TAB *cond_tab= tab->first_inner;
             COND *tmp= make_cond_for_table(*tab->on_expr_ref,
                                            join->const_table_map,
-                                         (  table_map) 0);
+					   (table_map) 0, FALSE, FALSE);
             if (!tmp)
               continue;
             tmp= new Item_func_trig_cond(tmp, &cond_tab->not_null_compl);
             if (!tmp)
               DBUG_RETURN(1);
             tmp->quick_fix_field();
-            cond_tab->select_cond= !cond_tab->select_cond ? tmp :
-	                            new Item_cond_and(cond_tab->select_cond,
-                                                      tmp);
+            COND *new_cond= !cond_tab->select_cond ? tmp :
+              new Item_cond_and(cond_tab->select_cond, tmp);
+            cond_tab->set_select_cond(new_cond, __LINE__);
             if (!cond_tab->select_cond)
 	      DBUG_RETURN(1);
+            cond_tab->select_cond->update_used_tables();
             cond_tab->select_cond->quick_fix_field();
           }       
         }
@@ -6353,20 +7132,43 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
         }
       }
     }
+
+    /*
+      Step #2: Extract WHERE/ON parts
+    */
+    table_map save_used_tables= 0;
     used_tables=((select->const_tables=join->const_table_map) |
 		 OUTER_REF_TABLE_BIT | RAND_TABLE_BIT);
+    JOIN_TAB *tab;
+    table_map current_map;
     for (uint i=join->const_tables ; i < join->tables ; i++)
     {
-      JOIN_TAB *tab=join->join_tab+i;
+      bool is_hj;
+      tab= join->join_tab+i;
       /*
         first_inner is the X in queries like:
         SELECT * FROM t1 LEFT OUTER JOIN (t2 JOIN t3) ON X
       */
       JOIN_TAB *first_inner_tab= tab->first_inner; 
-      table_map current_map= tab->table->map;
+      current_map= tab->table->map;
       bool use_quick_range=0;
       COND *tmp;
 
+      /* 
+        Tables that are within SJ-Materialization nests cannot have their
+        conditions referring to preceding non-const tables.
+         - If we're looking at the first SJM table, reset used_tables
+           to refer to only allowed tables
+      */
+      if (tab->emb_sj_nest && tab->emb_sj_nest->sj_mat_info && 
+          tab->emb_sj_nest->sj_mat_info->is_used &&
+          !(used_tables & tab->emb_sj_nest->sj_inner_tables))
+      {
+        save_used_tables= used_tables;
+        used_tables= join->const_table_map | OUTER_REF_TABLE_BIT | 
+                     RAND_TABLE_BIT;
+      }
+
       /*
 	Following force including random expression in last table condition.
 	It solve problem with select like SELECT * FROM t1 WHERE rand() > 0.5
@@ -6376,8 +7178,9 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
       used_tables|=current_map;
 
       if (tab->type == JT_REF && tab->quick &&
-	  (uint) tab->ref.key == tab->quick->index &&
-	  tab->ref.key_length < tab->quick->max_used_key_length)
+	  (((uint) tab->ref.key == tab->quick->index &&
+	    tab->ref.key_length < tab->quick->max_used_key_length) ||
+	    tab->table->intersect_keys.is_set(tab->ref.key)))
       {
 	/* Range uses longer key;  Use this instead of ref on key */
 	tab->type=JT_ALL;
@@ -6396,10 +7199,24 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
 
       tmp= NULL;
       if (cond)
-        tmp= make_cond_for_table(cond,used_tables,current_map);
+        tmp= make_cond_for_table(cond, used_tables, current_map, FALSE, FALSE);
+      /* Add conditions added by add_not_null_conds(). */
+      if (tab->select_cond)
+        add_cond_and_fix(&tmp, tab->select_cond);
+
+      is_hj= (tab->type == JT_REF || tab->type == JT_EQ_REF) &&
+             (join->allowed_join_cache_types & JOIN_CACHE_HASHED_BIT) &&
+	     ((join->max_allowed_join_cache_level+1)/2 == 2 ||
+              ((join->max_allowed_join_cache_level+1)/2 > 2 &&
+	       is_hash_join_key_no(tab->ref.key))) &&
+              (!tab->emb_sj_nest ||                     
+               join->allowed_semijoin_with_cache) && 
+              (!(tab->table->map & join->outer_join) ||
+               join->allowed_outer_join_with_cache);
+
       if (cond && !tmp && tab->quick)
       {						// Outer join
-        if (tab->type != JT_ALL)
+        if (tab->type != JT_ALL && !is_hj)
         {
           /*
             Don't use the quick method
@@ -6421,7 +7238,8 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
         }
 
       }
-      if (tmp || !cond || tab->type == JT_REF)
+      if (tmp || !cond || tab->type == JT_REF || tab->type == JT_REF_OR_NULL ||
+          tab->type == JT_EQ_REF || first_inner_tab)
       {
         DBUG_EXECUTE("where",
                      print_where(tmp,tab->table->alias.c_ptr(),
@@ -6445,13 +7263,14 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
           */
           if (!(tmp= add_found_match_trig_cond(first_inner_tab, tmp, 0)))
             DBUG_RETURN(1);
-          tab->select_cond=sel->cond=tmp;
+          sel->cond= tmp;
+          tab->set_select_cond(tmp, __LINE__);
           /* Push condition to storage engine if this is enabled
              and the condition is not guarded */
-	  if (thd->variables.engine_condition_pushdown)
+	  if (thd->variables.engine_condition_pushdown && !first_inner_tab)
           {
             COND *push_cond= 
-              make_cond_for_table(tmp, current_map, current_map);
+              make_cond_for_table(tmp, current_map, current_map, FALSE, FALSE);
             if (push_cond)
             {
               /* Push condition to handler */
@@ -6461,7 +7280,10 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
           }
         }
         else
-          tab->select_cond= sel->cond= NULL;
+        {
+          sel->cond= NULL;
+          tab->set_select_cond(NULL, __LINE__);
+        }
 
 	sel->head=tab->table;
         DBUG_EXECUTE("where",
@@ -6471,9 +7293,10 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
 	{
 	  /* Use quick key read if it's a constant and it's not used
 	     with key reading */
-	  if (tab->needed_reg.is_clear_all() && tab->type != JT_EQ_REF
-	      && tab->type != JT_FT && (tab->type != JT_REF ||
-               (uint) tab->ref.key == tab->quick->index))
+	  if ((tab->needed_reg.is_clear_all() && tab->type != JT_EQ_REF
+	       && tab->type != JT_FT &&
+               (tab->type != JT_REF ||
+                (uint) tab->ref.key == tab->quick->index)) || is_hj)
 	  {
 	    sel->quick=tab->quick;		// Use value from get_quick_...
 	    sel->quick_keys.clear_all();
@@ -6526,11 +7349,12 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
 	      sel->cond->quick_fix_field();
 
 	    if (sel->test_quick_select(thd, tab->keys,
-				       used_tables & ~ current_map,
+				       (used_tables & ~ current_map) | OUTER_REF_TABLE_BIT,
 				       (join->select_options &
 					OPTION_FOUND_ROWS ?
 					HA_POS_ERROR :
-					join->unit->select_limit_cnt), 0) < 0)
+					join->unit->select_limit_cnt), 0,
+                                        FALSE) < 0)
             {
 	      /*
 		Before reporting "Impossible WHERE" for the whole query
@@ -6543,7 +7367,8 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
                                          (join->select_options &
                                           OPTION_FOUND_ROWS ?
                                           HA_POS_ERROR :
-                                          join->unit->select_limit_cnt),0) < 0)
+                                          join->unit->select_limit_cnt),0,
+                                          FALSE) < 0)
 		DBUG_RETURN(1);			// Impossible WHERE
             }
             else
@@ -6570,21 +7395,12 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
 	      2 : 1;
 	    sel->read_tables= used_tables & ~current_map;
 	  }
-	  if (i != join->const_tables && tab->use_quick != 2)
+	  if (i != join->const_tables && tab->use_quick != 2 &&
+              !tab->first_inner)
 	  {					/* Read with cache */
-	    if (cond &&
-                (tmp=make_cond_for_table(cond,
-					 join->const_table_map |
-					 current_map,
-					 current_map)))
-	    {
-              DBUG_EXECUTE("where",print_where(tmp,"cache", QT_ORDINARY););
-	      tab->cache.select=(SQL_SELECT*)
-		thd->memdup((uchar*) sel, sizeof(SQL_SELECT));
-	      tab->cache.select->cond=tmp;
-	      tab->cache.select->read_tables=join->const_table_map;
-	    }
-	  }
+            if (tab->make_scan_filter())
+              DBUG_RETURN(1);
+          }
 	}
       }
       
@@ -6606,7 +7422,7 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
           JOIN_TAB *cond_tab= join_tab->first_inner;
           COND *tmp= make_cond_for_table(*join_tab->on_expr_ref,
                                          join->const_table_map,
-                                         (table_map) 0);
+                                         (table_map) 0, FALSE, FALSE);
           if (!tmp)
             continue;
           tmp= new Item_func_trig_cond(tmp, &cond_tab->not_null_compl);
@@ -6618,6 +7434,9 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
           if (!cond_tab->select_cond)
 	    DBUG_RETURN(1);
           cond_tab->select_cond->quick_fix_field();
+          cond_tab->select_cond->update_used_tables();
+          if (cond_tab->select)
+            cond_tab->select->cond= cond_tab->select_cond; 
         }       
       }
 
@@ -6638,7 +7457,9 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
           current_map= tab->table->map;
           used_tables2|= current_map;
           COND *tmp_cond= make_cond_for_table(on_expr, used_tables2,
-                                             current_map);
+                                              current_map, FALSE, FALSE);
+          if (tab == first_inner_tab && tab->on_precond)
+            add_cond_and_fix(&tmp_cond, tab->on_precond);
           if (tmp_cond)
           {
             JOIN_TAB *cond_tab= tab < first_inner_tab ? first_inner_tab : tab;
@@ -6675,16 +7496,133 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
             if (!(*sel_cond_ref))
               DBUG_RETURN(1);
             (*sel_cond_ref)->quick_fix_field();
+            (*sel_cond_ref)->update_used_tables();
+            if (cond_tab->select)
+              cond_tab->select->cond= cond_tab->select_cond;
           }              
         }
         first_inner_tab= first_inner_tab->first_upper;       
       }
+
+      if (save_used_tables && !(used_tables & 
+                                ~(tab->emb_sj_nest->sj_inner_tables |
+                                  join->const_table_map | PSEUDO_TABLE_BITS)))
+      {
+        /*
+          We have reached the end of semi join nest. That is, the join order
+          looks like this:
+
+           outer_tbl1 SJ-Materialize(inner_tbl1 ... inner_tblN) outer_tbl ...
+                                                               ^
+                                                                \-we're here
+          At this point, we need to produce two conditions
+           - A condition that can be checked when we have all of the sj-inner
+             tables (inner_tbl1 ... inner_tblN). This will be used while doing
+             materialization.
+           - A condition that can be checked when we have all of the tables
+             in the prefix (both inner and outer).
+        */
+        tab->emb_sj_nest->sj_mat_info->join_cond= 
+          cond ?
+             make_cond_after_sjm(cond, cond, save_used_tables, used_tables):
+            NULL;
+        used_tables= save_used_tables | used_tables;
+        save_used_tables= 0;
+      }
+
     }
   }
   DBUG_RETURN(0);
 }
 
 
+/*
+  Determine {after which table we'll produce ordered set} 
+
+  SYNOPSIS
+    make_join_orderinfo()
+     join
+
+   
+  DESCRIPTION 
+    Determine if the set is already ordered for ORDER BY, so it can 
+    disable join cache because it will change the ordering of the results.
+    Code handles sort table that is at any location (not only first after 
+    the const tables) despite the fact that it's currently prohibited.
+    We must disable join cache if the first non-const table alone is
+    ordered. If there is a temp table the ordering is done as a last
+    operation and doesn't prevent join cache usage.
+
+  RETURN
+    Number of table after which the set will be ordered
+    join->tables if we don't need an ordered set 
+*/
+
+static uint make_join_orderinfo(JOIN *join)
+{
+  JOIN_TAB *tab;
+  if (join->need_tmp)
+    return join->tables;
+  tab= join->get_sort_by_join_tab();
+  return tab ? tab-join->join_tab : join->tables;
+}
+
+/*
+  Deny usage of join buffer for the specified table
+
+  SYNOPSIS
+    set_join_cache_denial()
+      tab    join table for which join buffer usage is to be denied  
+     
+  DESCRIPTION
+    The function denies usage of join buffer when joining the table 'tab'.
+    The table is marked as not employing any join buffer. If a join cache
+    object has been already allocated for the table this object is destroyed.
+
+  RETURN
+    none    
+*/
+
+static
+void set_join_cache_denial(JOIN_TAB *join_tab)
+{
+  if (join_tab->cache)
+  {
+    /* 
+      If there is a previous cache linked to this cache through the
+      next_cache pointer: remove the link. 
+    */
+    if (join_tab->cache->prev_cache)
+      join_tab->cache->prev_cache->next_cache= 0;
+    /*
+      No need to do the same for next_cache since cache denial is done
+      backwards starting from the latest cache in the linked list (see
+      revise_cache_usage()).
+    */
+    DBUG_ASSERT(!join_tab->cache->next_cache);
+
+    join_tab->cache->free();
+    join_tab->cache= 0;
+  }
+  if (join_tab->use_join_cache)
+  {
+    join_tab->use_join_cache= FALSE;
+    join_tab->used_join_cache_level= 0;
+    /*
+      It could be only sub_select(). It could not be sub_seject_sjm because we
+      don't do join buffering for the first table in sjm nest. 
+    */
+    join_tab[-1].next_select= sub_select;
+    if (join_tab->type == JT_REF && join_tab->is_ref_for_hash_join())
+    {
+      join_tab->type= JT_ALL;
+      join_tab->ref.key_parts= 0;
+    }
+    join_tab->join->return_tab= join_tab;
+  }
+}
+
+
 /**
   The default implementation of unlock-row method of READ_RECORD,
   used in all access methods.
@@ -6697,7 +7635,6 @@ void rr_unlock_row(st_join_table *tab)
 }
 
 
-
 /**
   Pick the appropriate access method functions
 
@@ -6748,47 +7685,631 @@ pick_table_access_method(JOIN_TAB *tab)
 }
 
 
-static void
-make_join_readinfo(JOIN *join, ulonglong options)
+/* 
+  Revise usage of join buffer for the specified table and the whole nest   
+
+  SYNOPSIS
+    revise_cache_usage()
+      tab    join table for which join buffer usage is to be revised  
+
+  DESCRIPTION
+    The function revise the decision to use a join buffer for the table 'tab'.
+    If this table happened to be among the inner tables of a nested outer join/
+    semi-join the functions denies usage of join buffers for all of them
+
+  RETURN
+    none    
+*/
+
+static
+void revise_cache_usage(JOIN_TAB *join_tab)
+{
+  JOIN_TAB *tab;
+  JOIN_TAB *first_inner;
+
+  if (join_tab->first_inner)
+  {
+    JOIN_TAB *end_tab= join_tab;
+    for (first_inner= join_tab->first_inner; 
+         first_inner;
+         first_inner= first_inner->first_upper)           
+    {
+      for (tab= end_tab-1; tab >= first_inner; tab--)
+        set_join_cache_denial(tab);
+      end_tab= first_inner;
+    }
+  }
+  else if (join_tab->first_sj_inner_tab)
+  {
+    first_inner= join_tab->first_sj_inner_tab;
+    for (tab= join_tab-1; tab >= first_inner; tab--)
+    {
+      if (tab->first_sj_inner_tab == first_inner)
+        set_join_cache_denial(tab);
+    }
+  }
+  else set_join_cache_denial(join_tab);
+}
+
+
+/*
+  end_select-compatible function that writes the record into a sjm temptable
+  
+  SYNOPSIS
+    end_sj_materialize()
+      join            The join 
+      join_tab        Last join table
+      end_of_records  FALSE <=> This call is made to pass another record 
+                                combination
+                      TRUE  <=> EOF (no action)
+
+  DESCRIPTION
+    This function is used by semi-join materialization to capture suquery's
+    resultset and write it into the temptable (that is, materialize it).
+
+  NOTE
+    This function is used only for semi-join materialization. Non-semijoin
+    materialization uses different mechanism.
+
+  RETURN 
+    NESTED_LOOP_OK
+    NESTED_LOOP_ERROR
+*/
+
+static enum_nested_loop_state 
+end_sj_materialize(JOIN *join, JOIN_TAB *join_tab, bool end_of_records)
+{
+  int error;
+  THD *thd= join->thd;
+  SJ_MATERIALIZATION_INFO *sjm= join_tab[-1].emb_sj_nest->sj_mat_info;
+  DBUG_ENTER("end_sj_materialize");
+  if (!end_of_records)
+  {
+    TABLE *table= sjm->table;
+
+    List_iterator<Item> it(sjm->sjm_table_cols);
+    Item *item;
+    while ((item= it++))
+    {
+      if (item->is_null())
+        DBUG_RETURN(NESTED_LOOP_OK);
+    }
+    fill_record(thd, table->field, sjm->sjm_table_cols, TRUE, FALSE);
+    if (thd->is_error())
+      DBUG_RETURN(NESTED_LOOP_ERROR); /* purecov: inspected */
+    if ((error= table->file->ha_write_tmp_row(table->record[0])))
+    {
+      /* create_myisam_from_heap will generate error if needed */
+      if (table->file->is_fatal_error(error, HA_CHECK_DUP) &&
+          create_internal_tmp_table_from_heap(thd, table,
+                                              sjm->sjm_table_param.start_recinfo, 
+                                              &sjm->sjm_table_param.recinfo, error, 1))
+        DBUG_RETURN(NESTED_LOOP_ERROR); /* purecov: inspected */
+    }
+  }
+  DBUG_RETURN(NESTED_LOOP_OK);
+}
+
+
+/* 
+  Check whether a join buffer can be used to join the specified table   
+
+  SYNOPSIS
+    check_join_cache_usage()
+      tab                 joined table to check join buffer usage for
+      options             options of the join
+      no_jbuf_after       don't use join buffering after table with this number
+      prev_tab            previous join table
+
+  DESCRIPTION
+    The function finds out whether the table 'tab' can be joined using a join
+    buffer. This check is performed after the best execution plan for 'join'
+    has been chosen. If the function decides that a join buffer can be employed
+    then it selects the most appropriate join cache object that contains this
+    join buffer.
+    The result of the check and the type of the the join buffer to be used
+    depend on:
+      - the access method to access rows of the joined table
+      - whether the join table is an inner table of an outer join or semi-join
+      - whether the optimizer switches
+          outer_join_with_cache, semijoin_with_cache, join_cache_incremental,
+          join_cache_hashed, join_cache_bka,
+        are set on or off
+      - the join cache level set for the query
+      - the join 'options'.
+
+    In any case join buffer is not used if the number of the joined table is
+    greater than 'no_jbuf_after'. It's also never used if the value of
+    join_cache_level is equal to 0.
+    If the optimizer switch outer_join_with_cache is off no join buffer is
+    used for outer join operations.
+    If the optimizer switch semijoin_with_cache is off no join buffer is used
+    for semi-join operations.
+    If the optimizer switch join_cache_incremental is off no incremental join
+    buffers are used.
+    If the optimizer switch join_cache_hashed is off then the optimizer uses
+    neither BNLH algorithm, nor BKAH algorithm to perform join operations.
+
+    If the optimizer switch join_cache_bka is off then the optimizer uses
+    neither BKA algorithm, nor BKAH algorithm to perform join operation.
+    The valid settings for join_cache_level lay in the interval 0..8.
+    If it set to 0 no join buffers are used to perform join operations.
+    Currently we differentiate between join caches of 8 levels:
+      1 : non-incremental join cache used for BNL join algorithm
+      2 : incremental join cache used for BNL join algorithm
+      3 : non-incremental join cache used for BNLH join algorithm
+      4 : incremental join cache used for BNLH join algorithm
+      5 : non-incremental join cache used for BKA join algorithm
+      6 : incremental join cache used for BKA join algorithm 
+      7 : non-incremental join cache used for BKAH join algorithm 
+      8 : incremental join cache used for BKAH join algorithm
+    If the value of join_cache_level is set to n then no join caches of
+    levels higher than n can be employed.
+
+    If the optimizer switches outer_join_with_cache, semijoin_with_cache,
+    join_cache_incremental, join_cache_hashed, join_cache_bka are all on
+    the following rules are applied.
+    If join_cache_level==1|2 then join buffer is used for inner joins, outer
+    joins and semi-joins with 'JT_ALL' access method. In this case a
+    JOIN_CACHE_BNL object is employed.
+    If join_cache_level==3|4 and then join buffer is used for a join operation
+    (inner join, outer join, semi-join) with 'JT_REF'/'JT_EQREF' access method
+    then a JOIN_CACHE_BNLH object is employed. 
+    If an index is used to access rows of the joined table and the value of
+    join_cache_level==5|6 then a JOIN_CACHE_BKA object is employed. 
+    If an index is used to access rows of the joined table and the value of
+    join_cache_level==7|8 then a JOIN_CACHE_BKAH object is employed. 
+    If the value of join_cache_level is odd then creation of a non-linked 
+    join cache is forced.
+
+    Currently for any join operation a join cache of the  level of the
+    highest allowed and applicable level is used.
+    For example, if join_cache_level is set to 6 and the optimizer switch
+    join_cache_bka is off, while the optimizer switch join_cache_hashed is
+    on then for any inner join operation with JT_REF/JT_EQREF access method
+    to the joined table the BNLH join algorithm will be used, while for
+    the table accessed by the JT_ALL methods the BNL algorithm will be used.
+
+    If the function decides that a join buffer can be used to join the table
+    'tab' then it sets the value of tab->use_join_buffer to TRUE and assigns
+    the selected join cache object to the field 'cache' of the previous
+    join table. 
+    If the function creates a join cache object it tries to initialize it. The
+    failure to do this results in an invocation of the function that destructs
+    the created object.
+    If the function decides that but some reasons no join buffer can be used
+    for a table it calls the function revise_cache_usage that checks
+    whether join cache should be denied for some previous tables. In this case
+    a pointer to the first table for which join cache usage has been denied
+    is passed in join->return_val (see the function set_join_cache_denial).
+    
+    The functions changes the value the fields tab->icp_other_tables_ok and
+    tab->idx_cond_fact_out to FALSE if the chosen join cache algorithm 
+    requires it.
+ 
+  NOTES
+    An inner table of a nested outer join or a nested semi-join can be currently
+    joined only when a linked cache object is employed. In these cases setting
+    join_cache_incremental to 'off' results in denial of usage of any join
+    buffer when joining the table.
+    For a nested outer join/semi-join, currently, we either use join buffers for
+    all inner tables or for none of them. 
+    Some engines (e.g. Falcon) currently allow to use only a join cache
+    of the type JOIN_CACHE_BKAH when the joined table is accessed through
+    an index. For these engines setting the value of join_cache_level to 5 or 6
+    results in that no join buffer is used to join the table. 
+  
+  RETURN VALUE
+    cache level if cache is used, otherwise returns 0
+
+  TODO
+    Support BKA inside SJ-Materialization nests. When doing this, we'll need
+    to only store sj-inner tables in the join buffer.
+#if 0
+        JOIN_TAB *first_tab= join->join_tab+join->const_tables;
+        uint n_tables= i-join->const_tables;
+        / *
+          We normally put all preceding tables into the join buffer, except
+          for the constant tables.
+          If we're inside a semi-join materialization nest, e.g.
+
+             outer_tbl1  outer_tbl2  ( inner_tbl1, inner_tbl2 ) ...
+                                                       ^-- we're here
+
+          then we need to put into the join buffer only the tables from
+          within the nest.
+        * /
+        if (i >= first_sjm_table && i < last_sjm_table)
+        {
+          n_tables= i - first_sjm_table; // will be >0 if we got here
+          first_tab= join->join_tab + first_sjm_table;
+        }
+#endif
+*/
+
+static
+uint check_join_cache_usage(JOIN_TAB *tab,
+                            ulonglong options,
+                            uint no_jbuf_after,
+                            JOIN_TAB *prev_tab)
+{
+  COST_VECT cost;
+  uint flags= 0;
+  ha_rows rows= 0;
+  uint bufsz= 4096;
+  JOIN_CACHE *prev_cache=0;
+  JOIN *join= tab->join;
+  uint cache_level= tab->used_join_cache_level;
+  bool force_unlinked_cache=
+         !(join->allowed_join_cache_types & JOIN_CACHE_INCREMENTAL_BIT);
+  bool no_hashed_cache=
+         !(join->allowed_join_cache_types & JOIN_CACHE_HASHED_BIT);
+  bool no_bka_cache= 
+         !(join->allowed_join_cache_types & JOIN_CACHE_BKA_BIT);
+  uint i= tab - join->join_tab;
+
+  join->return_tab= 0;
+
+  if (cache_level == 0 || i == join->const_tables || !prev_tab)
+    return 0;
+
+  if (force_unlinked_cache && (cache_level%2 == 0))
+    cache_level--;
+
+  if (options & SELECT_NO_JOIN_CACHE)
+    goto no_join_cache;
+  /* 
+    psergey-todo: why the below when execution code seems to handle the
+    "range checked for each record" case?
+  */
+  if (tab->use_quick == 2)
+    goto no_join_cache;
+
+  if (tab->is_inner_table_of_semi_join_with_first_match() &&
+      !join->allowed_semijoin_with_cache)
+    goto no_join_cache;
+  if (tab->is_inner_table_of_outer_join() &&
+      !join->allowed_outer_join_with_cache)
+    goto no_join_cache;
+
+  /*
+    Non-linked join buffers can't guarantee one match
+  */
+  if (tab->is_nested_inner())
+  {
+    if (force_unlinked_cache || cache_level == 1)
+      goto no_join_cache;
+    if (cache_level & 1)
+      cache_level--;
+  }
+    
+  /*
+    Don't use join buffering if we're dictated not to by no_jbuf_after (this
+    ...)
+  */
+  if (!(i <= no_jbuf_after) || tab->loosescan_match_tab || 
+      sj_is_materialize_strategy(join->best_positions[i].sj_strategy))
+    goto no_join_cache;
+
+  for (JOIN_TAB *first_inner= tab->first_inner; first_inner;
+       first_inner= first_inner->first_upper)
+  {
+    if (first_inner != tab && !first_inner->use_join_cache)
+      goto no_join_cache;
+  }
+  if (tab->first_sj_inner_tab && tab->first_sj_inner_tab != tab &&
+      !tab->first_sj_inner_tab->use_join_cache)
+    goto no_join_cache;
+  if (!prev_tab->use_join_cache)
+  {
+    /* 
+      Check whether table tab and the previous one belong to the same nest of
+      inner tables and if so do not use join buffer when joining table tab. 
+    */
+    if (tab->first_inner)
+    {
+      for (JOIN_TAB *first_inner= tab[-1].first_inner;
+           first_inner;
+           first_inner= first_inner->first_upper)
+      {
+        if (first_inner == tab->first_inner)
+          goto no_join_cache;
+      }
+    }
+    else if (tab->first_sj_inner_tab &&
+             tab->first_sj_inner_tab == tab[-1].first_sj_inner_tab)
+      goto no_join_cache; 
+  }       
+
+  prev_cache= prev_tab->cache;
+
+  switch (tab->type) {
+  case JT_ALL:
+    if (cache_level == 1)
+      prev_cache= 0;
+    if ((tab->cache= new JOIN_CACHE_BNL(join, tab, prev_cache)) &&
+        ((options & SELECT_DESCRIBE) || !tab->cache->init()))
+    {
+      tab->icp_other_tables_ok= FALSE;
+      return (2-test(!prev_cache));
+    }
+    goto no_join_cache;
+  case JT_SYSTEM:
+  case JT_CONST:
+  case JT_REF:
+  case JT_EQ_REF:
+    if (cache_level <=2 || (no_hashed_cache && no_bka_cache))
+      goto no_join_cache;
+    if (!tab->is_ref_for_hash_join())
+    {
+      flags= HA_MRR_NO_NULL_ENDPOINTS | HA_MRR_SINGLE_POINT;
+      if (tab->table->covering_keys.is_set(tab->ref.key))
+        flags|= HA_MRR_INDEX_ONLY;
+      rows= tab->table->file->multi_range_read_info(tab->ref.key, 10, 20,
+                                                    tab->ref.key_parts,
+                                                    &bufsz, &flags, &cost);
+    }
+
+    if ((cache_level <=4 && !no_hashed_cache) || no_bka_cache ||
+        tab->is_ref_for_hash_join() ||
+	((flags & HA_MRR_NO_ASSOCIATION) && cache_level <=6))
+    {
+      if (!tab->hash_join_is_possible() ||
+          tab->make_scan_filter())
+        goto no_join_cache;
+      if (cache_level == 3)
+        prev_cache= 0;
+      if ((tab->cache= new JOIN_CACHE_BNLH(join, tab, prev_cache)) &&
+          ((options & SELECT_DESCRIBE) || !tab->cache->init()))
+      {
+        tab->icp_other_tables_ok= FALSE;        
+        return (4-test(!prev_cache));
+      }
+      goto no_join_cache;
+    }
+    if (cache_level > 4 && no_bka_cache)
+      goto no_join_cache;
+    
+    if ((flags & HA_MRR_NO_ASSOCIATION) &&
+	(cache_level <= 6 || no_hashed_cache))
+      goto no_join_cache;
+
+    if ((rows != HA_POS_ERROR) && !(flags & HA_MRR_USE_DEFAULT_IMPL))
+    {
+      if (cache_level <= 6 || no_hashed_cache)
+      {
+        if (cache_level == 5)
+          prev_cache= 0;
+        if ((tab->cache= new JOIN_CACHE_BKA(join, tab, flags, prev_cache)) &&
+            ((options & SELECT_DESCRIBE) || !tab->cache->init()))
+          return (6-test(!prev_cache));
+        goto no_join_cache;
+      }
+      else
+      {
+        if (cache_level == 7)
+          prev_cache= 0;
+        if ((tab->cache= new JOIN_CACHE_BKAH(join, tab, flags, prev_cache)) &&
+            ((options & SELECT_DESCRIBE) || !tab->cache->init()))
+	{
+         tab->idx_cond_fact_out= FALSE;
+          return (8-test(!prev_cache));
+        }
+        goto no_join_cache;
+      }
+    }
+    goto no_join_cache;
+  default : ;
+  }
+
+no_join_cache:
+  if (tab->type != JT_ALL && tab->is_ref_for_hash_join())
+    tab->type= JT_ALL;
+  revise_cache_usage(tab); 
+  return 0;
+}
+
+
+/* 
+  Check whether join buffers can be used to join tables of a join   
+
+  SYNOPSIS
+    check_join_cache_usage()
+      join                join whose tables are to be checked             
+      options             options of the join
+      no_jbuf_after       don't use join buffering after table with this number
+
+  DESCRIPTION
+    For each table after the first non-constant table the function checks
+    whether the table can be joined using a join buffer. If the function decides
+    that a join buffer can be employed then it selects the most appropriate join
+    cache object that contains this join buffer whose level is not greater
+    than join_cache_level set for the join. To make this check the function
+    calls the function check_join_cache_usage for every non-constant table.
+
+  NOTES
+    In some situations (e.g. for nested outer joins, for nested semi-joins) only
+    incremental buffers can be used. If it turns out that for some inner table
+    no join buffer can be used then any inner table of an outer/semi-join nest
+    cannot use join buffer. In the case when already chosen buffer must be
+    denied for a table the function recalls check_join_cache_usage()
+    starting from this table. The pointer to the table from which the check
+    has to be restarted is returned in join->return_val (see the description
+    of check_join_cache_usage).
+*/
+
+void check_join_cache_usage_for_tables(JOIN *join, ulonglong options,
+                                       uint no_jbuf_after)
+{
+  JOIN_TAB *first_sjm_table= NULL;
+  JOIN_TAB *last_sjm_table= NULL;
+
+  for (uint i= join->const_tables; i < join->tables; i++)
+    join->join_tab[i].used_join_cache_level= join->max_allowed_join_cache_level;  
+   
+  for (uint i= join->const_tables; i < join->tables; i++)
+  {
+    JOIN_TAB *tab= join->join_tab+i;
+
+    if (sj_is_materialize_strategy(join->best_positions[i].sj_strategy))
+    {
+      first_sjm_table= tab;
+      last_sjm_table= tab + join->best_positions[i].n_sj_tables;
+      for (JOIN_TAB *sjm_tab= first_sjm_table;
+             sjm_tab != last_sjm_table; sjm_tab++)
+        sjm_tab->first_sjm_sibling= first_sjm_table;
+    } 
+    if (!(tab >= first_sjm_table && tab < last_sjm_table))
+      tab->first_sjm_sibling= NULL;
+
+    tab->icp_other_tables_ok= TRUE;
+    tab->idx_cond_fact_out= TRUE;
+    switch (tab->type) {
+    case JT_SYSTEM:
+    case JT_CONST:
+    case JT_EQ_REF:
+    case JT_REF:
+    case JT_REF_OR_NULL:
+    case JT_ALL:
+      tab->used_join_cache_level= check_join_cache_usage(tab, options,
+                                                         no_jbuf_after,
+                                                         tab == last_sjm_table ?
+						           first_sjm_table :
+                                                           tab-1); 
+      tab->use_join_cache= test(tab->used_join_cache_level);
+      if (join->return_tab)
+        i= join->return_tab-join->join_tab-1;   // always >= 0
+      break; 
+    default:
+      tab->used_join_cache_level= 0;
+    }     
+  }
+}
+
+
+/*
+  Plan refinement stage: do various setup things for the executor
+
+  SYNOPSIS
+    make_join_readinfo()
+      join           Join being processed
+      options        Join's options (checking for SELECT_DESCRIBE, 
+                     SELECT_NO_JOIN_CACHE)
+      no_jbuf_after  Don't use join buffering after table with this number.
+
+  DESCRIPTION
+    Plan refinement stage: do various set ups for the executioner
+      - set up use of join buffering
+      - push index conditions
+      - increment relevant counters
+      - etc
+
+  RETURN 
+    FALSE - OK
+    TRUE  - Out of memory
+*/
+
+static bool
+make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after)
 {
   uint i;
+
+  DBUG_ENTER("make_join_readinfo");
+
   bool statistics= test(!(join->select_options & SELECT_DESCRIBE));
-  bool ordered_set= 0;
   bool sorted= 1;
-  DBUG_ENTER("make_join_readinfo");
+
+  if (!join->select_lex->sj_nests.is_empty() &&
+      setup_semijoin_dups_elimination(join, options, no_jbuf_after))
+    DBUG_RETURN(TRUE); /* purecov: inspected */
+
+  for (i= 0; i < join->const_tables; i++)
+    join->join_tab[i].partial_join_cardinality= 1; 
+
+  for (i=join->const_tables ; i < join->tables ; i++)
+  {
+    /*
+      The approximation below for partial join cardinality is not good because
+        - it does not take into account some pushdown predicates
+        - it does not differentiate between inner joins, outer joins and semi-joins.
+      Later it should be improved.
+    */
+    JOIN_TAB *tab=join->join_tab+i;
+    tab->partial_join_cardinality= join->best_positions[i].records_read *
+                                   (i ? (tab-1)->partial_join_cardinality : 1);
+  }
+ 
+  check_join_cache_usage_for_tables(join, options, no_jbuf_after);
 
   for (i=join->const_tables ; i < join->tables ; i++)
   {
     JOIN_TAB *tab=join->join_tab+i;
     TABLE *table=tab->table;
+    uint jcl= tab->used_join_cache_level;
     tab->read_record.table= table;
     tab->read_record.file=table->file;
     tab->read_record.unlock_row= rr_unlock_row;
     tab->next_select=sub_select;		/* normal select */
+    tab->sorted= sorted;
+    sorted= 0;                                  // only first must be sorted
+
+    if (tab->loosescan_match_tab)
+    {
+      if (!(tab->loosescan_buf= (uchar*)join->thd->alloc(tab->
+                                                         loosescan_key_len)))
+        return TRUE; /* purecov: inspected */
+      tab->sorted= TRUE;
+    }
 
     /*
-      Determine if the set is already ordered for ORDER BY, so it can 
-      disable join cache because it will change the ordering of the results.
-      Code handles sort table that is at any location (not only first after 
-      the const tables) despite the fact that it's currently prohibited.
-      We must disable join cache if the first non-const table alone is
-      ordered. If there is a temp table the ordering is done as a last
-      operation and doesn't prevent join cache usage.
+     SJ-Materialization
     */
-    if (!ordered_set && !join->need_tmp && 
-        (table == join->sort_by_table ||
-         (join->sort_by_table == (TABLE *) 1 && i != join->const_tables)))
-      ordered_set= 1;
+    if (sj_is_materialize_strategy(join->best_positions[i].sj_strategy))
+    {
+      if (i == join->const_tables)
+        join->first_select= sub_select_sjm;
+      else
+       tab[-1].next_select= sub_select_sjm;
 
-    tab->sorted= sorted;
-    sorted= 0;                                  // only first must be sorted
+      if (setup_sj_materialization(tab))
+        return TRUE;
+    }
     table->status=STATUS_NO_RECORD;
     pick_table_access_method (tab);
 
+    if (jcl)
+       tab[-1].next_select=sub_select_cache;
+
+    if (tab->cache && tab->cache->get_join_alg() == JOIN_CACHE::BNLH_JOIN_ALG)
+      tab->type= JT_HASH;
+      
     switch (tab->type) {
+    case JT_SYSTEM:				// Only happens with left join 
+    case JT_CONST:				// Only happens with left join
+      /* Only happens with outer joins */
+      tab->read_first_record= tab->type == JT_SYSTEM ?
+                                join_read_system :join_read_const;
+      if (table->covering_keys.is_set(tab->ref.key) &&
+          !table->no_keyread)
+      {
+        table->key_read=1;
+        table->file->extra(HA_EXTRA_KEYREAD);
+      }
+      else if (!jcl || jcl > 4) 
+        push_index_cond(tab, tab->ref.key);
+        break;
     case JT_EQ_REF:
       tab->read_record.unlock_row= join_read_key_unlock_row;
       /* fall through */
+      if (table->covering_keys.is_set(tab->ref.key) &&
+	  !table->no_keyread)
+      {
+	table->key_read=1;
+	table->file->extra(HA_EXTRA_KEYREAD);
+      }
+      else if (!jcl || jcl > 4) 
+        push_index_cond(tab, tab->ref.key);
+      break;
     case JT_REF_OR_NULL:
     case JT_REF:
       if (tab->select)
@@ -6798,27 +8319,20 @@ make_join_readinfo(JOIN *join, ulonglong options)
       }
       delete tab->quick;
       tab->quick=0;
-      /* fall through */
-    case JT_CONST:				// Only happens with left join
       if (table->covering_keys.is_set(tab->ref.key) &&
 	  !table->no_keyread)
         table->enable_keyread();
+      else if (!jcl || jcl > 4)
+        push_index_cond(tab, tab->ref.key);
       break;
     case JT_ALL:
+    case JT_HASH:
       /*
 	If previous table use cache
         If the incoming data set is already sorted don't use cache.
+        Also don't use cache if this is the first table in semi-join
+          materialization nest.
       */
-      if (i != join->const_tables && !(options & SELECT_NO_JOIN_CACHE) &&
-          tab->use_quick != 2 && !tab->first_inner && !ordered_set)
-      {
-	if ((options & SELECT_DESCRIBE) ||
-	    !join_init_cache(join->thd,join->join_tab+join->const_tables,
-			     i-join->const_tables))
-	{
-	  tab[-1].next_select=sub_select_cache; /* Patch previous */
-	}
-      }
       /* These init changes read_record */
       if (tab->use_quick == 2)
       {
@@ -6886,13 +8400,16 @@ make_join_readinfo(JOIN *join, ulonglong options)
 	    */
               tab->index=find_shortest_key(table, & table->covering_keys);
 	    tab->read_first_record= join_read_first;
-	    tab->type=JT_NEXT;		// Read with index_first / index_next
+            /* Read with index_first / index_next */
+	    tab->type= tab->type == JT_ALL ? JT_NEXT : JT_HASH_NEXT;		
 	  }
 	}
+        if (tab->select && tab->select->quick &&
+            tab->select->quick->index != MAX_KEY && ! tab->table->key_read)
+          push_index_cond(tab, tab->select->quick->index);
       }
       break;
     case JT_FT:
-    case JT_SYSTEM: 
       break;
     default:
       DBUG_PRINT("error",("Table type %d found",tab->type)); /* purecov: deadcode */
@@ -6903,7 +8420,40 @@ make_join_readinfo(JOIN *join, ulonglong options)
     }
   }
   join->join_tab[join->tables-1].next_select=0; /* Set by do_select */
-  DBUG_VOID_RETURN;
+  
+/*
+    If a join buffer is used to join a table the ordering by an index
+    for the first non-constant table cannot be employed anymore.
+  */
+  for (i=join->const_tables ; i < join->tables ; i++)
+  {
+    JOIN_TAB *tab=join->join_tab+i;
+    if (tab->use_join_cache)
+    {
+       JOIN_TAB *sort_by_tab= join->group && join->simple_group &&
+                              join->group_list ?
+			       join->join_tab+join->const_tables :
+                               join->get_sort_by_join_tab();
+     if (sort_by_tab)
+      {
+        join->need_tmp= 1;
+        join->simple_order= join->simple_group= 0;
+        if (sort_by_tab->type == JT_NEXT)
+        {
+          sort_by_tab->type= JT_ALL;
+          sort_by_tab->read_first_record= join_init_read_record;
+        }
+        else if (sort_by_tab->type == JT_HASH_NEXT)
+        {
+          sort_by_tab->type= JT_HASH;
+          sort_by_tab->read_first_record= join_init_read_record;
+        }
+      }
+      break;
+    }
+  }
+
+  DBUG_RETURN(FALSE);
 }
 
 
@@ -6950,8 +8500,11 @@ void JOIN_TAB::cleanup()
   select= 0;
   delete quick;
   quick= 0;
-  x_free(cache.buff);
-  cache.buff= 0;
+  if (cache)
+  {
+    cache->free();
+    cache= 0;
+  }
   limit= 0;
   if (table)
   {
@@ -6968,6 +8521,72 @@ void JOIN_TAB::cleanup()
 
 
 /**
+  Build a TABLE_REF structure for index lookup in the temporary table
+
+  @param thd             Thread handle
+  @param tmp_key         The temporary table key
+  @param it              The iterator of items for lookup in the key
+
+  @details
+  Build TABLE_REF object for lookup in the key 'tmp_key' using items
+  accessible via item iterator 'it'.
+
+  @retval TRUE  Error
+  @retval FALSE OK
+*/
+
+bool TABLE_REF::tmp_table_index_lookup_init(THD *thd,
+                                            KEY *tmp_key,
+                                            Item_iterator &it,
+                                            bool value)
+{
+  uint tmp_key_parts= tmp_key->key_parts;
+  DBUG_ENTER("TABLE_REF::tmp_table_index_lookup_init");
+
+  key= 0; /* The only temp table index. */
+  key_length= tmp_key->key_length;
+  if (!(key_buff=
+        (uchar*) thd->calloc(ALIGN_SIZE(tmp_key->key_length) * 2)) ||
+      !(key_copy=
+        (store_key**) thd->alloc((sizeof(store_key*) *
+                                  (tmp_key_parts + 1)))) ||
+      !(items=
+        (Item**) thd->alloc(sizeof(Item*) * tmp_key_parts)))
+    DBUG_RETURN(TRUE);
+
+  key_buff2= key_buff + ALIGN_SIZE(tmp_key->key_length);
+
+  KEY_PART_INFO *cur_key_part= tmp_key->key_part;
+  store_key **ref_key= key_copy;
+  uchar *cur_ref_buff= key_buff;
+
+  it.open();
+  for (uint i= 0; i < tmp_key_parts; i++, cur_key_part++, ref_key++)
+  {
+    Item *item= it.next();
+    DBUG_ASSERT(item);
+    items[i]= item;
+    int null_count= test(cur_key_part->field->real_maybe_null());
+    *ref_key= new store_key_item(thd, cur_key_part->field,
+                                 /* TIMOUR:
+                                    the NULL byte is taken into account in
+                                    cur_key_part->store_length, so instead of
+                                    cur_ref_buff + test(maybe_null), we could
+                                    use that information instead.
+                                 */
+                                 cur_ref_buff + null_count,
+                                 null_count ? cur_ref_buff : 0,
+                                 cur_key_part->length, items[i], value);
+    cur_ref_buff+= cur_key_part->store_length;
+  }
+  *ref_key= NULL; /* End marker. */
+  key_err= 1;
+  key_parts= tmp_key_parts;
+  DBUG_RETURN(FALSE);
+}
+
+
+/**
   Partially cleanup JOIN after it has executed: close index or rnd read
   (table cursors), free quick selects.
 
@@ -7999,7 +9618,7 @@ static COND *build_equal_items_for_cond(THD *thd, COND *cond,
       List_iterator_fast<Item_equal> it(cond_equal.current_level);
       while ((item_equal= it++))
       {
-        item_equal->fix_length_and_dec();
+        item_equal->fix_fields(thd, NULL);
         item_equal->update_used_tables();
         set_if_bigger(thd->lex->current_select->max_equal_elems,
                       item_equal->members());  
@@ -8184,6 +9803,7 @@ static COND *build_equal_items(THD *thd, COND *cond,
     if (cond->type() == Item::COND_ITEM &&
         ((Item_cond*) cond)->functype() == Item_func::COND_AND_FUNC)
       cond_equal= &((Item_cond_and*) cond)->cond_equal;
+
     else if (cond->type() == Item::FUNC_ITEM &&
              ((Item_cond*) cond)->functype() == Item_func::MULT_EQUAL_FUNC)
     {
@@ -8227,10 +9847,14 @@ static COND *build_equal_items(THD *thd, COND *cond,
 /**
   Compare field items by table order in the execution plan.
 
+    If field1 and field2 belong to different tables then
     field1 considered as better than field2 if the table containing
     field1 is accessed earlier than the table containing field2.   
     The function finds out what of two fields is better according
     this criteria.
+    If field1 and field2 belong to the same table then the result
+    of comparison depends on whether the fields are parts of
+    the key that are used to access this table.  
 
   @param field1          first field item to compare
   @param field2          second field item to compare
@@ -8245,8 +9869,8 @@ static COND *build_equal_items(THD *thd, COND *cond,
 */
 
 static int compare_fields_by_table_order(Item_field *field1,
-                                  Item_field *field2,
-                                  void *table_join_idx)
+                                         Item_field *field2,
+                                         void *table_join_idx)
 {
   int cmp= 0;
   bool outer_ref= 0;
@@ -8255,7 +9879,7 @@ static int compare_fields_by_table_order(Item_field *field1,
     outer_ref= 1;
     cmp= -1;
   }
-  if (field2->used_tables() & OUTER_REF_TABLE_BIT)
+  if (field1->used_tables() & OUTER_REF_TABLE_BIT)
   {
     outer_ref= 1;
     cmp++;
@@ -8264,10 +9888,55 @@ static int compare_fields_by_table_order(Item_field *field1,
     return cmp;
   JOIN_TAB **idx= (JOIN_TAB **) table_join_idx;
   cmp= idx[field2->field->table->tablenr]-idx[field1->field->table->tablenr];
+  if (!cmp)
+  {
+    JOIN_TAB *tab= idx[field1->field->table->tablenr];
+    uint keyno= MAX_KEY;
+    if (tab->ref.key_parts)
+      keyno= tab->ref.key;
+    else if (tab->select && tab->select->quick)
+       keyno = tab->select->quick->index;
+    if (keyno != MAX_KEY)
+    {
+      if (field2->field->part_of_key.is_set(keyno))
+        cmp= -1;
+      if (field1->field->part_of_key.is_set(keyno))
+        cmp++;
+      if (!cmp)
+      {
+        KEY *key_info= tab->table->key_info + keyno;
+        for (uint i= 0; i < key_info->key_parts; i++)
+	{
+          Field *fld= key_info->key_part[i].field;
+          if (fld->eq(field2->field))
+	  {
+	    cmp= -1;
+            break;
+          }
+          if (fld->eq(field1->field))
+	  {
+	    cmp= 1;
+            break;
+          }
+        }
+      }              
+    }              
+    else   
+      cmp= field2->field->field_index-field1->field->field_index;
+  }
   return cmp < 0 ? -1 : (cmp ? 1 : 0);
 }
 
 
+static TABLE_LIST* embedding_sjm(Item_field *item_field)
+{
+  TABLE_LIST *nest= item_field->field->table->pos_in_table_list->embedding;
+  if (nest && nest->sj_mat_info && nest->sj_mat_info->is_used)
+    return nest;
+  else
+    return NULL;
+}
+
 /**
   Generate minimal set of simple equalities equivalent to a multiple equality.
 
@@ -8301,6 +9970,23 @@ static int compare_fields_by_table_order(Item_field *field1,
     So only t1.a=t3.c should be left in the lower level.
     If cond is equal to 0, then not more then one equality is generated
     and a pointer to it is returned as the result of the function.
+    
+    Equality substutution and semi-join materialization nests:
+
+       In case join order looks like this:
+
+          outer_tbl1 outer_tbl2 SJM (inner_tbl1 inner_tbl2) outer_tbl3 
+
+        We must not construct equalities like 
+
+           outer_tbl1.col = inner_tbl1.col 
+
+        because they would get attached to inner_tbl1 and will get evaluated
+        during materialization phase, when we don't have current value of
+        outer_tbl1.col.
+
+        Item_equal::get_first() also takes similar measures for dealing with
+        equality substitution in presense of SJM nests.
 
   @return
     - The condition with generated simple equalities or
@@ -8308,8 +9994,8 @@ static int compare_fields_by_table_order(Item_field *field1,
     - 0, otherwise.
 */
 
-static Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
-                                  Item_equal *item_equal)
+Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
+                           Item_equal *item_equal)
 {
   List<Item> eq_list;
   Item_func_eq *eq_item= 0;
@@ -8320,18 +10006,49 @@ static Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
   Item *head;
   DBUG_ASSERT(!cond || cond->type() == Item::COND_ITEM);
 
+  TABLE_LIST *current_sjm= NULL;
+  Item *current_sjm_head= NULL;
+
+  /* 
+    Pick the "head" item: the constant one or the first in the join order
+    that's not inside some SJM nest.
+  */
   if (item_const)
     head= item_const;
   else
   {
-    head= item_equal->get_first();
+    TABLE_LIST *emb_nest;
+    Item_field *item_field;
+    head= item_field= item_equal->get_first(NULL);
     it++;
+    if ((emb_nest= embedding_sjm(item_field)))
+    {
+      current_sjm= emb_nest;
+      current_sjm_head= head;
+    }
   }
+
   Item_field *item_field;
+  /*
+    For each other item, generate "item=head" equality (except the tables that 
+    are within SJ-Materialization nests, for those "head" is defined
+    differently)
+  */
   while ((item_field= it++))
   {
     Item_equal *upper= item_field->find_item_equal(upper_levels);
     Item_field *item= item_field;
+    TABLE_LIST *field_sjm= embedding_sjm(item_field);
+    if (!field_sjm)
+    { 
+      current_sjm= NULL;
+      current_sjm_head= NULL;
+    }      
+
+    /* 
+      Check if "item_field=head" equality is already guaranteed to be true 
+      on upper AND-levels.
+    */
     if (upper)
     { 
       if (item_const && upper->get_const())
@@ -8346,16 +10063,29 @@ static Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
         }
       }
     }
-    if (item == item_field)
+    
+    bool produce_equality= test(item == item_field);
+    if (!item_const && field_sjm && field_sjm != current_sjm)
+    {
+      /* Entering an SJM nest */
+      current_sjm_head= item_field;
+      if (!field_sjm->sj_mat_info->is_sj_scan)
+        produce_equality= FALSE;
+    }
+
+    if (produce_equality)
     {
       if (eq_item)
         eq_list.push_back(eq_item);
-      eq_item= new Item_func_eq(item_field, head);
+      
+      eq_item= new Item_func_eq(item_field, current_sjm? current_sjm_head: head);
+
       if (!eq_item)
         return 0;
       eq_item->set_cmp_func();
       eq_item->quick_fix_field();
     }
+    current_sjm= field_sjm;
   }
 
   if (!cond)
@@ -8385,7 +10115,6 @@ static Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
   return cond;
 }
 
-
 /**
   Substitute every field reference in a condition by the best equal field
   and eliminate all multiple equality predicates.
@@ -8483,8 +10212,18 @@ static COND* substitute_for_best_equal_field(COND *cond,
     cond= eliminate_item_equal(0, cond_equal, item_equal);
     return cond ? cond : org_cond;
   }
-  else
-    cond->transform(&Item::replace_equal_field, 0);
+  else 
+  {
+    while (cond_equal)
+    {
+      List_iterator_fast<Item_equal> it(cond_equal->current_level);
+      while((item_equal= it++))
+      {
+        cond= cond->transform(&Item::replace_equal_field, (uchar *) item_equal);
+      }
+      cond_equal= cond_equal->upper_levels;
+    }
+  }
   return cond;
 }
 
@@ -8544,7 +10283,7 @@ static void update_const_equal_items(COND *cond, JOIN_TAB *tab)
           TABLE *tab= field->table;
           KEYUSE *use;
           for (use= stat->keyuse; use && use->table == tab; use++)
-            if (possible_keys.is_set(use->key) && 
+            if (!use->is_for_hash_join() && possible_keys.is_set(use->key) && 
                 tab->key_info[use->key].key_part[use->keypart].field ==
                 field)
               tab->const_key_parts[use->key]|= use->keypart_map;
@@ -8638,37 +10377,6 @@ change_cond_ref_to_const(THD *thd, I_List<COND_CMP> *save_list,
   }
 }
 
-/**
-  Remove additional condition inserted by IN/ALL/ANY transformation.
-
-  @param conds   condition for processing
-
-  @return
-    new conditions
-*/
-
-static Item *remove_additional_cond(Item* conds)
-{
-  if (conds->name == in_additional_cond)
-    return 0;
-  if (conds->type() == Item::COND_ITEM)
-  {
-    Item_cond *cnd= (Item_cond*) conds;
-    List_iterator<Item> li(*(cnd->argument_list()));
-    Item *item;
-    while ((item= li++))
-    {
-      if (item->name == in_additional_cond)
-      {
-	li.remove();
-	if (cnd->argument_list()->elements == 1)
-	  return cnd->argument_list()->head();
-	return conds;
-      }
-    }
-  }
-  return conds;
-}
 
 static void
 propagate_cond_constants(THD *thd, I_List<COND_CMP> *save_list,
@@ -8730,7 +10438,6 @@ propagate_cond_constants(THD *thd, I_List<COND_CMP> *save_list,
   }
 }
 
-
 /**
   Simplify joins replacing outer joins by inner joins whenever it's
   possible.
@@ -8825,13 +10532,18 @@ propagate_cond_constants(THD *thd, I_List<COND_CMP> *save_list,
     consider any plan where one of the inner tables is before some of outer
     tables.
 
-
+  IMPLEMENTATION
     The function is implemented by a recursive procedure.  On the recursive
     ascent all attributes are calculated, all outer joins that can be
     converted are replaced and then all unnecessary braces are removed.
     As join list contains join tables in the reverse order sequential
     elimination of outer joins does not require extra recursive calls.
 
+  SEMI-JOIN NOTES
+    Remove all semi-joins that have are within another semi-join (i.e. have
+    an "ancestor" semi-join nest)
+
+  EXAMPLES
     Here is an example of a join query with invalid cross references:
     @code
       SELECT * FROM t1 LEFT JOIN t2 ON t2.a=t3.a LEFT JOIN t3 ON t3.b=t1.b 
@@ -8841,14 +10553,15 @@ propagate_cond_constants(THD *thd, I_List<COND_CMP> *save_list,
   @param join_list   list representation of the join to be converted
   @param conds       conditions to add on expressions for converted joins
   @param top         true <=> conds is the where condition
-
+  @param in_sj       TRUE <=> processing semi-join nest's children
   @return
     - The new condition, if success
     - 0, otherwise
 */
 
 static COND *
-simplify_joins(JOIN *join, List<TABLE_LIST> *join_list, COND *conds, bool top)
+simplify_joins(JOIN *join, List<TABLE_LIST> *join_list, COND *conds, bool top,
+               bool in_sj)
 {
   TABLE_LIST *table;
   NESTED_JOIN *nested_join;
@@ -8884,7 +10597,7 @@ simplify_joins(JOIN *join, List<TABLE_LIST> *join_list, COND *conds, bool top)
            the corresponding on expression is added to E. 
 	*/ 
         expr= simplify_joins(join, &nested_join->join_list,
-                             expr, FALSE);
+                             expr, FALSE, in_sj || table->sj_on_expr);
 
         if (!table->prep_on_expr || expr != table->on_expr)
         {
@@ -8896,7 +10609,8 @@ simplify_joins(JOIN *join, List<TABLE_LIST> *join_list, COND *conds, bool top)
       }
       nested_join->used_tables= (table_map) 0;
       nested_join->not_null_tables=(table_map) 0;
-      conds= simplify_joins(join, &nested_join->join_list, conds, top);
+      conds= simplify_joins(join, &nested_join->join_list, conds, top, 
+                            in_sj || table->sj_on_expr);
       used_tables= nested_join->used_tables;
       not_null_tables= nested_join->not_null_tables;  
       /* The following two might become unequal after table elimination: */
@@ -8928,7 +10642,7 @@ simplify_joins(JOIN *join, List<TABLE_LIST> *join_list, COND *conds, bool top)
       table->outer_join= 0;
       if (table->on_expr)
       {
-        /* Add on expression to the where condition. */
+        /* Add ON expression to the WHERE or upper-level ON condition. */
         if (conds)
         {
           conds= and_conds(conds, table->on_expr);
@@ -8997,14 +10711,25 @@ simplify_joins(JOIN *join, List<TABLE_LIST> *join_list, COND *conds, bool top)
     prev_table= table;
   }
     
-  /* Flatten nested joins that can be flattened. */
+  /* 
+    Flatten nested joins that can be flattened.
+    no ON expression and not a semi-join => can be flattened.
+  */
   TABLE_LIST *right_neighbor= NULL;
   li.rewind();
   while ((table= li++))
   {
     bool fix_name_res= FALSE;
     nested_join= table->nested_join;
-    if (nested_join && !table->on_expr)
+    if (table->sj_on_expr && !in_sj)
+    {
+       /*
+         If this is a semi-join that is not contained within another semi-join, 
+         leave it intact (otherwise it is flattened)
+       */
+      join->select_lex->sj_nests.push_back(table);
+    }
+    else if (nested_join && !table->on_expr)
     {
       TABLE_LIST *tbl;
       List_iterator<TABLE_LIST> it(nested_join->join_list);
@@ -9033,8 +10758,8 @@ simplify_joins(JOIN *join, List<TABLE_LIST> *join_list, COND *conds, bool top)
 /**
   Assign each nested join structure a bit in nested_join_map.
 
-    Assign each nested join structure (except "confluent" ones - those that
-    embed only one element) a bit in nested_join_map.
+    Assign each nested join structure (except ones that embed only one element
+    and so are redundant) a bit in nested_join_map.
 
   @param join          Join being processed
   @param join_list     List of tables
@@ -9043,7 +10768,7 @@ simplify_joins(JOIN *join, List<TABLE_LIST> *join_list, COND *conds, bool top)
 
   @note
     This function is called after simplify_joins(), when there are no
-    redundant nested joins, #non_confluent_nested_joins <= #tables_in_join so
+    redundant nested joins, #non_redundant_nested_joins <= #tables_in_join so
     we will not run out of bits in nested_join_map.
 
   @return
@@ -9072,7 +10797,9 @@ static uint build_bitmap_for_nested_joins(List<TABLE_LIST> *join_list,
       */
       if (nested_join->n_tables != 1)
       {
-        nested_join->nj_map= (nested_join_map) 1 << first_unused++;
+        /* Don't assign bits to sj-nests */
+        if (table->on_expr)
+          nested_join->nj_map= (nested_join_map) 1 << first_unused++;
         first_unused= build_bitmap_for_nested_joins(&nested_join->join_list,
                                                     first_unused);
       }
@@ -9101,14 +10828,16 @@ static uint reset_nj_counters(JOIN *join, List<TABLE_LIST> *join_list)
   while ((table= li++))
   {
     NESTED_JOIN *nested_join;
+    bool is_eliminated_nest= FALSE;
     if ((nested_join= table->nested_join))
     {
       nested_join->counter= 0;
-      //nested_join->n_tables= my_count_bits(nested_join->used_tables & 
-      //                                     ~join->eliminated_tables);
       nested_join->n_tables= reset_nj_counters(join, &nested_join->join_list);
+      if (!nested_join->n_tables)
+        is_eliminated_nest= TRUE;
     }
-    if (!table->table || (table->table->map & ~join->eliminated_tables))
+    if ((!table->table && !is_eliminated_nest) || 
+        (table->table && (table->table->map & ~join->eliminated_tables)))
       n++;
   }
   DBUG_RETURN(n);
@@ -9223,28 +10952,31 @@ static bool check_interleaving_with_nj(JOIN_TAB *next_tab)
     Do update counters for "pairs of brackets" that we've left (marked as
     X,Y,Z in the above picture)
   */
-  for (;next_emb; next_emb= next_emb->embedding)
+  for (;next_emb && next_emb != join->emb_sjm_nest; next_emb= next_emb->embedding)
   {
-    next_emb->nested_join->counter++;
-    if (next_emb->nested_join->counter == 1)
+    if (!next_emb->sj_on_expr)
     {
-      /* 
-        next_emb is the first table inside a nested join we've "entered". In
-        the picture above, we're looking at the 'X' bracket. Don't exit yet as
-        X bracket might have Y pair bracket.
+      next_emb->nested_join->counter++;
+      if (next_emb->nested_join->counter == 1)
+      {
+        /* 
+          next_emb is the first table inside a nested join we've "entered". In
+          the picture above, we're looking at the 'X' bracket. Don't exit yet as
+          X bracket might have Y pair bracket.
+        */
+        join->cur_embedding_map |= next_emb->nested_join->nj_map;
+      }
+      
+      if (next_emb->nested_join->n_tables !=
+          next_emb->nested_join->counter)
+        break;
+
+      /*
+        We're currently at Y or Z-bracket as depicted in the above picture.
+        Mark that we've left it and continue walking up the brackets hierarchy.
       */
-      join->cur_embedding_map |= next_emb->nested_join->nj_map;
+      join->cur_embedding_map &= ~next_emb->nested_join->nj_map;
     }
-    
-    if (next_emb->nested_join->n_tables !=
-        next_emb->nested_join->counter)
-      break;
-
-    /*
-      We're currently at Y or Z-bracket as depicted in the above picture.
-      Mark that we've left it and continue walking up the brackets hierarchy.
-    */
-    join->cur_embedding_map &= ~next_emb->nested_join->nj_map;
   }
   return FALSE;
 }
@@ -9306,27 +11038,113 @@ static void restore_prev_nj_state(JOIN_TAB *last)
 {
   TABLE_LIST *last_emb= last->table->pos_in_table_list->embedding;
   JOIN *join= last->join;
-  for (;last_emb != NULL; last_emb= last_emb->embedding)
+  for (;last_emb != NULL && last_emb != join->emb_sjm_nest; 
+       last_emb= last_emb->embedding)
   {
-    NESTED_JOIN *nest= last_emb->nested_join;
-    DBUG_ASSERT(nest->counter > 0);
-    
-    bool was_fully_covered= nest->is_fully_covered();
-    
-    if (--nest->counter == 0)
-      join->cur_embedding_map&= ~nest->nj_map;
-    
-    if (!was_fully_covered)
-      break;
+    if (!last_emb->sj_on_expr)
+    {
+      NESTED_JOIN *nest= last_emb->nested_join;
+      DBUG_ASSERT(nest->counter > 0);
+      
+      bool was_fully_covered= nest->is_fully_covered();
+      
+      if (--nest->counter == 0)
+        join->cur_embedding_map&= ~nest->nj_map;
+      
+      if (!was_fully_covered)
+        break;
+      
+      join->cur_embedding_map|= nest->nj_map;
+    }
+  }
+}
+
+
+
+/*
+  Change access methods not to use join buffering and adjust costs accordingly
+
+  SYNOPSIS
+    optimize_wo_join_buffering()
+      join
+      first_tab               The first tab to do re-optimization for
+      last_tab                The last tab to do re-optimization for
+      last_remaining_tables   Bitmap of tables that are not in the
+                              [0...last_tab] join prefix
+      first_alt               TRUE <=> Use the LooseScan plan for the first_tab
+      no_jbuf_before          Don't allow to use join buffering before this
+                              table
+      reopt_rec_count     OUT New output record count
+      reopt_cost          OUT New join prefix cost
+
+  DESCRIPTION
+    Given a join prefix [0; ... first_tab], change the access to the tables
+    in the [first_tab; last_tab] not to use join buffering. This is needed
+    because some semi-join strategies cannot be used together with the join
+    buffering.
+    In general case the best table order in [first_tab; last_tab] range with
+    join buffering is different from the best order without join buffering but
+    we don't try finding a better join order. (TODO ask Igor why did we
+    chose not to do this in the end. that's actually the difference from the 
+    forking approach)
+*/
+
+void optimize_wo_join_buffering(JOIN *join, uint first_tab, uint last_tab, 
+                                table_map last_remaining_tables, 
+                                bool first_alt, uint no_jbuf_before,
+                                double *outer_rec_count, double *reopt_cost)
+{
+  double cost, rec_count;
+  table_map reopt_remaining_tables= last_remaining_tables;
+  uint i;
+
+  if (first_tab > join->const_tables)
+  {
+    cost=      join->positions[first_tab - 1].prefix_cost.total_cost();
+    rec_count= join->positions[first_tab - 1].prefix_record_count;
+  }
+  else
+  {
+    cost= 0.0;
+    rec_count= 1;
+  }
+
+  *outer_rec_count= rec_count;
+  for (i= first_tab; i <= last_tab; i++)
+    reopt_remaining_tables |= join->positions[i].table->table->map;
+
+  for (i= first_tab; i <= last_tab; i++)
+  {
+    JOIN_TAB *rs= join->positions[i].table;
+    POSITION pos, loose_scan_pos;
     
-    join->cur_embedding_map|= nest->nj_map;
+    if ((i == first_tab && first_alt) || join->positions[i].use_join_buffer)
+    {
+      /* Find the best access method that would not use join buffering */
+      best_access_path(join, rs, reopt_remaining_tables, i, 
+                       TRUE, rec_count,
+                       &pos, &loose_scan_pos);
+    }
+    else 
+      pos= join->positions[i];
+
+    if ((i == first_tab && first_alt))
+      pos= loose_scan_pos;
+
+    reopt_remaining_tables &= ~rs->table->map;
+    rec_count *= pos.records_read;
+    cost += pos.read_time;
+
+    if (!rs->emb_sj_nest)
+      *outer_rec_count *= pos.records_read;
   }
+  *reopt_cost= cost;
 }
 
 
 static COND *
 optimize_cond(JOIN *join, COND *conds, List<TABLE_LIST> *join_list,
-              Item::cond_result *cond_value)
+              Item::cond_result *cond_value, COND_EQUAL **cond_equal)
 {
   THD *thd= join->thd;
   DBUG_ENTER("optimize_cond");
@@ -9334,7 +11152,7 @@ optimize_cond(JOIN *join, COND *conds, List<TABLE_LIST> *join_list,
   if (!conds)
   {
     *cond_value= Item::COND_TRUE;
-    build_equal_items(join->thd, NULL, NULL, join_list, &join->cond_equal);
+    build_equal_items(join->thd, NULL, NULL, join_list, cond_equal);
   }  
   else
   {
@@ -9508,7 +11326,7 @@ remove_eq_conds(THD *thd, COND *cond, Item::cond_result *cond_value)
       return (COND*) 0;
     }
   }
-  else if (cond->const_item())
+  else if (cond->const_item() && !cond->is_expensive())
   {
     *cond_value= eval_const_cond(cond) ? Item::COND_TRUE : Item::COND_FALSE;
     return (COND*) 0;
@@ -9973,6 +11791,8 @@ Field *create_tmp_field(THD *thd, TABLE *table,Item *item, Item::Type type,
   case Item::REF_ITEM:
   case Item::NULL_ITEM:
   case Item::VARBIN_ITEM:
+  case Item::CACHE_ITEM:
+  case Item::EXPR_CACHE_ITEM:
     if (make_copy_field)
     {
       DBUG_ASSERT(((Item_result_field*)item)->result_field);
@@ -10009,7 +11829,9 @@ void setup_tmp_table_column_bitmaps(TABLE *table, uchar *bitmaps)
   bitmap_init(&table->tmp_set,
               (my_bitmap_map*) (bitmaps+ 2*bitmap_buffer_size(field_count)),
               field_count, FALSE);
-
+  bitmap_init(&table->eq_join_set,
+              (my_bitmap_map*) (bitmaps+ 3*bitmap_buffer_size(field_count)),
+              field_count, FALSE);
   /* write_set and all_set are copies of read_set */
   table->def_write_set= table->def_read_set;
   table->s->all_set= table->def_read_set;
@@ -10044,11 +11866,6 @@ void setup_tmp_table_column_bitmaps(TABLE *table, uchar *bitmaps)
                               be used for name resolving; can be "".
 */
 
-#define STRING_TOTAL_LENGTH_TO_PACK_ROWS 128
-#define AVG_STRING_LENGTH_TO_PACK_ROWS   64
-#define RATIO_TO_PACK_ROWS	       2
-#define MIN_STRING_LENGTH_TO_PACK_ROWS   10
-
 TABLE *
 create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
 		 ORDER *group, bool distinct, bool save_sum_fields,
@@ -10121,6 +11938,12 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
       group=0;					// Can't use group key
     else for (ORDER *tmp=group ; tmp ; tmp=tmp->next)
     {
+      /*
+        marker == 4 means two things:
+        - store NULLs in the key, and
+        - convert BIT fields to 64-bit long, needed because MEMORY tables
+          can't index BIT fields.
+      */
       (*tmp->item)->marker=4;			// Store null in key
       if ((*tmp->item)->max_length >= CONVERT_IF_BIGGER_TO_BLOB)
 	using_unique_constraint=1;
@@ -10162,7 +11985,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
                         &tmpname, (uint) strlen(tmp_table_name)+1,
                         &group_buff, (group && ! using_unique_constraint ?
                                       param->group_length : 0),
-                        &bitmaps, bitmap_buffer_size(field_count)*3,
+                        &bitmaps, bitmap_buffer_size(field_count)*4,
                         NullS))
   {
     if (temp_pool_slot != MY_BIT_NONE)
@@ -10194,7 +12017,6 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
   table->alias.set(table_alias, strlen(table_alias), table_alias_charset);
 
   table->reginfo.lock_type=TL_WRITE;	/* Will be updated */
-  table->db_stat=HA_OPEN_KEYFILE+HA_OPEN_RNDFILE;
   table->map=1;
   table->temp_pool_slot = temp_pool_slot;
   table->copy_blobs= 1;
@@ -10202,6 +12024,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
   table->quick_keys.init();
   table->covering_keys.init();
   table->merge_keys.init();
+  table->intersect_keys.init();
   table->keys_in_use_for_query.init();
 
   table->s= share;
@@ -10233,6 +12056,8 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
         if (item->used_tables() & OUTER_REF_TABLE_BIT)
           item->update_used_tables();
         if (type == Item::SUBSELECT_ITEM ||
+            (item->get_cached_item() &&
+             item->get_cached_item()->type() == Item::SUBSELECT_ITEM ) ||
             (item->used_tables() & ~OUTER_REF_TABLE_BIT))
         {
 	  /*
@@ -10318,7 +12143,15 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
                          group != 0,
                          !force_copy_fields &&
                            (not_all_columns || group !=0),
-                         item->marker == 4, force_copy_fields,
+                         /*
+                           If item->marker == 4 then we force create_tmp_field
+                           to create a 64-bit longs for BIT fields because HEAP
+                           tables can't index BIT fields directly. We do the same
+                           for distinct, as we want the distinct index to be
+                           usable in this case too.
+                         */
+                         item->marker == 4  || param->bit_fields_as_long, // psergey-feb17
+                         force_copy_fields,
                          param->convert_blob_length);
 
       if (!new_field)
@@ -10562,7 +12395,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
   }
 
   param->copy_field_end=copy;
-  param->recinfo=recinfo;
+  param->recinfo= recinfo;              	// Pointer to after last field
   store_record(table,s->default_values);        // Make empty default record
 
   if (thd->variables.tmp_table_size == ~ (ulonglong) 0)		// No limit
@@ -10590,7 +12423,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
     param->group_buff=group_buff;
     share->keys=1;
     share->uniques= test(using_unique_constraint);
-    table->key_info=keyinfo;
+    table->key_info= table->s->key_info= keyinfo;
     keyinfo->key_part=key_part_info;
     keyinfo->flags=HA_NOSAME;
     keyinfo->usable_key_parts=keyinfo->key_parts= param->group_parts;
@@ -10656,29 +12489,39 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
     */
     DBUG_PRINT("info",("hidden_field_count: %d", param->hidden_field_count));
 
-    null_pack_length-=hidden_null_pack_length;
-    keyinfo->key_parts= ((field_count-param->hidden_field_count)+
-			 test(null_pack_length));
-    table->distinct= 1;
-    share->keys= 1;
     if (blob_count)
     {
-      using_unique_constraint=1;
+      /*
+        Special mode for index creation in MyISAM used to support unique
+        indexes on blobs with arbitrary length. Such indexes cannot be
+        used for lookups.
+      */
       share->uniques= 1;
     }
+    null_pack_length-=hidden_null_pack_length;
+    keyinfo->key_parts= ((field_count-param->hidden_field_count)+
+			 (share->uniques ? test(null_pack_length) : 0));
+    table->distinct= 1;
+    share->keys= 1;
     if (!(key_part_info= (KEY_PART_INFO*)
           alloc_root(&table->mem_root,
                      keyinfo->key_parts * sizeof(KEY_PART_INFO))))
       goto err;
     bzero((void*) key_part_info, keyinfo->key_parts * sizeof(KEY_PART_INFO));
-    table->key_info=keyinfo;
+    table->key_info= table->s->key_info= keyinfo;
     keyinfo->key_part=key_part_info;
     keyinfo->flags=HA_NOSAME | HA_NULL_ARE_EQUAL;
-    keyinfo->key_length=(uint16) reclength;
+    keyinfo->key_length= 0;  // Will compute the sum of the parts below.
     keyinfo->name= (char*) "distinct_key";
     keyinfo->algorithm= HA_KEY_ALG_UNDEF;
     keyinfo->rec_per_key=0;
-    if (null_pack_length)
+
+    /*
+      Create an extra field to hold NULL bits so that unique indexes on
+      blobs can distinguish NULL from 0. This extra field is not needed
+      when we do not use UNIQUE indexes for blobs.
+    */
+    if (null_pack_length && share->uniques)
     {
       key_part_info->null_bit=0;
       key_part_info->offset=hidden_null_pack_length;
@@ -10705,6 +12548,24 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
       key_part_info->field=    *reg_field;
       key_part_info->offset=   (*reg_field)->offset(table->record[0]);
       key_part_info->length=   (uint16) (*reg_field)->pack_length();
+      /* TODO:
+        The below method of computing the key format length of the
+        key part is a copy/paste from opt_range.cc, and table.cc.
+        This should be factored out, e.g. as a method of Field.
+        In addition it is not clear if any of the Field::*_length
+        methods is supposed to compute the same length. If so, it
+        might be reused.
+      */
+      key_part_info->store_length= key_part_info->length;
+
+      if ((*reg_field)->real_maybe_null())
+        key_part_info->store_length+= HA_KEY_NULL_LENGTH;
+      if ((*reg_field)->type() == MYSQL_TYPE_BLOB || 
+          (*reg_field)->real_type() == MYSQL_TYPE_VARCHAR)
+        key_part_info->store_length+= HA_KEY_BLOB_LENGTH;
+
+      keyinfo->key_length+= key_part_info->store_length;
+
       key_part_info->type=     (uint8) (*reg_field)->key_type();
       key_part_info->key_type =
 	((ha_base_keytype) key_part_info->type == HA_KEYTYPE_TEXT ||
@@ -10719,11 +12580,17 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
   share->db_record_offset= 1;
   if (share->db_type() == TMP_ENGINE_HTON)
   {
-    if (create_internal_tmp_table(table,param,select_options))
+    if (create_internal_tmp_table(table, param->keyinfo, param->start_recinfo,
+                                  &param->recinfo, select_options))
       goto err;
   }
-  if (open_tmp_table(table))
-    goto err;
+  DBUG_PRINT("info", ("skip_create_table: %d", (int)param->skip_create_table));
+  if (!param->skip_create_table)
+  {
+    if (open_tmp_table(table))
+      goto err;
+    table->db_stat= HA_OPEN_KEYFILE+HA_OPEN_RNDFILE;
+  }
 
   thd->mem_root= mem_root_save;
 
@@ -10738,6 +12605,7 @@ err:
 }
 
 
+
 /****************************************************************************/
 
 /**
@@ -10778,7 +12646,7 @@ TABLE *create_virtual_tmp_table(THD *thd, List<Create_field> &field_list)
                         &share, sizeof(*share),
                         &field, (field_count + 1) * sizeof(Field*),
                         &blob_field, (field_count+1) *sizeof(uint),
-                        &bitmaps, bitmap_buffer_size(field_count)*3,
+                        &bitmaps, bitmap_buffer_size(field_count)*4,
                         NullS))
     return 0;
 
@@ -10866,7 +12734,7 @@ error:
 }
 
 
-static bool open_tmp_table(TABLE *table)
+bool open_tmp_table(TABLE *table)
 {
   int error;
   if ((error= table->file->ha_open(table, table->s->table_name.str, O_RDWR,
@@ -10884,15 +12752,45 @@ static bool open_tmp_table(TABLE *table)
 
 #if defined(WITH_ARIA_STORAGE_ENGINE) && defined(USE_MARIA_FOR_TMP_TABLES)
 
-/* Create internal Aria temporary table */
+/*
+  Create internal (MyISAM or Maria) temporary table
+
+  SYNOPSIS
+    create_internal_tmp_table()
+      table           Table object that descrimes the table to be created
+      keyinfo         Description of the index (there is always one index)
+      start_recinfo   engine's column descriptions
+      recinfo INOUT   End of engine's column descriptions
+      options         Option bits
+   
+  DESCRIPTION
+    Create an internal emporary table according to passed description. The is
+    assumed to have one unique index or constraint.
+
+    The passed array or ENGINE_COLUMNDEF structures must have this form:
+
+      1. 1-byte column (afaiu for 'deleted' flag) (note maybe not 1-byte
+         when there are many nullable columns)
+      2. Table columns
+      3. One free ENGINE_COLUMNDEF element (*recinfo points here)
+   
+    This function may use the free element to create hash column for unique
+    constraint.
+
+   RETURN
+     FALSE - OK
+     TRUE  - Error
+*/
+
 
-static bool create_internal_tmp_table(TABLE *table,TMP_TABLE_PARAM *param,
-                                      ulonglong options)
+bool create_internal_tmp_table(TABLE *table, KEY *keyinfo, 
+                               ENGINE_COLUMNDEF *start_recinfo,
+                               ENGINE_COLUMNDEF **recinfo, 
+                               ulonglong options)
 {
   int error;
   MARIA_KEYDEF keydef;
   MARIA_UNIQUEDEF uniquedef;
-  KEY *keyinfo=param->keyinfo;
   TABLE_SHARE *share= table->s;
   MARIA_CREATE_INFO create_info;
   DBUG_ENTER("create_internal_tmp_table");
@@ -10920,10 +12818,10 @@ static bool create_internal_tmp_table(TABLE *table,TMP_TABLE_PARAM *param,
       uniquedef.null_are_equal=1;
 
       /* Create extra column for hash value */
-      bzero((uchar*) param->recinfo,sizeof(*param->recinfo));
-      param->recinfo->type=   FIELD_CHECK;
-      param->recinfo->length= MARIA_UNIQUE_HASH_LENGTH;
-      param->recinfo++;
+      bzero((uchar*) *recinfo,sizeof(**recinfo));
+      (*recinfo)->type=   FIELD_CHECK;
+      (*recinfo)->length= MARIA_UNIQUE_HASH_LENGTH;
+      (*recinfo)++;
       share->reclength+=      MARIA_UNIQUE_HASH_LENGTH;
     }
     else
@@ -10979,12 +12877,13 @@ static bool create_internal_tmp_table(TABLE *table,TMP_TABLE_PARAM *param,
     create_info.data_file_length= ~(ulonglong) 0;
 
   if ((error= maria_create(share->table_name.str,
-                           share->reclength < 64 &&
-                           !share->blob_fields ? STATIC_RECORD :
-                           BLOCK_RECORD,
+                           table->no_rows ? NO_RECORD :
+                           (share->reclength < 64 &&
+                            !share->blob_fields ? STATIC_RECORD :
+                            BLOCK_RECORD),
                            share->keys, &keydef,
-                           (uint) (param->recinfo-param->start_recinfo),
-                           param->start_recinfo,
+                           (uint) (*recinfo-start_recinfo),
+                           start_recinfo,
                            share->uniques, &uniquedef,
                            &create_info,
                            HA_CREATE_TMP_TABLE)))
@@ -11003,11 +12902,13 @@ static bool create_internal_tmp_table(TABLE *table,TMP_TABLE_PARAM *param,
 
 
 bool create_internal_tmp_table_from_heap(THD *thd, TABLE *table,
-                                         TMP_TABLE_PARAM *param,
+                                         ENGINE_COLUMNDEF *start_recinfo,
+                                         ENGINE_COLUMNDEF **recinfo, 
                                          int error,
                                          bool ignore_last_dupp_key_error)
 {
-  return create_internal_tmp_table_from_heap2(thd, table, param, error,
+  return create_internal_tmp_table_from_heap2(thd, table, 
+                                              start_recinfo, recinfo, error,
                                               ignore_last_dupp_key_error,
                                               maria_hton,
                                               "converting HEAP to Aria");
@@ -11015,15 +12916,46 @@ bool create_internal_tmp_table_from_heap(THD *thd, TABLE *table,
 
 #else
 
+/*
+  Create internal (MyISAM or Maria) temporary table
+
+  SYNOPSIS
+    create_internal_tmp_table()
+      table           Table object that descrimes the table to be created
+      keyinfo         Description of the index (there is always one index)
+      start_recinfo   engine's column descriptions
+      recinfo INOUT   End of engine's column descriptions
+      options         Option bits
+   
+  DESCRIPTION
+    Create an internal emporary table according to passed description. The is
+    assumed to have one unique index or constraint.
+
+    The passed array or ENGINE_COLUMNDEF structures must have this form:
+
+      1. 1-byte column (afaiu for 'deleted' flag) (note maybe not 1-byte
+         when there are many nullable columns)
+      2. Table columns
+      3. One free ENGINE_COLUMNDEF element (*recinfo points here)
+   
+    This function may use the free element to create hash column for unique
+    constraint.
+
+   RETURN
+     FALSE - OK
+     TRUE  - Error
+*/
+
 /* Create internal MyISAM temporary table */
 
-static bool create_internal_tmp_table(TABLE *table,TMP_TABLE_PARAM *param,
-                                      ulonglong options)
+bool create_internal_tmp_table(TABLE *table, KEY *keyinfo, 
+                               ENGINE_COLUMNDEF *start_recinfo,
+                               ENGINE_COLUMNDEF **recinfo,
+                               ulonglong options)
 {
   int error;
   MI_KEYDEF keydef;
   MI_UNIQUEDEF uniquedef;
-  KEY *keyinfo=param->keyinfo;
   TABLE_SHARE *share= table->s;
   DBUG_ENTER("create_internal_tmp_table");
 
@@ -11050,10 +12982,10 @@ static bool create_internal_tmp_table(TABLE *table,TMP_TABLE_PARAM *param,
       uniquedef.null_are_equal=1;
 
       /* Create extra column for hash value */
-      bzero((uchar*) param->recinfo,sizeof(*param->recinfo));
-      param->recinfo->type= FIELD_CHECK;
-      param->recinfo->length=MI_UNIQUE_HASH_LENGTH;
-      param->recinfo++;
+      bzero((uchar*) *recinfo,sizeof(**recinfo));
+      (*recinfo)->type= FIELD_CHECK;
+      (*recinfo)->length=MI_UNIQUE_HASH_LENGTH;
+      (*recinfo)++;
       share->reclength+=MI_UNIQUE_HASH_LENGTH;
     }
     else
@@ -11110,8 +13042,8 @@ static bool create_internal_tmp_table(TABLE *table,TMP_TABLE_PARAM *param,
     create_info.data_file_length= ~(ulonglong) 0;
 
   if ((error=mi_create(share->table_name.str, share->keys, &keydef,
-		       (uint) (param->recinfo-param->start_recinfo),
-		       param->start_recinfo,
+		       (uint) (*recinfo-start_recinfo),
+		       start_recinfo,
 		       share->uniques, &uniquedef,
 		       &create_info,
 		       HA_CREATE_TMP_TABLE)))
@@ -11133,11 +13065,13 @@ static bool create_internal_tmp_table(TABLE *table,TMP_TABLE_PARAM *param,
 */
 
 bool create_internal_tmp_table_from_heap(THD *thd, TABLE *table,
-                                         TMP_TABLE_PARAM *param,
+                                         ENGINE_COLUMNDEF *start_recinfo,
+                                         ENGINE_COLUMNDEF **recinfo, 
                                          int error,
                                          bool ignore_last_dupp_key_error)
 {
-  return create_internal_tmp_table_from_heap2(thd, table, param, error,
+  return create_internal_tmp_table_from_heap2(thd, table, 
+                                              start_recinfo, recinfo, error,
                                               ignore_last_dupp_key_error,
                                               myisam_hton,
                                               "converting HEAP to MyISAM");
@@ -11154,7 +13088,8 @@ bool create_internal_tmp_table_from_heap(THD *thd, TABLE *table,
 
 static bool
 create_internal_tmp_table_from_heap2(THD *thd, TABLE *table,
-                                     TMP_TABLE_PARAM *param,
+                                     ENGINE_COLUMNDEF *start_recinfo,
+                                     ENGINE_COLUMNDEF **recinfo, 
                                      int error,
                                      bool ignore_last_dupp_key_error,
                                      handlerton *hton,
@@ -11163,9 +13098,8 @@ create_internal_tmp_table_from_heap2(THD *thd, TABLE *table,
   TABLE new_table;
   TABLE_SHARE share;
   const char *save_proc_info;
-  int write_err;
+  int write_err= 0;
   DBUG_ENTER("create_internal_tmp_table_from_heap2");
-  LINT_INIT(write_err);
 
   if (table->s->db_type() != heap_hton || 
       error != HA_ERR_RECORD_FILE_FULL)
@@ -11189,8 +13123,10 @@ create_internal_tmp_table_from_heap2(THD *thd, TABLE *table,
   save_proc_info=thd->proc_info;
   thd_proc_info(thd, proc_info);
 
-  if (create_internal_tmp_table(&new_table, param,
-			      thd->lex->select_lex.options | thd->options))
+  new_table.no_rows= table->no_rows;
+  if (create_internal_tmp_table(&new_table, table->key_info, start_recinfo,
+                                recinfo, thd->lex->select_lex.options | 
+                                thd->options))
     goto err2;
   if (open_tmp_table(&new_table))
     goto err1;
@@ -11199,24 +13135,15 @@ create_internal_tmp_table_from_heap2(THD *thd, TABLE *table,
   table->file->ha_index_or_rnd_end();
   if (table->file->ha_rnd_init_with_error(1))
     DBUG_RETURN(1);
-  if (table->no_rows)
-  {
+  if (new_table.no_rows)
     new_table.file->extra(HA_EXTRA_NO_ROWS);
-    new_table.no_rows=1;
+  else
+  {
+    /* update table->file->stats.records */
+    table->file->info(HA_STATUS_VARIABLE);
+    new_table.file->ha_start_bulk_insert(table->file->stats.records);
   }
 
-#ifdef TO_BE_DONE_LATER_IN_4_1
-  /*
-    To use start_bulk_insert() (which is new in 4.1) we need to find
-    all places where a corresponding end_bulk_insert() should be put.
-  */
-  table->file->info(HA_STATUS_VARIABLE); /* update table->file->stats.records */
-  new_table.file->ha_start_bulk_insert(table->file->stats.records);
-#else
-  /* HA_EXTRA_WRITE_CACHE can stay until close, no need to disable it */
-  new_table.file->extra(HA_EXTRA_WRITE_CACHE);
-#endif
-
   /*
     copy all old rows from heap table to MyISAM table
     This is the only code that uses record[1] to read/write but this
@@ -11225,7 +13152,7 @@ create_internal_tmp_table_from_heap2(THD *thd, TABLE *table,
   */
   while (!table->file->ha_rnd_next(new_table.record[1]))
   {
-    write_err= new_table.file->ha_write_row(new_table.record[1]);
+    write_err= new_table.file->ha_write_tmp_row(new_table.record[1]);
     DBUG_EXECUTE_IF("raise_error", write_err= HA_ERR_FOUND_DUPP_KEY ;);
     if (write_err)
       goto err;
@@ -11235,8 +13162,10 @@ create_internal_tmp_table_from_heap2(THD *thd, TABLE *table,
       goto err_killed;
     }
   }
+  if (!new_table.no_rows && new_table.file->ha_end_bulk_insert())
+    goto err;
   /* copy row that filled HEAP table */
-  if ((write_err=new_table.file->ha_write_row(table->record[0])))
+  if ((write_err=new_table.file->ha_write_tmp_row(table->record[0])))
   {
     if (new_table.file->is_fatal_error(write_err, HA_CHECK_DUP) ||
 	!ignore_last_dupp_key_error)
@@ -11257,8 +13186,8 @@ create_internal_tmp_table_from_heap2(THD *thd, TABLE *table,
   table->file->change_table_ptr(table, table->s);
   table->use_all_columns();
   if (save_proc_info)
-    thd_proc_info(thd, (!strcmp(save_proc_info,"Copying to tmp table") ?
-                     "Copying to tmp table on disk" : save_proc_info));
+    thd_proc_info(thd, save_proc_info == copy_to_tmp_table ?
+                  "Copying to tmp table on disk" : save_proc_info);
   DBUG_RETURN(0);
 
  err:
@@ -11403,7 +13332,6 @@ Next_select_func setup_end_select_func(JOIN *join)
   @retval
     -1  if error should be sent
 */
-
 static int
 do_select(JOIN *join,List<Item> *fields,TABLE *table,Procedure *procedure)
 {
@@ -11464,17 +13392,20 @@ do_select(JOIN *join,List<Item> *fields,TABLE *table,Procedure *procedure)
     }
     else if (join->send_row_on_empty_set())
     {
-      List<Item> *columns_list= (procedure ? &join->procedure_fields_list :
-                                 fields);
-      rc= join->result->send_data(*columns_list);
+      if (!join->having || join->having->val_int())
+      {
+        List<Item> *columns_list= (procedure ? &join->procedure_fields_list :
+                                   fields);
+        rc= join->result->send_data(*columns_list);
+      }
     }
   }
   else
   {
     DBUG_ASSERT(join->tables);
-    error= sub_select(join,join_tab,0);
+    error= join->first_select(join,join_tab,0);
     if (error == NESTED_LOOP_OK || error == NESTED_LOOP_NO_MORE_ROWS)
-      error= sub_select(join,join_tab,1);
+      error= join->first_select(join,join_tab,1);
     if (error == NESTED_LOOP_QUERY_LIMIT)
       error= NESTED_LOOP_OK;                    /* select_limit used */
   }
@@ -11527,33 +13458,235 @@ do_select(JOIN *join,List<Item> *fields,TABLE *table,Procedure *procedure)
 }
 
 
+int rr_sequential_and_unpack(READ_RECORD *info)
+{
+  int error;
+  if ((error= rr_sequential(info)))
+    return error;
+  
+  for (Copy_field *cp= info->copy_field; cp != info->copy_field_end; cp++)
+    (*cp->do_copy)(cp);
+
+  return error;
+}
+
+
+/*
+  Semi-join materialization join function
+
+  SYNOPSIS
+    sub_select_sjm()
+      join            The join
+      join_tab        The first table in the materialization nest
+      end_of_records  FALSE <=> This call is made to pass another record 
+                                combination
+                      TRUE  <=> EOF
+
+  DESCRIPTION
+    This is a join execution function that does materialization of a join
+    suborder before joining it to the rest of the join.
+
+    The table pointed by join_tab is the first of the materialized tables.
+    This function first creates the materialized table and then switches to
+    joining the materialized table with the rest of the join.
+
+    The materialized table can be accessed in two ways:
+     - index lookups
+     - full table scan
+
+  RETURN
+    One of enum_nested_loop_state values
+*/
+
 enum_nested_loop_state
-sub_select_cache(JOIN *join,JOIN_TAB *join_tab,bool end_of_records)
+sub_select_sjm(JOIN *join, JOIN_TAB *join_tab, bool end_of_records)
 {
+  int res;
   enum_nested_loop_state rc;
 
+  DBUG_ENTER("sub_select_sjm");
+
+  if (!join_tab->emb_sj_nest)
+  {
+    /*
+      We're handling GROUP BY/ORDER BY, this is the first table, and we've
+      actually executed the join already and now we're just reading the
+      result of the join from the temporary table.
+      Bypass to regular join handling.
+      Yes, it would be nicer if sub_select_sjm wasn't called at all in this
+      case but there's no easy way to arrange this.
+    */
+    rc= sub_select(join, join_tab, end_of_records);
+    DBUG_RETURN(rc);
+  }
+
+  SJ_MATERIALIZATION_INFO *sjm= join_tab->emb_sj_nest->sj_mat_info;
   if (end_of_records)
   {
-    rc= flush_cached_records(join,join_tab,FALSE);
+    rc= (*join_tab[sjm->tables - 1].next_select)(join,
+                                                 join_tab + sjm->tables,
+                                                 end_of_records);
+    DBUG_RETURN(rc);
+  }
+  if (!sjm->materialized)
+  {
+    /*
+      Do the materialization. First, put end_sj_materialize after the last
+      inner table so we can catch record combinations of sj-inner tables.
+    */
+    Next_select_func next_func= join_tab[sjm->tables - 1].next_select;
+    join_tab[sjm->tables - 1].next_select= end_sj_materialize;
+
+    /*
+      Now run the join for the inner tables. The first call is to run the
+      join, the second one is to signal EOF (this is essential for some
+      join strategies, e.g. it will make join buffering flush the records)
+    */
+    if ((rc= sub_select(join, join_tab, FALSE)) < 0 ||
+        (rc= sub_select(join, join_tab, TRUE/*EOF*/)) < 0)
+    {
+      join_tab[sjm->tables - 1].next_select= next_func;
+      DBUG_RETURN(rc); /* it's NESTED_LOOP_(ERROR|KILLED)*/
+    }
+    join_tab[sjm->tables - 1].next_select= next_func;
+
+    /*
+      Ok, materialization finished. Initialize the access to the temptable
+    */
+    sjm->materialized= TRUE;
+    join_tab->read_record.read_record= join_no_more_records;
+    if (sjm->is_sj_scan)
+    {
+      /* Initialize full scan */
+      JOIN_TAB *last_tab= join_tab + (sjm->tables - 1);
+      init_read_record(&last_tab->read_record, join->thd,
+                       sjm->table, NULL, TRUE, TRUE, FALSE);
+
+      DBUG_ASSERT(last_tab->read_record.read_record == rr_sequential);
+      last_tab->read_first_record= join_read_record_no_init;
+      last_tab->read_record.copy_field= sjm->copy_field;
+      last_tab->read_record.copy_field_end= sjm->copy_field +
+                                            sjm->sjm_table_cols.elements;
+      last_tab->read_record.read_record= rr_sequential_and_unpack;
+    }
+  }
+  else
+  {
+    if (sjm->is_sj_scan)
+    {
+      /* Reset the cursor for a new scan over the table */
+      if (sjm->table->file->ha_rnd_init(TRUE))
+        DBUG_RETURN(NESTED_LOOP_ERROR);
+    }
+  }
+
+  if (sjm->is_sj_scan)
+  {
+    /* Do full scan of the materialized table */
+    JOIN_TAB *last_tab= join_tab + (sjm->tables - 1);
+
+    Item *save_cond= last_tab->select_cond;
+    last_tab->set_select_cond(sjm->join_cond, __LINE__);
+
+    rc= sub_select(join, last_tab, end_of_records);
+    last_tab->set_select_cond(save_cond, __LINE__);
+    DBUG_RETURN(rc);
+  }
+  else
+  {
+    /* Do index lookup in the materialized table */
+    if ((res= join_read_key2(join_tab->join->thd, join_tab,
+                             sjm->table, sjm->tab_ref)) == 1)
+      DBUG_RETURN(NESTED_LOOP_ERROR); /* purecov: inspected */
+    if (res || !sjm->in_equality->val_int())
+      DBUG_RETURN(NESTED_LOOP_NO_MORE_ROWS);
+  }
+  rc= (*join_tab[sjm->tables - 1].next_select)(join,
+                                               join_tab + sjm->tables,
+                                               end_of_records);
+  DBUG_RETURN(rc);
+}
+
+
+/*
+  Fill the join buffer with partial records, retrieve all full  matches for them   
+
+  SYNOPSIS
+    sub_select_cache()
+      join     pointer to the structure providing all context info for the query
+      join_tab the first next table of the execution plan to be retrieved
+      end_records  true when we need to perform final steps of the retrieval
+
+  DESCRIPTION
+    For a given table Ti= join_tab from the sequence of tables of the chosen 
+    execution plan T1,...,Ti,...,Tn the function just put the partial record
+    t1,...,t[i-1] into the join buffer associated with table Ti unless this
+    is the last record added into the buffer. In this case,  the function 
+    additionally finds all matching full records for all partial
+    records accumulated in the buffer, after which it cleans the buffer up.
+    If a partial join record t1,...,ti is extended utilizing a dynamic
+    range scan then it is not put into the join buffer. Rather all matching
+    records are found for it at once by the function sub_select.
+
+  NOTES
+    The function implements the algorithmic schema for both Blocked Nested
+    Loop Join and Batched Key Access Join. The difference can be seen only at
+    the level of of the implementation of the put_record and join_records
+    virtual methods for the cache object associated with the join_tab.
+    The put_record method accumulates records in the cache, while the 
+    join_records method builds all matching join records and send them into
+    the output stream.  
+      
+  RETURN
+    return one of enum_nested_loop_state, except NESTED_LOOP_NO_MORE_ROWS.
+*/ 
+
+enum_nested_loop_state
+sub_select_cache(JOIN *join, JOIN_TAB *join_tab, bool end_of_records)
+{
+  enum_nested_loop_state rc;
+  JOIN_CACHE *cache= join_tab->cache;
+
+  DBUG_ENTER("sub_select_cache");
+
+  /* This function cannot be called if join_tab has no associated join buffer */
+  DBUG_ASSERT(cache != NULL);
+
+  join_tab->cache->reset_join(join);
+
+  if (end_of_records)
+  {
+    rc= cache->join_records(FALSE);
     if (rc == NESTED_LOOP_OK || rc == NESTED_LOOP_NO_MORE_ROWS)
-      rc= sub_select(join,join_tab,end_of_records);
-    return rc;
+      rc= sub_select(join, join_tab, end_of_records);
+    DBUG_RETURN(rc);
   }
-  if (join->thd->killed)		// If aborted by user
+  if (join->thd->killed)
   {
+    /* The user has aborted the execution of the query */
     join->thd->send_kill_message();
-    return NESTED_LOOP_KILLED;                   /* purecov: inspected */
+    DBUG_RETURN(NESTED_LOOP_KILLED);
   }
-  if (join_tab->use_quick != 2 || test_if_quick_select(join_tab) <= 0)
+  if (!test_if_use_dynamic_range_scan(join_tab))
   {
-    if (!store_record_in_cache(&join_tab->cache))
-      return NESTED_LOOP_OK;                     // There is more room in cache
-    return flush_cached_records(join,join_tab,FALSE);
+    if (!cache->put_record())
+      DBUG_RETURN(NESTED_LOOP_OK); 
+    /* 
+      We has decided that after the record we've just put into the buffer
+      won't add any more records. Now try to find all the matching 
+      extensions for all records in the buffer.
+    */ 
+    rc= cache->join_records(FALSE);
+    DBUG_RETURN(rc);
   }
-  rc= flush_cached_records(join, join_tab, TRUE);
+  /*
+     TODO: Check whether we really need the call below and we can't do
+           without it. If it's not the case remove it.
+  */ 
+  rc= cache->join_records(TRUE);
   if (rc == NESTED_LOOP_OK || rc == NESTED_LOOP_NO_MORE_ROWS)
     rc= sub_select(join, join_tab, end_of_records);
-  return rc;
+  DBUG_RETURN(rc);
 }
 
 /**
@@ -11679,14 +13812,24 @@ sub_select_cache(JOIN *join,JOIN_TAB *join_tab,bool end_of_records)
 enum_nested_loop_state
 sub_select(JOIN *join,JOIN_TAB *join_tab,bool end_of_records)
 {
+  DBUG_ENTER("sub_select");
+
   join_tab->table->null_row=0;
   if (end_of_records)
-    return (*join_tab->next_select)(join,join_tab+1,end_of_records);
-
+  {
+    enum_nested_loop_state nls=
+      (*join_tab->next_select)(join,join_tab+1,end_of_records);
+    DBUG_RETURN(nls);
+  }
   int error;
   enum_nested_loop_state rc= NESTED_LOOP_OK;
   READ_RECORD *info= &join_tab->read_record;
 
+  if (join_tab->flush_weedout_table)
+  {
+    do_sj_reset(join_tab->flush_weedout_table);
+  }
+
   if (join->resume_nested_loop)
   {
     /* If not the last table, plunge down the nested loop */
@@ -11714,19 +13857,57 @@ sub_select(JOIN *join,JOIN_TAB *join_tab,bool end_of_records)
       join_tab->last_inner->first_unmatched= join_tab;
       if (join_tab->on_precond && !join_tab->on_precond->val_int())
         rc= NESTED_LOOP_NO_MORE_ROWS;
-    }
+     }
     join->thd->row_count= 0;
+    
+    if (join_tab->loosescan_match_tab)
+      join_tab->loosescan_match_tab->found_match= FALSE;
 
     if (rc != NESTED_LOOP_NO_MORE_ROWS)
     {
       error= (*join_tab->read_first_record)(join_tab);
+      if (join_tab->keep_current_rowid)
+        join_tab->table->file->position(join_tab->table->record[0]);    
       rc= evaluate_join_record(join, join_tab, error);
     }
   }
-
-  while (rc == NESTED_LOOP_OK)
+  
+  /* 
+    Note: psergey has added the 2nd part of the following condition; the 
+    change should probably be made in 5.1, too.
+  */
+  bool skip_over= FALSE;
+  while (rc == NESTED_LOOP_OK && join->return_tab >= join_tab)
   {
+    if (join_tab->loosescan_match_tab && 
+        join_tab->loosescan_match_tab->found_match)
+    {
+      KEY *key= join_tab->table->key_info + join_tab->index;
+      key_copy(join_tab->loosescan_buf, info->record, key, 
+               join_tab->loosescan_key_len);
+      skip_over= TRUE;
+    }
+
     error= info->read_record(info);
+
+    if (skip_over && !error) 
+    {
+      if(!key_cmp(join_tab->table->key_info[join_tab->index].key_part,
+                  join_tab->loosescan_buf, join_tab->loosescan_key_len))
+      {
+        /* 
+          This is the LooseScan action: skip over records with the same key
+          value if we already had a match for them.
+        */
+        continue;
+      }
+      join_tab->loosescan_match_tab->found_match= FALSE;
+      skip_over= FALSE;
+    }
+
+    if (join_tab->keep_current_rowid)
+      join_tab->table->file->position(join_tab->table->record[0]);
+    
     rc= evaluate_join_record(join, join_tab, error);
   }
 
@@ -11736,16 +13917,23 @@ sub_select(JOIN *join,JOIN_TAB *join_tab,bool end_of_records)
 
   if (rc == NESTED_LOOP_NO_MORE_ROWS)
     rc= NESTED_LOOP_OK;
-  return rc;
+  DBUG_RETURN(rc);
 }
 
 
 /**
-  Process one record of the nested loop join.
-
-    This function will evaluate parts of WHERE/ON clauses that are
-    applicable to the partial record on hand and in case of success
-    submit this record to the next level of the nested loop.
+  @brief Process one row of the nested loop join.
+
+  This function will evaluate parts of WHERE/ON clauses that are
+  applicable to the partial row on hand and in case of success
+  submit this row to the next level of the nested loop.
+
+  @param  join     - The join object
+  @param  join_tab - The most inner join_tab being processed
+  @param  error > 0: Error, terminate processing
+                = 0: (Partial) row is available
+                < 0: No more rows available at this level
+  @return Nested loop state (Ok, No_more_rows, Error, Killed)
 */
 
 static enum_nested_loop_state
@@ -11757,16 +13945,19 @@ evaluate_join_record(JOIN *join, JOIN_TAB *join_tab,
   COND *select_cond= join_tab->select_cond;
   bool select_cond_result= TRUE;
 
+  DBUG_ENTER("evaluate_join_record");
+  DBUG_PRINT("enter",
+             ("evaluate_join_record join: %p join_tab: %p"
+              " cond: %p error: %d", join, join_tab, select_cond, error));
   if (error > 0 || (join->thd->is_error()))     // Fatal error
-    return NESTED_LOOP_ERROR;
+    DBUG_RETURN(NESTED_LOOP_ERROR);
   if (error < 0)
-    return NESTED_LOOP_NO_MORE_ROWS;
+    DBUG_RETURN(NESTED_LOOP_NO_MORE_ROWS);
   if (join->thd->killed)			// Aborted by user
   {
     join->thd->send_kill_message();
-    return NESTED_LOOP_KILLED;               /* purecov: inspected */
+    DBUG_RETURN(NESTED_LOOP_KILLED);            /* purecov: inspected */
   }
-  DBUG_PRINT("info", ("select cond 0x%lx", (ulong)select_cond));
 
   update_virtual_fields(join->thd, join_tab->table);
 
@@ -11776,7 +13967,7 @@ evaluate_join_record(JOIN *join, JOIN_TAB *join_tab,
 
     /* check for errors evaluating the condition */
     if (join->thd->is_error())
-      return NESTED_LOOP_ERROR;
+      DBUG_RETURN(NESTED_LOOP_ERROR);
   }
 
   if (!select_cond || select_cond_result)
@@ -11822,7 +14013,7 @@ evaluate_join_record(JOIN *join, JOIN_TAB *join_tab,
               not to the last table of the current nest level.
             */
             join->return_tab= tab;
-            return NESTED_LOOP_OK;
+            DBUG_RETURN(NESTED_LOOP_OK);
           }
         }
       }
@@ -11837,7 +14028,26 @@ evaluate_join_record(JOIN *join, JOIN_TAB *join_tab,
     }
 
     if (use_not_exists_opt)
-      return NESTED_LOOP_NO_MORE_ROWS;
+      DBUG_RETURN(NESTED_LOOP_NO_MORE_ROWS);
+    JOIN_TAB *return_tab= join->return_tab;
+    join_tab->found_match= TRUE;
+
+    if (join_tab->check_weed_out_table && found)
+    {
+      int res= do_sj_dups_weedout(join->thd, join_tab->check_weed_out_table);
+      if (res == -1)
+        DBUG_RETURN(NESTED_LOOP_ERROR);
+      else if (res == 1)
+        found= FALSE;
+    }
+    else if (join_tab->do_firstmatch)
+    {
+      /* 
+        We should return to the join_tab->do_firstmatch after we have 
+        enumerated all the suffixes for current prefix row combination
+      */
+      return_tab= join_tab->do_firstmatch;
+    }
 
     /*
       It was not just a return to lower loop level when one
@@ -11855,16 +14065,19 @@ evaluate_join_record(JOIN *join, JOIN_TAB *join_tab,
       /* A match from join_tab is found for the current partial join. */
       rc= (*join_tab->next_select)(join, join_tab+1, 0);
       if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
-        return rc;
+        DBUG_RETURN(rc);
+      if (return_tab < join->return_tab)
+        join->return_tab= return_tab;
+
       if (join->return_tab < join_tab)
-        return NESTED_LOOP_OK;
+        DBUG_RETURN(NESTED_LOOP_OK);
       /*
         Test if this was a SELECT DISTINCT query on a table that
         was not in the field list;  In this case we can abort if
         we found a row, as no new rows can be added to the result.
       */
       if (not_used_in_distinct && found_records != join->found_records)
-        return NESTED_LOOP_NO_MORE_ROWS;
+        DBUG_RETURN(NESTED_LOOP_NO_MORE_ROWS);
     }
     else
       join_tab->read_record.unlock_row(join_tab);
@@ -11879,10 +14092,9 @@ evaluate_join_record(JOIN *join, JOIN_TAB *join_tab,
     join->thd->row_count++;
     join_tab->read_record.unlock_row(join_tab);
   }
-  return NESTED_LOOP_OK;
+  DBUG_RETURN(NESTED_LOOP_OK);
 }
 
-
 /**
 
   @details
@@ -11949,103 +14161,6 @@ evaluate_null_complemented_join_record(JOIN *join, JOIN_TAB *join_tab)
   return (*join_tab->next_select)(join, join_tab+1, 0);
 }
 
-
-static enum_nested_loop_state
-flush_cached_records(JOIN *join,JOIN_TAB *join_tab,bool skip_last)
-{
-  enum_nested_loop_state rc= NESTED_LOOP_OK;
-  int error;
-  READ_RECORD *info;
-  SQL_SELECT *select;
-
-  join_tab->table->null_row= 0;
-  if (!join_tab->cache.records)
-    return NESTED_LOOP_OK;                      /* Nothing to do */
-  if (skip_last)
-    (void) store_record_in_cache(&join_tab->cache); // Must save this for later
-  if (join_tab->use_quick == 2)
-  {
-    if (join_tab->select->quick)
-    {					/* Used quick select last. reset it */
-      delete join_tab->select->quick;
-      join_tab->select->quick=0;
-    }
-  }
-  /* read through all records */
-  if ((error=join_init_read_record(join_tab)))
-  {
-    reset_cache_write(&join_tab->cache);
-    return error < 0 ? NESTED_LOOP_NO_MORE_ROWS: NESTED_LOOP_ERROR;
-  }
-
-  for (JOIN_TAB *tmp= join_tab-1;
-       tmp >= join->join_tab && !tmp->cache.buff; tmp--)
-  {
-    tmp->status=tmp->table->status;
-    tmp->table->status=0;
-  }
-
-  info= &join_tab->read_record;
-  select= join_tab->select;
-
-  do
-  {
-    int err= 0;
-    if (join->thd->killed)
-    {
-      join->thd->send_kill_message();
-      return NESTED_LOOP_KILLED; // Aborted by user /* purecov: inspected */
-    }
-    if (rc == NESTED_LOOP_OK)
-      update_virtual_fields(join->thd, join_tab->table);
-    if (rc == NESTED_LOOP_OK &&
-        (!join_tab->cache.select ||
-         (err= join_tab->cache.select->skip_record(join->thd)) != 0 ))
-    {
-      if (err < 0)
-      {
-        reset_cache_write(&join_tab->cache);
-        return NESTED_LOOP_ERROR;
-      }
-
-      reset_cache_read(&join_tab->cache);
-      for (uint i= (join_tab->cache.records- (skip_last ? 1 : 0)) ; i-- > 0 ;)
-      {
-	read_cached_record(join_tab);
-        err= 0;
-	if (!select || (err= select->skip_record(join->thd)) != 0)
-        {
-          if (err < 0)
-          {
-            reset_cache_write(&join_tab->cache);
-            return NESTED_LOOP_ERROR;
-          }
-          rc= (join_tab->next_select)(join,join_tab+1,0);
-	  if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
-          {
-            reset_cache_write(&join_tab->cache);
-            return rc;
-          }
-        }
-      }
-    }
-      
-    rc= NESTED_LOOP_OK;
-
-  } while (!(error=info->read_record(info)));
-
-  if (skip_last)
-    read_cached_record(join_tab);		// Restore current record
-  reset_cache_write(&join_tab->cache);
-  if (error > 0)				// Fatal error
-    return NESTED_LOOP_ERROR;                   /* purecov: inspected */
-  for (JOIN_TAB *tmp2= join_tab-1;
-       tmp2 >= join->join_tab && !tmp2->cache.buff; tmp2--)
-    tmp2->table->status=tmp2->status;
-  return NESTED_LOOP_OK;
-}
-
-
 /*****************************************************************************
   The different ways to read a record
   Returns -1 if row was not found, 0 if row was found and 1 on errors
@@ -12133,7 +14248,14 @@ join_read_const_table(JOIN_TAB *tab, POSITION *pos)
 	DBUG_RETURN(error);
     }
   }
-  if (*tab->on_expr_ref && !table->null_row)
+  /* 
+     Evaluate an on-expression only if it is not considered expensive.
+     This mainly prevents executing subqueries in optimization phase.
+     This is necessary since proper setup for such execution has not been
+     done at this stage.
+  */
+  if (*tab->on_expr_ref && !table->null_row && 
+      !(*tab->on_expr_ref)->is_expensive())
   {
 #if !defined(DBUG_OFF) && defined(NOT_USING_ITEM_EQUAL)
     /*
@@ -12253,25 +14375,49 @@ join_read_const(JOIN_TAB *tab)
   return table->status ? -1 : 0;
 }
 
+/*
+  eq_ref access method implementation: "read_first" function
+
+  SYNOPSIS
+    join_read_key()
+      tab  JOIN_TAB of the accessed table
+
+  DESCRIPTION
+    This is "read_fist" function for the eq_ref access method. The difference
+    from ref access function is that is that it has a one-element lookup 
+    cache (see cmp_buffer_with_ref)
+
+  RETURN
+    0  - Ok
+   -1  - Row not found 
+    1  - Error
+*/
+
 
 static int
 join_read_key(JOIN_TAB *tab)
 {
-  int error;
-  TABLE *table= tab->table;
+  return join_read_key2(tab->join->thd, tab, tab->table, &tab->ref);
+}
 
+
+/*
+  eq_ref access handler but generalized a bit to support TABLE and TABLE_REF
+  not from the join_tab. See join_read_key for detailed synopsis.
+*/
+int join_read_key2(THD *thd, JOIN_TAB *tab, TABLE *table, TABLE_REF *table_ref)
+{
+  int error;
   if (!table->file->inited)
   {
-    if ((error= table->file->ha_index_init(tab->ref.key, tab->sorted)))
-    {
-      table->file->print_error(error, MYF(0));/* purecov: inspected */
-      return 1;                               /* purecov: inspected */
-    }
+    table->file->ha_index_init(table_ref->key, (tab ? tab->sorted : TRUE));
   }
-  if (cmp_buffer_with_ref(tab) ||
+
+  /* TODO: Why don't we do "Late NULLs Filtering" here? */
+  if (cmp_buffer_with_ref(thd, table, table_ref) ||
       (table->status & (STATUS_GARBAGE | STATUS_NO_PARENT | STATUS_NULL_ROW)))
   {
-    if (tab->ref.key_err)
+    if (table_ref->key_err)
     {
       table->status=STATUS_NOT_FOUND;
       return -1;
@@ -12280,28 +14426,28 @@ join_read_key(JOIN_TAB *tab)
       Moving away from the current record. Unlock the row
       in the handler if it did not match the partial WHERE.
     */
-    if (tab->ref.has_record && tab->ref.use_count == 0)
+    if (tab && tab->ref.has_record && tab->ref.use_count == 0)
     {
       tab->read_record.file->unlock_row();
-      tab->ref.has_record= FALSE;
+      table_ref->has_record= FALSE;
     }
-    error= table->file->ha_index_read_map(table->record[0],
-                                          tab->ref.key_buff,
-                                          make_prev_keypart_map(tab->ref.key_parts),
-                                          HA_READ_KEY_EXACT);
+    error=table->file->ha_index_read_map(table->record[0],
+                                  table_ref->key_buff,
+                                  make_prev_keypart_map(table_ref->key_parts),
+                                  HA_READ_KEY_EXACT);
     if (error && error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE)
       return report_error(table, error);
 
     if (! error)
     {
-      tab->ref.has_record= TRUE;
-      tab->ref.use_count= 1;
+      table_ref->has_record= TRUE;
+      table_ref->use_count= 1;
     }
   }
   else if (table->status == 0)
   {
-    DBUG_ASSERT(tab->ref.has_record);
-    tab->ref.use_count++;
+    DBUG_ASSERT(table_ref->has_record);
+    table_ref->use_count++;
   }
   table->null_row=0;
   return table->status ? -1 : 0;
@@ -12360,13 +14506,6 @@ join_read_always_key(JOIN_TAB *tab)
     }
   }
 
-  /* Perform "Late NULLs Filtering" (see internals manual for explanations) */
-  for (uint i= 0 ; i < tab->ref.key_parts ; i++)
-  {
-    if ((tab->ref.null_rejecting & 1 << i) && tab->ref.items[i]->is_null())
-        return -1;
-  }
-
   if (cp_buffer_from_ref(tab->join->thd, table, &tab->ref))
     return -1;
   if ((error= table->file->ha_index_read_map(table->record[0],
@@ -12472,7 +14611,6 @@ join_init_quick_read_record(JOIN_TAB *tab)
 }
 
 
-int rr_sequential(READ_RECORD *info);
 int init_read_record_seq(JOIN_TAB *tab)
 {
   tab->read_record.read_record= rr_sequential;
@@ -12487,12 +14625,18 @@ test_if_quick_select(JOIN_TAB *tab)
   delete tab->select->quick;
   tab->select->quick=0;
   return tab->select->test_quick_select(tab->join->thd, tab->keys,
-					(table_map) 0, HA_POS_ERROR, 0);
+					(table_map) 0, HA_POS_ERROR, 0,
+                                        FALSE);
 }
 
 
-static int
-join_init_read_record(JOIN_TAB *tab)
+static 
+bool test_if_use_dynamic_range_scan(JOIN_TAB *join_tab)
+{
+    return (join_tab->use_quick == 2 && test_if_quick_select(join_tab) > 0);
+}
+
+int join_init_read_record(JOIN_TAB *tab)
 {
   if (tab->select && tab->select->quick && tab->select->quick->reset())
     return 1;
@@ -12502,6 +14646,12 @@ join_init_read_record(JOIN_TAB *tab)
   return (*tab->read_record.read_record)(&tab->read_record);
 }
 
+static int
+join_read_record_no_init(JOIN_TAB *tab)
+{
+  return (*tab->read_record.read_record)(&tab->read_record);
+}
+
 
 static int
 join_read_first(JOIN_TAB *tab)
@@ -12747,7 +14897,7 @@ end_send(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
 
 
 	/* ARGSUSED */
-static enum_nested_loop_state
+enum_nested_loop_state
 end_send_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
 	       bool end_of_records)
 {
@@ -12902,12 +15052,14 @@ end_write(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
     {
       int error;
       join->found_records++;
-      if ((error= table->file->ha_write_row(table->record[0])))
+      if ((error= table->file->ha_write_tmp_row(table->record[0])))
       {
         if (!table->file->is_fatal_error(error, HA_CHECK_DUP))
 	  goto end;
-	if (create_internal_tmp_table_from_heap(join->thd, table, &join->tmp_table_param,
-				    error,1))
+	if (create_internal_tmp_table_from_heap(join->thd, table, 
+                                                join->tmp_table_param.start_recinfo,
+                                                &join->tmp_table_param.recinfo,
+                                                error,1))
 	  DBUG_RETURN(NESTED_LOOP_ERROR);        // Not a table_is_full error
 	table->s->uniques=0;			// To ensure rows are the same
       }
@@ -12989,10 +15141,11 @@ end_update(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
   init_tmptable_sum_functions(join->sum_funcs);
   if (copy_funcs(join->tmp_table_param.items_to_copy, join->thd))
     DBUG_RETURN(NESTED_LOOP_ERROR);           /* purecov: inspected */
-  if ((error= table->file->ha_write_row(table->record[0])))
+  if ((error= table->file->ha_write_tmp_row(table->record[0])))
   {
     if (create_internal_tmp_table_from_heap(join->thd, table,
-                                            &join->tmp_table_param,
+                                            join->tmp_table_param.start_recinfo,
+                                            &join->tmp_table_param.recinfo,
                                             error, 0))
       DBUG_RETURN(NESTED_LOOP_ERROR);            // Not a table_is_full error
     /* Change method to update rows */
@@ -13031,7 +15184,7 @@ end_unique_update(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
   if (copy_funcs(join->tmp_table_param.items_to_copy, join->thd))
     DBUG_RETURN(NESTED_LOOP_ERROR);           /* purecov: inspected */
 
-  if (!(error= table->file->ha_write_row(table->record[0])))
+  if (!(error= table->file->ha_write_tmp_row(table->record[0])))
     join->send_records++;			// New group
   else
   {
@@ -13059,7 +15212,7 @@ end_unique_update(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
 
 
 	/* ARGSUSED */
-static enum_nested_loop_state
+enum_nested_loop_state
 end_write_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
 		bool end_of_records)
 {
@@ -13091,10 +15244,12 @@ end_write_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
                        join->sum_funcs_end[send_group_parts]);
 	if (!join->having || join->having->val_int())
 	{
-          int error= table->file->ha_write_row(table->record[0]);
-          if (error && create_internal_tmp_table_from_heap(join->thd, table,
-                                               &join->tmp_table_param,
-                                               error, 0))
+          int error= table->file->ha_write_tmp_row(table->record[0]);
+          if (error && 
+              create_internal_tmp_table_from_heap(join->thd, table,
+                                                  join->tmp_table_param.start_recinfo,
+                                                  &join->tmp_table_param.recinfo,
+                                                  error, 0))
 	    DBUG_RETURN(NESTED_LOOP_ERROR);
         }
         if (join->rollup.state != ROLLUP::STATE_NONE)
@@ -13146,12 +15301,25 @@ end_write_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
     1 if right_item is used removable reference key on left_item
 */
 
-static bool test_if_ref(Item_field *left_item,Item *right_item)
+bool test_if_ref(Item *root_cond, Item_field *left_item,Item *right_item)
 {
   Field *field=left_item->field;
-  // No need to change const test. We also have to keep tests on LEFT JOIN
-  if (!field->table->const_table && !field->table->maybe_null)
+  JOIN_TAB *join_tab= field->table->reginfo.join_tab;
+  // No need to change const test
+  if (!field->table->const_table && join_tab &&
+      !join_tab->is_ref_for_hash_join() &&
+      (!join_tab->first_inner ||
+       *join_tab->first_inner->on_expr_ref == root_cond))
   {
+    // Cond guards
+    for (uint i = 0; i < join_tab->ref.key_parts; i++)
+    {
+      if (join_tab->ref.cond_guards[i])
+      {
+        return FALSE;
+      }
+    }
+    //
     Item *ref_item=part_of_refkey(field->table,field);
     if (ref_item && ref_item->eq(right_item,1))
     {
@@ -13182,10 +15350,181 @@ static bool test_if_ref(Item_field *left_item,Item *right_item)
 }
 
 
+
+/*
+  Extract a condition that can be checked after reading given table
+  
+  SYNOPSIS
+    make_cond_for_table()
+      cond         Condition to analyze
+      tables       Tables for which "current field values" are available
+      used_table   Table that we're extracting the condition for (may 
+                   also include PSEUDO_TABLE_BITS
+      exclude_expensive_cond  Do not push expensive conditions
+      retain_ref_cond         Retain ref conditions
+
+  DESCRIPTION
+    Extract the condition that can be checked after reading the table
+    specified in 'used_table', given that current-field values for tables
+    specified in 'tables' bitmap are available.
+
+    The function assumes that
+      - Constant parts of the condition has already been checked.
+      - Condition that could be checked for tables in 'tables' has already 
+        been checked.
+        
+    The function takes into account that some parts of the condition are
+    guaranteed to be true by employed 'ref' access methods (the code that
+    does this is located at the end, search down for "EQ_FUNC").
+
+
+  SEE ALSO 
+    make_cond_for_info_schema uses similar algorithm
+
+  RETURN
+    Extracted condition
+*/
+
+static Item *
+make_cond_for_table(Item *cond, table_map tables, table_map used_table,
+                    bool exclude_expensive_cond, bool retain_ref_cond)
+{
+  return make_cond_for_table_from_pred(cond, cond, tables, used_table,
+                                       exclude_expensive_cond,
+                                       retain_ref_cond);
+}
+               
+static Item *
+make_cond_for_table_from_pred(Item *root_cond, Item *cond,
+                              table_map tables, table_map used_table,
+                              bool exclude_expensive_cond,
+                              bool retain_ref_cond)
+
+{
+  if (used_table && !(cond->used_tables() & used_table) &&
+      /*
+        Exclude constant conditions not checked at optimization time if
+        the table we are pushing conditions to is the first one.
+        As a result, such conditions are not considered as already checked
+        and will be checked at execution time, attached to the first table.
+
+        psergey: TODO: "used_table & 1" doesn't make sense in nearly any
+        context. Look at setup_table_map(), table bits reflect the order 
+        the tables were encountered by the parser. Check what we should
+        replace this condition with.
+      */
+      !((used_table & 1) && cond->is_expensive()))
+    return (COND*) 0;				// Already checked
+  if (cond->type() == Item::COND_ITEM)
+  {
+    if (((Item_cond*) cond)->functype() == Item_func::COND_AND_FUNC)
+    {
+      /* Create new top level AND item */
+      Item_cond_and *new_cond=new Item_cond_and;
+      if (!new_cond)
+	return (COND*) 0;			// OOM /* purecov: inspected */
+      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+	Item *fix=make_cond_for_table_from_pred(root_cond, item, 
+                                                tables, used_table,
+                                                exclude_expensive_cond,
+                                                retain_ref_cond);
+	if (fix)
+	  new_cond->argument_list()->push_back(fix);
+      }
+      switch (new_cond->argument_list()->elements) {
+      case 0:
+	return (COND*) 0;			// Always true
+      case 1:
+	return new_cond->argument_list()->head();
+      default:
+	/*
+	  Item_cond_and do not need fix_fields for execution, its parameters
+	  are fixed or do not need fix_fields, too
+	*/
+	new_cond->quick_fix_field();
+	new_cond->used_tables_cache=
+	  ((Item_cond_and*) cond)->used_tables_cache &
+	  tables;
+	return new_cond;
+      }
+    }
+    else
+    {						// Or list
+      Item_cond_or *new_cond=new Item_cond_or;
+      if (!new_cond)
+	return (COND*) 0;			// OOM /* purecov: inspected */
+      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+	Item *fix=make_cond_for_table_from_pred(root_cond, item,
+                                                tables, 0L,
+                                                exclude_expensive_cond,
+                                                retain_ref_cond);
+	if (!fix)
+	  return (COND*) 0;			// Always true
+	new_cond->argument_list()->push_back(fix);
+      }
+      /*
+	Item_cond_and do not need fix_fields for execution, its parameters
+	are fixed or do not need fix_fields, too
+      */
+      new_cond->quick_fix_field();
+      new_cond->used_tables_cache= ((Item_cond_or*) cond)->used_tables_cache;
+      new_cond->top_level_item();
+      return new_cond;
+    }
+  }
+
+  /*
+    Because the following test takes a while and it can be done
+    table_count times, we mark each item that we have examined with the result
+    of the test
+  */
+  if ((cond->marker == 3 && !retain_ref_cond) || 
+      (cond->used_tables() & ~tables) ||
+      /*
+        When extracting constant conditions, treat expensive conditions as
+        non-constant, so that they are not evaluated at optimization time.
+      */
+      (!used_table && exclude_expensive_cond && cond->is_expensive()))
+    return (COND*) 0;				// Can't check this yet
+  if (cond->marker == 2 || cond->eq_cmp_result() == Item::COND_OK)
+    return cond;				// Not boolean op
+
+  if (cond->type() == Item::FUNC_ITEM && 
+      ((Item_func*) cond)->functype() == Item_func::EQ_FUNC)
+  {
+    Item *left_item=	((Item_func*) cond)->arguments()[0]->real_item();
+    Item *right_item= ((Item_func*) cond)->arguments()[1]->real_item();
+    if (left_item->type() == Item::FIELD_ITEM && !retain_ref_cond &&
+	test_if_ref(root_cond, (Item_field*) left_item,right_item))
+    {
+      cond->marker=3;			// Checked when read
+      return (COND*) 0;
+    }
+    if (right_item->type() == Item::FIELD_ITEM && !retain_ref_cond &&
+	test_if_ref(root_cond, (Item_field*) right_item,left_item))
+    {
+      cond->marker=3;			// Checked when read
+      return (COND*) 0;
+    }
+  }
+  cond->marker=2;
+  return cond;
+}
+
+
+
 static COND *
-make_cond_for_table(COND *cond, table_map tables, table_map used_table)
+make_cond_after_sjm(Item *root_cond, Item *cond, table_map tables, 
+                    table_map sjm_tables)
 {
-  if (used_table && !(cond->used_tables() & used_table))
+  if ((!(cond->used_tables() & ~tables) || 
+       !(cond->used_tables() & ~sjm_tables)))
     return (COND*) 0;				// Already checked
   if (cond->type() == Item::COND_ITEM)
   {
@@ -13199,7 +15538,7 @@ make_cond_for_table(COND *cond, table_map tables, table_map used_table)
       Item *item;
       while ((item=li++))
       {
-	Item *fix=make_cond_for_table(item,tables,used_table);
+	Item *fix=make_cond_after_sjm(root_cond, item, tables, sjm_tables);
 	if (fix)
 	  new_cond->argument_list()->push_back(fix);
       }
@@ -13229,7 +15568,7 @@ make_cond_for_table(COND *cond, table_map tables, table_map used_table)
       Item *item;
       while ((item=li++))
       {
-	Item *fix=make_cond_for_table(item,tables,0L);
+	Item *fix= make_cond_after_sjm(root_cond, item, tables, 0L);
 	if (!fix)
 	  return (COND*) 0;			// Always true
 	new_cond->argument_list()->push_back(fix);
@@ -13251,23 +15590,27 @@ make_cond_for_table(COND *cond, table_map tables, table_map used_table)
     of the test
   */
 
-  if (cond->marker == 3 || (cond->used_tables() & ~tables))
+  if (cond->marker == 3 || (cond->used_tables() & ~(tables | sjm_tables)))
     return (COND*) 0;				// Can't check this yet
   if (cond->marker == 2 || cond->eq_cmp_result() == Item::COND_OK)
     return cond;				// Not boolean op
 
+  /* 
+    Remove equalities that are guaranteed to be true by use of 'ref' access
+    method
+  */
   if (((Item_func*) cond)->functype() == Item_func::EQ_FUNC)
   {
-    Item *left_item=	((Item_func*) cond)->arguments()[0];
-    Item *right_item= ((Item_func*) cond)->arguments()[1];
+    Item *left_item= ((Item_func*) cond)->arguments()[0]->real_item();
+    Item *right_item= ((Item_func*) cond)->arguments()[1]->real_item();
     if (left_item->type() == Item::FIELD_ITEM &&
-	test_if_ref((Item_field*) left_item,right_item))
+	test_if_ref(root_cond, (Item_field*) left_item,right_item))
     {
       cond->marker=3;			// Checked when read
       return (COND*) 0;
     }
     if (right_item->type() == Item::FIELD_ITEM &&
-	test_if_ref((Item_field*) right_item,left_item))
+	test_if_ref(root_cond, (Item_field*) right_item,left_item))
     {
       cond->marker=3;			// Checked when read
       return (COND*) 0;
@@ -13277,22 +15620,26 @@ make_cond_for_table(COND *cond, table_map tables, table_map used_table)
   return cond;
 }
 
+
 static Item *
 part_of_refkey(TABLE *table,Field *field)
 {
-  if (!table->reginfo.join_tab)
+  JOIN_TAB *join_tab= table->reginfo.join_tab;
+  if (!join_tab)
     return (Item*) 0;             // field from outer non-select (UPDATE,...)
 
-  uint ref_parts=table->reginfo.join_tab->ref.key_parts;
+  uint ref_parts= join_tab->ref.key_parts;
   if (ref_parts)
   {
-    KEY_PART_INFO *key_part=
-      table->key_info[table->reginfo.join_tab->ref.key].key_part;
+    
+    uint key= join_tab->ref.key;
+    KEY *key_info= join_tab->get_keyinfo_by_key_no(key);
+    KEY_PART_INFO *key_part= key_info->key_part;
 
     for (uint part=0 ; part < ref_parts ; part++,key_part++)
       if (field->eq(key_part->field) &&
 	  !(key_part->key_part_flag & (HA_PART_KEY_SEG | HA_NULL_PART)))
-	return table->reginfo.join_tab->ref.items[part];
+	return join_tab->ref.items[part];
   }
   return (Item*) 0;
 }
@@ -13664,7 +16011,7 @@ find_field_in_item_list (Field *field, void *data)
 
 static bool
 test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
-			bool no_changes, key_map *map)
+			bool no_changes, const key_map *map)
 {
   int ref_key;
   uint ref_key_parts;
@@ -13674,6 +16021,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
   SQL_SELECT *select=tab->select;
   key_map usable_keys;
   QUICK_SELECT_I *save_quick= 0;
+  COND *orig_select_cond= 0;
   DBUG_ENTER("test_if_skip_sort_order");
   LINT_INIT(ref_key_parts);
 
@@ -13693,7 +16041,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
     }
     usable_keys.intersect(((Item_field*) item)->field->part_of_sortkey);
     if (usable_keys.is_clear_all())
-      DBUG_RETURN(0);					// No usable keys
+      goto use_filesort;					// No usable keys
   }
 
   ref_key= -1;
@@ -13703,7 +16051,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
     ref_key=	   tab->ref.key;
     ref_key_parts= tab->ref.key_parts;
     if (tab->type == JT_REF_OR_NULL || tab->type == JT_FT)
-      DBUG_RETURN(0);
+      goto use_filesort;
   }
   else if (select && select->quick)		// Range found by opt_range
   {
@@ -13715,10 +16063,11 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
       by clustered PK values.
     */
   
-    if (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE || 
+    if (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE ||
+        quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_INTERSECT || 
         quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION || 
         quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT)
-      DBUG_RETURN(0);
+      goto use_filesort;
     ref_key=	   select->quick->index;
     ref_key_parts= select->quick->used_key_parts;
   }
@@ -13740,10 +16089,15 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
       */
       if (table->covering_keys.is_set(ref_key))
 	usable_keys.intersect(table->covering_keys);
+      if (tab->pre_idx_push_select_cond)
+        orig_select_cond= tab->set_cond(tab->pre_idx_push_select_cond);
+
       if ((new_ref_key= test_if_subkey(order, table, ref_key, ref_key_parts,
 				       &usable_keys)) < MAX_KEY)
       {
 	/* Found key that can be used to retrieve data in sorted order */
+        //psergey-mrr:if (tab->pre_idx_push_select_cond)
+        //  tab->select_cond= tab->select->cond= tab->pre_idx_push_select_cond;
 	if (tab->ref.key >= 0)
 	{
           /*
@@ -13757,9 +16111,10 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
           KEYUSE *keyuse= tab->keyuse;
           while (keyuse->key != new_ref_key && keyuse->table == tab->table)
             keyuse++;
+
           if (create_ref_for_key(tab->join, tab, keyuse, 
                                  tab->join->const_table_map))
-            DBUG_RETURN(0);
+            goto use_filesort;
 
           pick_table_access_method(tab);
 	}
@@ -13781,9 +16136,10 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
                                         (tab->join->select_options &
                                          OPTION_FOUND_ROWS) ?
                                         HA_POS_ERROR :
-                                        tab->join->unit->select_limit_cnt,0) <=
+                                        tab->join->unit->select_limit_cnt,0,
+                                        TRUE) <=
               0)
-            DBUG_RETURN(0);
+            goto use_filesort;
 	}
         ref_key= new_ref_key;
       }
@@ -14041,7 +16397,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
                                     join->select_options & OPTION_FOUND_ROWS ?
                                     HA_POS_ERROR :
                                     join->unit->select_limit_cnt,
-                                    0) > 0;
+                                    TRUE, FALSE) > 0;
       }
       if (!no_changes)
       {
@@ -14064,6 +16420,18 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
           }
           if (table->covering_keys.is_set(best_key) && ! table->key_read)
             table->enable_keyread();
+          if (tab->pre_idx_push_select_cond)
+          {
+            COND *tmp_cond= tab->pre_idx_push_select_cond;
+            if (orig_select_cond)
+            {
+              tmp_cond= and_conds(tmp_cond, orig_select_cond);
+              tmp_cond->quick_fix_field();
+            }
+            tab->set_cond(tmp_cond);
+            /* orig_select_cond was merged, no need to restore original one. */
+            orig_select_cond= 0;
+          }
           table->file->ha_index_or_rnd_end();
           if (join->select_options & SELECT_DESCRIBE)
           {
@@ -14104,7 +16472,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
         saved_best_key_parts :  best_key_parts;
     }
     else
-      DBUG_RETURN(0); 
+      goto use_filesort; 
   } 
 
 check_reverse_order:                  
@@ -14119,26 +16487,28 @@ check_reverse_order:
       if (!select->quick->reverse_sorted())
       {
         QUICK_SELECT_DESC *tmp;
+        bool error= FALSE;
         int quick_type= select->quick->get_type();
         if (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE ||
+            quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_INTERSECT ||
             quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT ||
             quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION ||
             quick_type == QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX)
         {
           tab->limit= 0;
           select->quick= save_quick;
-          DBUG_RETURN(0);                   // Use filesort
+          goto use_filesort;                   // Use filesort
         }
             
         /* ORDER BY range_key DESC */
 	tmp= new QUICK_SELECT_DESC((QUICK_RANGE_SELECT*)(select->quick),
-                                    used_key_parts);
-	if (!tmp || tmp->error)
+                                    used_key_parts, &error);
+	if (!tmp || error)
 	{
 	  delete tmp;
           select->quick= save_quick;
           tab->limit= 0;
-	  DBUG_RETURN(0);		// Reverse sort not supported
+	  goto use_filesort;		// Reverse sort not supported
 	}
 	select->quick=tmp;
       }
@@ -14157,8 +16527,14 @@ check_reverse_order:
     }
   }
   else if (select && select->quick)
-    select->quick->sorted= 1;
+    select->quick->need_sorted_output();
+  if (orig_select_cond)
+    tab->set_cond(orig_select_cond);
   DBUG_RETURN(1);
+use_filesort:
+  if (orig_select_cond)
+    tab->set_cond(orig_select_cond);
+  DBUG_RETURN(0);
 }
 
 
@@ -14256,7 +16632,7 @@ create_sort_index(THD *thd, JOIN *join, ORDER *order,
 	field, quick will contain an empty record set.
       */
       if (!(select->quick= (tab->type == JT_FT ?
-			    new FT_SELECT(thd, table, tab->ref.key) :
+			    get_ft_select(thd, table, tab->ref.key) :
 			    get_quick_select_for_ref(thd, table, &tab->ref, 
                                                      tab->found_records))))
 	goto err;
@@ -14293,9 +16669,10 @@ create_sort_index(THD *thd, JOIN *join, ORDER *order,
     select->cleanup();				// filesort did select
     tab->select= 0;
     table->quick_keys.clear_all();  // as far as we cleanup select->quick
+    table->intersect_keys.clear_all();
     table->sort.io_cache= tablesort_result_cache;
   }
-  tab->select_cond=0;
+  tab->set_select_cond(NULL, __LINE__);
   tab->last_inner= 0;
   tab->first_unmatched= 0;
   tab->type=JT_ALL;				// Read with normal read_record
@@ -14319,7 +16696,8 @@ static bool fix_having(JOIN *join, Item **having)
   table_map used_tables= join->const_table_map | table->table->map;
 
   DBUG_EXECUTE("where",print_where(*having,"having", QT_ORDINARY););
-  Item* sort_table_cond=make_cond_for_table(*having,used_tables,used_tables);
+  Item* sort_table_cond=make_cond_for_table(*having, used_tables, used_tables,
+                                            FALSE, FALSE);
   if (sort_table_cond)
   {
     if (!table->select)
@@ -14332,12 +16710,13 @@ static bool fix_having(JOIN *join, Item **having)
 						   sort_table_cond)) ||
 	  table->select->cond->fix_fields(join->thd, &table->select->cond))
 	return 1;
-    table->select_cond=table->select->cond;
+    table->set_select_cond(table->select->cond, __LINE__);
     table->select_cond->top_level_item();
     DBUG_EXECUTE("where",print_where(table->select_cond,
 				     "select and having",
                                      QT_ORDINARY););
-    *having=make_cond_for_table(*having,~ (table_map) 0,~used_tables);
+    *having= make_cond_for_table(*having,~ (table_map) 0,~used_tables,
+                                 FALSE, FALSE);
     DBUG_EXECUTE("where",
                  print_where(*having,"having after make_cond", QT_ORDINARY););
   }
@@ -14685,277 +17064,53 @@ SORT_FIELD *make_unireg_sortorder(ORDER *order, uint *length,
 }
 
 
-/*****************************************************************************
-  Fill join cache with packed records
-  Records are stored in tab->cache.buffer and last record in
-  last record is stored with pointers to blobs to support very big
-  records
-******************************************************************************/
 
-static int
-join_init_cache(THD *thd,JOIN_TAB *tables,uint table_count)
-{
-  reg1 uint i;
-  uint length, blobs;
-  size_t size;
-  CACHE_FIELD *copy,**blob_ptr;
-  JOIN_CACHE  *cache;
-  JOIN_TAB *join_tab;
-  DBUG_ENTER("join_init_cache");
-
-  cache= &tables[table_count].cache;
-  cache->fields=blobs=0;
-
-  join_tab=tables;
-  for (i=0 ; i < table_count ; i++,join_tab++)
-  {
-    if (!join_tab->used_fieldlength)		/* Not calced yet */
-      calc_used_field_length(thd, join_tab);
-    cache->fields+=join_tab->used_fields;
-    blobs+=join_tab->used_blobs;
-  }
-  if (!(cache->field=(CACHE_FIELD*)
-	sql_alloc(sizeof(CACHE_FIELD)*(cache->fields+table_count*2)+(blobs+1)*
-
-		  sizeof(CACHE_FIELD*))))
-  {
-    my_free((uchar*) cache->buff,MYF(0));		/* purecov: inspected */
-    cache->buff=0;				/* purecov: inspected */
-    DBUG_RETURN(1);				/* purecov: inspected */
-  }
-  copy=cache->field;
-  blob_ptr=cache->blob_ptr=(CACHE_FIELD**)
-    (cache->field+cache->fields+table_count*2);
+/*
+  eq_ref: Create the lookup key and check if it is the same as saved key
 
-  length=0;
-  for (i=0 ; i < table_count ; i++)
-  {
-    bool have_bit_fields= FALSE;
-    uint null_fields=0,used_fields;
-    Field **f_ptr,*field;
-    MY_BITMAP *read_set= tables[i].table->read_set;
-    for (f_ptr=tables[i].table->field,used_fields=tables[i].used_fields ;
-	 used_fields ;
-	 f_ptr++)
-    {
-      field= *f_ptr;
-      if (bitmap_is_set(read_set, field->field_index))
-      {
-	used_fields--;
-	length+=field->fill_cache_field(copy);
-	if (copy->type == CACHE_BLOB)
-	  (*blob_ptr++)=copy;
-	if (field->real_maybe_null())
-	  null_fields++;
-        if (field->type() == MYSQL_TYPE_BIT &&
-            ((Field_bit*)field)->bit_len)
-          have_bit_fields= TRUE;    
-	copy++;
-      }
-    }
-    /* Copy null bits from table */
-    if (null_fields || have_bit_fields)
-    {						/* must copy null bits */
-      copy->str= tables[i].table->null_flags;
-      copy->length= tables[i].table->s->null_bytes;
-      copy->type=0;
-      copy->field=0;
-      length+=copy->length;
-      copy++;
-      cache->fields++;
-    }
-    /* If outer join table, copy null_row flag */
-    if (tables[i].table->maybe_null)
-    {
-      copy->str= (uchar*) &tables[i].table->null_row;
-      copy->length=sizeof(tables[i].table->null_row);
-      copy->type=0;
-      copy->field=0;
-      length+=copy->length;
-      copy++;
-      cache->fields++;
-    }
-  }
 
-  cache->length=length+blobs*sizeof(char*);
-  cache->blobs=blobs;
-  *blob_ptr=0;					/* End sequentel */
-  size=max(thd->variables.join_buff_size, cache->length);
-  if (!(cache->buff=(uchar*) my_malloc(size,MYF(0))))
-    DBUG_RETURN(1);				/* Don't use cache */ /* purecov: inspected */
-  cache->end=cache->buff+size;
-  reset_cache_write(cache);
-  DBUG_RETURN(0);
-}
 
 
-static ulong
-used_blob_length(CACHE_FIELD **ptr)
-{
-  uint length,blob_length;
-  for (length=0 ; *ptr ; ptr++)
-  {
-    Field_blob *field_blob= (Field_blob *) (*ptr)->field;
-    (*ptr)->blob_length=blob_length= field_blob->get_length();
-    length+=blob_length;
-    field_blob->get_ptr(&(*ptr)->str);
-  }
-  return length;
-}
+  SYNOPSIS
+    cmp_buffer_with_ref()
+      tab      Join tab of the accessed table
+      table    The table to read.  This is usually tab->table, except for 
+               semi-join when we might need to make a lookup in a temptable
+               instead.
+      tab_ref  The structure with methods to collect index lookup tuple. 
+               This is usually table->ref, except for the case of when we're 
+               doing lookup into semi-join materialization table.
+
+  DESCRIPTION 
+    Used by eq_ref access method: create the index lookup key and check if 
+    we've used this key at previous lookup (If yes, we don't need to repeat
+    the lookup - the record has been already fetched)
 
+  RETURN 
+    TRUE   No cached record for the key, or failed to create the key (due to
+           out-of-domain error)
+    FALSE  The created key is the same as the previous one (and the record 
+           is already in table->record)
+*/
 
 static bool
-store_record_in_cache(JOIN_CACHE *cache)
+cmp_buffer_with_ref(THD *thd, TABLE *table, TABLE_REF *tab_ref)
 {
-  uint length;
-  uchar *pos;
-  CACHE_FIELD *copy,*end_field;
-  bool last_record;
-
-  pos=cache->pos;
-  end_field=cache->field+cache->fields;
-
-  length=cache->length;
-  if (cache->blobs)
-    length+=used_blob_length(cache->blob_ptr);
-  if ((last_record= (length + cache->length > (size_t) (cache->end - pos))))
-    cache->ptr_record=cache->records;
-
-  /*
-    There is room in cache. Put record there
-  */
-  cache->records++;
-  for (copy=cache->field ; copy < end_field; copy++)
+  bool no_prev_key;
+  if (!tab_ref->disable_cache)
   {
-    if (copy->type == CACHE_BLOB)
+    if (!(no_prev_key= tab_ref->key_err))
     {
-      Field_blob *blob_field= (Field_blob *) copy->field;
-      if (last_record)
-      {
-	blob_field->get_image(pos, copy->length+sizeof(char*), 
-                              blob_field->charset());
-	pos+=copy->length+sizeof(char*);
-      }
-      else
-      {
-	blob_field->get_image(pos, copy->length, // blob length
-                              blob_field->charset());
-	memcpy(pos+copy->length,copy->str,copy->blob_length);  // Blob data
-	pos+=copy->length+copy->blob_length;
-      }
-    }
-    else
-    {
-      if (copy->type == CACHE_STRIPPED)
-      {
-	uchar *str,*end;
-        Field *field= copy->field;
-        if (field && field->is_null())
-          end= str= copy->str;
-        else
-        {
-          for (str=copy->str,end= str+copy->length;
-               end > str && end[-1] == ' ' ;
-               end--)
-            ;
-        }
-	length=(uint) (end-str);
-	memcpy(pos+2, str, length);
-        int2store(pos, length);
-	pos+= length+2;
-      }
-      else
-      {
-	memcpy(pos,copy->str,copy->length);
-	pos+=copy->length;
-      }
+      /* Previous access found a row. Copy its key */
+      memcpy(tab_ref->key_buff2, tab_ref->key_buff, tab_ref->key_length);
     }
   }
-  cache->pos=pos;
-  return last_record || (size_t) (cache->end - pos) < cache->length;
-}
-
-
-static void
-reset_cache_read(JOIN_CACHE *cache)
-{
-  cache->record_nr=0;
-  cache->pos=cache->buff;
-}
-
-
-static void reset_cache_write(JOIN_CACHE *cache)
-{
-  reset_cache_read(cache);
-  cache->records= 0;
-  cache->ptr_record= (uint) ~0;
-}
-
-
-static void
-read_cached_record(JOIN_TAB *tab)
-{
-  uchar *pos;
-  uint length;
-  bool last_record;
-  CACHE_FIELD *copy,*end_field;
-
-  last_record=tab->cache.record_nr++ == tab->cache.ptr_record;
-  pos=tab->cache.pos;
-
-  for (copy=tab->cache.field,end_field=copy+tab->cache.fields ;
-       copy < end_field;
-       copy++)
-  {
-    if (copy->type == CACHE_BLOB)
-    {
-      Field_blob *blob_field= (Field_blob *) copy->field;
-      if (last_record)
-      {
-	blob_field->set_image(pos, copy->length+sizeof(char*),
-                              blob_field->charset());
-	pos+=copy->length+sizeof(char*);
-      }
-      else
-      {
-	blob_field->set_ptr(pos, pos+copy->length);
-	pos+=copy->length + blob_field->get_length();
-      }
-    }
-    else
-    {
-      if (copy->type == CACHE_STRIPPED)
-      {
-        length= uint2korr(pos);
-	memcpy(copy->str, pos+2, length);
-	memset(copy->str+length, ' ', copy->length-length);
-	pos+= 2 + length;
-      }
-      else
-      {
-	memcpy(copy->str,pos,copy->length);
-	pos+=copy->length;
-      }
-    }
-  }
-  tab->cache.pos=pos;
-  return;
-}
-
-
-static bool
-cmp_buffer_with_ref(JOIN_TAB *tab)
-{
-  bool diff;
-  if (!(diff=tab->ref.key_err))
-  {
-    memcpy(tab->ref.key_buff2, tab->ref.key_buff, tab->ref.key_length);
-  }
-  if ((tab->ref.key_err= cp_buffer_from_ref(tab->join->thd, tab->table,
-                                            &tab->ref)) ||
-      diff)
+  else 
+    no_prev_key= TRUE;
+  if ((tab_ref->key_err= cp_buffer_from_ref(thd, table, tab_ref)) ||
+      no_prev_key)
     return 1;
-  return memcmp(tab->ref.key_buff2, tab->ref.key_buff, tab->ref.key_length)
+  return memcmp(tab_ref->key_buff2, tab_ref->key_buff, tab_ref->key_length)
     != 0;
 }
 
@@ -15653,7 +17808,7 @@ alloc_group_fields(JOIN *join,ORDER *group)
   {
     for (; group ; group=group->next)
     {
-      Cached_item *tmp=new_Cached_item(join->thd, *group->item);
+      Cached_item *tmp=new_Cached_item(join->thd, *group->item, TRUE);
       if (!tmp || join->group_fields.push_front(tmp))
 	return TRUE;
     }
@@ -15663,6 +17818,38 @@ alloc_group_fields(JOIN *join,ORDER *group)
 }
 
 
+
+/*
+  Test if a single-row cache of items changed, and update the cache.
+
+  @details Test if a list of items that typically represents a result
+  row has changed. If the value of some item changed, update the cached
+  value for this item.
+  
+  @param list list of <item, cached_value> pairs stored as Cached_item.
+
+  @return -1 if no item changed
+  @return index of the first item that changed
+*/
+
+int test_if_item_cache_changed(List<Cached_item> &list)
+{
+  DBUG_ENTER("test_if_item_cache_changed");
+  List_iterator<Cached_item> li(list);
+  int idx= -1,i;
+  Cached_item *buff;
+
+  for (i=(int) list.elements-1 ; (buff=li++) ; i--)
+  {
+    if (buff->cmp())
+      idx=i;
+  }
+  DBUG_PRINT("info", ("idx: %d", idx));
+  DBUG_RETURN(idx);
+}
+
+
+
 static int
 test_if_group_changed(List<Cached_item> &list)
 {
@@ -15803,6 +17990,8 @@ setup_copy_fields(THD *thd, TMP_TABLE_PARAM *param,
     }
     else if ((real_pos->type() == Item::FUNC_ITEM ||
 	      real_pos->type() == Item::SUBSELECT_ITEM ||
+              (real_pos->get_cached_item() &&
+               real_pos->get_cached_item()->type() == Item::SUBSELECT_ITEM) ||
 	      real_pos->type() == Item::CACHE_ITEM ||
 	      real_pos->type() == Item::COND_ITEM) &&
 	     !real_pos->with_sum_func)
@@ -16276,11 +18465,12 @@ static bool add_ref_to_table_cond(THD *thd, JOIN_TAB *join_tab)
   {
     if (join_tab->select->cond)
       error=(int) cond->add(join_tab->select->cond);
-    join_tab->select_cond=join_tab->select->cond=cond;
+    join_tab->select->cond= cond;
+    join_tab->set_select_cond(cond, __LINE__);
   }
   else if ((join_tab->select= make_select(join_tab->table, 0, 0, cond, 0,
                                           &error)))
-    join_tab->select_cond=cond;
+    join_tab->set_select_cond(cond, __LINE__);
 
   DBUG_RETURN(error ? TRUE : FALSE);
 }
@@ -16735,10 +18925,12 @@ int JOIN::rollup_write_data(uint idx, TABLE *table_arg)
           item->save_in_result_field(1);
       }
       copy_sum_funcs(sum_funcs_end[i+1], sum_funcs_end[i]);
-      if ((write_error= table_arg->file->ha_write_row(table_arg->record[0])))
+      if ((write_error= table_arg->file->ha_write_tmp_row(table_arg->record[0])))
       {
-	if (create_internal_tmp_table_from_heap(thd, table_arg, &tmp_table_param,
-                                    write_error, 0))
+	if (create_internal_tmp_table_from_heap(thd, table_arg, 
+                                                tmp_table_param.start_recinfo,
+                                                &tmp_table_param.recinfo,
+                                                write_error, 0))
 	  return 1;		     
       }
     }
@@ -16884,24 +19076,39 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
   else
   {
     table_map used_tables=0;
-    for (uint i=0 ; i < join->tables ; i++)
+
+    uchar sjm_nests[MAX_TABLES];
+    uint sjm_nests_cur=0;
+    uint sjm_nests_end= 0;
+    uint end_table= join->tables;
+    bool printing_materialize_nest= FALSE;
+    uint select_id= join->select_lex->select_number;
+
+    for (uint i=0 ; i < end_table ; i++)
     {
       JOIN_TAB *tab=join->join_tab+i;
       TABLE *table=tab->table;
       TABLE_LIST *table_list= tab->table->pos_in_table_list;
       char buff[512]; 
-      char buff1[512], buff2[512], buff3[512];
+      char buff1[512], buff2[512], buff3[512], buff4[512];
       char keylen_str_buf[64];
+      my_bool key_read;
       String extra(buff, sizeof(buff),cs);
       char table_name_buffer[SAFE_NAME_LEN];
       String tmp1(buff1,sizeof(buff1),cs);
       String tmp2(buff2,sizeof(buff2),cs);
       String tmp3(buff3,sizeof(buff3),cs);
+      String tmp4(buff4,sizeof(buff4),cs);
+      char hash_key_prefix[]= "#hash#";
+      KEY *key_info= 0;
+      uint key_len= 0;
+      bool is_hj= tab->type == JT_HASH || tab->type ==JT_HASH_NEXT;
+
       extra.length(0);
       tmp1.length(0);
       tmp2.length(0);
       tmp3.length(0);
-
+      tmp4.length(0);
       quick_type= -1;
 
       /* Don't show eliminated tables */
@@ -16913,22 +19120,103 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
 
       item_list.empty();
       /* id */
-      item_list.push_back(new Item_uint((uint32)
-				       join->select_lex->select_number));
+      item_list.push_back(new Item_uint((uint32)select_id));
       /* select_type */
-      item_list.push_back(new Item_string(join->select_lex->type,
-					  strlen(join->select_lex->type),
-					  cs));
-      if (tab->type == JT_ALL && tab->select && tab->select->quick)
+      const char* stype= printing_materialize_nest? "SUBQUERY" : 
+                                                    join->select_lex->type;
+      item_list.push_back(new Item_string(stype, strlen(stype), cs));
+      
+      /* 
+        Special processing for SJ-Materialization nests: print the fake table
+        and delay printing of the SJM nest contents until later.
+      */
+      uint sj_strategy= join->best_positions[i].sj_strategy;
+      if (sj_is_materialize_strategy(sj_strategy) &&
+          !printing_materialize_nest)
+      {
+        /* table */
+        int len= my_snprintf(table_name_buffer, 
+                             sizeof(table_name_buffer)-1,
+                             "subselect%d", 
+                             tab->emb_sj_nest->sj_subq_pred->get_identifier());
+	item_list.push_back(new Item_string(table_name_buffer, len, cs));
+        /* partitions */
+        if (join->thd->lex->describe & DESCRIBE_PARTITIONS)
+          item_list.push_back(item_null);
+        /* type */
+        uint type= (sj_strategy == SJ_OPT_MATERIALIZE_SCAN)? JT_ALL : JT_EQ_REF;
+        item_list.push_back(new Item_string(join_type_str[type],
+                                            strlen(join_type_str[type]),
+                                            cs));
+        /* possible_keys */
+	item_list.push_back(new Item_string("unique_key", 
+                                            strlen("unique_key"), cs));
+        if (sj_strategy == SJ_OPT_MATERIALIZE_SCAN)
+        {
+          item_list.push_back(item_null); /* key */
+          item_list.push_back(item_null); /* key_len */
+          item_list.push_back(item_null); /* ref */
+        }
+        else
+        {
+          /* key */
+          item_list.push_back(new Item_string("unique_key", strlen("unique_key"), cs));
+          /* key_len */
+          uint klen= tab->emb_sj_nest->sj_mat_info->table->key_info[0].key_length;
+          uint buflen= longlong10_to_str(klen, keylen_str_buf, 10) - keylen_str_buf;
+          item_list.push_back(new Item_string(keylen_str_buf, buflen, cs));
+          /* ref */
+          item_list.push_back(new Item_string("func", strlen("func"), cs));
+        }
+        /* rows */
+        ha_rows rows= (ha_rows) ((sj_strategy == SJ_OPT_MATERIALIZE_SCAN)?
+                                 tab->emb_sj_nest->sj_mat_info->rows : 1.0);
+        item_list.push_back(new Item_int((longlong)rows, 
+                                         MY_INT64_NUM_DECIMAL_DIGITS));
+        /* filtered */
+        if (join->thd->lex->describe & DESCRIBE_EXTENDED)
+          item_list.push_back(new Item_float(1.0, 2));
+        
+        /* Extra */
+	if (need_tmp_table)
+	{
+	  need_tmp_table=0;
+	  extra.append(STRING_WITH_LEN("; Using temporary"));
+	}
+	if (need_order)
+	{
+	  need_order=0;
+	  extra.append(STRING_WITH_LEN("; Using filesort"));
+	}
+        /* Skip initial "; "*/
+        const char *str= extra.ptr();
+        uint32 extra_len= extra.length();
+        if (extra_len)
+        {
+          str += 2;
+          extra_len -= 2;
+        }
+	item_list.push_back(new Item_string(str, extra_len, cs));
+
+        /* Register the nest for further processing: */
+        sjm_nests[sjm_nests_end++]= i;
+        i += join->best_positions[i].n_sj_tables-1;
+        goto loop_end;
+      }
+
+      if ((tab->type == JT_ALL || tab->type == JT_HASH) &&
+           tab->select && tab->select->quick)
       {
         quick_type= tab->select->quick->get_type();
         if ((quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE) ||
+            (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_INTERSECT) ||
             (quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT) ||
             (quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION))
-          tab->type = JT_INDEX_MERGE;
+          tab->type= tab->type == JT_ALL ? JT_INDEX_MERGE : JT_HASH_INDEX_MERGE;
         else
-	  tab->type = JT_RANGE;
+	  tab->type= tab->type == JT_ALL ? JT_RANGE : JT_HASH_RANGE;
       }
+
       /* table */
       if (table->derived_select_number)
       {
@@ -16990,45 +19278,66 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
 	item_list.push_back(item_null);
 
       /* Build "key", "key_len", and "ref" values and add them to item_list */
-      if (tab->ref.key_parts)
+      if (tab->type == JT_NEXT)
+      {
+	key_info= table->key_info+tab->index;
+        key_len= key_info->key_length;
+      }
+      else if (tab->ref.key_parts)
+      {
+	key_info= tab->get_keyinfo_by_key_no(tab->ref.key);
+        key_len= tab->ref.key_length;
+      }
+      if (key_info)
       {
-	KEY *key_info=table->key_info+ tab->ref.key;
         register uint length;
-	item_list.push_back(new Item_string(key_info->name,
-					    strlen(key_info->name),
-					    system_charset_info));
-        length= (longlong10_to_str(tab->ref.key_length, keylen_str_buf, 10) - 
+        if (is_hj)
+          tmp2.append(hash_key_prefix, strlen(hash_key_prefix), cs);
+        tmp2.append(key_info->name,  strlen(key_info->name), cs);
+        length= (longlong10_to_str(key_len, keylen_str_buf, 10) - 
                  keylen_str_buf);
-        item_list.push_back(new Item_string(keylen_str_buf, length,
-                                            system_charset_info));
-	for (store_key **ref=tab->ref.key_copy ; *ref ; ref++)
+        tmp3.append(keylen_str_buf, length, cs);
+        if (tab->ref.key_parts)
 	{
-	  if (tmp2.length())
-	    tmp2.append(',');
-	  tmp2.append((*ref)->name(), strlen((*ref)->name()),
-		      system_charset_info);
-	}
-	item_list.push_back(new Item_string(tmp2.ptr(),tmp2.length(),cs));
+	  for (store_key **ref=tab->ref.key_copy ; *ref ; ref++)
+	  {
+	    if (tmp4.length())
+	      tmp4.append(',');
+	    tmp4.append((*ref)->name(), strlen((*ref)->name()), cs);
+          }
+        }
       }
-      else if (tab->type == JT_NEXT)
+      if (is_hj && tab->type != JT_HASH)
       {
-	KEY *key_info=table->key_info+ tab->index;
-        register uint length;
-	item_list.push_back(new Item_string(key_info->name,
-					    strlen(key_info->name),cs));
-        length= (longlong10_to_str(key_info->key_length, keylen_str_buf, 10) - 
-                 keylen_str_buf);
-        item_list.push_back(new Item_string(keylen_str_buf, 
-                                            length,
-                                            system_charset_info));
-	item_list.push_back(item_null);
+        tmp2.append(':');
+        tmp3.append(':');
       }
-      else if (tab->select && tab->select->quick)
+      if (tab->type == JT_HASH_NEXT)
       {
+        register uint length;
+	key_info= table->key_info+tab->index;
+        key_len= key_info->key_length;
+        tmp2.append(key_info->name,  strlen(key_info->name), cs);
+        length= (longlong10_to_str(key_len, keylen_str_buf, 10) - 
+                 keylen_str_buf);
+        tmp3.append(keylen_str_buf, length, cs);
+      }         
+      if (tab->select && tab->select->quick)
         tab->select->quick->add_keys_and_lengths(&tmp2, &tmp3);
-	item_list.push_back(new Item_string(tmp2.ptr(),tmp2.length(),cs));
-	item_list.push_back(new Item_string(tmp3.ptr(),tmp3.length(),cs));
-	item_list.push_back(item_null);
+      if (key_info || (tab->select && tab->select->quick))
+      {
+        if (tmp2.length())
+          item_list.push_back(new Item_string(tmp2.ptr(),tmp2.length(),cs));
+        else
+          item_list.push_back(item_null);
+        if (tmp3.length())
+          item_list.push_back(new Item_string(tmp3.ptr(),tmp3.length(),cs));
+        else
+          item_list.push_back(item_null);
+        if (key_info && tab->type != JT_NEXT)
+          item_list.push_back(new Item_string(tmp4.ptr(),tmp4.length(),cs));
+        else
+          item_list.push_back(item_null);
       }
       else
       {
@@ -17076,7 +19385,7 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
         ha_rows examined_rows;
         if (tab->select && tab->select->quick)
           examined_rows= tab->select->quick->records;
-        else if (tab->type == JT_NEXT || tab->type == JT_ALL)
+        else if (tab->type == JT_NEXT || tab->type == JT_ALL || is_hj)
         {
           if (tab->limit)
             examined_rows= tab->limit;
@@ -17095,6 +19404,17 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
         /* Add "filtered" field to item_list. */
         if (join->thd->lex->describe & DESCRIBE_EXTENDED)
         {
+          /*
+            psergey-todo: 
+              in the code above, we cast to integer when asssigning to
+              examined_rows. 
+              In the code below, we may divide original value but result of
+              conversion of the same value to integer, which may produce a
+              value that's greater than 100%, which looks very odd.
+              I'm not fixing this right away because that might trigger a wave
+              of small EXPLAIN EXTENDED output changes, which I don't have time
+              to deal with right now.
+          */
           float f= 0.0; 
           if (examined_rows)
             f= (float) (100.0 * join->best_positions[i].records_read /
@@ -17104,7 +19424,7 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
       }
 
       /* Build "Extra" field and add it to item_list. */
-      my_bool key_read=table->key_read;
+      key_read=table->key_read;
       if ((tab->type == JT_NEXT || tab->type == JT_CONST) &&
           table->covering_keys.is_set(tab->index))
 	key_read=1;
@@ -17134,8 +19454,21 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
       }
       else
       {
+        uint keyno= MAX_KEY;
+        if (tab->ref.key_parts)
+          keyno= tab->ref.key;
+        else if (tab->select && tab->select->quick)
+          keyno = tab->select->quick->index;
+
+        if (keyno != MAX_KEY && keyno == table->file->pushed_idx_cond_keyno &&
+            table->file->pushed_idx_cond)
+          extra.append(STRING_WITH_LEN("; Using index condition"));
+        else if (tab->cache_idx_cond)
+          extra.append(STRING_WITH_LEN("; Using index condition(BKA)"));
+
         if (quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION || 
             quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT ||
+            quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_INTERSECT ||
             quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE)
         {
           extra.append(STRING_WITH_LEN("; Using "));
@@ -17197,6 +19530,14 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
         }
 	if (table->reginfo.not_exists_optimize)
 	  extra.append(STRING_WITH_LEN("; Not exists"));
+
+        if (quick_type == QUICK_SELECT_I::QS_TYPE_RANGE &&
+            !(((QUICK_RANGE_SELECT*)(tab->select->quick))->mrr_flags &
+             HA_MRR_USE_DEFAULT_IMPL))
+        {
+	  extra.append(STRING_WITH_LEN("; Using MRR"));
+        }
+
 	if (need_tmp_table)
 	{
 	  need_tmp_table=0;
@@ -17209,6 +19550,56 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
 	}
 	if (distinct & test_all_bits(used_tables,thd->used_tables))
 	  extra.append(STRING_WITH_LEN("; Distinct"));
+        if (tab->loosescan_match_tab)
+        {
+          extra.append(STRING_WITH_LEN("; LooseScan"));
+        }
+
+        if (tab->flush_weedout_table)
+          extra.append(STRING_WITH_LEN("; Start temporary"));
+        if (tab->check_weed_out_table)
+          extra.append(STRING_WITH_LEN("; End temporary"));
+        else if (tab->do_firstmatch)
+        {
+          if (tab->do_firstmatch == join->join_tab - 1)
+            extra.append(STRING_WITH_LEN("; FirstMatch"));
+          else
+          {
+            extra.append(STRING_WITH_LEN("; FirstMatch("));
+            TABLE *prev_table=tab->do_firstmatch->table;
+            if (prev_table->derived_select_number)
+            {
+              char namebuf[NAME_LEN];
+              /* Derived table name generation */
+              int len= my_snprintf(namebuf, sizeof(namebuf)-1,
+                                   "<derived%u>",
+                                   prev_table->derived_select_number);
+              extra.append(namebuf, len);
+            }
+            else
+              extra.append(prev_table->pos_in_table_list->alias);
+            extra.append(STRING_WITH_LEN(")"));
+          }
+        }
+
+        /*
+        if (sj_is_materialize_strategy(sj_strategy))
+        {
+          if (join->best_positions[i].n_sj_tables == 1)
+            extra.append(STRING_WITH_LEN("; Materialize"));
+          else
+          {
+            last_sjm_table= i + join->best_positions[i].n_sj_tables - 1;
+            extra.append(STRING_WITH_LEN("; Start materialize"));
+          }
+          if (sj_strategy == SJ_OPT_MATERIALIZE_SCAN)
+              extra.append(STRING_WITH_LEN("; Scan"));
+        }
+        else if (last_sjm_table == i)
+        {
+          extra.append(STRING_WITH_LEN("; End materialize"));
+        }
+        */
 
         for (uint part= 0; part < tab->ref.key_parts; part++)
         {
@@ -17218,8 +19609,12 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
             break;
           }
         }
-        if (i > 0 && tab[-1].next_select == sub_select_cache)
+
+        if (tab->cache)
+	{
           extra.append(STRING_WITH_LEN("; Using join buffer"));
+          tab->cache->print_explain_comment(&extra);
+        }
         
         /* Skip initial "; "*/
         const char *str= extra.ptr();
@@ -17231,6 +19626,15 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
         }
 	item_list.push_back(new Item_string(str, len, cs));
       }
+    loop_end:
+       if (i+1 == end_table && sjm_nests_cur != sjm_nests_end)
+       {
+         printing_materialize_nest= TRUE;
+         i= sjm_nests[sjm_nests_cur++] - 1;
+         end_table= (i+1) + join->best_positions[i+1].n_sj_tables;
+         select_id= join->join_tab[i+1].emb_sj_nest->sj_subq_pred->get_identifier();
+       }
+      
       // For next iteration
       used_tables|=table->map;
       if (result->send_data(item_list))
@@ -17241,6 +19645,26 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
        unit;
        unit= unit->next_unit())
   {
+    /*
+      This fix_fields() call is to handle an edge case like this:
+       
+        SELECT ... UNION SELECT ... ORDER BY (SELECT ...)
+      
+      for such queries, we'll get here before having called
+      subquery_expr->fix_fields(), which will cause failure to
+    */
+    if (unit->item && !unit->item->fixed)
+    {
+      Item *ref= unit->item;
+      if (unit->item->fix_fields(thd, &ref))
+        DBUG_VOID_RETURN;
+      DBUG_ASSERT(ref == unit->item);
+    }
+
+    /* 
+      Display subqueries only if they are not parts of eliminated WHERE/ON
+      clauses.
+    */
     if (!(unit->item && unit->item->eliminated))
     {
       if (mysql_explain_union(thd, unit, result))
@@ -17308,6 +19732,53 @@ bool mysql_explain_union(THD *thd, SELECT_LEX_UNIT *unit, select_result *result)
 }
 
 
+static void print_table_array(THD *thd, 
+                              table_map eliminated_tables,
+                              String *str, TABLE_LIST **table, 
+                              TABLE_LIST **end,
+                              enum_query_type query_type)
+{
+  (*table)->print(thd, eliminated_tables, str, query_type);
+
+  for (TABLE_LIST **tbl= table + 1; tbl < end; tbl++)
+  {
+    TABLE_LIST *curr= *tbl;
+    
+    /*
+      The "eliminated_tables &&" check guards againist the case of 
+      printing the query for CREATE VIEW. We do that without having run 
+      JOIN::optimize() and so will have nested_join->used_tables==0.
+    */
+    if (eliminated_tables &&
+        ((curr->table && (curr->table->map & eliminated_tables)) ||
+         (curr->nested_join && !(curr->nested_join->used_tables &
+                                ~eliminated_tables))))
+    {
+      continue;
+    }
+
+    if (curr->outer_join)
+    {
+      /* MySQL converts right to left joins */
+      str->append(STRING_WITH_LEN(" left join "));
+    }
+    else if (curr->straight)
+      str->append(STRING_WITH_LEN(" straight_join "));
+    else if (curr->sj_inner_tables)
+      str->append(STRING_WITH_LEN(" semi join "));
+    else
+      str->append(STRING_WITH_LEN(" join "));
+    curr->print(thd, eliminated_tables, str, query_type);
+    if (curr->on_expr)
+    {
+      str->append(STRING_WITH_LEN(" on("));
+      curr->on_expr->print(str, query_type);
+      str->append(')');
+    }
+  }
+}
+
+
 /**
   Print joins from the FROM clause.
 
@@ -17342,45 +19813,28 @@ static void print_join(THD *thd,
               !(((*table)->table && ((*table)->table->map & eliminated_tables)) ||
                 ((*table)->nested_join && !((*table)->nested_join->used_tables &
                                            ~eliminated_tables))));
-  (*table)->print(thd, eliminated_tables, str, query_type);
-
-  TABLE_LIST **end= table + tables->elements;
-  for (TABLE_LIST **tbl= table + 1; tbl < end; tbl++)
+  /* 
+    If the first table is a semi-join nest, swap it with something that is
+    not a semi-join nest.
+  */
+  if ((*table)->sj_inner_tables)
   {
-    TABLE_LIST *curr= *tbl;
-    /*
-      The "eliminated_tables &&" check guards againist the case of 
-      printing the query for CREATE VIEW. We do that without having run 
-      JOIN::optimize() and so will have nested_join->used_tables==0.
-    */
-    if (eliminated_tables &&
-        ((curr->table && (curr->table->map & eliminated_tables)) ||
-         (curr->nested_join && !(curr->nested_join->used_tables &
-                                ~eliminated_tables))))
+    TABLE_LIST **end= table + tables->elements;
+    for (TABLE_LIST **t2= table; t2!=end; t2++)
     {
-      continue;
-    }
-
-    if (curr->outer_join)
-    {
-      /* MySQL converts right to left joins */
-      str->append(STRING_WITH_LEN(" left join "));
-    }
-    else if (curr->straight)
-      str->append(STRING_WITH_LEN(" straight_join "));
-    else
-      str->append(STRING_WITH_LEN(" join "));
-    curr->print(thd, eliminated_tables, str, query_type);
-    if (curr->on_expr)
-    {
-      str->append(STRING_WITH_LEN(" on("));
-      curr->on_expr->print(str, query_type);
-      str->append(')');
+      if (!(*t2)->sj_inner_tables)
+      {
+        TABLE_LIST *tmp= *t2;
+        *t2= *table;
+        *table= tmp;
+        break;
+      }
     }
   }
+  print_table_array(thd, eliminated_tables, str, table, 
+                    table + tables->elements, query_type);
 }
 
-
 /**
   @brief Print an index hint
 
@@ -17515,9 +19969,7 @@ void TABLE_LIST::print(THD *thd, table_map eliminated_tables, String *str,
 
 void st_select_lex::print(THD *thd, String *str, enum_query_type query_type)
 {
-  /* QQ: thd may not be set for sub queries, but this should be fixed */
-  if (!thd)
-    thd= current_thd;
+  DBUG_ASSERT(thd);
 
   str->append(STRING_WITH_LEN("select "));
 
@@ -17674,6 +20126,40 @@ bool JOIN::change_result(select_result *res)
   DBUG_RETURN(FALSE);
 }
 
+
+/**
+  @brief
+  Set allowed types of join caches that can be used for join operations
+
+  @details
+  The function sets a bitmap of allowed join buffers types in the field
+  allowed_join_cache_types of this JOIN structure:
+    bit 1 is set if tjoin buffers are allowed to be incremental
+    bit 2 is set if the join buffers are allowed to be hashed
+    but 3 is set if the join buffers are allowed to be used for BKA
+  join algorithms.
+  The allowed types are read from system variables.
+  Besides the function sets maximum allowed join cache level that is
+  also read from a system variable.
+*/
+
+void JOIN::set_allowed_join_cache_types()
+{
+  allowed_join_cache_types= 0;
+  if (optimizer_flag(thd, OPTIMIZER_SWITCH_JOIN_CACHE_INCREMENTAL))
+    allowed_join_cache_types|= JOIN_CACHE_INCREMENTAL_BIT;
+  if (optimizer_flag(thd, OPTIMIZER_SWITCH_JOIN_CACHE_HASHED))
+    allowed_join_cache_types|= JOIN_CACHE_HASHED_BIT;
+  if (optimizer_flag(thd, OPTIMIZER_SWITCH_JOIN_CACHE_BKA))
+    allowed_join_cache_types|= JOIN_CACHE_BKA_BIT;
+  allowed_semijoin_with_cache=
+    optimizer_flag(thd, OPTIMIZER_SWITCH_SEMIJOIN_WITH_CACHE);
+  allowed_outer_join_with_cache=
+    optimizer_flag(thd, OPTIMIZER_SWITCH_OUTER_JOIN_WITH_CACHE);
+  max_allowed_join_cache_level= thd->variables.join_cache_level;
+}
+
+
 /**
   @} (end of group Query_Optimizer)
 */
diff --git a/sql/sql_select.h b/sql/sql_select.h
index ea06b26a229..06eaa8c9485 100644
--- a/sql/sql_select.h
+++ b/sql/sql_select.h
@@ -14,6 +14,8 @@
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
 
+#ifndef SQL_SELECT_INCLUDED
+#define SQL_SELECT_INCLUDED
 /**
   @file
 
@@ -28,6 +30,21 @@
 #include "procedure.h"
 #include <myisam.h>
 
+#if defined(WITH_ARIA_STORAGE_ENGINE) && defined(USE_MARIA_FOR_TMP_TABLES)
+#include "../storage/maria/ha_maria.h"
+#define TMP_ENGINE_HTON maria_hton
+#else
+#define TMP_ENGINE_HTON myisam_hton
+#endif
+/* Values in optimize */
+#define KEY_OPTIMIZE_EXISTS		1
+#define KEY_OPTIMIZE_REF_OR_NULL	2
+#define KEY_OPTIMIZE_EQ	                4
+
+inline uint get_hash_join_key_no() { return MAX_KEY; }
+
+inline bool is_hash_join_key_no(uint key) { return key == MAX_KEY; }
+
 typedef struct keyuse_t {
   TABLE *table;
   Item	*val;				/**< or value if no field */
@@ -51,6 +68,13 @@ typedef struct keyuse_t {
     NULL  - Otherwise (the source equality can't be turned off)
   */
   bool *cond_guard;
+  /*
+     0..64    <=> This was created from semi-join IN-equality # sj_pred_no.
+     MAX_UINT  Otherwise
+  */
+  uint         sj_pred_no;
+
+  bool is_for_hash_join() { return is_hash_join_key_no(key); }
 } KEYUSE;
 
 class store_key;
@@ -92,32 +116,17 @@ typedef struct st_table_ref
     in the join.
   */
   ha_rows       use_count;
-} TABLE_REF;
-
-
-
-#define CACHE_BLOB      1        /* blob field  */
-#define CACHE_STRIPPED  2        /* field stripped of trailing spaces */
 
-/**
-  CACHE_FIELD and JOIN_CACHE is used on full join to cache records in outer
-  table
-*/
-
-typedef struct st_cache_field {
-  uchar *str;
-  uint length, blob_length;
-  Field *field;
-  uint type;    /**< category of the of the copied field (CACHE_BLOB et al.) */
-} CACHE_FIELD;
+  /*
+    TRUE <=> disable the "cache" as doing lookup with the same key value may
+    produce different results (because of Index Condition Pushdown)
 
+  */
+  bool          disable_cache;
 
-typedef struct st_join_cache {
-  uchar *buff,*pos,*end;
-  uint records,record_nr,ptr_record,fields,length,blobs;
-  CACHE_FIELD *field,**blob_ptr;
-  SQL_SELECT *select;
-} JOIN_CACHE;
+  bool tmp_table_index_lookup_init(THD *thd, KEY *tmp_key, Item_iterator &it,
+                                   bool value);
+} TABLE_REF;
 
 
 /*
@@ -125,7 +134,8 @@ typedef struct st_join_cache {
 */
 enum join_type { JT_UNKNOWN,JT_SYSTEM,JT_CONST,JT_EQ_REF,JT_REF,JT_MAYBE_REF,
 		 JT_ALL, JT_RANGE, JT_NEXT, JT_FT, JT_REF_OR_NULL,
-		 JT_UNIQUE_SUBQUERY, JT_INDEX_SUBQUERY, JT_INDEX_MERGE};
+		 JT_UNIQUE_SUBQUERY, JT_INDEX_SUBQUERY, JT_INDEX_MERGE,
+                 JT_HASH, JT_HASH_RANGE, JT_HASH_NEXT, JT_HASH_INDEX_MERGE};
 
 class JOIN;
 
@@ -147,17 +157,31 @@ typedef enum_nested_loop_state
 (*Next_select_func)(JOIN *, struct st_join_table *, bool);
 typedef int (*Read_record_func)(struct st_join_table *tab);
 Next_select_func setup_end_select_func(JOIN *join);
+int rr_sequential(READ_RECORD *info);
 
 
+class JOIN_CACHE;
+class SJ_TMP_TABLE;
+
 typedef struct st_join_table {
   st_join_table() {}                          /* Remove gcc warning */
   TABLE		*table;
   KEYUSE	*keyuse;			/**< pointer to first used key */
+  KEY           *hj_key;       /**< descriptor of the used best hash join key
+				    not supported by any index                 */
   SQL_SELECT	*select;
-  COND          *select_cond;
+  COND		*select_cond;
   COND          *on_precond;    /**< part of on condition to check before
 				     accessing the first inner table           */  
   QUICK_SELECT_I *quick;
+  /* 
+    The value of select_cond before we've attempted to do Index Condition
+    Pushdown. We may need to restore everything back if we first choose one
+    index but then reconsider (see test_if_skip_sort_order() for such
+    scenarios).
+    NULL means no index condition pushdown was performed.
+  */
+  Item          *pre_idx_push_select_cond;
   Item	       **on_expr_ref;   /**< pointer to the associated on expression   */
   COND_EQUAL    *cond_equal;    /**< multiple equalities for the on expression */
   st_join_table *first_inner;   /**< first inner table for including outerjoin */
@@ -204,11 +228,19 @@ typedef struct st_join_table {
     E(#records) is in found_records.
   */
   ha_rows       read_time;
-  
+    
+  double        partial_join_cardinality;
+
   table_map	dependent,key_dependent;
   uint		use_quick,index;
   uint		status;				///< Save status for cache
-  uint		used_fields,used_fieldlength,used_blobs;
+  uint		used_fields;
+  ulong         used_fieldlength;
+  ulong         max_used_fieldlength;
+  uint          used_blobs;
+  uint          used_null_fields;
+  uint          used_rowid_fields;
+  uint          used_uneven_bit_fields;
   enum join_type type;
   bool		cached_eq_ref_table,eq_ref_table,not_used_in_distinct;
   bool		sorted;
@@ -219,11 +251,81 @@ typedef struct st_join_table {
   */ 
   ha_rows       limit; 
   TABLE_REF	ref;
-  JOIN_CACHE	cache;
+  /* TRUE <=> condition pushdown supports other tables presence */
+  bool          icp_other_tables_ok;
+  /* 
+    TRUE <=> condition pushed to the index has to be factored out of
+    the condition pushed to the table
+  */
+  bool          idx_cond_fact_out;
+  bool          use_join_cache;
+  uint          used_join_cache_level;
+  ulong         join_buffer_size_limit;
+  JOIN_CACHE	*cache;
+  /*
+    Index condition for BKA access join
+  */
+  Item          *cache_idx_cond;
+  SQL_SELECT    *cache_select;
   JOIN		*join;
-  /** Bitmap of nested joins this table is part of */
+  /*
+    Embedding SJ-nest (may be not the direct parent), or NULL if none.
+    This variable holds the result of table pullout.
+  */
+  TABLE_LIST    *emb_sj_nest;
+
+  /* FirstMatch variables (final QEP) */
+  struct st_join_table *first_sj_inner_tab;
+  struct st_join_table *last_sj_inner_tab;
+
+  /* Variables for semi-join duplicate elimination */
+  SJ_TMP_TABLE  *flush_weedout_table;
+  SJ_TMP_TABLE  *check_weed_out_table;
+  
+  /*
+    If set, means we should stop join enumeration after we've got the first
+    match and return to the specified join tab. May point to
+    join->join_tab[-1] which means stop join execution after the first
+    match.
+  */
+  struct st_join_table  *do_firstmatch;
+ 
+  /* 
+     ptr  - We're doing a LooseScan, this join tab is the first (i.e. 
+            "driving") join tab), and ptr points to the last join tab
+            handled by the strategy. loosescan_match_tab->found_match
+            should be checked to see if the current value group had a match.
+     NULL - Not doing a loose scan on this join tab.
+  */
+  struct st_join_table *loosescan_match_tab;
+
+  /* Buffer to save index tuple to be able to skip duplicates */
+  uchar *loosescan_buf;
+  
+  /* Length of key tuple (depends on #keyparts used) to store in the above */
+  uint loosescan_key_len;
+
+  /* Used by LooseScan. TRUE<=> there has been a matching record combination */
+  bool found_match;
+  
+  /*
+    Used by DuplicateElimination. tab->table->ref must have the rowid
+    whenever we have a current record.
+  */
+  int  keep_current_rowid;
+
+  /* NestedOuterJoins: Bitmap of nested joins this table is part of */
   nested_join_map embedding_map;
 
+  /*
+    Semi-join strategy to be used for this join table. This is a copy of
+    POSITION::sj_strategy field. This field is set up by the
+    fix_semijion_strategies_for_picked_join_order.
+  */
+  uint sj_strategy;
+
+  struct st_join_table *first_sjm_sibling;
+
   void cleanup();
   inline bool is_using_loose_index_scan()
   {
@@ -231,12 +333,123 @@ typedef struct st_join_table {
             (select->quick->get_type() ==
              QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX));
   }
+  bool check_rowid_field()
+  {
+    if (keep_current_rowid && !used_rowid_fields)
+    {
+      used_rowid_fields= 1;
+      used_fieldlength+= table->file->ref_length;
+    }
+    return test(used_rowid_fields);
+  }
+  bool is_inner_table_of_semi_join_with_first_match()
+  {
+    return first_sj_inner_tab != NULL;
+  }
+  bool is_inner_table_of_outer_join()
+  {
+    return first_inner != NULL;
+  }
+  bool is_single_inner_of_semi_join_with_first_match()
+  {
+    return first_sj_inner_tab == this && last_sj_inner_tab == this;            
+  }
+  bool is_single_inner_of_outer_join()
+  {
+    return first_inner == this && first_inner->last_inner == this;
+  }
+  bool is_first_inner_for_outer_join()
+  {
+    return first_inner && first_inner == this;
+  }
+  bool use_match_flag()
+  {
+    return is_first_inner_for_outer_join() || first_sj_inner_tab == this ; 
+  }
+  bool check_only_first_match()
+  {
+    return is_inner_table_of_semi_join_with_first_match() ||
+           (is_inner_table_of_outer_join() &&
+            table->reginfo.not_exists_optimize);
+  }
+  bool is_last_inner_table()
+  {
+    return (first_inner && first_inner->last_inner == this) ||
+           last_sj_inner_tab == this;
+  }
+  /*
+    Check whether the table belongs to a nest of inner tables of an
+    outer join or to a nest of inner tables of a semi-join
+  */
+  bool is_nested_inner()
+  {
+    if (first_inner && 
+        (first_inner != first_inner->last_inner || first_inner->first_upper))
+      return TRUE;
+    if (first_sj_inner_tab && first_sj_inner_tab != last_sj_inner_tab)
+      return TRUE;
+    return FALSE;
+  }
+  struct st_join_table *get_first_inner_table()
+  {
+    if (first_inner)
+      return first_inner;
+    return first_sj_inner_tab; 
+  }
+  void set_select_cond(COND *to, uint line)
+  {
+    DBUG_PRINT("info", ("select_cond changes %p -> %p at line %u tab %p",
+                        select_cond, to, line, this));
+    select_cond= to;
+  }
+  COND *set_cond(COND *new_cond)
+  {
+    COND *tmp_select_cond= select_cond;
+    set_select_cond(new_cond, __LINE__);
+    if (select)
+      select->cond= new_cond;
+    return tmp_select_cond;
+  }
+  void calc_used_field_length(bool max_fl);
+  ulong get_used_fieldlength()
+  {
+    if (!used_fieldlength)
+      calc_used_field_length(FALSE);
+    return used_fieldlength;
+  }
+  ulong get_max_used_fieldlength()
+  {
+    if (!max_used_fieldlength)
+      calc_used_field_length(TRUE);
+    return max_used_fieldlength;
+  }
+  double get_partial_join_cardinality() { return partial_join_cardinality; }
+  bool hash_join_is_possible();
+  int make_scan_filter();
+  bool is_ref_for_hash_join() { return is_hash_join_key_no(ref.key); }
+  KEY *get_keyinfo_by_key_no(uint key) 
+  {
+    return (is_hash_join_key_no(key) ? hj_key : table->key_info+key);
+  }
 } JOIN_TAB;
 
+
+#include "sql_join_cache.h"
+
 enum_nested_loop_state sub_select_cache(JOIN *join, JOIN_TAB *join_tab, bool
                                         end_of_records);
 enum_nested_loop_state sub_select(JOIN *join,JOIN_TAB *join_tab, bool
                                   end_of_records);
+enum_nested_loop_state sub_select_sjm(JOIN *join, JOIN_TAB *join_tab, 
+                                      bool end_of_records);
+
+enum_nested_loop_state
+end_send_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
+	       bool end_of_records);
+enum_nested_loop_state
+end_write_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
+		bool end_of_records);
+
 
 /**
   Information about a position of table within a join order. Used in join
@@ -267,6 +480,89 @@ typedef struct st_position
 
   /* If ref-based access is used: bitmap of tables this table depends on  */
   table_map ref_depend_map;
+
+  bool use_join_buffer; 
+  
+  
+  /* These form a stack of partial join order costs and output sizes */
+  COST_VECT prefix_cost;
+  double    prefix_record_count;
+
+  /*
+    Current optimization state: Semi-join strategy to be used for this
+    and preceding join tables.
+    
+    Join optimizer sets this for the *last* join_tab in the
+    duplicate-generating range. That is, in order to interpret this field, 
+    one needs to traverse join->[best_]positions array from right to left.
+    When you see a join table with sj_strategy!= SJ_OPT_NONE, some other
+    field (depending on the strategy) tells how many preceding positions 
+    this applies to. The values of covered_preceding_positions->sj_strategy
+    must be ignored.
+  */
+  uint sj_strategy;
+  /*
+    Valid only after fix_semijoin_strategies_for_picked_join_order() call:
+    if sj_strategy!=SJ_OPT_NONE, this is the number of subsequent tables that
+    are covered by the specified semi-join strategy
+  */
+  uint n_sj_tables;
+
+/* LooseScan strategy members */
+
+  /* The first (i.e. driving) table we're doing loose scan for */
+  uint        first_loosescan_table;
+  /* 
+     Tables that need to be in the prefix before we can calculate the cost
+     of using LooseScan strategy.
+  */
+  table_map   loosescan_need_tables;
+
+  /*
+    keyno  -  Planning to do LooseScan on this key. If keyuse is NULL then 
+              this is a full index scan, otherwise this is a ref+loosescan
+              scan (and keyno matches the KEUSE's)
+    MAX_KEY - Not doing a LooseScan
+  */
+  uint loosescan_key;  // final (one for strategy instance )
+  uint loosescan_parts; /* Number of keyparts to be kept distinct */
+  
+/* FirstMatch strategy */
+  /*
+    Index of the first inner table that we intend to handle with this
+    strategy
+  */
+  uint first_firstmatch_table;
+  /*
+    Tables that were not in the join prefix when we've started considering 
+    FirstMatch strategy.
+  */
+  table_map first_firstmatch_rtbl;
+  /* 
+    Tables that need to be in the prefix before we can calculate the cost
+    of using FirstMatch strategy.
+   */
+  table_map firstmatch_need_tables;
+
+
+/* Duplicate Weedout strategy */
+  /* The first table that the strategy will need to handle */
+  uint  first_dupsweedout_table;
+  /*
+    Tables that we will need to have in the prefix to do the weedout step
+    (all inner and all outer that the involved semi-joins are correlated with)
+  */
+  table_map dupsweedout_tables;
+
+/* SJ-Materialization-Scan strategy */
+  /* The last inner table (valid once we're after it) */
+  uint      sjm_scan_last_inner;
+  /*
+    Tables that we need to have in the prefix to calculate the correct cost.
+    Basically, we need all inner tables and outer tables mentioned in the
+    semi-join's ON expression so we can correctly account for fanout.
+  */
+  table_map sjm_scan_need_tables;
 } POSITION;
 
 
@@ -280,6 +576,19 @@ typedef struct st_rollup
 } ROLLUP;
 
 
+#define SJ_OPT_NONE 0
+#define SJ_OPT_DUPS_WEEDOUT 1
+#define SJ_OPT_LOOSE_SCAN   2
+#define SJ_OPT_FIRST_MATCH  3
+#define SJ_OPT_MATERIALIZE  4
+#define SJ_OPT_MATERIALIZE_SCAN  5
+
+inline bool sj_is_materialize_strategy(uint strategy)
+{
+  return strategy >= SJ_OPT_MATERIALIZE;
+}
+
+
 class JOIN :public Sql_alloc
 {
   JOIN(const JOIN &rhs);                        /**< not implemented */
@@ -288,9 +597,19 @@ public:
   JOIN_TAB *join_tab,**best_ref;
   JOIN_TAB **map2table;    ///< mapping between table indexes and JOIN_TABs
   JOIN_TAB *join_tab_save; ///< saved join_tab for subquery reexecution
-  TABLE    **table,**all_tables,*sort_by_table;
-  uint	   tables,const_tables;
+  TABLE    **table;
+  TABLE    **all_tables;
+  /**
+    The table which has an index that allows to produce the requried ordering.
+    A special value of 0x1 means that the ordering will be produced by
+    passing 1st non-const table to filesort(). NULL means no such table exists.
+  */
+  TABLE    *sort_by_table;
+  uint	   tables;        /**< Number of tables in the join */
+  uint     outer_tables;  /**< Number of tables that are not inside semijoin */
+  uint     const_tables;
   uint	   send_group_parts;
+  bool	   group;          /**< If query contains GROUP BY clause */
   /**
     Indicates that grouping will be performed on the result set during
     query execution. This field belongs to query execution.
@@ -298,7 +617,7 @@ public:
     @see make_group_fields, alloc_group_fields, JOIN::exec
   */
   bool     sort_and_group; 
-  bool     first_record,full_join,group, no_field_update;
+  bool     first_record,full_join, no_field_update;
   bool	   do_send_rows;
   /**
     TRUE when we want to resume nested loop iterations when
@@ -329,14 +648,47 @@ public:
       - on each fetch iteration we add num_rows to fetch to fetch_limit
   */
   ha_rows  fetch_limit;
-  POSITION positions[MAX_TABLES+1],best_positions[MAX_TABLES+1];
+  /* Finally picked QEP. This is result of join optimization */
+  POSITION best_positions[MAX_TABLES+1];
+
+/******* Join optimization state members start *******/
+  /*
+    pointer - we're doing optimization for a semi-join materialization nest.
+    NULL    - otherwise
+  */
+  TABLE_LIST *emb_sjm_nest;
+  
+  /* Current join optimization state */
+  POSITION positions[MAX_TABLES+1];
   
-  /* *
+  /*
     Bitmap of nested joins embedding the position at the end of the current 
     partial join (valid only during join optimizer run).
   */
   nested_join_map cur_embedding_map;
+  
+  /*
+    Bitmap of inner tables of semi-join nests that have a proper subset of
+    their tables in the current join prefix. That is, of those semi-join
+    nests that have their tables both in and outside of the join prefix.
+  */
+  table_map cur_sj_inner_tables;
+  
+  /*
+    Bitmap of semi-join inner tables that are in the join prefix and for
+    which there's no provision for how to eliminate semi-join duplicates
+    they produce.
+  */
+  table_map cur_dups_producing_tables;
 
+  /* We also maintain a stack of join optimization states in * join->positions[] */
+/******* Join optimization state members end *******/
+  Next_select_func first_select;
+  /*
+    The cost of best complete join plan found so far during optimization,
+    after optimization phase - cost of picked join order (not taking into
+    account the changes made by test_if_skip_sort_order()).
+  */
   double   best_read;
   List<Item> *fields;
   List<Cached_item> group_fields, group_fields_cache;
@@ -352,6 +704,15 @@ public:
   Item      *tmp_having; ///< To store having when processed temporary table
   Item      *having_history; ///< Store having for explain
   ulonglong  select_options;
+  /* 
+    Bitmap of allowed types of the join caches that
+    can be used for join operations
+  */
+  uint allowed_join_cache_types;
+  bool allowed_semijoin_with_cache;
+  bool allowed_outer_join_with_cache;
+  /* Maximum level of the join caches that can be used for join operations */ 
+  uint max_allowed_join_cache_level;
   select_result *result;
   TMP_TABLE_PARAM tmp_table_param;
   MYSQL_LOCK *lock;
@@ -440,6 +801,7 @@ public:
   TABLE_LIST *tables_list;           ///<hold 'tables' parameter of mysql_select
   List<TABLE_LIST> *join_list;       ///< list of joined tables in reverse order
   COND_EQUAL *cond_equal;
+  COND_EQUAL *having_equal;
   SQL_SELECT *select;                ///<created in optimisation phase
   JOIN_TAB *return_tab;              ///<used only for outer joins
   Item **ref_pointer_array; ///<used pointer reference for this select
@@ -451,6 +813,13 @@ public:
   bool union_part; ///< this subselect is part of union 
   bool optimized; ///< flag to avoid double optimization in EXPLAIN
 
+  Array<Item_in_subselect> sj_subselects;
+
+  /* Temporary tables used to weed-out semi-join duplicates */
+  List<TABLE> sj_tmp_tables;
+  /* SJM nests that are executed with SJ-Materialization strategy */
+  List<SJ_MATERIALIZATION_INFO> sjm_info_list;
+
   /* 
     storage for caching buffers allocated during query execution. 
     These buffers allocations need to be cached as the thread memory pool is
@@ -468,7 +837,7 @@ public:
 
   JOIN(THD *thd_arg, List<Item> &fields_arg, ulonglong select_options_arg,
        select_result *result_arg)
-    :fields_list(fields_arg)
+    :fields_list(fields_arg), sj_subselects(thd_arg->mem_root, 4)
   {
     init(thd_arg, fields_arg, select_options_arg, result_arg);
   }
@@ -520,6 +889,7 @@ public:
     zero_result_cause= 0;
     optimized= 0;
     cond_equal= 0;
+    having_equal= 0;
     group_optimized_away= 0;
     no_rows_in_result_called= 0;
 
@@ -532,6 +902,7 @@ public:
     rollup.state= ROLLUP::STATE_NONE;
 
     no_const_tables= FALSE;
+    first_select= sub_select;
   }
 
   int prepare(Item ***rref_pointer_array, TABLE_LIST *tables, uint wind_num,
@@ -544,6 +915,8 @@ public:
   int destroy();
   void restore_tmp();
   bool alloc_func_list();
+  bool flatten_subqueries();
+  bool setup_subquery_materialization();
   bool make_sum_func_list(List<Item> &all_fields, List<Item> &send_fields,
 			  bool before_group_by, bool recompute= FALSE);
 
@@ -565,7 +938,6 @@ public:
 			  Item_sum ***func);
   int rollup_send_data(uint idx);
   int rollup_write_data(uint idx, TABLE *table);
-  void remove_subq_pushed_predicates(Item **where);
   /**
     Release memory and, if possible, the open tables held by this execution
     plan (and nested plans). It's used to release some tables before
@@ -593,6 +965,27 @@ public:
   {
     return (table_map(1) << tables) - 1;
   }
+  /* 
+    Return the table for which an index scan can be used to satisfy 
+    the sort order needed by the ORDER BY/(implicit) GROUP BY clause 
+  */
+  JOIN_TAB *get_sort_by_join_tab()
+  {
+    return (need_tmp || !sort_by_table || skip_sort_order ||
+            ((group || tmp_table_param.sum_func_count) && !group_list)) ?
+              NULL : join_tab+const_tables;
+  }
+  bool setup_subquery_caches();
+  bool shrink_join_buffers(JOIN_TAB *jt, 
+                           ulonglong curr_space,
+                           ulonglong needed_space);
+  void set_allowed_join_cache_types();
+  bool is_allowed_hash_join_access()
+  { 
+    return test(allowed_join_cache_types & JOIN_CACHE_HASHED_BIT) &&
+           max_allowed_join_cache_level > JOIN_CACHE_HASHED_BIT;
+  }
+
 private:
   /**
     TRUE if the query contains an aggregate function but has no GROUP
@@ -613,11 +1006,6 @@ void TEST_join(JOIN *join);
 
 /* Extern functions in sql_select.cc */
 bool store_val_in_field(Field *field, Item *val, enum_check_fields check_flag);
-TABLE *create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
-			ORDER *group, bool distinct, bool save_sum_fields,
-			ulonglong select_options, ha_rows rows_limit,
-			char* alias);
-void free_tmp_table(THD *thd, TABLE *entry);
 void count_field_types(SELECT_LEX *select_lex, TMP_TABLE_PARAM *param, 
                        List<Item> &fields, bool reset_with_sum_func);
 bool setup_copy_fields(THD *thd, TMP_TABLE_PARAM *param,
@@ -647,6 +1035,7 @@ class store_key :public Sql_alloc
 public:
   bool null_key; /* TRUE <=> the value of the key has a null part */
   enum store_key_result { STORE_KEY_OK, STORE_KEY_FATAL, STORE_KEY_CONV };
+  enum Type { FIELD_STORE_KEY, ITEM_STORE_KEY, CONST_ITEM_STORE_KEY };
   store_key(THD *thd, Field *field_arg, uchar *ptr, uchar *null, uint length)
     :null_key(0), null_ptr(null), err(0)
   {
@@ -667,6 +1056,7 @@ public:
                                         ptr, null, 1);
   }
   virtual ~store_key() {}			/** Not actually needed */
+  virtual enum Type type() const=0;
   virtual const char *name() const=0;
 
   /**
@@ -718,15 +1108,32 @@ class store_key_field: public store_key
     {
       copy_field.set(to_field,from_field,0);
     }
-  }
+  }  
+
+  enum Type type() const { return FIELD_STORE_KEY; }
   const char *name() const { return field_name; }
 
+  void change_source_field(Item_field *fld_item)
+  {
+    copy_field.set(to_field, fld_item->field, 0);
+    field_name= fld_item->full_name();
+  }
+
  protected: 
   enum store_key_result copy_inner()
   {
     TABLE *table= copy_field.to_field->table;
     my_bitmap_map *old_map= dbug_tmp_use_all_columns(table,
                                                      table->write_set);
+
+    /* 
+      It looks like the next statement is needed only for a simplified
+      hash function over key values used now in BNLH join.
+      When the implementation of this function will be replaced for a proper
+      full version this statement probably should be removed.
+    */  
+    bzero(copy_field.to_ptr,copy_field.to_length);
+
     copy_field.do_copy(&copy_field);
     dbug_tmp_restore_column_map(table->write_set, old_map);
     null_key= to_field->is_null();
@@ -739,13 +1146,20 @@ class store_key_item :public store_key
 {
  protected:
   Item *item;
+  /*
+    Flag that forces usage of save_val() method which save value of the
+    item instead of save_in_field() method which saves result.
+  */
+  bool use_value;
 public:
   store_key_item(THD *thd, Field *to_field_arg, uchar *ptr,
-                 uchar *null_ptr_arg, uint length, Item *item_arg)
+                 uchar *null_ptr_arg, uint length, Item *item_arg, bool val)
     :store_key(thd, to_field_arg, ptr,
 	       null_ptr_arg ? null_ptr_arg : item_arg->maybe_null ?
-	       &err : (uchar*) 0, length), item(item_arg)
+	       &err : (uchar*) 0, length), item(item_arg), use_value(val)
   {}
+
+  enum Type type() const { return ITEM_STORE_KEY; }
   const char *name() const { return "func"; }
 
  protected:  
@@ -754,7 +1168,20 @@ public:
     TABLE *table= to_field->table;
     my_bitmap_map *old_map= dbug_tmp_use_all_columns(table,
                                                      table->write_set);
-    int res= item->save_in_field(to_field, 1);
+    int res= FALSE;
+
+    /* 
+      It looks like the next statement is needed only for a simplified
+      hash function over key values used now in BNLH join.
+      When the implementation of this function will be replaced for a proper
+      full version this statement probably should be removed.
+    */  
+    to_field->reset();
+
+    if (use_value)
+      item->save_val(to_field);
+    else
+      res= item->save_in_field(to_field, 1);
     /*
      Item::save_in_field() may call Item::val_xxx(). And if this is a subquery
      we need to check for errors executing it and react accordingly
@@ -778,9 +1205,11 @@ public:
 		       Item *item_arg)
     :store_key_item(thd, to_field_arg,ptr,
 		    null_ptr_arg ? null_ptr_arg : item_arg->maybe_null ?
-		    &err : (uchar*) 0, length, item_arg), inited(0)
+		    &err : (uchar*) 0, length, item_arg, FALSE), inited(0)
   {
   }
+
+  enum Type type() const { return CONST_ITEM_STORE_KEY; }
   const char *name() const { return "const"; }
 
 protected:  
@@ -812,12 +1241,59 @@ bool error_if_full_join(JOIN *join);
 int report_error(TABLE *table, int error);
 int safe_index_read(JOIN_TAB *tab);
 COND *remove_eq_conds(THD *thd, COND *cond, Item::cond_result *cond_value);
+int test_if_item_cache_changed(List<Cached_item> &list);
+int join_init_read_record(JOIN_TAB *tab);
 void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key);
+inline Item * and_items(Item* cond, Item *item)
+{
+  return (cond? (new Item_cond_and(cond, item)) : item);
+}
+bool choose_plan(JOIN *join,table_map join_tables);
+void get_partial_join_cost(JOIN *join, uint n_tables, double *read_time_arg,
+                           double *record_count_arg);
+void optimize_wo_join_buffering(JOIN *join, uint first_tab, uint last_tab, 
+                                table_map last_remaining_tables, 
+                                bool first_alt, uint no_jbuf_before,
+                                double *outer_rec_count, double *reopt_cost);
+Item_equal *find_item_equal(COND_EQUAL *cond_equal, Field *field,
+                            bool *inherited_fl);
+bool test_if_ref(COND *root_cond, 
+                 Item_field *left_item,Item *right_item);
 
 inline bool optimizer_flag(THD *thd, uint flag)
 { 
   return (thd->variables.optimizer_switch & flag);
 }
 
+/* Table elimination entry point function */
 void eliminate_tables(JOIN *join);
 
+/* Index Condition Pushdown entry point function */
+void push_index_cond(JOIN_TAB *tab, uint keyno);
+
+/****************************************************************************
+  Temporary table support for SQL Runtime
+ ***************************************************************************/
+
+#define STRING_TOTAL_LENGTH_TO_PACK_ROWS 128
+#define AVG_STRING_LENGTH_TO_PACK_ROWS   64
+#define RATIO_TO_PACK_ROWS	       2
+#define MIN_STRING_LENGTH_TO_PACK_ROWS   10
+
+TABLE *create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
+			ORDER *group, bool distinct, bool save_sum_fields,
+			ulonglong select_options, ha_rows rows_limit,
+			char* alias);
+void free_tmp_table(THD *thd, TABLE *entry);
+bool create_internal_tmp_table_from_heap(THD *thd, TABLE *table,
+                                         ENGINE_COLUMNDEF *start_recinfo,
+                                         ENGINE_COLUMNDEF **recinfo, 
+                                         int error, bool ignore_last_dupp_key_error);
+bool create_internal_tmp_table(TABLE *table, KEY *keyinfo, 
+                               ENGINE_COLUMNDEF *start_recinfo,
+                               ENGINE_COLUMNDEF **recinfo, 
+                               ulonglong options);
+bool open_tmp_table(TABLE *table);
+void setup_tmp_table_column_bitmaps(TABLE *table, uchar *bitmaps);
+
+#endif /* SQL_SELECT_INCLUDED */
diff --git a/sql/sql_show.cc b/sql/sql_show.cc
index fa84be7bc17..15db2d05dc2 100644
--- a/sql/sql_show.cc
+++ b/sql/sql_show.cc
@@ -1937,8 +1937,7 @@ void mysqld_list_processes(THD *thd,const char *user, bool verbose)
           pthread_mutex_lock(&mysys_var->mutex);
         thd_info->proc_info= (char*) (tmp->killed == THD::KILL_CONNECTION? "Killed" : 0);
 #ifndef EMBEDDED_LIBRARY
-        thd_info->state_info= (char*) (tmp->locked ? "Locked" :
-                                       tmp->net.reading_or_writing ?
+        thd_info->state_info= (char*) (tmp->net.reading_or_writing ?
                                        (tmp->net.reading_or_writing == 2 ?
                                         "Writing to net" :
                                         thd_info->command == COM_SLEEP ? "" :
@@ -2063,8 +2062,7 @@ int fill_schema_processlist(THD* thd, TABLE_LIST* tables, COND* cond)
       table->field[5]->store(utime / 1000000, TRUE);
       /* STATE */
 #ifndef EMBEDDED_LIBRARY
-      val= (char*) (tmp->locked ? "Locked" :
-                    tmp->net.reading_or_writing ?
+      val= (char*) (tmp->net.reading_or_writing ?
                     (tmp->net.reading_or_writing == 2 ?
                      "Writing to net" :
                      tmp->command == COM_SLEEP ? "" :
@@ -2814,9 +2812,10 @@ bool schema_table_store_record(THD *thd, TABLE *table)
   int error;
   if ((error= table->file->ha_write_row(table->record[0])))
   {
-    if (create_internal_tmp_table_from_heap(thd, table,
-                                table->pos_in_table_list->schema_table_param,
-                                error, 0))
+    TMP_TABLE_PARAM *param= table->pos_in_table_list->schema_table_param;
+    if (create_internal_tmp_table_from_heap(thd, table, param->start_recinfo, 
+                                            &param->recinfo, error, 0))
+
       return 1;
   }
   return 0;
@@ -3004,7 +3003,10 @@ bool uses_only_table_name_fields(Item *item, TABLE_LIST *table)
   else if (item->type() == Item::REF_ITEM)
     return uses_only_table_name_fields(item->real_item(), table);
 
-  if (item->type() == Item::SUBSELECT_ITEM && !item->const_item())
+  if ((item->type() == Item::SUBSELECT_ITEM ||
+      (item->get_cached_item() &&
+       item->get_cached_item()->type() == Item::SUBSELECT_ITEM))
+      && !item->const_item())
     return 0;
 
   return 1;
@@ -3718,7 +3720,6 @@ int get_all_tables(THD *thd, TABLE_LIST *tables, COND *cond)
   uint table_open_method;
   DBUG_ENTER("get_all_tables");
 
-  lex->context_analysis_only|= CONTEXT_ANALYSIS_ONLY_VIEW;
   lex->reset_n_backup_query_tables_list(&query_tables_list_backup);
 
   /*
@@ -3850,8 +3851,13 @@ int get_all_tables(THD *thd, TABLE_LIST *tables, COND *cond)
             if (!(table_open_method & ~OPEN_FRM_ONLY) &&
                 !with_i_schema)
             {
-              if (!fill_schema_table_from_frm(thd, table, schema_table, db_name,
-                                              table_name, schema_table_idx))
+              my_bool res;
+              lex->context_analysis_only|= CONTEXT_ANALYSIS_ONLY_VIEW;
+              res= fill_schema_table_from_frm(thd, table, schema_table,
+                                              db_name,
+                                              table_name, schema_table_idx);
+              lex->context_analysis_only= save_context_analysis_only;
+              if (!res)
                 continue;
             }
 
diff --git a/sql/sql_sort.h b/sql/sql_sort.h
index f54b085eeda..fc4117e2767 100644
--- a/sql/sql_sort.h
+++ b/sql/sql_sort.h
@@ -57,6 +57,7 @@ typedef struct st_sort_param {
   uint addon_length;        /* Length of added packed fields */
   uint res_length;          /* Length of records in final sorted file/buffer */
   uint keys;				/* Max keys / buffer */
+  element_count min_dupl_count;
   ha_rows max_rows,examined_rows;
   TABLE *sort_form;			/* For quicker make_sortkey */
   SORT_FIELD *local_sortorder;
@@ -80,4 +81,9 @@ int merge_buffers(SORTPARAM *param,IO_CACHE *from_file,
 		  IO_CACHE *to_file, uchar *sort_buffer,
 		  BUFFPEK *lastbuff,BUFFPEK *Fb,
 		  BUFFPEK *Tb,int flag);
+int merge_index(SORTPARAM *param, uchar *sort_buffer,
+		BUFFPEK *buffpek, uint maxbuffer,
+		IO_CACHE *tempfile, IO_CACHE *outfile);
+
 void reuse_freed_buff(QUEUE *queue, BUFFPEK *reuse, uint key_length);
+
diff --git a/sql/sql_string.h b/sql/sql_string.h
index 5da0d38f24d..c9eaf924e4d 100644
--- a/sql/sql_string.h
+++ b/sql/sql_string.h
@@ -363,7 +363,7 @@ public:
 
   void qs_append(const char *str)
   {
-    qs_append(str, strlen(str));
+    qs_append(str, (uint32)strlen(str));
   }
   void qs_append(const char *str, uint32 len);
   void qs_append(double d);
diff --git a/sql/sql_table.cc b/sql/sql_table.cc
index 98de5e95da9..f2b267eadb1 100644
--- a/sql/sql_table.cc
+++ b/sql/sql_table.cc
@@ -24,6 +24,7 @@
 #include "sql_trigger.h"
 #include "sql_show.h"
 #include "debug_sync.h"
+#include "sql_handler.h"
 
 #ifdef __WIN__
 #include <io.h>
diff --git a/sql/sql_test.cc b/sql/sql_test.cc
index 8e945d0893f..af89b3cd525 100644
--- a/sql/sql_test.cc
+++ b/sql/sql_test.cc
@@ -224,6 +224,48 @@ TEST_join(JOIN *join)
 }
 
 
+#define FT_KEYPART   (MAX_REF_PARTS+10)
+
+void print_keyuse(KEYUSE *keyuse)
+{
+  char buff[256];
+  char buf2[64]; 
+  const char *fieldname;
+  JOIN_TAB *join_tab= keyuse->table->reginfo.join_tab;
+  KEY *key_info= join_tab->get_keyinfo_by_key_no(keyuse->key);
+  String str(buff,(uint32) sizeof(buff), system_charset_info);
+  str.length(0);
+  keyuse->val->print(&str, QT_ORDINARY);
+  str.append('\0');
+  if (keyuse->is_for_hash_join())
+    fieldname= keyuse->table->field[keyuse->keypart]->field_name;
+  else if (keyuse->keypart == FT_KEYPART)
+    fieldname= "FT_KEYPART";
+  else
+    fieldname= key_info->key_part[keyuse->keypart].field->field_name;
+  longlong2str(keyuse->used_tables, buf2, 16, 0); 
+  DBUG_LOCK_FILE;
+  fprintf(DBUG_FILE, "KEYUSE: %s.%s=%s  optimize: %u  used_tables: %s "
+          "ref_table_rows: %lu  keypart_map: %0lx\n",
+          keyuse->table->alias.c_ptr(), fieldname, str.ptr(),
+          (uint) keyuse->optimize, buf2, (ulong) keyuse->ref_table_rows, 
+          (ulong) keyuse->keypart_map);
+  DBUG_UNLOCK_FILE;
+  //key_part_map keypart_map; --?? there can be several? 
+}
+
+
+/* purecov: begin inspected */
+void print_keyuse_array(DYNAMIC_ARRAY *keyuse_array)
+{
+  DBUG_LOCK_FILE;
+  fprintf(DBUG_FILE, "KEYUSE array (%d elements)\n", keyuse_array->elements);
+  DBUG_UNLOCK_FILE;
+  for(uint i=0; i < keyuse_array->elements; i++)
+    print_keyuse((KEYUSE*)dynamic_array_ptr(keyuse_array, i));
+}
+
+
 /* 
   Print the current state during query optimization.
 
@@ -324,6 +366,27 @@ print_plan(JOIN* join, uint idx, double record_count, double read_time,
   DBUG_UNLOCK_FILE;
 }
 
+
+void print_sjm(SJ_MATERIALIZATION_INFO *sjm)
+{
+  DBUG_LOCK_FILE;
+  fprintf(DBUG_FILE, "\nsemi-join nest{\n");
+  fprintf(DBUG_FILE, "  tables { \n");
+  for (uint i= 0;i < sjm->tables; i++)
+  {
+    fprintf(DBUG_FILE, "    %s%s\n", 
+            sjm->positions[i].table->table->alias.c_ptr(),
+            (i == sjm->tables -1)? "": ",");
+  }
+  fprintf(DBUG_FILE, "  }\n");
+  fprintf(DBUG_FILE, "  materialize_cost= %g\n",
+          sjm->materialization_cost.total_cost());
+  fprintf(DBUG_FILE, "  rows= %g\n", sjm->rows);
+  fprintf(DBUG_FILE, "}\n");
+  DBUG_UNLOCK_FILE;
+}
+/* purecov: end */
+
 #endif
 
 typedef struct st_debug_lock
diff --git a/sql/sql_trigger.cc b/sql/sql_trigger.cc
index 1c4add27e57..8b95a36bc35 100644
--- a/sql/sql_trigger.cc
+++ b/sql/sql_trigger.cc
@@ -19,6 +19,7 @@
 #include "sp_head.h"
 #include "sql_trigger.h"
 #include "parse_file.h"
+#include "sql_handler.h"
 
 /*************************************************************************/
 
diff --git a/sql/sql_trigger.h b/sql/sql_trigger.h
index f6754a75284..08125f94b43 100644
--- a/sql/sql_trigger.h
+++ b/sql/sql_trigger.h
@@ -17,7 +17,7 @@
 /**
   This class holds all information about triggers of table.
 
-  QQ: Will it be merged into TABLE in the future ?
+  TODO: Will it be merged into TABLE in the future ?
 */
 
 class Table_triggers_list: public Sql_alloc
diff --git a/sql/sql_union.cc b/sql/sql_union.cc
index 2cedce497b6..a94ad9f3b4b 100644
--- a/sql/sql_union.cc
+++ b/sql/sql_union.cc
@@ -51,21 +51,23 @@ int select_union::prepare(List<Item> &list, SELECT_LEX_UNIT *u)
 
 bool select_union::send_data(List<Item> &values)
 {
-  int error= 0;
   if (unit->offset_limit_cnt)
   {						// using limit offset,count
     unit->offset_limit_cnt--;
     return 0;
   }
-  fill_record(thd, table->field, values, 1);
+  fill_record(thd, table->field, values, TRUE, FALSE);
   if (thd->is_error())
     return 1;
 
-  if ((error= table->file->ha_write_row(table->record[0])))
+  if ((write_err= table->file->ha_write_row(table->record[0])))
   {
     /* create_internal_tmp_table_from_heap will generate error if needed */
-    if (table->file->is_fatal_error(error, HA_CHECK_DUP) &&
-        create_internal_tmp_table_from_heap(thd, table, &tmp_table_param, error, 1))
+    if (table->file->is_fatal_error(write_err, HA_CHECK_DUP) &&
+        create_internal_tmp_table_from_heap(thd, table,
+                                            tmp_table_param.start_recinfo, 
+                                            &tmp_table_param.recinfo,
+                                            write_err, 1))
       return 1;
   }
   return 0;
@@ -100,6 +102,8 @@ bool select_union::flush()
       is_union_distinct  if set, the temporary table will eliminate
                          duplicates on insert
       options            create options
+      table_alias        name of the temporary table
+      bit_fields_as_long convert bit fields to ulonglong
 
   DESCRIPTION
     Create a temporary table that is used to store the result of a UNION,
@@ -113,11 +117,13 @@ bool select_union::flush()
 bool
 select_union::create_result_table(THD *thd_arg, List<Item> *column_types,
                                   bool is_union_distinct, ulonglong options,
-                                  const char *alias)
+                                  const char *alias,
+                                   bool bit_fields_as_long)
 {
   DBUG_ASSERT(table == 0);
   tmp_table_param.init();
   tmp_table_param.field_count= column_types->elements;
+  tmp_table_param.bit_fields_as_long= bit_fields_as_long;
 
   if (! (table= create_tmp_table(thd_arg, &tmp_table_param, *column_types,
                                  (ORDER*) 0, is_union_distinct, 1,
@@ -129,6 +135,22 @@ select_union::create_result_table(THD *thd_arg, List<Item> *column_types,
 }
 
 
+/**
+  Reset and empty the temporary table that stores the materialized query result.
+
+  @note The cleanup performed here is exactly the same as for the two temp
+  tables of JOIN - exec_tmp_table_[1 | 2].
+*/
+
+void select_union::cleanup()
+{
+  table->file->extra(HA_EXTRA_RESET_STATE);
+  table->file->ha_delete_all_rows();
+  free_io_cache(table);
+  filesort_free_buffers(table,0);
+}
+
+
 /*
   initialization procedures before fake_select_lex preparation()
 
@@ -366,7 +388,7 @@ bool st_select_lex_unit::prepare(THD *thd_arg, select_result *sel_result,
       create_options= create_options | TMP_TABLE_FORCE_MYISAM;
 
     if (union_result->create_result_table(thd, &types, test(union_distinct),
-                                          create_options, ""))
+                                          create_options, "", FALSE))
       goto err;
     bzero((char*) &result_table_list, sizeof(result_table_list));
     result_table_list.db= (char*) "";
@@ -730,8 +752,8 @@ void st_select_lex_unit::reinit_exec_mechanism()
     TRUE  - error
 */
 
-bool st_select_lex_unit::change_result(select_subselect *new_result,
-                                       select_subselect *old_result)
+bool st_select_lex_unit::change_result(select_result_interceptor *new_result,
+                                       select_result_interceptor *old_result)
 {
   bool res= FALSE;
   for (SELECT_LEX *sl= first_select(); sl; sl= sl->next_select())
diff --git a/sql/sql_update.cc b/sql/sql_update.cc
index 41ff4e998e2..0a4d3a4ba80 100644
--- a/sql/sql_update.cc
+++ b/sql/sql_update.cc
@@ -1829,7 +1829,7 @@ bool multi_update::send_data(List<Item> &not_used_values)
       /* Store regular updated fields in the row. */
       fill_record(thd,
                   tmp_table->field + 1 + unupdated_check_opt_tables.elements,
-                  *values_for_table[offset], 1);
+                  *values_for_table[offset], TRUE, FALSE);
 
       /* Write row, ignoring duplicated updates to a row */
       error= tmp_table->file->ha_write_row(tmp_table->record[0]);
@@ -1837,7 +1837,9 @@ bool multi_update::send_data(List<Item> &not_used_values)
       {
         if (error &&
             create_internal_tmp_table_from_heap(thd, tmp_table,
-                                    tmp_table_param + offset, error, 1))
+                                         tmp_table_param[offset].start_recinfo,
+                                         &tmp_table_param[offset].recinfo,
+                                         error, 1))
         {
           do_update= 0;
           DBUG_RETURN(1);			// Not a table_is_full error
@@ -2081,8 +2083,8 @@ int multi_update::do_updates()
 err:
   {
     thd->fatal_error();
-    prepare_record_for_error_message(local_error, table);
-    table->file->print_error(local_error,MYF(0));
+    prepare_record_for_error_message(local_error, err_table);
+    err_table->file->print_error(local_error,MYF(0));
   }
 
 err2:
diff --git a/sql/structs.h b/sql/structs.h
index 64e69fca0d0..86d1f12424e 100644
--- a/sql/structs.h
+++ b/sql/structs.h
@@ -53,7 +53,8 @@ typedef struct st_key_part_info {	/* Info about a key part */
   Field *field;
   uint	offset;				/* offset in record (from 0) */
   uint	null_offset;			/* Offset to null_bit in record */
-  uint16 length;                        /* Length of keypart value in bytes */
+  /* Length of key part in bytes, excluding NULL flag and length bytes */
+  uint16 length;
   /* 
     Number of bytes required to store the keypart value. This may be
     different from the "length" field as it also counts
@@ -74,7 +75,6 @@ typedef struct st_key {
   uint	key_length;			/* Tot length of key */
   ulong flags;                          /* dupp key and pack flags */
   uint	key_parts;			/* How many key_parts */
-  uint  extra_length;
   uint	usable_key_parts;		/* Should normally be = key_parts */
   uint  block_size;
   uint  name_length;
@@ -126,6 +126,21 @@ class SQL_SELECT;
 class THD;
 class handler;
 struct st_join_table;
+class Copy_field;
+
+/**
+  A context for reading through a single table using a chosen access method:
+  index read, scan, etc, use of cache, etc.
+
+  Use by:
+  READ_RECORD read_record;
+  init_read_record(&read_record, ...);
+  while (read_record.read_record())
+  {
+    ...
+  }
+  end_read_record();
+*/
 
 void rr_unlock_row(st_join_table *tab);
 
@@ -149,6 +164,12 @@ struct READ_RECORD {			/* Parameter to read_record */
   uchar	*cache,*cache_pos,*cache_end,*read_positions;
   IO_CACHE *io_cache;
   bool print_error, ignore_not_found_rows;
+  /* 
+    SJ-Materialization runtime may need to read fields from the materialized
+    table and unpack them into original table fields:
+  */
+  Copy_field *copy_field;
+  Copy_field *copy_field_end;
 };
 
 
diff --git a/sql/table.cc b/sql/table.cc
index d52c6bb085d..d3295f9dc48 100644
--- a/sql/table.cc
+++ b/sql/table.cc
@@ -21,6 +21,8 @@
 #include "create_options.h"
 #include <m_ctype.h>
 #include "my_md5.h"
+#include "my_bit.h"
+#include "sql_select.h"
 
 /* INFORMATION_SCHEMA name */
 LEX_STRING INFORMATION_SCHEMA_NAME= {C_STRING_WITH_LEN("information_schema")};
@@ -1491,7 +1493,6 @@ static int open_binary_frm(THD *thd, TABLE_SHARE *share, uchar *head,
           key_part->null_bit= field->null_bit;
           key_part->store_length+=HA_KEY_NULL_LENGTH;
           keyinfo->flags|=HA_NULL_PART_KEY;
-          keyinfo->extra_length+= HA_KEY_NULL_LENGTH;
           keyinfo->key_length+= HA_KEY_NULL_LENGTH;
         }
         if (field->type() == MYSQL_TYPE_BLOB ||
@@ -1503,7 +1504,6 @@ static int open_binary_frm(THD *thd, TABLE_SHARE *share, uchar *head,
             key_part->key_part_flag|= HA_BLOB_PART;
           else
             key_part->key_part_flag|= HA_VAR_LENGTH_PART;
-          keyinfo->extra_length+=HA_KEY_BLOB_LENGTH;
           key_part->store_length+=HA_KEY_BLOB_LENGTH;
           keyinfo->key_length+= HA_KEY_BLOB_LENGTH;
         }
@@ -1576,21 +1576,11 @@ static int open_binary_frm(THD *thd, TABLE_SHARE *share, uchar *head,
                                 share->table_name.str,
                                 share->table_name.str);
             share->crashed= 1;                // Marker for CHECK TABLE
-            goto to_be_deleted;
+            continue;
           }
 #endif
           key_part->key_part_flag|= HA_PART_KEY_SEG;
         }
-
-	to_be_deleted:
-
-        /*
-          If the field can be NULL, don't optimize away the test
-          key_part_column = expression from the WHERE clause
-          as we need to test for NULL = NULL.
-        */
-        if (field->real_maybe_null())
-          key_part->key_part_flag|= HA_NULL_PART;
         /*
           Sometimes we can compare key parts for equality with memcmp.
           But not always.
@@ -2349,7 +2339,7 @@ partititon_err:
   /* Allocate bitmaps */
 
   bitmap_size= share->column_bitmap_size;
-  if (!(bitmaps= (uchar*) alloc_root(&outparam->mem_root, bitmap_size*4)))
+  if (!(bitmaps= (uchar*) alloc_root(&outparam->mem_root, bitmap_size*5)))
     goto err;
   bitmap_init(&outparam->def_read_set,
               (my_bitmap_map*) bitmaps, share->fields, FALSE);
@@ -2359,6 +2349,8 @@ partititon_err:
               (my_bitmap_map*) (bitmaps+bitmap_size*2), share->fields, FALSE);
   bitmap_init(&outparam->tmp_set,
               (my_bitmap_map*) (bitmaps+bitmap_size*3), share->fields, FALSE);
+  bitmap_init(&outparam->eq_join_set,
+              (my_bitmap_map*) (bitmaps+bitmap_size*4), share->fields, FALSE);
   outparam->default_column_bitmaps();
 
   /* The table struct is now initialized;  Open the table */
@@ -2944,7 +2936,7 @@ File create_frm(THD *thd, const char *name, const char *db,
   if (create_info->options & HA_LEX_CREATE_TMP_TABLE)
     create_flags|= O_EXCL | O_NOFOLLOW;
 
-  /* Fix this when we have new .frm files;  Current limit is 4G rows (QQ) */
+  /* Fix this when we have new .frm files;  Current limit is 4G rows (TODO) */
   if (create_info->max_rows > UINT_MAX32)
     create_info->max_rows= UINT_MAX32;
   if (create_info->min_rows > UINT_MAX32)
@@ -5192,6 +5184,142 @@ void st_table::mark_virtual_columns_for_write(bool insert_fl)
     file->column_bitmaps_signal();
 }
 
+
+/**
+  Allocate space for keys
+
+  @param key_count  number of keys to allocate.
+
+  @details
+  Allocates space enough to fit 'key_count' keys for this table.
+
+  @return FALSE space was successfully allocated.
+  @return TRUE an error occur.
+*/
+
+bool TABLE::alloc_keys(uint key_count)
+{
+  DBUG_ASSERT(!s->keys);
+  key_info= s->key_info= (KEY*) alloc_root(&mem_root, sizeof(KEY)*key_count);
+  max_keys= key_count;
+  return !(key_info);
+}
+
+
+void TABLE::create_key_part_by_field(KEY *keyinfo,
+                                     KEY_PART_INFO *key_part_info,
+                                     Field *field)
+{   
+  field->flags|= PART_KEY_FLAG;
+  key_part_info->null_bit= field->null_bit;
+  key_part_info->null_offset= (uint) (field->null_ptr -
+                                      (uchar*) record[0]);
+  key_part_info->field= field;
+  key_part_info->offset= field->offset(record[0]);
+  key_part_info->length=   (uint16) field->pack_length();
+  keyinfo->key_length+= key_part_info->length;
+  key_part_info->key_part_flag= 0;
+  /* TODO:
+    The below method of computing the key format length of the
+    key part is a copy/paste from opt_range.cc, and table.cc.
+    This should be factored out, e.g. as a method of Field.
+    In addition it is not clear if any of the Field::*_length
+    methods is supposed to compute the same length. If so, it
+    might be reused.
+  */
+  key_part_info->store_length= key_part_info->length;
+
+  if (field->real_maybe_null())
+  {
+    key_part_info->store_length+= HA_KEY_NULL_LENGTH;
+    keyinfo->key_length+= HA_KEY_NULL_LENGTH;
+  }
+  if (field->type() == MYSQL_TYPE_BLOB || 
+      field->real_type() == MYSQL_TYPE_VARCHAR)
+  {
+    key_part_info->store_length+= HA_KEY_BLOB_LENGTH;
+    keyinfo->key_length+= HA_KEY_BLOB_LENGTH; // ???
+    key_part_info->key_part_flag|=
+      field->type() == MYSQL_TYPE_BLOB ? HA_BLOB_PART: HA_VAR_LENGTH_PART;
+  }
+
+  key_part_info->type=     (uint8) field->key_type();
+  key_part_info->key_type =
+    ((ha_base_keytype) key_part_info->type == HA_KEYTYPE_TEXT ||
+    (ha_base_keytype) key_part_info->type == HA_KEYTYPE_VARTEXT1 ||
+    (ha_base_keytype) key_part_info->type == HA_KEYTYPE_VARTEXT2) ?
+    0 : FIELDFLAG_BINARY;
+}
+
+
+/**
+  Add a key to a temporary  table
+
+  @param key            the number of the key
+  @param key_parts      number of components of the key
+  @param next_field_no  the call-back function that returns the number of
+                        the field used as the next component of the key
+  @param arg            the argument for the above function
+  @param unique         Is it unique index
+
+  @details
+  The function adds a new key to the table that is assumed to be
+  temprary table. The call-back function must at each call must return
+  the number of the field that used as next component of this key
+
+  @return FALSE is a success
+  @return TRUE if a failure
+*/
+
+bool TABLE::add_tmp_key(uint key, uint key_parts,
+                        uint (*next_field_no) (uchar *), uchar *arg,
+                        bool unique)
+{
+  DBUG_ASSERT(key < max_keys);
+
+  char buf[NAME_CHAR_LEN];
+  KEY* keyinfo;
+  Field **reg_field;
+  uint i;
+  bool key_start= TRUE;
+  KEY_PART_INFO* key_part_info=
+      (KEY_PART_INFO*) alloc_root(&mem_root, sizeof(KEY_PART_INFO)*key_parts);
+  if (!key_part_info)
+    return TRUE;
+  keyinfo= key_info + key;
+  keyinfo->key_part= key_part_info;
+  keyinfo->usable_key_parts= keyinfo->key_parts = key_parts;
+  keyinfo->key_length=0;
+  keyinfo->algorithm= HA_KEY_ALG_UNDEF;
+  keyinfo->flags= HA_GENERATED_KEY;
+  if (unique)
+    keyinfo->flags|= HA_NOSAME;
+  sprintf(buf, "key%i", key);
+  if (!(keyinfo->name= strdup_root(&mem_root, buf)))
+    return TRUE;
+  keyinfo->rec_per_key= (ulong*) alloc_root(&mem_root,
+                                            sizeof(ulong)*key_parts);
+  if (!keyinfo->rec_per_key)
+    return TRUE;
+  bzero(keyinfo->rec_per_key, sizeof(ulong)*key_parts);
+
+  for (i= 0; i < key_parts; i++)
+  {
+    reg_field= field + next_field_no(arg);
+    if (key_start)
+      (*reg_field)->key_start.set_bit(key);
+    (*reg_field)->part_of_key.set_bit(key);
+    create_key_part_by_field(keyinfo, key_part_info, *reg_field);
+    key_start= FALSE;
+    key_part_info++;
+  }
+
+  set_if_bigger(s->max_key_length, keyinfo->key_length);
+  s->keys++;
+  return FALSE;
+}
+
+
 /**
   @brief Check if this is part of a MERGE table with attached children.
 
diff --git a/sql/table.h b/sql/table.h
index afe1f64dae1..d43f5c85849 100644
--- a/sql/table.h
+++ b/sql/table.h
@@ -693,7 +693,7 @@ struct st_table {
     needed by the query without reading the row.
   */
   key_map covering_keys;
-  key_map quick_keys, merge_keys;
+  key_map quick_keys, merge_keys,intersect_keys;
   /*
     A set of keys that can be used in the query that references this
     table.
@@ -725,6 +725,7 @@ struct st_table {
   uchar		*null_flags;
   my_bitmap_map	*bitmap_init_value;
   MY_BITMAP     def_read_set, def_write_set, def_vcol_set, tmp_set; 
+  MY_BITMAP     eq_join_set;         /* used to mark equi-joined fields */
   MY_BITMAP     *read_set, *write_set, *vcol_set; /* Active column sets */
   /*
    The ID of the query that opened and is using this table. Has different
@@ -790,6 +791,7 @@ struct st_table {
   uint          temp_pool_slot;		/* Used by intern temp tables */
   uint		status;                 /* What's in record[0] */
   uint		db_stat;		/* mode of file as in handler.h */
+  uint          max_keys;               /* Size of allocated key_info array. */
   /* number of select if it is derived table */
   uint          derived_select_number;
   int		current_lock;           /* Type of lock on table */
@@ -950,6 +952,12 @@ struct st_table {
   */
   inline bool needs_reopen_or_name_lock()
   { return s->version != refresh_version; }
+  bool alloc_keys(uint key_count);
+  bool add_tmp_key(uint key, uint key_parts,
+                   uint (*next_field_no) (uchar *), uchar *arg,
+                   bool unique);
+  void create_key_part_by_field(KEY *keyinfo, KEY_PART_INFO *key_part_info,
+                                Field *field);
   bool is_children_attached(void);
   inline void enable_keyread()
   {
@@ -1171,6 +1179,11 @@ public:
 };
 
 
+class SJ_MATERIALIZATION_INFO;
+class Index_hint;
+class Item_in_subselect;
+
+
 /*
   Table reference in the FROM clause.
 
@@ -1233,6 +1246,20 @@ struct TABLE_LIST
   char		*db, *alias, *table_name, *schema_table_name;
   char          *option;                /* Used by cache index  */
   Item		*on_expr;		/* Used with outer join */
+
+  Item          *sj_on_expr;
+  /*
+    (Valid only for semi-join nests) Bitmap of tables that are within the
+    semi-join (this is different from bitmap of all nest's children because
+    tables that were pulled out of the semi-join nest remain listed as
+    nest's children).
+  */
+  table_map     sj_inner_tables;
+  /* Number of IN-compared expressions */
+  uint          sj_in_exprs; 
+  Item_in_subselect  *sj_subq_pred;
+  SJ_MATERIALIZATION_INFO *sj_mat_info;
+
   /*
     The structure of ON expression presented in the member above
     can be changed during certain optimizations. This member
@@ -1773,6 +1800,14 @@ typedef struct st_nested_join
   */
   uint              n_tables;
   nested_join_map   nj_map;          /* Bit used to identify this nested join*/
+  /*
+    (Valid only for semi-join nests) Bitmap of tables outside the semi-join
+    that are used within the semi-join's ON condition.
+  */
+  table_map         sj_depends_on;
+  /* Outer non-trivially correlated tables */
+  table_map         sj_corr_tables;
+  List<Item>        sj_outer_expr_list;
   /**
      True if this join nest node is completely covered by the query execution
      plan. This means two things.
diff --git a/sql/uniques.cc b/sql/uniques.cc
index 3d1ea9243b9..cfce53b70d7 100644
--- a/sql/uniques.cc
+++ b/sql/uniques.cc
@@ -33,7 +33,6 @@
 #include "mysql_priv.h"
 #include "sql_sort.h"
 
-
 int unique_write_to_file(uchar* key, element_count count, Unique *unique)
 {
   /*
@@ -45,6 +44,12 @@ int unique_write_to_file(uchar* key, element_count count, Unique *unique)
   return my_b_write(&unique->file, key, unique->size) ? 1 : 0;
 }
 
+int unique_write_to_file_with_count(uchar* key, element_count count, Unique *unique)
+{
+  return my_b_write(&unique->file, key, unique->size) ||
+         my_b_write(&unique->file, &count, sizeof(element_count)) ? 1 : 0;
+}
+
 int unique_write_to_ptrs(uchar* key, element_count count, Unique *unique)
 {
   memcpy(unique->record_pointers, key, unique->size);
@@ -52,10 +57,28 @@ int unique_write_to_ptrs(uchar* key, element_count count, Unique *unique)
   return 0;
 }
 
+int unique_intersect_write_to_ptrs(uchar* key, element_count count, Unique *unique)
+{
+  if (count >= unique->min_dupl_count)
+  {
+    memcpy(unique->record_pointers, key, unique->size);
+    unique->record_pointers+=unique->size;
+  }
+  else
+    unique->filtered_out_elems++;
+  return 0;
+}
+
+
 Unique::Unique(qsort_cmp2 comp_func, void * comp_func_fixed_arg,
-	       uint size_arg, ulonglong max_in_memory_size_arg)
+	       uint size_arg, ulonglong max_in_memory_size_arg,
+               uint min_dupl_count_arg)
   :max_in_memory_size(max_in_memory_size_arg), size(size_arg), elements(0)
 {
+  min_dupl_count= min_dupl_count_arg;
+  full_size= size;
+  if (min_dupl_count_arg)
+    full_size+= sizeof(element_count);
   my_b_clear(&file);
   init_tree(&tree, (ulong) (max_in_memory_size / 16), 0, size, comp_func, 0,
             NULL, comp_func_fixed_arg);
@@ -123,7 +146,8 @@ inline double log2_n_fact(double x)
 */
 
 static double get_merge_buffers_cost(uint *buff_elems, uint elem_size,
-                                     uint *first, uint *last)
+                                     uint *first, uint *last,
+                                     uint compare_factor)
 {
   uint total_buf_elems= 0;
   for (uint *pbuf= first; pbuf <= last; pbuf++)
@@ -134,7 +158,7 @@ static double get_merge_buffers_cost(uint *buff_elems, uint elem_size,
 
   /* Using log2(n)=log(n)/log(2) formula */
   return 2*((double)total_buf_elems*elem_size) / IO_SIZE +
-     total_buf_elems*log((double) n_buffers) / (TIME_FOR_COMPARE_ROWID * M_LN2);
+     total_buf_elems*log((double) n_buffers) / (compare_factor * M_LN2);
 }
 
 
@@ -167,7 +191,8 @@ static double get_merge_buffers_cost(uint *buff_elems, uint elem_size,
 
 static double get_merge_many_buffs_cost(uint *buffer,
                                         uint maxbuffer, uint max_n_elems,
-                                        uint last_n_elems, int elem_size)
+                                        uint last_n_elems, int elem_size,
+                                        uint compare_factor)
 {
   register int i;
   double total_cost= 0.0;
@@ -194,19 +219,22 @@ static double get_merge_many_buffs_cost(uint *buffer,
       {
         total_cost+=get_merge_buffers_cost(buff_elems, elem_size,
                                            buff_elems + i,
-                                           buff_elems + i + MERGEBUFF-1);
+                                           buff_elems + i + MERGEBUFF-1,
+                                           compare_factor);
 	lastbuff++;
       }
       total_cost+=get_merge_buffers_cost(buff_elems, elem_size,
                                          buff_elems + i,
-                                         buff_elems + maxbuffer);
+                                         buff_elems + maxbuffer,
+                                         compare_factor);
       maxbuffer= lastbuff;
     }
   }
 
   /* Simulate final merge_buff call. */
   total_cost += get_merge_buffers_cost(buff_elems, elem_size,
-                                       buff_elems, buff_elems + maxbuffer);
+                                       buff_elems, buff_elems + maxbuffer,
+                                       compare_factor);
   return total_cost;
 }
 
@@ -221,7 +249,11 @@ static double get_merge_many_buffs_cost(uint *buffer,
                 to get # bytes needed.
       nkeys     #of elements in Unique
       key_size  size of each elements in bytes
-      max_in_memory_size amount of memory Unique will be allowed to use
+      max_in_memory_size   amount of memory Unique will be allowed to use
+      compare_factor   used to calculate cost of one comparison
+      write_fl  if the result must be saved written to disk
+      in_memory_elems  OUT estimate of the number of elements in memory
+                           if disk is not used  
 
   RETURN
     Cost in disk seeks.
@@ -258,15 +290,17 @@ static double get_merge_many_buffs_cost(uint *buffer,
       these will be random seeks.
 */
 
-double Unique::get_use_cost(uint *buffer, uint nkeys, uint key_size,
-                            ulonglong max_in_memory_size)
+double Unique::get_use_cost(uint *buffer, size_t nkeys, uint key_size,
+                            ulonglong max_in_memory_size,
+                            uint compare_factor,
+                            bool intersect_fl, bool *in_memory)
 {
-  ulong max_elements_in_tree;
-  ulong last_tree_elems;
+  size_t max_elements_in_tree;
+  size_t last_tree_elems;
   int   n_full_trees; /* number of trees in unique - 1 */
   double result;
 
-  max_elements_in_tree= ((ulong) max_in_memory_size /
+  max_elements_in_tree= ((size_t) max_in_memory_size /
                          ALIGN_SIZE(sizeof(TREE_ELEMENT)+key_size));
 
   n_full_trees=    nkeys / max_elements_in_tree;
@@ -276,11 +310,15 @@ double Unique::get_use_cost(uint *buffer, uint nkeys, uint key_size,
   result= 2*log2_n_fact(last_tree_elems + 1.0);
   if (n_full_trees)
     result+= n_full_trees * log2_n_fact(max_elements_in_tree + 1.0);
-  result /= TIME_FOR_COMPARE_ROWID;
+  result /= compare_factor;
+
+  DBUG_PRINT("info",("unique trees sizes: %u=%u*%u + %u", (uint)nkeys,
+                     (uint)n_full_trees, 
+                     (uint)(n_full_trees?max_elements_in_tree:0),
+                     (uint)last_tree_elems));
 
-  DBUG_PRINT("info",("unique trees sizes: %u=%u*%lu + %lu", nkeys,
-                     n_full_trees, n_full_trees?max_elements_in_tree:0,
-                     last_tree_elems));
+  if (in_memory)
+    *in_memory= !n_full_trees;
 
   if (!n_full_trees)
     return result;
@@ -295,12 +333,12 @@ double Unique::get_use_cost(uint *buffer, uint nkeys, uint key_size,
   result += DISK_SEEK_BASE_COST * ceil(((double) key_size)*last_tree_elems / IO_SIZE);
 
   /* Cost of merge */
+  if (intersect_fl)
+    key_size+= sizeof(element_count);
   double merge_cost= get_merge_many_buffs_cost(buffer, n_full_trees,
                                                max_elements_in_tree,
-                                               last_tree_elems, key_size);
-  if (merge_cost < 0.0)
-    return merge_cost;
-
+                                               last_tree_elems, key_size,
+                                               compare_factor);
   result += merge_cost;
   /*
     Add cost of reading the resulting sequence, assuming there were no
@@ -327,7 +365,10 @@ bool Unique::flush()
   file_ptr.count=tree.elements_in_tree;
   file_ptr.file_pos=my_b_tell(&file);
 
-  if (tree_walk(&tree, (tree_walk_action) unique_write_to_file,
+  tree_walk_action action= min_dupl_count ?
+		           (tree_walk_action) unique_write_to_file_with_count :
+		           (tree_walk_action) unique_write_to_file;
+  if (tree_walk(&tree, action,
 		(void*) this, left_root_right) ||
       insert_dynamic(&file_ptrs, (uchar*) &file_ptr))
     return 1;
@@ -357,6 +398,7 @@ Unique::reset()
     reinit_io_cache(&file, WRITE_CACHE, 0L, 0, 1);
   }
   elements= 0;
+  tree.flag= 0;
 }
 
 /*
@@ -423,7 +465,7 @@ static bool merge_walk(uchar *merge_buffer, ulong merge_buffer_size,
   if (end <= begin ||
       merge_buffer_size < (ulong) (key_length * (end - begin + 1)) ||
       init_queue(&queue, (uint) (end - begin), offsetof(BUFFPEK, key), 0,
-                 buffpek_compare, &compare_context))
+                 buffpek_compare, &compare_context, 0, 0))
     return 1;
   /* we need space for one key when a piece of merge buffer is re-read */
   merge_buffer_size-= key_length;
@@ -468,7 +510,7 @@ static bool merge_walk(uchar *merge_buffer, ulong merge_buffer_size,
     */
     top->key+= key_length;
     if (--top->mem_count)
-      queue_replaced(&queue);
+      queue_replace_top(&queue);
     else /* next piece should be read */
     {
       /* save old_key not to overwrite it in read_to_buffer */
@@ -478,14 +520,14 @@ static bool merge_walk(uchar *merge_buffer, ulong merge_buffer_size,
       if (bytes_read == (uint) (-1))
         goto end;
       else if (bytes_read > 0)      /* top->key, top->mem_count are reset */
-        queue_replaced(&queue);     /* in read_to_buffer */
+        queue_replace_top(&queue);             /* in read_to_buffer */
       else
       {
         /*
           Tree for old 'top' element is empty: remove it from the queue and
           give all its memory to the nearest tree.
         */
-        queue_remove(&queue, 0);
+        queue_remove_top(&queue);
         reuse_freed_buff(&queue, top, key_length);
       }
     }
@@ -576,15 +618,19 @@ bool Unique::get(TABLE *table)
 {
   SORTPARAM sort_param;
   table->sort.found_records=elements+tree.elements_in_tree;
-
   if (my_b_tell(&file) == 0)
   {
     /* Whole tree is in memory;  Don't use disk if you don't need to */
     if ((record_pointers=table->sort.record_pointers= (uchar*)
 	 my_malloc(size * tree.elements_in_tree, MYF(0))))
     {
-      (void) tree_walk(&tree, (tree_walk_action) unique_write_to_ptrs,
+      tree_walk_action action= min_dupl_count ?
+		         (tree_walk_action) unique_intersect_write_to_ptrs :
+		         (tree_walk_action) unique_write_to_ptrs;
+      filtered_out_elems= 0;
+      (void) tree_walk(&tree, action,
 		       this, left_root_right);
+      table->sort.found_records-= filtered_out_elems;
       return 0;
     }
   }
@@ -614,7 +660,9 @@ bool Unique::get(TABLE *table)
   sort_param.max_rows= elements;
   sort_param.sort_form=table;
   sort_param.rec_length= sort_param.sort_length= sort_param.ref_length=
-    size;
+   full_size;
+  sort_param.min_dupl_count= min_dupl_count;
+  sort_param.res_length= 0;
   sort_param.keys= (uint) (max_in_memory_size / sort_param.sort_length);
   sort_param.not_killable=1;
 
@@ -635,8 +683,9 @@ bool Unique::get(TABLE *table)
   if (flush_io_cache(&file) ||
       reinit_io_cache(&file,READ_CACHE,0L,0,0))
     goto err;
-  if (merge_buffers(&sort_param, &file, outfile, sort_buffer, file_ptr,
-		    file_ptr, file_ptr+maxbuffer,0))
+  sort_param.res_length= sort_param.rec_length-
+                         (min_dupl_count ? sizeof(min_dupl_count) : 0);
+  if (merge_index(&sort_param, sort_buffer, file_ptr, maxbuffer, &file, outfile))
     goto err;
   error=0;
 err:
@@ -651,3 +700,5 @@ err:
   outfile->end_of_file=save_pos;
   return error;
 }
+
+
diff --git a/sql/unireg.h b/sql/unireg.h
index 57a9038d5f7..ccdbb650485 100644
--- a/sql/unireg.h
+++ b/sql/unireg.h
@@ -89,7 +89,7 @@
 #define MAX_SELECT_NESTING (sizeof(nesting_map)*8-1)
 
 #define MAX_SORT_MEMORY (2048*1024-MALLOC_OVERHEAD)
-#define MIN_SORT_MEMORY (32*1024-MALLOC_OVERHEAD)
+#define MIN_SORT_MEMORY (1024-MALLOC_OVERHEAD)
 
 /* Memory allocated when parsing a statement / saving a statement */
 #define MEM_ROOT_BLOCK_SIZE       8192